├── .editorconfig ├── .gitattributes ├── .gitignore ├── Directory.Build.props ├── Example.cs ├── Icon.ico ├── Icon.svg ├── LICENSE.txt ├── README.md ├── System.Unicode.Build.Core ├── BinaryWriterExtensions.cs ├── CharExtensions.cs ├── CharacterDecompositionMapping.cs ├── DataSourceProvider.cs ├── DataSources │ ├── FileDataSource.cs │ ├── HttpDataSource.cs │ ├── InMemoryDataSource.cs │ └── ZipDataSource.cs ├── EnumHelper.cs ├── HexCodePoint.cs ├── IDataSource.cs ├── System.Unicode.Build.Core.csproj ├── UnicodeCharacterDataBuilder.cs ├── UnicodeDataFileReader.cs ├── UnicodeDataProcessor.cs ├── UnicodeDatabaseGenerator.cs ├── UnicodeInfoBuilder.cs ├── UnihanCharacterDataBuilder.cs ├── UnihanDataFileReader.cs ├── UnihanProperty.cs ├── UnihanProperty.tt └── Utf8Buffer.cs ├── System.Unicode.Build.DatabaseGenerator ├── Program.cs ├── Properties │ └── launchSettings.json └── System.Unicode.Build.DatabaseGenerator.csproj ├── System.Unicode.Build.Tasks ├── AsyncTask.cs ├── GenerateUnicodeDatabase.cs ├── GetUnicodeDatabaseVersion.cs ├── System.Unicode.Build.Tasks.csproj ├── System.Unicode.Build.Tasks.props └── System.Unicode.Build.Tasks.targets ├── System.Unicode.Tests ├── CodePointEnumerableTests.cs ├── ImportRequestedUnicodeVersion.targets ├── PermissiveCodePointEnumerableTests.cs ├── System.Unicode.Tests.csproj ├── UnicodeCodePointRangeTests.cs ├── UnicodeInfoTests.cs ├── UnicodeRationalNumerTests.cs ├── UnihanCharacterDataTests.cs └── XUnitSerializableString.cs ├── System.Unicode.snk ├── System.Unicode ├── BidirectionalClass.cs ├── CanonicalCombiningClass.cs ├── CjkRadicalData.cs ├── CjkRadicalInfo.cs ├── CodePointEnumerable.cs ├── CodePointEnumerator.cs ├── CompatibilityFormattingTag.cs ├── ContributoryProperties.cs ├── CoreProperties.cs ├── EmojiProperties.cs ├── EnumHelper.cs ├── GenerateUnicodeDatabase.proj ├── HangulInfo.cs ├── PermissiveCodePointEnumerable.cs ├── PermissiveCodePointEnumerator.cs ├── StringExtensions.cs ├── System.Unicode.csproj ├── UcdFields.cs ├── UnicodeBlock.cs ├── UnicodeCategoryExtensions.cs ├── UnicodeCategoryInfo.cs ├── UnicodeCharInfo.cs ├── UnicodeCharacterData.cs ├── UnicodeCodePointRange.cs ├── UnicodeCrossReferenceCollection.cs ├── UnicodeData.cs ├── UnicodeInfo.cs ├── UnicodeNameAlias.cs ├── UnicodeNameAliasCollection.cs ├── UnicodeNameAliasKind.cs ├── UnicodeNumericType.cs ├── UnicodeRadicalStrokeCount.cs ├── UnicodeRadicalStrokeCountCollection.cs ├── UnicodeRationalNumber.cs ├── UnihanCharacterData.Generated.cs ├── UnihanCharacterData.cs ├── UnihanCharacterData.tt ├── UnihanFields.cs ├── UnihanNumericType.cs ├── ValueNameAttribute.cs ├── packageIcon.png └── ucd.dat ├── UnicodeCharacterInspector ├── UnicodeCharacterInspector.ico └── UnicodeCharacterInspector.svg ├── UnicodeInformation.sln ├── UnicodeVersion.txt └── azure-pipelines.yml /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.sln.docstates 8 | 9 | # Build results 10 | [Dd]ebug/ 11 | [Dd]ebugPublic/ 12 | [Rr]elease/ 13 | x64/ 14 | build/ 15 | bld/ 16 | [Bb]in/ 17 | [Oo]bj/ 18 | 19 | # Roslyn cache directories 20 | *.ide/ 21 | .vs/ 22 | 23 | # MSTest test Results 24 | [Tt]est[Rr]esult*/ 25 | [Bb]uild[Ll]og.* 26 | 27 | #NUNIT 28 | *.VisualState.xml 29 | TestResult.xml 30 | 31 | # Build Results of an ATL Project 32 | [Dd]ebugPS/ 33 | [Rr]eleasePS/ 34 | dlldata.c 35 | 36 | *_i.c 37 | *_p.c 38 | *_i.h 39 | *.ilk 40 | *.meta 41 | *.obj 42 | *.pch 43 | *.pdb 44 | *.pgc 45 | *.pgd 46 | *.rsp 47 | *.sbr 48 | *.tlb 49 | *.tli 50 | *.tlh 51 | *.tmp 52 | *.tmp_proj 53 | *.log 54 | *.vspscc 55 | *.vssscc 56 | .builds 57 | *.pidb 58 | *.svclog 59 | *.scc 60 | 61 | # Chutzpah Test files 62 | _Chutzpah* 63 | 64 | # Visual C++ cache files 65 | ipch/ 66 | *.aps 67 | *.ncb 68 | *.opensdf 69 | *.sdf 70 | *.cachefile 71 | 72 | # Visual Studio profiler 73 | *.psess 74 | *.vsp 75 | *.vspx 76 | 77 | # TFS 2012 Local Workspace 78 | $tf/ 79 | 80 | # Guidance Automation Toolkit 81 | *.gpState 82 | 83 | # ReSharper is a .NET coding add-in 84 | _ReSharper*/ 85 | *.[Rr]e[Ss]harper 86 | *.DotSettings.user 87 | 88 | # JustCode is a .NET coding addin-in 89 | .JustCode 90 | 91 | # TeamCity is a build add-in 92 | _TeamCity* 93 | 94 | # DotCover is a Code Coverage Tool 95 | *.dotCover 96 | 97 | # NCrunch 98 | _NCrunch_* 99 | .*crunch*.local.xml 100 | 101 | # MightyMoose 102 | *.mm.* 103 | AutoTest.Net/ 104 | 105 | # Web workbench (sass) 106 | .sass-cache/ 107 | 108 | # Installshield output folder 109 | [Ee]xpress/ 110 | 111 | # DocProject is a documentation generator add-in 112 | DocProject/buildhelp/ 113 | DocProject/Help/*.HxT 114 | DocProject/Help/*.HxC 115 | DocProject/Help/*.hhc 116 | DocProject/Help/*.hhk 117 | DocProject/Help/*.hhp 118 | DocProject/Help/Html2 119 | DocProject/Help/html 120 | 121 | # Click-Once directory 122 | publish/ 123 | 124 | # Publish Web Output 125 | *.[Pp]ublish.xml 126 | *.azurePubxml 127 | ## TODO: Comment the next line if you want to checkin your 128 | ## web deploy settings but do note that will include unencrypted 129 | ## passwords 130 | #*.pubxml 131 | 132 | # NuGet Packages Directory 133 | packages/* 134 | ## TODO: If the tool you use requires repositories.config 135 | ## uncomment the next line 136 | #!packages/repositories.config 137 | 138 | # Enable "build/" folder in the NuGet Packages folder since 139 | # NuGet packages use it for MSBuild targets. 140 | # This line needs to be after the ignore of the build folder 141 | # (and the packages folder if the line above has been uncommented) 142 | !packages/build/ 143 | 144 | # Windows Azure Build Output 145 | csx/ 146 | *.build.csdef 147 | 148 | # Windows Store app package directory 149 | AppPackages/ 150 | 151 | # Others 152 | sql/ 153 | *.Cache 154 | ClientBin/ 155 | [Ss]tyle[Cc]op.* 156 | ~$* 157 | *~ 158 | *.dbmdl 159 | *.dbproj.schemaview 160 | *.pfx 161 | *.publishsettings 162 | node_modules/ 163 | 164 | # RIA/Silverlight projects 165 | Generated_Code/ 166 | 167 | # Backup & report files from converting an old project file 168 | # to a newer Visual Studio version. Backup files are not needed, 169 | # because we have git ;-) 170 | _UpgradeReport_Files/ 171 | Backup*/ 172 | UpgradeLog*.XML 173 | UpgradeLog*.htm 174 | 175 | # SQL Server files 176 | *.mdf 177 | *.ldf 178 | 179 | # Business Intelligence projects 180 | *.rdl.data 181 | *.bim.layout 182 | *.bim_*.settings 183 | 184 | # Microsoft Fakes 185 | FakesAssemblies/ 186 | 187 | # LightSwitch generated files 188 | GeneratedArtifacts/ 189 | _Pvt_Extensions/ 190 | ModelManifest.xml 191 | -------------------------------------------------------------------------------- /Directory.Build.props: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9.0 4 | .NET Unicode Information 5 | Fabien Barbier 6 | Copyright © Fabien Barbier 2014-2019 7 | en 8 | 2.7.1 9 | 2.7.1 10 | 11 | 12 | true 13 | $(MSBuildThisFileDirectory)System.Unicode.snk 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /Example.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Text; 3 | using System.Unicode; 4 | 5 | namespace Example 6 | { 7 | internal static class Program 8 | { 9 | private static void Main() 10 | { 11 | Console.OutputEncoding = Encoding.Unicode; 12 | PrintCodePointInfo('A'); 13 | PrintCodePointInfo('∞'); 14 | PrintCodePointInfo(0x1F600); 15 | } 16 | 17 | private static void PrintCodePointInfo(int codePoint) 18 | { 19 | var charInfo = UnicodeInfo.GetCharInfo(codePoint); 20 | Console.WriteLine(UnicodeInfo.GetDisplayText(charInfo)); 21 | Console.WriteLine("U+" + codePoint.ToString("X4")); 22 | Console.WriteLine(charInfo.Name ?? charInfo.OldName); 23 | Console.WriteLine(charInfo.Category); 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Icon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/Icon.ico -------------------------------------------------------------------------------- /Icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 14 | 16 | 18 | 19 | 21 | image/svg+xml 22 | 24 | 25 | 26 | 27 | 28 | 31 | 36 | 40 | 45 | 49 | 53 | 57 | 61 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Fabien Barbier 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # .NET Unicode Information Library 2 | 3 | [![Build Status](https://dev.azure.com/goldencrystal/UnicodeInformation/_apis/build/status/GoldenCrystal.NetUnicodeInfo?branchName=master)](https://dev.azure.com/goldencrystal/UnicodeInformation/_build/latest?definitionId=1&branchName=master) 4 | 5 | ## Summary 6 | 7 | This project consists of a library that provides access to some of the data contained in the Unicode Character Database. 8 | 9 | ## Version of Unicode supported 10 | 11 | Unicode 13.0 12 | Emoji 13.0 13 | 14 | ## Breaking changes from versions 1.x to 2.x 15 | 16 | UnicodeRadicalStrokeCount.StrokeCount is now of type System.SByte instead of type System.Byte. 17 | 18 | ## Using the library 19 | 20 | ### Reference the NuGet package 21 | 22 | Grab the latest version of the package on NuGet: https://www.nuget.org/packages/UnicodeInformation/. 23 | Once the library is installed in your project, you will find everything you need in the System.Unicode namespace. 24 | 25 | ### Basic information 26 | 27 | Everything provided by the library will be under the namespace `System.Unicode`. 28 | XML documentation should be complete enough so that you can navigate the API without getting lost. 29 | 30 | In its current state, the project is written in C# 7.3, compilable by [Roslyn](http://roslyn.codeplex.com/), and targets both .NET Standard 2.0 and .NET Standard 1.1. 31 | The library UnicodeInformation includes a (large) subset of the official [Unicode Character Database](http://www.unicode.org/Public/UCD/latest/) stored in a custom file format. 32 | 33 | ### Example usage 34 | 35 | The following program will display informations on a few characters: 36 | 37 | ```csharp 38 | using System; 39 | using System.Text; 40 | using System.Unicode; 41 | 42 | namespace Example 43 | { 44 | internal static class Program 45 | { 46 | private static void Main() 47 | { 48 | Console.OutputEncoding = Encoding.Unicode; 49 | PrintCodePointInfo('A'); 50 | PrintCodePointInfo('∞'); 51 | PrintCodePointInfo(0x1F600); 52 | } 53 | 54 | private static void PrintCodePointInfo(int codePoint) 55 | { 56 | var charInfo = UnicodeInfo.GetCharInfo(codePoint); 57 | Console.WriteLine(UnicodeInfo.GetDisplayText(charInfo)); 58 | Console.WriteLine("U+" + codePoint.ToString("X4")); 59 | Console.WriteLine(charInfo.Name ?? charInfo.OldName); 60 | Console.WriteLine(charInfo.Category); 61 | } 62 | } 63 | } 64 | ``` 65 | 66 | Explanations: 67 | 68 | * `UnicodeInfo.GetCharInfo(int)` returns a structure `UnicodeCharInfo` that provides access to various bit of information associated with the specified code point. 69 | * `UnicodeInfo.GetDisplayText(UnicodeCharInfo)` is a helper method that computes a display text for the specified code point. 70 | Since some code points are not designed to be displayed in a standalone fashion, this will try to make the specified character more displayable. 71 | The algorithm used to provide a display text is quite simplistic, and will only affect very specific code points. (e.g. Control Characters) 72 | For most code points, this will simply return the direct string representation. 73 | * `UnicodeCharInfo.Name` returns the name of the code point as specified by the Unicode standard. 74 | Please note that some characters will, by design, not have any name assigned to them in the standard. (e.g. control characters) 75 | Those characters, however may have alternate names assigned to them, that you can use as fallbacks. (e.g. `UnicodeCharInfo.OldName`) 76 | * `UnicodeCharInfo.OldName` returns the name of the character as defined in Unicode 1.0, when applicable and different from the current name. 77 | * `UnicodeCharInfo.Category` returns the category assigned to the specified code point. 78 | 79 | 80 | ### Included Properties 81 | 82 | #### From UCD 83 | * Name 84 | * General_Category 85 | * Canonical_Combining_Class 86 | * Bidi_Class 87 | * Decomposition_Type 88 | * Decomposition_Mapping 89 | * Numeric_Type (See also kAccountingNumeric/kOtherNumeric/kPrimaryNumeric. Those will set Numeric_Type to Numeric.) 90 | * Numeric_Value 91 | * Bidi_Mirrored 92 | * Unicode_1_Name 93 | * Simple_Uppercase_Maping 94 | * Simple_Lowercase_Mapping 95 | * Simple_Titlecase_Mapping 96 | * Name_Alias 97 | * Block 98 | * ASCII_Hex_Digit 99 | * Bidi_Control 100 | * Dash 101 | * Deprecated 102 | * Diacritic 103 | * Extender 104 | * Hex_Digit 105 | * Hyphen 106 | * Ideographic 107 | * IDS_Binary_Operator 108 | * IDS_Trinary_Operator 109 | * Join_Control 110 | * Logical_Order_Exception 111 | * Noncharacter_Code_Point 112 | * Other_Alphabetic 113 | * Other_Default_Ignorable_Code_Point 114 | * Other_Grapheme_Extend 115 | * Other_ID_Continue 116 | * Other_ID_Start 117 | * Other_Lowercase 118 | * Other_Math 119 | * Other_Uppercase 120 | * Pattern_Syntax 121 | * Pattern_White_Space 122 | * Quotation_Mark 123 | * Radical 124 | * Soft_Dotted 125 | * STerm 126 | * Terminal_Punctuation 127 | * Unified_Ideograph 128 | * Variation_Selector 129 | * White_Space 130 | * Lowercase 131 | * Uppercase 132 | * Cased 133 | * Case_Ignorable 134 | * Changes_When_Lowercased 135 | * Changes_When_Uppercased 136 | * Changes_When_Titlecased 137 | * Changes_When_Casefolded 138 | * Changes_When_Casemapped 139 | * Alphabetic 140 | * Default_Ignorable_Code_Point 141 | * Grapheme_Base 142 | * Grapheme_Extend 143 | * Grapheme_Link 144 | * Math 145 | * ID_Start 146 | * ID_Continue 147 | * XID_Start 148 | * XID_Continue 149 | * Unicode_Radical_Stroke (This is actually kRSUnicode from the Unihan database) 150 | * Code point cross references extracted from NamesList.txt 151 | 152 | NB: The UCD property ISO_Comment will never be included since this one is empty in all new Unicode versions. 153 | 154 | #### From Unicode Emoji 155 | 156 | * Emoji 157 | * Emoji_Presentation 158 | * Emoji_Modifier 159 | * Emoji_Modifier_Base 160 | * Emoji_Component 161 | * Extended_Pictographic 162 | 163 | #### From Unihan 164 | * kAccountingNumeric 165 | * kOtherNumeric 166 | * kPrimaryNumeric 167 | * kRSUnicode 168 | * kDefinition 169 | * kMandarin 170 | * kCantonese 171 | * kJapaneseKun 172 | * kJapaneseOn 173 | * kKorean 174 | * kHangul 175 | * kVietnamese 176 | * kSimplifiedVariant 177 | * kTraditionalVariant 178 | 179 | ### Regenerating the data 180 | The project UnicodeInformation.Builder takes cares of generating a file named ucd.dat. This file contains Unicode data compressed by .NET's deflate algorithm, and should be included in UnicodeInformation.dll at compilation. 181 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/BinaryWriterExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.Text; 3 | 4 | namespace System.Unicode.Build.Core 5 | { 6 | public static class BinaryWriterExtensions 7 | { 8 | public static void WriteUInt24(this BinaryWriter writer, int value) 9 | { 10 | if (value < 0 || value > 0xFFFFFF) throw new ArgumentOutOfRangeException(nameof(value)); 11 | 12 | writer.Write((byte)value); 13 | writer.Write((byte)(value >> 8)); 14 | writer.Write((byte)(value >> 16)); 15 | } 16 | 17 | public static void WriteVariableUInt64(this BinaryWriter writer, ulong value) 18 | { 19 | byte b = (byte)(value & 0x7F); 20 | value >>= 7; 21 | 22 | while (value > 0) 23 | { 24 | writer.Write((byte)(b | 0x80)); 25 | b = (byte)(value & 0x7F); 26 | value >>= 7; 27 | } 28 | writer.Write(b); 29 | } 30 | 31 | /// Writes code point in a custom, but compact encoding. 32 | /// 33 | /// Unlike UTF-8, this encoding will consume at most 3 bytes. 34 | /// It could ideally store values between 0x0 and 0x40409F, but this range is useless at the moment. 35 | /// 36 | /// The binary writer to use. 37 | /// The value to write 38 | public static void WriteCodePoint(this BinaryWriter writer, int value) 39 | { 40 | if (value < 0 || value > 0x40407F) throw new ArgumentOutOfRangeException(nameof(value)); 41 | 42 | if (value < 0xA0) writer.Write((byte)value); 43 | else if (value < 0x20A0) 44 | { 45 | value -= 0xA0; 46 | writer.Write((byte)((byte)(value >> 8) | 0xA0)); 47 | writer.Write((byte)value); 48 | } 49 | else if (value < 0x40A0) 50 | { 51 | value -= 0x20A0; 52 | writer.Write((byte)((byte)(value >> 8) | 0xC0)); 53 | writer.Write((byte)value); 54 | } 55 | else 56 | { 57 | value -= 0x40A0; 58 | writer.Write((byte)((byte)(value >> 16) | 0xE0)); 59 | writer.Write((byte)(value >> 8)); 60 | writer.Write((byte)value); 61 | } 62 | } 63 | 64 | /// Writes a character name alias. 65 | /// We assume that character names will not exceed 64 bytes in length. 66 | /// The writer to use. 67 | /// The name alias value to write. 68 | public static void WriteNameAliasToFile(this BinaryWriter writer, UnicodeNameAlias nameAlias) 69 | { 70 | writer.Write(nameAlias.Name); 71 | writer.Write((byte)nameAlias.Kind); 72 | } 73 | 74 | /// Writes a character name, packing two information bits along with the length. 75 | /// We assume that character names will not exceed 128 bytes in length. 76 | /// The writer to use. 77 | /// The name to write. 78 | public static void WriteNamePropertyToFile(this BinaryWriter writer, string name) 79 | { 80 | var bytes = Encoding.UTF8.GetBytes(name); 81 | if (bytes.Length > 128) throw new InvalidOperationException("Did not expect UTF-8 encoded name to be longer than 128 bytes."); 82 | writer.Write((byte)(name.Length - 1)); // The most significant bit will always be cleared, because it will be used for other cases. 83 | writer.Write(bytes); 84 | } 85 | 86 | /// Writes a 6 bits length packed with two extra bits. 87 | /// The parameters have a restricted range, which must be respected. 88 | /// The writer used to perform the operation. 89 | /// The value of the two extra bits. 90 | /// The length to write. 91 | public static void WritePackedLength(this BinaryWriter writer, byte extraBits, int length) 92 | { 93 | if (extraBits > 3) throw new ArgumentOutOfRangeException(nameof(extraBits)); 94 | if (length < 1 || length > 64) throw new ArgumentOutOfRangeException(nameof(length)); 95 | 96 | writer.Write((byte)((extraBits << 6) | (length - 1))); 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/CharExtensions.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode.Build.Core 2 | { 3 | public static class CharExtensions 4 | { 5 | public static bool IsHexDigit(this char c) 6 | => c >= '0' && c <= 'f' && (c <= '9' || c <= 'F' && c >= 'A' || c >= 'a'); 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/CharacterDecompositionMapping.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode.Build.Core 2 | { 3 | public struct CharacterDecompositionMapping 4 | { 5 | public readonly CompatibilityFormattingTag DecompositionType; 6 | public readonly string DecompositionMapping; 7 | 8 | public CharacterDecompositionMapping(CompatibilityFormattingTag decompositionType, string decompositionMapping) 9 | { 10 | DecompositionType = decompositionType; 11 | DecompositionMapping = decompositionMapping; 12 | } 13 | 14 | public unsafe static CharacterDecompositionMapping Parse(string s) 15 | { 16 | if (string.IsNullOrEmpty(s)) return default; 17 | 18 | var tag = CompatibilityFormattingTag.Canonical; 19 | 20 | int index; 21 | 22 | if (s[0] == '<') 23 | { 24 | if (!EnumHelper.TryGetNamedValue(s.Substring(1, (index = s.IndexOf('>')) - 1), out tag)) 25 | throw new FormatException(); 26 | ++index; 27 | } 28 | else 29 | { 30 | index = 0; 31 | } 32 | 33 | var buffer = stackalloc char[36]; // From the Unicode docs, a decomposition cannot have more than 18 code points. 34 | int charIndex = 0; 35 | 36 | while (index < s.Length && charIndex < 35) 37 | { 38 | char c = s[index]; 39 | 40 | if (c == ' ') ++index; 41 | else 42 | { 43 | int codePoint = HexCodePoint.Parse(s, ref index); 44 | 45 | if (codePoint < 0x10000) 46 | buffer[charIndex++] = (char)codePoint; 47 | else if (codePoint < 0x10FFFF) 48 | { 49 | codePoint -= 0x10000; 50 | buffer[charIndex++] = (char)((codePoint >> 10) + 0xD800); 51 | buffer[charIndex++] = (char)((codePoint & 0x3FF) + 0xDC00); 52 | } 53 | else 54 | { 55 | throw new FormatException("The code point was outside of the allowed range."); 56 | } 57 | } 58 | } 59 | 60 | return new CharacterDecompositionMapping(tag, new string(buffer, 0, charIndex)); 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/DataSourceProvider.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.IO.Compression; 3 | using System.Linq; 4 | using System.Net.Http; 5 | using System.Threading.Tasks; 6 | using System.Unicode.Build.Core.DataSources; 7 | 8 | namespace System.Unicode.Build.Core 9 | { 10 | public static class DataSourceProvider 11 | { 12 | public static readonly Uri UnicodeCharacterDataUri = new Uri("http://www.unicode.org/Public/UCD/latest/ucd/", UriKind.Absolute); 13 | public static readonly Uri UcdEmojiDataUri = new Uri("http://www.unicode.org/Public/UCD/latest/ucd/emoji/", UriKind.Absolute); 14 | public static readonly Uri EmojiDataUri = new Uri("http://www.unicode.org/Public/emoji/latest/", UriKind.Absolute); 15 | 16 | public const string UnihanDataSourceName = "Unihan"; 17 | public const string UcdDataSourceName = "UCD"; 18 | public const string EmojiDataSourceName = "Emoji"; 19 | 20 | public static readonly string[] UcdRequiredFiles = new[] 21 | { 22 | "UnicodeData.txt", 23 | "PropList.txt", 24 | "DerivedCoreProperties.txt", 25 | "CJKRadicals.txt", 26 | //"Jamo.txt", // Not used right now, as the hangul syllable algorithm implementation takes care of this. 27 | "NameAliases.txt", 28 | "NamesList.txt", 29 | "Blocks.txt", 30 | }; 31 | 32 | public static readonly string[] UcdEmojiRequiredFiles = new[] 33 | { 34 | "emoji-data.txt", 35 | "emoji-variation-sequences.txt", 36 | }; 37 | 38 | public static readonly string[] UnihanRequiredFiles = new[] 39 | { 40 | "Unihan_NumericValues.txt", 41 | "Unihan_Readings.txt", 42 | "Unihan_Variants.txt", 43 | "Unihan_IRGSources.txt", 44 | }; 45 | 46 | public static readonly string[] EmojiRequiredFiles = new[] 47 | { 48 | //"emoji-data.txt", 49 | "emoji-sequences.txt", 50 | //"emoji-variation-sequences.txt", 51 | "emoji-zwj-sequences.txt", 52 | }; 53 | 54 | private static Task DownloadDataFileAsync(HttpClient httpClient, Uri baseUri, string archiveName) 55 | => httpClient.GetByteArrayAsync(new Uri(baseUri, archiveName)); 56 | 57 | public static async Task GetDataSourceAsync(HttpClient httpClient, Uri baseUri, string baseDirectory, string dataSourceName, string[] requiredFiles, bool useArchive, bool? shouldDownload, bool? shouldSaveFiles, bool? shouldExtract) 58 | { 59 | string dataDirectory = Path.GetFullPath(Path.Combine(baseDirectory, dataSourceName)); 60 | string dataArchiveFileName = dataSourceName + ".zip"; 61 | string dataArchivePath = dataDirectory + ".zip"; 62 | 63 | if (shouldDownload != true) 64 | { 65 | bool hasValidDirectory = Directory.Exists(dataDirectory); 66 | 67 | if (hasValidDirectory) 68 | { 69 | foreach (string requiredFile in requiredFiles) 70 | { 71 | if (!File.Exists(Path.Combine(dataDirectory, requiredFile))) 72 | { 73 | hasValidDirectory = false; 74 | break; 75 | } 76 | } 77 | } 78 | 79 | if (hasValidDirectory) 80 | { 81 | return new FileDataSource(dataDirectory); 82 | } 83 | 84 | if (useArchive && File.Exists(dataArchivePath)) 85 | { 86 | if (shouldExtract == true) 87 | { 88 | ZipFile.ExtractToDirectory(dataArchivePath, dataDirectory); 89 | return new FileDataSource(dataDirectory); 90 | } 91 | else 92 | { 93 | return new ZipDataSource(File.OpenRead(dataArchivePath)); 94 | } 95 | } 96 | } 97 | 98 | if (shouldDownload != false) 99 | { 100 | if (useArchive) 101 | { 102 | var dataArchiveData = await DownloadDataFileAsync(httpClient, baseUri, dataArchiveFileName).ConfigureAwait(false); 103 | 104 | if (shouldSaveFiles == true) 105 | { 106 | using (var stream = File.Open(dataArchivePath, FileMode.Create, FileAccess.ReadWrite, FileShare.Read)) 107 | { 108 | await stream.WriteAsync(dataArchiveData, 0, dataArchiveData.Length).ConfigureAwait(false); 109 | dataArchiveData = null; // Release the reference now, since we won't need it anymore. 110 | 111 | if (shouldExtract == true) 112 | { 113 | using (var archive = new ZipArchive(stream, ZipArchiveMode.Read, false)) 114 | { 115 | archive.ExtractToDirectory(dataDirectory); 116 | 117 | return new FileDataSource(dataDirectory); 118 | } 119 | } 120 | else 121 | { 122 | return new ZipDataSource(stream); 123 | } 124 | } 125 | } 126 | else 127 | { 128 | return new ZipDataSource(new MemoryStream(dataArchiveData)); 129 | } 130 | } 131 | else 132 | { 133 | var downloadedFiles = await Task.WhenAll 134 | ( 135 | Array.ConvertAll 136 | ( 137 | requiredFiles, 138 | async requiredFile => 139 | ( 140 | Name: requiredFile, 141 | Data: await DownloadDataFileAsync(httpClient, baseUri, requiredFile).ConfigureAwait(false) 142 | ) 143 | ) 144 | ).ConfigureAwait(false); 145 | 146 | if (shouldSaveFiles == true) 147 | { 148 | Directory.CreateDirectory(dataDirectory); 149 | 150 | await Task.WhenAll 151 | ( 152 | Array.ConvertAll 153 | ( 154 | downloadedFiles, 155 | //file => File.WriteAllBytesAsync(Path.Combine(dataDirectory, file.Name), file.Data) 156 | async file => 157 | { 158 | using (var stream = File.Open(Path.Combine(dataDirectory, file.Name), FileMode.Create, FileAccess.ReadWrite, FileShare.Read)) 159 | { 160 | await stream.WriteAsync(file.Data, 0, file.Data.Length).ConfigureAwait(false); 161 | } 162 | } 163 | ) 164 | ).ConfigureAwait(false); 165 | 166 | return new FileDataSource(dataDirectory); 167 | } 168 | else 169 | { 170 | return new InMemoryDataSource(downloadedFiles.ToDictionary(f => f.Name, f => f.Data)); 171 | } 172 | } 173 | } 174 | 175 | throw new InvalidOperationException(); 176 | } 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/DataSources/FileDataSource.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.Threading.Tasks; 3 | 4 | namespace System.Unicode.Build.Core.DataSources 5 | { 6 | public sealed class FileDataSource : IDataSource 7 | { 8 | private readonly string _baseDirectory; 9 | 10 | public FileDataSource(string baseDirectory) 11 | => _baseDirectory = Path.GetFullPath(baseDirectory); 12 | 13 | public void Dispose() 14 | { 15 | } 16 | 17 | public ValueTask OpenDataFileAsync(string fileName) 18 | => new ValueTask(File.OpenRead(Path.Combine(_baseDirectory, fileName))); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/DataSources/HttpDataSource.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.Net.Http; 3 | using System.Threading.Tasks; 4 | 5 | namespace System.Unicode.Build.Core.DataSources 6 | { 7 | public class HttpDataSource : IDataSource 8 | { 9 | private readonly HttpClient _httpClient; 10 | private readonly Uri _baseUri; 11 | 12 | public HttpDataSource(Uri baseUri, HttpClient httpClient) 13 | { 14 | _httpClient = httpClient ?? new HttpClient(); 15 | _baseUri = baseUri; 16 | } 17 | 18 | public void Dispose() => _httpClient.Dispose(); 19 | 20 | public ValueTask OpenDataFileAsync(string fileName) 21 | => new ValueTask(_httpClient.GetStreamAsync(_baseUri + fileName)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/DataSources/InMemoryDataSource.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.IO; 3 | using System.Threading.Tasks; 4 | 5 | namespace System.Unicode.Build.Core.DataSources 6 | { 7 | internal class InMemoryDataSource : IDataSource 8 | { 9 | private readonly Dictionary _files; 10 | 11 | public InMemoryDataSource(Dictionary files) => _files = files; 12 | 13 | public void Dispose() { } 14 | 15 | public ValueTask OpenDataFileAsync(string fileName) 16 | => new ValueTask(new MemoryStream(_files[fileName], false)); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/DataSources/ZipDataSource.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.IO.Compression; 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | 6 | namespace System.Unicode.Build.Core.DataSources 7 | { 8 | public sealed class ZipDataSource : IDataSource 9 | { 10 | private readonly ZipArchive _archive; 11 | 12 | public ZipDataSource(Stream stream) => _archive = new ZipArchive(stream, ZipArchiveMode.Read, false); 13 | 14 | public void Dispose() => _archive.Dispose(); 15 | 16 | public ValueTask OpenDataFileAsync(string fileName) 17 | { 18 | var entry = _archive.Entries.Where(e => e.FullName == fileName).FirstOrDefault(); 19 | 20 | if (entry == null) throw new FileNotFoundException(); 21 | 22 | return new ValueTask(entry.Open()); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/EnumHelper.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using System.Reflection; 4 | 5 | namespace System.Unicode.Build.Core 6 | { 7 | internal static class EnumHelper 8 | where T : struct, Enum 9 | { 10 | private static readonly Dictionary NamedValueDictionary = CreateNamedValueDictionary(); 11 | 12 | private static Dictionary CreateNamedValueDictionary() 13 | { 14 | var type = typeof(T).GetTypeInfo(); 15 | 16 | if (!type.IsEnum) throw new InvalidOperationException(); 17 | 18 | return 19 | ( 20 | from field in type.DeclaredFields 21 | where field.IsPublic && field.IsLiteral 22 | from attr in field.GetCustomAttributes() 23 | where attr.Name != null 24 | select new KeyValuePair(attr.Name, (T)field.GetValue(null)) 25 | ).ToDictionary(kvp => kvp.Key, kvp => kvp.Value, StringComparer.OrdinalIgnoreCase); 26 | } 27 | 28 | public static bool TryGetNamedValue(string name, out T value) 29 | { 30 | return NamedValueDictionary.TryGetValue(name, out value); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/HexCodePoint.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode.Build.Core 2 | { 3 | public static class HexCodePoint 4 | { 5 | public static int ParsePrefixed(string s) 6 | { 7 | if (!s.StartsWith("U+")) 8 | { 9 | throw new FormatException("Expected a code point in the form U+nnnn."); 10 | } 11 | return Parse(s, 2); 12 | } 13 | 14 | public static int Parse(string s, int index) => Parse(s, ref index); 15 | 16 | public static int Parse(string s, ref int index) 17 | { 18 | int i = index; 19 | int accum = 0; 20 | 21 | while (i < s.Length) 22 | { 23 | char c = s[i]; 24 | 25 | if (c == ' ') break; 26 | 27 | accum <<= 4; 28 | 29 | if (c >= '0' && c <= '9') accum |= c - '0'; 30 | else if (c >= 'A' && c <= 'F') accum |= c - 'A' + 0xA; 31 | else if (c >= 'a' && c <= 'f') accum |= c - 'a' + 0xA; 32 | else throw new FormatException(); 33 | 34 | ++i; 35 | } 36 | 37 | index = i; 38 | return accum; 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/IDataSource.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.Threading.Tasks; 3 | 4 | namespace System.Unicode.Build.Core 5 | { 6 | public interface IDataSource : IDisposable 7 | { 8 | ValueTask OpenDataFileAsync(string fileName); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/System.Unicode.Build.Core.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.0 5 | True 6 | $(DefineConstants);BUILD_SYSTEM 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | TextTemplatingFileGenerator 47 | UnihanProperty.cs 48 | 49 | 50 | 51 | 52 | 53 | True 54 | True 55 | UnihanProperty.tt 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/UnicodeCharacterDataBuilder.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Diagnostics; 3 | using System.Globalization; 4 | using System.IO; 5 | using System.Linq; 6 | 7 | namespace System.Unicode.Build.Core 8 | { 9 | [DebuggerDisplay("{CodePointRange} {DisplayName,nq}")] 10 | public sealed class UnicodeCharacterDataBuilder 11 | { 12 | private UnicodeCategory _category = UnicodeCategory.OtherNotAssigned; 13 | 14 | private readonly List _nameAliases = new List(); 15 | private readonly List _crossRerefences = new List(); 16 | 17 | public UnicodeCodePointRange CodePointRange { get; } 18 | 19 | public string Name { get; set; } 20 | 21 | public IList NameAliases => _nameAliases; 22 | 23 | public UnicodeCategory Category 24 | { 25 | get => _category; 26 | set => _category = Enum.IsDefined(typeof(UnicodeCategory), value) ? 27 | value : 28 | throw new ArgumentOutOfRangeException(nameof(value)); 29 | } 30 | 31 | public CanonicalCombiningClass CanonicalCombiningClass { get; set; } // Even values not defined in the enum are allowed here. 32 | public BidirectionalClass BidirectionalClass { get; set; } 33 | public CharacterDecompositionMapping CharacterDecompositionMapping { get; set; } 34 | public UnicodeNumericType NumericType { get; set; } 35 | public UnicodeRationalNumber NumericValue { get; set; } 36 | public string OldName { get; set; } 37 | public bool BidirectionalMirrored { get; set; } 38 | public string SimpleUpperCaseMapping { get; set; } 39 | public string SimpleLowerCaseMapping { get; set; } 40 | public string SimpleTitleCaseMapping { get; set; } 41 | public ContributoryProperties ContributoryProperties { get; set; } 42 | public CoreProperties CoreProperties { get; set; } 43 | public EmojiProperties EmojiProperties { get; set; } 44 | public IList CrossRerefences => _crossRerefences; 45 | 46 | public UnicodeCharacterDataBuilder(int codePoint) 47 | : this(new UnicodeCodePointRange(codePoint)) 48 | { 49 | } 50 | 51 | public UnicodeCharacterDataBuilder(UnicodeCodePointRange codePointRange) 52 | { 53 | CodePointRange = codePointRange; 54 | _category = UnicodeCategory.OtherNotAssigned; 55 | } 56 | 57 | private string DisplayName => Name ?? OldName; 58 | 59 | internal UnicodeCharacterData ToCharacterData() 60 | => new UnicodeCharacterData 61 | ( 62 | CodePointRange, 63 | Name, 64 | _nameAliases.Count > 0 ? _nameAliases.ToArray() : UnicodeNameAlias.EmptyArray, 65 | Category, 66 | CanonicalCombiningClass, 67 | BidirectionalClass, 68 | CharacterDecompositionMapping.DecompositionType, 69 | CharacterDecompositionMapping.DecompositionMapping, 70 | NumericType, 71 | NumericValue, 72 | BidirectionalMirrored, 73 | OldName, 74 | SimpleUpperCaseMapping, 75 | SimpleLowerCaseMapping, 76 | SimpleTitleCaseMapping, 77 | ContributoryProperties, 78 | (int)CoreProperties | (int)EmojiProperties << 24, 79 | CrossRerefences.Count > 0 ? CrossRerefences.ToArray() : null 80 | ); 81 | 82 | internal void WriteToFile(BinaryWriter writer) 83 | { 84 | if (_nameAliases.Count > 64) throw new InvalidDataException("Cannot handle more than 64 name aliases."); 85 | 86 | UcdFields fields = default; 87 | 88 | if (!CodePointRange.IsSingleCodePoint) fields = UcdFields.CodePointRange; 89 | 90 | if (Name != null || _nameAliases.Count > 0) fields |= UcdFields.Name; // This field combines name and alias. 91 | if (_category != UnicodeCategory.OtherNotAssigned) fields |= UcdFields.Category; 92 | if (CanonicalCombiningClass != CanonicalCombiningClass.NotReordered) fields |= UcdFields.CanonicalCombiningClass; 93 | if (BidirectionalClass != 0) fields |= UcdFields.BidirectionalClass; 94 | if (CharacterDecompositionMapping.DecompositionMapping != null) fields |= UcdFields.DecompositionMapping; 95 | fields |= (UcdFields)((int)NumericType << 6); 96 | if (BidirectionalMirrored) fields |= UcdFields.BidirectionalMirrored; 97 | if (OldName != null) fields |= UcdFields.OldName; 98 | if (SimpleUpperCaseMapping != null) fields |= UcdFields.SimpleUpperCaseMapping; 99 | if (SimpleLowerCaseMapping != null) fields |= UcdFields.SimpleLowerCaseMapping; 100 | if (SimpleTitleCaseMapping != null) fields |= UcdFields.SimpleTitleCaseMapping; 101 | if (ContributoryProperties != 0) fields |= UcdFields.ContributoryProperties; 102 | if (CoreProperties != 0 || EmojiProperties != 0) fields |= UcdFields.CorePropertiesAndEmojiProperties; 103 | if (_crossRerefences.Count > 0) fields |= UcdFields.CrossRerefences; 104 | 105 | writer.Write((ushort)fields); 106 | 107 | writer.WriteCodePoint(CodePointRange.FirstCodePoint); 108 | if ((fields & UcdFields.CodePointRange) != 0) writer.WriteCodePoint(CodePointRange.LastCodePoint); 109 | 110 | if ((fields & UcdFields.Name) != 0) 111 | { 112 | // We write the names by optimizing for the common case. 113 | // i.e. Most characters have only one name. 114 | // The first 8 bit sequence will encore either the length of the name property alone, 115 | // or the number of aliases and a bit indicating the presence of the name property. 116 | 117 | if (_nameAliases.Count > 0) 118 | { 119 | writer.WritePackedLength((byte)(Name != null ? 3 : 2), _nameAliases.Count); 120 | 121 | if (Name != null) 122 | writer.WriteNamePropertyToFile(Name); 123 | 124 | foreach (var nameAlias in _nameAliases) 125 | writer.WriteNameAliasToFile(nameAlias); 126 | } 127 | else 128 | { 129 | writer.WriteNamePropertyToFile(Name); 130 | } 131 | } 132 | if ((fields & UcdFields.Category) != 0) writer.Write((byte)_category); 133 | if ((fields & UcdFields.CanonicalCombiningClass) != 0) writer.Write((byte)CanonicalCombiningClass); 134 | if ((fields & UcdFields.BidirectionalClass) != 0) writer.Write((byte)BidirectionalClass); 135 | if ((fields & UcdFields.DecompositionMapping) != 0) 136 | { 137 | writer.Write((byte)CharacterDecompositionMapping.DecompositionType); 138 | writer.Write(CharacterDecompositionMapping.DecompositionMapping); 139 | } 140 | if ((fields & UcdFields.NumericNumeric) != 0) 141 | { 142 | writer.Write(NumericValue.Numerator); 143 | writer.WriteVariableUInt64(NumericValue.Denominator); 144 | } 145 | if ((fields & UcdFields.OldName) != 0) writer.Write(OldName); 146 | if ((fields & UcdFields.SimpleUpperCaseMapping) != 0) writer.Write(SimpleUpperCaseMapping); 147 | if ((fields & UcdFields.SimpleLowerCaseMapping) != 0) writer.Write(SimpleLowerCaseMapping); 148 | if ((fields & UcdFields.SimpleTitleCaseMapping) != 0) writer.Write(SimpleTitleCaseMapping); 149 | if ((fields & UcdFields.ContributoryProperties) != 0) writer.Write((int)ContributoryProperties); 150 | if ((fields & UcdFields.CorePropertiesAndEmojiProperties) != 0) 151 | { 152 | // This encoding is very dirty and needs to be reworked. For now I just want to make this work. 153 | // First byte has its 2 MSB indicating presence of 1) Emoji P. 2) Core P. Value 00xxxxxx is invalid & not used at all. 154 | // If emoji properties are present, they are contained in the first byte, possibly followed by an Int24 for core properties. 155 | // If emoji properties are absent, the byte is the high part of core properties, followed by an Int16 for the rest. 156 | if (CoreProperties != 0) 157 | { 158 | if (EmojiProperties != 0) 159 | { 160 | writer.Write((byte)(192 | (byte)EmojiProperties)); 161 | writer.WriteUInt24((int)CoreProperties & 0x00FFFFFF); 162 | } 163 | else 164 | { 165 | writer.Write((byte)(64 | (int)CoreProperties >> 16)); 166 | writer.Write((ushort)CoreProperties); 167 | } 168 | } 169 | else 170 | { 171 | writer.Write((byte)(128 | (byte)EmojiProperties)); 172 | } 173 | } 174 | if ((fields & UcdFields.CrossRerefences) != 0) 175 | { 176 | writer.Write(checked((byte)(_crossRerefences.Count - 1))); 177 | foreach (int crossReference in _crossRerefences) 178 | writer.WriteCodePoint(crossReference); 179 | } 180 | } 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/UnicodeDataFileReader.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | 3 | namespace System.Unicode.Build.Core 4 | { 5 | public class UnicodeDataFileReader : IDisposable 6 | { 7 | private readonly Stream _stream; 8 | private readonly byte[] _byteBuffer; 9 | private int _index; 10 | private int _length; 11 | private readonly char _fieldSeparator; 12 | private bool _hasField = false; 13 | private readonly bool _leaveOpen; 14 | 15 | public UnicodeDataFileReader(Stream stream, char fieldSeparator) 16 | : this(stream, fieldSeparator, false) 17 | { 18 | } 19 | 20 | public UnicodeDataFileReader(Stream stream, char fieldSeparator, bool leaveOpen) 21 | { 22 | _stream = stream; 23 | _fieldSeparator = fieldSeparator; 24 | _byteBuffer = new byte[8192]; 25 | _leaveOpen = leaveOpen; 26 | } 27 | 28 | public void Dispose() 29 | { 30 | if (!_leaveOpen) _stream.Dispose(); 31 | } 32 | 33 | private bool RefillBuffer() 34 | // Evilish line of code. 😈 35 | => (_length = _stream.Read(_byteBuffer, 0, _byteBuffer.Length)) != (_index = 0); 36 | 37 | private static bool IsNewLineOrComment(byte b) 38 | => b == '\n' || b == '#'; 39 | 40 | /// Moves the stream to the next valid data row. 41 | /// if data is available; otherwise. 42 | public bool MoveToNextLine() 43 | { 44 | if (_length == 0) 45 | { 46 | if (RefillBuffer()) 47 | { 48 | if (!IsNewLineOrComment(_byteBuffer[_index])) 49 | { 50 | _hasField = true; 51 | goto Completed; 52 | } 53 | } 54 | else 55 | { 56 | return false; 57 | } 58 | } 59 | 60 | do 61 | { 62 | while (_index < _length) 63 | { 64 | if (_byteBuffer[_index++] == '\n') 65 | { 66 | if ((_index < _length || RefillBuffer()) && !IsNewLineOrComment(_byteBuffer[_index])) 67 | { 68 | _hasField = true; 69 | goto Completed; 70 | } 71 | } 72 | } 73 | } while (RefillBuffer()); 74 | 75 | _hasField = false; 76 | Completed:; 77 | return _hasField; 78 | } 79 | 80 | private string ReadFieldInternal(bool trim) 81 | { 82 | if (_length == 0) throw new InvalidOperationException(); 83 | 84 | if (!_hasField) return null; 85 | else if (_index >= _length) RefillBuffer(); 86 | 87 | // If the current character is a new line or a comment, we are at the end of a line. 88 | if (IsNewLineOrComment(_byteBuffer[_index])) 89 | { 90 | if (_hasField) 91 | { 92 | _hasField = false; 93 | return string.Empty; 94 | } 95 | else 96 | { 97 | return null; 98 | } 99 | } 100 | 101 | using (var buffer = Utf8Buffer.Get()) 102 | { 103 | int startOffset; 104 | int endOffset; 105 | 106 | do 107 | { 108 | startOffset = _index; 109 | endOffset = -1; 110 | 111 | while (_index < _length) 112 | { 113 | byte b = _byteBuffer[_index]; 114 | 115 | if (IsNewLineOrComment(b)) // NB: Do not advance to the next byte when end of line has been reached. 116 | { 117 | endOffset = _index; 118 | _hasField = false; 119 | break; 120 | } 121 | else if (b == _fieldSeparator) 122 | { 123 | endOffset = _index++; 124 | break; 125 | } 126 | else 127 | { 128 | ++_index; 129 | } 130 | } 131 | 132 | if (endOffset >= 0) 133 | { 134 | buffer.Append(_byteBuffer, startOffset, endOffset - startOffset); 135 | break; 136 | } 137 | else if (_index > startOffset) 138 | { 139 | buffer.Append(_byteBuffer, startOffset, _index - startOffset); 140 | } 141 | } while (RefillBuffer()); 142 | 143 | return trim ? buffer.ToTrimmedString() : buffer.ToString(); 144 | } 145 | } 146 | 147 | /// Reads the next data field. 148 | /// This method will return on end of line. 149 | /// The text value of the read field, if available; otherwise. 150 | public string ReadField() => ReadFieldInternal(false); 151 | 152 | /// Reads the next data field as a trimmed value. 153 | /// This method will return on end of line. 154 | /// The trimmed text value of the read field, if available; otherwise. 155 | public string ReadTrimmedField() => ReadFieldInternal(true); 156 | 157 | /// Skips the next data field. 158 | /// This method will return on end of line. 159 | /// if a field was skipped; otherwise. 160 | public bool SkipField() 161 | { 162 | if (_length == 0) throw new InvalidOperationException(); 163 | 164 | if (!_hasField) return false; 165 | else if (_index >= _length) RefillBuffer(); 166 | 167 | // If the current character is a new line or a comment, we are at the end of a line. 168 | if (IsNewLineOrComment(_byteBuffer[_index])) 169 | { 170 | _hasField = false; 171 | return false; 172 | } 173 | 174 | do 175 | { 176 | while (_index < _length) 177 | { 178 | byte b = _byteBuffer[_index]; 179 | 180 | if (IsNewLineOrComment(b)) // NB: Do not advance to the next byte when end of line has been reached. 181 | { 182 | _hasField = false; 183 | return true; 184 | } 185 | else 186 | { 187 | ++_index; 188 | 189 | if (b == _fieldSeparator) 190 | { 191 | return true; 192 | } 193 | } 194 | } 195 | } while (RefillBuffer()); 196 | 197 | return true; 198 | } 199 | } 200 | } 201 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/UnicodeDatabaseGenerator.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.IO.Compression; 3 | using System.Net.Http; 4 | using System.Threading.Tasks; 5 | using static System.Unicode.Build.Core.DataSourceProvider; 6 | 7 | namespace System.Unicode.Build.Core 8 | { 9 | public static class UnicodeDatabaseGenerator 10 | { 11 | public static async ValueTask GenerateDatabase(HttpClient httpClient, string baseDirectory, string outputFilePath, bool? shouldDownloadFiles, bool? shouldSaveFiles, bool? shouldExtractFiles) 12 | { 13 | UnicodeInfoBuilder data; 14 | 15 | baseDirectory = string.IsNullOrWhiteSpace(baseDirectory) ? 16 | Environment.CurrentDirectory : 17 | Path.GetFullPath(baseDirectory); 18 | 19 | using (var ucdSource = await GetDataSourceAsync(httpClient, UnicodeCharacterDataUri, baseDirectory, UcdDataSourceName, UcdRequiredFiles, true, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles)) 20 | using (var unihanSource = await GetDataSourceAsync(httpClient, UnicodeCharacterDataUri, baseDirectory, UnihanDataSourceName, UnihanRequiredFiles, true, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles)) 21 | using (var ucdEmojiSource = await GetDataSourceAsync(httpClient, UcdEmojiDataUri, baseDirectory, EmojiDataSourceName, UcdEmojiRequiredFiles, false, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles)) 22 | //using (var emojiSource = await GetDataSourceAsync(httpClient, EmojiDataUri, baseDirectory, EmojiDataSourceName, EmojiRequiredFiles, false, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles)) 23 | { 24 | data = await UnicodeDataProcessor.BuildDataAsync(ucdSource, unihanSource, ucdEmojiSource); 25 | } 26 | 27 | // This part is actually highly susceptible to framework version. Different frameworks give a different results. 28 | // In order to consistently produce the same result, the framework executing this code must be fixed. 29 | using (var stream = new DeflateStream(File.Create(outputFilePath), CompressionLevel.Optimal, false)) 30 | { 31 | data.WriteToStream(stream); 32 | stream.Flush(); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/UnihanCharacterDataBuilder.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.IO; 3 | 4 | namespace System.Unicode.Build.Core 5 | { 6 | public sealed class UnihanCharacterDataBuilder 7 | { 8 | public int CodePoint { get; } 9 | public UnihanNumericType NumericType { get; set; } 10 | public long NumericValue { get; set; } 11 | public string Definition { get; set; } 12 | public string MandarinReading { get; set; } 13 | public string CantoneseReading { get; set; } 14 | public string JapaneseKunReading { get; set; } 15 | public string JapaneseOnReading { get; set; } 16 | public string KoreanReading { get; set; } 17 | public string HangulReading { get; set; } 18 | public string VietnameseReading { get; set; } 19 | public string SimplifiedVariant { get; set; } 20 | public string TraditionalVariant { get; set; } 21 | public IList UnicodeRadicalStrokeCounts => _unicodeRadicalStrokeCounts; 22 | 23 | private readonly List _unicodeRadicalStrokeCounts = new List(); 24 | 25 | internal UnihanCharacterDataBuilder(int codePoint) => CodePoint = codePoint; 26 | 27 | internal UnihanCharacterData ToCharacterData() 28 | => new UnihanCharacterData 29 | ( 30 | CodePoint, 31 | NumericType, 32 | NumericValue, 33 | _unicodeRadicalStrokeCounts.ToArray(), 34 | Definition, 35 | MandarinReading, 36 | CantoneseReading, 37 | JapaneseKunReading, 38 | JapaneseOnReading, 39 | KoreanReading, 40 | HangulReading, 41 | VietnameseReading, 42 | SimplifiedVariant, 43 | TraditionalVariant 44 | ); 45 | 46 | internal void WriteToFile(BinaryWriter writer) 47 | { 48 | UnihanFields fields = default; 49 | 50 | fields |= (UnihanFields)NumericType; 51 | // For now, we have enough bits to encode the length of the array in the field specifier, so we'll do that. 52 | // (NB: A quick analysis of the files revealed thare there are almost always exactly one Radical/Stroke count, and occasionally two, yet never more.) 53 | if (_unicodeRadicalStrokeCounts.Count > 0) 54 | { 55 | if (_unicodeRadicalStrokeCounts.Count == 1) fields |= UnihanFields.UnicodeRadicalStrokeCount; 56 | else if (_unicodeRadicalStrokeCounts.Count == 2) fields |= UnihanFields.UnicodeRadicalStrokeCountTwice; 57 | else fields |= UnihanFields.UnicodeRadicalStrokeCountMore; 58 | } 59 | if (Definition != null) fields |= UnihanFields.Definition; 60 | if (MandarinReading != null) fields |= UnihanFields.MandarinReading; 61 | if (CantoneseReading != null) fields |= UnihanFields.CantoneseReading; 62 | if (JapaneseKunReading != null) fields |= UnihanFields.JapaneseKunReading; 63 | if (JapaneseOnReading != null) fields |= UnihanFields.JapaneseOnReading; 64 | if (KoreanReading != null) fields |= UnihanFields.KoreanReading; 65 | if (HangulReading != null) fields |= UnihanFields.HangulReading; 66 | if (VietnameseReading != null) fields |= UnihanFields.VietnameseReading; 67 | if (SimplifiedVariant != null) fields |= UnihanFields.SimplifiedVariant; 68 | if (TraditionalVariant != null) fields |= UnihanFields.TraditionalVariant; 69 | 70 | writer.Write((ushort)fields); 71 | 72 | writer.WriteCodePoint(UnihanCharacterData.PackCodePoint(CodePoint)); 73 | if ((fields & UnihanFields.OtherNumeric) != 0) writer.Write(NumericValue); 74 | 75 | if ((fields & UnihanFields.UnicodeRadicalStrokeCountMore) != 0) 76 | { 77 | if ((fields & (UnihanFields.UnicodeRadicalStrokeCountMore)) == UnihanFields.UnicodeRadicalStrokeCountMore) 78 | writer.Write(checked((byte)(_unicodeRadicalStrokeCounts.Count - 3))); 79 | 80 | foreach (var radicalStrokeCount in _unicodeRadicalStrokeCounts) 81 | { 82 | writer.Write(radicalStrokeCount.Radical); 83 | writer.Write(radicalStrokeCount.RawStrokeCount); 84 | } 85 | } 86 | 87 | if ((fields & UnihanFields.Definition) != 0) writer.Write(Definition); 88 | if ((fields & UnihanFields.MandarinReading) != 0) writer.Write(MandarinReading); 89 | if ((fields & UnihanFields.CantoneseReading) != 0) writer.Write(CantoneseReading); 90 | if ((fields & UnihanFields.JapaneseKunReading) != 0) writer.Write(JapaneseKunReading); 91 | if ((fields & UnihanFields.JapaneseOnReading) != 0) writer.Write(JapaneseOnReading); 92 | if ((fields & UnihanFields.KoreanReading) != 0) writer.Write(KoreanReading); 93 | if ((fields & UnihanFields.HangulReading) != 0) writer.Write(HangulReading); 94 | if ((fields & UnihanFields.VietnameseReading) != 0) writer.Write(VietnameseReading); 95 | if ((fields & UnihanFields.SimplifiedVariant) != 0) writer.Write(SimplifiedVariant); 96 | if ((fields & UnihanFields.TraditionalVariant) != 0) writer.Write(TraditionalVariant); 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/UnihanDataFileReader.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | 3 | namespace System.Unicode.Build.Core 4 | { 5 | public sealed class UnihanDataFileReader : IDisposable 6 | { 7 | private readonly UnicodeDataFileReader _reader; 8 | 9 | public int CodePoint { get; private set; } 10 | 11 | public string PropertyName { get; private set; } 12 | 13 | public string PropertyValue { get; private set; } 14 | 15 | public UnihanDataFileReader(Stream stream) 16 | : this(stream, false) 17 | { 18 | } 19 | 20 | public UnihanDataFileReader(Stream stream, bool leaveOpen) => _reader = new UnicodeDataFileReader(stream, '\t', leaveOpen); 21 | 22 | public void Dispose() => _reader.Dispose(); 23 | 24 | public bool Read() 25 | { 26 | bool result; 27 | 28 | if (result = _reader.MoveToNextLine()) 29 | { 30 | CodePoint = HexCodePoint.ParsePrefixed(_reader.ReadField()); 31 | PropertyName = _reader.ReadField(); 32 | PropertyValue = _reader.ReadField(); 33 | } 34 | else 35 | { 36 | CodePoint = 0; 37 | PropertyName = null; 38 | PropertyValue = null; 39 | } 40 | 41 | return result; 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/UnihanProperty.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode.Build.Core 2 | { 3 | public static class UnihanProperty 4 | { 5 | public const string AccountingNumeric = "kAccountingNumeric"; 6 | public const string BigFive = "kBigFive"; 7 | public const string Cangjie = "kCangjie"; 8 | public const string Cantonese = "kCantonese"; 9 | public const string CCCII = "kCCCII"; 10 | public const string CheungBauer = "kCheungBauer"; 11 | public const string CheungBauerIndex = "kCheungBauerIndex"; 12 | public const string CihaiT = "kCihaiT"; 13 | public const string CNS1986 = "kCNS1986"; 14 | public const string CNS1992 = "kCNS1992"; 15 | public const string CompatibilityVariant = "kCompatibilityVariant"; 16 | public const string Cowles = "kCowles"; 17 | public const string DaeJaweon = "kDaeJaweon"; 18 | public const string Definition = "kDefinition"; 19 | public const string EACC = "kEACC"; 20 | public const string Fenn = "kFenn"; 21 | public const string FennIndex = "kFennIndex"; 22 | public const string FourCornerCode = "kFourCornerCode"; 23 | public const string Frequency = "kFrequency"; 24 | public const string GB0 = "kGB0"; 25 | public const string GB1 = "kGB1"; 26 | public const string GB3 = "kGB3"; 27 | public const string GB5 = "kGB5"; 28 | public const string GB7 = "kGB7"; 29 | public const string GB8 = "kGB8"; 30 | public const string GradeLevel = "kGradeLevel"; 31 | public const string GSR = "kGSR"; 32 | public const string Hangul = "kHangul"; 33 | public const string HanYu = "kHanYu"; 34 | public const string HanyuPinlu = "kHanyuPinlu"; 35 | public const string HanyuPinyin = "kHanyuPinyin"; 36 | public const string HDZRadBreak = "kHDZRadBreak"; 37 | public const string HKGlyph = "kHKGlyph"; 38 | public const string HKSCS = "kHKSCS"; 39 | public const string IBMJapan = "kIBMJapan"; 40 | public const string IICore = "kIICore"; 41 | public const string IRG_GSource = "kIRG_GSource"; 42 | public const string IRG_HSource = "kIRG_HSource"; 43 | public const string IRG_JSource = "kIRG_JSource"; 44 | public const string IRG_KPSource = "kIRG_KPSource"; 45 | public const string IRG_KSource = "kIRG_KSource"; 46 | public const string IRG_MSource = "kIRG_MSource"; 47 | public const string IRG_TSource = "kIRG_TSource"; 48 | public const string IRG_USource = "kIRG_USource"; 49 | public const string IRG_VSource = "kIRG_VSource"; 50 | public const string IRGDaeJaweon = "kIRGDaeJaweon"; 51 | public const string IRGDaiKanwaZiten = "kIRGDaiKanwaZiten"; 52 | public const string IRGHanyuDaZidian = "kIRGHanyuDaZidian"; 53 | public const string IRGKangXi = "kIRGKangXi"; 54 | public const string JapaneseKun = "kJapaneseKun"; 55 | public const string JapaneseOn = "kJapaneseOn"; 56 | public const string Jis0 = "kJis0"; 57 | public const string Jis1 = "kJis1"; 58 | public const string JIS0213 = "kJIS0213"; 59 | public const string KangXi = "kKangXi"; 60 | public const string Karlgren = "kKarlgren"; 61 | public const string Korean = "kKorean"; 62 | public const string KPS0 = "kKPS0"; 63 | public const string KPS1 = "kKPS1"; 64 | public const string KSC0 = "kKSC0"; 65 | public const string KSC1 = "kKSC1"; 66 | public const string Lau = "kLau"; 67 | public const string MainlandTelegraph = "kMainlandTelegraph"; 68 | public const string Mandarin = "kMandarin"; 69 | public const string Matthews = "kMatthews"; 70 | public const string MeyerWempe = "kMeyerWempe"; 71 | public const string Morohashi = "kMorohashi"; 72 | public const string Nelson = "kNelson"; 73 | public const string OtherNumeric = "kOtherNumeric"; 74 | public const string Phonetic = "kPhonetic"; 75 | public const string PrimaryNumeric = "kPrimaryNumeric"; 76 | public const string PseudoGB1 = "kPseudoGB1"; 77 | public const string RSAdobe_Japan1_6 = "kRSAdobe_Japan1_6"; 78 | public const string RSJapanese = "kRSJapanese"; 79 | public const string RSKangXi = "kRSKangXi"; 80 | public const string RSKanWa = "kRSKanWa"; 81 | public const string RSKorean = "kRSKorean"; 82 | public const string RSUnicode = "kRSUnicode"; 83 | public const string SBGY = "kSBGY"; 84 | public const string SemanticVariant = "kSemanticVariant"; 85 | public const string SimplifiedVariant = "kSimplifiedVariant"; 86 | public const string SpecializedSemanticVariant = "kSpecializedSemanticVariant"; 87 | public const string TaiwanTelegraph = "kTaiwanTelegraph"; 88 | public const string Tang = "kTang"; 89 | public const string TotalStrokes = "kTotalStrokes"; 90 | public const string TraditionalVariant = "kTraditionalVariant"; 91 | public const string Vietnamese = "kVietnamese"; 92 | public const string Xerox = "kXerox"; 93 | public const string XHC1983 = "kXHC1983"; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/UnihanProperty.tt: -------------------------------------------------------------------------------- 1 | <#@ template debug="false" hostspecific="false" language="C#" #> 2 | <#@ assembly name="System.Core" #> 3 | <#@ import namespace="System.Linq" #> 4 | <#@ import namespace="System.Text" #> 5 | <#@ import namespace="System.Collections.Generic" #> 6 | <#@ output extension=".cs" #> 7 | <# 8 | var propertyNames = new[] 9 | { 10 | "kAccountingNumeric", 11 | "kBigFive", 12 | "kCangjie", 13 | "kCantonese", 14 | "kCCCII", 15 | "kCheungBauer", 16 | "kCheungBauerIndex", 17 | "kCihaiT", 18 | "kCNS1986", 19 | "kCNS1992", 20 | "kCompatibilityVariant", 21 | "kCowles", 22 | "kDaeJaweon", 23 | "kDefinition", 24 | "kEACC", 25 | "kFenn", 26 | "kFennIndex", 27 | "kFourCornerCode", 28 | "kFrequency", 29 | "kGB0", 30 | "kGB1", 31 | "kGB3", 32 | "kGB5", 33 | "kGB7", 34 | "kGB8", 35 | "kGradeLevel", 36 | "kGSR", 37 | "kHangul", 38 | "kHanYu", 39 | "kHanyuPinlu", 40 | "kHanyuPinyin", 41 | "kHDZRadBreak", 42 | "kHKGlyph", 43 | "kHKSCS", 44 | "kIBMJapan", 45 | "kIICore", 46 | "kIRG_GSource", 47 | "kIRG_HSource", 48 | "kIRG_JSource", 49 | "kIRG_KPSource", 50 | "kIRG_KSource", 51 | "kIRG_MSource", 52 | "kIRG_TSource", 53 | "kIRG_USource", 54 | "kIRG_VSource", 55 | "kIRGDaeJaweon", 56 | "kIRGDaiKanwaZiten", 57 | "kIRGHanyuDaZidian", 58 | "kIRGKangXi", 59 | "kJapaneseKun", 60 | "kJapaneseOn", 61 | "kJis0", 62 | "kJis1", 63 | "kJIS0213", 64 | "kKangXi", 65 | "kKarlgren", 66 | "kKorean", 67 | "kKPS0", 68 | "kKPS1", 69 | "kKSC0", 70 | "kKSC1", 71 | "kLau", 72 | "kMainlandTelegraph", 73 | "kMandarin", 74 | "kMatthews", 75 | "kMeyerWempe", 76 | "kMorohashi", 77 | "kNelson", 78 | "kOtherNumeric", 79 | "kPhonetic", 80 | "kPrimaryNumeric", 81 | "kPseudoGB1", 82 | "kRSAdobe_Japan1_6", 83 | "kRSJapanese", 84 | "kRSKangXi", 85 | "kRSKanWa", 86 | "kRSKorean", 87 | "kRSUnicode", 88 | "kSBGY", 89 | "kSemanticVariant", 90 | "kSimplifiedVariant", 91 | "kSpecializedSemanticVariant", 92 | "kTaiwanTelegraph", 93 | "kTang", 94 | "kTotalStrokes", 95 | "kTraditionalVariant", 96 | "kVietnamese", 97 | "kXerox", 98 | "kXHC1983", 99 | }; 100 | #> 101 | namespace System.Unicode.Build.Core 102 | { 103 | public static class UnihanProperty 104 | { 105 | <# foreach (string propertyName in propertyNames) { #> 106 | public const string <#=propertyName[0] == 'k' ? propertyName.Substring(1) : propertyName#> = "<#=propertyName#>"; 107 | <# } #> 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /System.Unicode.Build.Core/Utf8Buffer.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Concurrent; 2 | using System.Text; 3 | 4 | namespace System.Unicode.Build.Core 5 | { 6 | public struct Utf8Buffer : IDisposable 7 | { 8 | private static readonly ConcurrentStack BufferStack = new ConcurrentStack(); 9 | 10 | public static Utf8Buffer Get() => new Utf8Buffer(BufferStack.TryPop(out var buffer) ? buffer : new byte[100]); 11 | 12 | private byte[] _buffer; 13 | 14 | public int Length { get; private set; } 15 | 16 | private Utf8Buffer(byte[] buffer) 17 | { 18 | _buffer = buffer; 19 | Length = 0; 20 | } 21 | 22 | public void Dispose() 23 | { 24 | if (_buffer != null) 25 | { 26 | BufferStack.Push(_buffer); 27 | this = default; 28 | } 29 | } 30 | 31 | private void EnsureExtraCapacity(int count) 32 | { 33 | if (count < 0) throw new ArgumentOutOfRangeException(nameof(count)); 34 | if (_buffer.Length < checked(Length + count)) 35 | Array.Resize(ref _buffer, Math.Max(Length + count, _buffer.Length << 1)); 36 | } 37 | 38 | public void Append(byte[] value, int startIndex, int count) 39 | { 40 | if (value == null) throw new ArgumentNullException(nameof(value)); 41 | if (startIndex >= value.Length) throw new ArgumentOutOfRangeException(nameof(startIndex)); 42 | if (checked(count += startIndex) > value.Length) throw new ArgumentOutOfRangeException(nameof(count)); 43 | 44 | EnsureExtraCapacity(value.Length); 45 | 46 | var buffer = _buffer; 47 | 48 | for (int i = startIndex; i < count; ++i) 49 | { 50 | buffer[Length++] = value[i]; 51 | } 52 | } 53 | 54 | public override string ToString() => Length > 0 ? Encoding.UTF8.GetString(_buffer, 0, Length) : string.Empty; 55 | 56 | public string ToTrimmedString() 57 | { 58 | if (Length == 0) return string.Empty; 59 | 60 | var buffer = _buffer; 61 | int start = 0; 62 | int end = Length; 63 | 64 | while (buffer[start] == ' ') if (++start == Length) return string.Empty; 65 | while (buffer[--end] == ' ') ; 66 | 67 | return end > start ? Encoding.UTF8.GetString(buffer, start, end - start + 1) : string.Empty; 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /System.Unicode.Build.DatabaseGenerator/Program.cs: -------------------------------------------------------------------------------- 1 | using System.Net.Http; 2 | using System.Threading.Tasks; 3 | using System.Unicode.Build.Core; 4 | 5 | namespace System.Unicode.Build.DatabaseGenerator 6 | { 7 | internal static class Program 8 | { 9 | private static async Task Main(string[] args) 10 | { 11 | // The sole purpose of this program is to consistently generate the database using .NET Core 2.2. 12 | using (var httpClient = new HttpClient()) 13 | { 14 | await UnicodeDatabaseGenerator.GenerateDatabase(httpClient, args[0], args[1], null, null, null); 15 | } 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /System.Unicode.Build.DatabaseGenerator/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "profiles": { 3 | "System.Unicode.Build.DatabaseGenerator": { 4 | "commandName": "Project", 5 | "commandLineArgs": ". ucd.dat" 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /System.Unicode.Build.DatabaseGenerator/System.Unicode.Build.DatabaseGenerator.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net7.0 6 | GenerateDatabase 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /System.Unicode.Build.Tasks/AsyncTask.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Concurrent; 2 | using System.Resources; 3 | using System.Threading; 4 | using System.Threading.Tasks; 5 | using Microsoft.Build.Framework; 6 | #if NETSTANDARD2_0 7 | using BuildTask = Microsoft.Build.Utilities.Task; 8 | #else 9 | using BuildTask = Microsoft.Build.Utilities.AppDomainIsolatedTask; 10 | #endif 11 | 12 | namespace System.Unicode.Build.Tasks 13 | { 14 | [RunInMTA] 15 | [LoadInSeparateAppDomain] 16 | public abstract class AsyncTask : BuildTask, ICancelableTask 17 | { 18 | private sealed class AsyncTaskSynchronizationContext : SynchronizationContext, IDisposable 19 | { 20 | private readonly BlockingCollection<(SendOrPostCallback d, object state)> _queuedMessages; 21 | private readonly SynchronizationContext _oldSynchronizationContext; 22 | 23 | public AsyncTaskSynchronizationContext(BlockingCollection<(SendOrPostCallback d, object state)> queuedMessages) 24 | { 25 | _queuedMessages = queuedMessages; 26 | _oldSynchronizationContext = Current; 27 | SetSynchronizationContext(this); 28 | } 29 | 30 | public void Dispose() => SetSynchronizationContext(_oldSynchronizationContext); 31 | 32 | public override void OperationStarted() => throw new NotSupportedException(); 33 | 34 | public override void OperationCompleted() => throw new NotSupportedException(); 35 | 36 | public override void Post(SendOrPostCallback d, object state) => _queuedMessages.Add((d, state)); 37 | 38 | public override void Send(SendOrPostCallback d, object state) => throw new NotSupportedException(); 39 | } 40 | 41 | private CancellationTokenSource _cancellationTokenSource; 42 | 43 | protected AsyncTask() 44 | { 45 | } 46 | 47 | protected AsyncTask(ResourceManager taskResources) : base(taskResources) 48 | { 49 | } 50 | 51 | protected AsyncTask(ResourceManager taskResources, string helpKeywordPrefix) : base(taskResources, helpKeywordPrefix) 52 | { 53 | } 54 | 55 | private static CancellationToken CancelOnCompletion(Task task) 56 | { 57 | if (task.IsCompleted) return new CancellationToken(true); 58 | 59 | var cts = new CancellationTokenSource(); 60 | 61 | task.ContinueWith 62 | ( 63 | (t, state) => 64 | { 65 | ((CancellationTokenSource)state).Cancel(); 66 | }, 67 | cts, 68 | TaskContinuationOptions.ExecuteSynchronously 69 | ); 70 | 71 | return cts.Token; 72 | } 73 | 74 | public sealed override bool Execute() 75 | { 76 | _cancellationTokenSource = new CancellationTokenSource(); 77 | try 78 | { 79 | var queuedMessages = new BlockingCollection<(SendOrPostCallback callback, object state)>(); 80 | 81 | using (new AsyncTaskSynchronizationContext(queuedMessages)) 82 | { 83 | var task = ExecuteAsync(_cancellationTokenSource.Token); 84 | 85 | var ct = CancelOnCompletion(task); 86 | 87 | while (!ct.IsCancellationRequested) 88 | { 89 | SendOrPostCallback callback; 90 | object state; 91 | 92 | try 93 | { 94 | (callback, state) = queuedMessages.Take(ct); 95 | } 96 | catch (OperationCanceledException) when (ct.IsCancellationRequested || _cancellationTokenSource.IsCancellationRequested) 97 | { 98 | break; 99 | } 100 | 101 | callback(state); 102 | } 103 | 104 | return task.Result; 105 | } 106 | } 107 | finally 108 | { 109 | _cancellationTokenSource.Dispose(); 110 | _cancellationTokenSource = null; 111 | } 112 | } 113 | 114 | public void Cancel() => _cancellationTokenSource?.Cancel(); 115 | 116 | protected abstract Task ExecuteAsync(CancellationToken cancellationToken); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /System.Unicode.Build.Tasks/GenerateUnicodeDatabase.cs: -------------------------------------------------------------------------------- 1 | using System.IO; 2 | using System.Net.Http; 3 | using System.Resources; 4 | using System.Threading; 5 | using System.Threading.Tasks; 6 | using System.Unicode.Build.Core; 7 | using Microsoft.Build.Framework; 8 | 9 | namespace System.Unicode.Build.Tasks 10 | { 11 | [RunInMTA] 12 | public sealed class GenerateUnicodeDatabase : AsyncTask 13 | { 14 | public GenerateUnicodeDatabase() 15 | { 16 | } 17 | 18 | public GenerateUnicodeDatabase(ResourceManager taskResources) : base(taskResources) 19 | { 20 | } 21 | 22 | public GenerateUnicodeDatabase(ResourceManager taskResources, string helpKeywordPrefix) : base(taskResources, helpKeywordPrefix) 23 | { 24 | } 25 | 26 | [Required] 27 | public string DatabasePath { get; set; } 28 | 29 | public string IntermediateDirectory { get; set; } 30 | 31 | public bool? ShouldDownloadFiles { get; set; } 32 | 33 | public bool? ShouldSaveFiles { get; set; } 34 | 35 | public bool? ShouldExtractFiles { get; set; } 36 | 37 | protected override async Task ExecuteAsync(CancellationToken cancellationToken) 38 | { 39 | using (var httpClient = new HttpClient()) 40 | { 41 | string baseDirectory = IntermediateDirectory; 42 | 43 | baseDirectory = string.IsNullOrWhiteSpace(baseDirectory) ? 44 | Environment.CurrentDirectory : 45 | Path.GetFullPath(baseDirectory); 46 | 47 | await UnicodeDatabaseGenerator.GenerateDatabase(httpClient, baseDirectory, DatabasePath, ShouldDownloadFiles, ShouldSaveFiles, ShouldExtractFiles); 48 | 49 | return true; 50 | } 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /System.Unicode.Build.Tasks/GetUnicodeDatabaseVersion.cs: -------------------------------------------------------------------------------- 1 | using System.Buffers.Binary; 2 | using System.IO; 3 | using System.IO.Compression; 4 | using System.Resources; 5 | using System.Threading; 6 | using System.Threading.Tasks; 7 | using Microsoft.Build.Framework; 8 | 9 | namespace System.Unicode.Build.Tasks 10 | { 11 | [RunInMTA] 12 | public sealed class GetUnicodeDatabaseVersion : AsyncTask 13 | { 14 | public GetUnicodeDatabaseVersion() 15 | { 16 | } 17 | 18 | public GetUnicodeDatabaseVersion(ResourceManager taskResources) : base(taskResources) 19 | { 20 | } 21 | 22 | public GetUnicodeDatabaseVersion(ResourceManager taskResources, string helpKeywordPrefix) : base(taskResources, helpKeywordPrefix) 23 | { 24 | } 25 | 26 | [Required] 27 | public string DatabasePath { get; set; } 28 | 29 | [Output] 30 | public string UnicodeDatabaseVersion { get; private set; } 31 | 32 | protected override async Task ExecuteAsync(CancellationToken cancellationToken) 33 | { 34 | var buffer = new byte[8]; 35 | 36 | using (var file = new DeflateStream(File.OpenRead(DatabasePath), CompressionMode.Decompress)) 37 | { 38 | await file.ReadAsync(buffer, 0, buffer.Length); 39 | } 40 | 41 | if (TryReadHeader(buffer, out var version)) 42 | { 43 | UnicodeDatabaseVersion = version.ToString(3); 44 | return true; 45 | } 46 | 47 | Log.LogError("The database contained an invalid header."); 48 | 49 | return false; 50 | } 51 | 52 | private static bool TryReadHeader(ReadOnlySpan buffer, out Version version) 53 | { 54 | if (!buffer.StartsWith(new byte[] { (byte)'U', (byte)'C', (byte)'D', 2 })) 55 | { 56 | version = null; 57 | return false; 58 | } 59 | 60 | buffer = buffer.Slice(4); 61 | 62 | ushort major = BinaryPrimitives.ReadUInt16LittleEndian(buffer); 63 | 64 | buffer = buffer.Slice(sizeof(ushort)); 65 | 66 | byte minor = buffer[0]; 67 | byte build = buffer[1]; 68 | 69 | version = new Version(major, minor, build); 70 | return true; 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /System.Unicode.Build.Tasks/System.Unicode.Build.Tasks.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net472;netstandard2.0 5 | $(NoWarn);NETSDK1138 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /System.Unicode.Build.Tasks/System.Unicode.Build.Tasks.props: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | false 5 | netstandard2.0 6 | net472 7 | TaskHostFactory 8 | $(MSBuildThisFileDirectory)bin\$(Configuration)\$(SystemUnicodeBuildTasksTargetFramework)\$(MSBuildThisFileName).dll 9 | 10 | 11 | 16 | 17 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /System.Unicode.Build.Tasks/System.Unicode.Build.Tasks.targets: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | ../UnicodeVersion.txt 12 | ucd.dat 13 | false 14 | true 15 | $(UnicodeDatabaseName) 16 | true 17 | 18 | 19 | 20 | 21 | $(BaseIntermediateOutputPath)Unicode/ 22 | $(BaseIntermediateOutputPath)$(UnicodeDatabaseName) 23 | 24 | 25 | 26 | 27 | ResolveProjectReferences 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 42 | 43 | 44 | 45 | 46 | 47 | true 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | true 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 75 | 76 | 79 | 80 | 81 | 82 | 83 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /System.Unicode.Tests/CodePointEnumerableTests.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Linq; 3 | using Xunit; 4 | 5 | namespace System.Unicode.Tests 6 | { 7 | public class CodePointEnumerableTests 8 | { 9 | public static readonly TheoryData EnumerationTestData = new TheoryData 10 | { 11 | { new int[0], "" }, 12 | { new int[] { 0x0041,0x1F600, 0x00E9 }, "\u0041\U0001F600\u00E9" }, 13 | }; 14 | 15 | [Theory] 16 | [MemberData(nameof(EnumerationTestData))] 17 | public void EnumerationShouldHaveExpectedResults(int[] expectedCharacters, string text) 18 | { 19 | var enumerable = text.AsCodePointEnumerable(); 20 | 21 | // Test C# foreach enumeration 22 | { 23 | int i = 0; 24 | foreach (int codePoint in enumerable) 25 | { 26 | Assert.Equal(expectedCharacters[i++], codePoint); 27 | } 28 | Assert.Equal(expectedCharacters.Length, i); 29 | } 30 | 31 | // Test generic enumerable 32 | Assert.Equal(expectedCharacters, from codePoint in enumerable select codePoint); 33 | 34 | // Test legacy enumeration 35 | { 36 | // We could use Enumerable.Cast<>, but we can't guarantee that the LINQ implementation we use wouldn't be smart and cast IEnumerable back to IEnumerable 37 | var legacyEnumerator = ((IEnumerable)enumerable).GetEnumerator(); 38 | 39 | int index = 0; 40 | 41 | while (legacyEnumerator.MoveNext()) 42 | { 43 | Assert.True(index < expectedCharacters.Length); 44 | Assert.Equal(expectedCharacters[index++], Assert.IsType(legacyEnumerator.Current)); 45 | } 46 | 47 | Assert.Equal(expectedCharacters.Length, index); 48 | } 49 | } 50 | 51 | [Fact] 52 | public void NullArgumentShouldThrowArgumentNullException() 53 | => Assert.Throws(() => { foreach (int codePoint in (null as string).AsCodePointEnumerable()) { } }); 54 | 55 | public static readonly TheoryData EnumerationFailureTestData = new TheoryData 56 | { 57 | "\uDA00", 58 | "\uDCD0", 59 | "\uDCD0\uDA00", 60 | "\u0041\uDA00", 61 | "\u0041\uDCD0", 62 | "\uDA00\u0041", 63 | "\uDCD0\u0041", 64 | "\uDA00\u0041\uDCD0\u0041", 65 | "\u0041\uDA00\u0041\uDCD0\u0041", 66 | }; 67 | 68 | [Theory] 69 | [MemberData(nameof(EnumerationFailureTestData))] 70 | public void EnumerationOfInvalidUtf16StringsShouldThrowArgumentException(XUnitSerializableString text) 71 | => Assert.Throws(() => { foreach (int codePoint in ((string)text).AsCodePointEnumerable()) { } }); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /System.Unicode.Tests/ImportRequestedUnicodeVersion.targets: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ../UnicodeVersion.txt 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /System.Unicode.Tests/PermissiveCodePointEnumerableTests.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Linq; 3 | using Xunit; 4 | 5 | namespace System.Unicode.Tests 6 | { 7 | public class PermissiveCodePointEnumerableTests 8 | { 9 | public static readonly TheoryData EnumerationTestData = new TheoryData 10 | { 11 | { new int[0], "" }, 12 | { new int[] { 0xDA00 }, "\uDA00" }, 13 | { new int[] { 0xDCD0 }, "\uDCD0" }, 14 | { new int[] { 0xDCD0, 0xDA00 }, "\uDCD0\uDA00" }, 15 | { new int[] { 0x0041, 0xDA00 }, "\u0041\uDA00" }, 16 | { new int[] { 0x0041, 0xDCD0 }, "\u0041\uDCD0" }, 17 | { new int[] { 0xDA00, 0x0041 }, "\uDA00\u0041" }, 18 | { new int[] { 0xDCD0, 0x0041 }, "\uDCD0\u0041" }, 19 | { new int[] { 0xDA00, 0x0041, 0xDCD0, 0x0041 }, "\uDA00\u0041\uDCD0\u0041" }, 20 | { new int[] { 0x0041, 0xDA00, 0x0041, 0xDCD0, 0x0041 }, "\u0041\uDA00\u0041\uDCD0\u0041" }, 21 | { new int[] { 0x0041, 0x1F600, 0x00E9 }, "\u0041\U0001F600\u00E9" }, 22 | }; 23 | 24 | [Theory] 25 | [MemberData(nameof(EnumerationTestData))] 26 | public void EnumerationShouldHaveExpectedResults(int[] expectedCharacters, XUnitSerializableString text) 27 | { 28 | var enumerable = ((string)text).AsPermissiveCodePointEnumerable(); 29 | 30 | // Test C# foreach enumeration 31 | { 32 | int i = 0; 33 | foreach (int codePoint in enumerable) 34 | { 35 | Assert.Equal(expectedCharacters[i++], codePoint); 36 | } 37 | Assert.Equal(expectedCharacters.Length, i); 38 | } 39 | 40 | // Test generic enumerable 41 | Assert.Equal(expectedCharacters, from codePoint in enumerable select codePoint); 42 | 43 | // Test legacy enumeration 44 | { 45 | // We could use Enumerable.Cast<>, but we can't guarantee that the LINQ implementation we use wouldn't be smart and cast IEnumerable back to IEnumerable 46 | var legacyEnumerator = ((IEnumerable)enumerable).GetEnumerator(); 47 | 48 | int index = 0; 49 | 50 | while (legacyEnumerator.MoveNext()) 51 | { 52 | Assert.True(index < expectedCharacters.Length); 53 | Assert.Equal(expectedCharacters[index++], Assert.IsType(legacyEnumerator.Current)); 54 | } 55 | 56 | Assert.Equal(expectedCharacters.Length, index); 57 | } 58 | } 59 | 60 | [Fact] 61 | public void NullArgumentShouldThrowArgumentNullException() 62 | => Assert.Throws(() => { foreach (int c in (null as string).AsPermissiveCodePointEnumerable()) { } }); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /System.Unicode.Tests/System.Unicode.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net7.0 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | all 17 | runtime; build; native; contentfiles; analyzers; buildtransitive 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /System.Unicode.Tests/UnicodeCodePointRangeTests.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using Xunit; 3 | 4 | namespace System.Unicode.Tests 5 | { 6 | public class UnicodeCodePointRangeTests 7 | { 8 | [Theory] 9 | [InlineData(0, 0x10FFFF)] 10 | public void MultiCodePointRangeShouldHaveExpectedResults(int firstCodePoint, int lastCodePoint) 11 | { 12 | var range = new UnicodeCodePointRange(firstCodePoint, lastCodePoint); 13 | 14 | Assert.Equal(firstCodePoint, range.FirstCodePoint); 15 | Assert.Equal(lastCodePoint, range.LastCodePoint); 16 | Assert.False(range.IsSingleCodePoint); 17 | } 18 | 19 | [Theory] 20 | [InlineData((int)'A')] 21 | [InlineData(0x0)] 22 | [InlineData(0x10FFFF)] 23 | public void SingleCodePointRangeShouldHaveExpectedResults(int codePoint) 24 | { 25 | var range = new UnicodeCodePointRange(codePoint); 26 | 27 | Assert.Equal(codePoint, range.FirstCodePoint); 28 | Assert.Equal(codePoint, range.LastCodePoint); 29 | Assert.True(range.IsSingleCodePoint); 30 | } 31 | 32 | [Theory] 33 | [InlineData(0, 0, "0000")] 34 | [InlineData(0x1, 0x30, "0001..0030")] 35 | [InlineData(0x41, 0x5A, "0041..005A")] 36 | [InlineData(0x0, 0xFFFF, "0000..FFFF")] 37 | [InlineData(0xFFFF, 0xFFFF, "FFFF")] 38 | [InlineData(0xFFFF, 0x10000, "FFFF..10000")] 39 | [InlineData(0x10000, 0x10000, "10000")] 40 | [InlineData(0, 0xF0000, "0000..F0000")] 41 | [InlineData(0xFFFFF, 0xFFFFF, "FFFFF")] 42 | [InlineData(0, 0xFFFFF, "0000..FFFFF")] 43 | [InlineData(0, 0x10FFFF, "0000..10FFFF")] 44 | [InlineData(0xFFFF, 0x10FFFF, "FFFF..10FFFF")] 45 | [InlineData(0x1FFFF, 0x10FFFF, "1FFFF..10FFFF")] 46 | [InlineData(0x10FFFE, 0x10FFFF, "10FFFE..10FFFF")] 47 | [InlineData(0x10FFFF, 0x10FFFF, "10FFFF")] 48 | public void ToStringShouldProduceExpectedResultForCodePoints(int firstCodePoint, int lastCodePoint, string expectedResult) 49 | { 50 | var range = new UnicodeCodePointRange(firstCodePoint, lastCodePoint); 51 | 52 | Assert.Equal(expectedResult, range.ToString()); 53 | } 54 | 55 | [Theory] 56 | [InlineData((int)'A', "0041")] 57 | [InlineData(0x0, "0000")] 58 | [InlineData(0xFFFF, "FFFF")] 59 | [InlineData(0x10000, "10000")] 60 | [InlineData(0x1FFFF, "1FFFF")] 61 | [InlineData(0xFFFFF, "FFFFF")] 62 | [InlineData(0x10FFFF, "10FFFF")] 63 | public void ToStringShouldProduceExpectedResultForCodePoint(int codePoint, string expectedResult) 64 | { 65 | var range = new UnicodeCodePointRange(codePoint); 66 | 67 | Assert.Equal(expectedResult, range.ToString()); 68 | } 69 | 70 | [Theory] 71 | [InlineData(-1)] 72 | [InlineData(0x110000)] 73 | [InlineData(int.MaxValue)] 74 | public void ConstructorShouldFailForInvalidCodePoint(int codePoint) 75 | => Assert.Throws(() => new UnicodeCodePointRange(codePoint)); 76 | 77 | [Theory] 78 | [InlineData(-1, 10)] 79 | [InlineData(10, 0x110000)] 80 | [InlineData(-1, 0x110000)] 81 | public void ConstructorShouldFailForInvalidCodePoints(int firstCodePoint, int lastCodePoint) 82 | => Assert.Throws(() => new UnicodeCodePointRange(firstCodePoint, lastCodePoint)); 83 | 84 | [Theory] 85 | [InlineData(0xA3F, 0x105F)] 86 | public void EnumerationShouldHaveExpectedResults(int firstCodePoint, int lastCodePoint) 87 | { 88 | // Generic test 89 | { 90 | int i = firstCodePoint; 91 | 92 | foreach (int n in new UnicodeCodePointRange(firstCodePoint, lastCodePoint)) 93 | { 94 | Assert.Equal(i++, n); 95 | } 96 | } 97 | 98 | // Nongeneric test 99 | { 100 | int i = firstCodePoint; 101 | 102 | var enumerator = (IEnumerator)new UnicodeCodePointRange(firstCodePoint, lastCodePoint).GetEnumerator(); 103 | 104 | while (enumerator.MoveNext()) 105 | { 106 | Assert.Equal(i++, enumerator.Current); 107 | } 108 | 109 | enumerator.Reset(); 110 | 111 | Assert.True(enumerator.MoveNext()); 112 | Assert.Equal(firstCodePoint, enumerator.Current); 113 | } 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /System.Unicode.Tests/UnicodeRationalNumerTests.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using Xunit; 3 | 4 | namespace System.Unicode.Tests 5 | { 6 | public class UnicodeRationalNumerTests 7 | { 8 | [Fact] 9 | public void DefaultValueShouldBeDetectedAsSuch() 10 | { 11 | Assert.True(default(UnicodeRationalNumber).IsDefaultValue); 12 | Assert.Equal(string.Empty, default(UnicodeRationalNumber).ToString()); 13 | } 14 | 15 | public static readonly TheoryData Numerators = new TheoryData 16 | { 17 | 0, 18 | 1, 19 | long.MaxValue, 20 | long.MinValue 21 | }; 22 | 23 | [Theory] 24 | [MemberData(nameof(Numerators))] 25 | public void NumbersAndFractionOverOneShouldBeEqual(long numerator) 26 | { 27 | Assert.Equal(new UnicodeRationalNumber(numerator), new UnicodeRationalNumber(numerator, 1)); 28 | Assert.Equal(new UnicodeRationalNumber(numerator).GetHashCode(), new UnicodeRationalNumber(numerator, 1).GetHashCode()); 29 | } 30 | 31 | [Theory] 32 | [InlineData("1/10", "10/1")] 33 | [InlineData("2/10", "1/10")] 34 | [InlineData("1/20", "1/10")] 35 | [InlineData("2/2", "1/1")] 36 | [InlineData("2/1", "1/2")] 37 | public void DifferentRationalNumbersShouldNotBeDeterminedEqual(string number1, string number2) 38 | { 39 | Assert.NotEqual(UnicodeRationalNumber.Parse(number1), UnicodeRationalNumber.Parse(number2)); 40 | Assert.NotEqual(UnicodeRationalNumber.Parse(number2), UnicodeRationalNumber.Parse(number1)); 41 | } 42 | 43 | public static readonly TheoryData StringConversionTestData = new TheoryData 44 | { 45 | { "0", 0, 1 }, 46 | { "1", 1, 1 }, 47 | { "1/100", 1, 100 }, 48 | { "-20/7", -20, 7 }, 49 | { "-5", -5, 1 }, 50 | { "-9223372036854775808", long.MinValue, 1 }, 51 | { "9223372036854775807", long.MaxValue, 1 }, 52 | { "9223372036854775807/255", long.MaxValue, byte.MaxValue }, 53 | }; 54 | 55 | [Theory] 56 | [MemberData(nameof(StringConversionTestData))] 57 | public void MethodToStringShouldReturnExpectedResult(string expectedText, long numerator, byte denominator) 58 | => Assert.Equal(expectedText, new UnicodeRationalNumber(numerator, denominator).ToString()); 59 | 60 | [Fact] 61 | public void ParsingNullValueShoudlFail() 62 | => Assert.Throws(() => UnicodeRationalNumber.Parse(null)); 63 | 64 | [Fact] 65 | public void ParsingEmptyValueShoudlFail() 66 | => Assert.Throws(() => UnicodeRationalNumber.Parse(string.Empty)); 67 | 68 | [Theory] 69 | [InlineData(0, "0")] 70 | [InlineData(0, "0/1")] 71 | [InlineData(1, "1")] 72 | [InlineData(1, "1/1")] 73 | [InlineData(long.MaxValue, "9223372036854775807")] 74 | [InlineData(long.MaxValue, "9223372036854775807/1")] 75 | [InlineData(long.MinValue, "-9223372036854775808")] 76 | [InlineData(long.MinValue, "-9223372036854775808/1")] 77 | public void ParsingCanReturnSimpleNumber(long expectedNumber, string text) 78 | => Assert.Equal(new UnicodeRationalNumber(expectedNumber), UnicodeRationalNumber.Parse(text)); 79 | 80 | public static readonly TheoryData FractionParsingTestData = new TheoryData 81 | { 82 | { 0, 1, "0" }, 83 | { 0, 1, "0/1" }, 84 | { 1, 1, "1" }, 85 | { 1, 1, "1/1" }, 86 | { 1, 10, "1/10" }, 87 | { 1, 255, "1/255" }, 88 | { 3, 4, "3/4" }, 89 | { 6, 8, "6/8" }, 90 | { 1, 255, "1/255" }, 91 | { long.MaxValue, 1, "9223372036854775807" }, 92 | { long.MaxValue, 1, "9223372036854775807/1" }, 93 | { long.MinValue, 1, "-9223372036854775808" }, 94 | { long.MinValue, 1, "-9223372036854775808/1" }, 95 | { long.MaxValue, byte.MaxValue, "9223372036854775807/255" }, 96 | }; 97 | 98 | [Theory] 99 | [MemberData(nameof(FractionParsingTestData))] 100 | public void ParsingCanReturnFraction(long expectedNumerator, byte expectedDenominator, string text) 101 | => Assert.Equal(new UnicodeRationalNumber(expectedNumerator, expectedDenominator), UnicodeRationalNumber.Parse(text)); 102 | 103 | [Fact] 104 | public void EqualityComparisonAndHashCodeShouldWorkAsExpected() 105 | { 106 | var numbers = new[] 107 | { 108 | default, 109 | new UnicodeRationalNumber(0), 110 | new UnicodeRationalNumber(1), 111 | new UnicodeRationalNumber(1, 10), 112 | new UnicodeRationalNumber(1, 100), 113 | new UnicodeRationalNumber(10), 114 | new UnicodeRationalNumber(100), 115 | new UnicodeRationalNumber(1000), 116 | new UnicodeRationalNumber(1000000), 117 | new UnicodeRationalNumber(1000000000), 118 | new UnicodeRationalNumber(1000000000000), 119 | }; 120 | 121 | var hashSet = new HashSet(); 122 | 123 | // Verify that all numbers are unique 124 | foreach (var number in numbers) 125 | Assert.True(hashSet.Add(number)); 126 | 127 | // Verify that all numbers are already in the list 128 | foreach (var number in numbers) 129 | Assert.False(hashSet.Add(number)); 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /System.Unicode.Tests/UnihanCharacterDataTests.cs: -------------------------------------------------------------------------------- 1 | using System.Linq; 2 | using Xunit; 3 | 4 | namespace System.Unicode.Tests 5 | { 6 | public sealed class UnihanCharacterDataTests 7 | { 8 | private static readonly UnicodeBlock[] Blocks = UnicodeInfo.GetBlocks(); 9 | 10 | [Theory] 11 | [InlineData("CJK Unified Ideographs")] 12 | [InlineData("CJK Unified Ideographs Extension A")] 13 | [InlineData("CJK Unified Ideographs Extension B")] 14 | [InlineData("CJK Unified Ideographs Extension C")] 15 | [InlineData("CJK Unified Ideographs Extension D")] 16 | [InlineData("CJK Unified Ideographs Extension E")] 17 | [InlineData("CJK Unified Ideographs Extension F")] 18 | [InlineData("CJK Unified Ideographs Extension G")] 19 | [InlineData("CJK Unified Ideographs Extension H")] 20 | [InlineData("CJK Compatibility Ideographs")] 21 | [InlineData("CJK Compatibility Ideographs Supplement")] 22 | public void CodePointPackingShouldRoundTrip(string blockName) 23 | { 24 | var block = Blocks.Single(b => b.Name == blockName); 25 | 26 | foreach (int codePoint in block.CodePointRange) 27 | { 28 | Assert.Equal(codePoint, UnihanCharacterData.UnpackCodePoint(UnihanCharacterData.PackCodePoint(codePoint))); 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /System.Unicode.Tests/XUnitSerializableString.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | using Xunit.Abstractions; 3 | 4 | namespace System.Unicode.Tests 5 | { 6 | // This class is needed because apparently, somewhere in the process of unit testing, strings with invalid UTF-16 sequences are "fixed", which totally messes up the tests here. 7 | // This is just a wrapper over regular strings… Data is serialized as an array of chars instead of a string. This seems to do the trick. 8 | public class XUnitSerializableString : IEquatable, IXunitSerializable 9 | { 10 | private string _value; 11 | 12 | public XUnitSerializableString() : this(null) { } 13 | 14 | public XUnitSerializableString(string value) 15 | { 16 | _value = value; 17 | } 18 | 19 | void IXunitSerializable.Deserialize(IXunitSerializationInfo info) 20 | { 21 | var chars = info.GetValue("Chars"); 22 | 23 | _value = chars != null ? 24 | new string(chars) : 25 | null; 26 | } 27 | 28 | void IXunitSerializable.Serialize(IXunitSerializationInfo info) 29 | => info.AddValue("Chars", _value?.ToCharArray(), typeof(char[])); 30 | 31 | public override string ToString() 32 | { 33 | if (string.IsNullOrEmpty(_value)) return _value; 34 | 35 | var sb = new StringBuilder(_value.Length * 6); 36 | 37 | foreach (char c in _value) 38 | { 39 | sb.Append(@"\u") 40 | .Append(((ushort)c).ToString("X4")); 41 | } 42 | 43 | return sb.ToString(); 44 | } 45 | 46 | public bool Equals(XUnitSerializableString other) => _value == other._value; 47 | public override bool Equals(object obj) => obj is XUnitSerializableString && Equals((XUnitSerializableString)obj); 48 | public override int GetHashCode() => StringComparer.Ordinal.GetHashCode(_value); 49 | 50 | public static implicit operator string(XUnitSerializableString text) => text._value; 51 | public static implicit operator XUnitSerializableString(string text) => new XUnitSerializableString(text); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /System.Unicode.snk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/System.Unicode.snk -------------------------------------------------------------------------------- /System.Unicode/BidirectionalClass.cs: -------------------------------------------------------------------------------- 1 | using System.ComponentModel.DataAnnotations; 2 | 3 | namespace System.Unicode 4 | { 5 | /// Represents possible values for the Bidi_Class unicode property. 6 | public enum BidirectionalClass : byte 7 | { 8 | /// Represents the value Left_To_Right. 9 | /// Any strong left-to-right character. 10 | [ValueName("L"), ValueName("Left_To_Right"), Display(Name = "Left_To_Right", Description = "Any strong left-to-right character.")] 11 | LeftToRight, 12 | /// Represents the value Right_To_Left. 13 | /// Any strong right-to-left (non-Arabic-type) character. 14 | [ValueName("R"), ValueName("Right_To_Left"), Display(Name = "Right_To_Left", Description = "Any strong right-to-left (non-Arabic-type) character.")] 15 | RightToLeft, 16 | /// Represents the value Arabic_Letter. 17 | /// Any strong right-to-left (Arabic-type) character. 18 | [ValueName("AL"), ValueName("Arabic_Letter"), Display(Name = "Arabic_Letter", Description = "Any strong right-to-left (Arabic-type) character.")] 19 | ArabicLetter, 20 | /// Represents the value European_Number. 21 | /// Any ASCII digit or Eastern Arabic-Indic digit. 22 | [ValueName("EN"), ValueName("European_Number"), Display(Name = "European_Number", Description = "Any ASCII digit or Eastern Arabic-Indic digit.")] 23 | EuropeanNumber, 24 | /// Represents the value European_Separator. 25 | /// Plus and minus signs. 26 | [ValueName("ES"), ValueName("European_Separator"), Display(Name = "European_Separator", Description = "Plus and minus signs.")] 27 | EuropeanSeparator, 28 | /// Represents the value European_Terminator. 29 | /// A terminator in a numeric format context, includes currency signs. 30 | [ValueName("ET"), ValueName("European_Terminator"), Display(Name = "European_Terminator", Description = "A terminator in a numeric format context, includes currency signs.")] 31 | EuropeanTerminator, 32 | /// Represents the value Arabic_Number. 33 | /// Any Arabic-Indic digit. 34 | [ValueName("AN"), ValueName("Arabic_Number"), Display(Name = "Arabic_Number", Description = "Any Arabic-Indic digit.")] 35 | ArabicNumber, 36 | /// Represents the value Common_Separator. 37 | /// Commas, colons, and slashes. 38 | [ValueName("CS"), ValueName("Common_Separator"), Display(Name = "Common_Separator", Description = "Commas, colons, and slashes.")] 39 | CommonSeparator, 40 | /// Represents the value Nonspacing_Mark. 41 | /// Any nonspacing mark. 42 | [ValueName("NSM"), ValueName("Nonspacing_Mark"), Display(Name = "Nonspacing_Mark", Description = "Any nonspacing mark.")] 43 | NonSpacingMark, 44 | /// Represents the value Boundary_Neutral. 45 | /// Most format characters, control codes, or noncharacters. 46 | [ValueName("BN"), ValueName("Boundary_Neutral"), Display(Name = "Boundary_Neutral", Description = "Most format characters, control codes, or noncharacters.")] 47 | BoundaryNeutral, 48 | /// Represents the value Paragraph_Separator. 49 | /// Various newline characters. 50 | [ValueName("B"), ValueName("Paragraph_Separator"), Display(Name = "Paragraph_Separator", Description = "Various newline characters.")] 51 | ParagraphSeparator, 52 | /// Represents the value Segment_Separator. 53 | /// Various segment-related control codes. 54 | [ValueName("S"), ValueName("Segment_Separator"), Display(Name = "Segment_Separator", Description = "Various segment-related control codes.")] 55 | SegmentSeparator, 56 | /// Represents the value White_Space. 57 | /// Spaces. 58 | [ValueName("WS"), ValueName("White_Space"), Display(Name = "White_Space", Description = "Spaces.")] 59 | WhiteSpace, 60 | /// Represents the value Other_Neutral. 61 | /// Most other symbols and punctuation marks. 62 | [ValueName("ON"), ValueName("Other_Neutral"), Display(Name = "Other_Neutral", Description = "Most other symbols and punctuation marks.")] 63 | OtherNeutral, 64 | /// Represents the value Left_To_Right_Embedding. 65 | /// U+202A: the LR embedding control. 66 | [ValueName("LRE"), ValueName("Left_To_Right_Embedding"), Display(Name = "Left_To_Right_Embedding", Description = "U+202A: the LR embedding control.")] 67 | LeftToRightEmbedding, 68 | /// Represents the value Left_To_Right_Override. 69 | /// U+202D: the LR override control. 70 | [ValueName("LRO"), ValueName("Left_To_Right_Override"), Display(Name = "Left_To_Right_Override", Description = "U+202D: the LR override control.")] 71 | LeftToRightOverride, 72 | /// Represents the value Right_To_Left_Embedding. 73 | /// U+202B: the RL embedding control. 74 | [ValueName("RLE"), ValueName("Right_To_Left_Embedding"), Display(Name = "Right_To_Left_Embedding", Description = "U+202B: the RL embedding control.")] 75 | RightToLeftEmbedding, 76 | /// Represents the value Right_To_Left_Override. 77 | /// U+202E: the RL override control. 78 | [ValueName("RLO"), ValueName("Right_To_Left_Override"), Display(Name = "Right_To_Left_Override", Description = "U+202E: the RL override control.")] 79 | RightToLeftOverride, 80 | /// Represents the value Pop_Directional_Format. 81 | /// U+202C: terminates an embedding or override control. 82 | [ValueName("PDF"), ValueName("Pop_Directional_Format"), Display(Name = "Pop_Directional_Format", Description = "U+202C: terminates an embedding or override control.")] 83 | PopDirectionalFormat, 84 | /// Represents the value Left_To_Right_Isolate. 85 | /// U+2066: the LR isolate control. 86 | [ValueName("LRI"), ValueName("Left_To_Right_Isolate"), Display(Name = "Left_To_Right_Isolate", Description = "U+2066: the LR isolate control.")] 87 | LeftToRightIsolate, 88 | /// Represents the value Right_To_Left_Isolate. 89 | /// U+2067: the RL isolate control. 90 | [ValueName("RLI"), ValueName("Right_To_Left_Isolate"), Display(Name = "Right_To_Left_Isolate", Description = "U+2067: the RL isolate control.")] 91 | RightToLeftIsolate, 92 | /// Represents the value First_Strong_Isolate. 93 | /// U+2068: the first strong isolate control. 94 | [ValueName("FSI"), ValueName("First_Strong_Isolate"), Display(Name = "First_Strong_Isolate", Description = "U+2068: the first strong isolate control.")] 95 | FirstStrongIsolate, 96 | /// Represents the value Pop_Directional_Isolate. 97 | /// U+2069: terminates an isolate control. 98 | [ValueName("PDI"), ValueName("Pop_Directional_Isolate"), Display(Name = "Pop_Directional_Isolate", Description = "U+2069: terminates an isolate control.")] 99 | PopDirectionalIsolate, 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /System.Unicode/CjkRadicalData.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | #if BUILD_SYSTEM 4 | public 5 | #else 6 | internal 7 | #endif 8 | readonly struct CjkRadicalData 9 | { 10 | public readonly char TraditionalRadicalCodePoint; 11 | public readonly char TraditionalCharacterCodePoint; 12 | public readonly char SimplifiedRadicalCodePoint; 13 | public readonly char SimplifiedCharacterCodePoint; 14 | 15 | internal CjkRadicalData(char radicalCodePoint, char characterCodePoint) 16 | { 17 | TraditionalRadicalCodePoint = radicalCodePoint; 18 | TraditionalCharacterCodePoint = characterCodePoint; 19 | SimplifiedRadicalCodePoint = radicalCodePoint; 20 | SimplifiedCharacterCodePoint = characterCodePoint; 21 | } 22 | 23 | internal CjkRadicalData(char traditionalRadicalCodePoint, char traditionalCharacterCodePoint, char simplifiedRadicalCodePoint, char simplifiedCharacterCodePoint) 24 | { 25 | TraditionalRadicalCodePoint = traditionalRadicalCodePoint; 26 | TraditionalCharacterCodePoint = traditionalCharacterCodePoint; 27 | SimplifiedRadicalCodePoint = simplifiedRadicalCodePoint; 28 | SimplifiedCharacterCodePoint = simplifiedCharacterCodePoint; 29 | } 30 | 31 | public bool HasSimplifiedForm 32 | => SimplifiedRadicalCodePoint != TraditionalRadicalCodePoint 33 | || SimplifiedCharacterCodePoint != TraditionalCharacterCodePoint; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /System.Unicode/CjkRadicalInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Diagnostics; 2 | 3 | namespace System.Unicode 4 | { 5 | /// Provides information on a specific CJK radical. 6 | [DebuggerDisplay("{RadicalIndex} - {TraditionalRadicalCodePoint.ToString(),nq} / {SimplifiedRadicalCodePoint.ToString(),nq}")] 7 | public readonly struct CjkRadicalInfo 8 | { 9 | /// The index of the radical in the Kangxi dictionary. 10 | /// There are 214 radicals, numbered from 1 to 214. 11 | public byte RadicalIndex { get; } 12 | 13 | private readonly CjkRadicalData _radicalData; 14 | 15 | /// Gets a code point representing the CJK radical in its traditional form. 16 | public char TraditionalRadicalCodePoint => _radicalData.TraditionalRadicalCodePoint; 17 | /// Gets the code point of a traditional character composed only of the CJK radical. 18 | /// 19 | /// Usually, the glyph of this code point will be the same as the one used for . 20 | /// However, the code point returned will have a meaning associated, contrary to the one returned by , which only represents the radical. 21 | /// 22 | public char TraditionalCharacterCodePoint => _radicalData.TraditionalCharacterCodePoint; 23 | /// Gets a code point representing the CJK radical in its simplified form, which may be the same as the traditional form. 24 | /// Most of the time, the value returned will be the same as . 25 | public char SimplifiedRadicalCodePoint => _radicalData.SimplifiedRadicalCodePoint; 26 | /// Gets the code point of a simplified character composed only of the CJK radical. 27 | /// 28 | /// Usually, the glyph of this code point will be the same as the one used for . 29 | /// However, the code point returned will have a meaning associated, contrary to the one returned by , which only represents the radical. 30 | /// 31 | public char SimplifiedCharacterCodePoint => _radicalData.SimplifiedCharacterCodePoint; 32 | 33 | /// Gets a value indicating whether a simplified form exists for the given radical. 34 | public bool HasSimplifiedForm => _radicalData.HasSimplifiedForm; 35 | 36 | internal CjkRadicalInfo(byte radicalIndex, CjkRadicalData radicalData) 37 | { 38 | RadicalIndex = radicalIndex; 39 | _radicalData = radicalData; 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /System.Unicode/CodePointEnumerable.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Allows enumeration of the code points contained in an encapsulated string. 7 | /// 8 | /// This enumerable will only allow enumeration of valid UTF-16 strings. 9 | /// For incomplete or invalid UTF-16 strings, please use instead. 10 | /// 11 | public readonly struct CodePointEnumerable : IEnumerable 12 | { 13 | /// Initializes a new instance of the struct . 14 | /// The string whose code points must be enumerated. 15 | public CodePointEnumerable(string text) => Text = text ?? throw new ArgumentNullException(nameof(text)); 16 | 17 | /// Gets the text whose code points are being enumerated. 18 | public string Text { get; } 19 | 20 | /// Gets an enumerator which can be used to enumerate the code points in the text. 21 | public CodePointEnumerator GetEnumerator() => new CodePointEnumerator(Text); 22 | 23 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 24 | 25 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /System.Unicode/CodePointEnumerator.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Supports a standard iteration of code points in a . 7 | public struct CodePointEnumerator : IEnumerator 8 | { 9 | private readonly string _text; 10 | private int _current; 11 | private int _index; 12 | 13 | /// Initializes a new instance of the struct. 14 | /// The text whose code point should be enumerated. 15 | /// is . 16 | public CodePointEnumerator(string text) 17 | { 18 | _text = text ?? throw new ArgumentNullException(nameof(text)); 19 | _current = 0; 20 | _index = -1; 21 | } 22 | 23 | /// Gets the element in the collection at the current position of the enumerator.. 24 | /// The element in the collection at the current position of the enumerator. 25 | public int Current => _current; 26 | 27 | object IEnumerator.Current => _current; 28 | 29 | void IDisposable.Dispose() { } 30 | 31 | /// Advances the enumerator to the next element of the collection. 32 | /// if the enumerator was successfully advanced to the next element; if the enumerator has passed the end of the collection. 33 | public bool MoveNext() 34 | { 35 | if (_index < _text.Length && (_index += _current > 0xFFFF ? 2 : 1) < _text.Length) 36 | { 37 | _current = char.ConvertToUtf32(_text, _index); 38 | return true; 39 | } 40 | else 41 | { 42 | _current = 0; 43 | return false; 44 | } 45 | } 46 | 47 | void IEnumerator.Reset() => (_current, _index) = (0, -1); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /System.Unicode/CompatibilityFormattingTag.cs: -------------------------------------------------------------------------------- 1 | using System.ComponentModel.DataAnnotations; 2 | 3 | namespace System.Unicode 4 | { 5 | /// Provides information on the kind of compatibility decomposition provided. 6 | /// The default value of indicates canonical decomposition of the code point. 7 | public enum CompatibilityFormattingTag : byte 8 | { 9 | /// Canonical form. 10 | Canonical = 0, 11 | /// Font variant (for example, a blackletter form). 12 | [ValueName("font"), Display(Name = "font", Description = "Font variant (for example, a blackletter form).")] 13 | Font, 14 | /// No-break version of a space or hyphen. 15 | [ValueName("noBreak"), Display(Name = "noBreak", Description = "No-break version of a space or hyphen.")] 16 | NoBreak, 17 | /// Initial presentation form (Arabic). 18 | [ValueName("initial"), Display(Name = "initial", Description = "Initial presentation form (Arabic).")] 19 | Initial, 20 | /// Medial presentation form (Arabic). 21 | [ValueName("medial"), Display(Name = "medial", Description = "Medial presentation form (Arabic).")] 22 | Medial, 23 | /// Final presentation form (Arabic). 24 | [ValueName("final"), Display(Name = "final", Description = "Final presentation form (Arabic).")] 25 | Final, 26 | /// Isolated presentation form (Arabic). 27 | [ValueName("isolated"), Display(Name = "isolated", Description = "Isolated presentation form (Arabic).")] 28 | Isolated, 29 | /// Encircled form. 30 | [ValueName("circle"), Display(Name = "circle", Description = "Encircled form.")] 31 | Circle, 32 | /// Superscript form. 33 | [ValueName("super"), Display(Name = "super", Description = "Superscript form.")] 34 | Super, 35 | /// Subscript form. 36 | [ValueName("sub"), Display(Name = "sub", Description = "Subscript form.")] 37 | Sub, 38 | /// Vertical layout presentation form. 39 | [ValueName("vertical"), Display(Name = "vertical", Description = "Vertical layout presentation form.")] 40 | Vertical, 41 | /// Wide (or zenkaku) compatibility character. 42 | [ValueName("wide"), Display(Name = "wide", Description = "Wide (or zenkaku) compatibility character.")] 43 | Wide, 44 | /// Narrow (or hankaku) compatibility character. 45 | [ValueName("narrow"), Display(Name = "narrow", Description = "Narrow (or hankaku) compatibility character.")] 46 | Narrow, 47 | /// Small variant form (CNS compatibility). 48 | [ValueName("small"), Display(Name = "small", Description = "Small variant form (CNS compatibility).")] 49 | Small, 50 | /// CJK squared font variant. 51 | [ValueName("square"), Display(Name = "square", Description = "CJK squared font variant.")] 52 | Square, 53 | /// Vulgar fraction form. 54 | [ValueName("fraction"), Display(Name = "fraction", Description = "Vulgar fraction form.")] 55 | Fraction, 56 | /// Otherwise unspecified compatibility character. 57 | [ValueName("compat"), Display(Name = "compat", Description = "Otherwise unspecified compatibility character.")] 58 | Compat, 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /System.Unicode/CoreProperties.cs: -------------------------------------------------------------------------------- 1 | using System.ComponentModel.DataAnnotations; 2 | 3 | namespace System.Unicode 4 | { 5 | /// A bitmask of the various available core properties. 6 | /// Core properties are normative, and derived from various properties as well as . 7 | [Flags] 8 | public enum CoreProperties : int 9 | { 10 | // ⚠️ Be careful when adding new properties to the enum. Only up to 22 bits should be consumed. 11 | 12 | /// Represents the Lowercase property. 13 | [ValueName("Lowercase"), ValueName("Lower"), Display(Name = "Lowercase")] 14 | Lowercase = 0b_0000_0000_0000_0000_0000_0001, 15 | /// Represents the Uppercase property. 16 | [ValueName("Uppercase"), ValueName("Upper"), Display(Name = "Uppercase")] 17 | Uppercase = 0b_0000_0000_0000_0000_0000_0010, 18 | /// Represents the Cased property. 19 | [ValueName("Cased"), Display(Name = "Cased")] 20 | Cased = 0b_0000_0000_0000_0000_0000_0100, 21 | /// Represents the Case_Ignorable property. 22 | [ValueName("Case_Ignorable"), ValueName("CI"), Display(Name = "Case_Ignorable")] 23 | CaseIgnorable = 0b_0000_0000_0000_0000_0000_1000, 24 | /// Represents the Changes_When_Lowercased property. 25 | [ValueName("Changes_When_Lowercased"), ValueName("CWL"), Display(Name = "Changes_When_Lowercased")] 26 | ChangesWhenLowercased = 0b_0000_0000_0000_0000_0001_0000, 27 | /// Represents the Changes_When_Uppercased property. 28 | [ValueName("Changes_When_Uppercased"), ValueName("CWU"), Display(Name = "Changes_When_Uppercased")] 29 | ChangesWhenUppercased = 0b_0000_0000_0000_0000_0010_0000, 30 | /// Represents the Changes_When_Titlecased property. 31 | [ValueName("Changes_When_Titlecased"), ValueName("CWT"), Display(Name = "Changes_When_Titlecased")] 32 | ChangesWhenTitlecased = 0b_0000_0000_0000_0000_0100_0000, 33 | /// Represents the Changes_When_Casefolded property. 34 | [ValueName("Changes_When_Casefolded"), ValueName("CWCF"), Display(Name = "Changes_When_Casefolded")] 35 | ChangesWhenCasefolded = 0b_0000_0000_0000_0000_1000_0000, 36 | /// Represents the Changes_When_Casemapped property. 37 | [ValueName("Changes_When_Casemapped"), ValueName("CWCM"), Display(Name = "Changes_When_Casemapped")] 38 | ChangesWhenCasemapped = 0b_0000_0000_0000_0001_0000_0000, 39 | /// Represents the Alphabetic property. 40 | [ValueName("Alphabetic"), ValueName("Alpha"), Display(Name = "Alphabetic")] 41 | Alphabetic = 0b_0000_0000_0000_0010_0000_0000, 42 | /// Represents the Default_Ignorable_Code_Point property. 43 | [ValueName("Default_Ignorable_Code_Point"), ValueName("DI"), Display(Name = "Default_Ignorable_Code_Point")] 44 | DefaultIgnorableCodePoint = 0b_0000_0000_0000_0100_0000_0000, 45 | /// Represents the Grapheme_Base property. 46 | [ValueName("Grapheme_Base"), ValueName("Gr_Base"), Display(Name = "Grapheme_Base")] 47 | GraphemeBase = 0b_0000_0000_0000_1000_0000_0000, 48 | /// Represents the Grapheme_Extend property. 49 | [ValueName("Grapheme_Extend"), ValueName("Gr_Ext"), Display(Name = "Grapheme_Extend")] 50 | GraphemeExtend = 0b_0000_0000_0001_0000_0000_0000, 51 | /// Represents the Grapheme_Link property. 52 | [ValueName("Grapheme_Link"), ValueName("Gr_Link"), Display(Name = "Grapheme_Link")] 53 | GraphemeLink = 0b_0000_0000_0010_0000_0000_0000, 54 | /// Represents the Math property. 55 | [ValueName("Math"), Display(Name = "Math")] 56 | Math = 0b_0000_0000_0100_0000_0000_0000, 57 | /// Represents the ID_Start property. 58 | [ValueName("ID_Start"), ValueName("IDS"), Display(Name = "ID_Start")] 59 | IdentifierStart = 0b_0000_0000_1000_0000_0000_0000, 60 | /// Represents the ID_Continue property. 61 | [ValueName("ID_Continue"), ValueName("IDC"), Display(Name = "ID_Continue")] 62 | IdentifierContinue = 0b_0000_0001_0000_0000_0000_0000, 63 | /// Represents the XID_Start property. 64 | [ValueName("XID_Start"), ValueName("XIDS"), Display(Name = "XID_Start")] 65 | ExtendedIdentifierStart = 0b_0000_0010_0000_0000_0000_0000, 66 | /// Represents the XID_Continue property. 67 | [ValueName("XID_Continue"), ValueName("XIDC"), Display(Name = "XID_Continue")] 68 | ExtendedIdentifierContinue = 0b_0000_0100_0000_0000_0000_0000, 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /System.Unicode/EmojiProperties.cs: -------------------------------------------------------------------------------- 1 | using System.ComponentModel.DataAnnotations; 2 | 3 | namespace System.Unicode 4 | { 5 | /// A bitmask of the various available emoji properties. 6 | /// Emoji properties are not formally part of UCD, but . 7 | [Flags] 8 | public enum EmojiProperties : byte 9 | { 10 | // ⚠️ Only 6 bits can be used here at the moment. Refactoring of the encoding is required to use 8 or more bits. 11 | // Reason: EmojiProperties does not have its own bit in UcdFields. 12 | 13 | /// Represents the Emoji property. 14 | [ValueName("Emoji"), Display(Name = "Emoji")] 15 | Emoji = 0b_00_0001, 16 | /// Represents the Emoji_Presentation property. 17 | [ValueName("Emoji_Presentation"), ValueName("EPres"), Display(Name = "Emoji_Presentation")] 18 | EmojiPresentation = 0b_00_0010, 19 | /// Represents the Emoji_Modifier property. 20 | [ValueName("Emoji_Modifier"), ValueName("EMod"), Display(Name = "Emoji_Modifier")] 21 | EmojiModifier = 0b_01_0000, 22 | /// Represents the Emoji_Modifier_Base property. 23 | [ValueName("Emoji_Modifier_Base"), ValueName("EBase"), Display(Name = "Emoji_Modifier_Base")] 24 | EmojiModifierBase = 0b_00_0100, 25 | /// Represents the Emoji_Component property. 26 | [ValueName("Emoji_Component"), ValueName("EComp"), Display(Name = "Emoji_Component")] 27 | EmojiComponent = 0b_00_1000, 28 | /// Represents the Extended_Pictographic property. 29 | [ValueName("Extended_Pictographic"), ValueName("ExtPict"), Display(Name = "Extended_Pictographic")] 30 | ExtendedPictographic = 0b_10_0000, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /System.Unicode/EnumHelper.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Linq; 3 | using System.Reflection; 4 | 5 | namespace System.Unicode 6 | { 7 | internal static class EnumHelper 8 | where T : struct, Enum 9 | { 10 | private static readonly Dictionary ValueNameDictionary = CreateValueNameDictionary(); 11 | 12 | private static Dictionary CreateValueNameDictionary() 13 | { 14 | var type = typeof(T).GetTypeInfo(); 15 | 16 | if (!type.IsEnum) throw new InvalidOperationException(); 17 | 18 | return 19 | ( 20 | from field in type.DeclaredFields 21 | where field.IsPublic && field.IsLiteral 22 | select new KeyValuePair 23 | ( 24 | (T)field.GetValue(null), 25 | ( 26 | from attr in field.GetCustomAttributes() 27 | where attr.Name != null 28 | select attr.Name 29 | ).ToArray() 30 | ) 31 | ).ToDictionary(kvp => kvp.Key, kvp => kvp.Value); 32 | } 33 | 34 | public static string[] GetValueNames(T value) => ValueNameDictionary.TryGetValue(value, out string[] names) ? names : null; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /System.Unicode/GenerateUnicodeDatabase.proj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | false 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /System.Unicode/HangulInfo.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | internal static class HangulInfo 4 | { 5 | // Constants defined on page 144 of the Unicode 7.0 Standard (3.12) 6 | private const ushort SBase = 0xAC00; 7 | //private const ushort LBase = 0x1100; 8 | //private const ushort VBase = 0x1161; 9 | //private const ushort TBase = 0x11A7; 10 | private const int LCount = 19; 11 | private const int VCount = 21; 12 | private const int TCount = 28; 13 | private const int NCount = VCount * TCount; 14 | private const int SCount = LCount * NCount; 15 | 16 | private static readonly string[] JamoLTable = 17 | { 18 | "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", 19 | "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H" 20 | }; 21 | 22 | private static readonly string[] JamoVTable = 23 | { 24 | "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", 25 | "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", 26 | "YU", "EU", "YI", "I" 27 | }; 28 | 29 | private static readonly string[] JamoTTable = 30 | { 31 | "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", 32 | "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", 33 | "S", "SS", "NG", "J", "C", "K", "T", "P", "H" 34 | }; 35 | 36 | // Algorithm defined on page 150 of the Unicode 7.0 Standard (3.12) 37 | internal static string GetHangulName(char codePoint) 38 | { 39 | int sIndex = codePoint - SBase; 40 | 41 | if (sIndex < 0 || sIndex >= SCount) throw new ArgumentOutOfRangeException(nameof(codePoint)); 42 | 43 | int lIndex = sIndex / NCount; 44 | int vIndex = sIndex % NCount / TCount; 45 | int tIndex = sIndex % TCount; 46 | 47 | return "HANGUL SYLLABLE " + JamoLTable[lIndex] + JamoVTable[vIndex] + JamoTTable[tIndex]; 48 | } 49 | 50 | internal static bool IsHangul(int codePoint) 51 | => codePoint >= SBase && codePoint < SBase + SCount; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /System.Unicode/PermissiveCodePointEnumerable.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Allows enumeration of the code points contained in an encapsulated string, even when this one contains lone surrogates. 7 | /// 8 | /// This enumerable will allow enumeration of UTF-16 strings containing lone surrogates. 9 | /// For a more conformant enumeration of code points, please use instead. 10 | /// 11 | public readonly struct PermissiveCodePointEnumerable : IEnumerable 12 | { 13 | /// Initializes a new instance of the struct . 14 | /// The string whose code points must be enumerated. 15 | public PermissiveCodePointEnumerable(string text) => Text = text ?? throw new ArgumentNullException(nameof(text)); 16 | 17 | /// Gets the text whose code points are being enumerated. 18 | public string Text { get; } 19 | 20 | /// Gets an enumerator which can be used to enumerate the code points in the text. 21 | /// 22 | public PermissiveCodePointEnumerator GetEnumerator() => new PermissiveCodePointEnumerator(Text); 23 | 24 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 25 | 26 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /System.Unicode/PermissiveCodePointEnumerator.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Supports a permissive iteration of code points in a . 7 | public struct PermissiveCodePointEnumerator : IEnumerator 8 | { 9 | private readonly string _text; 10 | private int _current; 11 | private int _index; 12 | 13 | /// Initializes a new instance of the struct. 14 | /// The text whose code point should be enumerated. 15 | /// is . 16 | public PermissiveCodePointEnumerator(string text) 17 | { 18 | _text = text ?? throw new ArgumentNullException(nameof(text)); 19 | _current = 0; 20 | _index = -1; 21 | } 22 | 23 | /// Gets the element in the collection at the current position of the enumerator.. 24 | /// The element in the collection at the current position of the enumerator. 25 | public int Current => _current; 26 | 27 | object IEnumerator.Current => _current; 28 | 29 | void IDisposable.Dispose() { } 30 | 31 | /// Advances the enumerator to the next element of the collection. 32 | /// if the enumerator was successfully advanced to the next element; if the enumerator has passed the end of the collection. 33 | public bool MoveNext() 34 | { 35 | if (_index < _text.Length && (_index += _current > 0xFFFF ? 2 : 1) < _text.Length) 36 | { 37 | _current = GetUtf32(_text, _index); 38 | return true; 39 | } 40 | else 41 | { 42 | _current = 0; 43 | return false; 44 | } 45 | } 46 | 47 | void IEnumerator.Reset() => (_current, _index) = (0, -1); 48 | 49 | private static int GetUtf32(string s, int index) 50 | { 51 | char c1 = s[index]; 52 | 53 | if (char.IsHighSurrogate(c1) && ++index < s.Length) 54 | { 55 | char c2 = s[index]; 56 | 57 | if (char.IsLowSurrogate(c2)) return char.ConvertToUtf32(c1, c2); 58 | } 59 | 60 | return c1; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /System.Unicode/StringExtensions.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | /// Contains extension methods applicable to the type. 4 | public static class StringExtensions 5 | { 6 | /// Encapsulates the string in an object which can be used to enumerate code points. 7 | /// 8 | /// The enumerable returned by this method enumerates code points in a strict manner. 9 | /// If the string contains lone surrogates, the enumeration will throw. 10 | /// 11 | /// The string to encapsulate. 12 | /// An enumerable object, which can be used to enumerate code points in the string. 13 | public static CodePointEnumerable AsCodePointEnumerable(this string s) => new(s); 14 | 15 | /// Encapsulates the string in an object which can be used to enumerate code points in a permissive way. 16 | /// 17 | /// The enumerable returned by this method is permissive, regarding the code points represented. 18 | /// It allows invalid sequences, such as lone surrogates, the enumeration will handle those gracefully. 19 | /// 20 | /// The string to encapsulate. 21 | /// An enumerable object, which can be used to enumerate code points in the string. 22 | public static PermissiveCodePointEnumerable AsPermissiveCodePointEnumerable(this string s) => new(s); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /System.Unicode/System.Unicode.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net5.0;netstandard2.1;netstandard2.0;netstandard1.1;net45 5 | en-US 6 | true 7 | True 8 | true 9 | true 10 | true 11 | snupkg 12 | UnicodeInformation 13 | true 14 | $(MSBuildThisFileDirectory)=C:\Sources\NetUnicodeInfo\System.Unicode 15 | $(NoWarn);NETSDK1138 16 | 17 | 18 | 19 | $(DefineConstants);HAS_NATIVE_SPAN 20 | 21 | 22 | 23 | UnicodeInformation 24 | .NET Unicode Information Library 25 | .NET Unicode Information Library 26 | Library providing access to Unicode data to .NET clients. 27 | Unicode Unihan Data .NET C# String Text Char Character CodePoint Code Point 28 | MIT 29 | https://github.com/GoldenCrystal/NetUnicodeInfo 30 | packageIcon.png 31 | https://github.com/GoldenCrystal/NetUnicodeInfo.git 32 | git 33 | Version 2.7.1 34 | ------------- 35 | Fix startup performance regression at the cost of more memory usage during startup. 36 | 37 | Version 2.7.0 38 | ------------- 39 | Support for Unicode 15.0 40 | 41 | Version 2.6.0 42 | ------------- 43 | Support for Unicode 14.0 44 | Bugfix in CjkRadicalData 45 | Reduce string allocations for a few methods on frameworks where native Span is available. 46 | 47 | Version 2.5.1 48 | ------------- 49 | Fix for .NET 6 50 | 51 | Version 2.5.0 52 | ------------- 53 | Support for Unicode 13.0. 54 | 55 | ------------- 56 | Support for Unicode 12.1. 57 | Added the missing Emoji properties Extended_Pictographic and EmojiModifier that were missing. 🎉 58 | Structs that were immutable have been marked as readonly. 59 | 60 | Version 2.3.0 61 | ------------- 62 | Support for Unicode 12.0. 63 | Target .NET Standard 2.0. 64 | 65 | Version 2.2.1 66 | ------------- 67 | Added DebuggerDisplay attributes on various types. 68 | 69 | Version 2.2.0 70 | ------------- 71 | Added emoji properties. 72 | 73 | Version 2.1.0 74 | ------------- 75 | Support for Unicode 10.0. 76 | 77 | Version 2.0.0 78 | ------------- 79 | Following migration to Unicode 9.0.0, UnicodeRadicalStrokeCount.StrokeCount is now of type System.SByte instead of type System.Byte. 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | all 123 | runtime; build; native; contentfiles; analyzers; buildtransitive 124 | 125 | 126 | 127 | 128 | 129 | UnihanCharacterData.Generated.cs 130 | TextTemplatingFileGenerator 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | True 141 | True 142 | UnihanCharacterData.tt 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /System.Unicode/UcdFields.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | /// Represents the fields available for an UCD entry. 4 | /// Not all the enumeration member directly map to a field. 5 | [Flags] 6 | internal enum UcdFields : ushort 7 | { 8 | // Not really a field, just here to indicate that the entry is a range 9 | CodePointRange = 0b_0000_0000_0000_0001, 10 | 11 | Name = 0b_0000_0000_0000_0010, // Will stand in for official name as well as related names. 12 | Category = 0b_0000_0000_0000_0100, 13 | CanonicalCombiningClass = 0b_0000_0000_0000_1000, 14 | BidirectionalClass = 0b_0000_0000_0001_0000, 15 | DecompositionMapping = 0b_0000_0000_0010_0000, 16 | 17 | // NumericType / NumericValue : Not exactly a bit mask here… More like [0…3] << 6 18 | NumericDecimal = 0b_0000_0000_0100_0000, 19 | NumericDigit = 0b_0000_0000_1000_0000, 20 | NumericNumeric = 0b_0000_0000_1100_0000, 21 | 22 | // This is a yes/no field, so obviously, no extra storage is required for this one… 23 | BidirectionalMirrored = 0b_0000_0001_0000_0000, 24 | 25 | OldName = 0b_0000_0010_0000_0000, 26 | SimpleUpperCaseMapping = 0b_0000_0100_0000_0000, 27 | SimpleLowerCaseMapping = 0b_0000_1000_0000_0000, 28 | SimpleTitleCaseMapping = 0b_0001_0000_0000_0000, 29 | 30 | ContributoryProperties = 0b_0010_0000_0000_0000, 31 | CorePropertiesAndEmojiProperties = 0b_0100_0000_0000_0000, 32 | 33 | CrossRerefences = 0b_1000_0000_0000_0000, 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeBlock.cs: -------------------------------------------------------------------------------- 1 | using System.Diagnostics; 2 | 3 | namespace System.Unicode 4 | { 5 | /// Represents a Unicode block. 6 | [DebuggerDisplay("[{CodePointRange.ToString(),nq}] {Name,nq}")] 7 | public readonly struct UnicodeBlock 8 | { 9 | /// The code point range of this block. 10 | public readonly UnicodeCodePointRange CodePointRange; 11 | /// The name of this block. 12 | public readonly string Name; 13 | 14 | internal UnicodeBlock(UnicodeCodePointRange codePointRange, string name) 15 | { 16 | CodePointRange = codePointRange; 17 | Name = name; 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeCategoryExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | 3 | namespace System.Unicode 4 | { 5 | /// Provides extensions to the type. 6 | public static class UnicodeCategoryExtensions 7 | { 8 | /// Gets the short name of the unicode category. 9 | /// The category whose short name should be retrieved. 10 | /// The short name of the unicode category. 11 | public static string GetShortName(this UnicodeCategory category) 12 | => UnicodeCategoryInfo.Get(category).ShortName; 13 | 14 | /// Gets the long name of the unicode category. 15 | /// The category whose long name should be retrieved. 16 | /// The long name of the unicode category. 17 | public static string GetLongName(this UnicodeCategory category) 18 | => UnicodeCategoryInfo.Get(category).LongName; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeCategoryInfo.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.Globalization; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Provides complementary information on values. 7 | public readonly struct UnicodeCategoryInfo : IEquatable 8 | { 9 | private static readonly UnicodeCategoryInfo[] Categories = 10 | { 11 | new UnicodeCategoryInfo(UnicodeCategory.UppercaseLetter, "Lu", "Uppercase_Letter"), 12 | new UnicodeCategoryInfo(UnicodeCategory.LowercaseLetter, "Ll", "Lowercase_Letter"), 13 | new UnicodeCategoryInfo(UnicodeCategory.TitlecaseLetter, "Lt", "Titlecase_Letter"), 14 | new UnicodeCategoryInfo(UnicodeCategory.ModifierLetter, "Lm", "Modifier_Letter"), 15 | new UnicodeCategoryInfo(UnicodeCategory.OtherLetter, "Lo", "Other_Letter"), 16 | new UnicodeCategoryInfo(UnicodeCategory.NonSpacingMark, "Mn", "Nonspacing_Mark"), 17 | new UnicodeCategoryInfo(UnicodeCategory.SpacingCombiningMark, "Mc", "Spacing_Mark"), 18 | new UnicodeCategoryInfo(UnicodeCategory.EnclosingMark, "Me", "Enclosing_Mark"), 19 | new UnicodeCategoryInfo(UnicodeCategory.DecimalDigitNumber, "Nd", "Decimal_Number"), 20 | new UnicodeCategoryInfo(UnicodeCategory.LetterNumber, "Nl", "Letter_Number"), 21 | new UnicodeCategoryInfo(UnicodeCategory.OtherNumber, "No", "Other_Number"), 22 | new UnicodeCategoryInfo(UnicodeCategory.SpaceSeparator, "Zs", "Space_Separator"), 23 | new UnicodeCategoryInfo(UnicodeCategory.LineSeparator, "Zl", "Line_Separator"), 24 | new UnicodeCategoryInfo(UnicodeCategory.ParagraphSeparator, "Zp", "Paragraph_Separator"), 25 | new UnicodeCategoryInfo(UnicodeCategory.Control, "Cc", "Control"), 26 | new UnicodeCategoryInfo(UnicodeCategory.Format, "Cf", "Format"), 27 | new UnicodeCategoryInfo(UnicodeCategory.Surrogate, "Cs", "Surrogate"), 28 | new UnicodeCategoryInfo(UnicodeCategory.PrivateUse, "Co", "Private_Use"), 29 | new UnicodeCategoryInfo(UnicodeCategory.ConnectorPunctuation, "Pc", "Connector_Punctuation"), 30 | new UnicodeCategoryInfo(UnicodeCategory.DashPunctuation, "Pd", "Dash_Punctuation"), 31 | new UnicodeCategoryInfo(UnicodeCategory.OpenPunctuation, "Ps", "Open_Punctuation"), 32 | new UnicodeCategoryInfo(UnicodeCategory.ClosePunctuation, "Pe", "Close_Punctuation"), 33 | new UnicodeCategoryInfo(UnicodeCategory.InitialQuotePunctuation, "Pi", "Initial_Punctuation"), 34 | new UnicodeCategoryInfo(UnicodeCategory.FinalQuotePunctuation, "Pf", "Final_Punctuation"), 35 | new UnicodeCategoryInfo(UnicodeCategory.OtherPunctuation, "Po", "Other_Punctuation"), 36 | new UnicodeCategoryInfo(UnicodeCategory.MathSymbol, "Sm", "Math_Symbol"), 37 | new UnicodeCategoryInfo(UnicodeCategory.CurrencySymbol, "Sc", "Currency_Symbol"), 38 | new UnicodeCategoryInfo(UnicodeCategory.ModifierSymbol, "Sk", "Modifier_Symbol"), 39 | new UnicodeCategoryInfo(UnicodeCategory.OtherSymbol, "So", "Other_Symbol"), 40 | new UnicodeCategoryInfo(UnicodeCategory.OtherNotAssigned, "Cn", "Unassigned"), 41 | }; 42 | 43 | private static readonly Dictionary UnicodeShortNameToCategoryDictionary = BuildShortNameDictionary(); 44 | private static readonly Dictionary UnicodeLongNameToCategoryDictionary = BuildLongNameDictionary(); 45 | 46 | private static Dictionary BuildShortNameDictionary() 47 | { 48 | var dictionary = new Dictionary(StringComparer.OrdinalIgnoreCase); 49 | 50 | foreach (var info in Categories) 51 | { 52 | dictionary.Add(info.ShortName, info.Category); 53 | } 54 | 55 | return dictionary; 56 | } 57 | 58 | private static Dictionary BuildLongNameDictionary() 59 | { 60 | var dictionary = new Dictionary(StringComparer.OrdinalIgnoreCase); 61 | 62 | foreach (var info in Categories) 63 | { 64 | dictionary.Add(info.LongName, info.Category); 65 | } 66 | 67 | return dictionary; 68 | } 69 | 70 | private static UnicodeCategory GetCategoryFromShortName(string name) 71 | => UnicodeShortNameToCategoryDictionary[name]; 72 | 73 | private static UnicodeCategory GetCategoryFromLongName(string name) 74 | => UnicodeLongNameToCategoryDictionary[name]; 75 | 76 | /// Gets an value providing information on the specified unicode category. 77 | /// The category on which information should be retrieved. 78 | /// Information on the specified category. 79 | public static UnicodeCategoryInfo Get(UnicodeCategory category) => Categories[(int)category]; 80 | 81 | /// Gets an value providing information on the unicode category, accessed by its short name, as per the Unicode standard. 82 | /// The short name for which information should be retrieved . 83 | /// Information on the specified category. 84 | public static UnicodeCategoryInfo FromShortName(string name) => Get(GetCategoryFromShortName(name)); 85 | 86 | /// Gets an value providing information on the unicode category, accessed by its long name, as per the Unicode standard. 87 | /// The long name for which information should be retrieved . 88 | /// Information on the specified category. 89 | public static UnicodeCategoryInfo FromLongName(string name) => Get(GetCategoryFromLongName(name)); 90 | 91 | /// The unicode category described. 92 | public readonly UnicodeCategory Category; 93 | /// Short name of the category, as per the Unicode standard. 94 | public readonly string ShortName; 95 | /// Long name of the category, as per the Unicode standard. 96 | public readonly string LongName; 97 | 98 | private UnicodeCategoryInfo(UnicodeCategory category, string shortName, string longName) 99 | { 100 | Category = category; 101 | ShortName = shortName; 102 | LongName = longName; 103 | } 104 | 105 | /// Returns a that represents this instance. 106 | /// A that represents this instance. 107 | public override string ToString() => Category.ToString(); 108 | 109 | /// Determines whether the specified , is equal to this instance. 110 | /// The to compare with this instance. 111 | /// if the specified is equal to this instance; otherwise, . 112 | public override bool Equals(object obj) => obj is UnicodeCategoryInfo other && Equals(other); 113 | 114 | /// Indicates whether the current object is equal to another object of the same type. 115 | /// An object to compare with this object. 116 | /// if the current object is equal to the other parameter; otherwise, . 117 | public bool Equals(UnicodeCategoryInfo other) => other.Category == Category && (other.Category != 0 || other.ShortName != null); 118 | 119 | /// Returns a hash code for this instance. 120 | /// A hash code for this instance, suitable for use in hashing algorithms and data structures like a hash table. 121 | public override int GetHashCode() => (int)Category; 122 | 123 | /// Performs an implicit conversion from to . 124 | /// The information. 125 | /// The result of the conversion. 126 | public static implicit operator UnicodeCategory(UnicodeCategoryInfo info) => info.Category; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeCharacterData.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | 3 | namespace System.Unicode 4 | { 5 | internal readonly struct UnicodeCharacterData 6 | { 7 | public readonly UnicodeCodePointRange CodePointRange; 8 | public readonly string Name; 9 | public readonly UnicodeNameAlias[] NameAliases; 10 | public readonly UnicodeCategory Category; 11 | public readonly CanonicalCombiningClass CanonicalCombiningClass; 12 | public readonly BidirectionalClass BidirectionalClass; 13 | public readonly CompatibilityFormattingTag DecompositionType; 14 | public readonly string DecompositionMapping; 15 | public readonly UnicodeNumericType NumericType; 16 | private readonly UnicodeRationalNumber _numericValue; 17 | public readonly bool BidirectionalMirrored; 18 | public readonly string OldName; 19 | public readonly string SimpleUpperCaseMapping; 20 | public readonly string SimpleLowerCaseMapping; 21 | public readonly string SimpleTitleCaseMapping; 22 | public readonly ContributoryProperties ContributoryProperties; 23 | private readonly int _corePropertiesAndEmojiProperties; 24 | public CoreProperties CoreProperties => (CoreProperties)(_corePropertiesAndEmojiProperties & 0x003FFFFF); 25 | public EmojiProperties EmojiProperties => (EmojiProperties)(_corePropertiesAndEmojiProperties >> 24); 26 | 27 | public readonly int[] CrossRerefences; // NB: It seems that parsing NamesList is required in order to provide data for this field ? 28 | 29 | internal UnicodeCharacterData 30 | ( 31 | UnicodeCodePointRange codePointRange, 32 | string name, 33 | UnicodeNameAlias[] nameAliases, 34 | UnicodeCategory category, 35 | CanonicalCombiningClass canonicalCombiningClass, 36 | BidirectionalClass bidirectionalClass, 37 | CompatibilityFormattingTag decompositionType, 38 | string decompositionMapping, 39 | UnicodeNumericType numericType, 40 | UnicodeRationalNumber numericValue, 41 | bool bidirectionalMirrored, 42 | string oldName, 43 | string simpleUpperCaseMapping, 44 | string simpleLowerCaseMapping, 45 | string simpleTitleCaseMapping, 46 | ContributoryProperties contributoryProperties, 47 | int corePropertiesAndEmojiProperties, 48 | int[] crossRerefences 49 | ) 50 | { 51 | CodePointRange = codePointRange; 52 | Name = name; 53 | NameAliases = nameAliases; 54 | Category = category; 55 | CanonicalCombiningClass = canonicalCombiningClass; 56 | BidirectionalClass = bidirectionalClass; 57 | DecompositionType = decompositionType; 58 | DecompositionMapping = decompositionMapping; 59 | NumericType = numericType; 60 | _numericValue = numericValue; 61 | BidirectionalMirrored = bidirectionalMirrored; 62 | OldName = oldName; 63 | SimpleUpperCaseMapping = simpleUpperCaseMapping; 64 | SimpleLowerCaseMapping = simpleLowerCaseMapping; 65 | SimpleTitleCaseMapping = simpleTitleCaseMapping; 66 | ContributoryProperties = contributoryProperties; 67 | _corePropertiesAndEmojiProperties = corePropertiesAndEmojiProperties; 68 | CrossRerefences = crossRerefences; 69 | } 70 | 71 | public UnicodeRationalNumber? NumericValue => NumericType != UnicodeNumericType.None ? _numericValue : null as UnicodeRationalNumber?; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeCodePointRange.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | using System.Globalization; 4 | 5 | namespace System.Unicode 6 | { 7 | /// Represents a range of Unicode code points. 8 | public readonly struct UnicodeCodePointRange : IEnumerable 9 | { 10 | /// Represents an enumerator which enumerated through all the code points in the . 11 | public struct Enumerator : IEnumerator 12 | { 13 | private readonly int _start; 14 | private readonly int _end; 15 | private int _index; 16 | 17 | /// Initializes a new instance of the struct. 18 | /// The start of the range. 19 | /// The end of the range. 20 | internal Enumerator(int start, int end) 21 | { 22 | _start = start; 23 | _end = end; 24 | _index = start - 1; 25 | } 26 | 27 | /// Does nothing. 28 | public void Dispose() { } 29 | 30 | /// Gets the element in the collection at the current position of the enumerator.. 31 | /// The element in the collection at the current position of the enumerator. 32 | public int Current => _index; 33 | 34 | object IEnumerator.Current => _index; 35 | 36 | /// Advances the enumerator to the next element of the collection. 37 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection. 38 | public bool MoveNext() => _index < _end && ++_index == _index; 39 | 40 | void IEnumerator.Reset() => _index = _start - 1; 41 | } 42 | 43 | /// The first code point in the range. 44 | public readonly int FirstCodePoint; 45 | /// The last code point in the range. 46 | public readonly int LastCodePoint; 47 | 48 | /// Gets a value indicating whether this value represents a single code point. 49 | /// if this value represents a single code point; otherwise, . 50 | public bool IsSingleCodePoint => FirstCodePoint == LastCodePoint; 51 | 52 | /// Initializes a new instance of the struct for a single code point. 53 | /// The code point. 54 | /// 55 | public UnicodeCodePointRange(int codePoint) 56 | { 57 | if (codePoint < 0 || codePoint > 0x10FFFF) throw new ArgumentOutOfRangeException(nameof(codePoint)); 58 | 59 | FirstCodePoint = codePoint; 60 | LastCodePoint = codePoint; 61 | } 62 | 63 | /// Initializes a new instance of the struct with specified bounds. 64 | /// The first code point in the range. 65 | /// The last code point in the range. 66 | /// 67 | /// is less than 0 or greated than 0x10FFFF, 68 | /// or is less than or greated than 0x10FFFF. 69 | /// 70 | public UnicodeCodePointRange(int firstCodePoint, int lastCodePoint) 71 | { 72 | if (firstCodePoint < 0 || firstCodePoint > 0x10FFFF) throw new ArgumentOutOfRangeException(nameof(firstCodePoint)); 73 | if (lastCodePoint < firstCodePoint || lastCodePoint > 0x10FFFF) throw new ArgumentOutOfRangeException(nameof(lastCodePoint)); 74 | 75 | FirstCodePoint = firstCodePoint; 76 | LastCodePoint = lastCodePoint; 77 | } 78 | 79 | /// Determines whether the range contains the specific code point. 80 | /// This method does not validate its inputs, but will always return for any invalid code point. 81 | /// The integer to check against the range. 82 | /// if the range contains the specified code point; otherwise, . 83 | public bool Contains(int i) 84 | // Since the first and last code points have been checked or are at their default value of zero, the method will always exlcude invalid code points. 85 | => i >= FirstCodePoint & i <= LastCodePoint; 86 | 87 | internal int CompareCodePoint(int codePoint) 88 | => FirstCodePoint <= codePoint ? LastCodePoint < codePoint ? 1 : 0 : -1; 89 | 90 | /// Returns a that represents this instance. 91 | /// A that represents this instance. 92 | public override string ToString() 93 | #if !HAS_NATIVE_SPAN 94 | => FirstCodePoint == LastCodePoint ? FirstCodePoint.ToString("X4") : FirstCodePoint.ToString("X4") + ".." + LastCodePoint.ToString("X4"); 95 | #else 96 | => FirstCodePoint == LastCodePoint ? FirstCodePoint.ToString("X4") : RangeToString(); 97 | 98 | private string RangeToString() 99 | { 100 | Span buffer = stackalloc char[14]; 101 | 102 | FirstCodePoint.TryFormat(buffer, out int length, "X4", CultureInfo.InvariantCulture); 103 | buffer.Slice(length, 2).Fill('.'); 104 | length += 2; 105 | LastCodePoint.TryFormat(buffer[length..], out int l, "X4", CultureInfo.InvariantCulture); 106 | length += l; 107 | 108 | return buffer[..length].ToString(); 109 | } 110 | #endif 111 | 112 | /// Parses the specified into a . 113 | /// Code point ranges are encoded as one unprefixed hexadecimal number for single code points, or a pair of unprefixed hexadecimal numbers separated by the characters "..". 114 | /// The text to parse. 115 | /// The parsed value. 116 | /// The parameter was not in an allowed format. 117 | public static UnicodeCodePointRange Parse(string s) 118 | { 119 | int start, end; 120 | 121 | int rangeSeparatorOffset = s.IndexOf(".."); 122 | 123 | if (rangeSeparatorOffset == 0) throw new FormatException(); 124 | else if (rangeSeparatorOffset < 0) 125 | { 126 | start = end = int.Parse(s, NumberStyles.HexNumber); 127 | } 128 | else 129 | { 130 | #if HAS_NATIVE_SPAN 131 | start = int.Parse(s.AsSpan(0, rangeSeparatorOffset), NumberStyles.HexNumber); 132 | end = int.Parse(s.AsSpan(rangeSeparatorOffset + 2), NumberStyles.HexNumber); 133 | #else 134 | start = int.Parse(s.Substring(0, rangeSeparatorOffset), NumberStyles.HexNumber); 135 | end = int.Parse(s.Substring(rangeSeparatorOffset + 2), NumberStyles.HexNumber); 136 | #endif 137 | } 138 | 139 | return new UnicodeCodePointRange(start, end); 140 | } 141 | 142 | /// Returns an enumerator that iterates through the collection. 143 | /// A that can be used to iterate through the collection. 144 | public Enumerator GetEnumerator() => new(FirstCodePoint, LastCodePoint); 145 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 146 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeCrossReferenceCollection.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Represents a collection of code point cross-references. 7 | public readonly struct UnicodeCrossReferenceCollection : IList 8 | { 9 | #if NETSTANDARD1_1 || NET45 10 | private static readonly int[] EmptyArray = new int[0]; 11 | #endif 12 | 13 | /// Represents an enumerator for the class. 14 | public struct Enumerator : IEnumerator 15 | { 16 | private readonly int[] _items; 17 | private int _index; 18 | 19 | /// Initializes a new instance of the struct. 20 | /// The items to enumerate. 21 | internal Enumerator(int[] items) 22 | { 23 | _items = items; 24 | _index = -1; 25 | } 26 | 27 | /// Does nothing. 28 | public void Dispose() { } 29 | 30 | /// Gets the element in the collection at the current position of the enumerator.. 31 | /// The element in the collection at the current position of the enumerator. 32 | public int Current => _items[_index]; 33 | object IEnumerator.Current => Current; 34 | 35 | /// Advances the enumerator to the next element of the collection. 36 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection. 37 | public bool MoveNext() => _index < _items.Length && ++_index < _items.Length; 38 | 39 | void IEnumerator.Reset() => _index = -1; 40 | } 41 | 42 | /// Gets an empty struct. 43 | public static readonly UnicodeCrossReferenceCollection Empty = 44 | #if NETSTANDARD1_1 || NET45 45 | new UnicodeCrossReferenceCollection(EmptyArray); 46 | #else 47 | new UnicodeCrossReferenceCollection(Array.Empty()); 48 | #endif 49 | 50 | private readonly int[] _items; 51 | 52 | internal UnicodeCrossReferenceCollection(int[] items) 53 | => _items = items 54 | #if NETSTANDARD1_1 || NET45 55 | ?? EmptyArray; 56 | #else 57 | ?? Array.Empty(); 58 | #endif 59 | 60 | /// Gets the cross-referenced code point at the specified index. 61 | /// The cross-referenced code point. 62 | /// The index. 63 | /// The cross-referenced code point at the specified index. 64 | public int this[int index] => _items[index]; 65 | 66 | int IList.this[int index] 67 | { 68 | get => _items[index]; 69 | set => throw new NotSupportedException(); 70 | } 71 | 72 | /// Gets the number of elements contained in the . 73 | /// The number of elements contained in the . 74 | public int Count => _items.Length; 75 | 76 | bool ICollection.IsReadOnly => true; 77 | 78 | void ICollection.Add(int item) => throw new NotSupportedException(); 79 | void IList.Insert(int index, int item) => throw new NotSupportedException(); 80 | 81 | bool ICollection.Remove(int item) => throw new NotSupportedException(); 82 | void IList.RemoveAt(int index) => throw new NotSupportedException(); 83 | 84 | void ICollection.Clear() => throw new NotSupportedException(); 85 | 86 | /// Determines the index of a specific item in the . 87 | /// The object to locate in the . 88 | /// The index of the item if found in the list; otherwise, -1. 89 | public int IndexOf(int item) => Array.IndexOf(_items, item); 90 | 91 | /// Determines whether the contains a specific value. 92 | /// The object to locate in the . 93 | /// if item is fount in the ; otherwise. 94 | public bool Contains(int item) => IndexOf(item) >= 0; 95 | 96 | /// 97 | /// Copies the elements of the UnicodeCrossReferenceCollection to an , starting at a particular index. 98 | /// 99 | /// The one-dimensional that is the destination of the elements to copy from UnicodeCrossReferenceCollection. The must have zero-based indexing. 100 | /// The zeo-based index in array at which copy begins. 101 | public void CopyTo(int[] array, int arrayIndex) => _items.CopyTo(array, arrayIndex); 102 | 103 | /// Returns an enumerator that iterates through the collection. 104 | /// A that can be used to iterate through the collection. 105 | public Enumerator GetEnumerator() => new Enumerator(_items); 106 | 107 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 108 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeNameAlias.cs: -------------------------------------------------------------------------------- 1 | using System.Diagnostics; 2 | using System.Linq; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Represents a name alias for an Unicode code point. 7 | [DebuggerDisplay("{DisplayText,nq}")] 8 | public readonly struct UnicodeNameAlias 9 | { 10 | internal static readonly UnicodeNameAlias[] EmptyArray = new UnicodeNameAlias[0]; 11 | 12 | /// Gets the alias name. 13 | /// The name. 14 | public string Name { get; } 15 | 16 | /// Gets the kind of alias. 17 | /// The kind of alias. 18 | public UnicodeNameAliasKind Kind { get; } 19 | 20 | private string DisplayText => (Kind != 0 ? "<" + EnumHelper.GetValueNames(Kind).FirstOrDefault() + "> " : string.Empty) + Name; 21 | 22 | internal UnicodeNameAlias(string name, UnicodeNameAliasKind kind) 23 | { 24 | Name = name; 25 | Kind = kind; 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeNameAliasCollection.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Represents a collection of name aliases. 7 | public readonly struct UnicodeNameAliasCollection : IList 8 | { 9 | /// Represents an enumerator for the class. 10 | public struct Enumerator : IEnumerator 11 | { 12 | private readonly UnicodeNameAlias[] _items; 13 | private int _index; 14 | 15 | /// Initializes a new instance of the struct. 16 | /// The items to enumerate. 17 | internal Enumerator(UnicodeNameAlias[] items) 18 | { 19 | _items = items; 20 | _index = -1; 21 | } 22 | 23 | /// Does nothing. 24 | public void Dispose() { } 25 | 26 | /// Gets the element in the collection at the current position of the enumerator.. 27 | /// The element in the collection at the current position of the enumerator. 28 | public UnicodeNameAlias Current => _items[_index]; 29 | object IEnumerator.Current => Current; 30 | 31 | /// Advances the enumerator to the next element of the collection. 32 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection. 33 | public bool MoveNext() => _index < _items.Length && ++_index < _items.Length; 34 | 35 | void IEnumerator.Reset() => _index = -1; 36 | } 37 | 38 | /// Gets an empty struct. 39 | public static readonly UnicodeNameAliasCollection Empty = new UnicodeNameAliasCollection(UnicodeNameAlias.EmptyArray); 40 | 41 | private readonly UnicodeNameAlias[] _items; 42 | 43 | internal UnicodeNameAliasCollection(UnicodeNameAlias[] items) => _items = items ?? UnicodeNameAlias.EmptyArray; 44 | 45 | /// Gets the at the specified index. 46 | /// The . 47 | /// The index. 48 | /// The at the specified index. 49 | public UnicodeNameAlias this[int index] => _items[index]; 50 | 51 | UnicodeNameAlias IList.this[int index] 52 | { 53 | get => _items[index]; 54 | set => throw new NotSupportedException(); 55 | } 56 | 57 | /// Gets the number of elements contained in the . 58 | /// The number of elements contained in the . 59 | public int Count => _items.Length; 60 | 61 | bool ICollection.IsReadOnly => true; 62 | 63 | void ICollection.Add(UnicodeNameAlias item) => throw new NotSupportedException(); 64 | void IList.Insert(int index, UnicodeNameAlias item) => throw new NotSupportedException(); 65 | 66 | bool ICollection.Remove(UnicodeNameAlias item) => throw new NotSupportedException(); 67 | void IList.RemoveAt(int index) => throw new NotSupportedException(); 68 | 69 | void ICollection.Clear() => throw new NotSupportedException(); 70 | 71 | /// Determines the index of a specific item in the . 72 | /// The object to locate in the . 73 | /// The index of the item if found in the list; otherwise, -1. 74 | public int IndexOf(UnicodeNameAlias item) => Array.IndexOf(_items, item); 75 | 76 | /// Determines whether the contains a specific value. 77 | /// The object to locate in the . 78 | /// if item is fount in the ; otherwise. 79 | public bool Contains(UnicodeNameAlias item) => IndexOf(item) >= 0; 80 | 81 | /// 82 | /// Copies the elements of the UnicodeNameAliasCollection to an , starting at a particular index. 83 | /// 84 | /// The one-dimensional that is the destination of the elements to copy from UnicodeNameAliasCollection. The must have zero-based indexing. 85 | /// The zeo-based index in array at which copy begins. 86 | public void CopyTo(UnicodeNameAlias[] array, int arrayIndex) => _items.CopyTo(array, arrayIndex); 87 | 88 | /// Returns an enumerator that iterates through the collection. 89 | /// An that can be used to iterate through the collection. 90 | public Enumerator GetEnumerator() => new Enumerator(_items); 91 | 92 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 93 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeNameAliasKind.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | /// Provides information on the kind of name alias provided for a code point. 4 | public enum UnicodeNameAliasKind : byte 5 | { 6 | /// The alias is a correction of a serious problem in the original name. 7 | [ValueName("correction")] 8 | Correction = 1, 9 | /// The alias provides the ISO 6429 name for C0 and C1 control functions of a control code, or another commonly occurring name for the control code. 10 | [ValueName("control")] 11 | Control = 2, 12 | /// The alias is a widely used alternate name for a format character. 13 | [ValueName("alternate")] 14 | Alternate = 3, 15 | /// The alias is a documented non-standardized label for C1 control code points. 16 | [ValueName("figment")] 17 | Figment = 4, 18 | /// The alias is a commonly occurring abbreviation (or acronym) for control codes, format characters, spaces, and variation selectors. 19 | [ValueName("abbreviation")] 20 | Abbreviation = 5 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeNumericType.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | /// Represents the value of the Numeric_Type property. 4 | public enum UnicodeNumericType : byte 5 | { 6 | /// The code point has no numeric value. 7 | None = 0, 8 | /// The code point represents a decimal digit which is part of a contiguous ascending range of characters from 0 to 9, and can be used in a decimal radix positional numeral system. 9 | Decimal = 1, 10 | /// The code point represents a digit between 0 and 9 and requires special handling. 11 | Digit = 2, 12 | /// The code point represents another kind of numeric value. 13 | Numeric = 3 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeRadicalStrokeCount.cs: -------------------------------------------------------------------------------- 1 | using System.Diagnostics; 2 | 3 | namespace System.Unicode 4 | { 5 | /// Provides information on radical and additional stroke count for a code point. 6 | /// Values of this type are usually associated with the property kRSUnicode (aka. Unicode_Radical_Stroke). 7 | [DebuggerDisplay(@"{IsSimplified ? ""Simplified"" : ""Traditional"",nq} Radical {Radical} + {StrokeCount} Strokes")] 8 | public readonly struct UnicodeRadicalStrokeCount 9 | { 10 | #if NETSTANDARD1_1 || NET45 11 | internal static readonly UnicodeRadicalStrokeCount[] EmptyArray = new UnicodeRadicalStrokeCount[0]; 12 | #endif 13 | 14 | /// Initializes a new instance of the class from raw data. 15 | /// The raw value to use for . 16 | /// The raw value to use for . 17 | internal UnicodeRadicalStrokeCount(byte rawRadical, byte rawStrokeCount) 18 | { 19 | Radical = rawRadical; 20 | RawStrokeCount = rawStrokeCount; 21 | } 22 | 23 | /// Initializes a new instance of the class . 24 | /// must be between -64 and 63 included. 25 | /// The index of the Kangxi radical of the character. 26 | /// The number of additional strokes required to form the character from the radical. 27 | /// Indicates whether the character is simplified. 28 | /// is outside of the allowed range of -8 to 119 inclusive. 29 | internal UnicodeRadicalStrokeCount(byte radical, sbyte strokeCount, bool isSimplified) 30 | { 31 | // Two's complement doesn't work anymore there, as we have some code points with more than 64 additional strokes. 32 | // Negative strokes don't seem to go below -5 for now, so we'll map value between -8 and 119 as 120..127;0..119. 33 | if (strokeCount < -8 || strokeCount > 127 - 8) throw new ArgumentOutOfRangeException(nameof(strokeCount)); 34 | 35 | Radical = radical; 36 | // Pack strokeCount together with isSimplified in a single byte. 37 | RawStrokeCount = unchecked((byte)(strokeCount & 0x7F | (isSimplified ? 0x80 : 0x00))); 38 | } 39 | 40 | /// Gets the index of the Kangxi radical of the character. 41 | /// The Kangxi radicals are numbered from 1 to 214 inclusive. 42 | /// The index of the Kangxi radical. 43 | public byte Radical { get; } 44 | 45 | /// Gets the value of packed with . 46 | /// 47 | /// The stroke count is stored as a 7bit value, together with the flag as a 1bit value. 48 | /// Raw values between 120 and 127 represent negative stroke counts -8 to -1. 49 | /// 50 | /// The raw value of . 51 | internal byte RawStrokeCount { get; } 52 | 53 | /// Gets the additional stroke count. 54 | /// The additional stroke count. 55 | public sbyte StrokeCount => (RawStrokeCount & 0x7F) is int c && c > 119 ? unchecked((sbyte)(c - 128)) : unchecked((sbyte)c); 56 | 57 | /// Gets a value indicating whether the information is based on the simplified form of the radical. 58 | /// if the information is based on the simplified form of the radical; otherwise, . 59 | public bool IsSimplified => (RawStrokeCount & 0x80) != 0; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeRadicalStrokeCountCollection.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Generic; 3 | 4 | namespace System.Unicode 5 | { 6 | /// Represents a collection of values for the kRSUnicode (aka. Unicode_Radical_Stroke) property. 7 | public readonly struct UnicodeRadicalStrokeCountCollection : IList 8 | { 9 | /// Represents an enumerator for the class. 10 | public struct Enumerator : IEnumerator 11 | { 12 | private readonly UnicodeRadicalStrokeCount[] _items; 13 | private int _index; 14 | 15 | /// Initializes a new instance of the struct. 16 | /// The items to enumerate. 17 | internal Enumerator(UnicodeRadicalStrokeCount[] items) 18 | { 19 | _items = items; 20 | _index = -1; 21 | } 22 | 23 | /// Does nothing. 24 | public void Dispose() { } 25 | 26 | /// Gets the element in the collection at the current position of the enumerator.. 27 | /// The element in the collection at the current position of the enumerator. 28 | public UnicodeRadicalStrokeCount Current => _items[_index]; 29 | object IEnumerator.Current => Current; 30 | 31 | /// Advances the enumerator to the next element of the collection. 32 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection. 33 | public bool MoveNext() => _index < _items.Length && ++_index < _items.Length; 34 | 35 | void IEnumerator.Reset() => _index = -1; 36 | } 37 | 38 | /// Gets an empty struct. 39 | public static readonly UnicodeRadicalStrokeCountCollection Empty = new UnicodeRadicalStrokeCountCollection(null); 40 | 41 | private readonly UnicodeRadicalStrokeCount[] _items; 42 | 43 | internal UnicodeRadicalStrokeCountCollection(UnicodeRadicalStrokeCount[] items) 44 | => _items = items 45 | #if NETSTANDARD1_1 || NET45 46 | ?? UnicodeRadicalStrokeCount.EmptyArray; 47 | #else 48 | ?? Array.Empty(); 49 | #endif 50 | 51 | /// Gets the at the specified index. 52 | /// The . 53 | /// The index. 54 | /// The at the specified index. 55 | public UnicodeRadicalStrokeCount this[int index] => _items[index]; 56 | 57 | UnicodeRadicalStrokeCount IList.this[int index] 58 | { 59 | get => _items[index]; 60 | set => throw new NotSupportedException(); 61 | } 62 | 63 | /// Gets the number of elements contained in the . 64 | /// The number of elements contained in the . 65 | public int Count => _items.Length; 66 | 67 | bool ICollection.IsReadOnly => true; 68 | 69 | void ICollection.Add(UnicodeRadicalStrokeCount item) => throw new NotSupportedException(); 70 | void IList.Insert(int index, UnicodeRadicalStrokeCount item) => throw new NotSupportedException(); 71 | 72 | bool ICollection.Remove(UnicodeRadicalStrokeCount item) => throw new NotSupportedException(); 73 | void IList.RemoveAt(int index) => throw new NotSupportedException(); 74 | 75 | void ICollection.Clear() => throw new NotSupportedException(); 76 | 77 | /// Determines the index of a specific item in the . 78 | /// The object to locate in the . 79 | /// The index of the item if found in the list; otherwise, -1. 80 | public int IndexOf(UnicodeRadicalStrokeCount item) => Array.IndexOf(_items, item); 81 | 82 | /// Determines whether the contains a specific value. 83 | /// The object to locate in the . 84 | /// if item is fount in the ; otherwise. 85 | public bool Contains(UnicodeRadicalStrokeCount item) => IndexOf(item) >= 0; 86 | 87 | /// Copies the elements of the UnicodeRadicalStrokeCountCollection to an , starting at a particular index. 88 | /// The one-dimensional that is the destination of the elements to copy from UnicodeRadicalStrokeCountCollection. The must have zero-based indexing. 89 | /// The zeo-based index in array at which copy begins. 90 | public void CopyTo(UnicodeRadicalStrokeCount[] array, int arrayIndex) => _items.CopyTo(array, arrayIndex); 91 | 92 | /// 93 | /// Returns an enumerator that iterates through the collection. 94 | /// 95 | /// 96 | /// A that can be used to iterate through the collection. 97 | /// 98 | public Enumerator GetEnumerator() => new Enumerator(_items); 99 | 100 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 101 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /System.Unicode/UnicodeRationalNumber.cs: -------------------------------------------------------------------------------- 1 | using System.Globalization; 2 | 3 | namespace System.Unicode 4 | { 5 | /// Represents a rational number in a format compatible with the Unicode standard. 6 | public readonly struct UnicodeRationalNumber : IEquatable 7 | { 8 | /// Parses a rational number from a string representation. 9 | /// 10 | /// Valid text representations should match the regex pattern /-?[0-9]+(?:\/[0-9]+)/. 11 | /// The numerator part should fit in a , and the denominator part should fit in a . 12 | /// 13 | /// The string to parse. 14 | /// The rational number parsed from the string. 15 | /// The parameter is . 16 | /// The parameter is empty. 17 | public static UnicodeRationalNumber Parse(string s) 18 | { 19 | if (s == null) throw new ArgumentNullException(nameof(s)); 20 | if (s.Length == 0) throw new ArgumentException(); 21 | 22 | int fractionBarIndex = s.IndexOf('/'); 23 | #if HAS_NATIVE_SPAN 24 | return new UnicodeRationalNumber(long.Parse(fractionBarIndex >= 0 ? s.AsSpan(0, fractionBarIndex) : s), fractionBarIndex >= 0 ? ushort.Parse(s.AsSpan(fractionBarIndex + 1)) : (byte)1); 25 | #else 26 | return new UnicodeRationalNumber(long.Parse(fractionBarIndex >= 0 ? s.Substring(0, fractionBarIndex) : s), fractionBarIndex >= 0 ? ushort.Parse(s.Substring(fractionBarIndex + 1)) : (byte)1); 27 | #endif 28 | } 29 | 30 | /// The numerator of the fraction. 31 | public readonly long Numerator; 32 | /// The denominator of the fraction. 33 | public readonly ushort Denominator; 34 | 35 | /// Initializes a new instance of the structure that represents a signed integer.. 36 | /// The number which should be represented as a rational number. 37 | public UnicodeRationalNumber(long number) 38 | { 39 | Numerator = number; 40 | Denominator = 1; 41 | } 42 | 43 | /// Initializes a new instance of the structure that represents a signed integer.. 44 | /// The number which should be used as numerator in the rational number. 45 | /// The number which should be used as denominator in the rational number. 46 | public UnicodeRationalNumber(long numerator, ushort denominator) 47 | { 48 | Numerator = numerator; 49 | Denominator = denominator; 50 | } 51 | 52 | /// Gets a value indicating whether the current value is the default value of the type. 53 | /// The default value is an invalid fraction of 0/0. 54 | public bool IsDefaultValue => Numerator == 0 && Denominator == 0; 55 | 56 | /// Creates a string representation of the current rational number. 57 | /// The created representation is culture invariant, and will be parsable by the method. 58 | public override string ToString() 59 | #if !HAS_NATIVE_SPAN 60 | => !IsDefaultValue ? Denominator != 1 ? Numerator.ToString() + "/" + Denominator.ToString() : Numerator.ToString() : string.Empty; 61 | #else 62 | => !IsDefaultValue ? 63 | Denominator != 1 ? 64 | FractionToString() : 65 | Numerator.ToString() : 66 | string.Empty; 67 | 68 | private string FractionToString() 69 | { 70 | Span buffer = stackalloc char[26]; 71 | 72 | Numerator.TryFormat(buffer, out int length, "D", CultureInfo.InvariantCulture); 73 | buffer[length++] = '/'; 74 | Denominator.TryFormat(buffer[length..], out int l, "D", CultureInfo.InvariantCulture); 75 | length += l; 76 | 77 | return buffer[..length].ToString(); 78 | } 79 | #endif 80 | 81 | /// Determines whether the specified rational number is equal to the current value. 82 | /// The other value to compare to the current one. 83 | /// if the two values are the same; otherwise. 84 | public bool Equals(UnicodeRationalNumber other) 85 | { 86 | // We don't consider 1/2 and 2/4 equal here, as, that wouldn't be the same character. 87 | return other.Numerator == Numerator && other.Denominator == Denominator; 88 | } 89 | 90 | /// Determines whether the specified object is equal to the current rational number. 91 | /// The object to compare to the current rational number. 92 | /// if the object represents the same rational number; otherwise. 93 | public override bool Equals(object obj) 94 | { 95 | return base.Equals(obj); 96 | } 97 | 98 | /// Returns the hash code for the current rational number. 99 | /// A 32-bit signed integer hash code. 100 | public override int GetHashCode() 101 | { 102 | return (int)(Numerator << 8) | (Denominator) ^ (byte)(Numerator >> 56); 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /System.Unicode/UnihanCharacterData.Generated.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | partial struct UnihanCharacterData 4 | { 5 | // This method densely packs code points by predicted order of importance (it may be wrong) 6 | // Its purpose is to avoid skipping numbers so that file encoding can be more efficient. 7 | internal static int PackCodePoint(int codePoint) 8 | { 9 | if (codePoint >= 0x3400) 10 | { 11 | // 3400..4DBF; CJK Unified Ideographs Extension A 12 | if (codePoint < 0x4DC0) return codePoint + 0x01E00; 13 | else if (codePoint >= 0x4E00) 14 | { 15 | // 4E00..9FFF; CJK Unified Ideographs 16 | if (codePoint < 0xA000) return codePoint - 0x04E00; 17 | else if (codePoint >= 0xF900) 18 | { 19 | // F900..FAFF; CJK Compatibility Ideographs 20 | if (codePoint < 0xFB00) return codePoint + 0x08240; 21 | else if (codePoint >= 0x20000) 22 | { 23 | // 20000..2A6DF; CJK Unified Ideographs Extension B 24 | if (codePoint < 0x2A6E0) return codePoint - 0x19440; 25 | else if (codePoint >= 0x2A700) 26 | { 27 | // 2A700..2B73F; CJK Unified Ideographs Extension C 28 | // 2B740..2B81F; CJK Unified Ideographs Extension D 29 | // 2B820..2CEAF; CJK Unified Ideographs Extension E 30 | // 2CEB0..2EBEF; CJK Unified Ideographs Extension F 31 | if (codePoint < 0x2EBF0) return codePoint - 0x19460; 32 | else if (codePoint >= 0x2F800) 33 | { 34 | // 2F800..2FA1F; CJK Compatibility Ideographs Supplement 35 | if (codePoint < 0x2FA20) return codePoint - 0x17AC0; 36 | else if (codePoint >= 0x30000) 37 | { 38 | // 30000..3134F; CJK Unified Ideographs Extension G 39 | // 31350..323AF; CJK Unified Ideographs Extension H 40 | if (codePoint < 0x323B0) return codePoint - 0x1A870; 41 | } 42 | } 43 | } 44 | } 45 | } 46 | } 47 | } 48 | 49 | throw new ArgumentOutOfRangeException(nameof(codePoint)); 50 | } 51 | 52 | // Reverses the packing done by the PackCodePoint method. 53 | internal static int UnpackCodePoint(int packedCodePoint) 54 | { 55 | if (packedCodePoint >= 0) 56 | { 57 | // 4E00..9FFF; CJK Unified Ideographs 58 | if (packedCodePoint < 0x05200) return packedCodePoint + 0x4E00; 59 | // 3400..4DBF; CJK Unified Ideographs Extension A 60 | else if (packedCodePoint < 0x06BC0) return packedCodePoint - 0x1E00; 61 | // 20000..2A6DF; CJK Unified Ideographs Extension B 62 | else if (packedCodePoint < 0x112A0) return packedCodePoint + 0x19440; 63 | // 2A700..2B73F; CJK Unified Ideographs Extension C 64 | // 2B740..2B81F; CJK Unified Ideographs Extension D 65 | // 2B820..2CEAF; CJK Unified Ideographs Extension E 66 | // 2CEB0..2EBEF; CJK Unified Ideographs Extension F 67 | else if (packedCodePoint < 0x15790) return packedCodePoint + 0x19460; 68 | // 30000..3134F; CJK Unified Ideographs Extension G 69 | // 31350..323AF; CJK Unified Ideographs Extension H 70 | else if (packedCodePoint < 0x17B40) return packedCodePoint + 0x1A870; 71 | // F900..FAFF; CJK Compatibility Ideographs 72 | else if (packedCodePoint < 0x17D40) return packedCodePoint - 0x8240; 73 | // 2F800..2FA1F; CJK Compatibility Ideographs Supplement 74 | else if (packedCodePoint < 0x17F60) return packedCodePoint + 0x17AC0; 75 | } 76 | throw new ArgumentOutOfRangeException(nameof(packedCodePoint)); 77 | } 78 | } 79 | } 80 | 81 | -------------------------------------------------------------------------------- /System.Unicode/UnihanCharacterData.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | internal readonly partial struct UnihanCharacterData 4 | { 5 | public readonly int CodePoint; 6 | public readonly UnihanNumericType NumericType; 7 | public readonly long NumericValue; 8 | public readonly UnicodeRadicalStrokeCount[] UnicodeRadicalStrokeCounts; 9 | public readonly string Definition; 10 | public readonly string MandarinReading; 11 | public readonly string CantoneseReading; 12 | public readonly string JapaneseKunReading; 13 | public readonly string JapaneseOnReading; 14 | public readonly string KoreanReading; 15 | public readonly string HangulReading; 16 | public readonly string VietnameseReading; 17 | public readonly string SimplifiedVariant; 18 | public readonly string TraditionalVariant; 19 | 20 | internal UnihanCharacterData 21 | ( 22 | int codePoint, 23 | UnihanNumericType numericType, 24 | long numericValue, 25 | UnicodeRadicalStrokeCount[] unicodeRadicalStrokeCounts, 26 | string definition, 27 | string mandarinReading, 28 | string cantoneseReading, 29 | string japaneseKunReading, 30 | string japaneseOnReading, 31 | string koreanReading, 32 | string hangulReading, 33 | string vietnameseReading, 34 | string simplifiedVariant, 35 | string traditionalVariant 36 | ) 37 | { 38 | CodePoint = codePoint; 39 | NumericType = numericType; 40 | NumericValue = numericValue; 41 | UnicodeRadicalStrokeCounts = unicodeRadicalStrokeCounts; 42 | Definition = definition; 43 | MandarinReading = mandarinReading; 44 | CantoneseReading = cantoneseReading; 45 | JapaneseKunReading = japaneseKunReading; 46 | JapaneseOnReading = japaneseOnReading; 47 | KoreanReading = koreanReading; 48 | HangulReading = hangulReading; 49 | VietnameseReading = vietnameseReading; 50 | SimplifiedVariant = simplifiedVariant; 51 | TraditionalVariant = traditionalVariant; 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /System.Unicode/UnihanCharacterData.tt: -------------------------------------------------------------------------------- 1 | <#@ template debug="false" hostspecific="false" language="C#" #> 2 | <#@ assembly name="System.Core" #> 3 | <#@ import namespace="System.Linq" #> 4 | <#@ import namespace="System.Text" #> 5 | <#@ import namespace="System.Collections.Generic" #> 6 | <#@ output extension=".Generated.cs" #> 7 | <# 8 | // This file will generate the code point packing and unpacking code for unihan data. 9 | // Since Unihan data covers pretty specific code point ranges, we can rebase those ranges closer to zero in order to get a better encoding in files. 10 | // The algorithm now generated a densely packed map, as opposed to the previous handwritten code. 11 | 12 | // Declare the blocks to pack and unpack in the arbitrarily chosen order. 13 | var blocks = new UnicodeBlockList 14 | { 15 | { 0x4E00, 0x9FFF, "CJK Unified Ideographs" }, 16 | { 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A" }, 17 | { 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B" }, 18 | { 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C" }, 19 | { 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D" }, 20 | { 0x2B820, 0x2CEAF, "CJK Unified Ideographs Extension E" }, 21 | { 0x2CEB0, 0x2EBEF, "CJK Unified Ideographs Extension F" }, 22 | { 0x30000, 0x3134F, "CJK Unified Ideographs Extension G" }, 23 | { 0x31350, 0x323AF, "CJK Unified Ideographs Extension H" }, 24 | { 0xF900, 0xFAFF, "CJK Compatibility Ideographs" }, 25 | { 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement" }, 26 | }; 27 | 28 | // Assign the (re)base(d) index for each block. 29 | { 30 | int baseIndex = 0; 31 | foreach (var block in blocks) 32 | { 33 | block.RebasedStartIndex = baseIndex; 34 | baseIndex += block.CodePointCount; 35 | } 36 | } 37 | 38 | // Merge contiguous blocks together in order to avoid useless branches 39 | { 40 | int firstMergeIndex = -1; 41 | UnicodeBlock lastBlock = null; 42 | for (int i = 0; i < blocks.Count; i++) 43 | { 44 | var block = blocks[i]; 45 | if (lastBlock is object) 46 | { 47 | bool isContiguous = block.FirstCodePoint - lastBlock.LastCodePoint == 1; 48 | 49 | if (!isContiguous || i == blocks.Count - 1) 50 | { 51 | int blockCount = i - firstMergeIndex + (isContiguous ? 1 : 0); 52 | 53 | if (blockCount > 1) 54 | { 55 | var mergedBlocks = blocks.GetRange(firstMergeIndex, blockCount).ToArray(); 56 | blocks[firstMergeIndex] = new UnicodeBlock(mergedBlocks); 57 | blocks.RemoveRange(firstMergeIndex + 1, blockCount - 1); 58 | } 59 | firstMergeIndex = i -= blockCount - 1; 60 | } 61 | } 62 | else 63 | { 64 | firstMergeIndex = i; 65 | } 66 | lastBlock = block; 67 | } 68 | } 69 | 70 | // Sort blocks by first code point 71 | var sortedBlocks = blocks.ToArray(); 72 | Array.Sort(sortedBlocks, (a, b) => Comparer.Default.Compare(a.FirstCodePoint, b.FirstCodePoint)); 73 | #> 74 | namespace System.Unicode 75 | { 76 | partial struct UnihanCharacterData 77 | { 78 | // This method densely packs code points by predicted order of importance (it may be wrong) 79 | // Its purpose is to avoid skipping numbers so that file encoding can be more efficient. 80 | internal static int PackCodePoint(int codePoint) 81 | { 82 | <# 83 | { 84 | int lastCodePoint = -1; 85 | int indentCount = 0; 86 | 87 | foreach (var block in sortedBlocks) 88 | { 89 | bool isContiguous = block.FirstCodePoint - lastCodePoint <= 1; 90 | 91 | if (!isContiguous) 92 | { 93 | #> 94 | <#= lastCodePoint >= 0 ? "else " : "" #>if (codePoint >= 0x<#= block.FirstCodePoint.ToString("X4") #>) 95 | { 96 | <# 97 | indentCount++; 98 | PushIndent("\t"); 99 | } 100 | 101 | foreach (var mergedBlock in block.MergedBlocks) 102 | { 103 | #> 104 | // <#= mergedBlock #> 105 | <# 106 | } 107 | 108 | int offset = block.RebasedStartIndex - block.FirstCodePoint; 109 | #> 110 | <#= isContiguous ? "else " : "" #>if (codePoint < 0x<#= (block.LastCodePoint + 1).ToString("X4") #>) return codePoint <#= offset < 0 ? "-" : "+" #> 0x<#= Math.Abs(offset).ToString("X5") #>; 111 | <# 112 | 113 | lastCodePoint = block.LastCodePoint; 114 | } 115 | 116 | while (indentCount-- > 0) 117 | { 118 | PopIndent(); 119 | #> 120 | } 121 | <# 122 | } 123 | } 124 | #> 125 | 126 | throw new ArgumentOutOfRangeException(nameof(codePoint)); 127 | } 128 | 129 | // Reverses the packing done by the PackCodePoint method. 130 | internal static int UnpackCodePoint(int packedCodePoint) 131 | { 132 | if (packedCodePoint >= 0) 133 | { 134 | <# 135 | { 136 | foreach (var block in blocks) 137 | { 138 | foreach (var mergedBlock in block.MergedBlocks) 139 | { 140 | #> 141 | // <#= mergedBlock #> 142 | <# 143 | } 144 | 145 | int offset = block.FirstCodePoint - block.RebasedStartIndex; 146 | #> 147 | <#= block.RebasedStartIndex > 0 ? "else " : "" #>if (packedCodePoint < 0x<#= (block.RebasedStartIndex + block.CodePointCount).ToString("X5") #>) return packedCodePoint <#= offset < 0 ? "-" : "+" #> 0x<#= Math.Abs(offset).ToString("X4") #>; 148 | <# 149 | } 150 | } 151 | #> 152 | } 153 | throw new ArgumentOutOfRangeException(nameof(packedCodePoint)); 154 | } 155 | } 156 | } 157 | 158 | <#+ 159 | class UnicodeBlock 160 | { 161 | public int FirstCodePoint { get; } 162 | public int LastCodePoint { get; } 163 | public string Name { get; } 164 | public UnicodeBlock[] MergedBlocks { get; } 165 | public int CodePointCount => LastCodePoint - FirstCodePoint + 1; 166 | public int RebasedStartIndex { get; set; } 167 | 168 | public UnicodeBlock(int firstCodePoint, int lastCodePoint, string name) 169 | => (FirstCodePoint, LastCodePoint, Name, MergedBlocks) = (firstCodePoint, lastCodePoint, name, new[] { this }); 170 | 171 | public UnicodeBlock(UnicodeBlock[] mergedBlocks) 172 | => (FirstCodePoint, LastCodePoint, Name, RebasedStartIndex, MergedBlocks) = (mergedBlocks[0].FirstCodePoint, mergedBlocks[mergedBlocks.Length - 1].LastCodePoint, "MERGED Block", mergedBlocks[0].RebasedStartIndex, mergedBlocks); 173 | 174 | public override string ToString() 175 | => $"{FirstCodePoint:X4}..{LastCodePoint:X4}; {Name}"; 176 | } 177 | 178 | class UnicodeBlockList : List 179 | { 180 | public void Add(int firstCodePoint, int lastCodePoint, string name) 181 | => Add(new UnicodeBlock(firstCodePoint, lastCodePoint, name)); 182 | } 183 | #> 184 | -------------------------------------------------------------------------------- /System.Unicode/UnihanFields.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | [Flags] 4 | internal enum UnihanFields : ushort 5 | { 6 | // NumericType / NumericValue : Not exactly a bit mask here… 7 | PrimaryNumeric = 1, 8 | AccountingNumeric = 2, 9 | OtherNumeric = 3, 10 | 11 | // UnicodeRadicalStroke : Not exactly a bit mask… 12 | UnicodeRadicalStrokeCount = 4, // Will indicate exactly one value for Unicode_Radical_Stroke. 13 | UnicodeRadicalStrokeCountTwice = 8, // Will indicate exactly two values for Unicode_Radical_Stroke. 14 | UnicodeRadicalStrokeCountMore = 12, // Will indicate three or more values for Unicode_Radical_Stroke. This combination should never happen in the current files. 15 | 16 | Definition = 16, 17 | MandarinReading = 32, 18 | CantoneseReading = 64, 19 | JapaneseKunReading = 128, 20 | JapaneseOnReading = 256, 21 | KoreanReading = 512, 22 | HangulReading = 1024, 23 | VietnameseReading = 2048, 24 | 25 | SimplifiedVariant = 4096, 26 | TraditionalVariant = 8192, 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /System.Unicode/UnihanNumericType.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | /// Represents the different numeric types from the Unihan database. 4 | public enum UnihanNumericType : byte 5 | { 6 | /// Indicates that there is no Unihan numeric property defined for the code point. 7 | None = 0, 8 | /// Indicates that the propery kPrimaryNumeric is defined for this code point. 9 | /// The kPrimaryNumeric property is used for ideographs wich are standard numerals. 10 | [ValueName("kPrimaryNumeric")] 11 | Primary = 1, 12 | /// Indicates that the propery kAccountingNumeric is defined for this code point. 13 | /// The kAccountingNumeric property is used for ideographs used as accounting numerals. 14 | [ValueName("kAccountingNumeric")] 15 | Accounting = 2, 16 | /// Indicates that the propery kOtherNumeric is defined for this code point. 17 | /// The kOtherNumeric property is used for ideographs wich are used as numerals in non common contexts. 18 | [ValueName("kOtherNumeric")] 19 | Other = 3, 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /System.Unicode/ValueNameAttribute.cs: -------------------------------------------------------------------------------- 1 | namespace System.Unicode 2 | { 3 | /// Declares a name for a specific value. 4 | /// 5 | /// Since this project tries to stick to the .NET Framework naming conventions, this attribute may be used to indicate standard property names and values names where applicable. 6 | /// It may also be of use when aliases are available for a given property or value. 7 | /// 8 | [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = true)] 9 | public sealed class ValueNameAttribute : Attribute 10 | { 11 | /// The name given to the property or value. 12 | public string Name { get; } 13 | 14 | /// Initializes an instance of the class . 15 | /// The name given to the property or value on which this attribute is to be applied. 16 | public ValueNameAttribute(string name) => Name = name; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /System.Unicode/packageIcon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/System.Unicode/packageIcon.png -------------------------------------------------------------------------------- /System.Unicode/ucd.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/System.Unicode/ucd.dat -------------------------------------------------------------------------------- /UnicodeCharacterInspector/UnicodeCharacterInspector.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/UnicodeCharacterInspector/UnicodeCharacterInspector.ico -------------------------------------------------------------------------------- /UnicodeCharacterInspector/UnicodeCharacterInspector.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 14 | 16 | 18 | 19 | 21 | image/svg+xml 22 | 24 | 25 | 26 | 27 | 28 | 31 | 36 | 40 | 45 | 49 | 53 | 57 | 61 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /UnicodeInformation.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.28721.148 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode", "System.Unicode\System.Unicode.csproj", "{CB722958-A1C4-4121-804B-7D5A671491B1}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Tests", "System.Unicode.Tests\System.Unicode.Tests.csproj", "{50337426-E884-4394-9E1A-F6F7A555F5D9}" 9 | EndProject 10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Build.Tasks", "System.Unicode.Build.Tasks\System.Unicode.Build.Tasks.csproj", "{8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}" 11 | EndProject 12 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{73097DF3-04B7-4C5F-B4EA-0EB800E40702}" 13 | ProjectSection(SolutionItems) = preProject 14 | .editorconfig = .editorconfig 15 | azure-pipelines.yml = azure-pipelines.yml 16 | Directory.Build.props = Directory.Build.props 17 | Example.cs = Example.cs 18 | Icon.ico = Icon.ico 19 | Icon.svg = Icon.svg 20 | LICENSE.txt = LICENSE.txt 21 | README.md = README.md 22 | System.Unicode.snk = System.Unicode.snk 23 | UnicodeVersion.txt = UnicodeVersion.txt 24 | EndProjectSection 25 | EndProject 26 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Build.Core", "System.Unicode.Build.Core\System.Unicode.Build.Core.csproj", "{A872B696-86A2-4B74-9878-08CD4742338A}" 27 | EndProject 28 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Build.DatabaseGenerator", "System.Unicode.Build.DatabaseGenerator\System.Unicode.Build.DatabaseGenerator.csproj", "{723A80B0-34A9-44BA-BB2C-B6921FEEDD56}" 29 | EndProject 30 | Global 31 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 32 | Debug|Any CPU = Debug|Any CPU 33 | Release|Any CPU = Release|Any CPU 34 | EndGlobalSection 35 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 36 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 37 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Debug|Any CPU.Build.0 = Debug|Any CPU 38 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Release|Any CPU.ActiveCfg = Release|Any CPU 39 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Release|Any CPU.Build.0 = Release|Any CPU 40 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 41 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Debug|Any CPU.Build.0 = Debug|Any CPU 42 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Release|Any CPU.ActiveCfg = Release|Any CPU 43 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Release|Any CPU.Build.0 = Release|Any CPU 44 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 45 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Debug|Any CPU.Build.0 = Debug|Any CPU 46 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Release|Any CPU.ActiveCfg = Release|Any CPU 47 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Release|Any CPU.Build.0 = Release|Any CPU 48 | {A872B696-86A2-4B74-9878-08CD4742338A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 49 | {A872B696-86A2-4B74-9878-08CD4742338A}.Debug|Any CPU.Build.0 = Debug|Any CPU 50 | {A872B696-86A2-4B74-9878-08CD4742338A}.Release|Any CPU.ActiveCfg = Release|Any CPU 51 | {A872B696-86A2-4B74-9878-08CD4742338A}.Release|Any CPU.Build.0 = Release|Any CPU 52 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 53 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Debug|Any CPU.Build.0 = Debug|Any CPU 54 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Release|Any CPU.ActiveCfg = Release|Any CPU 55 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Release|Any CPU.Build.0 = Release|Any CPU 56 | EndGlobalSection 57 | GlobalSection(SolutionProperties) = preSolution 58 | HideSolutionNode = FALSE 59 | EndGlobalSection 60 | GlobalSection(ExtensibilityGlobals) = postSolution 61 | SolutionGuid = {B155A7AA-DB01-4F49-8985-33AC25BC4B98} 62 | EndGlobalSection 63 | EndGlobal 64 | -------------------------------------------------------------------------------- /UnicodeVersion.txt: -------------------------------------------------------------------------------- 1 | 15.0.0 2 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | # .NET Desktop 2 | # Build and run tests for .NET Desktop or Windows classic desktop solutions. 3 | # Add steps that publish symbols, save build artifacts, and more: 4 | # https://docs.microsoft.com/azure/devops/pipelines/apps/windows/dot-net 5 | 6 | trigger: 7 | - master 8 | 9 | pool: 10 | vmImage: 'windows-latest' 11 | 12 | variables: 13 | solution: '**/*.sln' 14 | buildPlatform: 'Any CPU' 15 | buildConfiguration: 'Release' 16 | 17 | steps: 18 | - task: NuGetToolInstaller@1 19 | 20 | - task: NuGetCommand@2 21 | inputs: 22 | restoreSolution: '$(solution)' 23 | 24 | - task: VSBuild@1 25 | inputs: 26 | solution: '$(solution)' 27 | platform: '$(buildPlatform)' 28 | configuration: '$(buildConfiguration)' 29 | msbuildArgs: '/p:ContiniousIntegrationBuild=true' 30 | 31 | - task: VSTest@2 32 | inputs: 33 | platform: '$(buildPlatform)' 34 | configuration: '$(buildConfiguration)' 35 | testSelector: 'testAssemblies' 36 | testAssemblyVer2: | 37 | **\*.Tests.dll 38 | !**\obj\** 39 | !**\ref\** 40 | searchFolder: '$(System.DefaultWorkingDirectory)' 41 | runOnlyImpactedTests: false 42 | 43 | - task: CopyFiles@2 44 | inputs: 45 | SourceFolder: '$(Build.SourcesDirectory)' 46 | Contents: 'System.Unicode/bin/$(buildConfiguration)/?(*.nupkg|*.snupkg)' 47 | TargetFolder: '$(Build.ArtifactStagingDirectory)' 48 | 49 | - task: PublishBuildArtifacts@1 50 | inputs: 51 | PathtoPublish: '$(Build.ArtifactStagingDirectory)' 52 | ArtifactName: 'NuGet' 53 | publishLocation: 'Container' 54 | --------------------------------------------------------------------------------