├── .editorconfig
├── .gitattributes
├── .gitignore
├── Directory.Build.props
├── Example.cs
├── Icon.ico
├── Icon.svg
├── LICENSE.txt
├── README.md
├── System.Unicode.Build.Core
├── BinaryWriterExtensions.cs
├── CharExtensions.cs
├── CharacterDecompositionMapping.cs
├── DataSourceProvider.cs
├── DataSources
│ ├── FileDataSource.cs
│ ├── HttpDataSource.cs
│ ├── InMemoryDataSource.cs
│ └── ZipDataSource.cs
├── EnumHelper.cs
├── HexCodePoint.cs
├── IDataSource.cs
├── System.Unicode.Build.Core.csproj
├── UnicodeCharacterDataBuilder.cs
├── UnicodeDataFileReader.cs
├── UnicodeDataProcessor.cs
├── UnicodeDatabaseGenerator.cs
├── UnicodeInfoBuilder.cs
├── UnihanCharacterDataBuilder.cs
├── UnihanDataFileReader.cs
├── UnihanProperty.cs
├── UnihanProperty.tt
└── Utf8Buffer.cs
├── System.Unicode.Build.DatabaseGenerator
├── Program.cs
├── Properties
│ └── launchSettings.json
└── System.Unicode.Build.DatabaseGenerator.csproj
├── System.Unicode.Build.Tasks
├── AsyncTask.cs
├── GenerateUnicodeDatabase.cs
├── GetUnicodeDatabaseVersion.cs
├── System.Unicode.Build.Tasks.csproj
├── System.Unicode.Build.Tasks.props
└── System.Unicode.Build.Tasks.targets
├── System.Unicode.Tests
├── CodePointEnumerableTests.cs
├── ImportRequestedUnicodeVersion.targets
├── PermissiveCodePointEnumerableTests.cs
├── System.Unicode.Tests.csproj
├── UnicodeCodePointRangeTests.cs
├── UnicodeInfoTests.cs
├── UnicodeRationalNumerTests.cs
├── UnihanCharacterDataTests.cs
└── XUnitSerializableString.cs
├── System.Unicode.snk
├── System.Unicode
├── BidirectionalClass.cs
├── CanonicalCombiningClass.cs
├── CjkRadicalData.cs
├── CjkRadicalInfo.cs
├── CodePointEnumerable.cs
├── CodePointEnumerator.cs
├── CompatibilityFormattingTag.cs
├── ContributoryProperties.cs
├── CoreProperties.cs
├── EmojiProperties.cs
├── EnumHelper.cs
├── GenerateUnicodeDatabase.proj
├── HangulInfo.cs
├── PermissiveCodePointEnumerable.cs
├── PermissiveCodePointEnumerator.cs
├── StringExtensions.cs
├── System.Unicode.csproj
├── UcdFields.cs
├── UnicodeBlock.cs
├── UnicodeCategoryExtensions.cs
├── UnicodeCategoryInfo.cs
├── UnicodeCharInfo.cs
├── UnicodeCharacterData.cs
├── UnicodeCodePointRange.cs
├── UnicodeCrossReferenceCollection.cs
├── UnicodeData.cs
├── UnicodeInfo.cs
├── UnicodeNameAlias.cs
├── UnicodeNameAliasCollection.cs
├── UnicodeNameAliasKind.cs
├── UnicodeNumericType.cs
├── UnicodeRadicalStrokeCount.cs
├── UnicodeRadicalStrokeCountCollection.cs
├── UnicodeRationalNumber.cs
├── UnihanCharacterData.Generated.cs
├── UnihanCharacterData.cs
├── UnihanCharacterData.tt
├── UnihanFields.cs
├── UnihanNumericType.cs
├── ValueNameAttribute.cs
├── packageIcon.png
└── ucd.dat
├── UnicodeCharacterInspector
├── UnicodeCharacterInspector.ico
└── UnicodeCharacterInspector.svg
├── UnicodeInformation.sln
├── UnicodeVersion.txt
└── azure-pipelines.yml
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.sln.docstates
8 |
9 | # Build results
10 | [Dd]ebug/
11 | [Dd]ebugPublic/
12 | [Rr]elease/
13 | x64/
14 | build/
15 | bld/
16 | [Bb]in/
17 | [Oo]bj/
18 |
19 | # Roslyn cache directories
20 | *.ide/
21 | .vs/
22 |
23 | # MSTest test Results
24 | [Tt]est[Rr]esult*/
25 | [Bb]uild[Ll]og.*
26 |
27 | #NUNIT
28 | *.VisualState.xml
29 | TestResult.xml
30 |
31 | # Build Results of an ATL Project
32 | [Dd]ebugPS/
33 | [Rr]eleasePS/
34 | dlldata.c
35 |
36 | *_i.c
37 | *_p.c
38 | *_i.h
39 | *.ilk
40 | *.meta
41 | *.obj
42 | *.pch
43 | *.pdb
44 | *.pgc
45 | *.pgd
46 | *.rsp
47 | *.sbr
48 | *.tlb
49 | *.tli
50 | *.tlh
51 | *.tmp
52 | *.tmp_proj
53 | *.log
54 | *.vspscc
55 | *.vssscc
56 | .builds
57 | *.pidb
58 | *.svclog
59 | *.scc
60 |
61 | # Chutzpah Test files
62 | _Chutzpah*
63 |
64 | # Visual C++ cache files
65 | ipch/
66 | *.aps
67 | *.ncb
68 | *.opensdf
69 | *.sdf
70 | *.cachefile
71 |
72 | # Visual Studio profiler
73 | *.psess
74 | *.vsp
75 | *.vspx
76 |
77 | # TFS 2012 Local Workspace
78 | $tf/
79 |
80 | # Guidance Automation Toolkit
81 | *.gpState
82 |
83 | # ReSharper is a .NET coding add-in
84 | _ReSharper*/
85 | *.[Rr]e[Ss]harper
86 | *.DotSettings.user
87 |
88 | # JustCode is a .NET coding addin-in
89 | .JustCode
90 |
91 | # TeamCity is a build add-in
92 | _TeamCity*
93 |
94 | # DotCover is a Code Coverage Tool
95 | *.dotCover
96 |
97 | # NCrunch
98 | _NCrunch_*
99 | .*crunch*.local.xml
100 |
101 | # MightyMoose
102 | *.mm.*
103 | AutoTest.Net/
104 |
105 | # Web workbench (sass)
106 | .sass-cache/
107 |
108 | # Installshield output folder
109 | [Ee]xpress/
110 |
111 | # DocProject is a documentation generator add-in
112 | DocProject/buildhelp/
113 | DocProject/Help/*.HxT
114 | DocProject/Help/*.HxC
115 | DocProject/Help/*.hhc
116 | DocProject/Help/*.hhk
117 | DocProject/Help/*.hhp
118 | DocProject/Help/Html2
119 | DocProject/Help/html
120 |
121 | # Click-Once directory
122 | publish/
123 |
124 | # Publish Web Output
125 | *.[Pp]ublish.xml
126 | *.azurePubxml
127 | ## TODO: Comment the next line if you want to checkin your
128 | ## web deploy settings but do note that will include unencrypted
129 | ## passwords
130 | #*.pubxml
131 |
132 | # NuGet Packages Directory
133 | packages/*
134 | ## TODO: If the tool you use requires repositories.config
135 | ## uncomment the next line
136 | #!packages/repositories.config
137 |
138 | # Enable "build/" folder in the NuGet Packages folder since
139 | # NuGet packages use it for MSBuild targets.
140 | # This line needs to be after the ignore of the build folder
141 | # (and the packages folder if the line above has been uncommented)
142 | !packages/build/
143 |
144 | # Windows Azure Build Output
145 | csx/
146 | *.build.csdef
147 |
148 | # Windows Store app package directory
149 | AppPackages/
150 |
151 | # Others
152 | sql/
153 | *.Cache
154 | ClientBin/
155 | [Ss]tyle[Cc]op.*
156 | ~$*
157 | *~
158 | *.dbmdl
159 | *.dbproj.schemaview
160 | *.pfx
161 | *.publishsettings
162 | node_modules/
163 |
164 | # RIA/Silverlight projects
165 | Generated_Code/
166 |
167 | # Backup & report files from converting an old project file
168 | # to a newer Visual Studio version. Backup files are not needed,
169 | # because we have git ;-)
170 | _UpgradeReport_Files/
171 | Backup*/
172 | UpgradeLog*.XML
173 | UpgradeLog*.htm
174 |
175 | # SQL Server files
176 | *.mdf
177 | *.ldf
178 |
179 | # Business Intelligence projects
180 | *.rdl.data
181 | *.bim.layout
182 | *.bim_*.settings
183 |
184 | # Microsoft Fakes
185 | FakesAssemblies/
186 |
187 | # LightSwitch generated files
188 | GeneratedArtifacts/
189 | _Pvt_Extensions/
190 | ModelManifest.xml
191 |
--------------------------------------------------------------------------------
/Directory.Build.props:
--------------------------------------------------------------------------------
1 |
2 |
3 | 9.0
4 | .NET Unicode Information
5 | Fabien Barbier
6 | Copyright © Fabien Barbier 2014-2019
7 | en
8 | 2.7.1
9 | 2.7.1
10 |
11 |
12 | true
13 | $(MSBuildThisFileDirectory)System.Unicode.snk
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/Example.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Text;
3 | using System.Unicode;
4 |
5 | namespace Example
6 | {
7 | internal static class Program
8 | {
9 | private static void Main()
10 | {
11 | Console.OutputEncoding = Encoding.Unicode;
12 | PrintCodePointInfo('A');
13 | PrintCodePointInfo('∞');
14 | PrintCodePointInfo(0x1F600);
15 | }
16 |
17 | private static void PrintCodePointInfo(int codePoint)
18 | {
19 | var charInfo = UnicodeInfo.GetCharInfo(codePoint);
20 | Console.WriteLine(UnicodeInfo.GetDisplayText(charInfo));
21 | Console.WriteLine("U+" + codePoint.ToString("X4"));
22 | Console.WriteLine(charInfo.Name ?? charInfo.OldName);
23 | Console.WriteLine(charInfo.Category);
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/Icon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/Icon.ico
--------------------------------------------------------------------------------
/Icon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
67 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 Fabien Barbier
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # .NET Unicode Information Library
2 |
3 | [](https://dev.azure.com/goldencrystal/UnicodeInformation/_build/latest?definitionId=1&branchName=master)
4 |
5 | ## Summary
6 |
7 | This project consists of a library that provides access to some of the data contained in the Unicode Character Database.
8 |
9 | ## Version of Unicode supported
10 |
11 | Unicode 13.0
12 | Emoji 13.0
13 |
14 | ## Breaking changes from versions 1.x to 2.x
15 |
16 | UnicodeRadicalStrokeCount.StrokeCount is now of type System.SByte instead of type System.Byte.
17 |
18 | ## Using the library
19 |
20 | ### Reference the NuGet package
21 |
22 | Grab the latest version of the package on NuGet: https://www.nuget.org/packages/UnicodeInformation/.
23 | Once the library is installed in your project, you will find everything you need in the System.Unicode namespace.
24 |
25 | ### Basic information
26 |
27 | Everything provided by the library will be under the namespace `System.Unicode`.
28 | XML documentation should be complete enough so that you can navigate the API without getting lost.
29 |
30 | In its current state, the project is written in C# 7.3, compilable by [Roslyn](http://roslyn.codeplex.com/), and targets both .NET Standard 2.0 and .NET Standard 1.1.
31 | The library UnicodeInformation includes a (large) subset of the official [Unicode Character Database](http://www.unicode.org/Public/UCD/latest/) stored in a custom file format.
32 |
33 | ### Example usage
34 |
35 | The following program will display informations on a few characters:
36 |
37 | ```csharp
38 | using System;
39 | using System.Text;
40 | using System.Unicode;
41 |
42 | namespace Example
43 | {
44 | internal static class Program
45 | {
46 | private static void Main()
47 | {
48 | Console.OutputEncoding = Encoding.Unicode;
49 | PrintCodePointInfo('A');
50 | PrintCodePointInfo('∞');
51 | PrintCodePointInfo(0x1F600);
52 | }
53 |
54 | private static void PrintCodePointInfo(int codePoint)
55 | {
56 | var charInfo = UnicodeInfo.GetCharInfo(codePoint);
57 | Console.WriteLine(UnicodeInfo.GetDisplayText(charInfo));
58 | Console.WriteLine("U+" + codePoint.ToString("X4"));
59 | Console.WriteLine(charInfo.Name ?? charInfo.OldName);
60 | Console.WriteLine(charInfo.Category);
61 | }
62 | }
63 | }
64 | ```
65 |
66 | Explanations:
67 |
68 | * `UnicodeInfo.GetCharInfo(int)` returns a structure `UnicodeCharInfo` that provides access to various bit of information associated with the specified code point.
69 | * `UnicodeInfo.GetDisplayText(UnicodeCharInfo)` is a helper method that computes a display text for the specified code point.
70 | Since some code points are not designed to be displayed in a standalone fashion, this will try to make the specified character more displayable.
71 | The algorithm used to provide a display text is quite simplistic, and will only affect very specific code points. (e.g. Control Characters)
72 | For most code points, this will simply return the direct string representation.
73 | * `UnicodeCharInfo.Name` returns the name of the code point as specified by the Unicode standard.
74 | Please note that some characters will, by design, not have any name assigned to them in the standard. (e.g. control characters)
75 | Those characters, however may have alternate names assigned to them, that you can use as fallbacks. (e.g. `UnicodeCharInfo.OldName`)
76 | * `UnicodeCharInfo.OldName` returns the name of the character as defined in Unicode 1.0, when applicable and different from the current name.
77 | * `UnicodeCharInfo.Category` returns the category assigned to the specified code point.
78 |
79 |
80 | ### Included Properties
81 |
82 | #### From UCD
83 | * Name
84 | * General_Category
85 | * Canonical_Combining_Class
86 | * Bidi_Class
87 | * Decomposition_Type
88 | * Decomposition_Mapping
89 | * Numeric_Type (See also kAccountingNumeric/kOtherNumeric/kPrimaryNumeric. Those will set Numeric_Type to Numeric.)
90 | * Numeric_Value
91 | * Bidi_Mirrored
92 | * Unicode_1_Name
93 | * Simple_Uppercase_Maping
94 | * Simple_Lowercase_Mapping
95 | * Simple_Titlecase_Mapping
96 | * Name_Alias
97 | * Block
98 | * ASCII_Hex_Digit
99 | * Bidi_Control
100 | * Dash
101 | * Deprecated
102 | * Diacritic
103 | * Extender
104 | * Hex_Digit
105 | * Hyphen
106 | * Ideographic
107 | * IDS_Binary_Operator
108 | * IDS_Trinary_Operator
109 | * Join_Control
110 | * Logical_Order_Exception
111 | * Noncharacter_Code_Point
112 | * Other_Alphabetic
113 | * Other_Default_Ignorable_Code_Point
114 | * Other_Grapheme_Extend
115 | * Other_ID_Continue
116 | * Other_ID_Start
117 | * Other_Lowercase
118 | * Other_Math
119 | * Other_Uppercase
120 | * Pattern_Syntax
121 | * Pattern_White_Space
122 | * Quotation_Mark
123 | * Radical
124 | * Soft_Dotted
125 | * STerm
126 | * Terminal_Punctuation
127 | * Unified_Ideograph
128 | * Variation_Selector
129 | * White_Space
130 | * Lowercase
131 | * Uppercase
132 | * Cased
133 | * Case_Ignorable
134 | * Changes_When_Lowercased
135 | * Changes_When_Uppercased
136 | * Changes_When_Titlecased
137 | * Changes_When_Casefolded
138 | * Changes_When_Casemapped
139 | * Alphabetic
140 | * Default_Ignorable_Code_Point
141 | * Grapheme_Base
142 | * Grapheme_Extend
143 | * Grapheme_Link
144 | * Math
145 | * ID_Start
146 | * ID_Continue
147 | * XID_Start
148 | * XID_Continue
149 | * Unicode_Radical_Stroke (This is actually kRSUnicode from the Unihan database)
150 | * Code point cross references extracted from NamesList.txt
151 |
152 | NB: The UCD property ISO_Comment will never be included since this one is empty in all new Unicode versions.
153 |
154 | #### From Unicode Emoji
155 |
156 | * Emoji
157 | * Emoji_Presentation
158 | * Emoji_Modifier
159 | * Emoji_Modifier_Base
160 | * Emoji_Component
161 | * Extended_Pictographic
162 |
163 | #### From Unihan
164 | * kAccountingNumeric
165 | * kOtherNumeric
166 | * kPrimaryNumeric
167 | * kRSUnicode
168 | * kDefinition
169 | * kMandarin
170 | * kCantonese
171 | * kJapaneseKun
172 | * kJapaneseOn
173 | * kKorean
174 | * kHangul
175 | * kVietnamese
176 | * kSimplifiedVariant
177 | * kTraditionalVariant
178 |
179 | ### Regenerating the data
180 | The project UnicodeInformation.Builder takes cares of generating a file named ucd.dat. This file contains Unicode data compressed by .NET's deflate algorithm, and should be included in UnicodeInformation.dll at compilation.
181 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/BinaryWriterExtensions.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.Text;
3 |
4 | namespace System.Unicode.Build.Core
5 | {
6 | public static class BinaryWriterExtensions
7 | {
8 | public static void WriteUInt24(this BinaryWriter writer, int value)
9 | {
10 | if (value < 0 || value > 0xFFFFFF) throw new ArgumentOutOfRangeException(nameof(value));
11 |
12 | writer.Write((byte)value);
13 | writer.Write((byte)(value >> 8));
14 | writer.Write((byte)(value >> 16));
15 | }
16 |
17 | public static void WriteVariableUInt64(this BinaryWriter writer, ulong value)
18 | {
19 | byte b = (byte)(value & 0x7F);
20 | value >>= 7;
21 |
22 | while (value > 0)
23 | {
24 | writer.Write((byte)(b | 0x80));
25 | b = (byte)(value & 0x7F);
26 | value >>= 7;
27 | }
28 | writer.Write(b);
29 | }
30 |
31 | /// Writes code point in a custom, but compact encoding.
32 | ///
33 | /// Unlike UTF-8, this encoding will consume at most 3 bytes.
34 | /// It could ideally store values between 0x0 and 0x40409F, but this range is useless at the moment.
35 | ///
36 | /// The binary writer to use.
37 | /// The value to write
38 | public static void WriteCodePoint(this BinaryWriter writer, int value)
39 | {
40 | if (value < 0 || value > 0x40407F) throw new ArgumentOutOfRangeException(nameof(value));
41 |
42 | if (value < 0xA0) writer.Write((byte)value);
43 | else if (value < 0x20A0)
44 | {
45 | value -= 0xA0;
46 | writer.Write((byte)((byte)(value >> 8) | 0xA0));
47 | writer.Write((byte)value);
48 | }
49 | else if (value < 0x40A0)
50 | {
51 | value -= 0x20A0;
52 | writer.Write((byte)((byte)(value >> 8) | 0xC0));
53 | writer.Write((byte)value);
54 | }
55 | else
56 | {
57 | value -= 0x40A0;
58 | writer.Write((byte)((byte)(value >> 16) | 0xE0));
59 | writer.Write((byte)(value >> 8));
60 | writer.Write((byte)value);
61 | }
62 | }
63 |
64 | /// Writes a character name alias.
65 | /// We assume that character names will not exceed 64 bytes in length.
66 | /// The writer to use.
67 | /// The name alias value to write.
68 | public static void WriteNameAliasToFile(this BinaryWriter writer, UnicodeNameAlias nameAlias)
69 | {
70 | writer.Write(nameAlias.Name);
71 | writer.Write((byte)nameAlias.Kind);
72 | }
73 |
74 | /// Writes a character name, packing two information bits along with the length.
75 | /// We assume that character names will not exceed 128 bytes in length.
76 | /// The writer to use.
77 | /// The name to write.
78 | public static void WriteNamePropertyToFile(this BinaryWriter writer, string name)
79 | {
80 | var bytes = Encoding.UTF8.GetBytes(name);
81 | if (bytes.Length > 128) throw new InvalidOperationException("Did not expect UTF-8 encoded name to be longer than 128 bytes.");
82 | writer.Write((byte)(name.Length - 1)); // The most significant bit will always be cleared, because it will be used for other cases.
83 | writer.Write(bytes);
84 | }
85 |
86 | /// Writes a 6 bits length packed with two extra bits.
87 | /// The parameters have a restricted range, which must be respected.
88 | /// The writer used to perform the operation.
89 | /// The value of the two extra bits.
90 | /// The length to write.
91 | public static void WritePackedLength(this BinaryWriter writer, byte extraBits, int length)
92 | {
93 | if (extraBits > 3) throw new ArgumentOutOfRangeException(nameof(extraBits));
94 | if (length < 1 || length > 64) throw new ArgumentOutOfRangeException(nameof(length));
95 |
96 | writer.Write((byte)((extraBits << 6) | (length - 1)));
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/CharExtensions.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode.Build.Core
2 | {
3 | public static class CharExtensions
4 | {
5 | public static bool IsHexDigit(this char c)
6 | => c >= '0' && c <= 'f' && (c <= '9' || c <= 'F' && c >= 'A' || c >= 'a');
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/CharacterDecompositionMapping.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode.Build.Core
2 | {
3 | public struct CharacterDecompositionMapping
4 | {
5 | public readonly CompatibilityFormattingTag DecompositionType;
6 | public readonly string DecompositionMapping;
7 |
8 | public CharacterDecompositionMapping(CompatibilityFormattingTag decompositionType, string decompositionMapping)
9 | {
10 | DecompositionType = decompositionType;
11 | DecompositionMapping = decompositionMapping;
12 | }
13 |
14 | public unsafe static CharacterDecompositionMapping Parse(string s)
15 | {
16 | if (string.IsNullOrEmpty(s)) return default;
17 |
18 | var tag = CompatibilityFormattingTag.Canonical;
19 |
20 | int index;
21 |
22 | if (s[0] == '<')
23 | {
24 | if (!EnumHelper.TryGetNamedValue(s.Substring(1, (index = s.IndexOf('>')) - 1), out tag))
25 | throw new FormatException();
26 | ++index;
27 | }
28 | else
29 | {
30 | index = 0;
31 | }
32 |
33 | var buffer = stackalloc char[36]; // From the Unicode docs, a decomposition cannot have more than 18 code points.
34 | int charIndex = 0;
35 |
36 | while (index < s.Length && charIndex < 35)
37 | {
38 | char c = s[index];
39 |
40 | if (c == ' ') ++index;
41 | else
42 | {
43 | int codePoint = HexCodePoint.Parse(s, ref index);
44 |
45 | if (codePoint < 0x10000)
46 | buffer[charIndex++] = (char)codePoint;
47 | else if (codePoint < 0x10FFFF)
48 | {
49 | codePoint -= 0x10000;
50 | buffer[charIndex++] = (char)((codePoint >> 10) + 0xD800);
51 | buffer[charIndex++] = (char)((codePoint & 0x3FF) + 0xDC00);
52 | }
53 | else
54 | {
55 | throw new FormatException("The code point was outside of the allowed range.");
56 | }
57 | }
58 | }
59 |
60 | return new CharacterDecompositionMapping(tag, new string(buffer, 0, charIndex));
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/DataSourceProvider.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.IO.Compression;
3 | using System.Linq;
4 | using System.Net.Http;
5 | using System.Threading.Tasks;
6 | using System.Unicode.Build.Core.DataSources;
7 |
8 | namespace System.Unicode.Build.Core
9 | {
10 | public static class DataSourceProvider
11 | {
12 | public static readonly Uri UnicodeCharacterDataUri = new Uri("http://www.unicode.org/Public/UCD/latest/ucd/", UriKind.Absolute);
13 | public static readonly Uri UcdEmojiDataUri = new Uri("http://www.unicode.org/Public/UCD/latest/ucd/emoji/", UriKind.Absolute);
14 | public static readonly Uri EmojiDataUri = new Uri("http://www.unicode.org/Public/emoji/latest/", UriKind.Absolute);
15 |
16 | public const string UnihanDataSourceName = "Unihan";
17 | public const string UcdDataSourceName = "UCD";
18 | public const string EmojiDataSourceName = "Emoji";
19 |
20 | public static readonly string[] UcdRequiredFiles = new[]
21 | {
22 | "UnicodeData.txt",
23 | "PropList.txt",
24 | "DerivedCoreProperties.txt",
25 | "CJKRadicals.txt",
26 | //"Jamo.txt", // Not used right now, as the hangul syllable algorithm implementation takes care of this.
27 | "NameAliases.txt",
28 | "NamesList.txt",
29 | "Blocks.txt",
30 | };
31 |
32 | public static readonly string[] UcdEmojiRequiredFiles = new[]
33 | {
34 | "emoji-data.txt",
35 | "emoji-variation-sequences.txt",
36 | };
37 |
38 | public static readonly string[] UnihanRequiredFiles = new[]
39 | {
40 | "Unihan_NumericValues.txt",
41 | "Unihan_Readings.txt",
42 | "Unihan_Variants.txt",
43 | "Unihan_IRGSources.txt",
44 | };
45 |
46 | public static readonly string[] EmojiRequiredFiles = new[]
47 | {
48 | //"emoji-data.txt",
49 | "emoji-sequences.txt",
50 | //"emoji-variation-sequences.txt",
51 | "emoji-zwj-sequences.txt",
52 | };
53 |
54 | private static Task DownloadDataFileAsync(HttpClient httpClient, Uri baseUri, string archiveName)
55 | => httpClient.GetByteArrayAsync(new Uri(baseUri, archiveName));
56 |
57 | public static async Task GetDataSourceAsync(HttpClient httpClient, Uri baseUri, string baseDirectory, string dataSourceName, string[] requiredFiles, bool useArchive, bool? shouldDownload, bool? shouldSaveFiles, bool? shouldExtract)
58 | {
59 | string dataDirectory = Path.GetFullPath(Path.Combine(baseDirectory, dataSourceName));
60 | string dataArchiveFileName = dataSourceName + ".zip";
61 | string dataArchivePath = dataDirectory + ".zip";
62 |
63 | if (shouldDownload != true)
64 | {
65 | bool hasValidDirectory = Directory.Exists(dataDirectory);
66 |
67 | if (hasValidDirectory)
68 | {
69 | foreach (string requiredFile in requiredFiles)
70 | {
71 | if (!File.Exists(Path.Combine(dataDirectory, requiredFile)))
72 | {
73 | hasValidDirectory = false;
74 | break;
75 | }
76 | }
77 | }
78 |
79 | if (hasValidDirectory)
80 | {
81 | return new FileDataSource(dataDirectory);
82 | }
83 |
84 | if (useArchive && File.Exists(dataArchivePath))
85 | {
86 | if (shouldExtract == true)
87 | {
88 | ZipFile.ExtractToDirectory(dataArchivePath, dataDirectory);
89 | return new FileDataSource(dataDirectory);
90 | }
91 | else
92 | {
93 | return new ZipDataSource(File.OpenRead(dataArchivePath));
94 | }
95 | }
96 | }
97 |
98 | if (shouldDownload != false)
99 | {
100 | if (useArchive)
101 | {
102 | var dataArchiveData = await DownloadDataFileAsync(httpClient, baseUri, dataArchiveFileName).ConfigureAwait(false);
103 |
104 | if (shouldSaveFiles == true)
105 | {
106 | using (var stream = File.Open(dataArchivePath, FileMode.Create, FileAccess.ReadWrite, FileShare.Read))
107 | {
108 | await stream.WriteAsync(dataArchiveData, 0, dataArchiveData.Length).ConfigureAwait(false);
109 | dataArchiveData = null; // Release the reference now, since we won't need it anymore.
110 |
111 | if (shouldExtract == true)
112 | {
113 | using (var archive = new ZipArchive(stream, ZipArchiveMode.Read, false))
114 | {
115 | archive.ExtractToDirectory(dataDirectory);
116 |
117 | return new FileDataSource(dataDirectory);
118 | }
119 | }
120 | else
121 | {
122 | return new ZipDataSource(stream);
123 | }
124 | }
125 | }
126 | else
127 | {
128 | return new ZipDataSource(new MemoryStream(dataArchiveData));
129 | }
130 | }
131 | else
132 | {
133 | var downloadedFiles = await Task.WhenAll
134 | (
135 | Array.ConvertAll
136 | (
137 | requiredFiles,
138 | async requiredFile =>
139 | (
140 | Name: requiredFile,
141 | Data: await DownloadDataFileAsync(httpClient, baseUri, requiredFile).ConfigureAwait(false)
142 | )
143 | )
144 | ).ConfigureAwait(false);
145 |
146 | if (shouldSaveFiles == true)
147 | {
148 | Directory.CreateDirectory(dataDirectory);
149 |
150 | await Task.WhenAll
151 | (
152 | Array.ConvertAll
153 | (
154 | downloadedFiles,
155 | //file => File.WriteAllBytesAsync(Path.Combine(dataDirectory, file.Name), file.Data)
156 | async file =>
157 | {
158 | using (var stream = File.Open(Path.Combine(dataDirectory, file.Name), FileMode.Create, FileAccess.ReadWrite, FileShare.Read))
159 | {
160 | await stream.WriteAsync(file.Data, 0, file.Data.Length).ConfigureAwait(false);
161 | }
162 | }
163 | )
164 | ).ConfigureAwait(false);
165 |
166 | return new FileDataSource(dataDirectory);
167 | }
168 | else
169 | {
170 | return new InMemoryDataSource(downloadedFiles.ToDictionary(f => f.Name, f => f.Data));
171 | }
172 | }
173 | }
174 |
175 | throw new InvalidOperationException();
176 | }
177 | }
178 | }
179 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/DataSources/FileDataSource.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.Threading.Tasks;
3 |
4 | namespace System.Unicode.Build.Core.DataSources
5 | {
6 | public sealed class FileDataSource : IDataSource
7 | {
8 | private readonly string _baseDirectory;
9 |
10 | public FileDataSource(string baseDirectory)
11 | => _baseDirectory = Path.GetFullPath(baseDirectory);
12 |
13 | public void Dispose()
14 | {
15 | }
16 |
17 | public ValueTask OpenDataFileAsync(string fileName)
18 | => new ValueTask(File.OpenRead(Path.Combine(_baseDirectory, fileName)));
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/DataSources/HttpDataSource.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.Net.Http;
3 | using System.Threading.Tasks;
4 |
5 | namespace System.Unicode.Build.Core.DataSources
6 | {
7 | public class HttpDataSource : IDataSource
8 | {
9 | private readonly HttpClient _httpClient;
10 | private readonly Uri _baseUri;
11 |
12 | public HttpDataSource(Uri baseUri, HttpClient httpClient)
13 | {
14 | _httpClient = httpClient ?? new HttpClient();
15 | _baseUri = baseUri;
16 | }
17 |
18 | public void Dispose() => _httpClient.Dispose();
19 |
20 | public ValueTask OpenDataFileAsync(string fileName)
21 | => new ValueTask(_httpClient.GetStreamAsync(_baseUri + fileName));
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/DataSources/InMemoryDataSource.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.IO;
3 | using System.Threading.Tasks;
4 |
5 | namespace System.Unicode.Build.Core.DataSources
6 | {
7 | internal class InMemoryDataSource : IDataSource
8 | {
9 | private readonly Dictionary _files;
10 |
11 | public InMemoryDataSource(Dictionary files) => _files = files;
12 |
13 | public void Dispose() { }
14 |
15 | public ValueTask OpenDataFileAsync(string fileName)
16 | => new ValueTask(new MemoryStream(_files[fileName], false));
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/DataSources/ZipDataSource.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.IO.Compression;
3 | using System.Linq;
4 | using System.Threading.Tasks;
5 |
6 | namespace System.Unicode.Build.Core.DataSources
7 | {
8 | public sealed class ZipDataSource : IDataSource
9 | {
10 | private readonly ZipArchive _archive;
11 |
12 | public ZipDataSource(Stream stream) => _archive = new ZipArchive(stream, ZipArchiveMode.Read, false);
13 |
14 | public void Dispose() => _archive.Dispose();
15 |
16 | public ValueTask OpenDataFileAsync(string fileName)
17 | {
18 | var entry = _archive.Entries.Where(e => e.FullName == fileName).FirstOrDefault();
19 |
20 | if (entry == null) throw new FileNotFoundException();
21 |
22 | return new ValueTask(entry.Open());
23 | }
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/EnumHelper.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using System.Reflection;
4 |
5 | namespace System.Unicode.Build.Core
6 | {
7 | internal static class EnumHelper
8 | where T : struct, Enum
9 | {
10 | private static readonly Dictionary NamedValueDictionary = CreateNamedValueDictionary();
11 |
12 | private static Dictionary CreateNamedValueDictionary()
13 | {
14 | var type = typeof(T).GetTypeInfo();
15 |
16 | if (!type.IsEnum) throw new InvalidOperationException();
17 |
18 | return
19 | (
20 | from field in type.DeclaredFields
21 | where field.IsPublic && field.IsLiteral
22 | from attr in field.GetCustomAttributes()
23 | where attr.Name != null
24 | select new KeyValuePair(attr.Name, (T)field.GetValue(null))
25 | ).ToDictionary(kvp => kvp.Key, kvp => kvp.Value, StringComparer.OrdinalIgnoreCase);
26 | }
27 |
28 | public static bool TryGetNamedValue(string name, out T value)
29 | {
30 | return NamedValueDictionary.TryGetValue(name, out value);
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/HexCodePoint.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode.Build.Core
2 | {
3 | public static class HexCodePoint
4 | {
5 | public static int ParsePrefixed(string s)
6 | {
7 | if (!s.StartsWith("U+"))
8 | {
9 | throw new FormatException("Expected a code point in the form U+nnnn.");
10 | }
11 | return Parse(s, 2);
12 | }
13 |
14 | public static int Parse(string s, int index) => Parse(s, ref index);
15 |
16 | public static int Parse(string s, ref int index)
17 | {
18 | int i = index;
19 | int accum = 0;
20 |
21 | while (i < s.Length)
22 | {
23 | char c = s[i];
24 |
25 | if (c == ' ') break;
26 |
27 | accum <<= 4;
28 |
29 | if (c >= '0' && c <= '9') accum |= c - '0';
30 | else if (c >= 'A' && c <= 'F') accum |= c - 'A' + 0xA;
31 | else if (c >= 'a' && c <= 'f') accum |= c - 'a' + 0xA;
32 | else throw new FormatException();
33 |
34 | ++i;
35 | }
36 |
37 | index = i;
38 | return accum;
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/IDataSource.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.Threading.Tasks;
3 |
4 | namespace System.Unicode.Build.Core
5 | {
6 | public interface IDataSource : IDisposable
7 | {
8 | ValueTask OpenDataFileAsync(string fileName);
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/System.Unicode.Build.Core.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.0
5 | True
6 | $(DefineConstants);BUILD_SYSTEM
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 | TextTemplatingFileGenerator
47 | UnihanProperty.cs
48 |
49 |
50 |
51 |
52 |
53 | True
54 | True
55 | UnihanProperty.tt
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/UnicodeCharacterDataBuilder.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Diagnostics;
3 | using System.Globalization;
4 | using System.IO;
5 | using System.Linq;
6 |
7 | namespace System.Unicode.Build.Core
8 | {
9 | [DebuggerDisplay("{CodePointRange} {DisplayName,nq}")]
10 | public sealed class UnicodeCharacterDataBuilder
11 | {
12 | private UnicodeCategory _category = UnicodeCategory.OtherNotAssigned;
13 |
14 | private readonly List _nameAliases = new List();
15 | private readonly List _crossRerefences = new List();
16 |
17 | public UnicodeCodePointRange CodePointRange { get; }
18 |
19 | public string Name { get; set; }
20 |
21 | public IList NameAliases => _nameAliases;
22 |
23 | public UnicodeCategory Category
24 | {
25 | get => _category;
26 | set => _category = Enum.IsDefined(typeof(UnicodeCategory), value) ?
27 | value :
28 | throw new ArgumentOutOfRangeException(nameof(value));
29 | }
30 |
31 | public CanonicalCombiningClass CanonicalCombiningClass { get; set; } // Even values not defined in the enum are allowed here.
32 | public BidirectionalClass BidirectionalClass { get; set; }
33 | public CharacterDecompositionMapping CharacterDecompositionMapping { get; set; }
34 | public UnicodeNumericType NumericType { get; set; }
35 | public UnicodeRationalNumber NumericValue { get; set; }
36 | public string OldName { get; set; }
37 | public bool BidirectionalMirrored { get; set; }
38 | public string SimpleUpperCaseMapping { get; set; }
39 | public string SimpleLowerCaseMapping { get; set; }
40 | public string SimpleTitleCaseMapping { get; set; }
41 | public ContributoryProperties ContributoryProperties { get; set; }
42 | public CoreProperties CoreProperties { get; set; }
43 | public EmojiProperties EmojiProperties { get; set; }
44 | public IList CrossRerefences => _crossRerefences;
45 |
46 | public UnicodeCharacterDataBuilder(int codePoint)
47 | : this(new UnicodeCodePointRange(codePoint))
48 | {
49 | }
50 |
51 | public UnicodeCharacterDataBuilder(UnicodeCodePointRange codePointRange)
52 | {
53 | CodePointRange = codePointRange;
54 | _category = UnicodeCategory.OtherNotAssigned;
55 | }
56 |
57 | private string DisplayName => Name ?? OldName;
58 |
59 | internal UnicodeCharacterData ToCharacterData()
60 | => new UnicodeCharacterData
61 | (
62 | CodePointRange,
63 | Name,
64 | _nameAliases.Count > 0 ? _nameAliases.ToArray() : UnicodeNameAlias.EmptyArray,
65 | Category,
66 | CanonicalCombiningClass,
67 | BidirectionalClass,
68 | CharacterDecompositionMapping.DecompositionType,
69 | CharacterDecompositionMapping.DecompositionMapping,
70 | NumericType,
71 | NumericValue,
72 | BidirectionalMirrored,
73 | OldName,
74 | SimpleUpperCaseMapping,
75 | SimpleLowerCaseMapping,
76 | SimpleTitleCaseMapping,
77 | ContributoryProperties,
78 | (int)CoreProperties | (int)EmojiProperties << 24,
79 | CrossRerefences.Count > 0 ? CrossRerefences.ToArray() : null
80 | );
81 |
82 | internal void WriteToFile(BinaryWriter writer)
83 | {
84 | if (_nameAliases.Count > 64) throw new InvalidDataException("Cannot handle more than 64 name aliases.");
85 |
86 | UcdFields fields = default;
87 |
88 | if (!CodePointRange.IsSingleCodePoint) fields = UcdFields.CodePointRange;
89 |
90 | if (Name != null || _nameAliases.Count > 0) fields |= UcdFields.Name; // This field combines name and alias.
91 | if (_category != UnicodeCategory.OtherNotAssigned) fields |= UcdFields.Category;
92 | if (CanonicalCombiningClass != CanonicalCombiningClass.NotReordered) fields |= UcdFields.CanonicalCombiningClass;
93 | if (BidirectionalClass != 0) fields |= UcdFields.BidirectionalClass;
94 | if (CharacterDecompositionMapping.DecompositionMapping != null) fields |= UcdFields.DecompositionMapping;
95 | fields |= (UcdFields)((int)NumericType << 6);
96 | if (BidirectionalMirrored) fields |= UcdFields.BidirectionalMirrored;
97 | if (OldName != null) fields |= UcdFields.OldName;
98 | if (SimpleUpperCaseMapping != null) fields |= UcdFields.SimpleUpperCaseMapping;
99 | if (SimpleLowerCaseMapping != null) fields |= UcdFields.SimpleLowerCaseMapping;
100 | if (SimpleTitleCaseMapping != null) fields |= UcdFields.SimpleTitleCaseMapping;
101 | if (ContributoryProperties != 0) fields |= UcdFields.ContributoryProperties;
102 | if (CoreProperties != 0 || EmojiProperties != 0) fields |= UcdFields.CorePropertiesAndEmojiProperties;
103 | if (_crossRerefences.Count > 0) fields |= UcdFields.CrossRerefences;
104 |
105 | writer.Write((ushort)fields);
106 |
107 | writer.WriteCodePoint(CodePointRange.FirstCodePoint);
108 | if ((fields & UcdFields.CodePointRange) != 0) writer.WriteCodePoint(CodePointRange.LastCodePoint);
109 |
110 | if ((fields & UcdFields.Name) != 0)
111 | {
112 | // We write the names by optimizing for the common case.
113 | // i.e. Most characters have only one name.
114 | // The first 8 bit sequence will encore either the length of the name property alone,
115 | // or the number of aliases and a bit indicating the presence of the name property.
116 |
117 | if (_nameAliases.Count > 0)
118 | {
119 | writer.WritePackedLength((byte)(Name != null ? 3 : 2), _nameAliases.Count);
120 |
121 | if (Name != null)
122 | writer.WriteNamePropertyToFile(Name);
123 |
124 | foreach (var nameAlias in _nameAliases)
125 | writer.WriteNameAliasToFile(nameAlias);
126 | }
127 | else
128 | {
129 | writer.WriteNamePropertyToFile(Name);
130 | }
131 | }
132 | if ((fields & UcdFields.Category) != 0) writer.Write((byte)_category);
133 | if ((fields & UcdFields.CanonicalCombiningClass) != 0) writer.Write((byte)CanonicalCombiningClass);
134 | if ((fields & UcdFields.BidirectionalClass) != 0) writer.Write((byte)BidirectionalClass);
135 | if ((fields & UcdFields.DecompositionMapping) != 0)
136 | {
137 | writer.Write((byte)CharacterDecompositionMapping.DecompositionType);
138 | writer.Write(CharacterDecompositionMapping.DecompositionMapping);
139 | }
140 | if ((fields & UcdFields.NumericNumeric) != 0)
141 | {
142 | writer.Write(NumericValue.Numerator);
143 | writer.WriteVariableUInt64(NumericValue.Denominator);
144 | }
145 | if ((fields & UcdFields.OldName) != 0) writer.Write(OldName);
146 | if ((fields & UcdFields.SimpleUpperCaseMapping) != 0) writer.Write(SimpleUpperCaseMapping);
147 | if ((fields & UcdFields.SimpleLowerCaseMapping) != 0) writer.Write(SimpleLowerCaseMapping);
148 | if ((fields & UcdFields.SimpleTitleCaseMapping) != 0) writer.Write(SimpleTitleCaseMapping);
149 | if ((fields & UcdFields.ContributoryProperties) != 0) writer.Write((int)ContributoryProperties);
150 | if ((fields & UcdFields.CorePropertiesAndEmojiProperties) != 0)
151 | {
152 | // This encoding is very dirty and needs to be reworked. For now I just want to make this work.
153 | // First byte has its 2 MSB indicating presence of 1) Emoji P. 2) Core P. Value 00xxxxxx is invalid & not used at all.
154 | // If emoji properties are present, they are contained in the first byte, possibly followed by an Int24 for core properties.
155 | // If emoji properties are absent, the byte is the high part of core properties, followed by an Int16 for the rest.
156 | if (CoreProperties != 0)
157 | {
158 | if (EmojiProperties != 0)
159 | {
160 | writer.Write((byte)(192 | (byte)EmojiProperties));
161 | writer.WriteUInt24((int)CoreProperties & 0x00FFFFFF);
162 | }
163 | else
164 | {
165 | writer.Write((byte)(64 | (int)CoreProperties >> 16));
166 | writer.Write((ushort)CoreProperties);
167 | }
168 | }
169 | else
170 | {
171 | writer.Write((byte)(128 | (byte)EmojiProperties));
172 | }
173 | }
174 | if ((fields & UcdFields.CrossRerefences) != 0)
175 | {
176 | writer.Write(checked((byte)(_crossRerefences.Count - 1)));
177 | foreach (int crossReference in _crossRerefences)
178 | writer.WriteCodePoint(crossReference);
179 | }
180 | }
181 | }
182 | }
183 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/UnicodeDataFileReader.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 |
3 | namespace System.Unicode.Build.Core
4 | {
5 | public class UnicodeDataFileReader : IDisposable
6 | {
7 | private readonly Stream _stream;
8 | private readonly byte[] _byteBuffer;
9 | private int _index;
10 | private int _length;
11 | private readonly char _fieldSeparator;
12 | private bool _hasField = false;
13 | private readonly bool _leaveOpen;
14 |
15 | public UnicodeDataFileReader(Stream stream, char fieldSeparator)
16 | : this(stream, fieldSeparator, false)
17 | {
18 | }
19 |
20 | public UnicodeDataFileReader(Stream stream, char fieldSeparator, bool leaveOpen)
21 | {
22 | _stream = stream;
23 | _fieldSeparator = fieldSeparator;
24 | _byteBuffer = new byte[8192];
25 | _leaveOpen = leaveOpen;
26 | }
27 |
28 | public void Dispose()
29 | {
30 | if (!_leaveOpen) _stream.Dispose();
31 | }
32 |
33 | private bool RefillBuffer()
34 | // Evilish line of code. 😈
35 | => (_length = _stream.Read(_byteBuffer, 0, _byteBuffer.Length)) != (_index = 0);
36 |
37 | private static bool IsNewLineOrComment(byte b)
38 | => b == '\n' || b == '#';
39 |
40 | /// Moves the stream to the next valid data row.
41 | /// if data is available; otherwise.
42 | public bool MoveToNextLine()
43 | {
44 | if (_length == 0)
45 | {
46 | if (RefillBuffer())
47 | {
48 | if (!IsNewLineOrComment(_byteBuffer[_index]))
49 | {
50 | _hasField = true;
51 | goto Completed;
52 | }
53 | }
54 | else
55 | {
56 | return false;
57 | }
58 | }
59 |
60 | do
61 | {
62 | while (_index < _length)
63 | {
64 | if (_byteBuffer[_index++] == '\n')
65 | {
66 | if ((_index < _length || RefillBuffer()) && !IsNewLineOrComment(_byteBuffer[_index]))
67 | {
68 | _hasField = true;
69 | goto Completed;
70 | }
71 | }
72 | }
73 | } while (RefillBuffer());
74 |
75 | _hasField = false;
76 | Completed:;
77 | return _hasField;
78 | }
79 |
80 | private string ReadFieldInternal(bool trim)
81 | {
82 | if (_length == 0) throw new InvalidOperationException();
83 |
84 | if (!_hasField) return null;
85 | else if (_index >= _length) RefillBuffer();
86 |
87 | // If the current character is a new line or a comment, we are at the end of a line.
88 | if (IsNewLineOrComment(_byteBuffer[_index]))
89 | {
90 | if (_hasField)
91 | {
92 | _hasField = false;
93 | return string.Empty;
94 | }
95 | else
96 | {
97 | return null;
98 | }
99 | }
100 |
101 | using (var buffer = Utf8Buffer.Get())
102 | {
103 | int startOffset;
104 | int endOffset;
105 |
106 | do
107 | {
108 | startOffset = _index;
109 | endOffset = -1;
110 |
111 | while (_index < _length)
112 | {
113 | byte b = _byteBuffer[_index];
114 |
115 | if (IsNewLineOrComment(b)) // NB: Do not advance to the next byte when end of line has been reached.
116 | {
117 | endOffset = _index;
118 | _hasField = false;
119 | break;
120 | }
121 | else if (b == _fieldSeparator)
122 | {
123 | endOffset = _index++;
124 | break;
125 | }
126 | else
127 | {
128 | ++_index;
129 | }
130 | }
131 |
132 | if (endOffset >= 0)
133 | {
134 | buffer.Append(_byteBuffer, startOffset, endOffset - startOffset);
135 | break;
136 | }
137 | else if (_index > startOffset)
138 | {
139 | buffer.Append(_byteBuffer, startOffset, _index - startOffset);
140 | }
141 | } while (RefillBuffer());
142 |
143 | return trim ? buffer.ToTrimmedString() : buffer.ToString();
144 | }
145 | }
146 |
147 | /// Reads the next data field.
148 | /// This method will return on end of line.
149 | /// The text value of the read field, if available; otherwise.
150 | public string ReadField() => ReadFieldInternal(false);
151 |
152 | /// Reads the next data field as a trimmed value.
153 | /// This method will return on end of line.
154 | /// The trimmed text value of the read field, if available; otherwise.
155 | public string ReadTrimmedField() => ReadFieldInternal(true);
156 |
157 | /// Skips the next data field.
158 | /// This method will return on end of line.
159 | /// if a field was skipped; otherwise.
160 | public bool SkipField()
161 | {
162 | if (_length == 0) throw new InvalidOperationException();
163 |
164 | if (!_hasField) return false;
165 | else if (_index >= _length) RefillBuffer();
166 |
167 | // If the current character is a new line or a comment, we are at the end of a line.
168 | if (IsNewLineOrComment(_byteBuffer[_index]))
169 | {
170 | _hasField = false;
171 | return false;
172 | }
173 |
174 | do
175 | {
176 | while (_index < _length)
177 | {
178 | byte b = _byteBuffer[_index];
179 |
180 | if (IsNewLineOrComment(b)) // NB: Do not advance to the next byte when end of line has been reached.
181 | {
182 | _hasField = false;
183 | return true;
184 | }
185 | else
186 | {
187 | ++_index;
188 |
189 | if (b == _fieldSeparator)
190 | {
191 | return true;
192 | }
193 | }
194 | }
195 | } while (RefillBuffer());
196 |
197 | return true;
198 | }
199 | }
200 | }
201 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/UnicodeDatabaseGenerator.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.IO.Compression;
3 | using System.Net.Http;
4 | using System.Threading.Tasks;
5 | using static System.Unicode.Build.Core.DataSourceProvider;
6 |
7 | namespace System.Unicode.Build.Core
8 | {
9 | public static class UnicodeDatabaseGenerator
10 | {
11 | public static async ValueTask GenerateDatabase(HttpClient httpClient, string baseDirectory, string outputFilePath, bool? shouldDownloadFiles, bool? shouldSaveFiles, bool? shouldExtractFiles)
12 | {
13 | UnicodeInfoBuilder data;
14 |
15 | baseDirectory = string.IsNullOrWhiteSpace(baseDirectory) ?
16 | Environment.CurrentDirectory :
17 | Path.GetFullPath(baseDirectory);
18 |
19 | using (var ucdSource = await GetDataSourceAsync(httpClient, UnicodeCharacterDataUri, baseDirectory, UcdDataSourceName, UcdRequiredFiles, true, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles))
20 | using (var unihanSource = await GetDataSourceAsync(httpClient, UnicodeCharacterDataUri, baseDirectory, UnihanDataSourceName, UnihanRequiredFiles, true, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles))
21 | using (var ucdEmojiSource = await GetDataSourceAsync(httpClient, UcdEmojiDataUri, baseDirectory, EmojiDataSourceName, UcdEmojiRequiredFiles, false, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles))
22 | //using (var emojiSource = await GetDataSourceAsync(httpClient, EmojiDataUri, baseDirectory, EmojiDataSourceName, EmojiRequiredFiles, false, shouldDownloadFiles, shouldSaveFiles, shouldExtractFiles))
23 | {
24 | data = await UnicodeDataProcessor.BuildDataAsync(ucdSource, unihanSource, ucdEmojiSource);
25 | }
26 |
27 | // This part is actually highly susceptible to framework version. Different frameworks give a different results.
28 | // In order to consistently produce the same result, the framework executing this code must be fixed.
29 | using (var stream = new DeflateStream(File.Create(outputFilePath), CompressionLevel.Optimal, false))
30 | {
31 | data.WriteToStream(stream);
32 | stream.Flush();
33 | }
34 | }
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/UnihanCharacterDataBuilder.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.IO;
3 |
4 | namespace System.Unicode.Build.Core
5 | {
6 | public sealed class UnihanCharacterDataBuilder
7 | {
8 | public int CodePoint { get; }
9 | public UnihanNumericType NumericType { get; set; }
10 | public long NumericValue { get; set; }
11 | public string Definition { get; set; }
12 | public string MandarinReading { get; set; }
13 | public string CantoneseReading { get; set; }
14 | public string JapaneseKunReading { get; set; }
15 | public string JapaneseOnReading { get; set; }
16 | public string KoreanReading { get; set; }
17 | public string HangulReading { get; set; }
18 | public string VietnameseReading { get; set; }
19 | public string SimplifiedVariant { get; set; }
20 | public string TraditionalVariant { get; set; }
21 | public IList UnicodeRadicalStrokeCounts => _unicodeRadicalStrokeCounts;
22 |
23 | private readonly List _unicodeRadicalStrokeCounts = new List();
24 |
25 | internal UnihanCharacterDataBuilder(int codePoint) => CodePoint = codePoint;
26 |
27 | internal UnihanCharacterData ToCharacterData()
28 | => new UnihanCharacterData
29 | (
30 | CodePoint,
31 | NumericType,
32 | NumericValue,
33 | _unicodeRadicalStrokeCounts.ToArray(),
34 | Definition,
35 | MandarinReading,
36 | CantoneseReading,
37 | JapaneseKunReading,
38 | JapaneseOnReading,
39 | KoreanReading,
40 | HangulReading,
41 | VietnameseReading,
42 | SimplifiedVariant,
43 | TraditionalVariant
44 | );
45 |
46 | internal void WriteToFile(BinaryWriter writer)
47 | {
48 | UnihanFields fields = default;
49 |
50 | fields |= (UnihanFields)NumericType;
51 | // For now, we have enough bits to encode the length of the array in the field specifier, so we'll do that.
52 | // (NB: A quick analysis of the files revealed thare there are almost always exactly one Radical/Stroke count, and occasionally two, yet never more.)
53 | if (_unicodeRadicalStrokeCounts.Count > 0)
54 | {
55 | if (_unicodeRadicalStrokeCounts.Count == 1) fields |= UnihanFields.UnicodeRadicalStrokeCount;
56 | else if (_unicodeRadicalStrokeCounts.Count == 2) fields |= UnihanFields.UnicodeRadicalStrokeCountTwice;
57 | else fields |= UnihanFields.UnicodeRadicalStrokeCountMore;
58 | }
59 | if (Definition != null) fields |= UnihanFields.Definition;
60 | if (MandarinReading != null) fields |= UnihanFields.MandarinReading;
61 | if (CantoneseReading != null) fields |= UnihanFields.CantoneseReading;
62 | if (JapaneseKunReading != null) fields |= UnihanFields.JapaneseKunReading;
63 | if (JapaneseOnReading != null) fields |= UnihanFields.JapaneseOnReading;
64 | if (KoreanReading != null) fields |= UnihanFields.KoreanReading;
65 | if (HangulReading != null) fields |= UnihanFields.HangulReading;
66 | if (VietnameseReading != null) fields |= UnihanFields.VietnameseReading;
67 | if (SimplifiedVariant != null) fields |= UnihanFields.SimplifiedVariant;
68 | if (TraditionalVariant != null) fields |= UnihanFields.TraditionalVariant;
69 |
70 | writer.Write((ushort)fields);
71 |
72 | writer.WriteCodePoint(UnihanCharacterData.PackCodePoint(CodePoint));
73 | if ((fields & UnihanFields.OtherNumeric) != 0) writer.Write(NumericValue);
74 |
75 | if ((fields & UnihanFields.UnicodeRadicalStrokeCountMore) != 0)
76 | {
77 | if ((fields & (UnihanFields.UnicodeRadicalStrokeCountMore)) == UnihanFields.UnicodeRadicalStrokeCountMore)
78 | writer.Write(checked((byte)(_unicodeRadicalStrokeCounts.Count - 3)));
79 |
80 | foreach (var radicalStrokeCount in _unicodeRadicalStrokeCounts)
81 | {
82 | writer.Write(radicalStrokeCount.Radical);
83 | writer.Write(radicalStrokeCount.RawStrokeCount);
84 | }
85 | }
86 |
87 | if ((fields & UnihanFields.Definition) != 0) writer.Write(Definition);
88 | if ((fields & UnihanFields.MandarinReading) != 0) writer.Write(MandarinReading);
89 | if ((fields & UnihanFields.CantoneseReading) != 0) writer.Write(CantoneseReading);
90 | if ((fields & UnihanFields.JapaneseKunReading) != 0) writer.Write(JapaneseKunReading);
91 | if ((fields & UnihanFields.JapaneseOnReading) != 0) writer.Write(JapaneseOnReading);
92 | if ((fields & UnihanFields.KoreanReading) != 0) writer.Write(KoreanReading);
93 | if ((fields & UnihanFields.HangulReading) != 0) writer.Write(HangulReading);
94 | if ((fields & UnihanFields.VietnameseReading) != 0) writer.Write(VietnameseReading);
95 | if ((fields & UnihanFields.SimplifiedVariant) != 0) writer.Write(SimplifiedVariant);
96 | if ((fields & UnihanFields.TraditionalVariant) != 0) writer.Write(TraditionalVariant);
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/UnihanDataFileReader.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 |
3 | namespace System.Unicode.Build.Core
4 | {
5 | public sealed class UnihanDataFileReader : IDisposable
6 | {
7 | private readonly UnicodeDataFileReader _reader;
8 |
9 | public int CodePoint { get; private set; }
10 |
11 | public string PropertyName { get; private set; }
12 |
13 | public string PropertyValue { get; private set; }
14 |
15 | public UnihanDataFileReader(Stream stream)
16 | : this(stream, false)
17 | {
18 | }
19 |
20 | public UnihanDataFileReader(Stream stream, bool leaveOpen) => _reader = new UnicodeDataFileReader(stream, '\t', leaveOpen);
21 |
22 | public void Dispose() => _reader.Dispose();
23 |
24 | public bool Read()
25 | {
26 | bool result;
27 |
28 | if (result = _reader.MoveToNextLine())
29 | {
30 | CodePoint = HexCodePoint.ParsePrefixed(_reader.ReadField());
31 | PropertyName = _reader.ReadField();
32 | PropertyValue = _reader.ReadField();
33 | }
34 | else
35 | {
36 | CodePoint = 0;
37 | PropertyName = null;
38 | PropertyValue = null;
39 | }
40 |
41 | return result;
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/UnihanProperty.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode.Build.Core
2 | {
3 | public static class UnihanProperty
4 | {
5 | public const string AccountingNumeric = "kAccountingNumeric";
6 | public const string BigFive = "kBigFive";
7 | public const string Cangjie = "kCangjie";
8 | public const string Cantonese = "kCantonese";
9 | public const string CCCII = "kCCCII";
10 | public const string CheungBauer = "kCheungBauer";
11 | public const string CheungBauerIndex = "kCheungBauerIndex";
12 | public const string CihaiT = "kCihaiT";
13 | public const string CNS1986 = "kCNS1986";
14 | public const string CNS1992 = "kCNS1992";
15 | public const string CompatibilityVariant = "kCompatibilityVariant";
16 | public const string Cowles = "kCowles";
17 | public const string DaeJaweon = "kDaeJaweon";
18 | public const string Definition = "kDefinition";
19 | public const string EACC = "kEACC";
20 | public const string Fenn = "kFenn";
21 | public const string FennIndex = "kFennIndex";
22 | public const string FourCornerCode = "kFourCornerCode";
23 | public const string Frequency = "kFrequency";
24 | public const string GB0 = "kGB0";
25 | public const string GB1 = "kGB1";
26 | public const string GB3 = "kGB3";
27 | public const string GB5 = "kGB5";
28 | public const string GB7 = "kGB7";
29 | public const string GB8 = "kGB8";
30 | public const string GradeLevel = "kGradeLevel";
31 | public const string GSR = "kGSR";
32 | public const string Hangul = "kHangul";
33 | public const string HanYu = "kHanYu";
34 | public const string HanyuPinlu = "kHanyuPinlu";
35 | public const string HanyuPinyin = "kHanyuPinyin";
36 | public const string HDZRadBreak = "kHDZRadBreak";
37 | public const string HKGlyph = "kHKGlyph";
38 | public const string HKSCS = "kHKSCS";
39 | public const string IBMJapan = "kIBMJapan";
40 | public const string IICore = "kIICore";
41 | public const string IRG_GSource = "kIRG_GSource";
42 | public const string IRG_HSource = "kIRG_HSource";
43 | public const string IRG_JSource = "kIRG_JSource";
44 | public const string IRG_KPSource = "kIRG_KPSource";
45 | public const string IRG_KSource = "kIRG_KSource";
46 | public const string IRG_MSource = "kIRG_MSource";
47 | public const string IRG_TSource = "kIRG_TSource";
48 | public const string IRG_USource = "kIRG_USource";
49 | public const string IRG_VSource = "kIRG_VSource";
50 | public const string IRGDaeJaweon = "kIRGDaeJaweon";
51 | public const string IRGDaiKanwaZiten = "kIRGDaiKanwaZiten";
52 | public const string IRGHanyuDaZidian = "kIRGHanyuDaZidian";
53 | public const string IRGKangXi = "kIRGKangXi";
54 | public const string JapaneseKun = "kJapaneseKun";
55 | public const string JapaneseOn = "kJapaneseOn";
56 | public const string Jis0 = "kJis0";
57 | public const string Jis1 = "kJis1";
58 | public const string JIS0213 = "kJIS0213";
59 | public const string KangXi = "kKangXi";
60 | public const string Karlgren = "kKarlgren";
61 | public const string Korean = "kKorean";
62 | public const string KPS0 = "kKPS0";
63 | public const string KPS1 = "kKPS1";
64 | public const string KSC0 = "kKSC0";
65 | public const string KSC1 = "kKSC1";
66 | public const string Lau = "kLau";
67 | public const string MainlandTelegraph = "kMainlandTelegraph";
68 | public const string Mandarin = "kMandarin";
69 | public const string Matthews = "kMatthews";
70 | public const string MeyerWempe = "kMeyerWempe";
71 | public const string Morohashi = "kMorohashi";
72 | public const string Nelson = "kNelson";
73 | public const string OtherNumeric = "kOtherNumeric";
74 | public const string Phonetic = "kPhonetic";
75 | public const string PrimaryNumeric = "kPrimaryNumeric";
76 | public const string PseudoGB1 = "kPseudoGB1";
77 | public const string RSAdobe_Japan1_6 = "kRSAdobe_Japan1_6";
78 | public const string RSJapanese = "kRSJapanese";
79 | public const string RSKangXi = "kRSKangXi";
80 | public const string RSKanWa = "kRSKanWa";
81 | public const string RSKorean = "kRSKorean";
82 | public const string RSUnicode = "kRSUnicode";
83 | public const string SBGY = "kSBGY";
84 | public const string SemanticVariant = "kSemanticVariant";
85 | public const string SimplifiedVariant = "kSimplifiedVariant";
86 | public const string SpecializedSemanticVariant = "kSpecializedSemanticVariant";
87 | public const string TaiwanTelegraph = "kTaiwanTelegraph";
88 | public const string Tang = "kTang";
89 | public const string TotalStrokes = "kTotalStrokes";
90 | public const string TraditionalVariant = "kTraditionalVariant";
91 | public const string Vietnamese = "kVietnamese";
92 | public const string Xerox = "kXerox";
93 | public const string XHC1983 = "kXHC1983";
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/UnihanProperty.tt:
--------------------------------------------------------------------------------
1 | <#@ template debug="false" hostspecific="false" language="C#" #>
2 | <#@ assembly name="System.Core" #>
3 | <#@ import namespace="System.Linq" #>
4 | <#@ import namespace="System.Text" #>
5 | <#@ import namespace="System.Collections.Generic" #>
6 | <#@ output extension=".cs" #>
7 | <#
8 | var propertyNames = new[]
9 | {
10 | "kAccountingNumeric",
11 | "kBigFive",
12 | "kCangjie",
13 | "kCantonese",
14 | "kCCCII",
15 | "kCheungBauer",
16 | "kCheungBauerIndex",
17 | "kCihaiT",
18 | "kCNS1986",
19 | "kCNS1992",
20 | "kCompatibilityVariant",
21 | "kCowles",
22 | "kDaeJaweon",
23 | "kDefinition",
24 | "kEACC",
25 | "kFenn",
26 | "kFennIndex",
27 | "kFourCornerCode",
28 | "kFrequency",
29 | "kGB0",
30 | "kGB1",
31 | "kGB3",
32 | "kGB5",
33 | "kGB7",
34 | "kGB8",
35 | "kGradeLevel",
36 | "kGSR",
37 | "kHangul",
38 | "kHanYu",
39 | "kHanyuPinlu",
40 | "kHanyuPinyin",
41 | "kHDZRadBreak",
42 | "kHKGlyph",
43 | "kHKSCS",
44 | "kIBMJapan",
45 | "kIICore",
46 | "kIRG_GSource",
47 | "kIRG_HSource",
48 | "kIRG_JSource",
49 | "kIRG_KPSource",
50 | "kIRG_KSource",
51 | "kIRG_MSource",
52 | "kIRG_TSource",
53 | "kIRG_USource",
54 | "kIRG_VSource",
55 | "kIRGDaeJaweon",
56 | "kIRGDaiKanwaZiten",
57 | "kIRGHanyuDaZidian",
58 | "kIRGKangXi",
59 | "kJapaneseKun",
60 | "kJapaneseOn",
61 | "kJis0",
62 | "kJis1",
63 | "kJIS0213",
64 | "kKangXi",
65 | "kKarlgren",
66 | "kKorean",
67 | "kKPS0",
68 | "kKPS1",
69 | "kKSC0",
70 | "kKSC1",
71 | "kLau",
72 | "kMainlandTelegraph",
73 | "kMandarin",
74 | "kMatthews",
75 | "kMeyerWempe",
76 | "kMorohashi",
77 | "kNelson",
78 | "kOtherNumeric",
79 | "kPhonetic",
80 | "kPrimaryNumeric",
81 | "kPseudoGB1",
82 | "kRSAdobe_Japan1_6",
83 | "kRSJapanese",
84 | "kRSKangXi",
85 | "kRSKanWa",
86 | "kRSKorean",
87 | "kRSUnicode",
88 | "kSBGY",
89 | "kSemanticVariant",
90 | "kSimplifiedVariant",
91 | "kSpecializedSemanticVariant",
92 | "kTaiwanTelegraph",
93 | "kTang",
94 | "kTotalStrokes",
95 | "kTraditionalVariant",
96 | "kVietnamese",
97 | "kXerox",
98 | "kXHC1983",
99 | };
100 | #>
101 | namespace System.Unicode.Build.Core
102 | {
103 | public static class UnihanProperty
104 | {
105 | <# foreach (string propertyName in propertyNames) { #>
106 | public const string <#=propertyName[0] == 'k' ? propertyName.Substring(1) : propertyName#> = "<#=propertyName#>";
107 | <# } #>
108 | }
109 | }
110 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Core/Utf8Buffer.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Concurrent;
2 | using System.Text;
3 |
4 | namespace System.Unicode.Build.Core
5 | {
6 | public struct Utf8Buffer : IDisposable
7 | {
8 | private static readonly ConcurrentStack BufferStack = new ConcurrentStack();
9 |
10 | public static Utf8Buffer Get() => new Utf8Buffer(BufferStack.TryPop(out var buffer) ? buffer : new byte[100]);
11 |
12 | private byte[] _buffer;
13 |
14 | public int Length { get; private set; }
15 |
16 | private Utf8Buffer(byte[] buffer)
17 | {
18 | _buffer = buffer;
19 | Length = 0;
20 | }
21 |
22 | public void Dispose()
23 | {
24 | if (_buffer != null)
25 | {
26 | BufferStack.Push(_buffer);
27 | this = default;
28 | }
29 | }
30 |
31 | private void EnsureExtraCapacity(int count)
32 | {
33 | if (count < 0) throw new ArgumentOutOfRangeException(nameof(count));
34 | if (_buffer.Length < checked(Length + count))
35 | Array.Resize(ref _buffer, Math.Max(Length + count, _buffer.Length << 1));
36 | }
37 |
38 | public void Append(byte[] value, int startIndex, int count)
39 | {
40 | if (value == null) throw new ArgumentNullException(nameof(value));
41 | if (startIndex >= value.Length) throw new ArgumentOutOfRangeException(nameof(startIndex));
42 | if (checked(count += startIndex) > value.Length) throw new ArgumentOutOfRangeException(nameof(count));
43 |
44 | EnsureExtraCapacity(value.Length);
45 |
46 | var buffer = _buffer;
47 |
48 | for (int i = startIndex; i < count; ++i)
49 | {
50 | buffer[Length++] = value[i];
51 | }
52 | }
53 |
54 | public override string ToString() => Length > 0 ? Encoding.UTF8.GetString(_buffer, 0, Length) : string.Empty;
55 |
56 | public string ToTrimmedString()
57 | {
58 | if (Length == 0) return string.Empty;
59 |
60 | var buffer = _buffer;
61 | int start = 0;
62 | int end = Length;
63 |
64 | while (buffer[start] == ' ') if (++start == Length) return string.Empty;
65 | while (buffer[--end] == ' ') ;
66 |
67 | return end > start ? Encoding.UTF8.GetString(buffer, start, end - start + 1) : string.Empty;
68 | }
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/System.Unicode.Build.DatabaseGenerator/Program.cs:
--------------------------------------------------------------------------------
1 | using System.Net.Http;
2 | using System.Threading.Tasks;
3 | using System.Unicode.Build.Core;
4 |
5 | namespace System.Unicode.Build.DatabaseGenerator
6 | {
7 | internal static class Program
8 | {
9 | private static async Task Main(string[] args)
10 | {
11 | // The sole purpose of this program is to consistently generate the database using .NET Core 2.2.
12 | using (var httpClient = new HttpClient())
13 | {
14 | await UnicodeDatabaseGenerator.GenerateDatabase(httpClient, args[0], args[1], null, null, null);
15 | }
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/System.Unicode.Build.DatabaseGenerator/Properties/launchSettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "profiles": {
3 | "System.Unicode.Build.DatabaseGenerator": {
4 | "commandName": "Project",
5 | "commandLineArgs": ". ucd.dat"
6 | }
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/System.Unicode.Build.DatabaseGenerator/System.Unicode.Build.DatabaseGenerator.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net7.0
6 | GenerateDatabase
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Tasks/AsyncTask.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Concurrent;
2 | using System.Resources;
3 | using System.Threading;
4 | using System.Threading.Tasks;
5 | using Microsoft.Build.Framework;
6 | #if NETSTANDARD2_0
7 | using BuildTask = Microsoft.Build.Utilities.Task;
8 | #else
9 | using BuildTask = Microsoft.Build.Utilities.AppDomainIsolatedTask;
10 | #endif
11 |
12 | namespace System.Unicode.Build.Tasks
13 | {
14 | [RunInMTA]
15 | [LoadInSeparateAppDomain]
16 | public abstract class AsyncTask : BuildTask, ICancelableTask
17 | {
18 | private sealed class AsyncTaskSynchronizationContext : SynchronizationContext, IDisposable
19 | {
20 | private readonly BlockingCollection<(SendOrPostCallback d, object state)> _queuedMessages;
21 | private readonly SynchronizationContext _oldSynchronizationContext;
22 |
23 | public AsyncTaskSynchronizationContext(BlockingCollection<(SendOrPostCallback d, object state)> queuedMessages)
24 | {
25 | _queuedMessages = queuedMessages;
26 | _oldSynchronizationContext = Current;
27 | SetSynchronizationContext(this);
28 | }
29 |
30 | public void Dispose() => SetSynchronizationContext(_oldSynchronizationContext);
31 |
32 | public override void OperationStarted() => throw new NotSupportedException();
33 |
34 | public override void OperationCompleted() => throw new NotSupportedException();
35 |
36 | public override void Post(SendOrPostCallback d, object state) => _queuedMessages.Add((d, state));
37 |
38 | public override void Send(SendOrPostCallback d, object state) => throw new NotSupportedException();
39 | }
40 |
41 | private CancellationTokenSource _cancellationTokenSource;
42 |
43 | protected AsyncTask()
44 | {
45 | }
46 |
47 | protected AsyncTask(ResourceManager taskResources) : base(taskResources)
48 | {
49 | }
50 |
51 | protected AsyncTask(ResourceManager taskResources, string helpKeywordPrefix) : base(taskResources, helpKeywordPrefix)
52 | {
53 | }
54 |
55 | private static CancellationToken CancelOnCompletion(Task task)
56 | {
57 | if (task.IsCompleted) return new CancellationToken(true);
58 |
59 | var cts = new CancellationTokenSource();
60 |
61 | task.ContinueWith
62 | (
63 | (t, state) =>
64 | {
65 | ((CancellationTokenSource)state).Cancel();
66 | },
67 | cts,
68 | TaskContinuationOptions.ExecuteSynchronously
69 | );
70 |
71 | return cts.Token;
72 | }
73 |
74 | public sealed override bool Execute()
75 | {
76 | _cancellationTokenSource = new CancellationTokenSource();
77 | try
78 | {
79 | var queuedMessages = new BlockingCollection<(SendOrPostCallback callback, object state)>();
80 |
81 | using (new AsyncTaskSynchronizationContext(queuedMessages))
82 | {
83 | var task = ExecuteAsync(_cancellationTokenSource.Token);
84 |
85 | var ct = CancelOnCompletion(task);
86 |
87 | while (!ct.IsCancellationRequested)
88 | {
89 | SendOrPostCallback callback;
90 | object state;
91 |
92 | try
93 | {
94 | (callback, state) = queuedMessages.Take(ct);
95 | }
96 | catch (OperationCanceledException) when (ct.IsCancellationRequested || _cancellationTokenSource.IsCancellationRequested)
97 | {
98 | break;
99 | }
100 |
101 | callback(state);
102 | }
103 |
104 | return task.Result;
105 | }
106 | }
107 | finally
108 | {
109 | _cancellationTokenSource.Dispose();
110 | _cancellationTokenSource = null;
111 | }
112 | }
113 |
114 | public void Cancel() => _cancellationTokenSource?.Cancel();
115 |
116 | protected abstract Task ExecuteAsync(CancellationToken cancellationToken);
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Tasks/GenerateUnicodeDatabase.cs:
--------------------------------------------------------------------------------
1 | using System.IO;
2 | using System.Net.Http;
3 | using System.Resources;
4 | using System.Threading;
5 | using System.Threading.Tasks;
6 | using System.Unicode.Build.Core;
7 | using Microsoft.Build.Framework;
8 |
9 | namespace System.Unicode.Build.Tasks
10 | {
11 | [RunInMTA]
12 | public sealed class GenerateUnicodeDatabase : AsyncTask
13 | {
14 | public GenerateUnicodeDatabase()
15 | {
16 | }
17 |
18 | public GenerateUnicodeDatabase(ResourceManager taskResources) : base(taskResources)
19 | {
20 | }
21 |
22 | public GenerateUnicodeDatabase(ResourceManager taskResources, string helpKeywordPrefix) : base(taskResources, helpKeywordPrefix)
23 | {
24 | }
25 |
26 | [Required]
27 | public string DatabasePath { get; set; }
28 |
29 | public string IntermediateDirectory { get; set; }
30 |
31 | public bool? ShouldDownloadFiles { get; set; }
32 |
33 | public bool? ShouldSaveFiles { get; set; }
34 |
35 | public bool? ShouldExtractFiles { get; set; }
36 |
37 | protected override async Task ExecuteAsync(CancellationToken cancellationToken)
38 | {
39 | using (var httpClient = new HttpClient())
40 | {
41 | string baseDirectory = IntermediateDirectory;
42 |
43 | baseDirectory = string.IsNullOrWhiteSpace(baseDirectory) ?
44 | Environment.CurrentDirectory :
45 | Path.GetFullPath(baseDirectory);
46 |
47 | await UnicodeDatabaseGenerator.GenerateDatabase(httpClient, baseDirectory, DatabasePath, ShouldDownloadFiles, ShouldSaveFiles, ShouldExtractFiles);
48 |
49 | return true;
50 | }
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Tasks/GetUnicodeDatabaseVersion.cs:
--------------------------------------------------------------------------------
1 | using System.Buffers.Binary;
2 | using System.IO;
3 | using System.IO.Compression;
4 | using System.Resources;
5 | using System.Threading;
6 | using System.Threading.Tasks;
7 | using Microsoft.Build.Framework;
8 |
9 | namespace System.Unicode.Build.Tasks
10 | {
11 | [RunInMTA]
12 | public sealed class GetUnicodeDatabaseVersion : AsyncTask
13 | {
14 | public GetUnicodeDatabaseVersion()
15 | {
16 | }
17 |
18 | public GetUnicodeDatabaseVersion(ResourceManager taskResources) : base(taskResources)
19 | {
20 | }
21 |
22 | public GetUnicodeDatabaseVersion(ResourceManager taskResources, string helpKeywordPrefix) : base(taskResources, helpKeywordPrefix)
23 | {
24 | }
25 |
26 | [Required]
27 | public string DatabasePath { get; set; }
28 |
29 | [Output]
30 | public string UnicodeDatabaseVersion { get; private set; }
31 |
32 | protected override async Task ExecuteAsync(CancellationToken cancellationToken)
33 | {
34 | var buffer = new byte[8];
35 |
36 | using (var file = new DeflateStream(File.OpenRead(DatabasePath), CompressionMode.Decompress))
37 | {
38 | await file.ReadAsync(buffer, 0, buffer.Length);
39 | }
40 |
41 | if (TryReadHeader(buffer, out var version))
42 | {
43 | UnicodeDatabaseVersion = version.ToString(3);
44 | return true;
45 | }
46 |
47 | Log.LogError("The database contained an invalid header.");
48 |
49 | return false;
50 | }
51 |
52 | private static bool TryReadHeader(ReadOnlySpan buffer, out Version version)
53 | {
54 | if (!buffer.StartsWith(new byte[] { (byte)'U', (byte)'C', (byte)'D', 2 }))
55 | {
56 | version = null;
57 | return false;
58 | }
59 |
60 | buffer = buffer.Slice(4);
61 |
62 | ushort major = BinaryPrimitives.ReadUInt16LittleEndian(buffer);
63 |
64 | buffer = buffer.Slice(sizeof(ushort));
65 |
66 | byte minor = buffer[0];
67 | byte build = buffer[1];
68 |
69 | version = new Version(major, minor, build);
70 | return true;
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Tasks/System.Unicode.Build.Tasks.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net472;netstandard2.0
5 | $(NoWarn);NETSDK1138
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Tasks/System.Unicode.Build.Tasks.props:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | false
5 | netstandard2.0
6 | net472
7 | TaskHostFactory
8 | $(MSBuildThisFileDirectory)bin\$(Configuration)\$(SystemUnicodeBuildTasksTargetFramework)\$(MSBuildThisFileName).dll
9 |
10 |
11 |
16 |
17 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/System.Unicode.Build.Tasks/System.Unicode.Build.Tasks.targets:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 |
11 | ../UnicodeVersion.txt
12 | ucd.dat
13 | false
14 | true
15 | $(UnicodeDatabaseName)
16 | true
17 |
18 |
19 |
20 |
21 | $(BaseIntermediateOutputPath)Unicode/
22 | $(BaseIntermediateOutputPath)$(UnicodeDatabaseName)
23 |
24 |
25 |
26 |
27 | ResolveProjectReferences
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
42 |
43 |
44 |
45 |
46 |
47 | true
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 | true
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
75 |
76 |
79 |
80 |
81 |
82 |
83 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/CodePointEnumerableTests.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Linq;
3 | using Xunit;
4 |
5 | namespace System.Unicode.Tests
6 | {
7 | public class CodePointEnumerableTests
8 | {
9 | public static readonly TheoryData EnumerationTestData = new TheoryData
10 | {
11 | { new int[0], "" },
12 | { new int[] { 0x0041,0x1F600, 0x00E9 }, "\u0041\U0001F600\u00E9" },
13 | };
14 |
15 | [Theory]
16 | [MemberData(nameof(EnumerationTestData))]
17 | public void EnumerationShouldHaveExpectedResults(int[] expectedCharacters, string text)
18 | {
19 | var enumerable = text.AsCodePointEnumerable();
20 |
21 | // Test C# foreach enumeration
22 | {
23 | int i = 0;
24 | foreach (int codePoint in enumerable)
25 | {
26 | Assert.Equal(expectedCharacters[i++], codePoint);
27 | }
28 | Assert.Equal(expectedCharacters.Length, i);
29 | }
30 |
31 | // Test generic enumerable
32 | Assert.Equal(expectedCharacters, from codePoint in enumerable select codePoint);
33 |
34 | // Test legacy enumeration
35 | {
36 | // We could use Enumerable.Cast<>, but we can't guarantee that the LINQ implementation we use wouldn't be smart and cast IEnumerable back to IEnumerable
37 | var legacyEnumerator = ((IEnumerable)enumerable).GetEnumerator();
38 |
39 | int index = 0;
40 |
41 | while (legacyEnumerator.MoveNext())
42 | {
43 | Assert.True(index < expectedCharacters.Length);
44 | Assert.Equal(expectedCharacters[index++], Assert.IsType(legacyEnumerator.Current));
45 | }
46 |
47 | Assert.Equal(expectedCharacters.Length, index);
48 | }
49 | }
50 |
51 | [Fact]
52 | public void NullArgumentShouldThrowArgumentNullException()
53 | => Assert.Throws(() => { foreach (int codePoint in (null as string).AsCodePointEnumerable()) { } });
54 |
55 | public static readonly TheoryData EnumerationFailureTestData = new TheoryData
56 | {
57 | "\uDA00",
58 | "\uDCD0",
59 | "\uDCD0\uDA00",
60 | "\u0041\uDA00",
61 | "\u0041\uDCD0",
62 | "\uDA00\u0041",
63 | "\uDCD0\u0041",
64 | "\uDA00\u0041\uDCD0\u0041",
65 | "\u0041\uDA00\u0041\uDCD0\u0041",
66 | };
67 |
68 | [Theory]
69 | [MemberData(nameof(EnumerationFailureTestData))]
70 | public void EnumerationOfInvalidUtf16StringsShouldThrowArgumentException(XUnitSerializableString text)
71 | => Assert.Throws(() => { foreach (int codePoint in ((string)text).AsCodePointEnumerable()) { } });
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/ImportRequestedUnicodeVersion.targets:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ../UnicodeVersion.txt
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/PermissiveCodePointEnumerableTests.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Linq;
3 | using Xunit;
4 |
5 | namespace System.Unicode.Tests
6 | {
7 | public class PermissiveCodePointEnumerableTests
8 | {
9 | public static readonly TheoryData EnumerationTestData = new TheoryData
10 | {
11 | { new int[0], "" },
12 | { new int[] { 0xDA00 }, "\uDA00" },
13 | { new int[] { 0xDCD0 }, "\uDCD0" },
14 | { new int[] { 0xDCD0, 0xDA00 }, "\uDCD0\uDA00" },
15 | { new int[] { 0x0041, 0xDA00 }, "\u0041\uDA00" },
16 | { new int[] { 0x0041, 0xDCD0 }, "\u0041\uDCD0" },
17 | { new int[] { 0xDA00, 0x0041 }, "\uDA00\u0041" },
18 | { new int[] { 0xDCD0, 0x0041 }, "\uDCD0\u0041" },
19 | { new int[] { 0xDA00, 0x0041, 0xDCD0, 0x0041 }, "\uDA00\u0041\uDCD0\u0041" },
20 | { new int[] { 0x0041, 0xDA00, 0x0041, 0xDCD0, 0x0041 }, "\u0041\uDA00\u0041\uDCD0\u0041" },
21 | { new int[] { 0x0041, 0x1F600, 0x00E9 }, "\u0041\U0001F600\u00E9" },
22 | };
23 |
24 | [Theory]
25 | [MemberData(nameof(EnumerationTestData))]
26 | public void EnumerationShouldHaveExpectedResults(int[] expectedCharacters, XUnitSerializableString text)
27 | {
28 | var enumerable = ((string)text).AsPermissiveCodePointEnumerable();
29 |
30 | // Test C# foreach enumeration
31 | {
32 | int i = 0;
33 | foreach (int codePoint in enumerable)
34 | {
35 | Assert.Equal(expectedCharacters[i++], codePoint);
36 | }
37 | Assert.Equal(expectedCharacters.Length, i);
38 | }
39 |
40 | // Test generic enumerable
41 | Assert.Equal(expectedCharacters, from codePoint in enumerable select codePoint);
42 |
43 | // Test legacy enumeration
44 | {
45 | // We could use Enumerable.Cast<>, but we can't guarantee that the LINQ implementation we use wouldn't be smart and cast IEnumerable back to IEnumerable
46 | var legacyEnumerator = ((IEnumerable)enumerable).GetEnumerator();
47 |
48 | int index = 0;
49 |
50 | while (legacyEnumerator.MoveNext())
51 | {
52 | Assert.True(index < expectedCharacters.Length);
53 | Assert.Equal(expectedCharacters[index++], Assert.IsType(legacyEnumerator.Current));
54 | }
55 |
56 | Assert.Equal(expectedCharacters.Length, index);
57 | }
58 | }
59 |
60 | [Fact]
61 | public void NullArgumentShouldThrowArgumentNullException()
62 | => Assert.Throws(() => { foreach (int c in (null as string).AsPermissiveCodePointEnumerable()) { } });
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/System.Unicode.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net7.0
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 | all
17 | runtime; build; native; contentfiles; analyzers; buildtransitive
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/UnicodeCodePointRangeTests.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using Xunit;
3 |
4 | namespace System.Unicode.Tests
5 | {
6 | public class UnicodeCodePointRangeTests
7 | {
8 | [Theory]
9 | [InlineData(0, 0x10FFFF)]
10 | public void MultiCodePointRangeShouldHaveExpectedResults(int firstCodePoint, int lastCodePoint)
11 | {
12 | var range = new UnicodeCodePointRange(firstCodePoint, lastCodePoint);
13 |
14 | Assert.Equal(firstCodePoint, range.FirstCodePoint);
15 | Assert.Equal(lastCodePoint, range.LastCodePoint);
16 | Assert.False(range.IsSingleCodePoint);
17 | }
18 |
19 | [Theory]
20 | [InlineData((int)'A')]
21 | [InlineData(0x0)]
22 | [InlineData(0x10FFFF)]
23 | public void SingleCodePointRangeShouldHaveExpectedResults(int codePoint)
24 | {
25 | var range = new UnicodeCodePointRange(codePoint);
26 |
27 | Assert.Equal(codePoint, range.FirstCodePoint);
28 | Assert.Equal(codePoint, range.LastCodePoint);
29 | Assert.True(range.IsSingleCodePoint);
30 | }
31 |
32 | [Theory]
33 | [InlineData(0, 0, "0000")]
34 | [InlineData(0x1, 0x30, "0001..0030")]
35 | [InlineData(0x41, 0x5A, "0041..005A")]
36 | [InlineData(0x0, 0xFFFF, "0000..FFFF")]
37 | [InlineData(0xFFFF, 0xFFFF, "FFFF")]
38 | [InlineData(0xFFFF, 0x10000, "FFFF..10000")]
39 | [InlineData(0x10000, 0x10000, "10000")]
40 | [InlineData(0, 0xF0000, "0000..F0000")]
41 | [InlineData(0xFFFFF, 0xFFFFF, "FFFFF")]
42 | [InlineData(0, 0xFFFFF, "0000..FFFFF")]
43 | [InlineData(0, 0x10FFFF, "0000..10FFFF")]
44 | [InlineData(0xFFFF, 0x10FFFF, "FFFF..10FFFF")]
45 | [InlineData(0x1FFFF, 0x10FFFF, "1FFFF..10FFFF")]
46 | [InlineData(0x10FFFE, 0x10FFFF, "10FFFE..10FFFF")]
47 | [InlineData(0x10FFFF, 0x10FFFF, "10FFFF")]
48 | public void ToStringShouldProduceExpectedResultForCodePoints(int firstCodePoint, int lastCodePoint, string expectedResult)
49 | {
50 | var range = new UnicodeCodePointRange(firstCodePoint, lastCodePoint);
51 |
52 | Assert.Equal(expectedResult, range.ToString());
53 | }
54 |
55 | [Theory]
56 | [InlineData((int)'A', "0041")]
57 | [InlineData(0x0, "0000")]
58 | [InlineData(0xFFFF, "FFFF")]
59 | [InlineData(0x10000, "10000")]
60 | [InlineData(0x1FFFF, "1FFFF")]
61 | [InlineData(0xFFFFF, "FFFFF")]
62 | [InlineData(0x10FFFF, "10FFFF")]
63 | public void ToStringShouldProduceExpectedResultForCodePoint(int codePoint, string expectedResult)
64 | {
65 | var range = new UnicodeCodePointRange(codePoint);
66 |
67 | Assert.Equal(expectedResult, range.ToString());
68 | }
69 |
70 | [Theory]
71 | [InlineData(-1)]
72 | [InlineData(0x110000)]
73 | [InlineData(int.MaxValue)]
74 | public void ConstructorShouldFailForInvalidCodePoint(int codePoint)
75 | => Assert.Throws(() => new UnicodeCodePointRange(codePoint));
76 |
77 | [Theory]
78 | [InlineData(-1, 10)]
79 | [InlineData(10, 0x110000)]
80 | [InlineData(-1, 0x110000)]
81 | public void ConstructorShouldFailForInvalidCodePoints(int firstCodePoint, int lastCodePoint)
82 | => Assert.Throws(() => new UnicodeCodePointRange(firstCodePoint, lastCodePoint));
83 |
84 | [Theory]
85 | [InlineData(0xA3F, 0x105F)]
86 | public void EnumerationShouldHaveExpectedResults(int firstCodePoint, int lastCodePoint)
87 | {
88 | // Generic test
89 | {
90 | int i = firstCodePoint;
91 |
92 | foreach (int n in new UnicodeCodePointRange(firstCodePoint, lastCodePoint))
93 | {
94 | Assert.Equal(i++, n);
95 | }
96 | }
97 |
98 | // Nongeneric test
99 | {
100 | int i = firstCodePoint;
101 |
102 | var enumerator = (IEnumerator)new UnicodeCodePointRange(firstCodePoint, lastCodePoint).GetEnumerator();
103 |
104 | while (enumerator.MoveNext())
105 | {
106 | Assert.Equal(i++, enumerator.Current);
107 | }
108 |
109 | enumerator.Reset();
110 |
111 | Assert.True(enumerator.MoveNext());
112 | Assert.Equal(firstCodePoint, enumerator.Current);
113 | }
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/UnicodeRationalNumerTests.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using Xunit;
3 |
4 | namespace System.Unicode.Tests
5 | {
6 | public class UnicodeRationalNumerTests
7 | {
8 | [Fact]
9 | public void DefaultValueShouldBeDetectedAsSuch()
10 | {
11 | Assert.True(default(UnicodeRationalNumber).IsDefaultValue);
12 | Assert.Equal(string.Empty, default(UnicodeRationalNumber).ToString());
13 | }
14 |
15 | public static readonly TheoryData Numerators = new TheoryData
16 | {
17 | 0,
18 | 1,
19 | long.MaxValue,
20 | long.MinValue
21 | };
22 |
23 | [Theory]
24 | [MemberData(nameof(Numerators))]
25 | public void NumbersAndFractionOverOneShouldBeEqual(long numerator)
26 | {
27 | Assert.Equal(new UnicodeRationalNumber(numerator), new UnicodeRationalNumber(numerator, 1));
28 | Assert.Equal(new UnicodeRationalNumber(numerator).GetHashCode(), new UnicodeRationalNumber(numerator, 1).GetHashCode());
29 | }
30 |
31 | [Theory]
32 | [InlineData("1/10", "10/1")]
33 | [InlineData("2/10", "1/10")]
34 | [InlineData("1/20", "1/10")]
35 | [InlineData("2/2", "1/1")]
36 | [InlineData("2/1", "1/2")]
37 | public void DifferentRationalNumbersShouldNotBeDeterminedEqual(string number1, string number2)
38 | {
39 | Assert.NotEqual(UnicodeRationalNumber.Parse(number1), UnicodeRationalNumber.Parse(number2));
40 | Assert.NotEqual(UnicodeRationalNumber.Parse(number2), UnicodeRationalNumber.Parse(number1));
41 | }
42 |
43 | public static readonly TheoryData StringConversionTestData = new TheoryData
44 | {
45 | { "0", 0, 1 },
46 | { "1", 1, 1 },
47 | { "1/100", 1, 100 },
48 | { "-20/7", -20, 7 },
49 | { "-5", -5, 1 },
50 | { "-9223372036854775808", long.MinValue, 1 },
51 | { "9223372036854775807", long.MaxValue, 1 },
52 | { "9223372036854775807/255", long.MaxValue, byte.MaxValue },
53 | };
54 |
55 | [Theory]
56 | [MemberData(nameof(StringConversionTestData))]
57 | public void MethodToStringShouldReturnExpectedResult(string expectedText, long numerator, byte denominator)
58 | => Assert.Equal(expectedText, new UnicodeRationalNumber(numerator, denominator).ToString());
59 |
60 | [Fact]
61 | public void ParsingNullValueShoudlFail()
62 | => Assert.Throws(() => UnicodeRationalNumber.Parse(null));
63 |
64 | [Fact]
65 | public void ParsingEmptyValueShoudlFail()
66 | => Assert.Throws(() => UnicodeRationalNumber.Parse(string.Empty));
67 |
68 | [Theory]
69 | [InlineData(0, "0")]
70 | [InlineData(0, "0/1")]
71 | [InlineData(1, "1")]
72 | [InlineData(1, "1/1")]
73 | [InlineData(long.MaxValue, "9223372036854775807")]
74 | [InlineData(long.MaxValue, "9223372036854775807/1")]
75 | [InlineData(long.MinValue, "-9223372036854775808")]
76 | [InlineData(long.MinValue, "-9223372036854775808/1")]
77 | public void ParsingCanReturnSimpleNumber(long expectedNumber, string text)
78 | => Assert.Equal(new UnicodeRationalNumber(expectedNumber), UnicodeRationalNumber.Parse(text));
79 |
80 | public static readonly TheoryData FractionParsingTestData = new TheoryData
81 | {
82 | { 0, 1, "0" },
83 | { 0, 1, "0/1" },
84 | { 1, 1, "1" },
85 | { 1, 1, "1/1" },
86 | { 1, 10, "1/10" },
87 | { 1, 255, "1/255" },
88 | { 3, 4, "3/4" },
89 | { 6, 8, "6/8" },
90 | { 1, 255, "1/255" },
91 | { long.MaxValue, 1, "9223372036854775807" },
92 | { long.MaxValue, 1, "9223372036854775807/1" },
93 | { long.MinValue, 1, "-9223372036854775808" },
94 | { long.MinValue, 1, "-9223372036854775808/1" },
95 | { long.MaxValue, byte.MaxValue, "9223372036854775807/255" },
96 | };
97 |
98 | [Theory]
99 | [MemberData(nameof(FractionParsingTestData))]
100 | public void ParsingCanReturnFraction(long expectedNumerator, byte expectedDenominator, string text)
101 | => Assert.Equal(new UnicodeRationalNumber(expectedNumerator, expectedDenominator), UnicodeRationalNumber.Parse(text));
102 |
103 | [Fact]
104 | public void EqualityComparisonAndHashCodeShouldWorkAsExpected()
105 | {
106 | var numbers = new[]
107 | {
108 | default,
109 | new UnicodeRationalNumber(0),
110 | new UnicodeRationalNumber(1),
111 | new UnicodeRationalNumber(1, 10),
112 | new UnicodeRationalNumber(1, 100),
113 | new UnicodeRationalNumber(10),
114 | new UnicodeRationalNumber(100),
115 | new UnicodeRationalNumber(1000),
116 | new UnicodeRationalNumber(1000000),
117 | new UnicodeRationalNumber(1000000000),
118 | new UnicodeRationalNumber(1000000000000),
119 | };
120 |
121 | var hashSet = new HashSet();
122 |
123 | // Verify that all numbers are unique
124 | foreach (var number in numbers)
125 | Assert.True(hashSet.Add(number));
126 |
127 | // Verify that all numbers are already in the list
128 | foreach (var number in numbers)
129 | Assert.False(hashSet.Add(number));
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/UnihanCharacterDataTests.cs:
--------------------------------------------------------------------------------
1 | using System.Linq;
2 | using Xunit;
3 |
4 | namespace System.Unicode.Tests
5 | {
6 | public sealed class UnihanCharacterDataTests
7 | {
8 | private static readonly UnicodeBlock[] Blocks = UnicodeInfo.GetBlocks();
9 |
10 | [Theory]
11 | [InlineData("CJK Unified Ideographs")]
12 | [InlineData("CJK Unified Ideographs Extension A")]
13 | [InlineData("CJK Unified Ideographs Extension B")]
14 | [InlineData("CJK Unified Ideographs Extension C")]
15 | [InlineData("CJK Unified Ideographs Extension D")]
16 | [InlineData("CJK Unified Ideographs Extension E")]
17 | [InlineData("CJK Unified Ideographs Extension F")]
18 | [InlineData("CJK Unified Ideographs Extension G")]
19 | [InlineData("CJK Unified Ideographs Extension H")]
20 | [InlineData("CJK Compatibility Ideographs")]
21 | [InlineData("CJK Compatibility Ideographs Supplement")]
22 | public void CodePointPackingShouldRoundTrip(string blockName)
23 | {
24 | var block = Blocks.Single(b => b.Name == blockName);
25 |
26 | foreach (int codePoint in block.CodePointRange)
27 | {
28 | Assert.Equal(codePoint, UnihanCharacterData.UnpackCodePoint(UnihanCharacterData.PackCodePoint(codePoint)));
29 | }
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/System.Unicode.Tests/XUnitSerializableString.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 | using Xunit.Abstractions;
3 |
4 | namespace System.Unicode.Tests
5 | {
6 | // This class is needed because apparently, somewhere in the process of unit testing, strings with invalid UTF-16 sequences are "fixed", which totally messes up the tests here.
7 | // This is just a wrapper over regular strings… Data is serialized as an array of chars instead of a string. This seems to do the trick.
8 | public class XUnitSerializableString : IEquatable, IXunitSerializable
9 | {
10 | private string _value;
11 |
12 | public XUnitSerializableString() : this(null) { }
13 |
14 | public XUnitSerializableString(string value)
15 | {
16 | _value = value;
17 | }
18 |
19 | void IXunitSerializable.Deserialize(IXunitSerializationInfo info)
20 | {
21 | var chars = info.GetValue("Chars");
22 |
23 | _value = chars != null ?
24 | new string(chars) :
25 | null;
26 | }
27 |
28 | void IXunitSerializable.Serialize(IXunitSerializationInfo info)
29 | => info.AddValue("Chars", _value?.ToCharArray(), typeof(char[]));
30 |
31 | public override string ToString()
32 | {
33 | if (string.IsNullOrEmpty(_value)) return _value;
34 |
35 | var sb = new StringBuilder(_value.Length * 6);
36 |
37 | foreach (char c in _value)
38 | {
39 | sb.Append(@"\u")
40 | .Append(((ushort)c).ToString("X4"));
41 | }
42 |
43 | return sb.ToString();
44 | }
45 |
46 | public bool Equals(XUnitSerializableString other) => _value == other._value;
47 | public override bool Equals(object obj) => obj is XUnitSerializableString && Equals((XUnitSerializableString)obj);
48 | public override int GetHashCode() => StringComparer.Ordinal.GetHashCode(_value);
49 |
50 | public static implicit operator string(XUnitSerializableString text) => text._value;
51 | public static implicit operator XUnitSerializableString(string text) => new XUnitSerializableString(text);
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/System.Unicode.snk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/System.Unicode.snk
--------------------------------------------------------------------------------
/System.Unicode/BidirectionalClass.cs:
--------------------------------------------------------------------------------
1 | using System.ComponentModel.DataAnnotations;
2 |
3 | namespace System.Unicode
4 | {
5 | /// Represents possible values for the Bidi_Class unicode property.
6 | public enum BidirectionalClass : byte
7 | {
8 | /// Represents the value Left_To_Right.
9 | /// Any strong left-to-right character.
10 | [ValueName("L"), ValueName("Left_To_Right"), Display(Name = "Left_To_Right", Description = "Any strong left-to-right character.")]
11 | LeftToRight,
12 | /// Represents the value Right_To_Left.
13 | /// Any strong right-to-left (non-Arabic-type) character.
14 | [ValueName("R"), ValueName("Right_To_Left"), Display(Name = "Right_To_Left", Description = "Any strong right-to-left (non-Arabic-type) character.")]
15 | RightToLeft,
16 | /// Represents the value Arabic_Letter.
17 | /// Any strong right-to-left (Arabic-type) character.
18 | [ValueName("AL"), ValueName("Arabic_Letter"), Display(Name = "Arabic_Letter", Description = "Any strong right-to-left (Arabic-type) character.")]
19 | ArabicLetter,
20 | /// Represents the value European_Number.
21 | /// Any ASCII digit or Eastern Arabic-Indic digit.
22 | [ValueName("EN"), ValueName("European_Number"), Display(Name = "European_Number", Description = "Any ASCII digit or Eastern Arabic-Indic digit.")]
23 | EuropeanNumber,
24 | /// Represents the value European_Separator.
25 | /// Plus and minus signs.
26 | [ValueName("ES"), ValueName("European_Separator"), Display(Name = "European_Separator", Description = "Plus and minus signs.")]
27 | EuropeanSeparator,
28 | /// Represents the value European_Terminator.
29 | /// A terminator in a numeric format context, includes currency signs.
30 | [ValueName("ET"), ValueName("European_Terminator"), Display(Name = "European_Terminator", Description = "A terminator in a numeric format context, includes currency signs.")]
31 | EuropeanTerminator,
32 | /// Represents the value Arabic_Number.
33 | /// Any Arabic-Indic digit.
34 | [ValueName("AN"), ValueName("Arabic_Number"), Display(Name = "Arabic_Number", Description = "Any Arabic-Indic digit.")]
35 | ArabicNumber,
36 | /// Represents the value Common_Separator.
37 | /// Commas, colons, and slashes.
38 | [ValueName("CS"), ValueName("Common_Separator"), Display(Name = "Common_Separator", Description = "Commas, colons, and slashes.")]
39 | CommonSeparator,
40 | /// Represents the value Nonspacing_Mark.
41 | /// Any nonspacing mark.
42 | [ValueName("NSM"), ValueName("Nonspacing_Mark"), Display(Name = "Nonspacing_Mark", Description = "Any nonspacing mark.")]
43 | NonSpacingMark,
44 | /// Represents the value Boundary_Neutral.
45 | /// Most format characters, control codes, or noncharacters.
46 | [ValueName("BN"), ValueName("Boundary_Neutral"), Display(Name = "Boundary_Neutral", Description = "Most format characters, control codes, or noncharacters.")]
47 | BoundaryNeutral,
48 | /// Represents the value Paragraph_Separator.
49 | /// Various newline characters.
50 | [ValueName("B"), ValueName("Paragraph_Separator"), Display(Name = "Paragraph_Separator", Description = "Various newline characters.")]
51 | ParagraphSeparator,
52 | /// Represents the value Segment_Separator.
53 | /// Various segment-related control codes.
54 | [ValueName("S"), ValueName("Segment_Separator"), Display(Name = "Segment_Separator", Description = "Various segment-related control codes.")]
55 | SegmentSeparator,
56 | /// Represents the value White_Space.
57 | /// Spaces.
58 | [ValueName("WS"), ValueName("White_Space"), Display(Name = "White_Space", Description = "Spaces.")]
59 | WhiteSpace,
60 | /// Represents the value Other_Neutral.
61 | /// Most other symbols and punctuation marks.
62 | [ValueName("ON"), ValueName("Other_Neutral"), Display(Name = "Other_Neutral", Description = "Most other symbols and punctuation marks.")]
63 | OtherNeutral,
64 | /// Represents the value Left_To_Right_Embedding.
65 | /// U+202A: the LR embedding control.
66 | [ValueName("LRE"), ValueName("Left_To_Right_Embedding"), Display(Name = "Left_To_Right_Embedding", Description = "U+202A: the LR embedding control.")]
67 | LeftToRightEmbedding,
68 | /// Represents the value Left_To_Right_Override.
69 | /// U+202D: the LR override control.
70 | [ValueName("LRO"), ValueName("Left_To_Right_Override"), Display(Name = "Left_To_Right_Override", Description = "U+202D: the LR override control.")]
71 | LeftToRightOverride,
72 | /// Represents the value Right_To_Left_Embedding.
73 | /// U+202B: the RL embedding control.
74 | [ValueName("RLE"), ValueName("Right_To_Left_Embedding"), Display(Name = "Right_To_Left_Embedding", Description = "U+202B: the RL embedding control.")]
75 | RightToLeftEmbedding,
76 | /// Represents the value Right_To_Left_Override.
77 | /// U+202E: the RL override control.
78 | [ValueName("RLO"), ValueName("Right_To_Left_Override"), Display(Name = "Right_To_Left_Override", Description = "U+202E: the RL override control.")]
79 | RightToLeftOverride,
80 | /// Represents the value Pop_Directional_Format.
81 | /// U+202C: terminates an embedding or override control.
82 | [ValueName("PDF"), ValueName("Pop_Directional_Format"), Display(Name = "Pop_Directional_Format", Description = "U+202C: terminates an embedding or override control.")]
83 | PopDirectionalFormat,
84 | /// Represents the value Left_To_Right_Isolate.
85 | /// U+2066: the LR isolate control.
86 | [ValueName("LRI"), ValueName("Left_To_Right_Isolate"), Display(Name = "Left_To_Right_Isolate", Description = "U+2066: the LR isolate control.")]
87 | LeftToRightIsolate,
88 | /// Represents the value Right_To_Left_Isolate.
89 | /// U+2067: the RL isolate control.
90 | [ValueName("RLI"), ValueName("Right_To_Left_Isolate"), Display(Name = "Right_To_Left_Isolate", Description = "U+2067: the RL isolate control.")]
91 | RightToLeftIsolate,
92 | /// Represents the value First_Strong_Isolate.
93 | /// U+2068: the first strong isolate control.
94 | [ValueName("FSI"), ValueName("First_Strong_Isolate"), Display(Name = "First_Strong_Isolate", Description = "U+2068: the first strong isolate control.")]
95 | FirstStrongIsolate,
96 | /// Represents the value Pop_Directional_Isolate.
97 | /// U+2069: terminates an isolate control.
98 | [ValueName("PDI"), ValueName("Pop_Directional_Isolate"), Display(Name = "Pop_Directional_Isolate", Description = "U+2069: terminates an isolate control.")]
99 | PopDirectionalIsolate,
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/System.Unicode/CjkRadicalData.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | #if BUILD_SYSTEM
4 | public
5 | #else
6 | internal
7 | #endif
8 | readonly struct CjkRadicalData
9 | {
10 | public readonly char TraditionalRadicalCodePoint;
11 | public readonly char TraditionalCharacterCodePoint;
12 | public readonly char SimplifiedRadicalCodePoint;
13 | public readonly char SimplifiedCharacterCodePoint;
14 |
15 | internal CjkRadicalData(char radicalCodePoint, char characterCodePoint)
16 | {
17 | TraditionalRadicalCodePoint = radicalCodePoint;
18 | TraditionalCharacterCodePoint = characterCodePoint;
19 | SimplifiedRadicalCodePoint = radicalCodePoint;
20 | SimplifiedCharacterCodePoint = characterCodePoint;
21 | }
22 |
23 | internal CjkRadicalData(char traditionalRadicalCodePoint, char traditionalCharacterCodePoint, char simplifiedRadicalCodePoint, char simplifiedCharacterCodePoint)
24 | {
25 | TraditionalRadicalCodePoint = traditionalRadicalCodePoint;
26 | TraditionalCharacterCodePoint = traditionalCharacterCodePoint;
27 | SimplifiedRadicalCodePoint = simplifiedRadicalCodePoint;
28 | SimplifiedCharacterCodePoint = simplifiedCharacterCodePoint;
29 | }
30 |
31 | public bool HasSimplifiedForm
32 | => SimplifiedRadicalCodePoint != TraditionalRadicalCodePoint
33 | || SimplifiedCharacterCodePoint != TraditionalCharacterCodePoint;
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/System.Unicode/CjkRadicalInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics;
2 |
3 | namespace System.Unicode
4 | {
5 | /// Provides information on a specific CJK radical.
6 | [DebuggerDisplay("{RadicalIndex} - {TraditionalRadicalCodePoint.ToString(),nq} / {SimplifiedRadicalCodePoint.ToString(),nq}")]
7 | public readonly struct CjkRadicalInfo
8 | {
9 | /// The index of the radical in the Kangxi dictionary.
10 | /// There are 214 radicals, numbered from 1 to 214.
11 | public byte RadicalIndex { get; }
12 |
13 | private readonly CjkRadicalData _radicalData;
14 |
15 | /// Gets a code point representing the CJK radical in its traditional form.
16 | public char TraditionalRadicalCodePoint => _radicalData.TraditionalRadicalCodePoint;
17 | /// Gets the code point of a traditional character composed only of the CJK radical.
18 | ///
19 | /// Usually, the glyph of this code point will be the same as the one used for .
20 | /// However, the code point returned will have a meaning associated, contrary to the one returned by , which only represents the radical.
21 | ///
22 | public char TraditionalCharacterCodePoint => _radicalData.TraditionalCharacterCodePoint;
23 | /// Gets a code point representing the CJK radical in its simplified form, which may be the same as the traditional form.
24 | /// Most of the time, the value returned will be the same as .
25 | public char SimplifiedRadicalCodePoint => _radicalData.SimplifiedRadicalCodePoint;
26 | /// Gets the code point of a simplified character composed only of the CJK radical.
27 | ///
28 | /// Usually, the glyph of this code point will be the same as the one used for .
29 | /// However, the code point returned will have a meaning associated, contrary to the one returned by , which only represents the radical.
30 | ///
31 | public char SimplifiedCharacterCodePoint => _radicalData.SimplifiedCharacterCodePoint;
32 |
33 | /// Gets a value indicating whether a simplified form exists for the given radical.
34 | public bool HasSimplifiedForm => _radicalData.HasSimplifiedForm;
35 |
36 | internal CjkRadicalInfo(byte radicalIndex, CjkRadicalData radicalData)
37 | {
38 | RadicalIndex = radicalIndex;
39 | _radicalData = radicalData;
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/System.Unicode/CodePointEnumerable.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Allows enumeration of the code points contained in an encapsulated string.
7 | ///
8 | /// This enumerable will only allow enumeration of valid UTF-16 strings.
9 | /// For incomplete or invalid UTF-16 strings, please use instead.
10 | ///
11 | public readonly struct CodePointEnumerable : IEnumerable
12 | {
13 | /// Initializes a new instance of the struct .
14 | /// The string whose code points must be enumerated.
15 | public CodePointEnumerable(string text) => Text = text ?? throw new ArgumentNullException(nameof(text));
16 |
17 | /// Gets the text whose code points are being enumerated.
18 | public string Text { get; }
19 |
20 | /// Gets an enumerator which can be used to enumerate the code points in the text.
21 | public CodePointEnumerator GetEnumerator() => new CodePointEnumerator(Text);
22 |
23 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
24 |
25 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/System.Unicode/CodePointEnumerator.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Supports a standard iteration of code points in a .
7 | public struct CodePointEnumerator : IEnumerator
8 | {
9 | private readonly string _text;
10 | private int _current;
11 | private int _index;
12 |
13 | /// Initializes a new instance of the struct.
14 | /// The text whose code point should be enumerated.
15 | /// is .
16 | public CodePointEnumerator(string text)
17 | {
18 | _text = text ?? throw new ArgumentNullException(nameof(text));
19 | _current = 0;
20 | _index = -1;
21 | }
22 |
23 | /// Gets the element in the collection at the current position of the enumerator..
24 | /// The element in the collection at the current position of the enumerator.
25 | public int Current => _current;
26 |
27 | object IEnumerator.Current => _current;
28 |
29 | void IDisposable.Dispose() { }
30 |
31 | /// Advances the enumerator to the next element of the collection.
32 | /// if the enumerator was successfully advanced to the next element; if the enumerator has passed the end of the collection.
33 | public bool MoveNext()
34 | {
35 | if (_index < _text.Length && (_index += _current > 0xFFFF ? 2 : 1) < _text.Length)
36 | {
37 | _current = char.ConvertToUtf32(_text, _index);
38 | return true;
39 | }
40 | else
41 | {
42 | _current = 0;
43 | return false;
44 | }
45 | }
46 |
47 | void IEnumerator.Reset() => (_current, _index) = (0, -1);
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/System.Unicode/CompatibilityFormattingTag.cs:
--------------------------------------------------------------------------------
1 | using System.ComponentModel.DataAnnotations;
2 |
3 | namespace System.Unicode
4 | {
5 | /// Provides information on the kind of compatibility decomposition provided.
6 | /// The default value of indicates canonical decomposition of the code point.
7 | public enum CompatibilityFormattingTag : byte
8 | {
9 | /// Canonical form.
10 | Canonical = 0,
11 | /// Font variant (for example, a blackletter form).
12 | [ValueName("font"), Display(Name = "font", Description = "Font variant (for example, a blackletter form).")]
13 | Font,
14 | /// No-break version of a space or hyphen.
15 | [ValueName("noBreak"), Display(Name = "noBreak", Description = "No-break version of a space or hyphen.")]
16 | NoBreak,
17 | /// Initial presentation form (Arabic).
18 | [ValueName("initial"), Display(Name = "initial", Description = "Initial presentation form (Arabic).")]
19 | Initial,
20 | /// Medial presentation form (Arabic).
21 | [ValueName("medial"), Display(Name = "medial", Description = "Medial presentation form (Arabic).")]
22 | Medial,
23 | /// Final presentation form (Arabic).
24 | [ValueName("final"), Display(Name = "final", Description = "Final presentation form (Arabic).")]
25 | Final,
26 | /// Isolated presentation form (Arabic).
27 | [ValueName("isolated"), Display(Name = "isolated", Description = "Isolated presentation form (Arabic).")]
28 | Isolated,
29 | /// Encircled form.
30 | [ValueName("circle"), Display(Name = "circle", Description = "Encircled form.")]
31 | Circle,
32 | /// Superscript form.
33 | [ValueName("super"), Display(Name = "super", Description = "Superscript form.")]
34 | Super,
35 | /// Subscript form.
36 | [ValueName("sub"), Display(Name = "sub", Description = "Subscript form.")]
37 | Sub,
38 | /// Vertical layout presentation form.
39 | [ValueName("vertical"), Display(Name = "vertical", Description = "Vertical layout presentation form.")]
40 | Vertical,
41 | /// Wide (or zenkaku) compatibility character.
42 | [ValueName("wide"), Display(Name = "wide", Description = "Wide (or zenkaku) compatibility character.")]
43 | Wide,
44 | /// Narrow (or hankaku) compatibility character.
45 | [ValueName("narrow"), Display(Name = "narrow", Description = "Narrow (or hankaku) compatibility character.")]
46 | Narrow,
47 | /// Small variant form (CNS compatibility).
48 | [ValueName("small"), Display(Name = "small", Description = "Small variant form (CNS compatibility).")]
49 | Small,
50 | /// CJK squared font variant.
51 | [ValueName("square"), Display(Name = "square", Description = "CJK squared font variant.")]
52 | Square,
53 | /// Vulgar fraction form.
54 | [ValueName("fraction"), Display(Name = "fraction", Description = "Vulgar fraction form.")]
55 | Fraction,
56 | /// Otherwise unspecified compatibility character.
57 | [ValueName("compat"), Display(Name = "compat", Description = "Otherwise unspecified compatibility character.")]
58 | Compat,
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/System.Unicode/CoreProperties.cs:
--------------------------------------------------------------------------------
1 | using System.ComponentModel.DataAnnotations;
2 |
3 | namespace System.Unicode
4 | {
5 | /// A bitmask of the various available core properties.
6 | /// Core properties are normative, and derived from various properties as well as .
7 | [Flags]
8 | public enum CoreProperties : int
9 | {
10 | // ⚠️ Be careful when adding new properties to the enum. Only up to 22 bits should be consumed.
11 |
12 | /// Represents the Lowercase property.
13 | [ValueName("Lowercase"), ValueName("Lower"), Display(Name = "Lowercase")]
14 | Lowercase = 0b_0000_0000_0000_0000_0000_0001,
15 | /// Represents the Uppercase property.
16 | [ValueName("Uppercase"), ValueName("Upper"), Display(Name = "Uppercase")]
17 | Uppercase = 0b_0000_0000_0000_0000_0000_0010,
18 | /// Represents the Cased property.
19 | [ValueName("Cased"), Display(Name = "Cased")]
20 | Cased = 0b_0000_0000_0000_0000_0000_0100,
21 | /// Represents the Case_Ignorable property.
22 | [ValueName("Case_Ignorable"), ValueName("CI"), Display(Name = "Case_Ignorable")]
23 | CaseIgnorable = 0b_0000_0000_0000_0000_0000_1000,
24 | /// Represents the Changes_When_Lowercased property.
25 | [ValueName("Changes_When_Lowercased"), ValueName("CWL"), Display(Name = "Changes_When_Lowercased")]
26 | ChangesWhenLowercased = 0b_0000_0000_0000_0000_0001_0000,
27 | /// Represents the Changes_When_Uppercased property.
28 | [ValueName("Changes_When_Uppercased"), ValueName("CWU"), Display(Name = "Changes_When_Uppercased")]
29 | ChangesWhenUppercased = 0b_0000_0000_0000_0000_0010_0000,
30 | /// Represents the Changes_When_Titlecased property.
31 | [ValueName("Changes_When_Titlecased"), ValueName("CWT"), Display(Name = "Changes_When_Titlecased")]
32 | ChangesWhenTitlecased = 0b_0000_0000_0000_0000_0100_0000,
33 | /// Represents the Changes_When_Casefolded property.
34 | [ValueName("Changes_When_Casefolded"), ValueName("CWCF"), Display(Name = "Changes_When_Casefolded")]
35 | ChangesWhenCasefolded = 0b_0000_0000_0000_0000_1000_0000,
36 | /// Represents the Changes_When_Casemapped property.
37 | [ValueName("Changes_When_Casemapped"), ValueName("CWCM"), Display(Name = "Changes_When_Casemapped")]
38 | ChangesWhenCasemapped = 0b_0000_0000_0000_0001_0000_0000,
39 | /// Represents the Alphabetic property.
40 | [ValueName("Alphabetic"), ValueName("Alpha"), Display(Name = "Alphabetic")]
41 | Alphabetic = 0b_0000_0000_0000_0010_0000_0000,
42 | /// Represents the Default_Ignorable_Code_Point property.
43 | [ValueName("Default_Ignorable_Code_Point"), ValueName("DI"), Display(Name = "Default_Ignorable_Code_Point")]
44 | DefaultIgnorableCodePoint = 0b_0000_0000_0000_0100_0000_0000,
45 | /// Represents the Grapheme_Base property.
46 | [ValueName("Grapheme_Base"), ValueName("Gr_Base"), Display(Name = "Grapheme_Base")]
47 | GraphemeBase = 0b_0000_0000_0000_1000_0000_0000,
48 | /// Represents the Grapheme_Extend property.
49 | [ValueName("Grapheme_Extend"), ValueName("Gr_Ext"), Display(Name = "Grapheme_Extend")]
50 | GraphemeExtend = 0b_0000_0000_0001_0000_0000_0000,
51 | /// Represents the Grapheme_Link property.
52 | [ValueName("Grapheme_Link"), ValueName("Gr_Link"), Display(Name = "Grapheme_Link")]
53 | GraphemeLink = 0b_0000_0000_0010_0000_0000_0000,
54 | /// Represents the Math property.
55 | [ValueName("Math"), Display(Name = "Math")]
56 | Math = 0b_0000_0000_0100_0000_0000_0000,
57 | /// Represents the ID_Start property.
58 | [ValueName("ID_Start"), ValueName("IDS"), Display(Name = "ID_Start")]
59 | IdentifierStart = 0b_0000_0000_1000_0000_0000_0000,
60 | /// Represents the ID_Continue property.
61 | [ValueName("ID_Continue"), ValueName("IDC"), Display(Name = "ID_Continue")]
62 | IdentifierContinue = 0b_0000_0001_0000_0000_0000_0000,
63 | /// Represents the XID_Start property.
64 | [ValueName("XID_Start"), ValueName("XIDS"), Display(Name = "XID_Start")]
65 | ExtendedIdentifierStart = 0b_0000_0010_0000_0000_0000_0000,
66 | /// Represents the XID_Continue property.
67 | [ValueName("XID_Continue"), ValueName("XIDC"), Display(Name = "XID_Continue")]
68 | ExtendedIdentifierContinue = 0b_0000_0100_0000_0000_0000_0000,
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/System.Unicode/EmojiProperties.cs:
--------------------------------------------------------------------------------
1 | using System.ComponentModel.DataAnnotations;
2 |
3 | namespace System.Unicode
4 | {
5 | /// A bitmask of the various available emoji properties.
6 | /// Emoji properties are not formally part of UCD, but .
7 | [Flags]
8 | public enum EmojiProperties : byte
9 | {
10 | // ⚠️ Only 6 bits can be used here at the moment. Refactoring of the encoding is required to use 8 or more bits.
11 | // Reason: EmojiProperties does not have its own bit in UcdFields.
12 |
13 | /// Represents the Emoji property.
14 | [ValueName("Emoji"), Display(Name = "Emoji")]
15 | Emoji = 0b_00_0001,
16 | /// Represents the Emoji_Presentation property.
17 | [ValueName("Emoji_Presentation"), ValueName("EPres"), Display(Name = "Emoji_Presentation")]
18 | EmojiPresentation = 0b_00_0010,
19 | /// Represents the Emoji_Modifier property.
20 | [ValueName("Emoji_Modifier"), ValueName("EMod"), Display(Name = "Emoji_Modifier")]
21 | EmojiModifier = 0b_01_0000,
22 | /// Represents the Emoji_Modifier_Base property.
23 | [ValueName("Emoji_Modifier_Base"), ValueName("EBase"), Display(Name = "Emoji_Modifier_Base")]
24 | EmojiModifierBase = 0b_00_0100,
25 | /// Represents the Emoji_Component property.
26 | [ValueName("Emoji_Component"), ValueName("EComp"), Display(Name = "Emoji_Component")]
27 | EmojiComponent = 0b_00_1000,
28 | /// Represents the Extended_Pictographic property.
29 | [ValueName("Extended_Pictographic"), ValueName("ExtPict"), Display(Name = "Extended_Pictographic")]
30 | ExtendedPictographic = 0b_10_0000,
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/System.Unicode/EnumHelper.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Linq;
3 | using System.Reflection;
4 |
5 | namespace System.Unicode
6 | {
7 | internal static class EnumHelper
8 | where T : struct, Enum
9 | {
10 | private static readonly Dictionary ValueNameDictionary = CreateValueNameDictionary();
11 |
12 | private static Dictionary CreateValueNameDictionary()
13 | {
14 | var type = typeof(T).GetTypeInfo();
15 |
16 | if (!type.IsEnum) throw new InvalidOperationException();
17 |
18 | return
19 | (
20 | from field in type.DeclaredFields
21 | where field.IsPublic && field.IsLiteral
22 | select new KeyValuePair
23 | (
24 | (T)field.GetValue(null),
25 | (
26 | from attr in field.GetCustomAttributes()
27 | where attr.Name != null
28 | select attr.Name
29 | ).ToArray()
30 | )
31 | ).ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
32 | }
33 |
34 | public static string[] GetValueNames(T value) => ValueNameDictionary.TryGetValue(value, out string[] names) ? names : null;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/System.Unicode/GenerateUnicodeDatabase.proj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | false
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/System.Unicode/HangulInfo.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | internal static class HangulInfo
4 | {
5 | // Constants defined on page 144 of the Unicode 7.0 Standard (3.12)
6 | private const ushort SBase = 0xAC00;
7 | //private const ushort LBase = 0x1100;
8 | //private const ushort VBase = 0x1161;
9 | //private const ushort TBase = 0x11A7;
10 | private const int LCount = 19;
11 | private const int VCount = 21;
12 | private const int TCount = 28;
13 | private const int NCount = VCount * TCount;
14 | private const int SCount = LCount * NCount;
15 |
16 | private static readonly string[] JamoLTable =
17 | {
18 | "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
19 | "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H"
20 | };
21 |
22 | private static readonly string[] JamoVTable =
23 | {
24 | "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
25 | "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
26 | "YU", "EU", "YI", "I"
27 | };
28 |
29 | private static readonly string[] JamoTTable =
30 | {
31 | "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
32 | "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
33 | "S", "SS", "NG", "J", "C", "K", "T", "P", "H"
34 | };
35 |
36 | // Algorithm defined on page 150 of the Unicode 7.0 Standard (3.12)
37 | internal static string GetHangulName(char codePoint)
38 | {
39 | int sIndex = codePoint - SBase;
40 |
41 | if (sIndex < 0 || sIndex >= SCount) throw new ArgumentOutOfRangeException(nameof(codePoint));
42 |
43 | int lIndex = sIndex / NCount;
44 | int vIndex = sIndex % NCount / TCount;
45 | int tIndex = sIndex % TCount;
46 |
47 | return "HANGUL SYLLABLE " + JamoLTable[lIndex] + JamoVTable[vIndex] + JamoTTable[tIndex];
48 | }
49 |
50 | internal static bool IsHangul(int codePoint)
51 | => codePoint >= SBase && codePoint < SBase + SCount;
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/System.Unicode/PermissiveCodePointEnumerable.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Allows enumeration of the code points contained in an encapsulated string, even when this one contains lone surrogates.
7 | ///
8 | /// This enumerable will allow enumeration of UTF-16 strings containing lone surrogates.
9 | /// For a more conformant enumeration of code points, please use instead.
10 | ///
11 | public readonly struct PermissiveCodePointEnumerable : IEnumerable
12 | {
13 | /// Initializes a new instance of the struct .
14 | /// The string whose code points must be enumerated.
15 | public PermissiveCodePointEnumerable(string text) => Text = text ?? throw new ArgumentNullException(nameof(text));
16 |
17 | /// Gets the text whose code points are being enumerated.
18 | public string Text { get; }
19 |
20 | /// Gets an enumerator which can be used to enumerate the code points in the text.
21 | ///
22 | public PermissiveCodePointEnumerator GetEnumerator() => new PermissiveCodePointEnumerator(Text);
23 |
24 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
25 |
26 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/System.Unicode/PermissiveCodePointEnumerator.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Supports a permissive iteration of code points in a .
7 | public struct PermissiveCodePointEnumerator : IEnumerator
8 | {
9 | private readonly string _text;
10 | private int _current;
11 | private int _index;
12 |
13 | /// Initializes a new instance of the struct.
14 | /// The text whose code point should be enumerated.
15 | /// is .
16 | public PermissiveCodePointEnumerator(string text)
17 | {
18 | _text = text ?? throw new ArgumentNullException(nameof(text));
19 | _current = 0;
20 | _index = -1;
21 | }
22 |
23 | /// Gets the element in the collection at the current position of the enumerator..
24 | /// The element in the collection at the current position of the enumerator.
25 | public int Current => _current;
26 |
27 | object IEnumerator.Current => _current;
28 |
29 | void IDisposable.Dispose() { }
30 |
31 | /// Advances the enumerator to the next element of the collection.
32 | /// if the enumerator was successfully advanced to the next element; if the enumerator has passed the end of the collection.
33 | public bool MoveNext()
34 | {
35 | if (_index < _text.Length && (_index += _current > 0xFFFF ? 2 : 1) < _text.Length)
36 | {
37 | _current = GetUtf32(_text, _index);
38 | return true;
39 | }
40 | else
41 | {
42 | _current = 0;
43 | return false;
44 | }
45 | }
46 |
47 | void IEnumerator.Reset() => (_current, _index) = (0, -1);
48 |
49 | private static int GetUtf32(string s, int index)
50 | {
51 | char c1 = s[index];
52 |
53 | if (char.IsHighSurrogate(c1) && ++index < s.Length)
54 | {
55 | char c2 = s[index];
56 |
57 | if (char.IsLowSurrogate(c2)) return char.ConvertToUtf32(c1, c2);
58 | }
59 |
60 | return c1;
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/System.Unicode/StringExtensions.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | /// Contains extension methods applicable to the type.
4 | public static class StringExtensions
5 | {
6 | /// Encapsulates the string in an object which can be used to enumerate code points.
7 | ///
8 | /// The enumerable returned by this method enumerates code points in a strict manner.
9 | /// If the string contains lone surrogates, the enumeration will throw.
10 | ///
11 | /// The string to encapsulate.
12 | /// An enumerable object, which can be used to enumerate code points in the string.
13 | public static CodePointEnumerable AsCodePointEnumerable(this string s) => new(s);
14 |
15 | /// Encapsulates the string in an object which can be used to enumerate code points in a permissive way.
16 | ///
17 | /// The enumerable returned by this method is permissive, regarding the code points represented.
18 | /// It allows invalid sequences, such as lone surrogates, the enumeration will handle those gracefully.
19 | ///
20 | /// The string to encapsulate.
21 | /// An enumerable object, which can be used to enumerate code points in the string.
22 | public static PermissiveCodePointEnumerable AsPermissiveCodePointEnumerable(this string s) => new(s);
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/System.Unicode/System.Unicode.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net5.0;netstandard2.1;netstandard2.0;netstandard1.1;net45
5 | en-US
6 | true
7 | True
8 | true
9 | true
10 | true
11 | snupkg
12 | UnicodeInformation
13 | true
14 | $(MSBuildThisFileDirectory)=C:\Sources\NetUnicodeInfo\System.Unicode
15 | $(NoWarn);NETSDK1138
16 |
17 |
18 |
19 | $(DefineConstants);HAS_NATIVE_SPAN
20 |
21 |
22 |
23 | UnicodeInformation
24 | .NET Unicode Information Library
25 | .NET Unicode Information Library
26 | Library providing access to Unicode data to .NET clients.
27 | Unicode Unihan Data .NET C# String Text Char Character CodePoint Code Point
28 | MIT
29 | https://github.com/GoldenCrystal/NetUnicodeInfo
30 | packageIcon.png
31 | https://github.com/GoldenCrystal/NetUnicodeInfo.git
32 | git
33 | Version 2.7.1
34 | -------------
35 | Fix startup performance regression at the cost of more memory usage during startup.
36 |
37 | Version 2.7.0
38 | -------------
39 | Support for Unicode 15.0
40 |
41 | Version 2.6.0
42 | -------------
43 | Support for Unicode 14.0
44 | Bugfix in CjkRadicalData
45 | Reduce string allocations for a few methods on frameworks where native Span is available.
46 |
47 | Version 2.5.1
48 | -------------
49 | Fix for .NET 6
50 |
51 | Version 2.5.0
52 | -------------
53 | Support for Unicode 13.0.
54 |
55 | -------------
56 | Support for Unicode 12.1.
57 | Added the missing Emoji properties Extended_Pictographic and EmojiModifier that were missing. 🎉
58 | Structs that were immutable have been marked as readonly.
59 |
60 | Version 2.3.0
61 | -------------
62 | Support for Unicode 12.0.
63 | Target .NET Standard 2.0.
64 |
65 | Version 2.2.1
66 | -------------
67 | Added DebuggerDisplay attributes on various types.
68 |
69 | Version 2.2.0
70 | -------------
71 | Added emoji properties.
72 |
73 | Version 2.1.0
74 | -------------
75 | Support for Unicode 10.0.
76 |
77 | Version 2.0.0
78 | -------------
79 | Following migration to Unicode 9.0.0, UnicodeRadicalStrokeCount.StrokeCount is now of type System.SByte instead of type System.Byte.
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 | all
123 | runtime; build; native; contentfiles; analyzers; buildtransitive
124 |
125 |
126 |
127 |
128 |
129 | UnihanCharacterData.Generated.cs
130 | TextTemplatingFileGenerator
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 | True
141 | True
142 | UnihanCharacterData.tt
143 |
144 |
145 |
146 |
147 |
--------------------------------------------------------------------------------
/System.Unicode/UcdFields.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | /// Represents the fields available for an UCD entry.
4 | /// Not all the enumeration member directly map to a field.
5 | [Flags]
6 | internal enum UcdFields : ushort
7 | {
8 | // Not really a field, just here to indicate that the entry is a range
9 | CodePointRange = 0b_0000_0000_0000_0001,
10 |
11 | Name = 0b_0000_0000_0000_0010, // Will stand in for official name as well as related names.
12 | Category = 0b_0000_0000_0000_0100,
13 | CanonicalCombiningClass = 0b_0000_0000_0000_1000,
14 | BidirectionalClass = 0b_0000_0000_0001_0000,
15 | DecompositionMapping = 0b_0000_0000_0010_0000,
16 |
17 | // NumericType / NumericValue : Not exactly a bit mask here… More like [0…3] << 6
18 | NumericDecimal = 0b_0000_0000_0100_0000,
19 | NumericDigit = 0b_0000_0000_1000_0000,
20 | NumericNumeric = 0b_0000_0000_1100_0000,
21 |
22 | // This is a yes/no field, so obviously, no extra storage is required for this one…
23 | BidirectionalMirrored = 0b_0000_0001_0000_0000,
24 |
25 | OldName = 0b_0000_0010_0000_0000,
26 | SimpleUpperCaseMapping = 0b_0000_0100_0000_0000,
27 | SimpleLowerCaseMapping = 0b_0000_1000_0000_0000,
28 | SimpleTitleCaseMapping = 0b_0001_0000_0000_0000,
29 |
30 | ContributoryProperties = 0b_0010_0000_0000_0000,
31 | CorePropertiesAndEmojiProperties = 0b_0100_0000_0000_0000,
32 |
33 | CrossRerefences = 0b_1000_0000_0000_0000,
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeBlock.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics;
2 |
3 | namespace System.Unicode
4 | {
5 | /// Represents a Unicode block.
6 | [DebuggerDisplay("[{CodePointRange.ToString(),nq}] {Name,nq}")]
7 | public readonly struct UnicodeBlock
8 | {
9 | /// The code point range of this block.
10 | public readonly UnicodeCodePointRange CodePointRange;
11 | /// The name of this block.
12 | public readonly string Name;
13 |
14 | internal UnicodeBlock(UnicodeCodePointRange codePointRange, string name)
15 | {
16 | CodePointRange = codePointRange;
17 | Name = name;
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeCategoryExtensions.cs:
--------------------------------------------------------------------------------
1 | using System.Globalization;
2 |
3 | namespace System.Unicode
4 | {
5 | /// Provides extensions to the type.
6 | public static class UnicodeCategoryExtensions
7 | {
8 | /// Gets the short name of the unicode category.
9 | /// The category whose short name should be retrieved.
10 | /// The short name of the unicode category.
11 | public static string GetShortName(this UnicodeCategory category)
12 | => UnicodeCategoryInfo.Get(category).ShortName;
13 |
14 | /// Gets the long name of the unicode category.
15 | /// The category whose long name should be retrieved.
16 | /// The long name of the unicode category.
17 | public static string GetLongName(this UnicodeCategory category)
18 | => UnicodeCategoryInfo.Get(category).LongName;
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeCategoryInfo.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.Globalization;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Provides complementary information on values.
7 | public readonly struct UnicodeCategoryInfo : IEquatable
8 | {
9 | private static readonly UnicodeCategoryInfo[] Categories =
10 | {
11 | new UnicodeCategoryInfo(UnicodeCategory.UppercaseLetter, "Lu", "Uppercase_Letter"),
12 | new UnicodeCategoryInfo(UnicodeCategory.LowercaseLetter, "Ll", "Lowercase_Letter"),
13 | new UnicodeCategoryInfo(UnicodeCategory.TitlecaseLetter, "Lt", "Titlecase_Letter"),
14 | new UnicodeCategoryInfo(UnicodeCategory.ModifierLetter, "Lm", "Modifier_Letter"),
15 | new UnicodeCategoryInfo(UnicodeCategory.OtherLetter, "Lo", "Other_Letter"),
16 | new UnicodeCategoryInfo(UnicodeCategory.NonSpacingMark, "Mn", "Nonspacing_Mark"),
17 | new UnicodeCategoryInfo(UnicodeCategory.SpacingCombiningMark, "Mc", "Spacing_Mark"),
18 | new UnicodeCategoryInfo(UnicodeCategory.EnclosingMark, "Me", "Enclosing_Mark"),
19 | new UnicodeCategoryInfo(UnicodeCategory.DecimalDigitNumber, "Nd", "Decimal_Number"),
20 | new UnicodeCategoryInfo(UnicodeCategory.LetterNumber, "Nl", "Letter_Number"),
21 | new UnicodeCategoryInfo(UnicodeCategory.OtherNumber, "No", "Other_Number"),
22 | new UnicodeCategoryInfo(UnicodeCategory.SpaceSeparator, "Zs", "Space_Separator"),
23 | new UnicodeCategoryInfo(UnicodeCategory.LineSeparator, "Zl", "Line_Separator"),
24 | new UnicodeCategoryInfo(UnicodeCategory.ParagraphSeparator, "Zp", "Paragraph_Separator"),
25 | new UnicodeCategoryInfo(UnicodeCategory.Control, "Cc", "Control"),
26 | new UnicodeCategoryInfo(UnicodeCategory.Format, "Cf", "Format"),
27 | new UnicodeCategoryInfo(UnicodeCategory.Surrogate, "Cs", "Surrogate"),
28 | new UnicodeCategoryInfo(UnicodeCategory.PrivateUse, "Co", "Private_Use"),
29 | new UnicodeCategoryInfo(UnicodeCategory.ConnectorPunctuation, "Pc", "Connector_Punctuation"),
30 | new UnicodeCategoryInfo(UnicodeCategory.DashPunctuation, "Pd", "Dash_Punctuation"),
31 | new UnicodeCategoryInfo(UnicodeCategory.OpenPunctuation, "Ps", "Open_Punctuation"),
32 | new UnicodeCategoryInfo(UnicodeCategory.ClosePunctuation, "Pe", "Close_Punctuation"),
33 | new UnicodeCategoryInfo(UnicodeCategory.InitialQuotePunctuation, "Pi", "Initial_Punctuation"),
34 | new UnicodeCategoryInfo(UnicodeCategory.FinalQuotePunctuation, "Pf", "Final_Punctuation"),
35 | new UnicodeCategoryInfo(UnicodeCategory.OtherPunctuation, "Po", "Other_Punctuation"),
36 | new UnicodeCategoryInfo(UnicodeCategory.MathSymbol, "Sm", "Math_Symbol"),
37 | new UnicodeCategoryInfo(UnicodeCategory.CurrencySymbol, "Sc", "Currency_Symbol"),
38 | new UnicodeCategoryInfo(UnicodeCategory.ModifierSymbol, "Sk", "Modifier_Symbol"),
39 | new UnicodeCategoryInfo(UnicodeCategory.OtherSymbol, "So", "Other_Symbol"),
40 | new UnicodeCategoryInfo(UnicodeCategory.OtherNotAssigned, "Cn", "Unassigned"),
41 | };
42 |
43 | private static readonly Dictionary UnicodeShortNameToCategoryDictionary = BuildShortNameDictionary();
44 | private static readonly Dictionary UnicodeLongNameToCategoryDictionary = BuildLongNameDictionary();
45 |
46 | private static Dictionary BuildShortNameDictionary()
47 | {
48 | var dictionary = new Dictionary(StringComparer.OrdinalIgnoreCase);
49 |
50 | foreach (var info in Categories)
51 | {
52 | dictionary.Add(info.ShortName, info.Category);
53 | }
54 |
55 | return dictionary;
56 | }
57 |
58 | private static Dictionary BuildLongNameDictionary()
59 | {
60 | var dictionary = new Dictionary(StringComparer.OrdinalIgnoreCase);
61 |
62 | foreach (var info in Categories)
63 | {
64 | dictionary.Add(info.LongName, info.Category);
65 | }
66 |
67 | return dictionary;
68 | }
69 |
70 | private static UnicodeCategory GetCategoryFromShortName(string name)
71 | => UnicodeShortNameToCategoryDictionary[name];
72 |
73 | private static UnicodeCategory GetCategoryFromLongName(string name)
74 | => UnicodeLongNameToCategoryDictionary[name];
75 |
76 | /// Gets an value providing information on the specified unicode category.
77 | /// The category on which information should be retrieved.
78 | /// Information on the specified category.
79 | public static UnicodeCategoryInfo Get(UnicodeCategory category) => Categories[(int)category];
80 |
81 | /// Gets an value providing information on the unicode category, accessed by its short name, as per the Unicode standard.
82 | /// The short name for which information should be retrieved .
83 | /// Information on the specified category.
84 | public static UnicodeCategoryInfo FromShortName(string name) => Get(GetCategoryFromShortName(name));
85 |
86 | /// Gets an value providing information on the unicode category, accessed by its long name, as per the Unicode standard.
87 | /// The long name for which information should be retrieved .
88 | /// Information on the specified category.
89 | public static UnicodeCategoryInfo FromLongName(string name) => Get(GetCategoryFromLongName(name));
90 |
91 | /// The unicode category described.
92 | public readonly UnicodeCategory Category;
93 | /// Short name of the category, as per the Unicode standard.
94 | public readonly string ShortName;
95 | /// Long name of the category, as per the Unicode standard.
96 | public readonly string LongName;
97 |
98 | private UnicodeCategoryInfo(UnicodeCategory category, string shortName, string longName)
99 | {
100 | Category = category;
101 | ShortName = shortName;
102 | LongName = longName;
103 | }
104 |
105 | /// Returns a that represents this instance.
106 | /// A that represents this instance.
107 | public override string ToString() => Category.ToString();
108 |
109 | /// Determines whether the specified , is equal to this instance.
110 | /// The to compare with this instance.
111 | /// if the specified is equal to this instance; otherwise, .
112 | public override bool Equals(object obj) => obj is UnicodeCategoryInfo other && Equals(other);
113 |
114 | /// Indicates whether the current object is equal to another object of the same type.
115 | /// An object to compare with this object.
116 | /// if the current object is equal to the other parameter; otherwise, .
117 | public bool Equals(UnicodeCategoryInfo other) => other.Category == Category && (other.Category != 0 || other.ShortName != null);
118 |
119 | /// Returns a hash code for this instance.
120 | /// A hash code for this instance, suitable for use in hashing algorithms and data structures like a hash table.
121 | public override int GetHashCode() => (int)Category;
122 |
123 | /// Performs an implicit conversion from to .
124 | /// The information.
125 | /// The result of the conversion.
126 | public static implicit operator UnicodeCategory(UnicodeCategoryInfo info) => info.Category;
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeCharacterData.cs:
--------------------------------------------------------------------------------
1 | using System.Globalization;
2 |
3 | namespace System.Unicode
4 | {
5 | internal readonly struct UnicodeCharacterData
6 | {
7 | public readonly UnicodeCodePointRange CodePointRange;
8 | public readonly string Name;
9 | public readonly UnicodeNameAlias[] NameAliases;
10 | public readonly UnicodeCategory Category;
11 | public readonly CanonicalCombiningClass CanonicalCombiningClass;
12 | public readonly BidirectionalClass BidirectionalClass;
13 | public readonly CompatibilityFormattingTag DecompositionType;
14 | public readonly string DecompositionMapping;
15 | public readonly UnicodeNumericType NumericType;
16 | private readonly UnicodeRationalNumber _numericValue;
17 | public readonly bool BidirectionalMirrored;
18 | public readonly string OldName;
19 | public readonly string SimpleUpperCaseMapping;
20 | public readonly string SimpleLowerCaseMapping;
21 | public readonly string SimpleTitleCaseMapping;
22 | public readonly ContributoryProperties ContributoryProperties;
23 | private readonly int _corePropertiesAndEmojiProperties;
24 | public CoreProperties CoreProperties => (CoreProperties)(_corePropertiesAndEmojiProperties & 0x003FFFFF);
25 | public EmojiProperties EmojiProperties => (EmojiProperties)(_corePropertiesAndEmojiProperties >> 24);
26 |
27 | public readonly int[] CrossRerefences; // NB: It seems that parsing NamesList is required in order to provide data for this field ?
28 |
29 | internal UnicodeCharacterData
30 | (
31 | UnicodeCodePointRange codePointRange,
32 | string name,
33 | UnicodeNameAlias[] nameAliases,
34 | UnicodeCategory category,
35 | CanonicalCombiningClass canonicalCombiningClass,
36 | BidirectionalClass bidirectionalClass,
37 | CompatibilityFormattingTag decompositionType,
38 | string decompositionMapping,
39 | UnicodeNumericType numericType,
40 | UnicodeRationalNumber numericValue,
41 | bool bidirectionalMirrored,
42 | string oldName,
43 | string simpleUpperCaseMapping,
44 | string simpleLowerCaseMapping,
45 | string simpleTitleCaseMapping,
46 | ContributoryProperties contributoryProperties,
47 | int corePropertiesAndEmojiProperties,
48 | int[] crossRerefences
49 | )
50 | {
51 | CodePointRange = codePointRange;
52 | Name = name;
53 | NameAliases = nameAliases;
54 | Category = category;
55 | CanonicalCombiningClass = canonicalCombiningClass;
56 | BidirectionalClass = bidirectionalClass;
57 | DecompositionType = decompositionType;
58 | DecompositionMapping = decompositionMapping;
59 | NumericType = numericType;
60 | _numericValue = numericValue;
61 | BidirectionalMirrored = bidirectionalMirrored;
62 | OldName = oldName;
63 | SimpleUpperCaseMapping = simpleUpperCaseMapping;
64 | SimpleLowerCaseMapping = simpleLowerCaseMapping;
65 | SimpleTitleCaseMapping = simpleTitleCaseMapping;
66 | ContributoryProperties = contributoryProperties;
67 | _corePropertiesAndEmojiProperties = corePropertiesAndEmojiProperties;
68 | CrossRerefences = crossRerefences;
69 | }
70 |
71 | public UnicodeRationalNumber? NumericValue => NumericType != UnicodeNumericType.None ? _numericValue : null as UnicodeRationalNumber?;
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeCodePointRange.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 | using System.Globalization;
4 |
5 | namespace System.Unicode
6 | {
7 | /// Represents a range of Unicode code points.
8 | public readonly struct UnicodeCodePointRange : IEnumerable
9 | {
10 | /// Represents an enumerator which enumerated through all the code points in the .
11 | public struct Enumerator : IEnumerator
12 | {
13 | private readonly int _start;
14 | private readonly int _end;
15 | private int _index;
16 |
17 | /// Initializes a new instance of the struct.
18 | /// The start of the range.
19 | /// The end of the range.
20 | internal Enumerator(int start, int end)
21 | {
22 | _start = start;
23 | _end = end;
24 | _index = start - 1;
25 | }
26 |
27 | /// Does nothing.
28 | public void Dispose() { }
29 |
30 | /// Gets the element in the collection at the current position of the enumerator..
31 | /// The element in the collection at the current position of the enumerator.
32 | public int Current => _index;
33 |
34 | object IEnumerator.Current => _index;
35 |
36 | /// Advances the enumerator to the next element of the collection.
37 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection.
38 | public bool MoveNext() => _index < _end && ++_index == _index;
39 |
40 | void IEnumerator.Reset() => _index = _start - 1;
41 | }
42 |
43 | /// The first code point in the range.
44 | public readonly int FirstCodePoint;
45 | /// The last code point in the range.
46 | public readonly int LastCodePoint;
47 |
48 | /// Gets a value indicating whether this value represents a single code point.
49 | /// if this value represents a single code point; otherwise, .
50 | public bool IsSingleCodePoint => FirstCodePoint == LastCodePoint;
51 |
52 | /// Initializes a new instance of the struct for a single code point.
53 | /// The code point.
54 | ///
55 | public UnicodeCodePointRange(int codePoint)
56 | {
57 | if (codePoint < 0 || codePoint > 0x10FFFF) throw new ArgumentOutOfRangeException(nameof(codePoint));
58 |
59 | FirstCodePoint = codePoint;
60 | LastCodePoint = codePoint;
61 | }
62 |
63 | /// Initializes a new instance of the struct with specified bounds.
64 | /// The first code point in the range.
65 | /// The last code point in the range.
66 | ///
67 | /// is less than 0 or greated than 0x10FFFF,
68 | /// or is less than or greated than 0x10FFFF.
69 | ///
70 | public UnicodeCodePointRange(int firstCodePoint, int lastCodePoint)
71 | {
72 | if (firstCodePoint < 0 || firstCodePoint > 0x10FFFF) throw new ArgumentOutOfRangeException(nameof(firstCodePoint));
73 | if (lastCodePoint < firstCodePoint || lastCodePoint > 0x10FFFF) throw new ArgumentOutOfRangeException(nameof(lastCodePoint));
74 |
75 | FirstCodePoint = firstCodePoint;
76 | LastCodePoint = lastCodePoint;
77 | }
78 |
79 | /// Determines whether the range contains the specific code point.
80 | /// This method does not validate its inputs, but will always return for any invalid code point.
81 | /// The integer to check against the range.
82 | /// if the range contains the specified code point; otherwise, .
83 | public bool Contains(int i)
84 | // Since the first and last code points have been checked or are at their default value of zero, the method will always exlcude invalid code points.
85 | => i >= FirstCodePoint & i <= LastCodePoint;
86 |
87 | internal int CompareCodePoint(int codePoint)
88 | => FirstCodePoint <= codePoint ? LastCodePoint < codePoint ? 1 : 0 : -1;
89 |
90 | /// Returns a that represents this instance.
91 | /// A that represents this instance.
92 | public override string ToString()
93 | #if !HAS_NATIVE_SPAN
94 | => FirstCodePoint == LastCodePoint ? FirstCodePoint.ToString("X4") : FirstCodePoint.ToString("X4") + ".." + LastCodePoint.ToString("X4");
95 | #else
96 | => FirstCodePoint == LastCodePoint ? FirstCodePoint.ToString("X4") : RangeToString();
97 |
98 | private string RangeToString()
99 | {
100 | Span buffer = stackalloc char[14];
101 |
102 | FirstCodePoint.TryFormat(buffer, out int length, "X4", CultureInfo.InvariantCulture);
103 | buffer.Slice(length, 2).Fill('.');
104 | length += 2;
105 | LastCodePoint.TryFormat(buffer[length..], out int l, "X4", CultureInfo.InvariantCulture);
106 | length += l;
107 |
108 | return buffer[..length].ToString();
109 | }
110 | #endif
111 |
112 | /// Parses the specified into a .
113 | /// Code point ranges are encoded as one unprefixed hexadecimal number for single code points, or a pair of unprefixed hexadecimal numbers separated by the characters "..".
114 | /// The text to parse.
115 | /// The parsed value.
116 | /// The parameter was not in an allowed format.
117 | public static UnicodeCodePointRange Parse(string s)
118 | {
119 | int start, end;
120 |
121 | int rangeSeparatorOffset = s.IndexOf("..");
122 |
123 | if (rangeSeparatorOffset == 0) throw new FormatException();
124 | else if (rangeSeparatorOffset < 0)
125 | {
126 | start = end = int.Parse(s, NumberStyles.HexNumber);
127 | }
128 | else
129 | {
130 | #if HAS_NATIVE_SPAN
131 | start = int.Parse(s.AsSpan(0, rangeSeparatorOffset), NumberStyles.HexNumber);
132 | end = int.Parse(s.AsSpan(rangeSeparatorOffset + 2), NumberStyles.HexNumber);
133 | #else
134 | start = int.Parse(s.Substring(0, rangeSeparatorOffset), NumberStyles.HexNumber);
135 | end = int.Parse(s.Substring(rangeSeparatorOffset + 2), NumberStyles.HexNumber);
136 | #endif
137 | }
138 |
139 | return new UnicodeCodePointRange(start, end);
140 | }
141 |
142 | /// Returns an enumerator that iterates through the collection.
143 | /// A that can be used to iterate through the collection.
144 | public Enumerator GetEnumerator() => new(FirstCodePoint, LastCodePoint);
145 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
146 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
147 | }
148 | }
149 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeCrossReferenceCollection.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Represents a collection of code point cross-references.
7 | public readonly struct UnicodeCrossReferenceCollection : IList
8 | {
9 | #if NETSTANDARD1_1 || NET45
10 | private static readonly int[] EmptyArray = new int[0];
11 | #endif
12 |
13 | /// Represents an enumerator for the class.
14 | public struct Enumerator : IEnumerator
15 | {
16 | private readonly int[] _items;
17 | private int _index;
18 |
19 | /// Initializes a new instance of the struct.
20 | /// The items to enumerate.
21 | internal Enumerator(int[] items)
22 | {
23 | _items = items;
24 | _index = -1;
25 | }
26 |
27 | /// Does nothing.
28 | public void Dispose() { }
29 |
30 | /// Gets the element in the collection at the current position of the enumerator..
31 | /// The element in the collection at the current position of the enumerator.
32 | public int Current => _items[_index];
33 | object IEnumerator.Current => Current;
34 |
35 | /// Advances the enumerator to the next element of the collection.
36 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection.
37 | public bool MoveNext() => _index < _items.Length && ++_index < _items.Length;
38 |
39 | void IEnumerator.Reset() => _index = -1;
40 | }
41 |
42 | /// Gets an empty struct.
43 | public static readonly UnicodeCrossReferenceCollection Empty =
44 | #if NETSTANDARD1_1 || NET45
45 | new UnicodeCrossReferenceCollection(EmptyArray);
46 | #else
47 | new UnicodeCrossReferenceCollection(Array.Empty());
48 | #endif
49 |
50 | private readonly int[] _items;
51 |
52 | internal UnicodeCrossReferenceCollection(int[] items)
53 | => _items = items
54 | #if NETSTANDARD1_1 || NET45
55 | ?? EmptyArray;
56 | #else
57 | ?? Array.Empty();
58 | #endif
59 |
60 | /// Gets the cross-referenced code point at the specified index.
61 | /// The cross-referenced code point.
62 | /// The index.
63 | /// The cross-referenced code point at the specified index.
64 | public int this[int index] => _items[index];
65 |
66 | int IList.this[int index]
67 | {
68 | get => _items[index];
69 | set => throw new NotSupportedException();
70 | }
71 |
72 | /// Gets the number of elements contained in the .
73 | /// The number of elements contained in the .
74 | public int Count => _items.Length;
75 |
76 | bool ICollection.IsReadOnly => true;
77 |
78 | void ICollection.Add(int item) => throw new NotSupportedException();
79 | void IList.Insert(int index, int item) => throw new NotSupportedException();
80 |
81 | bool ICollection.Remove(int item) => throw new NotSupportedException();
82 | void IList.RemoveAt(int index) => throw new NotSupportedException();
83 |
84 | void ICollection.Clear() => throw new NotSupportedException();
85 |
86 | /// Determines the index of a specific item in the .
87 | /// The object to locate in the .
88 | /// The index of the item if found in the list; otherwise, -1.
89 | public int IndexOf(int item) => Array.IndexOf(_items, item);
90 |
91 | /// Determines whether the contains a specific value.
92 | /// The object to locate in the .
93 | /// if item is fount in the ; otherwise.
94 | public bool Contains(int item) => IndexOf(item) >= 0;
95 |
96 | ///
97 | /// Copies the elements of the UnicodeCrossReferenceCollection to an , starting at a particular index.
98 | ///
99 | /// The one-dimensional that is the destination of the elements to copy from UnicodeCrossReferenceCollection. The must have zero-based indexing.
100 | /// The zeo-based index in array at which copy begins.
101 | public void CopyTo(int[] array, int arrayIndex) => _items.CopyTo(array, arrayIndex);
102 |
103 | /// Returns an enumerator that iterates through the collection.
104 | /// A that can be used to iterate through the collection.
105 | public Enumerator GetEnumerator() => new Enumerator(_items);
106 |
107 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
108 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeNameAlias.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics;
2 | using System.Linq;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Represents a name alias for an Unicode code point.
7 | [DebuggerDisplay("{DisplayText,nq}")]
8 | public readonly struct UnicodeNameAlias
9 | {
10 | internal static readonly UnicodeNameAlias[] EmptyArray = new UnicodeNameAlias[0];
11 |
12 | /// Gets the alias name.
13 | /// The name.
14 | public string Name { get; }
15 |
16 | /// Gets the kind of alias.
17 | /// The kind of alias.
18 | public UnicodeNameAliasKind Kind { get; }
19 |
20 | private string DisplayText => (Kind != 0 ? "<" + EnumHelper.GetValueNames(Kind).FirstOrDefault() + "> " : string.Empty) + Name;
21 |
22 | internal UnicodeNameAlias(string name, UnicodeNameAliasKind kind)
23 | {
24 | Name = name;
25 | Kind = kind;
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeNameAliasCollection.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Represents a collection of name aliases.
7 | public readonly struct UnicodeNameAliasCollection : IList
8 | {
9 | /// Represents an enumerator for the class.
10 | public struct Enumerator : IEnumerator
11 | {
12 | private readonly UnicodeNameAlias[] _items;
13 | private int _index;
14 |
15 | /// Initializes a new instance of the struct.
16 | /// The items to enumerate.
17 | internal Enumerator(UnicodeNameAlias[] items)
18 | {
19 | _items = items;
20 | _index = -1;
21 | }
22 |
23 | /// Does nothing.
24 | public void Dispose() { }
25 |
26 | /// Gets the element in the collection at the current position of the enumerator..
27 | /// The element in the collection at the current position of the enumerator.
28 | public UnicodeNameAlias Current => _items[_index];
29 | object IEnumerator.Current => Current;
30 |
31 | /// Advances the enumerator to the next element of the collection.
32 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection.
33 | public bool MoveNext() => _index < _items.Length && ++_index < _items.Length;
34 |
35 | void IEnumerator.Reset() => _index = -1;
36 | }
37 |
38 | /// Gets an empty struct.
39 | public static readonly UnicodeNameAliasCollection Empty = new UnicodeNameAliasCollection(UnicodeNameAlias.EmptyArray);
40 |
41 | private readonly UnicodeNameAlias[] _items;
42 |
43 | internal UnicodeNameAliasCollection(UnicodeNameAlias[] items) => _items = items ?? UnicodeNameAlias.EmptyArray;
44 |
45 | /// Gets the at the specified index.
46 | /// The .
47 | /// The index.
48 | /// The at the specified index.
49 | public UnicodeNameAlias this[int index] => _items[index];
50 |
51 | UnicodeNameAlias IList.this[int index]
52 | {
53 | get => _items[index];
54 | set => throw new NotSupportedException();
55 | }
56 |
57 | /// Gets the number of elements contained in the .
58 | /// The number of elements contained in the .
59 | public int Count => _items.Length;
60 |
61 | bool ICollection.IsReadOnly => true;
62 |
63 | void ICollection.Add(UnicodeNameAlias item) => throw new NotSupportedException();
64 | void IList.Insert(int index, UnicodeNameAlias item) => throw new NotSupportedException();
65 |
66 | bool ICollection.Remove(UnicodeNameAlias item) => throw new NotSupportedException();
67 | void IList.RemoveAt(int index) => throw new NotSupportedException();
68 |
69 | void ICollection.Clear() => throw new NotSupportedException();
70 |
71 | /// Determines the index of a specific item in the .
72 | /// The object to locate in the .
73 | /// The index of the item if found in the list; otherwise, -1.
74 | public int IndexOf(UnicodeNameAlias item) => Array.IndexOf(_items, item);
75 |
76 | /// Determines whether the contains a specific value.
77 | /// The object to locate in the .
78 | /// if item is fount in the ; otherwise.
79 | public bool Contains(UnicodeNameAlias item) => IndexOf(item) >= 0;
80 |
81 | ///
82 | /// Copies the elements of the UnicodeNameAliasCollection to an , starting at a particular index.
83 | ///
84 | /// The one-dimensional that is the destination of the elements to copy from UnicodeNameAliasCollection. The must have zero-based indexing.
85 | /// The zeo-based index in array at which copy begins.
86 | public void CopyTo(UnicodeNameAlias[] array, int arrayIndex) => _items.CopyTo(array, arrayIndex);
87 |
88 | /// Returns an enumerator that iterates through the collection.
89 | /// An that can be used to iterate through the collection.
90 | public Enumerator GetEnumerator() => new Enumerator(_items);
91 |
92 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
93 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeNameAliasKind.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | /// Provides information on the kind of name alias provided for a code point.
4 | public enum UnicodeNameAliasKind : byte
5 | {
6 | /// The alias is a correction of a serious problem in the original name.
7 | [ValueName("correction")]
8 | Correction = 1,
9 | /// The alias provides the ISO 6429 name for C0 and C1 control functions of a control code, or another commonly occurring name for the control code.
10 | [ValueName("control")]
11 | Control = 2,
12 | /// The alias is a widely used alternate name for a format character.
13 | [ValueName("alternate")]
14 | Alternate = 3,
15 | /// The alias is a documented non-standardized label for C1 control code points.
16 | [ValueName("figment")]
17 | Figment = 4,
18 | /// The alias is a commonly occurring abbreviation (or acronym) for control codes, format characters, spaces, and variation selectors.
19 | [ValueName("abbreviation")]
20 | Abbreviation = 5
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeNumericType.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | /// Represents the value of the Numeric_Type property.
4 | public enum UnicodeNumericType : byte
5 | {
6 | /// The code point has no numeric value.
7 | None = 0,
8 | /// The code point represents a decimal digit which is part of a contiguous ascending range of characters from 0 to 9, and can be used in a decimal radix positional numeral system.
9 | Decimal = 1,
10 | /// The code point represents a digit between 0 and 9 and requires special handling.
11 | Digit = 2,
12 | /// The code point represents another kind of numeric value.
13 | Numeric = 3
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeRadicalStrokeCount.cs:
--------------------------------------------------------------------------------
1 | using System.Diagnostics;
2 |
3 | namespace System.Unicode
4 | {
5 | /// Provides information on radical and additional stroke count for a code point.
6 | /// Values of this type are usually associated with the property kRSUnicode (aka. Unicode_Radical_Stroke).
7 | [DebuggerDisplay(@"{IsSimplified ? ""Simplified"" : ""Traditional"",nq} Radical {Radical} + {StrokeCount} Strokes")]
8 | public readonly struct UnicodeRadicalStrokeCount
9 | {
10 | #if NETSTANDARD1_1 || NET45
11 | internal static readonly UnicodeRadicalStrokeCount[] EmptyArray = new UnicodeRadicalStrokeCount[0];
12 | #endif
13 |
14 | /// Initializes a new instance of the class from raw data.
15 | /// The raw value to use for .
16 | /// The raw value to use for .
17 | internal UnicodeRadicalStrokeCount(byte rawRadical, byte rawStrokeCount)
18 | {
19 | Radical = rawRadical;
20 | RawStrokeCount = rawStrokeCount;
21 | }
22 |
23 | /// Initializes a new instance of the class .
24 | /// must be between -64 and 63 included.
25 | /// The index of the Kangxi radical of the character.
26 | /// The number of additional strokes required to form the character from the radical.
27 | /// Indicates whether the character is simplified.
28 | /// is outside of the allowed range of -8 to 119 inclusive.
29 | internal UnicodeRadicalStrokeCount(byte radical, sbyte strokeCount, bool isSimplified)
30 | {
31 | // Two's complement doesn't work anymore there, as we have some code points with more than 64 additional strokes.
32 | // Negative strokes don't seem to go below -5 for now, so we'll map value between -8 and 119 as 120..127;0..119.
33 | if (strokeCount < -8 || strokeCount > 127 - 8) throw new ArgumentOutOfRangeException(nameof(strokeCount));
34 |
35 | Radical = radical;
36 | // Pack strokeCount together with isSimplified in a single byte.
37 | RawStrokeCount = unchecked((byte)(strokeCount & 0x7F | (isSimplified ? 0x80 : 0x00)));
38 | }
39 |
40 | /// Gets the index of the Kangxi radical of the character.
41 | /// The Kangxi radicals are numbered from 1 to 214 inclusive.
42 | /// The index of the Kangxi radical.
43 | public byte Radical { get; }
44 |
45 | /// Gets the value of packed with .
46 | ///
47 | /// The stroke count is stored as a 7bit value, together with the flag as a 1bit value.
48 | /// Raw values between 120 and 127 represent negative stroke counts -8 to -1.
49 | ///
50 | /// The raw value of .
51 | internal byte RawStrokeCount { get; }
52 |
53 | /// Gets the additional stroke count.
54 | /// The additional stroke count.
55 | public sbyte StrokeCount => (RawStrokeCount & 0x7F) is int c && c > 119 ? unchecked((sbyte)(c - 128)) : unchecked((sbyte)c);
56 |
57 | /// Gets a value indicating whether the information is based on the simplified form of the radical.
58 | /// if the information is based on the simplified form of the radical; otherwise, .
59 | public bool IsSimplified => (RawStrokeCount & 0x80) != 0;
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeRadicalStrokeCountCollection.cs:
--------------------------------------------------------------------------------
1 | using System.Collections;
2 | using System.Collections.Generic;
3 |
4 | namespace System.Unicode
5 | {
6 | /// Represents a collection of values for the kRSUnicode (aka. Unicode_Radical_Stroke) property.
7 | public readonly struct UnicodeRadicalStrokeCountCollection : IList
8 | {
9 | /// Represents an enumerator for the class.
10 | public struct Enumerator : IEnumerator
11 | {
12 | private readonly UnicodeRadicalStrokeCount[] _items;
13 | private int _index;
14 |
15 | /// Initializes a new instance of the struct.
16 | /// The items to enumerate.
17 | internal Enumerator(UnicodeRadicalStrokeCount[] items)
18 | {
19 | _items = items;
20 | _index = -1;
21 | }
22 |
23 | /// Does nothing.
24 | public void Dispose() { }
25 |
26 | /// Gets the element in the collection at the current position of the enumerator..
27 | /// The element in the collection at the current position of the enumerator.
28 | public UnicodeRadicalStrokeCount Current => _items[_index];
29 | object IEnumerator.Current => Current;
30 |
31 | /// Advances the enumerator to the next element of the collection.
32 | /// true if the enumerator was successfully advanced to the next element; false if the enumerator has passed the end of the collection.
33 | public bool MoveNext() => _index < _items.Length && ++_index < _items.Length;
34 |
35 | void IEnumerator.Reset() => _index = -1;
36 | }
37 |
38 | /// Gets an empty struct.
39 | public static readonly UnicodeRadicalStrokeCountCollection Empty = new UnicodeRadicalStrokeCountCollection(null);
40 |
41 | private readonly UnicodeRadicalStrokeCount[] _items;
42 |
43 | internal UnicodeRadicalStrokeCountCollection(UnicodeRadicalStrokeCount[] items)
44 | => _items = items
45 | #if NETSTANDARD1_1 || NET45
46 | ?? UnicodeRadicalStrokeCount.EmptyArray;
47 | #else
48 | ?? Array.Empty();
49 | #endif
50 |
51 | /// Gets the at the specified index.
52 | /// The .
53 | /// The index.
54 | /// The at the specified index.
55 | public UnicodeRadicalStrokeCount this[int index] => _items[index];
56 |
57 | UnicodeRadicalStrokeCount IList.this[int index]
58 | {
59 | get => _items[index];
60 | set => throw new NotSupportedException();
61 | }
62 |
63 | /// Gets the number of elements contained in the .
64 | /// The number of elements contained in the .
65 | public int Count => _items.Length;
66 |
67 | bool ICollection.IsReadOnly => true;
68 |
69 | void ICollection.Add(UnicodeRadicalStrokeCount item) => throw new NotSupportedException();
70 | void IList.Insert(int index, UnicodeRadicalStrokeCount item) => throw new NotSupportedException();
71 |
72 | bool ICollection.Remove(UnicodeRadicalStrokeCount item) => throw new NotSupportedException();
73 | void IList.RemoveAt(int index) => throw new NotSupportedException();
74 |
75 | void ICollection.Clear() => throw new NotSupportedException();
76 |
77 | /// Determines the index of a specific item in the .
78 | /// The object to locate in the .
79 | /// The index of the item if found in the list; otherwise, -1.
80 | public int IndexOf(UnicodeRadicalStrokeCount item) => Array.IndexOf(_items, item);
81 |
82 | /// Determines whether the contains a specific value.
83 | /// The object to locate in the .
84 | /// if item is fount in the ; otherwise.
85 | public bool Contains(UnicodeRadicalStrokeCount item) => IndexOf(item) >= 0;
86 |
87 | /// Copies the elements of the UnicodeRadicalStrokeCountCollection to an , starting at a particular index.
88 | /// The one-dimensional that is the destination of the elements to copy from UnicodeRadicalStrokeCountCollection. The must have zero-based indexing.
89 | /// The zeo-based index in array at which copy begins.
90 | public void CopyTo(UnicodeRadicalStrokeCount[] array, int arrayIndex) => _items.CopyTo(array, arrayIndex);
91 |
92 | ///
93 | /// Returns an enumerator that iterates through the collection.
94 | ///
95 | ///
96 | /// A that can be used to iterate through the collection.
97 | ///
98 | public Enumerator GetEnumerator() => new Enumerator(_items);
99 |
100 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
101 | IEnumerator IEnumerable.GetEnumerator() => GetEnumerator();
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/System.Unicode/UnicodeRationalNumber.cs:
--------------------------------------------------------------------------------
1 | using System.Globalization;
2 |
3 | namespace System.Unicode
4 | {
5 | /// Represents a rational number in a format compatible with the Unicode standard.
6 | public readonly struct UnicodeRationalNumber : IEquatable
7 | {
8 | /// Parses a rational number from a string representation.
9 | ///
10 | /// Valid text representations should match the regex pattern /-?[0-9]+(?:\/[0-9]+)/.
11 | /// The numerator part should fit in a , and the denominator part should fit in a .
12 | ///
13 | /// The string to parse.
14 | /// The rational number parsed from the string.
15 | /// The parameter is .
16 | /// The parameter is empty.
17 | public static UnicodeRationalNumber Parse(string s)
18 | {
19 | if (s == null) throw new ArgumentNullException(nameof(s));
20 | if (s.Length == 0) throw new ArgumentException();
21 |
22 | int fractionBarIndex = s.IndexOf('/');
23 | #if HAS_NATIVE_SPAN
24 | return new UnicodeRationalNumber(long.Parse(fractionBarIndex >= 0 ? s.AsSpan(0, fractionBarIndex) : s), fractionBarIndex >= 0 ? ushort.Parse(s.AsSpan(fractionBarIndex + 1)) : (byte)1);
25 | #else
26 | return new UnicodeRationalNumber(long.Parse(fractionBarIndex >= 0 ? s.Substring(0, fractionBarIndex) : s), fractionBarIndex >= 0 ? ushort.Parse(s.Substring(fractionBarIndex + 1)) : (byte)1);
27 | #endif
28 | }
29 |
30 | /// The numerator of the fraction.
31 | public readonly long Numerator;
32 | /// The denominator of the fraction.
33 | public readonly ushort Denominator;
34 |
35 | /// Initializes a new instance of the structure that represents a signed integer..
36 | /// The number which should be represented as a rational number.
37 | public UnicodeRationalNumber(long number)
38 | {
39 | Numerator = number;
40 | Denominator = 1;
41 | }
42 |
43 | /// Initializes a new instance of the structure that represents a signed integer..
44 | /// The number which should be used as numerator in the rational number.
45 | /// The number which should be used as denominator in the rational number.
46 | public UnicodeRationalNumber(long numerator, ushort denominator)
47 | {
48 | Numerator = numerator;
49 | Denominator = denominator;
50 | }
51 |
52 | /// Gets a value indicating whether the current value is the default value of the type.
53 | /// The default value is an invalid fraction of 0/0.
54 | public bool IsDefaultValue => Numerator == 0 && Denominator == 0;
55 |
56 | /// Creates a string representation of the current rational number.
57 | /// The created representation is culture invariant, and will be parsable by the method.
58 | public override string ToString()
59 | #if !HAS_NATIVE_SPAN
60 | => !IsDefaultValue ? Denominator != 1 ? Numerator.ToString() + "/" + Denominator.ToString() : Numerator.ToString() : string.Empty;
61 | #else
62 | => !IsDefaultValue ?
63 | Denominator != 1 ?
64 | FractionToString() :
65 | Numerator.ToString() :
66 | string.Empty;
67 |
68 | private string FractionToString()
69 | {
70 | Span buffer = stackalloc char[26];
71 |
72 | Numerator.TryFormat(buffer, out int length, "D", CultureInfo.InvariantCulture);
73 | buffer[length++] = '/';
74 | Denominator.TryFormat(buffer[length..], out int l, "D", CultureInfo.InvariantCulture);
75 | length += l;
76 |
77 | return buffer[..length].ToString();
78 | }
79 | #endif
80 |
81 | /// Determines whether the specified rational number is equal to the current value.
82 | /// The other value to compare to the current one.
83 | /// if the two values are the same; otherwise.
84 | public bool Equals(UnicodeRationalNumber other)
85 | {
86 | // We don't consider 1/2 and 2/4 equal here, as, that wouldn't be the same character.
87 | return other.Numerator == Numerator && other.Denominator == Denominator;
88 | }
89 |
90 | /// Determines whether the specified object is equal to the current rational number.
91 | /// The object to compare to the current rational number.
92 | /// if the object represents the same rational number; otherwise.
93 | public override bool Equals(object obj)
94 | {
95 | return base.Equals(obj);
96 | }
97 |
98 | /// Returns the hash code for the current rational number.
99 | /// A 32-bit signed integer hash code.
100 | public override int GetHashCode()
101 | {
102 | return (int)(Numerator << 8) | (Denominator) ^ (byte)(Numerator >> 56);
103 | }
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/System.Unicode/UnihanCharacterData.Generated.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | partial struct UnihanCharacterData
4 | {
5 | // This method densely packs code points by predicted order of importance (it may be wrong)
6 | // Its purpose is to avoid skipping numbers so that file encoding can be more efficient.
7 | internal static int PackCodePoint(int codePoint)
8 | {
9 | if (codePoint >= 0x3400)
10 | {
11 | // 3400..4DBF; CJK Unified Ideographs Extension A
12 | if (codePoint < 0x4DC0) return codePoint + 0x01E00;
13 | else if (codePoint >= 0x4E00)
14 | {
15 | // 4E00..9FFF; CJK Unified Ideographs
16 | if (codePoint < 0xA000) return codePoint - 0x04E00;
17 | else if (codePoint >= 0xF900)
18 | {
19 | // F900..FAFF; CJK Compatibility Ideographs
20 | if (codePoint < 0xFB00) return codePoint + 0x08240;
21 | else if (codePoint >= 0x20000)
22 | {
23 | // 20000..2A6DF; CJK Unified Ideographs Extension B
24 | if (codePoint < 0x2A6E0) return codePoint - 0x19440;
25 | else if (codePoint >= 0x2A700)
26 | {
27 | // 2A700..2B73F; CJK Unified Ideographs Extension C
28 | // 2B740..2B81F; CJK Unified Ideographs Extension D
29 | // 2B820..2CEAF; CJK Unified Ideographs Extension E
30 | // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
31 | if (codePoint < 0x2EBF0) return codePoint - 0x19460;
32 | else if (codePoint >= 0x2F800)
33 | {
34 | // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
35 | if (codePoint < 0x2FA20) return codePoint - 0x17AC0;
36 | else if (codePoint >= 0x30000)
37 | {
38 | // 30000..3134F; CJK Unified Ideographs Extension G
39 | // 31350..323AF; CJK Unified Ideographs Extension H
40 | if (codePoint < 0x323B0) return codePoint - 0x1A870;
41 | }
42 | }
43 | }
44 | }
45 | }
46 | }
47 | }
48 |
49 | throw new ArgumentOutOfRangeException(nameof(codePoint));
50 | }
51 |
52 | // Reverses the packing done by the PackCodePoint method.
53 | internal static int UnpackCodePoint(int packedCodePoint)
54 | {
55 | if (packedCodePoint >= 0)
56 | {
57 | // 4E00..9FFF; CJK Unified Ideographs
58 | if (packedCodePoint < 0x05200) return packedCodePoint + 0x4E00;
59 | // 3400..4DBF; CJK Unified Ideographs Extension A
60 | else if (packedCodePoint < 0x06BC0) return packedCodePoint - 0x1E00;
61 | // 20000..2A6DF; CJK Unified Ideographs Extension B
62 | else if (packedCodePoint < 0x112A0) return packedCodePoint + 0x19440;
63 | // 2A700..2B73F; CJK Unified Ideographs Extension C
64 | // 2B740..2B81F; CJK Unified Ideographs Extension D
65 | // 2B820..2CEAF; CJK Unified Ideographs Extension E
66 | // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
67 | else if (packedCodePoint < 0x15790) return packedCodePoint + 0x19460;
68 | // 30000..3134F; CJK Unified Ideographs Extension G
69 | // 31350..323AF; CJK Unified Ideographs Extension H
70 | else if (packedCodePoint < 0x17B40) return packedCodePoint + 0x1A870;
71 | // F900..FAFF; CJK Compatibility Ideographs
72 | else if (packedCodePoint < 0x17D40) return packedCodePoint - 0x8240;
73 | // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
74 | else if (packedCodePoint < 0x17F60) return packedCodePoint + 0x17AC0;
75 | }
76 | throw new ArgumentOutOfRangeException(nameof(packedCodePoint));
77 | }
78 | }
79 | }
80 |
81 |
--------------------------------------------------------------------------------
/System.Unicode/UnihanCharacterData.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | internal readonly partial struct UnihanCharacterData
4 | {
5 | public readonly int CodePoint;
6 | public readonly UnihanNumericType NumericType;
7 | public readonly long NumericValue;
8 | public readonly UnicodeRadicalStrokeCount[] UnicodeRadicalStrokeCounts;
9 | public readonly string Definition;
10 | public readonly string MandarinReading;
11 | public readonly string CantoneseReading;
12 | public readonly string JapaneseKunReading;
13 | public readonly string JapaneseOnReading;
14 | public readonly string KoreanReading;
15 | public readonly string HangulReading;
16 | public readonly string VietnameseReading;
17 | public readonly string SimplifiedVariant;
18 | public readonly string TraditionalVariant;
19 |
20 | internal UnihanCharacterData
21 | (
22 | int codePoint,
23 | UnihanNumericType numericType,
24 | long numericValue,
25 | UnicodeRadicalStrokeCount[] unicodeRadicalStrokeCounts,
26 | string definition,
27 | string mandarinReading,
28 | string cantoneseReading,
29 | string japaneseKunReading,
30 | string japaneseOnReading,
31 | string koreanReading,
32 | string hangulReading,
33 | string vietnameseReading,
34 | string simplifiedVariant,
35 | string traditionalVariant
36 | )
37 | {
38 | CodePoint = codePoint;
39 | NumericType = numericType;
40 | NumericValue = numericValue;
41 | UnicodeRadicalStrokeCounts = unicodeRadicalStrokeCounts;
42 | Definition = definition;
43 | MandarinReading = mandarinReading;
44 | CantoneseReading = cantoneseReading;
45 | JapaneseKunReading = japaneseKunReading;
46 | JapaneseOnReading = japaneseOnReading;
47 | KoreanReading = koreanReading;
48 | HangulReading = hangulReading;
49 | VietnameseReading = vietnameseReading;
50 | SimplifiedVariant = simplifiedVariant;
51 | TraditionalVariant = traditionalVariant;
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/System.Unicode/UnihanCharacterData.tt:
--------------------------------------------------------------------------------
1 | <#@ template debug="false" hostspecific="false" language="C#" #>
2 | <#@ assembly name="System.Core" #>
3 | <#@ import namespace="System.Linq" #>
4 | <#@ import namespace="System.Text" #>
5 | <#@ import namespace="System.Collections.Generic" #>
6 | <#@ output extension=".Generated.cs" #>
7 | <#
8 | // This file will generate the code point packing and unpacking code for unihan data.
9 | // Since Unihan data covers pretty specific code point ranges, we can rebase those ranges closer to zero in order to get a better encoding in files.
10 | // The algorithm now generated a densely packed map, as opposed to the previous handwritten code.
11 |
12 | // Declare the blocks to pack and unpack in the arbitrarily chosen order.
13 | var blocks = new UnicodeBlockList
14 | {
15 | { 0x4E00, 0x9FFF, "CJK Unified Ideographs" },
16 | { 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A" },
17 | { 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B" },
18 | { 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C" },
19 | { 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D" },
20 | { 0x2B820, 0x2CEAF, "CJK Unified Ideographs Extension E" },
21 | { 0x2CEB0, 0x2EBEF, "CJK Unified Ideographs Extension F" },
22 | { 0x30000, 0x3134F, "CJK Unified Ideographs Extension G" },
23 | { 0x31350, 0x323AF, "CJK Unified Ideographs Extension H" },
24 | { 0xF900, 0xFAFF, "CJK Compatibility Ideographs" },
25 | { 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement" },
26 | };
27 |
28 | // Assign the (re)base(d) index for each block.
29 | {
30 | int baseIndex = 0;
31 | foreach (var block in blocks)
32 | {
33 | block.RebasedStartIndex = baseIndex;
34 | baseIndex += block.CodePointCount;
35 | }
36 | }
37 |
38 | // Merge contiguous blocks together in order to avoid useless branches
39 | {
40 | int firstMergeIndex = -1;
41 | UnicodeBlock lastBlock = null;
42 | for (int i = 0; i < blocks.Count; i++)
43 | {
44 | var block = blocks[i];
45 | if (lastBlock is object)
46 | {
47 | bool isContiguous = block.FirstCodePoint - lastBlock.LastCodePoint == 1;
48 |
49 | if (!isContiguous || i == blocks.Count - 1)
50 | {
51 | int blockCount = i - firstMergeIndex + (isContiguous ? 1 : 0);
52 |
53 | if (blockCount > 1)
54 | {
55 | var mergedBlocks = blocks.GetRange(firstMergeIndex, blockCount).ToArray();
56 | blocks[firstMergeIndex] = new UnicodeBlock(mergedBlocks);
57 | blocks.RemoveRange(firstMergeIndex + 1, blockCount - 1);
58 | }
59 | firstMergeIndex = i -= blockCount - 1;
60 | }
61 | }
62 | else
63 | {
64 | firstMergeIndex = i;
65 | }
66 | lastBlock = block;
67 | }
68 | }
69 |
70 | // Sort blocks by first code point
71 | var sortedBlocks = blocks.ToArray();
72 | Array.Sort(sortedBlocks, (a, b) => Comparer.Default.Compare(a.FirstCodePoint, b.FirstCodePoint));
73 | #>
74 | namespace System.Unicode
75 | {
76 | partial struct UnihanCharacterData
77 | {
78 | // This method densely packs code points by predicted order of importance (it may be wrong)
79 | // Its purpose is to avoid skipping numbers so that file encoding can be more efficient.
80 | internal static int PackCodePoint(int codePoint)
81 | {
82 | <#
83 | {
84 | int lastCodePoint = -1;
85 | int indentCount = 0;
86 |
87 | foreach (var block in sortedBlocks)
88 | {
89 | bool isContiguous = block.FirstCodePoint - lastCodePoint <= 1;
90 |
91 | if (!isContiguous)
92 | {
93 | #>
94 | <#= lastCodePoint >= 0 ? "else " : "" #>if (codePoint >= 0x<#= block.FirstCodePoint.ToString("X4") #>)
95 | {
96 | <#
97 | indentCount++;
98 | PushIndent("\t");
99 | }
100 |
101 | foreach (var mergedBlock in block.MergedBlocks)
102 | {
103 | #>
104 | // <#= mergedBlock #>
105 | <#
106 | }
107 |
108 | int offset = block.RebasedStartIndex - block.FirstCodePoint;
109 | #>
110 | <#= isContiguous ? "else " : "" #>if (codePoint < 0x<#= (block.LastCodePoint + 1).ToString("X4") #>) return codePoint <#= offset < 0 ? "-" : "+" #> 0x<#= Math.Abs(offset).ToString("X5") #>;
111 | <#
112 |
113 | lastCodePoint = block.LastCodePoint;
114 | }
115 |
116 | while (indentCount-- > 0)
117 | {
118 | PopIndent();
119 | #>
120 | }
121 | <#
122 | }
123 | }
124 | #>
125 |
126 | throw new ArgumentOutOfRangeException(nameof(codePoint));
127 | }
128 |
129 | // Reverses the packing done by the PackCodePoint method.
130 | internal static int UnpackCodePoint(int packedCodePoint)
131 | {
132 | if (packedCodePoint >= 0)
133 | {
134 | <#
135 | {
136 | foreach (var block in blocks)
137 | {
138 | foreach (var mergedBlock in block.MergedBlocks)
139 | {
140 | #>
141 | // <#= mergedBlock #>
142 | <#
143 | }
144 |
145 | int offset = block.FirstCodePoint - block.RebasedStartIndex;
146 | #>
147 | <#= block.RebasedStartIndex > 0 ? "else " : "" #>if (packedCodePoint < 0x<#= (block.RebasedStartIndex + block.CodePointCount).ToString("X5") #>) return packedCodePoint <#= offset < 0 ? "-" : "+" #> 0x<#= Math.Abs(offset).ToString("X4") #>;
148 | <#
149 | }
150 | }
151 | #>
152 | }
153 | throw new ArgumentOutOfRangeException(nameof(packedCodePoint));
154 | }
155 | }
156 | }
157 |
158 | <#+
159 | class UnicodeBlock
160 | {
161 | public int FirstCodePoint { get; }
162 | public int LastCodePoint { get; }
163 | public string Name { get; }
164 | public UnicodeBlock[] MergedBlocks { get; }
165 | public int CodePointCount => LastCodePoint - FirstCodePoint + 1;
166 | public int RebasedStartIndex { get; set; }
167 |
168 | public UnicodeBlock(int firstCodePoint, int lastCodePoint, string name)
169 | => (FirstCodePoint, LastCodePoint, Name, MergedBlocks) = (firstCodePoint, lastCodePoint, name, new[] { this });
170 |
171 | public UnicodeBlock(UnicodeBlock[] mergedBlocks)
172 | => (FirstCodePoint, LastCodePoint, Name, RebasedStartIndex, MergedBlocks) = (mergedBlocks[0].FirstCodePoint, mergedBlocks[mergedBlocks.Length - 1].LastCodePoint, "MERGED Block", mergedBlocks[0].RebasedStartIndex, mergedBlocks);
173 |
174 | public override string ToString()
175 | => $"{FirstCodePoint:X4}..{LastCodePoint:X4}; {Name}";
176 | }
177 |
178 | class UnicodeBlockList : List
179 | {
180 | public void Add(int firstCodePoint, int lastCodePoint, string name)
181 | => Add(new UnicodeBlock(firstCodePoint, lastCodePoint, name));
182 | }
183 | #>
184 |
--------------------------------------------------------------------------------
/System.Unicode/UnihanFields.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | [Flags]
4 | internal enum UnihanFields : ushort
5 | {
6 | // NumericType / NumericValue : Not exactly a bit mask here…
7 | PrimaryNumeric = 1,
8 | AccountingNumeric = 2,
9 | OtherNumeric = 3,
10 |
11 | // UnicodeRadicalStroke : Not exactly a bit mask…
12 | UnicodeRadicalStrokeCount = 4, // Will indicate exactly one value for Unicode_Radical_Stroke.
13 | UnicodeRadicalStrokeCountTwice = 8, // Will indicate exactly two values for Unicode_Radical_Stroke.
14 | UnicodeRadicalStrokeCountMore = 12, // Will indicate three or more values for Unicode_Radical_Stroke. This combination should never happen in the current files.
15 |
16 | Definition = 16,
17 | MandarinReading = 32,
18 | CantoneseReading = 64,
19 | JapaneseKunReading = 128,
20 | JapaneseOnReading = 256,
21 | KoreanReading = 512,
22 | HangulReading = 1024,
23 | VietnameseReading = 2048,
24 |
25 | SimplifiedVariant = 4096,
26 | TraditionalVariant = 8192,
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/System.Unicode/UnihanNumericType.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | /// Represents the different numeric types from the Unihan database.
4 | public enum UnihanNumericType : byte
5 | {
6 | /// Indicates that there is no Unihan numeric property defined for the code point.
7 | None = 0,
8 | /// Indicates that the propery kPrimaryNumeric is defined for this code point.
9 | /// The kPrimaryNumeric property is used for ideographs wich are standard numerals.
10 | [ValueName("kPrimaryNumeric")]
11 | Primary = 1,
12 | /// Indicates that the propery kAccountingNumeric is defined for this code point.
13 | /// The kAccountingNumeric property is used for ideographs used as accounting numerals.
14 | [ValueName("kAccountingNumeric")]
15 | Accounting = 2,
16 | /// Indicates that the propery kOtherNumeric is defined for this code point.
17 | /// The kOtherNumeric property is used for ideographs wich are used as numerals in non common contexts.
18 | [ValueName("kOtherNumeric")]
19 | Other = 3,
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/System.Unicode/ValueNameAttribute.cs:
--------------------------------------------------------------------------------
1 | namespace System.Unicode
2 | {
3 | /// Declares a name for a specific value.
4 | ///
5 | /// Since this project tries to stick to the .NET Framework naming conventions, this attribute may be used to indicate standard property names and values names where applicable.
6 | /// It may also be of use when aliases are available for a given property or value.
7 | ///
8 | [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = true)]
9 | public sealed class ValueNameAttribute : Attribute
10 | {
11 | /// The name given to the property or value.
12 | public string Name { get; }
13 |
14 | /// Initializes an instance of the class .
15 | /// The name given to the property or value on which this attribute is to be applied.
16 | public ValueNameAttribute(string name) => Name = name;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/System.Unicode/packageIcon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/System.Unicode/packageIcon.png
--------------------------------------------------------------------------------
/System.Unicode/ucd.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/System.Unicode/ucd.dat
--------------------------------------------------------------------------------
/UnicodeCharacterInspector/UnicodeCharacterInspector.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hexawyz/NetUnicodeInfo/c2ab5227094d8f934b34fe6d3186c7f1e2be5e74/UnicodeCharacterInspector/UnicodeCharacterInspector.ico
--------------------------------------------------------------------------------
/UnicodeCharacterInspector/UnicodeCharacterInspector.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
67 |
--------------------------------------------------------------------------------
/UnicodeInformation.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.28721.148
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode", "System.Unicode\System.Unicode.csproj", "{CB722958-A1C4-4121-804B-7D5A671491B1}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Tests", "System.Unicode.Tests\System.Unicode.Tests.csproj", "{50337426-E884-4394-9E1A-F6F7A555F5D9}"
9 | EndProject
10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Build.Tasks", "System.Unicode.Build.Tasks\System.Unicode.Build.Tasks.csproj", "{8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}"
11 | EndProject
12 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{73097DF3-04B7-4C5F-B4EA-0EB800E40702}"
13 | ProjectSection(SolutionItems) = preProject
14 | .editorconfig = .editorconfig
15 | azure-pipelines.yml = azure-pipelines.yml
16 | Directory.Build.props = Directory.Build.props
17 | Example.cs = Example.cs
18 | Icon.ico = Icon.ico
19 | Icon.svg = Icon.svg
20 | LICENSE.txt = LICENSE.txt
21 | README.md = README.md
22 | System.Unicode.snk = System.Unicode.snk
23 | UnicodeVersion.txt = UnicodeVersion.txt
24 | EndProjectSection
25 | EndProject
26 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Build.Core", "System.Unicode.Build.Core\System.Unicode.Build.Core.csproj", "{A872B696-86A2-4B74-9878-08CD4742338A}"
27 | EndProject
28 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "System.Unicode.Build.DatabaseGenerator", "System.Unicode.Build.DatabaseGenerator\System.Unicode.Build.DatabaseGenerator.csproj", "{723A80B0-34A9-44BA-BB2C-B6921FEEDD56}"
29 | EndProject
30 | Global
31 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
32 | Debug|Any CPU = Debug|Any CPU
33 | Release|Any CPU = Release|Any CPU
34 | EndGlobalSection
35 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
36 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
37 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Debug|Any CPU.Build.0 = Debug|Any CPU
38 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Release|Any CPU.ActiveCfg = Release|Any CPU
39 | {CB722958-A1C4-4121-804B-7D5A671491B1}.Release|Any CPU.Build.0 = Release|Any CPU
40 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
41 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Debug|Any CPU.Build.0 = Debug|Any CPU
42 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Release|Any CPU.ActiveCfg = Release|Any CPU
43 | {50337426-E884-4394-9E1A-F6F7A555F5D9}.Release|Any CPU.Build.0 = Release|Any CPU
44 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
45 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Debug|Any CPU.Build.0 = Debug|Any CPU
46 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Release|Any CPU.ActiveCfg = Release|Any CPU
47 | {8DFDEE6C-4F0D-4DE1-B346-574CB56D2B8B}.Release|Any CPU.Build.0 = Release|Any CPU
48 | {A872B696-86A2-4B74-9878-08CD4742338A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
49 | {A872B696-86A2-4B74-9878-08CD4742338A}.Debug|Any CPU.Build.0 = Debug|Any CPU
50 | {A872B696-86A2-4B74-9878-08CD4742338A}.Release|Any CPU.ActiveCfg = Release|Any CPU
51 | {A872B696-86A2-4B74-9878-08CD4742338A}.Release|Any CPU.Build.0 = Release|Any CPU
52 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
53 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Debug|Any CPU.Build.0 = Debug|Any CPU
54 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Release|Any CPU.ActiveCfg = Release|Any CPU
55 | {723A80B0-34A9-44BA-BB2C-B6921FEEDD56}.Release|Any CPU.Build.0 = Release|Any CPU
56 | EndGlobalSection
57 | GlobalSection(SolutionProperties) = preSolution
58 | HideSolutionNode = FALSE
59 | EndGlobalSection
60 | GlobalSection(ExtensibilityGlobals) = postSolution
61 | SolutionGuid = {B155A7AA-DB01-4F49-8985-33AC25BC4B98}
62 | EndGlobalSection
63 | EndGlobal
64 |
--------------------------------------------------------------------------------
/UnicodeVersion.txt:
--------------------------------------------------------------------------------
1 | 15.0.0
2 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | # .NET Desktop
2 | # Build and run tests for .NET Desktop or Windows classic desktop solutions.
3 | # Add steps that publish symbols, save build artifacts, and more:
4 | # https://docs.microsoft.com/azure/devops/pipelines/apps/windows/dot-net
5 |
6 | trigger:
7 | - master
8 |
9 | pool:
10 | vmImage: 'windows-latest'
11 |
12 | variables:
13 | solution: '**/*.sln'
14 | buildPlatform: 'Any CPU'
15 | buildConfiguration: 'Release'
16 |
17 | steps:
18 | - task: NuGetToolInstaller@1
19 |
20 | - task: NuGetCommand@2
21 | inputs:
22 | restoreSolution: '$(solution)'
23 |
24 | - task: VSBuild@1
25 | inputs:
26 | solution: '$(solution)'
27 | platform: '$(buildPlatform)'
28 | configuration: '$(buildConfiguration)'
29 | msbuildArgs: '/p:ContiniousIntegrationBuild=true'
30 |
31 | - task: VSTest@2
32 | inputs:
33 | platform: '$(buildPlatform)'
34 | configuration: '$(buildConfiguration)'
35 | testSelector: 'testAssemblies'
36 | testAssemblyVer2: |
37 | **\*.Tests.dll
38 | !**\obj\**
39 | !**\ref\**
40 | searchFolder: '$(System.DefaultWorkingDirectory)'
41 | runOnlyImpactedTests: false
42 |
43 | - task: CopyFiles@2
44 | inputs:
45 | SourceFolder: '$(Build.SourcesDirectory)'
46 | Contents: 'System.Unicode/bin/$(buildConfiguration)/?(*.nupkg|*.snupkg)'
47 | TargetFolder: '$(Build.ArtifactStagingDirectory)'
48 |
49 | - task: PublishBuildArtifacts@1
50 | inputs:
51 | PathtoPublish: '$(Build.ArtifactStagingDirectory)'
52 | ArtifactName: 'NuGet'
53 | publishLocation: 'Container'
54 |
--------------------------------------------------------------------------------