├── .gitignore ├── .vs └── AsoSoftLibrary │ └── FileContentIndex │ └── read.lock ├── AsoSoft-logo.png ├── AsoSoftLibrary - Backup.csproj ├── AsoSoftLibrary.csproj ├── AsoSoftLibrary.csproj.user ├── AsoSoftLibrary.sln ├── G2P.cs ├── Normalize.cs ├── Number2Word.cs ├── PoemClassifier.cs ├── README.md ├── Sort.cs ├── Transliteration.cs ├── resFiles.Designer.cs ├── resFiles.resx └── resources ├── G2PCertain.csv ├── G2PExceptions.csv ├── NormalizeUnicodeAdditional.csv ├── NormalizeUnicodeDeep.csv ├── Phoneme2Ascii.csv ├── Phoneme2IPA.csv └── PoemPatterns.csv /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Build results 17 | [Dd]ebug/ 18 | [Dd]ebugPublic/ 19 | [Rr]elease/ 20 | [Rr]eleases/ 21 | x64/ 22 | x86/ 23 | [Aa][Rr][Mm]/ 24 | [Aa][Rr][Mm]64/ 25 | bld/ 26 | [Bb]in/ 27 | [Oo]bj/ 28 | [Ll]og/ 29 | 30 | # Visual Studio 2015/2017 cache/options directory 31 | .vs/ 32 | # Uncomment if you have tasks that create the project's static files in wwwroot 33 | #wwwroot/ 34 | 35 | # Visual Studio 2017 auto generated files 36 | Generated\ Files/ 37 | 38 | # MSTest test Results 39 | [Tt]est[Rr]esult*/ 40 | [Bb]uild[Ll]og.* 41 | 42 | # NUNIT 43 | *.VisualState.xml 44 | TestResult.xml 45 | 46 | # Build Results of an ATL Project 47 | [Dd]ebugPS/ 48 | [Rr]eleasePS/ 49 | dlldata.c 50 | 51 | # Benchmark Results 52 | BenchmarkDotNet.Artifacts/ 53 | 54 | # .NET Core 55 | project.lock.json 56 | project.fragment.lock.json 57 | artifacts/ 58 | 59 | # StyleCop 60 | StyleCopReport.xml 61 | 62 | # Files built by Visual Studio 63 | *_i.c 64 | *_p.c 65 | *_h.h 66 | *.ilk 67 | *.meta 68 | *.obj 69 | *.iobj 70 | *.pch 71 | *.pdb 72 | *.ipdb 73 | *.pgc 74 | *.pgd 75 | *.rsp 76 | *.sbr 77 | *.tlb 78 | *.tli 79 | *.tlh 80 | *.tmp 81 | *.tmp_proj 82 | *_wpftmp.csproj 83 | *.log 84 | *.vspscc 85 | *.vssscc 86 | .builds 87 | *.pidb 88 | *.svclog 89 | *.scc 90 | 91 | # Chutzpah Test files 92 | _Chutzpah* 93 | 94 | # Visual C++ cache files 95 | ipch/ 96 | *.aps 97 | *.ncb 98 | *.opendb 99 | *.opensdf 100 | *.sdf 101 | *.cachefile 102 | *.VC.db 103 | *.VC.VC.opendb 104 | 105 | # Visual Studio profiler 106 | *.psess 107 | *.vsp 108 | *.vspx 109 | *.sap 110 | 111 | # Visual Studio Trace Files 112 | *.e2e 113 | 114 | # TFS 2012 Local Workspace 115 | $tf/ 116 | 117 | # Guidance Automation Toolkit 118 | *.gpState 119 | 120 | # ReSharper is a .NET coding add-in 121 | _ReSharper*/ 122 | *.[Rr]e[Ss]harper 123 | *.DotSettings.user 124 | 125 | # JustCode is a .NET coding add-in 126 | .JustCode 127 | 128 | # TeamCity is a build add-in 129 | _TeamCity* 130 | 131 | # DotCover is a Code Coverage Tool 132 | *.dotCover 133 | 134 | # AxoCover is a Code Coverage Tool 135 | .axoCover/* 136 | !.axoCover/settings.json 137 | 138 | # Visual Studio code coverage results 139 | *.coverage 140 | *.coveragexml 141 | 142 | # NCrunch 143 | _NCrunch_* 144 | .*crunch*.local.xml 145 | nCrunchTemp_* 146 | 147 | # MightyMoose 148 | *.mm.* 149 | AutoTest.Net/ 150 | 151 | # Web workbench (sass) 152 | .sass-cache/ 153 | 154 | # Installshield output folder 155 | [Ee]xpress/ 156 | 157 | # DocProject is a documentation generator add-in 158 | DocProject/buildhelp/ 159 | DocProject/Help/*.HxT 160 | DocProject/Help/*.HxC 161 | DocProject/Help/*.hhc 162 | DocProject/Help/*.hhk 163 | DocProject/Help/*.hhp 164 | DocProject/Help/Html2 165 | DocProject/Help/html 166 | 167 | # Click-Once directory 168 | publish/ 169 | 170 | # Publish Web Output 171 | *.[Pp]ublish.xml 172 | *.azurePubxml 173 | # Note: Comment the next line if you want to checkin your web deploy settings, 174 | # but database connection strings (with potential passwords) will be unencrypted 175 | *.pubxml 176 | *.publishproj 177 | 178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 179 | # checkin your Azure Web App publish settings, but sensitive information contained 180 | # in these scripts will be unencrypted 181 | PublishScripts/ 182 | 183 | # NuGet Packages 184 | *.nupkg 185 | # The packages folder can be ignored because of Package Restore 186 | **/[Pp]ackages/* 187 | # except build/, which is used as an MSBuild target. 188 | !**/[Pp]ackages/build/ 189 | # Uncomment if necessary however generally it will be regenerated when needed 190 | #!**/[Pp]ackages/repositories.config 191 | # NuGet v3's project.json files produces more ignorable files 192 | *.nuget.props 193 | *.nuget.targets 194 | 195 | # Microsoft Azure Build Output 196 | csx/ 197 | *.build.csdef 198 | 199 | # Microsoft Azure Emulator 200 | ecf/ 201 | rcf/ 202 | 203 | # Windows Store app package directories and files 204 | AppPackages/ 205 | BundleArtifacts/ 206 | Package.StoreAssociation.xml 207 | _pkginfo.txt 208 | *.appx 209 | 210 | # Visual Studio cache files 211 | # files ending in .cache can be ignored 212 | *.[Cc]ache 213 | # but keep track of directories ending in .cache 214 | !?*.[Cc]ache/ 215 | 216 | # Others 217 | ClientBin/ 218 | ~$* 219 | *~ 220 | *.dbmdl 221 | *.dbproj.schemaview 222 | *.jfm 223 | *.pfx 224 | *.publishsettings 225 | orleans.codegen.cs 226 | 227 | # Including strong name files can present a security risk 228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 229 | #*.snk 230 | 231 | # Since there are multiple workflows, uncomment next line to ignore bower_components 232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 233 | #bower_components/ 234 | 235 | # RIA/Silverlight projects 236 | Generated_Code/ 237 | 238 | # Backup & report files from converting an old project file 239 | # to a newer Visual Studio version. Backup files are not needed, 240 | # because we have git ;-) 241 | _UpgradeReport_Files/ 242 | Backup*/ 243 | UpgradeLog*.XML 244 | UpgradeLog*.htm 245 | ServiceFabricBackup/ 246 | *.rptproj.bak 247 | 248 | # SQL Server files 249 | *.mdf 250 | *.ldf 251 | *.ndf 252 | 253 | # Business Intelligence projects 254 | *.rdl.data 255 | *.bim.layout 256 | *.bim_*.settings 257 | *.rptproj.rsuser 258 | *- Backup*.rdl 259 | 260 | # Microsoft Fakes 261 | FakesAssemblies/ 262 | 263 | # GhostDoc plugin setting file 264 | *.GhostDoc.xml 265 | 266 | # Node.js Tools for Visual Studio 267 | .ntvs_analysis.dat 268 | node_modules/ 269 | 270 | # Visual Studio 6 build log 271 | *.plg 272 | 273 | # Visual Studio 6 workspace options file 274 | *.opt 275 | 276 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 277 | *.vbw 278 | 279 | # Visual Studio LightSwitch build output 280 | **/*.HTMLClient/GeneratedArtifacts 281 | **/*.DesktopClient/GeneratedArtifacts 282 | **/*.DesktopClient/ModelManifest.xml 283 | **/*.Server/GeneratedArtifacts 284 | **/*.Server/ModelManifest.xml 285 | _Pvt_Extensions 286 | 287 | # Paket dependency manager 288 | .paket/paket.exe 289 | paket-files/ 290 | 291 | # FAKE - F# Make 292 | .fake/ 293 | 294 | # JetBrains Rider 295 | .idea/ 296 | *.sln.iml 297 | 298 | # CodeRush personal settings 299 | .cr/personal 300 | 301 | # Python Tools for Visual Studio (PTVS) 302 | __pycache__/ 303 | *.pyc 304 | 305 | # Cake - Uncomment if you are using it 306 | # tools/** 307 | # !tools/packages.config 308 | 309 | # Tabs Studio 310 | *.tss 311 | 312 | # Telerik's JustMock configuration file 313 | *.jmconfig 314 | 315 | # BizTalk build output 316 | *.btp.cs 317 | *.btm.cs 318 | *.odx.cs 319 | *.xsd.cs 320 | 321 | # OpenCover UI analysis results 322 | OpenCover/ 323 | 324 | # Azure Stream Analytics local run output 325 | ASALocalRun/ 326 | 327 | # MSBuild Binary and Structured Log 328 | *.binlog 329 | 330 | # NVidia Nsight GPU debugger configuration file 331 | *.nvuser 332 | 333 | # MFractors (Xamarin productivity tool) working folder 334 | .mfractor/ 335 | 336 | # Local History for Visual Studio 337 | .localhistory/ 338 | 339 | # BeatPulse healthcheck temp database 340 | healthchecksdb -------------------------------------------------------------------------------- /.vs/AsoSoftLibrary/FileContentIndex/read.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AsoSoft/AsoSoft-Library/f69d510e0a180c40511145691e8af1deb305aee5/.vs/AsoSoftLibrary/FileContentIndex/read.lock -------------------------------------------------------------------------------- /AsoSoft-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AsoSoft/AsoSoft-Library/f69d510e0a180c40511145691e8af1deb305aee5/AsoSoft-logo.png -------------------------------------------------------------------------------- /AsoSoftLibrary - Backup.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp3.1 5 | AsoSoft 6 | AsoSoft Class Library 7 | Aso Mahmudi 8 | AsoSoft Class Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish). 9 | MIT 10 | https://github.com/AsoSoft/AsoSoft-Library 11 | AsoSoft-logo.png 12 | kurdish normalization natural-language-processing 13 | AsoSoft Library for the Kurdish language processing (ckb: Central branch of Kurdish). 14 | Normalizer and Numeral Converter classes 15 | https://github.com/AsoSoft/AsoSoft-Library 16 | 2.0.1 17 | ReadMe.md 18 | True 19 | True 20 | True 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | True 50 | \ 51 | 52 | 53 | True 54 | \ 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /AsoSoftLibrary.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp3.1 5 | AsoSoft 6 | AsoSoft Class Library 7 | Aso Mahmudi 8 | AsoSoft Class Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish). 9 | MIT 10 | https://github.com/AsoSoft/AsoSoft-Library 11 | AsoSoft-logo.png 12 | kurdish normalization natural-language-processing 13 | AsoSoft Library for the Kurdish language processing (ckb: Central branch of Kurdish). 14 | Normalizer and Numeral Converter classes 15 | https://github.com/AsoSoft/AsoSoft-Library 16 | 2.1.3 17 | README.md 18 | True 19 | True 20 | True 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | True 35 | True 36 | resFiles.resx 37 | 38 | 39 | 40 | 41 | 42 | PublicResXFileCodeGenerator 43 | resFiles.Designer.cs 44 | 45 | 46 | 47 | 48 | 49 | True 50 | \ 51 | 52 | 53 | True 54 | \ 55 | Always 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /AsoSoftLibrary.csproj.user: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | <_LastSelectedProfileId>D:\DEV\AsoSoftLibrary\Properties\PublishProfiles\FolderProfile.pubxml 5 | 6 | 7 | 8 | Designer 9 | 10 | 11 | -------------------------------------------------------------------------------- /AsoSoftLibrary.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.2.32616.157 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AsoSoftLibrary", "AsoSoftLibrary.csproj", "{69039AA0-A7AD-4F12-B1B9-13263A9DC47F}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {95A3F2E7-0611-4D99-8A85-055D3FE5E265} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /G2P.cs: -------------------------------------------------------------------------------- 1 | // Automated Grapheme-to-Phoneme Conversion for Central Kurdish based on Optimality Theory 2 | // Copyright (C) 2019 Aso Mahmudi, Hadi Veisi 3 | // Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com) 4 | // Demo: https://asosoft.github.io/g2p/ 5 | // Source Code: https://github.com/AsoSoft/AsoSoft-Library 6 | // Test-set: https://github.com/AsoSoft/Kurdish-G2P-dataset 7 | // Paper: https://www.sciencedirect.com/science/article/abs/pii/S0885230821000292 8 | // Cite: 9 | // @article{mahmudi2021automated, 10 | // title={Automated grapheme-to-phoneme conversion for Central Kurdish based on optimality theory}, 11 | // author={Mahmudi, Aso and Veisi, Hadi}, 12 | // journal={Computer Speech \& Language}, 13 | // volume={70}, 14 | // pages={101222}, 15 | // year={2021}, 16 | // publisher={Elsevier} 17 | // } 18 | 19 | using System.Collections.Generic; 20 | using System.Linq; 21 | using System.Text; 22 | using System.Text.RegularExpressions; 23 | 24 | namespace AsoSoftLibrary 25 | { 26 | public static partial class AsoSoft 27 | { 28 | private static Dictionary History = new Dictionary(); 29 | 30 | /// Converts Central Kurdish text in standard Arabic script into syllabified phonemic Latin script (i.e. graphemes to phonems) 31 | public static string G2P(string text, 32 | bool convertNumbersToWord = false, 33 | bool backMergeConjunction = true, 34 | bool singleOutputPerWord = true) 35 | { 36 | var sb = new StringBuilder(); 37 | text = UnifyNumerals(text, "en"); 38 | if (convertNumbersToWord) 39 | text = Number2Word(text); 40 | 41 | text = g2pNormalize(text.Trim()); 42 | // 43 | var ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهیێ" + "ۋۉۊڎڴݵݸ"; 44 | var wordss = Regex.Matches(text, "([" + ku + "]+|[^" + ku + "]+)"); 45 | for (int i = 0; i < wordss.Count; i++) 46 | { 47 | var word = wordss[i].Value; 48 | if (Regex.IsMatch(word, "[" + ku + "]") && word != "و") 49 | sb.Append(WordG2P(Regex.Replace(word, "[^" + ku + "]+", ""), singleOutputPerWord)); 50 | else 51 | sb.Append(word); 52 | } 53 | var output = sb.ToString(); 54 | 55 | // conjunction و 56 | output = Regex.Replace(output, "(^|[?!.] ?)" + "و", "$1ˈwe"); 57 | if (!backMergeConjunction) 58 | output = Regex.Replace(output, "و", "û"); 59 | else 60 | { 61 | // if there are candidates preceeding conjunction (e.g ˈbîst¶ˈbîˈsit و) 62 | 63 | output = Regex.Replace(output, "(\\w+)¶(\\w+)¶(\\w+) و" 64 | , "$1 و¶$2 و¶$3 و"); 65 | output = Regex.Replace(output, "(\\w+)¶(\\w+) و" 66 | , "$1 و¶$2 و"); 67 | 68 | // ('bi'ra + w => bi'raw) 69 | output = Regex.Replace(output, "([aeêouûiî]) و", "$1w"); 70 | // ('be'fir + û => 'bef'rû) 71 | output = Regex.Replace(output, "(?<=\\w)ˈ([^aeêouûiî])i([^aeêouûiî]) و", "$1ˈ$2û"); 72 | // ('ser + û => 'se'rû) 73 | // ('sard + û => 'sar'dû) 74 | // ('min + û => 'mi'nû) 75 | // ('bi'gir + û => 'bi'gi'rû) 76 | // ('gir'tin + û => 'gir'ti'nû) 77 | output = Regex.Replace(output, "([^aeêouûiî]) و", "ˈ$1û"); 78 | // if conjunction makes candidates the same (e.g ˈbîsˈtû¶ˈbîsˈtû) 79 | output = Regex.Replace(output, "(?\\w+)¶\\k(?=\\s|$)", "$1"); 80 | } 81 | return output.TrimEnd(); 82 | } 83 | 84 | 85 | // chooses the best candidates for the word 86 | private static string Evaluator(string gr, List Candidates) 87 | { 88 | var Output = new List(); 89 | var evaluatedCandidates = EVAL(Candidates); 90 | if (evaluatedCandidates.Count() > 0) 91 | { 92 | var LowestPenalt = evaluatedCandidates.First().Value; 93 | foreach (var item in evaluatedCandidates) 94 | if (item.Value < LowestPenalt + 5) 95 | Output.Add(item.Key); 96 | } 97 | return (Output.Count() == 0) ? gr : string.Join('¶', Output); 98 | } 99 | 100 | // Normalizion 101 | private static string g2pNormalize(string text) 102 | { 103 | var s = new string[] 104 | { 105 | " +", " " , 106 | "دٚ", "ڎ", 107 | "گٚ", "ڴ", 108 | @"(^|\s)چ بکە", "$1چبکە", 109 | "َ", "ە", // فتحه 110 | "ِ", "ی", // کسره 111 | "ُ", "و", // ضمه 112 | "ء", "ئ", // Hamza 113 | "أ", "ئە", 114 | "إ", "ئی", 115 | "آ", "ئا", 116 | "ظ|ذ|ض", "ز", 117 | "ص|ث", "س", 118 | "ط", "ت", 119 | "ك", "ک", 120 | "ي|ى", "ی", 121 | "ه‌", "ە", 122 | "ھ", "ه", 123 | "ـ", "", // tatweel 124 | "؟", "?", 125 | "،", ",", 126 | "؛", ";", 127 | "\r", "", 128 | }; 129 | for (int i = 0; i < s.Length; i += 2) 130 | text = Regex.Replace(text, s[i], s[i + 1]); 131 | return text; 132 | } 133 | 134 | private static string WordG2P(string gr, bool SingleOutputPerWord) 135 | { 136 | // Check history for speed up 137 | if (!History.ContainsKey(gr)) 138 | History.Add(gr, Evaluator(gr, Generator(gr))); 139 | return SingleOutputPerWord ? History[gr].Split('¶')[0] : History[gr]; 140 | } 141 | 142 | // GEN: generates all possible candidates: 143 | // e.g. بوون => bûn, buwn, bwun 144 | private static List Generator(string gr) 145 | { 146 | // Converting exceptional words 147 | var G2PExceptions = resFiles.G2PExceptions.Split('\n'); 148 | for (int i = 1; i < G2PExceptions.Length; i++) 149 | { 150 | var item = G2PExceptions[i].Split(','); 151 | gr = Regex.Replace(gr, item[0], item[1]); 152 | } 153 | 154 | // Converting certain characters 155 | var G2PCertain = resFiles.G2PCertain.Split('\n'); 156 | for (int i = 1; i < G2PCertain.Length; i++) 157 | { 158 | var item = G2PCertain[i].Split(','); 159 | gr = Regex.Replace(gr, item[0], item[1]); 160 | } 161 | 162 | // Uncertainty in "و" and "ی" 163 | var CandList1 = new List { "" }; 164 | while (gr.Length > 0) 165 | { 166 | var temp = new List(); 167 | if (Regex.IsMatch(gr, "^ووووو")) 168 | { 169 | temp.AddRange(new List 170 | { "uwuwu", "uwuww", "uwwuw", "uwûw", 171 | "wuwwu", "wuwuw", "wuwû", "wûww", "wwuwu", "wwuww", "wwûw", "wûwu", 172 | "ûwwu", "ûwuw", "ûwû"}); 173 | gr = gr.Substring(5); 174 | } 175 | else if (Regex.IsMatch(gr, "^وووو")) 176 | { 177 | temp.AddRange(new List 178 | { "uwwu", "uwuw", "uwû", 179 | "wwuw", "wwû", "wuww", "wuwu", "wûw", 180 | "ûwu", "ûww", }); 181 | gr = gr.Substring(4); 182 | } 183 | else if (Regex.IsMatch(gr, "^ووو")) 184 | { 185 | temp.AddRange(new List 186 | { "wuw", "wwu", "wû", 187 | "uww", "uwu", 188 | "ûw" }); 189 | gr = gr.Substring(3); 190 | } 191 | else if (Regex.IsMatch(gr, "^وو")) 192 | { 193 | temp.AddRange(new List { "wu", "uw", "ww", "û" }); 194 | gr = gr.Substring(2); 195 | } 196 | else if (Regex.IsMatch(gr, "^و")) 197 | { 198 | temp.AddRange(new List { "u", "w" }); 199 | gr = gr.Substring(1); 200 | } 201 | else if (Regex.IsMatch(gr, "^یی")) 202 | { 203 | temp.AddRange(new List { "îy", "yî" }); 204 | gr = gr.Substring(2); 205 | } 206 | else if (Regex.IsMatch(gr, "^ی")) 207 | { 208 | temp.AddRange(new List { "y", "î" }); 209 | gr = gr.Substring(1); 210 | } 211 | else 212 | { 213 | temp.Add(gr[0].ToString()); 214 | gr = gr.Substring(1); 215 | } 216 | 217 | var Count = CandList1.Count; 218 | var TempList = new List(); 219 | foreach (var item in CandList1) 220 | TempList.Add(item); 221 | CandList1.Clear(); 222 | for (int i = 0; i < Count; i++) 223 | { 224 | for (int j = 0; j < temp.Count; j++) 225 | { 226 | var WW = Regex.IsMatch(temp[j], "^ww"); 227 | var IsPreviousVowel = Regex.IsMatch(TempList[i], "[aeêouûiîüȯė]$"); 228 | var IsNowVowel = Regex.IsMatch(temp[j], "^[aeêouûiîüȯė]"); 229 | var ConsonantBeforeWW = !IsPreviousVowel && WW; 230 | var hiatus = IsPreviousVowel && IsNowVowel; 231 | if (!hiatus && !ConsonantBeforeWW) 232 | CandList1.Add(TempList[i] + temp[j]); 233 | } 234 | } 235 | } 236 | // Adding "i" between Consonant Clusters 237 | var Candidates = iInsertion(CandList1); 238 | 239 | // ======= Syllabification for each candidate 240 | var OutputCandidates = Syllabification(Candidates); 241 | 242 | // for speed up: remove candidates that has 1) syllable without vowel or 2) more than 3 consonants in coda 243 | var cCount = OutputCandidates.Count; 244 | if (cCount > 1) 245 | { 246 | for (int i = cCount - 1; i > -1; i--) 247 | if (Regex.IsMatch(OutputCandidates[i], "ˈ[^aeêouûiîüȯė]+(ˈ|$)") 248 | || Regex.IsMatch(OutputCandidates[i], "[aeêouûiîüȯė][^aeêouûiîüȯėˈ]{4,}")) 249 | OutputCandidates.RemoveAt(i); 250 | } 251 | 252 | return OutputCandidates; 253 | } 254 | 255 | // insertion of hidden /i/ vowel 256 | // e.g. brd => bird, brid, birid 257 | private static List iInsertion(List Cands) 258 | { 259 | var Candidates = new List(); 260 | for (int i = 0; i < Cands.Count; i++) 261 | { 262 | var ThisCand = new List(); 263 | if (!string.IsNullOrEmpty(Cands[i])) 264 | { 265 | ThisCand.Add(Cands[i][0].ToString()); 266 | for (int j = 1; j < Cands[i].Length; j++) 267 | { 268 | var Count = ThisCand.Count; 269 | var TempList = new List(); 270 | foreach (var item in ThisCand) 271 | TempList.Add(item); 272 | ThisCand.Clear(); 273 | for (int k = 0; k < Count; k++) 274 | { 275 | ThisCand.Add(TempList[k] + Cands[i][j]); 276 | if (Regex.IsMatch(Cands[i].Substring(j - 1, 2), @"[^aeêouûiîüȯė][^aeêouûiîüȯė]")) 277 | ThisCand.Add(TempList[k] + "i" + Cands[i][j]); 278 | } 279 | } 280 | } 281 | else 282 | ThisCand.Add(Cands[i]); 283 | foreach (var item in ThisCand) 284 | Candidates.Add(item); 285 | 286 | } 287 | return Candidates; 288 | } 289 | 290 | // Syllabification of candidates 291 | // e.g. dexom => ˈdeˈxom 292 | private static List Syllabification(List Candidates) 293 | { 294 | var cCount = Candidates.Count; 295 | for (int i = 0; i < cCount; i++) 296 | { 297 | // Onset C(C)V 298 | Candidates[i] = Regex.Replace(Candidates[i], 299 | "([^aeêouûiîȯėwy][wy]|[^aeêouûiîȯė])([aeêouûiîȯė])", "ˈ$1$2"); 300 | // if no ˈ at beginig (grˈtin => ˈgrˈtin) 301 | Candidates[i] = Regex.Replace(Candidates[i], 302 | "^([^ˈ])", "ˈ$1"); 303 | // add candidate ( 'be'sye => + 'bes'ye) 304 | if (Regex.IsMatch(Candidates[i], "[aeêouûiîȯė][^aeêouûiîȯė]?ˈ[^aeêouûiîȯėwy][wy]")) 305 | Candidates.Add(Regex.Replace(Candidates[i], "([aeêouûiîȯė][^aeêouûiîȯė]?)ˈ([^aeêouûiîȯėwy])([wy])", "$1$2ˈ$3")); 306 | } 307 | return Candidates; 308 | } 309 | 310 | // EVAL: specifies a penalty number for each syllabified candidate 311 | private static Dictionary EVAL(List Candidates) 312 | { 313 | var output = new Dictionary(); 314 | if (Candidates.Count > 0) 315 | { 316 | var Penalty = new Dictionary(); 317 | for (int i = 0; i < Candidates.Count; i++) 318 | { 319 | var P = 0; 320 | // ================= types of penalties ============ 321 | // Complex Onset 322 | P += Regex.Matches(Candidates[i], "ˈ([^aeêouûiîȯėˈ]{2,}[wy]|[^aeêouûiîȯėˈ]+[^wy])[aeêouûiîȯė]").Count * 20; 323 | 324 | // Complex Coda 325 | if (Candidates[i] != "ˈpoynt") 326 | P += Regex.Matches(Candidates[i], "[aeêouûiîȯė][^aeêouûiîȯėˈ]{3}").Count * 10; 327 | 328 | P += Regex.Matches(Candidates[i], "[^aeêouûiîȯėˈ][wy][aeêouûiîȯė][wy][^aeêouûiîȯėˈ]").Count * 20; 329 | 330 | // SSP: ascending Sonority in coda 331 | var codas = Regex.Matches(Candidates[i], "(?<=[aeêouûiîȯė])[^aeêouûiîȯėˈ]{2,}"); 332 | foreach (var coda in codas) 333 | { 334 | var chars = coda.ToString(); 335 | for (int j = 0; j < chars.Length - 1; j++) 336 | if (SonorityIndex(chars[j]) <= SonorityIndex(chars[j + 1])) 337 | P += 10; 338 | } 339 | // DEP: i insertion 340 | P += Regex.Matches(Candidates[i], "i").Count * 2; 341 | //=========================== 342 | 343 | P += Regex.Matches(Candidates[i], "kˈr").Count * 3; 344 | 345 | // ('kurd'si'tan => 'kur'dis'tan) 346 | P += Regex.Matches(Candidates[i], "[^aeêouûiîȯėˈ]ˈsiˈtaˈ?n").Count * 3; 347 | 348 | //"(kewt|newt|ḧewt|rext|sext|dest|pest|řast|mest|pişt|wîst|hest|bîst|heşt|şest)" 349 | // suffix /it/ and /im/ ('sert => 'se'rit) ('xewt !! 'xe'wit / 'xewt) 350 | if (!Regex.IsMatch(Candidates[i], 351 | "(rift|neft|kurt|girt|xirt|germ|term|port)")) 352 | P += Regex.Matches(Candidates[i], "[aeêouûiîȯė]([^aeêouûiîyȯėˈ]m|[^aeêouûiîysşxwˈ]t)$").Count * 3; 353 | 354 | // (ˈdyu/ => ˈdîw) and (ˈkwiř => ˈkuř) 355 | P += Regex.Matches(Candidates[i], "yu").Count * 5; 356 | P += Regex.Matches(Candidates[i], "uy").Count * 5; 357 | P += Regex.Matches(Candidates[i], "yi").Count * 5; 358 | P += Regex.Matches(Candidates[i], "iˈ?y").Count * 5; // bes'ti'yan 359 | P += Regex.Matches(Candidates[i], "wu").Count * 5; 360 | P += Regex.Matches(Candidates[i], "uˈ?w").Count * 2; // 'bi'bu'wî 361 | P += Regex.Matches(Candidates[i], "wi").Count * 2; 362 | P += Regex.Matches(Candidates[i], "iw").Count * 2; 363 | P += Regex.Matches(Candidates[i], "wû").Count * 5; 364 | 365 | // ˈdiˈrêˈjayˈyî => ˈdiˈrêˈjaˈyîy (not heyyî and teyyî) 366 | // ˈdiˈrêjˈyî => ˈdiˈrêˈjîy 367 | // (NOT ˈḧeyˈyî teyˈyî") 368 | P += Regex.Matches(Candidates[i], "[^aeêouûiîȯė]ˈyî").Count * 3; 369 | 370 | // [CV]'CyV => [CV]C'yV (ˈdiˈrêˈjyî => ˈdiˈrêˈjîy) ('bes'tye'tî => 'best'ye'tî) 371 | P += Regex.Matches(Candidates[i], "(? CC'yV (bir'dyan => bird'yan) ˈswênˈdyan 374 | P += Regex.Matches(Candidates[i], "[^aeêouûiî]ˈ[^aeêouûiî][y][aeêouûî]").Count * 2; 375 | 376 | // twîˈwur => tu'yûr 377 | P += Regex.Matches(Candidates[i], "[^aeêouûiî]wîˈw").Count * 3; 378 | //=========================== 379 | // Cix (řê'kix'raw => řêk'xi'raw 380 | P += Regex.Matches(Candidates[i], "[^aeêouûiî]ixˈ").Count * 2; 381 | 382 | // ^'hełC' => ^'heł'C 383 | P += Regex.Matches(Candidates[i], "^ˈhe(ł[^aeêouûiîˈ]ˈ|ˈłi)").Count * 3; 384 | 385 | // (he'jarn => 'he'ja'rin) 386 | P += Regex.Matches(Candidates[i], "rn").Count * 5; 387 | 388 | // ('xawn => 'xa'win) ('pyawn => pya'win) 389 | P += Regex.Matches(Candidates[i], "[aêoûî][w][^aeêouûiîˈ]").Count * 5; 390 | //=========================== 391 | 392 | // ('lab'ri'di'nî => 'la'bir'di'nî) 393 | P += Regex.Matches(Candidates[i], "[aeêouûiî][^aeêouûiîˈ]ˈriˈ").Count * 5; 394 | // 395 | // 'ser'nic, 'dek'rid, gir'fit => 'se'rinc, 'de'kird, 'gi'rift (NOT gir'tin) 396 | var pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])"); 397 | if (pat.Success) 398 | { 399 | var C = Regex.Replace(pat.Value, "[iˈ]", ""); 400 | if (SonorityIndex(C[1]) > SonorityIndex(C[2])) 401 | P += 3; // 402 | } 403 | // ('sern'cê => 'se'rin'cê) 404 | pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])"); 405 | if (pat.Success) 406 | { 407 | var C = Regex.Replace(pat.Value, "[iˈ]", ""); 408 | if (SonorityIndex(C[0]) > SonorityIndex(C[1])) 409 | P += 3; 410 | } 411 | // ('ser'ni'cê => 'se'rin'cê) 412 | pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])iˈ([^aeêouûiîˈ])"); 413 | if (pat.Success) 414 | { 415 | var C = Regex.Replace(pat.Value, "[iˈ]", ""); 416 | if (SonorityIndex(C[0]) > SonorityIndex(C[1]) && SonorityIndex(C[1]) > SonorityIndex(C[2])) 417 | P += 3; 418 | } 419 | // ('gi'rit'nê => 'gir'ti'nê) ('ku'şit'ne => 'kuş'ti'ne) 420 | pat = Regex.Match(Candidates[i], "[aeêouûiî]ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])"); 421 | if (pat.Success) 422 | { 423 | var C = Regex.Replace(pat.Value, "[aeêouûiîˈ]", ""); 424 | if (SonorityIndex(C[2]) >= SonorityIndex(C[1])) 425 | P += 3; 426 | } 427 | Penalty.Add(Candidates[i], P); 428 | } 429 | output = Penalty.OrderBy(x => x.Value).ToDictionary(x => x.Key, x => x.Value); 430 | } 431 | return output; 432 | } 433 | 434 | // Sonority Sequencing Principle in EVAL needs phoneme ranking 435 | private static int SonorityIndex(char ch) 436 | { 437 | var c = ch.ToString(); 438 | if (Regex.IsMatch(c, "[wy]")) // Approximant 439 | return 6; 440 | if (Regex.IsMatch(c, "[lłrř]")) // lateral 441 | return 5; 442 | if (Regex.IsMatch(c, "[mn]")) // nasal 443 | return 4; 444 | if (Regex.IsMatch(c, "[fvszşjxẍƹḧh]")) // fricative 445 | return 3; 446 | if (Regex.IsMatch(c, "[cç]")) // affricate 447 | return 2; 448 | else // stop 449 | return 1; 450 | } 451 | 452 | /// only for tests. 453 | public static Dictionary AllCandidates(string grapheme) 454 | { 455 | return EVAL(Generator(g2pNormalize(grapheme))); 456 | } 457 | } 458 | } -------------------------------------------------------------------------------- /Normalize.cs: -------------------------------------------------------------------------------- 1 | // Automated Kurdish Text Normalization خاوێن کردنی ئۆتۆماتیکی دەقی کوردی 2 | // Copyright (C) 2019 Aso Mahmudi, Hadi Veisi, Mohammad MohammadAmini, Hawre Hosseini 3 | // Developer and Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com) 4 | 5 | // Source Code: https://github.com/AsoSoft/AsoSoft-Library 6 | // Paper: https://www.researchgate.net/publication/333729065 7 | // Cite: 8 | // @inproceedings{mahmudi2019automated, 9 | // title={Automated Kurdish Text Normalization}, 10 | // author={Mahmudi, Aso and Veisi, Hadi and MohammadAmini, Mohammad and Hosseini, Hawre}, 11 | // booktitle={The Second International Conference on Kurdish and Persian Languages and Literature}, 12 | // year={2019} 13 | // } 14 | 15 | using System.Collections.Generic; 16 | using System.Text; 17 | using System.Text.RegularExpressions; 18 | 19 | namespace AsoSoftLibrary 20 | { 21 | public static partial class AsoSoft 22 | { 23 | 24 | static Dictionary DeepReplacements = LoadNormalizerReplaces(resFiles.NormalizerDeep); 25 | static Dictionary additionalReplacements = LoadNormalizerReplaces(resFiles.NormalizerAdditional); 26 | 27 | // ================= Converting Non-Standard Fonts ================= 28 | 29 | /// Converts Kurdish text written in AliK fonts into Unicode standard 30 | public static string AliK2Unicode(string text) => replaceByList(text, normalizationReplaces["AliK2Unicode"]); 31 | 32 | /// Converts Kurdish text written in AliWeb fonts into Unicode standard 33 | public static string AliWeb2Unicode(string text) => replaceByList(text, normalizationReplaces["AliWeb2Unicode"]); 34 | 35 | /// Converts Kurdish text written in KDylan fonts into Unicode standard 36 | public static string Dylan2Unicode(string text) => replaceByList(text, normalizationReplaces["Dylan2Unicode"]); 37 | 38 | /// Converts Kurdish text written in Zarnegar fonts into Unicode standard 39 | public static string Zarnegar2Unicode(string text) => replaceByList(text, normalizationReplaces["Zarnegar2Unicode"]); 40 | 41 | static string Ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ" 42 | + "\u064B-\u065F"; // Haraka 43 | static string joiners = "ئبپتثجچحخسشصضطظعغفڤقکكگلڵمنیيهھێ"; 44 | private static readonly Dictionary> normalizationReplaces = new Dictionary> 45 | { 46 | {"NormalizeKurdish1", new List() { 47 | //========= Tatweels (U+0640) 48 | "\u0640{2,}", "\u0640", // merge 49 | $"(?<=[{joiners}])\u0640(?=[{Ku}])", "", // delete unnecessary tatweel e.g. هـا to ها 50 | // replace tatweel nonadjacent to Kurdish letters with dash 51 | $"(?<=[{joiners}])\u0640", "\uF640", // temporal preserve 52 | $"\u0640(?=[{Ku}])", "\uF640", // temporal preserve 53 | "\u0640", "-", 54 | "\uF640", "\u0640", 55 | 56 | //========= Zero-Width Non-Joiner 57 | "[\uFEFF\u200C]+", "\u200C", //Standardize and remove dublicated ZWNJ 58 | // remove unnecessary ZWNJ 59 | "\u200C(?=(\\s|\\p{P}|$))", "", // ZWNJ + white spaces 60 | $"(? ماهـ 65 | $"(?() { 68 | //========= standard H, E, Y, K 69 | "ه" + "\u200C", "ە", // Heh+ZWNJ => kurdish AE 70 | "ه" + "(?=([^" + Ku +"ـ]|$))", "ە", //final Heh looks like Ae 71 | "ھ" + "(?=([^" + Ku +"]|$))", "هـ", // final Heh Doachashmee 72 | "ھ" , "ه", // non-final Heh Doachashmee 73 | "ى|ي", "ی", // Alef maksura | Arabic Ye => Farsi ye 74 | "ك", "ک", // Arabic Kaf => Farsi Ke 75 | "\u200C" + "و ", " و ", // شوێن‌و جێ => شوێن و جێ 76 | //"\u200C" + "دا" + "(?![" + Ku + @"]($|[ \t]))", "دا", // شوێن‌دا => شوێندا 77 | //"(? بێ شوێن 78 | 79 | //========= errors from font conversion 80 | "لاَ|لاً|لأ", "ڵا", 81 | "(ی|ێ)" + "[\u064E\u064B]+", "ێ", //FATHA & FATHATAN 82 | "(و|ۆ)" + "[\u064E\u064B]+", "ۆ", 83 | "(ل|ڵ)" + "[\u064E\u064B]+", "ڵ", 84 | "(ر|ڕ)" + "\u0650+", "ڕ", //KASRA 85 | }}, 86 | {"NormalizeKurdish3", new List() { 87 | "(?() { 92 | "لاَ|لآ|لاً", "ڵا", 93 | "لً|لَ|لأ", "ڵ", 94 | "ة", "ە", 95 | "ه" + "(?!([ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ]|$))", "هـ", 96 | "ض", "چ", 97 | "ث", "پ", 98 | "ظ", "ڤ", 99 | "ط", "گ", 100 | "ك", "ک", 101 | "ىَ|يَ|یَ|آ", "ێ", 102 | "رِ", "ڕ", 103 | "ؤ|وَ", "ۆ", 104 | "ي|ى", "ی", 105 | "ء", "\u200Cو", 106 | "ِ", "", 107 | "ذ", "ژ" 108 | }}, 109 | {"AliWeb2Unicode", new List() { 110 | "لاَ|لآ|لاً", "ڵا", 111 | "لَ|پ", "ڵ", 112 | "ة", "ە", 113 | "ه", "ھ", 114 | "ه", "ھ", 115 | "رِ|أ", "ڕ", 116 | "ؤ|وَ", "ۆ", 117 | "يَ|یَ", "ێ", 118 | "ص", "ێ", 119 | "ي", "ی", 120 | "ط", "ڭ", //swap ط and گ 121 | "گ", "ط", // 122 | "ڭ", "گ", // 123 | "ض", "چ", 124 | "ث", "پ", 125 | "ظ", "ڤ", 126 | "ْ|ُ", "", 127 | "ى", "*", 128 | "ك", "ک", 129 | "ذ", "ژ" 130 | }}, 131 | {"Dylan2Unicode", new List() { 132 | "لإ|لأ|لآ", "ڵا", 133 | "ؤ|وَ", "ۆ", 134 | "ة", "ە", 135 | "ض", "ڤ", 136 | "ص", "ڵ", 137 | "ث", "ێ", 138 | "ؤ", "ۆ", 139 | "ه", "ھ", 140 | "ك", "ک", 141 | "ي|ى", "ی", 142 | "ذ", "ڕ" 143 | }}, 144 | {"Zarnegar2Unicode", new List() { 145 | "لاٌ", "ڵا", 146 | "ى|ي", "ی", 147 | "یٌ", "ێ", 148 | "ه‏", "ە", 149 | "لٌ", "ڵ", 150 | "رٍ", "ڕ", 151 | "وٌ", "ۆ" 152 | }}, 153 | {"SeperateDigits", new List() { 154 | "(?() { 159 | "\\(\\(", "«", 160 | "\\)\\)", "»", 161 | "»", "\uF8FA", // temp replacement «x»eke 162 | "\\)", "\uF8FB", //temp replacement 163 | "([!.:;?،؛؟]+)(\\p{Pi})", "$1 $2", 164 | "(\\p{P}+)(?![\\s\\p{P}])", "$1 ", // Seprate all punctuations 165 | "\uF8FA", "»", // undo temp replacement 166 | "\uF8FB", ")", // undo temp replacement 167 | "(?() { 173 | " ((\\p{Pe}|\\p{Pf})+)", "$1", // A ) B => A) B 174 | "((\\p{Ps}|\\p{Pi})+) ", "$1", // A ( B => A (B 175 | " ([!.:;?،؛؟]+)", "$1", // A ! => A! 176 | }}, 177 | {"NormalizePunctuations3", new List() { 178 | "(? A " B 179 | "(\uF8FD)(?![ \\t\\p{P}])", "$1 ", // A "B => A " B 180 | }} 181 | }; 182 | 183 | private static string replaceByList(string text, List replaceList) 184 | { 185 | for (int i = 0; i < replaceList.Count; i += 2) 186 | text = Regex.Replace(text, replaceList[i], replaceList[i + 1]); 187 | return text; 188 | } 189 | // ================= Normalization ================= 190 | private static Dictionary LoadNormalizerReplaces(string file) 191 | { 192 | var output = new Dictionary(); 193 | 194 | var items = file.Trim().Split('\n'); 195 | for (int i = 1; i < items.Length; i++) 196 | { 197 | var item = items[i].Split(','); 198 | var chOld = System.Convert.ToChar(System.Convert.ToUInt32(item[0], 16)); 199 | var chNew = ""; 200 | foreach (var ch in item[1].Split(' ')) 201 | if (ch != "") 202 | chNew += System.Convert.ToChar(System.Convert.ToUInt32(ch, 16)); 203 | if (!output.ContainsKey(chOld)) 204 | output.Add(chOld, chNew); 205 | } 206 | return output; 207 | } 208 | 209 | /// Unicode Normalization for Central Kurdish 210 | public static string Normalize(string text) 211 | { 212 | return Normalize(text, true, true, true, true, new Dictionary()); 213 | } 214 | 215 | /// Main Unicode Normalization for Central Kurdish 216 | public static string Normalize(string text, 217 | bool isOnlyKurdish, 218 | bool changeInitialR, 219 | bool deepUnicodeCorrectios, 220 | bool additionalUnicodeCorrections, 221 | Dictionary usersReplaceList) 222 | { 223 | var replaces = new Dictionary(); 224 | // Character-based replacement (ReplaceList and Private Use Area) 225 | var CharList = new List(); 226 | for (int i = 0; i < text.Length; i++) 227 | if (!CharList.Contains(text[i])) 228 | CharList.Add(text[i]); 229 | 230 | if (deepUnicodeCorrectios) 231 | foreach (var item in DeepReplacements) 232 | if (CharList.Contains(item.Key)) 233 | replaces.Add(item.Key, item.Value); 234 | if (additionalUnicodeCorrections) 235 | foreach (var item in additionalReplacements) 236 | if (CharList.Contains(item.Key) && !replaces.ContainsKey(item.Key)) 237 | replaces.Add(item.Key, item.Value); 238 | foreach (var item in usersReplaceList) 239 | if (CharList.Contains(item.Key) && !replaces.ContainsKey(item.Key)) 240 | replaces.Add(item.Key, item.Value); 241 | 242 | foreach (var ch in CharList) 243 | { 244 | if (replaces.ContainsKey(ch)) //ReplaceList 245 | text = text.Replace(ch.ToString(), replaces[ch]); 246 | else if (ch > 57343 && ch < 63744) //Private Use Area 247 | text = text.Replace(ch, '□'); // u25A1 White Square 248 | } 249 | 250 | text = replaceByList(text, normalizationReplaces["NormalizeKurdish1"]); 251 | 252 | // if the text is Monolingual (only Central Kurdish) 253 | if (isOnlyKurdish) 254 | { 255 | text = replaceByList(text, normalizationReplaces["NormalizeKurdish2"]); 256 | //========= Initial r 257 | if (changeInitialR) 258 | text = replaceByList(text, normalizationReplaces["NormalizeKurdish3"]); 259 | } 260 | return text; 261 | } 262 | 263 | // ===== Unifying Numerals ===== 264 | private static readonly string[] digits = new string[]{ 265 | "۰", "٠", "0", 266 | "۱", "١", "1", 267 | "۲", "٢", "2", 268 | "۳", "٣", "3", 269 | "۴", "٤", "4", 270 | "۵", "٥", "5", 271 | "۶", "٦", "6", 272 | "۷", "٧", "7", 273 | "۸", "٨", "8", 274 | "۹", "٩", "9", }; 275 | 276 | /// unifies numeral characters into desired numeral type from en (0123456789) or ar (٠١٢٣٤٥٦٧٨٩). 277 | public static string UnifyNumerals(string text, string NumeralType) 278 | { 279 | for (int i = 0; i < digits.Length; i += 3) 280 | { 281 | if (NumeralType == "en") 282 | text = Regex.Replace(text, digits[i] + "|" + digits[i + 1], digits[i + 2]); 283 | else if (NumeralType == "ar") 284 | text = Regex.Replace(text, digits[i] + "|" + digits[i + 2], digits[i + 1]); 285 | } 286 | return text; 287 | } 288 | 289 | /// Seperate digits from words (e.g. replacing "12a" with "12 a") 290 | public static string SeperateDigits(string text) => replaceByList(text, normalizationReplaces["SeperateDigits"]); 291 | 292 | /// Normalize Punctuations 293 | public static string NormalizePunctuations(string text, bool seprateAllPunctuations) 294 | { 295 | text = text.Replace('"', '\uF8FD'); //temp replacement 296 | text = replaceByList(text, normalizationReplaces["NormalizePunctuations1"]); 297 | if (!seprateAllPunctuations) 298 | text = replaceByList(text, normalizationReplaces["NormalizePunctuations2"]); 299 | else 300 | text = replaceByList(text, normalizationReplaces["NormalizePunctuations3"]); 301 | text = text.Replace('\uF8FD', '"'); //undo temp replacement 302 | return text; 303 | } 304 | 305 | 306 | /// Trim white spaces of a line 307 | public static string TrimLine(string line) 308 | { 309 | line = Regex.Replace(line.Trim(), "[\u200B\u200C\uFEFF]+$", ""); 310 | line = Regex.Replace(line.Trim(), "^[\u200B\u200C\uFEFF]+", ""); 311 | return line.Trim(); 312 | } 313 | 314 | /// HTML Entity replacement for web crawled texts (e.g. "&eacute;" with "é") 315 | public static string ReplaceHtmlEntity(string text) 316 | { 317 | return Regex.Replace(text, "&[a-zA-Z]+;", m => System.Net.WebUtility.HtmlDecode(m.Value)); 318 | } 319 | 320 | /// Replace URLs and Emails with a certain word (improves language models) 321 | public static string ReplaceUrlEmail(string text) 322 | { 323 | text = Regex.Replace(text, "([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+\\.[a-zA-Z]{2,5})", "EmailAddress"); 324 | text = Regex.Replace(text, "((http[s]?|ftp)?://([\\w-]+\\.)+[\\w-]+)(/[\\w-~./?%+&=]*)?", "URL"); 325 | return text; 326 | } 327 | 328 | /// Character replacement for ANSI CodePage 329 | public static string Char2CharReplacment(string text, Dictionary Codepage) 330 | { 331 | foreach (var item in Codepage) 332 | text = text.Replace(item.Key, item.Value); 333 | return text; 334 | } 335 | 336 | /// Correction Table (word replacement ) 337 | public static string Word2WordReplacement(string line, Dictionary wordReplacements) 338 | { 339 | return Regex.Replace(line, "(? wordReplacements.ContainsKey(m.Value) ? wordReplacements[m.Value] : m.Value); 341 | } 342 | 343 | //================= have to be improved: ================= 344 | 345 | /// Delete non-CK lines (fast but not accurate; we need a language detector.) 346 | public static string DeleteNonKurdish(string line, int KurdishRateThreshold) 347 | { 348 | float KuPersent = Regex.Matches(line, "[پچژگڵۆڕێڤەھ]").Count / (float)line.Length; 349 | if (KuPersent < KurdishRateThreshold / 100.0) 350 | line = ""; 351 | return line; 352 | } 353 | 354 | /// Embrace sentences with start/end tags 355 | public static string MarkSentence(string line, string sentenceTag) 356 | { 357 | var tagStart = "<" + sentenceTag + ">"; 358 | var tagEnd = ""; 359 | 360 | // ending punctuations !?؟ 361 | line = Regex.Replace(line.TrimEnd(), "([!?؟]+)(?!$)", "$1 " + tagEnd + tagStart); 362 | // full stop 363 | line = Regex.Replace(line, "([\\w\u200C]{2,} ?\\.)(?!([0-9a-zA-Z.]|$))", "$1 " + tagEnd + tagStart); 364 | 365 | return tagStart + line + tagEnd; 366 | } 367 | } 368 | } 369 | 370 | // ================= Regex Hints ================= 371 | // docs.microsoft.com/en-us/dotnet/standard/base-types/character-classes-in-regular-expressions 372 | // Lookbehind Positive: (?<=a)b 373 | // Lookbehind Negative: (?converts numerals into Central Kurdish words. It is useful in text-to-speech tools. 14 | public static string Number2Word(string text) 15 | { 16 | // convert numbers to latin 17 | var unifyNumbers = new string[]{ 18 | "٠|۰", "0", 19 | "١|۱", "1", 20 | "٢|۲", "2", 21 | "٣|۳", "3", 22 | "٤|۴", "4", 23 | "٥|۵", "5", 24 | "٦|۶", "6", 25 | "٧|۷", "7", 26 | "٨|۸", "8", 27 | "٩|۹", "9" }; 28 | for (int i = 0; i < unifyNumbers.Length; i += 2) 29 | text = Regex.Replace(text, unifyNumbers[i], unifyNumbers[i + 1]); 30 | 31 | text = Regex.Replace(text, "([0-9]{1,3})[,،](?=[0-9]{3})", "$1"); // remove thousend seperator 12,345,678 => 12345678 32 | text = Regex.Replace(text, "(? floatName(m.Groups[1].Value.ToString(), m.Groups[2].Value.ToString())); 42 | 43 | //convert remaining integr numbers 44 | text = Regex.Replace(text, "([0-9]+)", 45 | m => integerName(m.Groups[1].Value.ToString())); 46 | 47 | return text; 48 | } 49 | 50 | private static string floatName(string integerPart, string decimalPart) 51 | { 52 | var point = " پۆینت " + Regex.Replace(decimalPart, "(?<=^|0)0", " سفر "); 53 | point = Regex.Replace(point, "[0-9]", ""); 54 | return integerName(integerPart) + point + integerName(decimalPart); 55 | } 56 | 57 | private static string integerName(string inputInteger) 58 | { 59 | var output = ""; 60 | if (inputInteger != "0") 61 | { 62 | string[] ones = { "", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ" }; 63 | string[] teens = { "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" }; 64 | string[] tens = { "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد" }; 65 | string[] hundreds = { "", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد" }; 66 | string[] thousands = { "", " هەزار", " ملیۆن", " ملیار", " بلیۆن", " بلیار", " تریلیۆن", " تریلیار", " کوادرلیۆن" }; 67 | var temp = inputInteger; 68 | for (int i = 0; i < inputInteger.Length; i = i + 3) 69 | { 70 | string currentThree = Regex.Match(temp, "([0-9]{1,3})$").Result("$1"); 71 | temp = temp.Substring(0, temp.Length - currentThree.Length); 72 | currentThree = currentThree.PadLeft(3, '0'); 73 | var C = Int32.Parse(currentThree[0].ToString()); 74 | var X = Int32.Parse(currentThree[1].ToString()); 75 | var I = Int32.Parse(currentThree[2].ToString()); 76 | var conjunction1 = ((C != 0) && (X != 0 || I != 0)) ? " و " : ""; 77 | var conjunction2 = (X != 0 && I != 0) ? " و " : ""; 78 | if (X == 1) 79 | currentThree = hundreds[C] + conjunction1 + teens[I]; 80 | else 81 | currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I]; 82 | var M = (currentThree == "") ? "" : thousands[(int)(Math.Floor(i / 3.0))]; 83 | currentThree += M; 84 | var conjunction3 = (output == "") ? "" : " و "; 85 | if (currentThree != "") 86 | output = currentThree + conjunction3 + output; 87 | } 88 | output = output.Replace("یەک هەزار", "هەزار"); 89 | } 90 | else // if input number = 0 91 | output = "سفر"; 92 | return output; 93 | } 94 | } 95 | } -------------------------------------------------------------------------------- /PoemClassifier.cs: -------------------------------------------------------------------------------- 1 | // Automatic Meter Classification of Kurdish Poems 2 | // Copyright (C) 2019 Aso Mahmudi, Hadi Veisi 3 | // Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com) 4 | // Demo: https://asosoft.github.io/poem/ 5 | // Source Code: https://github.com/AsoSoft/AsoSoft-Library 6 | // Test-set: https://github.com/AsoSoft/Vejinbooks-Poem-Dataset 7 | // Paper: https://arxiv.org/abs/2102.12109 8 | // Cite: 9 | //@article{mahmudi2021automatic, 10 | // title={Automatic Meter Classification of Kurdish Poems}, 11 | // author={Mahmudi, Aso and Veisi, Hadi}, 12 | // journal={arXiv preprint arXiv: 2102.12109}, 13 | // year={2021} 14 | //} 15 | 16 | using System; 17 | using System.Collections.Generic; 18 | using System.Linq; 19 | using System.Text.RegularExpressions; 20 | 21 | namespace AsoSoftLibrary 22 | { 23 | 24 | /// 25 | public class Pattern 26 | { 27 | public int freq { get; set; } 28 | public string weights { get; set; } 29 | public string title { get; set; } 30 | } 31 | 32 | /// 33 | public class ScannedHemistich 34 | { 35 | public int lineNo { get; set; } 36 | public string scanned { get; set; } 37 | public int meterID { get; set; } 38 | public int dist { get; set; } 39 | } 40 | 41 | /// 42 | public class ResultSet 43 | { 44 | public int syllabic { get; set; } 45 | public double syllabicConfidence { get; set; } 46 | public string quantitative { get; set; } 47 | public double quantitativeConfidence { get; set; } 48 | public string overalPattern { get; set; } 49 | public string overalMeterType { get; set; } 50 | public List details { get; set; } 51 | } 52 | 53 | public static partial class AsoSoft 54 | { 55 | /// Common patterns of Kurdish quantitative verses (VejinBooks corpus, up to 2019/12/1) 56 | public static List CommonPatterns = new List(); 57 | 58 | private static void loadPoemPatterns() 59 | { 60 | var PoemPatterns = resFiles.PoemPatterns.Split('\n'); 61 | for (int i = 1; i < PoemPatterns.Length; i++) 62 | { 63 | var item = PoemPatterns[i].Split(','); 64 | CommonPatterns.Add(new Pattern() { freq = Convert.ToInt32(item[0]), weights = item[1], title = item[2] }); 65 | } 66 | } 67 | 68 | const int _maxDist = 4; 69 | private static int[] patternScores = new int[27]; 70 | 71 | /// Classifies the input Kurdish poem 72 | public static ResultSet PoemClassification(string[] sHemistiches) 73 | { 74 | if (CommonPatterns.Count == 0) 75 | loadPoemPatterns(); 76 | Array.Clear(patternScores, 0, patternScores.Length); 77 | var output = new ResultSet(); 78 | //===== syallabic analysis 79 | var syllableCounts = new List(); 80 | for (int i = 0; i < sHemistiches.Length; i++) 81 | { 82 | var sCount = sHemistiches[i].Split('ˈ').Length - 1; 83 | if (sCount > 0) 84 | syllableCounts.Add(sCount); 85 | } 86 | var HemistichesCount = syllableCounts.Count; 87 | var mode = syllableCounts 88 | .GroupBy(x => x) 89 | .OrderByDescending(y => y.Count()) 90 | .First().Key; 91 | output.syllabic = mode; 92 | output.syllabicConfidence = (double)syllableCounts.Where(x => x == mode).Count() 93 | / HemistichesCount * 100; 94 | 95 | //===== quantitative analysis 96 | var AcceptableCandidates = new List(); 97 | for (int i = 0; i < sHemistiches.Length; i++) 98 | AcceptableCandidates.AddRange(PatternMatch(Convert2CV(sHemistiches[i]), i)); 99 | 100 | var highScore = Array.IndexOf(patternScores, patternScores.Max()); 101 | output.quantitative = CommonPatterns[highScore].title; 102 | output.quantitativeConfidence = ((double)patternScores[highScore] / _maxDist) / HemistichesCount * 100; 103 | 104 | //===== final output for each hemistich 105 | var final = new List(); 106 | for (int i = 0; i < sHemistiches.Length; i++) 107 | { 108 | var highScoreMatches = AcceptableCandidates 109 | .Where(x => x.lineNo == i && x.meterID == highScore); 110 | if (highScoreMatches.Count() > 0) 111 | final.Add(highScoreMatches.First()); 112 | else 113 | final.Add(new ScannedHemistich()); 114 | } 115 | output.details = final; 116 | 117 | //===== overal poem classification 118 | var stdDev = CalculateStdDev(syllableCounts); 119 | var metricalMargin = (output.syllabic > 10) ? 40 : 50; 120 | var stdDevMargin = (double)output.syllabic / 10; 121 | if (stdDev > stdDevMargin) 122 | { 123 | output.overalMeterType = "Free Verse/شیعری نوێ"; 124 | } 125 | else if (output.quantitativeConfidence >= metricalMargin) // metrical when: 126 | { 127 | output.overalMeterType = "Quantitative/عەرووزی"; 128 | output.overalPattern = output.quantitative; 129 | } 130 | else if (output.syllabicConfidence >= 40 && stdDev < 1) // syllabic when: 131 | { 132 | output.overalMeterType = "Syllabic/بڕگەیی"; 133 | output.overalPattern = output.syllabic + "Syllabic"; 134 | } 135 | return output; 136 | } 137 | 138 | // input: "ˈgerˈçî ˈtûˈşî ˈřenˈceˈřoˈyîw ˈḧesˈreˈtû ˈderˈdim ˈʔeˈmin " 139 | // output: List<"∪––––∪–––∪–––∪–", "∪––––∪–––∪––∪∪–"> 140 | private static List Convert2CV(string syllabified) 141 | { 142 | if (syllabified.Length > 100) // abort if line is too long 143 | syllabified = " "; 144 | var CV = syllabified; 145 | CV = Regex.Replace(CV, @"[\[\]«»]", ""); // remove "] [" 146 | CV = Regex.Replace(CV + "\n", @"[\n\r\?,;! ]+", "¤"); // open junctures (punctuation and end of line) => ¤ 147 | CV = Regex.Replace(CV, @" ˈ¤", "¤"); 148 | CV = Regex.Replace(CV, "îˈye", "iˈye"); // (ˈnîˈye => ˈniˈye) 149 | CV = Regex.Replace(CV, "([^ieuaêoîûˈ])([yw])", "$1ɰ"); // gyan-gîyan, xiwa-xuwa => – or ∪– 150 | CV = Regex.Replace(CV, "[bcçdfghḧjklłmnpqrřsşṣtvwxẍyzʔƹ]", "C"); 151 | var syllables = CV.Split('ˈ').Skip(1).ToList(); 152 | var output = new List(); 153 | output.Add(""); 154 | for (int i = 0; i < syllables.Count(); i++) 155 | { 156 | var count = output.Count; 157 | if (Regex.IsMatch(syllables[i], "ɰ")) 158 | { // CVcC(C) syllable (e.g. گیان خوا) 159 | for (int j = 0; j < count; j++) 160 | { 161 | output.Add(output[j] + "–"); 162 | output[j] += "∪–"; 163 | } 164 | } 165 | else if (Regex.IsMatch(syllables[i], "([ieuaêoîû]C+|[aêoû]$|[aêo]¤$)")) 166 | { // heavy syllable 167 | if (i < 2) 168 | { // at first position may be light 169 | for (int j = 0; j < count; j++) 170 | { 171 | output.Add(output[j] + "∪"); 172 | output[j] += "–"; 173 | } 174 | } 175 | else 176 | for (int j = 0; j < count; j++) 177 | output[j] += "–"; 178 | } 179 | else if (Regex.IsMatch(syllables[i], "([ieu]$|i¤$)")) 180 | { // light syllable 181 | for (int j = 0; j < count; j++) 182 | output[j] += "∪"; 183 | } 184 | else if (Regex.IsMatch(syllables[i], "([euîû]¤$|î$)")) 185 | { // may be both 186 | for (int j = 0; j < count; j++) 187 | { 188 | output.Add(output[j] + "∪"); 189 | output[j] += "–"; 190 | } 191 | } 192 | } 193 | return output; 194 | } 195 | 196 | // input: List of "∪–"s 197 | // output: List of nearests of 27 common meter patterns 198 | private static List PatternMatch(List cands, int lineNumber) 199 | { 200 | if (CommonPatterns.Count == 0) 201 | loadPoemPatterns(); 202 | var output = new List(); 203 | if (!string.IsNullOrEmpty(cands[0].Trim())) 204 | { 205 | for (int i = 0; i < CommonPatterns.Count; i++) 206 | { // for 27 common meter patterns 207 | var distances = new Dictionary(); 208 | for (int j = 0; j < cands.Count; j++) // for each candidate 209 | distances.Add(j, Levenshtein(cands[j], CommonPatterns[i].weights)); 210 | var lowestDist = distances.OrderBy(x => x.Value).First().Value; 211 | if (lowestDist <= _maxDist) 212 | { 213 | patternScores[i] += _maxDist - lowestDist; 214 | foreach (var item in distances.Where(x => x.Value == lowestDist)) 215 | { 216 | output.Add(new ScannedHemistich() 217 | { 218 | lineNo = lineNumber, 219 | scanned = cands[item.Key], 220 | meterID = i, 221 | dist = item.Value 222 | }); 223 | } 224 | } 225 | } 226 | } 227 | return output; 228 | } 229 | 230 | //================================================== 231 | 232 | /// Normalizes the input text for classification steps. 233 | public static string PoemNormalization(string text) 234 | { 235 | text = Regex.Replace(text, "ط", "ت"); 236 | text = Regex.Replace(text, "[صث]", "س"); 237 | text = Regex.Replace(text, "[ضذظ]", "ز"); 238 | text = Regex.Replace(text, "( و)([.،؟!])", "$1"); 239 | return text; 240 | } 241 | 242 | private static double CalculateStdDev(List values) 243 | { 244 | double ret = 0; 245 | if (values.Count() > 0) 246 | { 247 | double avg = values.Average(); 248 | double sum = values.Sum(d => Math.Pow(d - avg, 2)); 249 | ret = Math.Sqrt((sum) / (values.Count() - 1)); 250 | } 251 | return ret; 252 | } 253 | 254 | private static double CalculateStdDev(List values, double avg) 255 | { 256 | double ret = 0; 257 | if (values.Count() > 0) 258 | { 259 | double sum = values.Sum(d => Math.Pow(d - avg, 2)); 260 | ret = Math.Sqrt((sum) / (values.Count() - 1)); 261 | } 262 | return ret; 263 | } 264 | 265 | private static int Levenshtein(string s1, string s2) 266 | { 267 | if (string.IsNullOrEmpty(s1)) 268 | { 269 | if (!string.IsNullOrEmpty(s2)) 270 | return s2.Length; 271 | return 0; 272 | } 273 | if (string.IsNullOrEmpty(s2)) 274 | { 275 | if (!string.IsNullOrEmpty(s1)) 276 | return s1.Length; 277 | return 0; 278 | } 279 | var m = s1.Length + 1; 280 | var n = s2.Length + 1; 281 | int[,] d = new int[m, n]; 282 | 283 | for (int i = 0; i < m; i++) 284 | d[i, 0] = i; 285 | for (int i = 0; i < n; i++) 286 | d[0, i] = i; 287 | 288 | for (int i = 1; i < m; i++) 289 | { 290 | for (int j = 1; j < n; j++) 291 | { 292 | var cost = (s1[i - 1] == s2[j - 1]) ? 0 : 2; // or 2 293 | var min1 = d[i - 1, j] + 1; 294 | var min2 = d[i, j - 1] + 1; 295 | var min3 = d[i - 1, j - 1] + cost; 296 | d[i, j] = Math.Min(Math.Min(min1, min2), min3); 297 | } 298 | } 299 | return d[m - 1, n - 1]; 300 | } 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AsoSoft Library 2 | AsoSoft Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish). 3 | AsoSoft Library is written in C#. 4 | - **Grapheme-to-Phoneme (G2P) converter and Transliteration**: converts Kurdish text into syllabified phoneme string. Also transliterates Kurdish texts from Arabic script into Latin script and vice versa. 5 | - **Normalizer**: normalizes the Kurdish text and punctuation marks, unifies numerals, replaces Html Entities, extracts and replaces URLs and emails, and more. 6 | - **Numeral Converter**: converts any type of numbers into Kurdish words. 7 | - **Sort**: Sorts a list in correct Kurdish alphabet order. 8 | - **Poem Meter Classifier**: Classifies the meter of the input Kurdish poem 9 | 10 | ## Grapheme-to-Phoneme (G2P) converter and Transliteration 11 | This function is based on the study "[Automated Grapheme-to-Phoneme Conversion for Central Kurdish based on Optimality Theory](https://www.sciencedirect.com/science/article/abs/pii/S0885230821000292)". 12 | 13 | ### Kurdish G2P converter 14 | Converts Central Kurdish text in standard Arabic script into **syllabified phonemic** Latin script (i.e. graphemes to phonems) 15 | 16 | General format: 17 | ```cs 18 | AsoSoft.G2P(string text, 19 | bool convertNumbersToWord = false, 20 | bool backMergeConjunction = true, 21 | bool singleOutputPerWord = true); 22 | ``` 23 | An example: 24 | ```cs 25 | AsoSoft.G2P("شەو و ڕۆژ بووین بە گرفت. درێژیی دیوارەکەی گرتن"); 26 | >ˈşeˈwû ˈřoj ˈbûyn ˈbe ˈgiˈrift. ˈdiˈrêˈjîy ˈdîˈwaˈreˈkey ˈgirˈtin< 27 | ``` 28 | ### Transliteration 29 | 30 | Arabic script into Hawar Latin script (ح‌غ‌ڕڵ→ḧẍřł): 31 | ```cs 32 | AsoSoft.Ar2La("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"); 33 | >gîrodey xałî řeşte; gwêt le neẍmey tuyûre?< 34 | ``` 35 | 36 | Arabic script into simplified (ح‌غ‌ڕڵ→hxrl) Hawar Latin script: 37 | ```cs 38 | AsoSoft.Ar2LaSimple("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟"); 39 | >gîrodey xalî reşte; gwêt le nexmey tuyûre?< 40 | ``` 41 | 42 | Latin script (Hawar) into Arabic script: 43 | ```cs 44 | AsoSoft.La2Ar("Gelî keç û xortên kurdan, hûn hemû bi xêr biçin"); 45 | >گەلی کەچ و خۆرتێن کوردان، هوون هەموو ب خێر بچن< 46 | ``` 47 | 48 | Arabic script into IPA: 49 | ```cs 50 | AsoSoft.Phonemes2IPA(AsoSoft.G2P("شەو و ڕۆژ بووین بە گرفت. درێژیی دیوارەکە گرتن")); 51 | >ʃa·wu ro̞ʒ bujn ba gɪ·ɾɪft. dɪ·ɾɛ·ʒij di·wä·ɾa·ka gɪɾ·tɪn< 52 | ``` 53 | ## Kurdish Text Normalizer 54 | Several functions needed for Central Kurdish text normalization: 55 | 56 | ### Normalize Kurdish 57 | Two character replacement lists are provided as the resources of the library: 58 | - Deep Unicode Corrections: 59 | - replacing deprecated Arabic Presentation Forms (FB50–FDFF and FE70–FEFF) with corresponding standard characters. 60 | - replacing different types of dashes and spaces 61 | - removing Unicode control character 62 | - Additional Unicode Corrections 63 | - replacing special Arabic math signs with corresponding Latin characters 64 | - replacing similar, but different letters with standard characters (e.g. ڪ,ے,ٶ with ک,ی,ؤ) 65 | 66 | The normalization task in this function: 67 | - for all Arabic scripts (including Kurdish, Arabic, and Persian): 68 | - Character-based replacement: 69 | - Above mentioned replacement lists 70 | - Private Use Area (U+E000 to U+F8FF) with White Square character 71 | - Standardizing and removing duplicated or unnecessary Zero-Width characters 72 | - removing unnecessary Tatweels (U+0640) 73 | - only for Central Kurdish: 74 | - standardizing Kurdish characters: ە, هـ, ی, and ک 75 | - correcting miss-converted characters from non-Unicode fonts 76 | - replacing word-initial ر with ڕ 77 | 78 | the simple overloading: 79 | ```cs 80 | AsoSoft.Normalize("دەقے شیَعري خـــۆش. ره‌نگه‌كاني خاك"); 81 | >دەقی شێعری خۆش. ڕەنگەکانی خاک< 82 | ``` 83 | 84 | or the complete overloading: 85 | ```cs 86 | AsoSoft.Normalize(string text, 87 | bool isOnlyKurdish, 88 | bool changeInitialR, 89 | bool deepUnicodeCorrectios, 90 | bool additionalUnicodeCorrections, 91 | Dictionary usersReplaceList); 92 | ``` 93 | 94 | ### AliK to Unicode 95 | `AliK2Unicode` converts Kurdish text written in AliK fonts (developed by Abas Majid in 1997) into Unicode standard. Ali-K fonts: *Alwand, Azzam, Hasan, Jiddah, kanaqen, Khalid, Sahifa, Sahifa Bold, Samik, Sayid, Sharif, Shrif Bold, Sulaimania, Traditional* 96 | ```cs 97 | AsoSoft.AliK2Unicode("ئاشناكردنى خويَندكار بة طوَرِانكاريية كوَمةلاَيةتييةكان"); 98 | >ئاشناکردنی خوێندکار بە گۆڕانکارییە کۆمەڵایەتییەکان< 99 | ``` 100 | 101 | ### AliWeb to Unicode 102 | `AliWeb2Unicode` converts Kurdish text written in AliK fonts into Unicode standard. Ali-Web fonts: *Malper, Malper Bold, Samik, Traditional, Traditional Bold* 103 | ```cs 104 | AsoSoft.AliWeb2Unicode("هةر جةرةيانصکي مصذووُيي کة أوو دةدا"); 105 | >ھەر جەرەیانێکی مێژوویی کە ڕوو دەدا< 106 | ``` 107 | 108 | ### Dylan to Unicode 109 | `Dylan2Unicode` converts Kurdish text written in Dylan fonts (developed by Dylan Saleh at [KurdSoft]( https://web.archive.org/web/20020528231610/http://www.kurdsoft.com/) in 2001) into Unicode standard. 110 | ```cs 111 | AsoSoft.Dylan2Unicode("لثكؤلثنةران بؤيان دةركةوتووة كة دةتوانث بؤ لةش بةكةصك بث"); 112 | >لێکۆلێنەران بۆیان دەرکەوتووە کە دەتوانێ بۆ لەش بەکەڵک بێ< 113 | ``` 114 | ### Zarnegar to Unicode 115 | `Zarnegar2Unicode` converts Kurdish text written in Zarnegar word processor (developed by [SinaSoft](http://www.sinasoft.com/fa/zarnegar.html) with RDF converter by [NoorSoft](https://www.noorsoft.org/fa/software/view/6561)) and into Unicode standard. 116 | ```cs 117 | AsoSoft.Zarnegar2Unicode("بلٌيٌين و بگه‏رٍيٌين بوٌ هه‏لاٌلٌه‏ى سىٌيه‏مى فه‏لسه‏فه‏"); 118 | >بڵێین و بگەڕێین بۆ هەڵاڵەی سێیەمی فەلسەفە< 119 | ``` 120 | ### NormalizePunctuations 121 | `NormalizePunctuations` corrects spaces before and after of the punctuations. When `seprateAllPunctuations` is true, 122 | ```cs 123 | AsoSoft.NormalizePunctuations("دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟", false); 124 | >دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟< 125 | ``` 126 | ### Trim Line 127 | Trim starting and ending white spaces (including zero width spaces) of line, 128 | `TrimLine` 129 | ```cs 130 | AsoSoft.TrimLine(" دەق\u200c "); 131 | >دەق< 132 | ``` 133 | 134 | ### Replace Html Entities 135 | `ReplaceHtmlEntity` replaces HTML Entities with single Unicode characters (e.g. "é" with "é"). It is useful in web crawled corpora. 136 | ```cs 137 | AsoSoft.ReplaceHtmlEntity("ئێوە "دەق" لە زمانی <کوردی> دەنووسن"); 138 | >ئێوە "دەق" بە زمانی <کوردی> دەنووسن< 139 | ``` 140 | ### Replace URLs and emails 141 | `ReplaceUrlEmail` replaces URLs and emails with a certain word. It improves language models. 142 | 143 | ### Unify Numerals 144 | `UnifyNumerals` unifies numeral characters into desired numeral type from `en` (0123456789) or `ar` (٠١٢٣٤٥٦٧٨٩) 145 | ```cs 146 | AsoSoft.UnifyNumerals("ژمارەکانی ٤٥٦ و ۴۵۶ و 456", "en"); 147 | >ژمارەکانی 456 و 456 و 456< 148 | ``` 149 | 150 | ### Seperate Digits from words 151 | `SeperateDigits` add a space between joined numerals and words (e.g. replacing "12کەس" with "12 کەس"). It improves language models. 152 | ```cs 153 | AsoSoft.SeperateDigits("لە ساڵی1950دا1000دۆلاریان بە 5کەس دا"); 154 | >لە ساڵی 1950 دا 1000 دۆلاریان بە 5 کەس دا< 155 | ``` 156 | 157 | ### Word to Word Replacment 158 | `Word2WordReplacement` applies a "string to string" replacement dictionary on the text. It replaces the full-matched words not a part of them. 159 | ```cs 160 | var dict = new Dictionary() { { "مال", "ماڵ" } }; 161 | AsoSoft.Word2WordReplacement("مال، نووری مالیکی", dict); 162 | >ماڵ، نووری مالیکی< 163 | ``` 164 | 165 | ### Character to Character Replacment 166 | `Char2CharReplacment` applies a "char to char" replacement dictionary on the text. It uses as the final step needed for some non-Unicode systems. 167 | 168 | ## Kurdish Numeral converter 169 | It converts numerals into Central Kurdish words. It is useful in text-to-speech tools. 170 | - integers (1100 => ) 171 | - floats (10.11) 172 | - negatives (-10.11) 173 | - percent (100% or %100) 174 | - querency marks ($100, £100, and €100) 175 | 176 | ```cs 177 | AsoSoft.Number2Word("لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت"); 178 | >لە ساڵی هەزار و نۆسەد و نەوەد و نۆدا بڕی چل لە سەد لە پارەکەیان واتە سەد و دوو پۆینت یەک دۆلاریان وەرگرت< 179 | ``` 180 | 181 | ## Kurdish Sort 182 | Sorting a string list in correct order of Kurdish alphabet ("ئءاآأإبپتثجچحخدڎذرڕزژسشصضطظعغفڤقكکگلڵمنوۆۊۉهھەیێ") 183 | ```cs 184 | var myList = new List{"یەک", "ڕەنگ", "ئەو", "ئاو", "ڤەژین", "فڵان"} 185 | AsoSoft.KurdishSort(myList); 186 | >"ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"< 187 | ``` 188 | or using your custom order: 189 | ```cs 190 | AsoSoft.CustomSort(List inputList, List inputOrder); 191 | ``` 192 | ## Poem Meter Classifier 193 | It classifies the meter of the input Kurdish poem typed in Arabic script. The lines of the poem should be seprated by new line char ('\n'). 194 | You can find Kurdish poems in https://books.vejin.net/. 195 | ```cs 196 | var poem = AsoSoft.PoemNormalization(@"گەرچی تووشی ڕەنجەڕۆیی و حەسرەت و دەردم ئەمن 197 | قەت لەدەس ئەم چەرخە سپڵە نابەزم مەردم ئەمن 198 | ئاشقی چاوی کەژاڵ و گەردنی پڕ خاڵ نیم 199 | ئاشقی کێو و تەلان و بەندەن و بەردم ئەمن"); 200 | var syllabified = AsoSoft.G2P(poem, true, true, true).Split('\n'); 201 | var classified = AsoSoft.PoemClassification(syllabified); 202 | var poemType = classified.overalMeterType; 203 | var poemMeter = classified.overalPattern; 204 | ``` 205 | 206 | ## How to use? 207 | Install [AsoSoft Library package](https://www.nuget.org/packages/AsoSoftLibrary) via NuGet Gallery. 208 | Then, insert `using AsoSoftLibrary;` into "Usings" of your codes. 209 | 210 | ## Development 211 | AsoSoft Library is developed and maintained by Aso Mahmudi. 212 | AsoSoft Library is written in C# (.NET 6). 213 | -------------------------------------------------------------------------------- /Sort.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Text; 4 | 5 | namespace AsoSoftLibrary 6 | { 7 | public static partial class AsoSoft 8 | { 9 | /// Sorting a string list in correct order of Kurdish alphabet. 10 | public static List KurdishSort(List inputList) 11 | { 12 | var ku = new List(); 13 | ku.AddRange("ئءاآأإبپتثجچحخدڎڊذرڕزژسشصضطظعغفڤقكکگڴلڵمنوۆۊۉۋهھەیێ"); 14 | return CustomSort(inputList, ku); 15 | } 16 | /// Sorting a string list in custom order. 17 | public static List CustomSort(List inputList, List inputOrder) 18 | { 19 | var baseChar = 62000;// 9472; 20 | var order = new List(); 21 | for (int i = 0; i < inputOrder.Count; i++) 22 | order.Add((char)(baseChar + i)); 23 | for (int i = 0; i < inputList.Count; i++) 24 | for (int j = 0; j < order.Count; j++) 25 | inputList[i] = inputList[i].Replace(inputOrder[j], order[j]); 26 | inputList.Sort(); 27 | for (int i = 0; i < inputList.Count; i++) 28 | for (int j = 0; j < order.Count; j++) 29 | inputList[i] = inputList[i].Replace(order[j], inputOrder[j]); 30 | return inputList; 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Transliteration.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Text.RegularExpressions; 6 | using System.Threading.Tasks; 7 | 8 | namespace AsoSoftLibrary 9 | { 10 | public static partial class AsoSoft 11 | { 12 | 13 | private static readonly string latinLetters = "a-zêîûçşéúıŕřĺɫƚḧẍḍṿʔ"; 14 | 15 | private static readonly Dictionary> TransliterationReplaces = new Dictionary> 16 | { 17 | {"LaDi2Ar", new List() { 18 | "gh", "ẍ", 19 | "hh", "ḧ", 20 | "ll", "ɫ", 21 | "rr", "ř" 22 | }}, 23 | {"La2Ar", new List() { 24 | "\u201C", "«", 25 | "\u201D", "»", 26 | $"([0-9])([\'’-])([aeiouêîûéú])", "$1$3", // (e.g. 1990'an 5'ê) 27 | "ʔ", "", // glottal stop 28 | $"(^|[^{latinLetters}0-9\"’])([aeiouêîûéú])", "$1ئ$2", //insert initial hamza 29 | "([aeouêîûéú])([aeiouêîûéú])", "$1ئ$2", //insert hamza between adjacent vowels 30 | $"(ئ)([uû])([^{latinLetters}0-9])", "و$3", //omit the inserted hamza for "û" (=and) 31 | "a", "ا", 32 | "b", "ب", 33 | "ç", "چ", 34 | "c", "ج", 35 | "d", "د", 36 | "ḍ", "ڎ", // a Horami consonant 37 | "ê|é", "ێ", 38 | "e", "ە", 39 | "f", "ف", 40 | "g", "گ", 41 | "h", "ه", 42 | "ḧ", "ح", 43 | "i|ı", "", 44 | "î|y|í", "ی", 45 | "j", "ژ", 46 | "k", "ک", 47 | "l", "ل", 48 | "ɫ|ł|ƚ|Ɨ|ĺ", "ڵ", 49 | "m", "م", 50 | "n", "ن", 51 | "ŋ", "نگ", 52 | "o", "ۆ", 53 | "ö", "وێ", 54 | "p", "پ", 55 | "q", "ق", 56 | "r", "ر", 57 | "ř|ŕ", "ڕ", 58 | "s", "س", 59 | "ş|š|ș|s̩", "ش", 60 | "ṣ", "ص", 61 | "t", "ت", 62 | "ṭ", "ط", 63 | "û|ú", "وو", 64 | "u|w", "و", 65 | "ü", "ۊ", 66 | "v", "ڤ", 67 | "x", "خ", 68 | "ẍ", "غ", 69 | "z", "ز", 70 | "ه" + "($|[^ابپتجچحخدرڕزژسشصعغفڤقکگلڵمنوۆهەیێ])", "هـ" + "$1", // word-final h 71 | "\"|’", "ئ", // need checking, not sure "ع" or "ئ" 72 | "\\u003F", "؟", //question mark 73 | ",", "،", //comma 74 | ";", "؛" //semicolon 75 | }} 76 | }; 77 | 78 | /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک) 79 | public static string La2Ar(string text) 80 | { 81 | text = replaceByList(text.ToLower(), TransliterationReplaces["La2Ar"]); 82 | return text; 83 | } 84 | 85 | /// Transliterating the Latin script with digraphs into Arabic script of Kurdish (e.g. chall→چاڵ) 86 | public static string LaDigraph2Ar(string text) 87 | { 88 | text = text.ToLower(); 89 | text = replaceByList(text, TransliterationReplaces["LaDi2Ar"]); 90 | text = replaceByList(text, TransliterationReplaces["La2Ar"]); 91 | return text; 92 | } 93 | 94 | /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک) 95 | public static string Ar2La(string text) 96 | { 97 | return Phonemes2Hawar(G2P(text, backMergeConjunction:false)); 98 | } 99 | /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک) 100 | public static string Ar2LaSimple(string text) 101 | { 102 | text = Phonemes2Hawar(G2P(text, backMergeConjunction: false)); 103 | text = text.Replace("ḧ", "h"); 104 | text = text.Replace("ř", "r"); 105 | text = text.Replace("ł", "l"); 106 | text = text.Replace("ẍ", "x"); 107 | return text; 108 | } 109 | 110 | /// Converts the output of the G2P into IPA (e.g. ˈdeˈçê→da.t͡ʃɛ) 111 | public static string Phonemes2IPA(string text) 112 | { 113 | text = Regex.Replace(text, "(?<=(^|\\W))ˈ", ""); 114 | text = Regex.Replace(text, "ˈ", "·"); //middle dot 115 | var Phoneme2IPA = resFiles.Phoneme2IPA.Split('\n'); 116 | for (int i = 1; i < Phoneme2IPA.Length; i++) 117 | { 118 | var item = Phoneme2IPA[i].Split(','); 119 | text = Regex.Replace(text, item[0], item[1]); 120 | } 121 | return text; 122 | } 123 | 124 | /// Converts the output of the G2P into Hawar (e.g. ˈʔeˈłêm→ełêm) 125 | public static string Phonemes2Hawar(string text) 126 | { 127 | text = text.Replace("ˈ", ""); 128 | text = Regex.Replace(text, "(?<=(^|\\W))ʔ", ""); 129 | text = Regex.Replace(text, "[ʔƹ]", "’"); 130 | return text; 131 | } 132 | 133 | /// Converts the output of the G2P into Jira's ASCII format (e.g. ˈdeˈçim→D▪A▪CH▪M) 134 | public static string Phonemes2ASCII(string text) 135 | { 136 | text = Regex.Replace(text, @"[iˈ]", ""); 137 | var Phoneme2Ascii = resFiles.Phoneme2Ascii.Split('\n'); 138 | for (int i = 1; i < Phoneme2Ascii.Length; i++) 139 | { 140 | var item = Phoneme2Ascii[i].Split(','); 141 | text = Regex.Replace(text, item[0], item[1] + "▪"); 142 | } 143 | return text; 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /resFiles.Designer.cs: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // 3 | // This code was generated by a tool. 4 | // Runtime Version:4.0.30319.42000 5 | // 6 | // Changes to this file may cause incorrect behavior and will be lost if 7 | // the code is regenerated. 8 | // 9 | //------------------------------------------------------------------------------ 10 | 11 | namespace AsoSoftLibrary { 12 | using System; 13 | 14 | 15 | /// 16 | /// A strongly-typed resource class, for looking up localized strings, etc. 17 | /// 18 | // This class was auto-generated by the StronglyTypedResourceBuilder 19 | // class via a tool like ResGen or Visual Studio. 20 | // To add or remove a member, edit your .ResX file then rerun ResGen 21 | // with the /str option, or rebuild your VS project. 22 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "17.0.0.0")] 23 | [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] 24 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] 25 | public class resFiles { 26 | 27 | private static global::System.Resources.ResourceManager resourceMan; 28 | 29 | private static global::System.Globalization.CultureInfo resourceCulture; 30 | 31 | [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] 32 | internal resFiles() { 33 | } 34 | 35 | /// 36 | /// Returns the cached ResourceManager instance used by this class. 37 | /// 38 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 39 | public static global::System.Resources.ResourceManager ResourceManager { 40 | get { 41 | if (object.ReferenceEquals(resourceMan, null)) { 42 | global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("AsoSoftLibrary.resFiles", typeof(resFiles).Assembly); 43 | resourceMan = temp; 44 | } 45 | return resourceMan; 46 | } 47 | } 48 | 49 | /// 50 | /// Overrides the current thread's CurrentUICulture property for all 51 | /// resource lookups using this strongly typed resource class. 52 | /// 53 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] 54 | public static global::System.Globalization.CultureInfo Culture { 55 | get { 56 | return resourceCulture; 57 | } 58 | set { 59 | resourceCulture = value; 60 | } 61 | } 62 | 63 | /// 64 | /// Looks up a localized string similar to G,P,Desc 65 | ///ڴ,ĝ,Garusi Consonant 66 | ///ڎ,đ,Hewrami Consonant 67 | ///ۉ,ŵ,Hewrami Consonant 68 | ///ݵ,ė,Hewrami Vowel 69 | ///ݸ,ȯ,Hewrami Vowel 70 | ///ۊ,ẅ,Southern Vowel 71 | ///ئ,ʔ, 72 | ///ب,b, 73 | ///پ,p, 74 | ///ت,t, 75 | ///ج,c, 76 | ///چ,ç, 77 | ///ح,ḧ, 78 | ///خ,x, 79 | ///د,d, 80 | ///ر,r, 81 | ///ڕ,ř, 82 | ///ز,z, 83 | ///ژ,j, 84 | ///س,s, 85 | ///ش,ş, 86 | ///ع,ƹ, 87 | ///غ,ẍ, 88 | ///ف,f, 89 | ///ڤ,v, 90 | ///ق,q, 91 | ///ک,k, 92 | ///گ,g, 93 | ///ل,l, 94 | ///ڵ,ł, 95 | ///م,m, 96 | ///ن,n, 97 | ///ه,h, 98 | ///ا,a, 99 | ///ۆ,o, 100 | ///ە,e, 101 | ///ێ,ê, 102 | ///^ی,y, 103 | ///^و,w, 104 | ///(?<=[aeêo])و,w,after vowel 105 | ///و(?=[aeêo]),w,before vowel 106 | ///(?<=[aeêo])ی,y,after vowel 107 | ///ی(?=[aeêo]),y,before vowel 108 | ///^([bçdjl])$,$1i,چ=>çi bcçdfghḧjklłmnpqrřsştvwxẍyzʔƹ. 109 | /// 110 | public static string G2PCertain { 111 | get { 112 | return ResourceManager.GetString("G2PCertain", resourceCulture); 113 | } 114 | } 115 | 116 | /// 117 | /// Looks up a localized string similar to Graphemes,Phonems 118 | ///حەییی,ḧeyyî 119 | ///تەییی,teyyî 120 | ///ئاگر,ʔagir 121 | ///قانع,qaniƹ 122 | ///سالم,salim 123 | ///عاشق,ƹaşiq. 124 | /// 125 | public static string G2PExceptions { 126 | get { 127 | return ResourceManager.GetString("G2PExceptions", resourceCulture); 128 | } 129 | } 130 | 131 | /// 132 | /// Looks up a localized string similar to From,To,Desc 133 | ///00AC,200C,Wrong ZWNJ by MS Word 134 | ///066A,0025,Arabic PERCENT SIGN 135 | ///066B,002E,Arabic DECIMAL SEPARATOR 136 | ///066C,002C,Arabic THOUSANDS SEPARATOR 137 | ///066D,002A,Arabic FIVE POINTED STAR 138 | ///0751,062B,ݑ 139 | ///0752,067E,ݒ 140 | ///0750,067E,ݐ 141 | ///0753,062A,ݓ 142 | ///067F,062A,ٿ 143 | ///0679,062A,ٹ 144 | ///0758,0686,ݘ 145 | ///0689,062F,ډ 146 | ///068A,062F,ڊ 147 | ///068B,062F,ڋ 148 | ///068C,062F,ڌ 149 | ///068D,062F,ڍ 150 | ///068F,062F,ڏ 151 | ///0690,062F,ڐ 152 | ///0759,062F,ݙ 153 | ///075A,062F,ݚ 154 | ///076C,0695,ݬ 155 | ///0691,0695,ڑ 156 | ///0692,0695,ڒ 157 | ///0693,0695,ړ 158 | ///0694,0695,ڔ 159 | ///0696,0695,ږ 160 | ///0697,0698,ڗ 161 | ///0699,0698,ڙ 162 | ///076B,0698,ݫ 163 | ///069A,0633,ښ 164 | ///069B,0633,ڛ 165 | ///069C,0 [rest of string was truncated]";. 166 | /// 167 | public static string NormalizerAdditional { 168 | get { 169 | return ResourceManager.GetString("NormalizerAdditional", resourceCulture); 170 | } 171 | } 172 | 173 | /// 174 | /// Looks up a localized string similar to From,To,Desc 175 | ///A78C,0027,Latin Small Letter Saltillo ꞌ 176 | ///FEFF,200C,ZERO WIDTH NO-BREAK SPACE 177 | ///200B,200C,ZERO WIDTH SPACE 178 | ///2010,002D,HYPHEN 179 | ///2011,002D,NON-BREAKING HYPHEN 180 | ///2012,002D,FIGURE DASH 181 | ///2013,002D,EN DASH 182 | ///2014,002D,EM DASH 183 | ///2015,002D,HORIZONTAL BAR 184 | ///2212,002D,Minus 185 | ///00AD,002D,Soft Hyphen 186 | ///FE58,002D,SMALL EM DASH 187 | ///FE63,002D,MALL HYPHEN-MINUS 188 | ///FF0D,002D,FULLWIDTH HYPHEN-MINUS 189 | ///1680,0020,OGHAM SPACE MARK 190 | ///2000,0020,EN QUAD 191 | ///2001,0020,EM QUAD 192 | ///2002,0020,EN SPACE 193 | ///2003,0020,EM SPACE 194 | ///2004,0020,THREE-PER-EM SPACE 195 | ///2005,0020,FOU [rest of string was truncated]";. 196 | /// 197 | public static string NormalizerDeep { 198 | get { 199 | return ResourceManager.GetString("NormalizerDeep", resourceCulture); 200 | } 201 | } 202 | 203 | /// 204 | /// Looks up a localized string similar to Phoneme,ASCII 205 | ///ʔ,EH 206 | ///a,AA 207 | ///b,B 208 | ///p,P 209 | ///t,T 210 | ///c,JE 211 | ///ç,CH 212 | ///ḧ,HE 213 | ///x,X 214 | ///d,D 215 | ///r,R 216 | ///ř,RR 217 | ///z,Z 218 | ///j,ZH 219 | ///s,S 220 | ///ş,SH 221 | ///ƹ,AH 222 | ///ẍ,XE 223 | ///f,F 224 | ///v,V 225 | ///q,Q 226 | ///k,K 227 | ///g,G 228 | ///l,L 229 | ///ł,LL 230 | ///m,M 231 | ///n,N 232 | ///o,O 233 | ///e,A 234 | ///h,H 235 | ///ê,E 236 | ///î,I 237 | ///y,Y 238 | ///w,W 239 | ///u,U 240 | ///û,UU. 241 | /// 242 | public static string Phoneme2Ascii { 243 | get { 244 | return ResourceManager.GetString("Phoneme2Ascii", resourceCulture); 245 | } 246 | } 247 | 248 | /// 249 | /// Looks up a localized string similar to Phoneme,IPA 250 | ///ng,ŋg 251 | ///ʔ,ʔ 252 | ///b,b 253 | ///p,p 254 | ///t,t 255 | ///c,d͡ʒ 256 | ///ç,t͡ʃ 257 | ///ḧ,ħ 258 | ///x,x 259 | ///d,d 260 | ///r,ɾ 261 | ///ř,r 262 | ///z,z 263 | ///j,ʒ 264 | ///s,s 265 | ///ş,ʃ 266 | ///ƹ,ʕ 267 | ///ẍ,ɣ 268 | ///f,f 269 | ///v,v 270 | ///q,q 271 | ///k,k 272 | ///g,g 273 | ///l,l 274 | ///ł,ɫ 275 | ///m,m 276 | ///n,n 277 | ///w,w 278 | ///u,ʊ 279 | ///û,u 280 | ///o,o̞ 281 | ///h,h 282 | ///y,j 283 | ///a,ä 284 | ///e,a 285 | ///ê,ɛ 286 | ///i,ɪ 287 | ///î,i 288 | ///ĝ,ŋ 289 | ///đ,đ 290 | ///ü,y 291 | ///ô,ô 292 | ///õ,õ. 293 | /// 294 | public static string Phoneme2IPA { 295 | get { 296 | return ResourceManager.GetString("Phoneme2IPA", resourceCulture); 297 | } 298 | } 299 | 300 | /// 301 | /// Looks up a localized string similar to Frequency,WeightPattern,Title 302 | ///1044,–∪–––∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلاتن فاعلن 303 | ///999,∪–––∪–––∪–––∪–––,مفاعیلن مفاعیلن مفاعیلن مفاعیلن 304 | ///386,∪–––∪–––∪––,مفاعیلن مفاعیلن فعولن 305 | ///334,––∪∪––∪∪––∪∪––,مفعولُ مفاعیلُ مفاعیلُ فعولن 306 | ///272,––∪∪––∪∪––∪∪–,مفعولُ مفاعیلُ مفاعیلُ فعل 307 | ///213,––∪–∪–∪∪––∪–∪–,مفعولُ فاعلاتُ مفاعیلُ فاعلن 308 | ///138,∪∪––∪∪––∪∪––∪∪–,فعلاتن فعلاتن فعلاتن فعلن 309 | ///131,––∪∪–∪–∪––,مفعولُ مفاعلن فعولن 310 | ///62,–∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلن 311 | ///45,∪∪––∪–∪–∪∪–,فعلاتن مفاعلن فعلن 312 | ///40,∪–∪–∪∪––∪–∪–∪∪–,مفاعلن فعلاتن مفاعلن فعلن 313 | ///31 [rest of string was truncated]";. 314 | /// 315 | public static string PoemPatterns { 316 | get { 317 | return ResourceManager.GetString("PoemPatterns", resourceCulture); 318 | } 319 | } 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /resFiles.resx: -------------------------------------------------------------------------------- 1 |  2 | 3 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | text/microsoft-resx 110 | 111 | 112 | 2.0 113 | 114 | 115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 116 | 117 | 118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 119 | 120 | 121 | 122 | resources\G2PCertain.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 123 | 124 | 125 | resources\G2PExceptions.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 126 | 127 | 128 | resources\NormalizeUnicodeAdditional.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 129 | 130 | 131 | resources\NormalizeUnicodeDeep.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 132 | 133 | 134 | resources\Phoneme2Ascii.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 135 | 136 | 137 | resources\Phoneme2IPA.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 138 | 139 | 140 | resources\PoemPatterns.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8 141 | 142 | -------------------------------------------------------------------------------- /resources/G2PCertain.csv: -------------------------------------------------------------------------------- 1 | G,P,Desc 2 | ڴ,ĝ,Garusi Consonant 3 | ڎ,đ,Hewrami Consonant 4 | ۉ,ŵ,Hewrami Consonant 5 | ݵ,ė,Hewrami Vowel 6 | ݸ,ȯ,Hewrami Vowel 7 | ۊ,ẅ,Southern Vowel 8 | ئ,ʔ, 9 | ب,b, 10 | پ,p, 11 | ت,t, 12 | ج,c, 13 | چ,ç, 14 | ح,ḧ, 15 | خ,x, 16 | د,d, 17 | ر,r, 18 | ڕ,ř, 19 | ز,z, 20 | ژ,j, 21 | س,s, 22 | ش,ş, 23 | ع,ƹ, 24 | غ,ẍ, 25 | ف,f, 26 | ڤ,v, 27 | ق,q, 28 | ک,k, 29 | گ,g, 30 | ل,l, 31 | ڵ,ł, 32 | م,m, 33 | ن,n, 34 | ه,h, 35 | ا,a, 36 | ۆ,o, 37 | ە,e, 38 | ێ,ê, 39 | ^ی,y, 40 | ^و,w, 41 | (?<=[aeêo])و,w,after vowel 42 | و(?=[aeêo]),w,before vowel 43 | (?<=[aeêo])ی,y,after vowel 44 | ی(?=[aeêo]),y,before vowel 45 | ^([bçdjl])$,$1i,چ=>çi bcçdfghḧjklłmnpqrřsştvwxẍyzʔƹ -------------------------------------------------------------------------------- /resources/G2PExceptions.csv: -------------------------------------------------------------------------------- 1 | Graphemes,Phonems 2 | حەییی,ḧeyyî 3 | تەییی,teyyî 4 | ئاگر,ʔagir 5 | قانع,qaniƹ 6 | سالم,salim 7 | عاشق,ƹaşiq -------------------------------------------------------------------------------- /resources/NormalizeUnicodeAdditional.csv: -------------------------------------------------------------------------------- 1 | From,To,Desc 2 | 00AC,200C,Wrong ZWNJ by MS Word 3 | 066A,0025,Arabic PERCENT SIGN 4 | 066B,002E,Arabic DECIMAL SEPARATOR 5 | 066C,002C,Arabic THOUSANDS SEPARATOR 6 | 066D,002A,Arabic FIVE POINTED STAR 7 | 0751,062B,ݑ 8 | 0752,067E,ݒ 9 | 0750,067E,ݐ 10 | 0753,062A,ݓ 11 | 067F,062A,ٿ 12 | 0679,062A,ٹ 13 | 0758,0686,ݘ 14 | 0689,062F,ډ 15 | 068A,062F,ڊ 16 | 068B,062F,ڋ 17 | 068C,062F,ڌ 18 | 068D,062F,ڍ 19 | 068F,062F,ڏ 20 | 0690,062F,ڐ 21 | 0759,062F,ݙ 22 | 075A,062F,ݚ 23 | 076C,0695,ݬ 24 | 0691,0695,ڑ 25 | 0692,0695,ڒ 26 | 0693,0695,ړ 27 | 0694,0695,ڔ 28 | 0696,0695,ږ 29 | 0697,0698,ڗ 30 | 0699,0698,ڙ 31 | 076B,0698,ݫ 32 | 069A,0633,ښ 33 | 069B,0633,ڛ 34 | 069C,0634,ڜ 35 | 06FA,0634,ۺ 36 | 069D,0635,ڝ 37 | 069E,0636,ڞ 38 | 06FB,0636,ۻ 39 | 069F,0638,ڟ 40 | 06A0,063A,ڠ 41 | 06FC,063A,ۼ 42 | 06A1,0641,ڡ 43 | 06A2,0641,ڢ 44 | 06A3,0641,ڣ 45 | 06A5,06A4,ڥ 46 | 06A4,06A4,ڤ 47 | 06A7,0642,ڧ 48 | 06A8,0642,ڨ 49 | 06A8,0642,ڨ 50 | 06AA,06A9,ڪ 51 | 06AB,06A9,ګ 52 | 06AC,06A9,ڬ 53 | 06AD,06A9,ڭ 54 | 06AE,06A9,ڮ 55 | 063B,06A9,ػ 56 | 063C,06A9,ؼ 57 | 06B0,06AF,ڰ 58 | 06B1,06AF,ڱ 59 | 06B2,06AF,ڲ 60 | 06B3,06AF,ڳ 61 | 06B4,06AF,ڴ 62 | 06D2,06CC,ے 63 | 06CD,06CC,ۍ 64 | 06B6,06B5,ڶ 65 | 06B7,06B5,ڷ 66 | 06B8,06B5,ڸ 67 | 076A,06B5,ݪ 68 | 0765,0645,ݥ 69 | 0766,0645,ݦ 70 | 06B9,0646,ڹ 71 | 06BA,0646,ں 72 | 06BB,0646,ڻ 73 | 06BC,0646,ڼ 74 | 06BD,0646,ڽ 75 | 0767,0646,ݧ 76 | 0768,0646,ݨ 77 | 0769,0646,ݩ 78 | 06C4,06C6,ۄ 79 | 06C5,06C6,ۅ 80 | 06C8,06C6,ۈ 81 | 06C9,06C6,ۉ 82 | 06CB,06C6,ۋ 83 | 0676,06C6,ٶ 84 | 06C9,06C6,ۉ 85 | 06C7,0648 0648,ۇ -------------------------------------------------------------------------------- /resources/NormalizeUnicodeDeep.csv: -------------------------------------------------------------------------------- 1 | From,To,Desc 2 | A78C,0027,Latin Small Letter Saltillo ꞌ 3 | FEFF,200C,ZERO WIDTH NO-BREAK SPACE 4 | 200B,200C,ZERO WIDTH SPACE 5 | 2010,002D,HYPHEN 6 | 2011,002D,NON-BREAKING HYPHEN 7 | 2012,002D,FIGURE DASH 8 | 2013,002D,EN DASH 9 | 2014,002D,EM DASH 10 | 2015,002D,HORIZONTAL BAR 11 | 2212,002D,Minus 12 | 00AD,002D,Soft Hyphen 13 | FE58,002D,SMALL EM DASH 14 | FE63,002D,MALL HYPHEN-MINUS 15 | FF0D,002D,FULLWIDTH HYPHEN-MINUS 16 | 1680,0020,OGHAM SPACE MARK 17 | 2000,0020,EN QUAD 18 | 2001,0020,EM QUAD 19 | 2002,0020,EN SPACE 20 | 2003,0020,EM SPACE 21 | 2004,0020,THREE-PER-EM SPACE 22 | 2005,0020,FOUR-PER-EM SPACE 23 | 2006,0020,SIX-PER-EM SPACE 24 | 205F,0020,MEDIUM MATHEMATICAL SPACE 25 | 3000,0020,IDEOGRAPHIC SPACE 26 | 2007,0020,FIGURE SPACE 27 | 2008,0020,PUNCTUATION SPACE 28 | 2009,0020,THIN SPACE 29 | 200A,0020,HAIR SPACE 30 | 00A0,0020,NO-BREAK SPACE 31 | 202F,0020,NARROW NO-BREAK SPACE 32 | 200E,0020,LEFT-TO-RIGHT MARK 33 | 200F,0020,RIGHT-TO-LEFT MARK 34 | 202A,0020,LEFT-TO-RIGHT EMBEDDING 35 | 202B,0020,RIGHT-TO-LEFT EMBEDDING 36 | 202C,0020,POP DIRECTIONAL FORMATTING 37 | 202D,0020,LEFT-TO-RIGHT OVERRIDE 38 | 202E,0020,RIGHT-TO-LEFT OVERRIDE 39 | 0000,0020,Control 40 | 0001,0020, 41 | 0002,0020, 42 | 0003,0020, 43 | 0004,0020, 44 | 0005,0020, 45 | 0006,0020, 46 | 0007,0020, 47 | 0008,0020, 48 | 000B,0020, 49 | 000C,0020, 50 | 000E,0020, 51 | 000F,0020, 52 | 0010,0020, 53 | 0011,0020, 54 | 0012,0020, 55 | 0013,0020, 56 | 0014,0020, 57 | 0015,0020, 58 | 0016,0020, 59 | 0017,0020, 60 | 0018,0020, 61 | 0019,0020, 62 | 001A,0020, 63 | 001B,0020, 64 | 001C,0020, 65 | 001D,0020, 66 | 001E,0020, 67 | 001F,0020, 68 | 007F,0020, 69 | 0080,0020, 70 | 0081,0020, 71 | 0082,0020, 72 | 0083,0020, 73 | 0084,0020, 74 | 0085,0020, 75 | 0086,0020, 76 | 0087,0020, 77 | 0088,0020, 78 | 0089,0020, 79 | 008A,0020, 80 | 008B,0020, 81 | 008C,0020, 82 | 008D,0020, 83 | 008E,0020, 84 | 008F,0020, 85 | 0090,0020, 86 | 0091,0020, 87 | 0092,0020, 88 | 0093,0020, 89 | 0094,0020, 90 | 0095,0020, 91 | 0096,0020, 92 | 0097,0020, 93 | 0098,0020, 94 | 0099,0020, 95 | 009A,0020, 96 | 009B,0020, 97 | 009C,0020, 98 | 009D,0020, 99 | 009E,0020, 100 | 009F,0020, 101 | 0610,,Arabic Nonspacing Marks 102 | 0611,, 103 | 0612,, 104 | 0613,, 105 | 0614,, 106 | 0615,, 107 | 0616,, 108 | 0617,, 109 | 0618,, 110 | 0619,, 111 | 061A,, 112 | 0653,, 113 | 0654,, 114 | 0655,, 115 | 0656,, 116 | 0657,, 117 | 0658,, 118 | 0659,, 119 | 065A,, 120 | 065B,, 121 | 065C,, 122 | 065D,, 123 | 065E,, 124 | 065F,, 125 | 0670,, 126 | 06D6,, 127 | 06D7,, 128 | 06D8,, 129 | 06D9,, 130 | 06DA,, 131 | 06DB,, 132 | 06DC,, 133 | 06DF,, 134 | 06E0,, 135 | 06E1,, 136 | 06E2,, 137 | 06E3,, 138 | 06E4,, 139 | 06E7,, 140 | 06E8,, 141 | 06EA,, 142 | 06EB,, 143 | 06EC,, 144 | 06ED,, 145 | FB50,0671,Arabic Presentation Forms 146 | FB51,0671, 147 | FB52,067B, 148 | FB53,067B, 149 | FB54,067B, 150 | FB55,067B, 151 | FB56,067E, 152 | FB57,067E, 153 | FB58,067E, 154 | FB59,067E, 155 | FB5A,0680, 156 | FB5B,0680, 157 | FB5C,0680, 158 | FB5D,0680, 159 | FB5E,067A, 160 | FB5F,067A, 161 | FB60,067A, 162 | FB61,067A, 163 | FB62,067F, 164 | FB63,067F, 165 | FB64,067F, 166 | FB65,067F, 167 | FB66,0679, 168 | FB67,0679, 169 | FB68,0679, 170 | FB69,0679, 171 | FB6A,06A4, 172 | FB6B,06A4, 173 | FB6C,06A4, 174 | FB6D,06A4, 175 | FB6E,06A6, 176 | FB6F,06A6, 177 | FB70,06A6, 178 | FB71,06A6, 179 | FB72,0684, 180 | FB73,0684, 181 | FB74,0684, 182 | FB75,0684, 183 | FB76,0683, 184 | FB77,0683, 185 | FB78,0683, 186 | FB79,0683, 187 | FB7A,0686, 188 | FB7B,0686, 189 | FB7C,0686, 190 | FB7D,0686, 191 | FB7E,0687, 192 | FB7F,0687, 193 | FB80,0687, 194 | FB81,0687, 195 | FB82,068D, 196 | FB83,068D, 197 | FB84,068C, 198 | FB85,068C, 199 | FB86,068E, 200 | FB87,068E, 201 | FB88,0688, 202 | FB89,0688, 203 | FB8A,0698, 204 | FB8B,0698, 205 | FB8C,0691, 206 | FB8D,0691, 207 | FB8E,06A9, 208 | FB8F,06A9, 209 | FB90,06A9, 210 | FB91,06A9, 211 | FB92,06AF, 212 | FB93,06AF, 213 | FB94,06AF, 214 | FB95,06AF, 215 | FB96,06B3, 216 | FB97,06B3, 217 | FB98,06B3, 218 | FB99,06B3, 219 | FB9A,06B1, 220 | FB9B,06B1, 221 | FB9C,06B1, 222 | FB9D,06B1, 223 | FB9E,06BA, 224 | FB9F,06BA, 225 | FBA0,06BB, 226 | FBA1,06BB, 227 | FBA2,06BB, 228 | FBA3,06BB, 229 | FBA4,06C0, 230 | FBA5,06C0, 231 | FBA6,06C1, 232 | FBA7,06C1, 233 | FBA8,06C1, 234 | FBA9,06C1, 235 | FBAA,06BE, 236 | FBAB,06BE, 237 | FBAC,06BE, 238 | FBAD,06BE, 239 | FBAE,06D2, 240 | FBAF,06D2, 241 | FBB0,06D3, 242 | FBB1,06D3, 243 | FBD3,06AD, 244 | FBD4,06AD, 245 | FBD5,06AD, 246 | FBD6,06AD, 247 | FBD7,06C7, 248 | FBD8,06C7, 249 | FBD9,06C6, 250 | FBDA,06C6, 251 | FBDB,06C8, 252 | FBDC,06C8, 253 | FBDD,0677, 254 | FBDE,06CB, 255 | FBDF,06CB, 256 | FBE0,06C5, 257 | FBE1,06C5, 258 | FBE2,06C9, 259 | FBE3,06C9, 260 | FBE4,06D0, 261 | FBE5,06D0, 262 | FBE6,06D0, 263 | FBE7,06D0, 264 | FBE8,0649, 265 | FBE9,0649, 266 | FBEA,0626 0627, 267 | FBEB,0626 0627, 268 | FBEC,0626 06D5, 269 | FBED,0626 06D5, 270 | FBEE,0626 0648, 271 | FBEF,0626 0648, 272 | FBF0,0626 06C7, 273 | FBF1,0626 06C7, 274 | FBF2,0626 06C6, 275 | FBF3,0626 06C6, 276 | FBF4,0626 06C8, 277 | FBF5,0626 06C8, 278 | FBF6,0626 06D0, 279 | FBF7,0626 06D0, 280 | FBF8,0626 06D0, 281 | FBF9,0626 0649, 282 | FBFA,0626 0649, 283 | FBFB,0626 0649, 284 | FBFC,06CC, 285 | FBFD,06CC, 286 | FBFE,06CC, 287 | FBFF,06CC, 288 | FC00,0626 062C, 289 | FC01,0626 062D, 290 | FC02,0626 0645, 291 | FC03,0626 0649, 292 | FC04,0626 064A, 293 | FC05,0628 062C, 294 | FC06,0628 062D, 295 | FC07,0628 062E, 296 | FC08,0628 0645, 297 | FC09,0628 0649, 298 | FC0A,0628 064A, 299 | FC0B,062A 062C, 300 | FC0C,062A 062D, 301 | FC0D,062A 062E, 302 | FC0E,062A 0645, 303 | FC0F,062A 0649, 304 | FC10,062A 064A, 305 | FC11,062B 062C, 306 | FC12,062B 0645, 307 | FC13,062B 0649, 308 | FC14,062B 064A, 309 | FC15,062C 062D, 310 | FC16,062C 0645, 311 | FC17,062D 062C, 312 | FC18,062D 0645, 313 | FC19,062E 062C, 314 | FC1A,062E 062D, 315 | FC1B,062E 0645, 316 | FC1C,0633 062C, 317 | FC1D,0633 062D, 318 | FC1E,0633 062E, 319 | FC1F,0633 0645, 320 | FC20,0635 062D, 321 | FC21,0635 0645, 322 | FC22,0636 062C, 323 | FC23,0636 062D, 324 | FC24,0636 062E, 325 | FC25,0636 0645, 326 | FC26,0637 062D, 327 | FC27,0637 0645, 328 | FC28,0638 0645, 329 | FC29,0639 062C, 330 | FC2A,0639 0645, 331 | FC2B,063A 062C, 332 | FC2C,063A 0645, 333 | FC2D,0641 062C, 334 | FC2E,0641 062D, 335 | FC2F,0641 062E, 336 | FC30,0641 0645, 337 | FC31,0641 0649, 338 | FC32,0641 064A, 339 | FC33,0642 062D, 340 | FC34,0642 0645, 341 | FC35,0642 0649, 342 | FC36,0642 064A, 343 | FC37,0643 0627, 344 | FC38,0643 062C, 345 | FC39,0643 062D, 346 | FC3A,0643 062E, 347 | FC3B,0643 0644, 348 | FC3C,0643 0645, 349 | FC3D,0643 0649, 350 | FC3E,0643 064A, 351 | FC3F,0644 062C, 352 | FC40,0644 062D, 353 | FC41,0644 062E, 354 | FC42,0644 0645, 355 | FC43,0644 0649, 356 | FC44,0644 064A, 357 | FC45,0645 062C, 358 | FC46,0645 062D, 359 | FC47,0645 062E, 360 | FC48,0645 0645, 361 | FC49,0645 0649, 362 | FC4A,0645 064A, 363 | FC4B,0646 062C, 364 | FC4C,0646 062D, 365 | FC4D,0646 062E, 366 | FC4E,0646 0645, 367 | FC4F,0646 0649, 368 | FC50,0646 064A, 369 | FC51,0647 062C, 370 | FC52,0647 0645, 371 | FC53,0647 0649, 372 | FC54,0647 064A, 373 | FC55,064A 062C, 374 | FC56,064A 062D, 375 | FC57,064A 062E, 376 | FC58,064A 0645, 377 | FC59,064A 0649, 378 | FC5A,064A 064A, 379 | FC5B,0630 0670, 380 | FC5C,0631 0670, 381 | FC5D,0649 0670, 382 | FC5E,0020 064C 0651, 383 | FC5F,0020 064D 0651, 384 | FC60,0020 064E 0651, 385 | FC61,0020 064F 0651, 386 | FC62,0020 0650 0651, 387 | FC63,0020 0651 0670, 388 | FC64,0626 0631, 389 | FC65,0626 0632, 390 | FC66,0626 0645, 391 | FC67,0626 0646, 392 | FC68,0626 0649, 393 | FC69,0626 064A, 394 | FC6A,0628 0631, 395 | FC6B,0628 0632, 396 | FC6C,0628 0645, 397 | FC6D,0628 0646, 398 | FC6E,0628 0649, 399 | FC6F,0628 064A, 400 | FC70,062A 0631, 401 | FC71,062A 0632, 402 | FC72,062A 0645, 403 | FC73,062A 0646, 404 | FC74,062A 0649, 405 | FC75,062A 064A, 406 | FC76,062B 0631, 407 | FC77,062B 0632, 408 | FC78,062B 0645, 409 | FC79,062B 0646, 410 | FC7A,062B 0649, 411 | FC7B,062B 064A, 412 | FC7C,0641 0649, 413 | FC7D,0641 064A, 414 | FC7E,0642 0649, 415 | FC7F,0642 064A, 416 | FC80,0643 0627, 417 | FC81,0643 0644, 418 | FC82,0643 0645, 419 | FC83,0643 0649, 420 | FC84,0643 064A, 421 | FC85,0644 0645, 422 | FC86,0644 0649, 423 | FC87,0644 064A, 424 | FC88,0645 0627, 425 | FC89,0645 0645, 426 | FC8A,0646 0631, 427 | FC8B,0646 0632, 428 | FC8C,0646 0645, 429 | FC8D,0646 0646, 430 | FC8E,0646 0649, 431 | FC8F,0646 064A, 432 | FC90,0649 0670, 433 | FC91,064A 0631, 434 | FC92,064A 0632, 435 | FC93,064A 0645, 436 | FC94,064A 0646, 437 | FC95,064A 0649, 438 | FC96,064A 064A, 439 | FC97,0626 062C, 440 | FC98,0626 062D, 441 | FC99,0626 062E, 442 | FC9A,0626 0645, 443 | FC9B,0626 0647, 444 | FC9C,0628 062C, 445 | FC9D,0628 062D, 446 | FC9E,0628 062E, 447 | FC9F,0628 0645, 448 | FCA0,0628 0647, 449 | FCA1,062A 062C, 450 | FCA2,062A 062D, 451 | FCA3,062A 062E, 452 | FCA4,062A 0645, 453 | FCA5,062A 0647, 454 | FCA6,062B 0645, 455 | FCA7,062C 062D, 456 | FCA8,062C 0645, 457 | FCA9,062D 062C, 458 | FCAA,062D 0645, 459 | FCAB,062E 062C, 460 | FCAC,062E 0645, 461 | FCAD,0633 062C, 462 | FCAE,0633 062D, 463 | FCAF,0633 062E, 464 | FCB0,0633 0645, 465 | FCB1,0635 062D, 466 | FCB2,0635 062E, 467 | FCB3,0635 0645, 468 | FCB4,0636 062C, 469 | FCB5,0636 062D, 470 | FCB6,0636 062E, 471 | FCB7,0636 0645, 472 | FCB8,0637 062D, 473 | FCB9,0638 0645, 474 | FCBA,0639 062C, 475 | FCBB,0639 0645, 476 | FCBC,063A 062C, 477 | FCBD,063A 0645, 478 | FCBE,0641 062C, 479 | FCBF,0641 062D, 480 | FCC0,0641 062E, 481 | FCC1,0641 0645, 482 | FCC2,0642 062D, 483 | FCC3,0642 0645, 484 | FCC4,0643 062C, 485 | FCC5,0643 062D, 486 | FCC6,0643 062E, 487 | FCC7,0643 0644, 488 | FCC8,0643 0645, 489 | FCC9,0644 062C, 490 | FCCA,0644 062D, 491 | FCCB,0644 062E, 492 | FCCC,0644 0645, 493 | FCCD,0644 0647, 494 | FCCE,0645 062C, 495 | FCCF,0645 062D, 496 | FCD0,0645 062E, 497 | FCD1,0645 0645, 498 | FCD2,0646 062C, 499 | FCD3,0646 062D, 500 | FCD4,0646 062E, 501 | FCD5,0646 0645, 502 | FCD6,0646 0647, 503 | FCD7,0647 062C, 504 | FCD8,0647 0645, 505 | FCD9,0647 0670, 506 | FCDA,064A 062C, 507 | FCDB,064A 062D, 508 | FCDC,064A 062E, 509 | FCDD,064A 0645, 510 | FCDE,064A 0647, 511 | FCDF,0626 0645, 512 | FCE0,0626 0647, 513 | FCE1,0628 0645, 514 | FCE2,0628 0647, 515 | FCE3,062A 0645, 516 | FCE4,062A 0647, 517 | FCE5,062B 0645, 518 | FCE6,062B 0647, 519 | FCE7,0633 0645, 520 | FCE8,0633 0647, 521 | FCE9,0634 0645, 522 | FCEA,0634 0647, 523 | FCEB,0643 0644, 524 | FCEC,0643 0645, 525 | FCED,0644 0645, 526 | FCEE,0646 0645, 527 | FCEF,0646 0647, 528 | FCF0,064A 0645, 529 | FCF1,064A 0647, 530 | FCF2,0640 064E 0651, 531 | FCF3,0640 064F 0651, 532 | FCF4,0640 0650 0651, 533 | FCF5,0637 0649, 534 | FCF6,0637 064A, 535 | FCF7,0639 0649, 536 | FCF8,0639 064A, 537 | FCF9,063A 0649, 538 | FCFA,063A 064A, 539 | FCFB,0633 0649, 540 | FCFC,0633 064A, 541 | FCFD,0634 0649, 542 | FCFE,0634 064A, 543 | FCFF,062D 0649, 544 | FD00,062D 064A, 545 | FD01,062C 0649, 546 | FD02,062C 064A, 547 | FD03,062E 0649, 548 | FD04,062E 064A, 549 | FD05,0635 0649, 550 | FD06,0635 064A, 551 | FD07,0636 0649, 552 | FD08,0636 064A, 553 | FD09,0634 062C, 554 | FD0A,0634 062D, 555 | FD0B,0634 062E, 556 | FD0C,0634 0645, 557 | FD0D,0634 0631, 558 | FD0E,0633 0631, 559 | FD0F,0635 0631, 560 | FD10,0636 0631, 561 | FD11,0637 0649, 562 | FD12,0637 064A, 563 | FD13,0639 0649, 564 | FD14,0639 064A, 565 | FD15,063A 0649, 566 | FD16,063A 064A, 567 | FD17,0633 0649, 568 | FD18,0633 064A, 569 | FD19,0634 0649, 570 | FD1A,0634 064A, 571 | FD1B,062D 0649, 572 | FD1C,062D 064A, 573 | FD1D,062C 0649, 574 | FD1E,062C 064A, 575 | FD1F,062E 0649, 576 | FD20,062E 064A, 577 | FD21,0635 0649, 578 | FD22,0635 064A, 579 | FD23,0636 0649, 580 | FD24,0636 064A, 581 | FD25,0634 062C, 582 | FD26,0634 062D, 583 | FD27,0634 062E, 584 | FD28,0634 0645, 585 | FD29,0634 0631, 586 | FD2A,0633 0631, 587 | FD2B,0635 0631, 588 | FD2C,0636 0631, 589 | FD2D,0634 062C, 590 | FD2E,0634 062D, 591 | FD2F,0634 062E, 592 | FD30,0634 0645, 593 | FD31,0633 0647, 594 | FD32,0634 0647, 595 | FD33,0637 0645, 596 | FD34,0633 062C, 597 | FD35,0633 062D, 598 | FD36,0633 062E, 599 | FD37,0634 062C, 600 | FD38,0634 062D, 601 | FD39,0634 062E, 602 | FD3A,0637 0645, 603 | FD3B,0638 0645, 604 | FD3C,0627 064B, 605 | FD3D,0627 064B, 606 | FD50,062A 062C 0645, 607 | FD51,062A 062D 062C, 608 | FD52,062A 062D 062C, 609 | FD53,062A 062D 0645, 610 | FD54,062A 062E 0645, 611 | FD55,062A 0645 062C, 612 | FD56,062A 0645 062D, 613 | FD57,062A 0645 062E, 614 | FD58,062C 0645 062D, 615 | FD59,062C 0645 062D, 616 | FD5A,062D 0645 064A, 617 | FD5B,062D 0645 0649, 618 | FD5C,0633 062D 062C, 619 | FD5D,0633 062C 062D, 620 | FD5E,0633 062C 0649, 621 | FD5F,0633 0645 062D, 622 | FD60,0633 0645 062D, 623 | FD61,0633 0645 062C, 624 | FD62,0633 0645 0645, 625 | FD63,0633 0645 0645, 626 | FD64,0635 062D 062D, 627 | FD65,0635 062D 062D, 628 | FD66,0635 0645 0645, 629 | FD67,0634 062D 0645, 630 | FD68,0634 062D 0645, 631 | FD69,0634 062C 064A, 632 | FD6A,0634 0645 062E, 633 | FD6B,0634 0645 062E, 634 | FD6C,0634 0645 0645, 635 | FD6D,0634 0645 0645, 636 | FD6E,0636 062D 0649, 637 | FD6F,0636 062E 0645, 638 | FD70,0636 062E 0645, 639 | FD71,0637 0645 062D, 640 | FD72,0637 0645 062D, 641 | FD73,0637 0645 0645, 642 | FD74,0637 0645 064A, 643 | FD75,0639 062C 0645, 644 | FD76,0639 0645 0645, 645 | FD77,0639 0645 0645, 646 | FD78,0639 0645 0649, 647 | FD79,063A 0645 0645, 648 | FD7A,063A 0645 064A, 649 | FD7B,063A 0645 0649, 650 | FD7C,0641 062E 0645, 651 | FD7D,0641 062E 0645, 652 | FD7E,0642 0645 062D, 653 | FD7F,0642 0645 0645, 654 | FD80,0644 062D 0645, 655 | FD81,0644 062D 064A, 656 | FD82,0644 062D 0649, 657 | FD83,0644 062C 062C, 658 | FD84,0644 062C 062C, 659 | FD85,0644 062E 0645, 660 | FD86,0644 062E 0645, 661 | FD87,0644 0645 062D, 662 | FD88,0644 0645 062D, 663 | FD89,0645 062D 062C, 664 | FD8A,0645 062D 0645, 665 | FD8B,0645 062D 064A, 666 | FD8C,0645 062C 062D, 667 | FD8D,0645 062C 0645, 668 | FD8E,0645 062E 062C, 669 | FD8F,0645 062E 0645, 670 | FD92,0645 062C 062E, 671 | FD93,0647 0645 062C, 672 | FD94,0647 0645 0645, 673 | FD95,0646 062D 0645, 674 | FD96,0646 062D 0649, 675 | FD97,0646 062C 0645, 676 | FD98,0646 062C 0645, 677 | FD99,0646 062C 0649, 678 | FD9A,0646 0645 064A, 679 | FD9B,0646 0645 0649, 680 | FD9C,064A 0645 0645, 681 | FD9D,064A 0645 0645, 682 | FD9E,0628 062E 064A, 683 | FD9F,062A 062C 064A, 684 | FDA0,062A 062C 0649, 685 | FDA1,062A 062E 064A, 686 | FDA2,062A 062E 0649, 687 | FDA3,062A 0645 064A, 688 | FDA4,062A 0645 0649, 689 | FDA5,062C 0645 064A, 690 | FDA6,062C 062D 0649, 691 | FDA7,062C 0645 0649, 692 | FDA8,0633 062E 0649, 693 | FDA9,0635 062D 064A, 694 | FDAA,0634 062D 064A, 695 | FDAB,0636 062D 064A, 696 | FDAC,0644 062C 064A, 697 | FDAD,0644 0645 064A, 698 | FDAE,064A 062D 064A, 699 | FDAF,064A 062C 064A, 700 | FDB0,064A 0645 064A, 701 | FDB1,0645 0645 064A, 702 | FDB2,0642 0645 064A, 703 | FDB3,0646 062D 064A, 704 | FDB4,0642 0645 062D, 705 | FDB5,0644 062D 0645, 706 | FDB6,0639 0645 064A, 707 | FDB7,0643 0645 064A, 708 | FDB8,0646 062C 062D, 709 | FDB9,0645 062E 064A, 710 | FDBA,0644 062C 0645, 711 | FDBB,0643 0645 0645, 712 | FDBC,0644 062C 0645, 713 | FDBD,0646 062C 062D, 714 | FDBE,062C 062D 064A, 715 | FDBF,062D 062C 064A, 716 | FDC0,0645 062C 064A, 717 | FDC1,0641 0645 064A, 718 | FDC2,0628 062D 064A, 719 | FDC3,0643 0645 0645, 720 | FDC4,0639 062C 0645, 721 | FDC5,0635 0645 0645, 722 | FDC6,0633 062E 064A, 723 | FDC7,0646 062C 064A, 724 | FDF0,0635 0644 06D2, 725 | FDF1,0642 0644 06D2, 726 | FDF2,0627 0644 0644 0647, 727 | FDF3,0627 0643 0628 0631, 728 | FDF4,0645 062D 0645 062F, 729 | FDF5,0635 0644 0639 0645, 730 | FDF6,0631 0633 0648 0644, 731 | FDF7,0639 0644 064A 0647, 732 | FDF8,0648 0633 0644 0645, 733 | FDF9,0635 0644 0649, 734 | FDFA,0635 0644 0649 0020 0627 0644 0644 0647 0020 0639 0644 064A 0647 0020 0648 0633 0644 0645, 735 | FDFB,062C 0644 0020 062C 0644 0627 0644 0647, 736 | FDFC,0631 06CC 0627 0644, 737 | FE70,0020 064B, 738 | FE71,0640 064B, 739 | FE72,0020 064C, 740 | FE74,0020 064D, 741 | FE76,0020 064E, 742 | FE77,0640 064E, 743 | FE78,0020 064F, 744 | FE79,0640 064F, 745 | FE7A,0020 0650, 746 | FE7B,0640 0650, 747 | FE7C,0020 0651, 748 | FE7D,0640 0651, 749 | FE7E,0020 0652, 750 | FE7F,0640 0652, 751 | FE80,0621, 752 | FE81,0622, 753 | FE82,0622, 754 | FE83,0623, 755 | FE84,0623, 756 | FE85,0624, 757 | FE86,0624, 758 | FE87,0625, 759 | FE88,0625, 760 | FE89,0626, 761 | FE8A,0626, 762 | FE8B,0626, 763 | FE8C,0626, 764 | FE8D,0627, 765 | FE8E,0627, 766 | FE8F,0628, 767 | FE90,0628, 768 | FE91,0628, 769 | FE92,0628, 770 | FE93,0629, 771 | FE94,0629, 772 | FE95,062A, 773 | FE96,062A, 774 | FE97,062A, 775 | FE98,062A, 776 | FE99,062B, 777 | FE9A,062B, 778 | FE9B,062B, 779 | FE9C,062B, 780 | FE9D,062C, 781 | FE9E,062C, 782 | FE9F,062C, 783 | FEA0,062C, 784 | FEA1,062D, 785 | FEA2,062D, 786 | FEA3,062D, 787 | FEA4,062D, 788 | FEA5,062E, 789 | FEA6,062E, 790 | FEA7,062E, 791 | FEA8,062E, 792 | FEA9,062F, 793 | FEAA,062F, 794 | FEAB,0630, 795 | FEAC,0630, 796 | FEAD,0631, 797 | FEAE,0631, 798 | FEAF,0632, 799 | FEB0,0632, 800 | FEB1,0633, 801 | FEB2,0633, 802 | FEB3,0633, 803 | FEB4,0633, 804 | FEB5,0634, 805 | FEB6,0634, 806 | FEB7,0634, 807 | FEB8,0634, 808 | FEB9,0635, 809 | FEBA,0635, 810 | FEBB,0635, 811 | FEBC,0635, 812 | FEBD,0636, 813 | FEBE,0636, 814 | FEBF,0636, 815 | FEC0,0636, 816 | FEC1,0637, 817 | FEC2,0637, 818 | FEC3,0637, 819 | FEC4,0637, 820 | FEC5,0638, 821 | FEC6,0638, 822 | FEC7,0638, 823 | FEC8,0638, 824 | FEC9,0639, 825 | FECA,0639, 826 | FECB,0639, 827 | FECC,0639, 828 | FECD,063A, 829 | FECE,063A, 830 | FECF,063A, 831 | FED0,063A, 832 | FED1,0641, 833 | FED2,0641, 834 | FED3,0641, 835 | FED4,0641, 836 | FED5,0642, 837 | FED6,0642, 838 | FED7,0642, 839 | FED8,0642, 840 | FED9,0643, 841 | FEDA,0643, 842 | FEDB,0643, 843 | FEDC,0643, 844 | FEDD,0644, 845 | FEDE,0644, 846 | FEDF,0644, 847 | FEE0,0644, 848 | FEE1,0645, 849 | FEE2,0645, 850 | FEE3,0645, 851 | FEE4,0645, 852 | FEE5,0646, 853 | FEE6,0646, 854 | FEE7,0646, 855 | FEE8,0646, 856 | FEE9,0647, 857 | FEEA,0647, 858 | FEEB,0647, 859 | FEEC,0647, 860 | FEED,0648, 861 | FEEE,0648, 862 | FEEF,0649, 863 | FEF0,0649, 864 | FEF1,064A, 865 | FEF2,064A, 866 | FEF3,064A, 867 | FEF4,064A, 868 | FEF5,0644 0622, 869 | FEF6,0644 0622, 870 | FEF7,0644 0623, 871 | FEF8,0644 0623, 872 | FEF9,0644 0625, 873 | FEFA,0644 0625, 874 | FEFB,0644 0627, 875 | FEFC,0644 0627, 876 | FF01,0021,FullWidth 877 | FF02,0022, 878 | FF03,0023, 879 | FF04,0024, 880 | FF05,0025, 881 | FF06,0026, 882 | FF07,0027, 883 | FF08,0028, 884 | FF09,0029, 885 | FF0A,002A, 886 | FF0B,002B, 887 | FF0C,002C, 888 | FF0D,002D, 889 | FF0E,002E, 890 | FF0F,002F, 891 | FF10,0030, 892 | FF11,0031, 893 | FF12,0032, 894 | FF13,0033, 895 | FF14,0034, 896 | FF15,0035, 897 | FF16,0036, 898 | FF17,0037, 899 | FF18,0038, 900 | FF19,0039, 901 | FF1A,003A, 902 | FF1B,003B, 903 | FF1C,003C, 904 | FF1D,003D, 905 | FF1E,003E, 906 | FF1F,003F, 907 | FF20,0040, 908 | FF21,0041, 909 | FF22,0042, 910 | FF23,0043, 911 | FF24,0044, 912 | FF25,0045, 913 | FF26,0046, 914 | FF27,0047, 915 | FF28,0048, 916 | FF29,0049, 917 | FF2A,004A, 918 | FF2B,004B, 919 | FF2C,004C, 920 | FF2D,004D, 921 | FF2E,004E, 922 | FF2F,004F, 923 | FF30,0050, 924 | FF31,0051, 925 | FF32,0052, 926 | FF33,0053, 927 | FF34,0054, 928 | FF35,0055, 929 | FF36,0056, 930 | FF37,0057, 931 | FF38,0058, 932 | FF39,0059, 933 | FF3A,005A, 934 | FF3B,005B, 935 | FF3C,005C, 936 | FF3D,005D, 937 | FF3E,005E, 938 | FF3F,005F, 939 | FF40,0060, 940 | FF41,0061, 941 | FF42,0062, 942 | FF43,0063, 943 | FF44,0064, 944 | FF45,0065, 945 | FF46,0066, 946 | FF47,0067, 947 | FF48,0068, 948 | FF49,0069, 949 | FF4A,006A, 950 | FF4B,006B, 951 | FF4C,006C, 952 | FF4D,006D, 953 | FF4E,006E, 954 | FF4F,006F, 955 | FF50,0070, 956 | FF51,0071, 957 | FF52,0072, 958 | FF53,0073, 959 | FF54,0074, 960 | FF55,0075, 961 | FF56,0076, 962 | FF57,0077, 963 | FF58,0078, 964 | FF59,0079, 965 | FF5A,007A, 966 | FF5B,007B, 967 | FF5C,007C, 968 | FF5D,007D, 969 | FF5E,007E, 970 | FF5F,2985, 971 | FF60,2986, 972 | FFE0,00A2, 973 | FFE1,00A3, 974 | FFE2,00AC, 975 | FFE3,00AF, 976 | FFE4,00A6, 977 | FFE5,00A5, 978 | FFE6,20A9, 979 | FFF0,0020,Specials 980 | FFF1,0020, 981 | FFF2,0020, 982 | FFF3,0020, 983 | FFF4,0020, 984 | FFF5,0020, 985 | FFF6,0020, 986 | FFF7,0020, 987 | FFF8,0020, 988 | FFF9,0020, 989 | FFFA,0020, 990 | FFFB,0020, 991 | FFFC,0020, 992 | FFFD,0020, 993 | FFFE,0020, 994 | FFFF,0020 -------------------------------------------------------------------------------- /resources/Phoneme2Ascii.csv: -------------------------------------------------------------------------------- 1 | Phoneme,ASCII 2 | ʔ,EH 3 | a,AA 4 | b,B 5 | p,P 6 | t,T 7 | c,JE 8 | ç,CH 9 | ḧ,HE 10 | x,X 11 | d,D 12 | r,R 13 | ř,RR 14 | z,Z 15 | j,ZH 16 | s,S 17 | ş,SH 18 | ƹ,AH 19 | ẍ,XE 20 | f,F 21 | v,V 22 | q,Q 23 | k,K 24 | g,G 25 | l,L 26 | ł,LL 27 | m,M 28 | n,N 29 | o,O 30 | e,A 31 | h,H 32 | ê,E 33 | î,I 34 | y,Y 35 | w,W 36 | u,U 37 | û,UU -------------------------------------------------------------------------------- /resources/Phoneme2IPA.csv: -------------------------------------------------------------------------------- 1 | Phoneme,IPA 2 | ng,ŋg 3 | ʔ,ʔ 4 | b,b 5 | p,p 6 | t,t 7 | c,d͡ʒ 8 | ç,t͡ʃ 9 | ḧ,ħ 10 | x,x 11 | d,d 12 | r,ɾ 13 | ř,r 14 | z,z 15 | j,ʒ 16 | s,s 17 | ş,ʃ 18 | ƹ,ʕ 19 | ẍ,ɣ 20 | f,f 21 | v,v 22 | q,q 23 | k,k 24 | g,g 25 | l,l 26 | ł,ɫ 27 | m,m 28 | n,n 29 | w,w 30 | u,ʊ 31 | û,u 32 | o,o̞ 33 | h,h 34 | y,j 35 | a,ä 36 | e,a 37 | ê,ɛ 38 | i,ɪ 39 | î,i 40 | ĝ,ŋ 41 | đ,đ 42 | ü,y 43 | ô,ô 44 | õ,õ -------------------------------------------------------------------------------- /resources/PoemPatterns.csv: -------------------------------------------------------------------------------- 1 | Frequency,WeightPattern,Title 2 | 1044,–∪–––∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلاتن فاعلن 3 | 999,∪–––∪–––∪–––∪–––,مفاعیلن مفاعیلن مفاعیلن مفاعیلن 4 | 386,∪–––∪–––∪––,مفاعیلن مفاعیلن فعولن 5 | 334,––∪∪––∪∪––∪∪––,مفعولُ مفاعیلُ مفاعیلُ فعولن 6 | 272,––∪∪––∪∪––∪∪–,مفعولُ مفاعیلُ مفاعیلُ فعل 7 | 213,––∪–∪–∪∪––∪–∪–,مفعولُ فاعلاتُ مفاعیلُ فاعلن 8 | 138,∪∪––∪∪––∪∪––∪∪–,فعلاتن فعلاتن فعلاتن فعلن 9 | 131,––∪∪–∪–∪––,مفعولُ مفاعلن فعولن 10 | 62,–∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلن 11 | 45,∪∪––∪–∪–∪∪–,فعلاتن مفاعلن فعلن 12 | 40,∪–∪–∪∪––∪–∪–∪∪–,مفاعلن فعلاتن مفاعلن فعلن 13 | 31,––∪∪–––––∪∪–––,مفعولُ مفاعیلن مفعولُ مفاعیلن 14 | 28,––∪–∪––––∪–∪––,مفعولُ فاعلاتن مفعولُ فاعلاتن 15 | 20,∪∪––∪∪––∪∪–,فعلاتن فعلاتن فعلن 16 | 19,––∪–––∪–––∪–––∪–,مستفعلن مستفعلن مستفعلن مستفعلن 17 | 14,∪––∪––∪––∪–,فعولن فعولن فعولن فعل 18 | 13,∪–∪–∪––∪–∪–∪––,مفاعلن فعولن مفاعلن فعولن 19 | 9,–∪∪––∪––∪∪––∪–,مفتعلن فاعلن مفتعلن فاعلن 20 | 8,–∪∪––∪∪––∪–,مفتعلن مفتعلن فاعلن 21 | 8,–∪–––∪–––∪–––∪––,فاعلاتن فاعلاتن فاعلاتن فاعلاتن 22 | 7,∪–∪–∪–∪–∪–∪–∪–∪–,مفاعلن مفاعلن مفاعلن مفاعلن 23 | 7,–∪∪–∪–∪––∪∪–∪–∪–,مفتعلن مفاعلن مفتعلن مفاعلن 24 | 6,∪––∪––∪––∪––,فعولن فعولن فعولن فعولن 25 | 5,∪––∪∪––∪∪––∪∪––,مفاعیلُ مفاعیلُ مفاعیلُ فعولن 26 | 3,∪∪–∪–∪∪–∪–∪∪–∪–∪∪–∪–,متفاعلن متفاعلن متفاعلن متفاعلن 27 | 2,∪∪–∪–∪––∪∪–∪–∪––,فعلاتُ فاعلاتن فعلاتُ فاعلاتن 28 | 2,∪–∪–∪∪––∪–∪–∪∪––,مفاعلن فعلاتن مفاعلن فعلاتن --------------------------------------------------------------------------------