├── .gitignore
├── .vs
└── AsoSoftLibrary
│ └── FileContentIndex
│ └── read.lock
├── AsoSoft-logo.png
├── AsoSoftLibrary - Backup.csproj
├── AsoSoftLibrary.csproj
├── AsoSoftLibrary.csproj.user
├── AsoSoftLibrary.sln
├── G2P.cs
├── Normalize.cs
├── Number2Word.cs
├── PoemClassifier.cs
├── README.md
├── Sort.cs
├── Transliteration.cs
├── resFiles.Designer.cs
├── resFiles.resx
└── resources
├── G2PCertain.csv
├── G2PExceptions.csv
├── NormalizeUnicodeAdditional.csv
├── NormalizeUnicodeDeep.csv
├── Phoneme2Ascii.csv
├── Phoneme2IPA.csv
└── PoemPatterns.csv
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 | ##
4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
5 |
6 | # User-specific files
7 | *.rsuser
8 | *.suo
9 | *.user
10 | *.userosscache
11 | *.sln.docstates
12 |
13 | # User-specific files (MonoDevelop/Xamarin Studio)
14 | *.userprefs
15 |
16 | # Build results
17 | [Dd]ebug/
18 | [Dd]ebugPublic/
19 | [Rr]elease/
20 | [Rr]eleases/
21 | x64/
22 | x86/
23 | [Aa][Rr][Mm]/
24 | [Aa][Rr][Mm]64/
25 | bld/
26 | [Bb]in/
27 | [Oo]bj/
28 | [Ll]og/
29 |
30 | # Visual Studio 2015/2017 cache/options directory
31 | .vs/
32 | # Uncomment if you have tasks that create the project's static files in wwwroot
33 | #wwwroot/
34 |
35 | # Visual Studio 2017 auto generated files
36 | Generated\ Files/
37 |
38 | # MSTest test Results
39 | [Tt]est[Rr]esult*/
40 | [Bb]uild[Ll]og.*
41 |
42 | # NUNIT
43 | *.VisualState.xml
44 | TestResult.xml
45 |
46 | # Build Results of an ATL Project
47 | [Dd]ebugPS/
48 | [Rr]eleasePS/
49 | dlldata.c
50 |
51 | # Benchmark Results
52 | BenchmarkDotNet.Artifacts/
53 |
54 | # .NET Core
55 | project.lock.json
56 | project.fragment.lock.json
57 | artifacts/
58 |
59 | # StyleCop
60 | StyleCopReport.xml
61 |
62 | # Files built by Visual Studio
63 | *_i.c
64 | *_p.c
65 | *_h.h
66 | *.ilk
67 | *.meta
68 | *.obj
69 | *.iobj
70 | *.pch
71 | *.pdb
72 | *.ipdb
73 | *.pgc
74 | *.pgd
75 | *.rsp
76 | *.sbr
77 | *.tlb
78 | *.tli
79 | *.tlh
80 | *.tmp
81 | *.tmp_proj
82 | *_wpftmp.csproj
83 | *.log
84 | *.vspscc
85 | *.vssscc
86 | .builds
87 | *.pidb
88 | *.svclog
89 | *.scc
90 |
91 | # Chutzpah Test files
92 | _Chutzpah*
93 |
94 | # Visual C++ cache files
95 | ipch/
96 | *.aps
97 | *.ncb
98 | *.opendb
99 | *.opensdf
100 | *.sdf
101 | *.cachefile
102 | *.VC.db
103 | *.VC.VC.opendb
104 |
105 | # Visual Studio profiler
106 | *.psess
107 | *.vsp
108 | *.vspx
109 | *.sap
110 |
111 | # Visual Studio Trace Files
112 | *.e2e
113 |
114 | # TFS 2012 Local Workspace
115 | $tf/
116 |
117 | # Guidance Automation Toolkit
118 | *.gpState
119 |
120 | # ReSharper is a .NET coding add-in
121 | _ReSharper*/
122 | *.[Rr]e[Ss]harper
123 | *.DotSettings.user
124 |
125 | # JustCode is a .NET coding add-in
126 | .JustCode
127 |
128 | # TeamCity is a build add-in
129 | _TeamCity*
130 |
131 | # DotCover is a Code Coverage Tool
132 | *.dotCover
133 |
134 | # AxoCover is a Code Coverage Tool
135 | .axoCover/*
136 | !.axoCover/settings.json
137 |
138 | # Visual Studio code coverage results
139 | *.coverage
140 | *.coveragexml
141 |
142 | # NCrunch
143 | _NCrunch_*
144 | .*crunch*.local.xml
145 | nCrunchTemp_*
146 |
147 | # MightyMoose
148 | *.mm.*
149 | AutoTest.Net/
150 |
151 | # Web workbench (sass)
152 | .sass-cache/
153 |
154 | # Installshield output folder
155 | [Ee]xpress/
156 |
157 | # DocProject is a documentation generator add-in
158 | DocProject/buildhelp/
159 | DocProject/Help/*.HxT
160 | DocProject/Help/*.HxC
161 | DocProject/Help/*.hhc
162 | DocProject/Help/*.hhk
163 | DocProject/Help/*.hhp
164 | DocProject/Help/Html2
165 | DocProject/Help/html
166 |
167 | # Click-Once directory
168 | publish/
169 |
170 | # Publish Web Output
171 | *.[Pp]ublish.xml
172 | *.azurePubxml
173 | # Note: Comment the next line if you want to checkin your web deploy settings,
174 | # but database connection strings (with potential passwords) will be unencrypted
175 | *.pubxml
176 | *.publishproj
177 |
178 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
179 | # checkin your Azure Web App publish settings, but sensitive information contained
180 | # in these scripts will be unencrypted
181 | PublishScripts/
182 |
183 | # NuGet Packages
184 | *.nupkg
185 | # The packages folder can be ignored because of Package Restore
186 | **/[Pp]ackages/*
187 | # except build/, which is used as an MSBuild target.
188 | !**/[Pp]ackages/build/
189 | # Uncomment if necessary however generally it will be regenerated when needed
190 | #!**/[Pp]ackages/repositories.config
191 | # NuGet v3's project.json files produces more ignorable files
192 | *.nuget.props
193 | *.nuget.targets
194 |
195 | # Microsoft Azure Build Output
196 | csx/
197 | *.build.csdef
198 |
199 | # Microsoft Azure Emulator
200 | ecf/
201 | rcf/
202 |
203 | # Windows Store app package directories and files
204 | AppPackages/
205 | BundleArtifacts/
206 | Package.StoreAssociation.xml
207 | _pkginfo.txt
208 | *.appx
209 |
210 | # Visual Studio cache files
211 | # files ending in .cache can be ignored
212 | *.[Cc]ache
213 | # but keep track of directories ending in .cache
214 | !?*.[Cc]ache/
215 |
216 | # Others
217 | ClientBin/
218 | ~$*
219 | *~
220 | *.dbmdl
221 | *.dbproj.schemaview
222 | *.jfm
223 | *.pfx
224 | *.publishsettings
225 | orleans.codegen.cs
226 |
227 | # Including strong name files can present a security risk
228 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
229 | #*.snk
230 |
231 | # Since there are multiple workflows, uncomment next line to ignore bower_components
232 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
233 | #bower_components/
234 |
235 | # RIA/Silverlight projects
236 | Generated_Code/
237 |
238 | # Backup & report files from converting an old project file
239 | # to a newer Visual Studio version. Backup files are not needed,
240 | # because we have git ;-)
241 | _UpgradeReport_Files/
242 | Backup*/
243 | UpgradeLog*.XML
244 | UpgradeLog*.htm
245 | ServiceFabricBackup/
246 | *.rptproj.bak
247 |
248 | # SQL Server files
249 | *.mdf
250 | *.ldf
251 | *.ndf
252 |
253 | # Business Intelligence projects
254 | *.rdl.data
255 | *.bim.layout
256 | *.bim_*.settings
257 | *.rptproj.rsuser
258 | *- Backup*.rdl
259 |
260 | # Microsoft Fakes
261 | FakesAssemblies/
262 |
263 | # GhostDoc plugin setting file
264 | *.GhostDoc.xml
265 |
266 | # Node.js Tools for Visual Studio
267 | .ntvs_analysis.dat
268 | node_modules/
269 |
270 | # Visual Studio 6 build log
271 | *.plg
272 |
273 | # Visual Studio 6 workspace options file
274 | *.opt
275 |
276 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
277 | *.vbw
278 |
279 | # Visual Studio LightSwitch build output
280 | **/*.HTMLClient/GeneratedArtifacts
281 | **/*.DesktopClient/GeneratedArtifacts
282 | **/*.DesktopClient/ModelManifest.xml
283 | **/*.Server/GeneratedArtifacts
284 | **/*.Server/ModelManifest.xml
285 | _Pvt_Extensions
286 |
287 | # Paket dependency manager
288 | .paket/paket.exe
289 | paket-files/
290 |
291 | # FAKE - F# Make
292 | .fake/
293 |
294 | # JetBrains Rider
295 | .idea/
296 | *.sln.iml
297 |
298 | # CodeRush personal settings
299 | .cr/personal
300 |
301 | # Python Tools for Visual Studio (PTVS)
302 | __pycache__/
303 | *.pyc
304 |
305 | # Cake - Uncomment if you are using it
306 | # tools/**
307 | # !tools/packages.config
308 |
309 | # Tabs Studio
310 | *.tss
311 |
312 | # Telerik's JustMock configuration file
313 | *.jmconfig
314 |
315 | # BizTalk build output
316 | *.btp.cs
317 | *.btm.cs
318 | *.odx.cs
319 | *.xsd.cs
320 |
321 | # OpenCover UI analysis results
322 | OpenCover/
323 |
324 | # Azure Stream Analytics local run output
325 | ASALocalRun/
326 |
327 | # MSBuild Binary and Structured Log
328 | *.binlog
329 |
330 | # NVidia Nsight GPU debugger configuration file
331 | *.nvuser
332 |
333 | # MFractors (Xamarin productivity tool) working folder
334 | .mfractor/
335 |
336 | # Local History for Visual Studio
337 | .localhistory/
338 |
339 | # BeatPulse healthcheck temp database
340 | healthchecksdb
--------------------------------------------------------------------------------
/.vs/AsoSoftLibrary/FileContentIndex/read.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AsoSoft/AsoSoft-Library/f69d510e0a180c40511145691e8af1deb305aee5/.vs/AsoSoftLibrary/FileContentIndex/read.lock
--------------------------------------------------------------------------------
/AsoSoft-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AsoSoft/AsoSoft-Library/f69d510e0a180c40511145691e8af1deb305aee5/AsoSoft-logo.png
--------------------------------------------------------------------------------
/AsoSoftLibrary - Backup.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netcoreapp3.1
5 | AsoSoft
6 | AsoSoft Class Library
7 | Aso Mahmudi
8 | AsoSoft Class Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish).
9 | MIT
10 | https://github.com/AsoSoft/AsoSoft-Library
11 | AsoSoft-logo.png
12 | kurdish normalization natural-language-processing
13 | AsoSoft Library for the Kurdish language processing (ckb: Central branch of Kurdish).
14 | Normalizer and Numeral Converter classes
15 | https://github.com/AsoSoft/AsoSoft-Library
16 | 2.0.1
17 | ReadMe.md
18 | True
19 | True
20 | True
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | True
50 | \
51 |
52 |
53 | True
54 | \
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/AsoSoftLibrary.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netcoreapp3.1
5 | AsoSoft
6 | AsoSoft Class Library
7 | Aso Mahmudi
8 | AsoSoft Class Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish).
9 | MIT
10 | https://github.com/AsoSoft/AsoSoft-Library
11 | AsoSoft-logo.png
12 | kurdish normalization natural-language-processing
13 | AsoSoft Library for the Kurdish language processing (ckb: Central branch of Kurdish).
14 | Normalizer and Numeral Converter classes
15 | https://github.com/AsoSoft/AsoSoft-Library
16 | 2.1.3
17 | README.md
18 | True
19 | True
20 | True
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 | True
35 | True
36 | resFiles.resx
37 |
38 |
39 |
40 |
41 |
42 | PublicResXFileCodeGenerator
43 | resFiles.Designer.cs
44 |
45 |
46 |
47 |
48 |
49 | True
50 | \
51 |
52 |
53 | True
54 | \
55 | Always
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/AsoSoftLibrary.csproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | <_LastSelectedProfileId>D:\DEV\AsoSoftLibrary\Properties\PublishProfiles\FolderProfile.pubxml
5 |
6 |
7 |
8 | Designer
9 |
10 |
11 |
--------------------------------------------------------------------------------
/AsoSoftLibrary.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 17
4 | VisualStudioVersion = 17.2.32616.157
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "AsoSoftLibrary", "AsoSoftLibrary.csproj", "{69039AA0-A7AD-4F12-B1B9-13263A9DC47F}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | {69039AA0-A7AD-4F12-B1B9-13263A9DC47F}.Release|Any CPU.Build.0 = Release|Any CPU
18 | EndGlobalSection
19 | GlobalSection(SolutionProperties) = preSolution
20 | HideSolutionNode = FALSE
21 | EndGlobalSection
22 | GlobalSection(ExtensibilityGlobals) = postSolution
23 | SolutionGuid = {95A3F2E7-0611-4D99-8A85-055D3FE5E265}
24 | EndGlobalSection
25 | EndGlobal
26 |
--------------------------------------------------------------------------------
/G2P.cs:
--------------------------------------------------------------------------------
1 | // Automated Grapheme-to-Phoneme Conversion for Central Kurdish based on Optimality Theory
2 | // Copyright (C) 2019 Aso Mahmudi, Hadi Veisi
3 | // Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com)
4 | // Demo: https://asosoft.github.io/g2p/
5 | // Source Code: https://github.com/AsoSoft/AsoSoft-Library
6 | // Test-set: https://github.com/AsoSoft/Kurdish-G2P-dataset
7 | // Paper: https://www.sciencedirect.com/science/article/abs/pii/S0885230821000292
8 | // Cite:
9 | // @article{mahmudi2021automated,
10 | // title={Automated grapheme-to-phoneme conversion for Central Kurdish based on optimality theory},
11 | // author={Mahmudi, Aso and Veisi, Hadi},
12 | // journal={Computer Speech \& Language},
13 | // volume={70},
14 | // pages={101222},
15 | // year={2021},
16 | // publisher={Elsevier}
17 | // }
18 |
19 | using System.Collections.Generic;
20 | using System.Linq;
21 | using System.Text;
22 | using System.Text.RegularExpressions;
23 |
24 | namespace AsoSoftLibrary
25 | {
26 | public static partial class AsoSoft
27 | {
28 | private static Dictionary History = new Dictionary();
29 |
30 | /// Converts Central Kurdish text in standard Arabic script into syllabified phonemic Latin script (i.e. graphemes to phonems)
31 | public static string G2P(string text,
32 | bool convertNumbersToWord = false,
33 | bool backMergeConjunction = true,
34 | bool singleOutputPerWord = true)
35 | {
36 | var sb = new StringBuilder();
37 | text = UnifyNumerals(text, "en");
38 | if (convertNumbersToWord)
39 | text = Number2Word(text);
40 |
41 | text = g2pNormalize(text.Trim());
42 | //
43 | var ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهیێ" + "ۋۉۊڎڴݵݸ";
44 | var wordss = Regex.Matches(text, "([" + ku + "]+|[^" + ku + "]+)");
45 | for (int i = 0; i < wordss.Count; i++)
46 | {
47 | var word = wordss[i].Value;
48 | if (Regex.IsMatch(word, "[" + ku + "]") && word != "و")
49 | sb.Append(WordG2P(Regex.Replace(word, "[^" + ku + "]+", ""), singleOutputPerWord));
50 | else
51 | sb.Append(word);
52 | }
53 | var output = sb.ToString();
54 |
55 | // conjunction و
56 | output = Regex.Replace(output, "(^|[?!.] ?)" + "و", "$1ˈwe");
57 | if (!backMergeConjunction)
58 | output = Regex.Replace(output, "و", "û");
59 | else
60 | {
61 | // if there are candidates preceeding conjunction (e.g ˈbîst¶ˈbîˈsit و)
62 |
63 | output = Regex.Replace(output, "(\\w+)¶(\\w+)¶(\\w+) و"
64 | , "$1 و¶$2 و¶$3 و");
65 | output = Regex.Replace(output, "(\\w+)¶(\\w+) و"
66 | , "$1 و¶$2 و");
67 |
68 | // ('bi'ra + w => bi'raw)
69 | output = Regex.Replace(output, "([aeêouûiî]) و", "$1w");
70 | // ('be'fir + û => 'bef'rû)
71 | output = Regex.Replace(output, "(?<=\\w)ˈ([^aeêouûiî])i([^aeêouûiî]) و", "$1ˈ$2û");
72 | // ('ser + û => 'se'rû)
73 | // ('sard + û => 'sar'dû)
74 | // ('min + û => 'mi'nû)
75 | // ('bi'gir + û => 'bi'gi'rû)
76 | // ('gir'tin + û => 'gir'ti'nû)
77 | output = Regex.Replace(output, "([^aeêouûiî]) و", "ˈ$1û");
78 | // if conjunction makes candidates the same (e.g ˈbîsˈtû¶ˈbîsˈtû)
79 | output = Regex.Replace(output, "(?\\w+)¶\\k(?=\\s|$)", "$1");
80 | }
81 | return output.TrimEnd();
82 | }
83 |
84 |
85 | // chooses the best candidates for the word
86 | private static string Evaluator(string gr, List Candidates)
87 | {
88 | var Output = new List();
89 | var evaluatedCandidates = EVAL(Candidates);
90 | if (evaluatedCandidates.Count() > 0)
91 | {
92 | var LowestPenalt = evaluatedCandidates.First().Value;
93 | foreach (var item in evaluatedCandidates)
94 | if (item.Value < LowestPenalt + 5)
95 | Output.Add(item.Key);
96 | }
97 | return (Output.Count() == 0) ? gr : string.Join('¶', Output);
98 | }
99 |
100 | // Normalizion
101 | private static string g2pNormalize(string text)
102 | {
103 | var s = new string[]
104 | {
105 | " +", " " ,
106 | "دٚ", "ڎ",
107 | "گٚ", "ڴ",
108 | @"(^|\s)چ بکە", "$1چبکە",
109 | "َ", "ە", // فتحه
110 | "ِ", "ی", // کسره
111 | "ُ", "و", // ضمه
112 | "ء", "ئ", // Hamza
113 | "أ", "ئە",
114 | "إ", "ئی",
115 | "آ", "ئا",
116 | "ظ|ذ|ض", "ز",
117 | "ص|ث", "س",
118 | "ط", "ت",
119 | "ك", "ک",
120 | "ي|ى", "ی",
121 | "ه", "ە",
122 | "ھ", "ه",
123 | "ـ", "", // tatweel
124 | "؟", "?",
125 | "،", ",",
126 | "؛", ";",
127 | "\r", "",
128 | };
129 | for (int i = 0; i < s.Length; i += 2)
130 | text = Regex.Replace(text, s[i], s[i + 1]);
131 | return text;
132 | }
133 |
134 | private static string WordG2P(string gr, bool SingleOutputPerWord)
135 | {
136 | // Check history for speed up
137 | if (!History.ContainsKey(gr))
138 | History.Add(gr, Evaluator(gr, Generator(gr)));
139 | return SingleOutputPerWord ? History[gr].Split('¶')[0] : History[gr];
140 | }
141 |
142 | // GEN: generates all possible candidates:
143 | // e.g. بوون => bûn, buwn, bwun
144 | private static List Generator(string gr)
145 | {
146 | // Converting exceptional words
147 | var G2PExceptions = resFiles.G2PExceptions.Split('\n');
148 | for (int i = 1; i < G2PExceptions.Length; i++)
149 | {
150 | var item = G2PExceptions[i].Split(',');
151 | gr = Regex.Replace(gr, item[0], item[1]);
152 | }
153 |
154 | // Converting certain characters
155 | var G2PCertain = resFiles.G2PCertain.Split('\n');
156 | for (int i = 1; i < G2PCertain.Length; i++)
157 | {
158 | var item = G2PCertain[i].Split(',');
159 | gr = Regex.Replace(gr, item[0], item[1]);
160 | }
161 |
162 | // Uncertainty in "و" and "ی"
163 | var CandList1 = new List { "" };
164 | while (gr.Length > 0)
165 | {
166 | var temp = new List();
167 | if (Regex.IsMatch(gr, "^ووووو"))
168 | {
169 | temp.AddRange(new List
170 | { "uwuwu", "uwuww", "uwwuw", "uwûw",
171 | "wuwwu", "wuwuw", "wuwû", "wûww", "wwuwu", "wwuww", "wwûw", "wûwu",
172 | "ûwwu", "ûwuw", "ûwû"});
173 | gr = gr.Substring(5);
174 | }
175 | else if (Regex.IsMatch(gr, "^وووو"))
176 | {
177 | temp.AddRange(new List
178 | { "uwwu", "uwuw", "uwû",
179 | "wwuw", "wwû", "wuww", "wuwu", "wûw",
180 | "ûwu", "ûww", });
181 | gr = gr.Substring(4);
182 | }
183 | else if (Regex.IsMatch(gr, "^ووو"))
184 | {
185 | temp.AddRange(new List
186 | { "wuw", "wwu", "wû",
187 | "uww", "uwu",
188 | "ûw" });
189 | gr = gr.Substring(3);
190 | }
191 | else if (Regex.IsMatch(gr, "^وو"))
192 | {
193 | temp.AddRange(new List { "wu", "uw", "ww", "û" });
194 | gr = gr.Substring(2);
195 | }
196 | else if (Regex.IsMatch(gr, "^و"))
197 | {
198 | temp.AddRange(new List { "u", "w" });
199 | gr = gr.Substring(1);
200 | }
201 | else if (Regex.IsMatch(gr, "^یی"))
202 | {
203 | temp.AddRange(new List { "îy", "yî" });
204 | gr = gr.Substring(2);
205 | }
206 | else if (Regex.IsMatch(gr, "^ی"))
207 | {
208 | temp.AddRange(new List { "y", "î" });
209 | gr = gr.Substring(1);
210 | }
211 | else
212 | {
213 | temp.Add(gr[0].ToString());
214 | gr = gr.Substring(1);
215 | }
216 |
217 | var Count = CandList1.Count;
218 | var TempList = new List();
219 | foreach (var item in CandList1)
220 | TempList.Add(item);
221 | CandList1.Clear();
222 | for (int i = 0; i < Count; i++)
223 | {
224 | for (int j = 0; j < temp.Count; j++)
225 | {
226 | var WW = Regex.IsMatch(temp[j], "^ww");
227 | var IsPreviousVowel = Regex.IsMatch(TempList[i], "[aeêouûiîüȯė]$");
228 | var IsNowVowel = Regex.IsMatch(temp[j], "^[aeêouûiîüȯė]");
229 | var ConsonantBeforeWW = !IsPreviousVowel && WW;
230 | var hiatus = IsPreviousVowel && IsNowVowel;
231 | if (!hiatus && !ConsonantBeforeWW)
232 | CandList1.Add(TempList[i] + temp[j]);
233 | }
234 | }
235 | }
236 | // Adding "i" between Consonant Clusters
237 | var Candidates = iInsertion(CandList1);
238 |
239 | // ======= Syllabification for each candidate
240 | var OutputCandidates = Syllabification(Candidates);
241 |
242 | // for speed up: remove candidates that has 1) syllable without vowel or 2) more than 3 consonants in coda
243 | var cCount = OutputCandidates.Count;
244 | if (cCount > 1)
245 | {
246 | for (int i = cCount - 1; i > -1; i--)
247 | if (Regex.IsMatch(OutputCandidates[i], "ˈ[^aeêouûiîüȯė]+(ˈ|$)")
248 | || Regex.IsMatch(OutputCandidates[i], "[aeêouûiîüȯė][^aeêouûiîüȯėˈ]{4,}"))
249 | OutputCandidates.RemoveAt(i);
250 | }
251 |
252 | return OutputCandidates;
253 | }
254 |
255 | // insertion of hidden /i/ vowel
256 | // e.g. brd => bird, brid, birid
257 | private static List iInsertion(List Cands)
258 | {
259 | var Candidates = new List();
260 | for (int i = 0; i < Cands.Count; i++)
261 | {
262 | var ThisCand = new List();
263 | if (!string.IsNullOrEmpty(Cands[i]))
264 | {
265 | ThisCand.Add(Cands[i][0].ToString());
266 | for (int j = 1; j < Cands[i].Length; j++)
267 | {
268 | var Count = ThisCand.Count;
269 | var TempList = new List();
270 | foreach (var item in ThisCand)
271 | TempList.Add(item);
272 | ThisCand.Clear();
273 | for (int k = 0; k < Count; k++)
274 | {
275 | ThisCand.Add(TempList[k] + Cands[i][j]);
276 | if (Regex.IsMatch(Cands[i].Substring(j - 1, 2), @"[^aeêouûiîüȯė][^aeêouûiîüȯė]"))
277 | ThisCand.Add(TempList[k] + "i" + Cands[i][j]);
278 | }
279 | }
280 | }
281 | else
282 | ThisCand.Add(Cands[i]);
283 | foreach (var item in ThisCand)
284 | Candidates.Add(item);
285 |
286 | }
287 | return Candidates;
288 | }
289 |
290 | // Syllabification of candidates
291 | // e.g. dexom => ˈdeˈxom
292 | private static List Syllabification(List Candidates)
293 | {
294 | var cCount = Candidates.Count;
295 | for (int i = 0; i < cCount; i++)
296 | {
297 | // Onset C(C)V
298 | Candidates[i] = Regex.Replace(Candidates[i],
299 | "([^aeêouûiîȯėwy][wy]|[^aeêouûiîȯė])([aeêouûiîȯė])", "ˈ$1$2");
300 | // if no ˈ at beginig (grˈtin => ˈgrˈtin)
301 | Candidates[i] = Regex.Replace(Candidates[i],
302 | "^([^ˈ])", "ˈ$1");
303 | // add candidate ( 'be'sye => + 'bes'ye)
304 | if (Regex.IsMatch(Candidates[i], "[aeêouûiîȯė][^aeêouûiîȯė]?ˈ[^aeêouûiîȯėwy][wy]"))
305 | Candidates.Add(Regex.Replace(Candidates[i], "([aeêouûiîȯė][^aeêouûiîȯė]?)ˈ([^aeêouûiîȯėwy])([wy])", "$1$2ˈ$3"));
306 | }
307 | return Candidates;
308 | }
309 |
310 | // EVAL: specifies a penalty number for each syllabified candidate
311 | private static Dictionary EVAL(List Candidates)
312 | {
313 | var output = new Dictionary();
314 | if (Candidates.Count > 0)
315 | {
316 | var Penalty = new Dictionary();
317 | for (int i = 0; i < Candidates.Count; i++)
318 | {
319 | var P = 0;
320 | // ================= types of penalties ============
321 | // Complex Onset
322 | P += Regex.Matches(Candidates[i], "ˈ([^aeêouûiîȯėˈ]{2,}[wy]|[^aeêouûiîȯėˈ]+[^wy])[aeêouûiîȯė]").Count * 20;
323 |
324 | // Complex Coda
325 | if (Candidates[i] != "ˈpoynt")
326 | P += Regex.Matches(Candidates[i], "[aeêouûiîȯė][^aeêouûiîȯėˈ]{3}").Count * 10;
327 |
328 | P += Regex.Matches(Candidates[i], "[^aeêouûiîȯėˈ][wy][aeêouûiîȯė][wy][^aeêouûiîȯėˈ]").Count * 20;
329 |
330 | // SSP: ascending Sonority in coda
331 | var codas = Regex.Matches(Candidates[i], "(?<=[aeêouûiîȯė])[^aeêouûiîȯėˈ]{2,}");
332 | foreach (var coda in codas)
333 | {
334 | var chars = coda.ToString();
335 | for (int j = 0; j < chars.Length - 1; j++)
336 | if (SonorityIndex(chars[j]) <= SonorityIndex(chars[j + 1]))
337 | P += 10;
338 | }
339 | // DEP: i insertion
340 | P += Regex.Matches(Candidates[i], "i").Count * 2;
341 | //===========================
342 |
343 | P += Regex.Matches(Candidates[i], "kˈr").Count * 3;
344 |
345 | // ('kurd'si'tan => 'kur'dis'tan)
346 | P += Regex.Matches(Candidates[i], "[^aeêouûiîȯėˈ]ˈsiˈtaˈ?n").Count * 3;
347 |
348 | //"(kewt|newt|ḧewt|rext|sext|dest|pest|řast|mest|pişt|wîst|hest|bîst|heşt|şest)"
349 | // suffix /it/ and /im/ ('sert => 'se'rit) ('xewt !! 'xe'wit / 'xewt)
350 | if (!Regex.IsMatch(Candidates[i],
351 | "(rift|neft|kurt|girt|xirt|germ|term|port)"))
352 | P += Regex.Matches(Candidates[i], "[aeêouûiîȯė]([^aeêouûiîyȯėˈ]m|[^aeêouûiîysşxwˈ]t)$").Count * 3;
353 |
354 | // (ˈdyu/ => ˈdîw) and (ˈkwiř => ˈkuř)
355 | P += Regex.Matches(Candidates[i], "yu").Count * 5;
356 | P += Regex.Matches(Candidates[i], "uy").Count * 5;
357 | P += Regex.Matches(Candidates[i], "yi").Count * 5;
358 | P += Regex.Matches(Candidates[i], "iˈ?y").Count * 5; // bes'ti'yan
359 | P += Regex.Matches(Candidates[i], "wu").Count * 5;
360 | P += Regex.Matches(Candidates[i], "uˈ?w").Count * 2; // 'bi'bu'wî
361 | P += Regex.Matches(Candidates[i], "wi").Count * 2;
362 | P += Regex.Matches(Candidates[i], "iw").Count * 2;
363 | P += Regex.Matches(Candidates[i], "wû").Count * 5;
364 |
365 | // ˈdiˈrêˈjayˈyî => ˈdiˈrêˈjaˈyîy (not heyyî and teyyî)
366 | // ˈdiˈrêjˈyî => ˈdiˈrêˈjîy
367 | // (NOT ˈḧeyˈyî teyˈyî")
368 | P += Regex.Matches(Candidates[i], "[^aeêouûiîȯė]ˈyî").Count * 3;
369 |
370 | // [CV]'CyV => [CV]C'yV (ˈdiˈrêˈjyî => ˈdiˈrêˈjîy) ('bes'tye'tî => 'best'ye'tî)
371 | P += Regex.Matches(Candidates[i], "(? CC'yV (bir'dyan => bird'yan) ˈswênˈdyan
374 | P += Regex.Matches(Candidates[i], "[^aeêouûiî]ˈ[^aeêouûiî][y][aeêouûî]").Count * 2;
375 |
376 | // twîˈwur => tu'yûr
377 | P += Regex.Matches(Candidates[i], "[^aeêouûiî]wîˈw").Count * 3;
378 | //===========================
379 | // Cix (řê'kix'raw => řêk'xi'raw
380 | P += Regex.Matches(Candidates[i], "[^aeêouûiî]ixˈ").Count * 2;
381 |
382 | // ^'hełC' => ^'heł'C
383 | P += Regex.Matches(Candidates[i], "^ˈhe(ł[^aeêouûiîˈ]ˈ|ˈłi)").Count * 3;
384 |
385 | // (he'jarn => 'he'ja'rin)
386 | P += Regex.Matches(Candidates[i], "rn").Count * 5;
387 |
388 | // ('xawn => 'xa'win) ('pyawn => pya'win)
389 | P += Regex.Matches(Candidates[i], "[aêoûî][w][^aeêouûiîˈ]").Count * 5;
390 | //===========================
391 |
392 | // ('lab'ri'di'nî => 'la'bir'di'nî)
393 | P += Regex.Matches(Candidates[i], "[aeêouûiî][^aeêouûiîˈ]ˈriˈ").Count * 5;
394 | //
395 | // 'ser'nic, 'dek'rid, gir'fit => 'se'rinc, 'de'kird, 'gi'rift (NOT gir'tin)
396 | var pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])");
397 | if (pat.Success)
398 | {
399 | var C = Regex.Replace(pat.Value, "[iˈ]", "");
400 | if (SonorityIndex(C[1]) > SonorityIndex(C[2]))
401 | P += 3; //
402 | }
403 | // ('sern'cê => 'se'rin'cê)
404 | pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])");
405 | if (pat.Success)
406 | {
407 | var C = Regex.Replace(pat.Value, "[iˈ]", "");
408 | if (SonorityIndex(C[0]) > SonorityIndex(C[1]))
409 | P += 3;
410 | }
411 | // ('ser'ni'cê => 'se'rin'cê)
412 | pat = Regex.Match(Candidates[i], "([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])iˈ([^aeêouûiîˈ])");
413 | if (pat.Success)
414 | {
415 | var C = Regex.Replace(pat.Value, "[iˈ]", "");
416 | if (SonorityIndex(C[0]) > SonorityIndex(C[1]) && SonorityIndex(C[1]) > SonorityIndex(C[2]))
417 | P += 3;
418 | }
419 | // ('gi'rit'nê => 'gir'ti'nê) ('ku'şit'ne => 'kuş'ti'ne)
420 | pat = Regex.Match(Candidates[i], "[aeêouûiî]ˈ([^aeêouûiîˈ])i([^aeêouûiîˈ])ˈ([^aeêouûiîˈ])");
421 | if (pat.Success)
422 | {
423 | var C = Regex.Replace(pat.Value, "[aeêouûiîˈ]", "");
424 | if (SonorityIndex(C[2]) >= SonorityIndex(C[1]))
425 | P += 3;
426 | }
427 | Penalty.Add(Candidates[i], P);
428 | }
429 | output = Penalty.OrderBy(x => x.Value).ToDictionary(x => x.Key, x => x.Value);
430 | }
431 | return output;
432 | }
433 |
434 | // Sonority Sequencing Principle in EVAL needs phoneme ranking
435 | private static int SonorityIndex(char ch)
436 | {
437 | var c = ch.ToString();
438 | if (Regex.IsMatch(c, "[wy]")) // Approximant
439 | return 6;
440 | if (Regex.IsMatch(c, "[lłrř]")) // lateral
441 | return 5;
442 | if (Regex.IsMatch(c, "[mn]")) // nasal
443 | return 4;
444 | if (Regex.IsMatch(c, "[fvszşjxẍƹḧh]")) // fricative
445 | return 3;
446 | if (Regex.IsMatch(c, "[cç]")) // affricate
447 | return 2;
448 | else // stop
449 | return 1;
450 | }
451 |
452 | /// only for tests.
453 | public static Dictionary AllCandidates(string grapheme)
454 | {
455 | return EVAL(Generator(g2pNormalize(grapheme)));
456 | }
457 | }
458 | }
--------------------------------------------------------------------------------
/Normalize.cs:
--------------------------------------------------------------------------------
1 | // Automated Kurdish Text Normalization خاوێن کردنی ئۆتۆماتیکی دەقی کوردی
2 | // Copyright (C) 2019 Aso Mahmudi, Hadi Veisi, Mohammad MohammadAmini, Hawre Hosseini
3 | // Developer and Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com)
4 |
5 | // Source Code: https://github.com/AsoSoft/AsoSoft-Library
6 | // Paper: https://www.researchgate.net/publication/333729065
7 | // Cite:
8 | // @inproceedings{mahmudi2019automated,
9 | // title={Automated Kurdish Text Normalization},
10 | // author={Mahmudi, Aso and Veisi, Hadi and MohammadAmini, Mohammad and Hosseini, Hawre},
11 | // booktitle={The Second International Conference on Kurdish and Persian Languages and Literature},
12 | // year={2019}
13 | // }
14 |
15 | using System.Collections.Generic;
16 | using System.Text;
17 | using System.Text.RegularExpressions;
18 |
19 | namespace AsoSoftLibrary
20 | {
21 | public static partial class AsoSoft
22 | {
23 |
24 | static Dictionary DeepReplacements = LoadNormalizerReplaces(resFiles.NormalizerDeep);
25 | static Dictionary additionalReplacements = LoadNormalizerReplaces(resFiles.NormalizerAdditional);
26 |
27 | // ================= Converting Non-Standard Fonts =================
28 |
29 | /// Converts Kurdish text written in AliK fonts into Unicode standard
30 | public static string AliK2Unicode(string text) => replaceByList(text, normalizationReplaces["AliK2Unicode"]);
31 |
32 | /// Converts Kurdish text written in AliWeb fonts into Unicode standard
33 | public static string AliWeb2Unicode(string text) => replaceByList(text, normalizationReplaces["AliWeb2Unicode"]);
34 |
35 | /// Converts Kurdish text written in KDylan fonts into Unicode standard
36 | public static string Dylan2Unicode(string text) => replaceByList(text, normalizationReplaces["Dylan2Unicode"]);
37 |
38 | /// Converts Kurdish text written in Zarnegar fonts into Unicode standard
39 | public static string Zarnegar2Unicode(string text) => replaceByList(text, normalizationReplaces["Zarnegar2Unicode"]);
40 |
41 | static string Ku = "ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ"
42 | + "\u064B-\u065F"; // Haraka
43 | static string joiners = "ئبپتثجچحخسشصضطظعغفڤقکكگلڵمنیيهھێ";
44 | private static readonly Dictionary> normalizationReplaces = new Dictionary>
45 | {
46 | {"NormalizeKurdish1", new List() {
47 | //========= Tatweels (U+0640)
48 | "\u0640{2,}", "\u0640", // merge
49 | $"(?<=[{joiners}])\u0640(?=[{Ku}])", "", // delete unnecessary tatweel e.g. هـا to ها
50 | // replace tatweel nonadjacent to Kurdish letters with dash
51 | $"(?<=[{joiners}])\u0640", "\uF640", // temporal preserve
52 | $"\u0640(?=[{Ku}])", "\uF640", // temporal preserve
53 | "\u0640", "-",
54 | "\uF640", "\u0640",
55 |
56 | //========= Zero-Width Non-Joiner
57 | "[\uFEFF\u200C]+", "\u200C", //Standardize and remove dublicated ZWNJ
58 | // remove unnecessary ZWNJ
59 | "\u200C(?=(\\s|\\p{P}|$))", "", // ZWNJ + white spaces
60 | $"(? ماهـ
65 | $"(?() {
68 | //========= standard H, E, Y, K
69 | "ه" + "\u200C", "ە", // Heh+ZWNJ => kurdish AE
70 | "ه" + "(?=([^" + Ku +"ـ]|$))", "ە", //final Heh looks like Ae
71 | "ھ" + "(?=([^" + Ku +"]|$))", "هـ", // final Heh Doachashmee
72 | "ھ" , "ه", // non-final Heh Doachashmee
73 | "ى|ي", "ی", // Alef maksura | Arabic Ye => Farsi ye
74 | "ك", "ک", // Arabic Kaf => Farsi Ke
75 | "\u200C" + "و ", " و ", // شوێنو جێ => شوێن و جێ
76 | //"\u200C" + "دا" + "(?)", "دا", // شوێندا => شوێندا
77 | //"(? بێ شوێن
78 |
79 | //========= errors from font conversion
80 | "لاَ|لاً|لأ", "ڵا",
81 | "(ی|ێ)" + "[\u064E\u064B]+", "ێ", //FATHA & FATHATAN
82 | "(و|ۆ)" + "[\u064E\u064B]+", "ۆ",
83 | "(ل|ڵ)" + "[\u064E\u064B]+", "ڵ",
84 | "(ر|ڕ)" + "\u0650+", "ڕ", //KASRA
85 | }},
86 | {"NormalizeKurdish3", new List() {
87 | "(?() {
92 | "لاَ|لآ|لاً", "ڵا",
93 | "لً|لَ|لأ", "ڵ",
94 | "ة", "ە",
95 | "ه" + "(?!([ئابپتجچحخدرڕزژسشعغفڤقکگلڵمنوۆەهھیێأإآثذصضطظكيىةڎۊؤ]|$))", "هـ",
96 | "ض", "چ",
97 | "ث", "پ",
98 | "ظ", "ڤ",
99 | "ط", "گ",
100 | "ك", "ک",
101 | "ىَ|يَ|یَ|آ", "ێ",
102 | "رِ", "ڕ",
103 | "ؤ|وَ", "ۆ",
104 | "ي|ى", "ی",
105 | "ء", "\u200Cو",
106 | "ِ", "",
107 | "ذ", "ژ"
108 | }},
109 | {"AliWeb2Unicode", new List() {
110 | "لاَ|لآ|لاً", "ڵا",
111 | "لَ|پ", "ڵ",
112 | "ة", "ە",
113 | "ه", "ھ",
114 | "ه", "ھ",
115 | "رِ|أ", "ڕ",
116 | "ؤ|وَ", "ۆ",
117 | "يَ|یَ", "ێ",
118 | "ص", "ێ",
119 | "ي", "ی",
120 | "ط", "ڭ", //swap ط and گ
121 | "گ", "ط", //
122 | "ڭ", "گ", //
123 | "ض", "چ",
124 | "ث", "پ",
125 | "ظ", "ڤ",
126 | "ْ|ُ", "",
127 | "ى", "*",
128 | "ك", "ک",
129 | "ذ", "ژ"
130 | }},
131 | {"Dylan2Unicode", new List() {
132 | "لإ|لأ|لآ", "ڵا",
133 | "ؤ|وَ", "ۆ",
134 | "ة", "ە",
135 | "ض", "ڤ",
136 | "ص", "ڵ",
137 | "ث", "ێ",
138 | "ؤ", "ۆ",
139 | "ه", "ھ",
140 | "ك", "ک",
141 | "ي|ى", "ی",
142 | "ذ", "ڕ"
143 | }},
144 | {"Zarnegar2Unicode", new List() {
145 | "لاٌ", "ڵا",
146 | "ى|ي", "ی",
147 | "یٌ", "ێ",
148 | "ه", "ە",
149 | "لٌ", "ڵ",
150 | "رٍ", "ڕ",
151 | "وٌ", "ۆ"
152 | }},
153 | {"SeperateDigits", new List() {
154 | "(?() {
159 | "\\(\\(", "«",
160 | "\\)\\)", "»",
161 | "»", "\uF8FA", // temp replacement «x»eke
162 | "\\)", "\uF8FB", //temp replacement
163 | "([!.:;?،؛؟]+)(\\p{Pi})", "$1 $2",
164 | "(\\p{P}+)(?![\\s\\p{P}])", "$1 ", // Seprate all punctuations
165 | "\uF8FA", "»", // undo temp replacement
166 | "\uF8FB", ")", // undo temp replacement
167 | "(?() {
173 | " ((\\p{Pe}|\\p{Pf})+)", "$1", // A ) B => A) B
174 | "((\\p{Ps}|\\p{Pi})+) ", "$1", // A ( B => A (B
175 | " ([!.:;?،؛؟]+)", "$1", // A ! => A!
176 | }},
177 | {"NormalizePunctuations3", new List() {
178 | "(? A " B
179 | "(\uF8FD)(?![ \\t\\p{P}])", "$1 ", // A "B => A " B
180 | }}
181 | };
182 |
183 | private static string replaceByList(string text, List replaceList)
184 | {
185 | for (int i = 0; i < replaceList.Count; i += 2)
186 | text = Regex.Replace(text, replaceList[i], replaceList[i + 1]);
187 | return text;
188 | }
189 | // ================= Normalization =================
190 | private static Dictionary LoadNormalizerReplaces(string file)
191 | {
192 | var output = new Dictionary();
193 |
194 | var items = file.Trim().Split('\n');
195 | for (int i = 1; i < items.Length; i++)
196 | {
197 | var item = items[i].Split(',');
198 | var chOld = System.Convert.ToChar(System.Convert.ToUInt32(item[0], 16));
199 | var chNew = "";
200 | foreach (var ch in item[1].Split(' '))
201 | if (ch != "")
202 | chNew += System.Convert.ToChar(System.Convert.ToUInt32(ch, 16));
203 | if (!output.ContainsKey(chOld))
204 | output.Add(chOld, chNew);
205 | }
206 | return output;
207 | }
208 |
209 | /// Unicode Normalization for Central Kurdish
210 | public static string Normalize(string text)
211 | {
212 | return Normalize(text, true, true, true, true, new Dictionary());
213 | }
214 |
215 | /// Main Unicode Normalization for Central Kurdish
216 | public static string Normalize(string text,
217 | bool isOnlyKurdish,
218 | bool changeInitialR,
219 | bool deepUnicodeCorrectios,
220 | bool additionalUnicodeCorrections,
221 | Dictionary usersReplaceList)
222 | {
223 | var replaces = new Dictionary();
224 | // Character-based replacement (ReplaceList and Private Use Area)
225 | var CharList = new List();
226 | for (int i = 0; i < text.Length; i++)
227 | if (!CharList.Contains(text[i]))
228 | CharList.Add(text[i]);
229 |
230 | if (deepUnicodeCorrectios)
231 | foreach (var item in DeepReplacements)
232 | if (CharList.Contains(item.Key))
233 | replaces.Add(item.Key, item.Value);
234 | if (additionalUnicodeCorrections)
235 | foreach (var item in additionalReplacements)
236 | if (CharList.Contains(item.Key) && !replaces.ContainsKey(item.Key))
237 | replaces.Add(item.Key, item.Value);
238 | foreach (var item in usersReplaceList)
239 | if (CharList.Contains(item.Key) && !replaces.ContainsKey(item.Key))
240 | replaces.Add(item.Key, item.Value);
241 |
242 | foreach (var ch in CharList)
243 | {
244 | if (replaces.ContainsKey(ch)) //ReplaceList
245 | text = text.Replace(ch.ToString(), replaces[ch]);
246 | else if (ch > 57343 && ch < 63744) //Private Use Area
247 | text = text.Replace(ch, '□'); // u25A1 White Square
248 | }
249 |
250 | text = replaceByList(text, normalizationReplaces["NormalizeKurdish1"]);
251 |
252 | // if the text is Monolingual (only Central Kurdish)
253 | if (isOnlyKurdish)
254 | {
255 | text = replaceByList(text, normalizationReplaces["NormalizeKurdish2"]);
256 | //========= Initial r
257 | if (changeInitialR)
258 | text = replaceByList(text, normalizationReplaces["NormalizeKurdish3"]);
259 | }
260 | return text;
261 | }
262 |
263 | // ===== Unifying Numerals =====
264 | private static readonly string[] digits = new string[]{
265 | "۰", "٠", "0",
266 | "۱", "١", "1",
267 | "۲", "٢", "2",
268 | "۳", "٣", "3",
269 | "۴", "٤", "4",
270 | "۵", "٥", "5",
271 | "۶", "٦", "6",
272 | "۷", "٧", "7",
273 | "۸", "٨", "8",
274 | "۹", "٩", "9", };
275 |
276 | /// unifies numeral characters into desired numeral type from en (0123456789) or ar (٠١٢٣٤٥٦٧٨٩).
277 | public static string UnifyNumerals(string text, string NumeralType)
278 | {
279 | for (int i = 0; i < digits.Length; i += 3)
280 | {
281 | if (NumeralType == "en")
282 | text = Regex.Replace(text, digits[i] + "|" + digits[i + 1], digits[i + 2]);
283 | else if (NumeralType == "ar")
284 | text = Regex.Replace(text, digits[i] + "|" + digits[i + 2], digits[i + 1]);
285 | }
286 | return text;
287 | }
288 |
289 | /// Seperate digits from words (e.g. replacing "12a" with "12 a")
290 | public static string SeperateDigits(string text) => replaceByList(text, normalizationReplaces["SeperateDigits"]);
291 |
292 | /// Normalize Punctuations
293 | public static string NormalizePunctuations(string text, bool seprateAllPunctuations)
294 | {
295 | text = text.Replace('"', '\uF8FD'); //temp replacement
296 | text = replaceByList(text, normalizationReplaces["NormalizePunctuations1"]);
297 | if (!seprateAllPunctuations)
298 | text = replaceByList(text, normalizationReplaces["NormalizePunctuations2"]);
299 | else
300 | text = replaceByList(text, normalizationReplaces["NormalizePunctuations3"]);
301 | text = text.Replace('\uF8FD', '"'); //undo temp replacement
302 | return text;
303 | }
304 |
305 |
306 | /// Trim white spaces of a line
307 | public static string TrimLine(string line)
308 | {
309 | line = Regex.Replace(line.Trim(), "[\u200B\u200C\uFEFF]+$", "");
310 | line = Regex.Replace(line.Trim(), "^[\u200B\u200C\uFEFF]+", "");
311 | return line.Trim();
312 | }
313 |
314 | /// HTML Entity replacement for web crawled texts (e.g. "é" with "é")
315 | public static string ReplaceHtmlEntity(string text)
316 | {
317 | return Regex.Replace(text, "&[a-zA-Z]+;", m => System.Net.WebUtility.HtmlDecode(m.Value));
318 | }
319 |
320 | /// Replace URLs and Emails with a certain word (improves language models)
321 | public static string ReplaceUrlEmail(string text)
322 | {
323 | text = Regex.Replace(text, "([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+\\.[a-zA-Z]{2,5})", "EmailAddress");
324 | text = Regex.Replace(text, "((http[s]?|ftp)?://([\\w-]+\\.)+[\\w-]+)(/[\\w-~./?%+&=]*)?", "URL");
325 | return text;
326 | }
327 |
328 | /// Character replacement for ANSI CodePage
329 | public static string Char2CharReplacment(string text, Dictionary Codepage)
330 | {
331 | foreach (var item in Codepage)
332 | text = text.Replace(item.Key, item.Value);
333 | return text;
334 | }
335 |
336 | /// Correction Table (word replacement )
337 | public static string Word2WordReplacement(string line, Dictionary wordReplacements)
338 | {
339 | return Regex.Replace(line, "(? wordReplacements.ContainsKey(m.Value) ? wordReplacements[m.Value] : m.Value);
341 | }
342 |
343 | //================= have to be improved: =================
344 |
345 | /// Delete non-CK lines (fast but not accurate; we need a language detector.)
346 | public static string DeleteNonKurdish(string line, int KurdishRateThreshold)
347 | {
348 | float KuPersent = Regex.Matches(line, "[پچژگڵۆڕێڤەھ]").Count / (float)line.Length;
349 | if (KuPersent < KurdishRateThreshold / 100.0)
350 | line = "";
351 | return line;
352 | }
353 |
354 | /// Embrace sentences with start/end tags
355 | public static string MarkSentence(string line, string sentenceTag)
356 | {
357 | var tagStart = "<" + sentenceTag + ">";
358 | var tagEnd = "" + sentenceTag + ">";
359 |
360 | // ending punctuations !?؟
361 | line = Regex.Replace(line.TrimEnd(), "([!?؟]+)(?!$)", "$1 " + tagEnd + tagStart);
362 | // full stop
363 | line = Regex.Replace(line, "([\\w\u200C]{2,} ?\\.)(?!([0-9a-zA-Z.]|$))", "$1 " + tagEnd + tagStart);
364 |
365 | return tagStart + line + tagEnd;
366 | }
367 | }
368 | }
369 |
370 | // ================= Regex Hints =================
371 | // docs.microsoft.com/en-us/dotnet/standard/base-types/character-classes-in-regular-expressions
372 | // Lookbehind Positive: (?<=a)b
373 | // Lookbehind Negative: (?converts numerals into Central Kurdish words. It is useful in text-to-speech tools.
14 | public static string Number2Word(string text)
15 | {
16 | // convert numbers to latin
17 | var unifyNumbers = new string[]{
18 | "٠|۰", "0",
19 | "١|۱", "1",
20 | "٢|۲", "2",
21 | "٣|۳", "3",
22 | "٤|۴", "4",
23 | "٥|۵", "5",
24 | "٦|۶", "6",
25 | "٧|۷", "7",
26 | "٨|۸", "8",
27 | "٩|۹", "9" };
28 | for (int i = 0; i < unifyNumbers.Length; i += 2)
29 | text = Regex.Replace(text, unifyNumbers[i], unifyNumbers[i + 1]);
30 |
31 | text = Regex.Replace(text, "([0-9]{1,3})[,،](?=[0-9]{3})", "$1"); // remove thousend seperator 12,345,678 => 12345678
32 | text = Regex.Replace(text, "(? floatName(m.Groups[1].Value.ToString(), m.Groups[2].Value.ToString()));
42 |
43 | //convert remaining integr numbers
44 | text = Regex.Replace(text, "([0-9]+)",
45 | m => integerName(m.Groups[1].Value.ToString()));
46 |
47 | return text;
48 | }
49 |
50 | private static string floatName(string integerPart, string decimalPart)
51 | {
52 | var point = " پۆینت " + Regex.Replace(decimalPart, "(?<=^|0)0", " سفر ");
53 | point = Regex.Replace(point, "[0-9]", "");
54 | return integerName(integerPart) + point + integerName(decimalPart);
55 | }
56 |
57 | private static string integerName(string inputInteger)
58 | {
59 | var output = "";
60 | if (inputInteger != "0")
61 | {
62 | string[] ones = { "", "یەک", "دوو", "سێ", "چوار", "پێنج", "شەش", "حەوت", "هەشت", "نۆ" };
63 | string[] teens = { "دە", "یازدە", "دوازدە", "سێزدە", "چواردە", "پازدە", "شازدە", "حەڤدە", "هەژدە", "نۆزدە" };
64 | string[] tens = { "", "", "بیست", "سی", "چل", "پەنجا", "شەست", "هەفتا", "هەشتا", "نەوەد" };
65 | string[] hundreds = { "", "سەد", "دووسەد", "سێسەد", "چوارسەد", "پێنسەد", "شەشسەد", "حەوتسەد", "هەشتسەد", "نۆسەد" };
66 | string[] thousands = { "", " هەزار", " ملیۆن", " ملیار", " بلیۆن", " بلیار", " تریلیۆن", " تریلیار", " کوادرلیۆن" };
67 | var temp = inputInteger;
68 | for (int i = 0; i < inputInteger.Length; i = i + 3)
69 | {
70 | string currentThree = Regex.Match(temp, "([0-9]{1,3})$").Result("$1");
71 | temp = temp.Substring(0, temp.Length - currentThree.Length);
72 | currentThree = currentThree.PadLeft(3, '0');
73 | var C = Int32.Parse(currentThree[0].ToString());
74 | var X = Int32.Parse(currentThree[1].ToString());
75 | var I = Int32.Parse(currentThree[2].ToString());
76 | var conjunction1 = ((C != 0) && (X != 0 || I != 0)) ? " و " : "";
77 | var conjunction2 = (X != 0 && I != 0) ? " و " : "";
78 | if (X == 1)
79 | currentThree = hundreds[C] + conjunction1 + teens[I];
80 | else
81 | currentThree = hundreds[C] + conjunction1 + tens[X] + conjunction2 + ones[I];
82 | var M = (currentThree == "") ? "" : thousands[(int)(Math.Floor(i / 3.0))];
83 | currentThree += M;
84 | var conjunction3 = (output == "") ? "" : " و ";
85 | if (currentThree != "")
86 | output = currentThree + conjunction3 + output;
87 | }
88 | output = output.Replace("یەک هەزار", "هەزار");
89 | }
90 | else // if input number = 0
91 | output = "سفر";
92 | return output;
93 | }
94 | }
95 | }
--------------------------------------------------------------------------------
/PoemClassifier.cs:
--------------------------------------------------------------------------------
1 | // Automatic Meter Classification of Kurdish Poems
2 | // Copyright (C) 2019 Aso Mahmudi, Hadi Veisi
3 | // Maintainer: Aso Mahmudi (aso.mehmudi@gmail.com)
4 | // Demo: https://asosoft.github.io/poem/
5 | // Source Code: https://github.com/AsoSoft/AsoSoft-Library
6 | // Test-set: https://github.com/AsoSoft/Vejinbooks-Poem-Dataset
7 | // Paper: https://arxiv.org/abs/2102.12109
8 | // Cite:
9 | //@article{mahmudi2021automatic,
10 | // title={Automatic Meter Classification of Kurdish Poems},
11 | // author={Mahmudi, Aso and Veisi, Hadi},
12 | // journal={arXiv preprint arXiv: 2102.12109},
13 | // year={2021}
14 | //}
15 |
16 | using System;
17 | using System.Collections.Generic;
18 | using System.Linq;
19 | using System.Text.RegularExpressions;
20 |
21 | namespace AsoSoftLibrary
22 | {
23 |
24 | ///
25 | public class Pattern
26 | {
27 | public int freq { get; set; }
28 | public string weights { get; set; }
29 | public string title { get; set; }
30 | }
31 |
32 | ///
33 | public class ScannedHemistich
34 | {
35 | public int lineNo { get; set; }
36 | public string scanned { get; set; }
37 | public int meterID { get; set; }
38 | public int dist { get; set; }
39 | }
40 |
41 | ///
42 | public class ResultSet
43 | {
44 | public int syllabic { get; set; }
45 | public double syllabicConfidence { get; set; }
46 | public string quantitative { get; set; }
47 | public double quantitativeConfidence { get; set; }
48 | public string overalPattern { get; set; }
49 | public string overalMeterType { get; set; }
50 | public List details { get; set; }
51 | }
52 |
53 | public static partial class AsoSoft
54 | {
55 | /// Common patterns of Kurdish quantitative verses (VejinBooks corpus, up to 2019/12/1)
56 | public static List CommonPatterns = new List();
57 |
58 | private static void loadPoemPatterns()
59 | {
60 | var PoemPatterns = resFiles.PoemPatterns.Split('\n');
61 | for (int i = 1; i < PoemPatterns.Length; i++)
62 | {
63 | var item = PoemPatterns[i].Split(',');
64 | CommonPatterns.Add(new Pattern() { freq = Convert.ToInt32(item[0]), weights = item[1], title = item[2] });
65 | }
66 | }
67 |
68 | const int _maxDist = 4;
69 | private static int[] patternScores = new int[27];
70 |
71 | /// Classifies the input Kurdish poem
72 | public static ResultSet PoemClassification(string[] sHemistiches)
73 | {
74 | if (CommonPatterns.Count == 0)
75 | loadPoemPatterns();
76 | Array.Clear(patternScores, 0, patternScores.Length);
77 | var output = new ResultSet();
78 | //===== syallabic analysis
79 | var syllableCounts = new List();
80 | for (int i = 0; i < sHemistiches.Length; i++)
81 | {
82 | var sCount = sHemistiches[i].Split('ˈ').Length - 1;
83 | if (sCount > 0)
84 | syllableCounts.Add(sCount);
85 | }
86 | var HemistichesCount = syllableCounts.Count;
87 | var mode = syllableCounts
88 | .GroupBy(x => x)
89 | .OrderByDescending(y => y.Count())
90 | .First().Key;
91 | output.syllabic = mode;
92 | output.syllabicConfidence = (double)syllableCounts.Where(x => x == mode).Count()
93 | / HemistichesCount * 100;
94 |
95 | //===== quantitative analysis
96 | var AcceptableCandidates = new List();
97 | for (int i = 0; i < sHemistiches.Length; i++)
98 | AcceptableCandidates.AddRange(PatternMatch(Convert2CV(sHemistiches[i]), i));
99 |
100 | var highScore = Array.IndexOf(patternScores, patternScores.Max());
101 | output.quantitative = CommonPatterns[highScore].title;
102 | output.quantitativeConfidence = ((double)patternScores[highScore] / _maxDist) / HemistichesCount * 100;
103 |
104 | //===== final output for each hemistich
105 | var final = new List();
106 | for (int i = 0; i < sHemistiches.Length; i++)
107 | {
108 | var highScoreMatches = AcceptableCandidates
109 | .Where(x => x.lineNo == i && x.meterID == highScore);
110 | if (highScoreMatches.Count() > 0)
111 | final.Add(highScoreMatches.First());
112 | else
113 | final.Add(new ScannedHemistich());
114 | }
115 | output.details = final;
116 |
117 | //===== overal poem classification
118 | var stdDev = CalculateStdDev(syllableCounts);
119 | var metricalMargin = (output.syllabic > 10) ? 40 : 50;
120 | var stdDevMargin = (double)output.syllabic / 10;
121 | if (stdDev > stdDevMargin)
122 | {
123 | output.overalMeterType = "Free Verse/شیعری نوێ";
124 | }
125 | else if (output.quantitativeConfidence >= metricalMargin) // metrical when:
126 | {
127 | output.overalMeterType = "Quantitative/عەرووزی";
128 | output.overalPattern = output.quantitative;
129 | }
130 | else if (output.syllabicConfidence >= 40 && stdDev < 1) // syllabic when:
131 | {
132 | output.overalMeterType = "Syllabic/بڕگەیی";
133 | output.overalPattern = output.syllabic + "Syllabic";
134 | }
135 | return output;
136 | }
137 |
138 | // input: "ˈgerˈçî ˈtûˈşî ˈřenˈceˈřoˈyîw ˈḧesˈreˈtû ˈderˈdim ˈʔeˈmin "
139 | // output: List<"∪––––∪–––∪–––∪–", "∪––––∪–––∪––∪∪–">
140 | private static List Convert2CV(string syllabified)
141 | {
142 | if (syllabified.Length > 100) // abort if line is too long
143 | syllabified = " ";
144 | var CV = syllabified;
145 | CV = Regex.Replace(CV, @"[\[\]«»]", ""); // remove "] ["
146 | CV = Regex.Replace(CV + "\n", @"[\n\r\?,;! ]+", "¤"); // open junctures (punctuation and end of line) => ¤
147 | CV = Regex.Replace(CV, @" ˈ¤", "¤");
148 | CV = Regex.Replace(CV, "îˈye", "iˈye"); // (ˈnîˈye => ˈniˈye)
149 | CV = Regex.Replace(CV, "([^ieuaêoîûˈ])([yw])", "$1ɰ"); // gyan-gîyan, xiwa-xuwa => – or ∪–
150 | CV = Regex.Replace(CV, "[bcçdfghḧjklłmnpqrřsşṣtvwxẍyzʔƹ]", "C");
151 | var syllables = CV.Split('ˈ').Skip(1).ToList();
152 | var output = new List();
153 | output.Add("");
154 | for (int i = 0; i < syllables.Count(); i++)
155 | {
156 | var count = output.Count;
157 | if (Regex.IsMatch(syllables[i], "ɰ"))
158 | { // CVcC(C) syllable (e.g. گیان خوا)
159 | for (int j = 0; j < count; j++)
160 | {
161 | output.Add(output[j] + "–");
162 | output[j] += "∪–";
163 | }
164 | }
165 | else if (Regex.IsMatch(syllables[i], "([ieuaêoîû]C+|[aêoû]$|[aêo]¤$)"))
166 | { // heavy syllable
167 | if (i < 2)
168 | { // at first position may be light
169 | for (int j = 0; j < count; j++)
170 | {
171 | output.Add(output[j] + "∪");
172 | output[j] += "–";
173 | }
174 | }
175 | else
176 | for (int j = 0; j < count; j++)
177 | output[j] += "–";
178 | }
179 | else if (Regex.IsMatch(syllables[i], "([ieu]$|i¤$)"))
180 | { // light syllable
181 | for (int j = 0; j < count; j++)
182 | output[j] += "∪";
183 | }
184 | else if (Regex.IsMatch(syllables[i], "([euîû]¤$|î$)"))
185 | { // may be both
186 | for (int j = 0; j < count; j++)
187 | {
188 | output.Add(output[j] + "∪");
189 | output[j] += "–";
190 | }
191 | }
192 | }
193 | return output;
194 | }
195 |
196 | // input: List of "∪–"s
197 | // output: List of nearests of 27 common meter patterns
198 | private static List PatternMatch(List cands, int lineNumber)
199 | {
200 | if (CommonPatterns.Count == 0)
201 | loadPoemPatterns();
202 | var output = new List();
203 | if (!string.IsNullOrEmpty(cands[0].Trim()))
204 | {
205 | for (int i = 0; i < CommonPatterns.Count; i++)
206 | { // for 27 common meter patterns
207 | var distances = new Dictionary();
208 | for (int j = 0; j < cands.Count; j++) // for each candidate
209 | distances.Add(j, Levenshtein(cands[j], CommonPatterns[i].weights));
210 | var lowestDist = distances.OrderBy(x => x.Value).First().Value;
211 | if (lowestDist <= _maxDist)
212 | {
213 | patternScores[i] += _maxDist - lowestDist;
214 | foreach (var item in distances.Where(x => x.Value == lowestDist))
215 | {
216 | output.Add(new ScannedHemistich()
217 | {
218 | lineNo = lineNumber,
219 | scanned = cands[item.Key],
220 | meterID = i,
221 | dist = item.Value
222 | });
223 | }
224 | }
225 | }
226 | }
227 | return output;
228 | }
229 |
230 | //==================================================
231 |
232 | /// Normalizes the input text for classification steps.
233 | public static string PoemNormalization(string text)
234 | {
235 | text = Regex.Replace(text, "ط", "ت");
236 | text = Regex.Replace(text, "[صث]", "س");
237 | text = Regex.Replace(text, "[ضذظ]", "ز");
238 | text = Regex.Replace(text, "( و)([.،؟!])", "$1");
239 | return text;
240 | }
241 |
242 | private static double CalculateStdDev(List values)
243 | {
244 | double ret = 0;
245 | if (values.Count() > 0)
246 | {
247 | double avg = values.Average();
248 | double sum = values.Sum(d => Math.Pow(d - avg, 2));
249 | ret = Math.Sqrt((sum) / (values.Count() - 1));
250 | }
251 | return ret;
252 | }
253 |
254 | private static double CalculateStdDev(List values, double avg)
255 | {
256 | double ret = 0;
257 | if (values.Count() > 0)
258 | {
259 | double sum = values.Sum(d => Math.Pow(d - avg, 2));
260 | ret = Math.Sqrt((sum) / (values.Count() - 1));
261 | }
262 | return ret;
263 | }
264 |
265 | private static int Levenshtein(string s1, string s2)
266 | {
267 | if (string.IsNullOrEmpty(s1))
268 | {
269 | if (!string.IsNullOrEmpty(s2))
270 | return s2.Length;
271 | return 0;
272 | }
273 | if (string.IsNullOrEmpty(s2))
274 | {
275 | if (!string.IsNullOrEmpty(s1))
276 | return s1.Length;
277 | return 0;
278 | }
279 | var m = s1.Length + 1;
280 | var n = s2.Length + 1;
281 | int[,] d = new int[m, n];
282 |
283 | for (int i = 0; i < m; i++)
284 | d[i, 0] = i;
285 | for (int i = 0; i < n; i++)
286 | d[0, i] = i;
287 |
288 | for (int i = 1; i < m; i++)
289 | {
290 | for (int j = 1; j < n; j++)
291 | {
292 | var cost = (s1[i - 1] == s2[j - 1]) ? 0 : 2; // or 2
293 | var min1 = d[i - 1, j] + 1;
294 | var min2 = d[i, j - 1] + 1;
295 | var min3 = d[i - 1, j - 1] + cost;
296 | d[i, j] = Math.Min(Math.Min(min1, min2), min3);
297 | }
298 | }
299 | return d[m - 1, n - 1];
300 | }
301 | }
302 | }
303 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AsoSoft Library
2 | AsoSoft Library offers basic natural language processing (NLP) algorithms for the Kurdish Language (ckb: Central branch of Kurdish).
3 | AsoSoft Library is written in C#.
4 | - **Grapheme-to-Phoneme (G2P) converter and Transliteration**: converts Kurdish text into syllabified phoneme string. Also transliterates Kurdish texts from Arabic script into Latin script and vice versa.
5 | - **Normalizer**: normalizes the Kurdish text and punctuation marks, unifies numerals, replaces Html Entities, extracts and replaces URLs and emails, and more.
6 | - **Numeral Converter**: converts any type of numbers into Kurdish words.
7 | - **Sort**: Sorts a list in correct Kurdish alphabet order.
8 | - **Poem Meter Classifier**: Classifies the meter of the input Kurdish poem
9 |
10 | ## Grapheme-to-Phoneme (G2P) converter and Transliteration
11 | This function is based on the study "[Automated Grapheme-to-Phoneme Conversion for Central Kurdish based on Optimality Theory](https://www.sciencedirect.com/science/article/abs/pii/S0885230821000292)".
12 |
13 | ### Kurdish G2P converter
14 | Converts Central Kurdish text in standard Arabic script into **syllabified phonemic** Latin script (i.e. graphemes to phonems)
15 |
16 | General format:
17 | ```cs
18 | AsoSoft.G2P(string text,
19 | bool convertNumbersToWord = false,
20 | bool backMergeConjunction = true,
21 | bool singleOutputPerWord = true);
22 | ```
23 | An example:
24 | ```cs
25 | AsoSoft.G2P("شەو و ڕۆژ بووین بە گرفت. درێژیی دیوارەکەی گرتن");
26 | >ˈşeˈwû ˈřoj ˈbûyn ˈbe ˈgiˈrift. ˈdiˈrêˈjîy ˈdîˈwaˈreˈkey ˈgirˈtin<
27 | ```
28 | ### Transliteration
29 |
30 | Arabic script into Hawar Latin script (حغڕڵ→ḧẍřł):
31 | ```cs
32 | AsoSoft.Ar2La("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟");
33 | >gîrodey xałî řeşte; gwêt le neẍmey tuyûre?<
34 | ```
35 |
36 | Arabic script into simplified (حغڕڵ→hxrl) Hawar Latin script:
37 | ```cs
38 | AsoSoft.Ar2LaSimple("گیرۆدەی خاڵی ڕەشتە؛ گوێت لە نەغمەی تویوورە؟");
39 | >gîrodey xalî reşte; gwêt le nexmey tuyûre?<
40 | ```
41 |
42 | Latin script (Hawar) into Arabic script:
43 | ```cs
44 | AsoSoft.La2Ar("Gelî keç û xortên kurdan, hûn hemû bi xêr biçin");
45 | >گەلی کەچ و خۆرتێن کوردان، هوون هەموو ب خێر بچن<
46 | ```
47 |
48 | Arabic script into IPA:
49 | ```cs
50 | AsoSoft.Phonemes2IPA(AsoSoft.G2P("شەو و ڕۆژ بووین بە گرفت. درێژیی دیوارەکە گرتن"));
51 | >ʃa·wu ro̞ʒ bujn ba gɪ·ɾɪft. dɪ·ɾɛ·ʒij di·wä·ɾa·ka gɪɾ·tɪn<
52 | ```
53 | ## Kurdish Text Normalizer
54 | Several functions needed for Central Kurdish text normalization:
55 |
56 | ### Normalize Kurdish
57 | Two character replacement lists are provided as the resources of the library:
58 | - Deep Unicode Corrections:
59 | - replacing deprecated Arabic Presentation Forms (FB50–FDFF and FE70–FEFF) with corresponding standard characters.
60 | - replacing different types of dashes and spaces
61 | - removing Unicode control character
62 | - Additional Unicode Corrections
63 | - replacing special Arabic math signs with corresponding Latin characters
64 | - replacing similar, but different letters with standard characters (e.g. ڪ,ے,ٶ with ک,ی,ؤ)
65 |
66 | The normalization task in this function:
67 | - for all Arabic scripts (including Kurdish, Arabic, and Persian):
68 | - Character-based replacement:
69 | - Above mentioned replacement lists
70 | - Private Use Area (U+E000 to U+F8FF) with White Square character
71 | - Standardizing and removing duplicated or unnecessary Zero-Width characters
72 | - removing unnecessary Tatweels (U+0640)
73 | - only for Central Kurdish:
74 | - standardizing Kurdish characters: ە, هـ, ی, and ک
75 | - correcting miss-converted characters from non-Unicode fonts
76 | - replacing word-initial ر with ڕ
77 |
78 | the simple overloading:
79 | ```cs
80 | AsoSoft.Normalize("دەقے شیَعري خـــۆش. رهنگهكاني خاك");
81 | >دەقی شێعری خۆش. ڕەنگەکانی خاک<
82 | ```
83 |
84 | or the complete overloading:
85 | ```cs
86 | AsoSoft.Normalize(string text,
87 | bool isOnlyKurdish,
88 | bool changeInitialR,
89 | bool deepUnicodeCorrectios,
90 | bool additionalUnicodeCorrections,
91 | Dictionary usersReplaceList);
92 | ```
93 |
94 | ### AliK to Unicode
95 | `AliK2Unicode` converts Kurdish text written in AliK fonts (developed by Abas Majid in 1997) into Unicode standard. Ali-K fonts: *Alwand, Azzam, Hasan, Jiddah, kanaqen, Khalid, Sahifa, Sahifa Bold, Samik, Sayid, Sharif, Shrif Bold, Sulaimania, Traditional*
96 | ```cs
97 | AsoSoft.AliK2Unicode("ئاشناكردنى خويَندكار بة طوَرِانكاريية كوَمةلاَيةتييةكان");
98 | >ئاشناکردنی خوێندکار بە گۆڕانکارییە کۆمەڵایەتییەکان<
99 | ```
100 |
101 | ### AliWeb to Unicode
102 | `AliWeb2Unicode` converts Kurdish text written in AliK fonts into Unicode standard. Ali-Web fonts: *Malper, Malper Bold, Samik, Traditional, Traditional Bold*
103 | ```cs
104 | AsoSoft.AliWeb2Unicode("هةر جةرةيانصکي مصذووُيي کة أوو دةدا");
105 | >ھەر جەرەیانێکی مێژوویی کە ڕوو دەدا<
106 | ```
107 |
108 | ### Dylan to Unicode
109 | `Dylan2Unicode` converts Kurdish text written in Dylan fonts (developed by Dylan Saleh at [KurdSoft]( https://web.archive.org/web/20020528231610/http://www.kurdsoft.com/) in 2001) into Unicode standard.
110 | ```cs
111 | AsoSoft.Dylan2Unicode("لثكؤلثنةران بؤيان دةركةوتووة كة دةتوانث بؤ لةش بةكةصك بث");
112 | >لێکۆلێنەران بۆیان دەرکەوتووە کە دەتوانێ بۆ لەش بەکەڵک بێ<
113 | ```
114 | ### Zarnegar to Unicode
115 | `Zarnegar2Unicode` converts Kurdish text written in Zarnegar word processor (developed by [SinaSoft](http://www.sinasoft.com/fa/zarnegar.html) with RDF converter by [NoorSoft](https://www.noorsoft.org/fa/software/view/6561)) and into Unicode standard.
116 | ```cs
117 | AsoSoft.Zarnegar2Unicode("بلٌيٌين و بگهرٍيٌين بوٌ ههلاٌلٌهى سىٌيهمى فهلسهفه");
118 | >بڵێین و بگەڕێین بۆ هەڵاڵەی سێیەمی فەلسەفە<
119 | ```
120 | ### NormalizePunctuations
121 | `NormalizePunctuations` corrects spaces before and after of the punctuations. When `seprateAllPunctuations` is true,
122 | ```cs
123 | AsoSoft.NormalizePunctuations("دەقی«کوردی » و ڕێنووس ،((خاڵبەندی )) چۆنە ؟", false);
124 | >دەقی «کوردی» و ڕێنووس، «خاڵبەندی» چۆنە؟<
125 | ```
126 | ### Trim Line
127 | Trim starting and ending white spaces (including zero width spaces) of line,
128 | `TrimLine`
129 | ```cs
130 | AsoSoft.TrimLine(" دەق\u200c ");
131 | >دەق<
132 | ```
133 |
134 | ### Replace Html Entities
135 | `ReplaceHtmlEntity` replaces HTML Entities with single Unicode characters (e.g. "é" with "é"). It is useful in web crawled corpora.
136 | ```cs
137 | AsoSoft.ReplaceHtmlEntity("ئێوە "دەق" لە زمانی <کوردی> دەنووسن");
138 | >ئێوە "دەق" بە زمانی <کوردی> دەنووسن<
139 | ```
140 | ### Replace URLs and emails
141 | `ReplaceUrlEmail` replaces URLs and emails with a certain word. It improves language models.
142 |
143 | ### Unify Numerals
144 | `UnifyNumerals` unifies numeral characters into desired numeral type from `en` (0123456789) or `ar` (٠١٢٣٤٥٦٧٨٩)
145 | ```cs
146 | AsoSoft.UnifyNumerals("ژمارەکانی ٤٥٦ و ۴۵۶ و 456", "en");
147 | >ژمارەکانی 456 و 456 و 456<
148 | ```
149 |
150 | ### Seperate Digits from words
151 | `SeperateDigits` add a space between joined numerals and words (e.g. replacing "12کەس" with "12 کەس"). It improves language models.
152 | ```cs
153 | AsoSoft.SeperateDigits("لە ساڵی1950دا1000دۆلاریان بە 5کەس دا");
154 | >لە ساڵی 1950 دا 1000 دۆلاریان بە 5 کەس دا<
155 | ```
156 |
157 | ### Word to Word Replacment
158 | `Word2WordReplacement` applies a "string to string" replacement dictionary on the text. It replaces the full-matched words not a part of them.
159 | ```cs
160 | var dict = new Dictionary() { { "مال", "ماڵ" } };
161 | AsoSoft.Word2WordReplacement("مال، نووری مالیکی", dict);
162 | >ماڵ، نووری مالیکی<
163 | ```
164 |
165 | ### Character to Character Replacment
166 | `Char2CharReplacment` applies a "char to char" replacement dictionary on the text. It uses as the final step needed for some non-Unicode systems.
167 |
168 | ## Kurdish Numeral converter
169 | It converts numerals into Central Kurdish words. It is useful in text-to-speech tools.
170 | - integers (1100 => )
171 | - floats (10.11)
172 | - negatives (-10.11)
173 | - percent (100% or %100)
174 | - querency marks ($100, £100, and €100)
175 |
176 | ```cs
177 | AsoSoft.Number2Word("لە ساڵی 1999دا بڕی 40% لە پارەکەیان واتە $102.1 یان وەرگرت");
178 | >لە ساڵی هەزار و نۆسەد و نەوەد و نۆدا بڕی چل لە سەد لە پارەکەیان واتە سەد و دوو پۆینت یەک دۆلاریان وەرگرت<
179 | ```
180 |
181 | ## Kurdish Sort
182 | Sorting a string list in correct order of Kurdish alphabet ("ئءاآأإبپتثجچحخدڎذرڕزژسشصضطظعغفڤقكکگلڵمنوۆۊۉهھەیێ")
183 | ```cs
184 | var myList = new List{"یەک", "ڕەنگ", "ئەو", "ئاو", "ڤەژین", "فڵان"}
185 | AsoSoft.KurdishSort(myList);
186 | >"ئاو", "ئەو", "ڕەنگ", "فڵان", "ڤەژین", "یەک"<
187 | ```
188 | or using your custom order:
189 | ```cs
190 | AsoSoft.CustomSort(List inputList, List inputOrder);
191 | ```
192 | ## Poem Meter Classifier
193 | It classifies the meter of the input Kurdish poem typed in Arabic script. The lines of the poem should be seprated by new line char ('\n').
194 | You can find Kurdish poems in https://books.vejin.net/.
195 | ```cs
196 | var poem = AsoSoft.PoemNormalization(@"گەرچی تووشی ڕەنجەڕۆیی و حەسرەت و دەردم ئەمن
197 | قەت لەدەس ئەم چەرخە سپڵە نابەزم مەردم ئەمن
198 | ئاشقی چاوی کەژاڵ و گەردنی پڕ خاڵ نیم
199 | ئاشقی کێو و تەلان و بەندەن و بەردم ئەمن");
200 | var syllabified = AsoSoft.G2P(poem, true, true, true).Split('\n');
201 | var classified = AsoSoft.PoemClassification(syllabified);
202 | var poemType = classified.overalMeterType;
203 | var poemMeter = classified.overalPattern;
204 | ```
205 |
206 | ## How to use?
207 | Install [AsoSoft Library package](https://www.nuget.org/packages/AsoSoftLibrary) via NuGet Gallery.
208 | Then, insert `using AsoSoftLibrary;` into "Usings" of your codes.
209 |
210 | ## Development
211 | AsoSoft Library is developed and maintained by Aso Mahmudi.
212 | AsoSoft Library is written in C# (.NET 6).
213 |
--------------------------------------------------------------------------------
/Sort.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Text;
4 |
5 | namespace AsoSoftLibrary
6 | {
7 | public static partial class AsoSoft
8 | {
9 | /// Sorting a string list in correct order of Kurdish alphabet.
10 | public static List KurdishSort(List inputList)
11 | {
12 | var ku = new List();
13 | ku.AddRange("ئءاآأإبپتثجچحخدڎڊذرڕزژسشصضطظعغفڤقكکگڴلڵمنوۆۊۉۋهھەیێ");
14 | return CustomSort(inputList, ku);
15 | }
16 | /// Sorting a string list in custom order.
17 | public static List CustomSort(List inputList, List inputOrder)
18 | {
19 | var baseChar = 62000;// 9472;
20 | var order = new List();
21 | for (int i = 0; i < inputOrder.Count; i++)
22 | order.Add((char)(baseChar + i));
23 | for (int i = 0; i < inputList.Count; i++)
24 | for (int j = 0; j < order.Count; j++)
25 | inputList[i] = inputList[i].Replace(inputOrder[j], order[j]);
26 | inputList.Sort();
27 | for (int i = 0; i < inputList.Count; i++)
28 | for (int j = 0; j < order.Count; j++)
29 | inputList[i] = inputList[i].Replace(order[j], inputOrder[j]);
30 | return inputList;
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/Transliteration.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Text.RegularExpressions;
6 | using System.Threading.Tasks;
7 |
8 | namespace AsoSoftLibrary
9 | {
10 | public static partial class AsoSoft
11 | {
12 |
13 | private static readonly string latinLetters = "a-zêîûçşéúıŕřĺɫƚḧẍḍṿʔ";
14 |
15 | private static readonly Dictionary> TransliterationReplaces = new Dictionary>
16 | {
17 | {"LaDi2Ar", new List() {
18 | "gh", "ẍ",
19 | "hh", "ḧ",
20 | "ll", "ɫ",
21 | "rr", "ř"
22 | }},
23 | {"La2Ar", new List() {
24 | "\u201C", "«",
25 | "\u201D", "»",
26 | $"([0-9])([\'’-])([aeiouêîûéú])", "$1$3", // (e.g. 1990'an 5'ê)
27 | "ʔ", "", // glottal stop
28 | $"(^|[^{latinLetters}0-9\"’])([aeiouêîûéú])", "$1ئ$2", //insert initial hamza
29 | "([aeouêîûéú])([aeiouêîûéú])", "$1ئ$2", //insert hamza between adjacent vowels
30 | $"(ئ)([uû])([^{latinLetters}0-9])", "و$3", //omit the inserted hamza for "û" (=and)
31 | "a", "ا",
32 | "b", "ب",
33 | "ç", "چ",
34 | "c", "ج",
35 | "d", "د",
36 | "ḍ", "ڎ", // a Horami consonant
37 | "ê|é", "ێ",
38 | "e", "ە",
39 | "f", "ف",
40 | "g", "گ",
41 | "h", "ه",
42 | "ḧ", "ح",
43 | "i|ı", "",
44 | "î|y|í", "ی",
45 | "j", "ژ",
46 | "k", "ک",
47 | "l", "ل",
48 | "ɫ|ł|ƚ|Ɨ|ĺ", "ڵ",
49 | "m", "م",
50 | "n", "ن",
51 | "ŋ", "نگ",
52 | "o", "ۆ",
53 | "ö", "وێ",
54 | "p", "پ",
55 | "q", "ق",
56 | "r", "ر",
57 | "ř|ŕ", "ڕ",
58 | "s", "س",
59 | "ş|š|ș|s̩", "ش",
60 | "ṣ", "ص",
61 | "t", "ت",
62 | "ṭ", "ط",
63 | "û|ú", "وو",
64 | "u|w", "و",
65 | "ü", "ۊ",
66 | "v", "ڤ",
67 | "x", "خ",
68 | "ẍ", "غ",
69 | "z", "ز",
70 | "ه" + "($|[^ابپتجچحخدرڕزژسشصعغفڤقکگلڵمنوۆهەیێ])", "هـ" + "$1", // word-final h
71 | "\"|’", "ئ", // need checking, not sure "ع" or "ئ"
72 | "\\u003F", "؟", //question mark
73 | ",", "،", //comma
74 | ";", "؛" //semicolon
75 | }}
76 | };
77 |
78 | /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک)
79 | public static string La2Ar(string text)
80 | {
81 | text = replaceByList(text.ToLower(), TransliterationReplaces["La2Ar"]);
82 | return text;
83 | }
84 |
85 | /// Transliterating the Latin script with digraphs into Arabic script of Kurdish (e.g. chall→چاڵ)
86 | public static string LaDigraph2Ar(string text)
87 | {
88 | text = text.ToLower();
89 | text = replaceByList(text, TransliterationReplaces["LaDi2Ar"]);
90 | text = replaceByList(text, TransliterationReplaces["La2Ar"]);
91 | return text;
92 | }
93 |
94 | /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک)
95 | public static string Ar2La(string text)
96 | {
97 | return Phonemes2Hawar(G2P(text, backMergeConjunction:false));
98 | }
99 | /// Transliterating the Latin script into Arabic script of Kurdish (e.g. çak→چاک)
100 | public static string Ar2LaSimple(string text)
101 | {
102 | text = Phonemes2Hawar(G2P(text, backMergeConjunction: false));
103 | text = text.Replace("ḧ", "h");
104 | text = text.Replace("ř", "r");
105 | text = text.Replace("ł", "l");
106 | text = text.Replace("ẍ", "x");
107 | return text;
108 | }
109 |
110 | /// Converts the output of the G2P into IPA (e.g. ˈdeˈçê→da.t͡ʃɛ)
111 | public static string Phonemes2IPA(string text)
112 | {
113 | text = Regex.Replace(text, "(?<=(^|\\W))ˈ", "");
114 | text = Regex.Replace(text, "ˈ", "·"); //middle dot
115 | var Phoneme2IPA = resFiles.Phoneme2IPA.Split('\n');
116 | for (int i = 1; i < Phoneme2IPA.Length; i++)
117 | {
118 | var item = Phoneme2IPA[i].Split(',');
119 | text = Regex.Replace(text, item[0], item[1]);
120 | }
121 | return text;
122 | }
123 |
124 | /// Converts the output of the G2P into Hawar (e.g. ˈʔeˈłêm→ełêm)
125 | public static string Phonemes2Hawar(string text)
126 | {
127 | text = text.Replace("ˈ", "");
128 | text = Regex.Replace(text, "(?<=(^|\\W))ʔ", "");
129 | text = Regex.Replace(text, "[ʔƹ]", "’");
130 | return text;
131 | }
132 |
133 | /// Converts the output of the G2P into Jira's ASCII format (e.g. ˈdeˈçim→D▪A▪CH▪M)
134 | public static string Phonemes2ASCII(string text)
135 | {
136 | text = Regex.Replace(text, @"[iˈ]", "");
137 | var Phoneme2Ascii = resFiles.Phoneme2Ascii.Split('\n');
138 | for (int i = 1; i < Phoneme2Ascii.Length; i++)
139 | {
140 | var item = Phoneme2Ascii[i].Split(',');
141 | text = Regex.Replace(text, item[0], item[1] + "▪");
142 | }
143 | return text;
144 | }
145 | }
146 | }
147 |
--------------------------------------------------------------------------------
/resFiles.Designer.cs:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------------------------
2 | //
3 | // This code was generated by a tool.
4 | // Runtime Version:4.0.30319.42000
5 | //
6 | // Changes to this file may cause incorrect behavior and will be lost if
7 | // the code is regenerated.
8 | //
9 | //------------------------------------------------------------------------------
10 |
11 | namespace AsoSoftLibrary {
12 | using System;
13 |
14 |
15 | ///
16 | /// A strongly-typed resource class, for looking up localized strings, etc.
17 | ///
18 | // This class was auto-generated by the StronglyTypedResourceBuilder
19 | // class via a tool like ResGen or Visual Studio.
20 | // To add or remove a member, edit your .ResX file then rerun ResGen
21 | // with the /str option, or rebuild your VS project.
22 | [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "17.0.0.0")]
23 | [global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
24 | [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
25 | public class resFiles {
26 |
27 | private static global::System.Resources.ResourceManager resourceMan;
28 |
29 | private static global::System.Globalization.CultureInfo resourceCulture;
30 |
31 | [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")]
32 | internal resFiles() {
33 | }
34 |
35 | ///
36 | /// Returns the cached ResourceManager instance used by this class.
37 | ///
38 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
39 | public static global::System.Resources.ResourceManager ResourceManager {
40 | get {
41 | if (object.ReferenceEquals(resourceMan, null)) {
42 | global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("AsoSoftLibrary.resFiles", typeof(resFiles).Assembly);
43 | resourceMan = temp;
44 | }
45 | return resourceMan;
46 | }
47 | }
48 |
49 | ///
50 | /// Overrides the current thread's CurrentUICulture property for all
51 | /// resource lookups using this strongly typed resource class.
52 | ///
53 | [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)]
54 | public static global::System.Globalization.CultureInfo Culture {
55 | get {
56 | return resourceCulture;
57 | }
58 | set {
59 | resourceCulture = value;
60 | }
61 | }
62 |
63 | ///
64 | /// Looks up a localized string similar to G,P,Desc
65 | ///ڴ,ĝ,Garusi Consonant
66 | ///ڎ,đ,Hewrami Consonant
67 | ///ۉ,ŵ,Hewrami Consonant
68 | ///ݵ,ė,Hewrami Vowel
69 | ///ݸ,ȯ,Hewrami Vowel
70 | ///ۊ,ẅ,Southern Vowel
71 | ///ئ,ʔ,
72 | ///ب,b,
73 | ///پ,p,
74 | ///ت,t,
75 | ///ج,c,
76 | ///چ,ç,
77 | ///ح,ḧ,
78 | ///خ,x,
79 | ///د,d,
80 | ///ر,r,
81 | ///ڕ,ř,
82 | ///ز,z,
83 | ///ژ,j,
84 | ///س,s,
85 | ///ش,ş,
86 | ///ع,ƹ,
87 | ///غ,ẍ,
88 | ///ف,f,
89 | ///ڤ,v,
90 | ///ق,q,
91 | ///ک,k,
92 | ///گ,g,
93 | ///ل,l,
94 | ///ڵ,ł,
95 | ///م,m,
96 | ///ن,n,
97 | ///ه,h,
98 | ///ا,a,
99 | ///ۆ,o,
100 | ///ە,e,
101 | ///ێ,ê,
102 | ///^ی,y,
103 | ///^و,w,
104 | ///(?<=[aeêo])و,w,after vowel
105 | ///و(?=[aeêo]),w,before vowel
106 | ///(?<=[aeêo])ی,y,after vowel
107 | ///ی(?=[aeêo]),y,before vowel
108 | ///^([bçdjl])$,$1i,چ=>çi bcçdfghḧjklłmnpqrřsştvwxẍyzʔƹ.
109 | ///
110 | public static string G2PCertain {
111 | get {
112 | return ResourceManager.GetString("G2PCertain", resourceCulture);
113 | }
114 | }
115 |
116 | ///
117 | /// Looks up a localized string similar to Graphemes,Phonems
118 | ///حەییی,ḧeyyî
119 | ///تەییی,teyyî
120 | ///ئاگر,ʔagir
121 | ///قانع,qaniƹ
122 | ///سالم,salim
123 | ///عاشق,ƹaşiq.
124 | ///
125 | public static string G2PExceptions {
126 | get {
127 | return ResourceManager.GetString("G2PExceptions", resourceCulture);
128 | }
129 | }
130 |
131 | ///
132 | /// Looks up a localized string similar to From,To,Desc
133 | ///00AC,200C,Wrong ZWNJ by MS Word
134 | ///066A,0025,Arabic PERCENT SIGN
135 | ///066B,002E,Arabic DECIMAL SEPARATOR
136 | ///066C,002C,Arabic THOUSANDS SEPARATOR
137 | ///066D,002A,Arabic FIVE POINTED STAR
138 | ///0751,062B,ݑ
139 | ///0752,067E,ݒ
140 | ///0750,067E,ݐ
141 | ///0753,062A,ݓ
142 | ///067F,062A,ٿ
143 | ///0679,062A,ٹ
144 | ///0758,0686,ݘ
145 | ///0689,062F,ډ
146 | ///068A,062F,ڊ
147 | ///068B,062F,ڋ
148 | ///068C,062F,ڌ
149 | ///068D,062F,ڍ
150 | ///068F,062F,ڏ
151 | ///0690,062F,ڐ
152 | ///0759,062F,ݙ
153 | ///075A,062F,ݚ
154 | ///076C,0695,ݬ
155 | ///0691,0695,ڑ
156 | ///0692,0695,ڒ
157 | ///0693,0695,ړ
158 | ///0694,0695,ڔ
159 | ///0696,0695,ږ
160 | ///0697,0698,ڗ
161 | ///0699,0698,ڙ
162 | ///076B,0698,ݫ
163 | ///069A,0633,ښ
164 | ///069B,0633,ڛ
165 | ///069C,0 [rest of string was truncated]";.
166 | ///
167 | public static string NormalizerAdditional {
168 | get {
169 | return ResourceManager.GetString("NormalizerAdditional", resourceCulture);
170 | }
171 | }
172 |
173 | ///
174 | /// Looks up a localized string similar to From,To,Desc
175 | ///A78C,0027,Latin Small Letter Saltillo ꞌ
176 | ///FEFF,200C,ZERO WIDTH NO-BREAK SPACE
177 | ///200B,200C,ZERO WIDTH SPACE
178 | ///2010,002D,HYPHEN
179 | ///2011,002D,NON-BREAKING HYPHEN
180 | ///2012,002D,FIGURE DASH
181 | ///2013,002D,EN DASH
182 | ///2014,002D,EM DASH
183 | ///2015,002D,HORIZONTAL BAR
184 | ///2212,002D,Minus
185 | ///00AD,002D,Soft Hyphen
186 | ///FE58,002D,SMALL EM DASH
187 | ///FE63,002D,MALL HYPHEN-MINUS
188 | ///FF0D,002D,FULLWIDTH HYPHEN-MINUS
189 | ///1680,0020,OGHAM SPACE MARK
190 | ///2000,0020,EN QUAD
191 | ///2001,0020,EM QUAD
192 | ///2002,0020,EN SPACE
193 | ///2003,0020,EM SPACE
194 | ///2004,0020,THREE-PER-EM SPACE
195 | ///2005,0020,FOU [rest of string was truncated]";.
196 | ///
197 | public static string NormalizerDeep {
198 | get {
199 | return ResourceManager.GetString("NormalizerDeep", resourceCulture);
200 | }
201 | }
202 |
203 | ///
204 | /// Looks up a localized string similar to Phoneme,ASCII
205 | ///ʔ,EH
206 | ///a,AA
207 | ///b,B
208 | ///p,P
209 | ///t,T
210 | ///c,JE
211 | ///ç,CH
212 | ///ḧ,HE
213 | ///x,X
214 | ///d,D
215 | ///r,R
216 | ///ř,RR
217 | ///z,Z
218 | ///j,ZH
219 | ///s,S
220 | ///ş,SH
221 | ///ƹ,AH
222 | ///ẍ,XE
223 | ///f,F
224 | ///v,V
225 | ///q,Q
226 | ///k,K
227 | ///g,G
228 | ///l,L
229 | ///ł,LL
230 | ///m,M
231 | ///n,N
232 | ///o,O
233 | ///e,A
234 | ///h,H
235 | ///ê,E
236 | ///î,I
237 | ///y,Y
238 | ///w,W
239 | ///u,U
240 | ///û,UU.
241 | ///
242 | public static string Phoneme2Ascii {
243 | get {
244 | return ResourceManager.GetString("Phoneme2Ascii", resourceCulture);
245 | }
246 | }
247 |
248 | ///
249 | /// Looks up a localized string similar to Phoneme,IPA
250 | ///ng,ŋg
251 | ///ʔ,ʔ
252 | ///b,b
253 | ///p,p
254 | ///t,t
255 | ///c,d͡ʒ
256 | ///ç,t͡ʃ
257 | ///ḧ,ħ
258 | ///x,x
259 | ///d,d
260 | ///r,ɾ
261 | ///ř,r
262 | ///z,z
263 | ///j,ʒ
264 | ///s,s
265 | ///ş,ʃ
266 | ///ƹ,ʕ
267 | ///ẍ,ɣ
268 | ///f,f
269 | ///v,v
270 | ///q,q
271 | ///k,k
272 | ///g,g
273 | ///l,l
274 | ///ł,ɫ
275 | ///m,m
276 | ///n,n
277 | ///w,w
278 | ///u,ʊ
279 | ///û,u
280 | ///o,o̞
281 | ///h,h
282 | ///y,j
283 | ///a,ä
284 | ///e,a
285 | ///ê,ɛ
286 | ///i,ɪ
287 | ///î,i
288 | ///ĝ,ŋ
289 | ///đ,đ
290 | ///ü,y
291 | ///ô,ô
292 | ///õ,õ.
293 | ///
294 | public static string Phoneme2IPA {
295 | get {
296 | return ResourceManager.GetString("Phoneme2IPA", resourceCulture);
297 | }
298 | }
299 |
300 | ///
301 | /// Looks up a localized string similar to Frequency,WeightPattern,Title
302 | ///1044,–∪–––∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلاتن فاعلن
303 | ///999,∪–––∪–––∪–––∪–––,مفاعیلن مفاعیلن مفاعیلن مفاعیلن
304 | ///386,∪–––∪–––∪––,مفاعیلن مفاعیلن فعولن
305 | ///334,––∪∪––∪∪––∪∪––,مفعولُ مفاعیلُ مفاعیلُ فعولن
306 | ///272,––∪∪––∪∪––∪∪–,مفعولُ مفاعیلُ مفاعیلُ فعل
307 | ///213,––∪–∪–∪∪––∪–∪–,مفعولُ فاعلاتُ مفاعیلُ فاعلن
308 | ///138,∪∪––∪∪––∪∪––∪∪–,فعلاتن فعلاتن فعلاتن فعلن
309 | ///131,––∪∪–∪–∪––,مفعولُ مفاعلن فعولن
310 | ///62,–∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلن
311 | ///45,∪∪––∪–∪–∪∪–,فعلاتن مفاعلن فعلن
312 | ///40,∪–∪–∪∪––∪–∪–∪∪–,مفاعلن فعلاتن مفاعلن فعلن
313 | ///31 [rest of string was truncated]";.
314 | ///
315 | public static string PoemPatterns {
316 | get {
317 | return ResourceManager.GetString("PoemPatterns", resourceCulture);
318 | }
319 | }
320 | }
321 | }
322 |
--------------------------------------------------------------------------------
/resFiles.resx:
--------------------------------------------------------------------------------
1 |
2 |
3 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 | text/microsoft-resx
110 |
111 |
112 | 2.0
113 |
114 |
115 | System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
116 |
117 |
118 | System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089
119 |
120 |
121 |
122 | resources\G2PCertain.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8
123 |
124 |
125 | resources\G2PExceptions.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8
126 |
127 |
128 | resources\NormalizeUnicodeAdditional.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8
129 |
130 |
131 | resources\NormalizeUnicodeDeep.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8
132 |
133 |
134 | resources\Phoneme2Ascii.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8
135 |
136 |
137 | resources\Phoneme2IPA.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8
138 |
139 |
140 | resources\PoemPatterns.csv;System.String, mscorlib, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089;utf-8
141 |
142 |
--------------------------------------------------------------------------------
/resources/G2PCertain.csv:
--------------------------------------------------------------------------------
1 | G,P,Desc
2 | ڴ,ĝ,Garusi Consonant
3 | ڎ,đ,Hewrami Consonant
4 | ۉ,ŵ,Hewrami Consonant
5 | ݵ,ė,Hewrami Vowel
6 | ݸ,ȯ,Hewrami Vowel
7 | ۊ,ẅ,Southern Vowel
8 | ئ,ʔ,
9 | ب,b,
10 | پ,p,
11 | ت,t,
12 | ج,c,
13 | چ,ç,
14 | ح,ḧ,
15 | خ,x,
16 | د,d,
17 | ر,r,
18 | ڕ,ř,
19 | ز,z,
20 | ژ,j,
21 | س,s,
22 | ش,ş,
23 | ع,ƹ,
24 | غ,ẍ,
25 | ف,f,
26 | ڤ,v,
27 | ق,q,
28 | ک,k,
29 | گ,g,
30 | ل,l,
31 | ڵ,ł,
32 | م,m,
33 | ن,n,
34 | ه,h,
35 | ا,a,
36 | ۆ,o,
37 | ە,e,
38 | ێ,ê,
39 | ^ی,y,
40 | ^و,w,
41 | (?<=[aeêo])و,w,after vowel
42 | و(?=[aeêo]),w,before vowel
43 | (?<=[aeêo])ی,y,after vowel
44 | ی(?=[aeêo]),y,before vowel
45 | ^([bçdjl])$,$1i,چ=>çi bcçdfghḧjklłmnpqrřsştvwxẍyzʔƹ
--------------------------------------------------------------------------------
/resources/G2PExceptions.csv:
--------------------------------------------------------------------------------
1 | Graphemes,Phonems
2 | حەییی,ḧeyyî
3 | تەییی,teyyî
4 | ئاگر,ʔagir
5 | قانع,qaniƹ
6 | سالم,salim
7 | عاشق,ƹaşiq
--------------------------------------------------------------------------------
/resources/NormalizeUnicodeAdditional.csv:
--------------------------------------------------------------------------------
1 | From,To,Desc
2 | 00AC,200C,Wrong ZWNJ by MS Word
3 | 066A,0025,Arabic PERCENT SIGN
4 | 066B,002E,Arabic DECIMAL SEPARATOR
5 | 066C,002C,Arabic THOUSANDS SEPARATOR
6 | 066D,002A,Arabic FIVE POINTED STAR
7 | 0751,062B,ݑ
8 | 0752,067E,ݒ
9 | 0750,067E,ݐ
10 | 0753,062A,ݓ
11 | 067F,062A,ٿ
12 | 0679,062A,ٹ
13 | 0758,0686,ݘ
14 | 0689,062F,ډ
15 | 068A,062F,ڊ
16 | 068B,062F,ڋ
17 | 068C,062F,ڌ
18 | 068D,062F,ڍ
19 | 068F,062F,ڏ
20 | 0690,062F,ڐ
21 | 0759,062F,ݙ
22 | 075A,062F,ݚ
23 | 076C,0695,ݬ
24 | 0691,0695,ڑ
25 | 0692,0695,ڒ
26 | 0693,0695,ړ
27 | 0694,0695,ڔ
28 | 0696,0695,ږ
29 | 0697,0698,ڗ
30 | 0699,0698,ڙ
31 | 076B,0698,ݫ
32 | 069A,0633,ښ
33 | 069B,0633,ڛ
34 | 069C,0634,ڜ
35 | 06FA,0634,ۺ
36 | 069D,0635,ڝ
37 | 069E,0636,ڞ
38 | 06FB,0636,ۻ
39 | 069F,0638,ڟ
40 | 06A0,063A,ڠ
41 | 06FC,063A,ۼ
42 | 06A1,0641,ڡ
43 | 06A2,0641,ڢ
44 | 06A3,0641,ڣ
45 | 06A5,06A4,ڥ
46 | 06A4,06A4,ڤ
47 | 06A7,0642,ڧ
48 | 06A8,0642,ڨ
49 | 06A8,0642,ڨ
50 | 06AA,06A9,ڪ
51 | 06AB,06A9,ګ
52 | 06AC,06A9,ڬ
53 | 06AD,06A9,ڭ
54 | 06AE,06A9,ڮ
55 | 063B,06A9,ػ
56 | 063C,06A9,ؼ
57 | 06B0,06AF,ڰ
58 | 06B1,06AF,ڱ
59 | 06B2,06AF,ڲ
60 | 06B3,06AF,ڳ
61 | 06B4,06AF,ڴ
62 | 06D2,06CC,ے
63 | 06CD,06CC,ۍ
64 | 06B6,06B5,ڶ
65 | 06B7,06B5,ڷ
66 | 06B8,06B5,ڸ
67 | 076A,06B5,ݪ
68 | 0765,0645,ݥ
69 | 0766,0645,ݦ
70 | 06B9,0646,ڹ
71 | 06BA,0646,ں
72 | 06BB,0646,ڻ
73 | 06BC,0646,ڼ
74 | 06BD,0646,ڽ
75 | 0767,0646,ݧ
76 | 0768,0646,ݨ
77 | 0769,0646,ݩ
78 | 06C4,06C6,ۄ
79 | 06C5,06C6,ۅ
80 | 06C8,06C6,ۈ
81 | 06C9,06C6,ۉ
82 | 06CB,06C6,ۋ
83 | 0676,06C6,ٶ
84 | 06C9,06C6,ۉ
85 | 06C7,0648 0648,ۇ
--------------------------------------------------------------------------------
/resources/NormalizeUnicodeDeep.csv:
--------------------------------------------------------------------------------
1 | From,To,Desc
2 | A78C,0027,Latin Small Letter Saltillo ꞌ
3 | FEFF,200C,ZERO WIDTH NO-BREAK SPACE
4 | 200B,200C,ZERO WIDTH SPACE
5 | 2010,002D,HYPHEN
6 | 2011,002D,NON-BREAKING HYPHEN
7 | 2012,002D,FIGURE DASH
8 | 2013,002D,EN DASH
9 | 2014,002D,EM DASH
10 | 2015,002D,HORIZONTAL BAR
11 | 2212,002D,Minus
12 | 00AD,002D,Soft Hyphen
13 | FE58,002D,SMALL EM DASH
14 | FE63,002D,MALL HYPHEN-MINUS
15 | FF0D,002D,FULLWIDTH HYPHEN-MINUS
16 | 1680,0020,OGHAM SPACE MARK
17 | 2000,0020,EN QUAD
18 | 2001,0020,EM QUAD
19 | 2002,0020,EN SPACE
20 | 2003,0020,EM SPACE
21 | 2004,0020,THREE-PER-EM SPACE
22 | 2005,0020,FOUR-PER-EM SPACE
23 | 2006,0020,SIX-PER-EM SPACE
24 | 205F,0020,MEDIUM MATHEMATICAL SPACE
25 | 3000,0020,IDEOGRAPHIC SPACE
26 | 2007,0020,FIGURE SPACE
27 | 2008,0020,PUNCTUATION SPACE
28 | 2009,0020,THIN SPACE
29 | 200A,0020,HAIR SPACE
30 | 00A0,0020,NO-BREAK SPACE
31 | 202F,0020,NARROW NO-BREAK SPACE
32 | 200E,0020,LEFT-TO-RIGHT MARK
33 | 200F,0020,RIGHT-TO-LEFT MARK
34 | 202A,0020,LEFT-TO-RIGHT EMBEDDING
35 | 202B,0020,RIGHT-TO-LEFT EMBEDDING
36 | 202C,0020,POP DIRECTIONAL FORMATTING
37 | 202D,0020,LEFT-TO-RIGHT OVERRIDE
38 | 202E,0020,RIGHT-TO-LEFT OVERRIDE
39 | 0000,0020,Control
40 | 0001,0020,
41 | 0002,0020,
42 | 0003,0020,
43 | 0004,0020,
44 | 0005,0020,
45 | 0006,0020,
46 | 0007,0020,
47 | 0008,0020,
48 | 000B,0020,
49 | 000C,0020,
50 | 000E,0020,
51 | 000F,0020,
52 | 0010,0020,
53 | 0011,0020,
54 | 0012,0020,
55 | 0013,0020,
56 | 0014,0020,
57 | 0015,0020,
58 | 0016,0020,
59 | 0017,0020,
60 | 0018,0020,
61 | 0019,0020,
62 | 001A,0020,
63 | 001B,0020,
64 | 001C,0020,
65 | 001D,0020,
66 | 001E,0020,
67 | 001F,0020,
68 | 007F,0020,
69 | 0080,0020,
70 | 0081,0020,
71 | 0082,0020,
72 | 0083,0020,
73 | 0084,0020,
74 | 0085,0020,
75 | 0086,0020,
76 | 0087,0020,
77 | 0088,0020,
78 | 0089,0020,
79 | 008A,0020,
80 | 008B,0020,
81 | 008C,0020,
82 | 008D,0020,
83 | 008E,0020,
84 | 008F,0020,
85 | 0090,0020,
86 | 0091,0020,
87 | 0092,0020,
88 | 0093,0020,
89 | 0094,0020,
90 | 0095,0020,
91 | 0096,0020,
92 | 0097,0020,
93 | 0098,0020,
94 | 0099,0020,
95 | 009A,0020,
96 | 009B,0020,
97 | 009C,0020,
98 | 009D,0020,
99 | 009E,0020,
100 | 009F,0020,
101 | 0610,,Arabic Nonspacing Marks
102 | 0611,,
103 | 0612,,
104 | 0613,,
105 | 0614,,
106 | 0615,,
107 | 0616,,
108 | 0617,,
109 | 0618,,
110 | 0619,,
111 | 061A,,
112 | 0653,,
113 | 0654,,
114 | 0655,,
115 | 0656,,
116 | 0657,,
117 | 0658,,
118 | 0659,,
119 | 065A,,
120 | 065B,,
121 | 065C,,
122 | 065D,,
123 | 065E,,
124 | 065F,,
125 | 0670,,
126 | 06D6,,
127 | 06D7,,
128 | 06D8,,
129 | 06D9,,
130 | 06DA,,
131 | 06DB,,
132 | 06DC,,
133 | 06DF,,
134 | 06E0,,
135 | 06E1,,
136 | 06E2,,
137 | 06E3,,
138 | 06E4,,
139 | 06E7,,
140 | 06E8,,
141 | 06EA,,
142 | 06EB,,
143 | 06EC,,
144 | 06ED,,
145 | FB50,0671,Arabic Presentation Forms
146 | FB51,0671,
147 | FB52,067B,
148 | FB53,067B,
149 | FB54,067B,
150 | FB55,067B,
151 | FB56,067E,
152 | FB57,067E,
153 | FB58,067E,
154 | FB59,067E,
155 | FB5A,0680,
156 | FB5B,0680,
157 | FB5C,0680,
158 | FB5D,0680,
159 | FB5E,067A,
160 | FB5F,067A,
161 | FB60,067A,
162 | FB61,067A,
163 | FB62,067F,
164 | FB63,067F,
165 | FB64,067F,
166 | FB65,067F,
167 | FB66,0679,
168 | FB67,0679,
169 | FB68,0679,
170 | FB69,0679,
171 | FB6A,06A4,
172 | FB6B,06A4,
173 | FB6C,06A4,
174 | FB6D,06A4,
175 | FB6E,06A6,
176 | FB6F,06A6,
177 | FB70,06A6,
178 | FB71,06A6,
179 | FB72,0684,
180 | FB73,0684,
181 | FB74,0684,
182 | FB75,0684,
183 | FB76,0683,
184 | FB77,0683,
185 | FB78,0683,
186 | FB79,0683,
187 | FB7A,0686,
188 | FB7B,0686,
189 | FB7C,0686,
190 | FB7D,0686,
191 | FB7E,0687,
192 | FB7F,0687,
193 | FB80,0687,
194 | FB81,0687,
195 | FB82,068D,
196 | FB83,068D,
197 | FB84,068C,
198 | FB85,068C,
199 | FB86,068E,
200 | FB87,068E,
201 | FB88,0688,
202 | FB89,0688,
203 | FB8A,0698,
204 | FB8B,0698,
205 | FB8C,0691,
206 | FB8D,0691,
207 | FB8E,06A9,
208 | FB8F,06A9,
209 | FB90,06A9,
210 | FB91,06A9,
211 | FB92,06AF,
212 | FB93,06AF,
213 | FB94,06AF,
214 | FB95,06AF,
215 | FB96,06B3,
216 | FB97,06B3,
217 | FB98,06B3,
218 | FB99,06B3,
219 | FB9A,06B1,
220 | FB9B,06B1,
221 | FB9C,06B1,
222 | FB9D,06B1,
223 | FB9E,06BA,
224 | FB9F,06BA,
225 | FBA0,06BB,
226 | FBA1,06BB,
227 | FBA2,06BB,
228 | FBA3,06BB,
229 | FBA4,06C0,
230 | FBA5,06C0,
231 | FBA6,06C1,
232 | FBA7,06C1,
233 | FBA8,06C1,
234 | FBA9,06C1,
235 | FBAA,06BE,
236 | FBAB,06BE,
237 | FBAC,06BE,
238 | FBAD,06BE,
239 | FBAE,06D2,
240 | FBAF,06D2,
241 | FBB0,06D3,
242 | FBB1,06D3,
243 | FBD3,06AD,
244 | FBD4,06AD,
245 | FBD5,06AD,
246 | FBD6,06AD,
247 | FBD7,06C7,
248 | FBD8,06C7,
249 | FBD9,06C6,
250 | FBDA,06C6,
251 | FBDB,06C8,
252 | FBDC,06C8,
253 | FBDD,0677,
254 | FBDE,06CB,
255 | FBDF,06CB,
256 | FBE0,06C5,
257 | FBE1,06C5,
258 | FBE2,06C9,
259 | FBE3,06C9,
260 | FBE4,06D0,
261 | FBE5,06D0,
262 | FBE6,06D0,
263 | FBE7,06D0,
264 | FBE8,0649,
265 | FBE9,0649,
266 | FBEA,0626 0627,
267 | FBEB,0626 0627,
268 | FBEC,0626 06D5,
269 | FBED,0626 06D5,
270 | FBEE,0626 0648,
271 | FBEF,0626 0648,
272 | FBF0,0626 06C7,
273 | FBF1,0626 06C7,
274 | FBF2,0626 06C6,
275 | FBF3,0626 06C6,
276 | FBF4,0626 06C8,
277 | FBF5,0626 06C8,
278 | FBF6,0626 06D0,
279 | FBF7,0626 06D0,
280 | FBF8,0626 06D0,
281 | FBF9,0626 0649,
282 | FBFA,0626 0649,
283 | FBFB,0626 0649,
284 | FBFC,06CC,
285 | FBFD,06CC,
286 | FBFE,06CC,
287 | FBFF,06CC,
288 | FC00,0626 062C,
289 | FC01,0626 062D,
290 | FC02,0626 0645,
291 | FC03,0626 0649,
292 | FC04,0626 064A,
293 | FC05,0628 062C,
294 | FC06,0628 062D,
295 | FC07,0628 062E,
296 | FC08,0628 0645,
297 | FC09,0628 0649,
298 | FC0A,0628 064A,
299 | FC0B,062A 062C,
300 | FC0C,062A 062D,
301 | FC0D,062A 062E,
302 | FC0E,062A 0645,
303 | FC0F,062A 0649,
304 | FC10,062A 064A,
305 | FC11,062B 062C,
306 | FC12,062B 0645,
307 | FC13,062B 0649,
308 | FC14,062B 064A,
309 | FC15,062C 062D,
310 | FC16,062C 0645,
311 | FC17,062D 062C,
312 | FC18,062D 0645,
313 | FC19,062E 062C,
314 | FC1A,062E 062D,
315 | FC1B,062E 0645,
316 | FC1C,0633 062C,
317 | FC1D,0633 062D,
318 | FC1E,0633 062E,
319 | FC1F,0633 0645,
320 | FC20,0635 062D,
321 | FC21,0635 0645,
322 | FC22,0636 062C,
323 | FC23,0636 062D,
324 | FC24,0636 062E,
325 | FC25,0636 0645,
326 | FC26,0637 062D,
327 | FC27,0637 0645,
328 | FC28,0638 0645,
329 | FC29,0639 062C,
330 | FC2A,0639 0645,
331 | FC2B,063A 062C,
332 | FC2C,063A 0645,
333 | FC2D,0641 062C,
334 | FC2E,0641 062D,
335 | FC2F,0641 062E,
336 | FC30,0641 0645,
337 | FC31,0641 0649,
338 | FC32,0641 064A,
339 | FC33,0642 062D,
340 | FC34,0642 0645,
341 | FC35,0642 0649,
342 | FC36,0642 064A,
343 | FC37,0643 0627,
344 | FC38,0643 062C,
345 | FC39,0643 062D,
346 | FC3A,0643 062E,
347 | FC3B,0643 0644,
348 | FC3C,0643 0645,
349 | FC3D,0643 0649,
350 | FC3E,0643 064A,
351 | FC3F,0644 062C,
352 | FC40,0644 062D,
353 | FC41,0644 062E,
354 | FC42,0644 0645,
355 | FC43,0644 0649,
356 | FC44,0644 064A,
357 | FC45,0645 062C,
358 | FC46,0645 062D,
359 | FC47,0645 062E,
360 | FC48,0645 0645,
361 | FC49,0645 0649,
362 | FC4A,0645 064A,
363 | FC4B,0646 062C,
364 | FC4C,0646 062D,
365 | FC4D,0646 062E,
366 | FC4E,0646 0645,
367 | FC4F,0646 0649,
368 | FC50,0646 064A,
369 | FC51,0647 062C,
370 | FC52,0647 0645,
371 | FC53,0647 0649,
372 | FC54,0647 064A,
373 | FC55,064A 062C,
374 | FC56,064A 062D,
375 | FC57,064A 062E,
376 | FC58,064A 0645,
377 | FC59,064A 0649,
378 | FC5A,064A 064A,
379 | FC5B,0630 0670,
380 | FC5C,0631 0670,
381 | FC5D,0649 0670,
382 | FC5E,0020 064C 0651,
383 | FC5F,0020 064D 0651,
384 | FC60,0020 064E 0651,
385 | FC61,0020 064F 0651,
386 | FC62,0020 0650 0651,
387 | FC63,0020 0651 0670,
388 | FC64,0626 0631,
389 | FC65,0626 0632,
390 | FC66,0626 0645,
391 | FC67,0626 0646,
392 | FC68,0626 0649,
393 | FC69,0626 064A,
394 | FC6A,0628 0631,
395 | FC6B,0628 0632,
396 | FC6C,0628 0645,
397 | FC6D,0628 0646,
398 | FC6E,0628 0649,
399 | FC6F,0628 064A,
400 | FC70,062A 0631,
401 | FC71,062A 0632,
402 | FC72,062A 0645,
403 | FC73,062A 0646,
404 | FC74,062A 0649,
405 | FC75,062A 064A,
406 | FC76,062B 0631,
407 | FC77,062B 0632,
408 | FC78,062B 0645,
409 | FC79,062B 0646,
410 | FC7A,062B 0649,
411 | FC7B,062B 064A,
412 | FC7C,0641 0649,
413 | FC7D,0641 064A,
414 | FC7E,0642 0649,
415 | FC7F,0642 064A,
416 | FC80,0643 0627,
417 | FC81,0643 0644,
418 | FC82,0643 0645,
419 | FC83,0643 0649,
420 | FC84,0643 064A,
421 | FC85,0644 0645,
422 | FC86,0644 0649,
423 | FC87,0644 064A,
424 | FC88,0645 0627,
425 | FC89,0645 0645,
426 | FC8A,0646 0631,
427 | FC8B,0646 0632,
428 | FC8C,0646 0645,
429 | FC8D,0646 0646,
430 | FC8E,0646 0649,
431 | FC8F,0646 064A,
432 | FC90,0649 0670,
433 | FC91,064A 0631,
434 | FC92,064A 0632,
435 | FC93,064A 0645,
436 | FC94,064A 0646,
437 | FC95,064A 0649,
438 | FC96,064A 064A,
439 | FC97,0626 062C,
440 | FC98,0626 062D,
441 | FC99,0626 062E,
442 | FC9A,0626 0645,
443 | FC9B,0626 0647,
444 | FC9C,0628 062C,
445 | FC9D,0628 062D,
446 | FC9E,0628 062E,
447 | FC9F,0628 0645,
448 | FCA0,0628 0647,
449 | FCA1,062A 062C,
450 | FCA2,062A 062D,
451 | FCA3,062A 062E,
452 | FCA4,062A 0645,
453 | FCA5,062A 0647,
454 | FCA6,062B 0645,
455 | FCA7,062C 062D,
456 | FCA8,062C 0645,
457 | FCA9,062D 062C,
458 | FCAA,062D 0645,
459 | FCAB,062E 062C,
460 | FCAC,062E 0645,
461 | FCAD,0633 062C,
462 | FCAE,0633 062D,
463 | FCAF,0633 062E,
464 | FCB0,0633 0645,
465 | FCB1,0635 062D,
466 | FCB2,0635 062E,
467 | FCB3,0635 0645,
468 | FCB4,0636 062C,
469 | FCB5,0636 062D,
470 | FCB6,0636 062E,
471 | FCB7,0636 0645,
472 | FCB8,0637 062D,
473 | FCB9,0638 0645,
474 | FCBA,0639 062C,
475 | FCBB,0639 0645,
476 | FCBC,063A 062C,
477 | FCBD,063A 0645,
478 | FCBE,0641 062C,
479 | FCBF,0641 062D,
480 | FCC0,0641 062E,
481 | FCC1,0641 0645,
482 | FCC2,0642 062D,
483 | FCC3,0642 0645,
484 | FCC4,0643 062C,
485 | FCC5,0643 062D,
486 | FCC6,0643 062E,
487 | FCC7,0643 0644,
488 | FCC8,0643 0645,
489 | FCC9,0644 062C,
490 | FCCA,0644 062D,
491 | FCCB,0644 062E,
492 | FCCC,0644 0645,
493 | FCCD,0644 0647,
494 | FCCE,0645 062C,
495 | FCCF,0645 062D,
496 | FCD0,0645 062E,
497 | FCD1,0645 0645,
498 | FCD2,0646 062C,
499 | FCD3,0646 062D,
500 | FCD4,0646 062E,
501 | FCD5,0646 0645,
502 | FCD6,0646 0647,
503 | FCD7,0647 062C,
504 | FCD8,0647 0645,
505 | FCD9,0647 0670,
506 | FCDA,064A 062C,
507 | FCDB,064A 062D,
508 | FCDC,064A 062E,
509 | FCDD,064A 0645,
510 | FCDE,064A 0647,
511 | FCDF,0626 0645,
512 | FCE0,0626 0647,
513 | FCE1,0628 0645,
514 | FCE2,0628 0647,
515 | FCE3,062A 0645,
516 | FCE4,062A 0647,
517 | FCE5,062B 0645,
518 | FCE6,062B 0647,
519 | FCE7,0633 0645,
520 | FCE8,0633 0647,
521 | FCE9,0634 0645,
522 | FCEA,0634 0647,
523 | FCEB,0643 0644,
524 | FCEC,0643 0645,
525 | FCED,0644 0645,
526 | FCEE,0646 0645,
527 | FCEF,0646 0647,
528 | FCF0,064A 0645,
529 | FCF1,064A 0647,
530 | FCF2,0640 064E 0651,
531 | FCF3,0640 064F 0651,
532 | FCF4,0640 0650 0651,
533 | FCF5,0637 0649,
534 | FCF6,0637 064A,
535 | FCF7,0639 0649,
536 | FCF8,0639 064A,
537 | FCF9,063A 0649,
538 | FCFA,063A 064A,
539 | FCFB,0633 0649,
540 | FCFC,0633 064A,
541 | FCFD,0634 0649,
542 | FCFE,0634 064A,
543 | FCFF,062D 0649,
544 | FD00,062D 064A,
545 | FD01,062C 0649,
546 | FD02,062C 064A,
547 | FD03,062E 0649,
548 | FD04,062E 064A,
549 | FD05,0635 0649,
550 | FD06,0635 064A,
551 | FD07,0636 0649,
552 | FD08,0636 064A,
553 | FD09,0634 062C,
554 | FD0A,0634 062D,
555 | FD0B,0634 062E,
556 | FD0C,0634 0645,
557 | FD0D,0634 0631,
558 | FD0E,0633 0631,
559 | FD0F,0635 0631,
560 | FD10,0636 0631,
561 | FD11,0637 0649,
562 | FD12,0637 064A,
563 | FD13,0639 0649,
564 | FD14,0639 064A,
565 | FD15,063A 0649,
566 | FD16,063A 064A,
567 | FD17,0633 0649,
568 | FD18,0633 064A,
569 | FD19,0634 0649,
570 | FD1A,0634 064A,
571 | FD1B,062D 0649,
572 | FD1C,062D 064A,
573 | FD1D,062C 0649,
574 | FD1E,062C 064A,
575 | FD1F,062E 0649,
576 | FD20,062E 064A,
577 | FD21,0635 0649,
578 | FD22,0635 064A,
579 | FD23,0636 0649,
580 | FD24,0636 064A,
581 | FD25,0634 062C,
582 | FD26,0634 062D,
583 | FD27,0634 062E,
584 | FD28,0634 0645,
585 | FD29,0634 0631,
586 | FD2A,0633 0631,
587 | FD2B,0635 0631,
588 | FD2C,0636 0631,
589 | FD2D,0634 062C,
590 | FD2E,0634 062D,
591 | FD2F,0634 062E,
592 | FD30,0634 0645,
593 | FD31,0633 0647,
594 | FD32,0634 0647,
595 | FD33,0637 0645,
596 | FD34,0633 062C,
597 | FD35,0633 062D,
598 | FD36,0633 062E,
599 | FD37,0634 062C,
600 | FD38,0634 062D,
601 | FD39,0634 062E,
602 | FD3A,0637 0645,
603 | FD3B,0638 0645,
604 | FD3C,0627 064B,
605 | FD3D,0627 064B,
606 | FD50,062A 062C 0645,
607 | FD51,062A 062D 062C,
608 | FD52,062A 062D 062C,
609 | FD53,062A 062D 0645,
610 | FD54,062A 062E 0645,
611 | FD55,062A 0645 062C,
612 | FD56,062A 0645 062D,
613 | FD57,062A 0645 062E,
614 | FD58,062C 0645 062D,
615 | FD59,062C 0645 062D,
616 | FD5A,062D 0645 064A,
617 | FD5B,062D 0645 0649,
618 | FD5C,0633 062D 062C,
619 | FD5D,0633 062C 062D,
620 | FD5E,0633 062C 0649,
621 | FD5F,0633 0645 062D,
622 | FD60,0633 0645 062D,
623 | FD61,0633 0645 062C,
624 | FD62,0633 0645 0645,
625 | FD63,0633 0645 0645,
626 | FD64,0635 062D 062D,
627 | FD65,0635 062D 062D,
628 | FD66,0635 0645 0645,
629 | FD67,0634 062D 0645,
630 | FD68,0634 062D 0645,
631 | FD69,0634 062C 064A,
632 | FD6A,0634 0645 062E,
633 | FD6B,0634 0645 062E,
634 | FD6C,0634 0645 0645,
635 | FD6D,0634 0645 0645,
636 | FD6E,0636 062D 0649,
637 | FD6F,0636 062E 0645,
638 | FD70,0636 062E 0645,
639 | FD71,0637 0645 062D,
640 | FD72,0637 0645 062D,
641 | FD73,0637 0645 0645,
642 | FD74,0637 0645 064A,
643 | FD75,0639 062C 0645,
644 | FD76,0639 0645 0645,
645 | FD77,0639 0645 0645,
646 | FD78,0639 0645 0649,
647 | FD79,063A 0645 0645,
648 | FD7A,063A 0645 064A,
649 | FD7B,063A 0645 0649,
650 | FD7C,0641 062E 0645,
651 | FD7D,0641 062E 0645,
652 | FD7E,0642 0645 062D,
653 | FD7F,0642 0645 0645,
654 | FD80,0644 062D 0645,
655 | FD81,0644 062D 064A,
656 | FD82,0644 062D 0649,
657 | FD83,0644 062C 062C,
658 | FD84,0644 062C 062C,
659 | FD85,0644 062E 0645,
660 | FD86,0644 062E 0645,
661 | FD87,0644 0645 062D,
662 | FD88,0644 0645 062D,
663 | FD89,0645 062D 062C,
664 | FD8A,0645 062D 0645,
665 | FD8B,0645 062D 064A,
666 | FD8C,0645 062C 062D,
667 | FD8D,0645 062C 0645,
668 | FD8E,0645 062E 062C,
669 | FD8F,0645 062E 0645,
670 | FD92,0645 062C 062E,
671 | FD93,0647 0645 062C,
672 | FD94,0647 0645 0645,
673 | FD95,0646 062D 0645,
674 | FD96,0646 062D 0649,
675 | FD97,0646 062C 0645,
676 | FD98,0646 062C 0645,
677 | FD99,0646 062C 0649,
678 | FD9A,0646 0645 064A,
679 | FD9B,0646 0645 0649,
680 | FD9C,064A 0645 0645,
681 | FD9D,064A 0645 0645,
682 | FD9E,0628 062E 064A,
683 | FD9F,062A 062C 064A,
684 | FDA0,062A 062C 0649,
685 | FDA1,062A 062E 064A,
686 | FDA2,062A 062E 0649,
687 | FDA3,062A 0645 064A,
688 | FDA4,062A 0645 0649,
689 | FDA5,062C 0645 064A,
690 | FDA6,062C 062D 0649,
691 | FDA7,062C 0645 0649,
692 | FDA8,0633 062E 0649,
693 | FDA9,0635 062D 064A,
694 | FDAA,0634 062D 064A,
695 | FDAB,0636 062D 064A,
696 | FDAC,0644 062C 064A,
697 | FDAD,0644 0645 064A,
698 | FDAE,064A 062D 064A,
699 | FDAF,064A 062C 064A,
700 | FDB0,064A 0645 064A,
701 | FDB1,0645 0645 064A,
702 | FDB2,0642 0645 064A,
703 | FDB3,0646 062D 064A,
704 | FDB4,0642 0645 062D,
705 | FDB5,0644 062D 0645,
706 | FDB6,0639 0645 064A,
707 | FDB7,0643 0645 064A,
708 | FDB8,0646 062C 062D,
709 | FDB9,0645 062E 064A,
710 | FDBA,0644 062C 0645,
711 | FDBB,0643 0645 0645,
712 | FDBC,0644 062C 0645,
713 | FDBD,0646 062C 062D,
714 | FDBE,062C 062D 064A,
715 | FDBF,062D 062C 064A,
716 | FDC0,0645 062C 064A,
717 | FDC1,0641 0645 064A,
718 | FDC2,0628 062D 064A,
719 | FDC3,0643 0645 0645,
720 | FDC4,0639 062C 0645,
721 | FDC5,0635 0645 0645,
722 | FDC6,0633 062E 064A,
723 | FDC7,0646 062C 064A,
724 | FDF0,0635 0644 06D2,
725 | FDF1,0642 0644 06D2,
726 | FDF2,0627 0644 0644 0647,
727 | FDF3,0627 0643 0628 0631,
728 | FDF4,0645 062D 0645 062F,
729 | FDF5,0635 0644 0639 0645,
730 | FDF6,0631 0633 0648 0644,
731 | FDF7,0639 0644 064A 0647,
732 | FDF8,0648 0633 0644 0645,
733 | FDF9,0635 0644 0649,
734 | FDFA,0635 0644 0649 0020 0627 0644 0644 0647 0020 0639 0644 064A 0647 0020 0648 0633 0644 0645,
735 | FDFB,062C 0644 0020 062C 0644 0627 0644 0647,
736 | FDFC,0631 06CC 0627 0644,
737 | FE70,0020 064B,
738 | FE71,0640 064B,
739 | FE72,0020 064C,
740 | FE74,0020 064D,
741 | FE76,0020 064E,
742 | FE77,0640 064E,
743 | FE78,0020 064F,
744 | FE79,0640 064F,
745 | FE7A,0020 0650,
746 | FE7B,0640 0650,
747 | FE7C,0020 0651,
748 | FE7D,0640 0651,
749 | FE7E,0020 0652,
750 | FE7F,0640 0652,
751 | FE80,0621,
752 | FE81,0622,
753 | FE82,0622,
754 | FE83,0623,
755 | FE84,0623,
756 | FE85,0624,
757 | FE86,0624,
758 | FE87,0625,
759 | FE88,0625,
760 | FE89,0626,
761 | FE8A,0626,
762 | FE8B,0626,
763 | FE8C,0626,
764 | FE8D,0627,
765 | FE8E,0627,
766 | FE8F,0628,
767 | FE90,0628,
768 | FE91,0628,
769 | FE92,0628,
770 | FE93,0629,
771 | FE94,0629,
772 | FE95,062A,
773 | FE96,062A,
774 | FE97,062A,
775 | FE98,062A,
776 | FE99,062B,
777 | FE9A,062B,
778 | FE9B,062B,
779 | FE9C,062B,
780 | FE9D,062C,
781 | FE9E,062C,
782 | FE9F,062C,
783 | FEA0,062C,
784 | FEA1,062D,
785 | FEA2,062D,
786 | FEA3,062D,
787 | FEA4,062D,
788 | FEA5,062E,
789 | FEA6,062E,
790 | FEA7,062E,
791 | FEA8,062E,
792 | FEA9,062F,
793 | FEAA,062F,
794 | FEAB,0630,
795 | FEAC,0630,
796 | FEAD,0631,
797 | FEAE,0631,
798 | FEAF,0632,
799 | FEB0,0632,
800 | FEB1,0633,
801 | FEB2,0633,
802 | FEB3,0633,
803 | FEB4,0633,
804 | FEB5,0634,
805 | FEB6,0634,
806 | FEB7,0634,
807 | FEB8,0634,
808 | FEB9,0635,
809 | FEBA,0635,
810 | FEBB,0635,
811 | FEBC,0635,
812 | FEBD,0636,
813 | FEBE,0636,
814 | FEBF,0636,
815 | FEC0,0636,
816 | FEC1,0637,
817 | FEC2,0637,
818 | FEC3,0637,
819 | FEC4,0637,
820 | FEC5,0638,
821 | FEC6,0638,
822 | FEC7,0638,
823 | FEC8,0638,
824 | FEC9,0639,
825 | FECA,0639,
826 | FECB,0639,
827 | FECC,0639,
828 | FECD,063A,
829 | FECE,063A,
830 | FECF,063A,
831 | FED0,063A,
832 | FED1,0641,
833 | FED2,0641,
834 | FED3,0641,
835 | FED4,0641,
836 | FED5,0642,
837 | FED6,0642,
838 | FED7,0642,
839 | FED8,0642,
840 | FED9,0643,
841 | FEDA,0643,
842 | FEDB,0643,
843 | FEDC,0643,
844 | FEDD,0644,
845 | FEDE,0644,
846 | FEDF,0644,
847 | FEE0,0644,
848 | FEE1,0645,
849 | FEE2,0645,
850 | FEE3,0645,
851 | FEE4,0645,
852 | FEE5,0646,
853 | FEE6,0646,
854 | FEE7,0646,
855 | FEE8,0646,
856 | FEE9,0647,
857 | FEEA,0647,
858 | FEEB,0647,
859 | FEEC,0647,
860 | FEED,0648,
861 | FEEE,0648,
862 | FEEF,0649,
863 | FEF0,0649,
864 | FEF1,064A,
865 | FEF2,064A,
866 | FEF3,064A,
867 | FEF4,064A,
868 | FEF5,0644 0622,
869 | FEF6,0644 0622,
870 | FEF7,0644 0623,
871 | FEF8,0644 0623,
872 | FEF9,0644 0625,
873 | FEFA,0644 0625,
874 | FEFB,0644 0627,
875 | FEFC,0644 0627,
876 | FF01,0021,FullWidth
877 | FF02,0022,
878 | FF03,0023,
879 | FF04,0024,
880 | FF05,0025,
881 | FF06,0026,
882 | FF07,0027,
883 | FF08,0028,
884 | FF09,0029,
885 | FF0A,002A,
886 | FF0B,002B,
887 | FF0C,002C,
888 | FF0D,002D,
889 | FF0E,002E,
890 | FF0F,002F,
891 | FF10,0030,
892 | FF11,0031,
893 | FF12,0032,
894 | FF13,0033,
895 | FF14,0034,
896 | FF15,0035,
897 | FF16,0036,
898 | FF17,0037,
899 | FF18,0038,
900 | FF19,0039,
901 | FF1A,003A,
902 | FF1B,003B,
903 | FF1C,003C,
904 | FF1D,003D,
905 | FF1E,003E,
906 | FF1F,003F,
907 | FF20,0040,
908 | FF21,0041,
909 | FF22,0042,
910 | FF23,0043,
911 | FF24,0044,
912 | FF25,0045,
913 | FF26,0046,
914 | FF27,0047,
915 | FF28,0048,
916 | FF29,0049,
917 | FF2A,004A,
918 | FF2B,004B,
919 | FF2C,004C,
920 | FF2D,004D,
921 | FF2E,004E,
922 | FF2F,004F,
923 | FF30,0050,
924 | FF31,0051,
925 | FF32,0052,
926 | FF33,0053,
927 | FF34,0054,
928 | FF35,0055,
929 | FF36,0056,
930 | FF37,0057,
931 | FF38,0058,
932 | FF39,0059,
933 | FF3A,005A,
934 | FF3B,005B,
935 | FF3C,005C,
936 | FF3D,005D,
937 | FF3E,005E,
938 | FF3F,005F,
939 | FF40,0060,
940 | FF41,0061,
941 | FF42,0062,
942 | FF43,0063,
943 | FF44,0064,
944 | FF45,0065,
945 | FF46,0066,
946 | FF47,0067,
947 | FF48,0068,
948 | FF49,0069,
949 | FF4A,006A,
950 | FF4B,006B,
951 | FF4C,006C,
952 | FF4D,006D,
953 | FF4E,006E,
954 | FF4F,006F,
955 | FF50,0070,
956 | FF51,0071,
957 | FF52,0072,
958 | FF53,0073,
959 | FF54,0074,
960 | FF55,0075,
961 | FF56,0076,
962 | FF57,0077,
963 | FF58,0078,
964 | FF59,0079,
965 | FF5A,007A,
966 | FF5B,007B,
967 | FF5C,007C,
968 | FF5D,007D,
969 | FF5E,007E,
970 | FF5F,2985,
971 | FF60,2986,
972 | FFE0,00A2,
973 | FFE1,00A3,
974 | FFE2,00AC,
975 | FFE3,00AF,
976 | FFE4,00A6,
977 | FFE5,00A5,
978 | FFE6,20A9,
979 | FFF0,0020,Specials
980 | FFF1,0020,
981 | FFF2,0020,
982 | FFF3,0020,
983 | FFF4,0020,
984 | FFF5,0020,
985 | FFF6,0020,
986 | FFF7,0020,
987 | FFF8,0020,
988 | FFF9,0020,
989 | FFFA,0020,
990 | FFFB,0020,
991 | FFFC,0020,
992 | FFFD,0020,
993 | FFFE,0020,
994 | FFFF,0020
--------------------------------------------------------------------------------
/resources/Phoneme2Ascii.csv:
--------------------------------------------------------------------------------
1 | Phoneme,ASCII
2 | ʔ,EH
3 | a,AA
4 | b,B
5 | p,P
6 | t,T
7 | c,JE
8 | ç,CH
9 | ḧ,HE
10 | x,X
11 | d,D
12 | r,R
13 | ř,RR
14 | z,Z
15 | j,ZH
16 | s,S
17 | ş,SH
18 | ƹ,AH
19 | ẍ,XE
20 | f,F
21 | v,V
22 | q,Q
23 | k,K
24 | g,G
25 | l,L
26 | ł,LL
27 | m,M
28 | n,N
29 | o,O
30 | e,A
31 | h,H
32 | ê,E
33 | î,I
34 | y,Y
35 | w,W
36 | u,U
37 | û,UU
--------------------------------------------------------------------------------
/resources/Phoneme2IPA.csv:
--------------------------------------------------------------------------------
1 | Phoneme,IPA
2 | ng,ŋg
3 | ʔ,ʔ
4 | b,b
5 | p,p
6 | t,t
7 | c,d͡ʒ
8 | ç,t͡ʃ
9 | ḧ,ħ
10 | x,x
11 | d,d
12 | r,ɾ
13 | ř,r
14 | z,z
15 | j,ʒ
16 | s,s
17 | ş,ʃ
18 | ƹ,ʕ
19 | ẍ,ɣ
20 | f,f
21 | v,v
22 | q,q
23 | k,k
24 | g,g
25 | l,l
26 | ł,ɫ
27 | m,m
28 | n,n
29 | w,w
30 | u,ʊ
31 | û,u
32 | o,o̞
33 | h,h
34 | y,j
35 | a,ä
36 | e,a
37 | ê,ɛ
38 | i,ɪ
39 | î,i
40 | ĝ,ŋ
41 | đ,đ
42 | ü,y
43 | ô,ô
44 | õ,õ
--------------------------------------------------------------------------------
/resources/PoemPatterns.csv:
--------------------------------------------------------------------------------
1 | Frequency,WeightPattern,Title
2 | 1044,–∪–––∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلاتن فاعلن
3 | 999,∪–––∪–––∪–––∪–––,مفاعیلن مفاعیلن مفاعیلن مفاعیلن
4 | 386,∪–––∪–––∪––,مفاعیلن مفاعیلن فعولن
5 | 334,––∪∪––∪∪––∪∪––,مفعولُ مفاعیلُ مفاعیلُ فعولن
6 | 272,––∪∪––∪∪––∪∪–,مفعولُ مفاعیلُ مفاعیلُ فعل
7 | 213,––∪–∪–∪∪––∪–∪–,مفعولُ فاعلاتُ مفاعیلُ فاعلن
8 | 138,∪∪––∪∪––∪∪––∪∪–,فعلاتن فعلاتن فعلاتن فعلن
9 | 131,––∪∪–∪–∪––,مفعولُ مفاعلن فعولن
10 | 62,–∪–––∪–––∪–,فاعلاتن فاعلاتن فاعلن
11 | 45,∪∪––∪–∪–∪∪–,فعلاتن مفاعلن فعلن
12 | 40,∪–∪–∪∪––∪–∪–∪∪–,مفاعلن فعلاتن مفاعلن فعلن
13 | 31,––∪∪–––––∪∪–––,مفعولُ مفاعیلن مفعولُ مفاعیلن
14 | 28,––∪–∪––––∪–∪––,مفعولُ فاعلاتن مفعولُ فاعلاتن
15 | 20,∪∪––∪∪––∪∪–,فعلاتن فعلاتن فعلن
16 | 19,––∪–––∪–––∪–––∪–,مستفعلن مستفعلن مستفعلن مستفعلن
17 | 14,∪––∪––∪––∪–,فعولن فعولن فعولن فعل
18 | 13,∪–∪–∪––∪–∪–∪––,مفاعلن فعولن مفاعلن فعولن
19 | 9,–∪∪––∪––∪∪––∪–,مفتعلن فاعلن مفتعلن فاعلن
20 | 8,–∪∪––∪∪––∪–,مفتعلن مفتعلن فاعلن
21 | 8,–∪–––∪–––∪–––∪––,فاعلاتن فاعلاتن فاعلاتن فاعلاتن
22 | 7,∪–∪–∪–∪–∪–∪–∪–∪–,مفاعلن مفاعلن مفاعلن مفاعلن
23 | 7,–∪∪–∪–∪––∪∪–∪–∪–,مفتعلن مفاعلن مفتعلن مفاعلن
24 | 6,∪––∪––∪––∪––,فعولن فعولن فعولن فعولن
25 | 5,∪––∪∪––∪∪––∪∪––,مفاعیلُ مفاعیلُ مفاعیلُ فعولن
26 | 3,∪∪–∪–∪∪–∪–∪∪–∪–∪∪–∪–,متفاعلن متفاعلن متفاعلن متفاعلن
27 | 2,∪∪–∪–∪––∪∪–∪–∪––,فعلاتُ فاعلاتن فعلاتُ فاعلاتن
28 | 2,∪–∪–∪∪––∪–∪–∪∪––,مفاعلن فعلاتن مفاعلن فعلاتن
--------------------------------------------------------------------------------