├── .gitattributes
├── .gitconfig
├── .gitignore
├── CODE_OF_CONDUCT.md
├── LICENSE
├── NOTICE.md
├── README.md
├── SECURITY.md
├── cli
    ├── Program.cs
    └── README.txt
├── factored-segmenter.csproj
├── factored-segmenter.sln
├── lib
    ├── README.txt
    ├── RemoveThese.cs
    ├── SentencePieceInterop.cs
    ├── TextHelpers.cs
    ├── Types.cs
    └── Utils.cs
├── spm
    ├── CMakeLists.txt
    ├── SentencePieceInterop.cpp
    ├── spm.model
    ├── spm.vocab
    └── unicode_conversions.h
├── src
    ├── FactoredSegmenter.cs
    ├── FactoredSegmenterConfigs.cs
    ├── FactoredSegmenterScriptHelpers.cs
    ├── ProcessTools.cs
    ├── README.txt
    ├── SegmenterRuntime.cs
    ├── SentencePieceConfigs.cs
    ├── SentencePieceManaged.cs
    └── SentencePieceWrapper.cs
└── test
    ├── FactoredSegmenterScriptHelpersTests.cs
    ├── FactoredSegmenterTests.cs
    └── blns
        ├── blns.txt
        └── blns_README.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | * -text
2 | * diff
3 | .cer -diff
4 | .qencr -diff
5 | .dat -diff


--------------------------------------------------------------------------------
/.gitconfig:
--------------------------------------------------------------------------------
1 | # This file is *NOT* read by git by default (due to security issues)
2 | # So for mtmain, we read it in using MyInit.cmd
3 | 
4 | # For documentation on the settings in this file, see https://git-scm.com/book/en/v2/Customizing-Git-Git-Configuration
5 | 
6 | # Don't auto-convert CRLF->LF on git add and CRLF->LR on git checkout (just leave the line-endings alone!)
7 | [core]
8 |         autocrlf = false


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | ##
  4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
  5 | 
  6 | # User-specific files
  7 | *.rsuser
  8 | *.suo
  9 | *.user
 10 | *.userosscache
 11 | *.sln.docstates
 12 | 
 13 | # User-specific files (MonoDevelop/Xamarin Studio)
 14 | *.userprefs
 15 | 
 16 | # Mono auto generated files
 17 | mono_crash.*
 18 | 
 19 | # Build results
 20 | [Dd]ebug/
 21 | [Dd]ebugPublic/
 22 | [Rr]elease/
 23 | [Rr]eleases/
 24 | x64/
 25 | x86/
 26 | [Aa][Rr][Mm]/
 27 | [Aa][Rr][Mm]64/
 28 | bld/
 29 | [Bb]in/
 30 | [Oo]bj/
 31 | [Ll]og/
 32 | [Ll]ogs/
 33 | 
 34 | # Visual Studio 2015/2017 cache/options directory
 35 | .vs/
 36 | # Uncomment if you have tasks that create the project's static files in wwwroot
 37 | #wwwroot/
 38 | 
 39 | # Visual Studio 2017 auto generated files
 40 | Generated\ Files/
 41 | 
 42 | # MSTest test Results
 43 | [Tt]est[Rr]esult*/
 44 | [Bb]uild[Ll]og.*
 45 | 
 46 | # NUnit
 47 | *.VisualState.xml
 48 | TestResult.xml
 49 | nunit-*.xml
 50 | 
 51 | # Build Results of an ATL Project
 52 | [Dd]ebugPS/
 53 | [Rr]eleasePS/
 54 | dlldata.c
 55 | 
 56 | # Benchmark Results
 57 | BenchmarkDotNet.Artifacts/
 58 | 
 59 | # .NET Core
 60 | project.lock.json
 61 | project.fragment.lock.json
 62 | artifacts/
 63 | **/Properties/launchSettings.json
 64 | 
 65 | # StyleCop
 66 | StyleCopReport.xml
 67 | 
 68 | # Files built by Visual Studio
 69 | *_i.c
 70 | *_p.c
 71 | *_i.h
 72 | *_h.h
 73 | *.ilk
 74 | *.meta
 75 | *.obj
 76 | *.iobj
 77 | *.pch
 78 | *.pdb
 79 | *.ipdb
 80 | *.pgc
 81 | *.pgd
 82 | *.rsp
 83 | *.sbr
 84 | *.tlb
 85 | *.tli
 86 | *.tlh
 87 | *.tmp
 88 | *.tmp_proj
 89 | *_wpftmp.csproj
 90 | *.log
 91 | *.vspscc
 92 | *.vssscc
 93 | .builds
 94 | *.pidb
 95 | *.svclog
 96 | *.scc
 97 | 
 98 | # Chutzpah Test files
 99 | _Chutzpah*
100 | 
101 | # Visual C++ cache files
102 | ipch/
103 | *.aps
104 | *.ncb
105 | *.opendb
106 | *.opensdf
107 | *.sdf
108 | *.cachefile
109 | *.VC.db
110 | *.VC.VC.opendb
111 | 
112 | # Visual Studio profiler
113 | *.psess
114 | *.vsp
115 | *.vspx
116 | *.sap
117 | 
118 | # Visual Studio Trace Files
119 | *.e2e
120 | 
121 | # TFS 2012 Local Workspace
122 | $tf/
123 | 
124 | # Guidance Automation Toolkit
125 | *.gpState
126 | 
127 | # ReSharper is a .NET coding add-in
128 | _ReSharper*/
129 | *.[Rr]e[Ss]harper
130 | *.DotSettings.user
131 | 
132 | # JustCode is a .NET coding add-in
133 | .JustCode
134 | 
135 | # TeamCity is a build add-in
136 | _TeamCity*
137 | 
138 | # DotCover is a Code Coverage Tool
139 | *.dotCover
140 | 
141 | # AxoCover is a Code Coverage Tool
142 | .axoCover/*
143 | !.axoCover/settings.json
144 | 
145 | # Visual Studio code coverage results
146 | *.coverage
147 | *.coveragexml
148 | 
149 | # NCrunch
150 | _NCrunch_*
151 | .*crunch*.local.xml
152 | nCrunchTemp_*
153 | 
154 | # MightyMoose
155 | *.mm.*
156 | AutoTest.Net/
157 | 
158 | # Web workbench (sass)
159 | .sass-cache/
160 | 
161 | # Installshield output folder
162 | [Ee]xpress/
163 | 
164 | # DocProject is a documentation generator add-in
165 | DocProject/buildhelp/
166 | DocProject/Help/*.HxT
167 | DocProject/Help/*.HxC
168 | DocProject/Help/*.hhc
169 | DocProject/Help/*.hhk
170 | DocProject/Help/*.hhp
171 | DocProject/Help/Html2
172 | DocProject/Help/html
173 | 
174 | # Click-Once directory
175 | publish/
176 | 
177 | # Publish Web Output
178 | *.[Pp]ublish.xml
179 | *.azurePubxml
180 | # Note: Comment the next line if you want to checkin your web deploy settings,
181 | # but database connection strings (with potential passwords) will be unencrypted
182 | *.pubxml
183 | *.publishproj
184 | 
185 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
186 | # checkin your Azure Web App publish settings, but sensitive information contained
187 | # in these scripts will be unencrypted
188 | PublishScripts/
189 | 
190 | # NuGet Packages
191 | *.nupkg
192 | # NuGet Symbol Packages
193 | *.snupkg
194 | # The packages folder can be ignored because of Package Restore
195 | **/[Pp]ackages/*
196 | # except build/, which is used as an MSBuild target.
197 | !**/[Pp]ackages/build/
198 | # Uncomment if necessary however generally it will be regenerated when needed
199 | #!**/[Pp]ackages/repositories.config
200 | # NuGet v3's project.json files produces more ignorable files
201 | *.nuget.props
202 | *.nuget.targets
203 | 
204 | # Microsoft Azure Build Output
205 | csx/
206 | *.build.csdef
207 | 
208 | # Microsoft Azure Emulator
209 | ecf/
210 | rcf/
211 | 
212 | # Windows Store app package directories and files
213 | AppPackages/
214 | BundleArtifacts/
215 | Package.StoreAssociation.xml
216 | _pkginfo.txt
217 | *.appx
218 | *.appxbundle
219 | *.appxupload
220 | 
221 | # Visual Studio cache files
222 | # files ending in .cache can be ignored
223 | *.[Cc]ache
224 | # but keep track of directories ending in .cache
225 | !*.[Cc]ache/
226 | !?*.[Cc]ache/
227 | 
228 | # Others
229 | ClientBin/
230 | ~$*
231 | *~
232 | *.dbmdl
233 | *.dbproj.schemaview
234 | *.jfm
235 | *.pfx
236 | *.publishsettings
237 | orleans.codegen.cs
238 | 
239 | # Including strong name files can present a security risk
240 | # (https://github.com/github/gitignore/pull/2483#issue-259490424)
241 | #*.snk
242 | 
243 | # Since there are multiple workflows, uncomment next line to ignore bower_components
244 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
245 | #bower_components/
246 | 
247 | # RIA/Silverlight projects
248 | Generated_Code/
249 | 
250 | # Backup & report files from converting an old project file
251 | # to a newer Visual Studio version. Backup files are not needed,
252 | # because we have git ;-)
253 | _UpgradeReport_Files/
254 | Backup*/
255 | UpgradeLog*.XML
256 | UpgradeLog*.htm
257 | ServiceFabricBackup/
258 | *.rptproj.bak
259 | 
260 | # SQL Server files
261 | *.mdf
262 | *.ldf
263 | *.ndf
264 | 
265 | # Business Intelligence projects
266 | *.rdl.data
267 | *.bim.layout
268 | *.bim_*.settings
269 | *.rptproj.rsuser
270 | *- [Bb]ackup.rdl
271 | *- [Bb]ackup ([0-9]).rdl
272 | *- [Bb]ackup ([0-9][0-9]).rdl
273 | 
274 | # Microsoft Fakes
275 | FakesAssemblies/
276 | 
277 | # GhostDoc plugin setting file
278 | *.GhostDoc.xml
279 | 
280 | # Node.js Tools for Visual Studio
281 | .ntvs_analysis.dat
282 | node_modules/
283 | 
284 | # Visual Studio 6 build log
285 | *.plg
286 | 
287 | # Visual Studio 6 workspace options file
288 | *.opt
289 | 
290 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
291 | *.vbw
292 | 
293 | # Visual Studio LightSwitch build output
294 | **/*.HTMLClient/GeneratedArtifacts
295 | **/*.DesktopClient/GeneratedArtifacts
296 | **/*.DesktopClient/ModelManifest.xml
297 | **/*.Server/GeneratedArtifacts
298 | **/*.Server/ModelManifest.xml
299 | _Pvt_Extensions
300 | 
301 | # Paket dependency manager
302 | .paket/paket.exe
303 | paket-files/
304 | 
305 | # FAKE - F# Make
306 | .fake/
307 | 
308 | # JetBrains Rider
309 | .idea/
310 | *.sln.iml
311 | 
312 | # CodeRush
313 | .cr/
314 | # CodeRush personal settings
315 | .cr/personal
316 | 
317 | # Python Tools for Visual Studio (PTVS)
318 | __pycache__/
319 | *.pyc
320 | 
321 | # Cake - Uncomment if you are using it
322 | # tools/**
323 | # !tools/packages.config
324 | 
325 | # Tabs Studio
326 | *.tss
327 | 
328 | # Telerik's JustMock configuration file
329 | *.jmconfig
330 | 
331 | # BizTalk build output
332 | *.btp.cs
333 | *.btm.cs
334 | *.odx.cs
335 | *.xsd.cs
336 | 
337 | # OpenCover UI analysis results
338 | OpenCover/
339 | 
340 | # Azure Stream Analytics local run output
341 | ASALocalRun/
342 | 
343 | # MSBuild Binary and Structured Log
344 | *.binlog
345 | 
346 | # NVidia Nsight GPU debugger configuration file
347 | *.nvuser
348 | 
349 | # MFractors (Xamarin productivity tool) working folder
350 | .mfractor/
351 | 
352 | # Local History for Visual Studio
353 | .localhistory/
354 | 
355 | # BeatPulse healthcheck temp database
356 | healthchecksdb
357 | 
358 | # Backup folder for Package Reference Convert tool in Visual Studio 2017
359 | MigrationBackup/
360 | 
361 | # Ionide (cross platform F# VS Code tools) working folder
362 | .ionide/
363 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/NOTICE.md:
--------------------------------------------------------------------------------
 1 | # THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
 2 | Do Not Translate or Localize  
 3 | 
 4 | This project is based on or incorporates material from the projects listed below (Third Party IP). The original copyright notice and the license under which Microsoft received such Third Party IP, are set forth below. Such licenses and notices are provided for informational purposes only. Where permitted, Microsoft licenses the Third Party IP to you under the licensing terms for the Microsoft product. Microsoft reserves all other rights not expressly granted under this agreement, whether by implication, estoppel or otherwise.
 5 | 
 6 | **a. Big List of Naughty Strings**
 7 | 
 8 | In the test suite, `./test/blns/blns.txt` is a near copy of the Big List of Naughty Strings [https://github.com/minimaxir/big-list-of-naughty-strings].
 9 | 
10 | Original license [https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/LICENSE]:
11 | 
12 | The MIT License (MIT)
13 | 
14 | Copyright (c) 2015-2020 Max Woolf
15 | 
16 | Permission is hereby granted, free of charge, to any person obtaining a copy
17 | of this software and associated documentation files (the "Software"), to deal
18 | in the Software without restriction, including without limitation the rights
19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20 | copies of the Software, and to permit persons to whom the Software is
21 | furnished to do so, subject to the following conditions:
22 | 
23 | The above copyright notice and this permission notice shall be included in all
24 | copies or substantial portions of the Software.
25 | 
26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
31 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 | SOFTWARE.
33 | 
34 | **b. C++ REST SDK**
35 | 
36 | `./spm/unicode_conversions.h` is an excerpt from `./Release/src/utilities/asyncrt_utils.cpp` in the C++ REST SDK [https://github.com/microsoft/cpprestsdk].
37 | 
38 | Original license [https://github.com/microsoft/cpprestsdk/blob/master/license.txt]:
39 | 
40 | C++ REST SDK 
41 | 
42 | The MIT License (MIT)
43 | 
44 | Copyright (c) Microsoft Corporation
45 | 
46 | All rights reserved.
47 | 
48 | Permission is hereby granted, free of charge, to any person obtaining a copy of
49 | this software and associated documentation files (the "Software"), to deal in
50 | the Software without restriction, including without limitation the rights to
51 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
52 | the Software, and to permit persons to whom the Software is furnished to do so,
53 | subject to the following conditions:
54 | 
55 | The above copyright notice and this permission notice shall be included in all
56 | copies or substantial portions of the Software.
57 | 
58 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
59 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
60 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
61 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
62 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
63 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
64 | SOFTWARE.
65 | 
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FactoredSegmenter
  2 | 
  3 | FactoredSegmenter is the unsupervised text tokenizer for machine translation that aims at _factoring shared properties of words_, such as casing or spacing, and underlies Microsoft Translator.
  4 | It encodes tokens in the form `WORDPIECE|factor1|factor2|...|factorN`.
  5 | This encoding syntax is directly understood by the [Marian Neural Machine Translation Toolkit](https://github.com/marian-nmt/marian).
  6 | To use FactoredSegmenter with other toolkits, one must implement a parser for this format, modify the embedding lookup and, to use factors on the target side, the beam decoder.
  7 | The term "FactoredSegmenter" refers to both a segmentation library and an encoding of text.
  8 | 
  9 | FactoredSegmenter segments words into subwords, or _word pieces_, using the popular [SentencePiece](https://github.com/google/sentencepiece) library under the hood.
 10 | However, unlike SentencePiece in its common usage, spaces and capitalization are not encoded in the sub-word tokens themselves.
 11 | Instead, spacing and capitalization are encoded in _factors_ that are attached to each token.
 12 | The purpose of this is to allow the sharing of model parameters across all occurences of a word, be it
 13 | in the middle of a sentence, capitalized at the start of a sentence, at the start of a sentence enclosed in parentheses or quotation marks, or in all-caps in a social-media rant.
 14 | In SentencePiece, these are all distinct tokens, which is less robust.
 15 | For example, this distinction leads to poor translation accuracy for all-caps sentences, which is problematic when translating social-media posts.
 16 | 
 17 | #### Features of FactoredSegmenter
 18 | 
 19 |  * represents words and tokens as _tuples of factors_ to allow for parameter sharing. E.g. spacing and capitalization are separate factors on word pieces. An NMT training tool would form token embeddings by summing or concatenating embeddings of all factors in the factor tuple;
 20 |  * infrequent words are represented by _subwords_, aka word pieces, using the SentencePiece library;
 21 |  * robust treatment of _numerals_: Each digit is always split as its own token, in any writing system. We have observed that this reliably fixes a large class of translation errors for numerals, especially when translating between different numeric systems (such as Arabic numbers to Chinese);
 22 |  * support for _"phrase fixing,"_ where specific phrases are required to be translated in a very specific way. Such constrained translation is achieved with FactoredSegmenter by either replacing such phrases by a fixed token (where a factor is used to distinguish multiple such phrase fixes in a single sentence), or by inserting the desired target translation directly into the encoded source, where factors are used to distinguish the source from the target translation;
 23 |  * unknown-character handling: characters not covered by the word-piece vocabulary, for example rare Emojis, are encoded by their Unicode character code in a form that a translation system can learn to copy through;
 24 |  * round-trippable: allows to fully reconstruct the source sentence from the factored (sub)word representation (with minor exceptions);
 25 |  * support of continuous scripts, which have different rules for spacing, and combining marks.
 26 | 
 27 | ## Factors
 28 | 
 29 | Let's randomly pick a word of recent prominence, say "hydroxychloroquine." First, observe that whether it occurs at the beginning of the word (where it would normally be capitalized) or within the sentence, or whether it appears after a quotation mark (where it is lower-case but there is no space before it), it is still the same word, and it seems desirable to share embedding parameters across all four cases to some degree. Secondly, note that since "hydroxychloroquine" is a word rarely seen until recently, it may not have been seen frequently enough after a quotation mark to get its own token. Hence, in that situation it would not only not share its embedding, but it also may be segmented differently altogether from the other cases.
 30 | 
 31 | FactoredSegmenter attempts to remedy this problem by representing each (sub)word as a tuple. For example, "hydroxychloroquine" at sentence start would be represented by a tuple
 32 | that might be written in pseudo-code as
 33 | ```
 34 | {
 35 |     lemma = "hydroxychloroquine",
 36 |     capitalization = CAP_INITIAL,
 37 |     isWordBeginning = WORDBEG_YES,
 38 |     isWordEnd = WORDEND_YES
 39 | }
 40 | ```
 41 | Each tuple member is called a _factor_. The subword identity itself ("hydroxychloroquine") is also represented by a factor, which we call the _lemma_, meaning that it is the base form that may be modified by factors (this is inspired by the linguistic term [lemma](https://simple.wikipedia.org/wiki/Lemma_(linguistics)), which is a base form that gets modified by inflections).
 42 | In machine translation, the embedding of the tuple would be formed by composing embedding vectors for each individual factor in the tuple, e.g. by summing or concatenating them.
 43 | 
 44 | A factor has a type and a value. While the lemma is a string, the `capitalization` factor above is an enumeration with three values, representing three kinds of capitalization: capitalized first letter (beginning of a capitalized word, using the symbol `CAP_INITIAL`), all-caps (`CAP_ALL`), and no capitalized letters at all (a regular all-lowercase word, `CAP_NONE`). To represent mixed-case words, e.g. RuPaul, we break them into subwords. `isWordBeginning` is conceptually a boolean, but for simplicity, we give each factor a unique data type, so `isWordBeginning` is an enum with two values, `WORDBEG_YES` and `WORDBEG_NO`. Likewise for `isWordEnd`.
 45 | 
 46 | Different lemmas can have different factor sets. For example, digits and punctuation cannot be capitalized,
 47 | hence those lemmas not have a capitalization factor. However, for a given lemma, the set of factors is always the same.
 48 | The specific set of factors of a lemma is determined from heuristics represented in the FactoredSegmenter code, with some configurability via options.
 49 | 
 50 | For infrequent words or morphological variants, FactoredSegmenter supports subword units. A subword unit is used when a word is unseen in the training, or not seen often enough. FactoredSegmenter relies on the excellent SentencePiece library for determining suitable subword units.
 51 | 
 52 | For example, "hydroxychloroquine" might be rare enough to be represented by subwords, such as "hydro" + "xy" + "chloroquine". It would be represented as a sequence of three tuples:
 53 | ```
 54 | {
 55 |     lemma = "hydro",
 56 |     capitalization = CAP_INITIAL,
 57 |     isWordBeginning = WORDBEG_YES,
 58 |     isWordEnd = WORDEND_NO
 59 | },
 60 | {
 61 |     lemma = "xy",
 62 |     capitalization = CAP_NONE,
 63 |     isWordBeginning = WORDBEG_NO,
 64 |     isWordEnd = WORDEND_NO
 65 | },
 66 | {
 67 |     lemma = "chloroquine",
 68 |     capitalization = CAP_NONE,
 69 |     isWordBeginning = WORDBEG_NO,
 70 |     isWordEnd = WORDEND_YES
 71 | }
 72 | ```
 73 | The subword nature of the tuples is represented by the `isWordBeginning` and `isWordEnd` factors.
 74 | 
 75 | #### Factor Syntax
 76 | 
 77 | When written to a text file or when communicated to an NMT training toolkit, factor tuples are represented as strings following a specific syntax:
 78 | The factor values are concatenated, separated by vertical bars. A direct concatenation of the above example would give `hydroxychloroquine|CAP_INITIAL|WORDBEG_YES|WORDEND_YES`.
 79 | However, to avoid to dramatically increase data-file sizes, factors use short-hand notations when serialized. Also, to make those files a little more readable to us humans, lemmas are written in all-caps, while factors use lowercase (this also avoids name conflicts between factor names and real words). If "hydroxychloroquine" is a single word piece, the actual form as written to file of the above is:
 80 | ```
 81 | HYDROXYCHLORIQUINE|ci|wb|we
 82 | ```
 83 | The example above where it is represented by multiple subword units has the following serialized form:
 84 | ```
 85 | HYDRO|ci|wb|wen XY|cn|wbn|wen CHLOROQUINE|cn|wbn|we
 86 | ```
 87 | Any character that may be used as part of this syntax is escaped as a hex code. For example, if the vertical bar character itself was the lemma, it would be serialized as `\x7c`.
 88 | 
 89 | #### Representation of Space Between Tokens
 90 | 
 91 | If you are familiar with SentencePiece, you will notice that the tuples above do not directly encode whether there is a space before or after the word. Instead, it is encoded as factors whether a token is at the _boundary_ (beginning/end) of a word. For single-word tokens, both flags are true. Most of the time, a word boundary implies a spaces, but not always. For example, a word in quotation marks would not be enclosed in spaces; rather, the quotation marks would. For example, the sequence "Hydroxychloroquine works" would be encoded as:
 92 | ```
 93 | HYDRO|ci|wb|wen XY|cn|wbn|wen CHLOROQUINE|cn|wbn|we WORKS|cn|wb|we
 94 | ```
 95 | without explicit factors for spaces; rather, the space between "hydroxychloroquine" and "works" is implied by the word-boundary factors.
 96 | 
 97 | Hence, words do not carry factors determining space directly. Rather, spacing-related factors are carried by _punctuation marks_. By default, there is always a space at word boundaries, but punctuation carries factors stating whether a space surrounding the punctuation should rather be _elided_, whether the punctuation should be _glued_ to the surrounding token(s). For example, in the sentence "Hydroxychloroquine works!", the sentence-final exclamation point is glued to the word to the left, and would be represented by the following factor tuple:
 98 | ```
 99 | {
100 |     lemma = "!",
101 |     glueLeft = GLUE_LEFT_YES,
102 |     glueRight = GLUE_RIGHT_NO
103 | }
104 | ```
105 | The `glueLeft` factor indicates that the default space after `works` should be elided.
106 | The short-hand form that is used when writing to file is `gl+` and `gl-` and likewise `gr+` and `gr-`. The full sequence would be encoded as:
107 | ```
108 | HYDRO|ci|wb|wen XY|cn|wbn|wen CHLOROQUINE|cn|wbn|we WORKS|cn|wb|we !|gl+|gr-
109 | ```
110 | Note that the short-hands for boolean-like factors are a little inconsistent for historical reasons. Note also that this documentation makes no claims regarding the veracity of its example sentences.
111 | 
112 | #### Round-Trippability
113 | 
114 | An important property of the factor representation is that it allows to fully reconstruct the original input text, it is fully _round-trippable_. If we encode a text as factor tuples, and then decode it, the result will be the original input string. FactoredSegmenter is used in machine translation by training the translation system to translate text in factor representation to text in the target language that is likewise in factor representation. The final surface form is then recreated by decoding factor representation in the target language.
115 | 
116 | There are few exception to round-trippability. To support specifying specific translations for words ("phrase fixing"), FactoredSegmenter can replace token ranges by special placeholders that get translated as such. Alternatively, it can include the given target translation in the source string, using special factors or marker tags. The identity of such a token would get lost in the factored representation (instead, the translation system would remember its identity as side information). The C# API also allows replacing arbitrary character ranges on the fly (the original characters get lost).
117 | 
118 | Lastly, it should be noted that the specific factor sets depend on configuration variables. For example, empirically we found no practical benefit in the `isWordEnd` factor, so this is typically disabled by a configuration setting.
119 | 
120 | ## FactoredSegmenter in Code
121 | 
122 | FactoredSegmenter is manifested in code in two different ways. First, in the form of a C# library which allows to execute all functions, that is, training, encoding, and decoding. For example, each time a user invokes Microsoft Translator, e.g. via http://translate.bing.com, FactoredSegmenter is invoked via the C# interface twice, once to encode the source sentence, and once to decode the translation.
123 | 
124 | Secondly, a Linux command-line tool gives access to most of the library functions. This is used for training FactoredSegmenter models (subword representations), and it allows to build offline systems using the factored-segmenter tool and Marian alone.
125 | 
126 | ## Training and Factor Configuration
127 | 
128 | The FactoredSegmenter representation is rule-based, except for the subword units, which are based on SentencePiece. Hence, before one can tokenize text with FactoredSegmenter, a _FactoredSegmenter model_ must be trained. The training process first pre-tokenizes the input into units of consistent letter type, and then execute SentencePiece training on the resulting tokens. The result of the training process are two files:
129 | 
130 |  * an `.fsm` file, for "factored-segmenter model." An `.fsm` file contains everything needed to encode and decode. It holds all configuration options, the factor specification (which lemma has what factors), subword inventories, and also embeds the binary SentencePiece model for subword splitting.
131 |  * an `.fsv` file, for "factored-segmenter vocabulary." The `.fsv` file holds the subset of the `.fsm` model that is needed by the translation software (Marian) to interpret the factor representation.
132 | 
133 | At training time, the user must specify all options regarding which factors are used.
134 | 
135 | *TODO*: To be continued, e.g. need to document continuous-script handling, combining marks, some more on numerals; also all model options and command-line arguments
136 | 
137 | ## Prerequisites
138 | 
139 | To build FactoredSegmenter, you will need to install the following dependencies:
140 | 
141 | #### Linux
142 | ```
143 | sudo apt-get install dotnet-sdk-3.1
144 | sudo apt-get install dotnet-runtime-3.1
145 | ```
146 | And you need to install SentencePiece [from source](https://github.com/google/sentencepiece#c-from-source). SentencePiece is accessed both via executing a binary and via direct invocation of the C++ library.
147 | 
148 | #### Windows
149 | ```
150 | https://dotnet.microsoft.com/download/dotnet-core/thank-you/sdk-3.1.101-windows-x64-installer
151 | ```
152 | And SentencePiece. In the Windows version, SentencePiece is presently only invoked via the SentencePiece command-line tools. It has not been tested whether the [vcpkg installation](https://github.com/google/sentencepiece#installation) works.
153 | 
154 | ## How to build
155 | 
156 | #### Linux
157 | ```
158 | cd REPO/src
159 | dotnet publish -c Release -r linux-x64 -f netcoreapp3.1 /p:PublishSingleFile=true /p:PublishTrimmed=true \
160 |   ../factored-segmenter.csproj
161 | # now you can run the binary at REPO/src/bin/Release/netcoreapp3.1/linux-x64/publish/factored-segmenter
162 | ```
163 | 
164 | #### Windows
165 | Open `src` folder in Visual Studio 2017 or later. With 2017, it will complain that it cannot build the 3.1 SDK. F5 debugging still works (using 2.1), but you may need to hit F5 twice.
166 | 
167 | ## Example command lines
168 | 
169 | ### Encoding
170 | ```
171 | pigz -d -c /data1/SpeechTrans/ENU-DEU_Student.speech/normalize_src_training_sentences/sentenceonly.src.normalized.ENU.snt.gz \
172 |   | time   parallelized   env LC_ALL=en_US.UTF-8 \
173 |     ~/factored-segmenter/src/bin/Release/netcoreapp3.1/linux-x64/publish/factored-segmenter encode  --model ~/factored-segmenter/enu.deu.generalnn.joint.segmenter.fsm \
174 |   | pigz -c --best \
175 |   > /data1/SpeechTrans/Data/2019-12-ENU-DEU_Student/TN/TrainSingleSent/normalized.ENU.snt.fs.gz
176 | ```
177 | ### Training
178 | ```
179 | time   env LC_ALL=en_US.UTF-8 \
180 |   ~/factored-segmenter/src/bin/Release/netcoreapp3.1/linux-x64/publish/factored-segmenter train \
181 |     --model ~/factored-segmenter/out/enu.deu.generalnn.joint.segmenter.fsm \
182 |     --distinguish-initial-and-internal-pieces  --single-letter-case-factors  --serialize-indices-and-unrepresentables  --inline-fixes \
183 |     --min-piece-count 38  --min-char-count 2  --vocab-size 32000 \
184 |     /data1/SpeechTrans/ENU-DEU_Student.speech/train_segmenter.ENU.DEU.generalnn.joint/corpus.sampled
185 | ```
186 | # Contributing
187 | 
188 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
189 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
190 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
191 | 
192 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
193 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
194 | provided by the bot. You will only need to do this once across all repos using our CLA.
195 | 
196 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
197 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
198 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
199 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/cli/Program.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | using Common.Collections.Extensions;
  5 | using Common.Utils;
  6 | using Microsoft.MT.Common.Tokenization;
  7 | using Microsoft.MT.Segmentation;
  8 | using System;
  9 | using System.Collections.Generic;
 10 | using System.IO;
 11 | using System.Linq;
 12 | using System.Text;
 13 | using System.Text.RegularExpressions;
 14 | 
 15 | namespace factored_segmenter
 16 | {
 17 |     class Program
 18 |     {
 19 |         /// <summary>
 20 |         /// Command-line format:
 21 |         ///   factored-segmenter train|encode|decode [--option]* [input file|-]
 22 |         /// </summary>
 23 |         static void Main(string[] args)
 24 |         {
 25 |             var (GetAndConsumeArg, GetArg) = IterateArgs(args);
 26 |             var action = GetAndConsumeArg();
 27 |             if (action != "train" && action != "encode" && action != "decode" && action != "runtests")
 28 |                 BadArgument("The first argument must be 'train', 'encode', 'decode', or 'runtests'");
 29 | 
 30 |             // parse options
 31 |             string dataOutPath = "-";
 32 |             string modelPath = null;
 33 |             string vocabOutputPath = null;
 34 |             string fieldSeparator = null;
 35 |             bool quiet = false;
 36 |             FactoredSegmenterModelTrainConfig newModelConfig = new FactoredSegmenterModelTrainConfig();
 37 |             while (GetArg() != null && ((GetArg().StartsWith("-") && GetArg().Length > 1) || GetArg().StartsWith("--")))  // --option, -o, and --
 38 |             {
 39 |                 bool GetBoolArg() // helper to parse bool options have an optional "true" or "false" follow them
 40 |                     => GetArg() == null || (GetArg() != "true" && GetArg() != "false") || GetAndConsumeArg() == "true";
 41 |                 var option = GetAndConsumeArg();
 42 |                 // common args
 43 |                 if ((option == "-o" || option == "--output") && action != "train") // output stream for encode and decode
 44 |                     dataOutPath = GetAndConsumeArg();
 45 |                 else if (option == "-m" || option == "--model") // model path: output for train, input for encode/decode
 46 |                     modelPath = GetAndConsumeArg();
 47 |                 else if ((option == "-v" || option == "--marian-vocab") && action == "train")
 48 |                     vocabOutputPath = GetAndConsumeArg();
 49 |                 else if (option == "--quiet") // avoid unnecessary logging
 50 |                     quiet = GetBoolArg();
 51 |                 else if (option == "-F") // field separator, e.g. set to "\t" to process TSV format
 52 |                     fieldSeparator = Regex.Unescape(GetAndConsumeArg()); // unescape so that we can pass \t
 53 |                 // new-model args
 54 |                 else if (option == "--right-word-glue")
 55 |                     newModelConfig.ModelOptions.RightWordGlue = GetBoolArg();
 56 |                 else if (option == "--distinguish-initial-and-internal-pieces")
 57 |                     newModelConfig.ModelOptions.DistinguishInitialAndInternalPieces = GetBoolArg();
 58 |                 else if (option == "--split-han")
 59 |                     newModelConfig.ModelOptions.SplitHan = GetBoolArg();
 60 |                 else if (option == "--single-letter-case-factors")
 61 |                     newModelConfig.ModelOptions.SingleLetterCaseFactors = GetBoolArg();
 62 |                 else if (option == "--serialize-indices-and-unrepresentables")
 63 |                     newModelConfig.ModelOptions.SerializeIndicesAndUnrepresentables = GetBoolArg();
 64 |                 else if (option == "--inline-fixes")
 65 |                     newModelConfig.ModelOptions.InlineFixes = GetBoolArg();
 66 |                 else if (option == "--inline-fix-use-tags")
 67 |                     newModelConfig.ModelOptions.InlineFixUseTags = GetBoolArg();
 68 |                 else if (option == "--no-sentence-piece")
 69 |                     newModelConfig.SentencePieceTrainingConfig = null;
 70 |                 // training args
 71 |                 else if (option == "--vocab-size" && action == "train")
 72 |                     newModelConfig.SentencePieceTrainingConfig.VocabSize = int.Parse(GetAndConsumeArg());
 73 |                 else if (option == "--character_coverage" && action == "train")
 74 |                     newModelConfig.SentencePieceTrainingConfig.CharacterCoverage = double.Parse(GetAndConsumeArg());
 75 |                 else if (option == "--training-sentence-size" && action == "train")
 76 |                     newModelConfig.TrainingSentenceSize = int.Parse(GetAndConsumeArg());
 77 |                 else if (option == "--min-piece-count" && action == "train")
 78 |                     newModelConfig.MinPieceCount = int.Parse(GetAndConsumeArg());
 79 |                 else if (option == "--min-char-count" && action == "train")
 80 |                     newModelConfig.MinCharCount = int.Parse(GetAndConsumeArg());
 81 |                 // other
 82 |                 else if (option == "--") // -- ends option processing
 83 |                     break;
 84 |                 else
 85 |                     BadArgument($"Unknown option {option}");
 86 |             }
 87 | 
 88 |             // parse remaining arguments (one or more input files)
 89 |             var inputPaths = new List<string>();
 90 |             while (GetArg() != null)
 91 |                 inputPaths.Add(GetAndConsumeArg());
 92 |             if (!inputPaths.Any()) // none given: read from stdin
 93 |                 inputPaths.Add("-");
 94 | 
 95 |             // open all input files
 96 |             var streams = from inputPath in inputPaths
 97 |                           select inputPath != "-" ?
 98 |                               new StreamReader(inputPath, encoding: Encoding.UTF8, detectEncodingFromByteOrderMarks: true, bufferSize: 1000000) :
 99 |                               Console.In;
100 | 
101 |             if (action == "train")
102 |             {
103 |                 if (!quiet)
104 |                     Log($"Creating model {modelPath} from input file(s) {" ".JoinItems(inputPaths)} ...");
105 |                 if (!modelPath.EndsWith(".fsm")) // @TODO: do this inside Train() where we create the temp pathnames
106 |                     BadArgument($"Extension .fsm is required for model path {modelPath}");
107 |                 var lines = from stream in streams
108 |                             from line in stream.ReadLines()
109 |                             select line;
110 |                 CreateDirectoryFor(modelPath); // @TODO: do this inside Train()
111 |                 var model = FactoredSegmenterModel.Train(newModelConfig, lines, sourceSentenceAnnotations: null, fsmModelPath: modelPath, spmBinDir: SentencePieceManaged.SpmBinaryDirPath);
112 | 
113 |                 // save the model
114 |                 // The SentencePiece model is embedded in 'model'; it is not a separate file.
115 |                 model.Save(modelPath);
116 |                 if (!quiet)
117 |                     Log($"Model file written to {modelPath}");
118 | 
119 |                 // save the vocab for Marian consumption
120 |                 if (model.FactorSpec != null && vocabOutputPath != null)
121 |                 {
122 |                     File.WriteAllLines(vocabOutputPath, model.FactorSpec, new UTF8Encoding(encoderShouldEmitUTF8Identifier: false));
123 |                     if (!quiet)
124 |                         Log($"Marian vocabulary file written to {vocabOutputPath}");
125 |                 }
126 |             }
127 |             else if (action == "encode" || action == "decode")
128 |             {
129 |                 if (!quiet)
130 |                     Log($"Processing input file(s) {" ".JoinItems(inputPaths)} with model {modelPath} ...");
131 |                 var lines = from stream in streams.ToList()  // ToList() eagerly opens all streams, to test upfront if all files are found
132 |                             from line in stream.ReadLines()
133 |                             select line;
134 |                 newModelConfig.ModelOptions.UseSentencePiece = false;
135 |                 var coderConfig = modelPath != null ?
136 |                     new FactoredSegmenterCoderConfig
137 |                     {
138 |                         ModelPath = modelPath
139 |                     } :
140 |                     new FactoredSegmenterCoderConfig  // no model specified: use untrained virgin model (without SentencePiece)
141 |                     {
142 |                         Model = new FactoredSegmenterModel(newModelConfig.ModelOptions)
143 |                     };
144 |                 var coder = new FactoredSegmenterCoder(coderConfig);
145 | 
146 |                 // write loop
147 |                 if (!quiet)
148 |                     Log($"Writing processed lines to {dataOutPath} ...");
149 |                 CreateDirectoryFor(dataOutPath);
150 |                 var outStream = dataOutPath != "-" ?  // open output stream (UTF-8 without BOM)
151 |                     new StreamWriter(dataOutPath, append: false, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), bufferSize: 1000000) :
152 |                     Console.Out;
153 |                 var linesProcessed = 0;
154 |                 string ProcessLine(string line)
155 |                 {
156 |                     try
157 |                     {
158 |                         return action == "encode" ?
159 |                                " ".JoinItems(coder.Encode(line).TokenStrings) :  // encode
160 |                                coder.Decode(line).ToString();                    // decode
161 |                     }
162 |                     catch (Exception e)
163 |                     {
164 |                         Log($"Failed to {action} input: {line}");
165 |                         Log($"Exception: {e.ToString()}");
166 |                         return "";  // back off to empty string, so that we can continue
167 |                     }
168 |                 }
169 |                 foreach (var line in lines)
170 |                 {
171 |                     string processedLine = fieldSeparator == null ?
172 |                         processedLine = ProcessLine(line) :
173 |                         processedLine = fieldSeparator.JoinItems(from field in line.Split(fieldSeparator) select ProcessLine(field));
174 |                     //Log($"{command} IN: {line} --> OUT: {processedLine}");
175 |                     outStream.WriteLine(processedLine);
176 |                     // @BUGBUG: Write errors are not caught, at least when writing to a pipe via stdout.
177 |                     linesProcessed++;
178 |                     if (!quiet && linesProcessed % 1000000 == 0)
179 |                         Log($"Completed processing of {linesProcessed:#,##0} lines so far.");
180 |                 }
181 |                 if (!quiet)
182 |                     Log($"Completed processing of {linesProcessed:#,##0} lines.");
183 | 
184 |                 outStream.Flush(); // hoping to elicit an exception in case flushing fails
185 |                 outStream.Close();
186 |             }
187 |             // @TODO: disabled for now since the tests don't build under Linux
188 |             //else if (action == "runtests")
189 |             //{
190 |             //    // This is for easier testing when debugging environment does not support tests.
191 |             //    // This must be manually maintained.
192 |             //    var tests = new TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test.FactoredSegmenterTests();
193 |             //    tests.ReversibilityAndBasicBreakingTests();
194 |             //    tests.DecodeIntoConsecutiveSegmentsTest();
195 |             //    //tests.ReversibilityAndBasicBreakingTestsOnNaughtyData();  // fails in standalone build because data file is our other repo
196 |             //    tests.RunTraining();
197 |             //    var tests1 = new TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test.FactoredSegmenterScriptHelperTests();
198 |             //    tests1.ScriptEdgeCasesTest();
199 |             //    tests1.ClassificationEdgeCaseTests();
200 |             //}
201 |         }
202 | 
203 |         static void Log(string what) => Logger.WriteLine(string.Format("{0:yyyy/MM/dd HH:mm:ss.fff} factored-segmenter: ", DateTime.Now) + what);
204 | 
205 |         static void BadArgument(string what)
206 |         {
207 |             Log(what);
208 |             Environment.Exit(1);
209 |         }
210 | 
211 |         static (Func<string> GetAndConsumeArg, Func<string> GetArg) IterateArgs(string[] args)
212 |         {
213 |             var e = args.GetEnumerator();
214 |             var b = e.MoveNext();
215 |             return (GetAndConsumeArg: () =>
216 |                     {
217 |                         if (!b)
218 |                             BadArgument("At least one more argument was expected.");
219 |                         var res = e.Current as string;
220 |                         b = e.MoveNext(); // b is boxed, so this persists across calls
221 |                         return res;
222 |                     },
223 |                     GetArg: () => b ? e.Current as string : null);
224 |         }
225 | 
226 |         static void CreateDirectoryFor(string filePath)
227 |         {
228 |             if (filePath == "-")
229 |                 return;
230 |             var dirName = Path.GetDirectoryName(filePath);
231 |             if (dirName != "")
232 |                 Directory.CreateDirectory(dirName);
233 |         }
234 |     }
235 | }
236 | 


--------------------------------------------------------------------------------
/cli/README.txt:
--------------------------------------------------------------------------------
1 | This directory contains the source files to implement the command-line wrapper
2 | around FactoredSegmenter. They are only used in the standalone build (i.e.
3 | FactoredSegmenter as a standalone command-line tool).
4 | 


--------------------------------------------------------------------------------
/factored-segmenter.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <OutputType>Exe</OutputType>
 5 |     <TargetFramework>netcoreapp3.1</TargetFramework>
 6 |     <RootNamespace>factored_segmenter</RootNamespace>
 7 |   </PropertyGroup>
 8 |   <ItemGroup>
 9 |     <Content Remove="test\**" />
10 |     <Compile Remove="test\**" />
11 |     <EmbeddedResource Remove="test\**" />
12 |     <None Remove="test\**" />
13 |   </ItemGroup>
14 | 
15 | </Project>
16 | 


--------------------------------------------------------------------------------
/factored-segmenter.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 15
 4 | VisualStudioVersion = 15.0.28307.852
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "factored-segmenter", "factored-segmenter.csproj", "{ADC50319-8AB7-415A-A501-A65073B281E4}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Release|Any CPU = Release|Any CPU
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{ADC50319-8AB7-415A-A501-A65073B281E4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | 		{ADC50319-8AB7-415A-A501-A65073B281E4}.Debug|Any CPU.Build.0 = Debug|Any CPU
16 | 		{ADC50319-8AB7-415A-A501-A65073B281E4}.Release|Any CPU.ActiveCfg = Release|Any CPU
17 | 		{ADC50319-8AB7-415A-A501-A65073B281E4}.Release|Any CPU.Build.0 = Release|Any CPU
18 | 	EndGlobalSection
19 | 	GlobalSection(SolutionProperties) = preSolution
20 | 		HideSolutionNode = FALSE
21 | 	EndGlobalSection
22 | 	GlobalSection(ExtensibilityGlobals) = postSolution
23 | 		SolutionGuid = {B3B448B5-6719-445D-B09A-A781B80286B9}
24 | 	EndGlobalSection
25 | EndGlobal
26 | 


--------------------------------------------------------------------------------
/lib/README.txt:
--------------------------------------------------------------------------------
 1 | This directory contains support libraries that are used by the main
 2 | FactoredSegmenter sources in the src/ directory.
 3 | 
 4 | The standalone command-line tool build uses these libraries here.
 5 | 
 6 | The production build uses a different version of this library, which is
 7 | included in our production environment, and is proprietary. The files in this
 8 | directory contain a subset of those production libraries that implements only
 9 | those classes and methods that are used by the standalone build, sometimes in
10 | greatly simplified versions.
11 | 


--------------------------------------------------------------------------------
/lib/RemoveThese.cs:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | // This file contains stubs to things that are referenced but not actually needed.
 5 | // These should be fixed in the shared code, e.g. by moving out the parts that
 6 | // reference these into a separate source file.
 7 | 
 8 | using Common.MT.Segments;
 9 | using System.Collections.Generic;
10 | 
11 | // @TODO: find out how to install this correctly in dotnet 2.x and 3.x. Then delete this.
12 | namespace Microsoft.VisualStudio.TestTools.UnitTesting
13 | {
14 |     public static class Assert
15 |     {
16 |         public static void IsTrue(bool condition) { System.Diagnostics.Debug.Assert(condition); }
17 |         public static void IsFalse(bool condition) { IsTrue(!condition); }
18 |         public static void AreEqual<T>(T a, T b) { IsTrue(a.Equals(b)); } // @TODO: correct?
19 |     }
20 | }
21 | namespace Microsoft.VisualStudio.TestTools.UnitTesting
22 | {
23 |     using System;
24 | 
25 |     [AttributeUsage(AttributeTargets.Class, AllowMultiple = false)]
26 |     public sealed class TestClassAttribute : System.Attribute
27 |     {
28 |         public TestClassAttribute() { }
29 |     }
30 |     [AttributeUsage(AttributeTargets.Assembly | AttributeTargets.Class | AttributeTargets.Struct | AttributeTargets.Constructor | AttributeTargets.Method | AttributeTargets.Property | AttributeTargets.Event, Inherited = false, AllowMultiple = false)]
31 |     public sealed class TestMethodAttribute : Attribute
32 |     {
33 |         public TestMethodAttribute() { }
34 |     }
35 |     [AttributeUsage(AttributeTargets.Class | AttributeTargets.Method, AllowMultiple = true)]
36 |     public sealed class DeploymentItemAttribute : Attribute
37 |     {
38 |         public DeploymentItemAttribute(string path, string outputDirectory) { }
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/lib/SentencePieceInterop.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | // Wrapper around the SentencePiece runtime library.
  5 | // This is currently emulated by a process-based interface,
  6 | // until a real P/invoke implementation is completed.
  7 | 
  8 | using System;
  9 | using System.IO;
 10 | using System.Text;
 11 | using System.Collections.Concurrent;
 12 | using static Common.Utils.ProcessTool;
 13 | using Common.Contracts;
 14 | using Common.Collections.Extensions;
 15 | using System.Linq;
 16 | using Common.Utils;
 17 | using System.Collections.Generic;
 18 | using System.Runtime.InteropServices;
 19 | 
 20 | namespace Microsoft.MT.Segmentation
 21 | {
 22 |     public class SentencePieceManaged // : IDisposable
 23 |     {
 24 |         static readonly string spmBinaryDirPathLinux = "/usr/local/bin/";
 25 |         static readonly string spmBinaryDirPathWindows = @"c:\work\mtmain\target\Retail\amd64\Tokenization\";
 26 | 
 27 |         public static string SpmBinaryDirPath =>
 28 |             RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? spmBinaryDirPathWindows : spmBinaryDirPathLinux;
 29 | 
 30 |         HashSet<string> m_vocabulary;
 31 |         readonly string m_tempModelPath;
 32 |         readonly string m_tempVocabPath;
 33 |         readonly ConcurrentQueue<ProcessPipe> m_serverPool;
 34 |         public SentencePieceManaged(string loadMe, string[] vocabulary)
 35 |         {
 36 |             m_tempModelPath = Path.GetTempFileName();
 37 |             m_tempVocabPath = Path.GetTempFileName();
 38 |             m_serverPool = new ConcurrentQueue<ProcessPipe>(); // pool of SPM helper processes. We need multiple if running multi-threaded.
 39 |             m_vocabulary = vocabulary?.ToHashSet();
 40 |             // save to file during the lifetime of this object
 41 |             File.WriteAllBytes(m_tempModelPath, File.ReadAllBytes(loadMe));
 42 |             if (vocabulary != null)
 43 |                 File.WriteAllLines(m_tempVocabPath, vocabulary, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false));
 44 |         }
 45 | 
 46 |         // @TODO: get destruction right, delete the temp files
 47 |         //public sealed override void Dispose()
 48 |         //{
 49 |         //    //Dispose(true);
 50 |         //    GC.SuppressFinalize(this);
 51 |         //}
 52 |         //[HandleProcessCorruptedStateExceptions]
 53 |         //protected virtual void Dispose(bool A_0) { }
 54 | 
 55 |         // This is the only interface into SPM used by FactoredSegmenter.
 56 |         // It determines the split points where SPM would split.
 57 |         // @TODO: change return type to IList type, which will save one operation in this build, while costing nothing in MTMAIN
 58 |         public int[] GetSplitPoints(string segmentMe)
 59 |         {
 60 |             if (segmentMe.Length <= 1) // nothing to split. This includes space, which is SPM's break symbol, and should not be sent.
 61 |                 return null;
 62 |             // obtain a server process if available, or create a new one if all are in use
 63 |             if (!m_serverPool.TryDequeue(out var processPipe))
 64 |             {
 65 |                 var argv = new List<string> { SpmBinaryDirPath + "spm_encode", "--model", m_tempModelPath };
 66 |                 if (m_vocabulary != null)
 67 |                     argv.AddRange(new List<string> { "--vocabulary", m_tempVocabPath });
 68 |                 Logger.WriteLine($"starting SentencePiece instance as: {" ".JoinItems(argv)}");
 69 |                 processPipe = new ProcessPipe(argv, envirVariables: new Dictionary<string, string> { { "LC_ALL", "en_US.UTF-8" } });
 70 |                 // @TODO: do we need the environment variable for spm_encode?
 71 |             }
 72 |             //Logger.WriteLine($"SPM-encoding word {segmentMe}");
 73 |             processPipe.process.StandardInput.WriteLine(segmentMe); // @TODO: how do we know/ensure this is UTF-8?
 74 |             var encodedWord = processPipe.process.StandardOutput.ReadLine();
 75 |             Sanity.Requires(encodedWord != null, "spm_encode unexpectedly terminated");
 76 |             // return the process back into the pool
 77 |             m_serverPool.Enqueue(processPipe);
 78 | 
 79 |             var pieces = encodedWord.Split(' ', options: StringSplitOptions.RemoveEmptyEntries);
 80 |             if ("".JoinItems(pieces) != segmentMe)
 81 |             {
 82 |                 Logger.WriteLine($"ignoring word: SentencePiece did not just split the word ('{segmentMe}', -> '{" ".JoinItems(pieces)}')");
 83 |                 return null;
 84 |             }
 85 | 
 86 |             // create array of segmentation points
 87 |             // E.g. if "abcde" got broken into "ab cde", then we return the split points (0, 2, 5).
 88 |             // This code handles the special case of OOV pieces.
 89 |             // E.g. if there is no '+' in the SentencePiece vocab, then spm_encode will keep
 90 |             // it as '++++'. We must break those up into individual pieces.
 91 |             List<int> res = null; // (created lazily)
 92 |             int n = 0; // accumulator for split points
 93 |             for (int i = 0; i < pieces.Length; i++)
 94 |             {
 95 |                 var piece = pieces[i];
 96 |                 if (m_vocabulary == null || m_vocabulary.Contains(piece))
 97 |                 {
 98 |                     n += piece.Length;
 99 |                     if (n < segmentMe.Length || res != null) // (in the frequent special case of an unbroken single token, we return null for efficiency)
100 |                     {
101 |                         if (res == null)
102 |                             res = new List<int> { 0, n };
103 |                         else
104 |                             res.Add(n);
105 |                     }
106 |                 }
107 |                 else // special case: OOV. Break at each character.
108 |                     for (int j = 0; j < piece.Length; /*j += n*/)
109 |                     {
110 |                         // length of this piece is 1 Unicode character. Surrogate pairs are 2 characters in C#'s UCS-2 encoding.
111 |                         var ucs2Len = (char.IsHighSurrogate(piece[j]) && j + 2 <= piece.Length) ? 2 : 1;
112 |                         n += ucs2Len;
113 |                         j += ucs2Len;
114 |                         if (n < segmentMe.Length || res != null)
115 |                         {
116 |                             if (res == null)
117 |                                 res = new List<int> { 0, n };
118 |                             else
119 |                                 res.Add(n);
120 |                         }
121 |                     }
122 |             }
123 |             return res?.ToArray();
124 |         }
125 |         public string[] Segment(string segmentMe) { throw new NotImplementedException("Segment() not implemented in this build."); }
126 |         public string Unsegment(string[] unsegmentMe) { throw new NotImplementedException("Unsegment() not implemented in this build."); }
127 | 
128 |         //public static bool IsHighSurrogate(char c) { return true; }
129 |     }
130 | }
131 | 


--------------------------------------------------------------------------------
/lib/TextHelpers.cs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | using System;
  5 | using System.Collections.Generic;
  6 | using System.Globalization;
  7 | using System.Linq;
  8 | 
  9 | namespace Common.Text
 10 | {
 11 |     /// <summary>Helper functions for Unicode</summary>
 12 |     public static class Unicode
 13 |     {
 14 |         /// <summary>
 15 |         /// Translate the UnicodeCategory into the two-letter Unicode-designation representation
 16 |         /// </summary>
 17 |         public static string GetUnicodeDesignation(this char c)
 18 |         {
 19 |             // derived from UnicodeCategory enum, which has these strings in the comment
 20 |             switch (CharUnicodeInfo.GetUnicodeCategory(c))
 21 |             {
 22 |                 case UnicodeCategory.UppercaseLetter:         return "Lu"; // (letter, uppercase)
 23 |                 case UnicodeCategory.LowercaseLetter:         return "Ll"; // (letter, lowercase)
 24 |                 case UnicodeCategory.TitlecaseLetter:         return "Lt"; // (letter, titlecase)
 25 |                 case UnicodeCategory.ModifierLetter:          return "Lm"; // (letter, modifier)
 26 |                 case UnicodeCategory.OtherLetter:             return "Lo"; // (letter, other)
 27 |                 case UnicodeCategory.NonSpacingMark:          return "Mn"; // (mark, nonspacing) combined with another and so not consuming additional horizontal space
 28 |                 case UnicodeCategory.SpacingCombiningMark:    return "Mc"; // (mark, spacing combining)
 29 |                 case UnicodeCategory.EnclosingMark:           return "Me"; // (mark, enclosing)
 30 |                 case UnicodeCategory.DecimalDigitNumber:      return "Nd"; // (number, decimal digit)
 31 |                 case UnicodeCategory.LetterNumber:            return "Nl"; // (number, letter)
 32 |                 case UnicodeCategory.OtherNumber:             return "No"; // (number other)
 33 |                 case UnicodeCategory.SpaceSeparator:          return "Zs"; // (separator, space)
 34 |                 case UnicodeCategory.LineSeparator:           return "Zl"; // (separator, line)
 35 |                 case UnicodeCategory.ParagraphSeparator:      return "Zp"; // (separator, paragraph)
 36 |                 case UnicodeCategory.Control:                 return "Cc"; // (other, control)
 37 |                 case UnicodeCategory.Format:                  return "Cf"; // (other format)
 38 |                 case UnicodeCategory.Surrogate:               return "Cs"; // (other surrogate)
 39 |                 case UnicodeCategory.PrivateUse:              return "Co"; // (other, private use)
 40 |                 case UnicodeCategory.ConnectorPunctuation:    return "Pc"; // (punctuation, connector)
 41 |                 case UnicodeCategory.DashPunctuation:         return "Pd"; // (punctuation dash)
 42 |                 case UnicodeCategory.OpenPunctuation:         return "Ps"; // (punctuation open)
 43 |                 case UnicodeCategory.ClosePunctuation:        return "Pe"; // (punctuation close)
 44 |                 case UnicodeCategory.InitialQuotePunctuation: return "Pi"; // (punctuation, initial quote)
 45 |                 case UnicodeCategory.FinalQuotePunctuation:   return "Pf"; // (punctuation, final quote)
 46 |                 case UnicodeCategory.OtherPunctuation:        return "Po"; // (punctuation, other)
 47 |                 case UnicodeCategory.MathSymbol:              return "Sm"; // (symbol, math)
 48 |                 case UnicodeCategory.CurrencySymbol:          return "Sc"; // (symbol currency)
 49 |                 case UnicodeCategory.ModifierSymbol:          return "Sk"; // (symbol, modifier)
 50 |                 case UnicodeCategory.OtherSymbol:             return "So"; // (symbol, other)
 51 |                 case UnicodeCategory.OtherNotAssigned:        return "Cn"; // (other, not assigned)
 52 |                 default: throw new ArgumentOutOfRangeException();
 53 |             }
 54 |         }
 55 | 
 56 |         /// <summary>
 57 |         /// Translate the UnicodeCategory into the one-letter major Unicode-designation representation
 58 |         /// @TODO: Find a way to handle surrogate pairs
 59 |         /// </summary>
 60 |         public static char GetUnicodeMajorDesignation(this char c) => GetUnicodeDesignation(c)[0];
 61 | 
 62 |         /// <summary>
 63 |         /// Test whether a script is continuous (not written with spaces).
 64 |         /// This informs FactoredSegmenter which factor to use for word/segment boundaries,
 65 |         /// which affects which rules the system learns regarding inserting spaces.
 66 |         /// </summary>
 67 |         public static bool IsContinuousScript(this char c)
 68 |         {
 69 |             var script = GetScript(c);
 70 |             return script == Script.Han ||
 71 |                    script == Script.Hiragana || script == Script.Katakana ||
 72 |                    script == Script.Thai;
 73 |         }
 74 | 
 75 |         /// <summary>Names of Unicode Scripts.  Scripts are set of chars like Arabic, Latin, Cyrillic, etc</summary>
 76 |         [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1034:NestedTypesShouldNotBeVisible"), System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1028:EnumStorageShouldBeInt32", Justification = "Use Byte for Enum to optimize storage of 65Kb array")]
 77 |         public enum Script : byte
 78 |         {
 79 |             None,      // not a script (i.e. "null" script value)
 80 |             Arabic,
 81 |             Common,    // commonly used in more than one script
 82 |             Cyrillic,
 83 |             Devanagari,
 84 |             Greek,
 85 |             Han,
 86 |             Hangul,
 87 |             Hebrew,
 88 |             Hiragana,
 89 |             Inherited, // considered to have the same script as that of the preceding character
 90 |             Katakana,
 91 |             Latin,
 92 |             Thai,
 93 |             Unknown    // valid Unicode codepoint with unknown script (per Unicode spec)
 94 |         }
 95 |         /// <summary>
 96 |         /// Returns the <see cref="Script"/> value for the given Unicode code point
 97 |         /// </summary>
 98 |         /// <param name="chUtf32">The Unicode code point of the desired character</param>
 99 |         /// <returns>The <see cref="Script"/> value for the given Unicode code point</returns>
100 |         private static Script GetValue(int chUtf32)
101 |         {
102 |             if (chUtf32 < s_ScriptByChar.Length)
103 |                 return s_ScriptByChar[chUtf32];
104 |             else
105 |                 return Script.Unknown;
106 |         }
107 |         /// <summary>Returns a script of a given character</summary>
108 |         /// <param name="value">The character to check</param>
109 |         /// <returns>The <see cref="Script"/> value for the given character</returns>
110 |         public static Script GetScript(char value)
111 |         {
112 |             // @BUGBUG: This interface is flawed. We must handle surrogate pairs correctly.
113 |             return !char.IsSurrogate(value) ?
114 |                 GetValue(value) : Script.None;
115 |         }
116 | 
117 |         private static Script[] s_ScriptByChar; // [unicode code point] -> Script
118 |         static Unicode()
119 |         {
120 |             // initialize the static script-mapping table
121 |             // process for getting this (only doing this once, so not writing a script for it):
122 |             //  - run MTMAIN Common.Text.Unicode.Scripts.Create() and get result of GetScriptRanges().
123 |             //  - replace Name= by Name=Script., replace DynamicValueStart by a number
124 |             //  - format as a 3-column table, e.g. "0	31	Script.Common"
125 |             //  - write to a file
126 |             //  - delete all numeric scripts, collate consecutive ranges
127 |             //    grep -v 'Script.[0-9]' d:\me\x1 | sort -n | gawk "{if ($1==P2+1 && P3==$3) {P2=$2} else {print P1, P2, P3; P1=$1;P2=$2;P3=$3}}END{print P1, P2, P3}" | grep Script | gawk '{if (NR%4 == 0) {print ""}; printf("(%i,%i,%s), ", $1, $2, $3)}' | clip
128 |             var ranges = new (int min, int max, Script script)[]
129 |             {
130 |                 (0,64,Script.Common), (65,90,Script.Latin), (91,96,Script.Common),
131 |                 (97,122,Script.Latin), (123,169,Script.Common), (170,170,Script.Latin), (171,185,Script.Common),
132 |                 (186,186,Script.Latin), (187,191,Script.Common), (192,214,Script.Latin), (215,215,Script.Common),
133 |                 (216,246,Script.Latin), (247,247,Script.Common), (248,696,Script.Latin), (697,735,Script.Common),
134 |                 (736,740,Script.Latin), (741,745,Script.Common), (748,767,Script.Common), (768,879,Script.Inherited),
135 |                 (880,883,Script.Greek), (884,884,Script.Common), (885,887,Script.Greek), (890,893,Script.Greek),
136 |                 (894,894,Script.Common), (900,900,Script.Greek), (901,901,Script.Common), (902,902,Script.Greek),
137 |                 (903,903,Script.Common), (904,906,Script.Greek), (908,908,Script.Greek), (910,929,Script.Greek),
138 |                 (931,993,Script.Greek), (1008,1023,Script.Greek), (1024,1156,Script.Cyrillic), (1157,1158,Script.Inherited),
139 |                 (1159,1319,Script.Cyrillic), (1417,1417,Script.Common), (1425,1479,Script.Hebrew), (1488,1514,Script.Hebrew),
140 |                 (1520,1524,Script.Hebrew), (1536,1540,Script.Arabic), (1542,1547,Script.Arabic), (1548,1548,Script.Common),
141 |                 (1549,1562,Script.Arabic), (1563,1563,Script.Common), (1566,1566,Script.Arabic), (1567,1567,Script.Common),
142 |                 (1568,1599,Script.Arabic), (1600,1600,Script.Common), (1601,1610,Script.Arabic), (1611,1621,Script.Inherited),
143 |                 (1622,1631,Script.Arabic), (1632,1641,Script.Common), (1642,1647,Script.Arabic), (1648,1648,Script.Inherited),
144 |                 (1649,1756,Script.Arabic), (1757,1757,Script.Common), (1758,1791,Script.Arabic), (1872,1919,Script.Arabic),
145 |                 (2208,2208,Script.Arabic), (2210,2220,Script.Arabic), (2276,2302,Script.Arabic), (2304,2384,Script.Devanagari),
146 |                 (2385,2386,Script.Inherited), (2387,2403,Script.Devanagari), (2404,2405,Script.Common), (2406,2423,Script.Devanagari),
147 |                 (2425,2431,Script.Devanagari), (3585,3642,Script.Thai), (3647,3647,Script.Common), (3648,3675,Script.Thai),
148 |                 (4053,4056,Script.Common), (4347,4347,Script.Common), (4352,4607,Script.Hangul), (5867,5869,Script.Common),
149 |                 (5941,5942,Script.Common), (6146,6147,Script.Common), (6149,6149,Script.Common), (7376,7378,Script.Inherited),
150 |                 (7379,7379,Script.Common), (7380,7392,Script.Inherited), (7393,7393,Script.Common), (7394,7400,Script.Inherited),
151 |                 (7401,7404,Script.Common), (7405,7405,Script.Inherited), (7406,7411,Script.Common), (7412,7412,Script.Inherited),
152 |                 (7413,7414,Script.Common), (7424,7461,Script.Latin), (7462,7466,Script.Greek), (7467,7467,Script.Cyrillic),
153 |                 (7468,7516,Script.Latin), (7517,7521,Script.Greek), (7522,7525,Script.Latin), (7526,7530,Script.Greek),
154 |                 (7531,7543,Script.Latin), (7544,7544,Script.Cyrillic), (7545,7614,Script.Latin), (7615,7615,Script.Greek),
155 |                 (7616,7654,Script.Inherited), (7676,7679,Script.Inherited), (7680,7935,Script.Latin), (7936,7957,Script.Greek),
156 |                 (7960,7965,Script.Greek), (7968,8005,Script.Greek), (8008,8013,Script.Greek), (8016,8023,Script.Greek),
157 |                 (8025,8025,Script.Greek), (8027,8027,Script.Greek), (8029,8029,Script.Greek), (8031,8061,Script.Greek),
158 |                 (8064,8116,Script.Greek), (8118,8132,Script.Greek), (8134,8147,Script.Greek), (8150,8155,Script.Greek),
159 |                 (8157,8175,Script.Greek), (8178,8180,Script.Greek), (8182,8190,Script.Greek), (8192,8203,Script.Common),
160 |                 (8204,8205,Script.Inherited), (8206,8292,Script.Common), (8298,8304,Script.Common), (8305,8305,Script.Latin),
161 |                 (8308,8318,Script.Common), (8319,8319,Script.Latin), (8320,8334,Script.Common), (8336,8348,Script.Latin),
162 |                 (8352,8378,Script.Common), (8400,8432,Script.Inherited), (8448,8485,Script.Common), (8486,8486,Script.Greek),
163 |                 (8487,8489,Script.Common), (8490,8491,Script.Latin), (8492,8497,Script.Common), (8498,8498,Script.Latin),
164 |                 (8499,8525,Script.Common), (8526,8526,Script.Latin), (8527,8543,Script.Common), (8544,8584,Script.Latin),
165 |                 (8585,8585,Script.Common), (8592,9203,Script.Common), (9216,9254,Script.Common), (9280,9290,Script.Common),
166 |                 (9312,9983,Script.Common), (9985,10239,Script.Common), (10496,11084,Script.Common), (11088,11097,Script.Common),
167 |                 (11360,11391,Script.Latin), (11744,11775,Script.Cyrillic), (11776,11835,Script.Common), (11904,11929,Script.Han),
168 |                 (11931,12019,Script.Han), (12032,12245,Script.Han), (12272,12283,Script.Common), (12288,12292,Script.Common),
169 |                 (12293,12293,Script.Han), (12294,12294,Script.Common), (12295,12295,Script.Han), (12296,12320,Script.Common),
170 |                 (12321,12329,Script.Han), (12330,12333,Script.Inherited), (12334,12335,Script.Hangul), (12336,12343,Script.Common),
171 |                 (12344,12347,Script.Han), (12348,12351,Script.Common), (12353,12438,Script.Hiragana), (12441,12442,Script.Inherited),
172 |                 (12443,12444,Script.Common), (12445,12447,Script.Hiragana), (12448,12448,Script.Common), (12449,12538,Script.Katakana),
173 |                 (12539,12540,Script.Common), (12541,12543,Script.Katakana), (12593,12686,Script.Hangul), (12688,12703,Script.Common),
174 |                 (12736,12771,Script.Common), (12784,12799,Script.Katakana), (12800,12830,Script.Hangul), (12832,12895,Script.Common),
175 |                 (12896,12926,Script.Hangul), (12927,13007,Script.Common), (13008,13054,Script.Katakana), (13056,13143,Script.Katakana),
176 |                 (13144,13311,Script.Common), (13312,19893,Script.Han), (19904,19967,Script.Common), (19968,40908,Script.Han),
177 |                 (42560,42647,Script.Cyrillic), (42655,42655,Script.Cyrillic), (42752,42785,Script.Common), (42786,42887,Script.Latin),
178 |                 (42888,42890,Script.Common), (42891,42894,Script.Latin), (42896,42899,Script.Latin), (42912,42922,Script.Latin),
179 |                 (43000,43007,Script.Latin), (43056,43065,Script.Common), (43232,43259,Script.Devanagari), (43360,43388,Script.Hangul),
180 |                 (44032,55203,Script.Hangul), (55216,55238,Script.Hangul), (55243,55291,Script.Hangul), (63744,64109,Script.Han),
181 |                 (64112,64217,Script.Han), (64256,64262,Script.Latin), (64285,64310,Script.Hebrew), (64312,64316,Script.Hebrew),
182 |                 (64318,64318,Script.Hebrew), (64320,64321,Script.Hebrew), (64323,64324,Script.Hebrew), (64326,64335,Script.Hebrew),
183 |                 (64336,64449,Script.Arabic), (64467,64829,Script.Arabic), (64830,64831,Script.Common), (64848,64911,Script.Arabic),
184 |                 (64914,64967,Script.Arabic), (65008,65020,Script.Arabic), (65021,65021,Script.Common), (65024,65039,Script.Inherited),
185 |                 (65040,65049,Script.Common), (65056,65062,Script.Inherited), (65072,65106,Script.Common), (65108,65126,Script.Common),
186 |                 (65128,65131,Script.Common), (65136,65140,Script.Arabic), (65142,65276,Script.Arabic), (65279,65279,Script.Common),
187 |                 (65281,65312,Script.Common), (65313,65338,Script.Latin), (65339,65344,Script.Common), (65345,65370,Script.Latin),
188 |                 (65371,65381,Script.Common), (65382,65391,Script.Katakana), (65392,65392,Script.Common), (65393,65437,Script.Katakana),
189 |                 (65438,65439,Script.Common), (65440,65470,Script.Hangul), (65474,65479,Script.Hangul), (65482,65487,Script.Hangul),
190 |                 (65490,65495,Script.Hangul), (65498,65500,Script.Hangul), (65504,65510,Script.Common), (65512,65518,Script.Common),
191 |                 (65529,65533,Script.Common), (65792,65794,Script.Common), (65799,65843,Script.Common), (65847,65855,Script.Common),
192 |                 (65856,65930,Script.Greek), (65936,65947,Script.Common), (66000,66044,Script.Common), (66045,66045,Script.Inherited),
193 |                 (69216,69246,Script.Arabic), (110592,110592,Script.Katakana), (110593,110593,Script.Hiragana), (118784,119029,Script.Common),
194 |                 (119040,119078,Script.Common), (119081,119142,Script.Common), (119143,119145,Script.Inherited), (119146,119162,Script.Common),
195 |                 (119163,119170,Script.Inherited), (119171,119172,Script.Common), (119173,119179,Script.Inherited), (119180,119209,Script.Common),
196 |                 (119210,119213,Script.Inherited), (119214,119261,Script.Common), (119296,119365,Script.Greek), (119552,119638,Script.Common),
197 |                 (119648,119665,Script.Common), (119808,119892,Script.Common), (119894,119964,Script.Common), (119966,119967,Script.Common),
198 |                 (119970,119970,Script.Common), (119973,119974,Script.Common), (119977,119980,Script.Common), (119982,119993,Script.Common),
199 |                 (119995,119995,Script.Common), (119997,120003,Script.Common), (120005,120069,Script.Common), (120071,120074,Script.Common),
200 |                 (120077,120084,Script.Common), (120086,120092,Script.Common), (120094,120121,Script.Common), (120123,120126,Script.Common),
201 |                 (120128,120132,Script.Common), (120134,120134,Script.Common), (120138,120144,Script.Common), (120146,120485,Script.Common),
202 |                 (120488,120779,Script.Common), (120782,120831,Script.Common), (126464,126467,Script.Arabic), (126469,126495,Script.Arabic),
203 |                 (126497,126498,Script.Arabic), (126500,126500,Script.Arabic), (126503,126503,Script.Arabic), (126505,126514,Script.Arabic),
204 |                 (126516,126519,Script.Arabic), (126521,126521,Script.Arabic), (126523,126523,Script.Arabic), (126530,126530,Script.Arabic),
205 |                 (126535,126535,Script.Arabic), (126537,126537,Script.Arabic), (126539,126539,Script.Arabic), (126541,126543,Script.Arabic),
206 |                 (126545,126546,Script.Arabic), (126548,126548,Script.Arabic), (126551,126551,Script.Arabic), (126553,126553,Script.Arabic),
207 |                 (126555,126555,Script.Arabic), (126557,126557,Script.Arabic), (126559,126559,Script.Arabic), (126561,126562,Script.Arabic),
208 |                 (126564,126564,Script.Arabic), (126567,126570,Script.Arabic), (126572,126578,Script.Arabic), (126580,126583,Script.Arabic),
209 |                 (126585,126588,Script.Arabic), (126590,126590,Script.Arabic), (126592,126601,Script.Arabic), (126603,126619,Script.Arabic),
210 |                 (126625,126627,Script.Arabic), (126629,126633,Script.Arabic), (126635,126651,Script.Arabic), (126704,126705,Script.Arabic),
211 |                 (126976,127019,Script.Common), (127024,127123,Script.Common), (127136,127150,Script.Common), (127153,127166,Script.Common),
212 |                 (127169,127183,Script.Common), (127185,127199,Script.Common), (127232,127242,Script.Common), (127248,127278,Script.Common),
213 |                 (127280,127339,Script.Common), (127344,127386,Script.Common), (127462,127487,Script.Common), (127488,127488,Script.Hiragana),
214 |                 (127489,127490,Script.Common), (127504,127546,Script.Common), (127552,127560,Script.Common), (127568,127569,Script.Common),
215 |                 (127744,127776,Script.Common), (127792,127797,Script.Common), (127799,127868,Script.Common), (127872,127891,Script.Common),
216 |                 (127904,127940,Script.Common), (127942,127946,Script.Common), (127968,127984,Script.Common), (128000,128062,Script.Common),
217 |                 (128064,128064,Script.Common), (128066,128247,Script.Common), (128249,128252,Script.Common), (128256,128317,Script.Common),
218 |                 (128320,128323,Script.Common), (128336,128359,Script.Common), (128507,128576,Script.Common), (128581,128591,Script.Common),
219 |                 (128640,128709,Script.Common), (128768,128883,Script.Common), (131072,173782,Script.Han), (173824,177972,Script.Han),
220 |                 (177984,178205,Script.Han), (194560,195101,Script.Han), (917505,917505,Script.Common), (917536,917631,Script.Common),
221 |                 (917760,917999,Script.Inherited)
222 |             };
223 |             s_ScriptByChar = Enumerable.Repeat(Script.Unknown, ranges.Last().max + 1).ToArray();
224 |             foreach (var range in ranges)
225 |                 for (int i = range.min; i <= range.max; i++)
226 |                     s_ScriptByChar[i] = range.script;
227 |         }
228 |     }
229 | }
230 | 


--------------------------------------------------------------------------------
/lib/Types.cs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | // This file contains a collection of enums, data structures, and interfaces
  5 | // referenced in FactoredSegmenter. These have been extracted from a larger library.
  6 | 
  7 | using System;
  8 | using System.Collections.Generic;
  9 | 
 10 | namespace Common.MT.Segments
 11 | {
 12 |     public class AlignmentLink : IComparable<AlignmentLink>, IEquatable<AlignmentLink>
 13 |     {
 14 |         public int SourceIndex { get; }
 15 |         public int TargetIndex { get; }
 16 |         public float Confidence { get; }
 17 |         public override int GetHashCode() => SourceIndex ^ (TargetIndex << 16);
 18 |         public bool Equals(AlignmentLink that) => that != null && this.SourceIndex == that.SourceIndex && this.TargetIndex == that.TargetIndex;
 19 |         public override string ToString() => $"{SourceIndex}:{TargetIndex}";
 20 |         public int CompareTo(AlignmentLink other)
 21 |         {
 22 |             int c1 = SourceIndex.CompareTo(other.SourceIndex);
 23 |             if (c1 != 0)
 24 |                 return c1;
 25 |             return TargetIndex.CompareTo(other.TargetIndex);
 26 |         }
 27 |     }
 28 |     public class Alignment
 29 |     {
 30 |         public List<AlignmentLink> Links { get; private set; }
 31 |         public Alignment InsertMissingTarget(int sourceIndex, int targetIndex)
 32 |             => throw new NotImplementedException("InsertMissingTarget is not supported");
 33 |         public int GetTargetIndexToInsert(int originalSrcIndex) => -1;
 34 |         public override string ToString() => string.Join(" ", Links);
 35 |     }
 36 | }
 37 | namespace Microsoft.MT.TextSegmentation.SpanFinder
 38 | {
 39 |     public enum AnnotatedSpanClassType
 40 |     {
 41 |         PhraseFix
 42 |     }
 43 |     public enum AnnotatedSpanInstructions  // note: ignored in standalone build
 44 |     {
 45 |         ForceDecodeAs,
 46 |         EncodeAsIf
 47 |     }
 48 |     public class AnnotatedSpan
 49 |     {
 50 |         public int StartIndex { get; private set; } // coordinates into the raw source string
 51 |         public int Length { get; private set; }
 52 |         /// <summary>
 53 |         /// If given, then Encode() will pretend that the character range was this string instead of the original.
 54 |         /// Casing and word/continuous-script factors are derived as if these characters were in the original.
 55 |         /// </summary>
 56 |         public string EncodeAsIf { get; private set; }
 57 |         /// <summary>
 58 |         /// If given, then Decode() will decode this token as the given string. Use this for PhraseFix.
 59 |         /// If not given, then Decode() will reproduce the original character string. Used internally for unencodable characters.
 60 |         /// This requires a class type. @TODO: In the future, it can also be a parenthesized pass-through (A|B).
 61 |         /// @BUGBUG: For now, we do not handle casing; DecodeAs is just applied as-is. Need to decide what to do here.
 62 |         /// </summary>
 63 |         public string DecodeAs { get; private set; }
 64 |         /// <summary>
 65 |         /// If given, this is the class type to use to represent this token in Marian.
 66 |         /// The reason to use different class types is that different
 67 |         /// classes may occur in different grammatical contexts (e.g. PhraseFix vs. Url).
 68 |         /// </summary>
 69 |         public AnnotatedSpanClassType? ClassType { get; private set; } // if non-null, then use this class token
 70 |         public AnnotatedSpan(
 71 |             int startIndex,
 72 |             int length,
 73 |             AnnotatedSpanClassType? classType,
 74 |             AnnotatedSpanInstructions instructions = AnnotatedSpanInstructions.ForceDecodeAs,  // note: ignored in standalone build
 75 |             string decodeAs = null,
 76 |             string encodeAsIf = null)
 77 |         {
 78 |             StartIndex = startIndex;
 79 |             Length = length;
 80 |             ClassType = classType;
 81 |             DecodeAs = decodeAs;
 82 |             EncodeAsIf = encodeAsIf;
 83 |         }
 84 |     }
 85 | }
 86 | namespace Microsoft.MT.Common.Tokenization
 87 | {
 88 |     public enum SegmenterKind
 89 |     {
 90 |         FactoredSegmenter,
 91 |         SentencePiece, // (not actually supported in this library)
 92 |         Unknown
 93 |     }
 94 |     public interface ISegmenterConfig { }
 95 |     public interface ISentencePieceConfig : ISegmenterConfig { }
 96 |     public interface IFactoredSegmenterConfig : ISegmenterConfig { }
 97 |     public class SegmenterConfigBase { }
 98 |     public abstract class SegmenterTrainConfigBase : SegmenterConfigBase
 99 |     {
100 |         /// <summary>
101 |         /// Maximum size of sentences to train sentence pieces
102 |         /// </summary>
103 |         public abstract int? TrainingSentenceSize { get; set; }
104 |     }
105 |     public class SegmenterEncodeConfigBase : SegmenterConfigBase { }
106 |     public class SegmenterDecodeConfigBase : SegmenterConfigBase { }
107 |     public class ProcessedToken
108 |     {
109 |         public static ProcessedToken CreateRegularToken(string sourceWord, List<string> origSource = null, int rawCharStart = -1, int rawCharLength = -1)
110 |             => throw new NotImplementedException("The ProcessedToken interface is not available in this build.");
111 |     }
112 | }
113 | 


--------------------------------------------------------------------------------
/lib/Utils.cs:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | // This file contains a collection of utility functions. This is an extract
  5 | // from a larger library, reduced to what is actually used by this project.
  6 | 
  7 | using Common.Collections;
  8 | using Common.Collections.Extensions;
  9 | using Common.Contracts;
 10 | using System;
 11 | using System.Collections.Generic;
 12 | using System.Diagnostics;
 13 | using System.Globalization;
 14 | using System.IO;
 15 | using System.Linq;
 16 | using System.Text;
 17 | using System.Threading;
 18 | 
 19 | namespace Common.Collections.Extensions
 20 | {
 21 |     public static class StringExtensions
 22 |     {
 23 |         /// <summary>
 24 |         /// Convenience version of string.Join() that follows the Python syntax where the joiner is 'this'.
 25 |         /// </summary>
 26 |         public static string JoinItems<T>(this string separator, IEnumerable<T> items) => string.Join(separator, items);
 27 |     }
 28 |     public static class EnumerableExtensions
 29 |     {
 30 |         /// <summary>
 31 |         /// Create a sequence of overlapping pairs of the input.
 32 |         /// E.g. a b c d -> (a,b) (b,c) (c,d)
 33 |         /// </summary>
 34 |         /// <param name="sequence">Sequence of items. The sequence must have at least one element.</param>
 35 |         /// <returns>Sequence of bigrams</returns>
 36 |         public static IEnumerable<(T, T)> Bigrams<T>(this IEnumerable<T> sequence)
 37 |         {
 38 |             var seqEnum = sequence.GetEnumerator();
 39 |             bool movedNext = seqEnum.MoveNext();
 40 |             Sanity.Requires(movedNext, "Bigram() requires a non-empty input");
 41 |             T lastVal = seqEnum.Current;
 42 |             while (seqEnum.MoveNext())
 43 |             {
 44 |                 T thisVal = seqEnum.Current;
 45 |                 yield return (lastVal, thisVal);
 46 |                 lastVal = thisVal;
 47 |             }
 48 |         }
 49 |     }
 50 |     public static class DictionaryExtensions
 51 |     {
 52 |         /// <summary>
 53 |         /// Same as Enumerable.SequenceEquals(), except that arguments may also be null.
 54 |         /// Amazingly, a.NullableSequenceEquals(b) works for a=null, thanks to the magic
 55 |         /// of extension methods.
 56 |         /// </summary>
 57 |         /// <typeparam name="T"></typeparam>
 58 |         /// <param name="a">sequence or null</param>
 59 |         /// <param name="b">sequence or null</param>
 60 |         /// <returns>True if both args are null, or if both are non-null and sequences match</returns>
 61 |         public static bool NullableSequenceEquals<T>(this IEnumerable<T> a, IEnumerable<T> b)
 62 |         {
 63 |             return (a == null && b == null) ||
 64 |                    (a != null && b != null && Enumerable.SequenceEqual(a, b));
 65 |         }
 66 |     }
 67 |     public static class IOExtensions
 68 |     {
 69 |         /// <summary>
 70 |         /// Implements ReadLines() on the TextReader interface.
 71 |         /// </summary>
 72 |         public static IEnumerable<string> ReadLines(this TextReader textReader)
 73 |         {
 74 |             var line = textReader.ReadLine();
 75 |             while (line != null)
 76 |             {
 77 |                 yield return line;
 78 |                 line = textReader.ReadLine();
 79 |             }
 80 |         }
 81 |     }
 82 | }
 83 | namespace Common.Contracts
 84 | {
 85 |     public static class Sanity
 86 |     {
 87 |         public static bool Requires(bool condition, string errorMessage, params object[] args)
 88 |         {
 89 |             if (!condition)
 90 |             {
 91 |                 if (args.Length == 0)
 92 |                     throw new ArgumentException(errorMessage);
 93 |                 else
 94 |                     throw new ArgumentException(string.Format(CultureInfo.InvariantCulture, errorMessage, args));
 95 |             }
 96 |             return true; // allows to use it in an expression
 97 |         }
 98 |     }
 99 | }
100 | namespace Common.Utils
101 | {
102 |     public static class ProcessTool
103 |     {
104 |         static char[] k_ArgToCommandLineInvalidChars = Enumerable.Concat(from c in Enumerable.Range(0, (int)' ') select (char)c, new char[] { '"', '^' }).ToArray();
105 |         /// <summary>
106 |         /// escape an argument to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#.
107 |         /// Some characters that are tricky to handle consistently. For now, we simply forbid them.
108 |         /// These include all control characters (0x00..0x1f), " (quotation marks inside string), and ^ (CMD shell escape).
109 |         /// To handle " and ^ correctly, we may need additional context on whether this is run via CMD, and there is
110 |         /// supposedly also a difference between CommandLineToArgV() and the C++ CRT (C# unknown) regarding sequences of double quotes.
111 |         /// </summary>
112 |         /// <param name="arg">Argument as the final string that the tool should receive, without escaping.</param>
113 |         /// <returns>Escaped version of argument, or unmodified argument if no escaping is needed.</returns>
114 |         static string ArgToCommandLine(string arg)
115 |         {
116 |             if (-1 != arg.IndexOfAny(k_ArgToCommandLineInvalidChars))
117 |                 throw new NotImplementedException($"ArgToCommandLine: presently cannot handle certain special characters (e.g. \" and ^) in: {arg}");
118 |             if (!arg.Any() || arg.Contains(' '))  // space is the delimiter, so we must surround the arg by quotes
119 |                 return $"\"{arg}\"";
120 |             else                    // otherwise, no need to escape (it would be OK to escape, but not escaping is better for log readability
121 |                 return arg;
122 |         }
123 |         /// <summary>
124 |         /// convert an array of string arguments to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#.
125 |         /// </summary>
126 |         public static string ArgsToCommandLine(IEnumerable<string> args)
127 |             => string.Join(" ", from arg in args select ArgToCommandLine(arg));
128 | 
129 |         private static Process CreateProcess(string exe, string args,
130 |                                              IEnumerable<KeyValuePair<string, string>> envirVariables, bool isPipe,
131 |                                              TextWriter stderr)
132 |         {
133 |             var psi = new ProcessStartInfo(exe, args)
134 |             {
135 |                 UseShellExecute = false,
136 |                 CreateNoWindow = true,
137 |                 ErrorDialog = false,
138 |             };
139 |             if (isPipe)
140 |             {
141 |                 psi.RedirectStandardInput = true;
142 |                 psi.RedirectStandardOutput = true;
143 |                 psi.StandardInputEncoding = Encoding.UTF8;
144 |                 psi.StandardOutputEncoding = Encoding.UTF8;
145 |             }
146 |             if (stderr != null)
147 |             {
148 |                 psi.RedirectStandardError = true;
149 |                 psi.StandardErrorEncoding = Encoding.UTF8; // @REVIEW: needed?
150 |             }
151 |             if (envirVariables != null)
152 |                 foreach (KeyValuePair<string, string> pair in envirVariables)
153 |                     psi.EnvironmentVariables[pair.Key] = pair.Value;
154 | 
155 |             var process = new Process();
156 |             process.StartInfo = psi;
157 |             if (stderr != null)
158 |                 process.ErrorDataReceived += (sender, e) => { stderr.WriteLine(e.Data); };
159 |             process.Start();
160 |             if (stderr != null)
161 |                 process.BeginErrorReadLine();
162 |             return process;
163 |         }
164 | 
165 |         // @TODO: do we need IDisposable interface, so we can WaitForExit() for the process?
166 |         public class ProcessPipe
167 |         {
168 |             public readonly Process process;
169 |             public ProcessPipe(IList<string> argv, IEnumerable<KeyValuePair<string, string>> envirVariables = null) // UNIX-style argv array incl. exe itself
170 |             {
171 |                 process = CreateProcess(argv.First(), ArgsToCommandLine(argv.Skip(1)), envirVariables: envirVariables, isPipe: true, stderr: null);
172 |                 process.StandardInput.AutoFlush = true;
173 |             }
174 |         }
175 | 
176 |         public static int RunCommand(
177 |                    string exe,
178 |                    string args,
179 |                    string stdoutPath, // must be null in this version
180 |                    string stderrPath, // may be null
181 |                    bool throwOnFailure = true,
182 |                    IEnumerable<KeyValuePair<string, string>> envirVariables = null)
183 |         {
184 |             Sanity.Requires(stdoutPath == null, "This reduced version of RunCommand() does not support stdout redirection");
185 |             Logger.WriteLine($"executing command: {exe} {args}");
186 |             using (TextWriter stderrWriter = stderrPath == null ? null : new StreamWriter(stderrPath, append: false, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)) { AutoFlush = true })
187 |             using (var process = CreateProcess(exe, args, envirVariables, isPipe: false, stderr: stderrWriter))
188 |             {
189 |                 process.WaitForExit();
190 |                 if (throwOnFailure && process.ExitCode != 0)
191 |                     throw new IOException($"Exit code {process.ExitCode} was returned by external process: {exe} {args}");
192 |                 else
193 |                     return process.ExitCode;
194 |             }
195 |         }
196 |     }
197 | }
198 | namespace Common.Utils
199 | {
200 |     public static class Logger
201 |     {
202 |         public static void WriteLine(string format, params object[] args) => Console.Error.WriteLine(format, args);
203 | 
204 |         public static void WriteLine(string s) => Console.Error.WriteLine(s);
205 |     }
206 | }
207 | namespace Common.IO
208 | {
209 |     /// <summary>
210 |     /// Contains static creator methods for various types of writers that will typically be used
211 |     /// with AtomicFileWriter
212 |     /// </summary>
213 |     public static class AtomicFileWriter
214 |     {
215 |         /// <summary>
216 |         /// move a file to a target location that gets deleted first if existing
217 |         /// TODO: This seems to be duplicated about 20 times throughout the Solution; clean it up.
218 |         /// This operation is faked to be "atomic" in that race conditions are handled that arise from concurrent attempts of doing the same thing on a parallel process.
219 |         /// Note: if the source cannot be moved for whatever reason, but the target can be deleted, then this function will cause harm.
220 |         /// TODO: the class name AtomicFileWriter does not seem fully appropriate for this function
221 |         /// </summary>
222 |         /// <param name="from"></param>
223 |         /// <param name="to"></param>
224 |         static void MoveReplace(string from, string to)
225 |         {
226 |             // This loop caters to the situation that two processes try to do this concurrently on
227 |             // the same target path. The semantics should be that one of them wins. The special case
228 |             // is that when this process deletes the target location, but then fails because a file at
229 |             // the target path has magically reappeared. This must have been a concurrent process.
230 |             // In this case, we just try again.
231 |             while (true)
232 |             {
233 |                 File.Delete(to);
234 |                 try
235 |                 {
236 |                     File.Move(from, to);
237 |                     return; // success
238 |                 }
239 |                 catch (IOException)
240 |                 {
241 |                     if (!File.Exists(to)) // file not there: failed due to some other problem
242 |                         throw;
243 |                     // target file magically reappeared:
244 |                     // This must be a concurrent thread. Just try again. If we cannot delete this new one, we will fail in Delete().
245 |                 }
246 |             }
247 |         }
248 | 
249 |         /// <summary>
250 |         /// helper to save an object to disk via an intermediate tmp file and a lambda
251 |         /// The caller must provide a lambda that accepts a (temporary) file name, and save to that.
252 |         /// That temp file will then be atomically renamed into the target location.
253 |         /// It is "atomic" in the sense that in case of an error, it will not leave a partially written file
254 |         /// under the target name, and only overwrite a potentially existing one if the write operation succeeded.
255 |         /// The outFilePath may be null. In that case, the SaveFunc() is called with null. This allows
256 |         /// for nested Save() calls with multiple temp files, where some are optional.
257 |         /// </summary>
258 |         /// <param name="outFilePath">final output goes here</param>
259 |         /// <param name="SaveFunc">lambda that creates a file (to which this function passes a temp path, which then gets renamed)</param>
260 |         public static void Save(string outFilePath, Action<string> SaveFunc)
261 |         {
262 |             if (outFilePath == null)
263 |             {
264 |                 SaveFunc(null);
265 |                 return;
266 |             }
267 |             string tmpPath = $"{outFilePath}.{Thread.CurrentThread.ManagedThreadId}$$";
268 |             try
269 |             {
270 |                 SaveFunc(tmpPath);
271 |                 MoveReplace(tmpPath, outFilePath);
272 |             }
273 |             catch
274 |             {
275 |                 File.Delete(tmpPath); // best-effort cleanup (which may file e.g. in case of network disconnect)
276 |                 throw;
277 |             }
278 |         }
279 |     }
280 | }
281 | namespace Common.Collections
282 | {
283 |     /// <summary>
284 |     /// Wrapper around Dictionary with the following properties:
285 |     /// (1) Dictionary is only allowed to grow to m_maxSize
286 |     /// (2) Access is synchronized and read-write until the dictionary is full. 
287 |     /// Once the dictionary is full it becomes read-only (subsequent adds are no-ops) and lock-free.
288 |     /// The class was designed for use as a cache to store computations on key streams that are assumed to 
289 |     /// exhibit Zipfian distribution.
290 |     /// </summary>
291 |     /// <typeparam name="K"></typeparam>
292 |     /// <typeparam name="V"></typeparam>
293 |     public class BoundedSizedLockingCache<K, V> //: IDictionary<K, V>
294 |     {
295 |         private object m_locker = new object();
296 |         private int m_maxSize;
297 |         private Dictionary<K, V> m_dict = new Dictionary<K, V>();
298 |         volatile bool m_full = false;
299 | 
300 |         private void MaybeLock(Action act)
301 |         {
302 |             if (m_full)
303 |             {
304 |                 act();
305 |                 return;
306 |             }
307 |             else
308 |             {
309 |                 lock (m_locker)
310 |                 {
311 |                     act();
312 |                 }
313 |             }
314 |         }
315 | 
316 |         private RetT MaybeLock<RetT>(Func<RetT> func)
317 |         {
318 |             if (m_full)
319 |             {
320 |                 return func();
321 |             }
322 |             lock (m_locker)
323 |             {
324 |                 return func();
325 |             }
326 |         }
327 | 
328 |         private void MaybeSetFull()
329 |         {
330 |             MaybeLock(() => { if (m_dict.Count >= m_maxSize) { m_full = true; } });
331 |         }
332 | 
333 |         /// <summary>
334 |         /// Create a cache 
335 |         /// </summary>
336 |         /// <param name="maxSize">Maximum size of cache. Setting to 0 effectively disables the cache.</param>
337 |         public BoundedSizedLockingCache(int maxSize)
338 |         {
339 |             m_maxSize = maxSize;
340 |             MaybeSetFull();
341 |         }
342 |         /// <summary>
343 |         /// If the dictionary has room, add key and value. Otherwise this is a no-op.
344 |         /// </summary>
345 |         public void Add(K key, V value)
346 |         {
347 |             if (m_full)
348 |                 return;
349 |         
350 |             MaybeLock(() =>
351 |             {
352 |                 if (!m_dict.ContainsKey(key))
353 |                     m_dict.Add(key, value);
354 |                 MaybeSetFull();
355 |             });
356 |         }
357 |         public bool TryGetValue(K key, out V value)
358 |         {
359 |             if (m_full)
360 |             {
361 |                 return m_dict.TryGetValue(key, out value);
362 |             }
363 |             lock (m_locker)
364 |             {
365 |         
366 |                 return m_dict.TryGetValue(key, out value);
367 |             }
368 |         }
369 |     }
370 | }
371 | namespace Microsoft.MT.Common.Tokenization
372 | {
373 |     public static class CachedFunction
374 |     {
375 |         /// <summary>
376 |         /// If an entry exists in the cache for key, return it. Otherwise, call unary function func and add it to cache.
377 |         /// </summary>
378 |         public static int[] Memoize(BoundedSizedLockingCache<string, int[]> cache, string key, Func<string, int[]> func)
379 |         {
380 |             if (cache.TryGetValue(key, out var ret))
381 |                 return ret;
382 |             ret = func(key);
383 |             cache.Add(key, ret);
384 |             return ret;
385 |         }
386 |     }
387 | }
388 | 


--------------------------------------------------------------------------------
/spm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ﻿# CMakeList.txt : CMake project for SentencePieceWrapper, include source and define
 2 | # project specific logic here.
 3 | #
 4 | cmake_minimum_required (VERSION 3.8)
 5 | 
 6 | # Add source to this project's executable.
 7 | add_executable (SentencePieceInterop "SentencePieceInterop.cpp" "unicode_conversions.h")
 8 | 
 9 | # TODO: Add tests and install targets if needed.
10 | 


--------------------------------------------------------------------------------
/spm/SentencePieceInterop.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | #include <stdio.h>
  5 | #include <iostream>
  6 | 
  7 | #include <sentencepiece_processor.h>
  8 | #include <memory>  // for unique_ptr
  9 | #include <fstream>
 10 | #include "unicode_conversions.h"
 11 | 
 12 | // This requires libsentencepiece.so. If you follow the build instructions
 13 | // from C++ source on https://github.com/google/sentencepiece, the necessary
 14 | // header file and library will be installed in the system.
 15 | 
 16 | // ---------------------------------------------------------------------------
 17 | // C++ implementation of the functionality
 18 | // ---------------------------------------------------------------------------
 19 | 
 20 | class SentencePieceInterop
 21 | {
 22 |     std::unique_ptr<sentencepiece::SentencePieceProcessor> m_processor;
 23 | 
 24 |     void check_status(sentencepiece::util::Status status, const char* what)
 25 |     {
 26 |         if (status.ok())
 27 |             return;
 28 |         std::string err = status.ToString();
 29 |         std::cerr << err << std::endl;
 30 |         throw std::runtime_error(std::string("SentencePiece error ") + what + ": " + err);
 31 |     }
 32 | public:
 33 |     SentencePieceInterop(const uint16_t* modelPath, const uint16_t** vocab, size_t vocabSize)
 34 |     {
 35 |         m_processor.reset(new sentencepiece::SentencePieceProcessor());
 36 |         // load the model file
 37 |         const auto status = m_processor->Load(utf16_to_utf8(utf16string(modelPath)));
 38 |         // implant the restricted vocabulary, if given
 39 |         if (vocab && vocabSize > 0)
 40 |         {
 41 |             std::vector<std::string> vocab_str;
 42 |             for (size_t i = 0; i < vocabSize; i++)
 43 |             {
 44 |                 vocab_str.push_back(utf16_to_utf8(utf16string(vocab[i])));
 45 |             }
 46 | 
 47 |             m_processor->SetVocabulary(vocab_str);
 48 |         }
 49 |         check_status(status, "loading");
 50 |     }
 51 | 
 52 |     int EncodeAsIds(const uint16_t* word, int* pieceIdBuffer, size_t pieceIdBufferSize)
 53 |     {
 54 |         std::string wordInUtf8 = utf16_to_utf8(utf16string(word));
 55 |         auto piece_ids = m_processor->EncodeAsIds(sentencepiece::util::min_string_view(wordInUtf8));
 56 |         if (piece_ids.size() > pieceIdBufferSize)
 57 |            return -((int)piece_ids.size());
 58 |             
 59 |         std::copy(piece_ids.begin(), piece_ids.end(), pieceIdBuffer);
 60 |         return (int)piece_ids.size();
 61 |     }
 62 | 
 63 |     int UCS2LengthOfPieceId(int pieceId)
 64 |     {
 65 |         if (m_processor->IsUnknown(pieceId))
 66 |             return -1;
 67 |         auto utf8 = m_processor->IdToPiece(pieceId);
 68 |         return (int)count_utf8_to_utf16(utf8);
 69 |     }
 70 | };
 71 | 
 72 | // ---------------------------------------------------------------------------
 73 | // C/C++ interop and exported C functions
 74 | //  - intptr_t object = LoadModel(void* model, size_t modelSize, char** vocab, size_t vocabSize)
 75 | //  - length = EncodeAsIds(intptr_t object, const char* wordInUtf8, int* pieceIdBuffer, size_t pieceIdBufferSIze)  // pieceIdBuffer size >= strlen(word)+1
 76 | //  - n = UCS2LengthOfPieceId(intptr_t object, int pieceId)
 77 | //  - UnloadModel(intptr_t object)
 78 | // ---------------------------------------------------------------------------
 79 | 
 80 | #if defined(_MSC_VER)
 81 |     //  Microsoft 
 82 | #define EXPORT __declspec(dllexport)
 83 | #define IMPORT __declspec(dllimport)
 84 | #elif defined(__GNUC__)
 85 |     //  GCC
 86 | #define EXPORT __attribute__((visibility("default")))
 87 | #define IMPORT
 88 | #else
 89 |     //  do nothing and hope for the best?
 90 | #define EXPORT
 91 | #define IMPORT
 92 | #pragma warning Unknown dynamic link import/export semantics.
 93 | #endif
 94 | 
 95 | extern "C" {
 96 | 
 97 | intptr_t EXPORT LoadModel(const uint16_t* modelPath, const uint16_t** vocab, size_t vocabSize)
 98 | {
 99 |     try
100 |     {
101 |         return (intptr_t) new SentencePieceInterop(modelPath, vocab, vocabSize);
102 |     }
103 |     catch(...)  // @TODO: how to return meaningful error information?
104 |     {
105 |         return (intptr_t) nullptr;
106 |     }
107 | }
108 | 
109 | int EXPORT EncodeAsIds(intptr_t object, const uint16_t* word, int* pieceIdBuffer, size_t pieceIdBufferSize)
110 | {
111 |     try
112 |     {
113 |         return (int)((SentencePieceInterop*)object)->EncodeAsIds(word, pieceIdBuffer, pieceIdBufferSize);
114 |     }
115 |     catch(...)  // @TODO: how to return meaningful error information?
116 |     {
117 |         return -1;
118 |     }
119 | }
120 | 
121 | int EXPORT UCS2LengthOfPieceId(intptr_t object, int pieceId)
122 | {
123 |     try
124 |     {
125 |         return ((SentencePieceInterop*)object)->UCS2LengthOfPieceId(pieceId);
126 |     }
127 |     catch(...)  // @TODO: how to return meaningful error information?
128 |     {
129 |         return 0;  // 0 is an invalid length
130 |     }
131 | }
132 | 
133 | void EXPORT UnloadModel(intptr_t object)
134 | {
135 |     delete (SentencePieceInterop*)object;
136 | }
137 | 
138 | }
139 | 
140 | // ---------------------------------------------------------------------------
141 | // BELOW IS MY DEV WRAPPER
142 | // ---------------------------------------------------------------------------
143 | 
144 | // how to build:
145 | //  - clang -lstdc++ -std=c++11 -lsentencepiece -Wall -Werror SentencePieceInterop.cpp
146 | // how the SPM files for testing were obtained:
147 | //  - run factored-segmenter encode --model /marcinjdeu.blob.core.windows.net/forfrank/model-99995c.fsm
148 | //  - you will see a log msg like this:
149 | //    starting SentencePiece instance as: /usr/local/bin/spm_encode --model /tmp/tmpg9BX8N.tmp --vocabulary /tmp/tmpFslYfv.tmp
150 | //  - copy out the --model and --vocab temp files
151 | 
152 | using namespace std;
153 | 
154 | const char* spmModelPath = "/home/fseide/factored-segmenter/spm/spm.model";
155 | const char* spmVocabPath = "/home/fseide/factored-segmenter/spm/spm.vocab";
156 | 
157 | //vector<string> test_strings =
158 | //{
159 | //    "\u2581HELLO",
160 | //    "\u2581OBAMA",
161 | //    "OBAMA",
162 | //    "HELL\u2582\u2582O"  // out-of-vocab example
163 | //};
164 | 
165 | void fail(const char* msg) { cerr << "FAILED: " << msg << endl; exit(1); }
166 | 
167 | int main()
168 | {
169 |     // load the model file into RAM
170 |     ifstream f_model(spmModelPath);
171 |     auto modelBytes = vector<char>(istreambuf_iterator<char>(f_model), istreambuf_iterator<char>());
172 |     if (f_model.bad() || modelBytes.empty())  // note: bad bit does not get set if file not found (??)
173 |         fail("Failed to read SPM model file.");
174 | 
175 |     // load the vocab file
176 |     ifstream f_vocab(spmVocabPath);
177 |     vector<string> vocab;
178 |     while (f_vocab)
179 |     {
180 |         string line;
181 |         getline(f_vocab, line);
182 |         vocab.push_back(line);
183 |     }
184 |     //vector<const uint16_t*> vocab_ptr;
185 |     //for (const auto& line : vocab)
186 |     //    vocab_ptr.push_back(line.c_str());
187 | 
188 |     //auto object = LoadModel(spmModelPath, vocab_ptr.data(), vocab_ptr.size());
189 | 
190 |     //for (const auto& test_string : test_strings)
191 |     //{
192 |     //    cerr << "Testing: " << test_string << endl;
193 |     //    vector<int> piece_ids(test_string.size() + 1);
194 |     //    auto num_pieces = EncodeAsIds(object, test_string.c_str(), piece_ids.data(), piece_ids.size());
195 |     //    if (num_pieces < 0)
196 |     //        fail("Failed to EncodeAsIds.");
197 |     //    piece_ids.resize(num_pieces);
198 |     //    for (auto piece_id : piece_ids)
199 |     //        cerr << " piece id " << piece_id << " has " << UCS2LengthOfPieceId(object, piece_id) << " UCS-2 characters" << endl;
200 |     //}
201 |     //UnloadModel(object);
202 |     cerr << "Done." << endl;
203 | }
204 | 


--------------------------------------------------------------------------------
/spm/spm.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/factored-segmenter/cf3a8bd099719a67d886eab907d996b187d924f6/spm/spm.model


--------------------------------------------------------------------------------
/spm/unicode_conversions.h:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | // This was extracted from https://github.com/microsoft/cpprestsdk/blob/cdae258bfb22f948c7b768b4dc56f5f4a2d9b2ce/Release/src/utilities/asyncrt_utils.cpp#L305
  5 | 
  6 | #include <string>
  7 | #include <stdexcept>
  8 | 
  9 | typedef std::basic_string<uint16_t> utf16string;
 10 | 
 11 | #define LOW_3BITS 0x7
 12 | #define LOW_4BITS 0xF
 13 | #define LOW_5BITS 0x1F
 14 | #define LOW_6BITS 0x3F
 15 | #define BIT4 0x8
 16 | #define BIT5 0x10
 17 | #define BIT6 0x20
 18 | #define BIT7 0x40
 19 | #define BIT8 0x80
 20 | #define L_SURROGATE_START 0xDC00
 21 | #define L_SURROGATE_END 0xDFFF
 22 | #define H_SURROGATE_START 0xD800
 23 | #define H_SURROGATE_END 0xDBFF
 24 | #define SURROGATE_PAIR_START 0x10000
 25 | 
 26 | // Create a dedicated type for characters to avoid the issue
 27 | // of different platforms defaulting char to be either signed
 28 | // or unsigned.
 29 | using UtilCharInternal_t = signed char;
 30 | 
 31 | inline size_t count_utf8_to_utf16(const std::string& s)
 32 | {
 33 |     const size_t sSize = s.size();
 34 |     auto const sData = reinterpret_cast<const UtilCharInternal_t*>(s.data());
 35 |     size_t result {sSize};
 36 | 
 37 |     for (size_t index = 0; index < sSize;)
 38 |     {
 39 |         if (sData[index] >= 0)
 40 |         {
 41 |             // use fast inner loop to skip single byte code points (which are
 42 |             // expected to be the most frequent)
 43 |             while ((++index < sSize) && (sData[index] >= 0))
 44 |                 ;
 45 | 
 46 |             if (index >= sSize) break;
 47 |         }
 48 | 
 49 |         // start special handling for multi-byte code points
 50 |         const UtilCharInternal_t c {sData[index++]};
 51 | 
 52 |         if ((c & BIT7) == 0)
 53 |         {
 54 |             throw std::range_error("UTF-8 string character can never start with 10xxxxxx");
 55 |         }
 56 |         else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF
 57 |         {
 58 |             if (index == sSize)
 59 |             {
 60 |                 throw std::range_error("UTF-8 string is missing bytes in character");
 61 |             }
 62 | 
 63 |             const UtilCharInternal_t c2 {sData[index++]};
 64 |             if ((c2 & 0xC0) != BIT8)
 65 |             {
 66 |                 throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
 67 |             }
 68 | 
 69 |             // can't require surrogates for 7FF
 70 |             --result;
 71 |         }
 72 |         else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF
 73 |         {
 74 |             if (sSize - index < 2)
 75 |             {
 76 |                 throw std::range_error("UTF-8 string is missing bytes in character");
 77 |             }
 78 | 
 79 |             const UtilCharInternal_t c2 {sData[index++]};
 80 |             const UtilCharInternal_t c3 {sData[index++]};
 81 |             if (((c2 | c3) & 0xC0) != BIT8)
 82 |             {
 83 |                 throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
 84 |             }
 85 | 
 86 |             result -= 2;
 87 |         }
 88 |         else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF
 89 |         {
 90 |             if (sSize - index < 3)
 91 |             {
 92 |                 throw std::range_error("UTF-8 string is missing bytes in character");
 93 |             }
 94 | 
 95 |             const UtilCharInternal_t c2 {sData[index++]};
 96 |             const UtilCharInternal_t c3 {sData[index++]};
 97 |             const UtilCharInternal_t c4 {sData[index++]};
 98 |             if (((c2 | c3 | c4) & 0xC0) != BIT8)
 99 |             {
100 |                 throw std::range_error("UTF-8 continuation byte is missing leading bit mask");
101 |             }
102 | 
103 |             const uint32_t codePoint =
104 |                 ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
105 |             result -= (3 - (codePoint >= SURROGATE_PAIR_START));
106 |         }
107 |         else
108 |         {
109 |             throw std::range_error("UTF-8 string has invalid Unicode code point");
110 |         }
111 |     }
112 | 
113 |     return result;
114 | }
115 | 
116 | utf16string /*__cdecl conversions::*/utf8_to_utf16(const std::string& s)
117 | {
118 |     // Save repeated heap allocations, use the length of resulting sequence.
119 |     const size_t srcSize = s.size();
120 |     auto const srcData = reinterpret_cast<const UtilCharInternal_t*>(s.data());
121 |     utf16string dest(count_utf8_to_utf16(s), L'\0');
122 |     utf16string::value_type* const destData = &dest[0];
123 |     size_t destIndex = 0;
124 | 
125 |     for (size_t index = 0; index < srcSize; ++index)
126 |     {
127 |         UtilCharInternal_t src = srcData[index];
128 |         switch (src & 0xF0)
129 |         {
130 |             case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF
131 |             {
132 |                 const UtilCharInternal_t c2 {srcData[++index]};
133 |                 const UtilCharInternal_t c3 {srcData[++index]};
134 |                 const UtilCharInternal_t c4 {srcData[++index]};
135 |                 uint32_t codePoint =
136 |                     ((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS);
137 |                 if (codePoint >= SURROGATE_PAIR_START)
138 |                 {
139 |                     // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs.
140 |                     //  - 0x10000 is subtracted from the code point
141 |                     //  - high surrogate is 0xD800 added to the top ten bits
142 |                     //  - low surrogate is 0xDC00 added to the low ten bits
143 |                     codePoint -= SURROGATE_PAIR_START;
144 |                     destData[destIndex++] = static_cast<utf16string::value_type>((codePoint >> 10) | H_SURROGATE_START);
145 |                     destData[destIndex++] =
146 |                         static_cast<utf16string::value_type>((codePoint & 0x3FF) | L_SURROGATE_START);
147 |                 }
148 |                 else
149 |                 {
150 |                     // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point
151 |                     // value. U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present
152 |                     // but will encode them if encountered.
153 |                     destData[destIndex++] = static_cast<utf16string::value_type>(codePoint);
154 |                 }
155 |             }
156 |             break;
157 |             case 0xE0: // 3 byte character, 0x800 to 0xFFFF
158 |             {
159 |                 const UtilCharInternal_t c2 {srcData[++index]};
160 |                 const UtilCharInternal_t c3 {srcData[++index]};
161 |                 destData[destIndex++] = static_cast<utf16string::value_type>(
162 |                     ((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS));
163 |             }
164 |             break;
165 |             case 0xD0: // 2 byte character, 0x80 to 0x7FF
166 |             case 0xC0:
167 |             {
168 |                 const UtilCharInternal_t c2 {srcData[++index]};
169 |                 destData[destIndex++] =
170 |                     static_cast<utf16string::value_type>(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS));
171 |             }
172 |             break;
173 |             default: // single byte character, 0x0 to 0x7F
174 |                 // try to use a fast inner loop for following single byte characters,
175 |                 // since they are quite probable
176 |                 do
177 |                 {
178 |                     destData[destIndex++] = static_cast<utf16string::value_type>(srcData[index++]);
179 |                 } while (index < srcSize && srcData[index] > 0);
180 |                 // adjust index since it will be incremented by the for loop
181 |                 --index;
182 |         }
183 |     }
184 |     return dest;
185 | }
186 | 
187 | inline size_t count_utf16_to_utf8(const utf16string& w)
188 | {
189 |     const utf16string::value_type* const srcData = &w[0];
190 |     const size_t srcSize = w.size();
191 |     size_t destSize(srcSize);
192 |     for (size_t index = 0; index < srcSize; ++index)
193 |     {
194 |         const utf16string::value_type ch(srcData[index]);
195 |         if (ch <= 0x7FF)
196 |         {
197 |             if (ch > 0x7F) // 2 bytes needed (11 bits used)
198 |             {
199 |                 ++destSize;
200 |             }
201 |         }
202 |         // Check for high surrogate.
203 |         else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // 4 bytes needed (21 bits used)
204 |         {
205 |             ++index;
206 |             if (index == srcSize)
207 |             {
208 |                 throw std::range_error("UTF-16 string is missing low surrogate");
209 |             }
210 | 
211 |             const auto lowSurrogate = srcData[index];
212 |             if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END)
213 |             {
214 |                 throw std::range_error("UTF-16 string has invalid low surrogate");
215 |             }
216 | 
217 |             destSize += 2;
218 |         }
219 |         else // 3 bytes needed (16 bits used)
220 |         {
221 |             destSize += 2;
222 |         }
223 |     }
224 | 
225 |     return destSize;
226 | }
227 | 
228 | std::string /*__cdecl conversions::*/utf16_to_utf8(const utf16string& w)
229 | {
230 |     const size_t srcSize = w.size();
231 |     const utf16string::value_type* const srcData = &w[0];
232 |     std::string dest(count_utf16_to_utf8(w), '\0');
233 |     std::string::value_type* const destData = &dest[0];
234 |     size_t destIndex(0);
235 | 
236 |     for (size_t index = 0; index < srcSize; ++index)
237 |     {
238 |         const utf16string::value_type src = srcData[index];
239 |         if (src <= 0x7FF)
240 |         {
241 |             if (src <= 0x7F) // single byte character
242 |             {
243 |                 destData[destIndex++] = static_cast<char>(src);
244 |             }
245 |             else // 2 bytes needed (11 bits used)
246 |             {
247 |                 destData[destIndex++] = static_cast<char>(char((src >> 6) | 0xC0));        // leading 5 bits
248 |                 destData[destIndex++] = static_cast<char>(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits
249 |             }
250 |         }
251 |         // Check for high surrogate.
252 |         else if (src >= H_SURROGATE_START && src <= H_SURROGATE_END)
253 |         {
254 |             const auto highSurrogate = src;
255 |             const auto lowSurrogate = srcData[++index];
256 | 
257 |             // To get from surrogate pair to Unicode code point:
258 |             // - subtract 0xD800 from high surrogate, this forms top ten bits
259 |             // - subtract 0xDC00 from low surrogate, this forms low ten bits
260 |             // - add 0x10000
261 |             // Leaves a code point in U+10000 to U+10FFFF range.
262 |             uint32_t codePoint = highSurrogate - H_SURROGATE_START;
263 |             codePoint <<= 10;
264 |             codePoint |= lowSurrogate - L_SURROGATE_START;
265 |             codePoint += SURROGATE_PAIR_START;
266 | 
267 |             // 4 bytes needed (21 bits used)
268 |             destData[destIndex++] = static_cast<char>((codePoint >> 18) | 0xF0);               // leading 3 bits
269 |             destData[destIndex++] = static_cast<char>(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits
270 |             destData[destIndex++] = static_cast<char>(((codePoint >> 6) & LOW_6BITS) | BIT8);  // next 6 bits
271 |             destData[destIndex++] = static_cast<char>((codePoint & LOW_6BITS) | BIT8);         // trailing 6 bits
272 |         }
273 |         else // 3 bytes needed (16 bits used)
274 |         {
275 |             destData[destIndex++] = static_cast<char>((src >> 12) | 0xE0);              // leading 4 bits
276 |             destData[destIndex++] = static_cast<char>(((src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits
277 |             destData[destIndex++] = static_cast<char>((src & LOW_6BITS) | BIT8);        // trailing 6 bits
278 |         }
279 |     }
280 | 
281 |     return dest;
282 | }
283 | 


--------------------------------------------------------------------------------
/src/FactoredSegmenterConfigs.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | using System;
  5 | using System.IO;
  6 | using System.Xml.Serialization;
  7 | using Common.Utils;
  8 | 
  9 | namespace Microsoft.MT.Common.Tokenization
 10 | {
 11 |     /// <summary>
 12 |     /// Configurable options for FactoredSegmenter models.
 13 |     /// All options that are kept inside the model file go here.
 14 |     /// </summary>
 15 |     public class FactoredSegmenterModelOptions
 16 |     {
 17 |         /// <summary>
 18 |         /// if false, do not emit |we nor |ce factors
 19 |         /// </summary>
 20 |         public bool RightWordGlue { get; set; } = false;
 21 | 
 22 |         /// <summary>
 23 |         /// if true, word-internal and word-initial pieces use distinct lemmas
 24 |         /// Without this, as piece xyz can exist in at least four forms, which in original
 25 |         /// SentencePiece notation would be written as xyz, Xyz, _xyz, and _Xyz.
 26 |         /// The latter three are all word boundaries, while the first is word-internal.
 27 |         /// I.e. two fundamentally different units are mapped onto the same piece.
 28 |         /// With this flag set, the latter three will use a different symbol.
 29 |         /// ...This is experimental, and not yet confirmed to help.
 30 |         /// </summary>
 31 |         public bool DistinguishInitialAndInternalPieces { get; set; } = false;
 32 | 
 33 |         public bool SplitHan { get; set; } = false;
 34 | 
 35 |         /// <summary>
 36 |         /// separate case factors for single letters
 37 |         /// For single letters, it is not clear whether to use |ca or |ci.
 38 |         /// With this option, we use a completely different factor |scu or |scl for single-letter words.
 39 |         /// This seems to quite robustly improve capitalization for English "I" and "U.S." for example.
 40 |         /// </summary>
 41 |         public bool SingleLetterCaseFactors { get; set; } = false;
 42 | 
 43 |         /// <summary>
 44 |         /// serialize phrase-fix indices and unrepresentable characters
 45 |         /// With this option, the index factor is no longer an additive factor,
 46 |         /// but instead is represented as a sequence of digits. This frees bits in Marian for other factors.
 47 |         /// Likewise, unrepresentable characters (=single characters not found in the
 48 |         /// SentencePiece vocabulary) are also serialized as their Unicode in digit form.
 49 |         /// This allows for any character to be represented (and hopefully translated by at least copying it through).
 50 |         /// ...This is experimental.
 51 |         public bool SerializeIndicesAndUnrepresentables { get; set; } = false;
 52 | 
 53 |         /// <summary>
 54 |         /// If true, phrase fixes are encoded by including them in the source.
 55 |         /// ...This is experimental.
 56 |         /// ...This is ongoing work. The following will be addressed once we know whether this works at all:
 57 |         ///  - no correct escaping of our internal delimiter chars if they occur in real text
 58 |         ///  - delimiter chars should be encoded in the form as XML tags, and have no glue factors
 59 |         ///  - glue/boundary-factor determination must correctly see through the delimited ranges
 60 |         ///  - currently if the decoder decides to output the delimiter chars, they will not be removed
 61 |         ///  - delimiter chars should be excluded from shortlists (as should sentence-start)
 62 |         /// </summary>
 63 |         public bool InlineFixes { get; set; } = false;
 64 | 
 65 |         /// <summary>
 66 |         /// If true, use start/middle/end tags (which is known to not work well).
 67 |         /// If false, then use INLINE_FIX_TYPE factors for the inline-fix tokens.
 68 |         /// Only used if InlineFixes == true.
 69 |         /// </summary>
 70 |         public bool InlineFixUseTags { get; set; } = false;
 71 | 
 72 |         /// <summary>
 73 |         /// Enables context-dependent capitalization factors for single letters.
 74 |         /// Workaround for Bug #101419 "Training of allcaps factors is inconsistent".
 75 |         /// The Marian all-caps routine changes all factors to "ca", causing an inconsistency
 76 |         /// with measurable impact. With this flag set, FactoredSegmenter will try to guess
 77 |         /// whether a single uppercase letter is part of an all-caps word sequence.
 78 |         /// ...This does not seem to work well, and may be removed.
 79 |         /// </summary>
 80 |         public bool UseContextDependentSingleLetterCapitalizationFactors { get; set; } = false;
 81 | 
 82 |         /// <summary>
 83 |         /// For sentence-level annotations, e.g. multi-lingual systems, this string
 84 |         /// declares the types of annotations. E.g. to enable sentence-level annotations
 85 |         /// for the sentence target language, e.g. "target_language=ENU", the string
 86 |         /// "target_language" is the type, and it must be declared here as a model option.
 87 |         /// </summary>      
 88 |         public string SourceSentenceAnnotationTypes { get; set; } = "";
 89 | 
 90 |         /// <summary>
 91 |         /// The list of source sentence annotation types.
 92 |         /// Note that this is not a property that can be specified by the user. User should instead specify SourceSentenceAnnotationTypes in above.
 93 |         /// </summary>
 94 |         [XmlIgnore]
 95 |         internal string[] SourceSentenceAnnotationTypeList => SourceSentenceAnnotationTypes != null ?
 96 |                                                               SourceSentenceAnnotationTypes.Split(new string[] { ";" }, StringSplitOptions.RemoveEmptyEntries) :
 97 |                                                               new string[0];
 98 | 
 99 |         // system-managed options persisted to file follow; not to be specified by user
100 | 
101 |         /// <summary>
102 |         /// if false then skip SentencePiece. If true, then SPM model file is FS model path s/\.model$/\.fsm/
103 |         /// </summary>
104 |         public bool? UseSentencePiece { get; set; }
105 |     }
106 | 
107 |     /// <summary>
108 |     /// Class to hold all parameters for the FactoredSegmenter training tool.
109 |     /// </summary>
110 |     public class FactoredSegmenterModelTrainConfig : SegmenterTrainConfigBase, IFactoredSegmenterConfig
111 |     {
112 |         /// <summary>
113 |         /// options persisted with the model, e.g. whether to use certain factors
114 |         /// </summary>
115 |         public FactoredSegmenterModelOptions ModelOptions { get; set; } = new FactoredSegmenterModelOptions(); 
116 |         /// <summary>
117 |         /// Number of sentences to use for determining the Marian vocab and for training
118 |         /// the underlying SentencePiece model. Normally set to 10 million.
119 |         /// This many sentences are sampled from the training corpus.
120 |         /// For joint training, this is the total number of sentences across both languages.
121 |         /// </summary>
122 |         public override int? TrainingSentenceSize { get; set; }
123 |         /// <summary>
124 |         /// Only keep SentencePiece units ("pieces") with at least this many observations
125 |         /// in the entire training set. Any unit with fewer observations will be represented
126 |         /// as multiple shorter pieces. The rationale is that too rare observations will
127 |         /// not get a properly trained embedding.
128 |         /// The total Marian vocabulary consists of these pieces plus single characters.
129 |         /// If TrainingSentenceSize is set, only a subset is processed. In this case,
130 |         /// this count is adjusted automatically internally accordingly.
131 |         /// </summary>
132 |         public int MinPieceCount { get; set; } = 0;
133 |         /// <summary>
134 |         /// Only keep single characters with at least this many observations in the entire
135 |         /// training data. Any character sequence that is not covered by units in the
136 |         /// SentencePiece vocabulary will be represented as single characters.
137 |         /// Many of these single characters are very rare, e.g. graphical characters
138 |         /// or Cyrillic characters in a Chinese corpus, and cannot be learned properly.
139 |         /// This parameter allows to eliminate rare characters from the vocab (they will
140 |         /// be treated as unrepresentable, which presently means UNK).
141 |         /// This threshold needs to be smaller than MinPieceCount to have an effect.
142 |         /// If TrainingSentenceSize is set, only a subset is processed. In this case,
143 |         /// this count is adjusted automatically internally accordingly.
144 |         /// </summary>
145 |         public int MinCharCount  { get; set; } = 0;
146 |         /// <summary>
147 |         /// Config for the underlying SentencePiece training (or null to indicate to not use SentencePiece).
148 |         /// </summary>
149 |         public SentencePieceTrainConfig SentencePieceTrainingConfig { get; set; } = new SentencePieceTrainConfig();
150 |         // @BUGBUG: now ^^ this is created by default, so there is no way to turn it off -> @TODO: ModelOptions->UseSentencePiece = false says 'ignore this'
151 |     }
152 | 
153 |     /// <summary>
154 |     /// Class to hold all parameters for the FactoredSegmenter encoding tool.
155 |     /// </summary>
156 |     public class FactoredSegmenterEncodeConfig : SegmenterEncodeConfigBase, IFactoredSegmenterConfig
157 |     {
158 |         public SentencePieceEncodeConfig SentencePieceEncodeConfig { get; set; } // for the underlying SentencePiece module
159 | 
160 |         // for debugging:
161 |         public int CheckEvery { get; set; } = 100; // decode each N-th encoded sentence and verify against source
162 |     }
163 | 
164 |     /// <summary>
165 |     /// Class to hold all parameters for the FactoredSegmenter decoding tool.
166 |     /// </summary>
167 |     public class FactoredSegmenterDecodeConfig : SegmenterDecodeConfigBase, IFactoredSegmenterConfig
168 |     {
169 |         SentencePieceDecodeConfig SentencePieceDecodeConfig { get; set; } // for the underlying SentencePiece module
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/src/FactoredSegmenterScriptHelpers.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | // This is meant as an extension of Unicode.cs. It should be merged into there,
  5 | // once the code in here has reached a sufficient level of maturity and generality
  6 | // across languages, and generally support surrogate pairs.
  7 | 
  8 | using System.Collections.Generic;
  9 | using System.Linq;
 10 | 
 11 | namespace Common.Text
 12 | {
 13 |     /// <summary>
 14 |     /// Helper class for Unicode characters.
 15 |     /// @BUGBUG: These do not work with surrogate pairs.
 16 |     /// </summary>
 17 |     public static class ScriptExtensions
 18 |     {
 19 |         /// <summary>
 20 |         /// Helper to test whether a character has a character code in range min..max
 21 |         /// </summary>
 22 |         public static bool IsInRange(this char c, int min, int max) => (c >= (char)min && c <= (char)max);
 23 | 
 24 |         /// <summary>
 25 |         /// Is character a combining character?
 26 |         /// </summary>
 27 |         /// <param name="c"></param>
 28 |         /// <returns></returns>
 29 |         public static bool IsCombiner(this char c) => c.GetUnicodeMajorDesignation() == 'M';
 30 | 
 31 |         /// <summary>
 32 |         /// Is character a Variation Selector? [https://en.wikipedia.org/wiki/Variation_Selectors_(Unicode_block)]
 33 |         /// Note that these are included in IsCombiner as well.
 34 |         /// </summary>
 35 |         //public static bool IsVariationSelector(this char c) => c.IsInRange(0xfe00, 0xfe0f);
 36 | 
 37 |         /// <summary>
 38 |         /// Helper to determine whether a character is a numeral.
 39 |         /// This includes numeral characters that are not classified as such in Unicode,
 40 |         /// such as Chinese numbers.
 41 |         /// This is meant for FactoredSegmenter, which uses this to prevent numeral characters
 42 |         /// from being merged in SentencePiece.
 43 |         public static bool IsNumeral(this char c)
 44 |         {
 45 |             // @BUGBUG: currently known failures:
 46 |             //  - Arabic fractions: ٠٫٢٥
 47 | 
 48 |             // Chinese numeral letters are not classified as digits in Unicode
 49 |             if (ScriptHelpers.ChineseDigits.Contains(c))
 50 |                 return true;
 51 |             else
 52 |                 return Unicode.GetUnicodeMajorDesignation(c) == 'N';
 53 |         }
 54 | 
 55 |         /// <summary>
 56 |         /// Is this character considered a letter inside FactoredSegmenter?
 57 |         /// This also returns true for combiners that are typically used with letters.
 58 |         /// @TODO: decide how to handle wide letters, or all sorts of weird letters such as exponents
 59 |         ///        Are those letters? Are those capitalizable?
 60 |         ///        Then remove this wrapper.
 61 |         /// </summary>
 62 |         public static bool IsLetterOrLetterCombiner(this char c)
 63 |             => char.IsLetter(c) ||
 64 |                (c.IsCombiner() && c.GetCombinerTypicalMajorDesignation() == 'L');
 65 | 
 66 |         ///// <summary>
 67 |         ///// Combine IsLetter() and IsNumeral(), which is used a few times in this combination.
 68 |         ///// </summary>
 69 |         ///// <param name="c"></param>
 70 |         ///// <returns></returns>
 71 |         //public static bool IsLetterOrNumeral(this char c) => c.IsLetter() || c.IsNumeral();
 72 | 
 73 |         /// <summary>
 74 |         /// Tests whether a character is a bicameral letter.
 75 |         /// @TODO: should we consider German ess-zet as bicameral? Lower=upper, but
 76 |         /// as of recently, an upper-case ess-zet exist.
 77 |         /// One can all-caps a word with ess-zet. This is currently special-cased in FactoredSegmenter.
 78 |         /// </summary>
 79 |         public static bool IsBicameral(this char c) => char.ToLowerInvariant(c) != char.ToUpperInvariant(c);
 80 | 
 81 |         /// <summary>
 82 |         /// Replacement for IsLower() that handles Roman numeral X correctly
 83 |         /// We define a lower-case letter as one that is bicameral in the first place, and of the lower kind.
 84 |         /// </summary>
 85 |         public static bool HasAndIsLower(this char c) => c != char.ToUpperInvariant(c);
 86 |         /// <summary>
 87 |         /// Same as HasAndIsLower() except for upper-case.
 88 |         /// </summary>
 89 |         public static bool HasAndIsUpper(this char c) => char.ToLowerInvariant(c) != c;
 90 | 
 91 |         /// <summary>
 92 |         /// String/index version of HasAndIsLower().
 93 |         /// @BUGBUG: Does not handle surrogate pairs.
 94 |         /// </summary>
 95 |         public static bool HasAndIsLowerAt(this string s, int index) => s[index].HasAndIsLower();
 96 | 
 97 |         /// <summary>
 98 |         /// String/index version of HasAndIsUpper().
 99 |         /// @BUGBUG: Does not handle surrogate pairs.
100 |         /// </summary>
101 |         public static bool HasAndIsUpperAt(this string s, int index) => s[index].HasAndIsUpper();
102 | 
103 |         /// <summary>
104 |         /// Test if string is a single Unicode character, with support for surrogate pairs.
105 |         /// Used for detecting unrepresentable Unicode characters.
106 |         /// </summary>
107 |         public static bool IsSingleCharConsideringSurrogatePairs(this string s)
108 |         {
109 |             var length = s.Length;
110 |             return length == 1 ||
111 |                   (length == 2 && char.IsSurrogatePair(s, 0));
112 |         }
113 | 
114 |         /// <summary>
115 |         /// Capitalize the first letter of a string and return the result.
116 |         /// This function attempts to be efficient and not allocate a new string
117 |         /// if the string is unchanged.
118 |         /// </summary>
119 |         public static string Capitalized(this string s)
120 |         {
121 |             if (!string.IsNullOrEmpty(s) && s.First().HasAndIsLower())
122 |             {
123 |                 var a = s.ToArray();
124 |                 a[0] = char.ToUpperInvariant(a[0]);
125 |                 return new string(a);
126 |             }
127 |             else
128 |                 return s;
129 |         }
130 | 
131 |         /// <summary>
132 |         /// Define a "typical" use for combining marks. FactoredSegmenter requires pieces to
133 |         /// be classifyable as being of word nature or not. Combiners depend on context.
134 |         /// This can lead to a contradiction if a combiner gets separated from its preceding
135 |         /// character by SentencePiece (which we allow since in Hindi, some combiners are morphemes).
136 |         /// The problem is that each lemma has a unique factor set. But if the lemma is a
137 |         /// combiner that is used both with a letter or with punctuation in the corpus,
138 |         /// that lemma ends up with two different factor sets, which is forbidden.
139 |         /// As a 95%-5% solution, we uniquely define a single "typical" use for each combiner.
140 |         /// For example, the accent is considered to always imply a letter, although I have
141 |         /// seen it used on top of a space character (to mimic an apostrophe). We consider
142 |         /// these as abnormal uses, which will just lead to an additional forced word break
143 |         /// that can still be learned and resolved by the MT model itself.
144 |         /// </summary>
145 |         public static char GetCombinerTypicalMajorDesignation(this char c)
146 |         {
147 |             // @TODO: Spencer pointed out that the key-cap combiner combines with 0..9, #, and *
148 |             //        It probably should be considered punctuation, to avoid # key-cap A to form a word "key-cap A".
149 |             if (c.IsInRange(0xfe0e, 0xfe0f)) // Variation Selectors 15 and 16 apply to Emojis
150 |                 return 'P'; // punctuation
151 |             else
152 |                 return 'L'; // letter
153 |         }
154 | 
155 |         /// <summary>
156 |         /// Classify a character, using our special rules
157 |         ///  - number letters, e.g. Chinese numerals, are classified as 'N'
158 |         ///  - combiners have a single "typical" designation
159 |         /// </summary>
160 |         public static char GetUnicodeMajorDesignationWithOurSpecialRules(this char c) // helper to get character designation, with our special rules for numerals and combiners
161 |         {
162 |             if (c.IsNumeral())
163 |                 return 'N';
164 |             var d = c.GetUnicodeMajorDesignation();
165 |             if (d == 'M')
166 |                 return c.GetCombinerTypicalMajorDesignation();
167 |             else
168 |                 return d;
169 |         }
170 | 
171 |         /// <summary>
172 |         /// Get the major unicode designation at a character position.
173 |         /// In the special case that that position is a combiner, find the first non-combining
174 |         /// character and use its designation.
175 |         /// </summary>
176 |         //public static char GetUnicodeMajorDesignationBeforeCombinerAt(this string s, int pos)
177 |         //{
178 |         //    var majorDesignation = s[pos].GetUnicodeMajorDesignation();
179 |         //    // if combiner then search for base char (=last non-combining char)
180 |         //    while (majorDesignation == 'M' && pos --> 0)
181 |         //        majorDesignation = s[pos].GetUnicodeMajorDesignation();
182 |         //    return majorDesignation;
183 |         //}
184 |     }
185 | 
186 |     /// <summary>
187 |     /// Character-script (as in writing-system) related helpers for FactoredSegmenter. These helpers are at present
188 |     /// not yet generic or mature enough to warrant being moved into Common or Unicode.cs.
189 |     /// Once they are, they should be moved.
190 |     /// </summary>
191 |     public static class ScriptHelpers
192 |     {
193 |         public static HashSet<char> ChineseDigits = new HashSet<char>{
194 |             // cf https://en.wikipedia.org/wiki/Chinese_numerals
195 |             '〇', '一', '二', '三', '四', '五', '六', '七', '八', '九', // base digits
196 |             '０', '１', '２', '３', '４', '５', '６', '７', '８', '９', // full-width digits. Note: These are designated as digits
197 |             '十', '百', '千', '萬', '万', '億', '亿', '兆', // units
198 |             '零', '壹', '貳', '贰', '叄', '叁', '陸', '陆', '柒', '捌', '玖', '拾', '佰', '仟', // financial
199 |             '幺', '兩', '两', '倆', '仨', '呀', '念', '廿', '卅', '卌', '皕', // regional
200 |             // @TODO: how about fractions? E.g. 分 (fen)
201 |             '○' // Small White Circle" (U+25CB)
202 |             // It is commonly used as zero in Chinese, but technically not a numeral. Unicode desig is Other Symbol "So".
203 |             // @BUGBUG: For now, we treat it as one since all we care is that it does not get merged.
204 |             //          @TODO: Decide whether we can add a different category that also never gets merged.
205 |             // @TODO:
206 |             //  京 = 10^16 
207 |             //  壱 = formal 1
208 |             //  弐 = formal 2
209 |             //  参 = formal 3 (has other uses)
210 |             // @REVIEW: A native speaker of Chinese and Japanese should check whether some characters
211 |             //          above are commonly used in regular words as well, and assess whether we need them here.
212 |         };
213 | 
214 |         /// <summary>
215 |         /// Get script designators for each character in a line.
216 |         /// This function handles surrogate pairs and combining marks.   --@TODO: ...not yet, actually
217 |         /// The function can optionally operate on a substring.
218 |         /// </summary>
219 |         //public static Unicode.Script[] GetScripts(string line, int startIndex = 0, int length = int.MaxValue)
220 |         //{
221 |         //    if (length == int.MaxValue)
222 |         //        length = line.Length;
223 |         //    var scripts = new Unicode.Script[length];
224 |         //    for (var i = 0; i < length; i++)
225 |         //    {
226 |         //        // @TODO: Handle surrogates
227 |         //        char c = line[startIndex + i];
228 |         //        if (c.IsCombiner() && i > 0)
229 |         //            scripts[i] = scripts[i - 1];
230 |         //        else
231 |         //            scripts[i] = Unicode.GetScript(c);
232 |         //    }
233 |         //    return scripts;
234 |         //}
235 | 
236 |         /// <summary>
237 |         /// Simplistic word-boundary detector.
238 |         /// This function attempts to detect word boundaries that can be detected in a language-independent
239 |         /// fashion from the surface form, and without additional knowledge sources.
240 |         /// I.e. it looks for a change in script and some changes in Unicode character designation.
241 |         /// This does not detect word breaks in continuous scripts, which require additional knowledge sources.
242 |         /// This function handles these special cases:
243 |         ///  - some known allowed punctuation between characters, such as ' in words and . in numbers
244 |         ///    @TODO: This rule may not apply to all scripts.
245 |         ///  - surrogate pairs    --@TODO
246 |         ///  - combiners inherit the script of the character to the left
247 |         ///  - combiners are classified as the char type (major designation) they are "typically"
248 |         ///    applied to (not depending on actual char).
249 |         ///    This is needed so that combiners that end up as single SentencePieces are classifyable.
250 |         ///    Any error this causes must be learned by the model.
251 |         ///  - designation changes only are a boundary if a letter or a number is on either side,
252 |         ///    but e.g. not a punctuation symbol next to a space or math symbol
253 |         ///  - (special rule: Hiragana is not split from Kanji. Currently this rule is disabled.)
254 |         /// Each space gives rise to two boundaries (one on each side).
255 |         /// It returns a cut list. An empty string is not cut.
256 |         /// </summary>
257 |         public static IList<int> DetectUnambiguousWordBreaks(string line) // @TODO: Better name for this?
258 |         {
259 |             // First, determines the major Unicode designation and script for each character, but with modifications,
260 |             // for purpose of simple word breaking:
261 |             //  - allowed punctuation marks inside words are flipped to 'L'
262 |             //  - allowed punctuation marks inside numbers are flipped to 'N'
263 |             //  - unambiguous CJK number letters are flipped to 'N'
264 |             //  - combining marks carry over both designation and script from their main character
265 |             var scripts = new Unicode.Script[line.Length];
266 |             var designations = new char[line.Length];
267 |             for (var i = 0; i < line.Length; i++)
268 |             {
269 |                 var c = line[i];
270 |                 // @TODO: handle surrogate pairs
271 |                 var m = Unicode.GetUnicodeMajorDesignation(c);
272 |                 var s = Unicode.GetScript(c);
273 |                 // special case: consider unambiguous CJK number symbols as numerals
274 |                 if (c.IsNumeral())
275 |                     designations[i] = 'N';
276 |                 // special case: combining marks carry over main character's script, and are classified as their most likely use (for consistency)
277 |                 else if (m == 'M')
278 |                 {
279 |                     m = c.GetCombinerTypicalMajorDesignation();
280 |                     if (i > 0)
281 |                         s = scripts[i - 1];
282 |                 }
283 |                 designations[i] = m;
284 |                 scripts[i] = s;
285 |                 // special case: allowed punctuation inside a word  --@TODO: Likely script dependent, maybe language dependent
286 |                 if (i - 2 >= 0 && designations[i] == 'L' && designations[i - 2] == 'L' && IsValidPuncInsideWord(line[i - 1]))
287 |                     designations[i - 1] = 'L';
288 |                 // special case: allowed punctuation inside a number  --@TODO: Likely script dependent, maybe language-locale dependent
289 |                 else if (i - 2 >= 0 && designations[i] == 'N' && designations[i - 2] == 'N' && IsValidPuncInsideNumber(line[i - 1]))
290 |                     designations[i - 1] = 'N';
291 |                 // @TODO: double-check handling of space characters: non-breaking space; optional hyphen
292 |             }
293 | 
294 |             // This function operates on a string, so we can handle the case of Unicode.Script 1 - Common - Unicode.Script 2
295 |             // This presently breaks this as (Unicode.Script 1 - Common, Unicode.Script 2).
296 |             // Without further knowledge, we can only make an arbitrary hard choice here.
297 |             // This is used by FactoredSegmenter, where that is OK because characters in Common are
298 |             // typically broken off anyways.
299 | 
300 |             if (line.Length == 0) // graceful exit in case of empty input
301 |                 return new List<int>{ 0, 0 }; // empty input is not cut
302 | 
303 |             var cutList = new List<int>(200) { 0 }; // (0=line start, which the resulting cut list must include)
304 |             var lastNonCommonScript = scripts[0];
305 |             //if (lastNonCommonScript == Unicode.Script.Hiragana)
306 |             //    lastNonCommonScript = Unicode.Script.Han; // no boundary between Kanji and Hiragana
307 |             for (var pos = 1; pos < line.Length; pos++)
308 |             {
309 |                 // detect change in character designation
310 |                 //  - break at number boundaries
311 |                 //     - add number factor
312 |                 //     - can numbers be part of words that need to be kept together for determining word-level factors?
313 |                 //  - break at word boundaries
314 |                 //     - letter/non-letter transitions
315 |                 //     - don't break apostrophes and hyphens with letters on both sides
316 |                 //     - break at script boundaries
317 |                 bool atDesignationChange = (designations[pos - 1] != designations[pos] &&
318 |                                             (designations[pos - 1] == 'N' || designations[pos] == 'N' ||
319 |                                              designations[pos - 1] == 'L' || designations[pos] == 'L'));
320 | 
321 |                 // detect script change
322 |                 var thisScript = scripts[pos];
323 |                 //if (thisScript == Unicode.Script.Hiragana) // the jury is still out whether we should do this or not
324 |                 //    thisScript = Unicode.Script.Han;
325 |                 bool atScriptChange = lastNonCommonScript != thisScript && thisScript != Unicode.Script.Common;
326 |                 // Note: If there is a script change across Common, we choose one arbitrarily.
327 |                 if (thisScript != Unicode.Script.Common || atDesignationChange) // condition 'atDesignationChange' is for back compat only; maybe not needed
328 |                     lastNonCommonScript = thisScript;
329 | 
330 |                 // add cut point if one was found
331 |                 if (atDesignationChange || atScriptChange)
332 |                     cutList.Add(pos);
333 |             }
334 |             cutList.Add(line.Length);
335 |             return cutList;
336 |         }
337 |         // @TODO: These next two functions should likely be script-dependent (and possibly language-dependent).
338 |         static bool IsValidPuncInsideWord(char c) => (c == '\'' || c == '-' || c == '\u00AD'/*soft hyphen*/); // true if words may contain this punctuation symbol inside, e.g. "It's", "well-behaved"
339 |         static bool IsValidPuncInsideNumber(char c) => (c == '.' || c == ',' || c == '\u2009'/*thin space*/); // true if numbers may contain this punctuation symbol inside, e.g. "1,234.56"
340 |     }
341 | }
342 | 


--------------------------------------------------------------------------------
/src/ProcessTools.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | using Common.Contracts;
  5 | using Common.Utils;
  6 | using System;
  7 | using System.Collections.Generic;
  8 | using System.Diagnostics;
  9 | using System.IO;
 10 | using System.Linq;
 11 | using System.Text;
 12 | 
 13 | namespace Microsoft.MT.Common.Tokenization
 14 | {
 15 |     public static class ProcessTools
 16 |     {
 17 |         public static int RunCommand(
 18 |            string exe,
 19 |            string args,
 20 |            string stdoutPath, // must be null in this version
 21 |            string stderrPath, // may be null
 22 |            bool throwOnFailure = true,
 23 |            IEnumerable<KeyValuePair<string, string>> envirVariables = null)
 24 |         {
 25 |             Sanity.Requires(stdoutPath == null, "This reduced version of RunCommand() does not support stdout redirection");
 26 |             Logger.WriteLine($"executing command: {exe} {args}");
 27 |             using (TextWriter stderrWriter = stderrPath == null ? null : new StreamWriter(stderrPath, append: false, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)) { AutoFlush = true })
 28 |             using (var process = CreateProcess(exe, args, envirVariables, isPipe: false, stderr: stderrWriter))
 29 |             {
 30 |                 process.WaitForExit();
 31 |                 if (throwOnFailure && process.ExitCode != 0)
 32 |                     throw new IOException($"Exit code {process.ExitCode} was returned by external process: {exe} {args}");
 33 |                 else
 34 |                     return process.ExitCode;
 35 |             }
 36 |         }
 37 | 
 38 | 
 39 |         static char[] k_ArgToCommandLineInvalidChars = Enumerable.Concat(from c in Enumerable.Range(0, (int)' ') select (char)c, new char[] { '"', '^' }).ToArray();
 40 |         /// <summary>
 41 |         /// escape an argument to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#.
 42 |         /// Some characters that are tricky to handle consistently. For now, we simply forbid them.
 43 |         /// These include all control characters (0x00..0x1f), " (quotation marks inside string), and ^ (CMD shell escape).
 44 |         /// To handle " and ^ correctly, we may need additional context on whether this is run via CMD, and there is
 45 |         /// supposedly also a difference between CommandLineToArgV() and the C++ CRT (C# unknown) regarding sequences of double quotes.
 46 |         /// </summary>
 47 |         /// <param name="arg">Argument as the final string that the tool should receive, without escaping.</param>
 48 |         /// <returns>Escaped version of argument, or unmodified argument if no escaping is needed.</returns>
 49 |         static string ArgToCommandLine(string arg)
 50 |         {
 51 |             if (-1 != arg.IndexOfAny(k_ArgToCommandLineInvalidChars))
 52 |                 throw new NotImplementedException($"ArgToCommandLine: presently cannot handle certain special characters (e.g. \" and ^) in: {arg}");
 53 |             if (!arg.Any() || arg.Contains(' '))  // space is the delimiter, so we must surround the arg by quotes
 54 |                 return $"\"{arg}\"";
 55 |             else                    // otherwise, no need to escape (it would be OK to escape, but not escaping is better for log readability
 56 |                 return arg;
 57 |         }
 58 |         /// <summary>
 59 |         /// convert an array of string arguments to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#.
 60 |         /// </summary>
 61 |         public static string ArgsToCommandLine(IEnumerable<string> args)
 62 |             => string.Join(" ", from arg in args select ArgToCommandLine(arg));
 63 | 
 64 |         private static Process CreateProcess(string exe, string args,
 65 |                                              IEnumerable<KeyValuePair<string, string>> envirVariables, bool isPipe,
 66 |                                              TextWriter stderr)
 67 |         {
 68 |             var psi = new ProcessStartInfo(exe, args)
 69 |             {
 70 |                 UseShellExecute = false,
 71 |                 CreateNoWindow = true,
 72 |                 ErrorDialog = false,
 73 |             };
 74 |             if (isPipe)
 75 |             {
 76 |                 psi.RedirectStandardInput = true;
 77 |                 psi.RedirectStandardOutput = true;
 78 |                 psi.StandardOutputEncoding = Encoding.UTF8;
 79 |             }
 80 |             if (stderr != null)
 81 |             {
 82 |                 psi.RedirectStandardError = true;
 83 |                 psi.StandardErrorEncoding = Encoding.UTF8; // @REVIEW: needed?
 84 |             }
 85 |             if (envirVariables != null)
 86 |                 foreach (KeyValuePair<string, string> pair in envirVariables)
 87 |                     psi.EnvironmentVariables[pair.Key] = pair.Value;
 88 | 
 89 |             var process = new Process();
 90 |             process.StartInfo = psi;
 91 |             if (stderr != null)
 92 |                 process.ErrorDataReceived += (sender, e) => { stderr.WriteLine(e.Data); };
 93 |             process.Start();
 94 |             if (stderr != null)
 95 |                 process.BeginErrorReadLine();
 96 |             return process;
 97 |         }
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/README.txt:
--------------------------------------------------------------------------------
1 | This directory contains all sources that implement the actual FactoredSegmenter functionality.
2 | 
3 | The content of this directory is shared between the standalone build in this repo and our production build.
4 | 


--------------------------------------------------------------------------------
/src/SegmenterRuntime.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | using Common.Collections.Extensions;
  5 | using Common.MT.Segments;
  6 | using Common.Text;
  7 | using Microsoft.MT.TextSegmentation.SpanFinder;
  8 | using System;
  9 | using System.Collections.Generic;
 10 | using System.IO;
 11 | using System.Linq;
 12 | 
 13 | namespace Microsoft.MT.Common.Tokenization.Segmenter
 14 | {
 15 |     public class SegmenterCoderConfig
 16 |     {
 17 |         public SegmenterKind SegmenterKind { get; set; }
 18 |         public string ModelPath { get; set; }
 19 | 
 20 |         // The Equals() function is for the parallel coder so that it can  determine whether
 21 |         // source and target configs are the same. If they are, the parallel coder will only
 22 |         // instantiate one segmenter and use it for both source and target.
 23 |         public override bool Equals(object obj)
 24 |         {
 25 |             return
 26 |                 obj is SegmenterCoderConfig other &&
 27 |                 SegmenterKind == other.SegmenterKind && ModelPath == other.ModelPath;
 28 |         }
 29 |         public override int GetHashCode() { return ModelPath.GetHashCode(); }
 30 |     }
 31 | 
 32 |     /// <summary>
 33 |     /// A reference to a segment of raw source text, as used in DecodedSegment.SourceLink
 34 |     /// </summary>
 35 |     public class EncodedSegmentReference
 36 |     {
 37 |         public string RawSourceText; // full raw source string   --@TODO: make private if not actually needed public
 38 |         public int StartIndex;       // character coordinates of source token in the raw source string
 39 |         public int Length;
 40 |         public bool IsWordTokenStart, IsWordTokenEnd;
 41 |         public bool IsSpacingWordStart, IsSpacingWordEnd;
 42 |         public string SurfaceForm => RawSourceText.Substring(StartIndex, Length);
 43 | 
 44 |         // is this eligible for alignment links from target tokens? Alignments from Marian to segments for which this is false will be discarded
 45 |         // in Decode. At present, this will be false for sentence annotation tokens, and true for others. 
 46 |         public bool CanBeAlignedTo;
 47 | 
 48 |         public override bool Equals(object obj)
 49 |         {
 50 |             return
 51 |                 obj is EncodedSegmentReference other &&
 52 |                 RawSourceText == other.RawSourceText && StartIndex == other.StartIndex && Length == other.Length &&
 53 |                 IsWordTokenStart == other.IsWordTokenStart && IsWordTokenEnd == other.IsWordTokenEnd &&
 54 |                 IsSpacingWordStart == other.IsSpacingWordStart && IsSpacingWordEnd == other.IsSpacingWordEnd &&
 55 |                 CanBeAlignedTo == other.CanBeAlignedTo;
 56 |         }
 57 |         public override int GetHashCode() { return RawSourceText.GetHashCode(); }
 58 |         // for debugging
 59 |         public override string ToString() => SurfaceForm;
 60 |     }
 61 | 
 62 |     /// <summary>
 63 |     /// The decoder outputs one of these for each Marian token, and additional ones for reconstructed spaces.
 64 |     /// The encoder uses this to return the segmentation of the source string.
 65 |     /// </summary>
 66 |     public struct DecodedSegment : IEquatable<DecodedSegment>
 67 |     {
 68 |         public readonly string SurfaceForm; // final plain-text form
 69 |         public readonly bool IsWordTokenStart, IsWordTokenEnd;
 70 |         public readonly bool IsSpacingWordStart, IsSpacingWordEnd;
 71 | 
 72 |         public struct SourceLink : IEquatable<SourceLink>// for representing alignment information
 73 |         {
 74 |             public EncodedSegmentReference SourceSegment; // contains the character alignment
 75 |             public float Confidence; // @TODO: unit? prob or log prob?
 76 |             public bool Equals(SourceLink other)
 77 |             {
 78 |                 return
 79 |                     ((SourceSegment == null) == (other.SourceSegment == null) ||
 80 |                      (SourceSegment != null) && SourceSegment.Equals(other.SourceSegment)) &&
 81 |                     Confidence == other.Confidence;
 82 |             }
 83 |         }
 84 |         public readonly List<SourceLink> SourceAlignment; // character range(s) (and confidence) of original source string(s)
 85 | 
 86 |         /// <summary>
 87 |         /// True if this segment's surface string was set using the DecodeAs mechanism (e.g. for phrasefix, urls, etc)
 88 |         /// </summary>
 89 |         public bool IsForceDecode { get; set; }
 90 |         public DecodedSegment(string surfaceForm, bool isWordTokenStart, bool isWordTokenEnd, List<SourceLink> sourceLinks, bool isForceDecode, bool isSpacingWordStart, bool isSpacingWordEnd)
 91 |         {
 92 |             SurfaceForm = surfaceForm;
 93 |             IsWordTokenStart = isWordTokenStart;
 94 |             IsWordTokenEnd = isWordTokenEnd;
 95 |             SourceAlignment = sourceLinks;
 96 |             IsForceDecode = isForceDecode;
 97 |             IsSpacingWordStart = isSpacingWordStart;
 98 |             IsSpacingWordEnd = isSpacingWordEnd;
 99 |         }
100 |         public bool Equals(DecodedSegment other)
101 |         {
102 |             return SurfaceForm == other.SurfaceForm && IsWordTokenStart == other.IsWordTokenStart && IsWordTokenEnd == other.IsWordTokenEnd &&
103 |                    SourceAlignment.NullableSequenceEquals(other.SourceAlignment) && IsForceDecode == other.IsForceDecode &&
104 |                    IsSpacingWordStart == other.IsSpacingWordStart && IsSpacingWordEnd == other.IsSpacingWordEnd;
105 |         }
106 | 
107 |         // for debugging
108 |         public override string ToString() => SurfaceForm;
109 | 
110 |         /// <summary>
111 |         /// Clone this object, but with a replaced surface form. This is inteded to be used for making modifications
112 |         /// to surface forms during postprocessing without disrupting alignment or word boundary flags. Examples are
113 |         /// ensuring that all question marks in Chinese are full width ('？' rather than '?') or that the newer and
114 |         /// more correct form of certain T/S diacritics for Romanian ('Ș' rather than 'Ş').
115 |         /// 
116 |         /// Using this has the potential to create a situation where some of the assumptions associated with word boundary
117 |         /// flags are violated. For example, IsSpacingWordStart/End can be true between two continuous script segments (e.g.
118 |         /// Japanese or Chinese), but would not be true between two spacing script segments in general (e.g. Latin or Cyrillic). 
119 |         /// If we replace surface forms for two consecutive Japanese segments with Latin strings, we would then have a
120 |         /// two consecutive Latin segments with IsSpacingWord* set to true, allowing tags and character alignment boundaries
121 |         /// to be placed at that boundary.
122 |         /// 
123 |         /// For these reasons, it is much safer to use this to change surface forms within like segment classes (e.g.
124 |         /// punctuation, characters within a script, etc).
125 |         /// 
126 |         /// For example, if we wanted to clone a DecodedSegment, questionMarkSeg, whose surface form was "?", but
127 |         /// wanted to the clone to have a full width question mark, we could use the following code:
128 |         /// var fullWidthSeg = questionMarkSeg.WithSurfaceForm("？");
129 |         /// </summary>
130 |         /// <param name="newSurfaceForm">A new surface form that will given to the clone</param>
131 |         /// <returns>A clone of this object, but with surface form replaced by specified argument</returns>
132 |         public DecodedSegment WithSurfaceForm(string newSurfaceForm)
133 |         {
134 |             return new DecodedSegment(
135 |                 surfaceForm: newSurfaceForm,
136 |                 isWordTokenStart: IsWordTokenStart,
137 |                 isWordTokenEnd: IsWordTokenEnd,
138 |                 sourceLinks: SourceAlignment,
139 |                 isForceDecode: IsForceDecode,
140 |                 isSpacingWordStart: IsSpacingWordStart,
141 |                 isSpacingWordEnd: IsSpacingWordEnd);
142 |         }
143 |     }
144 | 
145 |     /// <summary>
146 |     /// This is an opaque object returned by the Segmenter.Encode that gives instructions for how to replace word classed tokens 
147 |     /// (e.g. phrasefix) at Segmenter.Decode time.
148 |     /// </summary>
149 |     public interface IDecoderPackage { } // @TODO: find a better name
150 | 
151 |     /// <summary>
152 |     /// The result of Encode(), which consists of
153 |     ///  - tokens in their serialized string form, for use by Marian NMT training
154 |     ///  - segmentation information, for use in alignment
155 |     /// </summary>
156 |     public abstract class IEncoded
157 |     {
158 |         /// <summary>
159 |         /// The original source line of raw plain text that was to be encoded.
160 |         /// </summary>
161 |         public abstract string OriginalSourceText { get; }
162 |         /// <summary>
163 |         /// Source line segments that correspond to the encoded tokens.
164 |         /// Each token carries additional word-boundary information.
165 |         /// Tokens are in left-to-right order, but possibly with repeats, and may have gaps.
166 |         /// This array allows to find the set of segmentation boundaries, for example for
167 |         /// training an alignment model or tag manipulations.
168 |         /// Do NOT, however, use this to reconstruct the original source line, because:
169 |         ///  - spaces are not included (since they get elided in encoding)
170 |         ///  - any replaced ranges (phrase fixes, EncodeAsIf) only have their outer boundaries
171 |         ///  - if a replaced range gets SentencePiece'd, then we get multiple tokens that each
172 |         ///    span the full original replaced region
173 |         /// Examples:
174 |         ///  - "abc defg hi" with defg inline-phrase-fixed to XYZ, with SPM-splits de fg and XY Z.
175 |         ///    Tokens will be something like "abc (( de fg || XY Z )) hi" (factors not shown).
176 |         ///    Resulting source text segments will be "abc '' '' '' '' defg defg '' hi".
177 |         ///    (@TODO: A future version may retain de and fg as well.)
178 |         /// </summary>
179 |         public abstract EncodedSegmentReference[] OriginalSourceTextSegments { get; }
180 |         /// <summary>
181 |         /// The encoding result expressed as a sequence of ProcessToken items.
182 |         /// </summary>
183 |         public abstract List<ProcessedToken> ProcessedTokens { get; }
184 |         /// <summary>
185 |         /// The encoding result expressed as a sequence of tokens in their serialized (encoded) form.
186 |         /// </summary>
187 |         public abstract IEnumerable<string> TokenStrings { get; }
188 |         /// <summary>
189 |         /// The encoding result expressed as a sequence of tokens in their serialized (encoded) form.
190 |         /// This is different from TokenStrings() since e.g. for FactoredSegmenter, the aligner
191 |         /// should not receive factors.
192 |         /// </summary>
193 |         public abstract IEnumerable<string> TokenStringsForAligner { get; }
194 |         /// <summary>
195 |         /// Number of tokens. All properties above except OriginalSourceTextSegments return this many items.
196 |         /// </summary>
197 |         public abstract int Count { get; }
198 |         /// <summary>
199 |         /// The original source sentence annotations that was passed to Encode().
200 |         /// </summary>
201 |         public abstract Dictionary<string, string> OriginalSourceSentenceAnnotations { get; }
202 |         /// <summary>
203 |         /// The result expressed as a single text line; meant for debugger visualization only.
204 |         /// </summary>
205 |         public override string ToString() => " ".JoinItems(TokenStrings);
206 |         /// <summary>
207 |         /// This property holds an opaque package of information that should be passed on to
208 |         /// the Decode() function.
209 |         /// </summary>
210 |         public abstract IDecoderPackage DecoderPackage { get; }
211 |     }
212 | 
213 |     /// <summary>
214 |     /// Result of the Decode() function. Predominantly an array of SegmenterTokens,
215 |     /// which carry surface form, boundary flags, and alignment info.
216 |     /// </summary>
217 |     public abstract class IDecoded
218 |     {
219 |         /// <summary>
220 |         /// The decoded line as consecutive sub-strings that represent the original tokenization from translation,
221 |         /// but with spaces inserted. The decoded line can be formed by straight concatenation of the tokens' SurfaceForm fields.
222 |         /// </summary>
223 |         public abstract DecodedSegment[] Tokens { get; }
224 |         /// <summary>
225 |         /// The final decoded line as raw plain text. Same as concatenating all SegmenterToken[].SurfaceForm
226 |         /// </summary>
227 |         public override string ToString() => "".JoinItems(from token in Tokens select token.SurfaceForm);
228 |     }
229 | 
230 |     /// <summary>
231 |     /// Base class that is used to invoke segmenters (SentencePiece or FactoredSegmenter).
232 |     /// A segmenter is an object that can encode / decode a single language's strings (We need the parallel segmenter for runtime and training)
233 |     /// at runtime.
234 |     /// Such a segmenter does (ideally) not know about the translation process (the parallel segmenter does).
235 |     /// </summary>
236 |     public abstract class SegmenterCoderBase
237 |     {
238 |         public abstract IEncoded Encode(string line,
239 |                                         List<AnnotatedSpan> annotatedSpans = null, Dictionary<string, string> sourceSentenceAnnotations = null,
240 |                                         int? seed = null);
241 | 
242 |         /// <summary>
243 |         /// Decode a line of tokens in Marian-internal string format from in-memory data structures.
244 |         /// Spaces are individual tokens in the output.
245 |         /// </summary>
246 |         public abstract IDecoded Decode(IEnumerable<string> encodedTokensFromMT,
247 |                                         Alignment alignmentFromMT,
248 |                                         IDecoderPackage decoderPackage);
249 | 
250 |         /// <summary>
251 |         /// Decode a line of tokens in serialized Marian-NMT form, e.g. the result of Marian
252 |         /// translation as written to a file. This overload does not support alignments.
253 |         /// </summary>
254 |         public IDecoded Decode(string line, IDecoderPackage decoderPackage = null)
255 |         {
256 |             return Decode(line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(), null, decoderPackage);
257 |         }
258 | 
259 |         /// <summary>
260 |         /// Create a SegmenterCoder from a config. The kind of segmenter is determined
261 |         /// from the config's actual type.
262 |         /// </summary>
263 |         public static SegmenterCoderBase CreateForKindOf(SegmenterCoderConfig config)
264 |         {
265 |             if (config == null)
266 |                 return null;
267 |             switch (config.SegmenterKind)
268 |             {
269 |                 case SegmenterKind.FactoredSegmenter:
270 |                     // @TODO: what do we put into the SegmenterCoderConfig config? Maybe a FactoredSegmenterCoderConfig?
271 |                     return new FactoredSegmenterCoder(new FactoredSegmenterCoderConfig { ModelPath = config.ModelPath });
272 |                 case SegmenterKind.SentencePiece:
273 |                 default:
274 |                     throw new NotImplementedException();
275 |             }
276 |         }
277 | 
278 |         // special functions for shortlist generation, for use by PureNeuralTools/lex_trans_to_shortlist
279 | 
280 |         /// <summary>
281 |         /// Retrieve the shortlist vocabulary. This is for use by PureNeuralTools/lex_trans_to_shortlist.
282 |         /// </summary>
283 |         public abstract string[] ShortlistVocab { get; }
284 | 
285 |         /// <summary>
286 |         /// Transcode a token (in segmenter-encoded form) into the shortlist token (in segmenter-encoded form).
287 |         /// </summary>
288 |         public abstract string TranscodeTokenToShortlist(string token);
289 | 
290 |         /// <summary>
291 |         /// Why we need this flag? We may want to log strings, do additional checks, fail during training code path. However, at runtime -- when running in our cluster
292 |         /// we have strict requirements. This flag is used to indicate the same.  if this is set to false (default = true),we cannot log any user strings at runtime.
293 |         /// </summary>
294 |         public bool IsTrainingScenario { get; set; }
295 | 
296 |         /// <summary>
297 |         /// Find character spans that are out of vocabulary for the model and cannot be encoded.
298 |         /// </summary>
299 |         public abstract IEnumerable<(int StartIndex, int Length)> FindUnrepresentableSpans(string line);
300 |     }
301 | 
302 |     // Unimplemented version, if we wanted to use raw sentence piece instead of Factored segmenter. We'd need to figure out how to handle tags and other spans.
303 |     public class SentencePieceSegmenterCoder : SegmenterCoderBase
304 |     {
305 |         SentencePieceCoder coder;
306 | 
307 |         public SentencePieceSegmenterCoder(string modelPath)
308 |         {
309 |             coder = new SentencePieceCoder(new SentencePieceCoderConfig { SentencePieceModel = SentencePieceModel.Load(modelPath) });
310 |         }
311 | 
312 |         public override IEncoded Encode(string line, List<AnnotatedSpan> annotatedSpans = null, Dictionary<string, string> sourceSentenceAnnotations = null, int? seed = null)
313 |         {
314 |             throw new NotImplementedException();
315 |         }
316 | 
317 |         public override IDecoded Decode(
318 |             IEnumerable<string> encodedTokensFromMT,
319 |             Alignment alignmentFromMT,
320 |             IDecoderPackage decoderPackage)
321 |         {
322 |             throw new NotImplementedException();
323 |         }
324 | 
325 |         public override string[] ShortlistVocab { get { throw new NotImplementedException(); } }
326 | 
327 |         public override string TranscodeTokenToShortlist(string token)
328 |         {
329 |             throw new NotImplementedException();
330 |         }
331 | 
332 |         public override IEnumerable<(int StartIndex, int Length)> FindUnrepresentableSpans(string line)
333 |         {
334 |             throw new NotImplementedException();
335 |         }
336 |     }
337 | }
338 | 


--------------------------------------------------------------------------------
/src/SentencePieceConfigs.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | using System;
  5 | using System.Collections.Generic;
  6 | using System.Linq;
  7 | using System.Text;
  8 | using System.Threading.Tasks;
  9 | using System.Xml.Serialization;
 10 | 
 11 | namespace Microsoft.MT.Common.Tokenization
 12 | {
 13 |     // types for SentencePiece
 14 |     public enum SentencePieceModelType
 15 |     {
 16 |         [XmlEnum(Name = "unigram")]
 17 |         Unigram = 0,
 18 |         [XmlEnum(Name = "bpe")]
 19 |         Bpe,
 20 |         [XmlEnum(Name = "word")]
 21 |         Word,
 22 |         [XmlEnum(Name = "char")]
 23 |         Char
 24 |     }
 25 | 
 26 |     public enum SentencePieceNormalizationRuleName
 27 |     {
 28 |         [XmlEnum(Name = "nmt_nfkc")]
 29 |         Nfkc = 0,
 30 |         [XmlEnum(Name = "identity")]
 31 |         Identity
 32 |     }
 33 | 
 34 |     // Note: The following cannot be specified by Flo users, as these are under Flo's control.
 35 |     public enum SentencePieceInputFormat
 36 |     {
 37 |         [XmlEnum(Name = "text")]
 38 |         Text = 0,
 39 |         [XmlEnum(Name = "tsv")]
 40 |         Tsv
 41 |     }
 42 | 
 43 |     public enum SentencePieceEncodeFormat
 44 |     {
 45 |         [XmlEnum(Name = "piece")]
 46 |         Piece = 0,
 47 |         [XmlEnum(Name = "id")]
 48 |         Id,
 49 |         [XmlEnum(Name = "proto")]
 50 |         Proto,
 51 |         [XmlEnum(Name = "nbest_piece")]
 52 |         NBest_Piece,
 53 |         [XmlEnum(Name = "nbest_id")]
 54 |         NBest_Id,
 55 |         [XmlEnum(Name = "nbest_proto")]
 56 |         NBest_Proto
 57 |     }
 58 | 
 59 |     public enum SentencePieceDecodeInputFormat
 60 |     {
 61 |         [XmlEnum(Name = "piece")]
 62 |         Piece = 0,
 63 |         [XmlEnum(Name = "id")]
 64 |         Id
 65 |     }
 66 | 
 67 |     public enum SentencePieceDecodeOutputFormat
 68 |     {
 69 |         [XmlEnum(Name = "string")]
 70 |         String = 0,
 71 |         [XmlEnum(Name = "proto")]
 72 |         Proto
 73 |     }
 74 | 
 75 |     /// <summary>
 76 |     /// Class to hold all parameters for the SentencePiece training tool.
 77 |     /// </summary>
 78 |     public class SentencePieceTrainConfig : SegmenterTrainConfigBase, ISentencePieceConfig
 79 |     {
 80 |         /// <summary>
 81 |         /// comma-separated list of languages this model can accept
 82 |         /// </summary>
 83 |         public string AcceptLanguage { get; set; }
 84 |         /// <summary>
 85 |         /// Add dummy whitespace at the beginning of text ( default: true )
 86 |         /// </summary>
 87 |         public bool? AddDummyPrefix { get; set; }
 88 |         /// <summary>
 89 |         /// Override BOS (&lt;s&gt;) id. Set -1 to disable BOS ( default: -1 )
 90 |         /// @BUGBUG: BosId, eosId and UnkId should not be user-specifyable, as they are controlled by Flo
 91 |         /// </summary>
 92 |         public Int32 BosId { get; set; } = -1;
 93 |         /// <summary>
 94 |         /// Character coverage to determine the minimum symbols ( default: 0.9995 )
 95 |         /// </summary>
 96 |         public double? CharacterCoverage { get; set; }
 97 |         /// <summary>
 98 |         /// Comma separated list of control symbols
 99 |         /// </summary>
100 |         public string ControlSymbols { get; set; }
101 |         /// <summary>
102 |         /// Override EOS ((&lt;/s&gt;)) id. Set -1 to disable EOS. ( default: 0 )
103 |         /// @BUGBUG: BosId, eosId and UnkId should not be user-specifyable, as they are controlled by Flo
104 |         /// </summary>
105 |         public Int32 EosId { get; set; } = 0;
106 |         /// <summary>
107 |         /// If set to false, --vocab_size is considered as a soft limit. ( default: true )
108 |         /// </summary>
109 |         public bool? HardVocabLimit { get; set; }
110 |         /// <summary>
111 |         /// Comma separated list of input sentences )  type: string 
112 |         /// </summary>
113 |         public string input { get; set; }
114 |         /// <summary>
115 |         /// Input format. Supported format is 'text' or 'tsv'. ( default: 'text' )
116 |         /// </summary>
117 |         public SentencePieceInputFormat? InputFormat { get; set; }
118 |         /// <summary>
119 |         /// Maximum size of sentences the trainer loads ( default: 10000000 )
120 |         /// </summary>
121 |         public Int32? InputSentenceSize { get; set; }
122 |         /// <summary>
123 |         /// Maximum length of sentence in bytes ( default: 2048)
124 |         /// </summary>
125 |         public Int32? MaxSentenceLength { get; set; }
126 |         /// <summary>
127 |         /// Maximum length of sentence piece ( default: 16 )
128 |         /// </summary>
129 |         public Int32? MaxSentencepieceLength { get; set; }
130 |         /// <summary>
131 |         /// Maximum size of sentences to make seed sentence piece ( default: 2000000 )
132 |         /// </summary>
133 |         public Int32? MiningSentenceSize { get; set; }
134 |         /// <summary>
135 |         /// Output model prefix
136 |         /// </summary>
137 |         public string ModelPrefix { get; set; }
138 |         /// <summary>
139 |         /// Model algorithm: unigram, bpe, word or char ( default: unigram )
140 |         /// </summary>
141 |         public SentencePieceModelType? ModelType { get; set; }
142 |         /// <summary>
143 |         /// Normalization rule name. Choose from nfkc or identity ( default: nmt_nfkc )
144 |         /// </summary>
145 |         public SentencePieceNormalizationRuleName? NormalizationRuleName { get; set; }
146 |         /// <summary>
147 |         /// Normalization rule TSV file. 
148 |         /// </summary>
149 |         public string NormalizationRuleTsv { get; set; }
150 |         /// <summary>
151 |         /// Number of EM sub-iterations ( default: 2 )
152 |         /// </summary>
153 |         public Int32? NumSubIterations { get; set; }
154 |         /// <summary>
155 |         /// Number of threads for training ( default: 16 )
156 |         /// </summary>
157 |         public Int32? NumThreads { get; set; }
158 |         /// <summary>
159 |         /// Override PAD (&lt;pad&gt;) id. Set -1 to disable PAD. ( default: -1 )
160 |         /// </summary>
161 |         public Int32? PadId { get; set; }
162 |         /// <summary>
163 |         /// Removes leading, trailing, and duplicate internal whitespace ( default: true )
164 |         /// </summary>
165 |         public bool? RemoveExtraWhitespaces { get; set; }
166 |         /// <summary>
167 |         /// The size of seed sentencepieces ( default: 1000000 )
168 |         /// </summary>
169 |         public Int32? SeedSentencepieceSize { get; set; }
170 |         /// <summary>
171 |         /// The size of self test samples ( default: 0 )
172 |         /// </summary>
173 |         public Int32? SelfTestSampleSize { get; set; }
174 |         /// <summary>
175 |         /// Keeps top shrinking_factor pieces with respect to the loss ( default: 0.75 )
176 |         /// </summary>
177 |         public double? ShrinkingFactor { get; set; }
178 |         /// <summary>
179 |         /// Use Unicode script to split sentence pieces ( default: true )
180 |         /// </summary>
181 |         public bool? SplitByUnicodeScript { get; set; }
182 |         /// <summary>
183 |         /// Use a white space to split sentence pieces ( default: true )
184 |         /// </summary>
185 |         public bool? SplitByWhitespace { get; set; }
186 |         /// <summary>
187 |         /// Maximum size of sentences to train sentence pieces ( default: 10000000 )
188 |         /// </summary>
189 |         public override Int32? TrainingSentenceSize { get; set; }
190 |         /// <summary>
191 |         /// Override UNK (&lt;unk&gt;) id. ( default: 1 )
192 |         /// </summary>
193 |         public Int32 UnkId { get; set; } = 1;
194 |         /// <summary>
195 |         /// Dummy surface string for &lt;unk&gt;. In decoding &lt;unk&gt; is decoded to `unk_surface`.
196 |         /// @BUGBUG: BosId, eosId and UnkId should not be user-specifyable, as they are controlled by Flo
197 |         /// </summary>
198 |         public string UnkSurface { get; set; }
199 |         /// <summary>
200 |         /// If set to true, use all tokens as vocab.Valid for word/char models. ( default: false )
201 |         /// </summary>
202 |         public bool? UseAllVocab { get; set; }
203 |         /// <summary>
204 |         /// Comma separated list of user defined symbols
205 |         /// </summary>
206 |         public string UserDefinedSymbols { get; set; }
207 |         /// <summary>
208 |         /// Vocabulary size ( default: 32000 )
209 |         /// </summary>
210 |         public int? VocabSize { get; set; } = 32000;
211 |     }
212 | 
213 |     /// <summary>
214 |     /// Class to hold all parameters for the SentencePiece encoding tool.
215 |     /// </summary>
216 |     public class SentencePieceEncodeConfig : SegmenterEncodeConfigBase, ISentencePieceConfig
217 |     {
218 |         /// <summary>
219 |         /// Smoothing parameter for sampling mode ( default: 0.5 )
220 |         /// </summary>
221 |         public double? Alpha { get; set; }
222 |         /// <summary>
223 |         /// ':' separated encoder extra options, e.g., "reverse:bos:eos"
224 |         /// </summary>
225 |         public string ExtraOptions { get; set; }
226 |         /// <summary>
227 |         /// Generates vocabulary file instead of segmentation ( default: false )
228 |         /// Internal use only; cannot be specified by Flo user.
229 |         /// </summary>
230 |         public bool? GenerateVocabulary { get; set; }
231 |         /// <summary>
232 |         /// NBest size ( default: 10 ). Only used if OutputFormat is nbest_XXX.
233 |         /// </summary>
234 |         public Int32? NBest_Size { get; set; }
235 |         /// <summary>
236 |         /// choose from piece, id, proto, nbest_piece, nbest_id, or nbest_proto ( default: piece) 
237 |         /// Internal use only; cannot be specified by Flo user.
238 |         /// </summary>
239 |         public SentencePieceEncodeFormat? OutputFormat { get; set; }
240 |         /// <summary>
241 |         /// Restrict the vocabulary. The encoder only emits the tokens in "vocabulary" file
242 |         /// </summary>
243 |         public string Vocabulary { get; set; }
244 |         /// <summary>
245 |         /// Words with frequency below threshold will be treated as OOV ( default: 0 )
246 |         /// </summary>
247 |         public Int32? VocabularyThreshold { get; set; }
248 |     }
249 | 
250 |     /// <summary>
251 |     /// Class to hold all parameters for the SentencePiece decoding tool.
252 |     /// </summary>
253 |     public class SentencePieceDecodeConfig : SegmenterDecodeConfigBase, ISentencePieceConfig
254 |     {
255 |         /// <summary>
256 |         /// ':' separated encoder extra options, e.g., "reverse:bos:eos"
257 |         /// </summary>
258 |         public string ExtraOptions { get; set; }
259 |         /// <summary>
260 |         /// choose from piece, id. Default: piece
261 |         /// Internal use only; cannot be specified by Flo user.
262 |         /// </summary>
263 |         public SentencePieceDecodeInputFormat? InputFormat { get; set; }
264 |         /// <summary>
265 |         /// choose from string or proto. Default: string
266 |         /// Internal use only; cannot be specified by Flo user.
267 |         /// </summary>
268 |         public SentencePieceDecodeOutputFormat? OutputFormat { get; set; }
269 |     }
270 | }
271 | 


--------------------------------------------------------------------------------
/src/SentencePieceManaged.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | // This file is not used in the standalone/Linux build.
  5 | 
  6 | using System;
  7 | using System.Collections.Generic;
  8 | using System.Linq;
  9 | using System.Runtime.InteropServices;
 10 | 
 11 | namespace Segmentation
 12 | {
 13 |     public class SentencePieceManaged
 14 |     {
 15 |         private readonly IntPtr model;
 16 | 
 17 |         private static class NativeMethods
 18 |         {
 19 |             private const string DllName = "SentencePieceInterop";
 20 |             [DllImport(DllName, CharSet = CharSet.Unicode, CallingConvention = CallingConvention.Cdecl)]
 21 |             public static extern IntPtr LoadModel(String modelPath,
 22 |                 [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.LPWStr, SizeParamIndex = 2)]String[] vocab, ulong vocabSize);
 23 | 
 24 |             [DllImport(DllName, CharSet = CharSet.Unicode, CallingConvention = CallingConvention.Cdecl)]
 25 |             public static extern int EncodeAsIds(IntPtr model, string word, 
 26 |                 [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I4, SizeParamIndex = 3)]int[] pieceIdBuffer, ulong pieceIdBufferSize);
 27 | 
 28 |             [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
 29 |             public static extern int UCS2LengthOfPieceId(IntPtr model, int pieceId);
 30 | 
 31 |             [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
 32 |             public static extern void UnloadModel(IntPtr model);
 33 | 
 34 |         }
 35 | 
 36 | 
 37 |         public SentencePieceManaged(String modelPath, string[] vocab)
 38 |         {
 39 |             ulong vocabLength = (ulong?)vocab?.Length ?? 0UL;
 40 |             IntPtr local = NativeMethods.LoadModel(modelPath, vocab, (ulong) vocabLength);
 41 |             if (local == IntPtr.Zero)
 42 |                 throw new ArgumentNullException($"Could not load model file from path {modelPath}");
 43 | 
 44 |             this.model = local;
 45 |         }
 46 | 
 47 |         ~SentencePieceManaged()
 48 |         {
 49 |             if (model != IntPtr.Zero)
 50 |             {
 51 |                 NativeMethods.UnloadModel(this.model);
 52 |             }
 53 |         }
 54 | 
 55 |         /// <summary>
 56 |         /// This function splits a string (typically a word) into pieces. Instead of returning the pieces, it returns the indices of the split points as an array of integers (including 0 and N).
 57 |         /// In the frequent case that nothing is split, we instead return null to save a memory allocation.
 58 |         /// </summary>
 59 |         /// <param name="segment">The word to split</param>
 60 |         /// <returns>Array representing indices where to split the word, including 0 and N, or null which maps to [0,N]</returns>
 61 |         public int[] GetSplitPoints(String segment)
 62 |         {
 63 |             if (String.IsNullOrEmpty(segment) || segment.Length == 1)
 64 |                 return null;
 65 |             int[] pieceIds = new int[segment.Length];
 66 |             // break string using SentencePiece library
 67 |             int size = NativeMethods.EncodeAsIds(model, segment, pieceIds, (ulong)pieceIds.Length); ;
 68 |             if(size < 0)
 69 |             {
 70 |                 throw new InvalidOperationException("SentencePiece returned a negative size array");
 71 |             } 
 72 | 
 73 |             if (size == 1)
 74 |             {
 75 |                 int length = NativeMethods.UCS2LengthOfPieceId(this.model, pieceIds[0]);
 76 |                 // if it's length 1 and not an UNK token, we return null
 77 |                 if (length != -1)
 78 |                     return null;
 79 |             }
 80 |             
 81 |             // create the array of offsets, by aggregating the lengths of all pieces
 82 |             int segmentSize = segment.Length;
 83 |             List<int> cutList = new List<int>();
 84 |             cutList.Add(0); // 0 is always included in the cut-list
 85 |             
 86 |             bool done = false;
 87 |             while (!done) // retry loop used in case of unencodable characters
 88 |             {
 89 |                 done = true;
 90 |                 for (int i = 0; i < size; i++)
 91 |                 {
 92 |                     if (cutList.Last() >= segmentSize) // logic error
 93 |                         throw new InvalidOperationException($"Unexpectedly hit the end while splitting {segment}");
 94 |                     int pieceId = pieceIds[i];
 95 |                     int pieceLength = NativeMethods.UCS2LengthOfPieceId(this.model, pieceId);
 96 |                     // handle unknown character
 97 |                     // Unfortunately, SPM just returns a single <unk> token for any sequence of unencodable
 98 |                     // characters, without telling us how many source characters it is made up of.
 99 |                     // To work around this, we split off the first char of the <unk> token, but then
100 |                     // call Encode() again with the remaining string. If the <unk> consisted of
101 |                     // more than one unencodable, the same mechanism will then kick in to split off the next
102 |                     // char, call Encode() again etc. This has square complexity w.r.t. string length,
103 |                     // but sequences are short, and this does not happen too frequently.
104 |                     if (pieceLength == -1)                  // -1 indicates one or more unknown characters
105 |                     {
106 |                         bool skipLow = Char.IsHighSurrogate(segment, cutList.Last()) && cutList.Last() + 2 <= segmentSize;
107 |                         cutList.Add(cutList.Last() + 1 + (skipLow ? 1 : 0)); // consume it (skip two if surrogate pair)
108 |                         if (cutList.Last() == segmentSize)             // none left
109 |                             ;
110 |                         else if (cutList.Last() + 1 == segmentSize)    // single char left
111 |                             cutList.Add(segmentSize);
112 |                         else                                           // more left: go again with remainder
113 |                         {
114 |                             // find the substring from the last index that had a length to the end
115 |                             String copySegment = segment.Substring(cutList.Last());
116 |                             size = NativeMethods.EncodeAsIds(model, copySegment, pieceIds, (ulong)pieceIds.Length);
117 |                             if(size < 0)
118 |                                 throw new InvalidOperationException("Substring should use less space than original");
119 |                             done = false;
120 |                         }
121 | 
122 |                         // if we found an unk, break the current loop, and start a new loop over, if there are any characters left
123 |                         break;
124 |                     }
125 |                     // regular case
126 |                     else
127 |                     {
128 |                         cutList.Add(cutList.Last() + pieceLength);
129 |                     }
130 |                 }
131 |             }
132 | 
133 |             if (cutList.Last() != segmentSize)
134 |                 throw new InvalidOperationException("Sentence pieces do not reconstruct original string??");
135 |             return cutList.ToArray();
136 |         }
137 | 
138 |         public string[] Segment(String line)
139 |         {
140 |             throw new NotImplementedException();
141 |         }
142 | 
143 |         public String Unsegment(string[] pieces)
144 |         {
145 |             throw new NotImplementedException();
146 |         }
147 |     }
148 | }
149 | 


--------------------------------------------------------------------------------
/src/SentencePieceWrapper.cs:
--------------------------------------------------------------------------------
  1 | ﻿// Copyright (c) Microsoft Corporation.
  2 | // Licensed under the MIT license.
  3 | 
  4 | using System;
  5 | using System.Collections.Generic;
  6 | using System.IO;
  7 | using System.Linq;
  8 | using System.Runtime.InteropServices;
  9 | using System.Text;
 10 | using Common.Collections;
 11 | using Common.Collections.Extensions;
 12 | using Common.Contracts;
 13 | using Common.IO;
 14 | using Common.Utils;
 15 | 
 16 | namespace Microsoft.MT.Common.Tokenization
 17 | {
 18 |     /// <summary>
 19 |     /// Wrapper for SentencePiece that supports
 20 |     ///  - training an SPM model via invoking the spm_train executable
 21 |     ///  - encoding of words as pieces via an in-memory object/lambda
 22 |     /// </summary>
 23 |     public class SentencePieceModel
 24 |     {
 25 |         const string spmModelExt = ".model"; // these are required/hard-coded by the spm_train tool
 26 |         const string spmVocabExt = ".vocab";
 27 | 
 28 |         // model data
 29 |         public byte[] Bytes { get; }
 30 | 
 31 |         /// <summary>
 32 |         /// Construct an SPM model from file.
 33 |         /// </summary>
 34 |         public static SentencePieceModel Load(string path)
 35 |         {
 36 |             return new SentencePieceModel(File.ReadAllBytes(path));
 37 |         }
 38 | 
 39 |         /// <summary>
 40 |         /// Construct an SPM model from a byte array.
 41 |         /// </summary>
 42 |         public SentencePieceModel(byte[] modelBlob)
 43 |         {
 44 |             Bytes = modelBlob;
 45 |         }
 46 | 
 47 |         /// <summary>
 48 |         /// Construct an SPM model from data; that is, train one.
 49 |         /// The input is passed as an IEnumerable or a ParallelQuery of lines of raw plain-text.
 50 |         /// The model is returned as a binary blob (for later use in encoding/decoding).
 51 |         /// Underneath, this uses the spm_train executable, which needs to store the model as a file. That location is
 52 |         /// passed in as 'tempSPMModelPath'. These output files are temporary and local to this function, but
 53 |         /// it is useful to keep them around for diagnostics and debugging; they are not (meant to be) used after this.
 54 |         /// 'minPieceCount' allows to set a minimum observation count for word pieces. spm_train does not support this,
 55 |         /// so we emulate/approximate it by running spm_train twice.
 56 |         /// </summary>
 57 |         public static SentencePieceModel Train<Enumerable>(Enumerable tokenStrings, string tempSPMModelPath,
 58 |             SentencePieceTrainConfig spmParams, int minPieceCount, string spmBinDir)
 59 |             where Enumerable : IEnumerable<string> // using template so we won't loose parallelism (is this needed?)
 60 |         {
 61 |             Sanity.Requires(tempSPMModelPath.EndsWith(spmModelExt), $"FactoredSegmenter SentencePiece model path must end in {spmModelExt}");
 62 |             var modelPrefix = tempSPMModelPath.Substring(0, tempSPMModelPath.Length - spmModelExt.Length);
 63 | 
 64 | #if false   // helper during debugging of final Training stage when models already exist
 65 |             LoadSPMModelFiles(modelPrefix, out var spmModelBlob, out var spmVocab);
 66 | #else
 67 | 
 68 |             // write the tokens to a temp file
 69 |             var tempInputDataPath = modelPrefix + ".data";
 70 |             Logger.WriteLine($"FactoredSegmenter: Writing to temp file {tempInputDataPath} for SPM training...");
 71 |             AtomicFileWriter.Save(tempInputDataPath, tmpPath => File.WriteAllLines(tmpPath, tokenStrings, new UTF8Encoding()));
 72 |             // atomic writing allows the impatient user to know when the writing has completed and spm_train has taken over
 73 | 
 74 |             // invoke spm_train
 75 |             SPMTrain(tempInputDataPath, modelPrefix, spmParams, spmBinDir, null);
 76 | 
 77 |             // fetch the content of the generated .model and .vocab file into in-memory data structures
 78 |             // After this, the spm_train-generated files are no longer used; and only kept for debugging purposes.
 79 |             LoadSPMModelFiles(modelPrefix, out var spmModelBlob, out var spmVocab);
 80 | 
 81 |             // enforce minimum piece-count constraint
 82 |             if (minPieceCount > 1)
 83 |             {
 84 |                 // encode the SPM training data and count each token's occurence
 85 |                 Logger.WriteLine($"FactoredSegmenter: Minimum-count constraint ({minPieceCount}), counting SPM tokens...");
 86 |                 var coder = new SentencePieceCoder(new SentencePieceCoderConfig { SentencePieceModel = new SentencePieceModel(spmModelBlob) });
 87 |                 var counts = CountEncodedTokens(tempInputDataPath, coder);
 88 |                 File.WriteAllLines(tempSPMModelPath + $".{spmVocab.Length}.counts", // save it for diagnostics only
 89 |                     from kvp in counts orderby -kvp.Value, kvp.Key select $"{kvp.Key}\t{kvp.Value}");
 90 |                 // count number of SPM vocab items that should be kept (above the threshold or single character which we always keep)
 91 |                 var spmVocabSet = new HashSet<string>(spmVocab);
 92 |                 int adjustedVocabSize = counts.Count(kvp => spmVocabSet.Contains(kvp.Key) && (kvp.Key.Length == 1 || kvp.Value >= minPieceCount));
 93 |                 // if there are units below the threshold, reduce the SPM vocab size and retrain
 94 |                 if (adjustedVocabSize < spmVocab.Length)
 95 |                 {
 96 |                     Logger.WriteLine($"FactoredSegmenter: Only {adjustedVocabSize} out of {spmVocab.Length} sentence pieces have {minPieceCount} or more observations." +
 97 |                                      $" Retraining SPM model with reduced vocabSize {adjustedVocabSize}");
 98 |                     // invoke spm_train a second time
 99 |                     SPMTrain(tempInputDataPath, modelPrefix, spmParams, spmBinDir, adjustedVocabSize);
100 |                     LoadSPMModelFiles(modelPrefix, out spmModelBlob, out spmVocab); // reload the new model
101 |                 }
102 |                 // count once again for diagnostics only
103 |                 Logger.WriteLine($"FactoredSegmenter: Re-counting SPM tokens after reduction to {adjustedVocabSize}...");
104 |                 coder = new SentencePieceCoder(new SentencePieceCoderConfig { SentencePieceModel = new SentencePieceModel(spmModelBlob) });
105 |                 counts = counts = CountEncodedTokens(tempInputDataPath, coder);
106 |                 File.WriteAllLines(tempSPMModelPath + $".{adjustedVocabSize}.counts", // save for diagnostics only
107 |                     from kvp in counts orderby -kvp.Value, kvp.Key select $"{kvp.Key}\t{kvp.Value}");
108 |             }
109 | 
110 |             // delete the temp file   --except if it failed, so user can double-check what's going on
111 |             // commented out temporarily to aid debugging
112 |             //File.Delete(tempPath);
113 | #endif
114 | 
115 |             return new SentencePieceModel(spmModelBlob);
116 |         }
117 | 
118 |         // helper to count encoded tokens
119 |         private static Dictionary<string, int> CountEncodedTokens(string tempInputDataPath, SentencePieceCoder coder)
120 |         {
121 |             var counts = new Dictionary<string, int>();
122 |             var pieces = from s in File.ReadLines(tempInputDataPath).AsParallel() // note: AsParallel() makes things out of order
123 |                          let cutList = coder.Split(s)
124 |                          from range in (cutList == null) ? new[] { (0, s.Length) } : cutList.Bigrams()
125 |                          select s.Substring(range.Item1, range.Item2 - range.Item1);
126 |             foreach (var piece in pieces)
127 |             {
128 |                 counts.TryGetValue(piece, out var count);
129 |                 counts[piece] = count + 1;
130 |             }
131 |             return counts;
132 |         }
133 | 
134 |         // invoke spm_train tool
135 |         // Reads input data from file, and creates model and vocab to modelPrefix.model and .vocab, respectively.
136 |         private static void SPMTrain(string inputPath, string modelPrefix, SentencePieceTrainConfig spmParams, string spmBinDir, int? vocabSize)
137 |         {
138 |             // e.g.
139 |             // spm_train \
140 |             //    --input=/philly/wu3/msrmt/fseide/WMT.paracrawl/data/all.paracrawl.8M.norm.$units.ende.sub \
141 |             //    --model_prefix=/philly/wu3/msrmt/fseide/WMT.paracrawl/model/all.paracrawl.8M.norm.$units.ende \
142 |             //    --vocab_size=32000  --character_coverage=1.0  --model_type=unigram  --shuffle_input_sentence=false
143 |             string exe = Path.Combine(spmBinDir, "spm_train");
144 |             if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
145 |             {
146 |                 exe = Path.Combine(spmBinDir, "spm_train.exe");
147 |             }
148 |             var args = new List<string> { "--input", inputPath, "--model_prefix", modelPrefix };
149 |             var extraArgs = from extraParam in new Dictionary<string, object>
150 |                             { // @TODO: use generic Flo method that parses the struct type directly
151 |                                 ["vocab_size"] = vocabSize ?? spmParams.VocabSize,
152 |                                 ["character_coverage"] = spmParams.CharacterCoverage,
153 |                                 ["model_type"] = spmParams.ModelType.ToString().ToLower(),
154 |                                 //["shuffle_input_sentence"]   = spmParams.ShuffleInputSentence.ToString().ToLower(), // not supported in the SPM package version used in Flo
155 |                                 ["add_dummy_prefix"] = spmParams.AddDummyPrefix.ToString().ToLower(),
156 |                                 ["normalization_rule_name"] = spmParams.NormalizationRuleName.ToString().ToLower(),
157 |                                 ["split_by_whitespace"] = spmParams.SplitByWhitespace.ToString().ToLower(),
158 |                                 ["remove_extra_whitespaces"] = spmParams.RemoveExtraWhitespaces.ToString().ToLower(),
159 |                                 ["input_sentence_size"] = spmParams.InputSentenceSize,
160 |                                 ["mining_sentence_size"] = spmParams.MiningSentenceSize,
161 |                                 ["training_sentence_size"] = spmParams.TrainingSentenceSize,
162 |                                 ["seed_sentencepiece_size"] = spmParams.SeedSentencepieceSize,
163 |                                 ["max_sentence_length"] = spmParams.MaxSentenceLength
164 |                             }
165 |                             where extraParam.Value != null
166 |                             let val = extraParam.Value.ToString()
167 |                             where val != ""
168 |                             from arg in new string[] { "--" + extraParam.Key, val }
169 |                             select arg; // unroll into form --arg1 argval1 --arg2 argval2 ...
170 |             args.AddRange(extraArgs);
171 |             var envirVariables = new Dictionary<string, string> { { "LC_ALL", "C" } }; // (not sure if this matters; better safe than sorry)
172 |             ProcessTools.RunCommand(exe, ProcessTools.ArgsToCommandLine(args), null, modelPrefix + ".log", throwOnFailure: true, envirVariables: envirVariables);
173 |         }
174 | 
175 |         // helper to fetch .model and .vocab file written out by SPMTrain above into in-memory variables
176 |         private static void LoadSPMModelFiles(string modelPrefix, out byte[] spmModel, out string[] spmVocab)
177 |         {
178 |             spmModel = File.ReadAllBytes(modelPrefix + spmModelExt);
179 |             spmVocab = (from line in File.ReadAllLines(modelPrefix + spmVocabExt)
180 |                         select line.Split('\t').First())                   // .vocab file has the form "TOKEN\tLOGPROB"; we only want TOKEN
181 |                        .OrderBy(t => t.ToString(), StringComparer.Ordinal) // sort it for neatness
182 |                        .ToArray();
183 |         }
184 |     }
185 | 
186 | 
187 |     public class SentencePieceCoderConfig
188 |     {
189 |         /// <summary>
190 |         /// The underlying native SentencePiece model
191 |         /// </summary>
192 |         public SentencePieceModel SentencePieceModel { get; set; }
193 |         /// <summary>
194 |         /// If set, then SPM will be restricted to pieces in this set.
195 |         /// We have seen examples where the internal SPM vocab contains a few units
196 |         /// that are not observed when encoding the SPM training set. I suspect that
197 |         /// is because training uses a soft forward-backward method, while the
198 |         /// re-encoding of the SPM training set uses a best path. To circumvent that
199 |         /// situation, we pass the set of pieces determined by re-encoding the training set.
200 |         /// </summary>
201 |         public HashSet<string> VocabSubset { get; set; } = null;
202 |         /// <summary>
203 |         /// If defined, size of cache to use for split(). Currently only used for caching splits at word level when called from factored segmenter.
204 |         /// </summary>
205 |         public int SplitCacheSize { get; set; } = 0;
206 |     }
207 | 
208 | 
209 |     /// <summary>
210 |     /// Shallow wrapper over the SentencePieceManaged lib for encoding and decoding,
211 |     /// which follows our design of accepting a corresponding model as an input, and providing
212 |     /// encode and decode functions.
213 |     /// </summary>
214 |     public class SentencePieceCoder
215 |     {
216 |         readonly Segmentation.SentencePieceManaged spm;
217 |         private BoundedSizedLockingCache<string, int[]> m_splitCache;
218 | 
219 |         /// <summary>
220 |         /// Construct a coder from a SentencePieceModel.
221 |         /// @TODO: Should this also have a config object?
222 |         /// </summary>
223 |         /// <param name="model">Sentencepiece model to delegate calls to</param>
224 |         /// <param name="cacheSize">Size of cache for calls to Split() (segmentation is very resource intensive)</param>
225 |         public SentencePieceCoder(SentencePieceCoderConfig config)
226 |         {
227 |             // Save BLOB to file, since the current SentencePieceManaged wrapper can only load the model from a file,
228 |             // @TODO: The SentencePiece native API also supports reading from a std::istream,
229 |             //        so we should pass the blob via a a simple istream class that reads from memory, cf.
230 |             //        https://stackoverflow.com/questions/2079912/simpler-way-to-create-a-c-memorystream-from-char-size-t-without-copying-t
231 |             var spmTempModelPath = Path.GetTempFileName();
232 |             File.WriteAllBytes(spmTempModelPath, config.SentencePieceModel.Bytes);
233 |             spm = new Segmentation.SentencePieceManaged(spmTempModelPath, config.VocabSubset?.ToArray());
234 |             File.Delete(spmTempModelPath);
235 |             m_splitCache = new BoundedSizedLockingCache<string, int[]>(config.SplitCacheSize);
236 |         }
237 | 
238 |         /// <summary>
239 |         /// Invoke SPM encode on a text line
240 |         /// </summary>
241 |         public string[] Encode(string line) => spm.Segment(line);
242 | 
243 |         /// <summary>
244 |         /// Encode a word (or continuous-script segment) and return the result a list of split points.
245 |         /// E.g. if SPM splits an input word "hello" into "hel" and "lo",
246 |         /// this function returns (0, 3, 5). The result includes start (0) and length.
247 |         /// If the word was not split, it returns null, to save some memory allocation overhead.
248 |         /// Characters that cannot be represented by the sentence-piece inventory are
249 |         /// returned as individual characters.
250 |         /// This function is not meant to be used with unsegmented input. Its behavior for inputs
251 |         /// that include spaces is not tested or known.
252 |         /// </summary>
253 |         /// <param name="s">Character sequence to split.</param>
254 |         /// <param name="adjustForWordBegPrefix">If true, s has a leading _. Subtract 1 from every offset.</param>
255 |         /// <returns>List of split offsets (including 0 and the string length) or null if not split.</returns>
256 |         public int[] Split(string s, bool adjustForWordBegPrefix = false) => CachedFunction.Memoize(m_splitCache, s, x =>
257 |         //public int[] Split(string s, bool adjustForWordBegPrefix = false) => CachedFunction.Memoize<int[], string>(m_splitCache, s, x =>
258 |         {
259 |             var cutList = spm.GetSplitPoints(x);
260 |             if (adjustForWordBegPrefix && cutList != null) // source string had leading boundary prefix--account for it
261 |                 for (int i = 1; i < cutList.Length; i++)
262 |                     cutList[i]--;
263 |             return cutList;
264 |         });
265 | 
266 |         /// <summary>
267 |         /// Invoke SPM decode on an array of pieces
268 |         /// </summary>
269 |         public string Decode(string[] pieces) => spm.Unsegment(pieces);
270 |     }
271 | }
272 | 


--------------------------------------------------------------------------------
/test/FactoredSegmenterScriptHelpersTests.cs:
--------------------------------------------------------------------------------
 1 | ﻿// Copyright (c) Microsoft Corporation.
 2 | // Licensed under the MIT license.
 3 | 
 4 | namespace TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test
 5 | {
 6 |     using Common.Text;
 7 |     using Microsoft.VisualStudio.TestTools.UnitTesting;
 8 |     using System.Diagnostics.CodeAnalysis;
 9 | 
10 |     /// <summary>
11 |     /// Unit tests
12 |     /// </summary>
13 |     [TestClass]
14 |     [ExcludeFromCodeCoverage]
15 |     public class FactoredSegmenterScriptHelperTests
16 |     {
17 |         [TestMethod]
18 |         public void ScriptEdgeCasesTest()
19 |         {
20 |             // We put stuff here to be sure how stuff is classified (e.g. Chinese letter 6 (六) is not considered a number by C#).
21 |             // This is less of a regression test and more of a "documentation" on what we think is true for a few edge cases.
22 |             Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('는') == 'L'); // Korean case markers: make sure they are just like letters
23 |             Assert.IsTrue(Unicode.GetScript('는') == Unicode.Script.Hangul);
24 |             Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('＄') == 'S');
25 |             Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('，') == 'P'); // full-width (incl. Chinese)
26 |             Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('、') == 'P'); // Chinese
27 |             Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('。') == 'P'); // Chinese
28 |             Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('।') == 'P'); // Hindi Danda
29 |             Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('॥') == 'P'); // Hindi Danda
30 |         }
31 | 
32 |         [TestMethod]
33 |         public void ClassificationEdgeCaseTests()
34 |         {
35 |             // put stuff here to be sure how stuff is classified (e.g. Chinese letter 6 (六) is not considered a number by C#)
36 |             Assert.IsTrue('Ａ'.HasAndIsUpper());
37 |             Assert.IsTrue('Ａ'.IsBicameral());
38 |             Assert.IsTrue(!'ß'.HasAndIsUpper());
39 |             Assert.IsTrue('１'.IsNumeral());
40 |             Assert.IsTrue('〇'.IsNumeral());
41 |             Assert.IsTrue('○'.IsNumeral());     // medium small white circle; is used in Chinese as a zero
42 |             Assert.IsTrue('十'.IsNumeral());
43 |             Assert.IsTrue('六'.IsNumeral());
44 |             Assert.IsTrue('२'.IsNumeral());     // Hindi numeral
45 |             Assert.IsTrue('Ⅹ'.IsNumeral());     // Roman numeral
46 |             Assert.IsTrue('Ⅹ'.HasAndIsUpper()); // Roman numeral--C# IsUpper() gets this wrong
47 |         }
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/test/blns/blns_README.txt:
--------------------------------------------------------------------------------
 1 | source of blns.txt:
 2 | 
 3 | https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt
 4 | 
 5 | Latest commit f56ff6e  on Nov 16, 2018
 6 | 
 7 | then manually removed a few SQL-injection strings to avoid malicious use
 8 | 
 9 | (MIT license)
10 | 


--------------------------------------------------------------------------------