├── .gitattributes ├── .gitconfig ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── NOTICE.md ├── README.md ├── SECURITY.md ├── cli ├── Program.cs └── README.txt ├── factored-segmenter.csproj ├── factored-segmenter.sln ├── lib ├── README.txt ├── RemoveThese.cs ├── SentencePieceInterop.cs ├── TextHelpers.cs ├── Types.cs └── Utils.cs ├── spm ├── CMakeLists.txt ├── SentencePieceInterop.cpp ├── spm.model ├── spm.vocab └── unicode_conversions.h ├── src ├── FactoredSegmenter.cs ├── FactoredSegmenterConfigs.cs ├── FactoredSegmenterScriptHelpers.cs ├── ProcessTools.cs ├── README.txt ├── SegmenterRuntime.cs ├── SentencePieceConfigs.cs ├── SentencePieceManaged.cs └── SentencePieceWrapper.cs └── test ├── FactoredSegmenterScriptHelpersTests.cs ├── FactoredSegmenterTests.cs └── blns ├── blns.txt └── blns_README.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | * -text 2 | * diff 3 | .cer -diff 4 | .qencr -diff 5 | .dat -diff -------------------------------------------------------------------------------- /.gitconfig: -------------------------------------------------------------------------------- 1 | # This file is *NOT* read by git by default (due to security issues) 2 | # So for mtmain, we read it in using MyInit.cmd 3 | 4 | # For documentation on the settings in this file, see https://git-scm.com/book/en/v2/Customizing-Git-Git-Configuration 5 | 6 | # Don't auto-convert CRLF->LF on git add and CRLF->LR on git checkout (just leave the line-endings alone!) 7 | [core] 8 | autocrlf = false -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.rsuser 8 | *.suo 9 | *.user 10 | *.userosscache 11 | *.sln.docstates 12 | 13 | # User-specific files (MonoDevelop/Xamarin Studio) 14 | *.userprefs 15 | 16 | # Mono auto generated files 17 | mono_crash.* 18 | 19 | # Build results 20 | [Dd]ebug/ 21 | [Dd]ebugPublic/ 22 | [Rr]elease/ 23 | [Rr]eleases/ 24 | x64/ 25 | x86/ 26 | [Aa][Rr][Mm]/ 27 | [Aa][Rr][Mm]64/ 28 | bld/ 29 | [Bb]in/ 30 | [Oo]bj/ 31 | [Ll]og/ 32 | [Ll]ogs/ 33 | 34 | # Visual Studio 2015/2017 cache/options directory 35 | .vs/ 36 | # Uncomment if you have tasks that create the project's static files in wwwroot 37 | #wwwroot/ 38 | 39 | # Visual Studio 2017 auto generated files 40 | Generated\ Files/ 41 | 42 | # MSTest test Results 43 | [Tt]est[Rr]esult*/ 44 | [Bb]uild[Ll]og.* 45 | 46 | # NUnit 47 | *.VisualState.xml 48 | TestResult.xml 49 | nunit-*.xml 50 | 51 | # Build Results of an ATL Project 52 | [Dd]ebugPS/ 53 | [Rr]eleasePS/ 54 | dlldata.c 55 | 56 | # Benchmark Results 57 | BenchmarkDotNet.Artifacts/ 58 | 59 | # .NET Core 60 | project.lock.json 61 | project.fragment.lock.json 62 | artifacts/ 63 | **/Properties/launchSettings.json 64 | 65 | # StyleCop 66 | StyleCopReport.xml 67 | 68 | # Files built by Visual Studio 69 | *_i.c 70 | *_p.c 71 | *_i.h 72 | *_h.h 73 | *.ilk 74 | *.meta 75 | *.obj 76 | *.iobj 77 | *.pch 78 | *.pdb 79 | *.ipdb 80 | *.pgc 81 | *.pgd 82 | *.rsp 83 | *.sbr 84 | *.tlb 85 | *.tli 86 | *.tlh 87 | *.tmp 88 | *.tmp_proj 89 | *_wpftmp.csproj 90 | *.log 91 | *.vspscc 92 | *.vssscc 93 | .builds 94 | *.pidb 95 | *.svclog 96 | *.scc 97 | 98 | # Chutzpah Test files 99 | _Chutzpah* 100 | 101 | # Visual C++ cache files 102 | ipch/ 103 | *.aps 104 | *.ncb 105 | *.opendb 106 | *.opensdf 107 | *.sdf 108 | *.cachefile 109 | *.VC.db 110 | *.VC.VC.opendb 111 | 112 | # Visual Studio profiler 113 | *.psess 114 | *.vsp 115 | *.vspx 116 | *.sap 117 | 118 | # Visual Studio Trace Files 119 | *.e2e 120 | 121 | # TFS 2012 Local Workspace 122 | $tf/ 123 | 124 | # Guidance Automation Toolkit 125 | *.gpState 126 | 127 | # ReSharper is a .NET coding add-in 128 | _ReSharper*/ 129 | *.[Rr]e[Ss]harper 130 | *.DotSettings.user 131 | 132 | # JustCode is a .NET coding add-in 133 | .JustCode 134 | 135 | # TeamCity is a build add-in 136 | _TeamCity* 137 | 138 | # DotCover is a Code Coverage Tool 139 | *.dotCover 140 | 141 | # AxoCover is a Code Coverage Tool 142 | .axoCover/* 143 | !.axoCover/settings.json 144 | 145 | # Visual Studio code coverage results 146 | *.coverage 147 | *.coveragexml 148 | 149 | # NCrunch 150 | _NCrunch_* 151 | .*crunch*.local.xml 152 | nCrunchTemp_* 153 | 154 | # MightyMoose 155 | *.mm.* 156 | AutoTest.Net/ 157 | 158 | # Web workbench (sass) 159 | .sass-cache/ 160 | 161 | # Installshield output folder 162 | [Ee]xpress/ 163 | 164 | # DocProject is a documentation generator add-in 165 | DocProject/buildhelp/ 166 | DocProject/Help/*.HxT 167 | DocProject/Help/*.HxC 168 | DocProject/Help/*.hhc 169 | DocProject/Help/*.hhk 170 | DocProject/Help/*.hhp 171 | DocProject/Help/Html2 172 | DocProject/Help/html 173 | 174 | # Click-Once directory 175 | publish/ 176 | 177 | # Publish Web Output 178 | *.[Pp]ublish.xml 179 | *.azurePubxml 180 | # Note: Comment the next line if you want to checkin your web deploy settings, 181 | # but database connection strings (with potential passwords) will be unencrypted 182 | *.pubxml 183 | *.publishproj 184 | 185 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 186 | # checkin your Azure Web App publish settings, but sensitive information contained 187 | # in these scripts will be unencrypted 188 | PublishScripts/ 189 | 190 | # NuGet Packages 191 | *.nupkg 192 | # NuGet Symbol Packages 193 | *.snupkg 194 | # The packages folder can be ignored because of Package Restore 195 | **/[Pp]ackages/* 196 | # except build/, which is used as an MSBuild target. 197 | !**/[Pp]ackages/build/ 198 | # Uncomment if necessary however generally it will be regenerated when needed 199 | #!**/[Pp]ackages/repositories.config 200 | # NuGet v3's project.json files produces more ignorable files 201 | *.nuget.props 202 | *.nuget.targets 203 | 204 | # Microsoft Azure Build Output 205 | csx/ 206 | *.build.csdef 207 | 208 | # Microsoft Azure Emulator 209 | ecf/ 210 | rcf/ 211 | 212 | # Windows Store app package directories and files 213 | AppPackages/ 214 | BundleArtifacts/ 215 | Package.StoreAssociation.xml 216 | _pkginfo.txt 217 | *.appx 218 | *.appxbundle 219 | *.appxupload 220 | 221 | # Visual Studio cache files 222 | # files ending in .cache can be ignored 223 | *.[Cc]ache 224 | # but keep track of directories ending in .cache 225 | !*.[Cc]ache/ 226 | !?*.[Cc]ache/ 227 | 228 | # Others 229 | ClientBin/ 230 | ~$* 231 | *~ 232 | *.dbmdl 233 | *.dbproj.schemaview 234 | *.jfm 235 | *.pfx 236 | *.publishsettings 237 | orleans.codegen.cs 238 | 239 | # Including strong name files can present a security risk 240 | # (https://github.com/github/gitignore/pull/2483#issue-259490424) 241 | #*.snk 242 | 243 | # Since there are multiple workflows, uncomment next line to ignore bower_components 244 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 245 | #bower_components/ 246 | 247 | # RIA/Silverlight projects 248 | Generated_Code/ 249 | 250 | # Backup & report files from converting an old project file 251 | # to a newer Visual Studio version. Backup files are not needed, 252 | # because we have git ;-) 253 | _UpgradeReport_Files/ 254 | Backup*/ 255 | UpgradeLog*.XML 256 | UpgradeLog*.htm 257 | ServiceFabricBackup/ 258 | *.rptproj.bak 259 | 260 | # SQL Server files 261 | *.mdf 262 | *.ldf 263 | *.ndf 264 | 265 | # Business Intelligence projects 266 | *.rdl.data 267 | *.bim.layout 268 | *.bim_*.settings 269 | *.rptproj.rsuser 270 | *- [Bb]ackup.rdl 271 | *- [Bb]ackup ([0-9]).rdl 272 | *- [Bb]ackup ([0-9][0-9]).rdl 273 | 274 | # Microsoft Fakes 275 | FakesAssemblies/ 276 | 277 | # GhostDoc plugin setting file 278 | *.GhostDoc.xml 279 | 280 | # Node.js Tools for Visual Studio 281 | .ntvs_analysis.dat 282 | node_modules/ 283 | 284 | # Visual Studio 6 build log 285 | *.plg 286 | 287 | # Visual Studio 6 workspace options file 288 | *.opt 289 | 290 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 291 | *.vbw 292 | 293 | # Visual Studio LightSwitch build output 294 | **/*.HTMLClient/GeneratedArtifacts 295 | **/*.DesktopClient/GeneratedArtifacts 296 | **/*.DesktopClient/ModelManifest.xml 297 | **/*.Server/GeneratedArtifacts 298 | **/*.Server/ModelManifest.xml 299 | _Pvt_Extensions 300 | 301 | # Paket dependency manager 302 | .paket/paket.exe 303 | paket-files/ 304 | 305 | # FAKE - F# Make 306 | .fake/ 307 | 308 | # JetBrains Rider 309 | .idea/ 310 | *.sln.iml 311 | 312 | # CodeRush 313 | .cr/ 314 | # CodeRush personal settings 315 | .cr/personal 316 | 317 | # Python Tools for Visual Studio (PTVS) 318 | __pycache__/ 319 | *.pyc 320 | 321 | # Cake - Uncomment if you are using it 322 | # tools/** 323 | # !tools/packages.config 324 | 325 | # Tabs Studio 326 | *.tss 327 | 328 | # Telerik's JustMock configuration file 329 | *.jmconfig 330 | 331 | # BizTalk build output 332 | *.btp.cs 333 | *.btm.cs 334 | *.odx.cs 335 | *.xsd.cs 336 | 337 | # OpenCover UI analysis results 338 | OpenCover/ 339 | 340 | # Azure Stream Analytics local run output 341 | ASALocalRun/ 342 | 343 | # MSBuild Binary and Structured Log 344 | *.binlog 345 | 346 | # NVidia Nsight GPU debugger configuration file 347 | *.nvuser 348 | 349 | # MFractors (Xamarin productivity tool) working folder 350 | .mfractor/ 351 | 352 | # Local History for Visual Studio 353 | .localhistory/ 354 | 355 | # BeatPulse healthcheck temp database 356 | healthchecksdb 357 | 358 | # Backup folder for Package Reference Convert tool in Visual Studio 2017 359 | MigrationBackup/ 360 | 361 | # Ionide (cross platform F# VS Code tools) working folder 362 | .ionide/ 363 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /NOTICE.md: -------------------------------------------------------------------------------- 1 | # THIRD-PARTY SOFTWARE NOTICES AND INFORMATION 2 | Do Not Translate or Localize 3 | 4 | This project is based on or incorporates material from the projects listed below (Third Party IP). The original copyright notice and the license under which Microsoft received such Third Party IP, are set forth below. Such licenses and notices are provided for informational purposes only. Where permitted, Microsoft licenses the Third Party IP to you under the licensing terms for the Microsoft product. Microsoft reserves all other rights not expressly granted under this agreement, whether by implication, estoppel or otherwise. 5 | 6 | **a. Big List of Naughty Strings** 7 | 8 | In the test suite, `./test/blns/blns.txt` is a near copy of the Big List of Naughty Strings [https://github.com/minimaxir/big-list-of-naughty-strings]. 9 | 10 | Original license [https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/LICENSE]: 11 | 12 | The MIT License (MIT) 13 | 14 | Copyright (c) 2015-2020 Max Woolf 15 | 16 | Permission is hereby granted, free of charge, to any person obtaining a copy 17 | of this software and associated documentation files (the "Software"), to deal 18 | in the Software without restriction, including without limitation the rights 19 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 20 | copies of the Software, and to permit persons to whom the Software is 21 | furnished to do so, subject to the following conditions: 22 | 23 | The above copyright notice and this permission notice shall be included in all 24 | copies or substantial portions of the Software. 25 | 26 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 27 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 28 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 29 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 30 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 31 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 | SOFTWARE. 33 | 34 | **b. C++ REST SDK** 35 | 36 | `./spm/unicode_conversions.h` is an excerpt from `./Release/src/utilities/asyncrt_utils.cpp` in the C++ REST SDK [https://github.com/microsoft/cpprestsdk]. 37 | 38 | Original license [https://github.com/microsoft/cpprestsdk/blob/master/license.txt]: 39 | 40 | C++ REST SDK 41 | 42 | The MIT License (MIT) 43 | 44 | Copyright (c) Microsoft Corporation 45 | 46 | All rights reserved. 47 | 48 | Permission is hereby granted, free of charge, to any person obtaining a copy of 49 | this software and associated documentation files (the "Software"), to deal in 50 | the Software without restriction, including without limitation the rights to 51 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 52 | the Software, and to permit persons to whom the Software is furnished to do so, 53 | subject to the following conditions: 54 | 55 | The above copyright notice and this permission notice shall be included in all 56 | copies or substantial portions of the Software. 57 | 58 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 59 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 60 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 61 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 62 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 63 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 64 | SOFTWARE. 65 | 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FactoredSegmenter 2 | 3 | FactoredSegmenter is the unsupervised text tokenizer for machine translation that aims at _factoring shared properties of words_, such as casing or spacing, and underlies Microsoft Translator. 4 | It encodes tokens in the form `WORDPIECE|factor1|factor2|...|factorN`. 5 | This encoding syntax is directly understood by the [Marian Neural Machine Translation Toolkit](https://github.com/marian-nmt/marian). 6 | To use FactoredSegmenter with other toolkits, one must implement a parser for this format, modify the embedding lookup and, to use factors on the target side, the beam decoder. 7 | The term "FactoredSegmenter" refers to both a segmentation library and an encoding of text. 8 | 9 | FactoredSegmenter segments words into subwords, or _word pieces_, using the popular [SentencePiece](https://github.com/google/sentencepiece) library under the hood. 10 | However, unlike SentencePiece in its common usage, spaces and capitalization are not encoded in the sub-word tokens themselves. 11 | Instead, spacing and capitalization are encoded in _factors_ that are attached to each token. 12 | The purpose of this is to allow the sharing of model parameters across all occurences of a word, be it 13 | in the middle of a sentence, capitalized at the start of a sentence, at the start of a sentence enclosed in parentheses or quotation marks, or in all-caps in a social-media rant. 14 | In SentencePiece, these are all distinct tokens, which is less robust. 15 | For example, this distinction leads to poor translation accuracy for all-caps sentences, which is problematic when translating social-media posts. 16 | 17 | #### Features of FactoredSegmenter 18 | 19 | * represents words and tokens as _tuples of factors_ to allow for parameter sharing. E.g. spacing and capitalization are separate factors on word pieces. An NMT training tool would form token embeddings by summing or concatenating embeddings of all factors in the factor tuple; 20 | * infrequent words are represented by _subwords_, aka word pieces, using the SentencePiece library; 21 | * robust treatment of _numerals_: Each digit is always split as its own token, in any writing system. We have observed that this reliably fixes a large class of translation errors for numerals, especially when translating between different numeric systems (such as Arabic numbers to Chinese); 22 | * support for _"phrase fixing,"_ where specific phrases are required to be translated in a very specific way. Such constrained translation is achieved with FactoredSegmenter by either replacing such phrases by a fixed token (where a factor is used to distinguish multiple such phrase fixes in a single sentence), or by inserting the desired target translation directly into the encoded source, where factors are used to distinguish the source from the target translation; 23 | * unknown-character handling: characters not covered by the word-piece vocabulary, for example rare Emojis, are encoded by their Unicode character code in a form that a translation system can learn to copy through; 24 | * round-trippable: allows to fully reconstruct the source sentence from the factored (sub)word representation (with minor exceptions); 25 | * support of continuous scripts, which have different rules for spacing, and combining marks. 26 | 27 | ## Factors 28 | 29 | Let's randomly pick a word of recent prominence, say "hydroxychloroquine." First, observe that whether it occurs at the beginning of the word (where it would normally be capitalized) or within the sentence, or whether it appears after a quotation mark (where it is lower-case but there is no space before it), it is still the same word, and it seems desirable to share embedding parameters across all four cases to some degree. Secondly, note that since "hydroxychloroquine" is a word rarely seen until recently, it may not have been seen frequently enough after a quotation mark to get its own token. Hence, in that situation it would not only not share its embedding, but it also may be segmented differently altogether from the other cases. 30 | 31 | FactoredSegmenter attempts to remedy this problem by representing each (sub)word as a tuple. For example, "hydroxychloroquine" at sentence start would be represented by a tuple 32 | that might be written in pseudo-code as 33 | ``` 34 | { 35 | lemma = "hydroxychloroquine", 36 | capitalization = CAP_INITIAL, 37 | isWordBeginning = WORDBEG_YES, 38 | isWordEnd = WORDEND_YES 39 | } 40 | ``` 41 | Each tuple member is called a _factor_. The subword identity itself ("hydroxychloroquine") is also represented by a factor, which we call the _lemma_, meaning that it is the base form that may be modified by factors (this is inspired by the linguistic term [lemma](https://simple.wikipedia.org/wiki/Lemma_(linguistics)), which is a base form that gets modified by inflections). 42 | In machine translation, the embedding of the tuple would be formed by composing embedding vectors for each individual factor in the tuple, e.g. by summing or concatenating them. 43 | 44 | A factor has a type and a value. While the lemma is a string, the `capitalization` factor above is an enumeration with three values, representing three kinds of capitalization: capitalized first letter (beginning of a capitalized word, using the symbol `CAP_INITIAL`), all-caps (`CAP_ALL`), and no capitalized letters at all (a regular all-lowercase word, `CAP_NONE`). To represent mixed-case words, e.g. RuPaul, we break them into subwords. `isWordBeginning` is conceptually a boolean, but for simplicity, we give each factor a unique data type, so `isWordBeginning` is an enum with two values, `WORDBEG_YES` and `WORDBEG_NO`. Likewise for `isWordEnd`. 45 | 46 | Different lemmas can have different factor sets. For example, digits and punctuation cannot be capitalized, 47 | hence those lemmas not have a capitalization factor. However, for a given lemma, the set of factors is always the same. 48 | The specific set of factors of a lemma is determined from heuristics represented in the FactoredSegmenter code, with some configurability via options. 49 | 50 | For infrequent words or morphological variants, FactoredSegmenter supports subword units. A subword unit is used when a word is unseen in the training, or not seen often enough. FactoredSegmenter relies on the excellent SentencePiece library for determining suitable subword units. 51 | 52 | For example, "hydroxychloroquine" might be rare enough to be represented by subwords, such as "hydro" + "xy" + "chloroquine". It would be represented as a sequence of three tuples: 53 | ``` 54 | { 55 | lemma = "hydro", 56 | capitalization = CAP_INITIAL, 57 | isWordBeginning = WORDBEG_YES, 58 | isWordEnd = WORDEND_NO 59 | }, 60 | { 61 | lemma = "xy", 62 | capitalization = CAP_NONE, 63 | isWordBeginning = WORDBEG_NO, 64 | isWordEnd = WORDEND_NO 65 | }, 66 | { 67 | lemma = "chloroquine", 68 | capitalization = CAP_NONE, 69 | isWordBeginning = WORDBEG_NO, 70 | isWordEnd = WORDEND_YES 71 | } 72 | ``` 73 | The subword nature of the tuples is represented by the `isWordBeginning` and `isWordEnd` factors. 74 | 75 | #### Factor Syntax 76 | 77 | When written to a text file or when communicated to an NMT training toolkit, factor tuples are represented as strings following a specific syntax: 78 | The factor values are concatenated, separated by vertical bars. A direct concatenation of the above example would give `hydroxychloroquine|CAP_INITIAL|WORDBEG_YES|WORDEND_YES`. 79 | However, to avoid to dramatically increase data-file sizes, factors use short-hand notations when serialized. Also, to make those files a little more readable to us humans, lemmas are written in all-caps, while factors use lowercase (this also avoids name conflicts between factor names and real words). If "hydroxychloroquine" is a single word piece, the actual form as written to file of the above is: 80 | ``` 81 | HYDROXYCHLORIQUINE|ci|wb|we 82 | ``` 83 | The example above where it is represented by multiple subword units has the following serialized form: 84 | ``` 85 | HYDRO|ci|wb|wen XY|cn|wbn|wen CHLOROQUINE|cn|wbn|we 86 | ``` 87 | Any character that may be used as part of this syntax is escaped as a hex code. For example, if the vertical bar character itself was the lemma, it would be serialized as `\x7c`. 88 | 89 | #### Representation of Space Between Tokens 90 | 91 | If you are familiar with SentencePiece, you will notice that the tuples above do not directly encode whether there is a space before or after the word. Instead, it is encoded as factors whether a token is at the _boundary_ (beginning/end) of a word. For single-word tokens, both flags are true. Most of the time, a word boundary implies a spaces, but not always. For example, a word in quotation marks would not be enclosed in spaces; rather, the quotation marks would. For example, the sequence "Hydroxychloroquine works" would be encoded as: 92 | ``` 93 | HYDRO|ci|wb|wen XY|cn|wbn|wen CHLOROQUINE|cn|wbn|we WORKS|cn|wb|we 94 | ``` 95 | without explicit factors for spaces; rather, the space between "hydroxychloroquine" and "works" is implied by the word-boundary factors. 96 | 97 | Hence, words do not carry factors determining space directly. Rather, spacing-related factors are carried by _punctuation marks_. By default, there is always a space at word boundaries, but punctuation carries factors stating whether a space surrounding the punctuation should rather be _elided_, whether the punctuation should be _glued_ to the surrounding token(s). For example, in the sentence "Hydroxychloroquine works!", the sentence-final exclamation point is glued to the word to the left, and would be represented by the following factor tuple: 98 | ``` 99 | { 100 | lemma = "!", 101 | glueLeft = GLUE_LEFT_YES, 102 | glueRight = GLUE_RIGHT_NO 103 | } 104 | ``` 105 | The `glueLeft` factor indicates that the default space after `works` should be elided. 106 | The short-hand form that is used when writing to file is `gl+` and `gl-` and likewise `gr+` and `gr-`. The full sequence would be encoded as: 107 | ``` 108 | HYDRO|ci|wb|wen XY|cn|wbn|wen CHLOROQUINE|cn|wbn|we WORKS|cn|wb|we !|gl+|gr- 109 | ``` 110 | Note that the short-hands for boolean-like factors are a little inconsistent for historical reasons. Note also that this documentation makes no claims regarding the veracity of its example sentences. 111 | 112 | #### Round-Trippability 113 | 114 | An important property of the factor representation is that it allows to fully reconstruct the original input text, it is fully _round-trippable_. If we encode a text as factor tuples, and then decode it, the result will be the original input string. FactoredSegmenter is used in machine translation by training the translation system to translate text in factor representation to text in the target language that is likewise in factor representation. The final surface form is then recreated by decoding factor representation in the target language. 115 | 116 | There are few exception to round-trippability. To support specifying specific translations for words ("phrase fixing"), FactoredSegmenter can replace token ranges by special placeholders that get translated as such. Alternatively, it can include the given target translation in the source string, using special factors or marker tags. The identity of such a token would get lost in the factored representation (instead, the translation system would remember its identity as side information). The C# API also allows replacing arbitrary character ranges on the fly (the original characters get lost). 117 | 118 | Lastly, it should be noted that the specific factor sets depend on configuration variables. For example, empirically we found no practical benefit in the `isWordEnd` factor, so this is typically disabled by a configuration setting. 119 | 120 | ## FactoredSegmenter in Code 121 | 122 | FactoredSegmenter is manifested in code in two different ways. First, in the form of a C# library which allows to execute all functions, that is, training, encoding, and decoding. For example, each time a user invokes Microsoft Translator, e.g. via http://translate.bing.com, FactoredSegmenter is invoked via the C# interface twice, once to encode the source sentence, and once to decode the translation. 123 | 124 | Secondly, a Linux command-line tool gives access to most of the library functions. This is used for training FactoredSegmenter models (subword representations), and it allows to build offline systems using the factored-segmenter tool and Marian alone. 125 | 126 | ## Training and Factor Configuration 127 | 128 | The FactoredSegmenter representation is rule-based, except for the subword units, which are based on SentencePiece. Hence, before one can tokenize text with FactoredSegmenter, a _FactoredSegmenter model_ must be trained. The training process first pre-tokenizes the input into units of consistent letter type, and then execute SentencePiece training on the resulting tokens. The result of the training process are two files: 129 | 130 | * an `.fsm` file, for "factored-segmenter model." An `.fsm` file contains everything needed to encode and decode. It holds all configuration options, the factor specification (which lemma has what factors), subword inventories, and also embeds the binary SentencePiece model for subword splitting. 131 | * an `.fsv` file, for "factored-segmenter vocabulary." The `.fsv` file holds the subset of the `.fsm` model that is needed by the translation software (Marian) to interpret the factor representation. 132 | 133 | At training time, the user must specify all options regarding which factors are used. 134 | 135 | *TODO*: To be continued, e.g. need to document continuous-script handling, combining marks, some more on numerals; also all model options and command-line arguments 136 | 137 | ## Prerequisites 138 | 139 | To build FactoredSegmenter, you will need to install the following dependencies: 140 | 141 | #### Linux 142 | ``` 143 | sudo apt-get install dotnet-sdk-3.1 144 | sudo apt-get install dotnet-runtime-3.1 145 | ``` 146 | And you need to install SentencePiece [from source](https://github.com/google/sentencepiece#c-from-source). SentencePiece is accessed both via executing a binary and via direct invocation of the C++ library. 147 | 148 | #### Windows 149 | ``` 150 | https://dotnet.microsoft.com/download/dotnet-core/thank-you/sdk-3.1.101-windows-x64-installer 151 | ``` 152 | And SentencePiece. In the Windows version, SentencePiece is presently only invoked via the SentencePiece command-line tools. It has not been tested whether the [vcpkg installation](https://github.com/google/sentencepiece#installation) works. 153 | 154 | ## How to build 155 | 156 | #### Linux 157 | ``` 158 | cd REPO/src 159 | dotnet publish -c Release -r linux-x64 -f netcoreapp3.1 /p:PublishSingleFile=true /p:PublishTrimmed=true \ 160 | ../factored-segmenter.csproj 161 | # now you can run the binary at REPO/src/bin/Release/netcoreapp3.1/linux-x64/publish/factored-segmenter 162 | ``` 163 | 164 | #### Windows 165 | Open `src` folder in Visual Studio 2017 or later. With 2017, it will complain that it cannot build the 3.1 SDK. F5 debugging still works (using 2.1), but you may need to hit F5 twice. 166 | 167 | ## Example command lines 168 | 169 | ### Encoding 170 | ``` 171 | pigz -d -c /data1/SpeechTrans/ENU-DEU_Student.speech/normalize_src_training_sentences/sentenceonly.src.normalized.ENU.snt.gz \ 172 | | time parallelized env LC_ALL=en_US.UTF-8 \ 173 | ~/factored-segmenter/src/bin/Release/netcoreapp3.1/linux-x64/publish/factored-segmenter encode --model ~/factored-segmenter/enu.deu.generalnn.joint.segmenter.fsm \ 174 | | pigz -c --best \ 175 | > /data1/SpeechTrans/Data/2019-12-ENU-DEU_Student/TN/TrainSingleSent/normalized.ENU.snt.fs.gz 176 | ``` 177 | ### Training 178 | ``` 179 | time env LC_ALL=en_US.UTF-8 \ 180 | ~/factored-segmenter/src/bin/Release/netcoreapp3.1/linux-x64/publish/factored-segmenter train \ 181 | --model ~/factored-segmenter/out/enu.deu.generalnn.joint.segmenter.fsm \ 182 | --distinguish-initial-and-internal-pieces --single-letter-case-factors --serialize-indices-and-unrepresentables --inline-fixes \ 183 | --min-piece-count 38 --min-char-count 2 --vocab-size 32000 \ 184 | /data1/SpeechTrans/ENU-DEU_Student.speech/train_segmenter.ENU.DEU.generalnn.joint/corpus.sampled 185 | ``` 186 | # Contributing 187 | 188 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 189 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 190 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 191 | 192 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 193 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 194 | provided by the bot. You will only need to do this once across all repos using our CLA. 195 | 196 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 197 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 198 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 199 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /cli/Program.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | using Common.Collections.Extensions; 5 | using Common.Utils; 6 | using Microsoft.MT.Common.Tokenization; 7 | using Microsoft.MT.Segmentation; 8 | using System; 9 | using System.Collections.Generic; 10 | using System.IO; 11 | using System.Linq; 12 | using System.Text; 13 | using System.Text.RegularExpressions; 14 | 15 | namespace factored_segmenter 16 | { 17 | class Program 18 | { 19 | /// 20 | /// Command-line format: 21 | /// factored-segmenter train|encode|decode [--option]* [input file|-] 22 | /// 23 | static void Main(string[] args) 24 | { 25 | var (GetAndConsumeArg, GetArg) = IterateArgs(args); 26 | var action = GetAndConsumeArg(); 27 | if (action != "train" && action != "encode" && action != "decode" && action != "runtests") 28 | BadArgument("The first argument must be 'train', 'encode', 'decode', or 'runtests'"); 29 | 30 | // parse options 31 | string dataOutPath = "-"; 32 | string modelPath = null; 33 | string vocabOutputPath = null; 34 | string fieldSeparator = null; 35 | bool quiet = false; 36 | FactoredSegmenterModelTrainConfig newModelConfig = new FactoredSegmenterModelTrainConfig(); 37 | while (GetArg() != null && ((GetArg().StartsWith("-") && GetArg().Length > 1) || GetArg().StartsWith("--"))) // --option, -o, and -- 38 | { 39 | bool GetBoolArg() // helper to parse bool options have an optional "true" or "false" follow them 40 | => GetArg() == null || (GetArg() != "true" && GetArg() != "false") || GetAndConsumeArg() == "true"; 41 | var option = GetAndConsumeArg(); 42 | // common args 43 | if ((option == "-o" || option == "--output") && action != "train") // output stream for encode and decode 44 | dataOutPath = GetAndConsumeArg(); 45 | else if (option == "-m" || option == "--model") // model path: output for train, input for encode/decode 46 | modelPath = GetAndConsumeArg(); 47 | else if ((option == "-v" || option == "--marian-vocab") && action == "train") 48 | vocabOutputPath = GetAndConsumeArg(); 49 | else if (option == "--quiet") // avoid unnecessary logging 50 | quiet = GetBoolArg(); 51 | else if (option == "-F") // field separator, e.g. set to "\t" to process TSV format 52 | fieldSeparator = Regex.Unescape(GetAndConsumeArg()); // unescape so that we can pass \t 53 | // new-model args 54 | else if (option == "--right-word-glue") 55 | newModelConfig.ModelOptions.RightWordGlue = GetBoolArg(); 56 | else if (option == "--distinguish-initial-and-internal-pieces") 57 | newModelConfig.ModelOptions.DistinguishInitialAndInternalPieces = GetBoolArg(); 58 | else if (option == "--split-han") 59 | newModelConfig.ModelOptions.SplitHan = GetBoolArg(); 60 | else if (option == "--single-letter-case-factors") 61 | newModelConfig.ModelOptions.SingleLetterCaseFactors = GetBoolArg(); 62 | else if (option == "--serialize-indices-and-unrepresentables") 63 | newModelConfig.ModelOptions.SerializeIndicesAndUnrepresentables = GetBoolArg(); 64 | else if (option == "--inline-fixes") 65 | newModelConfig.ModelOptions.InlineFixes = GetBoolArg(); 66 | else if (option == "--inline-fix-use-tags") 67 | newModelConfig.ModelOptions.InlineFixUseTags = GetBoolArg(); 68 | else if (option == "--no-sentence-piece") 69 | newModelConfig.SentencePieceTrainingConfig = null; 70 | // training args 71 | else if (option == "--vocab-size" && action == "train") 72 | newModelConfig.SentencePieceTrainingConfig.VocabSize = int.Parse(GetAndConsumeArg()); 73 | else if (option == "--character_coverage" && action == "train") 74 | newModelConfig.SentencePieceTrainingConfig.CharacterCoverage = double.Parse(GetAndConsumeArg()); 75 | else if (option == "--training-sentence-size" && action == "train") 76 | newModelConfig.TrainingSentenceSize = int.Parse(GetAndConsumeArg()); 77 | else if (option == "--min-piece-count" && action == "train") 78 | newModelConfig.MinPieceCount = int.Parse(GetAndConsumeArg()); 79 | else if (option == "--min-char-count" && action == "train") 80 | newModelConfig.MinCharCount = int.Parse(GetAndConsumeArg()); 81 | // other 82 | else if (option == "--") // -- ends option processing 83 | break; 84 | else 85 | BadArgument($"Unknown option {option}"); 86 | } 87 | 88 | // parse remaining arguments (one or more input files) 89 | var inputPaths = new List(); 90 | while (GetArg() != null) 91 | inputPaths.Add(GetAndConsumeArg()); 92 | if (!inputPaths.Any()) // none given: read from stdin 93 | inputPaths.Add("-"); 94 | 95 | // open all input files 96 | var streams = from inputPath in inputPaths 97 | select inputPath != "-" ? 98 | new StreamReader(inputPath, encoding: Encoding.UTF8, detectEncodingFromByteOrderMarks: true, bufferSize: 1000000) : 99 | Console.In; 100 | 101 | if (action == "train") 102 | { 103 | if (!quiet) 104 | Log($"Creating model {modelPath} from input file(s) {" ".JoinItems(inputPaths)} ..."); 105 | if (!modelPath.EndsWith(".fsm")) // @TODO: do this inside Train() where we create the temp pathnames 106 | BadArgument($"Extension .fsm is required for model path {modelPath}"); 107 | var lines = from stream in streams 108 | from line in stream.ReadLines() 109 | select line; 110 | CreateDirectoryFor(modelPath); // @TODO: do this inside Train() 111 | var model = FactoredSegmenterModel.Train(newModelConfig, lines, sourceSentenceAnnotations: null, fsmModelPath: modelPath, spmBinDir: SentencePieceManaged.SpmBinaryDirPath); 112 | 113 | // save the model 114 | // The SentencePiece model is embedded in 'model'; it is not a separate file. 115 | model.Save(modelPath); 116 | if (!quiet) 117 | Log($"Model file written to {modelPath}"); 118 | 119 | // save the vocab for Marian consumption 120 | if (model.FactorSpec != null && vocabOutputPath != null) 121 | { 122 | File.WriteAllLines(vocabOutputPath, model.FactorSpec, new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)); 123 | if (!quiet) 124 | Log($"Marian vocabulary file written to {vocabOutputPath}"); 125 | } 126 | } 127 | else if (action == "encode" || action == "decode") 128 | { 129 | if (!quiet) 130 | Log($"Processing input file(s) {" ".JoinItems(inputPaths)} with model {modelPath} ..."); 131 | var lines = from stream in streams.ToList() // ToList() eagerly opens all streams, to test upfront if all files are found 132 | from line in stream.ReadLines() 133 | select line; 134 | newModelConfig.ModelOptions.UseSentencePiece = false; 135 | var coderConfig = modelPath != null ? 136 | new FactoredSegmenterCoderConfig 137 | { 138 | ModelPath = modelPath 139 | } : 140 | new FactoredSegmenterCoderConfig // no model specified: use untrained virgin model (without SentencePiece) 141 | { 142 | Model = new FactoredSegmenterModel(newModelConfig.ModelOptions) 143 | }; 144 | var coder = new FactoredSegmenterCoder(coderConfig); 145 | 146 | // write loop 147 | if (!quiet) 148 | Log($"Writing processed lines to {dataOutPath} ..."); 149 | CreateDirectoryFor(dataOutPath); 150 | var outStream = dataOutPath != "-" ? // open output stream (UTF-8 without BOM) 151 | new StreamWriter(dataOutPath, append: false, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false), bufferSize: 1000000) : 152 | Console.Out; 153 | var linesProcessed = 0; 154 | string ProcessLine(string line) 155 | { 156 | try 157 | { 158 | return action == "encode" ? 159 | " ".JoinItems(coder.Encode(line).TokenStrings) : // encode 160 | coder.Decode(line).ToString(); // decode 161 | } 162 | catch (Exception e) 163 | { 164 | Log($"Failed to {action} input: {line}"); 165 | Log($"Exception: {e.ToString()}"); 166 | return ""; // back off to empty string, so that we can continue 167 | } 168 | } 169 | foreach (var line in lines) 170 | { 171 | string processedLine = fieldSeparator == null ? 172 | processedLine = ProcessLine(line) : 173 | processedLine = fieldSeparator.JoinItems(from field in line.Split(fieldSeparator) select ProcessLine(field)); 174 | //Log($"{command} IN: {line} --> OUT: {processedLine}"); 175 | outStream.WriteLine(processedLine); 176 | // @BUGBUG: Write errors are not caught, at least when writing to a pipe via stdout. 177 | linesProcessed++; 178 | if (!quiet && linesProcessed % 1000000 == 0) 179 | Log($"Completed processing of {linesProcessed:#,##0} lines so far."); 180 | } 181 | if (!quiet) 182 | Log($"Completed processing of {linesProcessed:#,##0} lines."); 183 | 184 | outStream.Flush(); // hoping to elicit an exception in case flushing fails 185 | outStream.Close(); 186 | } 187 | // @TODO: disabled for now since the tests don't build under Linux 188 | //else if (action == "runtests") 189 | //{ 190 | // // This is for easier testing when debugging environment does not support tests. 191 | // // This must be manually maintained. 192 | // var tests = new TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test.FactoredSegmenterTests(); 193 | // tests.ReversibilityAndBasicBreakingTests(); 194 | // tests.DecodeIntoConsecutiveSegmentsTest(); 195 | // //tests.ReversibilityAndBasicBreakingTestsOnNaughtyData(); // fails in standalone build because data file is our other repo 196 | // tests.RunTraining(); 197 | // var tests1 = new TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test.FactoredSegmenterScriptHelperTests(); 198 | // tests1.ScriptEdgeCasesTest(); 199 | // tests1.ClassificationEdgeCaseTests(); 200 | //} 201 | } 202 | 203 | static void Log(string what) => Logger.WriteLine(string.Format("{0:yyyy/MM/dd HH:mm:ss.fff} factored-segmenter: ", DateTime.Now) + what); 204 | 205 | static void BadArgument(string what) 206 | { 207 | Log(what); 208 | Environment.Exit(1); 209 | } 210 | 211 | static (Func GetAndConsumeArg, Func GetArg) IterateArgs(string[] args) 212 | { 213 | var e = args.GetEnumerator(); 214 | var b = e.MoveNext(); 215 | return (GetAndConsumeArg: () => 216 | { 217 | if (!b) 218 | BadArgument("At least one more argument was expected."); 219 | var res = e.Current as string; 220 | b = e.MoveNext(); // b is boxed, so this persists across calls 221 | return res; 222 | }, 223 | GetArg: () => b ? e.Current as string : null); 224 | } 225 | 226 | static void CreateDirectoryFor(string filePath) 227 | { 228 | if (filePath == "-") 229 | return; 230 | var dirName = Path.GetDirectoryName(filePath); 231 | if (dirName != "") 232 | Directory.CreateDirectory(dirName); 233 | } 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /cli/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains the source files to implement the command-line wrapper 2 | around FactoredSegmenter. They are only used in the standalone build (i.e. 3 | FactoredSegmenter as a standalone command-line tool). 4 | -------------------------------------------------------------------------------- /factored-segmenter.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.1 6 | factored_segmenter 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /factored-segmenter.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.852 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "factored-segmenter", "factored-segmenter.csproj", "{ADC50319-8AB7-415A-A501-A65073B281E4}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {ADC50319-8AB7-415A-A501-A65073B281E4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {ADC50319-8AB7-415A-A501-A65073B281E4}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {ADC50319-8AB7-415A-A501-A65073B281E4}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {ADC50319-8AB7-415A-A501-A65073B281E4}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | GlobalSection(ExtensibilityGlobals) = postSolution 23 | SolutionGuid = {B3B448B5-6719-445D-B09A-A781B80286B9} 24 | EndGlobalSection 25 | EndGlobal 26 | -------------------------------------------------------------------------------- /lib/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains support libraries that are used by the main 2 | FactoredSegmenter sources in the src/ directory. 3 | 4 | The standalone command-line tool build uses these libraries here. 5 | 6 | The production build uses a different version of this library, which is 7 | included in our production environment, and is proprietary. The files in this 8 | directory contain a subset of those production libraries that implements only 9 | those classes and methods that are used by the standalone build, sometimes in 10 | greatly simplified versions. 11 | -------------------------------------------------------------------------------- /lib/RemoveThese.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | // This file contains stubs to things that are referenced but not actually needed. 5 | // These should be fixed in the shared code, e.g. by moving out the parts that 6 | // reference these into a separate source file. 7 | 8 | using Common.MT.Segments; 9 | using System.Collections.Generic; 10 | 11 | // @TODO: find out how to install this correctly in dotnet 2.x and 3.x. Then delete this. 12 | namespace Microsoft.VisualStudio.TestTools.UnitTesting 13 | { 14 | public static class Assert 15 | { 16 | public static void IsTrue(bool condition) { System.Diagnostics.Debug.Assert(condition); } 17 | public static void IsFalse(bool condition) { IsTrue(!condition); } 18 | public static void AreEqual(T a, T b) { IsTrue(a.Equals(b)); } // @TODO: correct? 19 | } 20 | } 21 | namespace Microsoft.VisualStudio.TestTools.UnitTesting 22 | { 23 | using System; 24 | 25 | [AttributeUsage(AttributeTargets.Class, AllowMultiple = false)] 26 | public sealed class TestClassAttribute : System.Attribute 27 | { 28 | public TestClassAttribute() { } 29 | } 30 | [AttributeUsage(AttributeTargets.Assembly | AttributeTargets.Class | AttributeTargets.Struct | AttributeTargets.Constructor | AttributeTargets.Method | AttributeTargets.Property | AttributeTargets.Event, Inherited = false, AllowMultiple = false)] 31 | public sealed class TestMethodAttribute : Attribute 32 | { 33 | public TestMethodAttribute() { } 34 | } 35 | [AttributeUsage(AttributeTargets.Class | AttributeTargets.Method, AllowMultiple = true)] 36 | public sealed class DeploymentItemAttribute : Attribute 37 | { 38 | public DeploymentItemAttribute(string path, string outputDirectory) { } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /lib/SentencePieceInterop.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | // Wrapper around the SentencePiece runtime library. 5 | // This is currently emulated by a process-based interface, 6 | // until a real P/invoke implementation is completed. 7 | 8 | using System; 9 | using System.IO; 10 | using System.Text; 11 | using System.Collections.Concurrent; 12 | using static Common.Utils.ProcessTool; 13 | using Common.Contracts; 14 | using Common.Collections.Extensions; 15 | using System.Linq; 16 | using Common.Utils; 17 | using System.Collections.Generic; 18 | using System.Runtime.InteropServices; 19 | 20 | namespace Microsoft.MT.Segmentation 21 | { 22 | public class SentencePieceManaged // : IDisposable 23 | { 24 | static readonly string spmBinaryDirPathLinux = "/usr/local/bin/"; 25 | static readonly string spmBinaryDirPathWindows = @"c:\work\mtmain\target\Retail\amd64\Tokenization\"; 26 | 27 | public static string SpmBinaryDirPath => 28 | RuntimeInformation.IsOSPlatform(OSPlatform.Windows) ? spmBinaryDirPathWindows : spmBinaryDirPathLinux; 29 | 30 | HashSet m_vocabulary; 31 | readonly string m_tempModelPath; 32 | readonly string m_tempVocabPath; 33 | readonly ConcurrentQueue m_serverPool; 34 | public SentencePieceManaged(string loadMe, string[] vocabulary) 35 | { 36 | m_tempModelPath = Path.GetTempFileName(); 37 | m_tempVocabPath = Path.GetTempFileName(); 38 | m_serverPool = new ConcurrentQueue(); // pool of SPM helper processes. We need multiple if running multi-threaded. 39 | m_vocabulary = vocabulary?.ToHashSet(); 40 | // save to file during the lifetime of this object 41 | File.WriteAllBytes(m_tempModelPath, File.ReadAllBytes(loadMe)); 42 | if (vocabulary != null) 43 | File.WriteAllLines(m_tempVocabPath, vocabulary, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)); 44 | } 45 | 46 | // @TODO: get destruction right, delete the temp files 47 | //public sealed override void Dispose() 48 | //{ 49 | // //Dispose(true); 50 | // GC.SuppressFinalize(this); 51 | //} 52 | //[HandleProcessCorruptedStateExceptions] 53 | //protected virtual void Dispose(bool A_0) { } 54 | 55 | // This is the only interface into SPM used by FactoredSegmenter. 56 | // It determines the split points where SPM would split. 57 | // @TODO: change return type to IList type, which will save one operation in this build, while costing nothing in MTMAIN 58 | public int[] GetSplitPoints(string segmentMe) 59 | { 60 | if (segmentMe.Length <= 1) // nothing to split. This includes space, which is SPM's break symbol, and should not be sent. 61 | return null; 62 | // obtain a server process if available, or create a new one if all are in use 63 | if (!m_serverPool.TryDequeue(out var processPipe)) 64 | { 65 | var argv = new List { SpmBinaryDirPath + "spm_encode", "--model", m_tempModelPath }; 66 | if (m_vocabulary != null) 67 | argv.AddRange(new List { "--vocabulary", m_tempVocabPath }); 68 | Logger.WriteLine($"starting SentencePiece instance as: {" ".JoinItems(argv)}"); 69 | processPipe = new ProcessPipe(argv, envirVariables: new Dictionary { { "LC_ALL", "en_US.UTF-8" } }); 70 | // @TODO: do we need the environment variable for spm_encode? 71 | } 72 | //Logger.WriteLine($"SPM-encoding word {segmentMe}"); 73 | processPipe.process.StandardInput.WriteLine(segmentMe); // @TODO: how do we know/ensure this is UTF-8? 74 | var encodedWord = processPipe.process.StandardOutput.ReadLine(); 75 | Sanity.Requires(encodedWord != null, "spm_encode unexpectedly terminated"); 76 | // return the process back into the pool 77 | m_serverPool.Enqueue(processPipe); 78 | 79 | var pieces = encodedWord.Split(' ', options: StringSplitOptions.RemoveEmptyEntries); 80 | if ("".JoinItems(pieces) != segmentMe) 81 | { 82 | Logger.WriteLine($"ignoring word: SentencePiece did not just split the word ('{segmentMe}', -> '{" ".JoinItems(pieces)}')"); 83 | return null; 84 | } 85 | 86 | // create array of segmentation points 87 | // E.g. if "abcde" got broken into "ab cde", then we return the split points (0, 2, 5). 88 | // This code handles the special case of OOV pieces. 89 | // E.g. if there is no '+' in the SentencePiece vocab, then spm_encode will keep 90 | // it as '++++'. We must break those up into individual pieces. 91 | List res = null; // (created lazily) 92 | int n = 0; // accumulator for split points 93 | for (int i = 0; i < pieces.Length; i++) 94 | { 95 | var piece = pieces[i]; 96 | if (m_vocabulary == null || m_vocabulary.Contains(piece)) 97 | { 98 | n += piece.Length; 99 | if (n < segmentMe.Length || res != null) // (in the frequent special case of an unbroken single token, we return null for efficiency) 100 | { 101 | if (res == null) 102 | res = new List { 0, n }; 103 | else 104 | res.Add(n); 105 | } 106 | } 107 | else // special case: OOV. Break at each character. 108 | for (int j = 0; j < piece.Length; /*j += n*/) 109 | { 110 | // length of this piece is 1 Unicode character. Surrogate pairs are 2 characters in C#'s UCS-2 encoding. 111 | var ucs2Len = (char.IsHighSurrogate(piece[j]) && j + 2 <= piece.Length) ? 2 : 1; 112 | n += ucs2Len; 113 | j += ucs2Len; 114 | if (n < segmentMe.Length || res != null) 115 | { 116 | if (res == null) 117 | res = new List { 0, n }; 118 | else 119 | res.Add(n); 120 | } 121 | } 122 | } 123 | return res?.ToArray(); 124 | } 125 | public string[] Segment(string segmentMe) { throw new NotImplementedException("Segment() not implemented in this build."); } 126 | public string Unsegment(string[] unsegmentMe) { throw new NotImplementedException("Unsegment() not implemented in this build."); } 127 | 128 | //public static bool IsHighSurrogate(char c) { return true; } 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /lib/TextHelpers.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Globalization; 7 | using System.Linq; 8 | 9 | namespace Common.Text 10 | { 11 | /// Helper functions for Unicode 12 | public static class Unicode 13 | { 14 | /// 15 | /// Translate the UnicodeCategory into the two-letter Unicode-designation representation 16 | /// 17 | public static string GetUnicodeDesignation(this char c) 18 | { 19 | // derived from UnicodeCategory enum, which has these strings in the comment 20 | switch (CharUnicodeInfo.GetUnicodeCategory(c)) 21 | { 22 | case UnicodeCategory.UppercaseLetter: return "Lu"; // (letter, uppercase) 23 | case UnicodeCategory.LowercaseLetter: return "Ll"; // (letter, lowercase) 24 | case UnicodeCategory.TitlecaseLetter: return "Lt"; // (letter, titlecase) 25 | case UnicodeCategory.ModifierLetter: return "Lm"; // (letter, modifier) 26 | case UnicodeCategory.OtherLetter: return "Lo"; // (letter, other) 27 | case UnicodeCategory.NonSpacingMark: return "Mn"; // (mark, nonspacing) combined with another and so not consuming additional horizontal space 28 | case UnicodeCategory.SpacingCombiningMark: return "Mc"; // (mark, spacing combining) 29 | case UnicodeCategory.EnclosingMark: return "Me"; // (mark, enclosing) 30 | case UnicodeCategory.DecimalDigitNumber: return "Nd"; // (number, decimal digit) 31 | case UnicodeCategory.LetterNumber: return "Nl"; // (number, letter) 32 | case UnicodeCategory.OtherNumber: return "No"; // (number other) 33 | case UnicodeCategory.SpaceSeparator: return "Zs"; // (separator, space) 34 | case UnicodeCategory.LineSeparator: return "Zl"; // (separator, line) 35 | case UnicodeCategory.ParagraphSeparator: return "Zp"; // (separator, paragraph) 36 | case UnicodeCategory.Control: return "Cc"; // (other, control) 37 | case UnicodeCategory.Format: return "Cf"; // (other format) 38 | case UnicodeCategory.Surrogate: return "Cs"; // (other surrogate) 39 | case UnicodeCategory.PrivateUse: return "Co"; // (other, private use) 40 | case UnicodeCategory.ConnectorPunctuation: return "Pc"; // (punctuation, connector) 41 | case UnicodeCategory.DashPunctuation: return "Pd"; // (punctuation dash) 42 | case UnicodeCategory.OpenPunctuation: return "Ps"; // (punctuation open) 43 | case UnicodeCategory.ClosePunctuation: return "Pe"; // (punctuation close) 44 | case UnicodeCategory.InitialQuotePunctuation: return "Pi"; // (punctuation, initial quote) 45 | case UnicodeCategory.FinalQuotePunctuation: return "Pf"; // (punctuation, final quote) 46 | case UnicodeCategory.OtherPunctuation: return "Po"; // (punctuation, other) 47 | case UnicodeCategory.MathSymbol: return "Sm"; // (symbol, math) 48 | case UnicodeCategory.CurrencySymbol: return "Sc"; // (symbol currency) 49 | case UnicodeCategory.ModifierSymbol: return "Sk"; // (symbol, modifier) 50 | case UnicodeCategory.OtherSymbol: return "So"; // (symbol, other) 51 | case UnicodeCategory.OtherNotAssigned: return "Cn"; // (other, not assigned) 52 | default: throw new ArgumentOutOfRangeException(); 53 | } 54 | } 55 | 56 | /// 57 | /// Translate the UnicodeCategory into the one-letter major Unicode-designation representation 58 | /// @TODO: Find a way to handle surrogate pairs 59 | /// 60 | public static char GetUnicodeMajorDesignation(this char c) => GetUnicodeDesignation(c)[0]; 61 | 62 | /// 63 | /// Test whether a script is continuous (not written with spaces). 64 | /// This informs FactoredSegmenter which factor to use for word/segment boundaries, 65 | /// which affects which rules the system learns regarding inserting spaces. 66 | /// 67 | public static bool IsContinuousScript(this char c) 68 | { 69 | var script = GetScript(c); 70 | return script == Script.Han || 71 | script == Script.Hiragana || script == Script.Katakana || 72 | script == Script.Thai; 73 | } 74 | 75 | /// Names of Unicode Scripts. Scripts are set of chars like Arabic, Latin, Cyrillic, etc 76 | [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1034:NestedTypesShouldNotBeVisible"), System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Design", "CA1028:EnumStorageShouldBeInt32", Justification = "Use Byte for Enum to optimize storage of 65Kb array")] 77 | public enum Script : byte 78 | { 79 | None, // not a script (i.e. "null" script value) 80 | Arabic, 81 | Common, // commonly used in more than one script 82 | Cyrillic, 83 | Devanagari, 84 | Greek, 85 | Han, 86 | Hangul, 87 | Hebrew, 88 | Hiragana, 89 | Inherited, // considered to have the same script as that of the preceding character 90 | Katakana, 91 | Latin, 92 | Thai, 93 | Unknown // valid Unicode codepoint with unknown script (per Unicode spec) 94 | } 95 | /// 96 | /// Returns the value for the given Unicode code point 97 | /// 98 | /// The Unicode code point of the desired character 99 | /// The value for the given Unicode code point 100 | private static Script GetValue(int chUtf32) 101 | { 102 | if (chUtf32 < s_ScriptByChar.Length) 103 | return s_ScriptByChar[chUtf32]; 104 | else 105 | return Script.Unknown; 106 | } 107 | /// Returns a script of a given character 108 | /// The character to check 109 | /// The value for the given character 110 | public static Script GetScript(char value) 111 | { 112 | // @BUGBUG: This interface is flawed. We must handle surrogate pairs correctly. 113 | return !char.IsSurrogate(value) ? 114 | GetValue(value) : Script.None; 115 | } 116 | 117 | private static Script[] s_ScriptByChar; // [unicode code point] -> Script 118 | static Unicode() 119 | { 120 | // initialize the static script-mapping table 121 | // process for getting this (only doing this once, so not writing a script for it): 122 | // - run MTMAIN Common.Text.Unicode.Scripts.Create() and get result of GetScriptRanges(). 123 | // - replace Name= by Name=Script., replace DynamicValueStart by a number 124 | // - format as a 3-column table, e.g. "0 31 Script.Common" 125 | // - write to a file 126 | // - delete all numeric scripts, collate consecutive ranges 127 | // grep -v 'Script.[0-9]' d:\me\x1 | sort -n | gawk "{if ($1==P2+1 && P3==$3) {P2=$2} else {print P1, P2, P3; P1=$1;P2=$2;P3=$3}}END{print P1, P2, P3}" | grep Script | gawk '{if (NR%4 == 0) {print ""}; printf("(%i,%i,%s), ", $1, $2, $3)}' | clip 128 | var ranges = new (int min, int max, Script script)[] 129 | { 130 | (0,64,Script.Common), (65,90,Script.Latin), (91,96,Script.Common), 131 | (97,122,Script.Latin), (123,169,Script.Common), (170,170,Script.Latin), (171,185,Script.Common), 132 | (186,186,Script.Latin), (187,191,Script.Common), (192,214,Script.Latin), (215,215,Script.Common), 133 | (216,246,Script.Latin), (247,247,Script.Common), (248,696,Script.Latin), (697,735,Script.Common), 134 | (736,740,Script.Latin), (741,745,Script.Common), (748,767,Script.Common), (768,879,Script.Inherited), 135 | (880,883,Script.Greek), (884,884,Script.Common), (885,887,Script.Greek), (890,893,Script.Greek), 136 | (894,894,Script.Common), (900,900,Script.Greek), (901,901,Script.Common), (902,902,Script.Greek), 137 | (903,903,Script.Common), (904,906,Script.Greek), (908,908,Script.Greek), (910,929,Script.Greek), 138 | (931,993,Script.Greek), (1008,1023,Script.Greek), (1024,1156,Script.Cyrillic), (1157,1158,Script.Inherited), 139 | (1159,1319,Script.Cyrillic), (1417,1417,Script.Common), (1425,1479,Script.Hebrew), (1488,1514,Script.Hebrew), 140 | (1520,1524,Script.Hebrew), (1536,1540,Script.Arabic), (1542,1547,Script.Arabic), (1548,1548,Script.Common), 141 | (1549,1562,Script.Arabic), (1563,1563,Script.Common), (1566,1566,Script.Arabic), (1567,1567,Script.Common), 142 | (1568,1599,Script.Arabic), (1600,1600,Script.Common), (1601,1610,Script.Arabic), (1611,1621,Script.Inherited), 143 | (1622,1631,Script.Arabic), (1632,1641,Script.Common), (1642,1647,Script.Arabic), (1648,1648,Script.Inherited), 144 | (1649,1756,Script.Arabic), (1757,1757,Script.Common), (1758,1791,Script.Arabic), (1872,1919,Script.Arabic), 145 | (2208,2208,Script.Arabic), (2210,2220,Script.Arabic), (2276,2302,Script.Arabic), (2304,2384,Script.Devanagari), 146 | (2385,2386,Script.Inherited), (2387,2403,Script.Devanagari), (2404,2405,Script.Common), (2406,2423,Script.Devanagari), 147 | (2425,2431,Script.Devanagari), (3585,3642,Script.Thai), (3647,3647,Script.Common), (3648,3675,Script.Thai), 148 | (4053,4056,Script.Common), (4347,4347,Script.Common), (4352,4607,Script.Hangul), (5867,5869,Script.Common), 149 | (5941,5942,Script.Common), (6146,6147,Script.Common), (6149,6149,Script.Common), (7376,7378,Script.Inherited), 150 | (7379,7379,Script.Common), (7380,7392,Script.Inherited), (7393,7393,Script.Common), (7394,7400,Script.Inherited), 151 | (7401,7404,Script.Common), (7405,7405,Script.Inherited), (7406,7411,Script.Common), (7412,7412,Script.Inherited), 152 | (7413,7414,Script.Common), (7424,7461,Script.Latin), (7462,7466,Script.Greek), (7467,7467,Script.Cyrillic), 153 | (7468,7516,Script.Latin), (7517,7521,Script.Greek), (7522,7525,Script.Latin), (7526,7530,Script.Greek), 154 | (7531,7543,Script.Latin), (7544,7544,Script.Cyrillic), (7545,7614,Script.Latin), (7615,7615,Script.Greek), 155 | (7616,7654,Script.Inherited), (7676,7679,Script.Inherited), (7680,7935,Script.Latin), (7936,7957,Script.Greek), 156 | (7960,7965,Script.Greek), (7968,8005,Script.Greek), (8008,8013,Script.Greek), (8016,8023,Script.Greek), 157 | (8025,8025,Script.Greek), (8027,8027,Script.Greek), (8029,8029,Script.Greek), (8031,8061,Script.Greek), 158 | (8064,8116,Script.Greek), (8118,8132,Script.Greek), (8134,8147,Script.Greek), (8150,8155,Script.Greek), 159 | (8157,8175,Script.Greek), (8178,8180,Script.Greek), (8182,8190,Script.Greek), (8192,8203,Script.Common), 160 | (8204,8205,Script.Inherited), (8206,8292,Script.Common), (8298,8304,Script.Common), (8305,8305,Script.Latin), 161 | (8308,8318,Script.Common), (8319,8319,Script.Latin), (8320,8334,Script.Common), (8336,8348,Script.Latin), 162 | (8352,8378,Script.Common), (8400,8432,Script.Inherited), (8448,8485,Script.Common), (8486,8486,Script.Greek), 163 | (8487,8489,Script.Common), (8490,8491,Script.Latin), (8492,8497,Script.Common), (8498,8498,Script.Latin), 164 | (8499,8525,Script.Common), (8526,8526,Script.Latin), (8527,8543,Script.Common), (8544,8584,Script.Latin), 165 | (8585,8585,Script.Common), (8592,9203,Script.Common), (9216,9254,Script.Common), (9280,9290,Script.Common), 166 | (9312,9983,Script.Common), (9985,10239,Script.Common), (10496,11084,Script.Common), (11088,11097,Script.Common), 167 | (11360,11391,Script.Latin), (11744,11775,Script.Cyrillic), (11776,11835,Script.Common), (11904,11929,Script.Han), 168 | (11931,12019,Script.Han), (12032,12245,Script.Han), (12272,12283,Script.Common), (12288,12292,Script.Common), 169 | (12293,12293,Script.Han), (12294,12294,Script.Common), (12295,12295,Script.Han), (12296,12320,Script.Common), 170 | (12321,12329,Script.Han), (12330,12333,Script.Inherited), (12334,12335,Script.Hangul), (12336,12343,Script.Common), 171 | (12344,12347,Script.Han), (12348,12351,Script.Common), (12353,12438,Script.Hiragana), (12441,12442,Script.Inherited), 172 | (12443,12444,Script.Common), (12445,12447,Script.Hiragana), (12448,12448,Script.Common), (12449,12538,Script.Katakana), 173 | (12539,12540,Script.Common), (12541,12543,Script.Katakana), (12593,12686,Script.Hangul), (12688,12703,Script.Common), 174 | (12736,12771,Script.Common), (12784,12799,Script.Katakana), (12800,12830,Script.Hangul), (12832,12895,Script.Common), 175 | (12896,12926,Script.Hangul), (12927,13007,Script.Common), (13008,13054,Script.Katakana), (13056,13143,Script.Katakana), 176 | (13144,13311,Script.Common), (13312,19893,Script.Han), (19904,19967,Script.Common), (19968,40908,Script.Han), 177 | (42560,42647,Script.Cyrillic), (42655,42655,Script.Cyrillic), (42752,42785,Script.Common), (42786,42887,Script.Latin), 178 | (42888,42890,Script.Common), (42891,42894,Script.Latin), (42896,42899,Script.Latin), (42912,42922,Script.Latin), 179 | (43000,43007,Script.Latin), (43056,43065,Script.Common), (43232,43259,Script.Devanagari), (43360,43388,Script.Hangul), 180 | (44032,55203,Script.Hangul), (55216,55238,Script.Hangul), (55243,55291,Script.Hangul), (63744,64109,Script.Han), 181 | (64112,64217,Script.Han), (64256,64262,Script.Latin), (64285,64310,Script.Hebrew), (64312,64316,Script.Hebrew), 182 | (64318,64318,Script.Hebrew), (64320,64321,Script.Hebrew), (64323,64324,Script.Hebrew), (64326,64335,Script.Hebrew), 183 | (64336,64449,Script.Arabic), (64467,64829,Script.Arabic), (64830,64831,Script.Common), (64848,64911,Script.Arabic), 184 | (64914,64967,Script.Arabic), (65008,65020,Script.Arabic), (65021,65021,Script.Common), (65024,65039,Script.Inherited), 185 | (65040,65049,Script.Common), (65056,65062,Script.Inherited), (65072,65106,Script.Common), (65108,65126,Script.Common), 186 | (65128,65131,Script.Common), (65136,65140,Script.Arabic), (65142,65276,Script.Arabic), (65279,65279,Script.Common), 187 | (65281,65312,Script.Common), (65313,65338,Script.Latin), (65339,65344,Script.Common), (65345,65370,Script.Latin), 188 | (65371,65381,Script.Common), (65382,65391,Script.Katakana), (65392,65392,Script.Common), (65393,65437,Script.Katakana), 189 | (65438,65439,Script.Common), (65440,65470,Script.Hangul), (65474,65479,Script.Hangul), (65482,65487,Script.Hangul), 190 | (65490,65495,Script.Hangul), (65498,65500,Script.Hangul), (65504,65510,Script.Common), (65512,65518,Script.Common), 191 | (65529,65533,Script.Common), (65792,65794,Script.Common), (65799,65843,Script.Common), (65847,65855,Script.Common), 192 | (65856,65930,Script.Greek), (65936,65947,Script.Common), (66000,66044,Script.Common), (66045,66045,Script.Inherited), 193 | (69216,69246,Script.Arabic), (110592,110592,Script.Katakana), (110593,110593,Script.Hiragana), (118784,119029,Script.Common), 194 | (119040,119078,Script.Common), (119081,119142,Script.Common), (119143,119145,Script.Inherited), (119146,119162,Script.Common), 195 | (119163,119170,Script.Inherited), (119171,119172,Script.Common), (119173,119179,Script.Inherited), (119180,119209,Script.Common), 196 | (119210,119213,Script.Inherited), (119214,119261,Script.Common), (119296,119365,Script.Greek), (119552,119638,Script.Common), 197 | (119648,119665,Script.Common), (119808,119892,Script.Common), (119894,119964,Script.Common), (119966,119967,Script.Common), 198 | (119970,119970,Script.Common), (119973,119974,Script.Common), (119977,119980,Script.Common), (119982,119993,Script.Common), 199 | (119995,119995,Script.Common), (119997,120003,Script.Common), (120005,120069,Script.Common), (120071,120074,Script.Common), 200 | (120077,120084,Script.Common), (120086,120092,Script.Common), (120094,120121,Script.Common), (120123,120126,Script.Common), 201 | (120128,120132,Script.Common), (120134,120134,Script.Common), (120138,120144,Script.Common), (120146,120485,Script.Common), 202 | (120488,120779,Script.Common), (120782,120831,Script.Common), (126464,126467,Script.Arabic), (126469,126495,Script.Arabic), 203 | (126497,126498,Script.Arabic), (126500,126500,Script.Arabic), (126503,126503,Script.Arabic), (126505,126514,Script.Arabic), 204 | (126516,126519,Script.Arabic), (126521,126521,Script.Arabic), (126523,126523,Script.Arabic), (126530,126530,Script.Arabic), 205 | (126535,126535,Script.Arabic), (126537,126537,Script.Arabic), (126539,126539,Script.Arabic), (126541,126543,Script.Arabic), 206 | (126545,126546,Script.Arabic), (126548,126548,Script.Arabic), (126551,126551,Script.Arabic), (126553,126553,Script.Arabic), 207 | (126555,126555,Script.Arabic), (126557,126557,Script.Arabic), (126559,126559,Script.Arabic), (126561,126562,Script.Arabic), 208 | (126564,126564,Script.Arabic), (126567,126570,Script.Arabic), (126572,126578,Script.Arabic), (126580,126583,Script.Arabic), 209 | (126585,126588,Script.Arabic), (126590,126590,Script.Arabic), (126592,126601,Script.Arabic), (126603,126619,Script.Arabic), 210 | (126625,126627,Script.Arabic), (126629,126633,Script.Arabic), (126635,126651,Script.Arabic), (126704,126705,Script.Arabic), 211 | (126976,127019,Script.Common), (127024,127123,Script.Common), (127136,127150,Script.Common), (127153,127166,Script.Common), 212 | (127169,127183,Script.Common), (127185,127199,Script.Common), (127232,127242,Script.Common), (127248,127278,Script.Common), 213 | (127280,127339,Script.Common), (127344,127386,Script.Common), (127462,127487,Script.Common), (127488,127488,Script.Hiragana), 214 | (127489,127490,Script.Common), (127504,127546,Script.Common), (127552,127560,Script.Common), (127568,127569,Script.Common), 215 | (127744,127776,Script.Common), (127792,127797,Script.Common), (127799,127868,Script.Common), (127872,127891,Script.Common), 216 | (127904,127940,Script.Common), (127942,127946,Script.Common), (127968,127984,Script.Common), (128000,128062,Script.Common), 217 | (128064,128064,Script.Common), (128066,128247,Script.Common), (128249,128252,Script.Common), (128256,128317,Script.Common), 218 | (128320,128323,Script.Common), (128336,128359,Script.Common), (128507,128576,Script.Common), (128581,128591,Script.Common), 219 | (128640,128709,Script.Common), (128768,128883,Script.Common), (131072,173782,Script.Han), (173824,177972,Script.Han), 220 | (177984,178205,Script.Han), (194560,195101,Script.Han), (917505,917505,Script.Common), (917536,917631,Script.Common), 221 | (917760,917999,Script.Inherited) 222 | }; 223 | s_ScriptByChar = Enumerable.Repeat(Script.Unknown, ranges.Last().max + 1).ToArray(); 224 | foreach (var range in ranges) 225 | for (int i = range.min; i <= range.max; i++) 226 | s_ScriptByChar[i] = range.script; 227 | } 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /lib/Types.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | // This file contains a collection of enums, data structures, and interfaces 5 | // referenced in FactoredSegmenter. These have been extracted from a larger library. 6 | 7 | using System; 8 | using System.Collections.Generic; 9 | 10 | namespace Common.MT.Segments 11 | { 12 | public class AlignmentLink : IComparable, IEquatable 13 | { 14 | public int SourceIndex { get; } 15 | public int TargetIndex { get; } 16 | public float Confidence { get; } 17 | public override int GetHashCode() => SourceIndex ^ (TargetIndex << 16); 18 | public bool Equals(AlignmentLink that) => that != null && this.SourceIndex == that.SourceIndex && this.TargetIndex == that.TargetIndex; 19 | public override string ToString() => $"{SourceIndex}:{TargetIndex}"; 20 | public int CompareTo(AlignmentLink other) 21 | { 22 | int c1 = SourceIndex.CompareTo(other.SourceIndex); 23 | if (c1 != 0) 24 | return c1; 25 | return TargetIndex.CompareTo(other.TargetIndex); 26 | } 27 | } 28 | public class Alignment 29 | { 30 | public List Links { get; private set; } 31 | public Alignment InsertMissingTarget(int sourceIndex, int targetIndex) 32 | => throw new NotImplementedException("InsertMissingTarget is not supported"); 33 | public int GetTargetIndexToInsert(int originalSrcIndex) => -1; 34 | public override string ToString() => string.Join(" ", Links); 35 | } 36 | } 37 | namespace Microsoft.MT.TextSegmentation.SpanFinder 38 | { 39 | public enum AnnotatedSpanClassType 40 | { 41 | PhraseFix 42 | } 43 | public enum AnnotatedSpanInstructions // note: ignored in standalone build 44 | { 45 | ForceDecodeAs, 46 | EncodeAsIf 47 | } 48 | public class AnnotatedSpan 49 | { 50 | public int StartIndex { get; private set; } // coordinates into the raw source string 51 | public int Length { get; private set; } 52 | /// 53 | /// If given, then Encode() will pretend that the character range was this string instead of the original. 54 | /// Casing and word/continuous-script factors are derived as if these characters were in the original. 55 | /// 56 | public string EncodeAsIf { get; private set; } 57 | /// 58 | /// If given, then Decode() will decode this token as the given string. Use this for PhraseFix. 59 | /// If not given, then Decode() will reproduce the original character string. Used internally for unencodable characters. 60 | /// This requires a class type. @TODO: In the future, it can also be a parenthesized pass-through (A|B). 61 | /// @BUGBUG: For now, we do not handle casing; DecodeAs is just applied as-is. Need to decide what to do here. 62 | /// 63 | public string DecodeAs { get; private set; } 64 | /// 65 | /// If given, this is the class type to use to represent this token in Marian. 66 | /// The reason to use different class types is that different 67 | /// classes may occur in different grammatical contexts (e.g. PhraseFix vs. Url). 68 | /// 69 | public AnnotatedSpanClassType? ClassType { get; private set; } // if non-null, then use this class token 70 | public AnnotatedSpan( 71 | int startIndex, 72 | int length, 73 | AnnotatedSpanClassType? classType, 74 | AnnotatedSpanInstructions instructions = AnnotatedSpanInstructions.ForceDecodeAs, // note: ignored in standalone build 75 | string decodeAs = null, 76 | string encodeAsIf = null) 77 | { 78 | StartIndex = startIndex; 79 | Length = length; 80 | ClassType = classType; 81 | DecodeAs = decodeAs; 82 | EncodeAsIf = encodeAsIf; 83 | } 84 | } 85 | } 86 | namespace Microsoft.MT.Common.Tokenization 87 | { 88 | public enum SegmenterKind 89 | { 90 | FactoredSegmenter, 91 | SentencePiece, // (not actually supported in this library) 92 | Unknown 93 | } 94 | public interface ISegmenterConfig { } 95 | public interface ISentencePieceConfig : ISegmenterConfig { } 96 | public interface IFactoredSegmenterConfig : ISegmenterConfig { } 97 | public class SegmenterConfigBase { } 98 | public abstract class SegmenterTrainConfigBase : SegmenterConfigBase 99 | { 100 | /// 101 | /// Maximum size of sentences to train sentence pieces 102 | /// 103 | public abstract int? TrainingSentenceSize { get; set; } 104 | } 105 | public class SegmenterEncodeConfigBase : SegmenterConfigBase { } 106 | public class SegmenterDecodeConfigBase : SegmenterConfigBase { } 107 | public class ProcessedToken 108 | { 109 | public static ProcessedToken CreateRegularToken(string sourceWord, List origSource = null, int rawCharStart = -1, int rawCharLength = -1) 110 | => throw new NotImplementedException("The ProcessedToken interface is not available in this build."); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /lib/Utils.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | // This file contains a collection of utility functions. This is an extract 5 | // from a larger library, reduced to what is actually used by this project. 6 | 7 | using Common.Collections; 8 | using Common.Collections.Extensions; 9 | using Common.Contracts; 10 | using System; 11 | using System.Collections.Generic; 12 | using System.Diagnostics; 13 | using System.Globalization; 14 | using System.IO; 15 | using System.Linq; 16 | using System.Text; 17 | using System.Threading; 18 | 19 | namespace Common.Collections.Extensions 20 | { 21 | public static class StringExtensions 22 | { 23 | /// 24 | /// Convenience version of string.Join() that follows the Python syntax where the joiner is 'this'. 25 | /// 26 | public static string JoinItems(this string separator, IEnumerable items) => string.Join(separator, items); 27 | } 28 | public static class EnumerableExtensions 29 | { 30 | /// 31 | /// Create a sequence of overlapping pairs of the input. 32 | /// E.g. a b c d -> (a,b) (b,c) (c,d) 33 | /// 34 | /// Sequence of items. The sequence must have at least one element. 35 | /// Sequence of bigrams 36 | public static IEnumerable<(T, T)> Bigrams(this IEnumerable sequence) 37 | { 38 | var seqEnum = sequence.GetEnumerator(); 39 | bool movedNext = seqEnum.MoveNext(); 40 | Sanity.Requires(movedNext, "Bigram() requires a non-empty input"); 41 | T lastVal = seqEnum.Current; 42 | while (seqEnum.MoveNext()) 43 | { 44 | T thisVal = seqEnum.Current; 45 | yield return (lastVal, thisVal); 46 | lastVal = thisVal; 47 | } 48 | } 49 | } 50 | public static class DictionaryExtensions 51 | { 52 | /// 53 | /// Same as Enumerable.SequenceEquals(), except that arguments may also be null. 54 | /// Amazingly, a.NullableSequenceEquals(b) works for a=null, thanks to the magic 55 | /// of extension methods. 56 | /// 57 | /// 58 | /// sequence or null 59 | /// sequence or null 60 | /// True if both args are null, or if both are non-null and sequences match 61 | public static bool NullableSequenceEquals(this IEnumerable a, IEnumerable b) 62 | { 63 | return (a == null && b == null) || 64 | (a != null && b != null && Enumerable.SequenceEqual(a, b)); 65 | } 66 | } 67 | public static class IOExtensions 68 | { 69 | /// 70 | /// Implements ReadLines() on the TextReader interface. 71 | /// 72 | public static IEnumerable ReadLines(this TextReader textReader) 73 | { 74 | var line = textReader.ReadLine(); 75 | while (line != null) 76 | { 77 | yield return line; 78 | line = textReader.ReadLine(); 79 | } 80 | } 81 | } 82 | } 83 | namespace Common.Contracts 84 | { 85 | public static class Sanity 86 | { 87 | public static bool Requires(bool condition, string errorMessage, params object[] args) 88 | { 89 | if (!condition) 90 | { 91 | if (args.Length == 0) 92 | throw new ArgumentException(errorMessage); 93 | else 94 | throw new ArgumentException(string.Format(CultureInfo.InvariantCulture, errorMessage, args)); 95 | } 96 | return true; // allows to use it in an expression 97 | } 98 | } 99 | } 100 | namespace Common.Utils 101 | { 102 | public static class ProcessTool 103 | { 104 | static char[] k_ArgToCommandLineInvalidChars = Enumerable.Concat(from c in Enumerable.Range(0, (int)' ') select (char)c, new char[] { '"', '^' }).ToArray(); 105 | /// 106 | /// escape an argument to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#. 107 | /// Some characters that are tricky to handle consistently. For now, we simply forbid them. 108 | /// These include all control characters (0x00..0x1f), " (quotation marks inside string), and ^ (CMD shell escape). 109 | /// To handle " and ^ correctly, we may need additional context on whether this is run via CMD, and there is 110 | /// supposedly also a difference between CommandLineToArgV() and the C++ CRT (C# unknown) regarding sequences of double quotes. 111 | /// 112 | /// Argument as the final string that the tool should receive, without escaping. 113 | /// Escaped version of argument, or unmodified argument if no escaping is needed. 114 | static string ArgToCommandLine(string arg) 115 | { 116 | if (-1 != arg.IndexOfAny(k_ArgToCommandLineInvalidChars)) 117 | throw new NotImplementedException($"ArgToCommandLine: presently cannot handle certain special characters (e.g. \" and ^) in: {arg}"); 118 | if (!arg.Any() || arg.Contains(' ')) // space is the delimiter, so we must surround the arg by quotes 119 | return $"\"{arg}\""; 120 | else // otherwise, no need to escape (it would be OK to escape, but not escaping is better for log readability 121 | return arg; 122 | } 123 | /// 124 | /// convert an array of string arguments to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#. 125 | /// 126 | public static string ArgsToCommandLine(IEnumerable args) 127 | => string.Join(" ", from arg in args select ArgToCommandLine(arg)); 128 | 129 | private static Process CreateProcess(string exe, string args, 130 | IEnumerable> envirVariables, bool isPipe, 131 | TextWriter stderr) 132 | { 133 | var psi = new ProcessStartInfo(exe, args) 134 | { 135 | UseShellExecute = false, 136 | CreateNoWindow = true, 137 | ErrorDialog = false, 138 | }; 139 | if (isPipe) 140 | { 141 | psi.RedirectStandardInput = true; 142 | psi.RedirectStandardOutput = true; 143 | psi.StandardInputEncoding = Encoding.UTF8; 144 | psi.StandardOutputEncoding = Encoding.UTF8; 145 | } 146 | if (stderr != null) 147 | { 148 | psi.RedirectStandardError = true; 149 | psi.StandardErrorEncoding = Encoding.UTF8; // @REVIEW: needed? 150 | } 151 | if (envirVariables != null) 152 | foreach (KeyValuePair pair in envirVariables) 153 | psi.EnvironmentVariables[pair.Key] = pair.Value; 154 | 155 | var process = new Process(); 156 | process.StartInfo = psi; 157 | if (stderr != null) 158 | process.ErrorDataReceived += (sender, e) => { stderr.WriteLine(e.Data); }; 159 | process.Start(); 160 | if (stderr != null) 161 | process.BeginErrorReadLine(); 162 | return process; 163 | } 164 | 165 | // @TODO: do we need IDisposable interface, so we can WaitForExit() for the process? 166 | public class ProcessPipe 167 | { 168 | public readonly Process process; 169 | public ProcessPipe(IList argv, IEnumerable> envirVariables = null) // UNIX-style argv array incl. exe itself 170 | { 171 | process = CreateProcess(argv.First(), ArgsToCommandLine(argv.Skip(1)), envirVariables: envirVariables, isPipe: true, stderr: null); 172 | process.StandardInput.AutoFlush = true; 173 | } 174 | } 175 | 176 | public static int RunCommand( 177 | string exe, 178 | string args, 179 | string stdoutPath, // must be null in this version 180 | string stderrPath, // may be null 181 | bool throwOnFailure = true, 182 | IEnumerable> envirVariables = null) 183 | { 184 | Sanity.Requires(stdoutPath == null, "This reduced version of RunCommand() does not support stdout redirection"); 185 | Logger.WriteLine($"executing command: {exe} {args}"); 186 | using (TextWriter stderrWriter = stderrPath == null ? null : new StreamWriter(stderrPath, append: false, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)) { AutoFlush = true }) 187 | using (var process = CreateProcess(exe, args, envirVariables, isPipe: false, stderr: stderrWriter)) 188 | { 189 | process.WaitForExit(); 190 | if (throwOnFailure && process.ExitCode != 0) 191 | throw new IOException($"Exit code {process.ExitCode} was returned by external process: {exe} {args}"); 192 | else 193 | return process.ExitCode; 194 | } 195 | } 196 | } 197 | } 198 | namespace Common.Utils 199 | { 200 | public static class Logger 201 | { 202 | public static void WriteLine(string format, params object[] args) => Console.Error.WriteLine(format, args); 203 | 204 | public static void WriteLine(string s) => Console.Error.WriteLine(s); 205 | } 206 | } 207 | namespace Common.IO 208 | { 209 | /// 210 | /// Contains static creator methods for various types of writers that will typically be used 211 | /// with AtomicFileWriter 212 | /// 213 | public static class AtomicFileWriter 214 | { 215 | /// 216 | /// move a file to a target location that gets deleted first if existing 217 | /// TODO: This seems to be duplicated about 20 times throughout the Solution; clean it up. 218 | /// This operation is faked to be "atomic" in that race conditions are handled that arise from concurrent attempts of doing the same thing on a parallel process. 219 | /// Note: if the source cannot be moved for whatever reason, but the target can be deleted, then this function will cause harm. 220 | /// TODO: the class name AtomicFileWriter does not seem fully appropriate for this function 221 | /// 222 | /// 223 | /// 224 | static void MoveReplace(string from, string to) 225 | { 226 | // This loop caters to the situation that two processes try to do this concurrently on 227 | // the same target path. The semantics should be that one of them wins. The special case 228 | // is that when this process deletes the target location, but then fails because a file at 229 | // the target path has magically reappeared. This must have been a concurrent process. 230 | // In this case, we just try again. 231 | while (true) 232 | { 233 | File.Delete(to); 234 | try 235 | { 236 | File.Move(from, to); 237 | return; // success 238 | } 239 | catch (IOException) 240 | { 241 | if (!File.Exists(to)) // file not there: failed due to some other problem 242 | throw; 243 | // target file magically reappeared: 244 | // This must be a concurrent thread. Just try again. If we cannot delete this new one, we will fail in Delete(). 245 | } 246 | } 247 | } 248 | 249 | /// 250 | /// helper to save an object to disk via an intermediate tmp file and a lambda 251 | /// The caller must provide a lambda that accepts a (temporary) file name, and save to that. 252 | /// That temp file will then be atomically renamed into the target location. 253 | /// It is "atomic" in the sense that in case of an error, it will not leave a partially written file 254 | /// under the target name, and only overwrite a potentially existing one if the write operation succeeded. 255 | /// The outFilePath may be null. In that case, the SaveFunc() is called with null. This allows 256 | /// for nested Save() calls with multiple temp files, where some are optional. 257 | /// 258 | /// final output goes here 259 | /// lambda that creates a file (to which this function passes a temp path, which then gets renamed) 260 | public static void Save(string outFilePath, Action SaveFunc) 261 | { 262 | if (outFilePath == null) 263 | { 264 | SaveFunc(null); 265 | return; 266 | } 267 | string tmpPath = $"{outFilePath}.{Thread.CurrentThread.ManagedThreadId}$$"; 268 | try 269 | { 270 | SaveFunc(tmpPath); 271 | MoveReplace(tmpPath, outFilePath); 272 | } 273 | catch 274 | { 275 | File.Delete(tmpPath); // best-effort cleanup (which may file e.g. in case of network disconnect) 276 | throw; 277 | } 278 | } 279 | } 280 | } 281 | namespace Common.Collections 282 | { 283 | /// 284 | /// Wrapper around Dictionary with the following properties: 285 | /// (1) Dictionary is only allowed to grow to m_maxSize 286 | /// (2) Access is synchronized and read-write until the dictionary is full. 287 | /// Once the dictionary is full it becomes read-only (subsequent adds are no-ops) and lock-free. 288 | /// The class was designed for use as a cache to store computations on key streams that are assumed to 289 | /// exhibit Zipfian distribution. 290 | /// 291 | /// 292 | /// 293 | public class BoundedSizedLockingCache //: IDictionary 294 | { 295 | private object m_locker = new object(); 296 | private int m_maxSize; 297 | private Dictionary m_dict = new Dictionary(); 298 | volatile bool m_full = false; 299 | 300 | private void MaybeLock(Action act) 301 | { 302 | if (m_full) 303 | { 304 | act(); 305 | return; 306 | } 307 | else 308 | { 309 | lock (m_locker) 310 | { 311 | act(); 312 | } 313 | } 314 | } 315 | 316 | private RetT MaybeLock(Func func) 317 | { 318 | if (m_full) 319 | { 320 | return func(); 321 | } 322 | lock (m_locker) 323 | { 324 | return func(); 325 | } 326 | } 327 | 328 | private void MaybeSetFull() 329 | { 330 | MaybeLock(() => { if (m_dict.Count >= m_maxSize) { m_full = true; } }); 331 | } 332 | 333 | /// 334 | /// Create a cache 335 | /// 336 | /// Maximum size of cache. Setting to 0 effectively disables the cache. 337 | public BoundedSizedLockingCache(int maxSize) 338 | { 339 | m_maxSize = maxSize; 340 | MaybeSetFull(); 341 | } 342 | /// 343 | /// If the dictionary has room, add key and value. Otherwise this is a no-op. 344 | /// 345 | public void Add(K key, V value) 346 | { 347 | if (m_full) 348 | return; 349 | 350 | MaybeLock(() => 351 | { 352 | if (!m_dict.ContainsKey(key)) 353 | m_dict.Add(key, value); 354 | MaybeSetFull(); 355 | }); 356 | } 357 | public bool TryGetValue(K key, out V value) 358 | { 359 | if (m_full) 360 | { 361 | return m_dict.TryGetValue(key, out value); 362 | } 363 | lock (m_locker) 364 | { 365 | 366 | return m_dict.TryGetValue(key, out value); 367 | } 368 | } 369 | } 370 | } 371 | namespace Microsoft.MT.Common.Tokenization 372 | { 373 | public static class CachedFunction 374 | { 375 | /// 376 | /// If an entry exists in the cache for key, return it. Otherwise, call unary function func and add it to cache. 377 | /// 378 | public static int[] Memoize(BoundedSizedLockingCache cache, string key, Func func) 379 | { 380 | if (cache.TryGetValue(key, out var ret)) 381 | return ret; 382 | ret = func(key); 383 | cache.Add(key, ret); 384 | return ret; 385 | } 386 | } 387 | } 388 | -------------------------------------------------------------------------------- /spm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMakeList.txt : CMake project for SentencePieceWrapper, include source and define 2 | # project specific logic here. 3 | # 4 | cmake_minimum_required (VERSION 3.8) 5 | 6 | # Add source to this project's executable. 7 | add_executable (SentencePieceInterop "SentencePieceInterop.cpp" "unicode_conversions.h") 8 | 9 | # TODO: Add tests and install targets if needed. 10 | -------------------------------------------------------------------------------- /spm/SentencePieceInterop.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include // for unique_ptr 9 | #include 10 | #include "unicode_conversions.h" 11 | 12 | // This requires libsentencepiece.so. If you follow the build instructions 13 | // from C++ source on https://github.com/google/sentencepiece, the necessary 14 | // header file and library will be installed in the system. 15 | 16 | // --------------------------------------------------------------------------- 17 | // C++ implementation of the functionality 18 | // --------------------------------------------------------------------------- 19 | 20 | class SentencePieceInterop 21 | { 22 | std::unique_ptr m_processor; 23 | 24 | void check_status(sentencepiece::util::Status status, const char* what) 25 | { 26 | if (status.ok()) 27 | return; 28 | std::string err = status.ToString(); 29 | std::cerr << err << std::endl; 30 | throw std::runtime_error(std::string("SentencePiece error ") + what + ": " + err); 31 | } 32 | public: 33 | SentencePieceInterop(const uint16_t* modelPath, const uint16_t** vocab, size_t vocabSize) 34 | { 35 | m_processor.reset(new sentencepiece::SentencePieceProcessor()); 36 | // load the model file 37 | const auto status = m_processor->Load(utf16_to_utf8(utf16string(modelPath))); 38 | // implant the restricted vocabulary, if given 39 | if (vocab && vocabSize > 0) 40 | { 41 | std::vector vocab_str; 42 | for (size_t i = 0; i < vocabSize; i++) 43 | { 44 | vocab_str.push_back(utf16_to_utf8(utf16string(vocab[i]))); 45 | } 46 | 47 | m_processor->SetVocabulary(vocab_str); 48 | } 49 | check_status(status, "loading"); 50 | } 51 | 52 | int EncodeAsIds(const uint16_t* word, int* pieceIdBuffer, size_t pieceIdBufferSize) 53 | { 54 | std::string wordInUtf8 = utf16_to_utf8(utf16string(word)); 55 | auto piece_ids = m_processor->EncodeAsIds(sentencepiece::util::min_string_view(wordInUtf8)); 56 | if (piece_ids.size() > pieceIdBufferSize) 57 | return -((int)piece_ids.size()); 58 | 59 | std::copy(piece_ids.begin(), piece_ids.end(), pieceIdBuffer); 60 | return (int)piece_ids.size(); 61 | } 62 | 63 | int UCS2LengthOfPieceId(int pieceId) 64 | { 65 | if (m_processor->IsUnknown(pieceId)) 66 | return -1; 67 | auto utf8 = m_processor->IdToPiece(pieceId); 68 | return (int)count_utf8_to_utf16(utf8); 69 | } 70 | }; 71 | 72 | // --------------------------------------------------------------------------- 73 | // C/C++ interop and exported C functions 74 | // - intptr_t object = LoadModel(void* model, size_t modelSize, char** vocab, size_t vocabSize) 75 | // - length = EncodeAsIds(intptr_t object, const char* wordInUtf8, int* pieceIdBuffer, size_t pieceIdBufferSIze) // pieceIdBuffer size >= strlen(word)+1 76 | // - n = UCS2LengthOfPieceId(intptr_t object, int pieceId) 77 | // - UnloadModel(intptr_t object) 78 | // --------------------------------------------------------------------------- 79 | 80 | #if defined(_MSC_VER) 81 | // Microsoft 82 | #define EXPORT __declspec(dllexport) 83 | #define IMPORT __declspec(dllimport) 84 | #elif defined(__GNUC__) 85 | // GCC 86 | #define EXPORT __attribute__((visibility("default"))) 87 | #define IMPORT 88 | #else 89 | // do nothing and hope for the best? 90 | #define EXPORT 91 | #define IMPORT 92 | #pragma warning Unknown dynamic link import/export semantics. 93 | #endif 94 | 95 | extern "C" { 96 | 97 | intptr_t EXPORT LoadModel(const uint16_t* modelPath, const uint16_t** vocab, size_t vocabSize) 98 | { 99 | try 100 | { 101 | return (intptr_t) new SentencePieceInterop(modelPath, vocab, vocabSize); 102 | } 103 | catch(...) // @TODO: how to return meaningful error information? 104 | { 105 | return (intptr_t) nullptr; 106 | } 107 | } 108 | 109 | int EXPORT EncodeAsIds(intptr_t object, const uint16_t* word, int* pieceIdBuffer, size_t pieceIdBufferSize) 110 | { 111 | try 112 | { 113 | return (int)((SentencePieceInterop*)object)->EncodeAsIds(word, pieceIdBuffer, pieceIdBufferSize); 114 | } 115 | catch(...) // @TODO: how to return meaningful error information? 116 | { 117 | return -1; 118 | } 119 | } 120 | 121 | int EXPORT UCS2LengthOfPieceId(intptr_t object, int pieceId) 122 | { 123 | try 124 | { 125 | return ((SentencePieceInterop*)object)->UCS2LengthOfPieceId(pieceId); 126 | } 127 | catch(...) // @TODO: how to return meaningful error information? 128 | { 129 | return 0; // 0 is an invalid length 130 | } 131 | } 132 | 133 | void EXPORT UnloadModel(intptr_t object) 134 | { 135 | delete (SentencePieceInterop*)object; 136 | } 137 | 138 | } 139 | 140 | // --------------------------------------------------------------------------- 141 | // BELOW IS MY DEV WRAPPER 142 | // --------------------------------------------------------------------------- 143 | 144 | // how to build: 145 | // - clang -lstdc++ -std=c++11 -lsentencepiece -Wall -Werror SentencePieceInterop.cpp 146 | // how the SPM files for testing were obtained: 147 | // - run factored-segmenter encode --model /marcinjdeu.blob.core.windows.net/forfrank/model-99995c.fsm 148 | // - you will see a log msg like this: 149 | // starting SentencePiece instance as: /usr/local/bin/spm_encode --model /tmp/tmpg9BX8N.tmp --vocabulary /tmp/tmpFslYfv.tmp 150 | // - copy out the --model and --vocab temp files 151 | 152 | using namespace std; 153 | 154 | const char* spmModelPath = "/home/fseide/factored-segmenter/spm/spm.model"; 155 | const char* spmVocabPath = "/home/fseide/factored-segmenter/spm/spm.vocab"; 156 | 157 | //vector test_strings = 158 | //{ 159 | // "\u2581HELLO", 160 | // "\u2581OBAMA", 161 | // "OBAMA", 162 | // "HELL\u2582\u2582O" // out-of-vocab example 163 | //}; 164 | 165 | void fail(const char* msg) { cerr << "FAILED: " << msg << endl; exit(1); } 166 | 167 | int main() 168 | { 169 | // load the model file into RAM 170 | ifstream f_model(spmModelPath); 171 | auto modelBytes = vector(istreambuf_iterator(f_model), istreambuf_iterator()); 172 | if (f_model.bad() || modelBytes.empty()) // note: bad bit does not get set if file not found (??) 173 | fail("Failed to read SPM model file."); 174 | 175 | // load the vocab file 176 | ifstream f_vocab(spmVocabPath); 177 | vector vocab; 178 | while (f_vocab) 179 | { 180 | string line; 181 | getline(f_vocab, line); 182 | vocab.push_back(line); 183 | } 184 | //vector vocab_ptr; 185 | //for (const auto& line : vocab) 186 | // vocab_ptr.push_back(line.c_str()); 187 | 188 | //auto object = LoadModel(spmModelPath, vocab_ptr.data(), vocab_ptr.size()); 189 | 190 | //for (const auto& test_string : test_strings) 191 | //{ 192 | // cerr << "Testing: " << test_string << endl; 193 | // vector piece_ids(test_string.size() + 1); 194 | // auto num_pieces = EncodeAsIds(object, test_string.c_str(), piece_ids.data(), piece_ids.size()); 195 | // if (num_pieces < 0) 196 | // fail("Failed to EncodeAsIds."); 197 | // piece_ids.resize(num_pieces); 198 | // for (auto piece_id : piece_ids) 199 | // cerr << " piece id " << piece_id << " has " << UCS2LengthOfPieceId(object, piece_id) << " UCS-2 characters" << endl; 200 | //} 201 | //UnloadModel(object); 202 | cerr << "Done." << endl; 203 | } 204 | -------------------------------------------------------------------------------- /spm/spm.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/factored-segmenter/cf3a8bd099719a67d886eab907d996b187d924f6/spm/spm.model -------------------------------------------------------------------------------- /spm/unicode_conversions.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | // This was extracted from https://github.com/microsoft/cpprestsdk/blob/cdae258bfb22f948c7b768b4dc56f5f4a2d9b2ce/Release/src/utilities/asyncrt_utils.cpp#L305 5 | 6 | #include 7 | #include 8 | 9 | typedef std::basic_string utf16string; 10 | 11 | #define LOW_3BITS 0x7 12 | #define LOW_4BITS 0xF 13 | #define LOW_5BITS 0x1F 14 | #define LOW_6BITS 0x3F 15 | #define BIT4 0x8 16 | #define BIT5 0x10 17 | #define BIT6 0x20 18 | #define BIT7 0x40 19 | #define BIT8 0x80 20 | #define L_SURROGATE_START 0xDC00 21 | #define L_SURROGATE_END 0xDFFF 22 | #define H_SURROGATE_START 0xD800 23 | #define H_SURROGATE_END 0xDBFF 24 | #define SURROGATE_PAIR_START 0x10000 25 | 26 | // Create a dedicated type for characters to avoid the issue 27 | // of different platforms defaulting char to be either signed 28 | // or unsigned. 29 | using UtilCharInternal_t = signed char; 30 | 31 | inline size_t count_utf8_to_utf16(const std::string& s) 32 | { 33 | const size_t sSize = s.size(); 34 | auto const sData = reinterpret_cast(s.data()); 35 | size_t result {sSize}; 36 | 37 | for (size_t index = 0; index < sSize;) 38 | { 39 | if (sData[index] >= 0) 40 | { 41 | // use fast inner loop to skip single byte code points (which are 42 | // expected to be the most frequent) 43 | while ((++index < sSize) && (sData[index] >= 0)) 44 | ; 45 | 46 | if (index >= sSize) break; 47 | } 48 | 49 | // start special handling for multi-byte code points 50 | const UtilCharInternal_t c {sData[index++]}; 51 | 52 | if ((c & BIT7) == 0) 53 | { 54 | throw std::range_error("UTF-8 string character can never start with 10xxxxxx"); 55 | } 56 | else if ((c & BIT6) == 0) // 2 byte character, 0x80 to 0x7FF 57 | { 58 | if (index == sSize) 59 | { 60 | throw std::range_error("UTF-8 string is missing bytes in character"); 61 | } 62 | 63 | const UtilCharInternal_t c2 {sData[index++]}; 64 | if ((c2 & 0xC0) != BIT8) 65 | { 66 | throw std::range_error("UTF-8 continuation byte is missing leading bit mask"); 67 | } 68 | 69 | // can't require surrogates for 7FF 70 | --result; 71 | } 72 | else if ((c & BIT5) == 0) // 3 byte character, 0x800 to 0xFFFF 73 | { 74 | if (sSize - index < 2) 75 | { 76 | throw std::range_error("UTF-8 string is missing bytes in character"); 77 | } 78 | 79 | const UtilCharInternal_t c2 {sData[index++]}; 80 | const UtilCharInternal_t c3 {sData[index++]}; 81 | if (((c2 | c3) & 0xC0) != BIT8) 82 | { 83 | throw std::range_error("UTF-8 continuation byte is missing leading bit mask"); 84 | } 85 | 86 | result -= 2; 87 | } 88 | else if ((c & BIT4) == 0) // 4 byte character, 0x10000 to 0x10FFFF 89 | { 90 | if (sSize - index < 3) 91 | { 92 | throw std::range_error("UTF-8 string is missing bytes in character"); 93 | } 94 | 95 | const UtilCharInternal_t c2 {sData[index++]}; 96 | const UtilCharInternal_t c3 {sData[index++]}; 97 | const UtilCharInternal_t c4 {sData[index++]}; 98 | if (((c2 | c3 | c4) & 0xC0) != BIT8) 99 | { 100 | throw std::range_error("UTF-8 continuation byte is missing leading bit mask"); 101 | } 102 | 103 | const uint32_t codePoint = 104 | ((c & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS); 105 | result -= (3 - (codePoint >= SURROGATE_PAIR_START)); 106 | } 107 | else 108 | { 109 | throw std::range_error("UTF-8 string has invalid Unicode code point"); 110 | } 111 | } 112 | 113 | return result; 114 | } 115 | 116 | utf16string /*__cdecl conversions::*/utf8_to_utf16(const std::string& s) 117 | { 118 | // Save repeated heap allocations, use the length of resulting sequence. 119 | const size_t srcSize = s.size(); 120 | auto const srcData = reinterpret_cast(s.data()); 121 | utf16string dest(count_utf8_to_utf16(s), L'\0'); 122 | utf16string::value_type* const destData = &dest[0]; 123 | size_t destIndex = 0; 124 | 125 | for (size_t index = 0; index < srcSize; ++index) 126 | { 127 | UtilCharInternal_t src = srcData[index]; 128 | switch (src & 0xF0) 129 | { 130 | case 0xF0: // 4 byte character, 0x10000 to 0x10FFFF 131 | { 132 | const UtilCharInternal_t c2 {srcData[++index]}; 133 | const UtilCharInternal_t c3 {srcData[++index]}; 134 | const UtilCharInternal_t c4 {srcData[++index]}; 135 | uint32_t codePoint = 136 | ((src & LOW_3BITS) << 18) | ((c2 & LOW_6BITS) << 12) | ((c3 & LOW_6BITS) << 6) | (c4 & LOW_6BITS); 137 | if (codePoint >= SURROGATE_PAIR_START) 138 | { 139 | // In UTF-16 U+10000 to U+10FFFF are represented as two 16-bit code units, surrogate pairs. 140 | // - 0x10000 is subtracted from the code point 141 | // - high surrogate is 0xD800 added to the top ten bits 142 | // - low surrogate is 0xDC00 added to the low ten bits 143 | codePoint -= SURROGATE_PAIR_START; 144 | destData[destIndex++] = static_cast((codePoint >> 10) | H_SURROGATE_START); 145 | destData[destIndex++] = 146 | static_cast((codePoint & 0x3FF) | L_SURROGATE_START); 147 | } 148 | else 149 | { 150 | // In UTF-16 U+0000 to U+D7FF and U+E000 to U+FFFF are represented exactly as the Unicode code point 151 | // value. U+D800 to U+DFFF are not valid characters, for simplicity we assume they are not present 152 | // but will encode them if encountered. 153 | destData[destIndex++] = static_cast(codePoint); 154 | } 155 | } 156 | break; 157 | case 0xE0: // 3 byte character, 0x800 to 0xFFFF 158 | { 159 | const UtilCharInternal_t c2 {srcData[++index]}; 160 | const UtilCharInternal_t c3 {srcData[++index]}; 161 | destData[destIndex++] = static_cast( 162 | ((src & LOW_4BITS) << 12) | ((c2 & LOW_6BITS) << 6) | (c3 & LOW_6BITS)); 163 | } 164 | break; 165 | case 0xD0: // 2 byte character, 0x80 to 0x7FF 166 | case 0xC0: 167 | { 168 | const UtilCharInternal_t c2 {srcData[++index]}; 169 | destData[destIndex++] = 170 | static_cast(((src & LOW_5BITS) << 6) | (c2 & LOW_6BITS)); 171 | } 172 | break; 173 | default: // single byte character, 0x0 to 0x7F 174 | // try to use a fast inner loop for following single byte characters, 175 | // since they are quite probable 176 | do 177 | { 178 | destData[destIndex++] = static_cast(srcData[index++]); 179 | } while (index < srcSize && srcData[index] > 0); 180 | // adjust index since it will be incremented by the for loop 181 | --index; 182 | } 183 | } 184 | return dest; 185 | } 186 | 187 | inline size_t count_utf16_to_utf8(const utf16string& w) 188 | { 189 | const utf16string::value_type* const srcData = &w[0]; 190 | const size_t srcSize = w.size(); 191 | size_t destSize(srcSize); 192 | for (size_t index = 0; index < srcSize; ++index) 193 | { 194 | const utf16string::value_type ch(srcData[index]); 195 | if (ch <= 0x7FF) 196 | { 197 | if (ch > 0x7F) // 2 bytes needed (11 bits used) 198 | { 199 | ++destSize; 200 | } 201 | } 202 | // Check for high surrogate. 203 | else if (ch >= H_SURROGATE_START && ch <= H_SURROGATE_END) // 4 bytes needed (21 bits used) 204 | { 205 | ++index; 206 | if (index == srcSize) 207 | { 208 | throw std::range_error("UTF-16 string is missing low surrogate"); 209 | } 210 | 211 | const auto lowSurrogate = srcData[index]; 212 | if (lowSurrogate < L_SURROGATE_START || lowSurrogate > L_SURROGATE_END) 213 | { 214 | throw std::range_error("UTF-16 string has invalid low surrogate"); 215 | } 216 | 217 | destSize += 2; 218 | } 219 | else // 3 bytes needed (16 bits used) 220 | { 221 | destSize += 2; 222 | } 223 | } 224 | 225 | return destSize; 226 | } 227 | 228 | std::string /*__cdecl conversions::*/utf16_to_utf8(const utf16string& w) 229 | { 230 | const size_t srcSize = w.size(); 231 | const utf16string::value_type* const srcData = &w[0]; 232 | std::string dest(count_utf16_to_utf8(w), '\0'); 233 | std::string::value_type* const destData = &dest[0]; 234 | size_t destIndex(0); 235 | 236 | for (size_t index = 0; index < srcSize; ++index) 237 | { 238 | const utf16string::value_type src = srcData[index]; 239 | if (src <= 0x7FF) 240 | { 241 | if (src <= 0x7F) // single byte character 242 | { 243 | destData[destIndex++] = static_cast(src); 244 | } 245 | else // 2 bytes needed (11 bits used) 246 | { 247 | destData[destIndex++] = static_cast(char((src >> 6) | 0xC0)); // leading 5 bits 248 | destData[destIndex++] = static_cast(char((src & LOW_6BITS) | BIT8)); // trailing 6 bits 249 | } 250 | } 251 | // Check for high surrogate. 252 | else if (src >= H_SURROGATE_START && src <= H_SURROGATE_END) 253 | { 254 | const auto highSurrogate = src; 255 | const auto lowSurrogate = srcData[++index]; 256 | 257 | // To get from surrogate pair to Unicode code point: 258 | // - subtract 0xD800 from high surrogate, this forms top ten bits 259 | // - subtract 0xDC00 from low surrogate, this forms low ten bits 260 | // - add 0x10000 261 | // Leaves a code point in U+10000 to U+10FFFF range. 262 | uint32_t codePoint = highSurrogate - H_SURROGATE_START; 263 | codePoint <<= 10; 264 | codePoint |= lowSurrogate - L_SURROGATE_START; 265 | codePoint += SURROGATE_PAIR_START; 266 | 267 | // 4 bytes needed (21 bits used) 268 | destData[destIndex++] = static_cast((codePoint >> 18) | 0xF0); // leading 3 bits 269 | destData[destIndex++] = static_cast(((codePoint >> 12) & LOW_6BITS) | BIT8); // next 6 bits 270 | destData[destIndex++] = static_cast(((codePoint >> 6) & LOW_6BITS) | BIT8); // next 6 bits 271 | destData[destIndex++] = static_cast((codePoint & LOW_6BITS) | BIT8); // trailing 6 bits 272 | } 273 | else // 3 bytes needed (16 bits used) 274 | { 275 | destData[destIndex++] = static_cast((src >> 12) | 0xE0); // leading 4 bits 276 | destData[destIndex++] = static_cast(((src >> 6) & LOW_6BITS) | BIT8); // middle 6 bits 277 | destData[destIndex++] = static_cast((src & LOW_6BITS) | BIT8); // trailing 6 bits 278 | } 279 | } 280 | 281 | return dest; 282 | } 283 | -------------------------------------------------------------------------------- /src/FactoredSegmenterConfigs.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | using System; 5 | using System.IO; 6 | using System.Xml.Serialization; 7 | using Common.Utils; 8 | 9 | namespace Microsoft.MT.Common.Tokenization 10 | { 11 | /// 12 | /// Configurable options for FactoredSegmenter models. 13 | /// All options that are kept inside the model file go here. 14 | /// 15 | public class FactoredSegmenterModelOptions 16 | { 17 | /// 18 | /// if false, do not emit |we nor |ce factors 19 | /// 20 | public bool RightWordGlue { get; set; } = false; 21 | 22 | /// 23 | /// if true, word-internal and word-initial pieces use distinct lemmas 24 | /// Without this, as piece xyz can exist in at least four forms, which in original 25 | /// SentencePiece notation would be written as xyz, Xyz, _xyz, and _Xyz. 26 | /// The latter three are all word boundaries, while the first is word-internal. 27 | /// I.e. two fundamentally different units are mapped onto the same piece. 28 | /// With this flag set, the latter three will use a different symbol. 29 | /// ...This is experimental, and not yet confirmed to help. 30 | /// 31 | public bool DistinguishInitialAndInternalPieces { get; set; } = false; 32 | 33 | public bool SplitHan { get; set; } = false; 34 | 35 | /// 36 | /// separate case factors for single letters 37 | /// For single letters, it is not clear whether to use |ca or |ci. 38 | /// With this option, we use a completely different factor |scu or |scl for single-letter words. 39 | /// This seems to quite robustly improve capitalization for English "I" and "U.S." for example. 40 | /// 41 | public bool SingleLetterCaseFactors { get; set; } = false; 42 | 43 | /// 44 | /// serialize phrase-fix indices and unrepresentable characters 45 | /// With this option, the index factor is no longer an additive factor, 46 | /// but instead is represented as a sequence of digits. This frees bits in Marian for other factors. 47 | /// Likewise, unrepresentable characters (=single characters not found in the 48 | /// SentencePiece vocabulary) are also serialized as their Unicode in digit form. 49 | /// This allows for any character to be represented (and hopefully translated by at least copying it through). 50 | /// ...This is experimental. 51 | public bool SerializeIndicesAndUnrepresentables { get; set; } = false; 52 | 53 | /// 54 | /// If true, phrase fixes are encoded by including them in the source. 55 | /// ...This is experimental. 56 | /// ...This is ongoing work. The following will be addressed once we know whether this works at all: 57 | /// - no correct escaping of our internal delimiter chars if they occur in real text 58 | /// - delimiter chars should be encoded in the form as XML tags, and have no glue factors 59 | /// - glue/boundary-factor determination must correctly see through the delimited ranges 60 | /// - currently if the decoder decides to output the delimiter chars, they will not be removed 61 | /// - delimiter chars should be excluded from shortlists (as should sentence-start) 62 | /// 63 | public bool InlineFixes { get; set; } = false; 64 | 65 | /// 66 | /// If true, use start/middle/end tags (which is known to not work well). 67 | /// If false, then use INLINE_FIX_TYPE factors for the inline-fix tokens. 68 | /// Only used if InlineFixes == true. 69 | /// 70 | public bool InlineFixUseTags { get; set; } = false; 71 | 72 | /// 73 | /// Enables context-dependent capitalization factors for single letters. 74 | /// Workaround for Bug #101419 "Training of allcaps factors is inconsistent". 75 | /// The Marian all-caps routine changes all factors to "ca", causing an inconsistency 76 | /// with measurable impact. With this flag set, FactoredSegmenter will try to guess 77 | /// whether a single uppercase letter is part of an all-caps word sequence. 78 | /// ...This does not seem to work well, and may be removed. 79 | /// 80 | public bool UseContextDependentSingleLetterCapitalizationFactors { get; set; } = false; 81 | 82 | /// 83 | /// For sentence-level annotations, e.g. multi-lingual systems, this string 84 | /// declares the types of annotations. E.g. to enable sentence-level annotations 85 | /// for the sentence target language, e.g. "target_language=ENU", the string 86 | /// "target_language" is the type, and it must be declared here as a model option. 87 | /// 88 | public string SourceSentenceAnnotationTypes { get; set; } = ""; 89 | 90 | /// 91 | /// The list of source sentence annotation types. 92 | /// Note that this is not a property that can be specified by the user. User should instead specify SourceSentenceAnnotationTypes in above. 93 | /// 94 | [XmlIgnore] 95 | internal string[] SourceSentenceAnnotationTypeList => SourceSentenceAnnotationTypes != null ? 96 | SourceSentenceAnnotationTypes.Split(new string[] { ";" }, StringSplitOptions.RemoveEmptyEntries) : 97 | new string[0]; 98 | 99 | // system-managed options persisted to file follow; not to be specified by user 100 | 101 | /// 102 | /// if false then skip SentencePiece. If true, then SPM model file is FS model path s/\.model$/\.fsm/ 103 | /// 104 | public bool? UseSentencePiece { get; set; } 105 | } 106 | 107 | /// 108 | /// Class to hold all parameters for the FactoredSegmenter training tool. 109 | /// 110 | public class FactoredSegmenterModelTrainConfig : SegmenterTrainConfigBase, IFactoredSegmenterConfig 111 | { 112 | /// 113 | /// options persisted with the model, e.g. whether to use certain factors 114 | /// 115 | public FactoredSegmenterModelOptions ModelOptions { get; set; } = new FactoredSegmenterModelOptions(); 116 | /// 117 | /// Number of sentences to use for determining the Marian vocab and for training 118 | /// the underlying SentencePiece model. Normally set to 10 million. 119 | /// This many sentences are sampled from the training corpus. 120 | /// For joint training, this is the total number of sentences across both languages. 121 | /// 122 | public override int? TrainingSentenceSize { get; set; } 123 | /// 124 | /// Only keep SentencePiece units ("pieces") with at least this many observations 125 | /// in the entire training set. Any unit with fewer observations will be represented 126 | /// as multiple shorter pieces. The rationale is that too rare observations will 127 | /// not get a properly trained embedding. 128 | /// The total Marian vocabulary consists of these pieces plus single characters. 129 | /// If TrainingSentenceSize is set, only a subset is processed. In this case, 130 | /// this count is adjusted automatically internally accordingly. 131 | /// 132 | public int MinPieceCount { get; set; } = 0; 133 | /// 134 | /// Only keep single characters with at least this many observations in the entire 135 | /// training data. Any character sequence that is not covered by units in the 136 | /// SentencePiece vocabulary will be represented as single characters. 137 | /// Many of these single characters are very rare, e.g. graphical characters 138 | /// or Cyrillic characters in a Chinese corpus, and cannot be learned properly. 139 | /// This parameter allows to eliminate rare characters from the vocab (they will 140 | /// be treated as unrepresentable, which presently means UNK). 141 | /// This threshold needs to be smaller than MinPieceCount to have an effect. 142 | /// If TrainingSentenceSize is set, only a subset is processed. In this case, 143 | /// this count is adjusted automatically internally accordingly. 144 | /// 145 | public int MinCharCount { get; set; } = 0; 146 | /// 147 | /// Config for the underlying SentencePiece training (or null to indicate to not use SentencePiece). 148 | /// 149 | public SentencePieceTrainConfig SentencePieceTrainingConfig { get; set; } = new SentencePieceTrainConfig(); 150 | // @BUGBUG: now ^^ this is created by default, so there is no way to turn it off -> @TODO: ModelOptions->UseSentencePiece = false says 'ignore this' 151 | } 152 | 153 | /// 154 | /// Class to hold all parameters for the FactoredSegmenter encoding tool. 155 | /// 156 | public class FactoredSegmenterEncodeConfig : SegmenterEncodeConfigBase, IFactoredSegmenterConfig 157 | { 158 | public SentencePieceEncodeConfig SentencePieceEncodeConfig { get; set; } // for the underlying SentencePiece module 159 | 160 | // for debugging: 161 | public int CheckEvery { get; set; } = 100; // decode each N-th encoded sentence and verify against source 162 | } 163 | 164 | /// 165 | /// Class to hold all parameters for the FactoredSegmenter decoding tool. 166 | /// 167 | public class FactoredSegmenterDecodeConfig : SegmenterDecodeConfigBase, IFactoredSegmenterConfig 168 | { 169 | SentencePieceDecodeConfig SentencePieceDecodeConfig { get; set; } // for the underlying SentencePiece module 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/FactoredSegmenterScriptHelpers.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | // This is meant as an extension of Unicode.cs. It should be merged into there, 5 | // once the code in here has reached a sufficient level of maturity and generality 6 | // across languages, and generally support surrogate pairs. 7 | 8 | using System.Collections.Generic; 9 | using System.Linq; 10 | 11 | namespace Common.Text 12 | { 13 | /// 14 | /// Helper class for Unicode characters. 15 | /// @BUGBUG: These do not work with surrogate pairs. 16 | /// 17 | public static class ScriptExtensions 18 | { 19 | /// 20 | /// Helper to test whether a character has a character code in range min..max 21 | /// 22 | public static bool IsInRange(this char c, int min, int max) => (c >= (char)min && c <= (char)max); 23 | 24 | /// 25 | /// Is character a combining character? 26 | /// 27 | /// 28 | /// 29 | public static bool IsCombiner(this char c) => c.GetUnicodeMajorDesignation() == 'M'; 30 | 31 | /// 32 | /// Is character a Variation Selector? [https://en.wikipedia.org/wiki/Variation_Selectors_(Unicode_block)] 33 | /// Note that these are included in IsCombiner as well. 34 | /// 35 | //public static bool IsVariationSelector(this char c) => c.IsInRange(0xfe00, 0xfe0f); 36 | 37 | /// 38 | /// Helper to determine whether a character is a numeral. 39 | /// This includes numeral characters that are not classified as such in Unicode, 40 | /// such as Chinese numbers. 41 | /// This is meant for FactoredSegmenter, which uses this to prevent numeral characters 42 | /// from being merged in SentencePiece. 43 | public static bool IsNumeral(this char c) 44 | { 45 | // @BUGBUG: currently known failures: 46 | // - Arabic fractions: ٠٫٢٥ 47 | 48 | // Chinese numeral letters are not classified as digits in Unicode 49 | if (ScriptHelpers.ChineseDigits.Contains(c)) 50 | return true; 51 | else 52 | return Unicode.GetUnicodeMajorDesignation(c) == 'N'; 53 | } 54 | 55 | /// 56 | /// Is this character considered a letter inside FactoredSegmenter? 57 | /// This also returns true for combiners that are typically used with letters. 58 | /// @TODO: decide how to handle wide letters, or all sorts of weird letters such as exponents 59 | /// Are those letters? Are those capitalizable? 60 | /// Then remove this wrapper. 61 | /// 62 | public static bool IsLetterOrLetterCombiner(this char c) 63 | => char.IsLetter(c) || 64 | (c.IsCombiner() && c.GetCombinerTypicalMajorDesignation() == 'L'); 65 | 66 | ///// 67 | ///// Combine IsLetter() and IsNumeral(), which is used a few times in this combination. 68 | ///// 69 | ///// 70 | ///// 71 | //public static bool IsLetterOrNumeral(this char c) => c.IsLetter() || c.IsNumeral(); 72 | 73 | /// 74 | /// Tests whether a character is a bicameral letter. 75 | /// @TODO: should we consider German ess-zet as bicameral? Lower=upper, but 76 | /// as of recently, an upper-case ess-zet exist. 77 | /// One can all-caps a word with ess-zet. This is currently special-cased in FactoredSegmenter. 78 | /// 79 | public static bool IsBicameral(this char c) => char.ToLowerInvariant(c) != char.ToUpperInvariant(c); 80 | 81 | /// 82 | /// Replacement for IsLower() that handles Roman numeral X correctly 83 | /// We define a lower-case letter as one that is bicameral in the first place, and of the lower kind. 84 | /// 85 | public static bool HasAndIsLower(this char c) => c != char.ToUpperInvariant(c); 86 | /// 87 | /// Same as HasAndIsLower() except for upper-case. 88 | /// 89 | public static bool HasAndIsUpper(this char c) => char.ToLowerInvariant(c) != c; 90 | 91 | /// 92 | /// String/index version of HasAndIsLower(). 93 | /// @BUGBUG: Does not handle surrogate pairs. 94 | /// 95 | public static bool HasAndIsLowerAt(this string s, int index) => s[index].HasAndIsLower(); 96 | 97 | /// 98 | /// String/index version of HasAndIsUpper(). 99 | /// @BUGBUG: Does not handle surrogate pairs. 100 | /// 101 | public static bool HasAndIsUpperAt(this string s, int index) => s[index].HasAndIsUpper(); 102 | 103 | /// 104 | /// Test if string is a single Unicode character, with support for surrogate pairs. 105 | /// Used for detecting unrepresentable Unicode characters. 106 | /// 107 | public static bool IsSingleCharConsideringSurrogatePairs(this string s) 108 | { 109 | var length = s.Length; 110 | return length == 1 || 111 | (length == 2 && char.IsSurrogatePair(s, 0)); 112 | } 113 | 114 | /// 115 | /// Capitalize the first letter of a string and return the result. 116 | /// This function attempts to be efficient and not allocate a new string 117 | /// if the string is unchanged. 118 | /// 119 | public static string Capitalized(this string s) 120 | { 121 | if (!string.IsNullOrEmpty(s) && s.First().HasAndIsLower()) 122 | { 123 | var a = s.ToArray(); 124 | a[0] = char.ToUpperInvariant(a[0]); 125 | return new string(a); 126 | } 127 | else 128 | return s; 129 | } 130 | 131 | /// 132 | /// Define a "typical" use for combining marks. FactoredSegmenter requires pieces to 133 | /// be classifyable as being of word nature or not. Combiners depend on context. 134 | /// This can lead to a contradiction if a combiner gets separated from its preceding 135 | /// character by SentencePiece (which we allow since in Hindi, some combiners are morphemes). 136 | /// The problem is that each lemma has a unique factor set. But if the lemma is a 137 | /// combiner that is used both with a letter or with punctuation in the corpus, 138 | /// that lemma ends up with two different factor sets, which is forbidden. 139 | /// As a 95%-5% solution, we uniquely define a single "typical" use for each combiner. 140 | /// For example, the accent is considered to always imply a letter, although I have 141 | /// seen it used on top of a space character (to mimic an apostrophe). We consider 142 | /// these as abnormal uses, which will just lead to an additional forced word break 143 | /// that can still be learned and resolved by the MT model itself. 144 | /// 145 | public static char GetCombinerTypicalMajorDesignation(this char c) 146 | { 147 | // @TODO: Spencer pointed out that the key-cap combiner combines with 0..9, #, and * 148 | // It probably should be considered punctuation, to avoid # key-cap A to form a word "key-cap A". 149 | if (c.IsInRange(0xfe0e, 0xfe0f)) // Variation Selectors 15 and 16 apply to Emojis 150 | return 'P'; // punctuation 151 | else 152 | return 'L'; // letter 153 | } 154 | 155 | /// 156 | /// Classify a character, using our special rules 157 | /// - number letters, e.g. Chinese numerals, are classified as 'N' 158 | /// - combiners have a single "typical" designation 159 | /// 160 | public static char GetUnicodeMajorDesignationWithOurSpecialRules(this char c) // helper to get character designation, with our special rules for numerals and combiners 161 | { 162 | if (c.IsNumeral()) 163 | return 'N'; 164 | var d = c.GetUnicodeMajorDesignation(); 165 | if (d == 'M') 166 | return c.GetCombinerTypicalMajorDesignation(); 167 | else 168 | return d; 169 | } 170 | 171 | /// 172 | /// Get the major unicode designation at a character position. 173 | /// In the special case that that position is a combiner, find the first non-combining 174 | /// character and use its designation. 175 | /// 176 | //public static char GetUnicodeMajorDesignationBeforeCombinerAt(this string s, int pos) 177 | //{ 178 | // var majorDesignation = s[pos].GetUnicodeMajorDesignation(); 179 | // // if combiner then search for base char (=last non-combining char) 180 | // while (majorDesignation == 'M' && pos --> 0) 181 | // majorDesignation = s[pos].GetUnicodeMajorDesignation(); 182 | // return majorDesignation; 183 | //} 184 | } 185 | 186 | /// 187 | /// Character-script (as in writing-system) related helpers for FactoredSegmenter. These helpers are at present 188 | /// not yet generic or mature enough to warrant being moved into Common or Unicode.cs. 189 | /// Once they are, they should be moved. 190 | /// 191 | public static class ScriptHelpers 192 | { 193 | public static HashSet ChineseDigits = new HashSet{ 194 | // cf https://en.wikipedia.org/wiki/Chinese_numerals 195 | '〇', '一', '二', '三', '四', '五', '六', '七', '八', '九', // base digits 196 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', // full-width digits. Note: These are designated as digits 197 | '十', '百', '千', '萬', '万', '億', '亿', '兆', // units 198 | '零', '壹', '貳', '贰', '叄', '叁', '陸', '陆', '柒', '捌', '玖', '拾', '佰', '仟', // financial 199 | '幺', '兩', '两', '倆', '仨', '呀', '念', '廿', '卅', '卌', '皕', // regional 200 | // @TODO: how about fractions? E.g. 分 (fen) 201 | '○' // Small White Circle" (U+25CB) 202 | // It is commonly used as zero in Chinese, but technically not a numeral. Unicode desig is Other Symbol "So". 203 | // @BUGBUG: For now, we treat it as one since all we care is that it does not get merged. 204 | // @TODO: Decide whether we can add a different category that also never gets merged. 205 | // @TODO: 206 | // 京 = 10^16 207 | // 壱 = formal 1 208 | // 弐 = formal 2 209 | // 参 = formal 3 (has other uses) 210 | // @REVIEW: A native speaker of Chinese and Japanese should check whether some characters 211 | // above are commonly used in regular words as well, and assess whether we need them here. 212 | }; 213 | 214 | /// 215 | /// Get script designators for each character in a line. 216 | /// This function handles surrogate pairs and combining marks. --@TODO: ...not yet, actually 217 | /// The function can optionally operate on a substring. 218 | /// 219 | //public static Unicode.Script[] GetScripts(string line, int startIndex = 0, int length = int.MaxValue) 220 | //{ 221 | // if (length == int.MaxValue) 222 | // length = line.Length; 223 | // var scripts = new Unicode.Script[length]; 224 | // for (var i = 0; i < length; i++) 225 | // { 226 | // // @TODO: Handle surrogates 227 | // char c = line[startIndex + i]; 228 | // if (c.IsCombiner() && i > 0) 229 | // scripts[i] = scripts[i - 1]; 230 | // else 231 | // scripts[i] = Unicode.GetScript(c); 232 | // } 233 | // return scripts; 234 | //} 235 | 236 | /// 237 | /// Simplistic word-boundary detector. 238 | /// This function attempts to detect word boundaries that can be detected in a language-independent 239 | /// fashion from the surface form, and without additional knowledge sources. 240 | /// I.e. it looks for a change in script and some changes in Unicode character designation. 241 | /// This does not detect word breaks in continuous scripts, which require additional knowledge sources. 242 | /// This function handles these special cases: 243 | /// - some known allowed punctuation between characters, such as ' in words and . in numbers 244 | /// @TODO: This rule may not apply to all scripts. 245 | /// - surrogate pairs --@TODO 246 | /// - combiners inherit the script of the character to the left 247 | /// - combiners are classified as the char type (major designation) they are "typically" 248 | /// applied to (not depending on actual char). 249 | /// This is needed so that combiners that end up as single SentencePieces are classifyable. 250 | /// Any error this causes must be learned by the model. 251 | /// - designation changes only are a boundary if a letter or a number is on either side, 252 | /// but e.g. not a punctuation symbol next to a space or math symbol 253 | /// - (special rule: Hiragana is not split from Kanji. Currently this rule is disabled.) 254 | /// Each space gives rise to two boundaries (one on each side). 255 | /// It returns a cut list. An empty string is not cut. 256 | /// 257 | public static IList DetectUnambiguousWordBreaks(string line) // @TODO: Better name for this? 258 | { 259 | // First, determines the major Unicode designation and script for each character, but with modifications, 260 | // for purpose of simple word breaking: 261 | // - allowed punctuation marks inside words are flipped to 'L' 262 | // - allowed punctuation marks inside numbers are flipped to 'N' 263 | // - unambiguous CJK number letters are flipped to 'N' 264 | // - combining marks carry over both designation and script from their main character 265 | var scripts = new Unicode.Script[line.Length]; 266 | var designations = new char[line.Length]; 267 | for (var i = 0; i < line.Length; i++) 268 | { 269 | var c = line[i]; 270 | // @TODO: handle surrogate pairs 271 | var m = Unicode.GetUnicodeMajorDesignation(c); 272 | var s = Unicode.GetScript(c); 273 | // special case: consider unambiguous CJK number symbols as numerals 274 | if (c.IsNumeral()) 275 | designations[i] = 'N'; 276 | // special case: combining marks carry over main character's script, and are classified as their most likely use (for consistency) 277 | else if (m == 'M') 278 | { 279 | m = c.GetCombinerTypicalMajorDesignation(); 280 | if (i > 0) 281 | s = scripts[i - 1]; 282 | } 283 | designations[i] = m; 284 | scripts[i] = s; 285 | // special case: allowed punctuation inside a word --@TODO: Likely script dependent, maybe language dependent 286 | if (i - 2 >= 0 && designations[i] == 'L' && designations[i - 2] == 'L' && IsValidPuncInsideWord(line[i - 1])) 287 | designations[i - 1] = 'L'; 288 | // special case: allowed punctuation inside a number --@TODO: Likely script dependent, maybe language-locale dependent 289 | else if (i - 2 >= 0 && designations[i] == 'N' && designations[i - 2] == 'N' && IsValidPuncInsideNumber(line[i - 1])) 290 | designations[i - 1] = 'N'; 291 | // @TODO: double-check handling of space characters: non-breaking space; optional hyphen 292 | } 293 | 294 | // This function operates on a string, so we can handle the case of Unicode.Script 1 - Common - Unicode.Script 2 295 | // This presently breaks this as (Unicode.Script 1 - Common, Unicode.Script 2). 296 | // Without further knowledge, we can only make an arbitrary hard choice here. 297 | // This is used by FactoredSegmenter, where that is OK because characters in Common are 298 | // typically broken off anyways. 299 | 300 | if (line.Length == 0) // graceful exit in case of empty input 301 | return new List{ 0, 0 }; // empty input is not cut 302 | 303 | var cutList = new List(200) { 0 }; // (0=line start, which the resulting cut list must include) 304 | var lastNonCommonScript = scripts[0]; 305 | //if (lastNonCommonScript == Unicode.Script.Hiragana) 306 | // lastNonCommonScript = Unicode.Script.Han; // no boundary between Kanji and Hiragana 307 | for (var pos = 1; pos < line.Length; pos++) 308 | { 309 | // detect change in character designation 310 | // - break at number boundaries 311 | // - add number factor 312 | // - can numbers be part of words that need to be kept together for determining word-level factors? 313 | // - break at word boundaries 314 | // - letter/non-letter transitions 315 | // - don't break apostrophes and hyphens with letters on both sides 316 | // - break at script boundaries 317 | bool atDesignationChange = (designations[pos - 1] != designations[pos] && 318 | (designations[pos - 1] == 'N' || designations[pos] == 'N' || 319 | designations[pos - 1] == 'L' || designations[pos] == 'L')); 320 | 321 | // detect script change 322 | var thisScript = scripts[pos]; 323 | //if (thisScript == Unicode.Script.Hiragana) // the jury is still out whether we should do this or not 324 | // thisScript = Unicode.Script.Han; 325 | bool atScriptChange = lastNonCommonScript != thisScript && thisScript != Unicode.Script.Common; 326 | // Note: If there is a script change across Common, we choose one arbitrarily. 327 | if (thisScript != Unicode.Script.Common || atDesignationChange) // condition 'atDesignationChange' is for back compat only; maybe not needed 328 | lastNonCommonScript = thisScript; 329 | 330 | // add cut point if one was found 331 | if (atDesignationChange || atScriptChange) 332 | cutList.Add(pos); 333 | } 334 | cutList.Add(line.Length); 335 | return cutList; 336 | } 337 | // @TODO: These next two functions should likely be script-dependent (and possibly language-dependent). 338 | static bool IsValidPuncInsideWord(char c) => (c == '\'' || c == '-' || c == '\u00AD'/*soft hyphen*/); // true if words may contain this punctuation symbol inside, e.g. "It's", "well-behaved" 339 | static bool IsValidPuncInsideNumber(char c) => (c == '.' || c == ',' || c == '\u2009'/*thin space*/); // true if numbers may contain this punctuation symbol inside, e.g. "1,234.56" 340 | } 341 | } 342 | -------------------------------------------------------------------------------- /src/ProcessTools.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | using Common.Contracts; 5 | using Common.Utils; 6 | using System; 7 | using System.Collections.Generic; 8 | using System.Diagnostics; 9 | using System.IO; 10 | using System.Linq; 11 | using System.Text; 12 | 13 | namespace Microsoft.MT.Common.Tokenization 14 | { 15 | public static class ProcessTools 16 | { 17 | public static int RunCommand( 18 | string exe, 19 | string args, 20 | string stdoutPath, // must be null in this version 21 | string stderrPath, // may be null 22 | bool throwOnFailure = true, 23 | IEnumerable> envirVariables = null) 24 | { 25 | Sanity.Requires(stdoutPath == null, "This reduced version of RunCommand() does not support stdout redirection"); 26 | Logger.WriteLine($"executing command: {exe} {args}"); 27 | using (TextWriter stderrWriter = stderrPath == null ? null : new StreamWriter(stderrPath, append: false, encoding: new UTF8Encoding(encoderShouldEmitUTF8Identifier: false)) { AutoFlush = true }) 28 | using (var process = CreateProcess(exe, args, envirVariables, isPipe: false, stderr: stderrWriter)) 29 | { 30 | process.WaitForExit(); 31 | if (throwOnFailure && process.ExitCode != 0) 32 | throw new IOException($"Exit code {process.ExitCode} was returned by external process: {exe} {args}"); 33 | else 34 | return process.ExitCode; 35 | } 36 | } 37 | 38 | 39 | static char[] k_ArgToCommandLineInvalidChars = Enumerable.Concat(from c in Enumerable.Range(0, (int)' ') select (char)c, new char[] { '"', '^' }).ToArray(); 40 | /// 41 | /// escape an argument to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#. 42 | /// Some characters that are tricky to handle consistently. For now, we simply forbid them. 43 | /// These include all control characters (0x00..0x1f), " (quotation marks inside string), and ^ (CMD shell escape). 44 | /// To handle " and ^ correctly, we may need additional context on whether this is run via CMD, and there is 45 | /// supposedly also a difference between CommandLineToArgV() and the C++ CRT (C# unknown) regarding sequences of double quotes. 46 | /// 47 | /// Argument as the final string that the tool should receive, without escaping. 48 | /// Escaped version of argument, or unmodified argument if no escaping is needed. 49 | static string ArgToCommandLine(string arg) 50 | { 51 | if (-1 != arg.IndexOfAny(k_ArgToCommandLineInvalidChars)) 52 | throw new NotImplementedException($"ArgToCommandLine: presently cannot handle certain special characters (e.g. \" and ^) in: {arg}"); 53 | if (!arg.Any() || arg.Contains(' ')) // space is the delimiter, so we must surround the arg by quotes 54 | return $"\"{arg}\""; 55 | else // otherwise, no need to escape (it would be OK to escape, but not escaping is better for log readability 56 | return arg; 57 | } 58 | /// 59 | /// convert an array of string arguments to a command line as needed in order to be parsed by CommandLineToArgv(), C++ CRT, or C#. 60 | /// 61 | public static string ArgsToCommandLine(IEnumerable args) 62 | => string.Join(" ", from arg in args select ArgToCommandLine(arg)); 63 | 64 | private static Process CreateProcess(string exe, string args, 65 | IEnumerable> envirVariables, bool isPipe, 66 | TextWriter stderr) 67 | { 68 | var psi = new ProcessStartInfo(exe, args) 69 | { 70 | UseShellExecute = false, 71 | CreateNoWindow = true, 72 | ErrorDialog = false, 73 | }; 74 | if (isPipe) 75 | { 76 | psi.RedirectStandardInput = true; 77 | psi.RedirectStandardOutput = true; 78 | psi.StandardOutputEncoding = Encoding.UTF8; 79 | } 80 | if (stderr != null) 81 | { 82 | psi.RedirectStandardError = true; 83 | psi.StandardErrorEncoding = Encoding.UTF8; // @REVIEW: needed? 84 | } 85 | if (envirVariables != null) 86 | foreach (KeyValuePair pair in envirVariables) 87 | psi.EnvironmentVariables[pair.Key] = pair.Value; 88 | 89 | var process = new Process(); 90 | process.StartInfo = psi; 91 | if (stderr != null) 92 | process.ErrorDataReceived += (sender, e) => { stderr.WriteLine(e.Data); }; 93 | process.Start(); 94 | if (stderr != null) 95 | process.BeginErrorReadLine(); 96 | return process; 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/README.txt: -------------------------------------------------------------------------------- 1 | This directory contains all sources that implement the actual FactoredSegmenter functionality. 2 | 3 | The content of this directory is shared between the standalone build in this repo and our production build. 4 | -------------------------------------------------------------------------------- /src/SegmenterRuntime.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | using Common.Collections.Extensions; 5 | using Common.MT.Segments; 6 | using Common.Text; 7 | using Microsoft.MT.TextSegmentation.SpanFinder; 8 | using System; 9 | using System.Collections.Generic; 10 | using System.IO; 11 | using System.Linq; 12 | 13 | namespace Microsoft.MT.Common.Tokenization.Segmenter 14 | { 15 | public class SegmenterCoderConfig 16 | { 17 | public SegmenterKind SegmenterKind { get; set; } 18 | public string ModelPath { get; set; } 19 | 20 | // The Equals() function is for the parallel coder so that it can determine whether 21 | // source and target configs are the same. If they are, the parallel coder will only 22 | // instantiate one segmenter and use it for both source and target. 23 | public override bool Equals(object obj) 24 | { 25 | return 26 | obj is SegmenterCoderConfig other && 27 | SegmenterKind == other.SegmenterKind && ModelPath == other.ModelPath; 28 | } 29 | public override int GetHashCode() { return ModelPath.GetHashCode(); } 30 | } 31 | 32 | /// 33 | /// A reference to a segment of raw source text, as used in DecodedSegment.SourceLink 34 | /// 35 | public class EncodedSegmentReference 36 | { 37 | public string RawSourceText; // full raw source string --@TODO: make private if not actually needed public 38 | public int StartIndex; // character coordinates of source token in the raw source string 39 | public int Length; 40 | public bool IsWordTokenStart, IsWordTokenEnd; 41 | public bool IsSpacingWordStart, IsSpacingWordEnd; 42 | public string SurfaceForm => RawSourceText.Substring(StartIndex, Length); 43 | 44 | // is this eligible for alignment links from target tokens? Alignments from Marian to segments for which this is false will be discarded 45 | // in Decode. At present, this will be false for sentence annotation tokens, and true for others. 46 | public bool CanBeAlignedTo; 47 | 48 | public override bool Equals(object obj) 49 | { 50 | return 51 | obj is EncodedSegmentReference other && 52 | RawSourceText == other.RawSourceText && StartIndex == other.StartIndex && Length == other.Length && 53 | IsWordTokenStart == other.IsWordTokenStart && IsWordTokenEnd == other.IsWordTokenEnd && 54 | IsSpacingWordStart == other.IsSpacingWordStart && IsSpacingWordEnd == other.IsSpacingWordEnd && 55 | CanBeAlignedTo == other.CanBeAlignedTo; 56 | } 57 | public override int GetHashCode() { return RawSourceText.GetHashCode(); } 58 | // for debugging 59 | public override string ToString() => SurfaceForm; 60 | } 61 | 62 | /// 63 | /// The decoder outputs one of these for each Marian token, and additional ones for reconstructed spaces. 64 | /// The encoder uses this to return the segmentation of the source string. 65 | /// 66 | public struct DecodedSegment : IEquatable 67 | { 68 | public readonly string SurfaceForm; // final plain-text form 69 | public readonly bool IsWordTokenStart, IsWordTokenEnd; 70 | public readonly bool IsSpacingWordStart, IsSpacingWordEnd; 71 | 72 | public struct SourceLink : IEquatable// for representing alignment information 73 | { 74 | public EncodedSegmentReference SourceSegment; // contains the character alignment 75 | public float Confidence; // @TODO: unit? prob or log prob? 76 | public bool Equals(SourceLink other) 77 | { 78 | return 79 | ((SourceSegment == null) == (other.SourceSegment == null) || 80 | (SourceSegment != null) && SourceSegment.Equals(other.SourceSegment)) && 81 | Confidence == other.Confidence; 82 | } 83 | } 84 | public readonly List SourceAlignment; // character range(s) (and confidence) of original source string(s) 85 | 86 | /// 87 | /// True if this segment's surface string was set using the DecodeAs mechanism (e.g. for phrasefix, urls, etc) 88 | /// 89 | public bool IsForceDecode { get; set; } 90 | public DecodedSegment(string surfaceForm, bool isWordTokenStart, bool isWordTokenEnd, List sourceLinks, bool isForceDecode, bool isSpacingWordStart, bool isSpacingWordEnd) 91 | { 92 | SurfaceForm = surfaceForm; 93 | IsWordTokenStart = isWordTokenStart; 94 | IsWordTokenEnd = isWordTokenEnd; 95 | SourceAlignment = sourceLinks; 96 | IsForceDecode = isForceDecode; 97 | IsSpacingWordStart = isSpacingWordStart; 98 | IsSpacingWordEnd = isSpacingWordEnd; 99 | } 100 | public bool Equals(DecodedSegment other) 101 | { 102 | return SurfaceForm == other.SurfaceForm && IsWordTokenStart == other.IsWordTokenStart && IsWordTokenEnd == other.IsWordTokenEnd && 103 | SourceAlignment.NullableSequenceEquals(other.SourceAlignment) && IsForceDecode == other.IsForceDecode && 104 | IsSpacingWordStart == other.IsSpacingWordStart && IsSpacingWordEnd == other.IsSpacingWordEnd; 105 | } 106 | 107 | // for debugging 108 | public override string ToString() => SurfaceForm; 109 | 110 | /// 111 | /// Clone this object, but with a replaced surface form. This is inteded to be used for making modifications 112 | /// to surface forms during postprocessing without disrupting alignment or word boundary flags. Examples are 113 | /// ensuring that all question marks in Chinese are full width ('?' rather than '?') or that the newer and 114 | /// more correct form of certain T/S diacritics for Romanian ('Ș' rather than 'Ş'). 115 | /// 116 | /// Using this has the potential to create a situation where some of the assumptions associated with word boundary 117 | /// flags are violated. For example, IsSpacingWordStart/End can be true between two continuous script segments (e.g. 118 | /// Japanese or Chinese), but would not be true between two spacing script segments in general (e.g. Latin or Cyrillic). 119 | /// If we replace surface forms for two consecutive Japanese segments with Latin strings, we would then have a 120 | /// two consecutive Latin segments with IsSpacingWord* set to true, allowing tags and character alignment boundaries 121 | /// to be placed at that boundary. 122 | /// 123 | /// For these reasons, it is much safer to use this to change surface forms within like segment classes (e.g. 124 | /// punctuation, characters within a script, etc). 125 | /// 126 | /// For example, if we wanted to clone a DecodedSegment, questionMarkSeg, whose surface form was "?", but 127 | /// wanted to the clone to have a full width question mark, we could use the following code: 128 | /// var fullWidthSeg = questionMarkSeg.WithSurfaceForm("?"); 129 | /// 130 | /// A new surface form that will given to the clone 131 | /// A clone of this object, but with surface form replaced by specified argument 132 | public DecodedSegment WithSurfaceForm(string newSurfaceForm) 133 | { 134 | return new DecodedSegment( 135 | surfaceForm: newSurfaceForm, 136 | isWordTokenStart: IsWordTokenStart, 137 | isWordTokenEnd: IsWordTokenEnd, 138 | sourceLinks: SourceAlignment, 139 | isForceDecode: IsForceDecode, 140 | isSpacingWordStart: IsSpacingWordStart, 141 | isSpacingWordEnd: IsSpacingWordEnd); 142 | } 143 | } 144 | 145 | /// 146 | /// This is an opaque object returned by the Segmenter.Encode that gives instructions for how to replace word classed tokens 147 | /// (e.g. phrasefix) at Segmenter.Decode time. 148 | /// 149 | public interface IDecoderPackage { } // @TODO: find a better name 150 | 151 | /// 152 | /// The result of Encode(), which consists of 153 | /// - tokens in their serialized string form, for use by Marian NMT training 154 | /// - segmentation information, for use in alignment 155 | /// 156 | public abstract class IEncoded 157 | { 158 | /// 159 | /// The original source line of raw plain text that was to be encoded. 160 | /// 161 | public abstract string OriginalSourceText { get; } 162 | /// 163 | /// Source line segments that correspond to the encoded tokens. 164 | /// Each token carries additional word-boundary information. 165 | /// Tokens are in left-to-right order, but possibly with repeats, and may have gaps. 166 | /// This array allows to find the set of segmentation boundaries, for example for 167 | /// training an alignment model or tag manipulations. 168 | /// Do NOT, however, use this to reconstruct the original source line, because: 169 | /// - spaces are not included (since they get elided in encoding) 170 | /// - any replaced ranges (phrase fixes, EncodeAsIf) only have their outer boundaries 171 | /// - if a replaced range gets SentencePiece'd, then we get multiple tokens that each 172 | /// span the full original replaced region 173 | /// Examples: 174 | /// - "abc defg hi" with defg inline-phrase-fixed to XYZ, with SPM-splits de fg and XY Z. 175 | /// Tokens will be something like "abc (( de fg || XY Z )) hi" (factors not shown). 176 | /// Resulting source text segments will be "abc '' '' '' '' defg defg '' hi". 177 | /// (@TODO: A future version may retain de and fg as well.) 178 | /// 179 | public abstract EncodedSegmentReference[] OriginalSourceTextSegments { get; } 180 | /// 181 | /// The encoding result expressed as a sequence of ProcessToken items. 182 | /// 183 | public abstract List ProcessedTokens { get; } 184 | /// 185 | /// The encoding result expressed as a sequence of tokens in their serialized (encoded) form. 186 | /// 187 | public abstract IEnumerable TokenStrings { get; } 188 | /// 189 | /// The encoding result expressed as a sequence of tokens in their serialized (encoded) form. 190 | /// This is different from TokenStrings() since e.g. for FactoredSegmenter, the aligner 191 | /// should not receive factors. 192 | /// 193 | public abstract IEnumerable TokenStringsForAligner { get; } 194 | /// 195 | /// Number of tokens. All properties above except OriginalSourceTextSegments return this many items. 196 | /// 197 | public abstract int Count { get; } 198 | /// 199 | /// The original source sentence annotations that was passed to Encode(). 200 | /// 201 | public abstract Dictionary OriginalSourceSentenceAnnotations { get; } 202 | /// 203 | /// The result expressed as a single text line; meant for debugger visualization only. 204 | /// 205 | public override string ToString() => " ".JoinItems(TokenStrings); 206 | /// 207 | /// This property holds an opaque package of information that should be passed on to 208 | /// the Decode() function. 209 | /// 210 | public abstract IDecoderPackage DecoderPackage { get; } 211 | } 212 | 213 | /// 214 | /// Result of the Decode() function. Predominantly an array of SegmenterTokens, 215 | /// which carry surface form, boundary flags, and alignment info. 216 | /// 217 | public abstract class IDecoded 218 | { 219 | /// 220 | /// The decoded line as consecutive sub-strings that represent the original tokenization from translation, 221 | /// but with spaces inserted. The decoded line can be formed by straight concatenation of the tokens' SurfaceForm fields. 222 | /// 223 | public abstract DecodedSegment[] Tokens { get; } 224 | /// 225 | /// The final decoded line as raw plain text. Same as concatenating all SegmenterToken[].SurfaceForm 226 | /// 227 | public override string ToString() => "".JoinItems(from token in Tokens select token.SurfaceForm); 228 | } 229 | 230 | /// 231 | /// Base class that is used to invoke segmenters (SentencePiece or FactoredSegmenter). 232 | /// A segmenter is an object that can encode / decode a single language's strings (We need the parallel segmenter for runtime and training) 233 | /// at runtime. 234 | /// Such a segmenter does (ideally) not know about the translation process (the parallel segmenter does). 235 | /// 236 | public abstract class SegmenterCoderBase 237 | { 238 | public abstract IEncoded Encode(string line, 239 | List annotatedSpans = null, Dictionary sourceSentenceAnnotations = null, 240 | int? seed = null); 241 | 242 | /// 243 | /// Decode a line of tokens in Marian-internal string format from in-memory data structures. 244 | /// Spaces are individual tokens in the output. 245 | /// 246 | public abstract IDecoded Decode(IEnumerable encodedTokensFromMT, 247 | Alignment alignmentFromMT, 248 | IDecoderPackage decoderPackage); 249 | 250 | /// 251 | /// Decode a line of tokens in serialized Marian-NMT form, e.g. the result of Marian 252 | /// translation as written to a file. This overload does not support alignments. 253 | /// 254 | public IDecoded Decode(string line, IDecoderPackage decoderPackage = null) 255 | { 256 | return Decode(line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries).ToList(), null, decoderPackage); 257 | } 258 | 259 | /// 260 | /// Create a SegmenterCoder from a config. The kind of segmenter is determined 261 | /// from the config's actual type. 262 | /// 263 | public static SegmenterCoderBase CreateForKindOf(SegmenterCoderConfig config) 264 | { 265 | if (config == null) 266 | return null; 267 | switch (config.SegmenterKind) 268 | { 269 | case SegmenterKind.FactoredSegmenter: 270 | // @TODO: what do we put into the SegmenterCoderConfig config? Maybe a FactoredSegmenterCoderConfig? 271 | return new FactoredSegmenterCoder(new FactoredSegmenterCoderConfig { ModelPath = config.ModelPath }); 272 | case SegmenterKind.SentencePiece: 273 | default: 274 | throw new NotImplementedException(); 275 | } 276 | } 277 | 278 | // special functions for shortlist generation, for use by PureNeuralTools/lex_trans_to_shortlist 279 | 280 | /// 281 | /// Retrieve the shortlist vocabulary. This is for use by PureNeuralTools/lex_trans_to_shortlist. 282 | /// 283 | public abstract string[] ShortlistVocab { get; } 284 | 285 | /// 286 | /// Transcode a token (in segmenter-encoded form) into the shortlist token (in segmenter-encoded form). 287 | /// 288 | public abstract string TranscodeTokenToShortlist(string token); 289 | 290 | /// 291 | /// Why we need this flag? We may want to log strings, do additional checks, fail during training code path. However, at runtime -- when running in our cluster 292 | /// we have strict requirements. This flag is used to indicate the same. if this is set to false (default = true),we cannot log any user strings at runtime. 293 | /// 294 | public bool IsTrainingScenario { get; set; } 295 | 296 | /// 297 | /// Find character spans that are out of vocabulary for the model and cannot be encoded. 298 | /// 299 | public abstract IEnumerable<(int StartIndex, int Length)> FindUnrepresentableSpans(string line); 300 | } 301 | 302 | // Unimplemented version, if we wanted to use raw sentence piece instead of Factored segmenter. We'd need to figure out how to handle tags and other spans. 303 | public class SentencePieceSegmenterCoder : SegmenterCoderBase 304 | { 305 | SentencePieceCoder coder; 306 | 307 | public SentencePieceSegmenterCoder(string modelPath) 308 | { 309 | coder = new SentencePieceCoder(new SentencePieceCoderConfig { SentencePieceModel = SentencePieceModel.Load(modelPath) }); 310 | } 311 | 312 | public override IEncoded Encode(string line, List annotatedSpans = null, Dictionary sourceSentenceAnnotations = null, int? seed = null) 313 | { 314 | throw new NotImplementedException(); 315 | } 316 | 317 | public override IDecoded Decode( 318 | IEnumerable encodedTokensFromMT, 319 | Alignment alignmentFromMT, 320 | IDecoderPackage decoderPackage) 321 | { 322 | throw new NotImplementedException(); 323 | } 324 | 325 | public override string[] ShortlistVocab { get { throw new NotImplementedException(); } } 326 | 327 | public override string TranscodeTokenToShortlist(string token) 328 | { 329 | throw new NotImplementedException(); 330 | } 331 | 332 | public override IEnumerable<(int StartIndex, int Length)> FindUnrepresentableSpans(string line) 333 | { 334 | throw new NotImplementedException(); 335 | } 336 | } 337 | } 338 | -------------------------------------------------------------------------------- /src/SentencePieceConfigs.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.Linq; 7 | using System.Text; 8 | using System.Threading.Tasks; 9 | using System.Xml.Serialization; 10 | 11 | namespace Microsoft.MT.Common.Tokenization 12 | { 13 | // types for SentencePiece 14 | public enum SentencePieceModelType 15 | { 16 | [XmlEnum(Name = "unigram")] 17 | Unigram = 0, 18 | [XmlEnum(Name = "bpe")] 19 | Bpe, 20 | [XmlEnum(Name = "word")] 21 | Word, 22 | [XmlEnum(Name = "char")] 23 | Char 24 | } 25 | 26 | public enum SentencePieceNormalizationRuleName 27 | { 28 | [XmlEnum(Name = "nmt_nfkc")] 29 | Nfkc = 0, 30 | [XmlEnum(Name = "identity")] 31 | Identity 32 | } 33 | 34 | // Note: The following cannot be specified by Flo users, as these are under Flo's control. 35 | public enum SentencePieceInputFormat 36 | { 37 | [XmlEnum(Name = "text")] 38 | Text = 0, 39 | [XmlEnum(Name = "tsv")] 40 | Tsv 41 | } 42 | 43 | public enum SentencePieceEncodeFormat 44 | { 45 | [XmlEnum(Name = "piece")] 46 | Piece = 0, 47 | [XmlEnum(Name = "id")] 48 | Id, 49 | [XmlEnum(Name = "proto")] 50 | Proto, 51 | [XmlEnum(Name = "nbest_piece")] 52 | NBest_Piece, 53 | [XmlEnum(Name = "nbest_id")] 54 | NBest_Id, 55 | [XmlEnum(Name = "nbest_proto")] 56 | NBest_Proto 57 | } 58 | 59 | public enum SentencePieceDecodeInputFormat 60 | { 61 | [XmlEnum(Name = "piece")] 62 | Piece = 0, 63 | [XmlEnum(Name = "id")] 64 | Id 65 | } 66 | 67 | public enum SentencePieceDecodeOutputFormat 68 | { 69 | [XmlEnum(Name = "string")] 70 | String = 0, 71 | [XmlEnum(Name = "proto")] 72 | Proto 73 | } 74 | 75 | /// 76 | /// Class to hold all parameters for the SentencePiece training tool. 77 | /// 78 | public class SentencePieceTrainConfig : SegmenterTrainConfigBase, ISentencePieceConfig 79 | { 80 | /// 81 | /// comma-separated list of languages this model can accept 82 | /// 83 | public string AcceptLanguage { get; set; } 84 | /// 85 | /// Add dummy whitespace at the beginning of text ( default: true ) 86 | /// 87 | public bool? AddDummyPrefix { get; set; } 88 | /// 89 | /// Override BOS (<s>) id. Set -1 to disable BOS ( default: -1 ) 90 | /// @BUGBUG: BosId, eosId and UnkId should not be user-specifyable, as they are controlled by Flo 91 | /// 92 | public Int32 BosId { get; set; } = -1; 93 | /// 94 | /// Character coverage to determine the minimum symbols ( default: 0.9995 ) 95 | /// 96 | public double? CharacterCoverage { get; set; } 97 | /// 98 | /// Comma separated list of control symbols 99 | /// 100 | public string ControlSymbols { get; set; } 101 | /// 102 | /// Override EOS ((</s>)) id. Set -1 to disable EOS. ( default: 0 ) 103 | /// @BUGBUG: BosId, eosId and UnkId should not be user-specifyable, as they are controlled by Flo 104 | /// 105 | public Int32 EosId { get; set; } = 0; 106 | /// 107 | /// If set to false, --vocab_size is considered as a soft limit. ( default: true ) 108 | /// 109 | public bool? HardVocabLimit { get; set; } 110 | /// 111 | /// Comma separated list of input sentences ) type: string 112 | /// 113 | public string input { get; set; } 114 | /// 115 | /// Input format. Supported format is 'text' or 'tsv'. ( default: 'text' ) 116 | /// 117 | public SentencePieceInputFormat? InputFormat { get; set; } 118 | /// 119 | /// Maximum size of sentences the trainer loads ( default: 10000000 ) 120 | /// 121 | public Int32? InputSentenceSize { get; set; } 122 | /// 123 | /// Maximum length of sentence in bytes ( default: 2048) 124 | /// 125 | public Int32? MaxSentenceLength { get; set; } 126 | /// 127 | /// Maximum length of sentence piece ( default: 16 ) 128 | /// 129 | public Int32? MaxSentencepieceLength { get; set; } 130 | /// 131 | /// Maximum size of sentences to make seed sentence piece ( default: 2000000 ) 132 | /// 133 | public Int32? MiningSentenceSize { get; set; } 134 | /// 135 | /// Output model prefix 136 | /// 137 | public string ModelPrefix { get; set; } 138 | /// 139 | /// Model algorithm: unigram, bpe, word or char ( default: unigram ) 140 | /// 141 | public SentencePieceModelType? ModelType { get; set; } 142 | /// 143 | /// Normalization rule name. Choose from nfkc or identity ( default: nmt_nfkc ) 144 | /// 145 | public SentencePieceNormalizationRuleName? NormalizationRuleName { get; set; } 146 | /// 147 | /// Normalization rule TSV file. 148 | /// 149 | public string NormalizationRuleTsv { get; set; } 150 | /// 151 | /// Number of EM sub-iterations ( default: 2 ) 152 | /// 153 | public Int32? NumSubIterations { get; set; } 154 | /// 155 | /// Number of threads for training ( default: 16 ) 156 | /// 157 | public Int32? NumThreads { get; set; } 158 | /// 159 | /// Override PAD (<pad>) id. Set -1 to disable PAD. ( default: -1 ) 160 | /// 161 | public Int32? PadId { get; set; } 162 | /// 163 | /// Removes leading, trailing, and duplicate internal whitespace ( default: true ) 164 | /// 165 | public bool? RemoveExtraWhitespaces { get; set; } 166 | /// 167 | /// The size of seed sentencepieces ( default: 1000000 ) 168 | /// 169 | public Int32? SeedSentencepieceSize { get; set; } 170 | /// 171 | /// The size of self test samples ( default: 0 ) 172 | /// 173 | public Int32? SelfTestSampleSize { get; set; } 174 | /// 175 | /// Keeps top shrinking_factor pieces with respect to the loss ( default: 0.75 ) 176 | /// 177 | public double? ShrinkingFactor { get; set; } 178 | /// 179 | /// Use Unicode script to split sentence pieces ( default: true ) 180 | /// 181 | public bool? SplitByUnicodeScript { get; set; } 182 | /// 183 | /// Use a white space to split sentence pieces ( default: true ) 184 | /// 185 | public bool? SplitByWhitespace { get; set; } 186 | /// 187 | /// Maximum size of sentences to train sentence pieces ( default: 10000000 ) 188 | /// 189 | public override Int32? TrainingSentenceSize { get; set; } 190 | /// 191 | /// Override UNK (<unk>) id. ( default: 1 ) 192 | /// 193 | public Int32 UnkId { get; set; } = 1; 194 | /// 195 | /// Dummy surface string for <unk>. In decoding <unk> is decoded to `unk_surface`. 196 | /// @BUGBUG: BosId, eosId and UnkId should not be user-specifyable, as they are controlled by Flo 197 | /// 198 | public string UnkSurface { get; set; } 199 | /// 200 | /// If set to true, use all tokens as vocab.Valid for word/char models. ( default: false ) 201 | /// 202 | public bool? UseAllVocab { get; set; } 203 | /// 204 | /// Comma separated list of user defined symbols 205 | /// 206 | public string UserDefinedSymbols { get; set; } 207 | /// 208 | /// Vocabulary size ( default: 32000 ) 209 | /// 210 | public int? VocabSize { get; set; } = 32000; 211 | } 212 | 213 | /// 214 | /// Class to hold all parameters for the SentencePiece encoding tool. 215 | /// 216 | public class SentencePieceEncodeConfig : SegmenterEncodeConfigBase, ISentencePieceConfig 217 | { 218 | /// 219 | /// Smoothing parameter for sampling mode ( default: 0.5 ) 220 | /// 221 | public double? Alpha { get; set; } 222 | /// 223 | /// ':' separated encoder extra options, e.g., "reverse:bos:eos" 224 | /// 225 | public string ExtraOptions { get; set; } 226 | /// 227 | /// Generates vocabulary file instead of segmentation ( default: false ) 228 | /// Internal use only; cannot be specified by Flo user. 229 | /// 230 | public bool? GenerateVocabulary { get; set; } 231 | /// 232 | /// NBest size ( default: 10 ). Only used if OutputFormat is nbest_XXX. 233 | /// 234 | public Int32? NBest_Size { get; set; } 235 | /// 236 | /// choose from piece, id, proto, nbest_piece, nbest_id, or nbest_proto ( default: piece) 237 | /// Internal use only; cannot be specified by Flo user. 238 | /// 239 | public SentencePieceEncodeFormat? OutputFormat { get; set; } 240 | /// 241 | /// Restrict the vocabulary. The encoder only emits the tokens in "vocabulary" file 242 | /// 243 | public string Vocabulary { get; set; } 244 | /// 245 | /// Words with frequency below threshold will be treated as OOV ( default: 0 ) 246 | /// 247 | public Int32? VocabularyThreshold { get; set; } 248 | } 249 | 250 | /// 251 | /// Class to hold all parameters for the SentencePiece decoding tool. 252 | /// 253 | public class SentencePieceDecodeConfig : SegmenterDecodeConfigBase, ISentencePieceConfig 254 | { 255 | /// 256 | /// ':' separated encoder extra options, e.g., "reverse:bos:eos" 257 | /// 258 | public string ExtraOptions { get; set; } 259 | /// 260 | /// choose from piece, id. Default: piece 261 | /// Internal use only; cannot be specified by Flo user. 262 | /// 263 | public SentencePieceDecodeInputFormat? InputFormat { get; set; } 264 | /// 265 | /// choose from string or proto. Default: string 266 | /// Internal use only; cannot be specified by Flo user. 267 | /// 268 | public SentencePieceDecodeOutputFormat? OutputFormat { get; set; } 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /src/SentencePieceManaged.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | // This file is not used in the standalone/Linux build. 5 | 6 | using System; 7 | using System.Collections.Generic; 8 | using System.Linq; 9 | using System.Runtime.InteropServices; 10 | 11 | namespace Segmentation 12 | { 13 | public class SentencePieceManaged 14 | { 15 | private readonly IntPtr model; 16 | 17 | private static class NativeMethods 18 | { 19 | private const string DllName = "SentencePieceInterop"; 20 | [DllImport(DllName, CharSet = CharSet.Unicode, CallingConvention = CallingConvention.Cdecl)] 21 | public static extern IntPtr LoadModel(String modelPath, 22 | [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.LPWStr, SizeParamIndex = 2)]String[] vocab, ulong vocabSize); 23 | 24 | [DllImport(DllName, CharSet = CharSet.Unicode, CallingConvention = CallingConvention.Cdecl)] 25 | public static extern int EncodeAsIds(IntPtr model, string word, 26 | [MarshalAs(UnmanagedType.LPArray, ArraySubType = UnmanagedType.I4, SizeParamIndex = 3)]int[] pieceIdBuffer, ulong pieceIdBufferSize); 27 | 28 | [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)] 29 | public static extern int UCS2LengthOfPieceId(IntPtr model, int pieceId); 30 | 31 | [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)] 32 | public static extern void UnloadModel(IntPtr model); 33 | 34 | } 35 | 36 | 37 | public SentencePieceManaged(String modelPath, string[] vocab) 38 | { 39 | ulong vocabLength = (ulong?)vocab?.Length ?? 0UL; 40 | IntPtr local = NativeMethods.LoadModel(modelPath, vocab, (ulong) vocabLength); 41 | if (local == IntPtr.Zero) 42 | throw new ArgumentNullException($"Could not load model file from path {modelPath}"); 43 | 44 | this.model = local; 45 | } 46 | 47 | ~SentencePieceManaged() 48 | { 49 | if (model != IntPtr.Zero) 50 | { 51 | NativeMethods.UnloadModel(this.model); 52 | } 53 | } 54 | 55 | /// 56 | /// This function splits a string (typically a word) into pieces. Instead of returning the pieces, it returns the indices of the split points as an array of integers (including 0 and N). 57 | /// In the frequent case that nothing is split, we instead return null to save a memory allocation. 58 | /// 59 | /// The word to split 60 | /// Array representing indices where to split the word, including 0 and N, or null which maps to [0,N] 61 | public int[] GetSplitPoints(String segment) 62 | { 63 | if (String.IsNullOrEmpty(segment) || segment.Length == 1) 64 | return null; 65 | int[] pieceIds = new int[segment.Length]; 66 | // break string using SentencePiece library 67 | int size = NativeMethods.EncodeAsIds(model, segment, pieceIds, (ulong)pieceIds.Length); ; 68 | if(size < 0) 69 | { 70 | throw new InvalidOperationException("SentencePiece returned a negative size array"); 71 | } 72 | 73 | if (size == 1) 74 | { 75 | int length = NativeMethods.UCS2LengthOfPieceId(this.model, pieceIds[0]); 76 | // if it's length 1 and not an UNK token, we return null 77 | if (length != -1) 78 | return null; 79 | } 80 | 81 | // create the array of offsets, by aggregating the lengths of all pieces 82 | int segmentSize = segment.Length; 83 | List cutList = new List(); 84 | cutList.Add(0); // 0 is always included in the cut-list 85 | 86 | bool done = false; 87 | while (!done) // retry loop used in case of unencodable characters 88 | { 89 | done = true; 90 | for (int i = 0; i < size; i++) 91 | { 92 | if (cutList.Last() >= segmentSize) // logic error 93 | throw new InvalidOperationException($"Unexpectedly hit the end while splitting {segment}"); 94 | int pieceId = pieceIds[i]; 95 | int pieceLength = NativeMethods.UCS2LengthOfPieceId(this.model, pieceId); 96 | // handle unknown character 97 | // Unfortunately, SPM just returns a single token for any sequence of unencodable 98 | // characters, without telling us how many source characters it is made up of. 99 | // To work around this, we split off the first char of the token, but then 100 | // call Encode() again with the remaining string. If the consisted of 101 | // more than one unencodable, the same mechanism will then kick in to split off the next 102 | // char, call Encode() again etc. This has square complexity w.r.t. string length, 103 | // but sequences are short, and this does not happen too frequently. 104 | if (pieceLength == -1) // -1 indicates one or more unknown characters 105 | { 106 | bool skipLow = Char.IsHighSurrogate(segment, cutList.Last()) && cutList.Last() + 2 <= segmentSize; 107 | cutList.Add(cutList.Last() + 1 + (skipLow ? 1 : 0)); // consume it (skip two if surrogate pair) 108 | if (cutList.Last() == segmentSize) // none left 109 | ; 110 | else if (cutList.Last() + 1 == segmentSize) // single char left 111 | cutList.Add(segmentSize); 112 | else // more left: go again with remainder 113 | { 114 | // find the substring from the last index that had a length to the end 115 | String copySegment = segment.Substring(cutList.Last()); 116 | size = NativeMethods.EncodeAsIds(model, copySegment, pieceIds, (ulong)pieceIds.Length); 117 | if(size < 0) 118 | throw new InvalidOperationException("Substring should use less space than original"); 119 | done = false; 120 | } 121 | 122 | // if we found an unk, break the current loop, and start a new loop over, if there are any characters left 123 | break; 124 | } 125 | // regular case 126 | else 127 | { 128 | cutList.Add(cutList.Last() + pieceLength); 129 | } 130 | } 131 | } 132 | 133 | if (cutList.Last() != segmentSize) 134 | throw new InvalidOperationException("Sentence pieces do not reconstruct original string??"); 135 | return cutList.ToArray(); 136 | } 137 | 138 | public string[] Segment(String line) 139 | { 140 | throw new NotImplementedException(); 141 | } 142 | 143 | public String Unsegment(string[] pieces) 144 | { 145 | throw new NotImplementedException(); 146 | } 147 | } 148 | } 149 | -------------------------------------------------------------------------------- /src/SentencePieceWrapper.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | using System; 5 | using System.Collections.Generic; 6 | using System.IO; 7 | using System.Linq; 8 | using System.Runtime.InteropServices; 9 | using System.Text; 10 | using Common.Collections; 11 | using Common.Collections.Extensions; 12 | using Common.Contracts; 13 | using Common.IO; 14 | using Common.Utils; 15 | 16 | namespace Microsoft.MT.Common.Tokenization 17 | { 18 | /// 19 | /// Wrapper for SentencePiece that supports 20 | /// - training an SPM model via invoking the spm_train executable 21 | /// - encoding of words as pieces via an in-memory object/lambda 22 | /// 23 | public class SentencePieceModel 24 | { 25 | const string spmModelExt = ".model"; // these are required/hard-coded by the spm_train tool 26 | const string spmVocabExt = ".vocab"; 27 | 28 | // model data 29 | public byte[] Bytes { get; } 30 | 31 | /// 32 | /// Construct an SPM model from file. 33 | /// 34 | public static SentencePieceModel Load(string path) 35 | { 36 | return new SentencePieceModel(File.ReadAllBytes(path)); 37 | } 38 | 39 | /// 40 | /// Construct an SPM model from a byte array. 41 | /// 42 | public SentencePieceModel(byte[] modelBlob) 43 | { 44 | Bytes = modelBlob; 45 | } 46 | 47 | /// 48 | /// Construct an SPM model from data; that is, train one. 49 | /// The input is passed as an IEnumerable or a ParallelQuery of lines of raw plain-text. 50 | /// The model is returned as a binary blob (for later use in encoding/decoding). 51 | /// Underneath, this uses the spm_train executable, which needs to store the model as a file. That location is 52 | /// passed in as 'tempSPMModelPath'. These output files are temporary and local to this function, but 53 | /// it is useful to keep them around for diagnostics and debugging; they are not (meant to be) used after this. 54 | /// 'minPieceCount' allows to set a minimum observation count for word pieces. spm_train does not support this, 55 | /// so we emulate/approximate it by running spm_train twice. 56 | /// 57 | public static SentencePieceModel Train(Enumerable tokenStrings, string tempSPMModelPath, 58 | SentencePieceTrainConfig spmParams, int minPieceCount, string spmBinDir) 59 | where Enumerable : IEnumerable // using template so we won't loose parallelism (is this needed?) 60 | { 61 | Sanity.Requires(tempSPMModelPath.EndsWith(spmModelExt), $"FactoredSegmenter SentencePiece model path must end in {spmModelExt}"); 62 | var modelPrefix = tempSPMModelPath.Substring(0, tempSPMModelPath.Length - spmModelExt.Length); 63 | 64 | #if false // helper during debugging of final Training stage when models already exist 65 | LoadSPMModelFiles(modelPrefix, out var spmModelBlob, out var spmVocab); 66 | #else 67 | 68 | // write the tokens to a temp file 69 | var tempInputDataPath = modelPrefix + ".data"; 70 | Logger.WriteLine($"FactoredSegmenter: Writing to temp file {tempInputDataPath} for SPM training..."); 71 | AtomicFileWriter.Save(tempInputDataPath, tmpPath => File.WriteAllLines(tmpPath, tokenStrings, new UTF8Encoding())); 72 | // atomic writing allows the impatient user to know when the writing has completed and spm_train has taken over 73 | 74 | // invoke spm_train 75 | SPMTrain(tempInputDataPath, modelPrefix, spmParams, spmBinDir, null); 76 | 77 | // fetch the content of the generated .model and .vocab file into in-memory data structures 78 | // After this, the spm_train-generated files are no longer used; and only kept for debugging purposes. 79 | LoadSPMModelFiles(modelPrefix, out var spmModelBlob, out var spmVocab); 80 | 81 | // enforce minimum piece-count constraint 82 | if (minPieceCount > 1) 83 | { 84 | // encode the SPM training data and count each token's occurence 85 | Logger.WriteLine($"FactoredSegmenter: Minimum-count constraint ({minPieceCount}), counting SPM tokens..."); 86 | var coder = new SentencePieceCoder(new SentencePieceCoderConfig { SentencePieceModel = new SentencePieceModel(spmModelBlob) }); 87 | var counts = CountEncodedTokens(tempInputDataPath, coder); 88 | File.WriteAllLines(tempSPMModelPath + $".{spmVocab.Length}.counts", // save it for diagnostics only 89 | from kvp in counts orderby -kvp.Value, kvp.Key select $"{kvp.Key}\t{kvp.Value}"); 90 | // count number of SPM vocab items that should be kept (above the threshold or single character which we always keep) 91 | var spmVocabSet = new HashSet(spmVocab); 92 | int adjustedVocabSize = counts.Count(kvp => spmVocabSet.Contains(kvp.Key) && (kvp.Key.Length == 1 || kvp.Value >= minPieceCount)); 93 | // if there are units below the threshold, reduce the SPM vocab size and retrain 94 | if (adjustedVocabSize < spmVocab.Length) 95 | { 96 | Logger.WriteLine($"FactoredSegmenter: Only {adjustedVocabSize} out of {spmVocab.Length} sentence pieces have {minPieceCount} or more observations." + 97 | $" Retraining SPM model with reduced vocabSize {adjustedVocabSize}"); 98 | // invoke spm_train a second time 99 | SPMTrain(tempInputDataPath, modelPrefix, spmParams, spmBinDir, adjustedVocabSize); 100 | LoadSPMModelFiles(modelPrefix, out spmModelBlob, out spmVocab); // reload the new model 101 | } 102 | // count once again for diagnostics only 103 | Logger.WriteLine($"FactoredSegmenter: Re-counting SPM tokens after reduction to {adjustedVocabSize}..."); 104 | coder = new SentencePieceCoder(new SentencePieceCoderConfig { SentencePieceModel = new SentencePieceModel(spmModelBlob) }); 105 | counts = counts = CountEncodedTokens(tempInputDataPath, coder); 106 | File.WriteAllLines(tempSPMModelPath + $".{adjustedVocabSize}.counts", // save for diagnostics only 107 | from kvp in counts orderby -kvp.Value, kvp.Key select $"{kvp.Key}\t{kvp.Value}"); 108 | } 109 | 110 | // delete the temp file --except if it failed, so user can double-check what's going on 111 | // commented out temporarily to aid debugging 112 | //File.Delete(tempPath); 113 | #endif 114 | 115 | return new SentencePieceModel(spmModelBlob); 116 | } 117 | 118 | // helper to count encoded tokens 119 | private static Dictionary CountEncodedTokens(string tempInputDataPath, SentencePieceCoder coder) 120 | { 121 | var counts = new Dictionary(); 122 | var pieces = from s in File.ReadLines(tempInputDataPath).AsParallel() // note: AsParallel() makes things out of order 123 | let cutList = coder.Split(s) 124 | from range in (cutList == null) ? new[] { (0, s.Length) } : cutList.Bigrams() 125 | select s.Substring(range.Item1, range.Item2 - range.Item1); 126 | foreach (var piece in pieces) 127 | { 128 | counts.TryGetValue(piece, out var count); 129 | counts[piece] = count + 1; 130 | } 131 | return counts; 132 | } 133 | 134 | // invoke spm_train tool 135 | // Reads input data from file, and creates model and vocab to modelPrefix.model and .vocab, respectively. 136 | private static void SPMTrain(string inputPath, string modelPrefix, SentencePieceTrainConfig spmParams, string spmBinDir, int? vocabSize) 137 | { 138 | // e.g. 139 | // spm_train \ 140 | // --input=/philly/wu3/msrmt/fseide/WMT.paracrawl/data/all.paracrawl.8M.norm.$units.ende.sub \ 141 | // --model_prefix=/philly/wu3/msrmt/fseide/WMT.paracrawl/model/all.paracrawl.8M.norm.$units.ende \ 142 | // --vocab_size=32000 --character_coverage=1.0 --model_type=unigram --shuffle_input_sentence=false 143 | string exe = Path.Combine(spmBinDir, "spm_train"); 144 | if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)) 145 | { 146 | exe = Path.Combine(spmBinDir, "spm_train.exe"); 147 | } 148 | var args = new List { "--input", inputPath, "--model_prefix", modelPrefix }; 149 | var extraArgs = from extraParam in new Dictionary 150 | { // @TODO: use generic Flo method that parses the struct type directly 151 | ["vocab_size"] = vocabSize ?? spmParams.VocabSize, 152 | ["character_coverage"] = spmParams.CharacterCoverage, 153 | ["model_type"] = spmParams.ModelType.ToString().ToLower(), 154 | //["shuffle_input_sentence"] = spmParams.ShuffleInputSentence.ToString().ToLower(), // not supported in the SPM package version used in Flo 155 | ["add_dummy_prefix"] = spmParams.AddDummyPrefix.ToString().ToLower(), 156 | ["normalization_rule_name"] = spmParams.NormalizationRuleName.ToString().ToLower(), 157 | ["split_by_whitespace"] = spmParams.SplitByWhitespace.ToString().ToLower(), 158 | ["remove_extra_whitespaces"] = spmParams.RemoveExtraWhitespaces.ToString().ToLower(), 159 | ["input_sentence_size"] = spmParams.InputSentenceSize, 160 | ["mining_sentence_size"] = spmParams.MiningSentenceSize, 161 | ["training_sentence_size"] = spmParams.TrainingSentenceSize, 162 | ["seed_sentencepiece_size"] = spmParams.SeedSentencepieceSize, 163 | ["max_sentence_length"] = spmParams.MaxSentenceLength 164 | } 165 | where extraParam.Value != null 166 | let val = extraParam.Value.ToString() 167 | where val != "" 168 | from arg in new string[] { "--" + extraParam.Key, val } 169 | select arg; // unroll into form --arg1 argval1 --arg2 argval2 ... 170 | args.AddRange(extraArgs); 171 | var envirVariables = new Dictionary { { "LC_ALL", "C" } }; // (not sure if this matters; better safe than sorry) 172 | ProcessTools.RunCommand(exe, ProcessTools.ArgsToCommandLine(args), null, modelPrefix + ".log", throwOnFailure: true, envirVariables: envirVariables); 173 | } 174 | 175 | // helper to fetch .model and .vocab file written out by SPMTrain above into in-memory variables 176 | private static void LoadSPMModelFiles(string modelPrefix, out byte[] spmModel, out string[] spmVocab) 177 | { 178 | spmModel = File.ReadAllBytes(modelPrefix + spmModelExt); 179 | spmVocab = (from line in File.ReadAllLines(modelPrefix + spmVocabExt) 180 | select line.Split('\t').First()) // .vocab file has the form "TOKEN\tLOGPROB"; we only want TOKEN 181 | .OrderBy(t => t.ToString(), StringComparer.Ordinal) // sort it for neatness 182 | .ToArray(); 183 | } 184 | } 185 | 186 | 187 | public class SentencePieceCoderConfig 188 | { 189 | /// 190 | /// The underlying native SentencePiece model 191 | /// 192 | public SentencePieceModel SentencePieceModel { get; set; } 193 | /// 194 | /// If set, then SPM will be restricted to pieces in this set. 195 | /// We have seen examples where the internal SPM vocab contains a few units 196 | /// that are not observed when encoding the SPM training set. I suspect that 197 | /// is because training uses a soft forward-backward method, while the 198 | /// re-encoding of the SPM training set uses a best path. To circumvent that 199 | /// situation, we pass the set of pieces determined by re-encoding the training set. 200 | /// 201 | public HashSet VocabSubset { get; set; } = null; 202 | /// 203 | /// If defined, size of cache to use for split(). Currently only used for caching splits at word level when called from factored segmenter. 204 | /// 205 | public int SplitCacheSize { get; set; } = 0; 206 | } 207 | 208 | 209 | /// 210 | /// Shallow wrapper over the SentencePieceManaged lib for encoding and decoding, 211 | /// which follows our design of accepting a corresponding model as an input, and providing 212 | /// encode and decode functions. 213 | /// 214 | public class SentencePieceCoder 215 | { 216 | readonly Segmentation.SentencePieceManaged spm; 217 | private BoundedSizedLockingCache m_splitCache; 218 | 219 | /// 220 | /// Construct a coder from a SentencePieceModel. 221 | /// @TODO: Should this also have a config object? 222 | /// 223 | /// Sentencepiece model to delegate calls to 224 | /// Size of cache for calls to Split() (segmentation is very resource intensive) 225 | public SentencePieceCoder(SentencePieceCoderConfig config) 226 | { 227 | // Save BLOB to file, since the current SentencePieceManaged wrapper can only load the model from a file, 228 | // @TODO: The SentencePiece native API also supports reading from a std::istream, 229 | // so we should pass the blob via a a simple istream class that reads from memory, cf. 230 | // https://stackoverflow.com/questions/2079912/simpler-way-to-create-a-c-memorystream-from-char-size-t-without-copying-t 231 | var spmTempModelPath = Path.GetTempFileName(); 232 | File.WriteAllBytes(spmTempModelPath, config.SentencePieceModel.Bytes); 233 | spm = new Segmentation.SentencePieceManaged(spmTempModelPath, config.VocabSubset?.ToArray()); 234 | File.Delete(spmTempModelPath); 235 | m_splitCache = new BoundedSizedLockingCache(config.SplitCacheSize); 236 | } 237 | 238 | /// 239 | /// Invoke SPM encode on a text line 240 | /// 241 | public string[] Encode(string line) => spm.Segment(line); 242 | 243 | /// 244 | /// Encode a word (or continuous-script segment) and return the result a list of split points. 245 | /// E.g. if SPM splits an input word "hello" into "hel" and "lo", 246 | /// this function returns (0, 3, 5). The result includes start (0) and length. 247 | /// If the word was not split, it returns null, to save some memory allocation overhead. 248 | /// Characters that cannot be represented by the sentence-piece inventory are 249 | /// returned as individual characters. 250 | /// This function is not meant to be used with unsegmented input. Its behavior for inputs 251 | /// that include spaces is not tested or known. 252 | /// 253 | /// Character sequence to split. 254 | /// If true, s has a leading _. Subtract 1 from every offset. 255 | /// List of split offsets (including 0 and the string length) or null if not split. 256 | public int[] Split(string s, bool adjustForWordBegPrefix = false) => CachedFunction.Memoize(m_splitCache, s, x => 257 | //public int[] Split(string s, bool adjustForWordBegPrefix = false) => CachedFunction.Memoize(m_splitCache, s, x => 258 | { 259 | var cutList = spm.GetSplitPoints(x); 260 | if (adjustForWordBegPrefix && cutList != null) // source string had leading boundary prefix--account for it 261 | for (int i = 1; i < cutList.Length; i++) 262 | cutList[i]--; 263 | return cutList; 264 | }); 265 | 266 | /// 267 | /// Invoke SPM decode on an array of pieces 268 | /// 269 | public string Decode(string[] pieces) => spm.Unsegment(pieces); 270 | } 271 | } 272 | -------------------------------------------------------------------------------- /test/FactoredSegmenterScriptHelpersTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. 2 | // Licensed under the MIT license. 3 | 4 | namespace TextSegmentation.Segmenter.FactoredSegmenter_GitSubmodule.src.Test 5 | { 6 | using Common.Text; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | using System.Diagnostics.CodeAnalysis; 9 | 10 | /// 11 | /// Unit tests 12 | /// 13 | [TestClass] 14 | [ExcludeFromCodeCoverage] 15 | public class FactoredSegmenterScriptHelperTests 16 | { 17 | [TestMethod] 18 | public void ScriptEdgeCasesTest() 19 | { 20 | // We put stuff here to be sure how stuff is classified (e.g. Chinese letter 6 (六) is not considered a number by C#). 21 | // This is less of a regression test and more of a "documentation" on what we think is true for a few edge cases. 22 | Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('는') == 'L'); // Korean case markers: make sure they are just like letters 23 | Assert.IsTrue(Unicode.GetScript('는') == Unicode.Script.Hangul); 24 | Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('$') == 'S'); 25 | Assert.IsTrue(Unicode.GetUnicodeMajorDesignation(',') == 'P'); // full-width (incl. Chinese) 26 | Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('、') == 'P'); // Chinese 27 | Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('。') == 'P'); // Chinese 28 | Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('।') == 'P'); // Hindi Danda 29 | Assert.IsTrue(Unicode.GetUnicodeMajorDesignation('॥') == 'P'); // Hindi Danda 30 | } 31 | 32 | [TestMethod] 33 | public void ClassificationEdgeCaseTests() 34 | { 35 | // put stuff here to be sure how stuff is classified (e.g. Chinese letter 6 (六) is not considered a number by C#) 36 | Assert.IsTrue('A'.HasAndIsUpper()); 37 | Assert.IsTrue('A'.IsBicameral()); 38 | Assert.IsTrue(!'ß'.HasAndIsUpper()); 39 | Assert.IsTrue('1'.IsNumeral()); 40 | Assert.IsTrue('〇'.IsNumeral()); 41 | Assert.IsTrue('○'.IsNumeral()); // medium small white circle; is used in Chinese as a zero 42 | Assert.IsTrue('十'.IsNumeral()); 43 | Assert.IsTrue('六'.IsNumeral()); 44 | Assert.IsTrue('२'.IsNumeral()); // Hindi numeral 45 | Assert.IsTrue('Ⅹ'.IsNumeral()); // Roman numeral 46 | Assert.IsTrue('Ⅹ'.HasAndIsUpper()); // Roman numeral--C# IsUpper() gets this wrong 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /test/blns/blns_README.txt: -------------------------------------------------------------------------------- 1 | source of blns.txt: 2 | 3 | https://github.com/minimaxir/big-list-of-naughty-strings/blob/master/blns.txt 4 | 5 | Latest commit f56ff6e on Nov 16, 2018 6 | 7 | then manually removed a few SQL-injection strings to avoid malicious use 8 | 9 | (MIT license) 10 | --------------------------------------------------------------------------------