├── .gitignore
├── BERTTokenizers.sln
├── CHANGELOG.md
├── LICENSE.txt
├── README.md
├── src
    ├── Assets
    │   └── logo.png
    ├── BERTTokenizers.csproj
    ├── Base
    │   ├── CasedTokenizer.cs
    │   ├── TokenizerBase.cs
    │   ├── Tokens.cs
    │   └── UncasedTokenizer.cs
    ├── BertBaseTokenizer.cs
    ├── BertCasedCustomVocabulary.cs
    ├── BertGermanTokenizer.cs
    ├── BertLargeTokenizer.cs
    ├── BertMultilingualTokenizer.cs
    ├── BertUncasedBaseTokenizer.cs
    ├── BertUncasedCustomVocabulary.cs
    ├── BertUncasedLargeTokenizer.cs
    ├── Extensions
    │   └── StringExtension.cs
    ├── Helpers
    │   └── VocabularyReader.cs
    └── Vocabularies
    │   ├── base_cased.txt
    │   ├── base_cased_german.txt
    │   ├── base_cased_large.txt
    │   ├── base_cased_multilingual.txt
    │   ├── base_uncased.txt
    │   └── base_uncased_large.txt
└── tests
    ├── BERTTokenizers.Tests.csproj
    ├── BertBaseTokenizerShould.cs
    ├── BertBaseTokenizerUncasedShould.cs
    ├── BertGermanTokenizerShould.cs
    ├── BertLargeTokenizerShould.cs
    ├── BertLargeTokenizerUncasedShould.cs
    └── BertMultilingualTokenizerShould.cs


/.gitignore:
--------------------------------------------------------------------------------
 1 | ﻿################################################################################
 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio.
 3 | ################################################################################
 4 | 
 5 | /src/bin
 6 | /src/obj
 7 | /tests/obj
 8 | /tests/bin
 9 | /.vs
10 | 


--------------------------------------------------------------------------------
/BERTTokenizers.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio Version 16
 4 | VisualStudioVersion = 16.0.31410.357
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BERTTokenizers", "src\BERTTokenizers.csproj", "{23A1F782-E9DF-422F-96DA-10F4D952BD00}"
 7 | EndProject
 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BERTTokenizers.Tests", "tests\BERTTokenizers.Tests.csproj", "{5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}"
 9 | EndProject
10 | Global
11 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | 		Debug|Any CPU = Debug|Any CPU
13 | 		Release|Any CPU = Release|Any CPU
14 | 	EndGlobalSection
15 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | 		{23A1F782-E9DF-422F-96DA-10F4D952BD00}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
17 | 		{23A1F782-E9DF-422F-96DA-10F4D952BD00}.Debug|Any CPU.Build.0 = Debug|Any CPU
18 | 		{23A1F782-E9DF-422F-96DA-10F4D952BD00}.Release|Any CPU.ActiveCfg = Release|Any CPU
19 | 		{23A1F782-E9DF-422F-96DA-10F4D952BD00}.Release|Any CPU.Build.0 = Release|Any CPU
20 | 		{5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21 | 		{5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Debug|Any CPU.Build.0 = Debug|Any CPU
22 | 		{5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Release|Any CPU.ActiveCfg = Release|Any CPU
23 | 		{5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Release|Any CPU.Build.0 = Release|Any CPU
24 | 	EndGlobalSection
25 | 	GlobalSection(SolutionProperties) = preSolution
26 | 		HideSolutionNode = FALSE
27 | 	EndGlobalSection
28 | 	GlobalSection(ExtensibilityGlobals) = postSolution
29 | 		SolutionGuid = {79FDFC75-2E13-4DF9-B610-ADFB1AD1E03E}
30 | 	EndGlobalSection
31 | EndGlobal
32 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 | 
3 | ## v1.0.0
4 | 
5 | ### Added or Changed
6 | - Added this changelog
7 | - Initial implementation
8 | - Added Readme.md


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Othneil Drew
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div id="top"></div>
  2 | <!--
  3 | *** Thanks for checking out the BERTTokenizers for C#. If you have a suggestion
  4 | *** that would make this better, please fork the repo and create a pull request
  5 | *** or simply open an issue with the tag "enhancement".
  6 | *** Don't forget to give the project a star!
  7 | *** Thanks again!
  8 | -->
  9 | 
 10 | [![Donate][donate-shield]][donate-url]
 11 | [![Contributors][contributors-shield]][contributors-url]
 12 | [![Forks][forks-shield]][forks-url]
 13 | [![Stargazers][stars-shield]][stars-url]
 14 | [![Issues][issues-shield]][issues-url]
 15 | [![MIT License][license-shield]][license-url]
 16 | [![LinkedIn][linkedin-shield]][linkedin-url]
 17 | 
 18 | <!-- PROJECT LOGO -->
 19 | <br />
 20 | <div align="center">
 21 |   <a href="https://github.com/NMZivkovic/BertTokenizers">
 22 |     <img src="https://github.com/NMZivkovic/BertTokenizers/blob/master/src/Assets/logo.png?raw=true" alt="Logo" width="80" height="80">
 23 |   </a>
 24 | 
 25 | <h3 align="center">BERTTokenizer for C#</h3>
 26 | 
 27 |   <p align="center">
 28 |     Source Code of NuGet package for tokenizing sentences and creating input for BERT Models.
 29 |     <br />
 30 |     ·
 31 |     <a href="https://github.com/NMZivkovic/BertTokenizers/issues">Report Bug</a>
 32 |     ·
 33 |     <a href="https://github.com/NMZivkovic/BertTokenizers/issues">Request Feature</a>
 34 |   </p>
 35 | </div>
 36 | 
 37 | 
 38 | 
 39 | <!-- TABLE OF CONTENTS -->
 40 | <details>
 41 |   <summary>Table of Contents</summary>
 42 |   <ol>
 43 |     <li>
 44 |       <a href="#about-the-project">About The Project</a>
 45 |       <ul>
 46 |         <li><a href="#built-with">Built With</a></li>
 47 |       </ul>
 48 |     </li>
 49 |     <li>
 50 |       <a href="#getting-started">Getting Started</a>
 51 |       <ul>
 52 |         <li><a href="#prerequisites">Prerequisites</a></li>
 53 |         <li><a href="#installation">Installation</a></li>
 54 |       </ul>
 55 |     </li>
 56 |     <li><a href="#usage">Usage</a></li>
 57 |     <li><a href="#license">License</a></li>
 58 |     <li><a href="#contact">Contact</a></li>
 59 |     <li><a href="#acknowledgments">Acknowledgments</a></li>
 60 |   </ol>
 61 | </details>
 62 | 
 63 | <!-- ABOUT THE PROJECT -->
 64 | ## About The Project
 65 | 
 66 | While working with BERT Models from Huggingface in combination with ML.NET, I stumbled upon several challenges.
 67 | I documented them in [here](https://rubikscode.net/2021/10/25/using-huggingface-transformers-with-ml-net/).</br>
 68 | However, the biggest challenge by far was that I needed to implement my own tokenizer and pair them with the correct vocabulary.
 69 | So, I decided to extend it and publish my implementation as a NuGet package and an open-source project.
 70 | More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/). </br>
 71 | 
 72 | This repository contains tokenizers for following models:<br />
 73 |     · BERT Base<br />
 74 |     · BERT Large<br />
 75 |     · BERT German<br />
 76 |     · BERT Multilingual<br />
 77 |     · BERT Base Uncased<br />
 78 |     · BERT Large Uncased<br />
 79 | 
 80 | There are also clases using which you can upload your own vocabulary.
 81 | 
 82 | <p align="right">(<a href="#top">back to top</a>)</p>
 83 | 
 84 | ### Built With
 85 | 
 86 | * [.NET 6](https://dotnet.microsoft.com/download/dotnet/6.0)
 87 | 
 88 | <p align="right">(<a href="#top">back to top</a>)</p>
 89 | 
 90 | <!-- GETTING STARTED -->
 91 | ## Getting Started
 92 | 
 93 | The project is available as NuGet package.
 94 | 
 95 | ### Installation
 96 | 
 97 | To add BERT Tokenizers to your project use dotnet command:
 98 | 
 99 | ```sh
100 | dotnet add package BERTTokenizers
101 | ```
102 | 
103 | </br>
104 | Or install it with package manager:
105 | 
106 | ```bash
107 | Install-Package BERTTokenizers
108 | ```
109 | 
110 | <!-- USAGE EXAMPLES -->
111 | ## Usage
112 | 
113 | For example, you want to use Huggingface BERT Base Model whose input is defined like this:
114 | 
115 | ```csharp
116 | 
117 | public class BertInput
118 | {
119 |     [VectorType(1, 256)]
120 |     [ColumnName("input_ids")]
121 |     public long[] InputIds { get; set; }
122 | 
123 |     [VectorType(1, 256)]
124 |     [ColumnName("attention_mask")]
125 |     public long[] AttentionMask { get; set; }
126 | 
127 |     [VectorType(1, 256)]
128 |     [ColumnName("token_type_ids")]
129 |     public long[] TypeIds { get; set; }
130 | }
131 | 
132 | ```
133 | 
134 | For this you need to encode sentences like this:
135 | 
136 | ```csharp
137 | 
138 | var sentence = "I love you";
139 | 
140 | var tokenizer = new BertBaseTokenizer();
141 | 
142 | var encoded = tokenizer.Encode(256, sentence);
143 | 
144 | var bertInput = new BertInput()
145 |                 {
146 |                     InputIds = encoded.Select(t => t.InputIds).ToArray(),
147 |                     AttentionMask = encoded.Select(t => t.AttentionMask).ToArray(),
148 |                     TypeIds = encoded.Select(t => t.TokenTypeIds).ToArray()
149 |                 };
150 | 
151 | ```
152 | 
153 | _For more examples, please refer to this [Blog Post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/)_
154 | 
155 | See the [open issues](https://github.com/NMZivkovic/BertTokenizers/issues) for a full list of proposed features (and known issues).
156 | 
157 | 
158 | <!-- CONTRIBUTING -->
159 | ## Contributing
160 | 
161 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**.
162 | 
163 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement".
164 | Don't forget to give the project a star! Thanks again!
165 | 
166 | 1. Fork the Project
167 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
168 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
169 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
170 | 5. Open a Pull Request
171 | 
172 | <p align="right">(<a href="#top">back to top</a>)</p>
173 | 
174 | 
175 | 
176 | <!-- LICENSE -->
177 | ## License
178 | 
179 | Distributed under the MIT License. See `LICENSE.txt` for more information.
180 | 
181 | <p align="right">(<a href="#top">back to top</a>)</p>
182 | 
183 | 
184 | 
185 | <!-- CONTACT -->
186 | ## Contact
187 | 
188 | Nikola M. Zivkovic</br>
189 | n.zivkovic@rubikscode.net</br>
190 | [LinkedIn](https://www.linkedin.com/in/nmzivkovic/)</br>
191 | [@NMZivkovic](https://twitter.com/NMZivkovic)</br>
192 | 
193 | <p align="right">(<a href="#top">back to top</a>)</p>
194 | 
195 | <!-- ACKNOWLEDGMENTS -->
196 | ## Acknowledgments
197 | 
198 | * Gianluca Bertani - Performance Improvements
199 | * [Paul Calot](https://github.com/PaulCalot) - First Token bugfix
200 | 
201 | <p align="right">(<a href="#top">back to top</a>)</p>
202 | 
203 | <!-- MARKDOWN LINKS & IMAGES -->
204 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
205 | [contributors-shield]: https://img.shields.io/github/contributors/NMZivkovic/BertTokenizers.svg?style=for-the-badge
206 | [contributors-url]: https://github.com/NMZivkovic/BertTokenizers/graphs/contributors
207 | [donate-shield]: https://img.shields.io/badge/Donate-!-555?style=for-the-badge
208 | [donate-url]: https://www.paypal.com/paypalme/rubikscode
209 | [forks-shield]: https://img.shields.io/github/forks/NMZivkovic/BertTokenizers.svg?style=for-the-badge
210 | [forks-url]: https://github.com/NMZivkovic/BertTokenizers/network/members
211 | [stars-shield]: https://img.shields.io/github/stars/NMZivkovic/BertTokenizers.svg?style=for-the-badge
212 | [stars-url]: https://github.com/NMZivkovic/BertTokenizers/stargazers
213 | [issues-shield]: https://img.shields.io/github/issues/NMZivkovic/BertTokenizers.svg?style=for-the-badge
214 | [issues-url]: https://github.com/NMZivkovic/BertTokenizers/issues
215 | [license-shield]: https://img.shields.io/github/license/NMZivkovic/BertTokenizers.svg?style=for-the-badge
216 | [license-url]: https://github.com/NMZivkovic/BertTokenizers/blob/master/LICENSE.txt
217 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555
218 | [linkedin-url]: https://www.linkedin.com/in/nmzivkovic/
219 | 


--------------------------------------------------------------------------------
/src/Assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NMZivkovic/BertTokenizers/150e40a178902bd258d4c9986dc1485c25c404b3/src/Assets/logo.png


--------------------------------------------------------------------------------
/src/BERTTokenizers.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <TargetFrameworks>net6.0;net5.0</TargetFrameworks>
 5 |     <Authors>Nikola M. Zivkovic</Authors>
 6 |     <Company>Rubik's Code (rubikscode.net)</Company>
 7 |     <Copyright>Rubik's Code</Copyright>
 8 |     <PackageIcon>logo.png</PackageIcon>
 9 |     <PackageIconUrl />
10 |     <RepositoryType>S</RepositoryType>
11 |     <PackageLicenseExpression></PackageLicenseExpression>
12 |     <RepositoryUrl>https://github.com/NMZivkovic/BertTokenizers</RepositoryUrl>
13 |     <PackageProjectUrl>https://github.com/NMZivkovic/BertTokenizers</PackageProjectUrl>
14 |     <PackageLicenseFile>LICENSE.txt</PackageLicenseFile>
15 |     <Description>This package contains tokenizers for following models:
16 |     · BERT Base
17 |     · BERT Large
18 |     · BERT German
19 |     · BERT Multilingual
20 |     · BERT Base Uncased
21 |     · BERT Large Uncased</Description>
22 |     <PackageReleaseNotes>Open-source project for BERT tokenizers that can be used in C#.</PackageReleaseNotes>
23 |     <PackageTags>BERT, Tokenizer, charp, dotnet</PackageTags>
24 |     <Version>1.2.0</Version>
25 |   </PropertyGroup>
26 | 
27 |   <ItemGroup>
28 |     <None Remove="Vocabularies\base_cased.txt" />
29 |     <None Remove="Vocabularies\base_cased_german.txt" />
30 |     <None Remove="Vocabularies\base_cased_large.txt" />
31 |     <None Remove="Vocabularies\base_cased_multilingual.txt" />
32 |     <None Remove="Vocabularies\base_uncased.txt" />
33 |     <None Remove="Vocabularies\base_uncased_large.txt" />
34 |   </ItemGroup>
35 | 
36 |   <ItemGroup>
37 |     <Content Include="Vocabularies\base_cased.txt">
38 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
39 |       <CopyToPublishDirectory>Always</CopyToPublishDirectory>
40 |       <PackageCopyToOutput>true</PackageCopyToOutput>
41 |     </Content>
42 |     <Content Include="Vocabularies\base_cased_german.txt">
43 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
44 |       <CopyToPublishDirectory>Always</CopyToPublishDirectory>
45 |       <PackageCopyToOutput>true</PackageCopyToOutput>
46 |     </Content>
47 |     <Content Include="Vocabularies\base_cased_large.txt">
48 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
49 |       <CopyToPublishDirectory>Always</CopyToPublishDirectory>
50 |       <PackageCopyToOutput>true</PackageCopyToOutput>
51 |     </Content>
52 |     <Content Include="Vocabularies\base_cased_multilingual.txt">
53 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
54 |       <CopyToPublishDirectory>Always</CopyToPublishDirectory>
55 |       <PackageCopyToOutput>true</PackageCopyToOutput>
56 |     </Content>
57 |     <Content Include="Vocabularies\base_uncased.txt">
58 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
59 |       <CopyToPublishDirectory>Always</CopyToPublishDirectory>
60 |       <PackageCopyToOutput>true</PackageCopyToOutput>
61 |     </Content>
62 |     <Content Include="Vocabularies\base_uncased_large.txt">
63 |       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
64 |       <CopyToPublishDirectory>Always</CopyToPublishDirectory>
65 |       <PackageCopyToOutput>true</PackageCopyToOutput>
66 |     </Content>
67 |   </ItemGroup>
68 | 
69 |   <ItemGroup>
70 |     <None Include="..\LICENSE.txt">
71 |       <Pack>True</Pack>
72 |       <PackagePath></PackagePath>
73 |     </None>
74 |     <None Include="Assets\logo.png">
75 |       <Pack>True</Pack>
76 |       <PackagePath></PackagePath>
77 |     </None>
78 |   </ItemGroup>
79 |   
80 |   <ItemGroup>
81 |     <Folder Include="Assets\" />
82 |   </ItemGroup>
83 | </Project>
84 | 


--------------------------------------------------------------------------------
/src/Base/CasedTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Extensions;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Linq;
 5 | 
 6 | namespace BERTTokenizers.Base
 7 | {
 8 |     public abstract class CasedTokenizer : TokenizerBase
 9 |     {
10 |         protected CasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) { }
11 | 
12 |         protected override IEnumerable<string> TokenizeSentence(string text)
13 |         {
14 |             return text.Split(new string[] { " ", "   ", "\r\n" }, StringSplitOptions.None)
15 |                 .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()));
16 |         }
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/Base/TokenizerBase.cs:
--------------------------------------------------------------------------------
  1 | ﻿using BERTTokenizers.Helpers;
  2 | using System;
  3 | using System.Collections.Generic;
  4 | using System.Linq;
  5 | using System.Text.RegularExpressions;
  6 | 
  7 | namespace BERTTokenizers.Base
  8 | {
  9 |     public abstract class TokenizerBase
 10 |     {
 11 |         protected readonly List<string> _vocabulary;
 12 |         protected readonly Dictionary<string, int> _vocabularyDict;
 13 | 
 14 |         public TokenizerBase(string vocabularyFilePath)
 15 |         {
 16 |             _vocabulary = VocabularyReader.ReadFile(vocabularyFilePath);
 17 | 
 18 |             _vocabularyDict = new Dictionary<string, int>();
 19 |             for (int i = 0; i < _vocabulary.Count; i++)
 20 |                 _vocabularyDict[_vocabulary[i]] = i;
 21 |         }
 22 | 
 23 |         
 24 |         public List<(long InputIds, long TokenTypeIds, long AttentionMask)> Encode(int sequenceLength, params string[] texts)
 25 |         {
 26 |             var tokens = Tokenize(texts);
 27 | 
 28 |             var padding = Enumerable.Repeat(0L, sequenceLength - tokens.Count).ToList();
 29 | 
 30 |             var tokenIndexes = tokens.Select(token => (long)token.VocabularyIndex).Concat(padding).ToArray();
 31 |             var segmentIndexes = tokens.Select(token => token.SegmentIndex).Concat(padding).ToArray();
 32 |             var inputMask = tokens.Select(o => 1L).Concat(padding).ToArray();
 33 | 
 34 |             var output = tokenIndexes.Zip(segmentIndexes, Tuple.Create)
 35 |                 .Zip(inputMask, (t, z) => Tuple.Create(t.Item1, t.Item2, z));
 36 | 
 37 |             return output.Select(x => (InputIds: x.Item1, TokenTypeIds: x.Item2, AttentionMask:x.Item3)).ToList();
 38 |         }
 39 | 
 40 |         public string IdToToken(int id)
 41 |         {
 42 |             return _vocabulary[id];
 43 |         }
 44 | 
 45 |         public List<string> Untokenize(List<string> tokens)
 46 |         {
 47 |             var currentToken = string.Empty;
 48 |             var untokens = new List<string>();
 49 |             tokens.Reverse();
 50 | 
 51 |             tokens.ForEach(token =>
 52 |             {
 53 |                 if (token.StartsWith("##"))
 54 |                 {
 55 |                     currentToken = token.Replace("##", "") + currentToken;
 56 |                 }
 57 |                 else
 58 |                 {
 59 |                     currentToken = token + currentToken;
 60 |                     untokens.Add(currentToken);
 61 |                     currentToken = string.Empty;
 62 |                 }
 63 |             });
 64 | 
 65 |             untokens.Reverse();
 66 | 
 67 |             return untokens;
 68 |         }
 69 | 
 70 |         public List<(string Token, int VocabularyIndex, long SegmentIndex)> Tokenize(params string[] texts)
 71 |         {
 72 |             IEnumerable<string> tokens = new string[] { Tokens.Classification };
 73 | 
 74 |             foreach (var text in texts)
 75 |             {
 76 |                 tokens = tokens.Concat(TokenizeSentence(text));
 77 |                 tokens = tokens.Concat(new string[] { Tokens.Separation });
 78 |             }
 79 | 
 80 |             var tokenAndIndex = tokens
 81 |                 .SelectMany(TokenizeSubwords)
 82 |                 .ToList();
 83 | 
 84 |             var segmentIndexes = SegmentIndex(tokenAndIndex);
 85 | 
 86 |             return tokenAndIndex.Zip(segmentIndexes, (tokenindex, segmentindex)
 87 |                                 => (tokenindex.Token, tokenindex.VocabularyIndex, segmentindex)).ToList();
 88 |         }
 89 | 
 90 |         private IEnumerable<long> SegmentIndex(List<(string token, int index)> tokens)
 91 |         {
 92 |             var segmentIndex = 0;
 93 |             var segmentIndexes = new List<long>();
 94 | 
 95 |             foreach (var (token, index) in tokens)
 96 |             {
 97 |                 segmentIndexes.Add(segmentIndex);
 98 | 
 99 |                 if (token == Tokens.Separation)
100 |                 {
101 |                     segmentIndex++;
102 |                 }
103 |             }
104 | 
105 |             return segmentIndexes;
106 |         }
107 | 
108 |         private IEnumerable<(string Token, int VocabularyIndex)> TokenizeSubwords(string word)
109 |         {
110 |             if (_vocabularyDict.ContainsKey(word))
111 |             {
112 |                 return new (string, int)[] { (word, _vocabularyDict[word]) };
113 |             }
114 | 
115 |             var tokens = new List<(string, int)>();
116 |             var remaining = word;
117 | 
118 |             while (!string.IsNullOrEmpty(remaining) && remaining.Length > 2)
119 |             {
120 |                 string prefix = null;
121 |                 int subwordLength = remaining.Length;
122 |                 while (subwordLength >= 1) // was initially 2, which prevents using "character encoding"
123 |                 {
124 |                     string subword = remaining.Substring(0, subwordLength);
125 |                     if (!_vocabularyDict.ContainsKey(subword))
126 |                     {
127 |                         subwordLength--;
128 |                         continue;
129 |                     }
130 | 
131 |                     prefix = subword;
132 |                     break;
133 |                 }
134 | 
135 |                 if (prefix == null)
136 |                 {
137 |                     tokens.Add((Tokens.Unknown, _vocabularyDict[Tokens.Unknown]));
138 | 
139 |                     return tokens;
140 |                 }
141 | 
142 |                 var regex = new Regex(prefix);
143 |                 remaining = regex.Replace(remaining, "##", 1);
144 | 
145 |                 tokens.Add((prefix, _vocabularyDict[prefix]));
146 |             }
147 | 
148 |             if (!string.IsNullOrWhiteSpace(word) && !tokens.Any())
149 |             {
150 |                 tokens.Add((Tokens.Unknown, _vocabularyDict[Tokens.Unknown]));
151 |             }
152 | 
153 |             return tokens;
154 |         }
155 | 
156 |         protected abstract IEnumerable<string> TokenizeSentence(string text);
157 |     }
158 | }
159 | 


--------------------------------------------------------------------------------
/src/Base/Tokens.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace BERTTokenizers.Base
 2 | {
 3 |     public class Tokens
 4 |     {
 5 |         public const string Padding = "";
 6 |         public const string Unknown = "[UNK]";
 7 |         public const string Classification = "[CLS]";
 8 |         public const string Separation = "[SEP]";
 9 |         public const string Mask = "[MASK]";
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/Base/UncasedTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Extensions;
 2 | using System;
 3 | using System.Collections.Generic;
 4 | using System.Linq;
 5 | 
 6 | namespace BERTTokenizers.Base
 7 | {
 8 |     public abstract class UncasedTokenizer : TokenizerBase
 9 |     {
10 |         protected UncasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath)
11 |         {
12 |         }
13 | 
14 |         protected override IEnumerable<string> TokenizeSentence(string text)
15 |         {
16 |             return text.Split(new string[] { " ", "   ", "\r\n" }, StringSplitOptions.None)
17 |                 .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()))
18 |                 .Select(o => o.ToLower());
19 |         }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/BertBaseTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertBaseTokenizer : CasedTokenizer
 6 |     {
 7 |         public BertBaseTokenizer() : base("./Vocabularies/base_cased.txt")
 8 |         {
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/BertCasedCustomVocabulary.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertUnasedCustomVocabulary : CasedTokenizer
 6 |     {
 7 |         public BertUnasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
 8 | 
 9 |     }
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/src/BertGermanTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertGermanTokenizer : CasedTokenizer
 6 |     {
 7 |         public BertGermanTokenizer() : base("./Vocabularies/base_cased_german.txt")
 8 |         {
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/BertLargeTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertLargeTokenizer : CasedTokenizer
 6 |     {
 7 |         public BertLargeTokenizer() : base("./Vocabularies/base_cased_large.txt")
 8 |         {
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/BertMultilingualTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertMultilingualTokenizer : CasedTokenizer
 6 |     {
 7 |         public BertMultilingualTokenizer() : base("./Vocabularies/base_cased_multilingual.txt")
 8 |         {
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/BertUncasedBaseTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertUncasedBaseTokenizer : UncasedTokenizer
 6 |     {
 7 |         public BertUncasedBaseTokenizer() : base("./Vocabularies/base_uncased.txt")
 8 |         {
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/BertUncasedCustomVocabulary.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertCasedCustomVocabulary : CasedTokenizer
 6 |     {
 7 |         public BertCasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
 8 | 
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/BertUncasedLargeTokenizer.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers.Base;
 2 | 
 3 | namespace BERTTokenizers
 4 | {
 5 |     public class BertUncasedLargeTokenizer : UncasedTokenizer
 6 |     {
 7 |         public BertUncasedLargeTokenizer() : base("./Vocabularies/base_uncased_large.txt")
 8 |         {
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/Extensions/StringExtension.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System;
 2 | using System.Collections.Generic;
 3 | using System.Linq;
 4 | using System.Text;
 5 | using System.Threading.Tasks;
 6 | 
 7 | namespace BERTTokenizers.Extensions
 8 | {
 9 |     static class StringExtension
10 |     {
11 |         public static IEnumerable<string> SplitAndKeep(
12 |                                 this string inputString, params char[] delimiters)
13 |         {
14 |             int start = 0, index;
15 | 
16 |             while ((index = inputString.IndexOfAny(delimiters, start)) != -1)
17 |             {
18 |                 if (index - start > 0)
19 |                     yield return inputString.Substring(start, index - start);
20 | 
21 |                 yield return inputString.Substring(index, 1);
22 | 
23 |                 start = index + 1;
24 |             }
25 | 
26 |             if (start < inputString.Length)
27 |             {
28 |                 yield return inputString.Substring(start);
29 |             }
30 |         }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/Helpers/VocabularyReader.cs:
--------------------------------------------------------------------------------
 1 | ﻿using System.Collections.Generic;
 2 | using System.IO;
 3 | 
 4 | namespace BERTTokenizers.Helpers
 5 | {
 6 |     public class VocabularyReader
 7 |     {
 8 |         public static List<string> ReadFile(string filename)
 9 |         {
10 |             var result = new List<string>();
11 | 
12 |             using (var reader = new StreamReader(filename))
13 |             {
14 |                 string line;
15 | 
16 |                 while ((line = reader.ReadLine()) != null)
17 |                 {
18 |                     if (!string.IsNullOrWhiteSpace(line))
19 |                     {
20 |                         result.Add(line);
21 |                     }
22 |                 }
23 |             }
24 | 
25 |             return result;
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/BERTTokenizers.Tests.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <TargetFramework>net6.0</TargetFramework>
 5 |   </PropertyGroup>
 6 | 
 7 |   <ItemGroup>
 8 |     <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.0.0" />
 9 |     <PackageReference Include="moq" Version="4.16.1" />
10 |     <PackageReference Include="xunit" Version="2.4.1" />
11 |     <PackageReference Include="xunit.runner.console" Version="2.4.1">
12 |       <PrivateAssets>all</PrivateAssets>
13 |       <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
14 |     </PackageReference>
15 |     <PackageReference Include="xunit.runner.visualstudio" Version="2.4.3">
16 |       <PrivateAssets>all</PrivateAssets>
17 |       <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
18 |     </PackageReference>
19 |   </ItemGroup>
20 | 
21 |   <ItemGroup>
22 |     <ProjectReference Include="..\src\BERTTokenizers.csproj" />
23 |   </ItemGroup>
24 | 
25 | </Project>
26 | 


--------------------------------------------------------------------------------
/tests/BertBaseTokenizerShould.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers;
 2 | using System.Collections.Generic;
 3 | using Xunit;
 4 | 
 5 | namespace BERTTokenizersTests
 6 | {
 7 |     public class BertBaseTokenizerShould
 8 |     {
 9 |         private BertBaseTokenizer _tokenizer;
10 | 
11 |         public BertBaseTokenizerShould()
12 |         {
13 |             _tokenizer = new BertBaseTokenizer();
14 |         }
15 | 
16 |         [Fact]
17 |         public void Tokenize_sentence()
18 |         {
19 |             var sentence = "I love you";
20 | 
21 |             var tokens = _tokenizer.Tokenize(sentence);
22 |             Assert.Equal(5, tokens.Count);
23 |             Assert.Equal(("[CLS]", 101, 0), tokens[0]);
24 |             Assert.Equal(("I", 146, 0), tokens[1]);
25 |             Assert.Equal(("love", 1567, 0), tokens[2]);
26 |             Assert.Equal(("you", 1128, 0), tokens[3]);
27 |             Assert.Equal(("[SEP]", 102, 0), tokens[4]);
28 | 
29 |         }
30 | 
31 |         [Fact]
32 |         public void Encode_sentence()
33 |         {
34 |             var sentence = "I love you";
35 | 
36 |             var encoded = _tokenizer.Encode(6, sentence);
37 |             Assert.Equal(6, encoded.Count);
38 |             Assert.Equal((101, 0, 1), encoded[0]);
39 |             Assert.Equal((146, 0, 1), encoded[1]);
40 |             Assert.Equal((1567, 0, 1), encoded[2]);
41 |             Assert.Equal((1128, 0, 1), encoded[3]);
42 |             Assert.Equal((102, 0, 1), encoded[4]);
43 |             Assert.Equal((0, 0, 0), encoded[5]);
44 |         }
45 | 
46 |         [Fact]
47 |         public void Unokenize_sentence()
48 |         {
49 |             var tokens = new List<string>(){ "she", "##s" };
50 | 
51 |             var sentence = _tokenizer.Untokenize(tokens);
52 |             Assert.Single(sentence);
53 |             Assert.Equal("shes", sentence[0]);
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/tests/BertBaseTokenizerUncasedShould.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers;
 2 | using System.Collections.Generic;
 3 | using Xunit;
 4 | 
 5 | namespace BERTTokenizersTests
 6 | {
 7 |     public class BertBaseTokenizerUncasedShould
 8 |     {
 9 |         private BertUncasedBaseTokenizer _tokenizer;
10 | 
11 |         public BertBaseTokenizerUncasedShould()
12 |         {
13 |             _tokenizer = new BertUncasedBaseTokenizer();
14 |         }
15 | 
16 |         [Fact]
17 |         public void Tokenize_sentence()
18 |         {
19 |             var sentence = "I love you";
20 | 
21 |             var tokens = _tokenizer.Tokenize(sentence);
22 |             Assert.Equal(5, tokens.Count);
23 |             Assert.Equal(("[CLS]", 101, 0), tokens[0]);
24 |             Assert.Equal(("i", 1045, 0), tokens[1]);
25 |             Assert.Equal(("love", 2293, 0), tokens[2]);
26 |             Assert.Equal(("you", 2017, 0), tokens[3]);
27 |             Assert.Equal(("[SEP]", 102, 0), tokens[4]);
28 | 
29 |         }
30 | 
31 |         [Fact]
32 |         public void Encode_admin_example()
33 |         {
34 |             var sentence = "Joe is an admin";
35 | 
36 |             var encoded = _tokenizer.Encode(8, sentence);
37 |             Assert.Equal(8, encoded.Count);
38 |             Assert.Equal((101, 0, 1), encoded[0]);
39 |             Assert.Equal((3533, 0, 1), encoded[1]);
40 |             Assert.Equal((2003, 0, 1), encoded[2]);
41 |             Assert.Equal((2019, 0, 1), encoded[3]);
42 |             Assert.Equal((4748, 0, 1), encoded[4]);
43 |             Assert.Equal((10020, 0, 1), encoded[5]);
44 |             Assert.Equal((102, 0, 1), encoded[6]);
45 |             Assert.Equal((0, 0, 0), encoded[7]);
46 |         }
47 | 
48 |         [Fact]
49 |         public void Encode_sentence()
50 |         {
51 |             var sentence = "I love you";
52 | 
53 |             var encoded = _tokenizer.Encode(6, sentence);
54 |             Assert.Equal(6, encoded.Count);
55 |             Assert.Equal((101, 0, 1), encoded[0]);
56 |             Assert.Equal((1045, 0, 1), encoded[1]);
57 |             Assert.Equal((2293, 0, 1), encoded[2]);
58 |             Assert.Equal((2017, 0, 1), encoded[3]);
59 |             Assert.Equal((102, 0, 1), encoded[4]);
60 |             Assert.Equal((0, 0, 0), encoded[5]);
61 |         }
62 | 
63 |         [Fact]
64 |         public void Unokenize_sentence()
65 |         {
66 |             var tokens = new List<string>(){ "she", "##s" };
67 | 
68 |             var sentence = _tokenizer.Untokenize(tokens);
69 |             Assert.Single(sentence);
70 |             Assert.Equal("shes", sentence[0]);
71 |         }
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/tests/BertGermanTokenizerShould.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers;
 2 | using System.Collections.Generic;
 3 | using Xunit;
 4 | 
 5 | namespace BERTTokenizersTests
 6 | {
 7 |     public class BertGermanTokenizerShould
 8 |     {
 9 |         private BertGermanTokenizer _tokenizer;
10 | 
11 |         public BertGermanTokenizerShould()
12 |         {
13 |             _tokenizer = new BertGermanTokenizer();
14 |         }
15 | 
16 |         [Fact]
17 |         public void Encode_sentence()
18 |         {
19 |             var sentence = "Ich liebe dich";
20 | 
21 |             var encoded = _tokenizer.Encode(6, sentence);
22 |             Assert.Equal(6, encoded.Count);
23 |             Assert.Equal((102, 0, 1), encoded[0]);
24 |             Assert.Equal((395, 0, 1), encoded[1]);
25 |             Assert.Equal((6230, 0, 1), encoded[2]);
26 |             Assert.Equal((1199, 0, 1), encoded[3]);
27 |             Assert.Equal((103, 0, 1), encoded[4]);
28 |             Assert.Equal((0, 0, 0), encoded[5]);
29 |         }
30 | 
31 |         [Fact]
32 |         public void Tokenize_sentence()
33 |         {
34 |             var sentence = "Ich liebe dich";
35 | 
36 |             var tokens = _tokenizer.Tokenize(sentence);
37 |             Assert.Equal(5, tokens.Count);
38 |             Assert.Equal(("[CLS]", 102, 0), tokens[0]);
39 |             Assert.Equal(("Ich", 395, 0), tokens[1]);
40 |             Assert.Equal(("liebe", 6230, 0), tokens[2]);
41 |             Assert.Equal(("dich", 1199, 0), tokens[3]);
42 |             Assert.Equal(("[SEP]", 103, 0), tokens[4]);
43 | 
44 |         }
45 | 
46 |         [Fact]
47 |         public void Unokenize_sentence()
48 |         {
49 |             var tokens = new List<string>(){ "she", "##s" };
50 | 
51 |             var sentence = _tokenizer.Untokenize(tokens);
52 |             Assert.Single(sentence);
53 |             Assert.Equal("shes", sentence[0]);
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/tests/BertLargeTokenizerShould.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers;
 2 | using System.Collections.Generic;
 3 | using Xunit;
 4 | 
 5 | namespace BERTTokenizersTests
 6 | {
 7 |     public class BertLargeTokenizerShould
 8 |     {
 9 |         private BertLargeTokenizer _tokenizer;
10 | 
11 |         public BertLargeTokenizerShould()
12 |         {
13 |             _tokenizer = new BertLargeTokenizer();
14 |         }
15 | 
16 |         [Fact]
17 |         public void Encode_sentence()
18 |         {
19 |             var sentence = "I love you";
20 | 
21 |             var encoded = _tokenizer.Encode(6, sentence);
22 |             Assert.Equal(6, encoded.Count);
23 |             Assert.Equal((101, 0, 1), encoded[0]);
24 |             Assert.Equal((146, 0, 1), encoded[1]);
25 |             Assert.Equal((1567, 0, 1), encoded[2]);
26 |             Assert.Equal((1128, 0, 1), encoded[3]);
27 |             Assert.Equal((102, 0, 1), encoded[4]);
28 |             Assert.Equal((0, 0, 0), encoded[5]);
29 |         }
30 | 
31 |         [Fact]
32 |         public void Tokenize_sentence()
33 |         {
34 |             var sentence = "I love you";
35 | 
36 |             var tokens = _tokenizer.Tokenize(sentence);
37 |             Assert.Equal(5, tokens.Count);
38 |             Assert.Equal(("[CLS]", 101, 0), tokens[0]);
39 |             Assert.Equal(("I", 146, 0), tokens[1]);
40 |             Assert.Equal(("love", 1567, 0), tokens[2]);
41 |             Assert.Equal(("you", 1128, 0), tokens[3]);
42 |             Assert.Equal(("[SEP]", 102, 0), tokens[4]);
43 | 
44 |         }
45 | 
46 |         [Fact]
47 |         public void Unokenize_sentence()
48 |         {
49 |             var tokens = new List<string>(){ "she", "##s" };
50 | 
51 |             var sentence = _tokenizer.Untokenize(tokens);
52 |             Assert.Single(sentence);
53 |             Assert.Equal("shes", sentence[0]);
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/tests/BertLargeTokenizerUncasedShould.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers;
 2 | using System.Collections.Generic;
 3 | using Xunit;
 4 | 
 5 | namespace BERTTokenizersTests
 6 | {
 7 |     public class BertLargeTokenizerUncasedShould
 8 |     {
 9 |         private BertUncasedLargeTokenizer _tokenizer;
10 | 
11 |         public BertLargeTokenizerUncasedShould()
12 |         {
13 |             _tokenizer = new BertUncasedLargeTokenizer();
14 |         }
15 | 
16 |         [Fact]
17 |         public void Tokenize_sentence()
18 |         {
19 |             var sentence = "I love you";
20 | 
21 |             var tokens = _tokenizer.Tokenize(sentence);
22 |             Assert.Equal(5, tokens.Count);
23 |             Assert.Equal(("[CLS]", 101, 0), tokens[0]);
24 |             Assert.Equal(("i", 1045, 0), tokens[1]);
25 |             Assert.Equal(("love", 2293, 0), tokens[2]);
26 |             Assert.Equal(("you", 2017, 0), tokens[3]);
27 |             Assert.Equal(("[SEP]", 102, 0), tokens[4]);
28 | 
29 |         }
30 | 
31 |         [Fact]
32 |         public void Encode_sentence()
33 |         {
34 |             var sentence = "I love you";
35 | 
36 |             var encoded = _tokenizer.Encode(6, sentence);
37 |             Assert.Equal(6, encoded.Count);
38 |             Assert.Equal((101, 0, 1), encoded[0]);
39 |             Assert.Equal((1045, 0, 1), encoded[1]);
40 |             Assert.Equal((2293, 0, 1), encoded[2]);
41 |             Assert.Equal((2017, 0, 1), encoded[3]);
42 |             Assert.Equal((102, 0, 1), encoded[4]);
43 |             Assert.Equal((0, 0, 0), encoded[5]);
44 |         }
45 | 
46 |         [Fact]
47 |         public void Unokenize_sentence()
48 |         {
49 |             var tokens = new List<string>(){ "she", "##s" };
50 | 
51 |             var sentence = _tokenizer.Untokenize(tokens);
52 |             Assert.Single(sentence);
53 |             Assert.Equal("shes", sentence[0]);
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/tests/BertMultilingualTokenizerShould.cs:
--------------------------------------------------------------------------------
 1 | ﻿using BERTTokenizers;
 2 | using System.Collections.Generic;
 3 | using Xunit;
 4 | 
 5 | namespace BERTTokenizersTests
 6 | {
 7 |     public class BertMultilingualTokenizerShould
 8 |     {
 9 |         private BertMultilingualTokenizer _tokenizer;
10 | 
11 |         public BertMultilingualTokenizerShould()
12 |         {
13 |             _tokenizer = new BertMultilingualTokenizer();
14 |         }
15 | 
16 |         [Fact]
17 |         public void Encode_sentence()
18 |         {
19 |             var sentence = "Je vous aime";
20 | 
21 |             var encoded = _tokenizer.Encode(6, sentence);
22 |             Assert.Equal(6, encoded.Count);
23 |             Assert.Equal((101, 0, 1), encoded[0]);
24 |             Assert.Equal((13796, 0, 1), encoded[1]);
25 |             Assert.Equal((24931, 0, 1), encoded[2]);
26 |             Assert.Equal((62691, 0, 1), encoded[3]);
27 |             Assert.Equal((102, 0, 1), encoded[4]);
28 |             Assert.Equal((0, 0, 0), encoded[5]);
29 |         }
30 | 
31 |         [Fact]
32 |         public void Tokenize_sentence()
33 |         {
34 |             var sentence = "Je vous aime";
35 | 
36 |             var tokens = _tokenizer.Tokenize(sentence);
37 |             Assert.Equal(5, tokens.Count);
38 |             Assert.Equal(("[CLS]", 101, 0), tokens[0]);
39 |             Assert.Equal(("Je", 13796, 0), tokens[1]);
40 |             Assert.Equal(("vous", 24931, 0), tokens[2]);
41 |             Assert.Equal(("aime", 62691, 0), tokens[3]);
42 |             Assert.Equal(("[SEP]", 102, 0), tokens[4]);
43 |         }
44 | 
45 |         [Fact]
46 |         public void Unokenize_sentence()
47 |         {
48 |             var tokens = new List<string>(){ "she", "##s" };
49 | 
50 |             var sentence = _tokenizer.Untokenize(tokens);
51 |             Assert.Single(sentence);
52 |             Assert.Equal("shes", sentence[0]);
53 |         }
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------