├── .gitignore ├── BERTTokenizers.sln ├── CHANGELOG.md ├── LICENSE.txt ├── README.md ├── src ├── Assets │ └── logo.png ├── BERTTokenizers.csproj ├── Base │ ├── CasedTokenizer.cs │ ├── TokenizerBase.cs │ ├── Tokens.cs │ └── UncasedTokenizer.cs ├── BertBaseTokenizer.cs ├── BertCasedCustomVocabulary.cs ├── BertGermanTokenizer.cs ├── BertLargeTokenizer.cs ├── BertMultilingualTokenizer.cs ├── BertUncasedBaseTokenizer.cs ├── BertUncasedCustomVocabulary.cs ├── BertUncasedLargeTokenizer.cs ├── Extensions │ └── StringExtension.cs ├── Helpers │ └── VocabularyReader.cs └── Vocabularies │ ├── base_cased.txt │ ├── base_cased_german.txt │ ├── base_cased_large.txt │ ├── base_cased_multilingual.txt │ ├── base_uncased.txt │ └── base_uncased_large.txt └── tests ├── BERTTokenizers.Tests.csproj ├── BertBaseTokenizerShould.cs ├── BertBaseTokenizerUncasedShould.cs ├── BertGermanTokenizerShould.cs ├── BertLargeTokenizerShould.cs ├── BertLargeTokenizerUncasedShould.cs └── BertMultilingualTokenizerShould.cs /.gitignore: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio. 3 | ################################################################################ 4 | 5 | /src/bin 6 | /src/obj 7 | /tests/obj 8 | /tests/bin 9 | /.vs 10 | -------------------------------------------------------------------------------- /BERTTokenizers.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 16 4 | VisualStudioVersion = 16.0.31410.357 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BERTTokenizers", "src\BERTTokenizers.csproj", "{23A1F782-E9DF-422F-96DA-10F4D952BD00}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BERTTokenizers.Tests", "tests\BERTTokenizers.Tests.csproj", "{5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}" 9 | EndProject 10 | Global 11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 12 | Debug|Any CPU = Debug|Any CPU 13 | Release|Any CPU = Release|Any CPU 14 | EndGlobalSection 15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 16 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 17 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Debug|Any CPU.Build.0 = Debug|Any CPU 18 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Release|Any CPU.ActiveCfg = Release|Any CPU 19 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Release|Any CPU.Build.0 = Release|Any CPU 20 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 21 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Debug|Any CPU.Build.0 = Debug|Any CPU 22 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Release|Any CPU.ActiveCfg = Release|Any CPU 23 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Release|Any CPU.Build.0 = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(SolutionProperties) = preSolution 26 | HideSolutionNode = FALSE 27 | EndGlobalSection 28 | GlobalSection(ExtensibilityGlobals) = postSolution 29 | SolutionGuid = {79FDFC75-2E13-4DF9-B610-ADFB1AD1E03E} 30 | EndGlobalSection 31 | EndGlobal 32 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v1.0.0 4 | 5 | ### Added or Changed 6 | - Added this changelog 7 | - Initial implementation 8 | - Added Readme.md -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Othneil Drew 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 9 | 10 | [![Donate][donate-shield]][donate-url] 11 | [![Contributors][contributors-shield]][contributors-url] 12 | [![Forks][forks-shield]][forks-url] 13 | [![Stargazers][stars-shield]][stars-url] 14 | [![Issues][issues-shield]][issues-url] 15 | [![MIT License][license-shield]][license-url] 16 | [![LinkedIn][linkedin-shield]][linkedin-url] 17 | 18 | 19 |
20 |
21 | 22 | Logo 23 | 24 | 25 |

BERTTokenizer for C#

26 | 27 |

28 | Source Code of NuGet package for tokenizing sentences and creating input for BERT Models. 29 |
30 | · 31 | Report Bug 32 | · 33 | Request Feature 34 |

35 |
36 | 37 | 38 | 39 | 40 |
41 | Table of Contents 42 |
    43 |
  1. 44 | About The Project 45 | 48 |
  2. 49 |
  3. 50 | Getting Started 51 | 55 |
  4. 56 |
  5. Usage
  6. 57 |
  7. License
  8. 58 |
  9. Contact
  10. 59 |
  11. Acknowledgments
  12. 60 |
61 |
62 | 63 | 64 | ## About The Project 65 | 66 | While working with BERT Models from Huggingface in combination with ML.NET, I stumbled upon several challenges. 67 | I documented them in [here](https://rubikscode.net/2021/10/25/using-huggingface-transformers-with-ml-net/).
68 | However, the biggest challenge by far was that I needed to implement my own tokenizer and pair them with the correct vocabulary. 69 | So, I decided to extend it and publish my implementation as a NuGet package and an open-source project. 70 | More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/).
71 | 72 | This repository contains tokenizers for following models:
73 | · BERT Base
74 | · BERT Large
75 | · BERT German
76 | · BERT Multilingual
77 | · BERT Base Uncased
78 | · BERT Large Uncased
79 | 80 | There are also clases using which you can upload your own vocabulary. 81 | 82 |

(back to top)

83 | 84 | ### Built With 85 | 86 | * [.NET 6](https://dotnet.microsoft.com/download/dotnet/6.0) 87 | 88 |

(back to top)

89 | 90 | 91 | ## Getting Started 92 | 93 | The project is available as NuGet package. 94 | 95 | ### Installation 96 | 97 | To add BERT Tokenizers to your project use dotnet command: 98 | 99 | ```sh 100 | dotnet add package BERTTokenizers 101 | ``` 102 | 103 |
104 | Or install it with package manager: 105 | 106 | ```bash 107 | Install-Package BERTTokenizers 108 | ``` 109 | 110 | 111 | ## Usage 112 | 113 | For example, you want to use Huggingface BERT Base Model whose input is defined like this: 114 | 115 | ```csharp 116 | 117 | public class BertInput 118 | { 119 | [VectorType(1, 256)] 120 | [ColumnName("input_ids")] 121 | public long[] InputIds { get; set; } 122 | 123 | [VectorType(1, 256)] 124 | [ColumnName("attention_mask")] 125 | public long[] AttentionMask { get; set; } 126 | 127 | [VectorType(1, 256)] 128 | [ColumnName("token_type_ids")] 129 | public long[] TypeIds { get; set; } 130 | } 131 | 132 | ``` 133 | 134 | For this you need to encode sentences like this: 135 | 136 | ```csharp 137 | 138 | var sentence = "I love you"; 139 | 140 | var tokenizer = new BertBaseTokenizer(); 141 | 142 | var encoded = tokenizer.Encode(256, sentence); 143 | 144 | var bertInput = new BertInput() 145 | { 146 | InputIds = encoded.Select(t => t.InputIds).ToArray(), 147 | AttentionMask = encoded.Select(t => t.AttentionMask).ToArray(), 148 | TypeIds = encoded.Select(t => t.TokenTypeIds).ToArray() 149 | }; 150 | 151 | ``` 152 | 153 | _For more examples, please refer to this [Blog Post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/)_ 154 | 155 | See the [open issues](https://github.com/NMZivkovic/BertTokenizers/issues) for a full list of proposed features (and known issues). 156 | 157 | 158 | 159 | ## Contributing 160 | 161 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. 162 | 163 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement". 164 | Don't forget to give the project a star! Thanks again! 165 | 166 | 1. Fork the Project 167 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) 168 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) 169 | 4. Push to the Branch (`git push origin feature/AmazingFeature`) 170 | 5. Open a Pull Request 171 | 172 |

(back to top)

173 | 174 | 175 | 176 | 177 | ## License 178 | 179 | Distributed under the MIT License. See `LICENSE.txt` for more information. 180 | 181 |

(back to top)

182 | 183 | 184 | 185 | 186 | ## Contact 187 | 188 | Nikola M. Zivkovic
189 | n.zivkovic@rubikscode.net
190 | [LinkedIn](https://www.linkedin.com/in/nmzivkovic/)
191 | [@NMZivkovic](https://twitter.com/NMZivkovic)
192 | 193 |

(back to top)

194 | 195 | 196 | ## Acknowledgments 197 | 198 | * Gianluca Bertani - Performance Improvements 199 | * [Paul Calot](https://github.com/PaulCalot) - First Token bugfix 200 | 201 |

(back to top)

202 | 203 | 204 | 205 | [contributors-shield]: https://img.shields.io/github/contributors/NMZivkovic/BertTokenizers.svg?style=for-the-badge 206 | [contributors-url]: https://github.com/NMZivkovic/BertTokenizers/graphs/contributors 207 | [donate-shield]: https://img.shields.io/badge/Donate-!-555?style=for-the-badge 208 | [donate-url]: https://www.paypal.com/paypalme/rubikscode 209 | [forks-shield]: https://img.shields.io/github/forks/NMZivkovic/BertTokenizers.svg?style=for-the-badge 210 | [forks-url]: https://github.com/NMZivkovic/BertTokenizers/network/members 211 | [stars-shield]: https://img.shields.io/github/stars/NMZivkovic/BertTokenizers.svg?style=for-the-badge 212 | [stars-url]: https://github.com/NMZivkovic/BertTokenizers/stargazers 213 | [issues-shield]: https://img.shields.io/github/issues/NMZivkovic/BertTokenizers.svg?style=for-the-badge 214 | [issues-url]: https://github.com/NMZivkovic/BertTokenizers/issues 215 | [license-shield]: https://img.shields.io/github/license/NMZivkovic/BertTokenizers.svg?style=for-the-badge 216 | [license-url]: https://github.com/NMZivkovic/BertTokenizers/blob/master/LICENSE.txt 217 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555 218 | [linkedin-url]: https://www.linkedin.com/in/nmzivkovic/ 219 | -------------------------------------------------------------------------------- /src/Assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NMZivkovic/BertTokenizers/150e40a178902bd258d4c9986dc1485c25c404b3/src/Assets/logo.png -------------------------------------------------------------------------------- /src/BERTTokenizers.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net6.0;net5.0 5 | Nikola M. Zivkovic 6 | Rubik's Code (rubikscode.net) 7 | Rubik's Code 8 | logo.png 9 | 10 | S 11 | 12 | https://github.com/NMZivkovic/BertTokenizers 13 | https://github.com/NMZivkovic/BertTokenizers 14 | LICENSE.txt 15 | This package contains tokenizers for following models: 16 | · BERT Base 17 | · BERT Large 18 | · BERT German 19 | · BERT Multilingual 20 | · BERT Base Uncased 21 | · BERT Large Uncased 22 | Open-source project for BERT tokenizers that can be used in C#. 23 | BERT, Tokenizer, charp, dotnet 24 | 1.2.0 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | Always 39 | Always 40 | true 41 | 42 | 43 | Always 44 | Always 45 | true 46 | 47 | 48 | Always 49 | Always 50 | true 51 | 52 | 53 | Always 54 | Always 55 | true 56 | 57 | 58 | Always 59 | Always 60 | true 61 | 62 | 63 | Always 64 | Always 65 | true 66 | 67 | 68 | 69 | 70 | 71 | True 72 | 73 | 74 | 75 | True 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /src/Base/CasedTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Extensions; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | 6 | namespace BERTTokenizers.Base 7 | { 8 | public abstract class CasedTokenizer : TokenizerBase 9 | { 10 | protected CasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) { } 11 | 12 | protected override IEnumerable TokenizeSentence(string text) 13 | { 14 | return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None) 15 | .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray())); 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/Base/TokenizerBase.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Helpers; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text.RegularExpressions; 6 | 7 | namespace BERTTokenizers.Base 8 | { 9 | public abstract class TokenizerBase 10 | { 11 | protected readonly List _vocabulary; 12 | protected readonly Dictionary _vocabularyDict; 13 | 14 | public TokenizerBase(string vocabularyFilePath) 15 | { 16 | _vocabulary = VocabularyReader.ReadFile(vocabularyFilePath); 17 | 18 | _vocabularyDict = new Dictionary(); 19 | for (int i = 0; i < _vocabulary.Count; i++) 20 | _vocabularyDict[_vocabulary[i]] = i; 21 | } 22 | 23 | 24 | public List<(long InputIds, long TokenTypeIds, long AttentionMask)> Encode(int sequenceLength, params string[] texts) 25 | { 26 | var tokens = Tokenize(texts); 27 | 28 | var padding = Enumerable.Repeat(0L, sequenceLength - tokens.Count).ToList(); 29 | 30 | var tokenIndexes = tokens.Select(token => (long)token.VocabularyIndex).Concat(padding).ToArray(); 31 | var segmentIndexes = tokens.Select(token => token.SegmentIndex).Concat(padding).ToArray(); 32 | var inputMask = tokens.Select(o => 1L).Concat(padding).ToArray(); 33 | 34 | var output = tokenIndexes.Zip(segmentIndexes, Tuple.Create) 35 | .Zip(inputMask, (t, z) => Tuple.Create(t.Item1, t.Item2, z)); 36 | 37 | return output.Select(x => (InputIds: x.Item1, TokenTypeIds: x.Item2, AttentionMask:x.Item3)).ToList(); 38 | } 39 | 40 | public string IdToToken(int id) 41 | { 42 | return _vocabulary[id]; 43 | } 44 | 45 | public List Untokenize(List tokens) 46 | { 47 | var currentToken = string.Empty; 48 | var untokens = new List(); 49 | tokens.Reverse(); 50 | 51 | tokens.ForEach(token => 52 | { 53 | if (token.StartsWith("##")) 54 | { 55 | currentToken = token.Replace("##", "") + currentToken; 56 | } 57 | else 58 | { 59 | currentToken = token + currentToken; 60 | untokens.Add(currentToken); 61 | currentToken = string.Empty; 62 | } 63 | }); 64 | 65 | untokens.Reverse(); 66 | 67 | return untokens; 68 | } 69 | 70 | public List<(string Token, int VocabularyIndex, long SegmentIndex)> Tokenize(params string[] texts) 71 | { 72 | IEnumerable tokens = new string[] { Tokens.Classification }; 73 | 74 | foreach (var text in texts) 75 | { 76 | tokens = tokens.Concat(TokenizeSentence(text)); 77 | tokens = tokens.Concat(new string[] { Tokens.Separation }); 78 | } 79 | 80 | var tokenAndIndex = tokens 81 | .SelectMany(TokenizeSubwords) 82 | .ToList(); 83 | 84 | var segmentIndexes = SegmentIndex(tokenAndIndex); 85 | 86 | return tokenAndIndex.Zip(segmentIndexes, (tokenindex, segmentindex) 87 | => (tokenindex.Token, tokenindex.VocabularyIndex, segmentindex)).ToList(); 88 | } 89 | 90 | private IEnumerable SegmentIndex(List<(string token, int index)> tokens) 91 | { 92 | var segmentIndex = 0; 93 | var segmentIndexes = new List(); 94 | 95 | foreach (var (token, index) in tokens) 96 | { 97 | segmentIndexes.Add(segmentIndex); 98 | 99 | if (token == Tokens.Separation) 100 | { 101 | segmentIndex++; 102 | } 103 | } 104 | 105 | return segmentIndexes; 106 | } 107 | 108 | private IEnumerable<(string Token, int VocabularyIndex)> TokenizeSubwords(string word) 109 | { 110 | if (_vocabularyDict.ContainsKey(word)) 111 | { 112 | return new (string, int)[] { (word, _vocabularyDict[word]) }; 113 | } 114 | 115 | var tokens = new List<(string, int)>(); 116 | var remaining = word; 117 | 118 | while (!string.IsNullOrEmpty(remaining) && remaining.Length > 2) 119 | { 120 | string prefix = null; 121 | int subwordLength = remaining.Length; 122 | while (subwordLength >= 1) // was initially 2, which prevents using "character encoding" 123 | { 124 | string subword = remaining.Substring(0, subwordLength); 125 | if (!_vocabularyDict.ContainsKey(subword)) 126 | { 127 | subwordLength--; 128 | continue; 129 | } 130 | 131 | prefix = subword; 132 | break; 133 | } 134 | 135 | if (prefix == null) 136 | { 137 | tokens.Add((Tokens.Unknown, _vocabularyDict[Tokens.Unknown])); 138 | 139 | return tokens; 140 | } 141 | 142 | var regex = new Regex(prefix); 143 | remaining = regex.Replace(remaining, "##", 1); 144 | 145 | tokens.Add((prefix, _vocabularyDict[prefix])); 146 | } 147 | 148 | if (!string.IsNullOrWhiteSpace(word) && !tokens.Any()) 149 | { 150 | tokens.Add((Tokens.Unknown, _vocabularyDict[Tokens.Unknown])); 151 | } 152 | 153 | return tokens; 154 | } 155 | 156 | protected abstract IEnumerable TokenizeSentence(string text); 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /src/Base/Tokens.cs: -------------------------------------------------------------------------------- 1 | namespace BERTTokenizers.Base 2 | { 3 | public class Tokens 4 | { 5 | public const string Padding = ""; 6 | public const string Unknown = "[UNK]"; 7 | public const string Classification = "[CLS]"; 8 | public const string Separation = "[SEP]"; 9 | public const string Mask = "[MASK]"; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/Base/UncasedTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Extensions; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | 6 | namespace BERTTokenizers.Base 7 | { 8 | public abstract class UncasedTokenizer : TokenizerBase 9 | { 10 | protected UncasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) 11 | { 12 | } 13 | 14 | protected override IEnumerable TokenizeSentence(string text) 15 | { 16 | return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None) 17 | .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray())) 18 | .Select(o => o.ToLower()); 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/BertBaseTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertBaseTokenizer : CasedTokenizer 6 | { 7 | public BertBaseTokenizer() : base("./Vocabularies/base_cased.txt") 8 | { 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/BertCasedCustomVocabulary.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertUnasedCustomVocabulary : CasedTokenizer 6 | { 7 | public BertUnasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { } 8 | 9 | } 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/BertGermanTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertGermanTokenizer : CasedTokenizer 6 | { 7 | public BertGermanTokenizer() : base("./Vocabularies/base_cased_german.txt") 8 | { 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/BertLargeTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertLargeTokenizer : CasedTokenizer 6 | { 7 | public BertLargeTokenizer() : base("./Vocabularies/base_cased_large.txt") 8 | { 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/BertMultilingualTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertMultilingualTokenizer : CasedTokenizer 6 | { 7 | public BertMultilingualTokenizer() : base("./Vocabularies/base_cased_multilingual.txt") 8 | { 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/BertUncasedBaseTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertUncasedBaseTokenizer : UncasedTokenizer 6 | { 7 | public BertUncasedBaseTokenizer() : base("./Vocabularies/base_uncased.txt") 8 | { 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/BertUncasedCustomVocabulary.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertCasedCustomVocabulary : CasedTokenizer 6 | { 7 | public BertCasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { } 8 | 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/BertUncasedLargeTokenizer.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers.Base; 2 | 3 | namespace BERTTokenizers 4 | { 5 | public class BertUncasedLargeTokenizer : UncasedTokenizer 6 | { 7 | public BertUncasedLargeTokenizer() : base("./Vocabularies/base_uncased_large.txt") 8 | { 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/Extensions/StringExtension.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using System.Text; 5 | using System.Threading.Tasks; 6 | 7 | namespace BERTTokenizers.Extensions 8 | { 9 | static class StringExtension 10 | { 11 | public static IEnumerable SplitAndKeep( 12 | this string inputString, params char[] delimiters) 13 | { 14 | int start = 0, index; 15 | 16 | while ((index = inputString.IndexOfAny(delimiters, start)) != -1) 17 | { 18 | if (index - start > 0) 19 | yield return inputString.Substring(start, index - start); 20 | 21 | yield return inputString.Substring(index, 1); 22 | 23 | start = index + 1; 24 | } 25 | 26 | if (start < inputString.Length) 27 | { 28 | yield return inputString.Substring(start); 29 | } 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/Helpers/VocabularyReader.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Generic; 2 | using System.IO; 3 | 4 | namespace BERTTokenizers.Helpers 5 | { 6 | public class VocabularyReader 7 | { 8 | public static List ReadFile(string filename) 9 | { 10 | var result = new List(); 11 | 12 | using (var reader = new StreamReader(filename)) 13 | { 14 | string line; 15 | 16 | while ((line = reader.ReadLine()) != null) 17 | { 18 | if (!string.IsNullOrWhiteSpace(line)) 19 | { 20 | result.Add(line); 21 | } 22 | } 23 | } 24 | 25 | return result; 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tests/BERTTokenizers.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net6.0 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | all 13 | runtime; build; native; contentfiles; analyzers; buildtransitive 14 | 15 | 16 | all 17 | runtime; build; native; contentfiles; analyzers; buildtransitive 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /tests/BertBaseTokenizerShould.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers; 2 | using System.Collections.Generic; 3 | using Xunit; 4 | 5 | namespace BERTTokenizersTests 6 | { 7 | public class BertBaseTokenizerShould 8 | { 9 | private BertBaseTokenizer _tokenizer; 10 | 11 | public BertBaseTokenizerShould() 12 | { 13 | _tokenizer = new BertBaseTokenizer(); 14 | } 15 | 16 | [Fact] 17 | public void Tokenize_sentence() 18 | { 19 | var sentence = "I love you"; 20 | 21 | var tokens = _tokenizer.Tokenize(sentence); 22 | Assert.Equal(5, tokens.Count); 23 | Assert.Equal(("[CLS]", 101, 0), tokens[0]); 24 | Assert.Equal(("I", 146, 0), tokens[1]); 25 | Assert.Equal(("love", 1567, 0), tokens[2]); 26 | Assert.Equal(("you", 1128, 0), tokens[3]); 27 | Assert.Equal(("[SEP]", 102, 0), tokens[4]); 28 | 29 | } 30 | 31 | [Fact] 32 | public void Encode_sentence() 33 | { 34 | var sentence = "I love you"; 35 | 36 | var encoded = _tokenizer.Encode(6, sentence); 37 | Assert.Equal(6, encoded.Count); 38 | Assert.Equal((101, 0, 1), encoded[0]); 39 | Assert.Equal((146, 0, 1), encoded[1]); 40 | Assert.Equal((1567, 0, 1), encoded[2]); 41 | Assert.Equal((1128, 0, 1), encoded[3]); 42 | Assert.Equal((102, 0, 1), encoded[4]); 43 | Assert.Equal((0, 0, 0), encoded[5]); 44 | } 45 | 46 | [Fact] 47 | public void Unokenize_sentence() 48 | { 49 | var tokens = new List(){ "she", "##s" }; 50 | 51 | var sentence = _tokenizer.Untokenize(tokens); 52 | Assert.Single(sentence); 53 | Assert.Equal("shes", sentence[0]); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/BertBaseTokenizerUncasedShould.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers; 2 | using System.Collections.Generic; 3 | using Xunit; 4 | 5 | namespace BERTTokenizersTests 6 | { 7 | public class BertBaseTokenizerUncasedShould 8 | { 9 | private BertUncasedBaseTokenizer _tokenizer; 10 | 11 | public BertBaseTokenizerUncasedShould() 12 | { 13 | _tokenizer = new BertUncasedBaseTokenizer(); 14 | } 15 | 16 | [Fact] 17 | public void Tokenize_sentence() 18 | { 19 | var sentence = "I love you"; 20 | 21 | var tokens = _tokenizer.Tokenize(sentence); 22 | Assert.Equal(5, tokens.Count); 23 | Assert.Equal(("[CLS]", 101, 0), tokens[0]); 24 | Assert.Equal(("i", 1045, 0), tokens[1]); 25 | Assert.Equal(("love", 2293, 0), tokens[2]); 26 | Assert.Equal(("you", 2017, 0), tokens[3]); 27 | Assert.Equal(("[SEP]", 102, 0), tokens[4]); 28 | 29 | } 30 | 31 | [Fact] 32 | public void Encode_admin_example() 33 | { 34 | var sentence = "Joe is an admin"; 35 | 36 | var encoded = _tokenizer.Encode(8, sentence); 37 | Assert.Equal(8, encoded.Count); 38 | Assert.Equal((101, 0, 1), encoded[0]); 39 | Assert.Equal((3533, 0, 1), encoded[1]); 40 | Assert.Equal((2003, 0, 1), encoded[2]); 41 | Assert.Equal((2019, 0, 1), encoded[3]); 42 | Assert.Equal((4748, 0, 1), encoded[4]); 43 | Assert.Equal((10020, 0, 1), encoded[5]); 44 | Assert.Equal((102, 0, 1), encoded[6]); 45 | Assert.Equal((0, 0, 0), encoded[7]); 46 | } 47 | 48 | [Fact] 49 | public void Encode_sentence() 50 | { 51 | var sentence = "I love you"; 52 | 53 | var encoded = _tokenizer.Encode(6, sentence); 54 | Assert.Equal(6, encoded.Count); 55 | Assert.Equal((101, 0, 1), encoded[0]); 56 | Assert.Equal((1045, 0, 1), encoded[1]); 57 | Assert.Equal((2293, 0, 1), encoded[2]); 58 | Assert.Equal((2017, 0, 1), encoded[3]); 59 | Assert.Equal((102, 0, 1), encoded[4]); 60 | Assert.Equal((0, 0, 0), encoded[5]); 61 | } 62 | 63 | [Fact] 64 | public void Unokenize_sentence() 65 | { 66 | var tokens = new List(){ "she", "##s" }; 67 | 68 | var sentence = _tokenizer.Untokenize(tokens); 69 | Assert.Single(sentence); 70 | Assert.Equal("shes", sentence[0]); 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /tests/BertGermanTokenizerShould.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers; 2 | using System.Collections.Generic; 3 | using Xunit; 4 | 5 | namespace BERTTokenizersTests 6 | { 7 | public class BertGermanTokenizerShould 8 | { 9 | private BertGermanTokenizer _tokenizer; 10 | 11 | public BertGermanTokenizerShould() 12 | { 13 | _tokenizer = new BertGermanTokenizer(); 14 | } 15 | 16 | [Fact] 17 | public void Encode_sentence() 18 | { 19 | var sentence = "Ich liebe dich"; 20 | 21 | var encoded = _tokenizer.Encode(6, sentence); 22 | Assert.Equal(6, encoded.Count); 23 | Assert.Equal((102, 0, 1), encoded[0]); 24 | Assert.Equal((395, 0, 1), encoded[1]); 25 | Assert.Equal((6230, 0, 1), encoded[2]); 26 | Assert.Equal((1199, 0, 1), encoded[3]); 27 | Assert.Equal((103, 0, 1), encoded[4]); 28 | Assert.Equal((0, 0, 0), encoded[5]); 29 | } 30 | 31 | [Fact] 32 | public void Tokenize_sentence() 33 | { 34 | var sentence = "Ich liebe dich"; 35 | 36 | var tokens = _tokenizer.Tokenize(sentence); 37 | Assert.Equal(5, tokens.Count); 38 | Assert.Equal(("[CLS]", 102, 0), tokens[0]); 39 | Assert.Equal(("Ich", 395, 0), tokens[1]); 40 | Assert.Equal(("liebe", 6230, 0), tokens[2]); 41 | Assert.Equal(("dich", 1199, 0), tokens[3]); 42 | Assert.Equal(("[SEP]", 103, 0), tokens[4]); 43 | 44 | } 45 | 46 | [Fact] 47 | public void Unokenize_sentence() 48 | { 49 | var tokens = new List(){ "she", "##s" }; 50 | 51 | var sentence = _tokenizer.Untokenize(tokens); 52 | Assert.Single(sentence); 53 | Assert.Equal("shes", sentence[0]); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/BertLargeTokenizerShould.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers; 2 | using System.Collections.Generic; 3 | using Xunit; 4 | 5 | namespace BERTTokenizersTests 6 | { 7 | public class BertLargeTokenizerShould 8 | { 9 | private BertLargeTokenizer _tokenizer; 10 | 11 | public BertLargeTokenizerShould() 12 | { 13 | _tokenizer = new BertLargeTokenizer(); 14 | } 15 | 16 | [Fact] 17 | public void Encode_sentence() 18 | { 19 | var sentence = "I love you"; 20 | 21 | var encoded = _tokenizer.Encode(6, sentence); 22 | Assert.Equal(6, encoded.Count); 23 | Assert.Equal((101, 0, 1), encoded[0]); 24 | Assert.Equal((146, 0, 1), encoded[1]); 25 | Assert.Equal((1567, 0, 1), encoded[2]); 26 | Assert.Equal((1128, 0, 1), encoded[3]); 27 | Assert.Equal((102, 0, 1), encoded[4]); 28 | Assert.Equal((0, 0, 0), encoded[5]); 29 | } 30 | 31 | [Fact] 32 | public void Tokenize_sentence() 33 | { 34 | var sentence = "I love you"; 35 | 36 | var tokens = _tokenizer.Tokenize(sentence); 37 | Assert.Equal(5, tokens.Count); 38 | Assert.Equal(("[CLS]", 101, 0), tokens[0]); 39 | Assert.Equal(("I", 146, 0), tokens[1]); 40 | Assert.Equal(("love", 1567, 0), tokens[2]); 41 | Assert.Equal(("you", 1128, 0), tokens[3]); 42 | Assert.Equal(("[SEP]", 102, 0), tokens[4]); 43 | 44 | } 45 | 46 | [Fact] 47 | public void Unokenize_sentence() 48 | { 49 | var tokens = new List(){ "she", "##s" }; 50 | 51 | var sentence = _tokenizer.Untokenize(tokens); 52 | Assert.Single(sentence); 53 | Assert.Equal("shes", sentence[0]); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/BertLargeTokenizerUncasedShould.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers; 2 | using System.Collections.Generic; 3 | using Xunit; 4 | 5 | namespace BERTTokenizersTests 6 | { 7 | public class BertLargeTokenizerUncasedShould 8 | { 9 | private BertUncasedLargeTokenizer _tokenizer; 10 | 11 | public BertLargeTokenizerUncasedShould() 12 | { 13 | _tokenizer = new BertUncasedLargeTokenizer(); 14 | } 15 | 16 | [Fact] 17 | public void Tokenize_sentence() 18 | { 19 | var sentence = "I love you"; 20 | 21 | var tokens = _tokenizer.Tokenize(sentence); 22 | Assert.Equal(5, tokens.Count); 23 | Assert.Equal(("[CLS]", 101, 0), tokens[0]); 24 | Assert.Equal(("i", 1045, 0), tokens[1]); 25 | Assert.Equal(("love", 2293, 0), tokens[2]); 26 | Assert.Equal(("you", 2017, 0), tokens[3]); 27 | Assert.Equal(("[SEP]", 102, 0), tokens[4]); 28 | 29 | } 30 | 31 | [Fact] 32 | public void Encode_sentence() 33 | { 34 | var sentence = "I love you"; 35 | 36 | var encoded = _tokenizer.Encode(6, sentence); 37 | Assert.Equal(6, encoded.Count); 38 | Assert.Equal((101, 0, 1), encoded[0]); 39 | Assert.Equal((1045, 0, 1), encoded[1]); 40 | Assert.Equal((2293, 0, 1), encoded[2]); 41 | Assert.Equal((2017, 0, 1), encoded[3]); 42 | Assert.Equal((102, 0, 1), encoded[4]); 43 | Assert.Equal((0, 0, 0), encoded[5]); 44 | } 45 | 46 | [Fact] 47 | public void Unokenize_sentence() 48 | { 49 | var tokens = new List(){ "she", "##s" }; 50 | 51 | var sentence = _tokenizer.Untokenize(tokens); 52 | Assert.Single(sentence); 53 | Assert.Equal("shes", sentence[0]); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/BertMultilingualTokenizerShould.cs: -------------------------------------------------------------------------------- 1 | using BERTTokenizers; 2 | using System.Collections.Generic; 3 | using Xunit; 4 | 5 | namespace BERTTokenizersTests 6 | { 7 | public class BertMultilingualTokenizerShould 8 | { 9 | private BertMultilingualTokenizer _tokenizer; 10 | 11 | public BertMultilingualTokenizerShould() 12 | { 13 | _tokenizer = new BertMultilingualTokenizer(); 14 | } 15 | 16 | [Fact] 17 | public void Encode_sentence() 18 | { 19 | var sentence = "Je vous aime"; 20 | 21 | var encoded = _tokenizer.Encode(6, sentence); 22 | Assert.Equal(6, encoded.Count); 23 | Assert.Equal((101, 0, 1), encoded[0]); 24 | Assert.Equal((13796, 0, 1), encoded[1]); 25 | Assert.Equal((24931, 0, 1), encoded[2]); 26 | Assert.Equal((62691, 0, 1), encoded[3]); 27 | Assert.Equal((102, 0, 1), encoded[4]); 28 | Assert.Equal((0, 0, 0), encoded[5]); 29 | } 30 | 31 | [Fact] 32 | public void Tokenize_sentence() 33 | { 34 | var sentence = "Je vous aime"; 35 | 36 | var tokens = _tokenizer.Tokenize(sentence); 37 | Assert.Equal(5, tokens.Count); 38 | Assert.Equal(("[CLS]", 101, 0), tokens[0]); 39 | Assert.Equal(("Je", 13796, 0), tokens[1]); 40 | Assert.Equal(("vous", 24931, 0), tokens[2]); 41 | Assert.Equal(("aime", 62691, 0), tokens[3]); 42 | Assert.Equal(("[SEP]", 102, 0), tokens[4]); 43 | } 44 | 45 | [Fact] 46 | public void Unokenize_sentence() 47 | { 48 | var tokens = new List(){ "she", "##s" }; 49 | 50 | var sentence = _tokenizer.Untokenize(tokens); 51 | Assert.Single(sentence); 52 | Assert.Equal("shes", sentence[0]); 53 | } 54 | } 55 | } 56 | --------------------------------------------------------------------------------