├── .gitignore
├── BERTTokenizers.sln
├── CHANGELOG.md
├── LICENSE.txt
├── README.md
├── src
├── Assets
│ └── logo.png
├── BERTTokenizers.csproj
├── Base
│ ├── CasedTokenizer.cs
│ ├── TokenizerBase.cs
│ ├── Tokens.cs
│ └── UncasedTokenizer.cs
├── BertBaseTokenizer.cs
├── BertCasedCustomVocabulary.cs
├── BertGermanTokenizer.cs
├── BertLargeTokenizer.cs
├── BertMultilingualTokenizer.cs
├── BertUncasedBaseTokenizer.cs
├── BertUncasedCustomVocabulary.cs
├── BertUncasedLargeTokenizer.cs
├── Extensions
│ └── StringExtension.cs
├── Helpers
│ └── VocabularyReader.cs
└── Vocabularies
│ ├── base_cased.txt
│ ├── base_cased_german.txt
│ ├── base_cased_large.txt
│ ├── base_cased_multilingual.txt
│ ├── base_uncased.txt
│ └── base_uncased_large.txt
└── tests
├── BERTTokenizers.Tests.csproj
├── BertBaseTokenizerShould.cs
├── BertBaseTokenizerUncasedShould.cs
├── BertGermanTokenizerShould.cs
├── BertLargeTokenizerShould.cs
├── BertLargeTokenizerUncasedShould.cs
└── BertMultilingualTokenizerShould.cs
/.gitignore:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # This .gitignore file was automatically created by Microsoft(R) Visual Studio.
3 | ################################################################################
4 |
5 | /src/bin
6 | /src/obj
7 | /tests/obj
8 | /tests/bin
9 | /.vs
10 |
--------------------------------------------------------------------------------
/BERTTokenizers.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio Version 16
4 | VisualStudioVersion = 16.0.31410.357
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BERTTokenizers", "src\BERTTokenizers.csproj", "{23A1F782-E9DF-422F-96DA-10F4D952BD00}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BERTTokenizers.Tests", "tests\BERTTokenizers.Tests.csproj", "{5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}"
9 | EndProject
10 | Global
11 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
12 | Debug|Any CPU = Debug|Any CPU
13 | Release|Any CPU = Release|Any CPU
14 | EndGlobalSection
15 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
16 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
17 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Debug|Any CPU.Build.0 = Debug|Any CPU
18 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Release|Any CPU.ActiveCfg = Release|Any CPU
19 | {23A1F782-E9DF-422F-96DA-10F4D952BD00}.Release|Any CPU.Build.0 = Release|Any CPU
20 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
21 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Debug|Any CPU.Build.0 = Debug|Any CPU
22 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Release|Any CPU.ActiveCfg = Release|Any CPU
23 | {5D2946F0-0C6F-4BA9-9CFC-C9932A8AF9D9}.Release|Any CPU.Build.0 = Release|Any CPU
24 | EndGlobalSection
25 | GlobalSection(SolutionProperties) = preSolution
26 | HideSolutionNode = FALSE
27 | EndGlobalSection
28 | GlobalSection(ExtensibilityGlobals) = postSolution
29 | SolutionGuid = {79FDFC75-2E13-4DF9-B610-ADFB1AD1E03E}
30 | EndGlobalSection
31 | EndGlobal
32 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Changelog
2 |
3 | ## v1.0.0
4 |
5 | ### Added or Changed
6 | - Added this changelog
7 | - Initial implementation
8 | - Added Readme.md
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Othneil Drew
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 | [![Donate][donate-shield]][donate-url]
11 | [![Contributors][contributors-shield]][contributors-url]
12 | [![Forks][forks-shield]][forks-url]
13 | [![Stargazers][stars-shield]][stars-url]
14 | [![Issues][issues-shield]][issues-url]
15 | [![MIT License][license-shield]][license-url]
16 | [![LinkedIn][linkedin-shield]][linkedin-url]
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
BERTTokenizer for C#
26 |
27 |
28 | Source Code of NuGet package for tokenizing sentences and creating input for BERT Models.
29 |
30 | ·
31 | Report Bug
32 | ·
33 | Request Feature
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 | Table of Contents
42 |
43 | -
44 | About The Project
45 |
48 |
49 | -
50 | Getting Started
51 |
55 |
56 | - Usage
57 | - License
58 | - Contact
59 | - Acknowledgments
60 |
61 |
62 |
63 |
64 | ## About The Project
65 |
66 | While working with BERT Models from Huggingface in combination with ML.NET, I stumbled upon several challenges.
67 | I documented them in [here](https://rubikscode.net/2021/10/25/using-huggingface-transformers-with-ml-net/).
68 | However, the biggest challenge by far was that I needed to implement my own tokenizer and pair them with the correct vocabulary.
69 | So, I decided to extend it and publish my implementation as a NuGet package and an open-source project.
70 | More info about this project can be found in this [blog post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/).
71 |
72 | This repository contains tokenizers for following models:
73 | · BERT Base
74 | · BERT Large
75 | · BERT German
76 | · BERT Multilingual
77 | · BERT Base Uncased
78 | · BERT Large Uncased
79 |
80 | There are also clases using which you can upload your own vocabulary.
81 |
82 | (back to top)
83 |
84 | ### Built With
85 |
86 | * [.NET 6](https://dotnet.microsoft.com/download/dotnet/6.0)
87 |
88 | (back to top)
89 |
90 |
91 | ## Getting Started
92 |
93 | The project is available as NuGet package.
94 |
95 | ### Installation
96 |
97 | To add BERT Tokenizers to your project use dotnet command:
98 |
99 | ```sh
100 | dotnet add package BERTTokenizers
101 | ```
102 |
103 |
104 | Or install it with package manager:
105 |
106 | ```bash
107 | Install-Package BERTTokenizers
108 | ```
109 |
110 |
111 | ## Usage
112 |
113 | For example, you want to use Huggingface BERT Base Model whose input is defined like this:
114 |
115 | ```csharp
116 |
117 | public class BertInput
118 | {
119 | [VectorType(1, 256)]
120 | [ColumnName("input_ids")]
121 | public long[] InputIds { get; set; }
122 |
123 | [VectorType(1, 256)]
124 | [ColumnName("attention_mask")]
125 | public long[] AttentionMask { get; set; }
126 |
127 | [VectorType(1, 256)]
128 | [ColumnName("token_type_ids")]
129 | public long[] TypeIds { get; set; }
130 | }
131 |
132 | ```
133 |
134 | For this you need to encode sentences like this:
135 |
136 | ```csharp
137 |
138 | var sentence = "I love you";
139 |
140 | var tokenizer = new BertBaseTokenizer();
141 |
142 | var encoded = tokenizer.Encode(256, sentence);
143 |
144 | var bertInput = new BertInput()
145 | {
146 | InputIds = encoded.Select(t => t.InputIds).ToArray(),
147 | AttentionMask = encoded.Select(t => t.AttentionMask).ToArray(),
148 | TypeIds = encoded.Select(t => t.TokenTypeIds).ToArray()
149 | };
150 |
151 | ```
152 |
153 | _For more examples, please refer to this [Blog Post](https://rubikscode.net/2021/11/01/bert-tokenizers-for-ml-net/)_
154 |
155 | See the [open issues](https://github.com/NMZivkovic/BertTokenizers/issues) for a full list of proposed features (and known issues).
156 |
157 |
158 |
159 | ## Contributing
160 |
161 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**.
162 |
163 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement".
164 | Don't forget to give the project a star! Thanks again!
165 |
166 | 1. Fork the Project
167 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
168 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
169 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
170 | 5. Open a Pull Request
171 |
172 | (back to top)
173 |
174 |
175 |
176 |
177 | ## License
178 |
179 | Distributed under the MIT License. See `LICENSE.txt` for more information.
180 |
181 | (back to top)
182 |
183 |
184 |
185 |
186 | ## Contact
187 |
188 | Nikola M. Zivkovic
189 | n.zivkovic@rubikscode.net
190 | [LinkedIn](https://www.linkedin.com/in/nmzivkovic/)
191 | [@NMZivkovic](https://twitter.com/NMZivkovic)
192 |
193 | (back to top)
194 |
195 |
196 | ## Acknowledgments
197 |
198 | * Gianluca Bertani - Performance Improvements
199 | * [Paul Calot](https://github.com/PaulCalot) - First Token bugfix
200 |
201 | (back to top)
202 |
203 |
204 |
205 | [contributors-shield]: https://img.shields.io/github/contributors/NMZivkovic/BertTokenizers.svg?style=for-the-badge
206 | [contributors-url]: https://github.com/NMZivkovic/BertTokenizers/graphs/contributors
207 | [donate-shield]: https://img.shields.io/badge/Donate-!-555?style=for-the-badge
208 | [donate-url]: https://www.paypal.com/paypalme/rubikscode
209 | [forks-shield]: https://img.shields.io/github/forks/NMZivkovic/BertTokenizers.svg?style=for-the-badge
210 | [forks-url]: https://github.com/NMZivkovic/BertTokenizers/network/members
211 | [stars-shield]: https://img.shields.io/github/stars/NMZivkovic/BertTokenizers.svg?style=for-the-badge
212 | [stars-url]: https://github.com/NMZivkovic/BertTokenizers/stargazers
213 | [issues-shield]: https://img.shields.io/github/issues/NMZivkovic/BertTokenizers.svg?style=for-the-badge
214 | [issues-url]: https://github.com/NMZivkovic/BertTokenizers/issues
215 | [license-shield]: https://img.shields.io/github/license/NMZivkovic/BertTokenizers.svg?style=for-the-badge
216 | [license-url]: https://github.com/NMZivkovic/BertTokenizers/blob/master/LICENSE.txt
217 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555
218 | [linkedin-url]: https://www.linkedin.com/in/nmzivkovic/
219 |
--------------------------------------------------------------------------------
/src/Assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NMZivkovic/BertTokenizers/150e40a178902bd258d4c9986dc1485c25c404b3/src/Assets/logo.png
--------------------------------------------------------------------------------
/src/BERTTokenizers.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net6.0;net5.0
5 | Nikola M. Zivkovic
6 | Rubik's Code (rubikscode.net)
7 | Rubik's Code
8 | logo.png
9 |
10 | S
11 |
12 | https://github.com/NMZivkovic/BertTokenizers
13 | https://github.com/NMZivkovic/BertTokenizers
14 | LICENSE.txt
15 | This package contains tokenizers for following models:
16 | · BERT Base
17 | · BERT Large
18 | · BERT German
19 | · BERT Multilingual
20 | · BERT Base Uncased
21 | · BERT Large Uncased
22 | Open-source project for BERT tokenizers that can be used in C#.
23 | BERT, Tokenizer, charp, dotnet
24 | 1.2.0
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 | Always
39 | Always
40 | true
41 |
42 |
43 | Always
44 | Always
45 | true
46 |
47 |
48 | Always
49 | Always
50 | true
51 |
52 |
53 | Always
54 | Always
55 | true
56 |
57 |
58 | Always
59 | Always
60 | true
61 |
62 |
63 | Always
64 | Always
65 | true
66 |
67 |
68 |
69 |
70 |
71 | True
72 |
73 |
74 |
75 | True
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/src/Base/CasedTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Extensions;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 |
6 | namespace BERTTokenizers.Base
7 | {
8 | public abstract class CasedTokenizer : TokenizerBase
9 | {
10 | protected CasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath) { }
11 |
12 | protected override IEnumerable TokenizeSentence(string text)
13 | {
14 | return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None)
15 | .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()));
16 | }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/Base/TokenizerBase.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Helpers;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 | using System.Text.RegularExpressions;
6 |
7 | namespace BERTTokenizers.Base
8 | {
9 | public abstract class TokenizerBase
10 | {
11 | protected readonly List _vocabulary;
12 | protected readonly Dictionary _vocabularyDict;
13 |
14 | public TokenizerBase(string vocabularyFilePath)
15 | {
16 | _vocabulary = VocabularyReader.ReadFile(vocabularyFilePath);
17 |
18 | _vocabularyDict = new Dictionary();
19 | for (int i = 0; i < _vocabulary.Count; i++)
20 | _vocabularyDict[_vocabulary[i]] = i;
21 | }
22 |
23 |
24 | public List<(long InputIds, long TokenTypeIds, long AttentionMask)> Encode(int sequenceLength, params string[] texts)
25 | {
26 | var tokens = Tokenize(texts);
27 |
28 | var padding = Enumerable.Repeat(0L, sequenceLength - tokens.Count).ToList();
29 |
30 | var tokenIndexes = tokens.Select(token => (long)token.VocabularyIndex).Concat(padding).ToArray();
31 | var segmentIndexes = tokens.Select(token => token.SegmentIndex).Concat(padding).ToArray();
32 | var inputMask = tokens.Select(o => 1L).Concat(padding).ToArray();
33 |
34 | var output = tokenIndexes.Zip(segmentIndexes, Tuple.Create)
35 | .Zip(inputMask, (t, z) => Tuple.Create(t.Item1, t.Item2, z));
36 |
37 | return output.Select(x => (InputIds: x.Item1, TokenTypeIds: x.Item2, AttentionMask:x.Item3)).ToList();
38 | }
39 |
40 | public string IdToToken(int id)
41 | {
42 | return _vocabulary[id];
43 | }
44 |
45 | public List Untokenize(List tokens)
46 | {
47 | var currentToken = string.Empty;
48 | var untokens = new List();
49 | tokens.Reverse();
50 |
51 | tokens.ForEach(token =>
52 | {
53 | if (token.StartsWith("##"))
54 | {
55 | currentToken = token.Replace("##", "") + currentToken;
56 | }
57 | else
58 | {
59 | currentToken = token + currentToken;
60 | untokens.Add(currentToken);
61 | currentToken = string.Empty;
62 | }
63 | });
64 |
65 | untokens.Reverse();
66 |
67 | return untokens;
68 | }
69 |
70 | public List<(string Token, int VocabularyIndex, long SegmentIndex)> Tokenize(params string[] texts)
71 | {
72 | IEnumerable tokens = new string[] { Tokens.Classification };
73 |
74 | foreach (var text in texts)
75 | {
76 | tokens = tokens.Concat(TokenizeSentence(text));
77 | tokens = tokens.Concat(new string[] { Tokens.Separation });
78 | }
79 |
80 | var tokenAndIndex = tokens
81 | .SelectMany(TokenizeSubwords)
82 | .ToList();
83 |
84 | var segmentIndexes = SegmentIndex(tokenAndIndex);
85 |
86 | return tokenAndIndex.Zip(segmentIndexes, (tokenindex, segmentindex)
87 | => (tokenindex.Token, tokenindex.VocabularyIndex, segmentindex)).ToList();
88 | }
89 |
90 | private IEnumerable SegmentIndex(List<(string token, int index)> tokens)
91 | {
92 | var segmentIndex = 0;
93 | var segmentIndexes = new List();
94 |
95 | foreach (var (token, index) in tokens)
96 | {
97 | segmentIndexes.Add(segmentIndex);
98 |
99 | if (token == Tokens.Separation)
100 | {
101 | segmentIndex++;
102 | }
103 | }
104 |
105 | return segmentIndexes;
106 | }
107 |
108 | private IEnumerable<(string Token, int VocabularyIndex)> TokenizeSubwords(string word)
109 | {
110 | if (_vocabularyDict.ContainsKey(word))
111 | {
112 | return new (string, int)[] { (word, _vocabularyDict[word]) };
113 | }
114 |
115 | var tokens = new List<(string, int)>();
116 | var remaining = word;
117 |
118 | while (!string.IsNullOrEmpty(remaining) && remaining.Length > 2)
119 | {
120 | string prefix = null;
121 | int subwordLength = remaining.Length;
122 | while (subwordLength >= 1) // was initially 2, which prevents using "character encoding"
123 | {
124 | string subword = remaining.Substring(0, subwordLength);
125 | if (!_vocabularyDict.ContainsKey(subword))
126 | {
127 | subwordLength--;
128 | continue;
129 | }
130 |
131 | prefix = subword;
132 | break;
133 | }
134 |
135 | if (prefix == null)
136 | {
137 | tokens.Add((Tokens.Unknown, _vocabularyDict[Tokens.Unknown]));
138 |
139 | return tokens;
140 | }
141 |
142 | var regex = new Regex(prefix);
143 | remaining = regex.Replace(remaining, "##", 1);
144 |
145 | tokens.Add((prefix, _vocabularyDict[prefix]));
146 | }
147 |
148 | if (!string.IsNullOrWhiteSpace(word) && !tokens.Any())
149 | {
150 | tokens.Add((Tokens.Unknown, _vocabularyDict[Tokens.Unknown]));
151 | }
152 |
153 | return tokens;
154 | }
155 |
156 | protected abstract IEnumerable TokenizeSentence(string text);
157 | }
158 | }
159 |
--------------------------------------------------------------------------------
/src/Base/Tokens.cs:
--------------------------------------------------------------------------------
1 | namespace BERTTokenizers.Base
2 | {
3 | public class Tokens
4 | {
5 | public const string Padding = "";
6 | public const string Unknown = "[UNK]";
7 | public const string Classification = "[CLS]";
8 | public const string Separation = "[SEP]";
9 | public const string Mask = "[MASK]";
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/Base/UncasedTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Extensions;
2 | using System;
3 | using System.Collections.Generic;
4 | using System.Linq;
5 |
6 | namespace BERTTokenizers.Base
7 | {
8 | public abstract class UncasedTokenizer : TokenizerBase
9 | {
10 | protected UncasedTokenizer(string vocabularyFilePath) : base(vocabularyFilePath)
11 | {
12 | }
13 |
14 | protected override IEnumerable TokenizeSentence(string text)
15 | {
16 | return text.Split(new string[] { " ", " ", "\r\n" }, StringSplitOptions.None)
17 | .SelectMany(o => o.SplitAndKeep(".,;:\\/?!#$%()=+-*\"'–_`<>&^@{}[]|~'".ToArray()))
18 | .Select(o => o.ToLower());
19 | }
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/src/BertBaseTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertBaseTokenizer : CasedTokenizer
6 | {
7 | public BertBaseTokenizer() : base("./Vocabularies/base_cased.txt")
8 | {
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/BertCasedCustomVocabulary.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertUnasedCustomVocabulary : CasedTokenizer
6 | {
7 | public BertUnasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
8 |
9 | }
10 |
11 | }
12 |
--------------------------------------------------------------------------------
/src/BertGermanTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertGermanTokenizer : CasedTokenizer
6 | {
7 | public BertGermanTokenizer() : base("./Vocabularies/base_cased_german.txt")
8 | {
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/BertLargeTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertLargeTokenizer : CasedTokenizer
6 | {
7 | public BertLargeTokenizer() : base("./Vocabularies/base_cased_large.txt")
8 | {
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/BertMultilingualTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertMultilingualTokenizer : CasedTokenizer
6 | {
7 | public BertMultilingualTokenizer() : base("./Vocabularies/base_cased_multilingual.txt")
8 | {
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/BertUncasedBaseTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertUncasedBaseTokenizer : UncasedTokenizer
6 | {
7 | public BertUncasedBaseTokenizer() : base("./Vocabularies/base_uncased.txt")
8 | {
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/BertUncasedCustomVocabulary.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertCasedCustomVocabulary : CasedTokenizer
6 | {
7 | public BertCasedCustomVocabulary(string vocabularyFilePath) : base(vocabularyFilePath) { }
8 |
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/src/BertUncasedLargeTokenizer.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers.Base;
2 |
3 | namespace BERTTokenizers
4 | {
5 | public class BertUncasedLargeTokenizer : UncasedTokenizer
6 | {
7 | public BertUncasedLargeTokenizer() : base("./Vocabularies/base_uncased_large.txt")
8 | {
9 | }
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/Extensions/StringExtension.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Collections.Generic;
3 | using System.Linq;
4 | using System.Text;
5 | using System.Threading.Tasks;
6 |
7 | namespace BERTTokenizers.Extensions
8 | {
9 | static class StringExtension
10 | {
11 | public static IEnumerable SplitAndKeep(
12 | this string inputString, params char[] delimiters)
13 | {
14 | int start = 0, index;
15 |
16 | while ((index = inputString.IndexOfAny(delimiters, start)) != -1)
17 | {
18 | if (index - start > 0)
19 | yield return inputString.Substring(start, index - start);
20 |
21 | yield return inputString.Substring(index, 1);
22 |
23 | start = index + 1;
24 | }
25 |
26 | if (start < inputString.Length)
27 | {
28 | yield return inputString.Substring(start);
29 | }
30 | }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/Helpers/VocabularyReader.cs:
--------------------------------------------------------------------------------
1 | using System.Collections.Generic;
2 | using System.IO;
3 |
4 | namespace BERTTokenizers.Helpers
5 | {
6 | public class VocabularyReader
7 | {
8 | public static List ReadFile(string filename)
9 | {
10 | var result = new List();
11 |
12 | using (var reader = new StreamReader(filename))
13 | {
14 | string line;
15 |
16 | while ((line = reader.ReadLine()) != null)
17 | {
18 | if (!string.IsNullOrWhiteSpace(line))
19 | {
20 | result.Add(line);
21 | }
22 | }
23 | }
24 |
25 | return result;
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/tests/BERTTokenizers.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net6.0
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | all
13 | runtime; build; native; contentfiles; analyzers; buildtransitive
14 |
15 |
16 | all
17 | runtime; build; native; contentfiles; analyzers; buildtransitive
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/tests/BertBaseTokenizerShould.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers;
2 | using System.Collections.Generic;
3 | using Xunit;
4 |
5 | namespace BERTTokenizersTests
6 | {
7 | public class BertBaseTokenizerShould
8 | {
9 | private BertBaseTokenizer _tokenizer;
10 |
11 | public BertBaseTokenizerShould()
12 | {
13 | _tokenizer = new BertBaseTokenizer();
14 | }
15 |
16 | [Fact]
17 | public void Tokenize_sentence()
18 | {
19 | var sentence = "I love you";
20 |
21 | var tokens = _tokenizer.Tokenize(sentence);
22 | Assert.Equal(5, tokens.Count);
23 | Assert.Equal(("[CLS]", 101, 0), tokens[0]);
24 | Assert.Equal(("I", 146, 0), tokens[1]);
25 | Assert.Equal(("love", 1567, 0), tokens[2]);
26 | Assert.Equal(("you", 1128, 0), tokens[3]);
27 | Assert.Equal(("[SEP]", 102, 0), tokens[4]);
28 |
29 | }
30 |
31 | [Fact]
32 | public void Encode_sentence()
33 | {
34 | var sentence = "I love you";
35 |
36 | var encoded = _tokenizer.Encode(6, sentence);
37 | Assert.Equal(6, encoded.Count);
38 | Assert.Equal((101, 0, 1), encoded[0]);
39 | Assert.Equal((146, 0, 1), encoded[1]);
40 | Assert.Equal((1567, 0, 1), encoded[2]);
41 | Assert.Equal((1128, 0, 1), encoded[3]);
42 | Assert.Equal((102, 0, 1), encoded[4]);
43 | Assert.Equal((0, 0, 0), encoded[5]);
44 | }
45 |
46 | [Fact]
47 | public void Unokenize_sentence()
48 | {
49 | var tokens = new List(){ "she", "##s" };
50 |
51 | var sentence = _tokenizer.Untokenize(tokens);
52 | Assert.Single(sentence);
53 | Assert.Equal("shes", sentence[0]);
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/tests/BertBaseTokenizerUncasedShould.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers;
2 | using System.Collections.Generic;
3 | using Xunit;
4 |
5 | namespace BERTTokenizersTests
6 | {
7 | public class BertBaseTokenizerUncasedShould
8 | {
9 | private BertUncasedBaseTokenizer _tokenizer;
10 |
11 | public BertBaseTokenizerUncasedShould()
12 | {
13 | _tokenizer = new BertUncasedBaseTokenizer();
14 | }
15 |
16 | [Fact]
17 | public void Tokenize_sentence()
18 | {
19 | var sentence = "I love you";
20 |
21 | var tokens = _tokenizer.Tokenize(sentence);
22 | Assert.Equal(5, tokens.Count);
23 | Assert.Equal(("[CLS]", 101, 0), tokens[0]);
24 | Assert.Equal(("i", 1045, 0), tokens[1]);
25 | Assert.Equal(("love", 2293, 0), tokens[2]);
26 | Assert.Equal(("you", 2017, 0), tokens[3]);
27 | Assert.Equal(("[SEP]", 102, 0), tokens[4]);
28 |
29 | }
30 |
31 | [Fact]
32 | public void Encode_admin_example()
33 | {
34 | var sentence = "Joe is an admin";
35 |
36 | var encoded = _tokenizer.Encode(8, sentence);
37 | Assert.Equal(8, encoded.Count);
38 | Assert.Equal((101, 0, 1), encoded[0]);
39 | Assert.Equal((3533, 0, 1), encoded[1]);
40 | Assert.Equal((2003, 0, 1), encoded[2]);
41 | Assert.Equal((2019, 0, 1), encoded[3]);
42 | Assert.Equal((4748, 0, 1), encoded[4]);
43 | Assert.Equal((10020, 0, 1), encoded[5]);
44 | Assert.Equal((102, 0, 1), encoded[6]);
45 | Assert.Equal((0, 0, 0), encoded[7]);
46 | }
47 |
48 | [Fact]
49 | public void Encode_sentence()
50 | {
51 | var sentence = "I love you";
52 |
53 | var encoded = _tokenizer.Encode(6, sentence);
54 | Assert.Equal(6, encoded.Count);
55 | Assert.Equal((101, 0, 1), encoded[0]);
56 | Assert.Equal((1045, 0, 1), encoded[1]);
57 | Assert.Equal((2293, 0, 1), encoded[2]);
58 | Assert.Equal((2017, 0, 1), encoded[3]);
59 | Assert.Equal((102, 0, 1), encoded[4]);
60 | Assert.Equal((0, 0, 0), encoded[5]);
61 | }
62 |
63 | [Fact]
64 | public void Unokenize_sentence()
65 | {
66 | var tokens = new List(){ "she", "##s" };
67 |
68 | var sentence = _tokenizer.Untokenize(tokens);
69 | Assert.Single(sentence);
70 | Assert.Equal("shes", sentence[0]);
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/tests/BertGermanTokenizerShould.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers;
2 | using System.Collections.Generic;
3 | using Xunit;
4 |
5 | namespace BERTTokenizersTests
6 | {
7 | public class BertGermanTokenizerShould
8 | {
9 | private BertGermanTokenizer _tokenizer;
10 |
11 | public BertGermanTokenizerShould()
12 | {
13 | _tokenizer = new BertGermanTokenizer();
14 | }
15 |
16 | [Fact]
17 | public void Encode_sentence()
18 | {
19 | var sentence = "Ich liebe dich";
20 |
21 | var encoded = _tokenizer.Encode(6, sentence);
22 | Assert.Equal(6, encoded.Count);
23 | Assert.Equal((102, 0, 1), encoded[0]);
24 | Assert.Equal((395, 0, 1), encoded[1]);
25 | Assert.Equal((6230, 0, 1), encoded[2]);
26 | Assert.Equal((1199, 0, 1), encoded[3]);
27 | Assert.Equal((103, 0, 1), encoded[4]);
28 | Assert.Equal((0, 0, 0), encoded[5]);
29 | }
30 |
31 | [Fact]
32 | public void Tokenize_sentence()
33 | {
34 | var sentence = "Ich liebe dich";
35 |
36 | var tokens = _tokenizer.Tokenize(sentence);
37 | Assert.Equal(5, tokens.Count);
38 | Assert.Equal(("[CLS]", 102, 0), tokens[0]);
39 | Assert.Equal(("Ich", 395, 0), tokens[1]);
40 | Assert.Equal(("liebe", 6230, 0), tokens[2]);
41 | Assert.Equal(("dich", 1199, 0), tokens[3]);
42 | Assert.Equal(("[SEP]", 103, 0), tokens[4]);
43 |
44 | }
45 |
46 | [Fact]
47 | public void Unokenize_sentence()
48 | {
49 | var tokens = new List(){ "she", "##s" };
50 |
51 | var sentence = _tokenizer.Untokenize(tokens);
52 | Assert.Single(sentence);
53 | Assert.Equal("shes", sentence[0]);
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/tests/BertLargeTokenizerShould.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers;
2 | using System.Collections.Generic;
3 | using Xunit;
4 |
5 | namespace BERTTokenizersTests
6 | {
7 | public class BertLargeTokenizerShould
8 | {
9 | private BertLargeTokenizer _tokenizer;
10 |
11 | public BertLargeTokenizerShould()
12 | {
13 | _tokenizer = new BertLargeTokenizer();
14 | }
15 |
16 | [Fact]
17 | public void Encode_sentence()
18 | {
19 | var sentence = "I love you";
20 |
21 | var encoded = _tokenizer.Encode(6, sentence);
22 | Assert.Equal(6, encoded.Count);
23 | Assert.Equal((101, 0, 1), encoded[0]);
24 | Assert.Equal((146, 0, 1), encoded[1]);
25 | Assert.Equal((1567, 0, 1), encoded[2]);
26 | Assert.Equal((1128, 0, 1), encoded[3]);
27 | Assert.Equal((102, 0, 1), encoded[4]);
28 | Assert.Equal((0, 0, 0), encoded[5]);
29 | }
30 |
31 | [Fact]
32 | public void Tokenize_sentence()
33 | {
34 | var sentence = "I love you";
35 |
36 | var tokens = _tokenizer.Tokenize(sentence);
37 | Assert.Equal(5, tokens.Count);
38 | Assert.Equal(("[CLS]", 101, 0), tokens[0]);
39 | Assert.Equal(("I", 146, 0), tokens[1]);
40 | Assert.Equal(("love", 1567, 0), tokens[2]);
41 | Assert.Equal(("you", 1128, 0), tokens[3]);
42 | Assert.Equal(("[SEP]", 102, 0), tokens[4]);
43 |
44 | }
45 |
46 | [Fact]
47 | public void Unokenize_sentence()
48 | {
49 | var tokens = new List(){ "she", "##s" };
50 |
51 | var sentence = _tokenizer.Untokenize(tokens);
52 | Assert.Single(sentence);
53 | Assert.Equal("shes", sentence[0]);
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/tests/BertLargeTokenizerUncasedShould.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers;
2 | using System.Collections.Generic;
3 | using Xunit;
4 |
5 | namespace BERTTokenizersTests
6 | {
7 | public class BertLargeTokenizerUncasedShould
8 | {
9 | private BertUncasedLargeTokenizer _tokenizer;
10 |
11 | public BertLargeTokenizerUncasedShould()
12 | {
13 | _tokenizer = new BertUncasedLargeTokenizer();
14 | }
15 |
16 | [Fact]
17 | public void Tokenize_sentence()
18 | {
19 | var sentence = "I love you";
20 |
21 | var tokens = _tokenizer.Tokenize(sentence);
22 | Assert.Equal(5, tokens.Count);
23 | Assert.Equal(("[CLS]", 101, 0), tokens[0]);
24 | Assert.Equal(("i", 1045, 0), tokens[1]);
25 | Assert.Equal(("love", 2293, 0), tokens[2]);
26 | Assert.Equal(("you", 2017, 0), tokens[3]);
27 | Assert.Equal(("[SEP]", 102, 0), tokens[4]);
28 |
29 | }
30 |
31 | [Fact]
32 | public void Encode_sentence()
33 | {
34 | var sentence = "I love you";
35 |
36 | var encoded = _tokenizer.Encode(6, sentence);
37 | Assert.Equal(6, encoded.Count);
38 | Assert.Equal((101, 0, 1), encoded[0]);
39 | Assert.Equal((1045, 0, 1), encoded[1]);
40 | Assert.Equal((2293, 0, 1), encoded[2]);
41 | Assert.Equal((2017, 0, 1), encoded[3]);
42 | Assert.Equal((102, 0, 1), encoded[4]);
43 | Assert.Equal((0, 0, 0), encoded[5]);
44 | }
45 |
46 | [Fact]
47 | public void Unokenize_sentence()
48 | {
49 | var tokens = new List(){ "she", "##s" };
50 |
51 | var sentence = _tokenizer.Untokenize(tokens);
52 | Assert.Single(sentence);
53 | Assert.Equal("shes", sentence[0]);
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/tests/BertMultilingualTokenizerShould.cs:
--------------------------------------------------------------------------------
1 | using BERTTokenizers;
2 | using System.Collections.Generic;
3 | using Xunit;
4 |
5 | namespace BERTTokenizersTests
6 | {
7 | public class BertMultilingualTokenizerShould
8 | {
9 | private BertMultilingualTokenizer _tokenizer;
10 |
11 | public BertMultilingualTokenizerShould()
12 | {
13 | _tokenizer = new BertMultilingualTokenizer();
14 | }
15 |
16 | [Fact]
17 | public void Encode_sentence()
18 | {
19 | var sentence = "Je vous aime";
20 |
21 | var encoded = _tokenizer.Encode(6, sentence);
22 | Assert.Equal(6, encoded.Count);
23 | Assert.Equal((101, 0, 1), encoded[0]);
24 | Assert.Equal((13796, 0, 1), encoded[1]);
25 | Assert.Equal((24931, 0, 1), encoded[2]);
26 | Assert.Equal((62691, 0, 1), encoded[3]);
27 | Assert.Equal((102, 0, 1), encoded[4]);
28 | Assert.Equal((0, 0, 0), encoded[5]);
29 | }
30 |
31 | [Fact]
32 | public void Tokenize_sentence()
33 | {
34 | var sentence = "Je vous aime";
35 |
36 | var tokens = _tokenizer.Tokenize(sentence);
37 | Assert.Equal(5, tokens.Count);
38 | Assert.Equal(("[CLS]", 101, 0), tokens[0]);
39 | Assert.Equal(("Je", 13796, 0), tokens[1]);
40 | Assert.Equal(("vous", 24931, 0), tokens[2]);
41 | Assert.Equal(("aime", 62691, 0), tokens[3]);
42 | Assert.Equal(("[SEP]", 102, 0), tokens[4]);
43 | }
44 |
45 | [Fact]
46 | public void Unokenize_sentence()
47 | {
48 | var tokens = new List(){ "she", "##s" };
49 |
50 | var sentence = _tokenizer.Untokenize(tokens);
51 | Assert.Single(sentence);
52 | Assert.Equal("shes", sentence[0]);
53 | }
54 | }
55 | }
56 |
--------------------------------------------------------------------------------