├── .gitattributes ├── .github └── workflows │ └── ci.yml ├── .readthedocs.yml ├── LICENSE ├── Makefile ├── README.rst ├── SECURITY.md ├── docs ├── .gitignore ├── FAQ.rst ├── Introduction.rst ├── JavaScript │ ├── Alignment.rst │ ├── BiString.rst │ ├── BiStringBuilder.rst │ ├── Tokenization.rst │ ├── Tokenizer.rst │ └── index.rst ├── Makefile ├── Pipfile ├── Pipfile.lock ├── Python │ ├── Alignment.rst │ ├── BistrBuilder.rst │ ├── Tokenization.rst │ ├── Tokenizer.rst │ ├── bistr.rst │ └── index.rst ├── conf.py ├── index.rst ├── make.bat ├── package-lock.json ├── package.json └── requirements.txt ├── js ├── .gitignore ├── Makefile ├── README.md ├── package-lock.json ├── package.json ├── rollup.config.mjs ├── scripts │ └── generate_unicode.py ├── src │ ├── alignment.ts │ ├── bistring.ts │ ├── builder.ts │ ├── index.ts │ ├── infer.ts │ ├── regex.ts │ ├── token.ts │ └── unicode.ts ├── tests │ ├── alignment.test.ts │ ├── bistring.test.ts │ ├── builder.test.ts │ └── token.test.ts └── tsconfig.json └── python ├── .gitignore ├── Makefile ├── Pipfile ├── Pipfile.lock ├── README.rst ├── bistring ├── __init__.py ├── _alignment.py ├── _bistr.py ├── _builder.py ├── _icu.py ├── _infer.py ├── _regex.py ├── _token.py ├── _typing.py └── py.typed ├── mypy.ini ├── pyproject.toml ├── setup.py ├── stubs └── icu.pyi └── tests ├── __init__.py ├── test_alignment.py ├── test_bistr.py ├── test_builder.py └── test_token.py /.gitattributes: -------------------------------------------------------------------------------- 1 | Pipfile.lock binary 2 | package-lock.json binary 3 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | strategy: 16 | matrix: 17 | python-version: ["3.10", "3.11"] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | pip install pipenv 30 | echo bistring >python/.venv 31 | echo bistring >docs/.venv 32 | make deps 33 | 34 | - name: Run tests 35 | run: | 36 | make check 37 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | sphinx: 7 | configuration: docs/conf.py 8 | 9 | build: 10 | os: "ubuntu-20.04" 11 | tools: 12 | python: "3" 13 | nodejs: "16" 14 | 15 | formats: all 16 | 17 | python: 18 | install: 19 | - requirements: docs/requirements.txt 20 | - method: pip 21 | path: ./python 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | +$(MAKE) -C python 3 | +$(MAKE) -C js 4 | +$(MAKE) -C docs html 5 | 6 | deps: 7 | +$(MAKE) -C python deps 8 | +$(MAKE) -C js deps 9 | +$(MAKE) -C docs deps 10 | 11 | check: 12 | +$(MAKE) -C python check 13 | +$(MAKE) -C js check 14 | +$(MAKE) -C docs doctest 15 | 16 | clean: 17 | +$(MAKE) -C python clean 18 | +$(MAKE) -C js clean 19 | +$(MAKE) -C docs clean 20 | 21 | .PHONY: all deps check clean 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | bistring 2 | ======== 3 | 4 | |Build status| |Documentation status| 5 | 6 | The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace. 7 | Each bistring remembers the original string, and how its substrings map to substrings of the modified version. 8 | 9 | For example: 10 | 11 | .. code-block:: python 12 | 13 | >>> from bistring import bistr 14 | >>> s = bistr('𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶') 15 | >>> s = s.normalize('NFKD') # Unicode normalization 16 | >>> s = s.casefold() # Case-insensitivity 17 | >>> s = s.replace('🦊', 'fox') # Replace emoji with text 18 | >>> s = s.replace('🐶', 'dog') 19 | >>> s = s.sub(r'[^\w\s]+', '') # Strip everything but letters and spaces 20 | >>> s = s[:19] # Extract a substring 21 | >>> s.modified # The modified substring, after changes 22 | 'the quick brown fox' 23 | >>> s.original # The original substring, before changes 24 | '𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊' 25 | 26 | 27 | Languages 28 | --------- 29 | 30 | |PyPI version| |npm version| 31 | 32 | bistring is available in multiple languages, currently `Python `_ and `JavaScript/TypeScript `_. 33 | Ports to other languages are planned for the near future. 34 | 35 | The code is structured similarly in each language to make it easy to share algorithms, tests, and fixes between them. 36 | The main differences come from trying to mirror the language's built-in string API. 37 | If you want to contribute a bug fix or a new feature, feel free to implement it in any one of the supported languages, and we'll try to port it to the rest of them. 38 | 39 | 40 | Demo 41 | ---- 42 | 43 | `Click here `_ for a live demo of the bistring library in your browser. 44 | 45 | 46 | Contributing 47 | ------------ 48 | 49 | This project welcomes contributions and suggestions. 50 | Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. 51 | For details, visit https://cla.microsoft.com. 52 | 53 | When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). 54 | Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. 55 | 56 | This project has adopted the `Microsoft Open Source Code of Conduct `_. 57 | For more information see the `Code of Conduct FAQ `_ or contact `opencode@microsoft.com `_ with any additional questions or comments. 58 | 59 | 60 | .. |Build status| image:: https://github.com/microsoft/bistring/actions/workflows/ci.yml/badge.svg 61 | :target: https://github.com/microsoft/bistring/actions/workflows/ci.yml 62 | .. |Documentation status| image:: https://readthedocs.org/projects/bistring/badge/?version=latest 63 | :target: https://bistring.readthedocs.io/en/latest/?badge=latest 64 | .. |PyPI version| image:: https://badge.fury.io/py/bistring.svg 65 | :target: https://pypi.org/project/bistring/ 66 | .. |npm version| image:: https://badge.fury.io/js/bistring.svg 67 | :target: https://www.npmjs.com/package/bistring 68 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Sphinx documentation 7 | _build/ 8 | 9 | # Dependency directories 10 | node_modules/ 11 | -------------------------------------------------------------------------------- /docs/FAQ.rst: -------------------------------------------------------------------------------- 1 | Frequently Asked Questions 2 | ========================== 3 | 4 | 5 | What is a bistring, anyway? 6 | --------------------------- 7 | 8 | Simply put, a `bistring` is a pair of strings, an original string and a modified one, along with information about how they align with each other. 9 | The :class:`bistring.bistr` class has an API very similar to the built-in :class:`str`, but all its operations keep track of the original string and the alignment for you. 10 | 11 | >>> from bistring import bistr 12 | >>> s = bistr('HELLO WORLD') 13 | >>> print(s) 14 | ⮎'HELLO WORLD'⮌ 15 | >>> s = s.lower() 16 | >>> print(s) 17 | ('HELLO WORLD' ⇋ 'hello world') 18 | >>> print(s[6:]) 19 | ('WORLD' ⇋ 'world') 20 | 21 | 22 | Why am I getting more text than I expect when slicing? 23 | ------------------------------------------------------ 24 | 25 | When a bistring doesn't have precise enough alignment information to slice exactly, it will give you back the smallest string it knows for certain contains a match for the region you requested. 26 | In the worst case, that may be the entire string! 27 | This happens, for example, when you use the two-argument `bistr` constructor, which makes no effort to infer a granular alignment between the strings: 28 | 29 | >>> s = bistr('color', 'colour') 30 | >>> print(s[3:5]) 31 | ('color' ⇋ 'ou') 32 | 33 | Instead, you should start from your original string as a `bistr`, and then transform it how you want: 34 | 35 | >>> s = bistr('color') 36 | >>> s = s.sub(r'(?<=col)o(?=r)', 'ou') 37 | >>> print(s) 38 | ('color' ⇋ 'colour') 39 | >>> print(s[3:5]) 40 | ('o' ⇋ 'ou') 41 | 42 | Alternatively, you can piece many smaller bistrings together to achieve the alignment you want manually: 43 | 44 | >>> s = bistr('col') + bistr('o', 'ou') + bistr('r') 45 | >>> print(s) 46 | ('color' ⇋ 'colour') 47 | >>> print(s[3:5]) 48 | ('o' ⇋ 'ou') 49 | 50 | 51 | What if I don't know the alignment? 52 | ----------------------------------- 53 | 54 | If at all possible, you should use `bistring` all the way through your text processing code, which will ensure an accurate alignment is tracked for you. 55 | If you don't control that code, or there are other reasons it won't work with `bistring`, you can still have us guess an alignment for you in simple cases with :meth:`bistring.bistr.infer`. 56 | 57 | >>> s = bistr.infer('color', 'colour') 58 | >>> print(s[0:3]) 59 | ⮎'col'⮌ 60 | >>> print(s[3:5]) 61 | ('o' ⇋ 'ou') 62 | >>> print(s[5:6]) 63 | ⮎'r'⮌ 64 | 65 | `infer()` is an expensive operation (``O(N*M)`` in the length of the strings), so if you absolutely need it, try to use it only for short strings. 66 | 67 | 68 | How do I get the actual indices, rather than just substrings? 69 | ------------------------------------------------------------- 70 | 71 | Use :attr:`bistring.bistr.alignment`: 72 | 73 | >>> s = bistr('The quick, brown 🦊') 74 | >>> s = s.replace(',', '') 75 | >>> s = s.replace('🦊', 'fox') 76 | >>> print(s[16:19]) 77 | ('🦊' ⇋ 'fox') 78 | >>> s.alignment.original_bounds(16, 19) 79 | (17, 18) 80 | >>> s.alignment.modified_bounds(11, 16) 81 | (10, 15) 82 | >>> print(s[10:15]) 83 | ⮎'brown'⮌ 84 | 85 | See :class:`bistring.Alignment` for more details. 86 | 87 | 88 | How do I perform case-insensitive operations? 89 | --------------------------------------------- 90 | 91 | Use :meth:`bistring.bistr.casefold`. 92 | Do not use :meth:`~bistring.bistr.lower`, :meth:`~bistring.bistr.upper`, or any other method, as you will get wrong results for many non-English languages. 93 | 94 | To check case-insensitive equality, you don't even need `bistring`: 95 | 96 | >>> 'HELLO WORLD!'.casefold() == 'HeLlO wOrLd!'.casefold() 97 | True 98 | 99 | To search for a substring case-insensitively: 100 | 101 | >>> s = bistr('Bundesstraße').casefold() 102 | >>> s.find_bounds('STRASSE'.casefold()) 103 | (6, 13) 104 | >>> print(s[6:13]) 105 | ('straße' ⇋ 'strasse') 106 | 107 | 108 | Forget case insensitivity, how do I make sure that identical looking strings compare equal? 109 | ------------------------------------------------------------------------------------------- 110 | 111 | This is a hard problem with Unicode strings. 112 | To start with, you should at least perform some kind of `Unicode normalization `_. 113 | That ensures that different ways of writing the semantically identical thing (e.g. with precomposed accented characters vs. combining accents) become actually identical: 114 | 115 | >>> a = bistr('\u00EAtre') # 'être' with a single character for the ê 116 | >>> b = bistr('e\u0302tre') # 'être' with an 'e' and a combining '^' 117 | >>> a.normalize('NFC').modified == b.normalize('NFC').modified 118 | True 119 | >>> a.normalize('NFD').modified == b.normalize('NFD').modified 120 | True 121 | 122 | Normalization form NFC tries to keep precomposed characters together whenever possible, while NFD always decomposes them. 123 | In general, NFC is more convenient for people to work with, but NFD can be useful for things like removing accents and other combining marks from text. 124 | 125 | 126 | What about similar-looking strings, that aren't necessarily identical? 127 | ---------------------------------------------------------------------- 128 | 129 | Unicode contains things like ligatures, alternative scripts, and other oddities than can result in similar-looking strings that are represented very differently. 130 | Here is where the "compatibility" normalization forms, NFKC and NFKD, can help: 131 | 132 | >>> s = bistr('𝕳𝖊𝖑𝖑𝖔 𝖜𝖔𝖗𝖑𝖉') 133 | >>> s = s.normalize('NFKC') 134 | >>> print(s) 135 | ('𝕳𝖊𝖑𝖑𝖔 𝖜𝖔𝖗𝖑𝖉' ⇋ 'Hello world') 136 | >>> print(s[6:]) 137 | ('𝖜𝖔𝖗𝖑𝖉' ⇋ 'world') 138 | 139 | 140 | How do I ensure I get the same results on every machine? 141 | -------------------------------------------------------- 142 | 143 | Always pass an explicit locale to any `bistr` method that takes one. 144 | Many of Python's string APIs implicitly use the system's default locale, which may be quite different than the one you developed with. 145 | While this may be the right behaviour if you're displaying strings to the current user, it's rarely the right behaviour if you're dealing with text that originated or will be displayed elsewhere, e.g. for cloud software. 146 | `bistr` always accepts a locale parameter in these APIs, to ensure reproducible and sensible results: 147 | 148 | >>> # s will be 'I' in most locales, but 'İ' in Turkish locales! 149 | >>> s = bistr('i').upper() 150 | >>> # An English locale guarantees a dotless capital I 151 | >>> print(bistr('i').upper('en_US')) 152 | ('i' ⇋ 'I') 153 | >>> # A Turkish locale gives a dotted capital İ 154 | >>> print(bistr('i').upper('tr_TR')) 155 | ('i' ⇋ 'İ') 156 | 157 | 158 | Tokenization 159 | ------------ 160 | 161 | How do I tokenize text in a reversible way? 162 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 163 | 164 | `bistring` provides some convenient tokenization APIs that track string indices. 165 | To use Unicode word boundary rules, for example: 166 | 167 | >>> from bistring import WordTokenizer 168 | >>> tokenizer = WordTokenizer('en_US') 169 | >>> tokens = tokenizer.tokenize('The quick, brown fox jumps over the lazy dog') 170 | >>> print(tokens[1]) 171 | [4:9]=⮎'quick'⮌ 172 | 173 | 174 | How do I find the whole substring of text for some tokens? 175 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 176 | 177 | :meth:`bistring.Tokenization.substring` gives the substring itself. 178 | :meth:`bistring.Tokenization.text_bounds` gives the bounds of that substring. 179 | 180 | >>> print(tokens.substring(1, 3)) 181 | ⮎'quick, brown'⮌ 182 | >>> tokens.text_bounds(1, 3) 183 | (4, 16) 184 | 185 | 186 | How do I find the tokens for a substring of text? 187 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 188 | 189 | :meth:`bistring.Tokenization.bounds_for_text` 190 | 191 | >>> tokens.bounds_for_text(4, 16) 192 | (1, 3) 193 | >>> print(tokens.substring(1, 3)) 194 | ⮎'quick, brown'⮌ 195 | 196 | 197 | How to I snap a substring of text to the nearest token boundaries? 198 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 199 | 200 | :meth:`bistring.Tokenization.snap_text_bounds` 201 | 202 | >>> print(tokens.text[6:14]) 203 | ⮎'ick, bro'⮌ 204 | >>> tokens.snap_text_bounds(6, 14) 205 | (4, 16) 206 | >>> print(tokens.text[4:16]) 207 | ⮎'quick, brown'⮌ 208 | 209 | 210 | What if I don't know the token positions? 211 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 212 | 213 | If at all possible, you should use a :class:`bistring.Tokenizer` or some other method that tokenizes with position information. 214 | If you can't, you can use :meth:`bistring.Tokenization.infer` to guess the alignment for you: 215 | 216 | >>> from bistring import Tokenization 217 | >>> tokens = Tokenization.infer('hello, world!', ['hello', 'world']) 218 | >>> print(tokens[0]) 219 | [0:5]=⮎'hello'⮌ 220 | >>> print(tokens[1]) 221 | [7:12]=⮎'world'⮌ 222 | -------------------------------------------------------------------------------- /docs/Introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | Many operations commonly performed on text strings are destructive; that is, they lose some information about the original string. 5 | Systems that deal with text will commonly perform many of these operations on their input, whether it's changing case, performing unicode normalization, collapsing whitespace, stripping punctuation, etc. 6 | This helps systems behave in a more uniform manner regarding the many different ways you or I might express the same thing. 7 | But the consequence is that when handling parts of this processed text, it may be hard to know what exactly the user originally wrote. 8 | Sometimes those details can be very important to the user. 9 | 10 | Consider an AI personal assistant, for example, that is helping a user send a text message to a friend. 11 | The user writes, 12 | 13 | send jane a text that says, "Hey! How are you? Haven't seen you in a while, what's up 😀" 14 | 15 | The system may perform some normalization on that text, such that it ends up looking like this, with casing and punctuation gone: 16 | 17 | send jane a text that says hey how are you havent seen you in a while whats up emoji 18 | 19 | The AI may then identify that the body of the message should be: 20 | 21 | hey how are you havent seen you in a while whats up emoji 22 | 23 | However, that message wouldn't make much sense as-is. 24 | If the assistant uses `bistring` though, it's easy for it to match that with the original text the user intended: 25 | 26 | >>> from bistring import bistr 27 | >>> query = bistr( 28 | ... 'send jane a text that says, ' 29 | ... '"Hey! How are you? Haven\'t seen you in a while, what\'s up 😀"' 30 | ... ) 31 | 32 | >>> # Get rid of upper-/lower-case distinctions 33 | >>> query = query.casefold() 34 | >>> print(query.modified) 35 | send jane a text that says, "hey! how are you? haven't seen you in a while, what's up 😀" 36 | 37 | >>> import regex 38 | >>> # Remove all punctuation 39 | >>> query = query.sub(regex.compile(r'\pP'), '') 40 | >>> # Replace all symbols with 'emoji' 41 | >>> query = query.sub(regex.compile(r'\pS'), 'emoji') 42 | >>> print(query.modified) 43 | send jane a text that says hey how are you havent seen you in a while whats up emoji 44 | 45 | >>> # Extract the substring we care about, the message body 46 | >>> message = query[27:84] 47 | >>> print(message.modified) 48 | hey how are you havent seen you in a while whats up emoji 49 | >>> print(message.original) 50 | Hey! How are you? Haven't seen you in a while, what's up 😀 51 | 52 | Every `bistr` keeps track of the original string it started with, and maintains a sequence alignment between the original and the modified strings. 53 | This alignment means that it knows exactly what substring of the original text is associated with every chunk of the modified text. 54 | So when you slice a `bistr`, you get the matching slice of original text automatically! 55 | -------------------------------------------------------------------------------- /docs/JavaScript/Alignment.rst: -------------------------------------------------------------------------------- 1 | Alignment 2 | ========= 3 | 4 | .. js:autoclass:: Alignment 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/JavaScript/BiString.rst: -------------------------------------------------------------------------------- 1 | BiString 2 | ======== 3 | 4 | .. js:autoclass:: BiString 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/JavaScript/BiStringBuilder.rst: -------------------------------------------------------------------------------- 1 | BiStringBuilder 2 | =============== 3 | 4 | .. js:autoclass:: BiStringBuilder 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/JavaScript/Tokenization.rst: -------------------------------------------------------------------------------- 1 | Tokenization 2 | ============ 3 | 4 | .. js:autoclass:: Token 5 | :members: 6 | 7 | .. js:autoclass:: Tokenization 8 | :members: 9 | -------------------------------------------------------------------------------- /docs/JavaScript/Tokenizer.rst: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | ========= 3 | 4 | .. js:autoclass:: Tokenizer 5 | :members: 6 | 7 | .. js:autoclass:: RegExpTokenizer 8 | :members: 9 | 10 | .. js:autoclass:: SplittingTokenizer 11 | :members: 12 | -------------------------------------------------------------------------------- /docs/JavaScript/index.rst: -------------------------------------------------------------------------------- 1 | JavaScript 2 | ========== 3 | 4 | .. toctree:: 5 | 6 | BiString 7 | BiStringBuilder 8 | Alignment 9 | Tokenization 10 | Tokenizer 11 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= pipenv run sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | deps: 16 | pipenv sync 17 | npm install 18 | 19 | .PHONY: help deps Makefile 20 | 21 | # Catch-all target: route all unknown targets to Sphinx using the new 22 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 23 | %: Makefile 24 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 25 | -------------------------------------------------------------------------------- /docs/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | regex = "*" 10 | sphinx = "*" 11 | sphinx-autodoc-typehints = "*" 12 | sphinx-rtd-theme = "*" 13 | sphinx-js = {git = "https://github.com/pyodide/sphinx-js", ref = "28105a224ec6c4c3141d10e182b9feaae7f10183"} 14 | pydantic = "<2.0" 15 | bistring = {editable = true, path = "./../python"} 16 | -------------------------------------------------------------------------------- /docs/Python/Alignment.rst: -------------------------------------------------------------------------------- 1 | Alignment 2 | ========= 3 | 4 | .. testsetup:: * 5 | 6 | from bistring import Alignment 7 | 8 | .. autoclass:: bistring.Alignment 9 | -------------------------------------------------------------------------------- /docs/Python/BistrBuilder.rst: -------------------------------------------------------------------------------- 1 | BistrBuilder 2 | ============ 3 | 4 | .. testsetup:: * 5 | 6 | from bistring import BistrBuilder 7 | 8 | .. autoclass:: bistring.BistrBuilder 9 | -------------------------------------------------------------------------------- /docs/Python/Tokenization.rst: -------------------------------------------------------------------------------- 1 | Tokenization 2 | ============ 3 | 4 | .. testsetup:: * 5 | 6 | from bistring import Tokenization 7 | 8 | .. autoclass:: bistring.Token 9 | 10 | .. autoclass:: bistring.Tokenization 11 | -------------------------------------------------------------------------------- /docs/Python/Tokenizer.rst: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | ========= 3 | 4 | .. testsetup:: * 5 | 6 | from bistring import RegexTokenizer, SplittingTokenizer, CharacterTokenizer, WordTokenizer, SentenceTokenizer 7 | 8 | .. autoclass:: bistring.Tokenizer 9 | 10 | .. autoclass:: bistring.RegexTokenizer 11 | 12 | .. autoclass:: bistring.SplittingTokenizer 13 | 14 | .. autoclass:: bistring.CharacterTokenizer 15 | 16 | .. autoclass:: bistring.WordTokenizer 17 | 18 | .. autoclass:: bistring.SentenceTokenizer 19 | 20 | -------------------------------------------------------------------------------- /docs/Python/bistr.rst: -------------------------------------------------------------------------------- 1 | bistr 2 | ===== 3 | 4 | .. testsetup:: * 5 | 6 | from bistring import bistr, Alignment 7 | 8 | .. autoclass:: bistring.bistr 9 | -------------------------------------------------------------------------------- /docs/Python/index.rst: -------------------------------------------------------------------------------- 1 | Python 2 | ====== 3 | 4 | .. toctree:: 5 | 6 | bistr 7 | BistrBuilder 8 | Alignment 9 | Tokenization 10 | Tokenizer 11 | 12 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | # Configuration file for the Sphinx documentation builder. 5 | # 6 | # This file only contains a selection of the most common options. For a full 7 | # list see the documentation: 8 | # http://www.sphinx-doc.org/en/master/config 9 | 10 | # -- Path setup -------------------------------------------------------------- 11 | 12 | # If extensions (or modules to document with autodoc) are in another directory, 13 | # add these directories to sys.path here. If the directory is relative to the 14 | # documentation root, use os.path.abspath to make it absolute, like shown here. 15 | 16 | import os 17 | from pathlib import Path 18 | import subprocess 19 | 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'bistring' 24 | copyright = '2022, Microsoft' 25 | author = 'Tavian Barnes' 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = '0.5.0' 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx.ext.doctest', 39 | 'sphinx.ext.intersphinx', 40 | 'sphinx_autodoc_typehints', 41 | 'sphinx_js', 42 | ] 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # List of patterns, relative to source directory, that match files and 48 | # directories to ignore when looking for source files. 49 | # This pattern also affects html_static_path and html_extra_path. 50 | exclude_patterns = [ 51 | 'node_modules', 52 | '_build', 53 | 'Thumbs.db', 54 | '.DS_Store', 55 | ] 56 | 57 | 58 | # -- Intersphinx configuration ----------------------------------------------- 59 | 60 | intersphinx_mapping = { 61 | 'python': ('https://docs.python.org/3', None), 62 | } 63 | 64 | 65 | # -- Autodoc configuration --------------------------------------------------- 66 | 67 | autoclass_content = 'both' 68 | 69 | autodoc_default_options = { 70 | 'members': True, 71 | 'member-order': 'bysource', 72 | 'show-inheritance': True, 73 | 'special-members': '__getitem__', 74 | } 75 | 76 | autodoc_inherit_docstrings = False 77 | 78 | 79 | # -- sphinx-js configuration ------------------------------------------------- 80 | 81 | parent = Path(__file__).parent.resolve() 82 | npm_bin = parent/'node_modules/.bin' 83 | os.environ["PATH"] = str(npm_bin) + ":" + os.environ["PATH"] 84 | 85 | js_language = 'typescript' 86 | 87 | js_source_path = '../js/src' 88 | 89 | jsdoc_config_path = '../js/tsconfig.json' 90 | 91 | root_for_relative_js_paths = '..' 92 | 93 | def npm_install(app, config): 94 | node_modules = parent/'node_modules' 95 | if not node_modules.exists(): 96 | subprocess.run(['npm', '--prefix=' + str(parent), 'install']) 97 | 98 | def setup(app): 99 | app.connect('config-inited', npm_install) 100 | 101 | 102 | # -- Options for HTML output ------------------------------------------------- 103 | 104 | # The theme to use for HTML and HTML Help pages. See the documentation for 105 | # a list of builtin themes. 106 | # 107 | html_theme = 'sphinx_rtd_theme' 108 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | bistring 2 | ======== 3 | 4 | The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace. 5 | Each bistring remembers the original string, and how its substrings map to substrings of the modified version. 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Contents: 10 | 11 | Introduction 12 | FAQ 13 | Python/index 14 | JavaScript/index 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "docs", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "": { 7 | "dependencies": { 8 | "typedoc": "^0.22.11", 9 | "typescript": "^4.7.0" 10 | } 11 | }, 12 | "node_modules/balanced-match": { 13 | "version": "1.0.2", 14 | "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", 15 | "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==" 16 | }, 17 | "node_modules/brace-expansion": { 18 | "version": "2.0.1", 19 | "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", 20 | "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", 21 | "dependencies": { 22 | "balanced-match": "^1.0.0" 23 | } 24 | }, 25 | "node_modules/fs.realpath": { 26 | "version": "1.0.0", 27 | "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", 28 | "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==" 29 | }, 30 | "node_modules/glob": { 31 | "version": "8.1.0", 32 | "resolved": "https://registry.npmjs.org/glob/-/glob-8.1.0.tgz", 33 | "integrity": "sha512-r8hpEjiQEYlF2QU0df3dS+nxxSIreXQS1qRhMJM0Q5NDdR386C7jb7Hwwod8Fgiuex+k0GFjgft18yvxm5XoCQ==", 34 | "dependencies": { 35 | "fs.realpath": "^1.0.0", 36 | "inflight": "^1.0.4", 37 | "inherits": "2", 38 | "minimatch": "^5.0.1", 39 | "once": "^1.3.0" 40 | }, 41 | "engines": { 42 | "node": ">=12" 43 | }, 44 | "funding": { 45 | "url": "https://github.com/sponsors/isaacs" 46 | } 47 | }, 48 | "node_modules/inflight": { 49 | "version": "1.0.6", 50 | "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", 51 | "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", 52 | "dependencies": { 53 | "once": "^1.3.0", 54 | "wrappy": "1" 55 | } 56 | }, 57 | "node_modules/inherits": { 58 | "version": "2.0.4", 59 | "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", 60 | "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==" 61 | }, 62 | "node_modules/jsonc-parser": { 63 | "version": "3.2.0", 64 | "resolved": "https://registry.npmjs.org/jsonc-parser/-/jsonc-parser-3.2.0.tgz", 65 | "integrity": "sha512-gfFQZrcTc8CnKXp6Y4/CBT3fTc0OVuDofpre4aEeEpSBPV5X5v4+Vmx+8snU7RLPrNHPKSgLxGo9YuQzz20o+w==" 66 | }, 67 | "node_modules/lunr": { 68 | "version": "2.3.9", 69 | "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz", 70 | "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==" 71 | }, 72 | "node_modules/marked": { 73 | "version": "4.3.0", 74 | "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz", 75 | "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==", 76 | "bin": { 77 | "marked": "bin/marked.js" 78 | }, 79 | "engines": { 80 | "node": ">= 12" 81 | } 82 | }, 83 | "node_modules/minimatch": { 84 | "version": "5.1.6", 85 | "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz", 86 | "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==", 87 | "dependencies": { 88 | "brace-expansion": "^2.0.1" 89 | }, 90 | "engines": { 91 | "node": ">=10" 92 | } 93 | }, 94 | "node_modules/once": { 95 | "version": "1.4.0", 96 | "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", 97 | "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==", 98 | "dependencies": { 99 | "wrappy": "1" 100 | } 101 | }, 102 | "node_modules/shiki": { 103 | "version": "0.10.1", 104 | "resolved": "https://registry.npmjs.org/shiki/-/shiki-0.10.1.tgz", 105 | "integrity": "sha512-VsY7QJVzU51j5o1+DguUd+6vmCmZ5v/6gYu4vyYAhzjuNQU6P/vmSy4uQaOhvje031qQMiW0d2BwgMH52vqMng==", 106 | "dependencies": { 107 | "jsonc-parser": "^3.0.0", 108 | "vscode-oniguruma": "^1.6.1", 109 | "vscode-textmate": "5.2.0" 110 | } 111 | }, 112 | "node_modules/typedoc": { 113 | "version": "0.22.18", 114 | "resolved": "https://registry.npmjs.org/typedoc/-/typedoc-0.22.18.tgz", 115 | "integrity": "sha512-NK9RlLhRUGMvc6Rw5USEYgT4DVAUFk7IF7Q6MYfpJ88KnTZP7EneEa4RcP+tX1auAcz7QT1Iy0bUSZBYYHdoyA==", 116 | "dependencies": { 117 | "glob": "^8.0.3", 118 | "lunr": "^2.3.9", 119 | "marked": "^4.0.16", 120 | "minimatch": "^5.1.0", 121 | "shiki": "^0.10.1" 122 | }, 123 | "bin": { 124 | "typedoc": "bin/typedoc" 125 | }, 126 | "engines": { 127 | "node": ">= 12.10.0" 128 | }, 129 | "peerDependencies": { 130 | "typescript": "4.0.x || 4.1.x || 4.2.x || 4.3.x || 4.4.x || 4.5.x || 4.6.x || 4.7.x" 131 | } 132 | }, 133 | "node_modules/typescript": { 134 | "version": "4.7.4", 135 | "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.7.4.tgz", 136 | "integrity": "sha512-C0WQT0gezHuw6AdY1M2jxUO83Rjf0HP7Sk1DtXj6j1EwkQNZrHAg2XPWlq62oqEhYvONq5pkC2Y9oPljWToLmQ==", 137 | "bin": { 138 | "tsc": "bin/tsc", 139 | "tsserver": "bin/tsserver" 140 | }, 141 | "engines": { 142 | "node": ">=4.2.0" 143 | } 144 | }, 145 | "node_modules/vscode-oniguruma": { 146 | "version": "1.7.0", 147 | "resolved": "https://registry.npmjs.org/vscode-oniguruma/-/vscode-oniguruma-1.7.0.tgz", 148 | "integrity": "sha512-L9WMGRfrjOhgHSdOYgCt/yRMsXzLDJSL7BPrOZt73gU0iWO4mpqzqQzOz5srxqTvMBaR0XZTSrVWo4j55Rc6cA==" 149 | }, 150 | "node_modules/vscode-textmate": { 151 | "version": "5.2.0", 152 | "resolved": "https://registry.npmjs.org/vscode-textmate/-/vscode-textmate-5.2.0.tgz", 153 | "integrity": "sha512-Uw5ooOQxRASHgu6C7GVvUxisKXfSgW4oFlO+aa+PAkgmH89O3CXxEEzNRNtHSqtXFTl0nAC1uYj0GMSH27uwtQ==" 154 | }, 155 | "node_modules/wrappy": { 156 | "version": "1.0.2", 157 | "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", 158 | "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==" 159 | } 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /docs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "typedoc": "^0.22.11", 4 | "typescript": "^4.7.0" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | -i https://pypi.org/simple 2 | alabaster==0.7.13; python_version >= '3.6' 3 | babel==2.12.1; python_version >= '3.7' 4 | ./../python 5 | certifi==2023.7.22; python_version >= '3.6' 6 | charset-normalizer==3.2.0; python_full_version >= '3.7.0' 7 | docutils==0.18.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' 8 | idna==3.4; python_version >= '3.5' 9 | imagesize==1.4.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' 10 | jinja2==3.1.2; python_version >= '3.7' 11 | markupsafe==2.0.1; python_version >= '3.6' 12 | packaging==23.1; python_version >= '3.7' 13 | parsimonious==0.10.0 14 | pydantic==1.10.12 15 | pygments==2.15.1; python_version >= '3.7' 16 | pyicu==2.11 17 | regex==2023.6.3 18 | requests==2.31.0; python_version >= '3.7' 19 | snowballstemmer==2.2.0 20 | sphinx==5.3.0 21 | sphinx-autodoc-typehints==1.23.0 22 | sphinx-js @ git+https://github.com/pyodide/sphinx-js@28105a224ec6c4c3141d10e182b9feaae7f10183 23 | sphinx-rtd-theme==1.2.2 24 | sphinxcontrib-applehelp==1.0.4; python_version >= '3.8' 25 | sphinxcontrib-devhelp==1.0.2; python_version >= '3.5' 26 | sphinxcontrib-htmlhelp==2.0.1; python_version >= '3.8' 27 | sphinxcontrib-jquery==4.1; python_version >= '2.7' 28 | sphinxcontrib-jsmath==1.0.1; python_version >= '3.5' 29 | sphinxcontrib-qthelp==1.0.3; python_version >= '3.5' 30 | sphinxcontrib-serializinghtml==1.1.5; python_version >= '3.5' 31 | typing-extensions==4.7.1; python_version >= '3.7' 32 | urllib3==2.0.7; python_version >= '3.7' 33 | -------------------------------------------------------------------------------- /js/.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Optional REPL history 57 | .node_repl_history 58 | 59 | # Output of 'npm pack' 60 | *.tgz 61 | 62 | # Yarn Integrity file 63 | .yarn-integrity 64 | 65 | # dotenv environment variables file 66 | .env 67 | .env.test 68 | 69 | # parcel-bundler cache (https://parceljs.org/) 70 | .cache 71 | 72 | # next.js build output 73 | .next 74 | 75 | # nuxt.js build output 76 | .nuxt 77 | 78 | # vuepress build output 79 | .vuepress/dist 80 | 81 | # Serverless directories 82 | .serverless/ 83 | 84 | # FuseBox cache 85 | .fusebox/ 86 | 87 | # DynamoDB Local files 88 | .dynamodb/ 89 | 90 | # Jest test output 91 | test-results/ 92 | 93 | # rollup cache 94 | .rpt2_cache/ 95 | 96 | # Transpilation outputs 97 | dist/ 98 | -------------------------------------------------------------------------------- /js/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | npm run build 3 | 4 | deps: 5 | npm install --ignore-scripts 6 | 7 | check: all 8 | npm test 9 | 10 | clean: 11 | $(RM) -r dist 12 | 13 | .PHONY: all deps check clean 14 | -------------------------------------------------------------------------------- /js/README.md: -------------------------------------------------------------------------------- 1 | bistring 2 | ======== 3 | 4 | [![npm version](https://badge.fury.io/js/bistring.svg)](https://www.npmjs.com/package/bistring) 5 | 6 | The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace. 7 | Each bistring remembers the original string, and how its substrings map to substrings of the modified version. 8 | 9 | For example: 10 | 11 | ```js 12 | import BiString from "bistring"; 13 | 14 | let s = new BiString("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶"); 15 | s = s.normalize("NFKD"); // Unicode normalization 16 | s = s.toLowerCase(); // Case-insensitivity 17 | s = s.replace("🦊", "fox"); // Replace emoji with text 18 | s = s.replace("🐶", "dog"); 19 | s = s.replace(/[^\w\s]+/g, ""); // Strip everything but letters and spaces 20 | s = s.substring(0, 19); // Extract a substring 21 | console.log(s.modified); // The modified substring, after changes 22 | // the quick brown fox 23 | console.log(s.original); // The original substring, before changes 24 | // 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 25 | ``` 26 | 27 | This allows you to perform very aggressive text processing completely invisibly. 28 | 29 | 30 | Demo 31 | ---- 32 | 33 | [Click here](https://microsoft.github.io/bistring/demo.html) for a live demo of the bistring library in your browser. 34 | -------------------------------------------------------------------------------- /js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "bistring", 3 | "version": "0.5.0", 4 | "description": "Bidirectionally transformed strings", 5 | "repository": { 6 | "type": "git", 7 | "url": "git+https://github.com/microsoft/bistring.git" 8 | }, 9 | "author": "msrmtle@microsoft.com", 10 | "license": "MIT", 11 | "bugs": { 12 | "url": "https://github.com/microsoft/bistring/issues" 13 | }, 14 | "homepage": "https://github.com/microsoft/bistring#readme", 15 | "main": "dist/index.js", 16 | "typings": "dist/index.d.ts", 17 | "module": "dist/index.mjs", 18 | "browser": "dist/index.browser.js", 19 | "files": [ 20 | "dist" 21 | ], 22 | "scripts": { 23 | "generate": "./scripts/generate_unicode.py", 24 | "prepare": "npm run build", 25 | "build": "rollup -c", 26 | "watch": "rollup -cw", 27 | "test": "jest" 28 | }, 29 | "devDependencies": { 30 | "@babel/preset-env": "^7.22.9", 31 | "@rollup/plugin-babel": "^6.0.3", 32 | "@rollup/plugin-commonjs": "^25.0.3", 33 | "@rollup/plugin-typescript": "^11.1.2", 34 | "@types/jest": "^29.5.3", 35 | "core-js": "^3.32.0", 36 | "jest": "^29.6.2", 37 | "jest-junit": "^16.0.0", 38 | "rollup": "^3.27.0", 39 | "ts-jest": "^29.1.1", 40 | "tslib": "^2.6.1", 41 | "typescript": "^5.1.6" 42 | }, 43 | "jest": { 44 | "reporters": [ 45 | "default", 46 | "jest-junit" 47 | ], 48 | "testRegex": ".*\\.(spec|test)\\.[jt]s$", 49 | "transform": { 50 | ".*\\.ts?$": "ts-jest" 51 | } 52 | }, 53 | "jest-junit": { 54 | "outputDirectory": "./test-results" 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /js/rollup.config.mjs: -------------------------------------------------------------------------------- 1 | import babel from "@rollup/plugin-babel"; 2 | import commonjs from "@rollup/plugin-commonjs"; 3 | import typescript from "@rollup/plugin-typescript"; 4 | import fs from "fs"; 5 | 6 | import pkg from "./package.json" assert { type: "json" }; 7 | 8 | export default [ 9 | { 10 | input: "src/index.ts", 11 | output: [ 12 | { 13 | file: pkg.main, 14 | format: "cjs", 15 | exports: "named", 16 | sourcemap: true, 17 | }, 18 | { 19 | file: pkg.module, 20 | format: "esm", 21 | exports: "named", 22 | sourcemap: true, 23 | }, 24 | ], 25 | external: [ 26 | ...Object.keys(pkg.dependencies || {}), 27 | ], 28 | plugins: [ 29 | typescript({ 30 | tsconfig: "./tsconfig.json", 31 | }), 32 | babel({ 33 | exclude: "node_modules/**", 34 | extensions: [".js", ".ts"], 35 | babelHelpers: "bundled", 36 | presets: [ 37 | [ 38 | "@babel/preset-env", 39 | { 40 | corejs: 3, 41 | targets: { 42 | node: "current", 43 | }, 44 | useBuiltIns: "usage", 45 | shippedProposals: true, 46 | }, 47 | ], 48 | ], 49 | }), 50 | commonjs(), 51 | ], 52 | }, 53 | { 54 | input: "src/index.ts", 55 | output: { 56 | file: pkg.browser, 57 | format: "umd", 58 | exports: "named", 59 | name: "bistring", 60 | sourcemap: true, 61 | }, 62 | external: [ 63 | ...Object.keys(pkg.dependencies || {}), 64 | "regenerator-runtime/runtime", 65 | ], 66 | plugins: [ 67 | typescript({ 68 | tsconfig: "./tsconfig.json", 69 | }), 70 | babel({ 71 | exclude: "node_modules/**", 72 | extensions: [".js", ".ts"], 73 | babelHelpers: "bundled", 74 | presets: [ 75 | [ 76 | "@babel/preset-env", 77 | { 78 | corejs: 3, 79 | targets: { 80 | browsers: "> 2%, not dead", 81 | }, 82 | useBuiltIns: "usage", 83 | shippedProposals: true, 84 | }, 85 | ], 86 | ], 87 | }), 88 | commonjs(), 89 | ], 90 | }, 91 | ]; 92 | -------------------------------------------------------------------------------- /js/scripts/generate_unicode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT license. 5 | 6 | 7 | import icu 8 | from typing import IO 9 | 10 | 11 | def escape(cp: int) -> str: 12 | if cp < 0x10000: 13 | return f"\\u{cp:04x}" 14 | else: 15 | return f"\\u{{{cp:x}}}" 16 | 17 | 18 | def gen_boundary_regex(normalizer: icu.Normalizer2) -> str: 19 | ranges = [] 20 | for cp in range(0x110000): 21 | if not normalizer.hasBoundaryBefore(chr(cp)): 22 | if ranges and cp == ranges[-1].stop: 23 | ranges[-1] = range(ranges[-1].start, cp + 1) 24 | else: 25 | ranges.append(range(cp, cp + 1)) 26 | 27 | chunks = ['/.['] 28 | for r in ranges: 29 | chunks.append(escape(r.start)) 30 | if len(r) > 1: 31 | chunks.append('-') 32 | chunks.append(escape(r.stop - 1)) 33 | chunks.append(']*/gsu') 34 | 35 | return "".join(chunks) 36 | 37 | 38 | def export_boundary_regex(f: IO[str], form: str) -> None: 39 | f.write(f'/**\n') 40 | f.write(f' * Matches until the next {form} normalization boundary.\n') 41 | f.write(f' */\n') 42 | f.write(f'export const {form}_CHUNK = ') 43 | 44 | normalizer = getattr(icu.Normalizer2, "get" + form + "Instance")() 45 | f.write(gen_boundary_regex(normalizer)) 46 | f.write(';\n') 47 | 48 | 49 | if __name__ == "__main__": 50 | with open('src/unicode.ts', 'w') as f: 51 | f.write('/**\n') 52 | f.write(' * GENERATED BY scripts/generate_unicode.py.\n') 53 | f.write(' * DO NOT EDIT BY HAND.\n') 54 | f.write(' */\n\n') 55 | 56 | export_boundary_regex(f, "NFC") 57 | f.write('\n') 58 | export_boundary_regex(f, "NFD") 59 | f.write('\n') 60 | export_boundary_regex(f, "NFKC") 61 | f.write('\n') 62 | export_boundary_regex(f, "NFKD") 63 | -------------------------------------------------------------------------------- /js/src/bistring.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | import Alignment, { Bounds } from "./alignment"; 7 | import BiStringBuilder from "./builder"; 8 | import heuristicInfer from "./infer"; 9 | import { Replacer, normalizeReplacer, cloneRegExp } from "./regex"; 10 | import * as unicode from "./unicode"; 11 | 12 | export type AnyString = string | BiString; 13 | 14 | /** 15 | * A bidirectionally transformed string. 16 | */ 17 | export default class BiString implements Iterable { 18 | /** The original string, before any modifications. */ 19 | readonly original: string; 20 | /** The current value of the string, after all modifications. */ 21 | readonly modified: string; 22 | /** The sequence alignment between `original` and `modified`. */ 23 | readonly alignment: Alignment; 24 | /** The length of the modified string. */ 25 | readonly length: number; 26 | /** Indexes the code units of the modified string. */ 27 | readonly [i: number]: string; 28 | 29 | /** 30 | * A BiString can be constructed from only a single string, which will give it identical original and modified 31 | * strings and an identity alignment: 32 | * 33 | * .. code-block:: ts 34 | * 35 | * new BiString("test"); 36 | * 37 | * You can also explicitly specify both the original and modified strings. The inferred alignment will be as course 38 | * as possible: 39 | * 40 | * .. code-block:: ts 41 | * 42 | * new BiString("TEST", "test"); 43 | * 44 | * Finally, you can specify the alignment explicitly, if you know it: 45 | * 46 | * .. code-block:: ts 47 | * 48 | * new BiString("TEST", "test", Alignment.identity(4)); 49 | * 50 | * @param original 51 | * The original string, before any modifications. 52 | * @param modified 53 | * The modified string, after any modifications. 54 | * @param alignment 55 | * The alignment between the original and modified strings. 56 | */ 57 | constructor(original: string, modified?: string, alignment?: Alignment) { 58 | if (typeof(original) !== "string") { 59 | throw new TypeError("original was not a string"); 60 | } 61 | this.original = original; 62 | 63 | if (modified === undefined) { 64 | modified = original; 65 | if (alignment === undefined) { 66 | alignment = Alignment.identity(original.length); 67 | } 68 | } else if (typeof(modified) === "string") { 69 | if (alignment === undefined) { 70 | alignment = new Alignment([[0, 0], [original.length, modified.length]]); 71 | } 72 | } else { 73 | throw new TypeError("modified was not a string"); 74 | } 75 | this.modified = modified; 76 | 77 | if (!(alignment instanceof Alignment)) { 78 | throw new TypeError("alignment was not an Alignment"); 79 | } 80 | 81 | const [ostart, oend] = alignment.originalBounds(); 82 | if (ostart !== 0 || oend !== original.length) { 83 | throw new RangeError("Alignment incompatible with original string"); 84 | } 85 | 86 | const [mstart, mend] = alignment.modifiedBounds(); 87 | if (mstart !== 0 || mend !== modified.length) { 88 | throw new RangeError("Alignment incompatible with modified string"); 89 | } 90 | 91 | this.alignment = alignment; 92 | 93 | this.length = this.modified.length; 94 | 95 | for (let i = 0; i < this.length; ++i) { 96 | // @ts-ignore: https://github.com/microsoft/TypeScript/issues/6781 97 | this[i] = this.modified[i]; 98 | } 99 | 100 | Object.freeze(this); 101 | } 102 | 103 | /** 104 | * Create a `BiString` from a string-like object. 105 | * 106 | * @param str 107 | * Either a `string` or a `BiString`. 108 | * @returns 109 | * The input coerced to a `BiString`. 110 | */ 111 | static from(str: AnyString): BiString { 112 | if (str instanceof BiString) { 113 | return str; 114 | } else { 115 | return new BiString(str); 116 | } 117 | } 118 | 119 | /** 120 | * Create a `BiString`, automatically inferring an alignment between the `original` and `modified` strings. 121 | * 122 | * @param original 123 | * The original string. 124 | * @param modified 125 | * The modified string. 126 | */ 127 | static infer(original: string, modified: string): BiString { 128 | return heuristicInfer(original, modified); 129 | } 130 | 131 | /** 132 | * Iterates over the code points in the modified string. 133 | */ 134 | [Symbol.iterator](): IterableIterator { 135 | return this.modified[Symbol.iterator](); 136 | } 137 | 138 | /** 139 | * Like :js:meth:`String.prototype.charAt`, returns a code unit as a string from the modified string. 140 | */ 141 | charAt(pos: number): string { 142 | return this.modified.charAt(pos); 143 | } 144 | 145 | /** 146 | * Like :js:meth:`String.prototype.charCodeAt`, returns a code unit as a number from the modified string. 147 | */ 148 | charCodeAt(pos: number): number { 149 | return this.modified.charCodeAt(pos); 150 | } 151 | 152 | /** 153 | * Like :js:meth:`String.prototype.codePointAt`, returns a code point from the modified string. 154 | */ 155 | codePointAt(pos: number): number | undefined { 156 | return this.modified.codePointAt(pos); 157 | } 158 | 159 | /** 160 | * Extract a substring of this BiString, with similar semantics to :js:meth:`String.prototype.substring`. 161 | */ 162 | substring(start: number, end?: number): BiString { 163 | if (end === undefined) { 164 | end = this.length; 165 | } 166 | if (start > end) { 167 | [start, end] = [end, start]; 168 | } 169 | start = Math.max(0, Math.min(start, this.length)); 170 | end = Math.max(0, Math.min(end, this.length)); 171 | return this.slice(start, end); 172 | } 173 | 174 | /** 175 | * Extract a slice of this BiString, with similar semantics to :js:meth:`String.prototype.slice`. 176 | */ 177 | slice(start: number, end?: number): BiString { 178 | if (end === undefined) { 179 | end = this.length; 180 | } 181 | if (start < 0) { 182 | start += this.length; 183 | } 184 | if (end < 0) { 185 | end += this.length; 186 | } 187 | if (end < start) { 188 | end = start; 189 | } 190 | start = Math.max(0, Math.min(start, this.length)); 191 | end = Math.max(0, Math.min(end, this.length)); 192 | 193 | const alignment = this.alignment.sliceByModified(start, end); 194 | const modified = this.modified.slice(...alignment.modifiedBounds()); 195 | const original = this.original.slice(...alignment.originalBounds()); 196 | const [o0, m0] = alignment.values[0]; 197 | return new BiString(original, modified, alignment.shift(-o0, -m0)); 198 | } 199 | 200 | /** 201 | * Concatenate this string together with one or more others. The additional strings can be either BiStrings or 202 | * normal strings. 203 | */ 204 | concat(...others: AnyString[]): BiString { 205 | let original = this.original; 206 | let modified = this.modified; 207 | let alignment = this.alignment; 208 | 209 | for (const other of others) { 210 | const biother = BiString.from(other); 211 | alignment = alignment.concat(biother.alignment.shift(original.length, modified.length)); 212 | original += biother.original; 213 | modified += biother.modified; 214 | } 215 | 216 | return new BiString(original, modified, alignment); 217 | } 218 | 219 | /** 220 | * @returns 221 | * The inverse of this string, swapping the original and modified strings. 222 | */ 223 | inverse(): BiString { 224 | return new BiString(this.modified, this.original, this.alignment.inverse()); 225 | } 226 | 227 | /** 228 | * @returns 229 | * Whether this BiString is equal to another. 230 | */ 231 | equals(other: BiString): boolean { 232 | return this.original === other.original 233 | && this.modified === other.modified 234 | && this.alignment.equals(other.alignment); 235 | } 236 | 237 | /** 238 | * Like :js:meth:`String.prototype.indexOf`, finds the first occurrence of a substring. 239 | */ 240 | indexOf(searchValue: string, fromIndex?: number): number { 241 | return this.modified.indexOf(searchValue, fromIndex); 242 | } 243 | 244 | /** 245 | * Like :js:meth:`String.prototype.lastIndexOf`, finds the last occurrence of a substring. 246 | */ 247 | lastIndexOf(searchValue: string, fromIndex?: number): number { 248 | return this.modified.lastIndexOf(searchValue, fromIndex); 249 | } 250 | 251 | /** 252 | * Like :js:meth:`indexOf`, but returns both the start and end positions for convenience. 253 | */ 254 | boundsOf(searchValue: string, fromIndex?: number): Bounds { 255 | let start = this.indexOf(searchValue, fromIndex); 256 | if (start === -1) { 257 | return [-1, -1]; 258 | } else { 259 | return [start, start + searchValue.length]; 260 | } 261 | } 262 | 263 | /** 264 | * Like :js:meth:`lastIndexOf`, but returns both the start and end positions for convenience. 265 | */ 266 | lastBoundsOf(searchValue: string, fromIndex?: number): Bounds { 267 | let start = this.lastIndexOf(searchValue, fromIndex); 268 | if (start === -1) { 269 | return [-1, -1]; 270 | } else { 271 | return [start, start + searchValue.length]; 272 | } 273 | } 274 | 275 | /** 276 | * Like :js:meth:`String.prototype.search`, finds the position of the first match of a regular expression. 277 | */ 278 | search(regexp: RegExp): number { 279 | return this.modified.search(regexp); 280 | } 281 | 282 | /** 283 | * Like :js:meth:`search`, but returns both the start and end positions for convenience. 284 | */ 285 | searchBounds(regexp: RegExp): Bounds { 286 | const match = regexp.exec(this.modified); 287 | if (match === null) { 288 | return [-1, -1]; 289 | } else { 290 | return [match.index, match.index + match[0].length]; 291 | } 292 | } 293 | 294 | /** 295 | * Like :js:meth:`String.prototype.match`, returns the result of a regular expression match. 296 | */ 297 | match(regexp: RegExp): RegExpMatchArray | null { 298 | return this.modified.match(regexp); 299 | } 300 | 301 | /** 302 | * Like :js:meth:`String.prototype.matchAll`, returns an iterator over all regular expression matches. 303 | */ 304 | matchAll(regexp: RegExp): IterableIterator { 305 | return this.modified.matchAll(regexp); 306 | } 307 | 308 | private _replaceString(pattern: string, replacement: string | Replacer): BiString { 309 | const replacer = normalizeReplacer(replacement); 310 | const builder = new BiStringBuilder(this); 311 | 312 | while (!builder.isComplete) { 313 | const next = this.indexOf(pattern, builder.position); 314 | if (next < 0) { 315 | break; 316 | } 317 | builder.skip(next - builder.position); 318 | 319 | const match = [this.modified.slice(next, next + pattern.length)] as RegExpMatchArray; 320 | match.index = next; 321 | match.input = this.modified; 322 | builder.replace(pattern.length, replacer(match)); 323 | } 324 | 325 | builder.skipRest(); 326 | return builder.build(); 327 | } 328 | 329 | private _replaceRegExp(pattern: RegExp, replacement: string | Replacer): BiString { 330 | const builder = new BiStringBuilder(this); 331 | builder.replaceAll(pattern, replacement); 332 | return builder.build(); 333 | } 334 | 335 | /** 336 | * Like :js:meth:`String.prototype.replace`, returns a new string with regex or fixed-string matches replaced. 337 | */ 338 | replace(pattern: string | RegExp, replacement: string | Replacer): BiString { 339 | if (typeof(pattern) === "string") { 340 | return this._replaceString(pattern, replacement); 341 | } else { 342 | return this._replaceRegExp(pattern, replacement); 343 | } 344 | } 345 | 346 | /** 347 | * Like :js:meth:`String.prototype.trim`, returns a new string with leading and trailing whitespace removed. 348 | */ 349 | trim(): BiString { 350 | return this.replace(/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g, ""); 351 | } 352 | 353 | /** 354 | * Like :js:meth:`String.prototype.trim`, returns a new string with leading whitespace removed. 355 | */ 356 | trimStart(): BiString { 357 | return this.replace(/^[\s\uFEFF\xA0]+/, ""); 358 | } 359 | 360 | /** 361 | * Like :js:meth:`String.prototype.trim`, returns a new string with trailing whitespace removed. 362 | */ 363 | trimEnd(): BiString { 364 | return this.replace(/[\s\uFEFF\xA0]+$/, ""); 365 | } 366 | 367 | /** 368 | * Like :js:meth:`String.prototype.padStart`, pads a string at the beginning to a target length. 369 | */ 370 | padStart(targetLength: number, padString: string = " "): BiString { 371 | const padLength = targetLength - this.length; 372 | if (padLength <= 0) { 373 | return this; 374 | } 375 | if (padString.length < padLength) { 376 | padString += padString.repeat(targetLength / padString.length); 377 | } 378 | padString = padString.slice(0, padLength); 379 | return new BiString("", padString).concat(this); 380 | } 381 | 382 | /** 383 | * Like :js:meth:`String.prototype.padEnd`, pads a string at the end to a target length. 384 | */ 385 | padEnd(targetLength: number, padString: string = " "): BiString { 386 | const padLength = targetLength - this.length; 387 | if (padLength <= 0) { 388 | return this; 389 | } 390 | if (padString.length < padLength) { 391 | padString += padString.repeat(targetLength / padString.length); 392 | } 393 | padString = padString.slice(0, padLength); 394 | return this.concat(new BiString("", padString)); 395 | } 396 | 397 | /** 398 | * Like :js:meth:`String.prototype.startsWith`, returns whether this string starts with the given prefix. 399 | */ 400 | startsWith(searchString: string, position?: number): boolean { 401 | return this.modified.startsWith(searchString, position); 402 | } 403 | 404 | /** 405 | * Like :js:meth:`String.prototype.endsWith`, returns whether this string ends with the given prefix. 406 | */ 407 | endsWith(searchString: string, position?: number): boolean { 408 | return this.modified.endsWith(searchString, position); 409 | } 410 | 411 | private _splitString(pattern: string, limit?: number): BiString[] { 412 | if (limit === undefined) { 413 | limit = Infinity; 414 | } 415 | 416 | const result = []; 417 | 418 | for (let i = 0, j, k; i >= 0 && result.length < limit; i = k) { 419 | if (pattern.length === 0) { 420 | if (i + 1 < this.length) { 421 | j = k = i + 1; 422 | } else { 423 | j = k = -1; 424 | } 425 | } else { 426 | [j, k] = this.boundsOf(pattern, i); 427 | } 428 | 429 | if (j >= 0) { 430 | result.push(this.slice(i, j)); 431 | } else { 432 | result.push(this.slice(i)); 433 | } 434 | } 435 | 436 | return result; 437 | } 438 | 439 | private _splitRegExp(pattern: RegExp, limit?: number): BiString[] { 440 | pattern = cloneRegExp(pattern, "g", "y"); 441 | if (limit === undefined) { 442 | limit = Infinity; 443 | } 444 | 445 | const result = []; 446 | 447 | let last = 0; 448 | for (const match of this.matchAll(pattern)) { 449 | if (result.length >= limit) { 450 | break; 451 | } 452 | 453 | const start = match.index!; 454 | const end = start + match[0].length; 455 | 456 | result.push(this.slice(last, start)); 457 | 458 | // String.prototype.split() will include any captured substrings in the result. But we can't support that 459 | // easily, since JS regexes give us no information about the position of matched capture groups 460 | if (match.length > 1) { 461 | throw new Error("split() with capture groups is not supported"); 462 | } 463 | 464 | last = end; 465 | } 466 | 467 | if (result.length < limit) { 468 | result.push(this.slice(last)); 469 | } 470 | 471 | return result; 472 | } 473 | 474 | /** 475 | * Like :js:meth:`String.prototype.split`, splits this string into chunks using a separator. 476 | */ 477 | split(separator?: string | RegExp, limit?: number): BiString[] { 478 | if (separator === undefined) { 479 | return [this]; 480 | } else if (typeof(separator) === "string") { 481 | return this._splitString(separator, limit); 482 | } else { 483 | return this._splitRegExp(separator, limit); 484 | } 485 | } 486 | 487 | /** 488 | * Like :js:meth:`Array.prototype.join`, joins a sequence together with this `BiString` as the separator. 489 | */ 490 | join(items: Iterable): BiString { 491 | let [first, ...rest] = items; 492 | if (first === undefined) { 493 | return new BiString(""); 494 | } 495 | 496 | first = BiString.from(first); 497 | rest = rest.flatMap(s => [this, s]); 498 | return first.concat(...rest); 499 | } 500 | 501 | private static _normalFormRegex(form: string) { 502 | switch (form) { 503 | case "NFC": 504 | return unicode.NFC_CHUNK; 505 | case "NFD": 506 | return unicode.NFD_CHUNK; 507 | case "NFKC": 508 | return unicode.NFKC_CHUNK; 509 | case "NFKD": 510 | return unicode.NFKD_CHUNK; 511 | default: 512 | throw new RangeError(`Expected a normalization form (NFC, NFD, NFKC, NFKD); found ${form}`); 513 | } 514 | } 515 | 516 | /** 517 | * Like :js:meth:`String.prototype.normalize`, applies a Unicode normalization form. 518 | * 519 | * @param form 520 | * The normalization form to apply, one of "NFC", "NFD", "NFKC", or "NFKD". 521 | */ 522 | normalize(form: "NFC" | "NFD" | "NFKC" | "NFKD"): BiString { 523 | const regex = BiString._normalFormRegex(form); 524 | return this.replace(regex, m => { 525 | const result = m.normalize(form); 526 | if (result === m) { 527 | return new BiString(m); 528 | } else { 529 | return new BiString(m, result); 530 | } 531 | }); 532 | } 533 | 534 | private _isFinalSigmaAt(index: number): boolean { 535 | if (this[index] !== "Σ") { 536 | return false; 537 | } 538 | 539 | // Emulate negative lookahead: (?!\p{Case_Ignorable}*+\p{Cased}) 540 | for (let i = index + 1; i < this.length; ++i) { 541 | const cp = this.codePointAt(i)!; 542 | const c = String.fromCodePoint(cp); 543 | if (/\P{Case_Ignorable}/uy.test(c)) { 544 | if (/\p{Cased}/uy.test(c)) { 545 | return false; 546 | } else { 547 | break; 548 | } 549 | } 550 | if (cp > 0xFFFF) { 551 | ++i; 552 | } 553 | } 554 | 555 | // Emulate positive lookbehind: (?<=\p{Cased}\p{Case_Ignorable}*+) 556 | for (let i = index; i-- > 0;) { 557 | let cp = this.charCodeAt(i); 558 | if (i > 0 && (cp & 0xFC00) == 0xDC00 && (this.charCodeAt(i - 1) & 0xFC00) == 0xD800) { 559 | --i; 560 | cp = this.codePointAt(i)!; 561 | } 562 | const c = String.fromCodePoint(cp); 563 | if (/\P{Case_Ignorable}/uy.test(c)) { 564 | if (/\p{Cased}/uy.test(c)) { 565 | return true; 566 | } else { 567 | break; 568 | } 569 | } 570 | } 571 | 572 | return false; 573 | } 574 | 575 | /** 576 | * Like :js:meth:`String.prototype.toLowerCase`, converts a string to lowercase. 577 | */ 578 | toLowerCase(): BiString { 579 | return this.replace(/\p{Changes_When_Lowercased}/gu, (m, ...args) => { 580 | // This is the only contextual but non-language-specific mapping in SpecialCasing.txt as of Unicode 12.1 581 | if (this._isFinalSigmaAt(args[args.length - 2])) { 582 | return "ς"; 583 | } else { 584 | return m.toLowerCase(); 585 | } 586 | }); 587 | } 588 | 589 | /** 590 | * Like :js:meth:`String.prototype.toUpperCase`, converts a string to uppercase. 591 | */ 592 | toUpperCase(): BiString { 593 | return this.replace(/\p{Changes_When_Uppercased}/gu, m => m.toUpperCase()); 594 | } 595 | } 596 | -------------------------------------------------------------------------------- /js/src/builder.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | import Alignment, { BiIndex } from "./alignment"; 7 | import BiString, { AnyString } from "./bistring"; 8 | import { Replacer, normalizeReplacer, cloneRegExp, isStatefulRegExp } from "./regex"; 9 | 10 | /** 11 | * Bidirectionally transformed string builder. 12 | * 13 | * 14 | * A `BistrBuilder` builds a transformed version of a source string iteratively. Each builder has an immutable 15 | * original string, a current string, and the in-progress modified string, with alignments between each. For example: 16 | * 17 | * .. code-block:: text 18 | * 19 | * original: |The| |quick,| |brown| |🦊| |jumps| |over| |the| |lazy| |🐶| 20 | * | | | | | | | \ \ \ \ \ \ \ \ \ \ \ 21 | * current: |The| |quick,| |brown| |fox| |jumps| |over| |the| |lazy| |dog| 22 | * | | | / / / 23 | * modified: |the| |quick| |brown| ... 24 | * 25 | * The modified string is built in pieces by calling :js:meth:`replace` to change `n` characters of the current string 26 | * into new ones in the modified string. Convenience methods like :js:meth:`skip`, :js:meth:`insert`, and 27 | * :js:meth:`discard` are implemented on top of this basic primitive. 28 | */ 29 | export default class BiStringBuilder { 30 | private _original: BiString; 31 | private _modified: string[]; 32 | private _alignment: BiIndex[]; 33 | private _oPos: number; 34 | private _mPos: number; 35 | 36 | /** 37 | * Construct a BiStringBuilder. 38 | * 39 | * @param original 40 | * Either an original string or a BiString to start from. 41 | */ 42 | constructor(original: AnyString) { 43 | this._original = BiString.from(original); 44 | this._modified = []; 45 | this._alignment = [[0, 0]]; 46 | this._oPos = 0; 47 | this._mPos = 0; 48 | } 49 | 50 | /** 51 | * The original string being modified. 52 | */ 53 | get original(): string { 54 | return this._original.original; 55 | } 56 | 57 | /** 58 | * The current string before modifications. 59 | */ 60 | get current(): string { 61 | return this._original.modified; 62 | } 63 | 64 | /** 65 | * The modified string as built so far. 66 | */ 67 | get modified(): string { 68 | return this._modified.join(""); 69 | } 70 | 71 | /** 72 | * The alignment as built so far from `this.current` to `this.modified`. 73 | */ 74 | get alignment(): Alignment { 75 | return new Alignment(this._alignment); 76 | } 77 | 78 | /** 79 | * The position of the builder in `this.current`. 80 | */ 81 | get position(): number { 82 | return this._oPos; 83 | } 84 | 85 | /** 86 | * The number of characters of the current string left to process. 87 | */ 88 | get remaining(): number { 89 | return this.current.length - this.position; 90 | } 91 | 92 | /** 93 | * Whether we've completely processed the string. In other words, whether the modified string aligns with the end 94 | * of the current string. 95 | */ 96 | get isComplete(): boolean { 97 | return this.remaining === 0; 98 | } 99 | 100 | /** 101 | * Peek at the next few characters. 102 | * 103 | * @param n 104 | * The number of characters to peek at. 105 | */ 106 | peek(n: number): string { 107 | return this.current.slice(this._oPos, this._oPos + n); 108 | } 109 | 110 | private _advance(oCount: number, mCount: number) { 111 | this._oPos += oCount; 112 | this._mPos += mCount; 113 | if (oCount > 0 || mCount > 0) { 114 | this._alignment.push([this._oPos, this._mPos]); 115 | } 116 | } 117 | 118 | /** 119 | * Skip the next `n` characters, copying them unchanged. 120 | */ 121 | skip(n: number) { 122 | if (n > 0) { 123 | this._modified.push(this.peek(n)); 124 | for (let i = 0; i < n; ++i) { 125 | this._advance(1, 1); 126 | } 127 | } 128 | } 129 | 130 | /** 131 | * Skip the rest of the string, copying it unchanged. 132 | */ 133 | skipRest() { 134 | this.skip(this.remaining); 135 | } 136 | 137 | /** 138 | * Insert a substring into the string. 139 | */ 140 | insert(str: string) { 141 | this.replace(0, str); 142 | } 143 | 144 | /** 145 | * Discard a portion of the original string. 146 | */ 147 | discard(n: number) { 148 | this.replace(n, ""); 149 | } 150 | 151 | /** 152 | * Discard the rest of the original string. 153 | */ 154 | discardRest() { 155 | this.discard(this.remaining); 156 | } 157 | 158 | /** 159 | * Replace the next `n` characters with a new string. 160 | */ 161 | replace(n: number, str: AnyString) { 162 | if (typeof(str) === "string") { 163 | if (str.length > 0) { 164 | this._modified.push(str); 165 | } 166 | this._advance(n, str.length); 167 | } else { 168 | if (str.original !== this.peek(n)) { 169 | throw new Error("BiString doesn't match the current string"); 170 | } 171 | 172 | this._modified.push(str.modified); 173 | 174 | const alignment = str.alignment.values; 175 | for (let i = 1; i < alignment.length; ++i) { 176 | const [o0, m0] = alignment[i - 1]; 177 | const [o1, m1] = alignment[i]; 178 | this._advance(o1 - o0, m1 - m0); 179 | } 180 | } 181 | } 182 | 183 | /** 184 | * Append a BiString. The original value of the BiString must match the current string being processed. 185 | */ 186 | append(bs: BiString) { 187 | this.replace(bs.original.length, bs); 188 | } 189 | 190 | private _match(pattern: RegExp): RegExpExecArray | null { 191 | if (!isStatefulRegExp(pattern)) { 192 | pattern = cloneRegExp(pattern, "g"); 193 | } 194 | pattern.lastIndex = this.position; 195 | return pattern.exec(this.current); 196 | } 197 | 198 | private * _matchAll(pattern: RegExp): IterableIterator { 199 | if (pattern.global) { 200 | pattern.lastIndex = this.position; 201 | let match; 202 | while ((match = pattern.exec(this.current))) { 203 | yield match; 204 | } 205 | } else { 206 | if (!pattern.sticky) { 207 | pattern = cloneRegExp(pattern, "g"); 208 | } 209 | pattern.lastIndex = this.position; 210 | let match; 211 | if ((match = pattern.exec(this.current))) { 212 | yield match; 213 | } 214 | } 215 | } 216 | 217 | /** 218 | * Skip a substring matching a regex, copying it unchanged. 219 | * 220 | * @param pattern 221 | * The pattern to match. Must have either the sticky flag, forcing it to match at the current position, or 222 | * the global flag, finding the next match. 223 | * @returns 224 | * Whether a match was found. 225 | */ 226 | skipMatch(pattern: RegExp): boolean { 227 | if (this._match(pattern)) { 228 | this.skip(pattern.lastIndex - this.position); 229 | return true; 230 | } else { 231 | return false; 232 | } 233 | } 234 | 235 | /** 236 | * Discard a substring that matches a regex. 237 | * 238 | * @param pattern 239 | * The pattern to match. Must have either the sticky flag, forcing it to match at the current position, or 240 | * the global flag, finding the next match. 241 | * @returns 242 | * Whether a match was found. 243 | */ 244 | discardMatch(pattern: RegExp): boolean { 245 | const match = this._match(pattern); 246 | if (match) { 247 | this.skip(match.index - this.position); 248 | this.discard(match[0].length); 249 | return true; 250 | } else { 251 | return false; 252 | } 253 | } 254 | 255 | /** 256 | * Replace a substring that matches a regex. 257 | * 258 | * @param pattern 259 | * The pattern to match. Must have either the sticky flag, forcing it to match at the current position, or 260 | * the global flag, finding the next match. 261 | * @param replacement 262 | * The replacement string or function, as in :js:meth:`String.prototype.replace`. 263 | * @returns 264 | * Whether a match was found. 265 | */ 266 | replaceMatch(pattern: RegExp, replacement: string | Replacer): boolean { 267 | const replacer = normalizeReplacer(replacement); 268 | const match = this._match(pattern); 269 | if (match) { 270 | this.skip(match.index - this.position); 271 | this.replace(match[0].length, replacer(match)); 272 | return true; 273 | } else { 274 | return false; 275 | } 276 | } 277 | 278 | /** 279 | * Replace all occurences of a regex, like :js:meth:`String.prototype.replace`. 280 | * 281 | * @param pattern 282 | * The pattern to match. The global flag (/g) must be set to get multiple matches. 283 | * @param replacement 284 | * The replacement string or function, as in :js:meth:`String.prototype.replace`. 285 | */ 286 | replaceAll(pattern: RegExp, replacement: string | Replacer) { 287 | const replacer = normalizeReplacer(replacement); 288 | 289 | for (const match of this._matchAll(pattern)) { 290 | this.skip(match.index - this.position); 291 | this.replace(match[0].length, replacer(match)); 292 | } 293 | 294 | this.skipRest(); 295 | } 296 | 297 | /** 298 | * Build the :js:class:`BiString`. 299 | */ 300 | build(): BiString { 301 | if (!this.isComplete) { 302 | throw new Error(`The string is not completely built yet (${this.remaining} characters remaining)`); 303 | } 304 | 305 | const alignment = this._original.alignment.compose(this.alignment); 306 | return new BiString(this.original, this.modified, alignment); 307 | } 308 | 309 | /** 310 | * Reset this builder to apply another transformation. 311 | */ 312 | rewind() { 313 | this._original = this.build(); 314 | this._modified = []; 315 | this._alignment = [[0, 0]]; 316 | this._oPos = 0; 317 | this._mPos = 0; 318 | } 319 | } 320 | -------------------------------------------------------------------------------- /js/src/index.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | export { default as Alignment } from "./alignment"; 7 | export { default as BiString } from "./bistring"; 8 | export { default as BiStringBuilder } from "./builder"; 9 | export * from "./token"; 10 | 11 | export { default } from "./bistring"; 12 | -------------------------------------------------------------------------------- /js/src/infer.ts: -------------------------------------------------------------------------------- 1 | import Alignment, { BiIndex } from "./alignment"; 2 | import BiString from "./bistring"; 3 | 4 | // https://unicode.org/reports/tr44/#GC_Values_Table 5 | const CATEGORIES = [ 6 | "Lu", "Ll", "Lt", "Lm", "Lo", 7 | "Mn", "Mc", "Me", 8 | "Nd", "Nl", "No", 9 | "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", 10 | "Sm", "Sc", "Sk", "So", 11 | "Zs", "Zl", "Zp", 12 | "Cc", "Cf", "Cs", "Co", "Cn", 13 | ]; 14 | 15 | // TODO: Babel doesn't polyfill this, and the source transformation doesn't catch it 16 | // const CATEGORY_REGEXP = new RegExp(CATEGORIES.map(c => `(\\p{${c}})`).join("|"), "u"); 17 | const CATEGORY_REGEXP = /(\p{Lu})|(\p{Ll})|(\p{Lt})|(\p{Lm})|(\p{Lo})|(\p{Mn})|(\p{Mc})|(\p{Me})|(\p{Nd})|(\p{Nl})|(\p{No})|(\p{Pc})|(\p{Pd})|(\p{Ps})|(\p{Pe})|(\p{Pi})|(\p{Pf})|(\p{Po})|(\p{Sm})|(\p{Sc})|(\p{Sk})|(\p{So})|(\p{Zs})|(\p{Zl})|(\p{Zp})|(\p{Cc})|(\p{Cf})|(\p{Cs})|(\p{Co})|(\p{Cn})/u; 18 | 19 | function category(cp: string): string { 20 | const match = cp.match(CATEGORY_REGEXP)!; 21 | const index = match.indexOf(cp, 1) - 1; 22 | return CATEGORIES[index]; 23 | } 24 | 25 | /** 26 | * A single glyph in a string, augmented with some extra information. 27 | */ 28 | class AugmentedGlyph { 29 | /** The original form of the glyph. */ 30 | readonly original: string; 31 | /** The Unicode compatibility normalized form of the glyph. */ 32 | readonly normalized: string; 33 | /** The uppercase form of the glyph. */ 34 | readonly upper: string; 35 | /** The root code point of the grapheme cluster. */ 36 | readonly root: string; 37 | /** The specific Unicode category of the glyph (Lu, Po, Zs, etc.). */ 38 | readonly category: string; 39 | /** The top-level Unicode category of the glyph (L, P, Z, etc.). */ 40 | readonly topCategory: string; 41 | 42 | constructor(original: string, normalized: string, upper: string) { 43 | this.original = original; 44 | this.normalized = normalized; 45 | this.upper = upper; 46 | this.root = String.fromCodePoint(upper.codePointAt(0)!); 47 | this.category = category(this.root); 48 | this.topCategory = this.category[0]; 49 | } 50 | 51 | static costFn(a?: AugmentedGlyph, b?: AugmentedGlyph) { 52 | if (!a || !b) { 53 | // cost(insert) + cost(delete) (4 + 4) should be more than cost(substitute) (6) 54 | return 4; 55 | } 56 | 57 | let result = 0; 58 | result += +(a.original !== b.original); 59 | result += +(a.normalized !== b.normalized); 60 | result += +(a.upper !== b.upper); 61 | result += +(a.root !== b.root); 62 | result += +(a.category !== b.topCategory); 63 | result += +(a.topCategory !== b.topCategory); 64 | return result; 65 | } 66 | } 67 | 68 | /** 69 | * Not quite as good as UAX #29 grapheme clusters, but we're waiting on Intl.Segmenter. 70 | */ 71 | const GLYPH_REGEXP = /\P{M}\p{M}*|^\p{M}+/gu; 72 | 73 | /** 74 | * A string augmented with some extra information about each glyph. 75 | */ 76 | class AugmentedString { 77 | /** The original string. */ 78 | readonly original: string; 79 | /** The augmented glyphs of the string. */ 80 | readonly glyphs: readonly AugmentedGlyph[]; 81 | /** The alignment between the original string and the augmented glyphs. */ 82 | readonly alignment: Alignment; 83 | 84 | constructor(original: string) { 85 | const normalized = new BiString(original).normalize("NFKD"); 86 | const upper = new BiString(normalized.modified).toUpperCase(); 87 | 88 | const glyphs = []; 89 | const alignment: BiIndex[] = [[0, 0]]; 90 | for (const match of upper.matchAll(GLYPH_REGEXP)) { 91 | const [o, m] = alignment[alignment.length - 1]; 92 | 93 | const upperC = match[0]; 94 | 95 | const normBounds = upper.alignment.originalBounds(o, o + upperC.length); 96 | const normC = upper.original.slice(...normBounds); 97 | 98 | const origBounds = normalized.alignment.originalBounds(normBounds); 99 | const origC = normalized.original.slice(...origBounds); 100 | 101 | glyphs.push(new AugmentedGlyph(origC, normC, upperC)); 102 | 103 | alignment.push([o + normC.length, m + 1]); 104 | } 105 | this.original = original; 106 | this.glyphs = glyphs; 107 | this.alignment = normalized.alignment.compose(new Alignment(alignment)); 108 | } 109 | } 110 | 111 | /** 112 | * Infer the alignment between two strings with a "smart" heuristic. 113 | * 114 | * We use Unicode normalization and case mapping to minimize differences that are due to case, accents, ligatures, etc. 115 | */ 116 | export default function heuristicInfer(original: string, modified: string): BiString { 117 | const augOrig = new AugmentedString(original); 118 | const augMod = new AugmentedString(modified); 119 | 120 | let alignment = Alignment.infer(augOrig.glyphs, augMod.glyphs, AugmentedGlyph.costFn); 121 | alignment = augOrig.alignment.compose(alignment); 122 | alignment = alignment.compose(augMod.alignment.inverse()); 123 | 124 | return new BiString(original, modified, alignment); 125 | } 126 | -------------------------------------------------------------------------------- /js/src/regex.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | import BiString, { AnyString } from "./bistring"; 7 | 8 | export type Replacer = (match: string, ...args: any[]) => string | BiString; 9 | export type MatchReplacer = (match: RegExpMatchArray) => string | BiString; 10 | 11 | /** 12 | * A replacement function that behaves the same as a fixed string supplied to :js:meth:`String.prototype.replace`. 13 | */ 14 | function expandReplacement(replacement: string, match: RegExpMatchArray): string { 15 | const index = match.index!; 16 | const input = match.input!; 17 | 18 | let result = ""; 19 | for (let i = 0; i < replacement.length; ++i) { 20 | const c = replacement[i]; 21 | if (c === "$" && i + 1 < replacement.length) { 22 | let n = replacement[++i]; 23 | switch (n) { 24 | case "$": 25 | result += "$"; 26 | continue; 27 | case "&": 28 | result += match[0]; 29 | continue; 30 | case "`": 31 | result += input.slice(0, index); 32 | continue; 33 | case "'": 34 | result += input.slice(index + match[0].length); 35 | continue; 36 | } 37 | 38 | if ("0123456789".includes(n)) { 39 | const n2 = replacement[i + 1]; 40 | if ("0123456789".includes(n2)) { 41 | n += n2; 42 | ++i; 43 | } 44 | const index = parseInt(n, 10); 45 | if (index >= 1 && index < match.length) { 46 | result += match[index]; 47 | continue; 48 | } 49 | } 50 | 51 | result += c + n; 52 | } else { 53 | result += c; 54 | } 55 | } 56 | 57 | return result; 58 | } 59 | 60 | /** 61 | * Unify the second argument to :js:meth:`String.prototype.replace` into a replacement function with a nicer signature. 62 | */ 63 | export function normalizeReplacer(replacement: string | Replacer): MatchReplacer { 64 | if (typeof(replacement) === "string") { 65 | return match => expandReplacement(replacement, match); 66 | } else { 67 | const replacer: (...args: any[]) => AnyString = replacement; 68 | return match => replacer(...match, match.index, match.input); 69 | } 70 | } 71 | 72 | /** 73 | * Check if a regexp is stateful (can start from arbitrary offsets). 74 | */ 75 | export function isStatefulRegExp(regexp: RegExp) { 76 | return regexp.global || regexp.sticky; 77 | } 78 | 79 | /** 80 | * Make a defensive copy of a regular expression. 81 | */ 82 | export function cloneRegExp(regexp: RegExp, addFlags: string = "", removeFlags: string = "") { 83 | let flags = ""; 84 | for (const flag of regexp.flags) { 85 | if (!removeFlags.includes(flag)) { 86 | flags += flag; 87 | } 88 | } 89 | for (const flag of addFlags) { 90 | if (!flags.includes(flag)) { 91 | flags += flag; 92 | } 93 | } 94 | 95 | return new RegExp(regexp.source, flags); 96 | } 97 | -------------------------------------------------------------------------------- /js/src/token.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | import Alignment, { BiIndex, Bounds } from "./alignment"; 7 | import BiString, { AnyString } from "./bistring"; 8 | import { cloneRegExp } from "./regex"; 9 | 10 | /** 11 | * A token extracted from a string. 12 | */ 13 | export class Token { 14 | /** The actual text of the token. */ 15 | readonly text: BiString; 16 | /** The start position of the token. */ 17 | readonly start: number; 18 | /** The end position of the token. */ 19 | readonly end: number; 20 | 21 | /** 22 | * Create a token. 23 | * 24 | * @param text 25 | * The text of this token. 26 | * @param start 27 | * The start position of the token. 28 | * @param end 29 | * The end position of the token. 30 | */ 31 | constructor(text: AnyString, start: number, end: number) { 32 | this.text = BiString.from(text); 33 | this.start = start; 34 | this.end = end; 35 | Object.freeze(this); 36 | } 37 | 38 | /** 39 | * Create a token from a slice of a string. 40 | * 41 | * @param text 42 | * The text to slice. 43 | * @param start 44 | * The start index of the token. 45 | * @param end 46 | * The end index of the token. 47 | */ 48 | static slice(text: AnyString, start: number, end: number): Token { 49 | return new Token(BiString.from(text).slice(start, end), start, end); 50 | } 51 | 52 | /** 53 | * The original value of the token. 54 | */ 55 | get original(): string { 56 | return this.text.original; 57 | } 58 | 59 | /** 60 | * The modified value of the token. 61 | */ 62 | get modified(): string { 63 | return this.text.modified; 64 | } 65 | } 66 | 67 | /** 68 | * A string and its tokenization. 69 | */ 70 | export class Tokenization { 71 | /** The text that was tokenized. */ 72 | readonly text: BiString; 73 | /** The tokens extracted from the text. */ 74 | readonly tokens: readonly Token[]; 75 | /** The alignment between the text and the tokens. */ 76 | readonly alignment: Alignment; 77 | /** The number of tokens. */ 78 | readonly length: number; 79 | 80 | /** 81 | * Create a `Tokenization`. 82 | * 83 | * @param text 84 | * The text from which the tokens have been extracted. 85 | * @param tokens 86 | * The tokens extracted from the text. 87 | */ 88 | constructor(text: AnyString, tokens: Iterable) { 89 | this.text = BiString.from(text); 90 | this.tokens = Object.freeze(Array.from(tokens)); 91 | 92 | const alignment: BiIndex[] = [[0, 0]]; 93 | this.tokens.forEach((token, i) => { 94 | alignment.push([token.start, i]); 95 | alignment.push([token.end, i + 1]); 96 | }); 97 | alignment.push([this.text.length, this.tokens.length]); 98 | this.alignment = new Alignment(alignment); 99 | 100 | this.length = this.tokens.length; 101 | 102 | Object.freeze(this); 103 | } 104 | 105 | /** 106 | * Infer a `Tokenization` from a sequence of tokens. 107 | * 108 | * Due to the possibility of ambiguity, it is much better to use a :js:class:`Tokenizer` or some other method of 109 | * producing :js:class:`Token`\ s with their positions explicitly set. 110 | * 111 | * @param text 112 | * The text that was tokenized. 113 | * @param tokens 114 | * The extracted tokens. 115 | * @returns 116 | * The inferred tokenization, with token positions found by simple forward search. 117 | */ 118 | static infer(text: AnyString, tokens: Iterable) { 119 | text = BiString.from(text); 120 | 121 | const result = []; 122 | let start = 0, end; 123 | for (const token of tokens) { 124 | [start, end] = text.boundsOf(token, start); 125 | if (start < 0) { 126 | throw new Error(`Couldn't find the token "${token}" in the text`); 127 | } 128 | result.push(Token.slice(text, start, end)); 129 | start = end; 130 | } 131 | 132 | return new Tokenization(text, result); 133 | } 134 | 135 | /** 136 | * Compute a slice of this tokenization. 137 | * 138 | * @param start 139 | * The position to start from. 140 | * @param end 141 | * The position to end at. 142 | * @returns 143 | * The requested slice as a new `Tokenization`. 144 | */ 145 | slice(start?: number, end?: number): Tokenization { 146 | return new Tokenization(this.text, this.tokens.slice(start, end)); 147 | } 148 | 149 | /** 150 | * Map a span of tokens to the corresponding substring. 151 | */ 152 | substring(start?: number, end?: number): BiString { 153 | const [first, last] = this.textBounds(start, end); 154 | return this.text.substring(first, last); 155 | } 156 | 157 | /** 158 | * Map a span of tokens to the bounds of the corresponding text. 159 | */ 160 | textBounds(start?: number, end?: number): Bounds { 161 | if (start === undefined) { 162 | start = 0; 163 | } 164 | if (end === undefined) { 165 | end = this.length; 166 | } 167 | return this.alignment.originalBounds(start, end); 168 | } 169 | 170 | /** 171 | * Map a span of tokens to the bounds of the corresponding original text. 172 | */ 173 | originalBounds(start?: number, end?: number): Bounds { 174 | return this.text.alignment.originalBounds(this.textBounds(start, end)); 175 | } 176 | 177 | /** 178 | * Map a span of text to the bounds of the corresponding span of tokens. 179 | */ 180 | boundsForText(start: number, end: number): Bounds { 181 | return this.alignment.modifiedBounds(start, end); 182 | } 183 | 184 | /** 185 | * Map a span of original text to the bounds of the corresponding span of tokens. 186 | */ 187 | boundsForOriginal(start: number, end: number): Bounds { 188 | const textBounds = this.text.alignment.modifiedBounds(start, end); 189 | return this.boundsForText(...textBounds); 190 | } 191 | 192 | /** 193 | * Map a span of text to the corresponding span of tokens. 194 | */ 195 | sliceByText(start: number, end: number): Tokenization { 196 | return this.slice(...this.boundsForText(start, end)); 197 | } 198 | 199 | /** 200 | * Map a span of original text to the corresponding span of tokens. 201 | */ 202 | sliceByOriginal(start: number, end: number): Tokenization { 203 | return this.slice(...this.boundsForOriginal(start, end)); 204 | } 205 | 206 | /** 207 | * Expand a span of text to align it with token boundaries. 208 | */ 209 | snapTextBounds(start: number, end: number): Bounds { 210 | return this.textBounds(...this.boundsForText(start, end)); 211 | } 212 | 213 | /** 214 | * Expand a span of original text to align it with token boundaries. 215 | */ 216 | snapOriginalBounds(start: number, end: number): Bounds { 217 | return this.originalBounds(...this.boundsForOriginal(start, end)); 218 | } 219 | } 220 | 221 | /** 222 | * A tokenizer that produces :js:class:`Tokenization`\ s. 223 | */ 224 | export interface Tokenizer { 225 | /** 226 | * Tokenize a string. 227 | * 228 | * @param text 229 | * The text to tokenize, either a string or a :js:class:`BiString`. 230 | * @returns 231 | * A :js:class:`Tokenization` holding the text and its tokens. 232 | */ 233 | tokenize(text: AnyString): Tokenization; 234 | } 235 | 236 | /** 237 | * Breaks text into tokens based on a :js:class:`RegExp`. 238 | */ 239 | export class RegExpTokenizer implements Tokenizer { 240 | private readonly _pattern: RegExp; 241 | 242 | /** 243 | * Create a `RegExpTokenizer`. 244 | * 245 | * @param pattern 246 | * The regex that will match tokens. 247 | */ 248 | constructor(pattern: RegExp) { 249 | this._pattern = cloneRegExp(pattern, "g"); 250 | } 251 | 252 | tokenize(text: AnyString): Tokenization { 253 | text = BiString.from(text); 254 | 255 | const tokens = []; 256 | for (const match of text.matchAll(this._pattern)) { 257 | const start = match.index!; 258 | const end = start + match[0].length; 259 | tokens.push(Token.slice(text, start, end)); 260 | } 261 | 262 | return new Tokenization(text, tokens); 263 | } 264 | } 265 | 266 | /** 267 | * Splits text into tokens based on a :js:class:`RegExp`. 268 | */ 269 | export class SplittingTokenizer implements Tokenizer { 270 | private readonly _pattern: RegExp; 271 | 272 | /** 273 | * Create a `SplittingTokenizer`. 274 | * 275 | * @param pattern 276 | * A regex that matches the regions between tokens. 277 | */ 278 | constructor(pattern: RegExp) { 279 | this._pattern = cloneRegExp(pattern, "g"); 280 | } 281 | 282 | tokenize(text: AnyString): Tokenization { 283 | text = BiString.from(text); 284 | 285 | const tokens = []; 286 | let last = 0; 287 | for (const match of text.matchAll(this._pattern)) { 288 | const start = match.index!; 289 | if (start > last) { 290 | tokens.push(Token.slice(text, last, start)); 291 | } 292 | last = start + match[0].length; 293 | } 294 | 295 | if (text.length > last) { 296 | tokens.push(Token.slice(text, last, text.length)); 297 | } 298 | 299 | return new Tokenization(text, tokens); 300 | } 301 | } 302 | -------------------------------------------------------------------------------- /js/src/unicode.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * GENERATED BY scripts/generate_unicode.py. 3 | * DO NOT EDIT BY HAND. 4 | */ 5 | 6 | /** 7 | * Matches until the next NFC normalization boundary. 8 | */ 9 | export const NFC_CHUNK = /.[\u0300-\u034e\u0350-\u036f\u0483-\u0487\u0591-\u05bd\u05bf\u05c1-\u05c2\u05c4-\u05c5\u05c7\u0610-\u061a\u064b-\u065f\u0670\u06d6-\u06dc\u06df-\u06e4\u06e7-\u06e8\u06ea-\u06ed\u0711\u0730-\u074a\u07eb-\u07f3\u07fd\u0816-\u0819\u081b-\u0823\u0825-\u0827\u0829-\u082d\u0859-\u085b\u08d3-\u08e1\u08e3-\u08ff\u093c\u094d\u0951-\u0954\u09bc\u09be\u09cd\u09d7\u09fe\u0a3c\u0a4d\u0abc\u0acd\u0b3c\u0b3e\u0b4d\u0b56-\u0b57\u0bbe\u0bcd\u0bd7\u0c4d\u0c55-\u0c56\u0cbc\u0cc2\u0ccd\u0cd5-\u0cd6\u0d3b-\u0d3c\u0d3e\u0d4d\u0d57\u0dca\u0dcf\u0ddf\u0e38-\u0e3a\u0e48-\u0e4b\u0eb8-\u0eba\u0ec8-\u0ecb\u0f18-\u0f19\u0f35\u0f37\u0f39\u0f71-\u0f75\u0f7a-\u0f7d\u0f80-\u0f84\u0f86-\u0f87\u0fc6\u102e\u1037\u1039-\u103a\u108d\u1161-\u1175\u11a8-\u11c2\u135d-\u135f\u1714\u1734\u17d2\u17dd\u18a9\u1939-\u193b\u1a17-\u1a18\u1a60\u1a75-\u1a7c\u1a7f\u1ab0-\u1abd\u1b34-\u1b35\u1b44\u1b6b-\u1b73\u1baa-\u1bab\u1be6\u1bf2-\u1bf3\u1c37\u1cd0-\u1cd2\u1cd4-\u1ce0\u1ce2-\u1ce8\u1ced\u1cf4\u1cf8-\u1cf9\u1dc0-\u1df9\u1dfb-\u1dff\u20d0-\u20dc\u20e1\u20e5-\u20f0\u2cef-\u2cf1\u2d7f\u2de0-\u2dff\u302a-\u302f\u3099-\u309a\ua66f\ua674-\ua67d\ua69e-\ua69f\ua6f0-\ua6f1\ua806\ua8c4\ua8e0-\ua8f1\ua92b-\ua92d\ua953\ua9b3\ua9c0\uaab0\uaab2-\uaab4\uaab7-\uaab8\uaabe-\uaabf\uaac1\uaaf6\uabed\ufb1e\ufe20-\ufe2f\u{101fd}\u{102e0}\u{10376}-\u{1037a}\u{10a0d}\u{10a0f}\u{10a38}-\u{10a3a}\u{10a3f}\u{10ae5}-\u{10ae6}\u{10d24}-\u{10d27}\u{10f46}-\u{10f50}\u{11046}\u{1107f}\u{110b9}-\u{110ba}\u{11100}-\u{11102}\u{11127}\u{11133}-\u{11134}\u{11173}\u{111c0}\u{111ca}\u{11235}-\u{11236}\u{112e9}-\u{112ea}\u{1133b}-\u{1133c}\u{1133e}\u{1134d}\u{11357}\u{11366}-\u{1136c}\u{11370}-\u{11374}\u{11442}\u{11446}\u{1145e}\u{114b0}\u{114ba}\u{114bd}\u{114c2}-\u{114c3}\u{115af}\u{115bf}-\u{115c0}\u{1163f}\u{116b6}-\u{116b7}\u{1172b}\u{11839}-\u{1183a}\u{119e0}\u{11a34}\u{11a47}\u{11a99}\u{11c3f}\u{11d42}\u{11d44}-\u{11d45}\u{11d97}\u{16af0}-\u{16af4}\u{16b30}-\u{16b36}\u{1bc9e}\u{1d165}-\u{1d169}\u{1d16d}-\u{1d172}\u{1d17b}-\u{1d182}\u{1d185}-\u{1d18b}\u{1d1aa}-\u{1d1ad}\u{1d242}-\u{1d244}\u{1e000}-\u{1e006}\u{1e008}-\u{1e018}\u{1e01b}-\u{1e021}\u{1e023}-\u{1e024}\u{1e026}-\u{1e02a}\u{1e130}-\u{1e136}\u{1e2ec}-\u{1e2ef}\u{1e8d0}-\u{1e8d6}\u{1e944}-\u{1e94a}]*/gsu; 10 | 11 | /** 12 | * Matches until the next NFD normalization boundary. 13 | */ 14 | export const NFD_CHUNK = /.[\u0300-\u034e\u0350-\u036f\u0483-\u0487\u0591-\u05bd\u05bf\u05c1-\u05c2\u05c4-\u05c5\u05c7\u0610-\u061a\u064b-\u065f\u0670\u06d6-\u06dc\u06df-\u06e4\u06e7-\u06e8\u06ea-\u06ed\u0711\u0730-\u074a\u07eb-\u07f3\u07fd\u0816-\u0819\u081b-\u0823\u0825-\u0827\u0829-\u082d\u0859-\u085b\u08d3-\u08e1\u08e3-\u08ff\u093c\u094d\u0951-\u0954\u09bc\u09cd\u09fe\u0a3c\u0a4d\u0abc\u0acd\u0b3c\u0b4d\u0bcd\u0c4d\u0c55-\u0c56\u0cbc\u0ccd\u0d3b-\u0d3c\u0d4d\u0dca\u0e38-\u0e3a\u0e48-\u0e4b\u0eb8-\u0eba\u0ec8-\u0ecb\u0f18-\u0f19\u0f35\u0f37\u0f39\u0f71-\u0f75\u0f7a-\u0f7d\u0f80-\u0f84\u0f86-\u0f87\u0fc6\u1037\u1039-\u103a\u108d\u135d-\u135f\u1714\u1734\u17d2\u17dd\u18a9\u1939-\u193b\u1a17-\u1a18\u1a60\u1a75-\u1a7c\u1a7f\u1ab0-\u1abd\u1b34\u1b44\u1b6b-\u1b73\u1baa-\u1bab\u1be6\u1bf2-\u1bf3\u1c37\u1cd0-\u1cd2\u1cd4-\u1ce0\u1ce2-\u1ce8\u1ced\u1cf4\u1cf8-\u1cf9\u1dc0-\u1df9\u1dfb-\u1dff\u20d0-\u20dc\u20e1\u20e5-\u20f0\u2cef-\u2cf1\u2d7f\u2de0-\u2dff\u302a-\u302f\u3099-\u309a\ua66f\ua674-\ua67d\ua69e-\ua69f\ua6f0-\ua6f1\ua806\ua8c4\ua8e0-\ua8f1\ua92b-\ua92d\ua953\ua9b3\ua9c0\uaab0\uaab2-\uaab4\uaab7-\uaab8\uaabe-\uaabf\uaac1\uaaf6\uabed\ufb1e\ufe20-\ufe2f\u{101fd}\u{102e0}\u{10376}-\u{1037a}\u{10a0d}\u{10a0f}\u{10a38}-\u{10a3a}\u{10a3f}\u{10ae5}-\u{10ae6}\u{10d24}-\u{10d27}\u{10f46}-\u{10f50}\u{11046}\u{1107f}\u{110b9}-\u{110ba}\u{11100}-\u{11102}\u{11133}-\u{11134}\u{11173}\u{111c0}\u{111ca}\u{11235}-\u{11236}\u{112e9}-\u{112ea}\u{1133b}-\u{1133c}\u{1134d}\u{11366}-\u{1136c}\u{11370}-\u{11374}\u{11442}\u{11446}\u{1145e}\u{114c2}-\u{114c3}\u{115bf}-\u{115c0}\u{1163f}\u{116b6}-\u{116b7}\u{1172b}\u{11839}-\u{1183a}\u{119e0}\u{11a34}\u{11a47}\u{11a99}\u{11c3f}\u{11d42}\u{11d44}-\u{11d45}\u{11d97}\u{16af0}-\u{16af4}\u{16b30}-\u{16b36}\u{1bc9e}\u{1d165}-\u{1d169}\u{1d16d}-\u{1d172}\u{1d17b}-\u{1d182}\u{1d185}-\u{1d18b}\u{1d1aa}-\u{1d1ad}\u{1d242}-\u{1d244}\u{1e000}-\u{1e006}\u{1e008}-\u{1e018}\u{1e01b}-\u{1e021}\u{1e023}-\u{1e024}\u{1e026}-\u{1e02a}\u{1e130}-\u{1e136}\u{1e2ec}-\u{1e2ef}\u{1e8d0}-\u{1e8d6}\u{1e944}-\u{1e94a}]*/gsu; 15 | 16 | /** 17 | * Matches until the next NFKC normalization boundary. 18 | */ 19 | export const NFKC_CHUNK = /.[\u0300-\u034e\u0350-\u036f\u0483-\u0487\u0591-\u05bd\u05bf\u05c1-\u05c2\u05c4-\u05c5\u05c7\u0610-\u061a\u064b-\u065f\u0670\u06d6-\u06dc\u06df-\u06e4\u06e7-\u06e8\u06ea-\u06ed\u0711\u0730-\u074a\u07eb-\u07f3\u07fd\u0816-\u0819\u081b-\u0823\u0825-\u0827\u0829-\u082d\u0859-\u085b\u08d3-\u08e1\u08e3-\u08ff\u093c\u094d\u0951-\u0954\u09bc\u09be\u09cd\u09d7\u09fe\u0a3c\u0a4d\u0abc\u0acd\u0b3c\u0b3e\u0b4d\u0b56-\u0b57\u0bbe\u0bcd\u0bd7\u0c4d\u0c55-\u0c56\u0cbc\u0cc2\u0ccd\u0cd5-\u0cd6\u0d3b-\u0d3c\u0d3e\u0d4d\u0d57\u0dca\u0dcf\u0ddf\u0e38-\u0e3a\u0e48-\u0e4b\u0eb8-\u0eba\u0ec8-\u0ecb\u0f18-\u0f19\u0f35\u0f37\u0f39\u0f71-\u0f75\u0f7a-\u0f7d\u0f80-\u0f84\u0f86-\u0f87\u0fc6\u102e\u1037\u1039-\u103a\u108d\u1161-\u1175\u11a8-\u11c2\u135d-\u135f\u1714\u1734\u17d2\u17dd\u18a9\u1939-\u193b\u1a17-\u1a18\u1a60\u1a75-\u1a7c\u1a7f\u1ab0-\u1abd\u1b34-\u1b35\u1b44\u1b6b-\u1b73\u1baa-\u1bab\u1be6\u1bf2-\u1bf3\u1c37\u1cd0-\u1cd2\u1cd4-\u1ce0\u1ce2-\u1ce8\u1ced\u1cf4\u1cf8-\u1cf9\u1dc0-\u1df9\u1dfb-\u1dff\u20d0-\u20dc\u20e1\u20e5-\u20f0\u2cef-\u2cf1\u2d7f\u2de0-\u2dff\u302a-\u302f\u3099-\u309a\u3133\u3135-\u3136\u313a-\u313f\u314f-\u3163\ua66f\ua674-\ua67d\ua69e-\ua69f\ua6f0-\ua6f1\ua806\ua8c4\ua8e0-\ua8f1\ua92b-\ua92d\ua953\ua9b3\ua9c0\uaab0\uaab2-\uaab4\uaab7-\uaab8\uaabe-\uaabf\uaac1\uaaf6\uabed\ufb1e\ufe20-\ufe2f\uff9e-\uff9f\uffa3\uffa5-\uffa6\uffaa-\uffaf\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc\u{101fd}\u{102e0}\u{10376}-\u{1037a}\u{10a0d}\u{10a0f}\u{10a38}-\u{10a3a}\u{10a3f}\u{10ae5}-\u{10ae6}\u{10d24}-\u{10d27}\u{10f46}-\u{10f50}\u{11046}\u{1107f}\u{110b9}-\u{110ba}\u{11100}-\u{11102}\u{11127}\u{11133}-\u{11134}\u{11173}\u{111c0}\u{111ca}\u{11235}-\u{11236}\u{112e9}-\u{112ea}\u{1133b}-\u{1133c}\u{1133e}\u{1134d}\u{11357}\u{11366}-\u{1136c}\u{11370}-\u{11374}\u{11442}\u{11446}\u{1145e}\u{114b0}\u{114ba}\u{114bd}\u{114c2}-\u{114c3}\u{115af}\u{115bf}-\u{115c0}\u{1163f}\u{116b6}-\u{116b7}\u{1172b}\u{11839}-\u{1183a}\u{119e0}\u{11a34}\u{11a47}\u{11a99}\u{11c3f}\u{11d42}\u{11d44}-\u{11d45}\u{11d97}\u{16af0}-\u{16af4}\u{16b30}-\u{16b36}\u{1bc9e}\u{1d165}-\u{1d169}\u{1d16d}-\u{1d172}\u{1d17b}-\u{1d182}\u{1d185}-\u{1d18b}\u{1d1aa}-\u{1d1ad}\u{1d242}-\u{1d244}\u{1e000}-\u{1e006}\u{1e008}-\u{1e018}\u{1e01b}-\u{1e021}\u{1e023}-\u{1e024}\u{1e026}-\u{1e02a}\u{1e130}-\u{1e136}\u{1e2ec}-\u{1e2ef}\u{1e8d0}-\u{1e8d6}\u{1e944}-\u{1e94a}]*/gsu; 20 | 21 | /** 22 | * Matches until the next NFKD normalization boundary. 23 | */ 24 | export const NFKD_CHUNK = /.[\u0300-\u034e\u0350-\u036f\u0483-\u0487\u0591-\u05bd\u05bf\u05c1-\u05c2\u05c4-\u05c5\u05c7\u0610-\u061a\u064b-\u065f\u0670\u06d6-\u06dc\u06df-\u06e4\u06e7-\u06e8\u06ea-\u06ed\u0711\u0730-\u074a\u07eb-\u07f3\u07fd\u0816-\u0819\u081b-\u0823\u0825-\u0827\u0829-\u082d\u0859-\u085b\u08d3-\u08e1\u08e3-\u08ff\u093c\u094d\u0951-\u0954\u09bc\u09cd\u09fe\u0a3c\u0a4d\u0abc\u0acd\u0b3c\u0b4d\u0bcd\u0c4d\u0c55-\u0c56\u0cbc\u0ccd\u0d3b-\u0d3c\u0d4d\u0dca\u0e38-\u0e3a\u0e48-\u0e4b\u0eb8-\u0eba\u0ec8-\u0ecb\u0f18-\u0f19\u0f35\u0f37\u0f39\u0f71-\u0f75\u0f7a-\u0f7d\u0f80-\u0f84\u0f86-\u0f87\u0fc6\u1037\u1039-\u103a\u108d\u135d-\u135f\u1714\u1734\u17d2\u17dd\u18a9\u1939-\u193b\u1a17-\u1a18\u1a60\u1a75-\u1a7c\u1a7f\u1ab0-\u1abd\u1b34\u1b44\u1b6b-\u1b73\u1baa-\u1bab\u1be6\u1bf2-\u1bf3\u1c37\u1cd0-\u1cd2\u1cd4-\u1ce0\u1ce2-\u1ce8\u1ced\u1cf4\u1cf8-\u1cf9\u1dc0-\u1df9\u1dfb-\u1dff\u20d0-\u20dc\u20e1\u20e5-\u20f0\u2cef-\u2cf1\u2d7f\u2de0-\u2dff\u302a-\u302f\u3099-\u309a\ua66f\ua674-\ua67d\ua69e-\ua69f\ua6f0-\ua6f1\ua806\ua8c4\ua8e0-\ua8f1\ua92b-\ua92d\ua953\ua9b3\ua9c0\uaab0\uaab2-\uaab4\uaab7-\uaab8\uaabe-\uaabf\uaac1\uaaf6\uabed\ufb1e\ufe20-\ufe2f\uff9e-\uff9f\u{101fd}\u{102e0}\u{10376}-\u{1037a}\u{10a0d}\u{10a0f}\u{10a38}-\u{10a3a}\u{10a3f}\u{10ae5}-\u{10ae6}\u{10d24}-\u{10d27}\u{10f46}-\u{10f50}\u{11046}\u{1107f}\u{110b9}-\u{110ba}\u{11100}-\u{11102}\u{11133}-\u{11134}\u{11173}\u{111c0}\u{111ca}\u{11235}-\u{11236}\u{112e9}-\u{112ea}\u{1133b}-\u{1133c}\u{1134d}\u{11366}-\u{1136c}\u{11370}-\u{11374}\u{11442}\u{11446}\u{1145e}\u{114c2}-\u{114c3}\u{115bf}-\u{115c0}\u{1163f}\u{116b6}-\u{116b7}\u{1172b}\u{11839}-\u{1183a}\u{119e0}\u{11a34}\u{11a47}\u{11a99}\u{11c3f}\u{11d42}\u{11d44}-\u{11d45}\u{11d97}\u{16af0}-\u{16af4}\u{16b30}-\u{16b36}\u{1bc9e}\u{1d165}-\u{1d169}\u{1d16d}-\u{1d172}\u{1d17b}-\u{1d182}\u{1d185}-\u{1d18b}\u{1d1aa}-\u{1d1ad}\u{1d242}-\u{1d244}\u{1e000}-\u{1e006}\u{1e008}-\u{1e018}\u{1e01b}-\u{1e021}\u{1e023}-\u{1e024}\u{1e026}-\u{1e02a}\u{1e130}-\u{1e136}\u{1e2ec}-\u{1e2ef}\u{1e8d0}-\u{1e8d6}\u{1e944}-\u{1e94a}]*/gsu; 25 | -------------------------------------------------------------------------------- /js/tests/alignment.test.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | import { Alignment } from ".."; 7 | 8 | test("Empty Alignment", () => { 9 | expect(() => new Alignment([])).toThrow(); 10 | 11 | const a = Alignment.identity(0); 12 | expect(a.values).toEqual([[0, 0]]); 13 | 14 | expect(a.originalBounds()).toEqual([0, 0]); 15 | expect(a.modifiedBounds()).toEqual([0, 0]); 16 | 17 | expect(a.originalBounds(0, 0)).toEqual([0, 0]); 18 | expect(a.modifiedBounds(0, 0)).toEqual([0, 0]); 19 | }); 20 | 21 | test("Alignment.identity()", () => { 22 | const a = Alignment.identity(1, 16); 23 | 24 | const values = []; 25 | for (let i = 1; i <= 16; ++i) { 26 | values.push([i, i]); 27 | } 28 | expect(a.values).toEqual(values); 29 | 30 | expect(a.originalBounds()).toEqual([1, 16]); 31 | expect(a.modifiedBounds()).toEqual([1, 16]); 32 | 33 | expect(a.originalBounds(4, 7)).toEqual([4, 7]); 34 | expect(a.modifiedBounds(4, 7)).toEqual([4, 7]); 35 | }); 36 | 37 | test("Alignment", () => { 38 | const a = new Alignment([[0, 0], [1, 2], [2, 4], [3, 6]]); 39 | 40 | expect(a.originalBounds()).toEqual([0, 3]); 41 | expect(a.modifiedBounds()).toEqual([0, 6]); 42 | 43 | expect(a.originalBounds(0, 0)).toEqual([0, 0]); 44 | expect(a.originalBounds(0, 1)).toEqual([0, 1]); 45 | expect(a.originalBounds(0, 2)).toEqual([0, 1]); 46 | expect(a.originalBounds(0, 3)).toEqual([0, 2]); 47 | expect(a.originalBounds(1, 1)).toEqual([0, 1]); 48 | expect(a.originalBounds(1, 3)).toEqual([0, 2]); 49 | expect(a.originalBounds(1, 4)).toEqual([0, 2]); 50 | expect(a.originalBounds(2, 2)).toEqual([1, 1]); 51 | expect(a.originalBounds(2, 4)).toEqual([1, 2]); 52 | expect(a.originalBounds(2, 5)).toEqual([1, 3]); 53 | expect(a.originalBounds(3, 3)).toEqual([1, 2]); 54 | 55 | expect(a.modifiedBounds(0, 0)).toEqual([0, 0]); 56 | expect(a.modifiedBounds(0, 1)).toEqual([0, 2]); 57 | expect(a.modifiedBounds(0, 2)).toEqual([0, 4]); 58 | expect(a.modifiedBounds(0, 3)).toEqual([0, 6]); 59 | expect(a.modifiedBounds(1, 1)).toEqual([2, 2]); 60 | expect(a.modifiedBounds(2, 2)).toEqual([4, 4]); 61 | }); 62 | 63 | test("Alignment canonicalization", () => { 64 | let a = new Alignment([[0, 0], [1, 2], [1, 2], [2, 4]]); 65 | expect(a.values).toEqual([[0, 0], [1, 2], [2, 4]]); 66 | 67 | a = new Alignment([[0, 0], [1, 2]]) 68 | .concat(new Alignment([[1, 2], [2, 4]])); 69 | expect(a.values).toEqual([[0, 0], [1, 2], [2, 4]]); 70 | }); 71 | 72 | function test_composition(first: Alignment, second: Alignment) { 73 | const composed = first.compose(second); 74 | 75 | const [of, ol] = composed.originalBounds(); 76 | const [mf, ml] = composed.modifiedBounds(); 77 | 78 | expect([of, ol]).toEqual(first.originalBounds()); 79 | expect([mf, ml]).toEqual(second.modifiedBounds()); 80 | 81 | for (let i = of; i <= ol; ++i) { 82 | for (let j = i; j <= ol; ++j) { 83 | expect(composed.modifiedBounds(i, j)) 84 | .toEqual(second.modifiedBounds(first.modifiedBounds(i, j))); 85 | } 86 | } 87 | 88 | for (let i = mf; i <= ml; ++i) { 89 | for (let j = i; j <= ml; ++j) { 90 | expect(composed.originalBounds(i, j)) 91 | .toEqual(first.originalBounds(second.originalBounds(i, j))); 92 | } 93 | } 94 | } 95 | 96 | test("Alignment.compose", () => { 97 | const first = new Alignment([ 98 | [0, 0], 99 | [1, 2], 100 | [2, 4], 101 | [3, 6], 102 | ]); 103 | const second = new Alignment([ 104 | [0, 0], 105 | [1, 2], 106 | [2, 4], 107 | [3, 6], 108 | [4, 8], 109 | [5, 10], 110 | [6, 11], 111 | ]); 112 | test_composition(first, second); 113 | }); 114 | 115 | function test_identity_composition(alignment: Alignment) { 116 | test_composition(alignment, Alignment.identity(alignment.modifiedBounds())); 117 | test_composition(Alignment.identity(alignment.originalBounds()), alignment); 118 | } 119 | 120 | test("Alignment.compose(Alignment.identity)", () => { 121 | const a = new Alignment([ 122 | [0, 2], 123 | [2, 2], 124 | [4, 4], 125 | [6, 6], 126 | [8, 6], 127 | ]); 128 | 129 | // Modified sequence is smaller 130 | test_identity_composition(a); 131 | 132 | // Original sequence is smaller 133 | test_identity_composition(a.inverse()); 134 | }); 135 | 136 | test("Alignment.infer", () => { 137 | let a = Alignment.infer("test", "test"); 138 | let b = Alignment.identity(4); 139 | expect(a.equals(b)).toBe(true); 140 | 141 | a = Alignment.infer("asdf", "jkl;"); 142 | expect(a.equals(b)).toBe(true); 143 | 144 | a = Alignment.infer("color", "colour"); 145 | b = new Alignment([ 146 | [0, 0], 147 | [1, 1], 148 | [2, 2], 149 | [3, 3], 150 | [4, 4], 151 | [4, 5], 152 | [5, 6], 153 | ]); 154 | expect(a.equals(b)).toBe(true); 155 | 156 | a = Alignment.infer("ab---", "ab"); 157 | b = new Alignment([ 158 | [0, 0], 159 | [1, 1], 160 | [2, 2], 161 | [3, 2], 162 | [4, 2], 163 | [5, 2], 164 | ]); 165 | expect(a.equals(b)).toBe(true); 166 | }); 167 | -------------------------------------------------------------------------------- /js/tests/bistring.test.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | import { BiString, Alignment } from ".."; 7 | 8 | test("new BiString", () => { 9 | expect(() => new BiString(42 as any)).toThrow(TypeError); 10 | expect(() => new BiString("fourty-two", 42 as any)).toThrow(TypeError); 11 | expect(() => new BiString("fourty-two", "42", 42 as any)).toThrow(TypeError); 12 | 13 | expect(() => new BiString("fourty-two", "42", new Alignment([ 14 | [0, 0], 15 | [9, 2], 16 | ]))) 17 | .toThrow(RangeError); 18 | 19 | expect(() => new BiString("fourty-two", "42", new Alignment([ 20 | [0, 0], 21 | [10, 1], 22 | ]))) 23 | .toThrow(RangeError); 24 | 25 | new BiString("42"); 26 | new BiString("fourty-two", "42"); 27 | new BiString("fourty-two", "42", new Alignment([ 28 | [0, 0], 29 | [6, 1], 30 | [7, 1], 31 | [10, 2], 32 | ])); 33 | }); 34 | 35 | test("BiString.infer", () => { 36 | let bs = BiString.infer("test", "test"); 37 | expect(bs.equals(new BiString("test"))).toBe(true); 38 | 39 | bs = BiString.infer("color", "colour"); 40 | expect(bs.substring(3, 5).original).toBe("o"); 41 | expect(bs.inverse().equals(BiString.infer("colour", "color"))).toBe(true); 42 | 43 | bs = BiString.infer( 44 | "🅃🄷🄴 🅀🅄🄸🄲🄺, 🄱🅁🄾🅆🄽 🦊 🄹🅄🄼🄿🅂 🄾🅅🄴🅁 🅃🄷🄴 🄻🄰🅉🅈 🐶", 45 | "the quick brown fox jumps over the lazy dog", 46 | ); 47 | expect(bs.substring(0, 3).original).toBe("🅃🄷🄴"); 48 | expect(bs.substring(0, 3).modified).toBe("the"); 49 | expect(bs.substring(4, 9).original).toBe("🅀🅄🄸🄲🄺"); 50 | expect(bs.substring(4, 9).modified).toBe("quick"); 51 | expect(bs.substring(10, 15).original).toBe("🄱🅁🄾🅆🄽"); 52 | expect(bs.substring(10, 15).modified).toBe("brown"); 53 | expect(bs.substring(16, 19).original).toBe("🦊"); 54 | expect(bs.substring(16, 19).modified).toBe("fox"); 55 | expect(bs.substring(20, 25).original).toBe("🄹🅄🄼🄿🅂"); 56 | expect(bs.substring(20, 25).modified).toBe("jumps"); 57 | expect(bs.substring(40, 43).original).toBe("🐶"); 58 | expect(bs.substring(40, 43).modified).toBe("dog"); 59 | 60 | bs = BiString.infer( 61 | "Ṫḧë qüïċḳ, ḅṛöẅṅ 🦊 jüṁṗṡ öṿëṛ ẗḧë ḷäżÿ 🐶", 62 | "the quick brown fox jumps over the lazy dog", 63 | ); 64 | expect(bs.substring(0, 3).equals(new BiString("Ṫḧë", "the", Alignment.identity(3)))).toBe(true); 65 | expect(bs.substring(4, 9).equals(new BiString("qüïċḳ", "quick", Alignment.identity(5)))).toBe(true); 66 | expect(bs.substring(10, 15).equals(new BiString("ḅṛöẅṅ", "brown", Alignment.identity(5)))).toBe(true); 67 | expect(bs.substring(16, 19).original).toBe("🦊"); 68 | expect(bs.substring(16, 19).modified).toBe("fox"); 69 | expect(bs.substring(20, 25).equals(new BiString("jüṁṗṡ", "jumps", Alignment.identity(5)))).toBe(true); 70 | expect(bs.substring(40, 43).original).toBe("🐶"); 71 | expect(bs.substring(40, 43).modified).toBe("dog"); 72 | 73 | bs = BiString.infer("Z̴̡̪̫̖̥̔̿̃̈̏̎͠͝á̸̪̠̖̻̬̖̪̞͙͇̮̠͎̆͋́̐͌̒͆̓l̶͉̭̳̤̬̮̩͎̟̯̜͇̥̠̘͑͐̌͂̄́̀̂̌̈͛̊̄̚͜ģ̸̬̼̞̙͇͕͎̌̾̒̐̿̎̆̿̌̃̏̌́̾̈͘͜o̶̢̭͕͔̩͐ ̴̡̡̜̥̗͔̘̦͉̣̲͚͙̐̈́t̵͈̰̉̀͒̎̈̿̔̄̽͑͝͠ẹ̵̫̲̫̄͜͜x̵͕̳͈̝̤̭̼̼̻͓̿̌̽̂̆̀̀̍̒͐́̈̀̚͝t̸̡̨̥̺̣̟͎̝̬̘̪͔͆́̄̅̚", "Zalgo text"); 74 | for (let i = 0; i < bs.length; ++i) { 75 | expect(bs.substring(i, i + 1).original.startsWith(bs[i])).toBe(true); 76 | } 77 | 78 | expect(BiString.infer("", "").equals(new BiString(""))).toBe(true); 79 | expect(BiString.infer("a", "").equals(new BiString("a", ""))).toBe(true); 80 | expect(BiString.infer("", "a").equals(new BiString("", "a"))).toBe(true); 81 | }); 82 | 83 | test("BiString.concat", () => { 84 | let bs = new BiString(" ", "").concat( 85 | "Hello", 86 | new BiString(" ", " "), 87 | "world!", 88 | new BiString(" ", ""), 89 | ); 90 | 91 | expect(bs.original).toBe(" Hello world! "); 92 | expect(bs.modified).toBe("Hello world!"); 93 | 94 | bs = bs.substring(4, 7); 95 | expect(bs.original).toBe("o w"); 96 | expect(bs.modified).toBe("o w"); 97 | 98 | bs = bs.substring(1, 2); 99 | expect(bs.original).toBe(" "); 100 | expect(bs.modified).toBe(" "); 101 | }); 102 | 103 | test("BiString.indexOf", () => { 104 | const bs = new BiString("dysfunction"); 105 | 106 | expect(bs.indexOf("dis")).toBe(-1); 107 | expect(bs.indexOf("fun")).toBe(3); 108 | expect(bs.indexOf("n")).toBe(5); 109 | expect(bs.indexOf("n", 6)).toBe(10); 110 | expect(bs.indexOf("n", 11)).toBe(-1); 111 | 112 | expect(bs.boundsOf("dis")).toEqual([-1, -1]); 113 | expect(bs.boundsOf("fun")).toEqual([3, 6]); 114 | expect(bs.boundsOf("n")).toEqual([5, 6]); 115 | expect(bs.boundsOf("n", 6)).toEqual([10, 11]); 116 | expect(bs.boundsOf("n", 11)).toEqual([-1, -1]); 117 | }); 118 | 119 | test("BiString.lastIndexOf", () => { 120 | const bs = new BiString("dysfunction"); 121 | 122 | expect(bs.lastIndexOf("dis")).toBe(-1); 123 | expect(bs.lastIndexOf("fun")).toBe(3); 124 | expect(bs.lastIndexOf("n")).toBe(10); 125 | expect(bs.lastIndexOf("n", 9)).toBe(5); 126 | expect(bs.lastIndexOf("n", 4)).toBe(-1); 127 | 128 | expect(bs.lastBoundsOf("dis")).toEqual([-1, -1]); 129 | expect(bs.lastBoundsOf("fun")).toEqual([3, 6]); 130 | expect(bs.lastBoundsOf("n")).toEqual([10, 11]); 131 | expect(bs.lastBoundsOf("n", 9)).toEqual([5, 6]); 132 | expect(bs.lastBoundsOf("n", 4)).toEqual([-1, -1]); 133 | }); 134 | 135 | test("BiString.{starts,ends}With", () => { 136 | const bs = new BiString("Beginning, middle, ending"); 137 | 138 | expect(bs.startsWith("Begin")).toBe(true); 139 | expect(bs.endsWith("ing")).toBe(true); 140 | 141 | expect(bs.startsWith("ending")).toBe(false); 142 | expect(bs.endsWith("Beginning")).toBe(false); 143 | }); 144 | 145 | test("BiString.pad*", () => { 146 | const bs = new BiString("Hello world!"); 147 | 148 | expect(bs.padStart(5).equals(bs)).toBe(true); 149 | expect(bs.padEnd(5).equals(bs)).toBe(true); 150 | 151 | let pad = new BiString("", " "); 152 | expect(bs.padStart(16).equals(pad.concat(bs))).toBe(true); 153 | expect(bs.padEnd(16).equals(bs.concat(pad))).toBe(true); 154 | }); 155 | 156 | test("BiString.split", () => { 157 | let bs = new BiString("The quick, brown fox jumps over the lazy dog"); 158 | 159 | expect(bs.split()).toEqual([bs]); 160 | 161 | expect(bs.split("").map(s => s.modified)).toEqual(bs.modified.split("")); 162 | 163 | expect(bs.split(" ").map(s => s.modified)).toEqual(bs.modified.split(" ")); 164 | expect(bs.split(/ /).map(s => s.modified)).toEqual(bs.modified.split(/ /)); 165 | 166 | expect(bs.split(/ /y).map(s => s.modified)).toEqual(bs.modified.split(/ /y)); 167 | 168 | expect(bs.split("", 0).map(s => s.modified)).toEqual(bs.modified.split("", 0)); 169 | expect(bs.split(" ", 0).map(s => s.modified)).toEqual(bs.modified.split(" ", 0)); 170 | expect(bs.split(/ /, 0).map(s => s.modified)).toEqual(bs.modified.split(/ /, 0)); 171 | 172 | expect(bs.split("", 3).map(s => s.modified)).toEqual(bs.modified.split("", 3)); 173 | expect(bs.split(" ", 3).map(s => s.modified)).toEqual(bs.modified.split(" ", 3)); 174 | expect(bs.split(/ /, 3).map(s => s.modified)).toEqual(bs.modified.split(/ /, 3)); 175 | 176 | expect(bs.split("", 20).map(s => s.modified)).toEqual(bs.modified.split("", 20)); 177 | expect(bs.split(" ", 20).map(s => s.modified)).toEqual(bs.modified.split(" ", 20)); 178 | expect(bs.split(/ /, 20).map(s => s.modified)).toEqual(bs.modified.split(/ /, 20)); 179 | 180 | bs = new BiString(" The quick, brown fox"); 181 | expect(bs.split(" ").map(s => s.modified)).toEqual(bs.modified.split(" ")); 182 | expect(bs.split(/ /).map(s => s.modified)).toEqual(bs.modified.split(/ /)); 183 | 184 | bs = new BiString("The quick, brown fox "); 185 | expect(bs.split(" ").map(s => s.modified)).toEqual(bs.modified.split(" ")); 186 | expect(bs.split(/ /).map(s => s.modified)).toEqual(bs.modified.split(/ /)); 187 | 188 | bs = new BiString(" The quick, brown fox "); 189 | expect(bs.split(" ").map(s => s.modified)).toEqual(bs.modified.split(" ")); 190 | expect(bs.split(/ /).map(s => s.modified)).toEqual(bs.modified.split(/ /)); 191 | }); 192 | 193 | test("BiString.join", () => { 194 | const sep = new BiString(" ", ", "); 195 | const chunks = new BiString("The quick brown fox").split(" "); 196 | const bs = sep.join(chunks); 197 | expect(bs.original).toBe("The quick brown fox"); 198 | expect(bs.modified).toBe("The, quick, brown, fox"); 199 | }); 200 | 201 | test("BiString.trim{,Start,End}", () => { 202 | let bs = new BiString(" Hello world! "); 203 | expect(bs.trim().modified).toBe("Hello world!"); 204 | expect(bs.trimStart().modified).toBe("Hello world! "); 205 | expect(bs.trimEnd().modified).toBe(" Hello world!"); 206 | 207 | bs = new BiString(" "); 208 | expect(bs.trim().modified).toBe(""); 209 | expect(bs.trimStart().modified).toBe(""); 210 | expect(bs.trimEnd().modified).toBe(""); 211 | }); 212 | 213 | test("BiString.normalize", () => { 214 | // "Héllö" -- é is composed but ö has a combining diaeresis 215 | let bs = new BiString("H\u00E9llo\u0308").normalize("NFC"); 216 | expect(bs.original).toBe("H\u00E9llo\u0308"); 217 | expect(bs.modified).toBe("H\u00E9ll\u00F6"); 218 | expect(bs.modified).toBe(bs.original.normalize("NFC")); 219 | expect(bs.slice(1, 2).equals(new BiString("\u00E9"))).toBe(true); 220 | expect(bs.slice(4, 5).equals(new BiString("o\u0308", "\u00F6"))).toBe(true); 221 | 222 | bs = new BiString("H\u00E9llo\u0308").normalize("NFD"); 223 | expect(bs.original).toBe("H\u00E9llo\u0308"); 224 | expect(bs.modified).toBe("He\u0301llo\u0308"); 225 | expect(bs.modified).toBe(bs.original.normalize("NFD")); 226 | expect(bs.slice(1, 3).equals(new BiString("\u00E9", "e\u0301"))).toBe(true); 227 | expect(bs.slice(5, 7).original).toBe("o\u0308"); 228 | expect(bs.slice(5, 7).modified).toBe("o\u0308"); 229 | expect(bs.slice(5, 7).equals(new BiString("o\u0308"))).toBe(true); 230 | }); 231 | 232 | test("BiString.toLowerCase", () => { 233 | let bs = new BiString("Hello World").toLowerCase(); 234 | let expected = new BiString("Hello World", "hello world", Alignment.identity(11)); 235 | expect(bs.equals(expected)).toBe(true); 236 | 237 | // Odysseus 238 | bs = new BiString("ὈΔΥΣΣΕΎΣ").toLowerCase(); 239 | expected = new BiString("ὈΔΥΣΣΕΎΣ", "ὀδυσσεύς", Alignment.identity(8)); 240 | expect(bs.equals(expected)).toBe(true); 241 | 242 | // Examples from The Unicode Standard, Version 12.0, Chapter 3.13 243 | bs = new BiString("ᾼΣͅ").toLowerCase(); 244 | expected = new BiString("ᾼΣͅ", "ᾳςͅ", Alignment.identity(4)); 245 | expect(bs.equals(expected)).toBe(true); 246 | 247 | bs = new BiString("ͅΣͅ").toLowerCase(); 248 | expected = new BiString("ͅΣͅ", "ͅσͅ", Alignment.identity(3)); 249 | expect(bs.equals(expected)).toBe(true); 250 | 251 | bs = new BiString("ᾼΣᾼ").toLowerCase(); 252 | expected = new BiString("ᾼΣᾼ", "ᾳσᾳ", Alignment.identity(5)); 253 | expect(bs.equals(expected)).toBe(true); 254 | 255 | bs = new BiString("Σ").toLowerCase(); 256 | expected = new BiString("Σ", "σ"); 257 | expect(bs.equals(expected)).toBe(true); 258 | }); 259 | 260 | test("BiString.toUpperCase", () => { 261 | let bs = new BiString("Hello World").toUpperCase(); 262 | let expected = new BiString("Hello World", "HELLO WORLD", Alignment.identity(11)); 263 | expect(bs.equals(expected)).toBe(true); 264 | 265 | bs = new BiString("straße").toUpperCase(); 266 | expected = new BiString("stra", "STRA", Alignment.identity(4)).concat( 267 | new BiString("ß", "SS"), 268 | new BiString("e", "E"), 269 | ); 270 | expect(bs.equals(expected)).toBe(true); 271 | 272 | // Odysseus 273 | bs = new BiString("Ὀδυσσεύς").toUpperCase(); 274 | expected = new BiString("Ὀδυσσεύς", "ὈΔΥΣΣΕΎΣ", Alignment.identity(8)); 275 | expect(bs.equals(expected)).toBe(true); 276 | }); 277 | 278 | test("README", () => { 279 | let bs = new BiString("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶"); 280 | bs = bs.normalize("NFKD"); 281 | bs = bs.toLowerCase(); 282 | bs = bs.replace("🦊", "fox") 283 | bs = bs.replace("🐶", "dog") 284 | bs = bs.replace(/[^\w\s]+/g, ""); 285 | bs = bs.slice(0, 19); 286 | expect(bs.modified).toBe("the quick brown fox"); 287 | expect(bs.original).toBe("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊"); 288 | }); 289 | -------------------------------------------------------------------------------- /js/tests/builder.test.ts: -------------------------------------------------------------------------------- 1 | import { BiString, BiStringBuilder, Alignment } from ".."; 2 | 3 | test("BiStringBuilder word chunks", () => { 4 | const builder = new BiStringBuilder(" the quick brown fox "); 5 | builder.discard(2); 6 | builder.replace(3, "the"); 7 | builder.skip(1); 8 | builder.replace(5, "quick"); 9 | builder.replace(2, " "); 10 | builder.replace(5, "brown"); 11 | builder.skip(1); 12 | builder.replace(3, "fox"); 13 | builder.discard(1); 14 | 15 | const bs = builder.build(); 16 | 17 | expect(bs.original).toBe(" the quick brown fox "); 18 | expect(bs.modified).toBe("the quick brown fox"); 19 | 20 | expect(bs.slice(0, 1).original).toBe("the"); 21 | expect(bs.slice(1, 2).original).toBe("the"); 22 | expect(bs.slice(2, 3).original).toBe("the"); 23 | 24 | expect(bs.slice(0, 3).original).toBe("the"); 25 | expect(bs.slice(1, 3).original).toBe("the"); 26 | 27 | expect(bs.slice(0, 4).original).toBe("the "); 28 | 29 | expect(bs.slice(3, 4).original).toBe(" "); 30 | expect(bs.slice(9, 10).original).toBe(" "); 31 | 32 | expect(bs.slice(4, 15).original).toBe("quick brown"); 33 | expect(bs.slice(5, 14).original).toBe("quick brown"); 34 | 35 | expect(bs.slice(0, 0).original).toBe(""); 36 | expect(bs.slice(10, 10).original).toBe(""); 37 | }); 38 | 39 | test("BiStringBuilder char chunks", () => { 40 | const builder = new BiStringBuilder(" the quick brown fox "); 41 | builder.discardMatch(/\s+/y); 42 | while (!builder.isComplete) { 43 | builder.skipMatch(/\S+/y); 44 | builder.replaceMatch(/\s+(?=\S)/y, " "); 45 | builder.discardMatch(/\s+$/y); 46 | } 47 | 48 | const bs = builder.build(); 49 | 50 | expect(bs.original).toBe(" the quick brown fox "); 51 | expect(bs.modified).toBe("the quick brown fox"); 52 | 53 | expect(bs.slice(0, 1).original).toBe("t"); 54 | expect(bs.slice(1, 2).original).toBe("h"); 55 | expect(bs.slice(2, 3).original).toBe("e"); 56 | 57 | expect(bs.slice(0, 3).original).toBe("the"); 58 | expect(bs.slice(1, 3).original).toBe("he"); 59 | 60 | expect(bs.slice(0, 4).original).toBe("the "); 61 | expect(bs.slice(1, 4).original).toBe("he "); 62 | 63 | expect(bs.slice(3, 4).original).toBe(" "); 64 | expect(bs.slice(9, 10).original).toBe(" "); 65 | 66 | expect(bs.slice(4, 15).original).toBe("quick brown"); 67 | expect(bs.slice(5, 14).original).toBe("uick brow"); 68 | 69 | expect(bs.slice(0, 0).original).toBe(""); 70 | expect(bs.slice(10, 10).original).toBe(""); 71 | }); 72 | 73 | test("BiStringBuilder('')", () => { 74 | const builder = new BiStringBuilder(""); 75 | const bs = builder.build(); 76 | expect(bs.original).toBe(""); 77 | expect(bs.modified).toBe(""); 78 | expect(bs.slice(0, 0).original).toBe(""); 79 | }); 80 | 81 | test("BiStringBuilder.rewind", () => { 82 | const builder = new BiStringBuilder("I wish I wouldn't've spent one thousand dollars."); 83 | builder.skipMatch(/[^.]*/y); 84 | builder.discardRest(); 85 | builder.rewind(); 86 | builder.skipMatch(/I wish I would/y); 87 | builder.replaceMatch(/n't/y, " not"); 88 | builder.replaceMatch(/'ve/y, " have"); 89 | builder.skipMatch(/ spent /y); 90 | builder.replaceMatch(/one thousand dollars/y, "$1,000"); 91 | 92 | const bs = builder.build(); 93 | expect(bs.original).toBe("I wish I wouldn't've spent one thousand dollars."); 94 | expect(bs.modified).toBe("I wish I would not have spent $1,000"); 95 | }); 96 | 97 | test("BiStringBuilder.replaceAll", () => { 98 | const builder = new BiStringBuilder("the cheese that the mouse that the cat that the dog chased played with ate"); 99 | builder.replaceMatch(/that/, "which"); 100 | builder.replaceAll(/that/g, "whom"); 101 | 102 | const bs = builder.build(); 103 | expect(bs.original).toBe("the cheese that the mouse that the cat that the dog chased played with ate"); 104 | expect(bs.modified).toBe("the cheese which the mouse whom the cat whom the dog chased played with ate"); 105 | }); 106 | 107 | test("BiStringBuilder.replaceAll back-references", () => { 108 | const builder = new BiStringBuilder("it doesn't work and stuff doesn't get replaced"); 109 | builder.replaceAll(/\bdoesn't (\S+)/g, "$1s"); 110 | 111 | const bs = builder.build(); 112 | expect(bs.original).toBe("it doesn't work and stuff doesn't get replaced"); 113 | expect(bs.modified).toBe("it works and stuff gets replaced"); 114 | }); 115 | 116 | test("BiStringBuilder.append", () => { 117 | const builder = new BiStringBuilder("hello WORLD"); 118 | builder.append(new BiString("hello", "HELLO", Alignment.identity(5))); 119 | builder.skip(1) 120 | builder.append(new BiString("WORLD", "world", Alignment.identity(5))); 121 | 122 | const bs = builder.build(); 123 | expect(bs.slice(1, 4).equals(new BiString("ell", "ELL", Alignment.identity(3)))).toBe(true); 124 | expect(bs.slice(7, 10).equals(new BiString("ORL", "orl", Alignment.identity(3)))).toBe(true); 125 | }); 126 | -------------------------------------------------------------------------------- /js/tests/token.test.ts: -------------------------------------------------------------------------------- 1 | /*! 2 | * Copyright (c) Microsoft Corporation. All rights reserved. 3 | * Licensed under the MIT license. 4 | */ 5 | 6 | import BiString, { Token, Tokenization, RegExpTokenizer, SplittingTokenizer } from ".."; 7 | 8 | test("Tokenization", () => { 9 | let text = new BiString(" The quick, brown fox jumps over the lazy dog "); 10 | text = text.replace(",", ""); 11 | text = text.replace(/^ +| +$/g, ""); 12 | 13 | let tokens = new Tokenization(text, [ 14 | Token.slice(text, 0, 3), 15 | Token.slice(text, 4, 9), 16 | Token.slice(text, 10, 15), 17 | Token.slice(text, 16, 19), 18 | Token.slice(text, 20, 25), 19 | Token.slice(text, 26, 30), 20 | Token.slice(text, 31, 34), 21 | Token.slice(text, 35, 39), 22 | Token.slice(text, 40, 43), 23 | ]); 24 | expect(tokens.text.equals(text)).toBe(true); 25 | expect(tokens.textBounds(1, 3)).toEqual([4, 15]); 26 | expect(tokens.originalBounds(1, 3)).toEqual([6, 18]); 27 | expect(tokens.boundsForText(0, 13)).toEqual([0, 3]); 28 | expect(tokens.boundsForOriginal(0, 13)).toEqual([0, 2]); 29 | expect(tokens.sliceByText(34, 43).substring().equals(new BiString("lazy dog"))).toBe(true); 30 | expect(tokens.sliceByOriginal(36, 48).substring().equals(new BiString("the lazy dog"))).toBe(true); 31 | expect(tokens.snapTextBounds(2, 13)).toEqual([0, 15]); 32 | expect(tokens.snapOriginalBounds(36, 47)).toEqual([34, 46]); 33 | }); 34 | 35 | test("Tokenization.infer", () => { 36 | const text = "the quick, brown fox" 37 | const tokens = Tokenization.infer(text, ["the", "quick", "brown", "fox"]); 38 | expect(tokens.substring(1, 3).equals(new BiString("quick, brown"))); 39 | 40 | expect(() => Tokenization.infer(text, ["the", "quick", "red", "fox"])).toThrow(); 41 | }); 42 | 43 | test("RegExpTokenizer", () => { 44 | const text = new BiString(" The quick, brown fox jumps over the lazy dog "); 45 | 46 | const tokenizer = new RegExpTokenizer(/\w+/g); 47 | const tokens = tokenizer.tokenize(text); 48 | 49 | expect(tokens.text).toBe(text); 50 | expect(tokens.length).toBe(9); 51 | expect(tokens.textBounds(0, 2)).toEqual([1, 10]); 52 | expect(tokens.slice(0, 2).substring().equals(text.slice(1, 10))).toBe(true); 53 | expect(tokens.sliceByText(5, 10).length).toBe(1); 54 | expect(tokens.sliceByText(5, 11).length).toBe(1); 55 | expect(tokens.sliceByText(3, 13).length).toBe(3); 56 | }); 57 | 58 | test("SplittingTokenizer", () => { 59 | const text = new BiString(" The quick, brown fox jumps over the lazy dog "); 60 | 61 | const tokenizer = new SplittingTokenizer(/\s+/g); 62 | const tokens = tokenizer.tokenize(text); 63 | 64 | expect(tokens.text).toBe(text); 65 | expect(tokens.length).toBe(9); 66 | expect(tokens.textBounds(0, 2)).toEqual([1, 11]); 67 | expect(tokens.slice(0, 2).substring().equals(text.slice(1, 11))).toBe(true); 68 | expect(tokens.sliceByText(5, 10).length).toBe(1); 69 | expect(tokens.sliceByText(5, 11).length).toBe(1); 70 | expect(tokens.sliceByText(3, 13).length).toBe(3); 71 | }); 72 | -------------------------------------------------------------------------------- /js/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2020", 4 | "module": "esnext", 5 | "moduleResolution": "node", 6 | "esModuleInterop": true, 7 | "removeComments": true, 8 | "declaration": true, 9 | "outDir": "dist", 10 | "declarationDir": ".", 11 | "strict": true, 12 | "sourceMap": true, 13 | "declarationMap": true 14 | }, 15 | "include": [ 16 | "src/**/*" 17 | ], 18 | "exclude": [ 19 | "dist" 20 | ], 21 | "typedocOptions": { 22 | "excludePrivate": true 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /python/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | db.sqlite3-journal 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 88 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 89 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 90 | # install all needed dependencies. 91 | #Pipfile.lock 92 | 93 | # celery beat schedule file 94 | celerybeat-schedule 95 | 96 | # SageMath parsed files 97 | *.sage.py 98 | 99 | # Environments 100 | .env 101 | .venv 102 | env/ 103 | venv/ 104 | ENV/ 105 | env.bak/ 106 | venv.bak/ 107 | 108 | # Spyder project settings 109 | .spyderproject 110 | .spyproject 111 | 112 | # Rope project settings 113 | .ropeproject 114 | 115 | # mkdocs documentation 116 | /site 117 | 118 | # mypy 119 | .mypy_cache/ 120 | .dmypy.json 121 | dmypy.json 122 | 123 | # Pyre type checker 124 | .pyre/ 125 | -------------------------------------------------------------------------------- /python/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | pipenv run mypy -p bistring --html-report=build/mypy 3 | 4 | deps: 5 | pipenv sync --dev 6 | 7 | check: all 8 | pipenv run pytest 9 | 10 | clean: 11 | pipenv run python setup.py clean --all 12 | 13 | .PHONY: all deps check clean 14 | -------------------------------------------------------------------------------- /python/Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | bistring = {editable = true,extras = ["dev"],path = "."} 8 | 9 | [packages] 10 | bistring = {path = "."} 11 | -------------------------------------------------------------------------------- /python/README.rst: -------------------------------------------------------------------------------- 1 | bistring 2 | ======== 3 | 4 | |PyPI version| 5 | 6 | The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace. 7 | Each bistring remembers the original string, and how its substrings map to substrings of the modified version. 8 | 9 | For example: 10 | 11 | .. code-block:: python 12 | 13 | >>> from bistring import bistr 14 | >>> s = bistr('𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶') 15 | >>> s = s.normalize('NFKD') # Unicode normalization 16 | >>> s = s.casefold() # Case-insensitivity 17 | >>> s = s.replace('🦊', 'fox') # Replace emoji with text 18 | >>> s = s.replace('🐶', 'dog') 19 | >>> s = s.sub(r'[^\w\s]+', '') # Strip everything but letters and spaces 20 | >>> s = s[:19] # Extract a substring 21 | >>> s.modified # The modified substring, after changes 22 | 'the quick brown fox' 23 | >>> s.original # The original substring, before changes 24 | '𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊' 25 | 26 | This allows you to perform very aggressive text processing completely invisibly. 27 | 28 | .. |PyPI version| image:: https://badge.fury.io/py/bistring.svg 29 | :target: https://pypi.org/project/bistring/ 30 | -------------------------------------------------------------------------------- /python/bistring/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from ._alignment import * 5 | from ._bistr import * 6 | from ._builder import * 7 | from ._token import * 8 | -------------------------------------------------------------------------------- /python/bistring/_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | __all__ = ['BistrBuilder'] 5 | 6 | from typing import Iterable, List, Match, Optional 7 | 8 | from ._alignment import Alignment 9 | from ._bistr import bistr, String 10 | from ._regex import compile_regex, expand_template 11 | from ._typing import BiIndex, Regex, Replacement 12 | 13 | 14 | class BistrBuilder: 15 | r""" 16 | Bidirectionally transformed string builer. 17 | 18 | A `BistrBuilder` builds a transformed version of a source string iteratively. Each builder has an immutable 19 | original string, a current string, and the in-progress modified string, with alignments between each. For example: 20 | 21 | .. code-block:: text 22 | 23 | original: |The| |quick,| |brown| |🦊| |jumps| |over| |the| |lazy| |🐶| 24 | | | | | | | | \ \ \ \ \ \ \ \ \ \ \ 25 | current: |The| |quick,| |brown| |fox| |jumps| |over| |the| |lazy| |dog| 26 | | | | / / / 27 | modified: |the| |quick| |brown| ... 28 | 29 | The modified string is built in pieces by calling :meth:`replace` to change `n` characters of the current string 30 | into new ones in the modified string. Convenience methods like :meth:`skip`, :meth:`insert`, and :meth:`discard` 31 | are implemented on top of this basic primitive. 32 | 33 | >>> b = BistrBuilder('The quick, brown 🦊 jumps over the lazy 🐶') 34 | >>> b.skip(17) 35 | >>> b.peek(1) 36 | '🦊' 37 | >>> b.replace(1, 'fox') 38 | >>> b.skip(21) 39 | >>> b.peek(1) 40 | '🐶' 41 | >>> b.replace(1, 'dog') 42 | >>> b.is_complete 43 | True 44 | >>> b.rewind() 45 | >>> b.peek(3) 46 | 'The' 47 | >>> b.replace(3, 'the') 48 | >>> b.skip(1) 49 | >>> b.peek(6) 50 | 'quick,' 51 | >>> b.replace(6, 'quick') 52 | >>> b.skip_rest() 53 | >>> s = b.build() 54 | >>> s.modified 55 | 'the quick brown fox jumps over the lazy dog' 56 | """ 57 | 58 | _original: bistr 59 | _modified: List[str] 60 | _alignment: List[BiIndex] 61 | _opos: int 62 | _mpos: int 63 | 64 | def __init__(self, original: String): 65 | """ 66 | :param original: 67 | The string to start from. 68 | """ 69 | 70 | self._original = bistr(original) 71 | self._modified = [] 72 | self._alignment = [(0, 0)] 73 | self._opos = 0 74 | self._mpos = 0 75 | 76 | @property 77 | def original(self) -> str: 78 | """ 79 | The original string being modified. 80 | """ 81 | return self._original.original 82 | 83 | @property 84 | def current(self) -> str: 85 | """ 86 | The current string before modifications. 87 | """ 88 | return self._original.modified 89 | 90 | @property 91 | def modified(self) -> str: 92 | """ 93 | The modified string as built so far. 94 | """ 95 | return ''.join(self._modified) 96 | 97 | @property 98 | def alignment(self) -> Alignment: 99 | """ 100 | The alignment built so far from self.current to self.modified. 101 | """ 102 | return Alignment(self._alignment) 103 | 104 | @property 105 | def position(self) -> int: 106 | """ 107 | The position of the builder in self.current. 108 | """ 109 | return self._opos 110 | 111 | @property 112 | def remaining(self) -> int: 113 | """ 114 | The number of characters of the current string left to process. 115 | """ 116 | return len(self.current) - self._opos 117 | 118 | @property 119 | def is_complete(self) -> bool: 120 | """ 121 | Whether we've completely processed the string. In other words, whether the modified string aligns with the end 122 | of the current string. 123 | """ 124 | return self.remaining == 0 125 | 126 | def peek(self, n: int) -> str: 127 | """ 128 | Peek at the next `n` characters of the original string. 129 | """ 130 | return self.current[self._opos:self._opos+n] 131 | 132 | def _advance(self, ocount: int, mcount: int) -> None: 133 | self._opos += ocount 134 | self._mpos += mcount 135 | if ocount > 0 or mcount > 0: 136 | self._alignment.append((self._opos, self._mpos)) 137 | 138 | def skip(self, n: int) -> None: 139 | """ 140 | Skip the next `n` characters, copying them unchanged. 141 | """ 142 | if n > 0: 143 | self._modified.append(self.peek(n)) 144 | for i in range(n): 145 | self._advance(1, 1) 146 | 147 | def skip_rest(self) -> None: 148 | """ 149 | Skip the rest of the string, copying it unchanged. 150 | """ 151 | self.skip(self.remaining) 152 | 153 | def insert(self, string: str) -> None: 154 | """ 155 | Insert a substring into the string. 156 | """ 157 | self.replace(0, string) 158 | 159 | def discard(self, n: int) -> None: 160 | """ 161 | Discard a portion of the original string. 162 | """ 163 | self.replace(n, '') 164 | 165 | def discard_rest(self) -> None: 166 | """ 167 | Discard the rest of the original string. 168 | """ 169 | self.discard(self.remaining) 170 | 171 | def replace(self, n: int, repl: str) -> None: 172 | """ 173 | Replace the next `n` characters with a new string. 174 | """ 175 | if len(repl) > 0: 176 | self._modified.append(repl) 177 | self._advance(n, len(repl)) 178 | 179 | def append(self, bs: bistr) -> None: 180 | """ 181 | Append a bistr. The original value of the bistr must match the current string being processed. 182 | """ 183 | if bs.original != self.peek(len(bs.original)): 184 | raise ValueError("bistr doesn't match the current string") 185 | self._modified.append(bs.modified) 186 | for (o0, m0), (o1, m1) in zip(bs.alignment, bs.alignment[1:]): 187 | self._advance(o1 - o0, m1 - m0) 188 | 189 | def _match(self, regex: Regex) -> Optional[Match[str]]: 190 | pattern = compile_regex(regex) 191 | return pattern.match(self.current, pos=self._opos) 192 | 193 | def _search(self, regex: Regex) -> Optional[Match[str]]: 194 | pattern = compile_regex(regex) 195 | return pattern.search(self.current, pos=self._opos) 196 | 197 | def _finditer(self, regex: Regex) -> Iterable[Match[str]]: 198 | pattern = compile_regex(regex) 199 | return pattern.finditer(self.current, pos=self._opos) 200 | 201 | def skip_match(self, regex: Regex) -> bool: 202 | """ 203 | Skip a substring matching a regex, copying it unchanged. 204 | 205 | :param regex: 206 | The (possibly compiled) regular expression to match. 207 | :returns: 208 | Whether a match was found. 209 | """ 210 | 211 | match = self._match(regex) 212 | if match: 213 | self.skip(match.end() - match.start()) 214 | return True 215 | else: 216 | return False 217 | 218 | def discard_match(self, regex: Regex) -> bool: 219 | """ 220 | Discard a substring that matches a regex. 221 | 222 | :param regex: 223 | The (possibly compiled) regular expression to match. 224 | :returns: 225 | Whether a match was found. 226 | """ 227 | 228 | match = self._match(regex) 229 | if match: 230 | self.discard(match.end() - match.start()) 231 | return True 232 | else: 233 | return False 234 | 235 | def replace_match(self, regex: Regex, repl: Replacement) -> bool: 236 | """ 237 | Replace a substring that matches a regex. 238 | 239 | :param regex: 240 | The (possibly compiled) regular expression to match. 241 | :param repl: 242 | The replacement to use. Can be a string, which is interpreted as in :meth:`re.Match.expand`, or a 243 | `callable`, which will receive each match and return the replacement string. 244 | :returns: 245 | Whether a match was found. 246 | """ 247 | 248 | match = self._match(regex) 249 | if match: 250 | self.replace(match.end() - match.start(), expand_template(match, repl)) 251 | return True 252 | else: 253 | return False 254 | 255 | def replace_next(self, regex: Regex, repl: Replacement) -> bool: 256 | """ 257 | Replace the next occurence of a regex. 258 | 259 | :param regex: 260 | The (possibly compiled) regular expression to match. 261 | :param repl: 262 | The replacement to use. 263 | :returns: 264 | Whether a match was found. 265 | """ 266 | 267 | match = self._search(regex) 268 | if match: 269 | self.skip(match.start() - self._opos) 270 | self.replace(match.end() - match.start(), expand_template(match, repl)) 271 | return True 272 | else: 273 | return False 274 | 275 | def replace_all(self, regex: Regex, repl: Replacement) -> None: 276 | """ 277 | Replace all occurences of a regex. 278 | 279 | :param regex: 280 | The (possibly compiled) regular expression to match. 281 | :param repl: 282 | The replacement to use. 283 | """ 284 | 285 | for match in self._finditer(regex): 286 | self.skip(match.start() - self._opos) 287 | self.replace(match.end() - match.start(), expand_template(match, repl)) 288 | self.skip_rest() 289 | 290 | def build(self) -> bistr: 291 | """ 292 | Build the `bistr`. 293 | 294 | :returns: 295 | A `bistr` from the original string to the new modified string. 296 | :raises: 297 | :class:`ValueError` if the modified string is not completely built yet. 298 | """ 299 | 300 | if not self.is_complete: 301 | raise ValueError(f'The string is not completely built yet ({self.remaining} characters remaining)') 302 | 303 | alignment = self._original.alignment.compose(self.alignment) 304 | return bistr(self.original, self.modified, alignment) 305 | 306 | def rewind(self) -> None: 307 | """ 308 | Reset this builder to apply another transformation. 309 | 310 | :raises: 311 | :class:`ValueError` if the modified string is not completely built yet. 312 | """ 313 | self._original = self.build() 314 | self._modified = [] 315 | self._alignment = [(0, 0)] 316 | self._opos = 0 317 | self._mpos = 0 318 | -------------------------------------------------------------------------------- /python/bistring/_icu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import icu 5 | from typing import Callable, Optional 6 | 7 | from ._bistr import bistr 8 | from ._builder import BistrBuilder 9 | 10 | 11 | def _edit(bs: bistr, op: Callable, locale: Optional[str] = None) -> bistr: 12 | builder = BistrBuilder(bs) 13 | edits = icu.Edits() 14 | ucur = icu.UnicodeString(builder.current) 15 | 16 | if locale is None: 17 | umod = icu.UnicodeString(op(ucur, edits)) 18 | else: 19 | umod = icu.UnicodeString(op(icu.Locale(locale), ucur, edits)) 20 | 21 | for is_change, old_len, new_len, old_i, new_i, _ in edits.getFineIterator(): 22 | old_len = ucur.countChar32(old_i, old_len) 23 | if is_change: 24 | repl = str(umod[new_i:new_i+new_len]) 25 | builder.replace(old_len, repl) 26 | else: 27 | builder.skip(old_len) 28 | 29 | return builder.build() 30 | 31 | 32 | def casefold(bs: bistr) -> bistr: 33 | return _edit(bs, icu.CaseMap.fold) 34 | 35 | 36 | def lower(bs: bistr, locale: Optional[str]) -> bistr: 37 | return _edit(bs, icu.CaseMap.toLower, locale) 38 | 39 | 40 | def upper(bs: bistr, locale: Optional[str]) -> bistr: 41 | return _edit(bs, icu.CaseMap.toUpper, locale) 42 | 43 | 44 | def title(bs: bistr, locale: Optional[str]) -> bistr: 45 | return _edit(bs, icu.CaseMap.toTitle, locale) 46 | 47 | 48 | def _normalize(normalizer: icu.Normalizer2, bs: bistr) -> bistr: 49 | builder = BistrBuilder(bs) 50 | current = builder.current 51 | 52 | while not builder.is_complete: 53 | i = builder.position 54 | j = i + 1 55 | while j < len(current) and not normalizer.hasBoundaryBefore(current[j]): 56 | j += 1 57 | 58 | chunk = current[i:j] 59 | repl = normalizer.normalize(chunk) 60 | if repl == chunk: 61 | builder.skip(len(chunk)) 62 | else: 63 | builder.replace(len(chunk), repl) 64 | 65 | return builder.build() 66 | 67 | 68 | _NORMALIZERS = { 69 | 'NFC': icu.Normalizer2.getNFCInstance, 70 | 'NFKC': icu.Normalizer2.getNFKCInstance, 71 | 'NFD': icu.Normalizer2.getNFDInstance, 72 | 'NFKD': icu.Normalizer2.getNFKDInstance, 73 | } 74 | 75 | def normalize(bs: bistr, form: str) -> bistr: 76 | factory = _NORMALIZERS.get(form) 77 | if factory: 78 | return _normalize(factory(), bs) 79 | else: 80 | raise ValueError('invalid normalization form') 81 | -------------------------------------------------------------------------------- /python/bistring/_infer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from __future__ import annotations 5 | 6 | from dataclasses import dataclass 7 | from typing import List, Optional 8 | import unicodedata 9 | 10 | from ._alignment import Alignment 11 | from ._bistr import bistr 12 | from ._token import CharacterTokenizer 13 | 14 | 15 | @dataclass(frozen=True) 16 | class AugmentedChar: 17 | """ 18 | A single character (grapheme cluster) augmented with extra information. 19 | """ 20 | 21 | top_category: str 22 | """ 23 | The top-level Unicode category of the char (L, P, Z, etc.). 24 | """ 25 | 26 | category: str 27 | """ 28 | The specific Unicode category of the char (Lu, Po, Zs, etc.). 29 | """ 30 | 31 | root: str 32 | """ 33 | The root code point of the grapheme cluster. 34 | """ 35 | 36 | folded: str 37 | """ 38 | The case-folded form of the char. 39 | """ 40 | 41 | normalized: str 42 | """ 43 | The Unicode compatibility normalized form of the char. 44 | """ 45 | 46 | original: str 47 | """ 48 | The original form of the char. 49 | """ 50 | 51 | @classmethod 52 | def cost_fn(cls, a: Optional[AugmentedChar], b: Optional[AugmentedChar]) -> int: 53 | """ 54 | The cost function between augmented chars. Each attribute contributes one "point" towards their distance. 55 | """ 56 | 57 | if a is None or b is None: 58 | # cost(insert) + cost(delete) (4 + 4) should be more than cost(substitute) (6) 59 | return 4 60 | 61 | result = 0 62 | result += int(a.top_category != b.top_category) 63 | result += int(a.category != b.category) 64 | result += int(a.root != b.root) 65 | result += int(a.folded != b.folded) 66 | result += int(a.normalized != b.normalized) 67 | result += int(a.original != b.original) 68 | return result 69 | 70 | 71 | TOKENIZER = CharacterTokenizer('root') 72 | 73 | 74 | @dataclass(frozen=True) 75 | class AugmentedString: 76 | """ 77 | A string augmented with extra information about each character. 78 | """ 79 | 80 | original: str 81 | """ 82 | The original string. 83 | """ 84 | 85 | chars: List[AugmentedChar] 86 | """ 87 | The augmented characters of the string. 88 | """ 89 | 90 | alignment: Alignment 91 | """ 92 | The alignment between the original string and the augmented chars. 93 | """ 94 | 95 | @classmethod 96 | def augment(cls, original: str) -> AugmentedString: 97 | normalized = bistr(original).normalize('NFKD') 98 | folded = bistr(normalized.modified).casefold() 99 | glyphs = TOKENIZER.tokenize(folded) 100 | 101 | chars = [] 102 | for glyph in glyphs: 103 | fold_c = glyph.text.modified 104 | root = fold_c[0] 105 | 106 | norm_slice = folded.alignment.original_slice(glyph.start, glyph.end) 107 | norm_c = folded.original[norm_slice] 108 | 109 | orig_slice = normalized.alignment.original_slice(norm_slice) 110 | orig_c = normalized.original[orig_slice] 111 | 112 | cat = unicodedata.category(root) 113 | top_cat = cat[0] 114 | 115 | chars.append(AugmentedChar(top_cat, cat, root, fold_c, norm_c, orig_c)) 116 | 117 | alignment = normalized.alignment 118 | alignment = alignment.compose(folded.alignment) 119 | alignment = alignment.compose(glyphs.alignment) 120 | return cls(original, chars, alignment) 121 | 122 | 123 | def heuristic_infer(original: str, modified: str) -> bistr: 124 | """ 125 | Infer the alignment between two strings with a "smart" heuristic. 126 | 127 | We use Unicode normalization and case folding to minimize differences that are due to case, accents, ligatures, etc. 128 | """ 129 | 130 | aug_orig = AugmentedString.augment(original) 131 | aug_mod = AugmentedString.augment(modified) 132 | 133 | alignment = Alignment.infer(aug_orig.chars, aug_mod.chars, AugmentedChar.cost_fn) 134 | alignment = aug_orig.alignment.compose(alignment) 135 | alignment = alignment.compose(aug_mod.alignment.inverse()) 136 | 137 | return bistr(original, modified, alignment) 138 | -------------------------------------------------------------------------------- /python/bistring/_regex.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | import re 5 | from typing import Match, Pattern 6 | 7 | from ._typing import Regex, Replacement 8 | 9 | 10 | def compile_regex(regex: Regex) -> Pattern[str]: 11 | if isinstance(regex, str): 12 | return re.compile(regex) 13 | else: 14 | return regex 15 | 16 | 17 | def expand_template(match: Match[str], repl: Replacement) -> str: 18 | if callable(repl): 19 | return repl(match) 20 | else: 21 | return match.expand(repl) 22 | -------------------------------------------------------------------------------- /python/bistring/_token.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from __future__ import annotations 5 | 6 | __all__ = [ 7 | 'Token', 8 | 'Tokenization', 9 | 'Tokenizer', 10 | 'RegexTokenizer', 11 | 'SplittingTokenizer', 12 | 'CharacterTokenizer', 13 | 'WordTokenizer', 14 | 'SentenceTokenizer', 15 | ] 16 | 17 | from abc import ABC, abstractmethod 18 | from dataclasses import dataclass 19 | import icu 20 | import threading 21 | from typing import Callable, Iterable, Iterator, Optional, Sequence, Union, overload 22 | 23 | from ._alignment import Alignment 24 | from ._bistr import bistr, String 25 | from ._regex import compile_regex 26 | from ._typing import AnyBounds, Bounds, Index, Regex 27 | 28 | 29 | @dataclass(frozen=True) 30 | class Token: 31 | """ 32 | A token extracted from a string. 33 | """ 34 | 35 | text: bistr 36 | """ 37 | The actual text of the token. 38 | """ 39 | 40 | start: int 41 | """ 42 | The start position of the token. 43 | """ 44 | 45 | end: int 46 | """ 47 | The end position of the token. 48 | """ 49 | 50 | def __init__(self, text: String, start: int, end: int): 51 | """ 52 | :param text: 53 | The text of this token. 54 | :param start: 55 | The starting index of this token. 56 | :param end: 57 | The ending index of this token. 58 | """ 59 | 60 | super().__setattr__('text', bistr(text)) 61 | super().__setattr__('start', start) 62 | super().__setattr__('end', end) 63 | 64 | @property 65 | def original(self) -> str: 66 | """ 67 | The original value of this token. 68 | """ 69 | return self.text.original 70 | 71 | @property 72 | def modified(self) -> str: 73 | """ 74 | The modified value of this token. 75 | """ 76 | return self.text.modified 77 | 78 | @classmethod 79 | def slice(cls, text: String, start: int, end: int) -> Token: 80 | """ 81 | Create a Token from a slice of a bistr. 82 | 83 | :param text: 84 | The (bi)string to slice. 85 | :param start: 86 | The starting index of the token. 87 | :param end: 88 | The ending index of the token. 89 | """ 90 | return cls(text[start:end], start, end) 91 | 92 | def __str__(self) -> str: 93 | return f'[{self.start}:{self.end}]={self.text}' 94 | 95 | def __repr__(self) -> str: 96 | return f'Token({self.text!r}, start={self.start}, end={self.end})' 97 | 98 | 99 | @dataclass(frozen=True) 100 | class Tokenization: 101 | """ 102 | A string and its tokenization. 103 | """ 104 | 105 | text: bistr 106 | """ 107 | The text that was tokenized. 108 | """ 109 | 110 | alignment: Alignment 111 | """ 112 | The alignment from text indices to token indices. 113 | """ 114 | 115 | _tokens: Sequence[Token] 116 | 117 | def __init__(self, text: String, tokens: Iterable[Token]): 118 | """ 119 | :param text: 120 | The text from which the tokens have been extracted. 121 | :param tokens: 122 | The tokens extracted from the text. 123 | """ 124 | text = bistr(text) 125 | tokens = tuple(tokens) 126 | 127 | alignment = [(0, 0)] 128 | for i, token in enumerate(tokens): 129 | alignment.append((token.start, i)) 130 | alignment.append((token.end, i + 1)) 131 | alignment.append((len(text), len(tokens))) 132 | 133 | super().__setattr__('text', text) 134 | super().__setattr__('_tokens', tokens) 135 | super().__setattr__('alignment', Alignment(alignment)) 136 | 137 | @classmethod 138 | def infer(cls, text: String, tokens: Iterable[str]) -> Tokenization: 139 | r""" 140 | Infer a `Tokenization` from a sequence of tokens. 141 | 142 | >>> tokens = Tokenization.infer('hello, world!', ['hello', 'world']) 143 | >>> tokens[0] 144 | Token(bistr('hello'), start=0, end=5) 145 | >>> tokens[1] 146 | Token(bistr('world'), start=7, end=12) 147 | 148 | Due to the possibility of ambiguity, it is much better to use a :class:`Tokenizer` or some other method of 149 | producing :class:`Token`\ s with their positions explicitly set. 150 | 151 | :returns: 152 | The inferred tokenization, with token positions found by simple forward search. 153 | :raises: 154 | :class:`ValueError` if the tokens can't be found in the source string. 155 | """ 156 | 157 | text = bistr(text) 158 | 159 | result = [] 160 | start = 0 161 | for token in tokens: 162 | start, end = text.index_bounds(token, start) 163 | result.append(Token.slice(text, start, end)) 164 | start = end 165 | 166 | return cls(text, result) 167 | 168 | def __iter__(self) -> Iterator[Token]: 169 | return iter(self._tokens) 170 | 171 | def __len__(self) -> int: 172 | return len(self._tokens) 173 | 174 | @overload 175 | def __getitem__(self, index: int) -> Token: ... 176 | 177 | @overload 178 | def __getitem__(self, index: slice) -> Tokenization: ... 179 | 180 | def __getitem__(self, index: Index) -> Union[Token, Tokenization]: 181 | r""" 182 | Indexing a `Tokenization` returns the nth token: 183 | 184 | >>> tokens = Tokenization.infer( 185 | ... "The quick, brown fox", 186 | ... ["The", "quick", "brown", "fox"], 187 | ... ) 188 | >>> tokens[0] 189 | Token(bistr('The'), start=0, end=3) 190 | 191 | Slicing a `Tokenization` returns a new one with the requested slice of tokens: 192 | 193 | >>> tokens = tokens[1:-1] 194 | >>> tokens[0] 195 | Token(bistr('quick'), start=4, end=9) 196 | """ 197 | 198 | if isinstance(index, slice): 199 | return Tokenization(self.text, self._tokens[index]) 200 | else: 201 | return self._tokens[index] 202 | 203 | def __str__(self) -> str: 204 | tokens = ', '.join(map(str, self)) 205 | return f'Tokenization({self.text}, [{tokens}])' 206 | 207 | def __repr__(self) -> str: 208 | return f'Tokenization({self.text!r}, {self._tokens!r})' 209 | 210 | def substring(self, *args: AnyBounds) -> bistr: 211 | """ 212 | Map a span of tokens to the corresponding substring. With no arguments, returns the substring from the first 213 | to the last token. 214 | """ 215 | i, j = self.text_bounds(*args) 216 | return self.text[i:j] 217 | 218 | def text_bounds(self, *args: AnyBounds) -> Bounds: 219 | """ 220 | Map a span of tokens to the bounds of the corresponding text. With no arguments, returns the bounds from the 221 | first to the last token. 222 | """ 223 | if len(args) == 0: 224 | args = (0, len(self)) 225 | return self.alignment.original_bounds(*args) 226 | 227 | def original_bounds(self, *args: AnyBounds) -> Bounds: 228 | """ 229 | Map a span of tokens to the bounds of the corresponding original text. With no arguments, returns the bounds from the 230 | first to the last token. 231 | """ 232 | return self.text.alignment.original_bounds(self.text_bounds(*args)) 233 | 234 | def bounds_for_text(self, *args: AnyBounds) -> Bounds: 235 | """ 236 | Map a span of text to the bounds of the corresponding span of tokens. 237 | """ 238 | return self.alignment.modified_bounds(*args) 239 | 240 | def bounds_for_original(self, *args: AnyBounds) -> Bounds: 241 | """ 242 | Map a span of original text to the bounds of the corresponding span of 243 | tokens. 244 | """ 245 | text_bounds = self.text.alignment.modified_bounds(*args) 246 | return self.alignment.modified_bounds(text_bounds) 247 | 248 | def slice_by_text(self, *args: AnyBounds) -> Tokenization: 249 | """ 250 | Map a span of text to the corresponding span of tokens. 251 | """ 252 | i, j = self.bounds_for_text(*args) 253 | return self[i:j] 254 | 255 | def slice_by_original(self, *args: AnyBounds) -> Tokenization: 256 | """ 257 | Map a span of the original text to the corresponding span of tokens. 258 | """ 259 | i, j = self.bounds_for_original(*args) 260 | return self[i:j] 261 | 262 | def snap_text_bounds(self, *args: AnyBounds) -> Bounds: 263 | """ 264 | Expand a span of text to align it with token boundaries. 265 | """ 266 | return self.text_bounds(self.bounds_for_text(*args)) 267 | 268 | def snap_original_bounds(self, *args: AnyBounds) -> Bounds: 269 | """ 270 | Expand a span of original text to align it with token boundaries. 271 | """ 272 | return self.original_bounds(self.bounds_for_original(*args)) 273 | 274 | 275 | class Tokenizer(ABC): 276 | """ 277 | Abstract base class for tokenizers. 278 | """ 279 | 280 | @abstractmethod 281 | def tokenize(self, text: String) -> Tokenization: 282 | """ 283 | Tokenize some text. 284 | 285 | :param text: The text to tokenize, as either an `str` or 286 | :class:`~bistring.bistr`. A plain `str` should be 287 | converted to a `bistr` before processing. 288 | 289 | :returns: A :class:`~bistring.Tokenization` holding the text and its 290 | tokens. 291 | """ 292 | 293 | pass 294 | 295 | 296 | class RegexTokenizer(Tokenizer): 297 | r""" 298 | Breaks text into tokens based on a regex. 299 | 300 | >>> tokenizer = RegexTokenizer(r'\w+') 301 | >>> tokens = tokenizer.tokenize('the quick brown fox jumps over the lazy dog') 302 | >>> tokens[0] 303 | Token(bistr('the'), start=0, end=3) 304 | >>> tokens[1] 305 | Token(bistr('quick'), start=4, end=9) 306 | """ 307 | 308 | def __init__(self, regex: Regex): 309 | """ 310 | :param regex: 311 | A (possibly compiled) regular expression that matches tokens to extract. 312 | """ 313 | 314 | self._pattern = compile_regex(regex) 315 | 316 | def tokenize(self, text: String) -> Tokenization: 317 | text = bistr(text) 318 | tokens = [] 319 | for match in self._pattern.finditer(text.modified): 320 | tokens.append(Token.slice(text, match.start(), match.end())) 321 | return Tokenization(text, tokens) 322 | 323 | 324 | class SplittingTokenizer(Tokenizer): 325 | r""" 326 | Splits text into tokens based on a regex. 327 | 328 | >>> tokenizer = SplittingTokenizer(r'\s+') 329 | >>> tokens = tokenizer.tokenize('the quick brown fox jumps over the lazy dog') 330 | >>> tokens[0] 331 | Token(bistr('the'), start=0, end=3) 332 | >>> tokens[1] 333 | Token(bistr('quick'), start=4, end=9) 334 | """ 335 | 336 | def __init__(self, regex: Regex): 337 | """ 338 | :param regex: 339 | A (possibly compiled) regular expression that matches the regions between tokens. 340 | """ 341 | 342 | self._pattern = compile_regex(regex) 343 | 344 | def tokenize(self, text: String) -> Tokenization: 345 | text = bistr(text) 346 | tokens = [] 347 | 348 | last = 0 349 | for match in self._pattern.finditer(text.modified): 350 | start = match.start() 351 | if start > last: 352 | tokens.append(Token.slice(text, last, start)) 353 | last = match.end() 354 | 355 | end = len(text.modified) 356 | if end > last: 357 | tokens.append(Token.slice(text, last, end)) 358 | 359 | return Tokenization(text, tokens) 360 | 361 | 362 | class _IcuTokenizer(Tokenizer): 363 | """ 364 | Base class for ICU BreakIterator-based tokenizers. 365 | """ 366 | 367 | def __init__(self, locale: str, constructor: Callable[[icu.Locale], icu.BreakIterator]): 368 | # BreakIterator is not a thread-safe API, so store a cache of 369 | # thread-local iterators 370 | self._locale = icu.Locale(locale) 371 | self._constructor = constructor 372 | self._local = threading.local() 373 | 374 | # Eagerly construct one on this thread as an optimization, and to check 375 | # for errors 376 | self._break_iterator() 377 | 378 | def _break_iterator(self) -> icu.BreakIterator: 379 | bi: Optional[icu.BreakIterator] = getattr(self._local, 'bi', None) 380 | if bi is None: 381 | bi = self._constructor(self._locale) 382 | self._local.bi = bi 383 | return bi 384 | 385 | def tokenize(self, text: String) -> Tokenization: 386 | text = bistr(text) 387 | tokens = [] 388 | 389 | bi = self._break_iterator() 390 | 391 | utext = icu.UnicodeString(text.modified) 392 | bi.setText(utext) 393 | 394 | ui = bi.first() 395 | uj = bi.nextBoundary() 396 | i = 0 397 | while uj != icu.BreakIterator.DONE: 398 | j = i + utext.countChar32(ui, uj - ui) 399 | if self._check_token(bi.getRuleStatus()): 400 | tokens.append(Token.slice(text, i, j)) 401 | ui = uj 402 | uj = bi.nextBoundary() 403 | i = j 404 | 405 | return Tokenization(text, tokens) 406 | 407 | def _check_token(self, tag: int) -> bool: 408 | return True 409 | 410 | 411 | class CharacterTokenizer(_IcuTokenizer): 412 | """ 413 | Splits text into user-perceived characters/grapheme clusters. 414 | 415 | >>> tokenizer = CharacterTokenizer('th_TH') 416 | >>> tokens = tokenizer.tokenize('กำนัล') 417 | >>> tokens[0] 418 | Token(bistr('กำ'), start=0, end=2) 419 | >>> tokens[1] 420 | Token(bistr('นั'), start=2, end=4) 421 | >>> tokens[2] 422 | Token(bistr('ล'), start=4, end=5) 423 | """ 424 | 425 | def __init__(self, locale: str): 426 | """ 427 | :param locale: 428 | The name of the locale to use for computing user-perceived character boundaries. 429 | """ 430 | super().__init__(locale, icu.BreakIterator.createCharacterInstance) 431 | 432 | 433 | class WordTokenizer(_IcuTokenizer): 434 | """ 435 | Splits text into words based on Unicode rules. 436 | 437 | >>> tokenizer = WordTokenizer('en_US') 438 | >>> tokens = tokenizer.tokenize('the quick brown fox jumps over the lazy dog') 439 | >>> tokens[0] 440 | Token(bistr('the'), start=0, end=3) 441 | >>> tokens[1] 442 | Token(bistr('quick'), start=4, end=9) 443 | """ 444 | 445 | def __init__(self, locale: str): 446 | """ 447 | :param locale: 448 | The name of the locale to use for computing word boundaries. 449 | """ 450 | super().__init__(locale, icu.BreakIterator.createWordInstance) 451 | 452 | def _check_token(self, tag: int) -> bool: 453 | return tag >= 100 # UBRK_WORD_NONE_LIMIT 454 | 455 | 456 | class SentenceTokenizer(_IcuTokenizer): 457 | """ 458 | Splits text into sentences based on Unicode rules. 459 | 460 | >>> tokenizer = SentenceTokenizer('en_US') 461 | >>> tokens = tokenizer.tokenize( 462 | ... 'Word, sentence, etc. boundaries are hard. Luckily, Unicode can help.' 463 | ... ) 464 | >>> tokens[0] 465 | Token(bistr('Word, sentence, etc. boundaries are hard. '), start=0, end=42) 466 | >>> tokens[1] 467 | Token(bistr('Luckily, Unicode can help.'), start=42, end=68) 468 | """ 469 | 470 | def __init__(self, locale: str): 471 | """ 472 | :param locale: 473 | The name of the locale to use for computing sentence boundaries. 474 | """ 475 | super().__init__(locale, icu.BreakIterator.createSentenceInstance) 476 | -------------------------------------------------------------------------------- /python/bistring/_typing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from typing import Callable, Match, Pattern, Tuple, Union 5 | 6 | 7 | BiIndex = Tuple[int, int] 8 | 9 | Bounds = Tuple[int, int] 10 | 11 | AnyBounds = Union[int, range, slice, Bounds] 12 | 13 | Index = Union[int, slice] 14 | 15 | Range = Union[range, slice, Bounds] 16 | 17 | Regex = Union[str, Pattern[str]] 18 | 19 | Replacement = Union[str, Callable[[Match[str]], str]] 20 | -------------------------------------------------------------------------------- /python/bistring/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/bistring/739b2620d0de1936ab915b5e92c9de141c169d51/python/bistring/py.typed -------------------------------------------------------------------------------- /python/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | mypy_path = ./stubs 3 | disallow_subclassing_any = True 4 | disallow_untyped_calls = True 5 | disallow_untyped_defs = True 6 | disallow_incomplete_defs = True 7 | disallow_untyped_decorators = True 8 | no_implicit_optional = True 9 | warn_no_return = True 10 | warn_return_any = True 11 | warn_unreachable = True 12 | warn_incomplete_stub = True 13 | warn_redundant_casts = True 14 | show_error_context = True 15 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) Microsoft Corporation. All rights reserved. 4 | # Licensed under the MIT license. 5 | 6 | from pathlib import Path 7 | from setuptools import setup 8 | 9 | 10 | def readme(): 11 | with open(Path(__file__).parent/'README.rst') as f: 12 | return f.read() 13 | 14 | 15 | setup( 16 | name='bistring', 17 | version='0.5.0', 18 | description='Bidirectionally transformed strings', 19 | long_description=readme(), 20 | long_description_content_type='text/x-rst', 21 | classifiers=[ 22 | 'Development Status :: 4 - Beta', 23 | 'License :: OSI Approved :: MIT License', 24 | 'Programming Language :: Python :: 3.10', 25 | 'Topic :: Text Processing :: General', 26 | 'Typing :: Typed', 27 | ], 28 | keywords='bistring string non-destructive', 29 | url='https://github.com/microsoft/bistring', 30 | author='Microsoft Research Montreal', 31 | author_email='msrmtle@microsoft.com', 32 | license='MIT', 33 | packages=[ 34 | 'bistring', 35 | ], 36 | package_data={ 37 | 'bistring': [ 38 | 'py.typed', 39 | ], 40 | }, 41 | zip_safe=False, 42 | test_suite='tests', 43 | python_requires='>=3.10', 44 | setup_requires=[ 45 | 'pytest-runner', 46 | ], 47 | install_requires=[ 48 | 'pyicu', 49 | ], 50 | extras_require={ 51 | 'dev': [ 52 | 'exceptiongroup', 53 | 'lxml', 54 | 'mypy', 55 | 'pytest', 56 | 'regex', 57 | 'tomli', 58 | ], 59 | }, 60 | tests_require=[ 61 | 'bistring[dev]', 62 | ], 63 | ) 64 | -------------------------------------------------------------------------------- /python/stubs/icu.pyi: -------------------------------------------------------------------------------- 1 | from typing import Iterator, Tuple, Union, overload 2 | 3 | 4 | UString = Union[str, 'UnicodeString'] 5 | 6 | 7 | class BreakIterator: 8 | DONE: int 9 | 10 | @classmethod 11 | def createCharacterInstance(cls, locale: Locale) -> BreakIterator: ... 12 | 13 | @classmethod 14 | def createWordInstance(cls, locale: Locale) -> BreakIterator: ... 15 | 16 | @classmethod 17 | def createSentenceInstance(cls, locale: Locale) -> BreakIterator: ... 18 | 19 | def setText(self, text: UString) -> None: ... 20 | 21 | def first(self) -> int: ... 22 | 23 | def nextBoundary(self) -> int: ... 24 | 25 | def getRuleStatus(self) -> int: ... 26 | 27 | 28 | class CaseMap: 29 | @classmethod 30 | def fold(cls, text: UString, edits: Edits) -> str: ... 31 | 32 | @classmethod 33 | def toLower(cls, locale: Locale, text: UString, edits: Edits) -> str: ... 34 | 35 | @classmethod 36 | def toUpper(cls, locale: Locale, text: UString, edits: Edits) -> str: ... 37 | 38 | @classmethod 39 | def toTitle(cls, locale: Locale, text: UString, edits: Edits) -> str: ... 40 | 41 | 42 | class Edits: 43 | def __init__(self) -> None: ... 44 | 45 | def getFineIterator(self) -> Iterator[Tuple[bool, int, int, int, int, int]]: ... 46 | 47 | 48 | class Locale: 49 | def __init__(self, name: str): ... 50 | 51 | 52 | class Normalizer2: 53 | @classmethod 54 | def getNFCInstance(cls) -> Normalizer2: ... 55 | 56 | @classmethod 57 | def getNFDInstance(cls) -> Normalizer2: ... 58 | 59 | @classmethod 60 | def getNFKCInstance(cls) -> Normalizer2: ... 61 | 62 | @classmethod 63 | def getNFKDInstance(cls) -> Normalizer2: ... 64 | 65 | def normalize(self, text: UString) -> str: ... 66 | 67 | def hasBoundaryBefore(self, c: UString) -> bool: ... 68 | 69 | 70 | class UnicodeString: 71 | def __init__(self, string: str): ... 72 | 73 | @overload 74 | def __getitem__(self, index: int) -> str: ... 75 | 76 | @overload 77 | def __getitem__(self, index: slice) -> UnicodeString: ... 78 | 79 | def __len__(self) -> int: ... 80 | 81 | @overload 82 | def countChar32(self) -> int: ... 83 | 84 | @overload 85 | def countChar32(self, start: int, length: int) -> int: ... 86 | 87 | def charAt(self, index: int) -> int: ... 88 | 89 | def char32At(self, index: int) -> int: ... 90 | -------------------------------------------------------------------------------- /python/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/bistring/739b2620d0de1936ab915b5e92c9de141c169d51/python/tests/__init__.py -------------------------------------------------------------------------------- /python/tests/test_alignment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from bistring import Alignment 5 | import pytest 6 | 7 | 8 | def test_empty(): 9 | pytest.raises(ValueError, Alignment, []) 10 | 11 | alignment = Alignment.identity(0) 12 | assert list(alignment) == [(0, 0)] 13 | 14 | assert alignment.original_bounds() == (0, 0) 15 | assert alignment.modified_bounds() == (0, 0) 16 | 17 | assert alignment.original_bounds(0, 0) == (0, 0) 18 | assert alignment.modified_bounds(0, 0) == (0, 0) 19 | 20 | 21 | def test_indexing(): 22 | data = [ 23 | (0, 1), 24 | (1, 2), 25 | (2, 4), 26 | (3, 8), 27 | (4, 16), 28 | ] 29 | length = len(data) 30 | alignment = Alignment(data) 31 | 32 | assert len(alignment) == length 33 | 34 | for i in range(length): 35 | assert alignment[i] == data[i] 36 | assert alignment[i - length] == data[i - length] 37 | 38 | for j in range(i + 1, length + 1): 39 | assert list(alignment[i:j]) == data[i:j] 40 | 41 | 42 | def test_identity(): 43 | alignment = Alignment.identity(1, 16) 44 | 45 | assert alignment == Alignment((i, i) for i in range(1, 17)) 46 | assert list(alignment) == [(i, i) for i in range(1, 17)] 47 | 48 | assert alignment.original_bounds() == (1, 16) 49 | assert alignment.modified_bounds() == (1, 16) 50 | 51 | assert alignment.original_bounds(4, 7) == (4, 7) 52 | assert alignment.modified_bounds(4, 7) == (4, 7) 53 | 54 | 55 | def test_aligning(): 56 | alignment = Alignment([(0, 0), (1, 2), (2, 4), (3, 6)]) 57 | 58 | assert alignment.original_bounds() == (0, 3) 59 | assert alignment.modified_bounds() == (0, 6) 60 | 61 | assert alignment.original_bounds(0, 0) == (0, 0) 62 | assert alignment.original_bounds(0, 1) == (0, 1) 63 | assert alignment.original_bounds(0, 2) == (0, 1) 64 | assert alignment.original_bounds(0, 3) == (0, 2) 65 | assert alignment.original_bounds(1, 1) == (0, 1) 66 | assert alignment.original_bounds(1, 3) == (0, 2) 67 | assert alignment.original_bounds(1, 4) == (0, 2) 68 | assert alignment.original_bounds(2, 2) == (1, 1) 69 | assert alignment.original_bounds(2, 4) == (1, 2) 70 | assert alignment.original_bounds(2, 5) == (1, 3) 71 | assert alignment.original_bounds(3, 3) == (1, 2) 72 | 73 | assert alignment.modified_bounds(0, 0) == (0, 0) 74 | assert alignment.modified_bounds(0, 1) == (0, 2) 75 | assert alignment.modified_bounds(0, 2) == (0, 4) 76 | assert alignment.modified_bounds(0, 3) == (0, 6) 77 | assert alignment.modified_bounds(1, 1) == (2, 2) 78 | assert alignment.modified_bounds(2, 2) == (4, 4) 79 | 80 | 81 | def test_canonicalization(): 82 | assert Alignment([(0, 0), (1, 2), (1, 2), (2, 4)]) == Alignment([(0, 0), (1, 2), (2, 4)]) 83 | 84 | assert Alignment([(0, 0), (1, 2)]) + Alignment([(1, 2), (2, 4)]) == Alignment([(0, 0), (1, 2), (2, 4)]) 85 | 86 | 87 | def _test_composition(first, second): 88 | composed = first.compose(second) 89 | 90 | of, ol = composed.original_bounds() 91 | mf, ml = composed.modified_bounds() 92 | 93 | assert (of, ol) == first.original_bounds() 94 | assert (mf, ml) == second.modified_bounds() 95 | 96 | for i in range(of, ol + 1): 97 | for j in range(i, ol + 1): 98 | assert composed.modified_bounds(i, j) == second.modified_bounds(first.modified_bounds(i, j)) 99 | 100 | for i in range(mf, ml + 1): 101 | for j in range(i, ml + 1): 102 | assert composed.original_bounds(i, j) == first.original_bounds(second.original_bounds(i, j)) 103 | 104 | 105 | def test_compose(): 106 | first = Alignment((i, 2 * i) for i in range(4)) 107 | second = Alignment((i, 2 * i) for i in range(7)) 108 | _test_composition(first, second) 109 | 110 | 111 | def _test_identity_composition(alignment): 112 | _test_composition(alignment, Alignment.identity(alignment.modified_range())) 113 | _test_composition(Alignment.identity(alignment.original_range()), alignment) 114 | 115 | 116 | def test_compose_identity(): 117 | alignment = Alignment([ 118 | (0, 2), 119 | (2, 2), 120 | (4, 4), 121 | (6, 6), 122 | (8, 6), 123 | ]) 124 | 125 | # Modified sequence is smaller 126 | _test_identity_composition(alignment) 127 | 128 | # Original sequence is smaller 129 | _test_identity_composition(alignment.inverse()) 130 | 131 | 132 | def test_infer(): 133 | assert Alignment.infer('test', 'test') == Alignment.identity(4) 134 | assert Alignment.infer('asdf', 'jkl;') == Alignment.identity(4) 135 | 136 | assert Alignment.infer('color', 'colour') == Alignment([ 137 | (0, 0), 138 | (1, 1), 139 | (2, 2), 140 | (3, 3), 141 | (4, 4), 142 | (4, 5), 143 | (5, 6), 144 | ]) 145 | 146 | assert Alignment.infer('color', 'colour') == Alignment.infer('colour', 'color').inverse() 147 | 148 | assert Alignment.infer("ab---", "ab") == Alignment([ 149 | (0, 0), 150 | (1, 1), 151 | (2, 2), 152 | (3, 2), 153 | (4, 2), 154 | (5, 2), 155 | ]) 156 | -------------------------------------------------------------------------------- /python/tests/test_bistr.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from bistring import Alignment, bistr 5 | import pytest 6 | import unicodedata 7 | 8 | 9 | def test_new(): 10 | pytest.raises(TypeError, bistr, 42) 11 | pytest.raises(TypeError, bistr, 'fourty-two', 42) 12 | pytest.raises(TypeError, bistr, 'fourty-two', '42', 42) 13 | 14 | pytest.raises(ValueError, bistr, 'fourty-two', '42', Alignment([ 15 | (0, 0), 16 | (9, 2), 17 | ])) 18 | pytest.raises(ValueError, bistr, 'fourty-two', '42', Alignment([ 19 | (0, 0), 20 | (10, 1), 21 | ])) 22 | 23 | bistr('42') 24 | bistr('fourty-two', '42') 25 | bistr('fourty-two', '42', Alignment([ 26 | (0, 0), 27 | (6, 1), 28 | (7, 1), 29 | (10, 2), 30 | ])) 31 | 32 | 33 | def test_infer(): 34 | bs = bistr.infer('test', 'test') 35 | assert bs == bistr('test', 'test', Alignment.identity(4)) 36 | 37 | bs = bistr.infer('color', 'colour') 38 | assert bs[3:5].original == 'o' 39 | 40 | assert bs.inverse() == bistr.infer('colour', 'color') 41 | 42 | bs = bistr.infer("--Hello, world!--", "hello world") 43 | assert bs[:5] == bistr("Hello", "hello", Alignment.identity(5)) 44 | assert bs[6:] == bistr("world") 45 | 46 | bs = bistr.infer( 47 | '🅃🄷🄴 🅀🅄🄸🄲🄺, 🄱🅁🄾🅆🄽 🦊 🄹🅄🄼🄿🅂 🄾🅅🄴🅁 🅃🄷🄴 🄻🄰🅉🅈 🐶', 48 | 'the quick brown fox jumps over the lazy dog', 49 | ) 50 | assert bs[0:3] == bistr('🅃🄷🄴', 'the', Alignment.identity(3)) 51 | assert bs[4:9] == bistr('🅀🅄🄸🄲🄺', 'quick', Alignment.identity(5)) 52 | assert bs[10:15] == bistr('🄱🅁🄾🅆🄽', 'brown', Alignment.identity(5)) 53 | assert bs[16:19].original == '🦊' 54 | assert bs[16:19].modified == 'fox' 55 | assert bs[20:25] == bistr('🄹🅄🄼🄿🅂', 'jumps', Alignment.identity(5)) 56 | assert bs[40:43].original == '🐶' 57 | assert bs[40:43].modified == 'dog' 58 | 59 | bs = bistr.infer( 60 | 'Ṫḧë qüïċḳ, ḅṛöẅṅ 🦊 jüṁṗṡ öṿëṛ ẗḧë ḷäżÿ 🐶', 61 | 'the quick brown fox jumps over the lazy dog', 62 | ) 63 | assert bs[0:3] == bistr('Ṫḧë', 'the', Alignment.identity(3)) 64 | assert bs[4:9] == bistr('qüïċḳ', 'quick', Alignment.identity(5)) 65 | assert bs[10:15] == bistr('ḅṛöẅṅ', 'brown', Alignment.identity(5)) 66 | assert bs[16:19].original == '🦊' 67 | assert bs[16:19].modified == 'fox' 68 | assert bs[20:25] == bistr('jüṁṗṡ', 'jumps', Alignment.identity(5)) 69 | assert bs[40:43].original == '🐶' 70 | assert bs[40:43].modified == 'dog' 71 | 72 | bs = bistr.infer('Z̴̡̪̫̖̥̔̿̃̈̏̎͠͝á̸̪̠̖̻̬̖̪̞͙͇̮̠͎̆͋́̐͌̒͆̓l̶͉̭̳̤̬̮̩͎̟̯̜͇̥̠̘͑͐̌͂̄́̀̂̌̈͛̊̄̚͜ģ̸̬̼̞̙͇͕͎̌̾̒̐̿̎̆̿̌̃̏̌́̾̈͘͜o̶̢̭͕͔̩͐ ̴̡̡̜̥̗͔̘̦͉̣̲͚͙̐̈́t̵͈̰̉̀͒̎̈̿̔̄̽͑͝͠ẹ̵̫̲̫̄͜͜x̵͕̳͈̝̤̭̼̼̻͓̿̌̽̂̆̀̀̍̒͐́̈̀̚͝t̸̡̨̥̺̣̟͎̝̬̘̪͔͆́̄̅̚', 'Zalgo text') 73 | for i, c in enumerate(bs): 74 | assert bs[i:i+1].original.startswith(c) 75 | 76 | 77 | def test_concat(): 78 | bs = bistr(' ', '') 79 | bs += 'Hello' 80 | bs += bistr(' ', ' ') 81 | bs += 'world!' 82 | bs += bistr(' ', '') 83 | 84 | assert bs.original == ' Hello world! ' 85 | assert bs.modified == 'Hello world!' 86 | 87 | bs = bs[4:7] 88 | assert bs.original == 'o w' 89 | assert bs.modified == 'o w' 90 | 91 | bs = bs[1:2] 92 | assert bs.original == ' ' 93 | assert bs.modified == ' ' 94 | 95 | 96 | def test_find_index(): 97 | bs = bistr('dysfunction') 98 | 99 | assert bs.find('dis') == -1 100 | assert bs.find('fun') == 3 101 | assert bs.find('n') == 5 102 | assert bs.find('n', 6) == 10 103 | 104 | assert bs.find_bounds('dis') == (-1, -1) 105 | assert bs.find_bounds('fun') == (3, 6) 106 | assert bs.find_bounds('n') == (5, 6) 107 | assert bs.find_bounds('n', 6) == (10, 11) 108 | 109 | pytest.raises(ValueError, bs.index, 'dis') 110 | pytest.raises(ValueError, bs.index_bounds, 'dis') 111 | 112 | assert bs.index('fun') == 3 113 | assert bs.index_bounds('fun') == (3, 6) 114 | assert bs.index_bounds('n') == (5, 6) 115 | assert bs.index_bounds('n', 6) == (10, 11) 116 | 117 | 118 | def test_rfind_rindex(): 119 | bs = bistr('dysfunction') 120 | 121 | assert bs.rfind('dis') == -1 122 | assert bs.rfind('fun') == 3 123 | assert bs.rfind('n') == 10 124 | assert bs.rfind('n', None, 9) == 5 125 | 126 | assert bs.rfind_bounds('dis') == (-1, -1) 127 | assert bs.rfind_bounds('fun') == (3, 6) 128 | assert bs.rfind_bounds('n') == (10, 11) 129 | assert bs.rfind_bounds('n', None, 9) == (5, 6) 130 | 131 | pytest.raises(ValueError, bs.index, 'dis') 132 | pytest.raises(ValueError, bs.index_bounds, 'dis') 133 | 134 | assert bs.rindex('fun') == 3 135 | assert bs.rindex_bounds('fun') == (3, 6) 136 | assert bs.rindex_bounds('n') == (10, 11) 137 | assert bs.rindex_bounds('n', None, 9) == (5, 6) 138 | 139 | 140 | def test_starts_ends_with(): 141 | bs = bistr('Beginning, middle, ending') 142 | 143 | assert bs.startswith('Begin') 144 | assert bs.endswith('ing') 145 | 146 | assert not bs.startswith('ending') 147 | assert not bs.endswith('Beginning') 148 | 149 | assert bs.startswith(('Begin', 'End')) 150 | assert bs.endswith(('beginning', 'ending')) 151 | 152 | 153 | def test_justify(): 154 | bs = bistr('Hello world!') 155 | 156 | assert bs.center(5) == bs 157 | assert bs.center(20) == bistr('', ' ') + bs + bistr('', ' ') 158 | assert bs.center(21) == bistr('', ' ') + bs + bistr('', ' ') 159 | 160 | assert bs.ljust(5) == bs 161 | assert bs.ljust(16) == bs + bistr('', ' ') 162 | 163 | assert bs.rjust(5) == bs 164 | assert bs.rjust(16) == bistr('', ' ') + bs 165 | 166 | 167 | def test_join(): 168 | assert bistr('').join([]) == bistr('') 169 | 170 | sep = bistr('|', '::') 171 | args = ['Hello', bistr('WORLD').lower()] 172 | assert sep.join(args) == args[0] + sep + args[1] 173 | 174 | 175 | def test_split(): 176 | bs = bistr('1,2,3') 177 | assert bs.split(',') == [bistr('1'), bistr('2'), bistr('3')] 178 | assert bs.split(',', 1) == [bistr('1'), bistr('2,3')] 179 | 180 | assert bistr('1,2,,3,').split(',') == [bistr('1'), bistr('2'), bistr(''), bistr('3'), bistr('')] 181 | 182 | assert bistr('').split(',') == [bistr('')] 183 | 184 | assert bistr('1<>2<>3').split('<>') == [bistr('1'), bistr('2'), bistr('3')] 185 | 186 | bs = bistr(' 1 2 3 ') 187 | assert bs.split() == [bistr('1'), bistr('2'), bistr('3')] 188 | assert bs.split(maxsplit=-1) == [bistr('1'), bistr('2'), bistr('3')] 189 | assert bs.split(maxsplit=2) == [bistr('1'), bistr('2'), bistr('3 ')] 190 | assert bs.split(maxsplit=1) == [bistr('1'), bistr('2 3 ')] 191 | 192 | assert bistr('').split() == [] 193 | 194 | 195 | def test_partition(): 196 | bs = bistr('left::middle::right') 197 | 198 | left, sep, right = bs.partition('::') 199 | assert left == bistr('left') 200 | assert sep == bistr('::') 201 | assert right == bistr('middle::right') 202 | 203 | left, sep, right = bs.partition(':::') 204 | assert left == bs 205 | assert sep == bistr('') 206 | assert right == bistr('') 207 | 208 | left, sep, right = bs.rpartition('::') 209 | assert left == bistr('left::middle') 210 | assert sep == bistr('::') 211 | assert right == bistr('right') 212 | 213 | left, sep, right = bs.rpartition(':::') 214 | assert left == bistr('') 215 | assert sep == bistr('') 216 | assert right == bs 217 | 218 | 219 | def test_expandtabs(): 220 | bs = bistr(' \tHello\t\tworld!\n\tGoodbye \tworld!') 221 | bs = bs.expandtabs() 222 | 223 | assert bs.modified == bs.original.expandtabs() 224 | assert bs[0:1] == bistr(' ') 225 | assert bs[1:8] == bistr('\t', ' ') 226 | assert bs[8:13] == bistr('Hello') 227 | assert bs[13:16] == bistr('\t', ' ') 228 | assert bs[16:24] == bistr('\t', ' ') 229 | assert bs[24:30] == bistr('world!') 230 | assert bs[30:31] == bistr('\n') 231 | 232 | 233 | def test_strip(): 234 | bs = bistr(' Hello world! ') 235 | assert bs.original == ' Hello world! ' 236 | assert bs.modified == ' Hello world! ' 237 | 238 | bs = bs.strip() 239 | assert bs.original == ' Hello world! ' 240 | assert bs.modified == 'Hello world!' 241 | 242 | bs = bistr(' ').strip() 243 | assert bs.modified == '' 244 | assert bs.original == ' ' 245 | 246 | 247 | def test_casefold(): 248 | # 'Híffi' 249 | # í has a combining acute accent, ffi is a ligature 250 | bs = bistr('Hi\u0301\uFB03').casefold() 251 | assert bs.original == 'Hi\u0301\uFB03' 252 | assert bs.modified == 'hi\u0301ffi' 253 | assert bs.modified == bs.original.casefold() 254 | 255 | assert bs[:3].original == 'Hi\u0301' 256 | assert bs[:3].modified == 'hi\u0301' 257 | 258 | assert bs[4:5].original == '\uFB03' 259 | assert bs[4:5].modified == 'f' 260 | 261 | # Odysseus 262 | bs = bistr('Ὀδυσσεύς').casefold() 263 | assert bs.original == 'Ὀδυσσεύς' 264 | assert bs.modified == 'ὀδυσσεύσ' 265 | 266 | 267 | def test_lower(): 268 | bs = bistr('DİYARBAKIR').lower('en_US') 269 | assert bs.original == 'DİYARBAKIR' 270 | assert bs.modified == 'di̇yarbakir' 271 | 272 | bs = bistr('DİYARBAKIR').lower('tr_TR') 273 | assert bs.original == 'DİYARBAKIR' 274 | assert bs.modified == 'diyarbakır' 275 | 276 | # Odysseus 277 | bs = bistr('ὈΔΥΣΣΕΎΣ').lower('el_GR') 278 | assert bs.original == 'ὈΔΥΣΣΕΎΣ' 279 | assert bs.modified == 'ὀδυσσεύς' 280 | 281 | # Examples from The Unicode Standard, Version 12.0, Chapter 3.13 282 | bs = bistr('ᾼΣͅ').lower('el_GR') 283 | assert bs.original == 'ᾼΣͅ' 284 | assert bs.modified == 'ᾳςͅ' 285 | 286 | bs = bistr('ͅΣͅ').lower('el_GR') 287 | assert bs.original == 'ͅΣͅ' 288 | assert bs.modified == 'ͅσͅ' 289 | 290 | bs = bistr('ᾼΣᾼ').lower('el_GR') 291 | assert bs.original == 'ᾼΣᾼ' 292 | assert bs.modified == 'ᾳσᾳ' 293 | 294 | bs = bistr('Σ').lower('el_GR') 295 | assert bs.original == 'Σ' 296 | assert bs.modified == 'σ' 297 | 298 | 299 | def test_upper(): 300 | bs = bistr('straße').upper('de_DE') 301 | assert bs.original == 'straße' 302 | assert bs.modified == 'STRASSE' 303 | assert bs[4:6].original == 'ß' 304 | assert bs[4:6].modified == 'SS' 305 | 306 | bs = bistr('Diyarbakır').upper('tr_TR') 307 | assert bs.original == 'Diyarbakır' 308 | assert bs.modified == 'DİYARBAKIR' 309 | 310 | # Odysseus 311 | bs = bistr('Ὀδυσσεύς').upper('und') 312 | assert bs.original == 'Ὀδυσσεύς' 313 | assert bs.modified == 'ὈΔΥΣΣΕΎΣ' 314 | 315 | 316 | def test_title(): 317 | bs = bistr('istanbul').title('en_US') 318 | assert bs.original == 'istanbul' 319 | assert bs.modified == 'Istanbul' 320 | 321 | bs = bistr('istanbul').title('tr_TR') 322 | assert bs.original == 'istanbul' 323 | assert bs.modified == 'İstanbul' 324 | 325 | 326 | def test_capitalize(): 327 | bs = bistr('hello WORLD').capitalize('en_US') 328 | assert bs.original == 'hello WORLD' 329 | assert bs.modified == 'Hello world' 330 | assert bs.alignment == Alignment.identity(11) 331 | 332 | bs = bistr('τελικός').capitalize('el_GR') 333 | assert bs.original == 'τελικός' 334 | assert bs.modified == 'Τελικός' 335 | assert bs.alignment == Alignment.identity(7) 336 | 337 | bs = bistr('ἴΣ').capitalize('el_GR') 338 | assert bs.original == 'ἴΣ' 339 | assert bs.modified == 'Ἴς' 340 | assert bs.alignment == Alignment.identity(2) 341 | 342 | 343 | def test_swapcase(): 344 | bs = bistr('hello WORLD').swapcase('en_US') 345 | assert bs.original == 'hello WORLD' 346 | assert bs.modified == 'HELLO world' 347 | assert bs.alignment == Alignment.identity(11) 348 | 349 | # Ligatures/digraphs in title case don't have a swapped form 350 | bs = bistr('Ljepòta').swapcase('hr_HR') 351 | assert bs.original == 'Ljepòta' 352 | assert bs.modified == 'LjEPÒTA' 353 | assert bs.alignment == Alignment.identity(6) 354 | 355 | bs = bistr('Ljepòta').normalize('NFKC').swapcase('hr_HR') 356 | assert bs.original == 'Ljepòta' 357 | assert bs.modified == 'lJEPÒTA' 358 | assert bs[0:2] == bistr('Lj', 'lJ') 359 | 360 | 361 | def test_normalize(): 362 | # "Héllö" -- é is composed but ö has a combining diaeresis 363 | bs = bistr('H\u00E9llo\u0308').normalize('NFC') 364 | assert bs.original == 'H\u00E9llo\u0308' 365 | assert bs.modified == 'H\u00E9ll\u00F6' 366 | assert bs.modified == unicodedata.normalize('NFC', bs.original) 367 | assert bs[1:2] == bistr('\u00E9') 368 | assert bs[4:5] == bistr('o\u0308', '\u00F6') 369 | 370 | bs = bistr('H\u00E9llo\u0308').normalize('NFD') 371 | assert bs.original == 'H\u00E9llo\u0308' 372 | assert bs.modified == 'He\u0301llo\u0308' 373 | assert bs.modified == unicodedata.normalize('NFD', bs.original) 374 | assert bs[1:3] == bistr('\u00E9', 'e\u0301') 375 | assert bs[5:7] == bistr('o\u0308') 376 | 377 | 378 | def test_readme(): 379 | bs = bistr('𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶') 380 | bs = bs.normalize('NFKD') 381 | bs = bs.casefold() 382 | bs = bs.replace('🦊', 'fox') 383 | bs = bs.replace('🐶', 'dog') 384 | bs = bs.sub(r'[^\w\s]+', '') 385 | bs = bs[:19] 386 | assert bs.modified == 'the quick brown fox' 387 | assert bs.original == '𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊' 388 | 389 | 390 | def test_equality(): 391 | bs1 = bistr(' Hello world ').strip().casefold() 392 | bs2 = bistr(' Hello world ', 'hello world', Alignment([ 393 | (0, 0), 394 | (2, 0), 395 | (3, 1), 396 | (4, 2), 397 | (5, 3), 398 | (6, 4), 399 | (7, 5), 400 | (8, 6), 401 | (9, 7), 402 | (10, 8), 403 | (11, 9), 404 | (12, 10), 405 | (13, 11), 406 | (15, 11), 407 | ])) 408 | assert bs1 == bs2 409 | 410 | 411 | def test_alternative_regex(): 412 | import regex 413 | 414 | bs = bistr('The quick, brown 🦊 jumps over the lazy 🐶') 415 | bs = bs.sub(regex.compile(r'\pS'), lambda m: unicodedata.name(m.group())) 416 | assert bs[17:25] == bistr('🦊', 'FOX FACE') 417 | assert bs[46:] == bistr('🐶', 'DOG FACE') 418 | -------------------------------------------------------------------------------- /python/tests/test_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from bistring import bistr, Alignment, BistrBuilder 5 | 6 | 7 | def test_chunk_words(): 8 | builder = BistrBuilder(' the quick brown fox ') 9 | builder.discard(2) 10 | builder.replace(3, 'the') 11 | builder.skip(1) 12 | builder.replace(5, 'quick') 13 | builder.replace(2, ' ') 14 | builder.replace(5, 'brown') 15 | builder.skip(1) 16 | builder.replace(3, 'fox') 17 | builder.discard(1) 18 | bs = builder.build() 19 | 20 | assert bs.original == ' the quick brown fox ' 21 | assert bs.modified == 'the quick brown fox' 22 | 23 | assert bs[0:1].original == 'the' 24 | assert bs[1:2].original == 'the' 25 | assert bs[2:3].original == 'the' 26 | 27 | assert bs[0:3].original == 'the' 28 | assert bs[1:3].original == 'the' 29 | 30 | assert bs[0:4].original == 'the ' 31 | assert bs[1:4].original == 'the ' 32 | 33 | assert bs[3:4].original == ' ' 34 | assert bs[9:10].original == ' ' 35 | 36 | assert bs[4:15].original == 'quick brown' 37 | assert bs[5:14].original == 'quick brown' 38 | 39 | assert bs[0:0].original == '' 40 | assert bs[10:10].original == '' 41 | 42 | 43 | def test_chunk_chars(): 44 | builder = BistrBuilder(' the quick brown fox ') 45 | builder.discard_match(r'\s+') 46 | while not builder.is_complete: 47 | builder.skip_match(r'\S+') 48 | builder.replace_match(r'\s+(?=\S)', ' ') 49 | builder.discard_match(r'\s+$') 50 | 51 | bs = builder.build() 52 | 53 | assert bs.original == ' the quick brown fox ' 54 | assert bs.modified == 'the quick brown fox' 55 | 56 | assert bs[0:1].original == 't' 57 | assert bs[1:2].original == 'h' 58 | assert bs[2:3].original == 'e' 59 | 60 | assert bs[0:3].original == 'the' 61 | assert bs[1:3].original == 'he' 62 | 63 | assert bs[0:4].original == 'the ' 64 | assert bs[1:4].original == 'he ' 65 | 66 | assert bs[3:4].original == ' ' 67 | assert bs[9:10].original == ' ' 68 | 69 | assert bs[4:15].original == 'quick brown' 70 | assert bs[5:14].original == 'uick brow' 71 | 72 | assert bs[0:0].original == '' 73 | assert bs[10:10].original == '' 74 | 75 | 76 | def test_empty_string(): 77 | builder = BistrBuilder('') 78 | bs = builder.build() 79 | assert bs.original == '' 80 | assert bs.modified == '' 81 | assert bs[0:0].original == '' 82 | 83 | 84 | def test_iterative(): 85 | builder = BistrBuilder("I wish I wouldn't've spent one thousand dollars.") 86 | builder.skip_match(r'[^.]*') 87 | builder.discard_rest() 88 | builder.rewind() 89 | builder.skip_match(r'I wish I '); 90 | builder.replace_match(r"wouldn't've", 'would not have'); 91 | builder.skip_match(r' spent '); 92 | builder.replace_match(r'one thousand dollars', '$1,000'); 93 | 94 | bs = builder.build() 95 | assert bs.original == "I wish I wouldn't've spent one thousand dollars." 96 | assert bs.modified == 'I wish I would not have spent $1,000' 97 | 98 | 99 | def test_replace_matches(): 100 | builder = BistrBuilder('the cheese that the mouse that the cat that the dog chased played with ate') 101 | builder.replace_next(r'that', 'which') 102 | builder.replace_all(r'that', 'whom') 103 | 104 | bs = builder.build() 105 | assert bs.original == 'the cheese that the mouse that the cat that the dog chased played with ate' 106 | assert bs.modified == 'the cheese which the mouse whom the cat whom the dog chased played with ate' 107 | 108 | 109 | def test_replace_backreference(): 110 | builder = BistrBuilder("it doesn't work and stuff doesn't get replaced") 111 | builder.replace_all(r"\bdoesn't (\S+)", r'\1s') 112 | 113 | bs = builder.build() 114 | assert bs.original == "it doesn't work and stuff doesn't get replaced" 115 | assert bs.modified == 'it works and stuff gets replaced' 116 | 117 | 118 | def test_append(): 119 | builder = BistrBuilder('hello WORLD') 120 | builder.append(bistr(builder.peek(5)).upper('en_US')) 121 | builder.skip(1) 122 | builder.append(bistr(builder.peek(5)).lower('en_US')) 123 | 124 | bs = builder.build() 125 | assert bs[1:4] == bistr('ell', 'ELL', Alignment.identity(3)) 126 | assert bs[7:10] == bistr('ORL', 'orl', Alignment.identity(3)) 127 | -------------------------------------------------------------------------------- /python/tests/test_token.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. All rights reserved. 2 | # Licensed under the MIT license. 3 | 4 | from bistring import bistr, Token, Tokenization, Tokenizer 5 | import pytest 6 | 7 | 8 | def test_tokenization(): 9 | text = bistr(' The quick, brown fox jumps over the lazy dog ') 10 | text = text.replace(',', '') 11 | text = text.sub(r'^ +| +$', '') 12 | 13 | tokens = Tokenization(text, [ 14 | Token.slice(text, 0, 3), 15 | Token.slice(text, 4, 9), 16 | Token.slice(text, 10, 15), 17 | Token.slice(text, 16, 19), 18 | Token.slice(text, 20, 25), 19 | Token.slice(text, 26, 30), 20 | Token.slice(text, 31, 34), 21 | Token.slice(text, 35, 39), 22 | Token.slice(text, 40, 43), 23 | ]) 24 | assert tokens.text == text 25 | assert tokens.text_bounds(1, 3) == (4, 15) 26 | assert tokens.original_bounds(1, 3) == (6, 18) 27 | assert tokens.bounds_for_text(0, 13) == (0, 3) 28 | assert tokens.bounds_for_original(0, 13) == (0, 2) 29 | assert tokens.slice_by_text(34, 43).substring() == bistr('lazy dog') 30 | assert tokens.slice_by_original(36, 48).substring() == bistr('the lazy dog') 31 | assert tokens.snap_text_bounds(2, 13) == (0, 15) 32 | assert tokens.snap_original_bounds(36, 47) == (34, 46) 33 | 34 | 35 | def test_infer(): 36 | text = 'the quick, brown fox' 37 | tokens = Tokenization.infer(text, ['the', 'quick', 'brown', 'fox']) 38 | assert tokens.substring(1, 3) == bistr('quick, brown') 39 | 40 | pytest.raises(ValueError, Tokenization.infer, text, ['the', 'quick', 'red', 'fox']) 41 | 42 | 43 | def test_regex_tokenizer(): 44 | from bistring import RegexTokenizer 45 | 46 | text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ') 47 | text = text.normalize('NFKD') 48 | text = text.casefold() 49 | 50 | tokenizer = RegexTokenizer(r'\w+') 51 | assert isinstance(tokenizer, Tokenizer) 52 | 53 | tokens = tokenizer.tokenize(text) 54 | assert tokens.text == text 55 | assert len(tokens) == 9 56 | assert tokens.text_bounds(0, 2) == (1, 10) 57 | assert tokens[0:2].substring() == text[1:10] 58 | assert len(tokens.slice_by_text(5, 10)) == 1 59 | assert len(tokens.slice_by_text(5, 11)) == 1 60 | assert len(tokens.slice_by_text(3, 13)) == 3 61 | 62 | 63 | def test_splitting_tokenizer(): 64 | from bistring import SplittingTokenizer 65 | 66 | text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ') 67 | text = text.normalize('NFKD') 68 | text = text.casefold() 69 | 70 | tokenizer = SplittingTokenizer(r'\s+') 71 | assert isinstance(tokenizer, Tokenizer) 72 | 73 | tokens = tokenizer.tokenize(text) 74 | assert tokens.text == text 75 | assert len(tokens) == 9 76 | assert tokens.text_bounds(0, 2) == (1, 11) 77 | assert tokens[0:2].substring() == text[1:11] 78 | assert len(tokens.slice_by_text(5, 10)) == 1 79 | assert len(tokens.slice_by_text(5, 11)) == 1 80 | assert len(tokens.slice_by_text(3, 13)) == 3 81 | 82 | 83 | def test_character_tokenizer(): 84 | from bistring import CharacterTokenizer 85 | 86 | text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ') 87 | 88 | tokenizer = CharacterTokenizer('en_US') 89 | assert isinstance(tokenizer, Tokenizer) 90 | 91 | tokens = tokenizer.tokenize(text) 92 | assert tokens.text == text 93 | assert all(token.text == text[i:i+1] for i, token in enumerate(tokens)) 94 | 95 | 96 | def test_word_tokenizer(): 97 | from bistring import WordTokenizer 98 | 99 | text = bistr(' 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ') 100 | 101 | tokenizer = WordTokenizer('en_US') 102 | assert isinstance(tokenizer, Tokenizer) 103 | 104 | tokens = tokenizer.tokenize(text) 105 | assert tokens.text == text 106 | assert len(tokens) == 9 107 | assert tokens.text_bounds(0, 2) == (1, 10) 108 | assert tokens[0:2].substring() == text[1:10] 109 | assert len(tokens.slice_by_text(5, 10)) == 1 110 | assert len(tokens.slice_by_text(5, 11)) == 1 111 | assert len(tokens.slice_by_text(3, 13)) == 3 112 | 113 | 114 | def test_sentence_tokenizer(): 115 | from bistring import SentenceTokenizer 116 | 117 | text = bistr('The following sentence is true. The preceeding sentence, surprisingly, is false.') 118 | 119 | tokenizer = SentenceTokenizer('en_US') 120 | assert isinstance(tokenizer, Tokenizer) 121 | 122 | tokens = tokenizer.tokenize(text) 123 | assert tokens.text == text 124 | assert len(tokens) == 2 125 | assert tokens[0].text == text[:33] 126 | assert tokens[1].text == text[33:] 127 | --------------------------------------------------------------------------------