├── tests ├── nim.cfg └── tests.nim ├── CHANGELOG.md ├── .gitignore ├── segmentation.nimble ├── .github └── workflows │ └── ci.yml ├── README.md ├── LICENSE ├── gen └── gen_re_words.nim ├── src └── segmentation.nim └── docs └── index.html /tests/nim.cfg: -------------------------------------------------------------------------------- 1 | --path:"../src/" 2 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | v0.0.1 2 | ================== 3 | 4 | * Initial release 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | nimcache/ 2 | tests/tests 3 | src/segmentation 4 | gen/gen_re_words 5 | docs/ugh 6 | config.nims 7 | -------------------------------------------------------------------------------- /segmentation.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.1.0" 4 | author = "Esteban Castro Borsani (@nitely)" 5 | description = "Unicode text segmentation tr29" 6 | license = "MIT" 7 | srcDir = "src" 8 | skipDirs = @["tests", "gen"] 9 | 10 | requires "nim >= 0.19.0" 11 | requires "unicodedb >= 0.8.0" 12 | 13 | task test, "Test": 14 | exec "nim c -r src/segmentation.nim" 15 | exec "nim c -r tests/tests.nim" 16 | 17 | # Test runnable examples 18 | #exec "nim doc -o:./docs/ugh/ugh.html ./src/segmentation.nim" 19 | 20 | task docs, "Docs": 21 | exec "nim doc -o:./docs/index.html ./src/segmentation.nim" 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | name: Nim ${{ matrix.nim }} 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | nim: [1.0.0, 1.0.2, 1.0.4, 1.0.10, 1.2.0, 1.2.2, 1.2.4, 1.2.6, 1.2.8, 1.2.12, 1.4.0, 1.4.2, 1.4.4, 1.4.8, 1.6.0] 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Run Tests 19 | run: | 20 | docker pull nimlang/nim:${{ matrix.nim }} 21 | docker run --rm -v `pwd`:/usr/src/app -w /usr/src/app nimlang/nim:${{ matrix.nim }} /bin/bash -c "nimble install -y; nimble test" 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Segmentation 2 | 3 | [![licence](https://img.shields.io/github/license/nitely/nim-segmentation.svg?style=flat-square)](https://raw.githubusercontent.com/nitely/nim-segmentation/master/LICENSE) 4 | 5 | An implementation of [Unicode Text Segmentation](https://unicode.org/reports/tr29/) (tr29). The splitting is made through a fast DFA. 6 | 7 | > See [nim-graphemes](https://github.com/nitely/nim-graphemes) for grapheme cluster segmentation 8 | 9 | ## Install 10 | 11 | ``` 12 | nimble install segmentation 13 | ``` 14 | 15 | # Compatibility 16 | 17 | Nim 0.19, 0.20, +1.0.4 18 | 19 | ## Usage 20 | 21 | ```nim 22 | import sequtils 23 | import segmentation 24 | 25 | assert toSeq("The (“brown”) fox can’t jump 32.3 feet, right?".words) == 26 | @["The", " ", "(", "“", "brown", "”", ")", " ", "fox", " ", 27 | "can’t", " ", "jump", " ", "32.3", " ", "feet", ",", " ", 28 | "right", "?"] 29 | ``` 30 | 31 | ## Docs 32 | 33 | [Read the docs](https://nitely.github.io/nim-segmentation/) 34 | 35 | ## Tests 36 | 37 | ``` 38 | nimble test 39 | ``` 40 | 41 | ## LICENSE 42 | 43 | MIT 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Esteban Castro Borsani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/tests.nim: -------------------------------------------------------------------------------- 1 | import unittest, strutils 2 | import unicode except strip 3 | import sequtils 4 | 5 | import segmentation 6 | 7 | proc wbreak(s: string): seq[string] = 8 | toSeq(s.words) 9 | 10 | test "Test words break": 11 | var i = 0 12 | for line in "./tests/WordBreakTest.txt".lines: 13 | var text = line.split('#', 1)[0] 14 | if text.strip.len == 0: 15 | continue 16 | var wordsFromTest: seq[string] 17 | for ch1 in text.split("÷"): 18 | if ch1.strip.len == 0: 19 | continue 20 | var words = "" 21 | for ch2 in ch1.split("×"): 22 | if ch2.strip.len == 0: 23 | continue 24 | words.add ch2.strip.parseHexInt.Rune.toUTF8 25 | wordsFromTest.add words 26 | check toSeq(wordsFromTest.join.words) == wordsFromTest 27 | inc i 28 | echo "$# words tested" % [$i] 29 | 30 | test "Test some words": 31 | # From the txt file 32 | check wbreak("1,̈1.⁠") == @["1,̈1", ".⁠"] 33 | check wbreak("\n̈‍") == @["\n", "̈‍"] # 0xa 0x308 0x200d 34 | check wbreak("〱A") == @["〱", "A"] 35 | check wbreak("A_0_〱_") == @["A_0_〱_"] 36 | # ZWJ, checked at https://unicode.org/cldr/utility/breaks.jsp 37 | check "🛑‍🛑".wbreak == @["🛑‍🛑"] 38 | check "a🇦🇧🇨🇩b".wbreak == @["a", "🇦🇧", "🇨🇩", "b"] 39 | check "a‍🛑".wbreak == @["a‍🛑"] 40 | check "👶🏿̈‍👶🏿".wbreak == @["👶🏿̈‍👶🏿"] 41 | check " ‍ن".wbreak == @[" ‍", "ن"] # Space ZWJ letter 42 | check " ‍🛑".wbreak == @[" ‍🛑"] # Space Space ZWJ Emoji 43 | 44 | test "Test misc": 45 | check wbreak("11 aa 22 bb 1.2 1,2 $1,2 $1") == 46 | @["11", " ", "aa", " ", "22", " ", "bb", " ", "1.2", " ", 47 | "1,2", " ", "$", "1,2", " ", "$", "1"] 48 | check wbreak("abc abc ghi can't") == 49 | @["abc", " ", "abc", " ", "ghi", " ", "can\'t"] 50 | check wbreak("The quick? (“brown”) fox can’t jump 32.3 feet, right?") == 51 | @["The", " ", "quick", "?", " ", "(", "“", "brown", "”", ")", 52 | " ", "fox", " ", "can’t", " ", "jump", " ", "32.3", " ", "feet", 53 | ",", " ", "right", "?"] 54 | check wbreak("3.2 3a 3.2a 3.2a3.2a a3.2 3. a3a a3.2a 1to1 1-1 1'1 1'a 1''1") == 55 | @["3.2", " ", "3a", " ", "3.2a", " ", "3.2a3.2a", " ", "a3.2", 56 | " ", "3", ".", " ", "a3a", " ", "a3.2a", " ", "1to1", " ", "1", 57 | "-", "1", " ", "1'1", " ", "1", "'", "a", " ", "1", "'", "'", "1"] 58 | 59 | test "Test wordsBounds": 60 | check toSeq("abc def?".wordsBounds) == 61 | @[0 .. 2, 3 .. 3, 4 .. 6, 7 .. 7] 62 | -------------------------------------------------------------------------------- /gen/gen_re_words.nim: -------------------------------------------------------------------------------- 1 | import strutils 2 | 3 | const 4 | unicodeVersion* = "12.1.0" 5 | specVersion* = "29" 6 | specURL* = "http://www.unicode.org/reports/tr29/" 7 | 8 | # Rules without "Ignore Format and Extend characters" 9 | #[ 10 | ( 11 | CR LF 12 | | Newline | CR | LF 13 | | ZWJ Extended_Pictographic 14 | | WSegSpace+ 15 | | AHLetter+ 16 | | AHLetter ((MidLetter | MidNumLetQ) AHLetter)+ 17 | | Hebrew_Letter Single_Quote 18 | | Hebrew_Letter (Double_Quote Hebrew_Letter)+ 19 | | Numeric+ 20 | | (AHLetter Numeric)+ 21 | | (Numeric AHLetter)+ 22 | | Numeric ((MidNum | MidNumLetQ) Numeric)+ 23 | | Katakana+ 24 | | ((AHLetter | Numeric | Katakana | ExtendNumLet) ExtendNumLet)+ 25 | | (RI RI)+ 26 | | Other 27 | ) 28 | 29 | The following rule handles: AHLetter+, Numeric+, (AHLetter | Numeric)+, 30 | and merge of rules (AHLetter ((MidLetter | MidNumLetQ) AHLetter)+) 31 | and (Numeric ((MidNum | MidNumLetQ) Numeric)+), 32 | also (Hebrew_Letter Single_Quote Hebrew_Letter)+ 33 | 34 | ( 35 | AHLetter ((MidLetter | MidNumLetQ) AHLetter)* 36 | | Numeric ((MidNum | MidNumLetQ) Numeric)* 37 | )+ 38 | ]# 39 | 40 | # Handmade regex based on the word-break table in the spec 41 | # Apparently anything can be before "ZWJ EMOJI", albeit the spec does 42 | # not mention it 43 | # Reference (X: (Extend | Format | ZWJ)*) 44 | const pattern = 45 | """ 46 | ( 47 | CR LF 48 | | Newline | CR | LF 49 | | ( 50 | ZWJ Extended_Pictographic 51 | | WSegSpace+ 52 | | ( 53 | AHLetter X ((MidLetter | MidNumLetQ) X AHLetter X)* 54 | | Numeric X ((MidNum | MidNumLetQ) X Numeric X)* 55 | | ExtendNumLet X (Katakana+ X ExtendNumLet X)* 56 | )+ 57 | | Hebrew_Letter X Single_Quote 58 | | Hebrew_Letter X (Double_Quote X Hebrew_Letter X)+ 59 | | ((Katakana | ExtendNumLet) X)+ 60 | | RegionalIndicator X RegionalIndicator 61 | | Other 62 | ) X (ZWJ Extended_Pictographic X)* 63 | ) 64 | """ 65 | 66 | # IDs must be in non-overlapping substring order (i.e longest to shortest) 67 | const identifiers = [ 68 | "__EOF__", # Reserved for the DFA 69 | "Extended_Pictographic", 70 | "RegionalIndicator", 71 | "Hebrew_Letter", 72 | "Single_Quote", 73 | "Double_Quote", 74 | "ExtendNumLet", 75 | "MidNumLet", 76 | #"MidNumLetQ", 77 | "WSegSpace", 78 | "MidLetter", 79 | "Katakana", 80 | "ALetter", 81 | #"AHLetter", 82 | "Numeric", 83 | "Newline", 84 | "Extend", 85 | "Format", 86 | "MidNum", 87 | "Other", 88 | "ZWJ", 89 | "CR", 90 | "LF" 91 | ] 92 | 93 | const anyOther = [ 94 | "Extended_Pictographic", 95 | "RegionalIndicator", 96 | "Hebrew_Letter", 97 | "Single_Quote", 98 | "Double_Quote", 99 | "ExtendNumLet", 100 | "MidNumLet", 101 | "WSegSpace", 102 | "MidLetter", 103 | "Katakana", 104 | "ALetter", 105 | "Numeric", 106 | #"Newline", 107 | "Extend", 108 | "Format", 109 | "MidNum", 110 | "Other", 111 | "ZWJ", 112 | #"CR", 113 | #"LF" 114 | ] 115 | 116 | var letters = "" 117 | for c in 'a' .. 'z': 118 | letters.add(c) 119 | 120 | proc buildRePattern(p: string): string = 121 | assert len(identifiers) <= len(letters) 122 | result = p 123 | result = replace(result, "Other", "(" & anyOther.join(" | ") & ")") 124 | result = replace(result, "AHLetter", "(ALetter | Hebrew_Letter)") 125 | result = replace(result, "MidNumLetQ", "(MidNumLet | Single_Quote)") 126 | result = replace(result, "X", "(Extend | Format | ZWJ)*") 127 | result = replace(result, "(", "(?:") 128 | for i, id in identifiers: 129 | result = replace(result, id, "" & letters[i]) 130 | result = replace(result, " ") 131 | result = replace(result, "\p") 132 | result = replace(result, "\n") 133 | 134 | when isMainModule: 135 | echo "pattern:" 136 | echo buildRePattern(pattern) 137 | -------------------------------------------------------------------------------- /src/segmentation.nim: -------------------------------------------------------------------------------- 1 | ## This library implements Unicode Text Segmentation (tr29) 2 | 3 | import macros 4 | import unicode 5 | 6 | import unicodedb/segmentation 7 | 8 | # Not every state can exit, so this needs backtracking 9 | # Auto generated with github@nitely/regexy 10 | # See ../gen/gen_re_words.nim for the original regex 11 | const wordBreakTable = [ 12 | [-1'i8, 82, 80, 72, 71, 70, 58, 57, 56, 55, 53, 52, 14, 85, 13, 12, 11, 10, 1, 84, 83], 13 | [0'i8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, 7, -1, -1, 2, -1, -1], 14 | [0'i8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 5, -1, -1, 2, -1, -1], 15 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, 4, -1, -1], 16 | [0'i8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, 4, -1, -1], 17 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 18 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 19 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 20 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 21 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 22 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 23 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 24 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 25 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 26 | [0'i8, -1, -1, 47, 49, -1, 46, 50, -1, -1, -1, 48, 14, -1, 14, 14, 51, -1, 15, -1, -1], 27 | [0'i8, 3, -1, 39, 41, -1, 16, 44, -1, -1, -1, 40, 14, -1, 14, 14, 45, -1, 15, -1, -1], 28 | [0'i8, -1, -1, 36, -1, -1, 16, -1, -1, -1, 38, 37, 14, -1, 16, 16, -1, -1, 17, -1, -1], 29 | [0'i8, 3, -1, 18, -1, -1, 16, -1, -1, -1, 30, 29, 14, -1, 16, 16, -1, -1, 17, -1, -1], 30 | [0'i8, -1, -1, 18, 26, -1, 16, 27, -1, 28, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 31 | [0'i8, 3, -1, 18, 20, -1, 16, 24, -1, 25, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 32 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, 20, 20, -1, -1, 20, -1, -1], 33 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1], 34 | [0'i8, 3, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1], 35 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1], 36 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1], 37 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1], 38 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1], 39 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1], 40 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1], 41 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 42 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, 30, -1, -1, -1, 35, 34, -1, -1, 33, -1, -1], 43 | [0'i8, -1, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 31, 31, -1, -1, 32, -1, -1], 44 | [0'i8, 3, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 31, 31, -1, -1, 32, -1, -1], 45 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1], 46 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1], 47 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1], 48 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 49 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 50 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, 30, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1], 51 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 52 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 53 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1], 54 | [0'i8, -1, -1, 18, 41, -1, 16, 41, -1, -1, -1, 18, 14, -1, 42, 42, 41, -1, 43, -1, -1], 55 | [0'i8, 3, -1, 18, 41, -1, 16, 41, -1, -1, -1, 18, 14, -1, 42, 42, 41, -1, 43, -1, -1], 56 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1], 57 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1], 58 | [0'i8, -1, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 16, 16, -1, -1, 17, -1, -1], 59 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 60 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 61 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1], 62 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1], 63 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1], 64 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1], 65 | [0'i8, -1, -1, -1, -1, -1, 53, -1, -1, -1, 53, -1, -1, -1, 53, 53, -1, -1, 54, -1, -1], 66 | [0'i8, 3, -1, -1, -1, -1, 53, -1, -1, -1, 53, -1, -1, -1, 53, 53, -1, -1, 54, -1, -1], 67 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 68 | [0'i8, -1, -1, -1, -1, -1, -1, -1, 56, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 69 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 70 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 58, 58, -1, -1, 69, -1, -1], 71 | [0'i8, -1, -1, -1, -1, -1, 68, -1, -1, -1, 59, -1, -1, -1, 67, 66, -1, -1, 60, -1, -1], 72 | [0'i8, 3, -1, -1, -1, -1, 65, -1, -1, -1, 53, -1, -1, -1, 64, 61, -1, -1, 60, -1, -1], 73 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1], 74 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1], 75 | [0'i8, 3, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1], 76 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1], 77 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1], 78 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1], 79 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1], 80 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1], 81 | [0'i8, 3, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 58, 58, -1, -1, 69, -1, -1], 82 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 83 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 84 | [0'i8, -1, -1, 18, 79, 73, 16, 20, -1, 20, -1, 18, 14, -1, 72, 72, -1, -1, 76, -1, -1], 85 | [-1'i8, -1, -1, 74, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 73, 73, -1, -1, 73, -1, -1], 86 | [0'i8, -1, -1, -1, -1, 73, -1, -1, -1, -1, -1, -1, -1, -1, 74, 74, -1, -1, 75, -1, -1], 87 | [0'i8, 3, -1, -1, -1, 73, -1, -1, -1, -1, -1, -1, -1, -1, 74, 74, -1, -1, 75, -1, -1], 88 | [0'i8, 3, -1, 18, 77, 73, 16, 20, -1, 20, -1, 18, 14, -1, 72, 72, -1, -1, 76, -1, -1], 89 | [0'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1], 90 | [0'i8, 3, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1], 91 | [0'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1], 92 | [0'i8, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 80, 80, -1, -1, 81, -1, -1], 93 | [0'i8, 3, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 80, 80, -1, -1, 81, -1, -1], 94 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1], 95 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 96 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83], 97 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]] 98 | 99 | func genWordBreakMap(prop: NimNode): NimNode = 100 | ## Gen mapping from word-break prop to DFA column 101 | # from gen/gen_re_words.nim 102 | const idnts = [ 103 | "__EOF__", # Reserved for the DFA 104 | "Extended_Pictographic", 105 | "RegionalIndicator", 106 | "Hebrew_Letter", 107 | "Single_Quote", 108 | "Double_Quote", 109 | "ExtendNumLet", 110 | "MidNumLet", 111 | #"MidNumLetQ", 112 | "WSegSpace", 113 | "MidLetter", 114 | "Katakana", 115 | "ALetter", 116 | #"AHLetter", 117 | "Numeric", 118 | "Newline", 119 | "Extend", 120 | "Format", 121 | "MidNum", 122 | "Other", 123 | "ZWJ", 124 | "CR", 125 | "LF" 126 | ] 127 | var caseStmt: seq[NimNode] 128 | caseStmt.add(prop) 129 | for i in 1 .. idnts.len-1: 130 | caseStmt.add(newTree(nnkOfBranch, 131 | ident("sgw" & idnts[i]), 132 | newLit i)) 133 | let falseLit = newLit false 134 | let badResultLit = newLit -1 135 | caseStmt.add(newTree(nnkElse, 136 | quote do: 137 | doAssert `falseLit` 138 | `badResultLit`)) 139 | result = newStmtList( 140 | newTree(nnkCaseStmt, caseStmt)) 141 | 142 | macro genWordBreakMap(prop: SgWord): untyped = 143 | result = genWordBreakMap(prop) 144 | when defined(reDumpWrodBreak): 145 | echo "==== genWordBreakMap ====" 146 | echo repr(result) 147 | 148 | # XXX wordBounds (not words) 149 | iterator wordsBounds*(s: string): Slice[int] {.inline.} = 150 | ## Return each word boundary in `s`. Boundaries are inclusive 151 | var 152 | state, a, b, c = 0 153 | r: Rune 154 | while b < s.len: 155 | state = 0 156 | while true: 157 | fastRuneAt(s, b, r, true) 158 | let prop = genWordBreakMap(wordBreakProp(r)) 159 | let next = wordBreakTable[state][prop] 160 | if next == -1: 161 | doAssert state > 0 162 | b = c 163 | break 164 | # save point 165 | if wordBreakTable[next][0] == 0: 166 | c = b 167 | if b >= s.len: 168 | b = c 169 | break 170 | state = next 171 | doAssert b > a 172 | yield a .. b-1 173 | a = b 174 | 175 | iterator words*(s: string): string {.inline.} = 176 | ## Return each word in `s` 177 | for b in s.wordsBounds: 178 | yield s[b] 179 | 180 | when isMainModule: 181 | block: 182 | echo "Test genWordBreakMap" 183 | var i = 0 184 | for cp in 0 .. 0x10FFFF: 185 | doAssert genWordBreakMap(wordBreakProp(Rune(cp))) >= 0 186 | inc i 187 | doAssert i == 0x10FFFF+1 188 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | segmentation 20 | 786 | 787 | 788 | 789 | 802 | 803 | 804 | 805 |
806 |
807 |

segmentation

808 |
809 |
810 | 814 |
815 | Search: 817 |
818 |
819 | Group by: 820 | 824 |
825 | 838 | 839 |
840 |
841 |
842 | 843 |

This library implements Unicode Text Segmentation (tr29)

844 |
845 |

Iterators

846 |
847 | 848 |
iterator wordsBounds(s: string): Slice[int] {...}{.inline, raises: [], tags: [].}
849 |
850 | 851 | Return each word boundary in s. Boundaries are inclusive 852 | 853 |
854 | 855 |
iterator words(s: string): string {...}{.inline, raises: [], tags: [].}
856 |
857 | 858 | Return each word in s 859 | 860 |
861 | 862 |
863 | 864 |
865 |
866 | 867 |
868 | 873 |
874 |
875 |
876 | 877 | 878 | 879 | --------------------------------------------------------------------------------