├── tests
├── nim.cfg
└── tests.nim
├── CHANGELOG.md
├── .gitignore
├── segmentation.nimble
├── .github
└── workflows
│ └── ci.yml
├── README.md
├── LICENSE
├── gen
└── gen_re_words.nim
├── src
└── segmentation.nim
└── docs
└── index.html
/tests/nim.cfg:
--------------------------------------------------------------------------------
1 | --path:"../src/"
2 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | v0.0.1
2 | ==================
3 |
4 | * Initial release
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nimcache/
2 | tests/tests
3 | src/segmentation
4 | gen/gen_re_words
5 | docs/ugh
6 | config.nims
7 |
--------------------------------------------------------------------------------
/segmentation.nimble:
--------------------------------------------------------------------------------
1 | # Package
2 |
3 | version = "0.1.0"
4 | author = "Esteban Castro Borsani (@nitely)"
5 | description = "Unicode text segmentation tr29"
6 | license = "MIT"
7 | srcDir = "src"
8 | skipDirs = @["tests", "gen"]
9 |
10 | requires "nim >= 0.19.0"
11 | requires "unicodedb >= 0.8.0"
12 |
13 | task test, "Test":
14 | exec "nim c -r src/segmentation.nim"
15 | exec "nim c -r tests/tests.nim"
16 |
17 | # Test runnable examples
18 | #exec "nim doc -o:./docs/ugh/ugh.html ./src/segmentation.nim"
19 |
20 | task docs, "Docs":
21 | exec "nim doc -o:./docs/index.html ./src/segmentation.nim"
22 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 |
9 | jobs:
10 | test:
11 | name: Nim ${{ matrix.nim }}
12 | runs-on: ubuntu-latest
13 | strategy:
14 | matrix:
15 | nim: [1.0.0, 1.0.2, 1.0.4, 1.0.10, 1.2.0, 1.2.2, 1.2.4, 1.2.6, 1.2.8, 1.2.12, 1.4.0, 1.4.2, 1.4.4, 1.4.8, 1.6.0]
16 | steps:
17 | - uses: actions/checkout@v2
18 | - name: Run Tests
19 | run: |
20 | docker pull nimlang/nim:${{ matrix.nim }}
21 | docker run --rm -v `pwd`:/usr/src/app -w /usr/src/app nimlang/nim:${{ matrix.nim }} /bin/bash -c "nimble install -y; nimble test"
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Segmentation
2 |
3 | [](https://raw.githubusercontent.com/nitely/nim-segmentation/master/LICENSE)
4 |
5 | An implementation of [Unicode Text Segmentation](https://unicode.org/reports/tr29/) (tr29). The splitting is made through a fast DFA.
6 |
7 | > See [nim-graphemes](https://github.com/nitely/nim-graphemes) for grapheme cluster segmentation
8 |
9 | ## Install
10 |
11 | ```
12 | nimble install segmentation
13 | ```
14 |
15 | # Compatibility
16 |
17 | Nim 0.19, 0.20, +1.0.4
18 |
19 | ## Usage
20 |
21 | ```nim
22 | import sequtils
23 | import segmentation
24 |
25 | assert toSeq("The (“brown”) fox can’t jump 32.3 feet, right?".words) ==
26 | @["The", " ", "(", "“", "brown", "”", ")", " ", "fox", " ",
27 | "can’t", " ", "jump", " ", "32.3", " ", "feet", ",", " ",
28 | "right", "?"]
29 | ```
30 |
31 | ## Docs
32 |
33 | [Read the docs](https://nitely.github.io/nim-segmentation/)
34 |
35 | ## Tests
36 |
37 | ```
38 | nimble test
39 | ```
40 |
41 | ## LICENSE
42 |
43 | MIT
44 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Esteban Castro Borsani
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/tests.nim:
--------------------------------------------------------------------------------
1 | import unittest, strutils
2 | import unicode except strip
3 | import sequtils
4 |
5 | import segmentation
6 |
7 | proc wbreak(s: string): seq[string] =
8 | toSeq(s.words)
9 |
10 | test "Test words break":
11 | var i = 0
12 | for line in "./tests/WordBreakTest.txt".lines:
13 | var text = line.split('#', 1)[0]
14 | if text.strip.len == 0:
15 | continue
16 | var wordsFromTest: seq[string]
17 | for ch1 in text.split("÷"):
18 | if ch1.strip.len == 0:
19 | continue
20 | var words = ""
21 | for ch2 in ch1.split("×"):
22 | if ch2.strip.len == 0:
23 | continue
24 | words.add ch2.strip.parseHexInt.Rune.toUTF8
25 | wordsFromTest.add words
26 | check toSeq(wordsFromTest.join.words) == wordsFromTest
27 | inc i
28 | echo "$# words tested" % [$i]
29 |
30 | test "Test some words":
31 | # From the txt file
32 | check wbreak("1,̈1.") == @["1,̈1", "."]
33 | check wbreak("\n̈") == @["\n", "̈"] # 0xa 0x308 0x200d
34 | check wbreak("〱A") == @["〱", "A"]
35 | check wbreak("A_0_〱_") == @["A_0_〱_"]
36 | # ZWJ, checked at https://unicode.org/cldr/utility/breaks.jsp
37 | check "🛑🛑".wbreak == @["🛑🛑"]
38 | check "a🇦🇧🇨🇩b".wbreak == @["a", "🇦🇧", "🇨🇩", "b"]
39 | check "a🛑".wbreak == @["a🛑"]
40 | check "👶🏿̈👶🏿".wbreak == @["👶🏿̈👶🏿"]
41 | check " ن".wbreak == @[" ", "ن"] # Space ZWJ letter
42 | check " 🛑".wbreak == @[" 🛑"] # Space Space ZWJ Emoji
43 |
44 | test "Test misc":
45 | check wbreak("11 aa 22 bb 1.2 1,2 $1,2 $1") ==
46 | @["11", " ", "aa", " ", "22", " ", "bb", " ", "1.2", " ",
47 | "1,2", " ", "$", "1,2", " ", "$", "1"]
48 | check wbreak("abc abc ghi can't") ==
49 | @["abc", " ", "abc", " ", "ghi", " ", "can\'t"]
50 | check wbreak("The quick? (“brown”) fox can’t jump 32.3 feet, right?") ==
51 | @["The", " ", "quick", "?", " ", "(", "“", "brown", "”", ")",
52 | " ", "fox", " ", "can’t", " ", "jump", " ", "32.3", " ", "feet",
53 | ",", " ", "right", "?"]
54 | check wbreak("3.2 3a 3.2a 3.2a3.2a a3.2 3. a3a a3.2a 1to1 1-1 1'1 1'a 1''1") ==
55 | @["3.2", " ", "3a", " ", "3.2a", " ", "3.2a3.2a", " ", "a3.2",
56 | " ", "3", ".", " ", "a3a", " ", "a3.2a", " ", "1to1", " ", "1",
57 | "-", "1", " ", "1'1", " ", "1", "'", "a", " ", "1", "'", "'", "1"]
58 |
59 | test "Test wordsBounds":
60 | check toSeq("abc def?".wordsBounds) ==
61 | @[0 .. 2, 3 .. 3, 4 .. 6, 7 .. 7]
62 |
--------------------------------------------------------------------------------
/gen/gen_re_words.nim:
--------------------------------------------------------------------------------
1 | import strutils
2 |
3 | const
4 | unicodeVersion* = "12.1.0"
5 | specVersion* = "29"
6 | specURL* = "http://www.unicode.org/reports/tr29/"
7 |
8 | # Rules without "Ignore Format and Extend characters"
9 | #[
10 | (
11 | CR LF
12 | | Newline | CR | LF
13 | | ZWJ Extended_Pictographic
14 | | WSegSpace+
15 | | AHLetter+
16 | | AHLetter ((MidLetter | MidNumLetQ) AHLetter)+
17 | | Hebrew_Letter Single_Quote
18 | | Hebrew_Letter (Double_Quote Hebrew_Letter)+
19 | | Numeric+
20 | | (AHLetter Numeric)+
21 | | (Numeric AHLetter)+
22 | | Numeric ((MidNum | MidNumLetQ) Numeric)+
23 | | Katakana+
24 | | ((AHLetter | Numeric | Katakana | ExtendNumLet) ExtendNumLet)+
25 | | (RI RI)+
26 | | Other
27 | )
28 |
29 | The following rule handles: AHLetter+, Numeric+, (AHLetter | Numeric)+,
30 | and merge of rules (AHLetter ((MidLetter | MidNumLetQ) AHLetter)+)
31 | and (Numeric ((MidNum | MidNumLetQ) Numeric)+),
32 | also (Hebrew_Letter Single_Quote Hebrew_Letter)+
33 |
34 | (
35 | AHLetter ((MidLetter | MidNumLetQ) AHLetter)*
36 | | Numeric ((MidNum | MidNumLetQ) Numeric)*
37 | )+
38 | ]#
39 |
40 | # Handmade regex based on the word-break table in the spec
41 | # Apparently anything can be before "ZWJ EMOJI", albeit the spec does
42 | # not mention it
43 | # Reference (X: (Extend | Format | ZWJ)*)
44 | const pattern =
45 | """
46 | (
47 | CR LF
48 | | Newline | CR | LF
49 | | (
50 | ZWJ Extended_Pictographic
51 | | WSegSpace+
52 | | (
53 | AHLetter X ((MidLetter | MidNumLetQ) X AHLetter X)*
54 | | Numeric X ((MidNum | MidNumLetQ) X Numeric X)*
55 | | ExtendNumLet X (Katakana+ X ExtendNumLet X)*
56 | )+
57 | | Hebrew_Letter X Single_Quote
58 | | Hebrew_Letter X (Double_Quote X Hebrew_Letter X)+
59 | | ((Katakana | ExtendNumLet) X)+
60 | | RegionalIndicator X RegionalIndicator
61 | | Other
62 | ) X (ZWJ Extended_Pictographic X)*
63 | )
64 | """
65 |
66 | # IDs must be in non-overlapping substring order (i.e longest to shortest)
67 | const identifiers = [
68 | "__EOF__", # Reserved for the DFA
69 | "Extended_Pictographic",
70 | "RegionalIndicator",
71 | "Hebrew_Letter",
72 | "Single_Quote",
73 | "Double_Quote",
74 | "ExtendNumLet",
75 | "MidNumLet",
76 | #"MidNumLetQ",
77 | "WSegSpace",
78 | "MidLetter",
79 | "Katakana",
80 | "ALetter",
81 | #"AHLetter",
82 | "Numeric",
83 | "Newline",
84 | "Extend",
85 | "Format",
86 | "MidNum",
87 | "Other",
88 | "ZWJ",
89 | "CR",
90 | "LF"
91 | ]
92 |
93 | const anyOther = [
94 | "Extended_Pictographic",
95 | "RegionalIndicator",
96 | "Hebrew_Letter",
97 | "Single_Quote",
98 | "Double_Quote",
99 | "ExtendNumLet",
100 | "MidNumLet",
101 | "WSegSpace",
102 | "MidLetter",
103 | "Katakana",
104 | "ALetter",
105 | "Numeric",
106 | #"Newline",
107 | "Extend",
108 | "Format",
109 | "MidNum",
110 | "Other",
111 | "ZWJ",
112 | #"CR",
113 | #"LF"
114 | ]
115 |
116 | var letters = ""
117 | for c in 'a' .. 'z':
118 | letters.add(c)
119 |
120 | proc buildRePattern(p: string): string =
121 | assert len(identifiers) <= len(letters)
122 | result = p
123 | result = replace(result, "Other", "(" & anyOther.join(" | ") & ")")
124 | result = replace(result, "AHLetter", "(ALetter | Hebrew_Letter)")
125 | result = replace(result, "MidNumLetQ", "(MidNumLet | Single_Quote)")
126 | result = replace(result, "X", "(Extend | Format | ZWJ)*")
127 | result = replace(result, "(", "(?:")
128 | for i, id in identifiers:
129 | result = replace(result, id, "" & letters[i])
130 | result = replace(result, " ")
131 | result = replace(result, "\p")
132 | result = replace(result, "\n")
133 |
134 | when isMainModule:
135 | echo "pattern:"
136 | echo buildRePattern(pattern)
137 |
--------------------------------------------------------------------------------
/src/segmentation.nim:
--------------------------------------------------------------------------------
1 | ## This library implements Unicode Text Segmentation (tr29)
2 |
3 | import macros
4 | import unicode
5 |
6 | import unicodedb/segmentation
7 |
8 | # Not every state can exit, so this needs backtracking
9 | # Auto generated with github@nitely/regexy
10 | # See ../gen/gen_re_words.nim for the original regex
11 | const wordBreakTable = [
12 | [-1'i8, 82, 80, 72, 71, 70, 58, 57, 56, 55, 53, 52, 14, 85, 13, 12, 11, 10, 1, 84, 83],
13 | [0'i8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, 7, -1, -1, 2, -1, -1],
14 | [0'i8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 5, -1, -1, 2, -1, -1],
15 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, 4, -1, -1],
16 | [0'i8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, 4, -1, -1],
17 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
18 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
19 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
20 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
21 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
22 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
23 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
24 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
25 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
26 | [0'i8, -1, -1, 47, 49, -1, 46, 50, -1, -1, -1, 48, 14, -1, 14, 14, 51, -1, 15, -1, -1],
27 | [0'i8, 3, -1, 39, 41, -1, 16, 44, -1, -1, -1, 40, 14, -1, 14, 14, 45, -1, 15, -1, -1],
28 | [0'i8, -1, -1, 36, -1, -1, 16, -1, -1, -1, 38, 37, 14, -1, 16, 16, -1, -1, 17, -1, -1],
29 | [0'i8, 3, -1, 18, -1, -1, 16, -1, -1, -1, 30, 29, 14, -1, 16, 16, -1, -1, 17, -1, -1],
30 | [0'i8, -1, -1, 18, 26, -1, 16, 27, -1, 28, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
31 | [0'i8, 3, -1, 18, 20, -1, 16, 24, -1, 25, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
32 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, 20, 20, -1, -1, 20, -1, -1],
33 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1],
34 | [0'i8, 3, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1],
35 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1],
36 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
37 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
38 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
39 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
40 | [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
41 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
42 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, 30, -1, -1, -1, 35, 34, -1, -1, 33, -1, -1],
43 | [0'i8, -1, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 31, 31, -1, -1, 32, -1, -1],
44 | [0'i8, 3, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 31, 31, -1, -1, 32, -1, -1],
45 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
46 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
47 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
48 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
49 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
50 | [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, 30, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
51 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
52 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
53 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
54 | [0'i8, -1, -1, 18, 41, -1, 16, 41, -1, -1, -1, 18, 14, -1, 42, 42, 41, -1, 43, -1, -1],
55 | [0'i8, 3, -1, 18, 41, -1, 16, 41, -1, -1, -1, 18, 14, -1, 42, 42, 41, -1, 43, -1, -1],
56 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
57 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
58 | [0'i8, -1, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 16, 16, -1, -1, 17, -1, -1],
59 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
60 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
61 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
62 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
63 | [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
64 | [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
65 | [0'i8, -1, -1, -1, -1, -1, 53, -1, -1, -1, 53, -1, -1, -1, 53, 53, -1, -1, 54, -1, -1],
66 | [0'i8, 3, -1, -1, -1, -1, 53, -1, -1, -1, 53, -1, -1, -1, 53, 53, -1, -1, 54, -1, -1],
67 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
68 | [0'i8, -1, -1, -1, -1, -1, -1, -1, 56, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
69 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
70 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 58, 58, -1, -1, 69, -1, -1],
71 | [0'i8, -1, -1, -1, -1, -1, 68, -1, -1, -1, 59, -1, -1, -1, 67, 66, -1, -1, 60, -1, -1],
72 | [0'i8, 3, -1, -1, -1, -1, 65, -1, -1, -1, 53, -1, -1, -1, 64, 61, -1, -1, 60, -1, -1],
73 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
74 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
75 | [0'i8, 3, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
76 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
77 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
78 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
79 | [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
80 | [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
81 | [0'i8, 3, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 58, 58, -1, -1, 69, -1, -1],
82 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
83 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
84 | [0'i8, -1, -1, 18, 79, 73, 16, 20, -1, 20, -1, 18, 14, -1, 72, 72, -1, -1, 76, -1, -1],
85 | [-1'i8, -1, -1, 74, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 73, 73, -1, -1, 73, -1, -1],
86 | [0'i8, -1, -1, -1, -1, 73, -1, -1, -1, -1, -1, -1, -1, -1, 74, 74, -1, -1, 75, -1, -1],
87 | [0'i8, 3, -1, -1, -1, 73, -1, -1, -1, -1, -1, -1, -1, -1, 74, 74, -1, -1, 75, -1, -1],
88 | [0'i8, 3, -1, 18, 77, 73, 16, 20, -1, 20, -1, 18, 14, -1, 72, 72, -1, -1, 76, -1, -1],
89 | [0'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1],
90 | [0'i8, 3, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1],
91 | [0'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1],
92 | [0'i8, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 80, 80, -1, -1, 81, -1, -1],
93 | [0'i8, 3, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 80, 80, -1, -1, 81, -1, -1],
94 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
95 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
96 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83],
97 | [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]
98 |
99 | func genWordBreakMap(prop: NimNode): NimNode =
100 | ## Gen mapping from word-break prop to DFA column
101 | # from gen/gen_re_words.nim
102 | const idnts = [
103 | "__EOF__", # Reserved for the DFA
104 | "Extended_Pictographic",
105 | "RegionalIndicator",
106 | "Hebrew_Letter",
107 | "Single_Quote",
108 | "Double_Quote",
109 | "ExtendNumLet",
110 | "MidNumLet",
111 | #"MidNumLetQ",
112 | "WSegSpace",
113 | "MidLetter",
114 | "Katakana",
115 | "ALetter",
116 | #"AHLetter",
117 | "Numeric",
118 | "Newline",
119 | "Extend",
120 | "Format",
121 | "MidNum",
122 | "Other",
123 | "ZWJ",
124 | "CR",
125 | "LF"
126 | ]
127 | var caseStmt: seq[NimNode]
128 | caseStmt.add(prop)
129 | for i in 1 .. idnts.len-1:
130 | caseStmt.add(newTree(nnkOfBranch,
131 | ident("sgw" & idnts[i]),
132 | newLit i))
133 | let falseLit = newLit false
134 | let badResultLit = newLit -1
135 | caseStmt.add(newTree(nnkElse,
136 | quote do:
137 | doAssert `falseLit`
138 | `badResultLit`))
139 | result = newStmtList(
140 | newTree(nnkCaseStmt, caseStmt))
141 |
142 | macro genWordBreakMap(prop: SgWord): untyped =
143 | result = genWordBreakMap(prop)
144 | when defined(reDumpWrodBreak):
145 | echo "==== genWordBreakMap ===="
146 | echo repr(result)
147 |
148 | # XXX wordBounds (not words)
149 | iterator wordsBounds*(s: string): Slice[int] {.inline.} =
150 | ## Return each word boundary in `s`. Boundaries are inclusive
151 | var
152 | state, a, b, c = 0
153 | r: Rune
154 | while b < s.len:
155 | state = 0
156 | while true:
157 | fastRuneAt(s, b, r, true)
158 | let prop = genWordBreakMap(wordBreakProp(r))
159 | let next = wordBreakTable[state][prop]
160 | if next == -1:
161 | doAssert state > 0
162 | b = c
163 | break
164 | # save point
165 | if wordBreakTable[next][0] == 0:
166 | c = b
167 | if b >= s.len:
168 | b = c
169 | break
170 | state = next
171 | doAssert b > a
172 | yield a .. b-1
173 | a = b
174 |
175 | iterator words*(s: string): string {.inline.} =
176 | ## Return each word in `s`
177 | for b in s.wordsBounds:
178 | yield s[b]
179 |
180 | when isMainModule:
181 | block:
182 | echo "Test genWordBreakMap"
183 | var i = 0
184 | for cp in 0 .. 0x10FFFF:
185 | doAssert genWordBreakMap(wordBreakProp(Rune(cp))) >= 0
186 | inc i
187 | doAssert i == 0x10FFFF+1
188 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | segmentation
20 |
786 |
787 |
788 |
789 |
802 |
803 |
804 |
805 |
806 |
807 |
segmentation
808 |
809 |
810 |
814 |
815 | Search:
817 |
818 |
819 | Group by:
820 |
824 |
825 |
826 | -
827 | Iterators
828 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
This library implements Unicode Text Segmentation (tr29)
844 |
845 |
846 |
847 |
848 | iterator wordsBounds(s: string): Slice[int] {...}{.inline, raises: [], tags: [].}
849 | -
850 |
851 | Return each word boundary in s. Boundaries are inclusive
852 |
853 |
854 |
855 | iterator words(s: string): string {...}{.inline, raises: [], tags: [].}
856 | -
857 |
858 | Return each word in s
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 | Made with Nim. Generated: 2020-02-15 14:05:26 UTC
872 |
873 |
874 |
875 |
876 |
877 |
878 |
879 |
--------------------------------------------------------------------------------