├── tests
    ├── nim.cfg
    └── tests.nim
├── CHANGELOG.md
├── .gitignore
├── segmentation.nimble
├── .github
    └── workflows
    │   └── ci.yml
├── README.md
├── LICENSE
├── gen
    └── gen_re_words.nim
├── src
    └── segmentation.nim
└── docs
    └── index.html


/tests/nim.cfg:
--------------------------------------------------------------------------------
1 | --path:"../src/"
2 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | v0.0.1
2 | ==================
3 | 
4 | * Initial release
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | nimcache/
2 | tests/tests
3 | src/segmentation
4 | gen/gen_re_words
5 | docs/ugh
6 | config.nims
7 | 


--------------------------------------------------------------------------------
/segmentation.nimble:
--------------------------------------------------------------------------------
 1 | # Package
 2 | 
 3 | version = "0.1.0"
 4 | author = "Esteban Castro Borsani (@nitely)"
 5 | description = "Unicode text segmentation tr29"
 6 | license = "MIT"
 7 | srcDir = "src"
 8 | skipDirs = @["tests", "gen"]
 9 | 
10 | requires "nim >= 0.19.0"
11 | requires "unicodedb >= 0.8.0"
12 | 
13 | task test, "Test":
14 |   exec "nim c -r src/segmentation.nim"
15 |   exec "nim c -r tests/tests.nim"
16 | 
17 |   # Test runnable examples
18 |   #exec "nim doc -o:./docs/ugh/ugh.html ./src/segmentation.nim"
19 | 
20 | task docs, "Docs":
21 |   exec "nim doc -o:./docs/index.html ./src/segmentation.nim"
22 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test:
11 |     name: Nim ${{ matrix.nim }}
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         nim: [1.0.0, 1.0.2, 1.0.4, 1.0.10, 1.2.0, 1.2.2, 1.2.4, 1.2.6, 1.2.8, 1.2.12, 1.4.0, 1.4.2, 1.4.4, 1.4.8, 1.6.0]
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |     - name: Run Tests
19 |       run: |
20 |         docker pull nimlang/nim:${{ matrix.nim }}
21 |         docker run --rm -v `pwd`:/usr/src/app -w /usr/src/app nimlang/nim:${{ matrix.nim }} /bin/bash -c "nimble install -y; nimble test"
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Segmentation
 2 | 
 3 | [![licence](https://img.shields.io/github/license/nitely/nim-segmentation.svg?style=flat-square)](https://raw.githubusercontent.com/nitely/nim-segmentation/master/LICENSE)
 4 | 
 5 | An implementation of [Unicode Text Segmentation](https://unicode.org/reports/tr29/) (tr29). The splitting is made through a fast DFA.
 6 | 
 7 | > See [nim-graphemes](https://github.com/nitely/nim-graphemes) for grapheme cluster segmentation
 8 | 
 9 | ## Install
10 | 
11 | ```
12 | nimble install segmentation
13 | ```
14 | 
15 | # Compatibility
16 | 
17 | Nim 0.19, 0.20, +1.0.4
18 | 
19 | ## Usage
20 | 
21 | ```nim
22 | import sequtils
23 | import segmentation
24 | 
25 | assert toSeq("The (“brown”) fox can’t jump 32.3 feet, right?".words) ==
26 |   @["The", " ", "(", "“", "brown", "”", ")", " ", "fox", " ",
27 |     "can’t", " ", "jump", " ", "32.3", " ", "feet", ",", " ",
28 |     "right", "?"]
29 | ```
30 | 
31 | ## Docs
32 | 
33 | [Read the docs](https://nitely.github.io/nim-segmentation/)
34 | 
35 | ## Tests
36 | 
37 | ```
38 | nimble test
39 | ```
40 | 
41 | ## LICENSE
42 | 
43 | MIT
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Esteban Castro Borsani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/tests.nim:
--------------------------------------------------------------------------------
 1 | import unittest, strutils
 2 | import unicode except strip
 3 | import sequtils
 4 | 
 5 | import segmentation
 6 | 
 7 | proc wbreak(s: string): seq[string] =
 8 |   toSeq(s.words)
 9 | 
10 | test "Test words break":
11 |   var i = 0
12 |   for line in "./tests/WordBreakTest.txt".lines:
13 |     var text = line.split('#', 1)[0]
14 |     if text.strip.len == 0:
15 |       continue
16 |     var wordsFromTest: seq[string]
17 |     for ch1 in text.split("÷"):
18 |       if ch1.strip.len == 0:
19 |         continue
20 |       var words = ""
21 |       for ch2 in ch1.split("×"):
22 |         if ch2.strip.len == 0:
23 |           continue
24 |         words.add ch2.strip.parseHexInt.Rune.toUTF8
25 |       wordsFromTest.add words
26 |     check toSeq(wordsFromTest.join.words) == wordsFromTest
27 |     inc i
28 |   echo "$# words tested" % [$i]
29 | 
30 | test "Test some words":
31 |   # From the txt file
32 |   check wbreak("1,̈1.⁠") == @["1,̈1", ".⁠"]
33 |   check wbreak("\n̈‍") == @["\n", "̈‍"]  # 0xa 0x308 0x200d
34 |   check wbreak("〱A") == @["〱", "A"]
35 |   check wbreak("A_0_〱_") == @["A_0_〱_"]
36 |   # ZWJ, checked at https://unicode.org/cldr/utility/breaks.jsp
37 |   check "🛑‍🛑".wbreak == @["🛑‍🛑"]
38 |   check "a🇦🇧🇨🇩b".wbreak == @["a", "🇦🇧", "🇨🇩", "b"]
39 |   check "a‍🛑".wbreak == @["a‍🛑"]
40 |   check "👶🏿̈‍👶🏿".wbreak == @["👶🏿̈‍👶🏿"]
41 |   check " ‍ن".wbreak == @[" ‍", "ن"]  # Space ZWJ letter
42 |   check "  ‍🛑".wbreak == @["  ‍🛑"]  # Space Space ZWJ Emoji
43 | 
44 | test "Test misc":
45 |   check wbreak("11 aa 22 bb 1.2 1,2 $1,2 $1") ==
46 |     @["11", " ", "aa", " ", "22", " ", "bb", " ", "1.2", " ",
47 |     "1,2", " ", "$", "1,2", " ", "$", "1"]
48 |   check wbreak("abc abc ghi can't") ==
49 |     @["abc", " ", "abc", " ", "ghi", " ", "can\'t"]
50 |   check wbreak("The quick? (“brown”) fox can’t jump 32.3 feet, right?") ==
51 |     @["The", " ", "quick", "?", " ", "(", "“", "brown", "”", ")",
52 |     " ", "fox", " ", "can’t", " ", "jump", " ", "32.3", " ", "feet",
53 |     ",", " ", "right", "?"]
54 |   check wbreak("3.2 3a 3.2a 3.2a3.2a a3.2 3. a3a a3.2a 1to1 1-1 1'1 1'a 1''1") ==
55 |     @["3.2", " ", "3a", " ", "3.2a", " ", "3.2a3.2a", " ", "a3.2",
56 |     " ", "3", ".", " ", "a3a", " ", "a3.2a", " ", "1to1", " ", "1",
57 |     "-", "1", " ", "1'1", " ", "1", "'", "a", " ", "1", "'", "'", "1"]
58 | 
59 | test "Test wordsBounds":
60 |   check toSeq("abc def?".wordsBounds) ==
61 |     @[0 .. 2, 3 .. 3, 4 .. 6, 7 .. 7]
62 | 


--------------------------------------------------------------------------------
/gen/gen_re_words.nim:
--------------------------------------------------------------------------------
  1 | import strutils
  2 | 
  3 | const
  4 |   unicodeVersion* = "12.1.0"
  5 |   specVersion* = "29"
  6 |   specURL* = "http://www.unicode.org/reports/tr29/"
  7 | 
  8 | # Rules without "Ignore Format and Extend characters"
  9 | #[
 10 |   (
 11 |     CR LF
 12 |     | Newline | CR | LF
 13 |     | ZWJ Extended_Pictographic
 14 |     | WSegSpace+
 15 |     | AHLetter+
 16 |     | AHLetter ((MidLetter | MidNumLetQ) AHLetter)+
 17 |     | Hebrew_Letter Single_Quote
 18 |     | Hebrew_Letter (Double_Quote Hebrew_Letter)+
 19 |     | Numeric+
 20 |     | (AHLetter Numeric)+
 21 |     | (Numeric AHLetter)+
 22 |     | Numeric ((MidNum | MidNumLetQ) Numeric)+
 23 |     | Katakana+
 24 |     | ((AHLetter | Numeric | Katakana | ExtendNumLet) ExtendNumLet)+
 25 |     | (RI RI)+
 26 |     | Other
 27 |   )
 28 | 
 29 |   The following rule handles: AHLetter+, Numeric+, (AHLetter | Numeric)+,
 30 |   and merge of rules (AHLetter ((MidLetter | MidNumLetQ) AHLetter)+)
 31 |   and (Numeric ((MidNum | MidNumLetQ) Numeric)+),
 32 |   also (Hebrew_Letter Single_Quote Hebrew_Letter)+
 33 | 
 34 |   (
 35 |     AHLetter ((MidLetter | MidNumLetQ) AHLetter)*
 36 |     | Numeric ((MidNum | MidNumLetQ) Numeric)*
 37 |   )+
 38 | ]#
 39 | 
 40 | # Handmade regex based on the word-break table in the spec
 41 | # Apparently anything can be before "ZWJ EMOJI", albeit the spec does
 42 | # not mention it
 43 | # Reference (X: (Extend | Format | ZWJ)*)
 44 | const pattern =
 45 |   """
 46 |   (
 47 |     CR LF
 48 |     | Newline | CR | LF
 49 |     | (
 50 |       ZWJ Extended_Pictographic
 51 |       | WSegSpace+
 52 |       | (
 53 |           AHLetter X ((MidLetter | MidNumLetQ) X AHLetter X)*
 54 |           | Numeric X ((MidNum | MidNumLetQ) X Numeric X)*
 55 |           | ExtendNumLet X (Katakana+ X ExtendNumLet X)*
 56 |         )+
 57 |       | Hebrew_Letter X Single_Quote
 58 |       | Hebrew_Letter X (Double_Quote X Hebrew_Letter X)+
 59 |       | ((Katakana | ExtendNumLet) X)+
 60 |       | RegionalIndicator X RegionalIndicator
 61 |       | Other
 62 |     ) X (ZWJ Extended_Pictographic X)*
 63 |   )
 64 |   """
 65 | 
 66 | # IDs must be in non-overlapping substring order (i.e longest to shortest)
 67 | const identifiers = [
 68 |   "__EOF__",  # Reserved for the DFA
 69 |   "Extended_Pictographic",
 70 |   "RegionalIndicator",
 71 |   "Hebrew_Letter",
 72 |   "Single_Quote",
 73 |   "Double_Quote",
 74 |   "ExtendNumLet",
 75 |   "MidNumLet",
 76 |   #"MidNumLetQ",
 77 |   "WSegSpace",
 78 |   "MidLetter",
 79 |   "Katakana",
 80 |   "ALetter",
 81 |   #"AHLetter",
 82 |   "Numeric",
 83 |   "Newline",
 84 |   "Extend",
 85 |   "Format",
 86 |   "MidNum",
 87 |   "Other",
 88 |   "ZWJ",
 89 |   "CR",
 90 |   "LF"
 91 | ]
 92 | 
 93 | const anyOther = [
 94 |   "Extended_Pictographic",
 95 |   "RegionalIndicator",
 96 |   "Hebrew_Letter",
 97 |   "Single_Quote",
 98 |   "Double_Quote",
 99 |   "ExtendNumLet",
100 |   "MidNumLet",
101 |   "WSegSpace",
102 |   "MidLetter",
103 |   "Katakana",
104 |   "ALetter",
105 |   "Numeric",
106 |   #"Newline",
107 |   "Extend",
108 |   "Format",
109 |   "MidNum",
110 |   "Other",
111 |   "ZWJ",
112 |   #"CR",
113 |   #"LF"
114 | ]
115 | 
116 | var letters = ""
117 | for c in 'a' .. 'z':
118 |   letters.add(c)
119 | 
120 | proc buildRePattern(p: string): string =
121 |   assert len(identifiers) <= len(letters)
122 |   result = p
123 |   result = replace(result, "Other", "(" & anyOther.join(" | ") & ")")
124 |   result = replace(result, "AHLetter", "(ALetter | Hebrew_Letter)")
125 |   result = replace(result, "MidNumLetQ", "(MidNumLet | Single_Quote)")
126 |   result = replace(result, "X", "(Extend | Format | ZWJ)*")
127 |   result = replace(result, "(", "(?:")
128 |   for i, id in identifiers:
129 |     result = replace(result, id, "" & letters[i])
130 |   result = replace(result, " ")
131 |   result = replace(result, "\p")
132 |   result = replace(result, "\n")
133 | 
134 | when isMainModule:
135 |   echo "pattern:"
136 |   echo buildRePattern(pattern)
137 | 


--------------------------------------------------------------------------------
/src/segmentation.nim:
--------------------------------------------------------------------------------
  1 | ## This library implements Unicode Text Segmentation (tr29)
  2 | 
  3 | import macros
  4 | import unicode
  5 | 
  6 | import unicodedb/segmentation
  7 | 
  8 | # Not every state can exit, so this needs backtracking
  9 | # Auto generated with github@nitely/regexy
 10 | # See ../gen/gen_re_words.nim for the original regex
 11 | const wordBreakTable = [
 12 |   [-1'i8, 82, 80, 72, 71, 70, 58, 57, 56, 55, 53, 52, 14, 85, 13, 12, 11, 10, 1, 84, 83],
 13 |   [0'i8, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 8, 7, -1, -1, 2, -1, -1],
 14 |   [0'i8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 5, -1, -1, 2, -1, -1],
 15 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, 4, -1, -1],
 16 |   [0'i8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 3, -1, -1, 4, -1, -1],
 17 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 18 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 19 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 20 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 21 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 22 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 23 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 24 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 25 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 26 |   [0'i8, -1, -1, 47, 49, -1, 46, 50, -1, -1, -1, 48, 14, -1, 14, 14, 51, -1, 15, -1, -1],
 27 |   [0'i8, 3, -1, 39, 41, -1, 16, 44, -1, -1, -1, 40, 14, -1, 14, 14, 45, -1, 15, -1, -1],
 28 |   [0'i8, -1, -1, 36, -1, -1, 16, -1, -1, -1, 38, 37, 14, -1, 16, 16, -1, -1, 17, -1, -1],
 29 |   [0'i8, 3, -1, 18, -1, -1, 16, -1, -1, -1, 30, 29, 14, -1, 16, 16, -1, -1, 17, -1, -1],
 30 |   [0'i8, -1, -1, 18, 26, -1, 16, 27, -1, 28, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 31 |   [0'i8, 3, -1, 18, 20, -1, 16, 24, -1, 25, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 32 |   [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, 20, 20, -1, -1, 20, -1, -1],
 33 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1],
 34 |   [0'i8, 3, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1],
 35 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 21, 21, -1, -1, 22, -1, -1],
 36 |   [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
 37 |   [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
 38 |   [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
 39 |   [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
 40 |   [-1'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 20, 20, -1, -1, 20, -1, -1],
 41 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 42 |   [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, 30, -1, -1, -1, 35, 34, -1, -1, 33, -1, -1],
 43 |   [0'i8, -1, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 31, 31, -1, -1, 32, -1, -1],
 44 |   [0'i8, 3, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 31, 31, -1, -1, 32, -1, -1],
 45 |   [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
 46 |   [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
 47 |   [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, -1, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
 48 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 49 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 50 |   [-1'i8, -1, -1, -1, -1, -1, 31, -1, -1, -1, 30, -1, -1, -1, 33, 33, -1, -1, 33, -1, -1],
 51 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 52 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 53 |   [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
 54 |   [0'i8, -1, -1, 18, 41, -1, 16, 41, -1, -1, -1, 18, 14, -1, 42, 42, 41, -1, 43, -1, -1],
 55 |   [0'i8, 3, -1, 18, 41, -1, 16, 41, -1, -1, -1, 18, 14, -1, 42, 42, 41, -1, 43, -1, -1],
 56 |   [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
 57 |   [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
 58 |   [0'i8, -1, -1, 18, -1, -1, 16, -1, -1, -1, 30, 18, 14, -1, 16, 16, -1, -1, 17, -1, -1],
 59 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 60 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 61 |   [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
 62 |   [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
 63 |   [-1'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 42, -1, 41, 41, -1, -1, 41, -1, -1],
 64 |   [0'i8, -1, -1, 18, 20, -1, 16, 20, -1, 20, -1, 18, 14, -1, 18, 18, -1, -1, 19, -1, -1],
 65 |   [0'i8, -1, -1, -1, -1, -1, 53, -1, -1, -1, 53, -1, -1, -1, 53, 53, -1, -1, 54, -1, -1],
 66 |   [0'i8, 3, -1, -1, -1, -1, 53, -1, -1, -1, 53, -1, -1, -1, 53, 53, -1, -1, 54, -1, -1],
 67 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 68 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, 56, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 69 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 70 |   [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 58, 58, -1, -1, 69, -1, -1],
 71 |   [0'i8, -1, -1, -1, -1, -1, 68, -1, -1, -1, 59, -1, -1, -1, 67, 66, -1, -1, 60, -1, -1],
 72 |   [0'i8, 3, -1, -1, -1, -1, 65, -1, -1, -1, 53, -1, -1, -1, 64, 61, -1, -1, 60, -1, -1],
 73 |   [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
 74 |   [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
 75 |   [0'i8, 3, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
 76 |   [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
 77 |   [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
 78 |   [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
 79 |   [0'i8, -1, -1, -1, -1, -1, 62, -1, -1, -1, 53, -1, -1, -1, 61, 61, -1, -1, 60, -1, -1],
 80 |   [0'i8, -1, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 62, 62, -1, -1, 63, -1, -1],
 81 |   [0'i8, 3, -1, 18, -1, -1, 58, -1, -1, -1, 59, 18, 14, -1, 58, 58, -1, -1, 69, -1, -1],
 82 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 83 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 84 |   [0'i8, -1, -1, 18, 79, 73, 16, 20, -1, 20, -1, 18, 14, -1, 72, 72, -1, -1, 76, -1, -1],
 85 |   [-1'i8, -1, -1, 74, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 73, 73, -1, -1, 73, -1, -1],
 86 |   [0'i8, -1, -1, -1, -1, 73, -1, -1, -1, -1, -1, -1, -1, -1, 74, 74, -1, -1, 75, -1, -1],
 87 |   [0'i8, 3, -1, -1, -1, 73, -1, -1, -1, -1, -1, -1, -1, -1, 74, 74, -1, -1, 75, -1, -1],
 88 |   [0'i8, 3, -1, 18, 77, 73, 16, 20, -1, 20, -1, 18, 14, -1, 72, 72, -1, -1, 76, -1, -1],
 89 |   [0'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1],
 90 |   [0'i8, 3, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1],
 91 |   [0'i8, -1, -1, 21, -1, -1, -1, -1, -1, -1, -1, 21, -1, -1, 77, 77, -1, -1, 78, -1, -1],
 92 |   [0'i8, -1, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 80, 80, -1, -1, 81, -1, -1],
 93 |   [0'i8, 3, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 80, 80, -1, -1, 81, -1, -1],
 94 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 5, -1, -1, 2, -1, -1],
 95 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
 96 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 83],
 97 |   [0'i8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]
 98 | 
 99 | func genWordBreakMap(prop: NimNode): NimNode =
100 |   ## Gen mapping from word-break prop to DFA column
101 |   # from gen/gen_re_words.nim
102 |   const idnts = [
103 |     "__EOF__",  # Reserved for the DFA
104 |     "Extended_Pictographic",
105 |     "RegionalIndicator",
106 |     "Hebrew_Letter",
107 |     "Single_Quote",
108 |     "Double_Quote",
109 |     "ExtendNumLet",
110 |     "MidNumLet",
111 |     #"MidNumLetQ",
112 |     "WSegSpace",
113 |     "MidLetter",
114 |     "Katakana",
115 |     "ALetter",
116 |     #"AHLetter",
117 |     "Numeric",
118 |     "Newline",
119 |     "Extend",
120 |     "Format",
121 |     "MidNum",
122 |     "Other",
123 |     "ZWJ",
124 |     "CR",
125 |     "LF"
126 |   ]
127 |   var caseStmt: seq[NimNode]
128 |   caseStmt.add(prop)
129 |   for i in 1 .. idnts.len-1:
130 |     caseStmt.add(newTree(nnkOfBranch,
131 |       ident("sgw" & idnts[i]),
132 |       newLit i))
133 |   let falseLit = newLit false
134 |   let badResultLit = newLit -1
135 |   caseStmt.add(newTree(nnkElse,
136 |     quote do:
137 |       doAssert `falseLit`
138 |       `badResultLit`))
139 |   result = newStmtList(
140 |     newTree(nnkCaseStmt, caseStmt))
141 | 
142 | macro genWordBreakMap(prop: SgWord): untyped =
143 |   result = genWordBreakMap(prop)
144 |   when defined(reDumpWrodBreak):
145 |     echo "==== genWordBreakMap ===="
146 |     echo repr(result)
147 | 
148 | # XXX wordBounds (not words)
149 | iterator wordsBounds*(s: string): Slice[int] {.inline.} =
150 |   ## Return each word boundary in `s`. Boundaries are inclusive
151 |   var
152 |     state, a, b, c = 0
153 |     r: Rune
154 |   while b < s.len:
155 |     state = 0
156 |     while true:
157 |       fastRuneAt(s, b, r, true)
158 |       let prop = genWordBreakMap(wordBreakProp(r))
159 |       let next = wordBreakTable[state][prop]
160 |       if next == -1:
161 |         doAssert state > 0
162 |         b = c
163 |         break
164 |       # save point
165 |       if wordBreakTable[next][0] == 0:
166 |         c = b
167 |       if b >= s.len:
168 |         b = c
169 |         break
170 |       state = next
171 |     doAssert b > a
172 |     yield a .. b-1
173 |     a = b
174 | 
175 | iterator words*(s: string): string {.inline.} =
176 |   ## Return each word in `s`
177 |   for b in s.wordsBounds:
178 |     yield s[b]
179 | 
180 | when isMainModule:
181 |   block:
182 |     echo "Test genWordBreakMap"
183 |     var i = 0
184 |     for cp in 0 .. 0x10FFFF:
185 |       doAssert genWordBreakMap(wordBreakProp(Rune(cp))) >= 0
186 |       inc i
187 |     doAssert i == 0x10FFFF+1
188 | 


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8" ?>
  2 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |   "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  4 | <!--  This file is generated by Nim. -->
  5 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  6 | <head>
  7 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  8 | 
  9 | <meta name="viewport" content="width=device-width, initial-scale=1.0">
 10 | 
 11 | <!-- Favicon -->
 12 | <link rel="shortcut icon" href="data:image/x-icon;base64,AAABAAEAEBAAAAEAIABoBAAAFgAAACgAAAAQAAAAIAAAAAEAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAUAAAAF////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAIAAABbAAAAlQAAAKIAAACbAAAAmwAAAKIAAACVAAAAWwAAAAL///8A////AP///wD///8A////AAAAABQAAADAAAAAYwAAAA3///8A////AP///wD///8AAAAADQAAAGMAAADAAAAAFP///wD///8A////AP///wAAAACdAAAAOv///wD///8A////AP///wD///8A////AP///wD///8AAAAAOgAAAJ3///8A////AP///wAAAAAnAAAAcP///wAAAAAoAAAASv///wD///8A////AP///wAAAABKAAAAKP///wAAAABwAAAAJ////wD///8AAAAAgQAAABwAAACIAAAAkAAAAJMAAACtAAAAFQAAABUAAACtAAAAkwAAAJAAAACIAAAAHAAAAIH///8A////AAAAAKQAAACrAAAAaP///wD///8AAAAARQAAANIAAADSAAAARf///wD///8AAAAAaAAAAKsAAACk////AAAAADMAAACcAAAAnQAAABj///8A////AP///wAAAAAYAAAAGP///wD///8A////AAAAABgAAACdAAAAnAAAADMAAAB1AAAAwwAAAP8AAADpAAAAsQAAAE4AAAAb////AP///wAAAAAbAAAATgAAALEAAADpAAAA/wAAAMMAAAB1AAAAtwAAAOkAAAD/AAAA/wAAAP8AAADvAAAA3gAAAN4AAADeAAAA3gAAAO8AAAD/AAAA/wAAAP8AAADpAAAAtwAAAGUAAAA/AAAA3wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAADfAAAAPwAAAGX///8A////AAAAAEgAAADtAAAAvwAAAL0AAADGAAAA7wAAAO8AAADGAAAAvQAAAL8AAADtAAAASP///wD///8A////AP///wD///8AAAAAO////wD///8A////AAAAAIcAAACH////AP///wD///8AAAAAO////wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A//8AAP//AAD4HwAA7/cAAN/7AAD//wAAoYUAAJ55AACf+QAAh+EAAAAAAADAAwAA4AcAAP5/AAD//wAA//8AAA=="/>
 13 | 
 14 | <!-- Google fonts -->
 15 | <link href='https://fonts.googleapis.com/css?family=Lato:400,600,900' rel='stylesheet' type='text/css'/>
 16 | <link href='https://fonts.googleapis.com/css?family=Source+Code+Pro:400,500,600' rel='stylesheet' type='text/css'/>
 17 | 
 18 | <!-- CSS -->
 19 | <title>segmentation</title>
 20 | <style type="text/css" >
 21 | /*
 22 | Stylesheet for use with Docutils/rst2html.
 23 | 
 24 | See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
 25 | customize this style sheet.
 26 | 
 27 | Modified from Chad Skeeters' rst2html-style
 28 | https://bitbucket.org/cskeeters/rst2html-style/
 29 | 
 30 | Modified by Boyd Greenfield and narimiran
 31 | */
 32 | 
 33 | html {
 34 |   font-size: 100%;
 35 |   -webkit-text-size-adjust: 100%;
 36 |   -ms-text-size-adjust: 100%; }
 37 | 
 38 | body {
 39 |   font-family: "Lato", "Helvetica Neue", "HelveticaNeue", Helvetica, Arial, sans-serif;
 40 |   font-weight: 400;
 41 |   font-size: 1.125em;
 42 |   line-height: 1.5;
 43 |   color: #222;
 44 |   background-color: #FCFCFC; }
 45 | 
 46 | /* Skeleton grid */
 47 | .container {
 48 |   position: relative;
 49 |   width: 100%;
 50 |   max-width: 1050px;
 51 |   margin: 0 auto;
 52 |   padding: 0;
 53 |   box-sizing: border-box; }
 54 | 
 55 | .column,
 56 | .columns {
 57 |   width: 100%;
 58 |   float: left;
 59 |   box-sizing: border-box;
 60 |   margin-left: 1%;
 61 | }
 62 | 
 63 | .column:first-child,
 64 | .columns:first-child {
 65 |   margin-left: 0; }
 66 | 
 67 | .three.columns {
 68 |   width: 19%; }
 69 | 
 70 | .nine.columns {
 71 |   width: 80.0%; }
 72 | 
 73 | .twelve.columns {
 74 |   width: 100%;
 75 |   margin-left: 0; }
 76 | 
 77 | @media screen and (max-width: 860px) {
 78 |   .three.columns {
 79 |     display: none;
 80 |   }
 81 |   .nine.columns {
 82 |     width: 98.0%;
 83 |   }
 84 |   body {
 85 |     font-size: 1em;
 86 |     line-height: 1.35;
 87 |   }
 88 | }
 89 | 
 90 | cite {
 91 |   font-style: italic !important; }
 92 | 
 93 | 
 94 | /* Nim search input */
 95 | div#searchInputDiv {
 96 |   margin-bottom: 1em;
 97 | }
 98 | input#searchInput {
 99 |   width: 80%;
100 | }
101 | 
102 | /*
103 |  * Some custom formatting for input forms.
104 |  * This also fixes input form colors on Firefox with a dark system theme on Linux.
105 |  */
106 | input {
107 |   -moz-appearance: none;
108 |   color: #333;
109 |   background-color: #f8f8f8;
110 |   border: 1px solid #aaa;
111 |   font-family: "Lato", "Helvetica Neue", "HelveticaNeue", Helvetica, Arial, sans-serif;
112 |   font-size: 0.9em;
113 |   padding: 6px;
114 | }
115 | input:focus {
116 |   border: 1px solid #1fa0eb;
117 |   box-shadow: 0 0 2px #1fa0eb;
118 | }
119 | 
120 | /* Docgen styles */
121 | /* Links */
122 | a {
123 |   color: #07b;
124 |   text-decoration: none;
125 | }
126 | 
127 | a span.Identifier {
128 |   text-decoration: underline;
129 |   text-decoration-color: #aab;
130 | }
131 | 
132 | a.reference-toplevel {
133 |   font-weight: bold;
134 | }
135 | 
136 | a.toc-backref {
137 |   text-decoration: none;
138 |   color: #222; }
139 | 
140 | a.link-seesrc {
141 |   color: #607c9f;
142 |   font-size: 0.9em;
143 |   font-style: italic; }
144 | 
145 | a:hover,
146 | a:focus {
147 |   color: #607c9f;
148 |   text-decoration: underline; }
149 | 
150 | a:hover span.Identifier {
151 |   color: #607c9f;
152 | }
153 | 
154 | 
155 | sub,
156 | sup {
157 |   position: relative;
158 |   font-size: 75%;
159 |   line-height: 0;
160 |   vertical-align: baseline; }
161 | 
162 | sup {
163 |   top: -0.5em; }
164 | 
165 | sub {
166 |   bottom: -0.25em; }
167 | 
168 | img {
169 |   width: auto;
170 |   height: auto;
171 |   max-width: 100%;
172 |   vertical-align: middle;
173 |   border: 0;
174 |   -ms-interpolation-mode: bicubic; }
175 | 
176 | @media print {
177 |   * {
178 |     color: black !important;
179 |     text-shadow: none !important;
180 |     background: transparent !important;
181 |     box-shadow: none !important; }
182 | 
183 |   a,
184 |   a:visited {
185 |     text-decoration: underline; }
186 | 
187 |   a[href]:after {
188 |     content: " (" attr(href) ")"; }
189 | 
190 |   abbr[title]:after {
191 |     content: " (" attr(title) ")"; }
192 | 
193 |   .ir a:after,
194 |   a[href^="javascript:"]:after,
195 |   a[href^="#"]:after {
196 |     content: ""; }
197 | 
198 |   pre,
199 |   blockquote {
200 |     border: 1px solid #999;
201 |     page-break-inside: avoid; }
202 | 
203 |   thead {
204 |     display: table-header-group; }
205 | 
206 |   tr,
207 |   img {
208 |     page-break-inside: avoid; }
209 | 
210 |   img {
211 |     max-width: 100% !important; }
212 | 
213 |   @page {
214 |     margin: 0.5cm; }
215 | 
216 |   h1 {
217 |     page-break-before: always; }
218 | 
219 |   h1.title {
220 |     page-break-before: avoid; }
221 | 
222 |   p,
223 |   h2,
224 |   h3 {
225 |     orphans: 3;
226 |     widows: 3; }
227 | 
228 |   h2,
229 |   h3 {
230 |     page-break-after: avoid; }
231 | }
232 | 
233 | 
234 | p {
235 |   margin-top: 0.5em;
236 |   margin-bottom: 0.5em;
237 | }
238 | 
239 | small {
240 |   font-size: 85%; }
241 | 
242 | strong {
243 |   font-weight: 600;
244 |   font-size: 0.95em;
245 |   color: #3c3c3c;
246 | }
247 | 
248 | em {
249 |   font-style: italic; }
250 | 
251 | h1 {
252 |   font-size: 1.8em;
253 |   font-weight: 400;
254 |   padding-bottom: .25em;
255 |   border-bottom: 1px solid #aaa;
256 |   margin-top: 2.5em;
257 |   margin-bottom: 1em;
258 |   line-height: 1.2em; }
259 | 
260 | h1.title {
261 |   padding-bottom: 1em;
262 |   border-bottom: 0px;
263 |   font-size: 2.5em;
264 |   text-align: center;
265 |   font-weight: 900;
266 |   margin-top: 0.75em;
267 |   margin-bottom: 0em;
268 | }
269 | 
270 | h2 {
271 |   font-size: 1.3em;
272 |   margin-top: 2em; }
273 | 
274 | h2.subtitle {
275 |   text-align: center; }
276 | 
277 | h3 {
278 |   font-size: 1.125em;
279 |   font-style: italic;
280 |   margin-top: 1.5em; }
281 | 
282 | h4 {
283 |   font-size: 1.125em;
284 |   margin-top: 1em; }
285 | 
286 | h5 {
287 |   font-size: 1.125em;
288 |   margin-top: 0.75em; }
289 | 
290 | h6 {
291 |   font-size: 1.1em; }
292 | 
293 | 
294 | ul,
295 | ol {
296 |   padding: 0;
297 |   margin-top: 0.5em;
298 |   margin-left: 0.75em; }
299 | 
300 | ul ul,
301 | ul ol,
302 | ol ol,
303 | ol ul {
304 |   margin-bottom: 0;
305 |   margin-left: 1.25em; }
306 | 
307 | li {
308 |     list-style-type: circle;
309 | }
310 | 
311 | ul.simple-boot li {
312 |     list-style-type: none;
313 |     margin-left: 0em;
314 |     margin-bottom: 0.5em;
315 | }
316 | 
317 | ol.simple > li, ul.simple > li {
318 |   margin-bottom: 0.25em;
319 |   margin-left: 0.4em }
320 | 
321 | ul.simple.simple-toc > li {
322 |     margin-top: 1em;
323 | }
324 | 
325 | ul.simple-toc {
326 |   list-style: none;
327 |   font-size: 0.9em;
328 |   margin-left: -0.3em;
329 |   margin-top: 1em; }
330 | 
331 | ul.simple-toc > li {
332 |     list-style-type: none;
333 | }
334 | 
335 | ul.simple-toc-section {
336 |   list-style-type: circle;
337 |   margin-left: 1em;
338 |   color: #6c9aae; }
339 | 
340 | 
341 | ol.arabic {
342 |   list-style: decimal; }
343 | 
344 | ol.loweralpha {
345 |   list-style: lower-alpha; }
346 | 
347 | ol.upperalpha {
348 |   list-style: upper-alpha; }
349 | 
350 | ol.lowerroman {
351 |   list-style: lower-roman; }
352 | 
353 | ol.upperroman {
354 |   list-style: upper-roman; }
355 | 
356 | ul.auto-toc {
357 |   list-style-type: none; }
358 | 
359 | 
360 | dl {
361 |   margin-bottom: 1.5em; }
362 | 
363 | dt {
364 |   margin-bottom: -0.5em;
365 |   margin-left: 0.0em; }
366 | 
367 | dd {
368 |   margin-left: 2.0em;
369 |   margin-bottom: 3.0em;
370 |   margin-top: 0.5em; }
371 | 
372 | 
373 | hr {
374 |   margin: 2em 0;
375 |   border: 0;
376 |   border-top: 1px solid #aaa; }
377 | 
378 | blockquote {
379 |   font-size: 0.9em;
380 |   font-style: italic;
381 |   padding-left: 0.5em;
382 |   margin-left: 0;
383 |   border-left: 5px solid #bbc;
384 | }
385 | 
386 | .pre {
387 |   font-family: "Source Code Pro", Monaco, Menlo, Consolas, "Courier New", monospace;
388 |   font-weight: 500;
389 |   font-size: 0.85em;
390 |   background-color: #f0f3ff;
391 |   padding-left: 3px;
392 |   padding-right: 3px;
393 |   border-radius: 4px;
394 | }
395 | 
396 | pre {
397 |   font-family: "Source Code Pro", Monaco, Menlo, Consolas, "Courier New", monospace;
398 |   color: #222;
399 |   font-weight: 500;
400 |   display: inline-block;
401 |   box-sizing: border-box;
402 |   min-width: 100%;
403 |   padding: 0.5em;
404 |   margin-top: 0.5em;
405 |   margin-bottom: 0.5em;
406 |   font-size: 0.85em;
407 |   white-space: pre !important;
408 |   overflow-y: hidden;
409 |   overflow-x: visible;
410 |   background-color: ghostwhite;
411 |   border: 1px solid #dde;
412 |   -webkit-border-radius: 6px;
413 |   -moz-border-radius: 6px;
414 |   border-radius: 6px; }
415 | 
416 | .pre-scrollable {
417 |   max-height: 340px;
418 |   overflow-y: scroll; }
419 | 
420 | 
421 | /* Nim line-numbered tables */
422 | .line-nums-table {
423 |   width: 100%;
424 |   table-layout: fixed; }
425 | 
426 | table.line-nums-table {
427 |   border-radius: 4px;
428 |   border: 1px solid #cccccc;
429 |   background-color: ghostwhite;
430 |   border-collapse: separate;
431 |   margin-top: 15px;
432 |   margin-bottom: 25px; }
433 | 
434 | .line-nums-table tbody {
435 |   border: none; }
436 | 
437 | .line-nums-table td pre {
438 |   border: none;
439 |   background-color: transparent; }
440 | 
441 | .line-nums-table td.blob-line-nums {
442 |   width: 28px; }
443 | 
444 | .line-nums-table td.blob-line-nums pre {
445 |   color: #b0b0b0;
446 |   -webkit-filter: opacity(75%);
447 |   text-align: right;
448 |   border-color: transparent;
449 |   background-color: transparent;
450 |   padding-left: 0px;
451 |   margin-left: 0px;
452 |   padding-right: 0px;
453 |   margin-right: 0px; }
454 | 
455 | 
456 | table {
457 |   max-width: 100%;
458 |   background-color: transparent;
459 |   margin-top: 0.5em;
460 |   margin-bottom: 1.5em;
461 |   border-collapse: collapse;
462 |   border-color: #ccc;
463 |   border-spacing: 0;
464 |   font-size: 0.9em;
465 | }
466 | 
467 | table th, table td {
468 |   padding: 0px 0.5em 0px;
469 | }
470 | 
471 | table th {
472 |   background-color: #e8e8e8;
473 |   font-weight: bold; }
474 | 
475 | table th.docinfo-name {
476 |     background-color: transparent;
477 | }
478 | 
479 | table tr:hover {
480 |   background-color: ghostwhite; }
481 | 
482 | 
483 | /* rst2html default used to remove borders from tables and images */
484 | .borderless, table.borderless td, table.borderless th {
485 |   border: 0; }
486 | 
487 | table.borderless td, table.borderless th {
488 |   /* Override padding for "table.docutils td" with "! important".
489 |      The right padding separates the table cells. */
490 |   padding: 0 0.5em 0 0 !important; }
491 | 
492 | .first {
493 |   /* Override more specific margin styles with "! important". */
494 |   margin-top: 0 !important; }
495 | 
496 | .last, .with-subtitle {
497 |   margin-bottom: 0 !important; }
498 | 
499 | .hidden {
500 |   display: none; }
501 | 
502 | blockquote.epigraph {
503 |   margin: 2em 5em; }
504 | 
505 | dl.docutils dd {
506 |   margin-bottom: 0.5em; }
507 | 
508 | object[type="image/svg+xml"], object[type="application/x-shockwave-flash"] {
509 |   overflow: hidden; }
510 | 
511 | 
512 | div.figure {
513 |   margin-left: 2em;
514 |   margin-right: 2em; }
515 | 
516 | div.footer, div.header {
517 |   clear: both;
518 |   text-align: center;
519 |   color: #666;
520 |   font-size: smaller; }
521 | 
522 | div.footer {
523 |     padding-top: 5em;
524 | }
525 | 
526 | div.line-block {
527 |   display: block;
528 |   margin-top: 1em;
529 |   margin-bottom: 1em; }
530 | 
531 | div.line-block div.line-block {
532 |   margin-top: 0;
533 |   margin-bottom: 0;
534 |   margin-left: 1.5em; }
535 | 
536 | div.topic {
537 |   margin: 2em; }
538 | 
539 | div.search_results {
540 |   background-color: antiquewhite;
541 |   margin: 3em;
542 |   padding: 1em;
543 |   border: 1px solid #4d4d4d;
544 | }
545 | 
546 | div#global-links ul {
547 |   margin-left: 0;
548 |   list-style-type: none;
549 | }
550 | 
551 | div#global-links > simple-boot {
552 |     margin-left: 3em;
553 | }
554 | 
555 | hr.docutils {
556 |   width: 75%; }
557 | 
558 | img.align-left, .figure.align-left, object.align-left {
559 |   clear: left;
560 |   float: left;
561 |   margin-right: 1em; }
562 | 
563 | img.align-right, .figure.align-right, object.align-right {
564 |   clear: right;
565 |   float: right;
566 |   margin-left: 1em; }
567 | 
568 | img.align-center, .figure.align-center, object.align-center {
569 |   display: block;
570 |   margin-left: auto;
571 |   margin-right: auto; }
572 | 
573 | .align-left {
574 |   text-align: left; }
575 | 
576 | .align-center {
577 |   clear: both;
578 |   text-align: center; }
579 | 
580 | .align-right {
581 |   text-align: right; }
582 | 
583 | /* reset inner alignment in figures */
584 | div.align-right {
585 |   text-align: inherit; }
586 | 
587 | p.attribution {
588 |   text-align: right;
589 |   margin-left: 50%; }
590 | 
591 | p.caption {
592 |   font-style: italic; }
593 | 
594 | p.credits {
595 |   font-style: italic;
596 |   font-size: smaller; }
597 | 
598 | p.label {
599 |   white-space: nowrap; }
600 | 
601 | p.rubric {
602 |   font-weight: bold;
603 |   font-size: larger;
604 |   color: maroon;
605 |   text-align: center; }
606 | 
607 | p.topic-title {
608 |   font-weight: bold; }
609 | 
610 | pre.address {
611 |   margin-bottom: 0;
612 |   margin-top: 0;
613 |   font: inherit; }
614 | 
615 | pre.literal-block, pre.doctest-block, pre.math, pre.code {
616 |   margin-left: 2em;
617 |   margin-right: 2em; }
618 | 
619 | pre.code .ln {
620 |   color: grey; }
621 | 
622 | /* line numbers */
623 | pre.code, code {
624 |   background-color: #eeeeee; }
625 | 
626 | pre.code .comment, code .comment {
627 |   color: #5c6576; }
628 | 
629 | pre.code .keyword, code .keyword {
630 |   color: #3B0D06;
631 |   font-weight: bold; }
632 | 
633 | pre.code .literal.string, code .literal.string {
634 |   color: #0c5404; }
635 | 
636 | pre.code .name.builtin, code .name.builtin {
637 |   color: #352b84; }
638 | 
639 | pre.code .deleted, code .deleted {
640 |   background-color: #DEB0A1; }
641 | 
642 | pre.code .inserted, code .inserted {
643 |   background-color: #A3D289; }
644 | 
645 | span.classifier {
646 |   font-style: oblique; }
647 | 
648 | span.classifier-delimiter {
649 |   font-weight: bold; }
650 | 
651 | span.option {
652 |   white-space: nowrap; }
653 | 
654 | span.problematic {
655 |   color: #b30000; }
656 | 
657 | span.section-subtitle {
658 |   /* font-size relative to parent (h1..h6 element) */
659 |   font-size: 80%; }
660 | 
661 | span.DecNumber {
662 |   color: #252dbe; }
663 | 
664 | span.BinNumber {
665 |   color: #252dbe; }
666 | 
667 | span.HexNumber {
668 |   color: #252dbe; }
669 | 
670 | span.OctNumber {
671 |   color: #252dbe; }
672 | 
673 | span.FloatNumber {
674 |   color: #252dbe; }
675 | 
676 | span.Identifier {
677 |   color: #222; }
678 | 
679 | span.Keyword {
680 |   font-weight: 600;
681 |   color: #5e8f60; }
682 | 
683 | span.StringLit {
684 |   color: #a4255b; }
685 | 
686 | span.LongStringLit {
687 |   color: #a4255b; }
688 | 
689 | span.CharLit {
690 |   color: #a4255b; }
691 | 
692 | span.EscapeSequence {
693 |   color: black; }
694 | 
695 | span.Operator {
696 |   color: black; }
697 | 
698 | span.Punctuation {
699 |   color: black; }
700 | 
701 | span.Comment, span.LongComment {
702 |   font-style: italic;
703 |   font-weight: 400;
704 |   color: #484a86; }
705 | 
706 | span.RegularExpression {
707 |   color: darkviolet; }
708 | 
709 | span.TagStart {
710 |   color: darkviolet; }
711 | 
712 | span.TagEnd {
713 |   color: darkviolet; }
714 | 
715 | span.Key {
716 |   color: #252dbe; }
717 | 
718 | span.Value {
719 |   color: #252dbe; }
720 | 
721 | span.RawData {
722 |   color: #a4255b; }
723 | 
724 | span.Assembler {
725 |   color: #252dbe; }
726 | 
727 | span.Preprocessor {
728 |   color: #252dbe; }
729 | 
730 | span.Directive {
731 |   color: #252dbe; }
732 | 
733 | span.Command, span.Rule, span.Hyperlink, span.Label, span.Reference,
734 | span.Other {
735 |   color: black; }
736 | 
737 | /* Pop type, const, proc, and iterator defs in nim def blocks */
738 | dt pre > span.Identifier, dt pre > span.Operator {
739 |   color: #155da4;
740 |   font-weight: 700; }
741 | 
742 | dt pre > span.Keyword ~ span.Identifier, dt pre > span.Identifier ~ span.Identifier,
743 | dt pre > span.Operator ~ span.Identifier, dt pre > span.Other ~ span.Identifier {
744 |   color: inherit;
745 |   font-weight: inherit; }
746 | 
747 | /* Nim sprite for the footer (taken from main page favicon) */
748 | .nim-sprite {
749 |   display: inline-block;
750 |   height: 16px;
751 |   width: 16px;
752 |   background-position: 0 0;
753 |   background-size: 16px 16px;
754 |   -webkit-filter: opacity(50%);
755 |   background-repeat: no-repeat;
756 |   background-image: url("data:image/x-icon;base64,AAABAAEAEBAAAAEAIABoBAAAFgAAACgAAAAQAAAAIAAAAAEAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAUAAAAF////AP///wD///8A////AP///wD///8A////AP///wD///8A////AAAAAAIAAABbAAAAlQAAAKIAAACbAAAAmwAAAKIAAACVAAAAWwAAAAL///8A////AP///wD///8A////AAAAABQAAADAAAAAYwAAAA3///8A////AP///wD///8AAAAADQAAAGMAAADAAAAAFP///wD///8A////AP///wAAAACdAAAAOv///wD///8A////AP///wD///8A////AP///wD///8AAAAAOgAAAJ3///8A////AP///wAAAAAnAAAAcP///wAAAAAoAAAASv///wD///8A////AP///wAAAABKAAAAKP///wAAAABwAAAAJ////wD///8AAAAAgQAAABwAAACIAAAAkAAAAJMAAACtAAAAFQAAABUAAACtAAAAkwAAAJAAAACIAAAAHAAAAIH///8A////AAAAAKQAAACrAAAAaP///wD///8AAAAARQAAANIAAADSAAAARf///wD///8AAAAAaAAAAKsAAACk////AAAAADMAAACcAAAAnQAAABj///8A////AP///wAAAAAYAAAAGP///wD///8A////AAAAABgAAACdAAAAnAAAADMAAAB1AAAAwwAAAP8AAADpAAAAsQAAAE4AAAAb////AP///wAAAAAbAAAATgAAALEAAADpAAAA/wAAAMMAAAB1AAAAtwAAAOkAAAD/AAAA/wAAAP8AAADvAAAA3gAAAN4AAADeAAAA3gAAAO8AAAD/AAAA/wAAAP8AAADpAAAAtwAAAGUAAAA/AAAA3wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAAD/AAAA/wAAAP8AAADfAAAAPwAAAGX///8A////AAAAAEgAAADtAAAAvwAAAL0AAADGAAAA7wAAAO8AAADGAAAAvQAAAL8AAADtAAAASP///wD///8A////AP///wD///8AAAAAO////wD///8A////AAAAAIcAAACH////AP///wD///8AAAAAO////wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A////AP///wD///8A//8AAP//AAD4HwAA7/cAAN/7AAD//wAAoYUAAJ55AACf+QAAh+EAAAAAAADAAwAA4AcAAP5/AAD//wAA//8AAA==");
757 |   margin-bottom: -5px; }
758 | 
759 | span.pragmadots {
760 |   /* Position: relative frees us up to make the dots
761 |   look really nice without fucking up the layout and
762 |   causing bulging in the parent container */
763 |   position: relative;
764 |   /* 1px down looks slightly nicer */
765 |   top: 1px;
766 |   padding: 2px;
767 |   background-color: #e8e8e8;
768 |   border-radius: 4px;
769 |   margin: 0 2px;
770 |   cursor: pointer;
771 |   font-size: 0.8em;
772 | }
773 | 
774 | span.pragmadots:hover {
775 |   background-color: #DBDBDB;
776 | }
777 | span.pragmawrap {
778 |   display: none;
779 | }
780 | 
781 | span.attachedType {
782 |   display: none;
783 |   visibility: hidden;
784 | }
785 | </style>
786 | 
787 | <script type="text/javascript" src="dochack.js"></script>
788 | 
789 | <script type="text/javascript">
790 | function main() {
791 |   var pragmaDots = document.getElementsByClassName("pragmadots");
792 |   for (var i = 0; i < pragmaDots.length; i++) {
793 |     pragmaDots[i].onclick = function(event) {
794 |       // Hide tease
795 |       event.target.parentNode.style.display = "none";
796 |       // Show actual
797 |       event.target.parentNode.nextElementSibling.style.display = "inline";
798 |     }
799 |   }
800 | }
801 | </script>
802 | 
803 | </head>
804 | <body onload="main()">
805 | <div class="document" id="documentId">
806 |   <div class="container">
807 |     <h1 class="title">segmentation</h1>
808 |     <div class="row">
809 |   <div class="three columns">
810 |   <div id="global-links">
811 |     <ul class="simple">
812 |     </ul>
813 |   </div>
814 |   <div id="searchInputDiv">
815 |     Search: <input type="text" id="searchInput"
816 |       onkeyup="search()" />
817 |   </div>
818 |   <div>
819 |     Group by:
820 |     <select onchange="groupBy(this.value)">
821 |       <option value="section">Section</option>
822 |       <option value="type">Type</option>
823 |     </select>
824 |   </div>
825 |   <ul class="simple simple-toc" id="toc-list">
826 | <li>
827 |   <a class="reference reference-toplevel" href="#15" id="65">Iterators</a>
828 |   <ul class="simple simple-toc-section">
829 |       <li><a class="reference" href="#wordsBounds.i%2Cstring"
830 |     title="wordsBounds(s: string): Slice[int]"><wbr />words<wbr />Bounds<span class="attachedType"></span></a></li>
831 |   <li><a class="reference" href="#words.i%2Cstring"
832 |     title="words(s: string): string"><wbr />words<span class="attachedType"></span></a></li>
833 | 
834 |   </ul>
835 | </li>
836 | 
837 | </ul>
838 | 
839 |   </div>
840 |   <div class="nine columns" id="content">
841 |   <div id="tocRoot"></div>
842 |   
843 |   <p class="module-desc">This library implements Unicode Text Segmentation (tr29)</p>
844 |   <div class="section" id="15">
845 | <h1><a class="toc-backref" href="#15">Iterators</a></h1>
846 | <dl class="item">
847 | <a id="wordsBounds.i,string"></a>
848 | <dt><pre><span class="Keyword">iterator</span> <a href="#wordsBounds.i%2Cstring"><span class="Identifier">wordsBounds</span></a><span class="Other">(</span><span class="Identifier">s</span><span class="Other">:</span> <span class="Identifier">string</span><span class="Other">)</span><span class="Other">:</span> <span class="Identifier">Slice</span><span class="Other">[</span><span class="Identifier">int</span><span class="Other">]</span> <span><span class="Other">{</span><span class="Other pragmadots">...</span><span class="Other">}</span></span><span class="pragmawrap"><span class="Other">{.</span><span class="pragma"><span class="Identifier">inline</span><span class="Other">,</span> <span class="Identifier">raises</span><span class="Other">:</span> <span class="Other">[</span><span class="Other">]</span><span class="Other">,</span> <span class="Identifier">tags</span><span class="Other">:</span> <span class="Other">[</span><span class="Other">]</span></span><span class="Other">.}</span></span></pre></dt>
849 | <dd>
850 | 
851 | Return each word boundary in <tt class="docutils literal"><span class="pre">s</span></tt>. Boundaries are inclusive
852 | 
853 | </dd>
854 | <a id="words.i,string"></a>
855 | <dt><pre><span class="Keyword">iterator</span> <a href="#words.i%2Cstring"><span class="Identifier">words</span></a><span class="Other">(</span><span class="Identifier">s</span><span class="Other">:</span> <span class="Identifier">string</span><span class="Other">)</span><span class="Other">:</span> <span class="Identifier">string</span> <span><span class="Other">{</span><span class="Other pragmadots">...</span><span class="Other">}</span></span><span class="pragmawrap"><span class="Other">{.</span><span class="pragma"><span class="Identifier">inline</span><span class="Other">,</span> <span class="Identifier">raises</span><span class="Other">:</span> <span class="Other">[</span><span class="Other">]</span><span class="Other">,</span> <span class="Identifier">tags</span><span class="Other">:</span> <span class="Other">[</span><span class="Other">]</span></span><span class="Other">.}</span></span></pre></dt>
856 | <dd>
857 | 
858 | Return each word in <tt class="docutils literal"><span class="pre">s</span></tt>
859 | 
860 | </dd>
861 | 
862 | </dl></div>
863 | 
864 |   </div>
865 | </div>
866 | 
867 |     <div class="row">
868 |       <div class="twelve-columns footer">
869 |         <span class="nim-sprite"></span>
870 |         <br/>
871 |         <small>Made with Nim. Generated: 2020-02-15 14:05:26 UTC</small>
872 |       </div>
873 |     </div>
874 |   </div>
875 | </div>
876 | 
877 | </body>
878 | </html>
879 | 


--------------------------------------------------------------------------------