├── .formatter.exs
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE.md
├── README.md
├── benchee
    ├── casing.exs
    ├── compare.exs
    └── next.exs
├── config
    └── config.exs
├── lib
    ├── tasks
    │   └── download_dictionaries.ex
    └── unicode
    │   ├── break.ex
    │   ├── case
    │       ├── folding.ex
    │       ├── greek_upper.ex
    │       └── mapping.ex
    │   ├── dictionary.ex
    │   ├── segment.ex
    │   ├── string.ex
    │   └── trie.ex
├── logo.png
├── mix.exs
├── mix.lock
├── mix
    └── myapp_backend.ex
├── priv
    ├── dictionaries
    │   └── .gitkeep
    └── segments
    │   ├── de.xml
    │   ├── el.xml
    │   ├── en.xml
    │   ├── en_US.xml
    │   ├── en_US_POSIX.xml
    │   ├── es.xml
    │   ├── fi.xml
    │   ├── fr.xml
    │   ├── it.xml
    │   ├── ja.xml
    │   ├── pt.xml
    │   ├── root.xml
    │   ├── ru.xml
    │   ├── sv.xml
    │   ├── zh.xml
    │   └── zh_Hant.xml
├── test
    ├── casing_test.exs
    ├── line_break_test.exs
    ├── segment_test.exs
    ├── sentence_break_test.exs
    ├── support
    │   ├── test_data
    │   │   ├── grapheme_break_test.txt
    │   │   ├── line_break_test.txt
    │   │   ├── sentence_break_test.txt
    │   │   └── word_break_test.txt
    │   └── test_data_parser.ex
    ├── test_helper.exs
    ├── unicode_string_test.exs
    └── word_break_test.exs
├── update_segment_data
└── update_test_data


/.formatter.exs:
--------------------------------------------------------------------------------
1 | # Used by "mix format"
2 | [
3 |   inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"]
4 | ]
5 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   pull_request:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     env:
13 |       MIX_ENV: test
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         include:
18 |           - pair:
19 |               elixir: 1.18.1-otp-27
20 |               otp: 27.2
21 |             lint: lint
22 |     steps:
23 |       - uses: actions/checkout@v3
24 | 
25 |       - uses: erlef/setup-beam@v1
26 |         with:
27 |           otp-version: ${{matrix.pair.otp}}
28 |           elixir-version: ${{matrix.pair.elixir}}
29 | 
30 |       - uses: actions/cache@v3
31 |         with:
32 |           path: |
33 |             deps
34 |             _build
35 |           key: ${{ runner.os }}-mix-${{matrix.pair.elixir}}-${{matrix.pair.otp}}-${{ hashFiles('**/mix.lock') }}
36 | 
37 |       - run: mix deps.get
38 | 
39 |       - run: mix deps --check-unused
40 |         if: ${{ matrix.lint }}
41 | 
42 |       - run: mix deps.compile
43 | 
44 |       - run: mix compile --warnings-as-errors
45 |         if: ${{ matrix.lint }}
46 | 
47 |       - run: mix dialyzer
48 |         if: ${{ matrix.lint }}
49 |         
50 |       - run: mix unicode.string.download.dictionaries
51 | 
52 |       - run: mix test
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | /_build
 2 | /cover
 3 | /deps
 4 | /doc
 5 | /references
 6 | *.snapshot
 7 | erl_crash.dump
 8 | *.ez
 9 | *.tar
10 | .DS_Store
11 | .iex.exs
12 | 
13 | # The xml downloaded from Unicode
14 | /downloads
15 | 
16 | # Generated erlang source
17 | /src/*.erl
18 | 
19 | # asdf
20 | .tool-versions
21 | 
22 | # Don't store the dictionaries
23 | /priv/dictionaries/*.txt
24 | 
25 | # Mise
26 | mise.toml
27 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## Unicode String v1.7.0
  4 | 
  5 | This is the changelog for Unicode String v1.7.0 released on March 29th.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
  6 | 
  7 | ### Bug Fixes
  8 | 
  9 | * Converts all compile-time regex compilation to runtime to be compatible with OTP 28. Performance implications are not yet known.
 10 | 
 11 | ## Unicode String v1.6.0
 12 | 
 13 | This is the changelog for Unicode String v1.6.0 released on March 17th, 2025.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 14 | 
 15 | ### Bug Fixes
 16 | 
 17 | * Fix word break detection when a `\p{word_break=extend}` codepoint is preceeded by a letter and followed by a letter.
 18 | 
 19 | ### Enhancements
 20 | 
 21 | * Updated to [CLDR 47](https://cldr.unicode.org/downloads/cldr-47) break rules and test data.
 22 | 
 23 | ## Unicode String v1.5.0
 24 | 
 25 | This is the changelog for Unicode String v1.5.0 released on January 1st, 2025.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 26 | 
 27 | ### Enhancements
 28 | 
 29 | * Update to CLDR 46.1 segmentation data and tests.
 30 | 
 31 | * Pass dialyzer with `:underspecs` flag set.
 32 | 
 33 | ## Unicode String v1.4.1
 34 | 
 35 | This is the changelog for Unicode String v1.4.1 released on March 14th, 2024.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 36 | 
 37 | ### Bug Fixes
 38 | 
 39 | * Fix performance regressing in `Uncode.String.Break.next/4`. Added the script `bench/next.exs` to allow for regression testing. Thanks to @mntns for the report. Closes #6.
 40 | 
 41 | ## Unicode String v1.4.0
 42 | 
 43 | This is the changelog for Unicode String v1.4.0 released on March 10th, 2024.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 44 | 
 45 | ### Enhancements
 46 | 
 47 | * Adds dictionary-based work breaking for Chinese (zh, zh-Hant, zh-Hans, zh-Hant-HK, yue, yue-Hans), Japanese (ja), Thai (th), Lao (lo), Khmer (km) and Burmese (my). These languages don't typically use whitespace to separate words so a dictionary lookup is more appropriate - although not perfect.  The same dictionary is used for Chinese and Japanese. The dictionaries implemented are those used in the [CLDR](https://cldr.unicode.org) since they are under an open source license and also for consistency with [ICU](https://icu.unicode.org). Note that these dictionaries need to be downloaded with `mix unicode.string.download.dictionaries` prior to use. Each dictionary will be parsed and loaded into [persistent_term](https://www.erlang.org/doc/man/persistent_term) on demand. Each dictionary has a sizable memory footprint as measured by `:persistent_term.info/0`:
 48 | 
 49 | | Dictionary  | Memory Mb   |
 50 | | ----------- | ----------: |
 51 | | Chinese     | 104.8       |
 52 | | Thai        | 9.6         |
 53 | | Lao         | 11.4        |
 54 | | Khmer       | 38.8        |
 55 | | Burmese     | 23.1        |
 56 | 
 57 | ## Unicode String v1.3.1
 58 | 
 59 | This is the changelog for Unicode String v1.3.1 released on March 6th, 2024.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 60 | 
 61 | ### Bug Fixes
 62 | 
 63 | * Fix `Unicode.String.split/2` and `Unicode.String.next/2` when the passing rule is `:no_break` rule. Thanks to @GregLMcDonald for the report. Closes #5.
 64 | 
 65 | ## Unicode String v1.3.0
 66 | 
 67 | This is the changelog for Unicode String v1.3.0 released on February 27th, 2024.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 68 | 
 69 | ### Bug Fixes
 70 | 
 71 | * Fix case folding for codepoints that fold to themselves.
 72 | 
 73 | ### Enhancements
 74 | 
 75 | * Adds case mapping functions `Unicode.String.upcase/2`, `Unicode.String.downcase/2` and `Unicode/String.titlecase/2`. These functions implement the full [Unicode Casing algorithm](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) including conditiional mappings. They are locale-aware and a locale can be specified as a string, atom or a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) thereby providing basic integration between `unicode_string` and [ex_cldr](https://hex.pm/packages/ex_cldr).
 76 | 
 77 | * Case folding always follows the `:full` path which allows mapping of single code points to multiple code points. There is no practical reason to implement the `:simple` path. As a result, the `type` parameter to `Unicode.String.Case.Folding.fold/2` is no longer required or supported.
 78 | 
 79 | * Support an [ex_cldr](https://hex.pm/packages/ex_cldr) [Language Tag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) as a parameter to `Unicode.String.Case.Folding.fold/2`. In fact any map that has a `:language` key with a value that is an [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) language code as a lower cased atom may be passed as a parameter.
 80 | 
 81 | ## Unicode String v1.2.1
 82 | 
 83 | This is the changelog for Unicode String v1.2.1 released on June 2nd, 2023.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 84 | 
 85 | ### Bug Fixes
 86 | 
 87 | * Resolve segments dir at runtime, not compile time. Thanks to @crkent for the report. Closes #4.
 88 | 
 89 | ## Unicode String v1.2.0
 90 | 
 91 | This is the changelog for Unicode String v1.2.0 released on March 14th, 2023.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
 92 | 
 93 | ### Enhancements
 94 | 
 95 | * Adds `Unicode.String.stream/2` to support streaming graphemes, words, sentences and line breaks.
 96 | 
 97 | ## Unicode String v1.1.0
 98 | 
 99 | This is the changelog for Unicode String v1.1.0 released on September 21st, 2022.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
100 | 
101 | ### Enhancements
102 | 
103 | * Updates the segmentation supplemental data (including locales) for CLDR. This adds the "sv" and "fi" locale data for sentence break suppressions.
104 | 
105 | ## Unicode String v1.0.1
106 | 
107 | This is the changelog for Unicode String v1.0.1 released on September 15th, 2021.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
108 | 
109 | ### Bug Fixes
110 | 
111 | * Woops, the priv/segments directory was not included in the build artifact
112 | 
113 | ## Unicode String v1.0.0
114 | 
115 | This is the changelog for Unicode String v1.0.0 released on September 14th, 2021.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
116 | 
117 | ### Enhancements
118 | 
119 | * Update to use [Unicode 14](https://unicode.org/versions/Unicode14.0.0) release data.
120 | 
121 | ## Unicode String v0.3.0
122 | 
123 | This is the changelog for Unicode String v0.3.0 released on October 11th, 2020.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
124 | 
125 | ### Bug Fixes
126 | 
127 | * Correct deps and docs to align with Elixir 1.11 and recent releases of `ex_unicode`.
128 | 
129 | # Unicode String v0.2.0
130 | 
131 | This is the changelog for Unicode String v0.2.0 released on July 12th, 2020.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
132 | 
133 | ### Enhancements
134 | 
135 | This release implements the Unicode break rules for graphemes, words, lines (word-wrapping) and sentences.
136 | 
137 | * Adds `Unicode.String.split/2`
138 | 
139 | * Adds `Unicode.String.break?/2`
140 | 
141 | * Adds `Unicode.String.break/2`
142 | 
143 | * Adds `Unicode.String.splitter/2`
144 | 
145 | * Adds `Unicode.String.next/2`
146 | 
147 | # Unicode String v0.1.0
148 | 
149 | This is the changelog for Unicode String v0.1.0 released on May 17th, 2020.  For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags)
150 | 
151 | ### Enhancements
152 | 
153 | * Initial release
154 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # License
 2 | 
 3 | Copyright 2018-2023 Kip Cole
 4 | 
 5 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
 6 | compliance with the License. You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software distributed under the License
11 | is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12 | implied. See the License for the specific language governing permissions and limitations under the
13 | License.
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Unicode String
  2 | 
  3 | ![Build status](https://github.com/elixir-unicode/unicode_string/actions/workflows/ci.yml/badge.svg)
  4 | [![Hex.pm](https://img.shields.io/hexpm/v/unicode_string.svg)](https://hex.pm/packages/unicode_string)
  5 | [![Hex.pm](https://img.shields.io/hexpm/dw/unicode_string.svg?)](https://hex.pm/packages/unicode_string)
  6 | [![Hex.pm](https://img.shields.io/hexpm/l/unicode_string.svg)](https://hex.pm/packages/unicode_string)
  7 | 
  8 | Adds functions supporting some string algorithms in the Unicode standard. For example:
  9 | 
 10 | * The [Unicode Case Folding](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm to provide case-independent equality checking irrespective of language or script with `Unicode.String.fold/2` and `Unicode.String.equals_ignoring_case?/2`
 11 | 
 12 | * The [Unicode Code Mapping](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm that implements locale-aware `Unicode.String.upcase/2`, `Unicode.String.downcase/2` and `Unicode.String.titlecase/2`.
 13 | 
 14 | * The [Unicode Segmentation](https://unicode.org/reports/tr29/) algorithm to detect, break, split or stream strings into grapheme clusters, words and sentences.
 15 | 
 16 | * The [Unicode Line Breaking](https://www.unicode.org/reports/tr14/) algorithm to determine line breaks (breaks meaning where word-wrapping would be acceptable).
 17 | 
 18 | ## Installation
 19 | 
 20 | The package can be installed by adding `:unicode_string` to your list of dependencies in `mix.exs`:
 21 | 
 22 | ```elixir
 23 | def deps do
 24 |   [
 25 |     {:unicode_string, "~> 1.0"},
 26 |     ...
 27 |   ]
 28 | end
 29 | ```
 30 | 
 31 | Then run `mix dep.get`.
 32 | 
 33 | > #### Word Break Dictionary Download {: .info}
 34 | >
 35 | > If you plan to perform word break segmentation on Chinese, Japanese, Lao,
 36 | > Burmese, Thai or Khmer languages you will need to download the word break dictionaries
 37 | > by running `mix unicode.string.download.dictionaries`.
 38 | 
 39 | ## Casing
 40 | 
 41 | ### Case Folding
 42 | 
 43 | The [Unicode Case Folding](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm defines how to perform case folding. This allows comparison of strings in a case-insensitive fashion. It does not define the means to compare ignoring diacritical marks (accents). Some examples follow, for details see:
 44 | 
 45 | * `Unicode.String.fold/2`
 46 | * `Unicode.String.equals_ignoring_case?/3`
 47 | 
 48 | > #### Note {: .info}
 49 | >
 50 | > Although the folding algorithm commonly downcases characters, folding is not a general purpose downcasing process. It exists only to facilitate case insensitive string comparison.
 51 | 
 52 | 
 53 | ```elixir
 54 | iex> Unicode.String.equals_ignoring_case? "ABC", "abc"
 55 | true
 56 | 
 57 | iex> Unicode.String.equals_ignoring_case? "beißen", "beissen"
 58 | true
 59 | 
 60 | iex> Unicode.String.equals_ignoring_case? "grüßen", "grussen"
 61 | false
 62 | ```
 63 | 
 64 | ### Case Mapping
 65 | 
 66 | The [Unicode Case Mapping](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm defines the process and data to transform text into upper case, lower case or title case. Since most languages are not bicameral, characters which have no case mapping remain unchanged.
 67 | 
 68 | Three case mapping functions are provided:
 69 | 
 70 | * `Unicode.String.upcase/2` which will convert text to upper case characters.
 71 | * `Unicode.String.downcase/2` which will convert text to lower case characters.
 72 | * `Unicode.String.titlecase/2` which will convert text to title case.  Title case means that the first character or each word is set to upper case and all other characters in the word are set to lower case. `Unicode.String.split/2` is used to split the string into words before title casing.
 73 | 
 74 | Each function operates in a locale-aware manner implementing some basic capabilities:
 75 | 
 76 | * Casing rules for the Turkish dotted capital `I` and dotless small `i`.
 77 | * Casing rules for the retention of dots over `i` for Lithuanian letters with additional accents.
 78 | * Titlecasing of IJ at the start of words in Dutch.
 79 | * Removal of diacritics when upper casing letters in Greek.
 80 | 
 81 | There are other casing rules that are not currently implemented such as:
 82 | 
 83 | * Titlecasing of second or subsequent letters in words in orthographies that include caseless letters such as apostrophes.
 84 | * Uppercasing of U+00DF `ß` latin small letter sharp `s` to U+1E9E `ẞ` latin capital letter sharp `s`.
 85 | 
 86 | ```elixir
 87 | # Basic case transformation
 88 | iex> Unicode.String.upcase("the quick brown fox")
 89 | "THE QUICK BROWN FOX"
 90 | 
 91 | # Dotted-I in Turkish and Azeri
 92 | iex> Unicode.String.upcase("Diyarbakır", locale: :tr)
 93 | "DİYARBAKIR"
 94 | 
 95 | # Upper case in Greek removes diacritics
 96 | iex> Unicode.String.upcase("Πατάτα, Αέρας, Μυστήριο", locale: :el)
 97 | "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ"
 98 | 
 99 | # Lower case Greek with a final sigma
100 | iex> Unicode.String.downcase("ὈΔΥΣΣΕΎΣ", locale: :el)
101 | "ὀδυσσεύς"
102 | 
103 | # Title case Dutch with leading dipthong
104 | iex> Unicode.String.titlecase("ijsselmeer", locale: :nl)
105 | "IJsselmeer"
106 | ```
107 | 
108 | ## Segmentation
109 | 
110 | The [Unicode Segmentation](https://unicode.org/reports/tr29/) annex details the algorithm to be applied with segmenting text (Elixir strings) into words, sentences, graphemes and line breaks. Some examples follow, for details see:
111 | 
112 | * `Unicode.String.split/2`
113 | * `Unicode.String.break?/2`
114 | * `Unicode.String.break/2`
115 | * `Unicode.String.splitter/2`
116 | * `Unicode.String.next/2`
117 | * `Unicode.String.stream/2`
118 | 
119 | ```elixir
120 | # Split text at a word boundary.
121 | iex> Unicode.String.split "This is a sentence. And another.", break: :word
122 | ["This", " ", "is", " ", "a", " ", "sentence", ".", " ", "And", " ", "another", "."]
123 | 
124 | # Split text at a word boundary but omit any whitespace
125 | iex> Unicode.String.split "This is a sentence. And another.", break: :word, trim: true
126 | ["This", "is", "a", "sentence", ".", "And", "another", "."]
127 | 
128 | # Split text at a sentence boundary.
129 | iex> Unicode.String.split "This is a sentence. And another.", break: :sentence
130 | ["This is a sentence. ", "And another."]
131 | 
132 | # By default, common abbreviations are suppressed (ie
133 | # they do not cause a break)
134 | iex> Unicode.String.split "No, I don't have a Ph.D. but I don't think it matters.", break: :word, trim: true
135 | ["No", ",", "I", "don't", "have", "a", "Ph.D", ".", "but", "I", "don't",
136 |  "think", "it", "matters", "."]
137 | 
138 | iex> Unicode.String.split "No, I don't have a Ph.D. but I don't think it matters.", break: :sentence, trim: true
139 | ["No, I don't have a Ph.D. but I don't think it matters."]
140 | 
141 | # Sentence Break suppressions are locale sensitive.
142 | iex> Unicode.String.Segment.known_locales
143 | ["de", "el", "en", "en-US", "en-US-POSIX", "es", "fi", "fr", "it", "ja", "pt",
144 |  "root", "ru", "sv", "zh", "zh-Hant"]
145 | 
146 | iex> Unicode.String.split "Non, c'est M. Dubois.", break: :sentence, trim: true, locale: "fr"
147 | ["Non, c'est M. Dubois."]
148 | 
149 | # Note that break: :line does NOT mean split the string
150 | # at newlines. It splits the string where a line break would be
151 | # acceptable. This is very useful for calculating where
152 | # to perform word-wrap on some text.
153 | iex> Unicode.String.split "This is a sentence. And another.", break: :line
154 | ["This ", "is ", "a ", "sentence. ", "And ", "another."]
155 | ```
156 | 
157 | ### Dictionary-based word segmentation
158 | 
159 | Some languages, commonly east asian languages, don't typically use whitespace to separate words so a dictionary lookup is more appropriate - although not perfect.
160 | 
161 | This implementation supports dictionary-based word breaking for:
162 | 
163 | * Chinese (`zh`, `zh-Hant`, `zh-Hans`, `zh-Hant-HK`, `yue`, `yue-Hans`) locales,
164 | * Japanese (`ja`) using the same dictionary as for Chinese,
165 | * Thai (`th`),
166 | * Lao (`lo`),
167 | * Khmer (`km`) and
168 | * Burmese (`my`).
169 | 
170 | The dictionaries implemented are those used in the [CLDR](https://cldr.unicode.org) since they are under an open source license and also for consistency with [ICU](https://icu.unicode.org).
171 | 
172 | Note that these dictionaries need to be downloaded with `mix unicode.string.download.dictionaries` prior to use. Each dictionary will be parsed and loaded into [persistent_term](https://www.erlang.org/doc/man/persistent_term) on demand. Note that each dictionary has a sizable memory footprint as measured by `:persistent_term.info/0`:
173 | 
174 | | Dictionary  | Memory Mb   |
175 | | ----------- | ----------: |
176 | | Chinese     | 104.8       |
177 | | Thai        | 9.6         |
178 | | Lao         | 11.4        |
179 | | Khmer       | 38.8        |
180 | | Burmese     | 23.1        |
181 | 
182 | ## Segment Streaming
183 | 
184 | Segmentation can also be streamed using `Unicode.String.stream/2`. For large strings this may improve memory usage since the intermediate segments will be garbage collected when they fall out of scope.
185 | 
186 | ```elixir
187 | iex> Enum.to_list Unicode.String.stream("this is a list of words", trim: true)                       ["this", "is", "a", "list", "of", "words"]
188 | 
189 | iex> Enum.map Unicode.String.stream("this is a list of words", trim: true),
190 | ...>   fn word -> %{word: word, length: String.length(word)} end
191 | [
192 |   %{length: 4, word: "this"},
193 |   %{length: 2, word: "is"},
194 |   %{length: 1, word: "a"},
195 |   %{length: 3, word: "list"},
196 |   %{length: 2, word: "of"},
197 |   %{length: 5, word: "words"}
198 | ]
199 | ```
200 | 
201 | ## References
202 | 
203 | * Unicode maintains a [break testing utility](https://util.unicode.org/UnicodeJsps/breaks.jsp).
204 | 
205 | 


--------------------------------------------------------------------------------
/benchee/casing.exs:
--------------------------------------------------------------------------------
 1 | s = "THIS IS A STRING WE ARE GOING TO DOWNCASE WITH CHARACTERS THAT HAVE MAPPING AND THOSE THAT DONT 1234^&*&^%$)(*)}"
 2 | 
 3 | Benchee.run(%{
 4 |   "Unicode.String.Case.Mapping.downcase"  =>
 5 |     fn -> Unicode.String.Case.Mapping.downcase(s) end,
 6 |   "String.downcase default mode"  =>
 7 |     fn -> String.downcase(s) end,
 8 |   "String.downcase ASCII mode"  =>
 9 |     fn -> String.downcase(s, :ascii) end,
10 | })
11 | 
12 | 


--------------------------------------------------------------------------------
/benchee/compare.exs:
--------------------------------------------------------------------------------
 1 | s1 = "ABC"
 2 | s2 = "abc"
 3 | 
 4 | Benchee.run(%{
 5 |   "Unicode.String.equal_ignoring_case?"  =>
 6 |     fn -> Unicode.String.equals_ignoring_case?(s1, s2) end,
 7 |   "String.==" =>
 8 |     fn -> s1 == s2 end,
 9 |   "String.downcase compare" =>
10 |     fn -> String.downcase(s1) == String.downcase(s2) end,
11 | })


--------------------------------------------------------------------------------
/benchee/next.exs:
--------------------------------------------------------------------------------
1 | Benchee.run(%{
2 |   "Unicode.String.Break.next/4"  =>
3 |     fn -> Unicode.String.Break.next("test123 ", "root", :word, []) end,
4 | })


--------------------------------------------------------------------------------
/config/config.exs:
--------------------------------------------------------------------------------
1 | import Config
2 | 
3 | config :ex_cldr,
4 |   default_backend: MyApp.Cldr
5 | 


--------------------------------------------------------------------------------
/lib/tasks/download_dictionaries.ex:
--------------------------------------------------------------------------------
  1 | defmodule Mix.Tasks.Unicode.String.Download.Dictionaries do
  2 |   @moduledoc """
  3 |   Downloads the ICU (Unicode) dictionaries supporting word breaks
  4 |   for Chinese, Japanese, Thai, Burmese and Laotion languages.
  5 | 
  6 |   """
  7 | 
  8 |   use Mix.Task
  9 |   require Logger
 10 | 
 11 |   @shortdoc "Download Unicode ICU Word Break Dictionaries"
 12 | 
 13 |   @root_url "https://raw.githubusercontent.com/unicode-org/icu/main/icu4c/source/data/brkitr/dictionaries"
 14 | 
 15 |   @unicode_unsafe_https "UNICODE_UNSAFE_HTTPS"
 16 |   @unicode_default_timeout "120000"
 17 |   @unicode_default_connection_timeout "60000"
 18 | 
 19 |   @app_name :unicode_string
 20 | 
 21 |   @doc false
 22 |   def run(_) do
 23 |     Application.ensure_all_started(:inets)
 24 |     Application.ensure_all_started(:ssl)
 25 | 
 26 |     Enum.each(required_files(), &download_file/1)
 27 |   end
 28 | 
 29 |   defp required_files do
 30 |     [
 31 |       {Path.join(root_url(), "/thaidict.txt"), data_path("thai.txt")},
 32 |       {Path.join(root_url(), "/laodict.txt"), data_path("lao.txt")},
 33 |       {Path.join(root_url(), "/khmerdict.txt"), data_path("khmer.txt")},
 34 |       {Path.join(root_url(), "/cjdict.txt"), data_path("chinese_japanese.txt")},
 35 |       {Path.join(root_url(), "/burmesedict.txt"), data_path("burmese.txt")}
 36 |     ]
 37 |   end
 38 | 
 39 |   def root_url do
 40 |     @root_url
 41 |   end
 42 | 
 43 |   defp download_file({url, destination}) do
 44 |     case get(url) do
 45 |       {:ok, body} ->
 46 |         File.write!(destination, body)
 47 |         Logger.info("Downloaded #{inspect(url)} to #{inspect(destination)}")
 48 |         {:ok, destination}
 49 | 
 50 |       error ->
 51 |         error
 52 |     end
 53 |   end
 54 | 
 55 |   @doc """
 56 |   Securely download https content from
 57 |   a URL.
 58 | 
 59 |   This function uses the built-in `:httpc`
 60 |   client but enables certificate verification
 61 |   which is not enabled by `:httpc` by default.
 62 | 
 63 |   See also https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl
 64 | 
 65 |   ### Arguments
 66 | 
 67 |   * `url` is a binary URL or a `{url, list_of_headers}` tuple. If
 68 |     provided the headers are a list of `{'header_name', 'header_value'}`
 69 |     tuples. Note that the name and value are both charlists, not
 70 |     strings.
 71 | 
 72 |   * `options` is a keyword list of options.
 73 | 
 74 |   ### Options
 75 | 
 76 |   * `:verify_peer` is a boolean value indicating
 77 |     if peer verification should be done for this request.
 78 |     The default is `true` in which case the default
 79 |     `:ssl` options follow the [erlef guidelines](https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl)
 80 |     noted above.
 81 | 
 82 |   * `:timeout` is the number of milliseconds available
 83 |     for the request to complete. The default is
 84 |     #{inspect @unicode_default_timeout}. This option may also be
 85 |     set with the `CLDR_HTTP_TIMEOUT` environment variable.
 86 | 
 87 |   * `:connection_timeout` is the number of milliseconds
 88 |     available for the a connection to be estabklished to
 89 |     the remote host. The default is #{inspect @unicode_default_connection_timeout}.
 90 |     This option may also be set with the
 91 |     `CLDR_HTTP_CONNECTION_TIMEOUT` environment variable.
 92 | 
 93 |   ### Returns
 94 | 
 95 |   * `{:ok, body}` if the return is successful.
 96 | 
 97 |   * `{:not_modified, headers}` if the request would result in
 98 |     returning the same results as one matching an etag.
 99 | 
100 |   * `{:error, error}` if the download is
101 |      unsuccessful. An error will also be logged
102 |      in these cases.
103 | 
104 |   ### Unsafe HTTPS
105 | 
106 |   If the environment variable `CLDR_UNSAFE_HTTPS` is
107 |   set to anything other than `FALSE`, `false`, `nil`
108 |   or `NIL` then no peer verification of certificates
109 |   is performed. Setting this variable is not recommended
110 |   but may be required is where peer verification for
111 |   unidentified reasons. Please [open an issue](https://github.com/elixir-cldr/cldr/issues)
112 |   if this occurs.
113 | 
114 |   ### Certificate stores
115 | 
116 |   In order to keep dependencies to a minimum,
117 |   `get/1` attempts to locate an already installed
118 |   certificate store. It will try to locate a
119 |   store in the following order which is intended
120 |   to satisfy most host systems. The certificate
121 |   store is expected to be a path name on the
122 |   host system.
123 | 
124 |   ```elixir
125 |   # A certificate store configured by the
126 |   # developer
127 |   Application.get_env(:ex_cldr, :cacertfile)
128 | 
129 |   # Populated if hex package `CAStore` is configured
130 |   CAStore.file_path()
131 | 
132 |   # Populated if hex package `certfi` is configured
133 |   :certifi.cacertfile()
134 | 
135 |   # Debian/Ubuntu/Gentoo etc.
136 |   "/etc/ssl/certs/ca-certificates.crt",
137 | 
138 |   # Fedora/RHEL 6
139 |   "/etc/pki/tls/certs/ca-bundle.crt",
140 | 
141 |   # OpenSUSE
142 |   "/etc/ssl/ca-bundle.pem",
143 | 
144 |   # OpenELEC
145 |   "/etc/pki/tls/cacert.pem",
146 | 
147 |   # CentOS/RHEL 7
148 |   "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",
149 | 
150 |   # Open SSL on MacOS
151 |   "/usr/local/etc/openssl/cert.pem",
152 | 
153 |   # MacOS & Alpine Linux
154 |   "/etc/ssl/cert.pem"
155 |   ```
156 | 
157 |   """
158 |   @spec get(String.t | {String.t, list()}, options :: Keyword.t) ::
159 |     {:ok, binary} | {:not_modified, any()} | {:error, any}
160 | 
161 |   def get(url, options \\ [])
162 | 
163 |   def get(url, options) when is_binary(url) and is_list(options) do
164 |     case get_with_headers(url, options) do
165 |       {:ok, _headers, body} -> {:ok, body}
166 |       other -> other
167 |     end
168 |   end
169 | 
170 |   def get({url, headers}, options) when is_binary(url) and is_list(headers) and is_list(options) do
171 |     case get_with_headers({url, headers}, options) do
172 |       {:ok, _headers, body} -> {:ok, body}
173 |       other -> other
174 |     end
175 |   end
176 | 
177 |   @doc """
178 |   Securely download https content from
179 |   a URL.
180 | 
181 |   This function uses the built-in `:httpc`
182 |   client but enables certificate verification
183 |   which is not enabled by `:httc` by default.
184 | 
185 |   See also https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl
186 | 
187 |   ### Arguments
188 | 
189 |   * `url` is a binary URL or a `{url, list_of_headers}` tuple. If
190 |     provided the headers are a list of `{'header_name', 'header_value'}`
191 |     tuples. Note that the name and value are both charlists, not
192 |     strings.
193 | 
194 |   * `options` is a keyword list of options.
195 | 
196 |   ### Options
197 | 
198 |   * `:verify_peer` is a boolean value indicating
199 |     if peer verification should be done for this request.
200 |     The default is `true` in which case the default
201 |     `:ssl` options follow the [erlef guidelines](https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl)
202 |     noted above.
203 | 
204 |   * `:timeout` is the number of milliseconds available
205 |     for the request to complete. The default is
206 |     #{inspect @unicode_default_timeout}. This option may also be
207 |     set with the `CLDR_HTTP_TIMEOUT` environment variable.
208 | 
209 |   * `:connection_timeout` is the number of milliseconds
210 |     available for the a connection to be estabklished to
211 |     the remote host. The default is #{inspect @unicode_default_connection_timeout}.
212 |     This option may also be set with the
213 |     `CLDR_HTTP_CONNECTION_TIMEOUT` environment variable.
214 | 
215 |   * `:https_proxy` is the URL of an https proxy to be used. The
216 |     default is `nil`.
217 | 
218 |   ### Returns
219 | 
220 |   * `{:ok, body, headers}` if the return is successful.
221 | 
222 |   * `{:not_modified, headers}` if the request would result in
223 |     returning the same results as one matching an etag.
224 | 
225 |   * `{:error, error}` if the download is
226 |      unsuccessful. An error will also be logged
227 |      in these cases.
228 | 
229 |   ### Unsafe HTTPS
230 | 
231 |   If the environment variable `CLDR_UNSAFE_HTTPS` is
232 |   set to anything other than `FALSE`, `false`, `nil`
233 |   or `NIL` then no peer verification of certificates
234 |   is performed. Setting this variable is not recommended
235 |   but may be required is where peer verification for
236 |   unidentified reasons. Please [open an issue](https://github.com/elixir-cldr/cldr/issues)
237 |   if this occurs.
238 | 
239 |   ### Https Proxy
240 | 
241 |   `Cldr.Http.get/2` will look for a proxy URL in the following
242 |   locales in the order presented:
243 | 
244 |   * `options[:https_proxy]`
245 |   * `ex_cldr` compile-time configuration under the
246 |     key `:ex_cldr[:https_proxy]`
247 |   * The environment variable `HTTPS_PROXY`
248 |   * The environment variable `https_proxy`
249 | 
250 |   ### Certificate stores
251 | 
252 |   In order to keep dependencies to a minimum,
253 |   `get/1` attempts to locate an already installed
254 |   certificate store. It will try to locate a
255 |   store in the following order which is intended
256 |   to satisfy most host systems. The certificate
257 |   store is expected to be a path name on the
258 |   host system.
259 | 
260 |   ```elixir
261 |   # A certificate store configured by the
262 |   # developer
263 |   Application.get_env(:ex_cldr, :cacertfile)
264 | 
265 |   # Populated if hex package `CAStore` is configured
266 |   CAStore.file_path()
267 | 
268 |   # Populated if hex package `certfi` is configured
269 |   :certifi.cacertfile()
270 | 
271 |   # Debian/Ubuntu/Gentoo etc.
272 |   "/etc/ssl/certs/ca-certificates.crt",
273 | 
274 |   # Fedora/RHEL 6
275 |   "/etc/pki/tls/certs/ca-bundle.crt",
276 | 
277 |   # OpenSUSE
278 |   "/etc/ssl/ca-bundle.pem",
279 | 
280 |   # OpenELEC
281 |   "/etc/pki/tls/cacert.pem",
282 | 
283 |   # CentOS/RHEL 7
284 |   "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",
285 | 
286 |   # Open SSL on MacOS
287 |   "/usr/local/etc/openssl/cert.pem",
288 | 
289 |   # MacOS & Alpine Linux
290 |   "/etc/ssl/cert.pem"
291 |   ```
292 | 
293 |   """
294 |   @doc since: "2.21.0"
295 | 
296 |   @spec get_with_headers(String.t | {String.t, list()}, options :: Keyword.t) ::
297 |     {:ok, list(), binary} | {:not_modified, any()} | {:error, any}
298 | 
299 |   def get_with_headers(request, options \\ [])
300 | 
301 |   def get_with_headers(url, options) when is_binary(url) do
302 |     get_with_headers({url, []}, options)
303 |   end
304 | 
305 |   def get_with_headers({url, headers}, options) when is_binary(url) and is_list(headers) and is_list(options) do
306 |     require Logger
307 | 
308 |     hostname = String.to_charlist(URI.parse(url).host)
309 |     url = String.to_charlist(url)
310 |     http_options = http_opts(hostname, options)
311 |     https_proxy = https_proxy(options)
312 | 
313 |     if https_proxy do
314 |       case URI.parse(https_proxy) do
315 |         %{host: host, port: port} when is_binary(host) and is_integer(port) ->
316 |           :httpc.set_options([{:https_proxy, {{String.to_charlist(host), port}, []}}])
317 |         _other ->
318 |           Logger.bare_log(:warning, "https_proxy was set to an invalid value. Found #{inspect https_proxy}.")
319 |       end
320 |     end
321 | 
322 |     case :httpc.request(:get, {url, headers}, http_options, []) do
323 |       {:ok, {{_version, 200, _}, headers, body}} ->
324 |         {:ok, headers, body}
325 | 
326 |       {:ok, {{_version, 304, _}, headers, _body}} ->
327 |         {:not_modified, headers}
328 | 
329 |       {_, {{_version, code, message}, _headers, _body}} ->
330 |         Logger.bare_log(
331 |           :error,
332 |           "Failed to download #{inspect url}. " <>
333 |             "HTTP Error: (#{code}) #{inspect(message)}"
334 |         )
335 | 
336 |         {:error, code}
337 | 
338 |       {:error, {:failed_connect, [{_, {host, _port}}, {_, _, sys_message}]}} ->
339 |         if sys_message == :timeout do
340 |           Logger.bare_log(
341 |             :error,
342 |             "Timeout connecting to #{inspect(host)} to download #{inspect url}. " <>
343 |             "Connection time exceeded #{http_options[:connect_timeout]}ms."
344 |           )
345 | 
346 |           {:error, :connection_timeout}
347 |         else
348 |           Logger.bare_log(
349 |             :error,
350 |             "Failed to connect to #{inspect(host)} to download #{inspect url}"
351 |           )
352 | 
353 |           {:error, sys_message}
354 |         end
355 | 
356 |       {:error, {other}} ->
357 |         Logger.bare_log(
358 |           :error,
359 |           "Failed to download #{inspect url}. Error #{inspect other}"
360 |         )
361 | 
362 |         {:error, other}
363 | 
364 |       {:error, :timeout} ->
365 |         Logger.bare_log(
366 |           :error,
367 |           "Timeout downloading from #{inspect url}. " <>
368 |           "Request exceeded #{http_options[:timeout]}ms."
369 |         )
370 |         {:error, :timeout}
371 |     end
372 |   end
373 | 
374 |   @static_certificate_locations [
375 |     # Debian/Ubuntu/Gentoo etc.
376 |     "/etc/ssl/certs/ca-certificates.crt",
377 | 
378 |     # Fedora/RHEL 6
379 |     "/etc/pki/tls/certs/ca-bundle.crt",
380 | 
381 |     # OpenSUSE
382 |     "/etc/ssl/ca-bundle.pem",
383 | 
384 |     # OpenELEC
385 |     "/etc/pki/tls/cacert.pem",
386 | 
387 |     # CentOS/RHEL 7
388 |     "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem",
389 | 
390 |     # Open SSL on MacOS
391 |     "/usr/local/etc/openssl/cert.pem",
392 | 
393 |     # MacOS & Alpine Linux
394 |     "/etc/ssl/cert.pem"
395 |   ]
396 | 
397 |   defp dynamic_certificate_locations do
398 |     [
399 |       # Configured cacertfile
400 |       Application.get_env(:ex_cldr, :cacertfile),
401 | 
402 |       # Populated if hex package CAStore is configured
403 |       if(Code.ensure_loaded?(CAStore), do: apply(CAStore, :file_path, [])),
404 | 
405 |       # Populated if hex package certfi is configured
406 |       if(Code.ensure_loaded?(:certifi), do: apply(:certifi, :cacertfile, []) |> List.to_string())
407 |     ]
408 |     |> Enum.reject(&is_nil/1)
409 |   end
410 | 
411 |   def certificate_locations() do
412 |     dynamic_certificate_locations() ++ @static_certificate_locations
413 |   end
414 | 
415 |   @doc false
416 |   defp certificate_store do
417 |     certificate_locations()
418 |     |> Enum.find(&File.exists?/1)
419 |     |> raise_if_no_cacertfile!
420 |     |> :erlang.binary_to_list()
421 |   end
422 | 
423 |   defp raise_if_no_cacertfile!(nil) do
424 |     raise RuntimeError, """
425 |     No certificate trust store was found.
426 |     Tried looking for: #{inspect(certificate_locations())}
427 | 
428 |     A certificate trust store is required in
429 |     order to download locales for your configuration.
430 | 
431 |     Since ex_cldr could not detect a system
432 |     installed certificate trust store one of the
433 |     following actions may be taken:
434 | 
435 |     1. Install the hex package `castore`. It will
436 |        be automatically detected after recompilation.
437 | 
438 |     2. Install the hex package `certifi`. It will
439 |        be automatically detected after recomilation.
440 | 
441 |     3. Specify the location of a certificate trust store
442 |        by configuring it in `config.exs` or `runtime.exs`:
443 | 
444 |        config :ex_cldr,
445 |          cacertfile: "/path/to/cacertfile",
446 |          ...
447 | 
448 |     """
449 |   end
450 | 
451 |   defp raise_if_no_cacertfile!(file) do
452 |     file
453 |   end
454 | 
455 |   defp http_opts(hostname, options) do
456 |     default_timeout =
457 |       "TZWORLD_HTTP_TIMEOUT"
458 |       |> System.get_env(@unicode_default_timeout)
459 |       |> String.to_integer()
460 | 
461 |     default_connection_timeout =
462 |       "TZWORLD_HTTP_CONNECTION_TIMEOUT"
463 |       |> System.get_env(@unicode_default_connection_timeout)
464 |       |> String.to_integer()
465 | 
466 |     verify_peer? = Keyword.get(options, :verify_peer, true)
467 |     ssl_options = https_ssl_opts(hostname, verify_peer?)
468 |     timeout = Keyword.get(options, :timeout, default_timeout)
469 |     connection_timeout = Keyword.get(options, :connection_timeout, default_connection_timeout)
470 | 
471 |     [timeout: timeout, connect_timeout: connection_timeout, ssl: ssl_options]
472 |   end
473 | 
474 |   @doc false
475 |   def user_agent do
476 |     "erlang httpc/unicode OTP version #{otp_version()}"
477 |     |> String.to_charlist()
478 |   end
479 | 
480 |   defp https_ssl_opts(hostname, verify_peer?) do
481 |     if secure_ssl?() and verify_peer? do
482 |       [
483 |         verify: :verify_peer,
484 |         cacertfile: certificate_store(),
485 |         depth: 4,
486 |         ciphers: preferred_ciphers(),
487 |         versions: protocol_versions(),
488 |         eccs: preferred_eccs(),
489 |         reuse_sessions: true,
490 |         server_name_indication: hostname,
491 |         secure_renegotiate: true,
492 |         customize_hostname_check: [
493 |           match_fun: :public_key.pkix_verify_hostname_match_fun(:https)
494 |         ]
495 |       ]
496 |     else
497 |       [
498 |         verify: :verify_none,
499 |         server_name_indication: hostname,
500 |         secure_renegotiate: true,
501 |         reuse_sessions: true,
502 |         versions: protocol_versions(),
503 |         ciphers: preferred_ciphers(),
504 |         versions: protocol_versions(),
505 |       ]
506 |     end
507 |   end
508 | 
509 |   defp preferred_ciphers do
510 |     preferred_ciphers =
511 |       [
512 |         # Cipher suites (TLS 1.3): TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256
513 |         %{cipher: :aes_128_gcm, key_exchange: :any, mac: :aead, prf: :sha256},
514 |         %{cipher: :aes_256_gcm, key_exchange: :any, mac: :aead, prf: :sha384},
515 |         %{cipher: :chacha20_poly1305, key_exchange: :any, mac: :aead, prf: :sha256},
516 | 
517 |         # Cipher suites (TLS 1.2): ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256:
518 |         # ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305:
519 |         # ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384
520 |         %{cipher: :aes_128_gcm, key_exchange: :ecdhe_ecdsa, mac: :aead, prf: :sha256},
521 |         %{cipher: :aes_128_gcm, key_exchange: :ecdhe_rsa, mac: :aead, prf: :sha256},
522 |         %{cipher: :aes_256_gcm, key_exchange: :ecdh_ecdsa, mac: :aead, prf: :sha384},
523 |         %{cipher: :aes_256_gcm, key_exchange: :ecdh_rsa, mac: :aead, prf: :sha384},
524 |         %{cipher: :chacha20_poly1305, key_exchange: :ecdhe_ecdsa, mac: :aead, prf: :sha256},
525 |         %{cipher: :chacha20_poly1305, key_exchange: :ecdhe_rsa, mac: :aead, prf: :sha256},
526 |         %{cipher: :aes_128_gcm, key_exchange: :dhe_rsa, mac: :aead, prf: :sha256},
527 |         %{cipher: :aes_256_gcm, key_exchange: :dhe_rsa, mac: :aead, prf: :sha384}
528 |       ]
529 | 
530 |     :ssl.filter_cipher_suites(preferred_ciphers, [])
531 |   end
532 | 
533 |   defp protocol_versions do
534 |     if otp_version() < 25 do
535 |       [:"tlsv1.2"]
536 |     else
537 |       [:"tlsv1.2", :"tlsv1.3"]
538 |     end
539 |   end
540 | 
541 |   defp preferred_eccs do
542 |     # TLS curves: X25519, prime256v1, secp384r1
543 |     preferred_eccs = [:secp256r1, :secp384r1]
544 |     :ssl.eccs() -- (:ssl.eccs() -- preferred_eccs)
545 |   end
546 | 
547 |   defp secure_ssl? do
548 |     case String.upcase(System.get_env(@unicode_unsafe_https, "TRUE")) do
549 |       "FALSE" -> false
550 |       "NIL" -> false
551 |       _other -> true
552 |     end
553 |   end
554 | 
555 |   defp https_proxy(options) do
556 |     options[:https_proxy] ||
557 |     Application.get_env(:unicode, :https_proxy) ||
558 |     System.get_env("HTTPS_PROXY") ||
559 |     System.get_env("https_proxy")
560 |   end
561 | 
562 |   def otp_version do
563 |     :erlang.system_info(:otp_release) |> List.to_integer
564 |   end
565 | 
566 |   def data_path(filename) do
567 |     priv_dir = :code.priv_dir(@app_name) |> to_string()
568 |     Path.join(priv_dir, ["dictionaries/", filename])
569 |   end
570 | end
571 | 
572 | 


--------------------------------------------------------------------------------
/lib/unicode/break.ex:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.Break do
  2 |   @moduledoc """
  3 |   Implements the Unicode break algorithm for words
  4 |   and lines.
  5 | 
  6 |   """
  7 | 
  8 |   alias Unicode.String.Segment
  9 |   alias Unicode.String.Dictionary
 10 | 
 11 |   @dictionary_locales Dictionary.known_dictionary_locales()
 12 | 
 13 |   @break_map %{
 14 |     grapheme: :grapheme_cluster_break,
 15 |     word: :word_break,
 16 |     sentence: :sentence_break,
 17 |     line: :line_break,
 18 |     graphemes: :grapheme_cluster_break,
 19 |     grapheme_cluster: :grapheme_cluster_break,
 20 |     words: :word_break,
 21 |     sentences: :sentence_break,
 22 |     lines: :line_break
 23 |   }
 24 | 
 25 |   @break_keys Map.keys(@break_map)
 26 | 
 27 |   @doc false
 28 |   def break(string, locale, break, options) when break in @break_keys do
 29 |     break_at(string, locale, Map.fetch!(@break_map, break), options)
 30 |   end
 31 | 
 32 |   @doc false
 33 |   def break_at("", _locale, _segment_type, _options) do
 34 |     {:no_break, {"", {"", ""}}}
 35 |   end
 36 | 
 37 |   def break_at(string, locale, segment_type, options) when is_binary(string) do
 38 |     break_at({"", string}, locale, segment_type, options)
 39 |   end
 40 | 
 41 |   def break_at({"", string_after}, _locale, _segment_type, _options) do
 42 |     {:break, {"", {"", string_after}}}
 43 |   end
 44 | 
 45 |   def break_at({string_before, string_after}, locale, segment_type, options) do
 46 |     suppress? = Keyword.get(options, :suppressions, true)
 47 |     {:ok, rules} = rules(locale, segment_type, suppress?)
 48 | 
 49 |     {string_before, string_after}
 50 |     |> Segment.evaluate_rules(rules)
 51 |   end
 52 | 
 53 |   @doc false
 54 |   def split(string, locale, break, options) when break in @break_keys do
 55 |     case next(string, locale, break, options) do
 56 |       {fore, aft} ->
 57 |         [fore | split(aft, locale, break, options)]
 58 | 
 59 |       nil ->
 60 |         []
 61 |     end
 62 |   end
 63 | 
 64 |   @doc false
 65 |   def next("", _locale, _break, _options) do
 66 |     nil
 67 |   end
 68 | 
 69 |   def next(string, locale, :word = break, options) when locale in @dictionary_locales do
 70 |     <<char::utf8, rest::binary>> = string
 71 | 
 72 |     case next_at({<<char::utf8>>, rest}, locale, :word, options) do
 73 |       {fore, {_match, rest}} ->
 74 |         {fore, rest}
 75 | 
 76 |       {fore, rest} ->
 77 |         {fore, rest}
 78 |     end
 79 |     |> repeat_if_trimming_required(locale, break, options, options[:trim])
 80 |   end
 81 | 
 82 |   def next(string, locale, break, options) when break in @break_keys and is_binary(string) do
 83 |     <<char::utf8, rest::binary>> = string
 84 | 
 85 |     case next_at({<<char::utf8>>, rest}, locale, Map.fetch!(@break_map, break), options) do
 86 |       {fore, {match, rest}} ->
 87 |         {<<char::utf8>> <> fore, match <> rest}
 88 | 
 89 |       {fore, rest} ->
 90 |         {<<char::utf8>> <> fore, rest}
 91 |     end
 92 |     |> repeat_if_trimming_required(locale, break, options, options[:trim])
 93 |   end
 94 | 
 95 |   defp repeat_if_trimming_required({match, rest}, locale, break, options, true) do
 96 |     if Unicode.Property.white_space?(match) do
 97 |       next(rest, locale, break, options)
 98 |     else
 99 |       {match, rest}
100 |     end
101 |   end
102 | 
103 |   defp repeat_if_trimming_required({match, rest}, _locale, _break, _options, _) do
104 |     {match, rest}
105 |   end
106 | 
107 |   defp next_at({string_before, ""}, locale, :word, _options)
108 |       when locale in @dictionary_locales do
109 |     {string_before, ""}
110 |   end
111 | 
112 |   defp next_at({string_before, string_after}, locale, :word = break, options)
113 |       when locale in @dictionary_locales do
114 |     <<next::utf8, rest::binary>> = string_after
115 |     word = string_before <> <<next::utf8>>
116 | 
117 |     case Dictionary.find_prefix(word, locale) do
118 |       {:ok, _} ->
119 |         next_at({word, rest}, locale, break, options)
120 |       :prefix ->
121 |         # If its a prefix then we keep going to see if we have a word
122 |         # But if the next step doesn't produce either a prefix or
123 |         # a word then it should be a break here
124 |         case next_at({word, rest}, locale, break, options) do
125 |           {fore, _aft} when fore == word ->
126 |             {string_before, string_after}
127 |           other ->
128 |             other
129 |         end
130 |       :error ->
131 |         {string_before, string_after}
132 |     end
133 |   end
134 | 
135 |   defp next_at({string_before, string_after}, locale, segment_type, options) do
136 |     suppress? = Keyword.get(options, :suppressions, true)
137 |     {:ok, rules} = rules(locale, segment_type, suppress?)
138 | 
139 |     {string_before, string_after}
140 |     |> Segment.evaluate_rules(rules)
141 |     |> do_next(rules, "")
142 |   end
143 | 
144 |   defp do_next({:break, {_string_before, {"", ""}}}, _rules, acc) do
145 |     {acc, ""}
146 |   end
147 | 
148 |   defp do_next({:break, {_string_before, {fore, ""}}}, _rules, acc) do
149 |     {acc, fore}
150 |   end
151 | 
152 |   defp do_next({:break, {_string_before, rest}}, _rules, acc) do
153 |     {acc, rest}
154 |   end
155 | 
156 |   defp do_next({:no_break, {_string_before, {fore, ""}}}, _rules, acc) do
157 |     {acc <> fore, ""}
158 |   end
159 | 
160 |   # Previously we were doing {acc <> fore, aft} but more context
161 |   # is needed for some rules so now its {string_before <> fore, aft}
162 | 
163 |   defp do_next({:no_break, {string_before, {fore, aft}}}, rules, acc) do
164 |     {string_before <> fore, aft}
165 |     |> Segment.evaluate_rules(rules)
166 |     |> do_next(rules, acc <> fore)
167 |   end
168 | 
169 |   # Recompile this module if any of the segment
170 |   # files change.
171 | 
172 |   for {_locale, file} <- Segment.locale_map() do
173 |     @external_resource Path.join(Segment.segments_dir(), file)
174 |   end
175 | 
176 |   @suppression_rules %{
177 |     sentence_break: %{id: 10.5, value: "$Sp+ $Suppressions $Close* $Sp* ($ParaSep?) ×"}
178 |   }
179 | 
180 |   # Returns a list of rules applicable for
181 |   # a given locale and segment type.
182 |   defp rules(locale, segment_type)
183 | 
184 |   # Returns the variable definitions for
185 |   # a given locale and segment type.
186 |   @doc false
187 |   def variables(locale, segment_type)
188 | 
189 |   # Returns a list of suppressions
190 |   # (abbreviations) that can be used
191 |   # to suppress an otherwise acceptable
192 |   # break point.
193 | 
194 |   # Examples
195 |   #
196 |   #     => Unicode.String.Break.variables "en", :sentence_break
197 |   #     [
198 |   #       %{name: "$CR", value: "\\p{Sentence_Break=CR}"},
199 |   #       %{name: "$LF", value: "\\p{Sentence_Break=LF}"},
200 |   #       %{name: "$Extend", value: "\\p{Sentence_Break=Extend}"},
201 |   #       %{name: "$Format", value: "\\p{Sentence_Break=Format}"},
202 |   #       %{name: "$Sep", value: "\\p{Sentence_Break=Sep}"},
203 |   #       %{name: "$Sp", value: "\\p{Sentence_Break=Sp}"},
204 |   #       %{name: "$Lower", value: "\\p{Sentence_Break=Lower}"},
205 |   #       ...
206 |   #     ]
207 |   @doc false
208 |   def suppressions(locale, segment_type)
209 | 
210 |   @doc false
211 |   def suppressions_rule(locale, segment_type)
212 | 
213 |   for locale <- Segment.known_segmentation_locales() do
214 |     {:ok, segments} = Segment.segments(locale)
215 | 
216 |     for segment_type <- Map.keys(segments) do
217 |       defp rules(unquote(locale), unquote(segment_type)) do
218 |         unquote(Macro.escape(Segment.rules(locale, segment_type)))
219 |       end
220 | 
221 |       def variables(unquote(locale), unquote(segment_type)) do
222 |         unquote(Macro.escape(get_in(segments, [segment_type, :variables])))
223 |       end
224 | 
225 |       def suppressions(unquote(locale), unquote(segment_type)) do
226 |         unquote(Macro.escape(Segment.suppressions!(locale, segment_type)))
227 |       end
228 | 
229 |       suppressions_rule = Map.get(@suppression_rules, segment_type)
230 |       suppressions_variable = Segment.suppressions_variable(locale, segment_type)
231 | 
232 |       if suppressions_rule && suppressions_variable do
233 |         variables =
234 |           get_in(segments, [segment_type, :variables])
235 |           |> Segment.expand_variables([suppressions_variable])
236 | 
237 |         rule = Segment.compile_rule(suppressions_rule, variables, [:caseless])
238 | 
239 |         def suppressions_rule(unquote(locale), unquote(segment_type)) do
240 |           unquote(Macro.escape(rule))
241 |         end
242 |       end
243 |     end
244 |   end
245 | 
246 |   @default_locale :root
247 | 
248 |   defp rules(_other, segment_type) do
249 |     rules(@default_locale, segment_type)
250 |   end
251 | 
252 |   def suppressions_rule(_locale, _segment_type) do
253 |     nil
254 |   end
255 | 
256 |   @doc false
257 |   def rules(locale, break_type, true) do
258 |     if suppressions_rule = suppressions_rule(locale, break_type) do
259 |       {:ok, rules} = rules(locale, break_type)
260 |       {:ok, sort_rules([suppressions_rule | rules])}
261 |     else
262 |       rules(locale, break_type)
263 |     end
264 |   end
265 | 
266 |   def rules(locale, break_type, _) do
267 |     rules(locale, break_type)
268 |   end
269 | 
270 |   defp sort_rules(rules) do
271 |     Enum.sort_by(rules, &elem(&1, 0))
272 |   end
273 | end
274 | 


--------------------------------------------------------------------------------
/lib/unicode/case/folding.ex:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.Case.Folding do
  2 |   @moduledoc """
  3 |   Implements the Unicode Case Folding algorithm.
  4 | 
  5 |   The intention of case folding is to facilitate
  6 |   case-insensitive string comparisons. It is not
  7 |   intended to be a general purpose transformation.
  8 | 
  9 |   Although case folding does generally use lower
 10 |   case as its normal form, it is not true for
 11 |   all scripts and codepoints.  Therefore case
 12 |   folding should not be used as an alternative
 13 |   to `String.downcase/1`.
 14 | 
 15 |   """
 16 | 
 17 |   @turkic_languages [:tr, :az]
 18 |   @fold_status [:turkic, :common, :full]
 19 | 
 20 |   @doc """
 21 |   Case fold a string.
 22 | 
 23 |   Returns a string after applying the Unicode
 24 |   Case Folding algorithm.
 25 | 
 26 |   Case folding is intended to suport case
 27 |   insensitve string comparisons such as that
 28 |   implemented by `Unicode.String.equals_ignoring_case?/2` which
 29 |   calls this function on its parameters.
 30 | 
 31 |   ### Arguments
 32 | 
 33 |   * `string` is any `String.t()`
 34 | 
 35 |   * `mode or language tag` is either the atoms `:turkic` or `nil`
 36 |     or a map that includes the key `:language` with a value that
 37 |     is a lowercase atom representing an [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
 38 |     language code. The [CLDR language tag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) which is defined
 39 |     as part of the [ex_cldr](https://hex.pm/packages/ex_cldr) is one
 40 |     such example. See [Cldr.validate_locale/2](https://hexdocs.pm/ex_cldr/Cldr.html#validate_locale/2)
 41 |     for further information. The default is `nil`.
 42 | 
 43 |   ### Returns
 44 | 
 45 |     * The case folded string
 46 | 
 47 |   ### Notes
 48 | 
 49 |   * No normalization is applied to the
 50 |     string on either input or output.
 51 | 
 52 |   * Case folding does not apply any transformation
 53 |     to accented characters. `"ü"` will not case fold
 54 |     to `"u"` for example.
 55 | 
 56 |   ### Examples
 57 | 
 58 |       iex> Unicode.String.Case.Folding.fold("THIS")
 59 |       "this"
 60 | 
 61 |       iex> Unicode.String.Case.Folding.fold("grüßen")
 62 |       "grüssen"
 63 | 
 64 |       iex(13)> Unicode.String.Case.Folding.fold("I")
 65 |       "i"
 66 | 
 67 |       # Turkic languages such as Turkish and Azerbaijani have
 68 |       # a dotless lower case "i"
 69 |       iex> Unicode.String.Case.Folding.fold("I", :turkic)
 70 |       "ı"
 71 | 
 72 |       iex> Unicode.String.Case.Folding.fold("I", %{language: :az})
 73 |       "ı"
 74 | 
 75 |   """
 76 |   def fold(string) when is_binary(string) do
 77 |     fold(string, :full, nil)
 78 |   end
 79 | 
 80 |   def fold(string, %{language: language}) when language in @turkic_languages do
 81 |     fold(string, :full, :turkic)
 82 |   end
 83 | 
 84 |   def fold(string, language) when language in @turkic_languages do
 85 |     fold(string, :full, :turkic)
 86 |   end
 87 | 
 88 |   def fold(string, %{language: _language}) do
 89 |     fold(string, :full, nil)
 90 |   end
 91 | 
 92 |   def fold(string, :turkic) when is_binary(string) do
 93 |     fold(string, :full, :turkic)
 94 |   end
 95 | 
 96 |   def fold(string, _other) when is_binary(string) do
 97 |     fold(string, :full, nil)
 98 |   end
 99 | 
100 |   for [status, from, to] <- Unicode.Utils.case_folding(), status in @fold_status do
101 |     to = if is_list(to), do: List.to_string(to), else: List.to_string([to])
102 | 
103 |     case status do
104 |       :turkic ->
105 |         defp fold(<<unquote(from)::utf8, rest::binary>>, _status, :turkic) do
106 |           <<unquote(to), fold(rest, unquote(status))::binary>>
107 |         end
108 | 
109 |       :common ->
110 |         defp fold(<<unquote(from)::utf8, rest::binary>>, status, mode) do
111 |           <<unquote(to), fold(rest, status, mode)::binary>>
112 |         end
113 | 
114 |       :full ->
115 |         defp fold(<<unquote(from)::utf8, rest::binary>>, unquote(status), mode) do
116 |           <<unquote(to), fold(rest, unquote(status), mode)::binary>>
117 |         end
118 |     end
119 |   end
120 | 
121 |   defp fold(<<from::utf8, rest::binary>>, status, mode) do
122 |     <<from::utf8, fold(rest, status, mode)::binary>>
123 |   end
124 | 
125 |   defp fold("", _, _) do
126 |     ""
127 |   end
128 | end
129 | 


--------------------------------------------------------------------------------
/lib/unicode/case/greek_upper.ex:
--------------------------------------------------------------------------------
 1 | defmodule Unicode.String.Case.Mapping.Greek do
 2 |   @moduledoc """
 3 |   Implements the special upper casing rules for
 4 |   for the Greek language.
 5 | 
 6 |   """
 7 | 
 8 |   @remove_accents Unicode.Regex.expand_regex(
 9 |                     "[^[:ccc=Not_Reordered:][:ccc=Above:]]*?[\\u0313\\u0314\\u0301\\u0300\\u0306\\u0342\\u0308\\u0304]"
10 |                   )
11 |   @remove_iota Unicode.Regex.expand_regex("[^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*?[\\u0345]")
12 | 
13 |   @doc """
14 |   This implementation currently implements the `el-Upper` transform
15 |   from CLDR.
16 | 
17 |   ### CLDR algorithm
18 | 
19 |   According to CLDR all accents on all characters are are omitted when
20 |   upcasing.
21 | 
22 |     Remove 0301 following Greek, with possible intervening 0308 marks.
23 |     ::NFD();
24 |     For uppercasing (not titlecasing!) remove all greek accents from greek letters.
25 |     This is done in two groups, to account for canonical ordering.
26 |     [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
27 |     [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
28 |     ::NFC();
29 | 
30 |   That transform basically says remove all accents except a
31 |   subscripted iota. It doesn't handle dipthongs correctly.
32 | 
33 |   ### Mozilla algorithm
34 | 
35 |   Mozilla has a thread on a [bug report](https://bugzilla.mozilla.org/show_bug.cgi?id=307039)
36 |   that:
37 | 
38 |   >  Greek accented letters should be converted to the respective non-accented uppercase
39 |   >  letters. The required conversions are the following (in Unicode):
40 |   >
41 |   >  ά -> Α
42 |   >  έ -> Ε
43 |   >  ή -> Η
44 |   >  ί -> Ι
45 |   >  ΐ -> Ϊ
46 |   >  ό -> Ο
47 |   >  ύ -> Υ
48 |   >  ΰ -> Ϋ
49 |   >  ώ -> Ω
50 |   >
51 |   >  Also diphthongs (two-vowel constructs) should be converted as follows, when the
52 |   >  first vowel is accented:
53 |   >
54 |   >  άι -> ΑΪ
55 |   >  έι -> ΕΪ
56 |   >  όι -> ΟΪ
57 |   >  ύι -> ΥΪ
58 |   >  άυ -> ΑΫ
59 |   >  έυ -> ΕΫ
60 |   >  ήυ -> ΗΫ
61 |   >  όυ -> ΟΫ
62 | 
63 |   That thread seems to align with current-day [Mozilla](https://developer.mozilla.org/en-US/docs/Web/CSS/text-transform)
64 |   which says the rules are:
65 | 
66 |   > In Greek (el), vowels lose their accent when the whole word is in
67 |   > uppercase (ά/Α), except for the disjunctive eta (ή/Ή). Also, diphthongs
68 |   > with an accent on the first vowel lose the accent and gain a diaeresis
69 |   > on the second vowel (άι/ΑΪ).
70 | 
71 |   """
72 |   def upcase(string) do
73 |     string
74 |     |> String.normalize(:nfd)
75 |     |> String.replace(~r/#{@remove_accents}/u, "")
76 |     |> String.replace(~r/#{@remove_iota}/u, "")
77 |     |> String.normalize(:nfc)
78 |     |> Unicode.String.Case.Mapping.upcase(:any)
79 |   end
80 | end
81 | 


--------------------------------------------------------------------------------
/lib/unicode/case/mapping.ex:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.Case.Mapping do
  2 |   @moduledoc """
  3 |   The [Unicode Case Mapping](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm
  4 |   defines the process and data to transform text into upper case, lower case or title case.
  5 | 
  6 |   Since most languages are not bicameral, characters which have no appropriate mapping remain unchanged.
  7 | 
  8 |   Three case mapping functions are provided as a public API which have their implementations in this module:
  9 | 
 10 |   * `Unicode.String.upcase/2` which will convert text to upper case characters.
 11 |   * `Unicode.String.downcase/2` which will convert text to lower case characters.
 12 |   * `Unicode.String.titlecase/2` which will convert text to title case.  Title case means
 13 |     that the first character or each word is set to upper case and all other characters in
 14 |     the word are set to lower case. `Unicode.String.split/2` is used to split the string
 15 |     into words before title casing.
 16 | 
 17 |   Each function operates in a locale-aware manner implementing some basic capabilities:
 18 | 
 19 |   * Casing rules for the Turkish dotted capital `I` and dotless small `i`.
 20 |   * Casing rules for the retention of dots over `i` for Lithuanian letters with additional accents.
 21 |   * Titlecasing of IJ at the start of words in Dutch.
 22 |   * Removal of accents when upper casing letters in Greek.
 23 | 
 24 |   There are other casing rules that are not currently implemented such as:
 25 | 
 26 |   * Titlecasing of second or subsequent letters in words in orthographies that include
 27 |     caseless letters such as apostrophes.
 28 |   * Uppercasing of U+00DF `ß` latin small letter sharp `s` to U+1E9E `ẞ` latin capital letter
 29 |     sharp `s`.
 30 | 
 31 |   ### Examples
 32 | 
 33 |       # Basic case transformation
 34 |       iex> Unicode.String.Case.Mapping.upcase("the quick brown fox")
 35 |       "THE QUICK BROWN FOX"
 36 | 
 37 |       # Dotted-I in Turkish and Azeri
 38 |       iex> Unicode.String.Case.Mapping.upcase("Diyarbakır", :tr)
 39 |       "DİYARBAKIR"
 40 | 
 41 |       # Upper case in Greek removes diacritics
 42 |       iex> Unicode.String.Case.Mapping.upcase("Πατάτα, Αέρας, Μυστήριο", :el)
 43 |       "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ"
 44 | 
 45 |       # Lower case Greek with a final sigma
 46 |       iex> Unicode.String.Case.Mapping.downcase("ὈΔΥΣΣΕΎΣ", :el)
 47 |       "ὀδυσσεύς"
 48 | 
 49 |       # Title case Dutch with leading dipthong
 50 |       iex> Unicode.String.Case.Mapping.titlecase("ijsselmeer", :nl)
 51 |       "IJsselmeer"
 52 | 
 53 |   """
 54 | 
 55 |   alias Unicode.Utils
 56 | 
 57 |   @sigma 0x03A3
 58 |   @lower_sigma <<0x03C3::utf8>>
 59 |   @sigma_byte_size byte_size(<<@sigma::utf8>>)
 60 | 
 61 |   # See table Table 3-17 of https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
 62 |   # for details of the contexts
 63 | 
 64 |   # These regexes can probably be converted to another form
 65 |   # which may further enable binary optimmization.
 66 |   @final_sigma_before Unicode.Regex.expand_regex("\\p{cased}(\\p{Case_Ignorable})*")
 67 |   @final_sigma_after Unicode.Regex.expand_regex("(\\p{Case_Ignorable})*\\p{cased}")
 68 | 
 69 |   @after_soft_dotted Unicode.Regex.expand_regex("[\\p{Soft_Dotted}]([^\\p{ccc=230}\\p{ccc=0}])*")
 70 |   @more_above Unicode.Regex.expand_regex("[^\\p{ccc=230}\\p{ccc=0}]*[\\p{ccc=230}]")
 71 |   @before_dot Unicode.Regex.expand_regex("([^\\p{ccc=230}\\p{ccc=0}])*[\u0307]")
 72 |   @after_i Unicode.Regex.expand_regex("[I]([^\\p{ccc=230}\\p{ccc=0}])*")
 73 | 
 74 |   utf8_bytes_for_codepoint = fn codepoint ->
 75 |     byte_size(<<codepoint::utf8>>)
 76 |   end
 77 | 
 78 |   define_casing_function = fn
 79 |     casing, codepoint, replace, language, nil ->
 80 |       codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
 81 |       replacement = :unicode.characters_to_binary(replace)
 82 | 
 83 |       defp casing(
 84 |              string,
 85 |              <<unquote(codepoint)::utf8, rest::binary>>,
 86 |              unquote(casing),
 87 |              unquote(language),
 88 |              bytes_so_far,
 89 |              acc
 90 |            ) do
 91 |         bytes_so_far = bytes_so_far + unquote(codepoint_bytes)
 92 | 
 93 |         casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [
 94 |           unquote(replacement) | acc
 95 |         ])
 96 |       end
 97 | 
 98 |     casing, codepoint, replace, language, "final_sigma" ->
 99 |       codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
100 |       replacement = :unicode.characters_to_binary(replace)
101 | 
102 |       defp casing(
103 |              string,
104 |              <<@sigma::utf8, rest::binary>>,
105 |              unquote(casing),
106 |              unquote(language),
107 |              bytes_so_far,
108 |              acc
109 |            ) do
110 |         <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
111 |         bytes_so_far = bytes_so_far + unquote(codepoint_bytes)
112 | 
113 |         if Regex.match?(~r/#{@final_sigma_before}/u, prior) && !Regex.match?(~r/#{@final_sigma_after}/u, rest) do
114 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [
115 |             unquote(replacement) | acc
116 |           ])
117 |         else
118 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [
119 |             @lower_sigma | acc
120 |           ])
121 |         end
122 |       end
123 | 
124 |     casing, codepoint, replace, language, "not_before_dot" ->
125 |       codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
126 |       replacement = :unicode.characters_to_binary(replace)
127 | 
128 |       defp casing(
129 |              string,
130 |              <<unquote(codepoint)::utf8, rest::binary>>,
131 |              unquote(casing),
132 |              unquote(language),
133 |              bytes_so_far,
134 |              acc
135 |            ) do
136 |         <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
137 |         bytes_so_far = bytes_so_far + unquote(codepoint_bytes)
138 | 
139 |         if !Regex.match?(~r/#{@before_dot}/u, prior) do
140 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [
141 |             unquote(replacement) | acc
142 |           ])
143 |         else
144 |           this =
145 |             casing(
146 |               <<unquote(codepoint)::utf8>>,
147 |               <<unquote(codepoint)::utf8>>,
148 |               unquote(casing),
149 |               :any,
150 |               0,
151 |               acc
152 |             )
153 | 
154 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
155 |         end
156 |       end
157 | 
158 |     casing, codepoint, replace, language, "more_above" ->
159 |       codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
160 |       replacement = :unicode.characters_to_binary(replace)
161 | 
162 |       defp casing(
163 |              string,
164 |              <<unquote(codepoint)::utf8, rest::binary>>,
165 |              unquote(casing),
166 |              unquote(language),
167 |              bytes_so_far,
168 |              acc
169 |            ) do
170 |         bytes_so_far = bytes_so_far + unquote(codepoint_bytes)
171 | 
172 |         if Regex.match?(~r/#{@more_above}/u, rest) do
173 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [
174 |             unquote(replacement) | acc
175 |           ])
176 |         else
177 |           this =
178 |             casing(
179 |               <<unquote(codepoint)::utf8>>,
180 |               <<unquote(codepoint)::utf8>>,
181 |               unquote(casing),
182 |               :any,
183 |               0,
184 |               acc
185 |             )
186 | 
187 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
188 |         end
189 |       end
190 | 
191 |     casing, codepoint, replace, language, "after_soft_dotted" ->
192 |       codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
193 |       replacement = :unicode.characters_to_binary(replace)
194 | 
195 |       defp casing(
196 |              string,
197 |              <<unquote(codepoint)::utf8, rest::binary>>,
198 |              unquote(casing),
199 |              unquote(language),
200 |              bytes_so_far,
201 |              acc
202 |            ) do
203 |         <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
204 |         bytes_so_far = bytes_so_far + unquote(codepoint_bytes)
205 | 
206 |         if Regex.match?(~r/#{@after_soft_dotted}/u, prior) do
207 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [
208 |             unquote(replacement) | acc
209 |           ])
210 |         else
211 |           this =
212 |             casing(
213 |               <<unquote(codepoint)::utf8>>,
214 |               <<unquote(codepoint)::utf8>>,
215 |               unquote(casing),
216 |               :any,
217 |               0,
218 |               acc
219 |             )
220 | 
221 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
222 |         end
223 |       end
224 | 
225 |     casing, codepoint, replace, language, "after_i" ->
226 |       codepoint_bytes = utf8_bytes_for_codepoint.(codepoint)
227 |       replacement = :unicode.characters_to_binary(replace)
228 | 
229 |       defp casing(
230 |              string,
231 |              <<unquote(codepoint)::utf8, rest::binary>>,
232 |              unquote(casing),
233 |              unquote(language),
234 |              bytes_so_far,
235 |              acc
236 |            ) do
237 |         <<prior::binary-size(bytes_so_far), _remaining::binary>> = string
238 |         bytes_so_far = bytes_so_far + unquote(codepoint_bytes)
239 | 
240 |         if Regex.match?(~r/#{@after_i}/u, prior) do
241 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [
242 |             unquote(replacement) | acc
243 |           ])
244 |         else
245 |           this =
246 |             casing(
247 |               <<unquote(codepoint)::utf8>>,
248 |               <<unquote(codepoint)::utf8>>,
249 |               unquote(casing),
250 |               :any,
251 |               0,
252 |               acc
253 |             )
254 | 
255 |           casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc])
256 |         end
257 |       end
258 |   end
259 | 
260 |   @doc """
261 |   Replace lower case characters with their
262 |   uppercase equivalents.
263 | 
264 |   Lower case characters are replaced with their
265 |   upper case equivalents. All other characters
266 |   remain unchanged.
267 | 
268 |   For the Greek language (`:el`), all accents are
269 |   removed prior to capitalization as is the normal
270 |   practise for this language.
271 | 
272 |   """
273 |   def upcase(string, language \\ :any)
274 | 
275 |   def upcase(string, :el) do
276 |     Unicode.String.Case.Mapping.Greek.upcase(string)
277 |   end
278 | 
279 |   def upcase(string, language) when is_atom(language) do
280 |     casing(string, string, :upcase, language, 0, [])
281 |   end
282 | 
283 |   @doc """
284 |   Replace upper case characters with their
285 |   lower case equivalents.
286 | 
287 |   """
288 |   def downcase(string, language \\ :any)
289 | 
290 |   def downcase(string, language) when is_atom(language) do
291 |     casing(string, string, :downcase, language, 0, [])
292 |   end
293 | 
294 |   @doc """
295 |   Apply to Unicode title case algorithm.
296 | 
297 |   """
298 |   def titlecase(string, language \\ :any)
299 | 
300 |   def titlecase(<<i::size(8), j::size(8), rest::binary>>, :nl)
301 |       when i in [?i, ?I] and j in [?j, ?J] do
302 |     "IJ" <> casing(rest, rest, :downcase, :any, 0, [])
303 |   end
304 | 
305 |   def titlecase(<<first::utf8, rest::binary>>, language) when is_atom(language) do
306 |     casing(<<first::utf8>>, <<first::utf8>>, :titlecase, language, 0, []) <>
307 |       downcase(rest, language)
308 |   end
309 | 
310 |   # These next four function clauses optimze for ASCII characters.
311 |   # We need to omit the `i` from all ranges since in Turkish and Azeri
312 |   # they upcase to a dotted-capital-I
313 | 
314 |   defp casing(
315 |          string,
316 |          <<byte::size(8), rest::binary>>,
317 |          :downcase = casing,
318 |          language,
319 |          bytes_so_far,
320 |          acc
321 |        )
322 |        when byte >= ?A and byte <= ?Z and byte != ?I do
323 |     casing(string, rest, casing, language, bytes_so_far + 1, [byte + 32 | acc])
324 |   end
325 | 
326 |   defp casing(string, <<byte::size(8), rest::binary>>, casing, language, bytes_so_far, acc)
327 |        when casing in [:upcase, :titlecase] and byte >= ?a and byte <= ?z and byte != ?i do
328 |     casing(string, rest, casing, language, bytes_so_far + 1, [byte - 32 | acc])
329 |   end
330 | 
331 |   defp casing(string, <<byte::size(8), rest::binary>>, casing, language, bytes_so_far, acc)
332 |        when casing in [:upcase, :titlecase] and byte != ?i and byte <= ?~ do
333 |     casing(string, rest, casing, language, bytes_so_far + 1, [byte | acc])
334 |   end
335 | 
336 |   defp casing(
337 |          string,
338 |          <<byte::size(8), rest::binary>>,
339 |          :downcase = casing,
340 |          language,
341 |          bytes_so_far,
342 |          acc
343 |        )
344 |        when byte != ?I and byte <= ?~ do
345 |     casing(string, rest, casing, language, bytes_so_far + 1, [byte | acc])
346 |   end
347 | 
348 |   # Generate the mapping functions
349 | 
350 |   for %{codepoint: codepoint, upper: upper} = casing <- Utils.casing_in_order(),
351 |       upper && upper != codepoint && (codepoint == ?i or codepoint > ?~) do
352 |     %{context: context, language: language} = casing
353 | 
354 |     define_casing_function.(:upcase, codepoint, upper, language, context)
355 |   end
356 | 
357 |   for %{codepoint: codepoint, lower: lower} = casing <- Utils.casing_in_order(),
358 |       lower && lower != codepoint && (codepoint == ?I or codepoint > ?~) do
359 |     %{language: language, context: context} = casing
360 | 
361 |     # Special casing for capital sigma with no context.
362 |     # see the default implementations of casing/5 at the
363 |     # end of this file. Don't generate a function clause for
364 |     # this codepoint here.
365 |     unless codepoint == @sigma and is_nil(context) do
366 |       define_casing_function.(:downcase, codepoint, lower, language, context)
367 |     end
368 |   end
369 | 
370 |   for %{codepoint: codepoint, title: title} = casing <- Utils.casing_in_order(),
371 |       title && title != codepoint && codepoint > ?~ do
372 |     %{context: context, language: language} = casing
373 | 
374 |     define_casing_function.(:titlecase, codepoint, title, language, context)
375 |   end
376 | 
377 |   # End of string, return accumulator
378 |   defp casing(_string, "", _casing, _language, _bytes_so_far, acc) do
379 |     acc
380 |     |> :lists.reverse()
381 |     |> IO.iodata_to_binary()
382 |   end
383 | 
384 |   # Special case for Greek sigma when no context. This is the only codepoint
385 |   # that has two cases for the language :any. One case with "final_sigma" context
386 |   # and one with no context. This means we can't generate two distinct function
387 |   # clauses for casing/5 so we define a special one here for the "no context"
388 |   # version and generate the one with the context in the normal flow.
389 |   defp casing(
390 |          string,
391 |          <<@sigma::utf8, rest::binary>>,
392 |          :downcase = casing,
393 |          :any = language,
394 |          bytes_so_far,
395 |          acc
396 |        ) do
397 |     bytes_so_far = bytes_so_far + @sigma_byte_size
398 | 
399 |     casing(string, rest, casing, language, bytes_so_far, [@lower_sigma | acc])
400 |   end
401 | 
402 |   # Pass the character through since there is no casing data.
403 |   # Optimize for ASCII bytes (byte value is less than 127)
404 |   defp casing(string, <<byte::size(8), rest::binary>>, casing, :any = language, bytes_so_far, acc)
405 |        when byte <= ?~ do
406 |     bytes_so_far = bytes_so_far + 1
407 | 
408 |     casing(string, rest, casing, language, bytes_so_far, [byte | acc])
409 |   end
410 | 
411 |   defp casing(string, <<next::utf8, rest::binary>>, casing, :any = language, bytes_so_far, acc) do
412 |     next = <<next::utf8>>
413 |     bytes_so_far = bytes_so_far + byte_size(next)
414 | 
415 |     casing(string, rest, casing, language, bytes_so_far, [next | acc])
416 |   end
417 | 
418 |   # If the language version has no casing, use the default casing by
419 |   # using the :any language.
420 |   defp casing(string, rest, casing, _language, bytes_so_far, acc) do
421 |     casing(string, rest, casing, :any, bytes_so_far, acc)
422 |   end
423 | 
424 |   @doc false
425 |   def unknown_locale_error(locale) do
426 |     "Unknown locale #{inspect(locale)}"
427 |   end
428 | end
429 | 


--------------------------------------------------------------------------------
/lib/unicode/dictionary.ex:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.Dictionary do
  2 |   @moduledoc """
  3 |   Implements basic dictionary functions for dictionary-based
  4 |   work break.
  5 | 
  6 |   This implementation supports dictionary-based word breaking for:
  7 | 
  8 |   * Chinese (`zh`, `zh-Hant`, `zh-Hans`, `zh-Hant-HK`, `yue`, `yue-Hans`) locales,
  9 |   * Japanese (`ja`) using the same dictionary as for Chinese,
 10 |   * Thai (`th`),
 11 |   * Lao (`lo`),
 12 |   * Khmer (`km`) and
 13 |   * Burmese (`my`).
 14 | 
 15 |   The dictionaries implemented are those used in the [CLDR](https://cldr.unicode.org) since
 16 |   they are under an open source license and also for consistency with
 17 |   [ICU](https://icu.unicode.org).
 18 | 
 19 |   Note that these dictionaries need to be downloaded with
 20 |   `mix unicode.string.download.dictionaries` prior to use. Each dictionary
 21 |   will be parsed and loaded into [persistent_term](https://www.erlang.org/doc/man/persistent_term)
 22 |   on demand. Note that each dictionary has a sizable memory footprint as measured
 23 |   by `:persistent_term.info/0`:
 24 | 
 25 |   | Dictionary  | Memory Mb   |
 26 |   | ----------- | ----------: |
 27 |   | Chinese     | 104.8       |
 28 |   | Thai        | 9.6         |
 29 |   | Lao         | 11.4        |
 30 |   | Khmer       | 38.8        |
 31 |   | Burmese     | 23.1        |
 32 | 
 33 |   """
 34 | 
 35 |   alias Unicode.String.Trie
 36 | 
 37 |   @app_name :unicode_string
 38 |   @dictionary_dir "dictionaries/"
 39 | 
 40 |   @dictionary_locales [
 41 |     :zh, :th, :lo, :my, :km, :ja, :"zh-Hant", :"zh-Hant-HK", :yue, :"yue-Hant", :"yue-Hans"
 42 |   ]
 43 | 
 44 |   @doc """
 45 |   Returns the locales that have a dictionary supporting
 46 |   word breaking.
 47 | 
 48 |   """
 49 |   def known_dictionary_locales do
 50 |     @dictionary_locales
 51 |   end
 52 | 
 53 |   @doc false
 54 |   def ensure_dictionary_loaded_if_available(locale) when locale in @dictionary_locales do
 55 |     require Logger
 56 | 
 57 |     with {:ok, locale} <- dictionary_locale(locale) do
 58 |       status =
 59 |         if dictionary = dictionary(locale) do
 60 |           {:ok, dictionary}
 61 |         else
 62 |           load(locale)
 63 |         end
 64 | 
 65 |       case status do
 66 |         {:ok, dictionary} ->
 67 |           {:ok, dictionary}
 68 | 
 69 |         _other ->
 70 |           message = "No dictionary for #{locale} found. Have you run `mix download.dictionaries`?"
 71 |           Logger.debug(message)
 72 |           {:error, message}
 73 |       end
 74 |     end
 75 |   end
 76 | 
 77 |   def ensure_dictionary_loaded_if_available(locale) do
 78 |     {:ok, "No dictionary for #{inspect locale} found"}
 79 |   end
 80 | 
 81 |   @doc false
 82 |   def load(locale) do
 83 |     with {:ok, locale} <- dictionary_locale(locale) do
 84 |       load_dictionary(locale)
 85 |     end
 86 |   end
 87 | 
 88 |   @doc false
 89 |   def is_loaded(locale) do
 90 |     with {:ok, locale} <- dictionary_locale(locale) do
 91 |       :persistent_term.get({@app_name, locale}, false) && true
 92 |     else
 93 |       _other -> false
 94 |     end
 95 |   end
 96 | 
 97 |   @doc false
 98 |   def dictionary(locale) when locale in @dictionary_locales do
 99 |     :persistent_term.get({@app_name, locale}, nil)
100 |   end
101 | 
102 |   @doc false
103 |   def has_key(string, locale) do
104 |     with {:ok, locale} <- dictionary_locale(locale) do
105 |       dictionary = :persistent_term.get({@app_name, locale})
106 |       Trie.has_key(string, dictionary)
107 |     end
108 |   end
109 | 
110 |   @doc false
111 |   def find_prefix(string, locale) do
112 |     with {:ok, locale} <- dictionary_locale(locale) do
113 |       dictionary = :persistent_term.get({@app_name, locale})
114 |       Trie.find_prefix(string, dictionary)
115 |     end
116 |   end
117 | 
118 |   @doc false
119 |   @dialyzer {:nowarn_function, load_dictionary: 1}
120 |   defp load_dictionary(:zh), do: load_dictionary(:zh, "chinese_japanese.txt")
121 |   defp load_dictionary(:ja), do: load_dictionary(:zh)
122 |   defp load_dictionary(:lo), do: load_dictionary(:lo, "lao.txt")
123 |   defp load_dictionary(:th), do: load_dictionary(:th, "thai.txt")
124 |   defp load_dictionary(:my), do: load_dictionary(:my, "burmese.txt")
125 |   defp load_dictionary(:km), do: load_dictionary(:km, "khmer.txt")
126 | 
127 |   @comment_marker ["#", " #", "  #", "\uFEFF #"]
128 | 
129 |   defp load_dictionary(locale, file_name) do
130 |     require Logger
131 | 
132 |     trie =
133 |       file_name
134 |       |> read_dictionary()
135 |       |> String.split("\n")
136 |       |> Enum.reject(&String.starts_with?(&1, @comment_marker))
137 |       |> Enum.reject(&(String.length(&1) == 0))
138 |       |> Enum.map(fn line ->
139 |         case String.split(line, "\t") do
140 |           [word] -> word
141 |           [word, value] -> {word, String.to_integer(value)}
142 |         end
143 |       end)
144 |       |> Trie.new()
145 | 
146 |     :ok = :persistent_term.put({@app_name, locale}, trie)
147 |     trie = :persistent_term.get({@app_name, locale})
148 | 
149 |     # Logger.debug("[unicode_string] Loaded word break dictionary for locale #{inspect locale}")
150 |     {:ok, trie}
151 |   end
152 | 
153 |   defp read_dictionary(file_name) do
154 |     priv_dir = :code.priv_dir(@app_name) |> to_string
155 |     path = Path.join(priv_dir, [@dictionary_dir, file_name])
156 |     File.read!(path)
157 |   end
158 | 
159 |   @doc false
160 |   def dictionary_locale(:zh), do: {:ok, :zh}
161 |   def dictionary_locale(:"zh-Hant"), do: {:ok, :zh}
162 |   def dictionary_locale(:"zh-Hant-HK"), do: {:ok, :zh}
163 |   def dictionary_locale(:yue), do: {:ok, :zh}
164 |   def dictionary_locale(:"yue-Hant"), do: {:ok, :zh}
165 |   def dictionary_locale(:"yue-Hans"), do: {:ok, :zh}
166 | 
167 |   def dictionary_locale(:lo), do: {:ok, :lo}
168 |   def dictionary_locale(:my), do: {:ok, :my}
169 |   def dictionary_locale(:th), do: {:ok, :th}
170 |   def dictionary_locale(:km), do: {:ok, :km}
171 |   def dictionary_locale(:ja), do: {:ok, :zh}
172 |   def dictionary_locale(%{language: language}), do: dictionary_locale(language)
173 |   def dictionary_locale(language), do: {:error, "No dictionary for #{inspect language} found."}
174 | 
175 | end


--------------------------------------------------------------------------------
/lib/unicode/segment.ex:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.Segment do
  2 |   @moduledoc """
  3 |   Implements the compilation of the Unicode
  4 |   segment rules.
  5 | 
  6 |   """
  7 | 
  8 |   import SweetXml
  9 |   require Unicode.Set
 10 | 
 11 |   @root_locale "root"
 12 |   @suppressions_variable "$Suppressions"
 13 | 
 14 |   # This is the formal definition but it takes a while to compile
 15 |   # and all of the known variable names are in the Latin-1 set
 16 |   # defguard is_id_start(char) when Unicode.Set.match?(char, "\\p{ID_start}")
 17 |   # defguard is_id_continue(char) when Unicode.Set.match?(char, "\\p{ID_continue}")
 18 | 
 19 |   @doc "Identifies if a codepoint is a valid start of an identifier"
 20 |   defguard is_id_start(char)
 21 |            when char in ?A..?Z
 22 | 
 23 |   @doc "Identifies if a codepoint is a valid identifier character"
 24 |   defguard is_id_continue(char)
 25 |            when char in ?a..?z or char in ?A..?Z or char in ?0..?9 or char == ?_
 26 | 
 27 |   @doc """
 28 |   Return the rules as defined by CLDR for a given
 29 |   locale and break type.
 30 | 
 31 |   """
 32 |   def rules(locale, segment_type, additional_variables \\ []) do
 33 |     with {:ok, segment} <- segments(locale, segment_type) do
 34 |       variables = Map.fetch!(segment, :variables) |> expand_variables(additional_variables)
 35 |       rules = Map.fetch!(segment, :rules)
 36 | 
 37 |       rules
 38 |       |> compile_rules(variables, [])
 39 |       |> wrap(:ok)
 40 |     end
 41 |   end
 42 | 
 43 |   @doc """
 44 |   Return the rules as defined by CLDR for a given
 45 |   locale and break type and raises on error.
 46 | 
 47 |   """
 48 |   def rules!(locale, segment_type, additional_variables \\ []) do
 49 |     case rules(locale, segment_type, additional_variables) do
 50 |       {:ok, rules} -> rules
 51 |       {:error, reason} -> raise ArgumentError, reason
 52 |     end
 53 |   end
 54 | 
 55 |   def compile_rules(rules, variables, regex_options) when is_list(rules) do
 56 |     rules
 57 |     |> expand_rules(variables)
 58 |     |> compile_rules(regex_options)
 59 |   end
 60 | 
 61 |   # These options set unicode mode. Interpret certain
 62 |   # codes like \B and \w in the unicode space, ignore
 63 |   # unescaped whitespace in regexs
 64 |   @regex_options [:unicode, :extended, :ucp, :dollar_endonly, :dotall, :bsr_unicode]
 65 |   @rule_splitter "[×÷]"
 66 | 
 67 |   defp compile_rules(rules, regex_options) do
 68 |     Enum.map(rules, fn {sequence, rule} ->
 69 |       [left, operator, right] = Regex.split(~r/#{@rule_splitter}/u, rule, include_captures: true)
 70 |       operator = if operator == "×", do: :no_break, else: :break
 71 | 
 72 |       left = if left != "", do: left <> "$", else: left
 73 |       right = if right != "", do: "^" <> right, else: right
 74 | 
 75 |       {sequence,
 76 |        {operator, expand_regex(left, regex_options), expand_regex(right, regex_options)}}
 77 |     end)
 78 |   end
 79 | 
 80 |   @doc """
 81 |   Compiles a segment rule in the context of a list
 82 |   of variables.
 83 | 
 84 |   The compile rule can then be inserted into a
 85 |   rule set.
 86 | 
 87 |   """
 88 |   def compile_rule(rule, variables, regex_options \\ []) when is_map(rule) do
 89 |     compile_rules([rule], variables, regex_options)
 90 |     |> hd
 91 |   end
 92 | 
 93 |   @doc false
 94 |   def suppressions_variable(locale, segment_type) do
 95 |     variable =
 96 |       locale
 97 |       |> suppressions!(segment_type)
 98 |       |> suppressions_regex()
 99 | 
100 |     if variable do
101 |       %{name: @suppressions_variable, value: variable}
102 |     else
103 |       nil
104 |     end
105 |   end
106 | 
107 |   defp suppressions_regex([]) do
108 |     nil
109 |   end
110 | 
111 |   defp suppressions_regex(suppressions) do
112 |     suppression_regex = Enum.map_join(suppressions, "|", &String.replace(&1, ".", "\\."))
113 | 
114 |     "(" <> suppression_regex <> ")"
115 |   end
116 | 
117 |   @doc """
118 |   Returns a list of the suppressions for a given
119 |   locale and segment type.
120 | 
121 |   """
122 |   def suppressions(locale, segment_type) do
123 |     with {:ok, segment} <- segments(locale, segment_type) do
124 |       {:ok, Map.get(segment, :suppressions, [])}
125 |     end
126 |   end
127 | 
128 |   @doc """
129 |   Returns a list of the suppressions for a given
130 |   locale and segment type and raises on error.
131 | 
132 |   """
133 |   def suppressions!(locale, segment_type) do
134 |     case suppressions(locale, segment_type) do
135 |       {:ok, suppressions} -> suppressions
136 |       {:error, reason} -> raise ArgumentError, reason
137 |     end
138 |   end
139 | 
140 |   defp expand_regex("", _regex_options) do
141 |     :any
142 |   end
143 | 
144 |   # Delete spaces because PCRE doesn't ignore them
145 | 
146 |   defp expand_regex(string, regex_options) do
147 |     string
148 |     |> String.trim()
149 |     |> String.replace(" ", "")
150 |     |> Unicode.Regex.expand_regex(@regex_options ++ regex_options)
151 |   end
152 | 
153 |   @doc """
154 |   Evaluates a list of rules against a given
155 |   string.
156 | 
157 |   """
158 |   def evaluate_rules(string, rules) when is_binary(string) do
159 |     evaluate_rules({"", string}, rules)
160 |   end
161 | 
162 |   def evaluate_rules({string_before, string_after}, rules) do
163 |     Enum.reduce_while(rules, [], fn rule, _acc ->
164 |       {_rule_number, {operator, _fore, _aft}} = rule
165 | 
166 |       case evaluate_rule({string_before, string_after}, rule) do
167 |         {:pass, result} ->
168 |           {:halt, {:pass, operator, result}}
169 | 
170 |         {:fail, string} ->
171 |           {:cont, {:fail, string}}
172 |       end
173 |     end)
174 |     |> return_break_or_no_break
175 |   end
176 | 
177 |   # The final implicit rule is to to break. ie: :any ÷ :any
178 |   defp return_break_or_no_break({:fail, {before_string, ""}}) do
179 |     {:break, {before_string, {"", ""}}}
180 |   end
181 | 
182 |   defp return_break_or_no_break({:fail, {before_string, after_string}}) do
183 |     <<char::utf8, rest::binary>> = after_string
184 |     {:break, {before_string, {<<char::utf8>>, rest}}}
185 |   end
186 | 
187 |   defp return_break_or_no_break({:pass, operator, result}) do
188 |     {operator, result}
189 |   end
190 | 
191 |   @split_options [parts: 2, include_captures: true, trim: true]
192 | 
193 |   # Process an `:any op regex` rule at end of string
194 |   defp evaluate_rule({string_before, <<_::utf8>> = string_after}, {_seq, {_operator, :any, {aft, regex_options}}}) do
195 |     aft = Regex.compile!(aft, regex_options)
196 | 
197 |     if Regex.match?(aft, string_after) do
198 |       {:pass, {string_before, {string_after, ""}}}
199 |     else
200 |       {:fail, {string_before, string_after}}
201 |     end
202 |   end
203 | 
204 |   defp evaluate_rule({string_before, string_after}, {_seq, {_operator, :any, {aft, regex_options}}}) do
205 |     aft = Regex.compile!(aft, regex_options)
206 | 
207 |     case Regex.split(aft, string_after, @split_options) do
208 |       [match, rest] -> {:pass, {string_before, {match, rest}}}
209 |       _other -> {:fail, {string_before, string_after}}
210 |     end
211 |   end
212 | 
213 |   # Ignore suppressions at end of the string
214 |   defp evaluate_rule({string_before, string_after}, {10.5, {_operator, {fore, regex_options}, :any}}) do
215 |     fore = Regex.compile!(fore, regex_options)
216 | 
217 |     if Regex.match?(fore, string_before) do
218 |       # IO.inspect {string_before, string_after}, label: "Matched Rule 10.5"
219 |       case Regex.split(fore, string_before, @split_options) do
220 |         [match] ->
221 |           # IO.inspect {operator, match}, label: "Matched One"
222 |           {:pass, {string_before, {match, ""}}}
223 | 
224 |         [match, rest] ->
225 |           # IO.inspect {operator, match, rest}, label: "Matched"
226 |           {:pass, {string_before, {match, rest}}}
227 |       end
228 |     else
229 |       # IO.inspect {string_before, string_after}, label: "Did not match Rule 10.5"
230 |       {:fail, {string_before, string_after}}
231 |     end
232 |   end
233 | 
234 |   # :any matches end of string
235 |   defp evaluate_rule({string_before, "" = string_after}, {_seq, {_operator, {fore, regex_options}, :any}}) do
236 |     fore = Regex.compile!(fore, regex_options)
237 | 
238 |     if Regex.match?(fore, string_before) do
239 |       {:pass, {string_before, {"", ""}}}
240 |     else
241 |       {:fail, {string_before, string_after}}
242 |     end
243 |   end
244 | 
245 |   defp evaluate_rule({string_before, string_after}, {_seq, {_operator, {fore, regex_options}, :any}}) do
246 |     fore = Regex.compile!(fore, regex_options)
247 | 
248 |     if Regex.match?(fore, string_before) do
249 |       <<char::utf8, rest::binary>> = string_after
250 |       {:pass, {string_before, {<<char::utf8>>, rest}}}
251 |     else
252 |       {:fail, {string_before, string_after}}
253 |     end
254 |   end
255 | 
256 |   defp evaluate_rule({string_before, string_after}, {_seq, {_operator, {fore, fore_regex_options}, {aft, aft_regex_options}}}) do
257 |     fore = Regex.compile!(fore, fore_regex_options)
258 |     aft = Regex.compile!(aft, aft_regex_options)
259 | 
260 |     if Regex.match?(fore, string_before) && Regex.match?(aft, string_after) do
261 |       case Regex.split(aft, string_after, @split_options) do
262 |         [match, rest] -> {:pass, {string_before, {match, rest}}}
263 |         [match] -> {:pass, {string_before, {match, ""}}}
264 |       end
265 |     else
266 |       {:fail, {string_before, string_after}}
267 |     end
268 |   end
269 | 
270 |   defp expand_rules(rules, variables) do
271 |     Enum.reduce(rules, [], fn %{id: sequence, value: rule}, acc ->
272 |       rule =
273 |         rule
274 |         |> String.trim()
275 |         |> substitute_variables(variables)
276 | 
277 |       [{sequence, rule} | acc]
278 |     end)
279 |     |> Enum.sort()
280 |   end
281 | 
282 |   def expand_variables(variables, additional_variables)
283 |       when is_list(variables) and is_list(additional_variables) do
284 |     Enum.reduce(variables ++ additional_variables, %{}, fn
285 |       %{name: <<"$", name::binary>>, value: value}, variables ->
286 |         new_value = substitute_variables(value, variables)
287 |         Map.put(variables, name, new_value)
288 |     end)
289 |   end
290 | 
291 |   defp substitute_variables("", _variables) do
292 |     ""
293 |   end
294 | 
295 |   defp substitute_variables(<<"$", char::utf8, rest::binary>>, variables)
296 |        when is_id_start(char) do
297 |     {name, rest} = extract_variable_name(<<char::utf8>> <> rest)
298 |     Map.fetch!(variables, name) <> substitute_variables(rest, variables)
299 |   end
300 | 
301 |   defp substitute_variables(<<char::binary-1, rest::binary>>, variables) do
302 |     char <> substitute_variables(rest, variables)
303 |   end
304 | 
305 |   defp extract_variable_name("" = string) do
306 |     {string, ""}
307 |   end
308 | 
309 |   defp extract_variable_name(<<char::utf8, rest::binary>>)
310 |        when is_id_continue(char) do
311 |     {string, rest} = extract_variable_name(rest)
312 |     {<<char::utf8>> <> string, rest}
313 |   end
314 | 
315 |   defp extract_variable_name(rest) do
316 |     {"", rest}
317 |   end
318 | 
319 |   @app_name Mix.Project.config()[:app]
320 | 
321 |   @doctype "<!DOCTYPE ldml SYSTEM \"../../common/dtd/ldml.dtd\">"
322 | 
323 |   @doc false
324 |   def segments_dir do
325 |     Path.join(:code.priv_dir(@app_name), "/segments")
326 |   end
327 | 
328 |   @doc false
329 |   def locale_map do
330 |     segments_dir()
331 |     |> File.ls!()
332 |     |> Enum.map(fn locale_file ->
333 |       locale =
334 |         locale_file
335 |         |> String.split(".xml")
336 |         |> hd
337 |         |> String.replace("_", "-")
338 | 
339 |       {locale, locale_file}
340 |     end)
341 |     |> Map.new()
342 |   end
343 | 
344 |   @doc """
345 |   Returns a list of the known locales that have
346 |   segmentation data.
347 | 
348 |   """
349 |   def known_segmentation_locales do
350 |     locale_map()
351 |     |> Map.keys()
352 |     |> Enum.map(&String.to_atom/1)
353 |   end
354 | 
355 |   @doc """
356 |   Returns a list of the ancestor locales
357 |   of the a given locale.
358 | 
359 |   The list includes the given locale.
360 | 
361 |   """
362 | 
363 |   def ancestors(locale_name) do
364 |     if Map.get(locale_map(), locale_name) do
365 |       case String.split(locale_name, "-") do
366 |         [locale] -> [locale, @root_locale]
367 |         [locale, _territory] -> [locale_name, locale, @root_locale]
368 |         [locale, script, _territory] -> [locale_name, "#{locale}-#{script}", locale, @root_locale]
369 |       end
370 |       |> wrap(:ok)
371 |     else
372 |       {:error, unknown_locale_error(locale_name)}
373 |     end
374 |   end
375 | 
376 |   @doc false
377 |   def merge_ancestors(@root_locale) do
378 |     raw_segments!(@root_locale)
379 |     |> wrap(:ok)
380 |   end
381 | 
382 |   def merge_ancestors(locale) when is_binary(locale) do
383 |     with {:ok, ancestors} <- ancestors(locale) do
384 |       merge_ancestors(ancestors)
385 |       |> wrap(:ok)
386 |     end
387 |   end
388 | 
389 |   @doc false
390 |   def merge_ancestors([locale, root]) do
391 |     merge_ancestor(locale, raw_segments!(root))
392 |   end
393 | 
394 |   def merge_ancestors([locale | rest]) do
395 |     merge_ancestor(locale, merge_ancestors(rest))
396 |   end
397 | 
398 |   # For each segment type, add the variables, rules and
399 |   # suppressions from locale to other
400 |   defp merge_ancestor(locale, other) do
401 |     locale_segments = raw_segments!(locale)
402 | 
403 |     Enum.map(other, fn {segment_type, content} ->
404 |       variables =
405 |         Map.fetch!(content, :variables) ++
406 |           (get_in(locale_segments, [segment_type, :variables]) || [])
407 | 
408 |       rules =
409 |         Map.fetch!(content, :rules) ++
410 |           (get_in(locale_segments, [segment_type, :rules]) || [])
411 | 
412 |       suppressions =
413 |         Map.fetch!(content, :suppressions) ++
414 |           (get_in(locale_segments, [segment_type, :suppressions]) || [])
415 | 
416 |       {segment_type, %{content | variables: variables, rules: rules, suppressions: suppressions}}
417 |     end)
418 |     |> Map.new()
419 |   end
420 | 
421 |   defp raw_segments(locale) do
422 |     if file = Map.get(locale_map(), locale) do
423 |       content =
424 |         segments_dir()
425 |         |> Path.join(file)
426 |         |> File.read!()
427 |         |> String.replace(@doctype, "")
428 |         |> xpath(~x"//segmentation"l,
429 |           type: ~x"./@type"s,
430 |           variables: [
431 |             ~x".//variable"l,
432 |             name: ~x"./@id"s,
433 |             value: ~x"./text()"s
434 |           ],
435 |           rules: [
436 |             ~x".//rule"l,
437 |             id: ~x"./@id"f,
438 |             value: ~x"./text()"s
439 |           ],
440 |           suppressions: ~x".//suppression/text()"ls
441 |         )
442 | 
443 |       Enum.map(content, fn c ->
444 |         type =
445 |           c.type
446 |           |> Macro.underscore()
447 |           |> String.replace("__", "_")
448 |           |> String.to_atom()
449 | 
450 |         {type, %{rules: c.rules, variables: c.variables, suppressions: c.suppressions}}
451 |       end)
452 |       |> Map.new()
453 |       |> wrap(:ok)
454 |     else
455 |       {:error, unknown_locale_error(locale)}
456 |     end
457 |   end
458 | 
459 |   defp raw_segments!(locale) do
460 |     case raw_segments(locale) do
461 |       {:ok, segments} -> segments
462 |       {:error, reason} -> raise ArgumentError, reason
463 |     end
464 |   end
465 | 
466 |   @doc false
467 |   def segments(locale) when is_binary(locale) do
468 |     merge_ancestors(locale)
469 |   end
470 | 
471 |   def segments(locale) when is_atom(locale) do
472 |     locale
473 |     |> Atom.to_string()
474 |     |> segments()
475 |   end
476 | 
477 |   @doc false
478 |   def segments(locale, segment_type) do
479 |     with {:ok, segments} <- segments(to_string(locale)) do
480 |       if segment = Map.get(segments, segment_type) do
481 |         {:ok, segment}
482 |       else
483 |         {:error, unknown_segment_type_error(segment_type)}
484 |       end
485 |     end
486 |   end
487 | 
488 |   defp wrap(term, atom) do
489 |     {atom, term}
490 |   end
491 | 
492 |   @doc false
493 |   def unknown_locale_error(locale) do
494 |     "Unknown locale #{inspect(locale)}"
495 |   end
496 | 
497 |   @doc false
498 |   def unknown_segment_type_error(segment_type) do
499 |     "Unknown segment type #{inspect(segment_type)}"
500 |   end
501 | end
502 | 


--------------------------------------------------------------------------------
/lib/unicode/string.ex:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String do
  2 |   @moduledoc """
  3 |   This module provides functions that implement some
  4 |   of the [Unicode](https://unicode.org) standards:
  5 | 
  6 |   * The [Unicode Case Mapping](https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf) algorithm
  7 |     to provide mapping to upper, lower and title case text.
  8 | 
  9 |   * The [Unicode Case Folding](https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf) algorithm
 10 |     to provide case-independent equality checking irrespective of language or script.
 11 | 
 12 |   * The [Unicode Segmentation](https://unicode.org/reports/tr29/) algorithm to detect,
 13 |     break or split strings into grapheme clusters, words and sentences.
 14 | 
 15 |   * The [Unicode Line Breaking](https://www.unicode.org/reports/tr14/) algorithm to determine
 16 |     line break placement to support word-wrapping.
 17 | 
 18 |   """
 19 | 
 20 |   alias Unicode.Property
 21 |   alias Unicode.String.Break
 22 |   alias Unicode.String.Segment
 23 |   alias Unicode.String.Case
 24 |   alias Unicode.String.Dictionary
 25 | 
 26 |   defdelegate fold(string), to: Unicode.String.Case.Folding
 27 |   defdelegate fold(string, type), to: Unicode.String.Case.Folding
 28 | 
 29 |   defguard is_language(language) when (byte_size(language) == 2 or byte_size(language) == 3)
 30 |   defguard is_script(script) when byte_size(script) == 4
 31 |   defguard is_territory(territory) when byte_size(territory) == 2
 32 | 
 33 |   @type string_interval :: {String.t(), String.t()}
 34 |   @type break_type :: :grapheme | :word | :line | :sentence
 35 |   @type error_return :: {:error, String.t()}
 36 | 
 37 |   @type option :: {:locale, String.t() | map}
 38 |           | {:break, break_type}
 39 |           | {:suppressions, boolean}
 40 | 
 41 | 
 42 |   @type split_option :: {:locale, String.t() | map}
 43 |           | {:break, break_type}
 44 |           | {:suppressions, boolean}
 45 |           | {:trim, boolean}
 46 | 
 47 |   @type break_or_no_break :: :break | :no_break
 48 | 
 49 |   @type break_match ::
 50 |           {break_or_no_break, {String.t(), {String.t(), String.t()}}}
 51 |           | {break_or_no_break, {String.t(), String.t()}}
 52 | 
 53 |   @type mode_or_language :: :turkic | nil | %{language: atom()}
 54 | 
 55 |   @default_locale "root"
 56 |   @default_break :word
 57 | 
 58 |   @doc """
 59 |   Compares two strings in a case insensitive
 60 |   manner.
 61 | 
 62 |   Case folding is applied to the two string
 63 |   arguments which are then compared with the
 64 |   `==` operator.
 65 | 
 66 |   ## Arguments
 67 | 
 68 |   * `string_a` and `string_b` are two strings
 69 |     to be compared
 70 | 
 71 |   ## Returns
 72 | 
 73 |   * `true` or `false`
 74 | 
 75 |   ## Notes
 76 | 
 77 |   * This function applies the [Unicode Case Folding
 78 |     algorithm](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf)
 79 | 
 80 |   * The algorithm does not apply any treatment to diacritical
 81 |     marks hence "compare strings without accents" is not
 82 |     part of this function.
 83 | 
 84 |   * No string normalization is performed. Where the
 85 |     normalization state of the string cannot be guaranteed
 86 |     it is recommended they be normalized before comparison
 87 |     using `String.normalize(string, :nfc)`.
 88 | 
 89 |   ## Examples
 90 | 
 91 |       iex> Unicode.String.equals_ignoring_case? "ABC", "abc"
 92 |       true
 93 | 
 94 |       iex> Unicode.String.equals_ignoring_case? "beißen", "beissen"
 95 |       true
 96 | 
 97 |       iex> Unicode.String.equals_ignoring_case? "grüßen", "grussen"
 98 |       false
 99 | 
100 |   """
101 |   @spec equals_ignoring_case?(String.t(), String.t(), mode_or_language()) :: boolean
102 |   def equals_ignoring_case?(string_a, string_b, mode_or_language_tag \\ nil) do
103 |     fold(string_a, mode_or_language_tag) == fold(string_b, mode_or_language_tag)
104 |   end
105 | 
106 |   @doc """
107 |   Returns a boolean indicating if the
108 |   requested break is applicable
109 |   at the point between the two string
110 |   segments represented by `{string_before, string_after}`.
111 | 
112 |   ## Arguments
113 | 
114 |   * `string_interval` is any 2-tuple consisting
115 |     of the string before a possible break and the string
116 |     after a possible break.
117 | 
118 |   * `options` is a keyword list of
119 |     options.
120 | 
121 |   ## Options
122 | 
123 |   * `:locale` is any locale returned by
124 |     `Unicode.String.Segment.known_segmentation_locales/0` or
125 |     `Unicode.String.Dictionary.known_dictionary_locales/0`.
126 |     The default is #{inspect(@default_locale)} which corresponds
127 |     to the break rules defined by the
128 |     [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.
129 | 
130 |   * `:break` is the type of break. It is one of
131 |     `:grapheme`, `:word`, `:line` or `:sentence`. The
132 |     default is `#{inspect(@default_break)}`.
133 | 
134 |   * `:suppressions` is a boolean which,
135 |     if `true`, will suppress breaks for common
136 |     abbreviations defined for the `locale`. The
137 |     default is `true`.
138 | 
139 |   ## Returns
140 | 
141 |   * `true` or `false` or
142 | 
143 |   * raises an exception if there is an error.
144 | 
145 |   ## Examples
146 | 
147 |       iex> Unicode.String.break? {"This is ", "some words"}
148 |       true
149 | 
150 |       iex> Unicode.String.break? {"This is ", "some words"}, break: :sentence
151 |       false
152 | 
153 |       iex> Unicode.String.break? {"This is one. ", "This is some words."}, break: :sentence
154 |       true
155 | 
156 |   """
157 |   @spec break?(string_interval :: string_interval(), options :: list(option())) ::
158 |     boolean | no_return()
159 | 
160 |   def break?({string_before, string_after}, options \\ []) do
161 |     case break({string_before, string_after}, options) do
162 |       {:break, _} -> true
163 |       {:no_break, _} -> false
164 |       {:error, reason} -> raise ArgumentError, reason
165 |     end
166 |   end
167 | 
168 |   @doc """
169 |   Returns match data indicating if the
170 |   requested break is applicable
171 |   at the point between the two string
172 |   segments represented by `{string_before, string_after}`.
173 | 
174 |   ## Arguments
175 | 
176 |   * `string_interval` is any 2-tuple consisting
177 |     of the string before a possible break and the string
178 |     after a possible break.
179 | 
180 |   * `options` is a keyword list of
181 |     options.
182 | 
183 |   ## Options
184 | 
185 |   * `:locale` is any locale returned by
186 |     `Unicode.String.Segment.known_segmentation_locales/0` or
187 |     `Unicode.String.Dictionary.known_dictionary_locales/0`.
188 |     The default is #{inspect(@default_locale)} which corresponds
189 |     to the break rules defined by the
190 |     [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.
191 | 
192 |   * `:break` is the type of break. It is one of
193 |     `:grapheme`, `:word`, `:line` or `:sentence`. The
194 |     default is `#{inspect(@default_break)}`.
195 | 
196 |   * `:suppressions` is a boolean which,
197 |     if `true`, will suppress breaks for common
198 |     abbreviations defined for the `locale`. The
199 |     default is `true`.
200 | 
201 |   ## Returns
202 | 
203 |   A tuple indicating if a break would
204 |   be applicable at this point between
205 |   `string_before` and `string_after`.
206 | 
207 |   * `{:break, {string_before, {matched_string, remaining_string}}}` or
208 | 
209 |   * `{:no_break, {string_before, {matched_string, remaining_string}}}` or
210 | 
211 |   * `{:error, reason}`.
212 | 
213 |   ## Examples
214 | 
215 |       iex> Unicode.String.break {"This is ", "some words"}
216 |       {:break, {"This is ", {"s", "ome words"}}}
217 | 
218 |       iex> Unicode.String.break {"This is ", "some words"}, break: :sentence
219 |       {:no_break, {"This is ", {"s", "ome words"}}}
220 | 
221 |       iex> Unicode.String.break {"This is one. ", "This is some words."}, break: :sentence
222 |       {:break, {"This is one. ", {"T", "his is some words."}}}
223 | 
224 |   """
225 |  @spec break(string_interval :: string_interval(), options :: list(option())) ::
226 |     break_match | error_return
227 | 
228 |   def break({string_before, string_after}, options \\ []) do
229 |     break = Keyword.get(options, :break, @default_break)
230 | 
231 |     with {:ok, break} <- validate(:break, break),
232 |          {:ok, locale} <- segmentation_locale_from_options(break, options),
233 |          {:ok, _dictionary} <- Dictionary.ensure_dictionary_loaded_if_available(locale) do
234 |       Break.break({string_before, string_after}, locale, break, options)
235 |     end
236 |   end
237 | 
238 |   @doc """
239 |   Returns an enumerable that splits a string on demand.
240 | 
241 |   ## Arguments
242 | 
243 |   * `string` is any `t:String.t/0`.
244 | 
245 |   * `options` is a keyword list of
246 |     options.
247 | 
248 |   ## Returns
249 | 
250 |   * A function that implements the enumerable
251 |     protocol or
252 | 
253 |   * `{:error, reason}`
254 | 
255 |   ## Options
256 | 
257 |   * `:locale` is any locale returned by
258 |     `Unicode.String.Segment.known_segmentation_locales/0` or
259 |     `Unicode.String.Dictionary.known_dictionary_locales/0`.
260 |     The default is #{inspect(@default_locale)} which corresponds
261 |     to the break rules defined by the
262 |     [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.
263 | 
264 |   * `:break` is the type of break. It is one of
265 |     `:grapheme`, `:word`, `:line` or `:sentence`. The
266 |     default is `#{inspect(@default_break)}`.
267 | 
268 |   * `:suppressions` is a boolean which,
269 |     if `true`, will suppress breaks for common
270 |     abbreviations defined for the `locale`. The
271 |     default is `true`.
272 | 
273 |   * `:trim` is a boolean indicating if segments
274 |     the are comprised of only white space are to be
275 |     excluded from the returned list.  The default
276 |     is `false`.
277 | 
278 |   ## Examples
279 | 
280 |       iex> enum = Unicode.String.splitter "This is a sentence. And another.", break: :word, trim: true
281 |       iex> Enum.take enum, 3
282 |       ["This", "is", "a"]
283 | 
284 |   """
285 |   @spec splitter(string :: String.t(), split_options :: list(split_option)) ::
286 |     function | error_return
287 | 
288 |   def splitter(string, options) when is_binary(string) do
289 |     break = Keyword.get(options, :break, @default_break)
290 | 
291 |     with {:ok, break} <- validate(:break, break),
292 |          {:ok, locale} <- segmentation_locale_from_options(break, options),
293 |          {:ok, _dictionary} <- Dictionary.ensure_dictionary_loaded_if_available(locale) do
294 |       Stream.unfold(string, &Break.next(&1, locale, break, options))
295 |     end
296 |   end
297 | 
298 |   @doc """
299 |   Returns next segment in a string.
300 | 
301 |   ## Arguments
302 | 
303 |   * `string` is any `t:String.t/0`.
304 | 
305 |   * `options` is a keyword list of
306 |     options.
307 | 
308 |   ## Returns
309 | 
310 |   A tuple with the segment and the remainder of the string or `""`
311 |   in case the String reached its end.
312 | 
313 |   * `{next_string, rest_of_the_string}` or
314 | 
315 |   * `{:error, reason}`
316 | 
317 |   ## Options
318 | 
319 |   * `:locale` is any locale returned by
320 |     `Unicode.String.Segment.known_segmentation_locales/0` or
321 |     `Unicode.String.Dictionary.known_dictionary_locales/0` or
322 |     a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html)
323 |     struct. The default is #{inspect(@default_locale)} which corresponds
324 |     to the break rules defined by the
325 |     [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.
326 | 
327 |   * `:break` is the type of break. It is one of
328 |     `:grapheme`, `:word`, `:line` or `:sentence`. The
329 |     default is `#{inspect(@default_break)}`.
330 | 
331 |   * `:suppressions` is a boolean which,
332 |     if `true`, will suppress breaks for common
333 |     abbreviations defined for the `locale`. The
334 |     default is `true`.
335 | 
336 |   ## Examples
337 | 
338 |       iex> Unicode.String.next "This is a sentence. And another.", break: :word
339 |       {"This", " is a sentence. And another."}
340 | 
341 |       iex> Unicode.String.next "This is a sentence. And another.", break: :sentence
342 |       {"This is a sentence. ", "And another."}
343 | 
344 |   """
345 |   @spec next(string :: String.t(), split_options :: list(split_option)) ::
346 |     String.t() | nil | error_return
347 | 
348 |   def next(string, options \\ []) when is_binary(string) do
349 |     break = Keyword.get(options, :break, @default_break)
350 | 
351 |     with {:ok, break} <- validate(:break, break),
352 |          {:ok, locale} <- segmentation_locale_from_options(break, options) do
353 |       Break.next(string, locale, break, options)
354 |     end
355 |   end
356 | 
357 |   @doc """
358 |   Splits a string according to the
359 |   specified break type.
360 | 
361 |   ## Arguments
362 | 
363 |   * `string` is any `t:String.t/0`.
364 | 
365 |   * `options` is a keyword list of
366 |     options.
367 | 
368 |   ## Returns
369 | 
370 |   * A list of strings after applying the
371 |     specified break rules or
372 | 
373 |   * `{:error, reason}`
374 | 
375 |   ## Options
376 | 
377 |   * `:locale` is any locale returned by
378 |     `Unicode.String.Segment.known_segmentation_locales/0`  or
379 |     `Unicode.String.Dictionary.known_dictionary_locales/0` or
380 |     a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html)
381 |     struct. The default is #{inspect(@default_locale)} which corresponds
382 |     to the break rules defined by the
383 |     [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.
384 | 
385 |   * `:break` is the type of break. It is one of
386 |     `:grapheme`, `:word`, `:line` or `:sentence`. The
387 |     default is `#{inspect(@default_break)}`.
388 | 
389 |   * `:suppressions` is a boolean which,
390 |     if `true`, will suppress breaks for common
391 |     abbreviations defined for the `locale`. The
392 |     default is `true`.
393 | 
394 |   * `:trim` is a boolean indicating if segments
395 |     the are comprised of only white space are to be
396 |     excluded from the returned list.  The default
397 |     is `false`.
398 | 
399 |   ## Examples
400 | 
401 |       iex> Unicode.String.split "This is a sentence. And another.", break: :word
402 |       ["This", " ", "is", " ", "a", " ", "sentence", ".", " ", "And", " ", "another", "."]
403 | 
404 |       iex> Unicode.String.split "This is a sentence. And another.", break: :word, trim: true
405 |       ["This", "is", "a", "sentence", ".", "And", "another", "."]
406 | 
407 |       iex> Unicode.String.split "This is a sentence. And another.", break: :sentence
408 |       ["This is a sentence. ", "And another."]
409 | 
410 |   """
411 |   @spec split(string :: String.t(), split_options :: list(split_option)) ::
412 |     [String.t(), ...] | error_return
413 | 
414 |   def split(string, options \\ []) when is_binary(string) do
415 |     break = Keyword.get(options, :break, @default_break)
416 | 
417 |     with {:ok, break} <- validate(:break, break),
418 |          {:ok, locale} <- segmentation_locale_from_options(break, options) do
419 |       Break.split(string, locale, break, options)
420 |     end
421 |     |> maybe_trim(options[:trim])
422 |   end
423 | 
424 |   defp maybe_trim(list, true) when is_list(list) do
425 |     Enum.reject(list, &Property.white_space?/1)
426 |   end
427 | 
428 |   defp maybe_trim(list, _) do
429 |     list
430 |   end
431 | 
432 |   @doc """
433 |   Return a stream that breaks a string into
434 |   graphemes, words, sentences or line breaks.
435 | 
436 |   ## Arguments
437 | 
438 |   * `string` is any `t:String.t/0`.
439 | 
440 |   * `options` is a keyword list of
441 |     options.
442 | 
443 |   ## Returns
444 | 
445 |   * A stream that is an `t:Enumerable.t/0` that
446 |     can be used with the functions in the `Stream`
447 |     or `Enum` modules.
448 | 
449 |   * `{:error, reason}`
450 | 
451 |   ## Options
452 | 
453 |   * `:locale` is any locale returned by
454 |     `Unicode.String.Segment.known_segmentation_locales/0` or
455 |     `Unicode.String.Dictionary.known_dictionary_locales/0` or
456 |     a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html)
457 |     struct. The default is #{inspect(@default_locale)} which corresponds
458 |     to the break rules defined by the
459 |     [Unicode Segmentation](https://unicode.org/reports/tr29/) rules.
460 | 
461 |   * `:break` is the type of break. It is one of
462 |     `:grapheme`, `:word`, `:line` or `:sentence`. The
463 |     default is `#{inspect(@default_break)}`.
464 | 
465 |   * `:suppressions` is a boolean which,
466 |     if `true`, will suppress breaks for common
467 |     abbreviations defined for the `locale`. The
468 |     default is `true`.
469 | 
470 |   * `:trim` is a boolean indicating if segments
471 |     the are comprised of only white space are to be
472 |     excluded from the returned list.  The default
473 |     is `false`.
474 | 
475 |   ## Examples
476 | 
477 |     iex> Enum.to_list Unicode.String.stream("this is a set of words", trim: true)
478 |     ["this", "is", "a", "set", "of", "words"]
479 | 
480 |     iex> Enum.to_list Unicode.String.stream("this is a set of words", break: :sentence, trim: true)
481 |     ["this is a set of words"]
482 | 
483 |   """
484 |   @doc since: "1.2.0"
485 | 
486 |   @spec stream(string :: String.t(), split_options :: list(split_option)) ::
487 |     Enumerable.t() | {:error, String.t()}
488 | 
489 |   def stream(string, options \\ []) do
490 |     break = Keyword.get(options, :break, @default_break)
491 | 
492 |     with {:ok, break} <- validate(:break, break),
493 |          {:ok, locale} <- segmentation_locale_from_options(break, options) do
494 |       Stream.resource(
495 |         fn -> string end,
496 |         fn string ->
497 |           case Break.next(string, locale, break, options) do
498 |             nil -> {:halt, ""}
499 |             {break, rest} -> {[break], rest}
500 |           end
501 |         end,
502 |         fn _ -> :ok end
503 |       )
504 |     end
505 |   end
506 | 
507 |   @doc """
508 |   Converts all characters in the given string to upper case
509 |   according to the Unicode Casing algorithm.
510 | 
511 |   ### Arguments
512 | 
513 |   * `string` is any `t:String.t/0`.
514 | 
515 |   * `options` is a keyword list of options.
516 | 
517 |   ### Options
518 | 
519 |   * `:locale` is any [ISO 639](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
520 |     language code or a [LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html)
521 |     which provides integration with [ex_cldr](https://hex.pm/packages/ex_cldr)
522 |     applications.  The default is `:any` which signifies the
523 |     application of the base Unicode casing algorithm.
524 | 
525 |   ### Notes
526 | 
527 |   * The locale option determines the use of certain locale-specific
528 |     casing rules.  Where no specific casing rules apply to
529 |     the given locale, the base Unicode casing algorithm is
530 |     applied. The locales which have customized casing rules
531 |     are returned by `Unicode.String.special_casing_locales/0`.
532 | 
533 |   ### Returns
534 | 
535 |   * `downcased_string`
536 | 
537 |   ### Examples
538 | 
539 |       # Basic case transformation
540 |       iex> Unicode.String.upcase("the quick brown fox")
541 |       "THE QUICK BROWN FOX"
542 | 
543 |       # Dotted-I in Turkish and Azeri
544 |       iex> Unicode.String.upcase("Diyarbakır", locale: :tr)
545 |       "DİYARBAKIR"
546 | 
547 |       # Upper case in Greek removes diacritics
548 |       iex> Unicode.String.upcase("Πατάτα, Αέρας, Μυστήριο", locale: :el)
549 |       "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ"
550 | 
551 |   """
552 |   @doc since: "1.3.0"
553 | 
554 |   @spec upcase(String.t(), Keyword.t()) :: String.t()
555 |   def upcase(string, options \\ []) when is_list(options) do
556 |     with {:ok, locale} <- casing_locale_from_options(options) do
557 |       Case.Mapping.upcase(string, locale)
558 |     end
559 |   end
560 | 
561 |   @doc """
562 |   Converts all characters in the given string to lower case
563 |   according to the Unicode Casing algorithm.
564 | 
565 |   ### Arguments
566 | 
567 |   * `string` is any `t:String.t/0`.
568 | 
569 |   * `options` is a keyword list of options.
570 | 
571 |   ### Options
572 | 
573 |   * `:locale` is any [ISO 639](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
574 |     language code or a [LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html)
575 |     which provides integration with [ex_cldr](https://hex.pm/packages/ex_cldr)
576 |     applications.  The default is `:any` which signifies the
577 |     application of the base Unicode casing algorithm.
578 | 
579 |   ### Notes
580 | 
581 |   * The locale option determines the use of certain locale-specific
582 |     casing rules.  Where no specific casing rules apply to
583 |     the given locale, the base Unicode casing algorithm is
584 |     applied. The locales which have customized casing rules
585 |     are returned by `Unicode.String.special_casing_locales/0`.
586 | 
587 |   ### Returns
588 | 
589 |   * `downcased_string`
590 | 
591 |   ### Examples
592 | 
593 |       iex> Unicode.String.downcase("THE QUICK BROWN FOX")
594 |       "the quick brown fox"
595 | 
596 |       # Lower case Greek with a final sigma
597 |       iex> Unicode.String.downcase("ὈΔΥΣΣΕΎΣ", locale: :el)
598 |       "ὀδυσσεύς"
599 | 
600 |       # Lower case in Turkish and Azeri correctly handles
601 |       # undotted-i and undotted-I
602 |       iex> Unicode.String.downcase("DİYARBAKIR", locale: :tr)
603 |       "diyarbakır"
604 | 
605 |   """
606 |   @doc since: "1.3.0"
607 | 
608 |   @spec downcase(String.t(), Keyword.t()) :: String.t()
609 |   def downcase(string, options \\ []) when is_list(options) do
610 |     with {:ok, locale} <- casing_locale_from_options(options) do
611 |       Case.Mapping.downcase(string, locale)
612 |     end
613 |   end
614 | 
615 |   @doc """
616 |   Converts the given string to title case
617 |   according to the Unicode Casing algorithm.
618 | 
619 |   Title casing is the process of transforming
620 |   the first character of each word in a string
621 |   to upper case and the following characters
622 |   in the word to lower case.
623 | 
624 |   As a result this algorithm does not conform
625 |   to the norms of all languages and cultures.
626 |   However special processing is performed for
627 |   the Dutch dipthong "IJ" when using the `:nl`
628 |   casing locale.
629 | 
630 |   Further work will focus on improving title
631 |   casing of Greek dipthongs.
632 | 
633 |   ### Arguments
634 | 
635 |   * `string` is any `t:String.t/0`.
636 | 
637 |   * `options` is a keyword list of options.
638 | 
639 |   ### Options
640 | 
641 |   * `:locale` is any [ISO 639](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)
642 |     language code or a [LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html)
643 |     which provides integration with [ex_cldr](https://hex.pm/packages/ex_cldr)
644 |     applications.  The default is `:any` which signifies the
645 |     application of the base Unicode casing algorithm.
646 | 
647 |   ### Notes
648 | 
649 |   * The locale option determines the use of certain locale-specific
650 |     casing rules.  Where no specific casing rules apply to
651 |     the given locale, the base Unicode casing algorithm is
652 |     applied. The locales which have customized casing rules
653 |     are returned by `Unicode.String.special_casing_locales/0`.
654 | 
655 |   * The string is broken into words using
656 |     `Unicode.String.break/2` which implements the
657 |     [Unicode segmentation algorithm](https://unicode.org/reports/tr29/).
658 | 
659 |   ### Returns
660 | 
661 |   * `title_cased_string`.
662 | 
663 |   ### Examples
664 | 
665 |       iex> Unicode.String.titlecase("THE QUICK BROWN FOX")
666 |       "The Quick Brown Fox"
667 | 
668 |       # Title case Dutch with leading dipthong
669 |       iex> Unicode.String.titlecase("ijsselmeer", locale: :nl)
670 |       "IJsselmeer"
671 | 
672 |   """
673 |   @doc since: "1.3.0"
674 | 
675 |   @spec titlecase(String.t(), Keyword.t()) :: String.t()
676 |   def titlecase(string, options \\ []) when is_list(options) do
677 |     with {:ok, casing_locale} <- casing_locale_from_options(options),
678 |          {:ok, segmentation_locale} <- segmentation_locale_from_options(:word, options) do
679 |       stream_options = Keyword.merge(options, break: :word, locale: segmentation_locale)
680 | 
681 |       string
682 |       |> stream(stream_options)
683 |       |> Enum.map(&Case.Mapping.titlecase(&1, casing_locale))
684 |       |> Enum.join()
685 |     end
686 |   end
687 | 
688 |   # These locales have some aadditional processing
689 |   # beyond that specified in SpecialCasing.txt
690 |   @special_casing_locales [:nl, :el]
691 |   @casing_locales (@special_casing_locales ++ Unicode.Utils.known_casing_locales())
692 |                   |> Enum.sort()
693 | 
694 |   @doc """
695 |   Returms a list of locales that have special
696 |   casing rules.
697 | 
698 |   ### Example
699 | 
700 |       iex> Unicode.String.special_casing_locales()
701 |       [:az, :el, :lt, :nl, :tr]
702 | 
703 |   """
704 |   def special_casing_locales do
705 |     @casing_locales
706 |   end
707 | 
708 |   #
709 |   # Helpers
710 |   #
711 | 
712 |   @doc false
713 |   def casing_locale(locale) do
714 |     casing_locale_from_options(locale: locale)
715 |   end
716 | 
717 |   @doc false
718 |   def segmentation_locale(break, locale) do
719 |     segmentation_locale_from_options(break, locale: locale)
720 |   end
721 | 
722 |   defp casing_locale_from_options(options) do
723 |     options
724 |     |> Keyword.get(:locale)
725 |     |> match_locale(@casing_locales, :any)
726 |     |> wrap(:ok)
727 |   end
728 | 
729 |   @segmentation_locales Segment.known_segmentation_locales()
730 |   @dictionary_locales Dictionary.known_dictionary_locales()
731 | 
732 |   defp segmentation_locale_from_options(:word, options) do
733 |     locale =  Keyword.get(options, :locale)
734 |     segmentation_locale =  match_locale(locale, @segmentation_locales, :root)
735 |     dictionary_locale = match_locale(locale, @dictionary_locales, nil)
736 | 
737 |     if dictionary_locale do
738 |       Dictionary.ensure_dictionary_loaded_if_available(dictionary_locale)
739 |     end
740 | 
741 |     (dictionary_locale || segmentation_locale)
742 |     |> wrap(:ok)
743 |   end
744 | 
745 |   defp segmentation_locale_from_options(_break, options) do
746 |     options
747 |     |> Keyword.get(:locale)
748 |     |> match_locale(@segmentation_locales, :root)
749 |     |> wrap(:ok)
750 |   end
751 | 
752 |   @doc false
753 |   def dictionary_locale(locale) do
754 |     dictionary_locale_from_options(locale: locale)
755 |   end
756 | 
757 |   @dictionary_locales Dictionary.known_dictionary_locales()
758 | 
759 |   defp dictionary_locale_from_options(options) do
760 |     options
761 |     |> Keyword.get(:locale)
762 |     |> match_locale(@dictionary_locales, nil)
763 |     |> wrap(:ok)
764 |   end
765 | 
766 |   defp wrap({:error, _} = error, _) do
767 |     error
768 |   end
769 | 
770 |   defp wrap(term, atom) do
771 |     {atom, term}
772 |   end
773 | 
774 |   defp match_locale(nil, _known_locales, default) do
775 |     default
776 |   end
777 | 
778 |   # The Enum.sort/1 here relies on the coincidental fact tha the three fields
779 |   # are alphabetically in the order we already want
780 | 
781 |   defp match_locale(locale, known_locales, default) when is_struct(locale, Cldr.LanguageTag) do
782 |     locale
783 |     |> Map.take([:canonical_locale_name, :cldr_locale_name, :language])
784 |     |> Enum.sort()
785 |     |> Keyword.values()
786 |     |> Enum.uniq()
787 |     |> Enum.map(&atomize/1)
788 |     |> find_matching_locale(known_locales, default)
789 |   end
790 | 
791 |   defp match_locale(locale, known_locales, default) when is_binary(locale) do
792 |     locale
793 |     |> String.split(["-", "_"])
794 |     |> build_candidate_locales()
795 |     |> find_matching_locale(known_locales, default)
796 |   end
797 | 
798 |   defp match_locale(locale, known_locales, default) when is_atom(locale) do
799 |     if locale in known_locales do
800 |       locale
801 |     else
802 |       match_locale(to_string(locale), known_locales, default)
803 |     end
804 |   end
805 | 
806 |   # Means it was a segment match request
807 |   defp match_locale(locale, _known_locales, :root) do
808 |     {:error, Segment.unknown_locale_error(locale)}
809 |   end
810 | 
811 |   # Means it was a casing match request
812 |   defp match_locale(locale, _known_locales, :any) do
813 |     {:error, Case.Mapping.unknown_locale_error(locale)}
814 |   end
815 | 
816 |   def find_matching_locale(candidates, known_locales, default) do
817 |     Enum.reduce_while(candidates, default, fn candidate, default ->
818 |       if candidate in known_locales do
819 |         {:halt, candidate}
820 |       else
821 |         {:cont, default}
822 |       end
823 |     end)
824 |   end
825 | 
826 |   defp build_candidate_locales([language]) when is_language(language) do
827 |     language
828 |     |> String.downcase()
829 |     |> atomize()
830 |     |> List.wrap()
831 |     |> Enum.reject(&is_nil/1)
832 |   end
833 | 
834 |   defp build_candidate_locales([language, territory | _rest])
835 |        when is_language(language) and is_territory(territory) do
836 |     language = downcase(language)
837 |     territory = upcase(territory)
838 | 
839 |     Enum.reject([atomize("#{language}-#{territory}"), atomize(language)], &is_nil/1)
840 |   end
841 | 
842 |   defp build_candidate_locales([language, script, territory | _rest])
843 |        when is_language(language) and is_script(script) and is_territory(territory) do
844 |     language = downcase(language)
845 |     script = titlecase(script)
846 |     territory = upcase(territory)
847 | 
848 |     Enum.reject([
849 |       atomize("#{language}-#{territory}"),
850 |       atomize("#{language}-#{script}"),
851 |       atomize(language)
852 |     ], &is_nil/1)
853 |   end
854 | 
855 |   defp build_candidate_locales([language, script | _rest])
856 |       when is_language(language) and is_script(script) do
857 |     language = downcase(language)
858 |     script = titlecase(script)
859 | 
860 |     Enum.reject([atomize("#{language}-#{script}"), atomize(language)], &is_nil/1)
861 |   end
862 | 
863 |   defp build_candidate_locales([language | _rest])  when is_language(language) do
864 |     build_candidate_locales([language])
865 |   end
866 | 
867 |   defp build_candidate_locales(["root"]) do
868 |     [:root]
869 |   end
870 | 
871 |   defp build_candidate_locales(_other) do
872 |     []
873 |   end
874 | 
875 |   defp atomize(string) do
876 |     String.to_existing_atom(string)
877 |   rescue
878 |     ArgumentError ->
879 |       nil
880 |   end
881 | 
882 |   @breaks [:word, :grapheme, :line, :sentence]
883 | 
884 |   defp validate(:break, break) do
885 |     if break in @breaks do
886 |       {:ok, break}
887 |     else
888 |       {:error, "Unknown break #{inspect(break)}. Valid breaks are #{inspect(@breaks)}"}
889 |     end
890 |   end
891 | end
892 | 


--------------------------------------------------------------------------------
/lib/unicode/trie.ex:
--------------------------------------------------------------------------------
 1 | defmodule Unicode.String.Trie do
 2 |   def new(list) do
 3 |     :btrie.new(list)
 4 |   end
 5 | 
 6 |   def find_prefix(string, dictionary) do
 7 |     :btrie.find_prefix(string, dictionary)
 8 |   end
 9 | 
10 |   def has_key(string, dictionary) do
11 |     :btrie.is_key(string, dictionary)
12 |   end
13 | end


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elixir-unicode/unicode_string/2ae703bb30551cf25d0dbf4100908c4986a6bbfc/logo.png


--------------------------------------------------------------------------------
/mix.exs:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.MixProject do
  2 |   use Mix.Project
  3 | 
  4 |   @version "1.7.0"
  5 | 
  6 |   def project do
  7 |     [
  8 |       app: :unicode_string,
  9 |       version: @version,
 10 |       elixir: "~> 1.11",
 11 |       start_permanent: Mix.env() == :prod,
 12 |       build_embedded: Mix.env() == :prod,
 13 |       deps: deps(),
 14 |       docs: docs(),
 15 |       name: "Unicode String",
 16 |       source_url: "https://github.com/elixir-unicode/unicode_string",
 17 |       description: description(),
 18 |       package: package(),
 19 |       elixirc_paths: elixirc_paths(Mix.env()),
 20 |       dialyzer: [
 21 |         plt_add_apps: ~w(mix sweet_xml)a,
 22 |         flags: [:underspecs]
 23 |       ]
 24 |     ]
 25 |   end
 26 | 
 27 |   defp description do
 28 |     """
 29 |     Unicode locale-aware case folding, case mapping (upcase, downcase and titlecase)
 30 |     case-insensitive equality as well as word, line, grapheme and sentence
 31 |     breaking and streaming.
 32 |     """
 33 |   end
 34 | 
 35 |   defp package do
 36 |     [
 37 |       maintainers: ["Kip Cole"],
 38 |       licenses: ["Apache-2.0"],
 39 |       logo: "logo.png",
 40 |       links: links(),
 41 |       files: [
 42 |         "lib",
 43 |         "priv",
 44 |         "logo.png",
 45 |         "mix.exs",
 46 |         "README*",
 47 |         "CHANGELOG*",
 48 |         "LICENSE*"
 49 |       ]
 50 |     ]
 51 |   end
 52 | 
 53 |   def application do
 54 |     [
 55 |       extra_applications: [:logger]
 56 |     ]
 57 |   end
 58 | 
 59 |   defp deps do
 60 |     [
 61 |       {:unicode, "~> 1.19"},
 62 |       {:unicode_set, "~> 1.5"},
 63 | 
 64 |       {:trie, "~> 2.0"},
 65 |       {:ex_cldr, "~> 2.38", optional: true},
 66 |       {:jason, "~> 1.0", optional: true},
 67 |       {:sweet_xml, "~> 0.7", runtime: false},
 68 |       {:dialyxir, "~> 1.0", only: [:dev, :test], runtime: false},
 69 |       # {:benchee, "~> 1.0", only: :dev, optional: true},
 70 |       {:ex_doc, "~> 0.23", only: [:dev, :release], optional: true, runtime: false}
 71 |     ]
 72 |   end
 73 | 
 74 |   def links do
 75 |     %{
 76 |       "GitHub" => "https://github.com/elixir-unicode/unicode_string",
 77 |       "Readme" => "https://github.com/elixir-unicode/unicode_string/blob/v#{@version}/README.md",
 78 |       "Changelog" =>
 79 |         "https://github.com/elixir-unicode/unicode_string/blob/v#{@version}/CHANGELOG.md"
 80 |     }
 81 |   end
 82 | 
 83 |   def docs do
 84 |     [
 85 |       source_ref: "v#{@version}",
 86 |       main: "readme",
 87 |       logo: "logo.png",
 88 |       extras: [
 89 |         "README.md",
 90 |         "LICENSE.md",
 91 |         "CHANGELOG.md"
 92 |       ],
 93 |       skip_undefined_reference_warnings_on: ["changelog", "CHANGELOG.md"],
 94 |       formatters: ["html"]
 95 |     ]
 96 |   end
 97 | 
 98 |   defp elixirc_paths(:test), do: ["lib", "mix", "src", "test"]
 99 |   defp elixirc_paths(:dev), do: ["lib", "mix", "src", "bench"]
100 |   defp elixirc_paths(_), do: ["lib", "src"]
101 | end
102 | 


--------------------------------------------------------------------------------
/mix.lock:
--------------------------------------------------------------------------------
 1 | %{
 2 |   "cldr_utils": {:hex, :cldr_utils, "2.28.2", "f500667164a9043369071e4f9dcef31f88b8589b2e2c07a1eb9f9fa53cb1dce9", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.5", [hex: :certifi, repo: "hexpm", optional: true]}, {:decimal, "~> 1.9 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "c506eb1a170ba7cdca59b304ba02a56795ed119856662f6b1a420af80ec42551"},
 3 |   "decimal": {:hex, :decimal, "2.3.0", "3ad6255aa77b4a3c4f818171b12d237500e63525c2fd056699967a3e7ea20f62", [:mix], [], "hexpm", "a4d66355cb29cb47c3cf30e71329e58361cfcb37c34235ef3bf1d7bf3773aeac"},
 4 |   "dialyxir": {:hex, :dialyxir, "1.4.5", "ca1571ac18e0f88d4ab245f0b60fa31ff1b12cbae2b11bd25d207f865e8ae78a", [:mix], [{:erlex, ">= 0.2.7", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "b0fb08bb8107c750db5c0b324fa2df5ceaa0f9307690ee3c1f6ba5b9eb5d35c3"},
 5 |   "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"},
 6 |   "erlex": {:hex, :erlex, "0.2.7", "810e8725f96ab74d17aac676e748627a07bc87eb950d2b83acd29dc047a30595", [:mix], [], "hexpm", "3ed95f79d1a844c3f6bf0cea61e0d5612a42ce56da9c03f01df538685365efb0"},
 7 |   "ex_cldr": {:hex, :ex_cldr, "2.42.0", "17ea930e88b8802b330e1c1e288cdbaba52cbfafcccf371ed34b299a47101ffb", [:mix], [{:cldr_utils, "~> 2.28", [hex: :cldr_utils, repo: "hexpm", optional: false]}, {:decimal, "~> 1.6 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:gettext, "~> 0.19", [hex: :gettext, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: true]}], "hexpm", "07264a7225810ecae6bdd6715d8800c037a1248dc0063923cddc4ca3c4888df6"},
 8 |   "ex_doc": {:hex, :ex_doc, "0.37.3", "f7816881a443cd77872b7d6118e8a55f547f49903aef8747dbcb345a75b462f9", [:mix], [{:earmark_parser, "~> 1.4.42", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "e6aebca7156e7c29b5da4daa17f6361205b2ae5f26e5c7d8ca0d3f7e18972233"},
 9 |   "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
10 |   "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"},
11 |   "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"},
12 |   "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"},
13 |   "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"},
14 |   "sweet_xml": {:hex, :sweet_xml, "0.7.5", "803a563113981aaac202a1dbd39771562d0ad31004ddbfc9b5090bdcd5605277", [:mix], [], "hexpm", "193b28a9b12891cae351d81a0cead165ffe67df1b73fe5866d10629f4faefb12"},
15 |   "trie": {:hex, :trie, "2.0.7", "09fa6b08cda978fe97e5b68cd4fca68c6d6fba8e941a9e66c75a4b4bf383af91", [:rebar3], [], "hexpm", "6b86092654bc6383d5c72dfbb32b466d3a70d3e95be37538bb5500ee888fa944"},
16 |   "unicode": {:hex, :unicode, "1.20.0", "10189cfe98b03ebb8be6efd00df0936c1c94d75bfbd62cba2bdf958fef3ee4a7", [:mix], [], "hexpm", "fa581cf80b3b1b7f42e4d24a69109dfac465cec27a62c661306c81f4ab35894c"},
17 |   "unicode_set": {:hex, :unicode_set, "1.5.0", "f2dcc40b1e8daf1a04433c705d9a8fb8ccdfc8fd5763a92d414a3e0775414cfb", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}, {:unicode, "~> 1.13", [hex: :unicode, repo: "hexpm", optional: false]}], "hexpm", "6c7f200e52fb90434d6b783eaa4e0ea303cfc4844ea25b2fc1ba3eb8a6901b11"},
18 | }
19 | 


--------------------------------------------------------------------------------
/mix/myapp_backend.ex:
--------------------------------------------------------------------------------
1 | defmodule MyApp.Cldr do
2 |   use Cldr,
3 |     locales: ["en", "de", "tr", "az", "fr-CA", "lt", "fr", "sv", "ar"],
4 |     default_locale: "en",
5 |     providers: []
6 | end


--------------------------------------------------------------------------------
/priv/dictionaries/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elixir-unicode/unicode_string/2ae703bb30551cf25d0dbf4100908c4986a6bbfc/priv/dictionaries/.gitkeep


--------------------------------------------------------------------------------
/priv/segments/de.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
  3 | <ldml>
  4 |   <identity>
  5 |     <version number="$Revision$"/>
  6 |     <language type="de"/>
  7 |   </identity>
  8 |   <segmentations>
  9 |     <segmentation type="SentenceBreak">
 10 |       <!--From ULI data, http://uli.unicode.org-->
 11 |       <suppressions type="standard">
 12 |         <suppression>Port.</suppression>
 13 |         <suppression>Alt.</suppression>
 14 |         <suppression>Di.</suppression>
 15 |         <suppression>Ges.</suppression>
 16 |         <suppression>frz.</suppression>
 17 |         <suppression>entspr.</suppression>
 18 |         <suppression>Gebr.</suppression>
 19 |         <suppression>erw.</suppression>
 20 |         <suppression>Frl.</suppression>
 21 |         <suppression>Inh.</suppression>
 22 |         <suppression>k.u.k.</suppression>
 23 |         <suppression>Ca.</suppression>
 24 |         <suppression>J.D.</suppression>
 25 |         <suppression>Ausg.</suppression>
 26 |         <suppression>evtl.</suppression>
 27 |         <suppression>So.</suppression>
 28 |         <suppression>i.B.</suppression>
 29 |         <suppression>s.a.</suppression>
 30 |         <suppression>kgl.</suppression>
 31 |         <suppression>Sept.</suppression>
 32 |         <suppression>o.B.</suppression>
 33 |         <suppression>Sa.</suppression>
 34 |         <suppression>ev.</suppression>
 35 |         <suppression>Dez.</suppression>
 36 |         <suppression>am.</suppression>
 37 |         <suppression>i.R.</suppression>
 38 |         <suppression>eigtl.</suppression>
 39 |         <suppression>i.J.</suppression>
 40 |         <suppression>u.U.</suppression>
 41 |         <suppression>G.</suppression>
 42 |         <suppression>z.Hd.</suppression>
 43 |         <suppression>u.A.w.g.</suppression>
 44 |         <suppression>Kl.</suppression>
 45 |         <suppression>Spezif.</suppression>
 46 |         <suppression>Obj.</suppression>
 47 |         <suppression>Ing.</suppression>
 48 |         <suppression>D. h.</suppression>
 49 |         <suppression>Folg.</suppression>
 50 |         <suppression>Akt.</suppression>
 51 |         <suppression>i.A.</suppression>
 52 |         <suppression>Msp.</suppression>
 53 |         <suppression>U.U.</suppression>
 54 |         <suppression>Chr.</suppression>
 55 |         <suppression>R.</suppression>
 56 |         <suppression>Einh.</suppression>
 57 |         <suppression>schwäb.</suppression>
 58 |         <suppression>Vgl.</suppression>
 59 |         <suppression>Aug.</suppression>
 60 |         <suppression>Dipl.-Ing.</suppression>
 61 |         <suppression>W.</suppression>
 62 |         <suppression>B.</suppression>
 63 |         <suppression>U. U.</suppression>
 64 |         <suppression>J.</suppression>
 65 |         <suppression>Fa.</suppression>
 66 |         <suppression>Mo.</suppression>
 67 |         <suppression>n.u.Z.</suppression>
 68 |         <suppression>Op.</suppression>
 69 |         <suppression>Mrd.</suppression>
 70 |         <suppression>e.h.</suppression>
 71 |         <suppression>Hr.</suppression>
 72 |         <suppression>Hrn.</suppression>
 73 |         <suppression>Ztr.</suppression>
 74 |         <suppression>k. u. k.</suppression>
 75 |         <suppression>Bibl.</suppression>
 76 |         <suppression>d.Ä.</suppression>
 77 |         <suppression>b.</suppression>
 78 |         <suppression>M.</suppression>
 79 |         <suppression>i.H.</suppression>
 80 |         <suppression>v.R.w.</suppression>
 81 |         <suppression>o.A.</suppression>
 82 |         <suppression>St.</suppression>
 83 |         <suppression>Dr.</suppression>
 84 |         <suppression>Fn.</suppression>
 85 |         <suppression>Abs.</suppression>
 86 |         <suppression>Rd.</suppression>
 87 |         <suppression>Dtzd.</suppression>
 88 |         <suppression>Jahrh.</suppression>
 89 |         <suppression>Z.</suppression>
 90 |         <suppression>Std.</suppression>
 91 |         <suppression>n. Chr.</suppression>
 92 |         <suppression>möbl.</suppression>
 93 |         <suppression>tägl.</suppression>
 94 |         <suppression>gest.</suppression>
 95 |         <suppression>gesch.</suppression>
 96 |         <suppression>z.B.</suppression>
 97 |         <suppression>Hbf.</suppression>
 98 |         <suppression>Abt.</suppression>
 99 |         <suppression>A.M.</suppression>
100 |         <suppression>e.Wz.</suppression>
101 |         <suppression>v.T.</suppression>
102 |         <suppression>Nov.</suppression>
103 |         <suppression>z.</suppression>
104 |         <suppression>Prot.</suppression>
105 |         <suppression>U.S.</suppression>
106 |         <suppression>Wg.</suppression>
107 |         <suppression>u.v.a.</suppression>
108 |         <suppression>Adr.</suppression>
109 |         <suppression>App.</suppression>
110 |         <suppression>ggf.</suppression>
111 |         <suppression>ggfs.</suppression>
112 |         <suppression>Jan.</suppression>
113 |         <suppression>O.</suppression>
114 |         <suppression>Rel.</suppression>
115 |         <suppression>od.</suppression>
116 |         <suppression>Pfd.</suppression>
117 |         <suppression>a.a.O.</suppression>
118 |         <suppression>p.Adr.</suppression>
119 |         <suppression>P.</suppression>
120 |         <suppression>Gem.</suppression>
121 |         <suppression>v. Chr.</suppression>
122 |         <suppression>Art.</suppression>
123 |         <suppression>z.Z.</suppression>
124 |         <suppression>S.A.</suppression>
125 |         <suppression>i.V.</suppression>
126 |         <suppression>verh.</suppression>
127 |         <suppression>Ausschl.</suppression>
128 |         <suppression>m.W.</suppression>
129 |         <suppression>Dir.</suppression>
130 |         <suppression>Verf.</suppression>
131 |         <suppression>Sek.</suppression>
132 |         <suppression>r.</suppression>
133 |         <suppression>Chin.</suppression>
134 |         <suppression>Feb.</suppression>
135 |         <suppression>Int.</suppression>
136 |         <suppression>Sep.</suppression>
137 |         <suppression>Gesch.</suppression>
138 |         <suppression>schweiz.</suppression>
139 |         <suppression>Bed.</suppression>
140 |         <suppression>a.Rh.</suppression>
141 |         <suppression>jew.</suppression>
142 |         <suppression>vgl.</suppression>
143 |         <suppression>a.M.</suppression>
144 |         <suppression>Str.</suppression>
145 |         <suppression>exkl.</suppression>
146 |         <suppression>gek.</suppression>
147 |         <suppression>Erf.</suppression>
148 |         <suppression>u.Ä.</suppression>
149 |         <suppression>ehem.</suppression>
150 |         <suppression>näml.</suppression>
151 |         <suppression>u. Z.</suppression>
152 |         <suppression>v. u. Z.</suppression>
153 |         <suppression>sog.</suppression>
154 |         <suppression>C.</suppression>
155 |         <suppression>Dipl.-Kfm.</suppression>
156 |         <suppression>mtl.</suppression>
157 |         <suppression>Hrsg.</suppression>
158 |         <suppression>Qu.</suppression>
159 |         <suppression>röm.</suppression>
160 |         <suppression>u.</suppression>
161 |         <suppression>U.</suppression>
162 |         <suppression>Adj.</suppression>
163 |         <suppression>Kap.</suppression>
164 |         <suppression>hpts.</suppression>
165 |         <suppression>a.D.</suppression>
166 |         <suppression>gedr.</suppression>
167 |         <suppression>Best.</suppression>
168 |         <suppression>N.</suppression>
169 |         <suppression>v.u.Z.</suppression>
170 |         <suppression>Phys.</suppression>
171 |         <suppression>Fr.</suppression>
172 |         <suppression>d.J.</suppression>
173 |         <suppression>Reg.-Bez.</suppression>
174 |         <suppression>m.E.</suppression>
175 |         <suppression>schles.</suppression>
176 |         <suppression>Max.</suppression>
177 |         <suppression>Ltd.</suppression>
178 |         <suppression>südd.</suppression>
179 |         <suppression>inkl.</suppression>
180 |         <suppression>geb.</suppression>
181 |         <suppression>Ggf.</suppression>
182 |         <suppression>Inc.</suppression>
183 |         <suppression>kath.</suppression>
184 |         <suppression>kfm.</suppression>
185 |         <suppression>Nr.</suppression>
186 |         <suppression>Proz.</suppression>
187 |         <suppression>Dim.</suppression>
188 |         <suppression>verw.</suppression>
189 |         <suppression>Reg.</suppression>
190 |         <suppression>Dat.</suppression>
191 |         <suppression>Evtl.</suppression>
192 |         <suppression>led.</suppression>
193 |         <suppression>F.</suppression>
194 |         <suppression>Test.</suppression>
195 |         <suppression>Schr.</suppression>
196 |         <suppression>Do.</suppression>
197 |         <suppression>PIN.</suppression>
198 |         <suppression>Z. Zt.</suppression>
199 |         <suppression>v.Chr.</suppression>
200 |         <suppression>Tägl.</suppression>
201 |         <suppression>s.</suppression>
202 |         <suppression>amtl.</suppression>
203 |         <suppression>Temp.</suppression>
204 |         <suppression>Mind.</suppression>
205 |         <suppression>e.V.</suppression>
206 |         <suppression>Abw.</suppression>
207 |         <suppression>P.M.</suppression>
208 |         <suppression>F.f.</suppression>
209 |         <suppression>a.a.S.</suppression>
210 |         <suppression>Mod.</suppression>
211 |         <suppression>Co.</suppression>
212 |         <suppression>Min.</suppression>
213 |         <suppression>Allg.</suppression>
214 |         <suppression>Geograph.</suppression>
215 |         <suppression>Jr.</suppression>
216 |         <suppression>Urspr.</suppression>
217 |         <suppression>Apr.</suppression>
218 |         <suppression>Z. B.</suppression>
219 |         <suppression>v.H.</suppression>
220 |         <suppression>A.</suppression>
221 |         <suppression>einschl.</suppression>
222 |         <suppression>Trans.</suppression>
223 |         <suppression>zzgl.</suppression>
224 |         <suppression>StR.</suppression>
225 |         <suppression>Fam.</suppression>
226 |         <suppression>I.</suppression>
227 |         <suppression>jhrl.</suppression>
228 |         <suppression>u.a.</suppression>
229 |         <suppression>Ben.</suppression>
230 |         <suppression>o.g.</suppression>
231 |         <suppression>Kfm.</suppression>
232 |         <suppression>Konv.</suppression>
233 |         <suppression>Mi.</suppression>
234 |         <suppression>L.</suppression>
235 |         <suppression>beil.</suppression>
236 |         <suppression>T.</suppression>
237 |         <suppression>Ursprüngl.</suppression>
238 |         <suppression>röm.-kath.</suppression>
239 |         <suppression>Okt.</suppression>
240 |         <suppression>u.ä.</suppression>
241 |         <suppression>Tel.</suppression>
242 |         <suppression>D.</suppression>
243 |         <suppression>Ber.</suppression>
244 |         <suppression>Kop.</suppression>
245 |         <suppression>Mio.</suppression>
246 |         <suppression>Y.</suppression>
247 |         <suppression>U.S.A.</suppression>
248 |         <suppression>v. H.</suppression>
249 |         <suppression>Forts. f.</suppression>
250 |         <suppression>Rep.</suppression>
251 |         <suppression>Hptst.</suppression>
252 |         <suppression>österr.</suppression>
253 |       </suppressions>
254 |     </segmentation>
255 |   </segmentations>
256 | </ldml>
257 | 
258 | 


--------------------------------------------------------------------------------
/priv/segments/el.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 1991-2013 Unicode, Inc.
 5 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 6 | For terms of use, see http://www.unicode.org/copyright.html
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="el"/>
12 | 	</identity>
13 | 	<segmentations>
14 | 		<segmentation type="SentenceBreak">
15 | 			<variables>
16 | 				<variable id="$STerm">[[$STerm] [\u003B \u037E]]</variable>
17 | 			</variables>
18 | 		</segmentation>
19 | 	</segmentations>
20 | </ldml>
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/priv/segments/en.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
  3 | <ldml>
  4 |   <identity>
  5 |     <version number="$Revision$"/>
  6 |     <language type="en"/>
  7 |   </identity>
  8 |   <segmentations>
  9 |     <segmentation type="SentenceBreak">
 10 |       <!--From ULI data, http://uli.unicode.org-->
 11 |       <suppressions type="standard">
 12 |         <suppression>L.P.</suppression>
 13 |         <suppression>Alt.</suppression>
 14 |         <suppression>Approx.</suppression>
 15 |         <suppression>E.G.</suppression>
 16 |         <suppression>O.</suppression>
 17 |         <suppression>Maj.</suppression>
 18 |         <suppression>Misc.</suppression>
 19 |         <suppression>P.O.</suppression>
 20 |         <suppression>J.D.</suppression>
 21 |         <suppression>Jam.</suppression>
 22 |         <suppression>Card.</suppression>
 23 |         <suppression>Dec.</suppression>
 24 |         <suppression>Sept.</suppression>
 25 |         <suppression>MR.</suppression>
 26 |         <suppression>Long.</suppression>
 27 |         <suppression>Hat.</suppression>
 28 |         <suppression>G.</suppression>
 29 |         <suppression>Link.</suppression>
 30 |         <suppression>DC.</suppression>
 31 |         <suppression>D.C.</suppression>
 32 |         <suppression>M.T.</suppression>
 33 |         <suppression>Hz.</suppression>
 34 |         <suppression>Mrs.</suppression>
 35 |         <suppression>By.</suppression>
 36 |         <suppression>Act.</suppression>
 37 |         <suppression>Var.</suppression>
 38 |         <suppression>N.V.</suppression>
 39 |         <suppression>Aug.</suppression>
 40 |         <suppression>B.</suppression>
 41 |         <suppression>S.A.</suppression>
 42 |         <suppression>Up.</suppression>
 43 |         <suppression>Job.</suppression>
 44 |         <suppression>Num.</suppression>
 45 |         <suppression>M.I.T.</suppression>
 46 |         <suppression>Ok.</suppression>
 47 |         <suppression>Org.</suppression>
 48 |         <suppression>Ex.</suppression>
 49 |         <suppression>Cont.</suppression>
 50 |         <suppression>U.</suppression>
 51 |         <suppression>Mart.</suppression>
 52 |         <suppression>Fn.</suppression>
 53 |         <suppression>Abs.</suppression>
 54 |         <suppression>Lt.</suppression>
 55 |         <suppression>OK.</suppression>
 56 |         <suppression>Z.</suppression>
 57 |         <suppression>E.</suppression>
 58 |         <suppression>Kb.</suppression>
 59 |         <suppression>Est.</suppression>
 60 |         <suppression>A.M.</suppression>
 61 |         <suppression>L.A.</suppression>
 62 |         <suppression>Prof.</suppression>
 63 |         <suppression>U.S.</suppression>
 64 |         <suppression>Nov.</suppression>
 65 |         <suppression>Ph.D.</suppression>
 66 |         <suppression>Mar.</suppression>
 67 |         <suppression>I.T.</suppression>
 68 |         <suppression>exec.</suppression>
 69 |         <suppression>Jan.</suppression>
 70 |         <suppression>N.Y.</suppression>
 71 |         <suppression>X.</suppression>
 72 |         <suppression>Md.</suppression>
 73 |         <suppression>Op.</suppression>
 74 |         <suppression>vs.</suppression>
 75 |         <suppression>D.A.</suppression>
 76 |         <suppression>A.D.</suppression>
 77 |         <suppression>R.L.</suppression>
 78 |         <suppression>P.M.</suppression>
 79 |         <suppression>Or.</suppression>
 80 |         <suppression>M.R.</suppression>
 81 |         <suppression>Cap.</suppression>
 82 |         <suppression>PC.</suppression>
 83 |         <suppression>Feb.</suppression>
 84 |         <suppression>Exec.</suppression>
 85 |         <suppression>I.e.</suppression>
 86 |         <suppression>Sep.</suppression>
 87 |         <suppression>Gb.</suppression>
 88 |         <suppression>K.</suppression>
 89 |         <suppression>U.S.C.</suppression>
 90 |         <suppression>Mt.</suppression>
 91 |         <suppression>S.</suppression>
 92 |         <suppression>A.S.</suppression>
 93 |         <suppression>C.O.D.</suppression>
 94 |         <suppression>Capt.</suppression>
 95 |         <suppression>Col.</suppression>
 96 |         <suppression>In.</suppression>
 97 |         <suppression>C.F.</suppression>
 98 |         <suppression>Adj.</suppression>
 99 |         <suppression>AD.</suppression>
100 |         <suppression>I.D.</suppression>
101 |         <suppression>Mgr.</suppression>
102 |         <suppression>R.T.</suppression>
103 |         <suppression>B.V.</suppression>
104 |         <suppression>M.</suppression>
105 |         <suppression>Conn.</suppression>
106 |         <suppression>Yr.</suppression>
107 |         <suppression>Rev.</suppression>
108 |         <suppression>Phys.</suppression>
109 |         <suppression>pp.</suppression>
110 |         <suppression>Ms.</suppression>
111 |         <suppression>To.</suppression>
112 |         <suppression>Sgt.</suppression>
113 |         <suppression>J.K.</suppression>
114 |         <suppression>Nr.</suppression>
115 |         <suppression>Jun.</suppression>
116 |         <suppression>Fri.</suppression>
117 |         <suppression>S.A.R.</suppression>
118 |         <suppression>Lev.</suppression>
119 |         <suppression>Lt.Cdr.</suppression>
120 |         <suppression>Def.</suppression>
121 |         <suppression>F.</suppression>
122 |         <suppression>Do.</suppression>
123 |         <suppression>Joe.</suppression>
124 |         <suppression>Id.</suppression>
125 |         <suppression>Mr.</suppression>
126 |         <suppression>Dept.</suppression>
127 |         <suppression>Is.</suppression>
128 |         <suppression>Pvt.</suppression>
129 |         <suppression>Diff.</suppression>
130 |         <suppression>Hon.B.A.</suppression>
131 |         <suppression>Q.</suppression>
132 |         <suppression>Mb.</suppression>
133 |         <suppression>On.</suppression>
134 |         <suppression>Min.</suppression>
135 |         <suppression>J.B.</suppression>
136 |         <suppression>Ed.</suppression>
137 |         <suppression>AB.</suppression>
138 |         <suppression>A.</suppression>
139 |         <suppression>S.p.A.</suppression>
140 |         <suppression>I.</suppression>
141 |         <suppression>a.m.</suppression>
142 |         <suppression>Comm.</suppression>
143 |         <suppression>Go.</suppression>
144 |         <suppression>VS.</suppression>
145 |         <suppression>L.</suppression>
146 |         <suppression>All.</suppression>
147 |         <suppression>PP.</suppression>
148 |         <suppression>P.V.</suppression>
149 |         <suppression>T.</suppression>
150 |         <suppression>K.R.</suppression>
151 |         <suppression>Etc.</suppression>
152 |         <suppression>D.</suppression>
153 |         <suppression>Adv.</suppression>
154 |         <suppression>Lib.</suppression>
155 |         <suppression>E.g.</suppression>
156 |         <suppression>Pro.</suppression>
157 |         <suppression>U.S.A.</suppression>
158 |         <suppression>S.E.</suppression>
159 |         <suppression>AA.</suppression>
160 |         <suppression>Rep.</suppression>
161 |         <suppression>Sq.</suppression>
162 |         <suppression>As.</suppression>
163 |       </suppressions>
164 |     </segmentation>
165 |   </segmentations>
166 | </ldml>
167 | 
168 | 


--------------------------------------------------------------------------------
/priv/segments/en_US.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!-- Copyright © 1991-2013 Unicode, Inc.
 4 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 5 | For terms of use, see http://www.unicode.org/copyright.html
 6 | -->
 7 | <ldml>
 8 | 	<identity>
 9 | 		<version number="$Revision$"/>
10 | 		<language type="en"/>
11 | 		<territory type="US"/>
12 | 	</identity>
13 | </ldml>
14 | 


--------------------------------------------------------------------------------
/priv/segments/en_US_POSIX.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 1991-2013 Unicode, Inc.
 5 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 6 | For terms of use, see http://www.unicode.org/copyright.html
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="en"/>
12 | 		<territory type="US"/>
13 | 		<variant type="POSIX"/>
14 | 	</identity>
15 | 	<segmentations>
16 | 		<segmentation type="WordBreak">
17 | 			<variables>
18 | 				<variable id="$MidNumLet">[[$MidNumLet]-[.]]</variable>
19 | 				<variable id="$MidNum">[[$MidNum] [.]]</variable>
20 | 			</variables>
21 | 		</segmentation>
22 | 	</segmentations>
23 | </ldml>
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/priv/segments/es.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
  3 | <ldml>
  4 |   <identity>
  5 |     <version number="$Revision$"/>
  6 |     <language type="es"/>
  7 |   </identity>
  8 |   <segmentations>
  9 |     <segmentation type="SentenceBreak">
 10 |       <!--From ULI data, http://uli.unicode.org-->
 11 |       <suppressions type="standard">
 12 |         <suppression>Rdos.</suppression>
 13 |         <suppression>JJ.OO.</suppression>
 14 |         <suppression>Sres.</suppression>
 15 |         <suppression>fig.</suppression>
 16 |         <suppression>may.</suppression>
 17 |         <suppression>RR.HH.</suppression>
 18 |         <suppression>oct.</suppression>
 19 |         <suppression>cap.</suppression>
 20 |         <suppression>mié.</suppression>
 21 |         <suppression>doc.</suppression>
 22 |         <suppression>Excmo.</suppression>
 23 |         <suppression>Trab.</suppression>
 24 |         <suppression>Excmos.</suppression>
 25 |         <suppression>Kit.</suppression>
 26 |         <suppression>Inc.</suppression>
 27 |         <suppression>FF.CC.</suppression>
 28 |         <suppression>DC.</suppression>
 29 |         <suppression>ago.</suppression>
 30 |         <suppression>trad.</suppression>
 31 |         <suppression>SA.</suppression>
 32 |         <suppression>Rvdos.</suppression>
 33 |         <suppression>ed.</suppression>
 34 |         <suppression>Exmo.</suppression>
 35 |         <suppression>jul.</suppression>
 36 |         <suppression>col.</suppression>
 37 |         <suppression>RAM.</suppression>
 38 |         <suppression>Srtas.</suppression>
 39 |         <suppression>ene.</suppression>
 40 |         <suppression>Rol.</suppression>
 41 |         <suppression>Fabric.</suppression>
 42 |         <suppression>Comm.</suppression>
 43 |         <suppression>vid.</suppression>
 44 |         <suppression>Da.</suppression>
 45 |         <suppression>dic.</suppression>
 46 |         <suppression>ss.</suppression>
 47 |         <suppression>abr.</suppression>
 48 |         <suppression>ntra.</suppression>
 49 |         <suppression>Sra.</suppression>
 50 |         <suppression>dtor.</suppression>
 51 |         <suppression>cf.</suppression>
 52 |         <suppression>dom.</suppression>
 53 |         <suppression>prov.</suppression>
 54 |         <suppression>Emm.</suppression>
 55 |         <suppression>Sr.</suppression>
 56 |         <suppression>licdo.</suppression>
 57 |         <suppression>p.ej.</suppression>
 58 |         <suppression>bol.</suppression>
 59 |         <suppression>figs.</suppression>
 60 |         <suppression>Vda.</suppression>
 61 |         <suppression>Dr.</suppression>
 62 |         <suppression>ntro.</suppression>
 63 |         <suppression>Desv.</suppression>
 64 |         <suppression>O.M.</suppression>
 65 |         <suppression>Ldo.</suppression>
 66 |         <suppression>Drs.</suppression>
 67 |         <suppression>sáb.</suppression>
 68 |         <suppression>feb.</suppression>
 69 |         <suppression>Ltda.</suppression>
 70 |         <suppression>Lcda.</suppression>
 71 |         <suppression>Exma.</suppression>
 72 |         <suppression>C.V.</suppression>
 73 |         <suppression>SS.MM.</suppression>
 74 |         <suppression>Lda.</suppression>
 75 |         <suppression>U.S.</suppression>
 76 |         <suppression>hnos.</suppression>
 77 |         <suppression>R.D.</suppression>
 78 |         <suppression>Korn.</suppression>
 79 |         <suppression>v.gr.</suppression>
 80 |         <suppression>vs.</suppression>
 81 |         <suppression>Ilmas.</suppression>
 82 |         <suppression>Rdo.</suppression>
 83 |         <suppression>ej.</suppression>
 84 |         <suppression>vie.</suppression>
 85 |         <suppression>jue.</suppression>
 86 |         <suppression>a. C.</suppression>
 87 |         <suppression>Ilmos.</suppression>
 88 |         <suppression>e. c.</suppression>
 89 |         <suppression>Excma.</suppression>
 90 |         <suppression>afma.</suppression>
 91 |         <suppression>licda.</suppression>
 92 |         <suppression>Em.</suppression>
 93 |         <suppression>K.</suppression>
 94 |         <suppression>sras.</suppression>
 95 |         <suppression>MM.</suppression>
 96 |         <suppression>fund.</suppression>
 97 |         <suppression>Mons.</suppression>
 98 |         <suppression>Lcdo.</suppression>
 99 |         <suppression>afmo.</suppression>
100 |         <suppression>C.</suppression>
101 |         <suppression>A.C.</suppression>
102 |         <suppression>dptos.</suppression>
103 |         <suppression>Col.</suppression>
104 |         <suppression>Srta.</suppression>
105 |         <suppression>Av.</suppression>
106 |         <suppression>Ant.</suppression>
107 |         <suppression>depto.</suppression>
108 |         <suppression>Var.</suppression>
109 |         <suppression>H.P.</suppression>
110 |         <suppression>D.</suppression>
111 |         <suppression>M.</suppression>
112 |         <suppression>C.P.</suppression>
113 |         <suppression>Rev.</suppression>
114 |         <suppression>Rvdmos.</suppression>
115 |         <suppression>Fr.</suppression>
116 |         <suppression>Ilmo.</suppression>
117 |         <suppression>afmos.</suppression>
118 |         <suppression>Ltd.</suppression>
119 |         <suppression>afmas.</suppression>
120 |         <suppression>prof.</suppression>
121 |         <suppression>lun.</suppression>
122 |         <suppression>SS.AA.</suppression>
123 |         <suppression>Sol.</suppression>
124 |         <suppression>nov.</suppression>
125 |         <suppression>mss.</suppression>
126 |         <suppression>Dña.</suppression>
127 |         <suppression>Seg.</suppression>
128 |         <suppression>mar.</suppression>
129 |         <suppression>Rvdmo.</suppression>
130 |         <suppression>Reg.</suppression>
131 |         <suppression>ms.</suppression>
132 |         <suppression>Sras.</suppression>
133 |         <suppression>sres.</suppression>
134 |         <suppression>U.S.A.</suppression>
135 |         <suppression>Sta.</suppression>
136 |         <suppression>Sdad.</suppression>
137 |         <suppression>Dra.</suppression>
138 |         <suppression>srs.</suppression>
139 |         <suppression>R.U.</suppression>
140 |         <suppression>deptos.</suppression>
141 |         <suppression>dpto.</suppression>
142 |         <suppression>jun.</suppression>
143 |         <suppression>bco.</suppression>
144 |         <suppression>Cía.</suppression>
145 |         <suppression>Id.</suppression>
146 |         <suppression>Mr.</suppression>
147 |         <suppression>e.g.</suppression>
148 |         <suppression>C.S.</suppression>
149 |         <suppression>Excmas.</suppression>
150 |         <suppression>Dª.</suppression>
151 |         <suppression>Rvdo.</suppression>
152 |         <suppression>Lic.</suppression>
153 |         <suppression>cfr.</suppression>
154 |         <suppression>Corp.</suppression>
155 |         <suppression>Dto.</suppression>
156 |         <suppression>Ilma.</suppression>
157 |         <suppression>L.</suppression>
158 |         <suppression>All.</suppression>
159 |         <suppression>PP.</suppression>
160 |         <suppression>d. C.</suppression>
161 |         <suppression>Ltdo.</suppression>
162 |         <suppression>mtro.</suppression>
163 |         <suppression>Mrs.</suppression>
164 |         <suppression>Desc.</suppression>
165 |         <suppression>Avda.</suppression>
166 |         <suppression>Exmas.</suppression>
167 |         <suppression>a. e. c.</suppression>
168 |         <suppression>Bien.</suppression>
169 |         <suppression>Exmos.</suppression>
170 |         <suppression>AA.</suppression>
171 |         <suppression>Sto.</suppression>
172 |         <suppression>CA.</suppression>
173 |         <suppression>sept.</suppression>
174 |         <suppression>Exc.</suppression>
175 |         <suppression>c/c.</suppression>
176 |       </suppressions>
177 |     </segmentation>
178 |   </segmentations>
179 | </ldml>
180 | 
181 | 


--------------------------------------------------------------------------------
/priv/segments/fi.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 1991-2015 Unicode, Inc.
 5 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 6 | For terms of use, see http://www.unicode.org/copyright.html
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="fi"/>
12 | 	</identity>
13 | 	<segmentations>
14 | 		<segmentation type="WordBreak">
15 | 			<variables>
16 | 				<variable id="$MidLetter">\p{Word_Break=MidLetter}</variable>
17 | 			</variables>
18 | 		</segmentation>
19 | 	</segmentations>
20 | </ldml>
21 | 


--------------------------------------------------------------------------------
/priv/segments/fr.xml:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8'?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <ldml>
 4 |   <identity>
 5 |     <version number="$Revision$"/>
 6 |     <language type="fr"/>
 7 |   </identity>
 8 |   <segmentations>
 9 |     <segmentation type="SentenceBreak">
10 |       <!--From ULI data, http://uli.unicode.org-->
11 |       <suppressions type="standard">
12 |         <suppression>aux.</suppression>
13 |         <suppression>config.</suppression>
14 |         <suppression>collab.</suppression>
15 |         <suppression>M.</suppression>
16 |         <suppression>dim.</suppression>
17 |         <suppression>imprim.</suppression>
18 |         <suppression>oct.</suppression>
19 |         <suppression>syst.</suppression>
20 |         <suppression>bull.</suppression>
21 |         <suppression>MM.</suppression>
22 |         <suppression>doc.</suppression>
23 |         <suppression>P.O.</suppression>
24 |         <suppression>hôp.</suppression>
25 |         <suppression>Mart.</suppression>
26 |         <suppression>juil.</suppression>
27 |         <suppression>broch.</suppression>
28 |         <suppression>adr.</suppression>
29 |         <suppression>symb.</suppression>
30 |         <suppression>C.</suppression>
31 |         <suppression>anc.</suppression>
32 |         <suppression>voit.</suppression>
33 |         <suppression>Jr.</suppression>
34 |         <suppression>graph.</suppression>
35 |         <suppression>dir.</suppression>
36 |         <suppression>éd.</suppression>
37 |         <suppression>fig.</suppression>
38 |         <suppression>édit.</suppression>
39 |         <suppression>niv.</suppression>
40 |         <suppression>quart.</suppression>
41 |         <suppression>cam.</suppression>
42 |         <suppression>éval.</suppression>
43 |         <suppression>anon.</suppression>
44 |         <suppression>réf.</suppression>
45 |         <suppression>Comm.</suppression>
46 |         <suppression>Prof.</suppression>
47 |         <suppression>févr.</suppression>
48 |         <suppression>indus.</suppression>
49 |         <suppression>DC.</suppression>
50 |         <suppression>équiv.</suppression>
51 |         <suppression>illustr.</suppression>
52 |         <suppression>acoust.</suppression>
53 |         <suppression>nov.</suppression>
54 |         <suppression>L.</suppression>
55 |         <suppression>All.</suppression>
56 |         <suppression>U.S.</suppression>
57 |         <suppression>S.M.A.R.T.</suppression>
58 |         <suppression>sept.</suppression>
59 |         <suppression>avr.</suppression>
60 |         <suppression>jeu.</suppression>
61 |         <suppression>dest.</suppression>
62 |         <suppression>P.-D. G.</suppression>
63 |         <suppression>ill.</suppression>
64 |         <suppression>coll.</suppression>
65 |         <suppression>encycl.</suppression>
66 |         <suppression>mer.</suppression>
67 |         <suppression>Desc.</suppression>
68 |         <suppression>ven.</suppression>
69 |         <suppression>P.</suppression>
70 |         <suppression>lun.</suppression>
71 |         <suppression>Inc.</suppression>
72 |         <suppression>sam.</suppression>
73 |         <suppression>D.</suppression>
74 |         <suppression>append.</suppression>
75 |         <suppression>Var.</suppression>
76 |         <suppression>categ.</suppression>
77 |         <suppression>janv.</suppression>
78 |         <suppression>S.A.</suppression>
79 |         <suppression>imm.</suppression>
80 |         <suppression>U.S.A.</suppression>
81 |         <suppression>mar.</suppression>
82 |         <suppression>exempl.</suppression>
83 |         <suppression>déc.</suppression>
84 |         <suppression>ann.</suppression>
85 |         <suppression>U.</suppression>
86 |         <suppression>synth.</suppression>
87 |         <suppression>dict.</suppression>
88 |         <suppression>av. J.-C.</suppression>
89 |         <suppression>W.</suppression>
90 |         <suppression>Op.</suppression>
91 |         <suppression>ap. J.-C.</suppression>
92 |         <suppression>gouv.</suppression>
93 |         <suppression>trav. publ.</suppression>
94 |       </suppressions>
95 |     </segmentation>
96 |   </segmentations>
97 | </ldml>
98 | 
99 | 


--------------------------------------------------------------------------------
/priv/segments/it.xml:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8'?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <ldml>
 4 |   <identity>
 5 |     <version number="$Revision$"/>
 6 |     <language type="it"/>
 7 |   </identity>
 8 |   <segmentations>
 9 |     <segmentation type="SentenceBreak">
10 |       <!--From ULI data, http://uli.unicode.org-->
11 |       <suppressions type="standard">
12 |         <suppression>N.B.</suppression>
13 |         <suppression>div.</suppression>
14 |         <suppression>a.C.</suppression>
15 |         <suppression>fig.</suppression>
16 |         <suppression>d.p.R.</suppression>
17 |         <suppression>c.c.p.</suppression>
18 |         <suppression>Cfr.</suppression>
19 |         <suppression>vol.</suppression>
20 |         <suppression>Geom.</suppression>
21 |         <suppression>O.d.G.</suppression>
22 |         <suppression>S.p.A.</suppression>
23 |         <suppression>ver.</suppression>
24 |         <suppression>N.d.A.</suppression>
25 |         <suppression>dott.</suppression>
26 |         <suppression>arch.</suppression>
27 |         <suppression>d.C.</suppression>
28 |         <suppression>N.d.T.</suppression>
29 |         <suppression>rag.</suppression>
30 |         <suppression>Sig.</suppression>
31 |         <suppression>Mod.</suppression>
32 |         <suppression>pag.</suppression>
33 |         <suppression>dr.</suppression>
34 |         <suppression>tav.</suppression>
35 |         <suppression>N.d.E.</suppression>
36 |         <suppression>DC.</suppression>
37 |         <suppression>mitt.</suppression>
38 |         <suppression>Ing.</suppression>
39 |         <suppression>int.</suppression>
40 |         <suppression>on.</suppression>
41 |         <suppression>C.P.</suppression>
42 |         <suppression>ag.</suppression>
43 |         <suppression>L.</suppression>
44 |         <suppression>U.S.</suppression>
45 |         <suppression>S.M.A.R.T.</suppression>
46 |         <suppression>p.i.</suppression>
47 |         <suppression>tab.</suppression>
48 |         <suppression>Ltd.</suppression>
49 |         <suppression>Liv.</suppression>
50 |         <suppression>D.</suppression>
51 |         <suppression>U.S.A.</suppression>
52 |         <suppression>sez.</suppression>
53 |         <suppression>avv.</suppression>
54 |         <suppression>S.A.R.</suppression>
55 |         <suppression>all.</suppression>
56 |         <suppression>p.</suppression>
57 |       </suppressions>
58 |     </segmentation>
59 |   </segmentations>
60 | </ldml>
61 | 
62 | 


--------------------------------------------------------------------------------
/priv/segments/ja.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 1991-2018 Unicode, Inc.
 5 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 6 | For terms of use, see http://www.unicode.org/copyright.html
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="ja"/>
12 | 	</identity>
13 | 	<segmentations>
14 | 		<segmentation type="LineBreak">
15 | 			<variables>
16 | 				<variable id="$ID">[[\p{Line_Break=Ideographic}] [$CJ]]</variable>
17 | 				<variable id="$NS">\p{Line_Break=Nonstarter}</variable>
18 | 			</variables>
19 | 		</segmentation>
20 | 		<segmentation type="WordBreak">
21 | 			<variables>
22 | 				<variable id="$Hiragana">\p{Hiragana}</variable>
23 | 				<variable id="$Hiragana">($Hiragana $FE*)</variable>
24 | 				<variable id="$Ideographic">[[\p{Ideographic}] [\u3005 \u3007 \u303B]]</variable>
25 | 				<variable id="$Ideographic">($Ideographic $FE*)</variable>
26 | 			</variables>
27 | 			<segmentRules>
28 | 				<rule id="13.3"> $Hiragana × $Hiragana </rule>
29 | 				<rule id="13.4"> $Ideographic × $Ideographic </rule>
30 | 			</segmentRules>
31 | 		</segmentation>
32 | 	</segmentations>
33 | </ldml>
34 | 


--------------------------------------------------------------------------------
/priv/segments/pt.xml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
  3 | <ldml>
  4 |   <identity>
  5 |     <version number="$Revision$"/>
  6 |     <language type="pt"/>
  7 |   </identity>
  8 |   <segmentations>
  9 |     <segmentation type="SentenceBreak">
 10 |       <!--From ULI data, http://uli.unicode.org-->
 11 |       <suppressions type="standard">
 12 |         <suppression>psicol.</suppression>
 13 |         <suppression>fig.</suppression>
 14 |         <suppression>compl.</suppression>
 15 |         <suppression>rep.</suppression>
 16 |         <suppression>cap.</suppression>
 17 |         <suppression>doc.</suppression>
 18 |         <suppression>fisiol.</suppression>
 19 |         <suppression>dipl.</suppression>
 20 |         <suppression>astron.</suppression>
 21 |         <suppression>port.</suppression>
 22 |         <suppression>eletrôn.</suppression>
 23 |         <suppression>geom.</suppression>
 24 |         <suppression>mov.</suppression>
 25 |         <suppression>ago.</suppression>
 26 |         <suppression>trad.</suppression>
 27 |         <suppression>arquit.</suppression>
 28 |         <suppression>dez.</suppression>
 29 |         <suppression>ed.</suppression>
 30 |         <suppression>apt.</suppression>
 31 |         <suppression>Exmo.</suppression>
 32 |         <suppression>col.</suppression>
 33 |         <suppression>ff.</suppression>
 34 |         <suppression>univ.</suppression>
 35 |         <suppression>res.</suppression>
 36 |         <suppression>R.</suppression>
 37 |         <suppression>transp.</suppression>
 38 |         <suppression>D.C</suppression>
 39 |         <suppression>l.</suppression>
 40 |         <suppression>des.</suppression>
 41 |         <suppression>fev.</suppression>
 42 |         <suppression>abr.</suppression>
 43 |         <suppression>liter.</suppression>
 44 |         <suppression>lat.</suppression>
 45 |         <suppression>Dir.</suppression>
 46 |         <suppression>cf.</suppression>
 47 |         <suppression>adm.</suppression>
 48 |         <suppression>fot.</suppression>
 49 |         <suppression>p.m.</suppression>
 50 |         <suppression>P.M.</suppression>
 51 |         <suppression>créd.</suppression>
 52 |         <suppression>jur.</suppression>
 53 |         <suppression>com.</suppression>
 54 |         <suppression>anat.</suppression>
 55 |         <suppression>dir.</suppression>
 56 |         <suppression>end.</suppression>
 57 |         <suppression>fís.</suppression>
 58 |         <suppression>E.</suppression>
 59 |         <suppression>Est.</suppression>
 60 |         <suppression>cont.</suppression>
 61 |         <suppression>matem.</suppression>
 62 |         <suppression>Drs.</suppression>
 63 |         <suppression>gên.</suppression>
 64 |         <suppression>neol.</suppression>
 65 |         <suppression>pág.</suppression>
 66 |         <suppression>índ.</suppression>
 67 |         <suppression>Ltda.</suppression>
 68 |         <suppression>Exma.</suppression>
 69 |         <suppression>esp.</suppression>
 70 |         <suppression>ingl.</suppression>
 71 |         <suppression>tecnol.</suppression>
 72 |         <suppression>Mar.</suppression>
 73 |         <suppression>símb.</suppression>
 74 |         <suppression>Pe.</suppression>
 75 |         <suppression>pal.</suppression>
 76 |         <suppression>filos.</suppression>
 77 |         <suppression>V.T.</suppression>
 78 |         <suppression>fasc.</suppression>
 79 |         <suppression>vs.</suppression>
 80 |         <suppression>mai.</suppression>
 81 |         <suppression>S.A.</suppression>
 82 |         <suppression>profa.</suppression>
 83 |         <suppression>N.Sra.</suppression>
 84 |         <suppression>r.s.v.p.</suppression>
 85 |         <suppression>cel.</suppression>
 86 |         <suppression>mat.</suppression>
 87 |         <suppression>abrev.</suppression>
 88 |         <suppression>out.</suppression>
 89 |         <suppression>long.</suppression>
 90 |         <suppression>aux.</suppression>
 91 |         <suppression>arit.</suppression>
 92 |         <suppression>aer.</suppression>
 93 |         <suppression>jul.</suppression>
 94 |         <suppression>lin.</suppression>
 95 |         <suppression>S.</suppression>
 96 |         <suppression>méd.</suppression>
 97 |         <suppression>odontol.</suppression>
 98 |         <suppression>org.</suppression>
 99 |         <suppression>A.C.</suppression>
100 |         <suppression>jun.</suppression>
101 |         <suppression>déb.</suppression>
102 |         <suppression>Av.</suppression>
103 |         <suppression>álg.</suppression>
104 |         <suppression>sup.</suppression>
105 |         <suppression>fl.</suppression>
106 |         <suppression>odont.</suppression>
107 |         <suppression>caps.</suppression>
108 |         <suppression>relat.</suppression>
109 |         <suppression>organiz.</suppression>
110 |         <suppression>hist.</suppression>
111 |         <suppression>Fr.</suppression>
112 |         <suppression>Ilmo.</suppression>
113 |         <suppression>fem.</suppression>
114 |         <suppression>ap.</suppression>
115 |         <suppression>Ltd.</suppression>
116 |         <suppression>pol.</suppression>
117 |         <suppression>séc.</suppression>
118 |         <suppression>prof.</suppression>
119 |         <suppression>cx.</suppression>
120 |         <suppression>nov.</suppression>
121 |         <suppression>quím.</suppression>
122 |         <suppression>mús.</suppression>
123 |         <suppression>agric.</suppression>
124 |         <suppression>mar.</suppression>
125 |         <suppression>W.C.</suppression>
126 |         <suppression>fr.</suppression>
127 |         <suppression>cat.</suppression>
128 |         <suppression>jan.</suppression>
129 |         <suppression>pron.</suppression>
130 |         <suppression>rel.</suppression>
131 |         <suppression>autom.</suppression>
132 |         <suppression>Sta.</suppression>
133 |         <suppression>Dra.</suppression>
134 |         <suppression>p.</suppression>
135 |         <suppression>tel.</suppression>
136 |         <suppression>div.</suppression>
137 |         <suppression>p. ex.</suppression>
138 |         <suppression>a.C.</suppression>
139 |         <suppression>bras.</suppression>
140 |         <suppression>Alm.</suppression>
141 |         <suppression>Dr.</suppression>
142 |         <suppression>comp.</suppression>
143 |         <suppression>pq.</suppression>
144 |         <suppression>arqueol.</suppression>
145 |         <suppression>náut.</suppression>
146 |         <suppression>biogr.</suppression>
147 |         <suppression>f.</suppression>
148 |         <suppression>círc.</suppression>
149 |         <suppression>fac.</suppression>
150 |         <suppression>d.C.</suppression>
151 |         <suppression>apart.</suppression>
152 |         <suppression>ex.</suppression>
153 |         <suppression>Jr.</suppression>
154 |         <suppression>set.</suppression>
155 |         <suppression>tec.</suppression>
156 |         <suppression>sociol.</suppression>
157 |         <suppression>gram.</suppression>
158 |         <suppression>ind.</suppression>
159 |         <suppression>Ilma.</suppression>
160 |         <suppression>vol.</suppression>
161 |         <suppression>eng.</suppression>
162 |         <suppression>rod.</suppression>
163 |         <suppression>Ph.D.</suppression>
164 |         <suppression>Dras.</suppression>
165 |         <suppression>pp.</suppression>
166 |         <suppression>elem.</suppression>
167 |         <suppression>máq.</suppression>
168 |         <suppression>cód.</suppression>
169 |         <suppression>eletr.</suppression>
170 |         <suppression>prod.</suppression>
171 |         <suppression>ref.</suppression>
172 |         <suppression>fil.</suppression>
173 |         <suppression>a.m.</suppression>
174 |         <suppression>A.M</suppression>
175 |         <suppression>obs.</suppression>
176 |         <suppression>N.T.</suppression>
177 |         <suppression>contab.</suppression>
178 |         <suppression>Sto.</suppression>
179 |         <suppression>lit.</suppression>
180 |         <suppression>educ.</suppression>
181 |         <suppression>rementente</suppression>
182 |         <suppression>desc.</suppression>
183 |         <suppression>próx.</suppression>
184 |       </suppressions>
185 |     </segmentation>
186 |   </segmentations>
187 | </ldml>
188 | 
189 | 


--------------------------------------------------------------------------------
/priv/segments/root.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" ?>
  2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
  3 | <!--
  4 | Copyright © 1991-2024 Unicode, Inc.
  5 | CLDR data files are interpreted according to the LDML specification (https://unicode.org/reports/tr35/)
  6 | For terms of use and license, see https://www.unicode.org/terms_of_use.html
  7 | -->
  8 | <ldml>
  9 | 	<identity>
 10 | 		<version number="$Revision: 13690 $"/>
 11 | 		<language type="root"/>
 12 | 	</identity>
 13 | 	<segmentations>
 14 | 		<segmentation type="GraphemeClusterBreak">
 15 | 			<variables>
 16 | 				<!-- VARIABLES -->
 17 | 				<variable id="$CR">\p{Grapheme_Cluster_Break=CR}</variable>
 18 | 				<variable id="$LF">\p{Grapheme_Cluster_Break=LF}</variable>
 19 | 				<variable id="$Control">\p{Grapheme_Cluster_Break=Control}</variable>
 20 | 				<variable id="$Extend">\p{Grapheme_Cluster_Break=Extend}</variable>
 21 | 				<variable id="$ZWJ">\p{Grapheme_Cluster_Break=ZWJ}</variable>
 22 | 				<variable id="$RI">\p{Grapheme_Cluster_Break=Regional_Indicator}</variable>
 23 | 				<variable id="$Prepend">\p{Grapheme_Cluster_Break=Prepend}</variable>
 24 | 				<variable id="$SpacingMark">\p{Grapheme_Cluster_Break=SpacingMark}</variable>
 25 | 				<variable id="$L">\p{Grapheme_Cluster_Break=L}</variable>
 26 | 				<variable id="$V">\p{Grapheme_Cluster_Break=V}</variable>
 27 | 				<variable id="$T">\p{Grapheme_Cluster_Break=T}</variable>
 28 | 				<variable id="$LV">\p{Grapheme_Cluster_Break=LV}</variable>
 29 | 				<variable id="$LVT">\p{Grapheme_Cluster_Break=LVT}</variable>
 30 | 				<!-- Note: The following may overlap with the above -->
 31 | 				<!-- Note: ConjunctLinkingScripts is not used anymore, instead that list exists in the derivation of Indic_Conjunct_Break. -->
 32 | 				<!-- It is kept here so that the diff of the generated test cases compared to the Unicode 15.1 β is minimal. -->
 33 | 				<!-- TODO(egg): Consider removing in Unicode 16.0. -->
 34 | 				<variable id="$ConjunctLinkingScripts">[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}]</variable>
 35 | 				<variable id="$ConjunctLinker">\p{Indic_Conjunct_Break=Linker}</variable>
 36 | 				<variable id="$LinkingConsonant">\p{Indic_Conjunct_Break=Consonant}</variable>
 37 | 				<variable id="$ExtPict">\p{Extended_Pictographic}</variable>
 38 | 				<variable id="$ExtCccZwj">[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]</variable>
 39 | 			</variables>
 40 | 			<segmentRules>
 41 | 				<!-- RULES -->
 42 | 				<!-- Break at the start and end of text, unless the text is empty. -->
 43 | 				<!-- Do not break between a CR and LF. Otherwise, break before and after controls. -->
 44 | 				<rule id="3"> $CR × $LF </rule>
 45 | 				<rule id="4"> ( $Control | $CR | $LF ) ÷ </rule>
 46 | 				<rule id="5"> ÷ ( $Control | $CR | $LF ) </rule>
 47 | 				<!-- Do not break Hangul syllable sequences. -->
 48 | 				<rule id="6"> $L × ( $L | $V | $LV | $LVT ) </rule>
 49 | 				<rule id="7"> ( $LV | $V ) × ( $V | $T ) </rule>
 50 | 				<rule id="8"> ( $LVT | $T) × $T </rule>
 51 | 				<rule id="9"> × ($Extend | $ZWJ) </rule>
 52 | 				<!-- Only for extended grapheme clusters: Do not break before SpacingMarks, or after Prepend characters. -->
 53 | 				<rule id="9.1"> × $SpacingMark </rule>
 54 | 				<rule id="9.2"> $Prepend × </rule>
 55 | 				<rule id="9.3"> $LinkingConsonant $ExtCccZwj* $ConjunctLinker $ExtCccZwj* × $LinkingConsonant </rule>
 56 | 				<rule id="11"> $ExtPict $Extend* $ZWJ × $ExtPict </rule>
 57 | 				<!-- Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. -->
 58 | 				<rule id="12"> ^ ($RI $RI)* $RI × $RI </rule>
 59 | 				<rule id="13"> [^$RI] ($RI $RI)* $RI × $RI </rule>
 60 | 				<!-- Otherwise, break everywhere. -->
 61 | 			</segmentRules>
 62 | 		</segmentation>
 63 | 		<segmentation type="LineBreak">
 64 | 			<variables>
 65 | 				<!-- VARIABLES -->
 66 | 				<variable id="$AI">\p{Line_Break=Ambiguous}</variable>
 67 | 				<variable id="$AK">\p{Line_Break=Aksara}</variable>
 68 | 				<variable id="$AL">\p{Line_Break=Alphabetic}</variable>
 69 | 				<variable id="$AP">\p{Line_Break=Aksara_Prebase}</variable>
 70 | 				<variable id="$AS">\p{Line_Break=Aksara_Start}</variable>
 71 | 				<variable id="$B2">\p{Line_Break=Break_Both}</variable>
 72 | 				<variable id="$BA">\p{Line_Break=Break_After}</variable>
 73 | 				<variable id="$BB">\p{Line_Break=Break_Before}</variable>
 74 | 				<variable id="$BK">\p{Line_Break=Mandatory_Break}</variable>
 75 | 				<variable id="$CB">\p{Line_Break=Contingent_Break}</variable>
 76 | 				<variable id="$CL">\p{Line_Break=Close_Punctuation}</variable>
 77 | 				<variable id="$CP">\p{Line_Break=CP}</variable>
 78 | 				<variable id="$CM1">\p{Line_Break=Combining_Mark}</variable>
 79 | 				<variable id="$CR">\p{Line_Break=Carriage_Return}</variable>
 80 | 				<variable id="$EX">\p{Line_Break=Exclamation}</variable>
 81 | 				<variable id="$GL">\p{Line_Break=Glue}</variable>
 82 | 				<variable id="$H2">\p{Line_Break=H2}</variable>
 83 | 				<variable id="$H3">\p{Line_Break=H3}</variable>
 84 | 				<variable id="$HL">\p{Line_Break=HL}</variable>
 85 | 				<variable id="$HY">\p{Line_Break=Hyphen}</variable>
 86 | 				<variable id="$ID">\p{Line_Break=Ideographic}</variable>
 87 | 				<variable id="$IN">\p{Line_Break=Inseparable}</variable>
 88 | 				<variable id="$IS">\p{Line_Break=Infix_Numeric}</variable>
 89 | 				<variable id="$JL">\p{Line_Break=JL}</variable>
 90 | 				<variable id="$JT">\p{Line_Break=JT}</variable>
 91 | 				<variable id="$JV">\p{Line_Break=JV}</variable>
 92 | 				<variable id="$LF">\p{Line_Break=Line_Feed}</variable>
 93 | 				<variable id="$NL">\p{Line_Break=Next_Line}</variable>
 94 | 				<variable id="$NS">\p{Line_Break=Nonstarter}</variable>
 95 | 				<variable id="$NU">\p{Line_Break=Numeric}</variable>
 96 | 				<variable id="$OP">\p{Line_Break=Open_Punctuation}</variable>
 97 | 				<variable id="$PO">\p{Line_Break=Postfix_Numeric}</variable>
 98 | 				<variable id="$PR">\p{Line_Break=Prefix_Numeric}</variable>
 99 | 				<variable id="$QU">\p{Line_Break=Quotation}</variable>
100 | 				<variable id="$SA">\p{Line_Break=Complex_Context}</variable>
101 | 				<variable id="$SG">\p{Line_Break=Surrogate}</variable>
102 | 				<variable id="$SP">\p{Line_Break=Space}</variable>
103 | 				<variable id="$SY">\p{Line_Break=Break_Symbols}</variable>
104 | 				<variable id="$VF">\p{Line_Break=Virama_Final}</variable>
105 | 				<variable id="$VI">\p{Line_Break=Virama}</variable>
106 | 				<variable id="$WJ">\p{Line_Break=Word_Joiner}</variable>
107 | 				<variable id="$XX">\p{Line_Break=Unknown}</variable>
108 | 				<variable id="$ZW">\p{Line_Break=ZWSpace}</variable>
109 | 				<variable id="$CJ">\p{Line_Break=Conditional_Japanese_Starter}</variable>
110 | 				<variable id="$RI">\p{Line_Break=Regional_Indicator}</variable>
111 | 				<variable id="$EB">\p{Line_Break=E_Base}</variable>
112 | 				<variable id="$EM">\p{Line_Break=E_Modifier}</variable>
113 | 				<variable id="$ZWJ_O">\p{Line_Break=ZWJ}</variable>
114 | 				<variable id="$ZWJ">\p{Line_Break=ZWJ}</variable>
115 | 				<variable id="$QU_Pi">[$QU &amp; \p{gc=Pi}]</variable>
116 | 				<variable id="$QU_Pf">[$QU &amp; \p{gc=Pf}]</variable>
117 | 				<variable id="$QUmPi">[$QU - \p{gc=Pi}]</variable>
118 | 				<variable id="$QUmPf">[$QU - \p{gc=Pf}]</variable>
119 | 				<variable id="$NotEastAsian">[^\p{ea=F}\p{ea=W}\p{ea=H}]</variable>
120 | 				<variable id="$NonEastAsianBA">[$BA &amp; $NotEastAsian]</variable>
121 | 				<variable id="$DottedCircle">◌</variable>
122 | 				<variable id="$Hyphen">[\u2010]</variable>
123 | 				<variable id="$CP30">[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]]</variable>
124 | 				<variable id="$OP30">[$OP-[\p{ea=F}\p{ea=W}\p{ea=H}]]</variable>
125 | 				<variable id="$ExtPictUnassigned">[\p{Extended_Pictographic}&amp;\p{gc=Cn}]</variable>
126 | 				<!-- Some rules refer to the start and end of text.  We could just use a literal ^ for sot, but naming -->
127 | 				<!-- it as in the spec makes it easier to compare.  The parser will eat (and choke on) $, so we play a -->
128 | 				<!-- stupid trick instead. -->
129 | 				<variable id="$sot">^</variable>
130 | 				<variable id="$eot">(?!.)</variable>
131 | 				<!-- SPECIAL EXTENSIONS -->
132 | 				<variable id="$CM">[$CM1 $ZWJ]</variable>
133 | 				<!-- LB 1  Assign a line breaking class to each code point of the input. -->
134 | 				<!-- Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm. -->
135 | 				<!-- NOTE: CB is ok to fall through, but must handle others here. -->
136 | 				<variable id="$AL">[$AI $AL $SG $XX $SA]</variable>
137 | 				<variable id="$NS">[$NS $CJ]</variable>
138 | 				<!-- WARNING: Fixes for Rule 9 -->
139 | 				<!-- Treat X (CM|ZWJ* as if it were X. -->
140 | 				<!-- Where X is any line break class except SP, BK, CR, LF, NL or ZW. -->
141 | 				<variable id="$X">$CM*</variable>
142 | 				<!-- MACROS -->
143 | 				<variable id="$Spec1_">[$SP $BK $CR $LF $NL $ZW]</variable>
144 | 				<variable id="$Spec2_">[^ $SP $BK $CR $LF $NL $ZW]</variable>
145 | 				<variable id="$Spec3a_">[^ $SP $BA $HY $CM]</variable>
146 | 				<variable id="$Spec3b_">[^ $BA $HY $CM]</variable>
147 | 				<variable id="$Spec4_">[^ $NU $CM]</variable>
148 | 				<variable id="$AI">($AI $X)</variable>
149 | 				<variable id="$AK">($AK $X)</variable>
150 | 				<variable id="$AL">($AL $X)</variable>
151 | 				<variable id="$AP">($AP $X)</variable>
152 | 				<variable id="$AS">($AS $X)</variable>
153 | 				<variable id="$B2">($B2 $X)</variable>
154 | 				<variable id="$BA">($BA $X)</variable>
155 | 				<variable id="$BB">($BB $X)</variable>
156 | 				<variable id="$CB">($CB $X)</variable>
157 | 				<variable id="$CL">($CL $X)</variable>
158 | 				<variable id="$CP">($CP $X)</variable>
159 | 				<variable id="$CM">($CM $X)</variable>
160 | 				<variable id="$EX">($EX $X)</variable>
161 | 				<variable id="$GL">($GL $X)</variable>
162 | 				<variable id="$H2">($H2 $X)</variable>
163 | 				<variable id="$H3">($H3 $X)</variable>
164 | 				<variable id="$HL">($HL $X)</variable>
165 | 				<variable id="$HY">($HY $X)</variable>
166 | 				<variable id="$ID">($ID $X)</variable>
167 | 				<variable id="$IN">($IN $X)</variable>
168 | 				<variable id="$IS">($IS $X)</variable>
169 | 				<variable id="$JL">($JL $X)</variable>
170 | 				<variable id="$JT">($JT $X)</variable>
171 | 				<variable id="$JV">($JV $X)</variable>
172 | 				<variable id="$NS">($NS $X)</variable>
173 | 				<variable id="$NU">($NU $X)</variable>
174 | 				<variable id="$OP">($OP $X)</variable>
175 | 				<variable id="$PO">($PO $X)</variable>
176 | 				<variable id="$PR">($PR $X)</variable>
177 | 				<variable id="$QU">($QU $X)</variable>
178 | 				<variable id="$SA">($SA $X)</variable>
179 | 				<variable id="$SG">($SG $X)</variable>
180 | 				<variable id="$SY">($SY $X)</variable>
181 | 				<variable id="$VF">($VF $X)</variable>
182 | 				<variable id="$VI">($VI $X)</variable>
183 | 				<variable id="$WJ">($WJ $X)</variable>
184 | 				<variable id="$XX">($XX $X)</variable>
185 | 				<variable id="$RI">($RI $X)</variable>
186 | 				<variable id="$EB">($EB $X)</variable>
187 | 				<variable id="$EM">($EM $X)</variable>
188 | 				<variable id="$ZWJ">($ZWJ $X)</variable>
189 | 				<variable id="$QU_Pi">($QU_Pi $X)</variable>
190 | 				<variable id="$QU_Pf">($QU_Pf $X)</variable>
191 | 				<variable id="$QUmPi">($QUmPi $X)</variable>
192 | 				<variable id="$QUmPf">($QUmPf $X)</variable>
193 | 				<variable id="$NotEastAsian">( $NotEastAsian | [$NotEastAsian - $Spec1_] $X)</variable>
194 | 				<variable id="$NonEastAsianBA">(NonEastAsianBA $X)</variable>
195 | 				<variable id="$DottedCircle">($DottedCircle $X)</variable>
196 | 				<variable id="$Hyphen">($Hyphen $X)</variable>
197 | 				<variable id="$CP30">($CP30 $X)</variable>
198 | 				<variable id="$OP30">($OP30 $X)</variable>
199 | 				<!-- OUT OF ORDER ON PURPOSE -->
200 | 				<!-- LB 10  Treat any remaining combining mark as AL and non-$EastAsian. -->
201 | 				<variable id="$AL">($AL | ^ $CM | (?&lt;=$Spec1_) $CM)</variable>
202 | 				<variable id="$NotEastAsian">( $NotEastAsian | ^ $CM | (?&lt;=$Spec1_) $CM )</variable>
203 | 			</variables>
204 | 			<segmentRules>
205 | 				<!-- RULES -->
206 | 				<!-- LB 4  Always break after hard line breaks (but never between CR and LF). -->
207 | 				<rule id="4"> $BK ÷ </rule>
208 | 				<!-- LB 5  Treat CR followed by LF, as well as CR, LF and NL as hard line breaks. -->
209 | 				<rule id="5.01"> $CR × $LF </rule>
210 | 				<rule id="5.02"> $CR ÷ </rule>
211 | 				<rule id="5.03"> $LF ÷ </rule>
212 | 				<rule id="5.04"> $NL ÷ </rule>
213 | 				<!-- LB 6  Do not break before hard line breaks. -->
214 | 				<rule id="6"> × ( $BK | $CR | $LF | $NL ) </rule>
215 | 				<!-- LB 7  Do not break before spaces or zero-width space. -->
216 | 				<rule id="7.01"> × $SP </rule>
217 | 				<rule id="7.02"> × $ZW </rule>
218 | 				<!-- LB 8  Break before any character following a zero-width space, even if one or more spaces intervene. -->
219 | 				<rule id="8"> $ZW $SP* ÷ </rule>
220 | 				<!-- LB 8a  Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences) -->
221 | 				<rule id="8.1"> $ZWJ_O × </rule>
222 | 				<!-- LB 9  Do not break a combining character sequence; treat it as if it has the LB class of the base character -->
223 | 				<!-- in all of the following rules. (Where X is any line break class except SP, BK, CR, LF, NL or ZW.) -->
224 | 				<rule id="9"> $Spec2_ × $CM </rule>
225 | 				<rule id="11.01"> × $WJ </rule>
226 | 				<rule id="11.02"> $WJ × </rule>
227 | 				<!-- LB 12  Do not break after NBSP and related characters. -->
228 | 				<rule id="12"> $GL × </rule>
229 | 				<rule id="12.1"> $Spec3a_ × $GL </rule>
230 | 				<rule id="12.2"> $Spec3b_ $CM+ × $GL </rule>
231 | 				<rule id="12.3"> ^ $CM+ × $GL </rule>
232 | 				<!-- LB 13  Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces. -->
233 | 				<rule id="13.01"> × $EX </rule>
234 | 				<rule id="13.02"> × $CL </rule>
235 | 				<rule id="13.03"> × $CP </rule>
236 | 				<rule id="13.04"> × $SY </rule>
237 | 				<!-- LB 14  Do not break after \u2018[\u2019, even after spaces. -->
238 | 				<rule id="14"> $OP $SP* × </rule>
239 | 				<!-- LB 15a Do not break after an unresolved initial punctuation that lies at the start of the line, -->
240 | 				<!-- after a space, after opening punctuation, or after an unresolved quotation mark, even after -->
241 | 				<!-- spaces. -->
242 | 				<rule id="15.11"> ( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* × </rule>
243 | 				<!-- LB 15b Do not break before an unresolved final punctuation that lies at the end of the line, before -->
244 | 				<!-- a space, before a prohibited break, or before an unresolved quotation mark, even before spaces. -->
245 | 				<rule id="15.21"> × $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ) </rule>
246 | 				<!-- LB 15c Break before numbers starting with a decimal mark. -->
247 | 				<rule id="15.3"> $SP ÷ $IS $NU </rule>
248 | 				<!-- LB 15d Otherwise, do not break before commas or full stops. -->
249 | 				<rule id="15.4"> × $IS </rule>
250 | 				<!-- LB 16  Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces. -->
251 | 				<rule id="16"> ($CL | $CP) $SP* × $NS </rule>
252 | 				<!-- LB 17  Do not break within \u2018\u2014\u2014\u2019, even with intervening spaces. -->
253 | 				<rule id="17"> $B2 $SP* × $B2 </rule>
254 | 				<!-- LB 18  Break after spaces. -->
255 | 				<rule id="18"> $SP ÷ </rule>
256 | 				<!-- LB 19  Do not break before or after \u2018\"\u2019. -->
257 | 				<rule id="19.01"> × $QUmPi </rule>
258 | 				<rule id="19.02"> $QUmPf × </rule>
259 | 				<!-- LB 19a Unless surrounded by East Asian Characters, do not break either side of any unresolved quotation marks. -->
260 | 				<rule id="19.1"> $NotEastAsian × $QU </rule>
261 | 				<rule id="19.11"> × $QU ( $NotEastAsian | $eot ) </rule>
262 | 				<rule id="19.12"> $QU × $NotEastAsian </rule>
263 | 				<rule id="19.13"> ( $sot | $NotEastAsian ) $QU × </rule>
264 | 				<!-- LB 20  Break before and after unresolved CB. -->
265 | 				<rule id="20.01"> ÷ $CB </rule>
266 | 				<rule id="20.02"> $CB ÷ </rule>
267 | 				<!-- LB 20a Do not break after a hyphen that follows break opportunity, a space, or the start of text. -->
268 | 				<rule id="20.1"> ( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL </rule>
269 | 				<!-- LB 21  Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents. -->
270 | 				<rule id="21.01"> × $BA </rule>
271 | 				<rule id="21.02"> × $HY </rule>
272 | 				<rule id="21.03"> × $NS </rule>
273 | 				<rule id="21.04"> $BB × </rule>
274 | 				<!-- LB 21a Do not break after the hyphen in Hebrew-hyphen-non-Hebrew. -->
275 | 				<rule id="21.1"> $HL ($HY | $NonEastAsianBA) × [^$HL] </rule>
276 | 				<!-- LB 21b Don’t break between Solidus and Hebrew letters. -->
277 | 				<rule id="21.2"> $SY × $HL </rule>
278 | 				<!-- LB 22  Do not break before ellipses. -->
279 | 				<rule id="22"> × $IN </rule>
280 | 				<!-- LB 23  Do not break between digits and letters. -->
281 | 				<rule id="23.02"> ($AL | $HL) × $NU </rule>
282 | 				<rule id="23.03"> $NU × ($AL | $HL) </rule>
283 | 				<!-- LB 24  Do not break between prefix and letters or ideographs. -->
284 | 				<rule id="23.12"> $PR × ($ID | $EB | $EM) </rule>
285 | 				<rule id="23.13"> ($ID | $EB | $EM) × $PO </rule>
286 | 				<!-- LB24 Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix. -->
287 | 				<rule id="24.02"> ($PR | $PO) × ($AL | $HL) </rule>
288 | 				<rule id="24.03"> ($AL | $HL) × ($PR | $PO) </rule>
289 | 				<!-- LB 25 Do not break numbers. -->
290 | 				<rule id="25.01"> $NU ( $SY | $IS )* $CL × $PO </rule>
291 | 				<rule id="25.02"> $NU ( $SY | $IS )* $CP × $PO </rule>
292 | 				<rule id="25.03"> $NU ( $SY | $IS )* $CL × $PR </rule>
293 | 				<rule id="25.04"> $NU ( $SY | $IS )* $CP × $PR </rule>
294 | 				<rule id="25.05"> $NU ( $SY | $IS )* × $PO </rule>
295 | 				<rule id="25.06"> $NU ( $SY | $IS )* × $PR </rule>
296 | 				<rule id="25.07"> $PO × $OP $NU </rule>
297 | 				<rule id="25.08"> $PO × $OP $IS $NU </rule>
298 | 				<rule id="25.09"> $PO × $NU </rule>
299 | 				<rule id="25.1"> $PR × $OP $NU </rule>
300 | 				<rule id="25.11"> $PR × $OP $IS $NU </rule>
301 | 				<rule id="25.12"> $PR × $NU </rule>
302 | 				<rule id="25.13"> $HY × $NU </rule>
303 | 				<rule id="25.14"> $IS × $NU </rule>
304 | 				<rule id="25.15"> $NU ( $SY | $IS )* × $NU </rule>
305 | 				<!-- LB 26 Do not break a Korean syllable. -->
306 | 				<rule id="26.01"> $JL × $JL | $JV | $H2 | $H3 </rule>
307 | 				<rule id="26.02"> $JV | $H2 × $JV | $JT </rule>
308 | 				<rule id="26.03"> $JT | $H3 × $JT </rule>
309 | 				<!-- LB 27 Treat a Korean Syllable Block the same as ID. -->
310 | 				<rule id="27.01"> $JL | $JV | $JT | $H2 | $H3 × $PO </rule>
311 | 				<rule id="27.02"> $PR × $JL | $JV | $JT | $H2 | $H3 </rule>
312 | 				<!-- LB 28  Do not break between alphabetics (\"at\"). -->
313 | 				<rule id="28"> ($AL | $HL) × ($AL | $HL) </rule>
314 | 				<!-- LB28a Do not break inside the orthographic syllables of Brahmic scripts. -->
315 | 				<rule id="28.11"> $AP × ($AK | $DottedCircle | $AS) </rule>
316 | 				<rule id="28.12"> ($AK | $DottedCircle | $AS) × ($VF | $VI) </rule>
317 | 				<rule id="28.13"> ($AK | $DottedCircle | $AS) $VI × ($AK | $DottedCircle) </rule>
318 | 				<rule id="28.14"> ($AK | $DottedCircle | $AS) × ($AK | $DottedCircle | $AS) $VF </rule>
319 | 				<!-- LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\"). -->
320 | 				<rule id="29"> $IS × ($AL | $HL) </rule>
321 | 				<!-- LB 30  Do not break between letters, numbers or ordinary symbols and opening or closing punctuation. -->
322 | 				<rule id="30.01"> ($AL | $HL | $NU) × $OP30 </rule>
323 | 				<rule id="30.02"> $CP30 × ($AL | $HL | $NU) </rule>
324 | 				<!-- LB 30a  Break between two Regional Indicators if and only if there is an even number of them before the point being considered. -->
325 | 				<rule id="30.11"> $sot ($RI $RI)* $RI × $RI </rule>
326 | 				<rule id="30.12"> [^$RI] ($RI $RI)* $RI × $RI </rule>
327 | 				<rule id="30.13"> $RI ÷ $RI </rule>
328 | 				<!-- LB 30b Do not break between an emoji base (or potential emoji) and an emoji modifier. -->
329 | 				<rule id="30.21"> $EB × $EM </rule>
330 | 				<rule id="30.22"> $ExtPictUnassigned × $EM </rule>
331 | 			</segmentRules>
332 | 		</segmentation>
333 | 		<segmentation type="SentenceBreak">
334 | 			<variables>
335 | 				<!-- VARIABLES -->
336 | 				<variable id="$CR">\p{Sentence_Break=CR}</variable>
337 | 				<variable id="$LF">\p{Sentence_Break=LF}</variable>
338 | 				<variable id="$Extend">\p{Sentence_Break=Extend}</variable>
339 | 				<variable id="$Format">\p{Sentence_Break=Format}</variable>
340 | 				<variable id="$Sep">\p{Sentence_Break=Sep}</variable>
341 | 				<variable id="$Sp">\p{Sentence_Break=Sp}</variable>
342 | 				<variable id="$Lower">\p{Sentence_Break=Lower}</variable>
343 | 				<variable id="$Upper">\p{Sentence_Break=Upper}</variable>
344 | 				<variable id="$OLetter">\p{Sentence_Break=OLetter}</variable>
345 | 				<variable id="$Numeric">\p{Sentence_Break=Numeric}</variable>
346 | 				<variable id="$ATerm">\p{Sentence_Break=ATerm}</variable>
347 | 				<variable id="$STerm">\p{Sentence_Break=STerm}</variable>
348 | 				<variable id="$Close">\p{Sentence_Break=Close}</variable>
349 | 				<variable id="$SContinue">\p{Sentence_Break=SContinue}</variable>
350 | 				<variable id="$Any">.</variable>
351 | 				<!-- SPECIAL EXTENSIONS -->
352 | 				<!-- WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend -->
353 | 				<variable id="$FE">[$Format $Extend]</variable>
354 | 				<variable id="$NotPreLower_">[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]</variable>
355 | 				<variable id="$Sp">($Sp $FE*)</variable>
356 | 				<variable id="$Lower">($Lower $FE*)</variable>
357 | 				<variable id="$Upper">($Upper $FE*)</variable>
358 | 				<variable id="$OLetter">($OLetter $FE*)</variable>
359 | 				<variable id="$Numeric">($Numeric $FE*)</variable>
360 | 				<variable id="$ATerm">($ATerm $FE*)</variable>
361 | 				<variable id="$STerm">($STerm $FE*)</variable>
362 | 				<variable id="$Close">($Close $FE*)</variable>
363 | 				<variable id="$SContinue">($SContinue $FE*)</variable>
364 | 				<!-- MACROS -->
365 | 				<variable id="$ParaSep">($Sep | $CR | $LF)</variable>
366 | 				<variable id="$SATerm">($STerm | $ATerm)</variable>
367 | 			</variables>
368 | 			<segmentRules>
369 | 				<!-- RULES -->
370 | 				<!-- Break at the start and end of text, unless the text is empty. -->
371 | 				<!-- Do not break within CRLF. -->
372 | 				<rule id="3"> $CR × $LF </rule>
373 | 				<!-- Break after paragraph separators. -->
374 | 				<rule id="4"> $ParaSep ÷ </rule>
375 | 				<!-- Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend) -->
376 | 				<!-- WARNING: Implemented as don't break before format (except after linebreaks), -->
377 | 				<!-- AND add format and extend in all variables definitions that appear after this point! -->
378 | 				<rule id="5"> × [$Format $Extend] </rule>
379 | 				<!-- Do not break after full stop in certain contexts. [See note below.] -->
380 | 				<!-- Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, -->
381 | 				<!-- is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. -->
382 | 				<!-- For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence. -->
383 | 				<rule id="6"> $ATerm × $Numeric </rule>
384 | 				<rule id="7"> ($Upper | $Lower) $ATerm × $Upper </rule>
385 | 				<rule id="8"> $ATerm $Close* $Sp* × $NotPreLower_* $Lower </rule>
386 | 				<rule id="8.1"> $SATerm $Close* $Sp* × ($SContinue | $SATerm) </rule>
387 | 				<!-- Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator. -->
388 | 				<rule id="9"> $SATerm $Close* × ( $Close | $Sp | $ParaSep ) </rule>
389 | 				<!-- Note the fix to $Sp*, $Sep? -->
390 | 				<rule id="10"> $SATerm $Close* $Sp* × ( $Sp | $ParaSep ) </rule>
391 | 				<rule id="11"> $SATerm $Close* $Sp* $ParaSep? ÷ </rule>
392 | 				<!-- Otherwise, do not break -->
393 | 				<rule id="998"> × $Any </rule>
394 | 			</segmentRules>
395 | 		</segmentation>
396 | 		<segmentation type="WordBreak">
397 | 			<variables>
398 | 				<!-- VARIABLES -->
399 | 				<variable id="$CR">\p{Word_Break=CR}</variable>
400 | 				<variable id="$LF">\p{Word_Break=LF}</variable>
401 | 				<variable id="$Newline">\p{Word_Break=Newline}</variable>
402 | 				<variable id="$Extend">\p{Word_Break=Extend}</variable>
403 | 				<!-- Now normal variables -->
404 | 				<variable id="$Format">[\p{Word_Break=Format}]</variable>
405 | 				<variable id="$Katakana">\p{Word_Break=Katakana}</variable>
406 | 				<variable id="$ALetter">\p{Word_Break=ALetter}</variable>
407 | 				<variable id="$MidLetter">\p{Word_Break=MidLetter}</variable>
408 | 				<variable id="$MidNum">\p{Word_Break=MidNum}</variable>
409 | 				<variable id="$MidNumLet">\p{Word_Break=MidNumLet}</variable>
410 | 				<variable id="$Numeric">\p{Word_Break=Numeric}</variable>
411 | 				<variable id="$ExtendNumLet">\p{Word_Break=ExtendNumLet}</variable>
412 | 				<variable id="$RI">\p{Word_Break=Regional_Indicator}</variable>
413 | 				<variable id="$Hebrew_Letter">\p{Word_Break=Hebrew_Letter}</variable>
414 | 				<variable id="$Double_Quote">\p{Word_Break=Double_Quote}</variable>
415 | 				<variable id="$Single_Quote">\p{Word_Break=Single_Quote}</variable>
416 | 				<variable id="$ZWJ">\p{Word_Break=ZWJ}</variable>
417 | 				<!-- Note: The following may overlap with the above -->
418 | 				<variable id="$ExtPict">\p{Extended_Pictographic}</variable>
419 | 				<variable id="$WSegSpace">\p{Word_Break=WSegSpace}</variable>
420 | 				<!-- MACROS -->
421 | 				<variable id="$AHLetter">($ALetter | $Hebrew_Letter)</variable>
422 | 				<variable id="$MidNumLetQ">($MidNumLet | $Single_Quote)</variable>
423 | 				<!-- SPECIAL EXTENSIONS -->
424 | 				<!-- Add format and extend to everything -->
425 | 				<variable id="$FE">[$Format $Extend $ZWJ]</variable>
426 | 				<variable id="$NotBreak_">[^ $Newline $CR $LF ]</variable>
427 | 				<variable id="$Katakana">($Katakana $FE*)</variable>
428 | 				<variable id="$ALetter">($ALetter $FE*)</variable>
429 | 				<variable id="$MidLetter">($MidLetter $FE*)</variable>
430 | 				<variable id="$MidNum">($MidNum $FE*)</variable>
431 | 				<variable id="$MidNumLet">($MidNumLet $FE*)</variable>
432 | 				<variable id="$Numeric">($Numeric $FE*)</variable>
433 | 				<variable id="$ExtendNumLet">($ExtendNumLet $FE*)</variable>
434 | 				<variable id="$RI">($RI $FE*)</variable>
435 | 				<variable id="$Hebrew_Letter">($Hebrew_Letter $FE*)</variable>
436 | 				<variable id="$Double_Quote">($Double_Quote $FE*)</variable>
437 | 				<variable id="$Single_Quote">($Single_Quote $FE*)</variable>
438 | 				<variable id="$AHLetter">($AHLetter $FE*)</variable>
439 | 				<variable id="$MidNumLetQ">($MidNumLetQ $FE*)</variable>
440 | 			</variables>
441 | 			<segmentRules>
442 | 				<!-- RULES -->
443 | 				<!-- Break at the start and end of text, unless the text is empty. -->
444 | 				<!-- Do not break within CRLF. -->
445 | 				<rule id="3"> $CR × $LF </rule>
446 | 				<!-- Otherwise break before and after Newlines (including CR and LF) -->
447 | 				<rule id="3.1"> ($Newline | $CR | $LF) ÷ </rule>
448 | 				<rule id="3.2"> ÷ ($Newline | $CR | $LF) </rule>
449 | 				<!-- Do not break within emoji zwj sequences. -->
450 | 				<rule id="3.3"> $ZWJ × $ExtPict </rule>
451 | 				<rule id="3.4"> $WSegSpace × $WSegSpace </rule>
452 | 				<!-- Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend) -->
453 | 				<!-- WARNING: Implemented as don't break before format (except after linebreaks), -->
454 | 				<!-- AND add format and extend in all variables definitions that appear after this point! -->
455 | 				<rule id="4"> $NotBreak_ × [$Format $Extend $ZWJ] </rule>
456 | 				<!-- VANILLA RULES -->
457 | 				<!-- Do not break between most letters. -->
458 | 				<rule id="5"> $AHLetter × $AHLetter </rule>
459 | 				<!-- Do not break letters across certain punctuation. -->
460 | 				<rule id="6"> $AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter </rule>
461 | 				<rule id="7"> $AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter </rule>
462 | 				<rule id="7.1"> $Hebrew_Letter × $Single_Quote </rule>
463 | 				<rule id="7.2"> $Hebrew_Letter × $Double_Quote $Hebrew_Letter </rule>
464 | 				<rule id="7.3"> $Hebrew_Letter $Double_Quote × $Hebrew_Letter </rule>
465 | 				<!-- Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”). -->
466 | 				<rule id="8"> $Numeric × $Numeric </rule>
467 | 				<rule id="9"> $AHLetter × $Numeric </rule>
468 | 				<rule id="10"> $Numeric × $AHLetter </rule>
469 | 				<!-- Do not break within sequences, such as “3.2” or “3,456.789”. -->
470 | 				<rule id="11"> $Numeric ($MidNum | $MidNumLetQ) × $Numeric </rule>
471 | 				<rule id="12"> $Numeric × ($MidNum | $MidNumLetQ) $Numeric </rule>
472 | 				<!-- Do not break between Katakana. -->
473 | 				<rule id="13"> $Katakana × $Katakana </rule>
474 | 				<!-- Do not break from extenders. -->
475 | 				<rule id="13.1"> ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet </rule>
476 | 				<rule id="13.2"> $ExtendNumLet × ($AHLetter | $Numeric | $Katakana) </rule>
477 | 				<!-- Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. -->
478 | 				<rule id="15"> ^ ($RI $RI)* $RI × $RI </rule>
479 | 				<rule id="16"> [^$RI] ($RI $RI)* $RI × $RI </rule>
480 | 				<!-- Otherwise, break everywhere (including around ideographs). -->
481 | 			</segmentRules>
482 | 		</segmentation>
483 | 	</segmentations>
484 | </ldml>
485 | 


--------------------------------------------------------------------------------
/priv/segments/ru.xml:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8'?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <ldml>
 4 |   <identity>
 5 |     <version number="$Revision$"/>
 6 |     <language type="ru"/>
 7 |   </identity>
 8 |   <segmentations>
 9 |     <segmentation type="SentenceBreak">
10 |       <!--From ULI data, http://uli.unicode.org-->
11 |       <suppressions type="standard">
12 |         <suppression>руб.</suppression>
13 |         <suppression>янв.</suppression>
14 |         <suppression>до н. э.</suppression>
15 |         <suppression>сент.</suppression>
16 |         <suppression>тел.</suppression>
17 |         <suppression>дек.</suppression>
18 |         <suppression>февр.</suppression>
19 |         <suppression>нояб.</suppression>
20 |         <suppression>апр.</suppression>
21 |         <suppression>н. э.</suppression>
22 |         <suppression>окт.</suppression>
23 |         <suppression>тыс.</suppression>
24 |         <suppression>авг.</suppression>
25 |         <suppression>проф.</suppression>
26 |         <suppression>н.э.</suppression>
27 |         <suppression>кв.</suppression>
28 |         <suppression>ул.</suppression>
29 |         <suppression>отд.</suppression>
30 |       </suppressions>
31 |     </segmentation>
32 |   </segmentations>
33 | </ldml>
34 | 
35 | 


--------------------------------------------------------------------------------
/priv/segments/sv.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 1991-2015 Unicode, Inc.
 5 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 6 | For terms of use, see http://www.unicode.org/copyright.html
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="sv"/>
12 | 	</identity>
13 | 	<segmentations>
14 | 		<segmentation type="WordBreak">
15 | 			<variables>
16 | 				<variable id="$MidLetter">\p{Word_Break=MidLetter}</variable>
17 | 			</variables>
18 | 		</segmentation>
19 | 	</segmentations>
20 | </ldml>
21 | 


--------------------------------------------------------------------------------
/priv/segments/zh.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 1991-2015 Unicode, Inc.
 5 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 6 | For terms of use, see http://www.unicode.org/copyright.html
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="zh"/>
12 | 	</identity>
13 | 	<segmentations>
14 | 		<segmentation type="LineBreak">
15 | 			<!-- Placeholder until the rules here are updated to reflect the CSS line break variants -->
16 | 			<variables>
17 | 				<variable id="$ID">[[\p{Line_Break=Ideographic}] [$CJ]]</variable>
18 | 				<variable id="$NS">\p{Line_Break=Nonstarter}</variable>
19 | 			</variables>
20 | 		</segmentation>
21 | 	</segmentations>
22 | </ldml>
23 | 


--------------------------------------------------------------------------------
/priv/segments/zh_Hant.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd">
 3 | <!--
 4 | Copyright © 1991-2015 Unicode, Inc.
 5 | CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
 6 | For terms of use, see http://www.unicode.org/copyright.html
 7 | -->
 8 | <ldml>
 9 | 	<identity>
10 | 		<version number="$Revision$"/>
11 | 		<language type="zh"/>
12 | 		<script type="Hant"/>
13 | 	</identity>
14 | 	<segmentations>
15 | 		<segmentation type="LineBreak">
16 | 			<!-- Placeholder until the rules here are updated to reflect the CSS line break variants -->
17 | 			<variables>
18 | 				<variable id="$ID">[[\p{Line_Break=Ideographic}] [$CJ]]</variable>
19 | 				<variable id="$NS">\p{Line_Break=Nonstarter}</variable>
20 | 			</variables>
21 | 		</segmentation>
22 | 	</segmentations>
23 | </ldml>
24 | 


--------------------------------------------------------------------------------
/test/casing_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule UnicodeString.Casing.Test do
 2 |   use ExUnit.Case, async: true
 3 | 
 4 |   test "Casing Greek with final sigma" do
 5 |     assert Unicode.String.Case.Mapping.upcase("Ὀδυσσεύς") == "ὈΔΥΣΣΕΎΣ"
 6 |     assert Unicode.String.Case.Mapping.titlecase("ὈΔΥΣΣΕΎΣ") == "Ὀδυσσεύς"
 7 |     assert Unicode.String.Case.Mapping.downcase("ὈΔΥΣΣΕΎΣ") == "ὀδυσσεύς"
 8 |   end
 9 | 
10 |   test "That accents are removed when upcasing Greek" do
11 |     string = "Πατάτα, Αέρας, Μυστήριο, Ωραίο, Μαΐου, Πόρος, Ρύθμιση, ΰ, Τηρώ, Μάιος, άυλο"
12 |     upcased = "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ, ΩΡΑΙΟ, ΜΑΙΟΥ, ΠΟΡΟΣ, ΡΥΘΜΙΣΗ, Υ, ΤΗΡΩ, ΜΑΙΟΣ, ΑΥΛΟ"
13 |     assert Unicode.String.upcase(string, locale: :el) == upcased
14 |   end
15 | 
16 |   test "Casing dottied I in the Turkish and Azeri languages" do
17 |     assert Unicode.String.Case.Mapping.upcase("ii", :tr) == "İİ"
18 |     assert Unicode.String.Case.Mapping.upcase("ii", :az) == "İİ"
19 |     assert Unicode.String.Case.Mapping.upcase("ii") == "II"
20 |     assert Unicode.String.upcase("Diyarbakır", locale: :tr) == "DİYARBAKIR"
21 |     assert Unicode.String.downcase("DİYARBAKIR", locale: :tr) == "diyarbakır"
22 |   end
23 | 
24 |   test "Titlecasing in Dutch for leading `ij` dipthong" do
25 |     assert Unicode.String.Case.Mapping.titlecase("ijthings", :nl) == "IJthings"
26 |   end
27 | 
28 |   test "Resolving the casing locale from a Language Tag" do
29 |     import Cldr.LanguageTag.Sigil
30 | 
31 |     assert Unicode.String.casing_locale(~l"az") == {:ok, :az}
32 |     assert Unicode.String.casing_locale(~l"tr") == {:ok, :tr}
33 |     assert Unicode.String.casing_locale(~l"lt") == {:ok, :lt}
34 |     assert Unicode.String.casing_locale(~l"en") == {:ok, :any}
35 |     assert Unicode.String.casing_locale(~l"ar") == {:ok, :any}
36 |   end
37 | 
38 |   test "Resolving the casing locale from a string" do
39 |     assert Unicode.String.casing_locale("az") == {:ok, :az}
40 |     assert Unicode.String.casing_locale("tr") == {:ok, :tr}
41 |     assert Unicode.String.casing_locale("lt") == {:ok, :lt}
42 |     assert Unicode.String.casing_locale("en") == {:ok, :any}
43 |     assert Unicode.String.casing_locale("ar") == {:ok, :any}
44 |   end
45 | 
46 |   test "Resolving the casing locale from an atom" do
47 |     assert Unicode.String.casing_locale(:az) == {:ok, :az}
48 |     assert Unicode.String.casing_locale(:tr) == {:ok, :tr}
49 |     assert Unicode.String.casing_locale(:lt) == {:ok, :lt}
50 |     assert Unicode.String.casing_locale(:en) == {:ok, :any}
51 |     assert Unicode.String.casing_locale(:ar) == {:ok, :any}
52 |   end
53 | end
54 | 


--------------------------------------------------------------------------------
/test/line_break_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Unicode.String.LineBreak.Test do
 2 |   use ExUnit.Case, async: true
 3 | 
 4 |   test "Unicode.String.split that end in a quote mark" do
 5 |     assert ["He ", "said, ", "\"A ", "cup ", "of ", "hot ", "tea?\""] =
 6 |       Unicode.String.split(~s(He said, "A cup of hot tea?"), locale: :en, break: :line)
 7 |   end
 8 | 
 9 |   test "Unicode.String.next that ends in a quote mark" do
10 |     assert Unicode.String.next(~s(tea"), locale: :en, break: :line) ==
11 |       {"tea\"", ""}
12 |   end
13 | end


--------------------------------------------------------------------------------
/test/segment_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule UnicodeString.Segment.Test do
 2 |   use ExUnit.Case, async: true
 3 | 
 4 |   test "Resolving the segmentation locale from a Language Tag" do
 5 |     import Cldr.LanguageTag.Sigil
 6 | 
 7 |     assert Unicode.String.segmentation_locale(:word, ~l"fr") == {:ok, :fr}
 8 |     assert Unicode.String.segmentation_locale(:word, ~l"en-US") == {:ok, :"en-US"}
 9 |     assert Unicode.String.segmentation_locale(:word, ~l"en") == {:ok, :en}
10 |     assert Unicode.String.segmentation_locale(:word, ~l"ar") == {:ok, :root}
11 |   end
12 | 
13 |   test "Resolving the segmentation locale from a string" do
14 |     assert Unicode.String.segmentation_locale(:word, "fr") == {:ok, :fr}
15 |     assert Unicode.String.segmentation_locale(:word, "en-US") == {:ok, :"en-US"}
16 |     assert Unicode.String.segmentation_locale(:word, "en") == {:ok, :en}
17 |     assert Unicode.String.segmentation_locale(:word, "ar") == {:ok, :root}
18 |   end
19 | 
20 |   test "Resolving the segmentation locale from an atom" do
21 |     assert Unicode.String.segmentation_locale(:word, :fr) == {:ok, :fr}
22 |     assert Unicode.String.segmentation_locale(:word, :"en-US") == {:ok, :"en-US"}
23 |     assert Unicode.String.segmentation_locale(:word, :en) == {:ok, :en}
24 |     assert Unicode.String.segmentation_locale(:word, :ar) == {:ok, :root}
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/test/sentence_break_test.exs:
--------------------------------------------------------------------------------
 1 | defmodule Unicode.String.SentenceBreakTest do
 2 |   use ExUnit.Case, async: true
 3 |   import Unicode.String.TestDataParser
 4 | 
 5 |   @sentence_break_tests "./test/support/test_data/sentence_break_test.txt"
 6 | 
 7 |   @failing_lines []
 8 | 
 9 |   for {line, break, {left, _left_rule, _left_name}, {right, _right_rule, _right_name}} <-
10 |         tests(@sentence_break_tests),
11 |       line not in @failing_lines do
12 |     if break == :"÷" do
13 |       test "line #{line}: #{left} ÷ #{right}" do
14 |         assert Unicode.String.break?({unquote(left), unquote(right)}, break: :sentence)
15 |       end
16 |     else
17 |       test "line: #{line}: #{left} × #{right}" do
18 |         refute Unicode.String.break?({unquote(left), unquote(right)}, break: :sentence)
19 |       end
20 |     end
21 |   end
22 | end
23 | 


--------------------------------------------------------------------------------
/test/support/test_data_parser.ex:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.TestDataParser do
  2 |   @moduledoc false
  3 | 
  4 |   @word_break "./test/support/test_data/word_break_test.txt"
  5 | 
  6 |   def codepoints("") do
  7 |     "<<>>"
  8 |   end
  9 | 
 10 |   def codepoints(string) do
 11 |     codepoints =
 12 |       string
 13 |       |> String.codepoints()
 14 |       |> Enum.map(fn codepoint ->
 15 |         <<codepoint::utf8>> = codepoint
 16 |         codepoint = Integer.to_string(codepoint, 16)
 17 |         "0x" <> pad(codepoint) <> codepoint <> "::utf8"
 18 |       end)
 19 | 
 20 |     "<<" <> Enum.join(codepoints, ", ") <> ">>"
 21 |   end
 22 | 
 23 |   defp pad(<<_::utf8, _::utf8, _::utf8, _::utf8, _::utf8>>), do: ""
 24 |   defp pad(<<_::utf8, _::utf8, _::utf8, _::utf8>>), do: ""
 25 |   defp pad(<<_::utf8, _::utf8, _::utf8>>), do: "0"
 26 |   defp pad(<<_::utf8, _::utf8>>), do: "00"
 27 |   defp pad(<<_::utf8>>), do: "000"
 28 | 
 29 |   def tests(path \\ @word_break) do
 30 |     path
 31 |     |> parse()
 32 |     |> separate_tests()
 33 |   end
 34 | 
 35 |   def separate_tests(tests) do
 36 |     tests
 37 |     |> Enum.reduce({"", []}, &reduce_tests/2)
 38 |     |> elem(1)
 39 |     |> Enum.reverse()
 40 |   end
 41 | 
 42 |   defp reduce_tests({_index, [_eot]}, {_previous, acc}) do
 43 |     {"", acc}
 44 |   end
 45 | 
 46 |   defp reduce_tests({line, [{left, {rule, codepoint_name}}, {break?, _} | rest]}, {previous, acc}) do
 47 |     previous = previous <> left
 48 |     test = {line, break?, {previous, rule, codepoint_name}, collect(rest)}
 49 |     acc = [test | acc]
 50 | 
 51 |     reduce_tests({line, rest}, {previous, acc})
 52 |   end
 53 | 
 54 |   defp collect([{_codepoint, {rule, codepoint_name}} | _others] = rest) do
 55 |     binary =
 56 |       Enum.reduce(rest, "", fn
 57 |         {char, _descr}, acc when is_binary(char) ->
 58 |           acc <> char
 59 | 
 60 |         _break?, acc ->
 61 |           acc
 62 |       end)
 63 | 
 64 |     {binary, rule, codepoint_name}
 65 |   end
 66 | 
 67 |   def parse(path \\ @word_break)
 68 | 
 69 |   def parse(path) when is_binary(path) do
 70 |     path
 71 |     |> File.read!()
 72 |     |> String.split("\n")
 73 |     |> Enum.with_index(1)
 74 |     |> Enum.reject(fn {string, _line} -> String.starts_with?(string, "#") || string == "" end)
 75 |     |> Enum.map(fn {string, line} -> {String.split(string, "#"), line} end)
 76 |     |> Enum.map(&parse/1)
 77 |     |> Enum.map(fn {line, rule, description} -> {line, Enum.zip(rule, description)} end)
 78 |   end
 79 | 
 80 |   def parse({[test, description], line}) do
 81 |     {line, parse_test(test), parse_description(description)}
 82 |   end
 83 | 
 84 |   defp parse_test(test) do
 85 |     test = String.trim(test)
 86 | 
 87 |     ~r/[÷×]/u
 88 |     |> Regex.split(test, include_captures: true)
 89 |     |> Enum.map(&String.trim/1)
 90 |     |> Enum.map(&parse_test_part/1)
 91 |   end
 92 | 
 93 |   defp parse_test_part(operator) when operator in ["÷", "×"] do
 94 |     String.to_atom(operator)
 95 |   end
 96 | 
 97 |   defp parse_test_part("") do
 98 |     ""
 99 |   end
100 | 
101 |   defp parse_test_part(codepoint) do
102 |     codepoint
103 |     |> String.to_integer(16)
104 |     |> List.wrap()
105 |     |> List.to_string()
106 |   end
107 | 
108 |   # ÷ 0001 ÷ 0001 ÷	#  ÷ [0.2] <START OF HEADING> (Other) ÷ [999.0] <START OF HEADING> (Other) ÷ [0.3]
109 | 
110 |   defp parse_description(description) do
111 |     description = String.trim(description)
112 | 
113 |     ~r/[÷×]/u
114 |     |> Regex.split(description, include_captures: true)
115 |     |> Enum.map(&String.trim/1)
116 |     |> Enum.map(&parse_description_part/1)
117 |   end
118 | 
119 |   def parse_description_part(operator) when operator in ["÷", "×"] do
120 |     operator
121 |   end
122 | 
123 |   def parse_description_part("") do
124 |     {"0.0", "SOT"}
125 |   end
126 | 
127 |   def parse_description_part(part) do
128 |     case String.split(part, " ", parts: 2) do
129 |       [rule_number, rule_description] ->
130 |         part
131 |         |> String.trim()
132 |         |> String.split(" ", parts: 2)
133 |         |> Enum.map(&String.trim/1)
134 | 
135 |         [_, rule_number, _] = String.split(rule_number, ["[", "]"])
136 |         {rule_number, rule_description}
137 | 
138 |       [rule_number] ->
139 |         [_, rule_number, _] = String.split(rule_number, ["[", "]"])
140 |         {rule_number, "EOT"}
141 |     end
142 |   end
143 | end
144 | 


--------------------------------------------------------------------------------
/test/test_helper.exs:
--------------------------------------------------------------------------------
1 | ExUnit.start()
2 | 


--------------------------------------------------------------------------------
/test/unicode_string_test.exs:
--------------------------------------------------------------------------------
1 | defmodule UnicodeStringTest do
2 |   use ExUnit.Case
3 |   doctest Unicode.String
4 | end
5 | 


--------------------------------------------------------------------------------
/test/word_break_test.exs:
--------------------------------------------------------------------------------
  1 | defmodule Unicode.String.WordBreakTest do
  2 |   use ExUnit.Case, async: true
  3 |   import Unicode.String.TestDataParser
  4 | 
  5 |   @word_break_tests "./test/support/test_data/word_break_test.txt"
  6 | 
  7 |   # The following tests pass using the Unicode definition of $MidLetter but
  8 |   # CLDR makes some changes to that definition which causes the following test
  9 |   # lines to fail. For testing purposes we omit them.
 10 | 
 11 |   # Difference is in root.xml
 12 |   #
 13 |   # CLDR:
 14 |   #  <variable id="$MidLetter">[\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]</variable>
 15 |   #
 16 |   # Unicode:
 17 |   #  <variable id="$MidLetter">[\p{Word_Break = MidLetter}]</variable>
 18 | 
 19 |   @cldr_specific_lines [
 20 |     1715,
 21 |     1253,
 22 |     1254,
 23 |     1267,
 24 |     1268,
 25 |     1283,
 26 |     1284,
 27 |     1285,
 28 |     1286,
 29 |     1287,
 30 |     1288,
 31 |     1289,
 32 |     1290,
 33 |     1291,
 34 |     1292,
 35 |     1712,
 36 |     1728,
 37 |     1729,
 38 |     1734,
 39 |     1735,
 40 |     1736
 41 |   ]
 42 | 
 43 |   @test_lines 1..5000
 44 | 
 45 |   for {line, break, {left, _, _}, {right, _, _}} <- tests(@word_break_tests),
 46 |       line not in @cldr_specific_lines && line in @test_lines do
 47 |     left_codepoints = codepoints(left)
 48 |     right_codepoints = codepoints(right)
 49 | 
 50 |     case break do
 51 |       :"÷" ->
 52 |         test "word break line #{line}: #{left_codepoints} ÷ #{right_codepoints}" do
 53 |           assert Unicode.String.break?({unquote(left), unquote(right)})
 54 |         end
 55 | 
 56 |       :"×" ->
 57 |         test "word break line #{line}: #{left_codepoints} × #{right_codepoints}" do
 58 |           refute Unicode.String.break?({unquote(left), unquote(right)})
 59 |         end
 60 |     end
 61 |   end
 62 | 
 63 |   test "Unicode.String.split/2 when the passing rule is a :no_break" do
 64 |     assert Unicode.String.split(~s(“Hi), locale: :en, break: :word) == ["“", "Hi"]
 65 |     assert Unicode.String.split(~s("Du), locale: :de, break: :word) == ["\"", "Du"]
 66 |     assert Unicode.String.split(~s("Hi"), locale: :en, break: :word) == ["\"", "Hi", "\""]
 67 |     assert Unicode.String.split(~s("Hi ), locale: :en, break: :word) == ["\"", "Hi", " "]
 68 |   end
 69 | 
 70 |   test "Unicode.String.next/2 when the passing rule is a :no_break" do
 71 |     assert Unicode.String.next(~s(“Hi), locale: :en, break: :word) == {"“", "Hi"}
 72 |     assert Unicode.String.next(~s("Du), locale: :de, break: :word) == {"\"", "Du"}
 73 |     assert Unicode.String.next(~s("Hi"), locale: :en, break: :word) ==  {"\"", "Hi\""}
 74 |     assert Unicode.String.next(~s("Hi ), locale: :en, break: :word) == {"\"", "Hi "}
 75 |   end
 76 | 
 77 |   # CLDR, unlike Unicode, applies a dictionary-based approach for word
 78 |   # breaks. Therefore there are no Unicode tests for these. We add them
 79 |   # here.
 80 | 
 81 |   test "Resolving dictionary locales" do
 82 |     assert {:ok, :zh} = Unicode.String.Dictionary.dictionary_locale(:"zh-Hant")
 83 |     assert {:ok, :zh} = Unicode.String.Dictionary.dictionary_locale(:"zh-Hant-HK")
 84 |     assert {:ok, :zh} = Unicode.String.Dictionary.dictionary_locale(:yue)
 85 |     assert {:ok, :zh} = Unicode.String.Dictionary.dictionary_locale(:"yue-Hant")
 86 |     assert {:ok, :zh} = Unicode.String.Dictionary.dictionary_locale(:"yue-Hans")
 87 |   end
 88 | 
 89 |   test "Unicode.String.split/2 uses a dictionary with dictionary locales" do
 90 |     assert Unicode.String.split("布鲁赫", locale: :zh) == ["布", "鲁", "赫"]
 91 | 
 92 |     assert Unicode.String.split("明德", locale: :zh_Hant_HK) == ["明德"]
 93 |     assert Unicode.String.split("明德", locale: :zh_Hant) == ["明德"]
 94 |     assert Unicode.String.split("明德", locale: :yue) == ["明德"]
 95 |     assert Unicode.String.split("明德", locale: :yue_Hant) == ["明德"]
 96 |     assert Unicode.String.split("明德", locale: :yue_Hans) == ["明德"]
 97 |     assert Unicode.String.split("明德", locale: :zh) == ["明德"]
 98 |     assert Unicode.String.split("明德", locale: :ja) ==["明德"]
 99 | 
100 |     assert Unicode.String.split("สวัสดีเจ้านาย", locale: :th) == ["สวัสดี", "เจ้า", "นาย"]
101 |     assert Unicode.String.split("ສະບາຍດີນາຍຈ້າງ", locale: :lo) == ["ສະບາຍດີ", "ນາຍ", "ຈ້າງ"]
102 |     assert Unicode.String.split("ສະမင်္ဂလာပါ သူဌေး", locale: :my) == ["ສ", "ະ", "မင်္ဂလာ", "ပါ", " ", "သူဌေး"]
103 |     assert Unicode.String.split("ສជំរាបសួរចៅហ្វាយ", locale: :km) == ["ສ", "ជំរាបសួរ", "ចៅហ្វាយ"]
104 |   end
105 | 
106 |   test "Doesn't break after a word-break=extend codepoint when followed by a letter" do
107 |     assert ["Ẹ́va", "Sophia"] ==
108 |       Unicode.String.split("Ẹ́va Sophia", locale: :pcm, trim: true, break: :word)
109 |   end
110 | end
111 | 


--------------------------------------------------------------------------------
/update_segment_data:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | # The location of the cloned CLDR repo
4 | export CLDR_REPO="${CLDR_REPO:=$HOME/Development/cldr_repo}"
5 | [ ! -d $CLDR_REPO ] && { echo "Unicode CLDR repository $CLDR_REPO was not found."; exit 1; }
6 | 
7 | cp -R $CLDR_REPO/common/segments ./priv


--------------------------------------------------------------------------------
/update_test_data:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | # Update the test data from Unicode
 3 | wget https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt \
 4 |   -O ./test/support/test_data/grapheme_break_test.txt
 5 | wget https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakTest.txt \
 6 |   -O ./test/support/test_data/word_break_test.txt
 7 | wget https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/SentenceBreakTest.txt \
 8 |   -O ./test/support/test_data/sentence_break_test.txt
 9 | wget https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/LineBreakTest.txt \
10 |   -O ./test/support/test_data/line_break_test.txt
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------