├── .formatter.exs ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── benchee ├── casing.exs ├── compare.exs └── next.exs ├── config └── config.exs ├── lib ├── tasks │ └── download_dictionaries.ex └── unicode │ ├── break.ex │ ├── case │ ├── folding.ex │ ├── greek_upper.ex │ └── mapping.ex │ ├── dictionary.ex │ ├── segment.ex │ ├── string.ex │ └── trie.ex ├── logo.png ├── mix.exs ├── mix.lock ├── mix └── myapp_backend.ex ├── priv ├── dictionaries │ └── .gitkeep └── segments │ ├── de.xml │ ├── el.xml │ ├── en.xml │ ├── en_US.xml │ ├── en_US_POSIX.xml │ ├── es.xml │ ├── fi.xml │ ├── fr.xml │ ├── it.xml │ ├── ja.xml │ ├── pt.xml │ ├── root.xml │ ├── ru.xml │ ├── sv.xml │ ├── zh.xml │ └── zh_Hant.xml ├── test ├── casing_test.exs ├── line_break_test.exs ├── segment_test.exs ├── sentence_break_test.exs ├── support │ ├── test_data │ │ ├── grapheme_break_test.txt │ │ ├── line_break_test.txt │ │ ├── sentence_break_test.txt │ │ └── word_break_test.txt │ └── test_data_parser.ex ├── test_helper.exs ├── unicode_string_test.exs └── word_break_test.exs ├── update_segment_data └── update_test_data /.formatter.exs: -------------------------------------------------------------------------------- 1 | # Used by "mix format" 2 | [ 3 | inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"] 4 | ] 5 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - main 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | env: 13 | MIX_ENV: test 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | include: 18 | - pair: 19 | elixir: 1.18.1-otp-27 20 | otp: 27.2 21 | lint: lint 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - uses: erlef/setup-beam@v1 26 | with: 27 | otp-version: ${{matrix.pair.otp}} 28 | elixir-version: ${{matrix.pair.elixir}} 29 | 30 | - uses: actions/cache@v3 31 | with: 32 | path: | 33 | deps 34 | _build 35 | key: ${{ runner.os }}-mix-${{matrix.pair.elixir}}-${{matrix.pair.otp}}-${{ hashFiles('**/mix.lock') }} 36 | 37 | - run: mix deps.get 38 | 39 | - run: mix deps --check-unused 40 | if: ${{ matrix.lint }} 41 | 42 | - run: mix deps.compile 43 | 44 | - run: mix compile --warnings-as-errors 45 | if: ${{ matrix.lint }} 46 | 47 | - run: mix dialyzer 48 | if: ${{ matrix.lint }} 49 | 50 | - run: mix unicode.string.download.dictionaries 51 | 52 | - run: mix test 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /_build 2 | /cover 3 | /deps 4 | /doc 5 | /references 6 | *.snapshot 7 | erl_crash.dump 8 | *.ez 9 | *.tar 10 | .DS_Store 11 | .iex.exs 12 | 13 | # The xml downloaded from Unicode 14 | /downloads 15 | 16 | # Generated erlang source 17 | /src/*.erl 18 | 19 | # asdf 20 | .tool-versions 21 | 22 | # Don't store the dictionaries 23 | /priv/dictionaries/*.txt 24 | 25 | # Mise 26 | mise.toml 27 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## Unicode String v1.7.0 4 | 5 | This is the changelog for Unicode String v1.7.0 released on March 29th. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 6 | 7 | ### Bug Fixes 8 | 9 | * Converts all compile-time regex compilation to runtime to be compatible with OTP 28. Performance implications are not yet known. 10 | 11 | ## Unicode String v1.6.0 12 | 13 | This is the changelog for Unicode String v1.6.0 released on March 17th, 2025. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 14 | 15 | ### Bug Fixes 16 | 17 | * Fix word break detection when a `\p{word_break=extend}` codepoint is preceeded by a letter and followed by a letter. 18 | 19 | ### Enhancements 20 | 21 | * Updated to [CLDR 47](https://cldr.unicode.org/downloads/cldr-47) break rules and test data. 22 | 23 | ## Unicode String v1.5.0 24 | 25 | This is the changelog for Unicode String v1.5.0 released on January 1st, 2025. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 26 | 27 | ### Enhancements 28 | 29 | * Update to CLDR 46.1 segmentation data and tests. 30 | 31 | * Pass dialyzer with `:underspecs` flag set. 32 | 33 | ## Unicode String v1.4.1 34 | 35 | This is the changelog for Unicode String v1.4.1 released on March 14th, 2024. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 36 | 37 | ### Bug Fixes 38 | 39 | * Fix performance regressing in `Uncode.String.Break.next/4`. Added the script `bench/next.exs` to allow for regression testing. Thanks to @mntns for the report. Closes #6. 40 | 41 | ## Unicode String v1.4.0 42 | 43 | This is the changelog for Unicode String v1.4.0 released on March 10th, 2024. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 44 | 45 | ### Enhancements 46 | 47 | * Adds dictionary-based work breaking for Chinese (zh, zh-Hant, zh-Hans, zh-Hant-HK, yue, yue-Hans), Japanese (ja), Thai (th), Lao (lo), Khmer (km) and Burmese (my). These languages don't typically use whitespace to separate words so a dictionary lookup is more appropriate - although not perfect. The same dictionary is used for Chinese and Japanese. The dictionaries implemented are those used in the [CLDR](https://cldr.unicode.org) since they are under an open source license and also for consistency with [ICU](https://icu.unicode.org). Note that these dictionaries need to be downloaded with `mix unicode.string.download.dictionaries` prior to use. Each dictionary will be parsed and loaded into [persistent_term](https://www.erlang.org/doc/man/persistent_term) on demand. Each dictionary has a sizable memory footprint as measured by `:persistent_term.info/0`: 48 | 49 | | Dictionary | Memory Mb | 50 | | ----------- | ----------: | 51 | | Chinese | 104.8 | 52 | | Thai | 9.6 | 53 | | Lao | 11.4 | 54 | | Khmer | 38.8 | 55 | | Burmese | 23.1 | 56 | 57 | ## Unicode String v1.3.1 58 | 59 | This is the changelog for Unicode String v1.3.1 released on March 6th, 2024. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 60 | 61 | ### Bug Fixes 62 | 63 | * Fix `Unicode.String.split/2` and `Unicode.String.next/2` when the passing rule is `:no_break` rule. Thanks to @GregLMcDonald for the report. Closes #5. 64 | 65 | ## Unicode String v1.3.0 66 | 67 | This is the changelog for Unicode String v1.3.0 released on February 27th, 2024. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 68 | 69 | ### Bug Fixes 70 | 71 | * Fix case folding for codepoints that fold to themselves. 72 | 73 | ### Enhancements 74 | 75 | * Adds case mapping functions `Unicode.String.upcase/2`, `Unicode.String.downcase/2` and `Unicode/String.titlecase/2`. These functions implement the full [Unicode Casing algorithm](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) including conditiional mappings. They are locale-aware and a locale can be specified as a string, atom or a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) thereby providing basic integration between `unicode_string` and [ex_cldr](https://hex.pm/packages/ex_cldr). 76 | 77 | * Case folding always follows the `:full` path which allows mapping of single code points to multiple code points. There is no practical reason to implement the `:simple` path. As a result, the `type` parameter to `Unicode.String.Case.Folding.fold/2` is no longer required or supported. 78 | 79 | * Support an [ex_cldr](https://hex.pm/packages/ex_cldr) [Language Tag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) as a parameter to `Unicode.String.Case.Folding.fold/2`. In fact any map that has a `:language` key with a value that is an [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) language code as a lower cased atom may be passed as a parameter. 80 | 81 | ## Unicode String v1.2.1 82 | 83 | This is the changelog for Unicode String v1.2.1 released on June 2nd, 2023. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 84 | 85 | ### Bug Fixes 86 | 87 | * Resolve segments dir at runtime, not compile time. Thanks to @crkent for the report. Closes #4. 88 | 89 | ## Unicode String v1.2.0 90 | 91 | This is the changelog for Unicode String v1.2.0 released on March 14th, 2023. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 92 | 93 | ### Enhancements 94 | 95 | * Adds `Unicode.String.stream/2` to support streaming graphemes, words, sentences and line breaks. 96 | 97 | ## Unicode String v1.1.0 98 | 99 | This is the changelog for Unicode String v1.1.0 released on September 21st, 2022. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 100 | 101 | ### Enhancements 102 | 103 | * Updates the segmentation supplemental data (including locales) for CLDR. This adds the "sv" and "fi" locale data for sentence break suppressions. 104 | 105 | ## Unicode String v1.0.1 106 | 107 | This is the changelog for Unicode String v1.0.1 released on September 15th, 2021. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 108 | 109 | ### Bug Fixes 110 | 111 | * Woops, the priv/segments directory was not included in the build artifact 112 | 113 | ## Unicode String v1.0.0 114 | 115 | This is the changelog for Unicode String v1.0.0 released on September 14th, 2021. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 116 | 117 | ### Enhancements 118 | 119 | * Update to use [Unicode 14](https://unicode.org/versions/Unicode14.0.0) release data. 120 | 121 | ## Unicode String v0.3.0 122 | 123 | This is the changelog for Unicode String v0.3.0 released on October 11th, 2020. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 124 | 125 | ### Bug Fixes 126 | 127 | * Correct deps and docs to align with Elixir 1.11 and recent releases of `ex_unicode`. 128 | 129 | # Unicode String v0.2.0 130 | 131 | This is the changelog for Unicode String v0.2.0 released on July 12th, 2020. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 132 | 133 | ### Enhancements 134 | 135 | This release implements the Unicode break rules for graphemes, words, lines (word-wrapping) and sentences. 136 | 137 | * Adds `Unicode.String.split/2` 138 | 139 | * Adds `Unicode.String.break?/2` 140 | 141 | * Adds `Unicode.String.break/2` 142 | 143 | * Adds `Unicode.String.splitter/2` 144 | 145 | * Adds `Unicode.String.next/2` 146 | 147 | # Unicode String v0.1.0 148 | 149 | This is the changelog for Unicode String v0.1.0 released on May 17th, 2020. For older changelogs please consult the release tag on [GitHub](https://github.com/elixir-unicode/unicode_string/tags) 150 | 151 | ### Enhancements 152 | 153 | * Initial release 154 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | Copyright 2018-2023 Kip Cole 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in 6 | compliance with the License. You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software distributed under the License 11 | is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 12 | implied. See the License for the specific language governing permissions and limitations under the 13 | License. 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unicode String 2 | 3 | ![Build status](https://github.com/elixir-unicode/unicode_string/actions/workflows/ci.yml/badge.svg) 4 | [![Hex.pm](https://img.shields.io/hexpm/v/unicode_string.svg)](https://hex.pm/packages/unicode_string) 5 | [![Hex.pm](https://img.shields.io/hexpm/dw/unicode_string.svg?)](https://hex.pm/packages/unicode_string) 6 | [![Hex.pm](https://img.shields.io/hexpm/l/unicode_string.svg)](https://hex.pm/packages/unicode_string) 7 | 8 | Adds functions supporting some string algorithms in the Unicode standard. For example: 9 | 10 | * The [Unicode Case Folding](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm to provide case-independent equality checking irrespective of language or script with `Unicode.String.fold/2` and `Unicode.String.equals_ignoring_case?/2` 11 | 12 | * The [Unicode Code Mapping](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm that implements locale-aware `Unicode.String.upcase/2`, `Unicode.String.downcase/2` and `Unicode.String.titlecase/2`. 13 | 14 | * The [Unicode Segmentation](https://unicode.org/reports/tr29/) algorithm to detect, break, split or stream strings into grapheme clusters, words and sentences. 15 | 16 | * The [Unicode Line Breaking](https://www.unicode.org/reports/tr14/) algorithm to determine line breaks (breaks meaning where word-wrapping would be acceptable). 17 | 18 | ## Installation 19 | 20 | The package can be installed by adding `:unicode_string` to your list of dependencies in `mix.exs`: 21 | 22 | ```elixir 23 | def deps do 24 | [ 25 | {:unicode_string, "~> 1.0"}, 26 | ... 27 | ] 28 | end 29 | ``` 30 | 31 | Then run `mix dep.get`. 32 | 33 | > #### Word Break Dictionary Download {: .info} 34 | > 35 | > If you plan to perform word break segmentation on Chinese, Japanese, Lao, 36 | > Burmese, Thai or Khmer languages you will need to download the word break dictionaries 37 | > by running `mix unicode.string.download.dictionaries`. 38 | 39 | ## Casing 40 | 41 | ### Case Folding 42 | 43 | The [Unicode Case Folding](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm defines how to perform case folding. This allows comparison of strings in a case-insensitive fashion. It does not define the means to compare ignoring diacritical marks (accents). Some examples follow, for details see: 44 | 45 | * `Unicode.String.fold/2` 46 | * `Unicode.String.equals_ignoring_case?/3` 47 | 48 | > #### Note {: .info} 49 | > 50 | > Although the folding algorithm commonly downcases characters, folding is not a general purpose downcasing process. It exists only to facilitate case insensitive string comparison. 51 | 52 | 53 | ```elixir 54 | iex> Unicode.String.equals_ignoring_case? "ABC", "abc" 55 | true 56 | 57 | iex> Unicode.String.equals_ignoring_case? "beißen", "beissen" 58 | true 59 | 60 | iex> Unicode.String.equals_ignoring_case? "grüßen", "grussen" 61 | false 62 | ``` 63 | 64 | ### Case Mapping 65 | 66 | The [Unicode Case Mapping](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm defines the process and data to transform text into upper case, lower case or title case. Since most languages are not bicameral, characters which have no case mapping remain unchanged. 67 | 68 | Three case mapping functions are provided: 69 | 70 | * `Unicode.String.upcase/2` which will convert text to upper case characters. 71 | * `Unicode.String.downcase/2` which will convert text to lower case characters. 72 | * `Unicode.String.titlecase/2` which will convert text to title case. Title case means that the first character or each word is set to upper case and all other characters in the word are set to lower case. `Unicode.String.split/2` is used to split the string into words before title casing. 73 | 74 | Each function operates in a locale-aware manner implementing some basic capabilities: 75 | 76 | * Casing rules for the Turkish dotted capital `I` and dotless small `i`. 77 | * Casing rules for the retention of dots over `i` for Lithuanian letters with additional accents. 78 | * Titlecasing of IJ at the start of words in Dutch. 79 | * Removal of diacritics when upper casing letters in Greek. 80 | 81 | There are other casing rules that are not currently implemented such as: 82 | 83 | * Titlecasing of second or subsequent letters in words in orthographies that include caseless letters such as apostrophes. 84 | * Uppercasing of U+00DF `ß` latin small letter sharp `s` to U+1E9E `ẞ` latin capital letter sharp `s`. 85 | 86 | ```elixir 87 | # Basic case transformation 88 | iex> Unicode.String.upcase("the quick brown fox") 89 | "THE QUICK BROWN FOX" 90 | 91 | # Dotted-I in Turkish and Azeri 92 | iex> Unicode.String.upcase("Diyarbakır", locale: :tr) 93 | "DİYARBAKIR" 94 | 95 | # Upper case in Greek removes diacritics 96 | iex> Unicode.String.upcase("Πατάτα, Αέρας, Μυστήριο", locale: :el) 97 | "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ" 98 | 99 | # Lower case Greek with a final sigma 100 | iex> Unicode.String.downcase("ὈΔΥΣΣΕΎΣ", locale: :el) 101 | "ὀδυσσεύς" 102 | 103 | # Title case Dutch with leading dipthong 104 | iex> Unicode.String.titlecase("ijsselmeer", locale: :nl) 105 | "IJsselmeer" 106 | ``` 107 | 108 | ## Segmentation 109 | 110 | The [Unicode Segmentation](https://unicode.org/reports/tr29/) annex details the algorithm to be applied with segmenting text (Elixir strings) into words, sentences, graphemes and line breaks. Some examples follow, for details see: 111 | 112 | * `Unicode.String.split/2` 113 | * `Unicode.String.break?/2` 114 | * `Unicode.String.break/2` 115 | * `Unicode.String.splitter/2` 116 | * `Unicode.String.next/2` 117 | * `Unicode.String.stream/2` 118 | 119 | ```elixir 120 | # Split text at a word boundary. 121 | iex> Unicode.String.split "This is a sentence. And another.", break: :word 122 | ["This", " ", "is", " ", "a", " ", "sentence", ".", " ", "And", " ", "another", "."] 123 | 124 | # Split text at a word boundary but omit any whitespace 125 | iex> Unicode.String.split "This is a sentence. And another.", break: :word, trim: true 126 | ["This", "is", "a", "sentence", ".", "And", "another", "."] 127 | 128 | # Split text at a sentence boundary. 129 | iex> Unicode.String.split "This is a sentence. And another.", break: :sentence 130 | ["This is a sentence. ", "And another."] 131 | 132 | # By default, common abbreviations are suppressed (ie 133 | # they do not cause a break) 134 | iex> Unicode.String.split "No, I don't have a Ph.D. but I don't think it matters.", break: :word, trim: true 135 | ["No", ",", "I", "don't", "have", "a", "Ph.D", ".", "but", "I", "don't", 136 | "think", "it", "matters", "."] 137 | 138 | iex> Unicode.String.split "No, I don't have a Ph.D. but I don't think it matters.", break: :sentence, trim: true 139 | ["No, I don't have a Ph.D. but I don't think it matters."] 140 | 141 | # Sentence Break suppressions are locale sensitive. 142 | iex> Unicode.String.Segment.known_locales 143 | ["de", "el", "en", "en-US", "en-US-POSIX", "es", "fi", "fr", "it", "ja", "pt", 144 | "root", "ru", "sv", "zh", "zh-Hant"] 145 | 146 | iex> Unicode.String.split "Non, c'est M. Dubois.", break: :sentence, trim: true, locale: "fr" 147 | ["Non, c'est M. Dubois."] 148 | 149 | # Note that break: :line does NOT mean split the string 150 | # at newlines. It splits the string where a line break would be 151 | # acceptable. This is very useful for calculating where 152 | # to perform word-wrap on some text. 153 | iex> Unicode.String.split "This is a sentence. And another.", break: :line 154 | ["This ", "is ", "a ", "sentence. ", "And ", "another."] 155 | ``` 156 | 157 | ### Dictionary-based word segmentation 158 | 159 | Some languages, commonly east asian languages, don't typically use whitespace to separate words so a dictionary lookup is more appropriate - although not perfect. 160 | 161 | This implementation supports dictionary-based word breaking for: 162 | 163 | * Chinese (`zh`, `zh-Hant`, `zh-Hans`, `zh-Hant-HK`, `yue`, `yue-Hans`) locales, 164 | * Japanese (`ja`) using the same dictionary as for Chinese, 165 | * Thai (`th`), 166 | * Lao (`lo`), 167 | * Khmer (`km`) and 168 | * Burmese (`my`). 169 | 170 | The dictionaries implemented are those used in the [CLDR](https://cldr.unicode.org) since they are under an open source license and also for consistency with [ICU](https://icu.unicode.org). 171 | 172 | Note that these dictionaries need to be downloaded with `mix unicode.string.download.dictionaries` prior to use. Each dictionary will be parsed and loaded into [persistent_term](https://www.erlang.org/doc/man/persistent_term) on demand. Note that each dictionary has a sizable memory footprint as measured by `:persistent_term.info/0`: 173 | 174 | | Dictionary | Memory Mb | 175 | | ----------- | ----------: | 176 | | Chinese | 104.8 | 177 | | Thai | 9.6 | 178 | | Lao | 11.4 | 179 | | Khmer | 38.8 | 180 | | Burmese | 23.1 | 181 | 182 | ## Segment Streaming 183 | 184 | Segmentation can also be streamed using `Unicode.String.stream/2`. For large strings this may improve memory usage since the intermediate segments will be garbage collected when they fall out of scope. 185 | 186 | ```elixir 187 | iex> Enum.to_list Unicode.String.stream("this is a list of words", trim: true) ["this", "is", "a", "list", "of", "words"] 188 | 189 | iex> Enum.map Unicode.String.stream("this is a list of words", trim: true), 190 | ...> fn word -> %{word: word, length: String.length(word)} end 191 | [ 192 | %{length: 4, word: "this"}, 193 | %{length: 2, word: "is"}, 194 | %{length: 1, word: "a"}, 195 | %{length: 3, word: "list"}, 196 | %{length: 2, word: "of"}, 197 | %{length: 5, word: "words"} 198 | ] 199 | ``` 200 | 201 | ## References 202 | 203 | * Unicode maintains a [break testing utility](https://util.unicode.org/UnicodeJsps/breaks.jsp). 204 | 205 | -------------------------------------------------------------------------------- /benchee/casing.exs: -------------------------------------------------------------------------------- 1 | s = "THIS IS A STRING WE ARE GOING TO DOWNCASE WITH CHARACTERS THAT HAVE MAPPING AND THOSE THAT DONT 1234^&*&^%$)(*)}" 2 | 3 | Benchee.run(%{ 4 | "Unicode.String.Case.Mapping.downcase" => 5 | fn -> Unicode.String.Case.Mapping.downcase(s) end, 6 | "String.downcase default mode" => 7 | fn -> String.downcase(s) end, 8 | "String.downcase ASCII mode" => 9 | fn -> String.downcase(s, :ascii) end, 10 | }) 11 | 12 | -------------------------------------------------------------------------------- /benchee/compare.exs: -------------------------------------------------------------------------------- 1 | s1 = "ABC" 2 | s2 = "abc" 3 | 4 | Benchee.run(%{ 5 | "Unicode.String.equal_ignoring_case?" => 6 | fn -> Unicode.String.equals_ignoring_case?(s1, s2) end, 7 | "String.==" => 8 | fn -> s1 == s2 end, 9 | "String.downcase compare" => 10 | fn -> String.downcase(s1) == String.downcase(s2) end, 11 | }) -------------------------------------------------------------------------------- /benchee/next.exs: -------------------------------------------------------------------------------- 1 | Benchee.run(%{ 2 | "Unicode.String.Break.next/4" => 3 | fn -> Unicode.String.Break.next("test123 ", "root", :word, []) end, 4 | }) -------------------------------------------------------------------------------- /config/config.exs: -------------------------------------------------------------------------------- 1 | import Config 2 | 3 | config :ex_cldr, 4 | default_backend: MyApp.Cldr 5 | -------------------------------------------------------------------------------- /lib/tasks/download_dictionaries.ex: -------------------------------------------------------------------------------- 1 | defmodule Mix.Tasks.Unicode.String.Download.Dictionaries do 2 | @moduledoc """ 3 | Downloads the ICU (Unicode) dictionaries supporting word breaks 4 | for Chinese, Japanese, Thai, Burmese and Laotion languages. 5 | 6 | """ 7 | 8 | use Mix.Task 9 | require Logger 10 | 11 | @shortdoc "Download Unicode ICU Word Break Dictionaries" 12 | 13 | @root_url "https://raw.githubusercontent.com/unicode-org/icu/main/icu4c/source/data/brkitr/dictionaries" 14 | 15 | @unicode_unsafe_https "UNICODE_UNSAFE_HTTPS" 16 | @unicode_default_timeout "120000" 17 | @unicode_default_connection_timeout "60000" 18 | 19 | @app_name :unicode_string 20 | 21 | @doc false 22 | def run(_) do 23 | Application.ensure_all_started(:inets) 24 | Application.ensure_all_started(:ssl) 25 | 26 | Enum.each(required_files(), &download_file/1) 27 | end 28 | 29 | defp required_files do 30 | [ 31 | {Path.join(root_url(), "/thaidict.txt"), data_path("thai.txt")}, 32 | {Path.join(root_url(), "/laodict.txt"), data_path("lao.txt")}, 33 | {Path.join(root_url(), "/khmerdict.txt"), data_path("khmer.txt")}, 34 | {Path.join(root_url(), "/cjdict.txt"), data_path("chinese_japanese.txt")}, 35 | {Path.join(root_url(), "/burmesedict.txt"), data_path("burmese.txt")} 36 | ] 37 | end 38 | 39 | def root_url do 40 | @root_url 41 | end 42 | 43 | defp download_file({url, destination}) do 44 | case get(url) do 45 | {:ok, body} -> 46 | File.write!(destination, body) 47 | Logger.info("Downloaded #{inspect(url)} to #{inspect(destination)}") 48 | {:ok, destination} 49 | 50 | error -> 51 | error 52 | end 53 | end 54 | 55 | @doc """ 56 | Securely download https content from 57 | a URL. 58 | 59 | This function uses the built-in `:httpc` 60 | client but enables certificate verification 61 | which is not enabled by `:httpc` by default. 62 | 63 | See also https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl 64 | 65 | ### Arguments 66 | 67 | * `url` is a binary URL or a `{url, list_of_headers}` tuple. If 68 | provided the headers are a list of `{'header_name', 'header_value'}` 69 | tuples. Note that the name and value are both charlists, not 70 | strings. 71 | 72 | * `options` is a keyword list of options. 73 | 74 | ### Options 75 | 76 | * `:verify_peer` is a boolean value indicating 77 | if peer verification should be done for this request. 78 | The default is `true` in which case the default 79 | `:ssl` options follow the [erlef guidelines](https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl) 80 | noted above. 81 | 82 | * `:timeout` is the number of milliseconds available 83 | for the request to complete. The default is 84 | #{inspect @unicode_default_timeout}. This option may also be 85 | set with the `CLDR_HTTP_TIMEOUT` environment variable. 86 | 87 | * `:connection_timeout` is the number of milliseconds 88 | available for the a connection to be estabklished to 89 | the remote host. The default is #{inspect @unicode_default_connection_timeout}. 90 | This option may also be set with the 91 | `CLDR_HTTP_CONNECTION_TIMEOUT` environment variable. 92 | 93 | ### Returns 94 | 95 | * `{:ok, body}` if the return is successful. 96 | 97 | * `{:not_modified, headers}` if the request would result in 98 | returning the same results as one matching an etag. 99 | 100 | * `{:error, error}` if the download is 101 | unsuccessful. An error will also be logged 102 | in these cases. 103 | 104 | ### Unsafe HTTPS 105 | 106 | If the environment variable `CLDR_UNSAFE_HTTPS` is 107 | set to anything other than `FALSE`, `false`, `nil` 108 | or `NIL` then no peer verification of certificates 109 | is performed. Setting this variable is not recommended 110 | but may be required is where peer verification for 111 | unidentified reasons. Please [open an issue](https://github.com/elixir-cldr/cldr/issues) 112 | if this occurs. 113 | 114 | ### Certificate stores 115 | 116 | In order to keep dependencies to a minimum, 117 | `get/1` attempts to locate an already installed 118 | certificate store. It will try to locate a 119 | store in the following order which is intended 120 | to satisfy most host systems. The certificate 121 | store is expected to be a path name on the 122 | host system. 123 | 124 | ```elixir 125 | # A certificate store configured by the 126 | # developer 127 | Application.get_env(:ex_cldr, :cacertfile) 128 | 129 | # Populated if hex package `CAStore` is configured 130 | CAStore.file_path() 131 | 132 | # Populated if hex package `certfi` is configured 133 | :certifi.cacertfile() 134 | 135 | # Debian/Ubuntu/Gentoo etc. 136 | "/etc/ssl/certs/ca-certificates.crt", 137 | 138 | # Fedora/RHEL 6 139 | "/etc/pki/tls/certs/ca-bundle.crt", 140 | 141 | # OpenSUSE 142 | "/etc/ssl/ca-bundle.pem", 143 | 144 | # OpenELEC 145 | "/etc/pki/tls/cacert.pem", 146 | 147 | # CentOS/RHEL 7 148 | "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", 149 | 150 | # Open SSL on MacOS 151 | "/usr/local/etc/openssl/cert.pem", 152 | 153 | # MacOS & Alpine Linux 154 | "/etc/ssl/cert.pem" 155 | ``` 156 | 157 | """ 158 | @spec get(String.t | {String.t, list()}, options :: Keyword.t) :: 159 | {:ok, binary} | {:not_modified, any()} | {:error, any} 160 | 161 | def get(url, options \\ []) 162 | 163 | def get(url, options) when is_binary(url) and is_list(options) do 164 | case get_with_headers(url, options) do 165 | {:ok, _headers, body} -> {:ok, body} 166 | other -> other 167 | end 168 | end 169 | 170 | def get({url, headers}, options) when is_binary(url) and is_list(headers) and is_list(options) do 171 | case get_with_headers({url, headers}, options) do 172 | {:ok, _headers, body} -> {:ok, body} 173 | other -> other 174 | end 175 | end 176 | 177 | @doc """ 178 | Securely download https content from 179 | a URL. 180 | 181 | This function uses the built-in `:httpc` 182 | client but enables certificate verification 183 | which is not enabled by `:httc` by default. 184 | 185 | See also https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl 186 | 187 | ### Arguments 188 | 189 | * `url` is a binary URL or a `{url, list_of_headers}` tuple. If 190 | provided the headers are a list of `{'header_name', 'header_value'}` 191 | tuples. Note that the name and value are both charlists, not 192 | strings. 193 | 194 | * `options` is a keyword list of options. 195 | 196 | ### Options 197 | 198 | * `:verify_peer` is a boolean value indicating 199 | if peer verification should be done for this request. 200 | The default is `true` in which case the default 201 | `:ssl` options follow the [erlef guidelines](https://erlef.github.io/security-wg/secure_coding_and_deployment_hardening/ssl) 202 | noted above. 203 | 204 | * `:timeout` is the number of milliseconds available 205 | for the request to complete. The default is 206 | #{inspect @unicode_default_timeout}. This option may also be 207 | set with the `CLDR_HTTP_TIMEOUT` environment variable. 208 | 209 | * `:connection_timeout` is the number of milliseconds 210 | available for the a connection to be estabklished to 211 | the remote host. The default is #{inspect @unicode_default_connection_timeout}. 212 | This option may also be set with the 213 | `CLDR_HTTP_CONNECTION_TIMEOUT` environment variable. 214 | 215 | * `:https_proxy` is the URL of an https proxy to be used. The 216 | default is `nil`. 217 | 218 | ### Returns 219 | 220 | * `{:ok, body, headers}` if the return is successful. 221 | 222 | * `{:not_modified, headers}` if the request would result in 223 | returning the same results as one matching an etag. 224 | 225 | * `{:error, error}` if the download is 226 | unsuccessful. An error will also be logged 227 | in these cases. 228 | 229 | ### Unsafe HTTPS 230 | 231 | If the environment variable `CLDR_UNSAFE_HTTPS` is 232 | set to anything other than `FALSE`, `false`, `nil` 233 | or `NIL` then no peer verification of certificates 234 | is performed. Setting this variable is not recommended 235 | but may be required is where peer verification for 236 | unidentified reasons. Please [open an issue](https://github.com/elixir-cldr/cldr/issues) 237 | if this occurs. 238 | 239 | ### Https Proxy 240 | 241 | `Cldr.Http.get/2` will look for a proxy URL in the following 242 | locales in the order presented: 243 | 244 | * `options[:https_proxy]` 245 | * `ex_cldr` compile-time configuration under the 246 | key `:ex_cldr[:https_proxy]` 247 | * The environment variable `HTTPS_PROXY` 248 | * The environment variable `https_proxy` 249 | 250 | ### Certificate stores 251 | 252 | In order to keep dependencies to a minimum, 253 | `get/1` attempts to locate an already installed 254 | certificate store. It will try to locate a 255 | store in the following order which is intended 256 | to satisfy most host systems. The certificate 257 | store is expected to be a path name on the 258 | host system. 259 | 260 | ```elixir 261 | # A certificate store configured by the 262 | # developer 263 | Application.get_env(:ex_cldr, :cacertfile) 264 | 265 | # Populated if hex package `CAStore` is configured 266 | CAStore.file_path() 267 | 268 | # Populated if hex package `certfi` is configured 269 | :certifi.cacertfile() 270 | 271 | # Debian/Ubuntu/Gentoo etc. 272 | "/etc/ssl/certs/ca-certificates.crt", 273 | 274 | # Fedora/RHEL 6 275 | "/etc/pki/tls/certs/ca-bundle.crt", 276 | 277 | # OpenSUSE 278 | "/etc/ssl/ca-bundle.pem", 279 | 280 | # OpenELEC 281 | "/etc/pki/tls/cacert.pem", 282 | 283 | # CentOS/RHEL 7 284 | "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", 285 | 286 | # Open SSL on MacOS 287 | "/usr/local/etc/openssl/cert.pem", 288 | 289 | # MacOS & Alpine Linux 290 | "/etc/ssl/cert.pem" 291 | ``` 292 | 293 | """ 294 | @doc since: "2.21.0" 295 | 296 | @spec get_with_headers(String.t | {String.t, list()}, options :: Keyword.t) :: 297 | {:ok, list(), binary} | {:not_modified, any()} | {:error, any} 298 | 299 | def get_with_headers(request, options \\ []) 300 | 301 | def get_with_headers(url, options) when is_binary(url) do 302 | get_with_headers({url, []}, options) 303 | end 304 | 305 | def get_with_headers({url, headers}, options) when is_binary(url) and is_list(headers) and is_list(options) do 306 | require Logger 307 | 308 | hostname = String.to_charlist(URI.parse(url).host) 309 | url = String.to_charlist(url) 310 | http_options = http_opts(hostname, options) 311 | https_proxy = https_proxy(options) 312 | 313 | if https_proxy do 314 | case URI.parse(https_proxy) do 315 | %{host: host, port: port} when is_binary(host) and is_integer(port) -> 316 | :httpc.set_options([{:https_proxy, {{String.to_charlist(host), port}, []}}]) 317 | _other -> 318 | Logger.bare_log(:warning, "https_proxy was set to an invalid value. Found #{inspect https_proxy}.") 319 | end 320 | end 321 | 322 | case :httpc.request(:get, {url, headers}, http_options, []) do 323 | {:ok, {{_version, 200, _}, headers, body}} -> 324 | {:ok, headers, body} 325 | 326 | {:ok, {{_version, 304, _}, headers, _body}} -> 327 | {:not_modified, headers} 328 | 329 | {_, {{_version, code, message}, _headers, _body}} -> 330 | Logger.bare_log( 331 | :error, 332 | "Failed to download #{inspect url}. " <> 333 | "HTTP Error: (#{code}) #{inspect(message)}" 334 | ) 335 | 336 | {:error, code} 337 | 338 | {:error, {:failed_connect, [{_, {host, _port}}, {_, _, sys_message}]}} -> 339 | if sys_message == :timeout do 340 | Logger.bare_log( 341 | :error, 342 | "Timeout connecting to #{inspect(host)} to download #{inspect url}. " <> 343 | "Connection time exceeded #{http_options[:connect_timeout]}ms." 344 | ) 345 | 346 | {:error, :connection_timeout} 347 | else 348 | Logger.bare_log( 349 | :error, 350 | "Failed to connect to #{inspect(host)} to download #{inspect url}" 351 | ) 352 | 353 | {:error, sys_message} 354 | end 355 | 356 | {:error, {other}} -> 357 | Logger.bare_log( 358 | :error, 359 | "Failed to download #{inspect url}. Error #{inspect other}" 360 | ) 361 | 362 | {:error, other} 363 | 364 | {:error, :timeout} -> 365 | Logger.bare_log( 366 | :error, 367 | "Timeout downloading from #{inspect url}. " <> 368 | "Request exceeded #{http_options[:timeout]}ms." 369 | ) 370 | {:error, :timeout} 371 | end 372 | end 373 | 374 | @static_certificate_locations [ 375 | # Debian/Ubuntu/Gentoo etc. 376 | "/etc/ssl/certs/ca-certificates.crt", 377 | 378 | # Fedora/RHEL 6 379 | "/etc/pki/tls/certs/ca-bundle.crt", 380 | 381 | # OpenSUSE 382 | "/etc/ssl/ca-bundle.pem", 383 | 384 | # OpenELEC 385 | "/etc/pki/tls/cacert.pem", 386 | 387 | # CentOS/RHEL 7 388 | "/etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem", 389 | 390 | # Open SSL on MacOS 391 | "/usr/local/etc/openssl/cert.pem", 392 | 393 | # MacOS & Alpine Linux 394 | "/etc/ssl/cert.pem" 395 | ] 396 | 397 | defp dynamic_certificate_locations do 398 | [ 399 | # Configured cacertfile 400 | Application.get_env(:ex_cldr, :cacertfile), 401 | 402 | # Populated if hex package CAStore is configured 403 | if(Code.ensure_loaded?(CAStore), do: apply(CAStore, :file_path, [])), 404 | 405 | # Populated if hex package certfi is configured 406 | if(Code.ensure_loaded?(:certifi), do: apply(:certifi, :cacertfile, []) |> List.to_string()) 407 | ] 408 | |> Enum.reject(&is_nil/1) 409 | end 410 | 411 | def certificate_locations() do 412 | dynamic_certificate_locations() ++ @static_certificate_locations 413 | end 414 | 415 | @doc false 416 | defp certificate_store do 417 | certificate_locations() 418 | |> Enum.find(&File.exists?/1) 419 | |> raise_if_no_cacertfile! 420 | |> :erlang.binary_to_list() 421 | end 422 | 423 | defp raise_if_no_cacertfile!(nil) do 424 | raise RuntimeError, """ 425 | No certificate trust store was found. 426 | Tried looking for: #{inspect(certificate_locations())} 427 | 428 | A certificate trust store is required in 429 | order to download locales for your configuration. 430 | 431 | Since ex_cldr could not detect a system 432 | installed certificate trust store one of the 433 | following actions may be taken: 434 | 435 | 1. Install the hex package `castore`. It will 436 | be automatically detected after recompilation. 437 | 438 | 2. Install the hex package `certifi`. It will 439 | be automatically detected after recomilation. 440 | 441 | 3. Specify the location of a certificate trust store 442 | by configuring it in `config.exs` or `runtime.exs`: 443 | 444 | config :ex_cldr, 445 | cacertfile: "/path/to/cacertfile", 446 | ... 447 | 448 | """ 449 | end 450 | 451 | defp raise_if_no_cacertfile!(file) do 452 | file 453 | end 454 | 455 | defp http_opts(hostname, options) do 456 | default_timeout = 457 | "TZWORLD_HTTP_TIMEOUT" 458 | |> System.get_env(@unicode_default_timeout) 459 | |> String.to_integer() 460 | 461 | default_connection_timeout = 462 | "TZWORLD_HTTP_CONNECTION_TIMEOUT" 463 | |> System.get_env(@unicode_default_connection_timeout) 464 | |> String.to_integer() 465 | 466 | verify_peer? = Keyword.get(options, :verify_peer, true) 467 | ssl_options = https_ssl_opts(hostname, verify_peer?) 468 | timeout = Keyword.get(options, :timeout, default_timeout) 469 | connection_timeout = Keyword.get(options, :connection_timeout, default_connection_timeout) 470 | 471 | [timeout: timeout, connect_timeout: connection_timeout, ssl: ssl_options] 472 | end 473 | 474 | @doc false 475 | def user_agent do 476 | "erlang httpc/unicode OTP version #{otp_version()}" 477 | |> String.to_charlist() 478 | end 479 | 480 | defp https_ssl_opts(hostname, verify_peer?) do 481 | if secure_ssl?() and verify_peer? do 482 | [ 483 | verify: :verify_peer, 484 | cacertfile: certificate_store(), 485 | depth: 4, 486 | ciphers: preferred_ciphers(), 487 | versions: protocol_versions(), 488 | eccs: preferred_eccs(), 489 | reuse_sessions: true, 490 | server_name_indication: hostname, 491 | secure_renegotiate: true, 492 | customize_hostname_check: [ 493 | match_fun: :public_key.pkix_verify_hostname_match_fun(:https) 494 | ] 495 | ] 496 | else 497 | [ 498 | verify: :verify_none, 499 | server_name_indication: hostname, 500 | secure_renegotiate: true, 501 | reuse_sessions: true, 502 | versions: protocol_versions(), 503 | ciphers: preferred_ciphers(), 504 | versions: protocol_versions(), 505 | ] 506 | end 507 | end 508 | 509 | defp preferred_ciphers do 510 | preferred_ciphers = 511 | [ 512 | # Cipher suites (TLS 1.3): TLS_AES_128_GCM_SHA256:TLS_AES_256_GCM_SHA384:TLS_CHACHA20_POLY1305_SHA256 513 | %{cipher: :aes_128_gcm, key_exchange: :any, mac: :aead, prf: :sha256}, 514 | %{cipher: :aes_256_gcm, key_exchange: :any, mac: :aead, prf: :sha384}, 515 | %{cipher: :chacha20_poly1305, key_exchange: :any, mac: :aead, prf: :sha256}, 516 | 517 | # Cipher suites (TLS 1.2): ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256: 518 | # ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:ECDHE-ECDSA-CHACHA20-POLY1305: 519 | # ECDHE-RSA-CHACHA20-POLY1305:DHE-RSA-AES128-GCM-SHA256:DHE-RSA-AES256-GCM-SHA384 520 | %{cipher: :aes_128_gcm, key_exchange: :ecdhe_ecdsa, mac: :aead, prf: :sha256}, 521 | %{cipher: :aes_128_gcm, key_exchange: :ecdhe_rsa, mac: :aead, prf: :sha256}, 522 | %{cipher: :aes_256_gcm, key_exchange: :ecdh_ecdsa, mac: :aead, prf: :sha384}, 523 | %{cipher: :aes_256_gcm, key_exchange: :ecdh_rsa, mac: :aead, prf: :sha384}, 524 | %{cipher: :chacha20_poly1305, key_exchange: :ecdhe_ecdsa, mac: :aead, prf: :sha256}, 525 | %{cipher: :chacha20_poly1305, key_exchange: :ecdhe_rsa, mac: :aead, prf: :sha256}, 526 | %{cipher: :aes_128_gcm, key_exchange: :dhe_rsa, mac: :aead, prf: :sha256}, 527 | %{cipher: :aes_256_gcm, key_exchange: :dhe_rsa, mac: :aead, prf: :sha384} 528 | ] 529 | 530 | :ssl.filter_cipher_suites(preferred_ciphers, []) 531 | end 532 | 533 | defp protocol_versions do 534 | if otp_version() < 25 do 535 | [:"tlsv1.2"] 536 | else 537 | [:"tlsv1.2", :"tlsv1.3"] 538 | end 539 | end 540 | 541 | defp preferred_eccs do 542 | # TLS curves: X25519, prime256v1, secp384r1 543 | preferred_eccs = [:secp256r1, :secp384r1] 544 | :ssl.eccs() -- (:ssl.eccs() -- preferred_eccs) 545 | end 546 | 547 | defp secure_ssl? do 548 | case String.upcase(System.get_env(@unicode_unsafe_https, "TRUE")) do 549 | "FALSE" -> false 550 | "NIL" -> false 551 | _other -> true 552 | end 553 | end 554 | 555 | defp https_proxy(options) do 556 | options[:https_proxy] || 557 | Application.get_env(:unicode, :https_proxy) || 558 | System.get_env("HTTPS_PROXY") || 559 | System.get_env("https_proxy") 560 | end 561 | 562 | def otp_version do 563 | :erlang.system_info(:otp_release) |> List.to_integer 564 | end 565 | 566 | def data_path(filename) do 567 | priv_dir = :code.priv_dir(@app_name) |> to_string() 568 | Path.join(priv_dir, ["dictionaries/", filename]) 569 | end 570 | end 571 | 572 | -------------------------------------------------------------------------------- /lib/unicode/break.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.Break do 2 | @moduledoc """ 3 | Implements the Unicode break algorithm for words 4 | and lines. 5 | 6 | """ 7 | 8 | alias Unicode.String.Segment 9 | alias Unicode.String.Dictionary 10 | 11 | @dictionary_locales Dictionary.known_dictionary_locales() 12 | 13 | @break_map %{ 14 | grapheme: :grapheme_cluster_break, 15 | word: :word_break, 16 | sentence: :sentence_break, 17 | line: :line_break, 18 | graphemes: :grapheme_cluster_break, 19 | grapheme_cluster: :grapheme_cluster_break, 20 | words: :word_break, 21 | sentences: :sentence_break, 22 | lines: :line_break 23 | } 24 | 25 | @break_keys Map.keys(@break_map) 26 | 27 | @doc false 28 | def break(string, locale, break, options) when break in @break_keys do 29 | break_at(string, locale, Map.fetch!(@break_map, break), options) 30 | end 31 | 32 | @doc false 33 | def break_at("", _locale, _segment_type, _options) do 34 | {:no_break, {"", {"", ""}}} 35 | end 36 | 37 | def break_at(string, locale, segment_type, options) when is_binary(string) do 38 | break_at({"", string}, locale, segment_type, options) 39 | end 40 | 41 | def break_at({"", string_after}, _locale, _segment_type, _options) do 42 | {:break, {"", {"", string_after}}} 43 | end 44 | 45 | def break_at({string_before, string_after}, locale, segment_type, options) do 46 | suppress? = Keyword.get(options, :suppressions, true) 47 | {:ok, rules} = rules(locale, segment_type, suppress?) 48 | 49 | {string_before, string_after} 50 | |> Segment.evaluate_rules(rules) 51 | end 52 | 53 | @doc false 54 | def split(string, locale, break, options) when break in @break_keys do 55 | case next(string, locale, break, options) do 56 | {fore, aft} -> 57 | [fore | split(aft, locale, break, options)] 58 | 59 | nil -> 60 | [] 61 | end 62 | end 63 | 64 | @doc false 65 | def next("", _locale, _break, _options) do 66 | nil 67 | end 68 | 69 | def next(string, locale, :word = break, options) when locale in @dictionary_locales do 70 | <> = string 71 | 72 | case next_at({<>, rest}, locale, :word, options) do 73 | {fore, {_match, rest}} -> 74 | {fore, rest} 75 | 76 | {fore, rest} -> 77 | {fore, rest} 78 | end 79 | |> repeat_if_trimming_required(locale, break, options, options[:trim]) 80 | end 81 | 82 | def next(string, locale, break, options) when break in @break_keys and is_binary(string) do 83 | <> = string 84 | 85 | case next_at({<>, rest}, locale, Map.fetch!(@break_map, break), options) do 86 | {fore, {match, rest}} -> 87 | {<> <> fore, match <> rest} 88 | 89 | {fore, rest} -> 90 | {<> <> fore, rest} 91 | end 92 | |> repeat_if_trimming_required(locale, break, options, options[:trim]) 93 | end 94 | 95 | defp repeat_if_trimming_required({match, rest}, locale, break, options, true) do 96 | if Unicode.Property.white_space?(match) do 97 | next(rest, locale, break, options) 98 | else 99 | {match, rest} 100 | end 101 | end 102 | 103 | defp repeat_if_trimming_required({match, rest}, _locale, _break, _options, _) do 104 | {match, rest} 105 | end 106 | 107 | defp next_at({string_before, ""}, locale, :word, _options) 108 | when locale in @dictionary_locales do 109 | {string_before, ""} 110 | end 111 | 112 | defp next_at({string_before, string_after}, locale, :word = break, options) 113 | when locale in @dictionary_locales do 114 | <> = string_after 115 | word = string_before <> <> 116 | 117 | case Dictionary.find_prefix(word, locale) do 118 | {:ok, _} -> 119 | next_at({word, rest}, locale, break, options) 120 | :prefix -> 121 | # If its a prefix then we keep going to see if we have a word 122 | # But if the next step doesn't produce either a prefix or 123 | # a word then it should be a break here 124 | case next_at({word, rest}, locale, break, options) do 125 | {fore, _aft} when fore == word -> 126 | {string_before, string_after} 127 | other -> 128 | other 129 | end 130 | :error -> 131 | {string_before, string_after} 132 | end 133 | end 134 | 135 | defp next_at({string_before, string_after}, locale, segment_type, options) do 136 | suppress? = Keyword.get(options, :suppressions, true) 137 | {:ok, rules} = rules(locale, segment_type, suppress?) 138 | 139 | {string_before, string_after} 140 | |> Segment.evaluate_rules(rules) 141 | |> do_next(rules, "") 142 | end 143 | 144 | defp do_next({:break, {_string_before, {"", ""}}}, _rules, acc) do 145 | {acc, ""} 146 | end 147 | 148 | defp do_next({:break, {_string_before, {fore, ""}}}, _rules, acc) do 149 | {acc, fore} 150 | end 151 | 152 | defp do_next({:break, {_string_before, rest}}, _rules, acc) do 153 | {acc, rest} 154 | end 155 | 156 | defp do_next({:no_break, {_string_before, {fore, ""}}}, _rules, acc) do 157 | {acc <> fore, ""} 158 | end 159 | 160 | # Previously we were doing {acc <> fore, aft} but more context 161 | # is needed for some rules so now its {string_before <> fore, aft} 162 | 163 | defp do_next({:no_break, {string_before, {fore, aft}}}, rules, acc) do 164 | {string_before <> fore, aft} 165 | |> Segment.evaluate_rules(rules) 166 | |> do_next(rules, acc <> fore) 167 | end 168 | 169 | # Recompile this module if any of the segment 170 | # files change. 171 | 172 | for {_locale, file} <- Segment.locale_map() do 173 | @external_resource Path.join(Segment.segments_dir(), file) 174 | end 175 | 176 | @suppression_rules %{ 177 | sentence_break: %{id: 10.5, value: "$Sp+ $Suppressions $Close* $Sp* ($ParaSep?) ×"} 178 | } 179 | 180 | # Returns a list of rules applicable for 181 | # a given locale and segment type. 182 | defp rules(locale, segment_type) 183 | 184 | # Returns the variable definitions for 185 | # a given locale and segment type. 186 | @doc false 187 | def variables(locale, segment_type) 188 | 189 | # Returns a list of suppressions 190 | # (abbreviations) that can be used 191 | # to suppress an otherwise acceptable 192 | # break point. 193 | 194 | # Examples 195 | # 196 | # => Unicode.String.Break.variables "en", :sentence_break 197 | # [ 198 | # %{name: "$CR", value: "\\p{Sentence_Break=CR}"}, 199 | # %{name: "$LF", value: "\\p{Sentence_Break=LF}"}, 200 | # %{name: "$Extend", value: "\\p{Sentence_Break=Extend}"}, 201 | # %{name: "$Format", value: "\\p{Sentence_Break=Format}"}, 202 | # %{name: "$Sep", value: "\\p{Sentence_Break=Sep}"}, 203 | # %{name: "$Sp", value: "\\p{Sentence_Break=Sp}"}, 204 | # %{name: "$Lower", value: "\\p{Sentence_Break=Lower}"}, 205 | # ... 206 | # ] 207 | @doc false 208 | def suppressions(locale, segment_type) 209 | 210 | @doc false 211 | def suppressions_rule(locale, segment_type) 212 | 213 | for locale <- Segment.known_segmentation_locales() do 214 | {:ok, segments} = Segment.segments(locale) 215 | 216 | for segment_type <- Map.keys(segments) do 217 | defp rules(unquote(locale), unquote(segment_type)) do 218 | unquote(Macro.escape(Segment.rules(locale, segment_type))) 219 | end 220 | 221 | def variables(unquote(locale), unquote(segment_type)) do 222 | unquote(Macro.escape(get_in(segments, [segment_type, :variables]))) 223 | end 224 | 225 | def suppressions(unquote(locale), unquote(segment_type)) do 226 | unquote(Macro.escape(Segment.suppressions!(locale, segment_type))) 227 | end 228 | 229 | suppressions_rule = Map.get(@suppression_rules, segment_type) 230 | suppressions_variable = Segment.suppressions_variable(locale, segment_type) 231 | 232 | if suppressions_rule && suppressions_variable do 233 | variables = 234 | get_in(segments, [segment_type, :variables]) 235 | |> Segment.expand_variables([suppressions_variable]) 236 | 237 | rule = Segment.compile_rule(suppressions_rule, variables, [:caseless]) 238 | 239 | def suppressions_rule(unquote(locale), unquote(segment_type)) do 240 | unquote(Macro.escape(rule)) 241 | end 242 | end 243 | end 244 | end 245 | 246 | @default_locale :root 247 | 248 | defp rules(_other, segment_type) do 249 | rules(@default_locale, segment_type) 250 | end 251 | 252 | def suppressions_rule(_locale, _segment_type) do 253 | nil 254 | end 255 | 256 | @doc false 257 | def rules(locale, break_type, true) do 258 | if suppressions_rule = suppressions_rule(locale, break_type) do 259 | {:ok, rules} = rules(locale, break_type) 260 | {:ok, sort_rules([suppressions_rule | rules])} 261 | else 262 | rules(locale, break_type) 263 | end 264 | end 265 | 266 | def rules(locale, break_type, _) do 267 | rules(locale, break_type) 268 | end 269 | 270 | defp sort_rules(rules) do 271 | Enum.sort_by(rules, &elem(&1, 0)) 272 | end 273 | end 274 | -------------------------------------------------------------------------------- /lib/unicode/case/folding.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.Case.Folding do 2 | @moduledoc """ 3 | Implements the Unicode Case Folding algorithm. 4 | 5 | The intention of case folding is to facilitate 6 | case-insensitive string comparisons. It is not 7 | intended to be a general purpose transformation. 8 | 9 | Although case folding does generally use lower 10 | case as its normal form, it is not true for 11 | all scripts and codepoints. Therefore case 12 | folding should not be used as an alternative 13 | to `String.downcase/1`. 14 | 15 | """ 16 | 17 | @turkic_languages [:tr, :az] 18 | @fold_status [:turkic, :common, :full] 19 | 20 | @doc """ 21 | Case fold a string. 22 | 23 | Returns a string after applying the Unicode 24 | Case Folding algorithm. 25 | 26 | Case folding is intended to suport case 27 | insensitve string comparisons such as that 28 | implemented by `Unicode.String.equals_ignoring_case?/2` which 29 | calls this function on its parameters. 30 | 31 | ### Arguments 32 | 33 | * `string` is any `String.t()` 34 | 35 | * `mode or language tag` is either the atoms `:turkic` or `nil` 36 | or a map that includes the key `:language` with a value that 37 | is a lowercase atom representing an [ISO 639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) 38 | language code. The [CLDR language tag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) which is defined 39 | as part of the [ex_cldr](https://hex.pm/packages/ex_cldr) is one 40 | such example. See [Cldr.validate_locale/2](https://hexdocs.pm/ex_cldr/Cldr.html#validate_locale/2) 41 | for further information. The default is `nil`. 42 | 43 | ### Returns 44 | 45 | * The case folded string 46 | 47 | ### Notes 48 | 49 | * No normalization is applied to the 50 | string on either input or output. 51 | 52 | * Case folding does not apply any transformation 53 | to accented characters. `"ü"` will not case fold 54 | to `"u"` for example. 55 | 56 | ### Examples 57 | 58 | iex> Unicode.String.Case.Folding.fold("THIS") 59 | "this" 60 | 61 | iex> Unicode.String.Case.Folding.fold("grüßen") 62 | "grüssen" 63 | 64 | iex(13)> Unicode.String.Case.Folding.fold("I") 65 | "i" 66 | 67 | # Turkic languages such as Turkish and Azerbaijani have 68 | # a dotless lower case "i" 69 | iex> Unicode.String.Case.Folding.fold("I", :turkic) 70 | "ı" 71 | 72 | iex> Unicode.String.Case.Folding.fold("I", %{language: :az}) 73 | "ı" 74 | 75 | """ 76 | def fold(string) when is_binary(string) do 77 | fold(string, :full, nil) 78 | end 79 | 80 | def fold(string, %{language: language}) when language in @turkic_languages do 81 | fold(string, :full, :turkic) 82 | end 83 | 84 | def fold(string, language) when language in @turkic_languages do 85 | fold(string, :full, :turkic) 86 | end 87 | 88 | def fold(string, %{language: _language}) do 89 | fold(string, :full, nil) 90 | end 91 | 92 | def fold(string, :turkic) when is_binary(string) do 93 | fold(string, :full, :turkic) 94 | end 95 | 96 | def fold(string, _other) when is_binary(string) do 97 | fold(string, :full, nil) 98 | end 99 | 100 | for [status, from, to] <- Unicode.Utils.case_folding(), status in @fold_status do 101 | to = if is_list(to), do: List.to_string(to), else: List.to_string([to]) 102 | 103 | case status do 104 | :turkic -> 105 | defp fold(<>, _status, :turkic) do 106 | <> 107 | end 108 | 109 | :common -> 110 | defp fold(<>, status, mode) do 111 | <> 112 | end 113 | 114 | :full -> 115 | defp fold(<>, unquote(status), mode) do 116 | <> 117 | end 118 | end 119 | end 120 | 121 | defp fold(<>, status, mode) do 122 | <> 123 | end 124 | 125 | defp fold("", _, _) do 126 | "" 127 | end 128 | end 129 | -------------------------------------------------------------------------------- /lib/unicode/case/greek_upper.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.Case.Mapping.Greek do 2 | @moduledoc """ 3 | Implements the special upper casing rules for 4 | for the Greek language. 5 | 6 | """ 7 | 8 | @remove_accents Unicode.Regex.expand_regex( 9 | "[^[:ccc=Not_Reordered:][:ccc=Above:]]*?[\\u0313\\u0314\\u0301\\u0300\\u0306\\u0342\\u0308\\u0304]" 10 | ) 11 | @remove_iota Unicode.Regex.expand_regex("[^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*?[\\u0345]") 12 | 13 | @doc """ 14 | This implementation currently implements the `el-Upper` transform 15 | from CLDR. 16 | 17 | ### CLDR algorithm 18 | 19 | According to CLDR all accents on all characters are are omitted when 20 | upcasing. 21 | 22 | Remove 0301 following Greek, with possible intervening 0308 marks. 23 | ::NFD(); 24 | For uppercasing (not titlecasing!) remove all greek accents from greek letters. 25 | This is done in two groups, to account for canonical ordering. 26 | [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; 27 | [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; 28 | ::NFC(); 29 | 30 | That transform basically says remove all accents except a 31 | subscripted iota. It doesn't handle dipthongs correctly. 32 | 33 | ### Mozilla algorithm 34 | 35 | Mozilla has a thread on a [bug report](https://bugzilla.mozilla.org/show_bug.cgi?id=307039) 36 | that: 37 | 38 | > Greek accented letters should be converted to the respective non-accented uppercase 39 | > letters. The required conversions are the following (in Unicode): 40 | > 41 | > ά -> Α 42 | > έ -> Ε 43 | > ή -> Η 44 | > ί -> Ι 45 | > ΐ -> Ϊ 46 | > ό -> Ο 47 | > ύ -> Υ 48 | > ΰ -> Ϋ 49 | > ώ -> Ω 50 | > 51 | > Also diphthongs (two-vowel constructs) should be converted as follows, when the 52 | > first vowel is accented: 53 | > 54 | > άι -> ΑΪ 55 | > έι -> ΕΪ 56 | > όι -> ΟΪ 57 | > ύι -> ΥΪ 58 | > άυ -> ΑΫ 59 | > έυ -> ΕΫ 60 | > ήυ -> ΗΫ 61 | > όυ -> ΟΫ 62 | 63 | That thread seems to align with current-day [Mozilla](https://developer.mozilla.org/en-US/docs/Web/CSS/text-transform) 64 | which says the rules are: 65 | 66 | > In Greek (el), vowels lose their accent when the whole word is in 67 | > uppercase (ά/Α), except for the disjunctive eta (ή/Ή). Also, diphthongs 68 | > with an accent on the first vowel lose the accent and gain a diaeresis 69 | > on the second vowel (άι/ΑΪ). 70 | 71 | """ 72 | def upcase(string) do 73 | string 74 | |> String.normalize(:nfd) 75 | |> String.replace(~r/#{@remove_accents}/u, "") 76 | |> String.replace(~r/#{@remove_iota}/u, "") 77 | |> String.normalize(:nfc) 78 | |> Unicode.String.Case.Mapping.upcase(:any) 79 | end 80 | end 81 | -------------------------------------------------------------------------------- /lib/unicode/case/mapping.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.Case.Mapping do 2 | @moduledoc """ 3 | The [Unicode Case Mapping](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) algorithm 4 | defines the process and data to transform text into upper case, lower case or title case. 5 | 6 | Since most languages are not bicameral, characters which have no appropriate mapping remain unchanged. 7 | 8 | Three case mapping functions are provided as a public API which have their implementations in this module: 9 | 10 | * `Unicode.String.upcase/2` which will convert text to upper case characters. 11 | * `Unicode.String.downcase/2` which will convert text to lower case characters. 12 | * `Unicode.String.titlecase/2` which will convert text to title case. Title case means 13 | that the first character or each word is set to upper case and all other characters in 14 | the word are set to lower case. `Unicode.String.split/2` is used to split the string 15 | into words before title casing. 16 | 17 | Each function operates in a locale-aware manner implementing some basic capabilities: 18 | 19 | * Casing rules for the Turkish dotted capital `I` and dotless small `i`. 20 | * Casing rules for the retention of dots over `i` for Lithuanian letters with additional accents. 21 | * Titlecasing of IJ at the start of words in Dutch. 22 | * Removal of accents when upper casing letters in Greek. 23 | 24 | There are other casing rules that are not currently implemented such as: 25 | 26 | * Titlecasing of second or subsequent letters in words in orthographies that include 27 | caseless letters such as apostrophes. 28 | * Uppercasing of U+00DF `ß` latin small letter sharp `s` to U+1E9E `ẞ` latin capital letter 29 | sharp `s`. 30 | 31 | ### Examples 32 | 33 | # Basic case transformation 34 | iex> Unicode.String.Case.Mapping.upcase("the quick brown fox") 35 | "THE QUICK BROWN FOX" 36 | 37 | # Dotted-I in Turkish and Azeri 38 | iex> Unicode.String.Case.Mapping.upcase("Diyarbakır", :tr) 39 | "DİYARBAKIR" 40 | 41 | # Upper case in Greek removes diacritics 42 | iex> Unicode.String.Case.Mapping.upcase("Πατάτα, Αέρας, Μυστήριο", :el) 43 | "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ" 44 | 45 | # Lower case Greek with a final sigma 46 | iex> Unicode.String.Case.Mapping.downcase("ὈΔΥΣΣΕΎΣ", :el) 47 | "ὀδυσσεύς" 48 | 49 | # Title case Dutch with leading dipthong 50 | iex> Unicode.String.Case.Mapping.titlecase("ijsselmeer", :nl) 51 | "IJsselmeer" 52 | 53 | """ 54 | 55 | alias Unicode.Utils 56 | 57 | @sigma 0x03A3 58 | @lower_sigma <<0x03C3::utf8>> 59 | @sigma_byte_size byte_size(<<@sigma::utf8>>) 60 | 61 | # See table Table 3-17 of https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf 62 | # for details of the contexts 63 | 64 | # These regexes can probably be converted to another form 65 | # which may further enable binary optimmization. 66 | @final_sigma_before Unicode.Regex.expand_regex("\\p{cased}(\\p{Case_Ignorable})*") 67 | @final_sigma_after Unicode.Regex.expand_regex("(\\p{Case_Ignorable})*\\p{cased}") 68 | 69 | @after_soft_dotted Unicode.Regex.expand_regex("[\\p{Soft_Dotted}]([^\\p{ccc=230}\\p{ccc=0}])*") 70 | @more_above Unicode.Regex.expand_regex("[^\\p{ccc=230}\\p{ccc=0}]*[\\p{ccc=230}]") 71 | @before_dot Unicode.Regex.expand_regex("([^\\p{ccc=230}\\p{ccc=0}])*[\u0307]") 72 | @after_i Unicode.Regex.expand_regex("[I]([^\\p{ccc=230}\\p{ccc=0}])*") 73 | 74 | utf8_bytes_for_codepoint = fn codepoint -> 75 | byte_size(<>) 76 | end 77 | 78 | define_casing_function = fn 79 | casing, codepoint, replace, language, nil -> 80 | codepoint_bytes = utf8_bytes_for_codepoint.(codepoint) 81 | replacement = :unicode.characters_to_binary(replace) 82 | 83 | defp casing( 84 | string, 85 | <>, 86 | unquote(casing), 87 | unquote(language), 88 | bytes_so_far, 89 | acc 90 | ) do 91 | bytes_so_far = bytes_so_far + unquote(codepoint_bytes) 92 | 93 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [ 94 | unquote(replacement) | acc 95 | ]) 96 | end 97 | 98 | casing, codepoint, replace, language, "final_sigma" -> 99 | codepoint_bytes = utf8_bytes_for_codepoint.(codepoint) 100 | replacement = :unicode.characters_to_binary(replace) 101 | 102 | defp casing( 103 | string, 104 | <<@sigma::utf8, rest::binary>>, 105 | unquote(casing), 106 | unquote(language), 107 | bytes_so_far, 108 | acc 109 | ) do 110 | <> = string 111 | bytes_so_far = bytes_so_far + unquote(codepoint_bytes) 112 | 113 | if Regex.match?(~r/#{@final_sigma_before}/u, prior) && !Regex.match?(~r/#{@final_sigma_after}/u, rest) do 114 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [ 115 | unquote(replacement) | acc 116 | ]) 117 | else 118 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [ 119 | @lower_sigma | acc 120 | ]) 121 | end 122 | end 123 | 124 | casing, codepoint, replace, language, "not_before_dot" -> 125 | codepoint_bytes = utf8_bytes_for_codepoint.(codepoint) 126 | replacement = :unicode.characters_to_binary(replace) 127 | 128 | defp casing( 129 | string, 130 | <>, 131 | unquote(casing), 132 | unquote(language), 133 | bytes_so_far, 134 | acc 135 | ) do 136 | <> = string 137 | bytes_so_far = bytes_so_far + unquote(codepoint_bytes) 138 | 139 | if !Regex.match?(~r/#{@before_dot}/u, prior) do 140 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [ 141 | unquote(replacement) | acc 142 | ]) 143 | else 144 | this = 145 | casing( 146 | <>, 147 | <>, 148 | unquote(casing), 149 | :any, 150 | 0, 151 | acc 152 | ) 153 | 154 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc]) 155 | end 156 | end 157 | 158 | casing, codepoint, replace, language, "more_above" -> 159 | codepoint_bytes = utf8_bytes_for_codepoint.(codepoint) 160 | replacement = :unicode.characters_to_binary(replace) 161 | 162 | defp casing( 163 | string, 164 | <>, 165 | unquote(casing), 166 | unquote(language), 167 | bytes_so_far, 168 | acc 169 | ) do 170 | bytes_so_far = bytes_so_far + unquote(codepoint_bytes) 171 | 172 | if Regex.match?(~r/#{@more_above}/u, rest) do 173 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [ 174 | unquote(replacement) | acc 175 | ]) 176 | else 177 | this = 178 | casing( 179 | <>, 180 | <>, 181 | unquote(casing), 182 | :any, 183 | 0, 184 | acc 185 | ) 186 | 187 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc]) 188 | end 189 | end 190 | 191 | casing, codepoint, replace, language, "after_soft_dotted" -> 192 | codepoint_bytes = utf8_bytes_for_codepoint.(codepoint) 193 | replacement = :unicode.characters_to_binary(replace) 194 | 195 | defp casing( 196 | string, 197 | <>, 198 | unquote(casing), 199 | unquote(language), 200 | bytes_so_far, 201 | acc 202 | ) do 203 | <> = string 204 | bytes_so_far = bytes_so_far + unquote(codepoint_bytes) 205 | 206 | if Regex.match?(~r/#{@after_soft_dotted}/u, prior) do 207 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [ 208 | unquote(replacement) | acc 209 | ]) 210 | else 211 | this = 212 | casing( 213 | <>, 214 | <>, 215 | unquote(casing), 216 | :any, 217 | 0, 218 | acc 219 | ) 220 | 221 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc]) 222 | end 223 | end 224 | 225 | casing, codepoint, replace, language, "after_i" -> 226 | codepoint_bytes = utf8_bytes_for_codepoint.(codepoint) 227 | replacement = :unicode.characters_to_binary(replace) 228 | 229 | defp casing( 230 | string, 231 | <>, 232 | unquote(casing), 233 | unquote(language), 234 | bytes_so_far, 235 | acc 236 | ) do 237 | <> = string 238 | bytes_so_far = bytes_so_far + unquote(codepoint_bytes) 239 | 240 | if Regex.match?(~r/#{@after_i}/u, prior) do 241 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [ 242 | unquote(replacement) | acc 243 | ]) 244 | else 245 | this = 246 | casing( 247 | <>, 248 | <>, 249 | unquote(casing), 250 | :any, 251 | 0, 252 | acc 253 | ) 254 | 255 | casing(string, rest, unquote(casing), unquote(language), bytes_so_far, [this | acc]) 256 | end 257 | end 258 | end 259 | 260 | @doc """ 261 | Replace lower case characters with their 262 | uppercase equivalents. 263 | 264 | Lower case characters are replaced with their 265 | upper case equivalents. All other characters 266 | remain unchanged. 267 | 268 | For the Greek language (`:el`), all accents are 269 | removed prior to capitalization as is the normal 270 | practise for this language. 271 | 272 | """ 273 | def upcase(string, language \\ :any) 274 | 275 | def upcase(string, :el) do 276 | Unicode.String.Case.Mapping.Greek.upcase(string) 277 | end 278 | 279 | def upcase(string, language) when is_atom(language) do 280 | casing(string, string, :upcase, language, 0, []) 281 | end 282 | 283 | @doc """ 284 | Replace upper case characters with their 285 | lower case equivalents. 286 | 287 | """ 288 | def downcase(string, language \\ :any) 289 | 290 | def downcase(string, language) when is_atom(language) do 291 | casing(string, string, :downcase, language, 0, []) 292 | end 293 | 294 | @doc """ 295 | Apply to Unicode title case algorithm. 296 | 297 | """ 298 | def titlecase(string, language \\ :any) 299 | 300 | def titlecase(<>, :nl) 301 | when i in [?i, ?I] and j in [?j, ?J] do 302 | "IJ" <> casing(rest, rest, :downcase, :any, 0, []) 303 | end 304 | 305 | def titlecase(<>, language) when is_atom(language) do 306 | casing(<>, <>, :titlecase, language, 0, []) <> 307 | downcase(rest, language) 308 | end 309 | 310 | # These next four function clauses optimze for ASCII characters. 311 | # We need to omit the `i` from all ranges since in Turkish and Azeri 312 | # they upcase to a dotted-capital-I 313 | 314 | defp casing( 315 | string, 316 | <>, 317 | :downcase = casing, 318 | language, 319 | bytes_so_far, 320 | acc 321 | ) 322 | when byte >= ?A and byte <= ?Z and byte != ?I do 323 | casing(string, rest, casing, language, bytes_so_far + 1, [byte + 32 | acc]) 324 | end 325 | 326 | defp casing(string, <>, casing, language, bytes_so_far, acc) 327 | when casing in [:upcase, :titlecase] and byte >= ?a and byte <= ?z and byte != ?i do 328 | casing(string, rest, casing, language, bytes_so_far + 1, [byte - 32 | acc]) 329 | end 330 | 331 | defp casing(string, <>, casing, language, bytes_so_far, acc) 332 | when casing in [:upcase, :titlecase] and byte != ?i and byte <= ?~ do 333 | casing(string, rest, casing, language, bytes_so_far + 1, [byte | acc]) 334 | end 335 | 336 | defp casing( 337 | string, 338 | <>, 339 | :downcase = casing, 340 | language, 341 | bytes_so_far, 342 | acc 343 | ) 344 | when byte != ?I and byte <= ?~ do 345 | casing(string, rest, casing, language, bytes_so_far + 1, [byte | acc]) 346 | end 347 | 348 | # Generate the mapping functions 349 | 350 | for %{codepoint: codepoint, upper: upper} = casing <- Utils.casing_in_order(), 351 | upper && upper != codepoint && (codepoint == ?i or codepoint > ?~) do 352 | %{context: context, language: language} = casing 353 | 354 | define_casing_function.(:upcase, codepoint, upper, language, context) 355 | end 356 | 357 | for %{codepoint: codepoint, lower: lower} = casing <- Utils.casing_in_order(), 358 | lower && lower != codepoint && (codepoint == ?I or codepoint > ?~) do 359 | %{language: language, context: context} = casing 360 | 361 | # Special casing for capital sigma with no context. 362 | # see the default implementations of casing/5 at the 363 | # end of this file. Don't generate a function clause for 364 | # this codepoint here. 365 | unless codepoint == @sigma and is_nil(context) do 366 | define_casing_function.(:downcase, codepoint, lower, language, context) 367 | end 368 | end 369 | 370 | for %{codepoint: codepoint, title: title} = casing <- Utils.casing_in_order(), 371 | title && title != codepoint && codepoint > ?~ do 372 | %{context: context, language: language} = casing 373 | 374 | define_casing_function.(:titlecase, codepoint, title, language, context) 375 | end 376 | 377 | # End of string, return accumulator 378 | defp casing(_string, "", _casing, _language, _bytes_so_far, acc) do 379 | acc 380 | |> :lists.reverse() 381 | |> IO.iodata_to_binary() 382 | end 383 | 384 | # Special case for Greek sigma when no context. This is the only codepoint 385 | # that has two cases for the language :any. One case with "final_sigma" context 386 | # and one with no context. This means we can't generate two distinct function 387 | # clauses for casing/5 so we define a special one here for the "no context" 388 | # version and generate the one with the context in the normal flow. 389 | defp casing( 390 | string, 391 | <<@sigma::utf8, rest::binary>>, 392 | :downcase = casing, 393 | :any = language, 394 | bytes_so_far, 395 | acc 396 | ) do 397 | bytes_so_far = bytes_so_far + @sigma_byte_size 398 | 399 | casing(string, rest, casing, language, bytes_so_far, [@lower_sigma | acc]) 400 | end 401 | 402 | # Pass the character through since there is no casing data. 403 | # Optimize for ASCII bytes (byte value is less than 127) 404 | defp casing(string, <>, casing, :any = language, bytes_so_far, acc) 405 | when byte <= ?~ do 406 | bytes_so_far = bytes_so_far + 1 407 | 408 | casing(string, rest, casing, language, bytes_so_far, [byte | acc]) 409 | end 410 | 411 | defp casing(string, <>, casing, :any = language, bytes_so_far, acc) do 412 | next = <> 413 | bytes_so_far = bytes_so_far + byte_size(next) 414 | 415 | casing(string, rest, casing, language, bytes_so_far, [next | acc]) 416 | end 417 | 418 | # If the language version has no casing, use the default casing by 419 | # using the :any language. 420 | defp casing(string, rest, casing, _language, bytes_so_far, acc) do 421 | casing(string, rest, casing, :any, bytes_so_far, acc) 422 | end 423 | 424 | @doc false 425 | def unknown_locale_error(locale) do 426 | "Unknown locale #{inspect(locale)}" 427 | end 428 | end 429 | -------------------------------------------------------------------------------- /lib/unicode/dictionary.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.Dictionary do 2 | @moduledoc """ 3 | Implements basic dictionary functions for dictionary-based 4 | work break. 5 | 6 | This implementation supports dictionary-based word breaking for: 7 | 8 | * Chinese (`zh`, `zh-Hant`, `zh-Hans`, `zh-Hant-HK`, `yue`, `yue-Hans`) locales, 9 | * Japanese (`ja`) using the same dictionary as for Chinese, 10 | * Thai (`th`), 11 | * Lao (`lo`), 12 | * Khmer (`km`) and 13 | * Burmese (`my`). 14 | 15 | The dictionaries implemented are those used in the [CLDR](https://cldr.unicode.org) since 16 | they are under an open source license and also for consistency with 17 | [ICU](https://icu.unicode.org). 18 | 19 | Note that these dictionaries need to be downloaded with 20 | `mix unicode.string.download.dictionaries` prior to use. Each dictionary 21 | will be parsed and loaded into [persistent_term](https://www.erlang.org/doc/man/persistent_term) 22 | on demand. Note that each dictionary has a sizable memory footprint as measured 23 | by `:persistent_term.info/0`: 24 | 25 | | Dictionary | Memory Mb | 26 | | ----------- | ----------: | 27 | | Chinese | 104.8 | 28 | | Thai | 9.6 | 29 | | Lao | 11.4 | 30 | | Khmer | 38.8 | 31 | | Burmese | 23.1 | 32 | 33 | """ 34 | 35 | alias Unicode.String.Trie 36 | 37 | @app_name :unicode_string 38 | @dictionary_dir "dictionaries/" 39 | 40 | @dictionary_locales [ 41 | :zh, :th, :lo, :my, :km, :ja, :"zh-Hant", :"zh-Hant-HK", :yue, :"yue-Hant", :"yue-Hans" 42 | ] 43 | 44 | @doc """ 45 | Returns the locales that have a dictionary supporting 46 | word breaking. 47 | 48 | """ 49 | def known_dictionary_locales do 50 | @dictionary_locales 51 | end 52 | 53 | @doc false 54 | def ensure_dictionary_loaded_if_available(locale) when locale in @dictionary_locales do 55 | require Logger 56 | 57 | with {:ok, locale} <- dictionary_locale(locale) do 58 | status = 59 | if dictionary = dictionary(locale) do 60 | {:ok, dictionary} 61 | else 62 | load(locale) 63 | end 64 | 65 | case status do 66 | {:ok, dictionary} -> 67 | {:ok, dictionary} 68 | 69 | _other -> 70 | message = "No dictionary for #{locale} found. Have you run `mix download.dictionaries`?" 71 | Logger.debug(message) 72 | {:error, message} 73 | end 74 | end 75 | end 76 | 77 | def ensure_dictionary_loaded_if_available(locale) do 78 | {:ok, "No dictionary for #{inspect locale} found"} 79 | end 80 | 81 | @doc false 82 | def load(locale) do 83 | with {:ok, locale} <- dictionary_locale(locale) do 84 | load_dictionary(locale) 85 | end 86 | end 87 | 88 | @doc false 89 | def is_loaded(locale) do 90 | with {:ok, locale} <- dictionary_locale(locale) do 91 | :persistent_term.get({@app_name, locale}, false) && true 92 | else 93 | _other -> false 94 | end 95 | end 96 | 97 | @doc false 98 | def dictionary(locale) when locale in @dictionary_locales do 99 | :persistent_term.get({@app_name, locale}, nil) 100 | end 101 | 102 | @doc false 103 | def has_key(string, locale) do 104 | with {:ok, locale} <- dictionary_locale(locale) do 105 | dictionary = :persistent_term.get({@app_name, locale}) 106 | Trie.has_key(string, dictionary) 107 | end 108 | end 109 | 110 | @doc false 111 | def find_prefix(string, locale) do 112 | with {:ok, locale} <- dictionary_locale(locale) do 113 | dictionary = :persistent_term.get({@app_name, locale}) 114 | Trie.find_prefix(string, dictionary) 115 | end 116 | end 117 | 118 | @doc false 119 | @dialyzer {:nowarn_function, load_dictionary: 1} 120 | defp load_dictionary(:zh), do: load_dictionary(:zh, "chinese_japanese.txt") 121 | defp load_dictionary(:ja), do: load_dictionary(:zh) 122 | defp load_dictionary(:lo), do: load_dictionary(:lo, "lao.txt") 123 | defp load_dictionary(:th), do: load_dictionary(:th, "thai.txt") 124 | defp load_dictionary(:my), do: load_dictionary(:my, "burmese.txt") 125 | defp load_dictionary(:km), do: load_dictionary(:km, "khmer.txt") 126 | 127 | @comment_marker ["#", " #", " #", "\uFEFF #"] 128 | 129 | defp load_dictionary(locale, file_name) do 130 | require Logger 131 | 132 | trie = 133 | file_name 134 | |> read_dictionary() 135 | |> String.split("\n") 136 | |> Enum.reject(&String.starts_with?(&1, @comment_marker)) 137 | |> Enum.reject(&(String.length(&1) == 0)) 138 | |> Enum.map(fn line -> 139 | case String.split(line, "\t") do 140 | [word] -> word 141 | [word, value] -> {word, String.to_integer(value)} 142 | end 143 | end) 144 | |> Trie.new() 145 | 146 | :ok = :persistent_term.put({@app_name, locale}, trie) 147 | trie = :persistent_term.get({@app_name, locale}) 148 | 149 | # Logger.debug("[unicode_string] Loaded word break dictionary for locale #{inspect locale}") 150 | {:ok, trie} 151 | end 152 | 153 | defp read_dictionary(file_name) do 154 | priv_dir = :code.priv_dir(@app_name) |> to_string 155 | path = Path.join(priv_dir, [@dictionary_dir, file_name]) 156 | File.read!(path) 157 | end 158 | 159 | @doc false 160 | def dictionary_locale(:zh), do: {:ok, :zh} 161 | def dictionary_locale(:"zh-Hant"), do: {:ok, :zh} 162 | def dictionary_locale(:"zh-Hant-HK"), do: {:ok, :zh} 163 | def dictionary_locale(:yue), do: {:ok, :zh} 164 | def dictionary_locale(:"yue-Hant"), do: {:ok, :zh} 165 | def dictionary_locale(:"yue-Hans"), do: {:ok, :zh} 166 | 167 | def dictionary_locale(:lo), do: {:ok, :lo} 168 | def dictionary_locale(:my), do: {:ok, :my} 169 | def dictionary_locale(:th), do: {:ok, :th} 170 | def dictionary_locale(:km), do: {:ok, :km} 171 | def dictionary_locale(:ja), do: {:ok, :zh} 172 | def dictionary_locale(%{language: language}), do: dictionary_locale(language) 173 | def dictionary_locale(language), do: {:error, "No dictionary for #{inspect language} found."} 174 | 175 | end -------------------------------------------------------------------------------- /lib/unicode/segment.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.Segment do 2 | @moduledoc """ 3 | Implements the compilation of the Unicode 4 | segment rules. 5 | 6 | """ 7 | 8 | import SweetXml 9 | require Unicode.Set 10 | 11 | @root_locale "root" 12 | @suppressions_variable "$Suppressions" 13 | 14 | # This is the formal definition but it takes a while to compile 15 | # and all of the known variable names are in the Latin-1 set 16 | # defguard is_id_start(char) when Unicode.Set.match?(char, "\\p{ID_start}") 17 | # defguard is_id_continue(char) when Unicode.Set.match?(char, "\\p{ID_continue}") 18 | 19 | @doc "Identifies if a codepoint is a valid start of an identifier" 20 | defguard is_id_start(char) 21 | when char in ?A..?Z 22 | 23 | @doc "Identifies if a codepoint is a valid identifier character" 24 | defguard is_id_continue(char) 25 | when char in ?a..?z or char in ?A..?Z or char in ?0..?9 or char == ?_ 26 | 27 | @doc """ 28 | Return the rules as defined by CLDR for a given 29 | locale and break type. 30 | 31 | """ 32 | def rules(locale, segment_type, additional_variables \\ []) do 33 | with {:ok, segment} <- segments(locale, segment_type) do 34 | variables = Map.fetch!(segment, :variables) |> expand_variables(additional_variables) 35 | rules = Map.fetch!(segment, :rules) 36 | 37 | rules 38 | |> compile_rules(variables, []) 39 | |> wrap(:ok) 40 | end 41 | end 42 | 43 | @doc """ 44 | Return the rules as defined by CLDR for a given 45 | locale and break type and raises on error. 46 | 47 | """ 48 | def rules!(locale, segment_type, additional_variables \\ []) do 49 | case rules(locale, segment_type, additional_variables) do 50 | {:ok, rules} -> rules 51 | {:error, reason} -> raise ArgumentError, reason 52 | end 53 | end 54 | 55 | def compile_rules(rules, variables, regex_options) when is_list(rules) do 56 | rules 57 | |> expand_rules(variables) 58 | |> compile_rules(regex_options) 59 | end 60 | 61 | # These options set unicode mode. Interpret certain 62 | # codes like \B and \w in the unicode space, ignore 63 | # unescaped whitespace in regexs 64 | @regex_options [:unicode, :extended, :ucp, :dollar_endonly, :dotall, :bsr_unicode] 65 | @rule_splitter "[×÷]" 66 | 67 | defp compile_rules(rules, regex_options) do 68 | Enum.map(rules, fn {sequence, rule} -> 69 | [left, operator, right] = Regex.split(~r/#{@rule_splitter}/u, rule, include_captures: true) 70 | operator = if operator == "×", do: :no_break, else: :break 71 | 72 | left = if left != "", do: left <> "$", else: left 73 | right = if right != "", do: "^" <> right, else: right 74 | 75 | {sequence, 76 | {operator, expand_regex(left, regex_options), expand_regex(right, regex_options)}} 77 | end) 78 | end 79 | 80 | @doc """ 81 | Compiles a segment rule in the context of a list 82 | of variables. 83 | 84 | The compile rule can then be inserted into a 85 | rule set. 86 | 87 | """ 88 | def compile_rule(rule, variables, regex_options \\ []) when is_map(rule) do 89 | compile_rules([rule], variables, regex_options) 90 | |> hd 91 | end 92 | 93 | @doc false 94 | def suppressions_variable(locale, segment_type) do 95 | variable = 96 | locale 97 | |> suppressions!(segment_type) 98 | |> suppressions_regex() 99 | 100 | if variable do 101 | %{name: @suppressions_variable, value: variable} 102 | else 103 | nil 104 | end 105 | end 106 | 107 | defp suppressions_regex([]) do 108 | nil 109 | end 110 | 111 | defp suppressions_regex(suppressions) do 112 | suppression_regex = Enum.map_join(suppressions, "|", &String.replace(&1, ".", "\\.")) 113 | 114 | "(" <> suppression_regex <> ")" 115 | end 116 | 117 | @doc """ 118 | Returns a list of the suppressions for a given 119 | locale and segment type. 120 | 121 | """ 122 | def suppressions(locale, segment_type) do 123 | with {:ok, segment} <- segments(locale, segment_type) do 124 | {:ok, Map.get(segment, :suppressions, [])} 125 | end 126 | end 127 | 128 | @doc """ 129 | Returns a list of the suppressions for a given 130 | locale and segment type and raises on error. 131 | 132 | """ 133 | def suppressions!(locale, segment_type) do 134 | case suppressions(locale, segment_type) do 135 | {:ok, suppressions} -> suppressions 136 | {:error, reason} -> raise ArgumentError, reason 137 | end 138 | end 139 | 140 | defp expand_regex("", _regex_options) do 141 | :any 142 | end 143 | 144 | # Delete spaces because PCRE doesn't ignore them 145 | 146 | defp expand_regex(string, regex_options) do 147 | string 148 | |> String.trim() 149 | |> String.replace(" ", "") 150 | |> Unicode.Regex.expand_regex(@regex_options ++ regex_options) 151 | end 152 | 153 | @doc """ 154 | Evaluates a list of rules against a given 155 | string. 156 | 157 | """ 158 | def evaluate_rules(string, rules) when is_binary(string) do 159 | evaluate_rules({"", string}, rules) 160 | end 161 | 162 | def evaluate_rules({string_before, string_after}, rules) do 163 | Enum.reduce_while(rules, [], fn rule, _acc -> 164 | {_rule_number, {operator, _fore, _aft}} = rule 165 | 166 | case evaluate_rule({string_before, string_after}, rule) do 167 | {:pass, result} -> 168 | {:halt, {:pass, operator, result}} 169 | 170 | {:fail, string} -> 171 | {:cont, {:fail, string}} 172 | end 173 | end) 174 | |> return_break_or_no_break 175 | end 176 | 177 | # The final implicit rule is to to break. ie: :any ÷ :any 178 | defp return_break_or_no_break({:fail, {before_string, ""}}) do 179 | {:break, {before_string, {"", ""}}} 180 | end 181 | 182 | defp return_break_or_no_break({:fail, {before_string, after_string}}) do 183 | <> = after_string 184 | {:break, {before_string, {<>, rest}}} 185 | end 186 | 187 | defp return_break_or_no_break({:pass, operator, result}) do 188 | {operator, result} 189 | end 190 | 191 | @split_options [parts: 2, include_captures: true, trim: true] 192 | 193 | # Process an `:any op regex` rule at end of string 194 | defp evaluate_rule({string_before, <<_::utf8>> = string_after}, {_seq, {_operator, :any, {aft, regex_options}}}) do 195 | aft = Regex.compile!(aft, regex_options) 196 | 197 | if Regex.match?(aft, string_after) do 198 | {:pass, {string_before, {string_after, ""}}} 199 | else 200 | {:fail, {string_before, string_after}} 201 | end 202 | end 203 | 204 | defp evaluate_rule({string_before, string_after}, {_seq, {_operator, :any, {aft, regex_options}}}) do 205 | aft = Regex.compile!(aft, regex_options) 206 | 207 | case Regex.split(aft, string_after, @split_options) do 208 | [match, rest] -> {:pass, {string_before, {match, rest}}} 209 | _other -> {:fail, {string_before, string_after}} 210 | end 211 | end 212 | 213 | # Ignore suppressions at end of the string 214 | defp evaluate_rule({string_before, string_after}, {10.5, {_operator, {fore, regex_options}, :any}}) do 215 | fore = Regex.compile!(fore, regex_options) 216 | 217 | if Regex.match?(fore, string_before) do 218 | # IO.inspect {string_before, string_after}, label: "Matched Rule 10.5" 219 | case Regex.split(fore, string_before, @split_options) do 220 | [match] -> 221 | # IO.inspect {operator, match}, label: "Matched One" 222 | {:pass, {string_before, {match, ""}}} 223 | 224 | [match, rest] -> 225 | # IO.inspect {operator, match, rest}, label: "Matched" 226 | {:pass, {string_before, {match, rest}}} 227 | end 228 | else 229 | # IO.inspect {string_before, string_after}, label: "Did not match Rule 10.5" 230 | {:fail, {string_before, string_after}} 231 | end 232 | end 233 | 234 | # :any matches end of string 235 | defp evaluate_rule({string_before, "" = string_after}, {_seq, {_operator, {fore, regex_options}, :any}}) do 236 | fore = Regex.compile!(fore, regex_options) 237 | 238 | if Regex.match?(fore, string_before) do 239 | {:pass, {string_before, {"", ""}}} 240 | else 241 | {:fail, {string_before, string_after}} 242 | end 243 | end 244 | 245 | defp evaluate_rule({string_before, string_after}, {_seq, {_operator, {fore, regex_options}, :any}}) do 246 | fore = Regex.compile!(fore, regex_options) 247 | 248 | if Regex.match?(fore, string_before) do 249 | <> = string_after 250 | {:pass, {string_before, {<>, rest}}} 251 | else 252 | {:fail, {string_before, string_after}} 253 | end 254 | end 255 | 256 | defp evaluate_rule({string_before, string_after}, {_seq, {_operator, {fore, fore_regex_options}, {aft, aft_regex_options}}}) do 257 | fore = Regex.compile!(fore, fore_regex_options) 258 | aft = Regex.compile!(aft, aft_regex_options) 259 | 260 | if Regex.match?(fore, string_before) && Regex.match?(aft, string_after) do 261 | case Regex.split(aft, string_after, @split_options) do 262 | [match, rest] -> {:pass, {string_before, {match, rest}}} 263 | [match] -> {:pass, {string_before, {match, ""}}} 264 | end 265 | else 266 | {:fail, {string_before, string_after}} 267 | end 268 | end 269 | 270 | defp expand_rules(rules, variables) do 271 | Enum.reduce(rules, [], fn %{id: sequence, value: rule}, acc -> 272 | rule = 273 | rule 274 | |> String.trim() 275 | |> substitute_variables(variables) 276 | 277 | [{sequence, rule} | acc] 278 | end) 279 | |> Enum.sort() 280 | end 281 | 282 | def expand_variables(variables, additional_variables) 283 | when is_list(variables) and is_list(additional_variables) do 284 | Enum.reduce(variables ++ additional_variables, %{}, fn 285 | %{name: <<"$", name::binary>>, value: value}, variables -> 286 | new_value = substitute_variables(value, variables) 287 | Map.put(variables, name, new_value) 288 | end) 289 | end 290 | 291 | defp substitute_variables("", _variables) do 292 | "" 293 | end 294 | 295 | defp substitute_variables(<<"$", char::utf8, rest::binary>>, variables) 296 | when is_id_start(char) do 297 | {name, rest} = extract_variable_name(<> <> rest) 298 | Map.fetch!(variables, name) <> substitute_variables(rest, variables) 299 | end 300 | 301 | defp substitute_variables(<>, variables) do 302 | char <> substitute_variables(rest, variables) 303 | end 304 | 305 | defp extract_variable_name("" = string) do 306 | {string, ""} 307 | end 308 | 309 | defp extract_variable_name(<>) 310 | when is_id_continue(char) do 311 | {string, rest} = extract_variable_name(rest) 312 | {<> <> string, rest} 313 | end 314 | 315 | defp extract_variable_name(rest) do 316 | {"", rest} 317 | end 318 | 319 | @app_name Mix.Project.config()[:app] 320 | 321 | @doctype "" 322 | 323 | @doc false 324 | def segments_dir do 325 | Path.join(:code.priv_dir(@app_name), "/segments") 326 | end 327 | 328 | @doc false 329 | def locale_map do 330 | segments_dir() 331 | |> File.ls!() 332 | |> Enum.map(fn locale_file -> 333 | locale = 334 | locale_file 335 | |> String.split(".xml") 336 | |> hd 337 | |> String.replace("_", "-") 338 | 339 | {locale, locale_file} 340 | end) 341 | |> Map.new() 342 | end 343 | 344 | @doc """ 345 | Returns a list of the known locales that have 346 | segmentation data. 347 | 348 | """ 349 | def known_segmentation_locales do 350 | locale_map() 351 | |> Map.keys() 352 | |> Enum.map(&String.to_atom/1) 353 | end 354 | 355 | @doc """ 356 | Returns a list of the ancestor locales 357 | of the a given locale. 358 | 359 | The list includes the given locale. 360 | 361 | """ 362 | 363 | def ancestors(locale_name) do 364 | if Map.get(locale_map(), locale_name) do 365 | case String.split(locale_name, "-") do 366 | [locale] -> [locale, @root_locale] 367 | [locale, _territory] -> [locale_name, locale, @root_locale] 368 | [locale, script, _territory] -> [locale_name, "#{locale}-#{script}", locale, @root_locale] 369 | end 370 | |> wrap(:ok) 371 | else 372 | {:error, unknown_locale_error(locale_name)} 373 | end 374 | end 375 | 376 | @doc false 377 | def merge_ancestors(@root_locale) do 378 | raw_segments!(@root_locale) 379 | |> wrap(:ok) 380 | end 381 | 382 | def merge_ancestors(locale) when is_binary(locale) do 383 | with {:ok, ancestors} <- ancestors(locale) do 384 | merge_ancestors(ancestors) 385 | |> wrap(:ok) 386 | end 387 | end 388 | 389 | @doc false 390 | def merge_ancestors([locale, root]) do 391 | merge_ancestor(locale, raw_segments!(root)) 392 | end 393 | 394 | def merge_ancestors([locale | rest]) do 395 | merge_ancestor(locale, merge_ancestors(rest)) 396 | end 397 | 398 | # For each segment type, add the variables, rules and 399 | # suppressions from locale to other 400 | defp merge_ancestor(locale, other) do 401 | locale_segments = raw_segments!(locale) 402 | 403 | Enum.map(other, fn {segment_type, content} -> 404 | variables = 405 | Map.fetch!(content, :variables) ++ 406 | (get_in(locale_segments, [segment_type, :variables]) || []) 407 | 408 | rules = 409 | Map.fetch!(content, :rules) ++ 410 | (get_in(locale_segments, [segment_type, :rules]) || []) 411 | 412 | suppressions = 413 | Map.fetch!(content, :suppressions) ++ 414 | (get_in(locale_segments, [segment_type, :suppressions]) || []) 415 | 416 | {segment_type, %{content | variables: variables, rules: rules, suppressions: suppressions}} 417 | end) 418 | |> Map.new() 419 | end 420 | 421 | defp raw_segments(locale) do 422 | if file = Map.get(locale_map(), locale) do 423 | content = 424 | segments_dir() 425 | |> Path.join(file) 426 | |> File.read!() 427 | |> String.replace(@doctype, "") 428 | |> xpath(~x"//segmentation"l, 429 | type: ~x"./@type"s, 430 | variables: [ 431 | ~x".//variable"l, 432 | name: ~x"./@id"s, 433 | value: ~x"./text()"s 434 | ], 435 | rules: [ 436 | ~x".//rule"l, 437 | id: ~x"./@id"f, 438 | value: ~x"./text()"s 439 | ], 440 | suppressions: ~x".//suppression/text()"ls 441 | ) 442 | 443 | Enum.map(content, fn c -> 444 | type = 445 | c.type 446 | |> Macro.underscore() 447 | |> String.replace("__", "_") 448 | |> String.to_atom() 449 | 450 | {type, %{rules: c.rules, variables: c.variables, suppressions: c.suppressions}} 451 | end) 452 | |> Map.new() 453 | |> wrap(:ok) 454 | else 455 | {:error, unknown_locale_error(locale)} 456 | end 457 | end 458 | 459 | defp raw_segments!(locale) do 460 | case raw_segments(locale) do 461 | {:ok, segments} -> segments 462 | {:error, reason} -> raise ArgumentError, reason 463 | end 464 | end 465 | 466 | @doc false 467 | def segments(locale) when is_binary(locale) do 468 | merge_ancestors(locale) 469 | end 470 | 471 | def segments(locale) when is_atom(locale) do 472 | locale 473 | |> Atom.to_string() 474 | |> segments() 475 | end 476 | 477 | @doc false 478 | def segments(locale, segment_type) do 479 | with {:ok, segments} <- segments(to_string(locale)) do 480 | if segment = Map.get(segments, segment_type) do 481 | {:ok, segment} 482 | else 483 | {:error, unknown_segment_type_error(segment_type)} 484 | end 485 | end 486 | end 487 | 488 | defp wrap(term, atom) do 489 | {atom, term} 490 | end 491 | 492 | @doc false 493 | def unknown_locale_error(locale) do 494 | "Unknown locale #{inspect(locale)}" 495 | end 496 | 497 | @doc false 498 | def unknown_segment_type_error(segment_type) do 499 | "Unknown segment type #{inspect(segment_type)}" 500 | end 501 | end 502 | -------------------------------------------------------------------------------- /lib/unicode/string.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String do 2 | @moduledoc """ 3 | This module provides functions that implement some 4 | of the [Unicode](https://unicode.org) standards: 5 | 6 | * The [Unicode Case Mapping](https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf) algorithm 7 | to provide mapping to upper, lower and title case text. 8 | 9 | * The [Unicode Case Folding](https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf) algorithm 10 | to provide case-independent equality checking irrespective of language or script. 11 | 12 | * The [Unicode Segmentation](https://unicode.org/reports/tr29/) algorithm to detect, 13 | break or split strings into grapheme clusters, words and sentences. 14 | 15 | * The [Unicode Line Breaking](https://www.unicode.org/reports/tr14/) algorithm to determine 16 | line break placement to support word-wrapping. 17 | 18 | """ 19 | 20 | alias Unicode.Property 21 | alias Unicode.String.Break 22 | alias Unicode.String.Segment 23 | alias Unicode.String.Case 24 | alias Unicode.String.Dictionary 25 | 26 | defdelegate fold(string), to: Unicode.String.Case.Folding 27 | defdelegate fold(string, type), to: Unicode.String.Case.Folding 28 | 29 | defguard is_language(language) when (byte_size(language) == 2 or byte_size(language) == 3) 30 | defguard is_script(script) when byte_size(script) == 4 31 | defguard is_territory(territory) when byte_size(territory) == 2 32 | 33 | @type string_interval :: {String.t(), String.t()} 34 | @type break_type :: :grapheme | :word | :line | :sentence 35 | @type error_return :: {:error, String.t()} 36 | 37 | @type option :: {:locale, String.t() | map} 38 | | {:break, break_type} 39 | | {:suppressions, boolean} 40 | 41 | 42 | @type split_option :: {:locale, String.t() | map} 43 | | {:break, break_type} 44 | | {:suppressions, boolean} 45 | | {:trim, boolean} 46 | 47 | @type break_or_no_break :: :break | :no_break 48 | 49 | @type break_match :: 50 | {break_or_no_break, {String.t(), {String.t(), String.t()}}} 51 | | {break_or_no_break, {String.t(), String.t()}} 52 | 53 | @type mode_or_language :: :turkic | nil | %{language: atom()} 54 | 55 | @default_locale "root" 56 | @default_break :word 57 | 58 | @doc """ 59 | Compares two strings in a case insensitive 60 | manner. 61 | 62 | Case folding is applied to the two string 63 | arguments which are then compared with the 64 | `==` operator. 65 | 66 | ## Arguments 67 | 68 | * `string_a` and `string_b` are two strings 69 | to be compared 70 | 71 | ## Returns 72 | 73 | * `true` or `false` 74 | 75 | ## Notes 76 | 77 | * This function applies the [Unicode Case Folding 78 | algorithm](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf) 79 | 80 | * The algorithm does not apply any treatment to diacritical 81 | marks hence "compare strings without accents" is not 82 | part of this function. 83 | 84 | * No string normalization is performed. Where the 85 | normalization state of the string cannot be guaranteed 86 | it is recommended they be normalized before comparison 87 | using `String.normalize(string, :nfc)`. 88 | 89 | ## Examples 90 | 91 | iex> Unicode.String.equals_ignoring_case? "ABC", "abc" 92 | true 93 | 94 | iex> Unicode.String.equals_ignoring_case? "beißen", "beissen" 95 | true 96 | 97 | iex> Unicode.String.equals_ignoring_case? "grüßen", "grussen" 98 | false 99 | 100 | """ 101 | @spec equals_ignoring_case?(String.t(), String.t(), mode_or_language()) :: boolean 102 | def equals_ignoring_case?(string_a, string_b, mode_or_language_tag \\ nil) do 103 | fold(string_a, mode_or_language_tag) == fold(string_b, mode_or_language_tag) 104 | end 105 | 106 | @doc """ 107 | Returns a boolean indicating if the 108 | requested break is applicable 109 | at the point between the two string 110 | segments represented by `{string_before, string_after}`. 111 | 112 | ## Arguments 113 | 114 | * `string_interval` is any 2-tuple consisting 115 | of the string before a possible break and the string 116 | after a possible break. 117 | 118 | * `options` is a keyword list of 119 | options. 120 | 121 | ## Options 122 | 123 | * `:locale` is any locale returned by 124 | `Unicode.String.Segment.known_segmentation_locales/0` or 125 | `Unicode.String.Dictionary.known_dictionary_locales/0`. 126 | The default is #{inspect(@default_locale)} which corresponds 127 | to the break rules defined by the 128 | [Unicode Segmentation](https://unicode.org/reports/tr29/) rules. 129 | 130 | * `:break` is the type of break. It is one of 131 | `:grapheme`, `:word`, `:line` or `:sentence`. The 132 | default is `#{inspect(@default_break)}`. 133 | 134 | * `:suppressions` is a boolean which, 135 | if `true`, will suppress breaks for common 136 | abbreviations defined for the `locale`. The 137 | default is `true`. 138 | 139 | ## Returns 140 | 141 | * `true` or `false` or 142 | 143 | * raises an exception if there is an error. 144 | 145 | ## Examples 146 | 147 | iex> Unicode.String.break? {"This is ", "some words"} 148 | true 149 | 150 | iex> Unicode.String.break? {"This is ", "some words"}, break: :sentence 151 | false 152 | 153 | iex> Unicode.String.break? {"This is one. ", "This is some words."}, break: :sentence 154 | true 155 | 156 | """ 157 | @spec break?(string_interval :: string_interval(), options :: list(option())) :: 158 | boolean | no_return() 159 | 160 | def break?({string_before, string_after}, options \\ []) do 161 | case break({string_before, string_after}, options) do 162 | {:break, _} -> true 163 | {:no_break, _} -> false 164 | {:error, reason} -> raise ArgumentError, reason 165 | end 166 | end 167 | 168 | @doc """ 169 | Returns match data indicating if the 170 | requested break is applicable 171 | at the point between the two string 172 | segments represented by `{string_before, string_after}`. 173 | 174 | ## Arguments 175 | 176 | * `string_interval` is any 2-tuple consisting 177 | of the string before a possible break and the string 178 | after a possible break. 179 | 180 | * `options` is a keyword list of 181 | options. 182 | 183 | ## Options 184 | 185 | * `:locale` is any locale returned by 186 | `Unicode.String.Segment.known_segmentation_locales/0` or 187 | `Unicode.String.Dictionary.known_dictionary_locales/0`. 188 | The default is #{inspect(@default_locale)} which corresponds 189 | to the break rules defined by the 190 | [Unicode Segmentation](https://unicode.org/reports/tr29/) rules. 191 | 192 | * `:break` is the type of break. It is one of 193 | `:grapheme`, `:word`, `:line` or `:sentence`. The 194 | default is `#{inspect(@default_break)}`. 195 | 196 | * `:suppressions` is a boolean which, 197 | if `true`, will suppress breaks for common 198 | abbreviations defined for the `locale`. The 199 | default is `true`. 200 | 201 | ## Returns 202 | 203 | A tuple indicating if a break would 204 | be applicable at this point between 205 | `string_before` and `string_after`. 206 | 207 | * `{:break, {string_before, {matched_string, remaining_string}}}` or 208 | 209 | * `{:no_break, {string_before, {matched_string, remaining_string}}}` or 210 | 211 | * `{:error, reason}`. 212 | 213 | ## Examples 214 | 215 | iex> Unicode.String.break {"This is ", "some words"} 216 | {:break, {"This is ", {"s", "ome words"}}} 217 | 218 | iex> Unicode.String.break {"This is ", "some words"}, break: :sentence 219 | {:no_break, {"This is ", {"s", "ome words"}}} 220 | 221 | iex> Unicode.String.break {"This is one. ", "This is some words."}, break: :sentence 222 | {:break, {"This is one. ", {"T", "his is some words."}}} 223 | 224 | """ 225 | @spec break(string_interval :: string_interval(), options :: list(option())) :: 226 | break_match | error_return 227 | 228 | def break({string_before, string_after}, options \\ []) do 229 | break = Keyword.get(options, :break, @default_break) 230 | 231 | with {:ok, break} <- validate(:break, break), 232 | {:ok, locale} <- segmentation_locale_from_options(break, options), 233 | {:ok, _dictionary} <- Dictionary.ensure_dictionary_loaded_if_available(locale) do 234 | Break.break({string_before, string_after}, locale, break, options) 235 | end 236 | end 237 | 238 | @doc """ 239 | Returns an enumerable that splits a string on demand. 240 | 241 | ## Arguments 242 | 243 | * `string` is any `t:String.t/0`. 244 | 245 | * `options` is a keyword list of 246 | options. 247 | 248 | ## Returns 249 | 250 | * A function that implements the enumerable 251 | protocol or 252 | 253 | * `{:error, reason}` 254 | 255 | ## Options 256 | 257 | * `:locale` is any locale returned by 258 | `Unicode.String.Segment.known_segmentation_locales/0` or 259 | `Unicode.String.Dictionary.known_dictionary_locales/0`. 260 | The default is #{inspect(@default_locale)} which corresponds 261 | to the break rules defined by the 262 | [Unicode Segmentation](https://unicode.org/reports/tr29/) rules. 263 | 264 | * `:break` is the type of break. It is one of 265 | `:grapheme`, `:word`, `:line` or `:sentence`. The 266 | default is `#{inspect(@default_break)}`. 267 | 268 | * `:suppressions` is a boolean which, 269 | if `true`, will suppress breaks for common 270 | abbreviations defined for the `locale`. The 271 | default is `true`. 272 | 273 | * `:trim` is a boolean indicating if segments 274 | the are comprised of only white space are to be 275 | excluded from the returned list. The default 276 | is `false`. 277 | 278 | ## Examples 279 | 280 | iex> enum = Unicode.String.splitter "This is a sentence. And another.", break: :word, trim: true 281 | iex> Enum.take enum, 3 282 | ["This", "is", "a"] 283 | 284 | """ 285 | @spec splitter(string :: String.t(), split_options :: list(split_option)) :: 286 | function | error_return 287 | 288 | def splitter(string, options) when is_binary(string) do 289 | break = Keyword.get(options, :break, @default_break) 290 | 291 | with {:ok, break} <- validate(:break, break), 292 | {:ok, locale} <- segmentation_locale_from_options(break, options), 293 | {:ok, _dictionary} <- Dictionary.ensure_dictionary_loaded_if_available(locale) do 294 | Stream.unfold(string, &Break.next(&1, locale, break, options)) 295 | end 296 | end 297 | 298 | @doc """ 299 | Returns next segment in a string. 300 | 301 | ## Arguments 302 | 303 | * `string` is any `t:String.t/0`. 304 | 305 | * `options` is a keyword list of 306 | options. 307 | 308 | ## Returns 309 | 310 | A tuple with the segment and the remainder of the string or `""` 311 | in case the String reached its end. 312 | 313 | * `{next_string, rest_of_the_string}` or 314 | 315 | * `{:error, reason}` 316 | 317 | ## Options 318 | 319 | * `:locale` is any locale returned by 320 | `Unicode.String.Segment.known_segmentation_locales/0` or 321 | `Unicode.String.Dictionary.known_dictionary_locales/0` or 322 | a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) 323 | struct. The default is #{inspect(@default_locale)} which corresponds 324 | to the break rules defined by the 325 | [Unicode Segmentation](https://unicode.org/reports/tr29/) rules. 326 | 327 | * `:break` is the type of break. It is one of 328 | `:grapheme`, `:word`, `:line` or `:sentence`. The 329 | default is `#{inspect(@default_break)}`. 330 | 331 | * `:suppressions` is a boolean which, 332 | if `true`, will suppress breaks for common 333 | abbreviations defined for the `locale`. The 334 | default is `true`. 335 | 336 | ## Examples 337 | 338 | iex> Unicode.String.next "This is a sentence. And another.", break: :word 339 | {"This", " is a sentence. And another."} 340 | 341 | iex> Unicode.String.next "This is a sentence. And another.", break: :sentence 342 | {"This is a sentence. ", "And another."} 343 | 344 | """ 345 | @spec next(string :: String.t(), split_options :: list(split_option)) :: 346 | String.t() | nil | error_return 347 | 348 | def next(string, options \\ []) when is_binary(string) do 349 | break = Keyword.get(options, :break, @default_break) 350 | 351 | with {:ok, break} <- validate(:break, break), 352 | {:ok, locale} <- segmentation_locale_from_options(break, options) do 353 | Break.next(string, locale, break, options) 354 | end 355 | end 356 | 357 | @doc """ 358 | Splits a string according to the 359 | specified break type. 360 | 361 | ## Arguments 362 | 363 | * `string` is any `t:String.t/0`. 364 | 365 | * `options` is a keyword list of 366 | options. 367 | 368 | ## Returns 369 | 370 | * A list of strings after applying the 371 | specified break rules or 372 | 373 | * `{:error, reason}` 374 | 375 | ## Options 376 | 377 | * `:locale` is any locale returned by 378 | `Unicode.String.Segment.known_segmentation_locales/0` or 379 | `Unicode.String.Dictionary.known_dictionary_locales/0` or 380 | a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) 381 | struct. The default is #{inspect(@default_locale)} which corresponds 382 | to the break rules defined by the 383 | [Unicode Segmentation](https://unicode.org/reports/tr29/) rules. 384 | 385 | * `:break` is the type of break. It is one of 386 | `:grapheme`, `:word`, `:line` or `:sentence`. The 387 | default is `#{inspect(@default_break)}`. 388 | 389 | * `:suppressions` is a boolean which, 390 | if `true`, will suppress breaks for common 391 | abbreviations defined for the `locale`. The 392 | default is `true`. 393 | 394 | * `:trim` is a boolean indicating if segments 395 | the are comprised of only white space are to be 396 | excluded from the returned list. The default 397 | is `false`. 398 | 399 | ## Examples 400 | 401 | iex> Unicode.String.split "This is a sentence. And another.", break: :word 402 | ["This", " ", "is", " ", "a", " ", "sentence", ".", " ", "And", " ", "another", "."] 403 | 404 | iex> Unicode.String.split "This is a sentence. And another.", break: :word, trim: true 405 | ["This", "is", "a", "sentence", ".", "And", "another", "."] 406 | 407 | iex> Unicode.String.split "This is a sentence. And another.", break: :sentence 408 | ["This is a sentence. ", "And another."] 409 | 410 | """ 411 | @spec split(string :: String.t(), split_options :: list(split_option)) :: 412 | [String.t(), ...] | error_return 413 | 414 | def split(string, options \\ []) when is_binary(string) do 415 | break = Keyword.get(options, :break, @default_break) 416 | 417 | with {:ok, break} <- validate(:break, break), 418 | {:ok, locale} <- segmentation_locale_from_options(break, options) do 419 | Break.split(string, locale, break, options) 420 | end 421 | |> maybe_trim(options[:trim]) 422 | end 423 | 424 | defp maybe_trim(list, true) when is_list(list) do 425 | Enum.reject(list, &Property.white_space?/1) 426 | end 427 | 428 | defp maybe_trim(list, _) do 429 | list 430 | end 431 | 432 | @doc """ 433 | Return a stream that breaks a string into 434 | graphemes, words, sentences or line breaks. 435 | 436 | ## Arguments 437 | 438 | * `string` is any `t:String.t/0`. 439 | 440 | * `options` is a keyword list of 441 | options. 442 | 443 | ## Returns 444 | 445 | * A stream that is an `t:Enumerable.t/0` that 446 | can be used with the functions in the `Stream` 447 | or `Enum` modules. 448 | 449 | * `{:error, reason}` 450 | 451 | ## Options 452 | 453 | * `:locale` is any locale returned by 454 | `Unicode.String.Segment.known_segmentation_locales/0` or 455 | `Unicode.String.Dictionary.known_dictionary_locales/0` or 456 | a [Cldr.LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) 457 | struct. The default is #{inspect(@default_locale)} which corresponds 458 | to the break rules defined by the 459 | [Unicode Segmentation](https://unicode.org/reports/tr29/) rules. 460 | 461 | * `:break` is the type of break. It is one of 462 | `:grapheme`, `:word`, `:line` or `:sentence`. The 463 | default is `#{inspect(@default_break)}`. 464 | 465 | * `:suppressions` is a boolean which, 466 | if `true`, will suppress breaks for common 467 | abbreviations defined for the `locale`. The 468 | default is `true`. 469 | 470 | * `:trim` is a boolean indicating if segments 471 | the are comprised of only white space are to be 472 | excluded from the returned list. The default 473 | is `false`. 474 | 475 | ## Examples 476 | 477 | iex> Enum.to_list Unicode.String.stream("this is a set of words", trim: true) 478 | ["this", "is", "a", "set", "of", "words"] 479 | 480 | iex> Enum.to_list Unicode.String.stream("this is a set of words", break: :sentence, trim: true) 481 | ["this is a set of words"] 482 | 483 | """ 484 | @doc since: "1.2.0" 485 | 486 | @spec stream(string :: String.t(), split_options :: list(split_option)) :: 487 | Enumerable.t() | {:error, String.t()} 488 | 489 | def stream(string, options \\ []) do 490 | break = Keyword.get(options, :break, @default_break) 491 | 492 | with {:ok, break} <- validate(:break, break), 493 | {:ok, locale} <- segmentation_locale_from_options(break, options) do 494 | Stream.resource( 495 | fn -> string end, 496 | fn string -> 497 | case Break.next(string, locale, break, options) do 498 | nil -> {:halt, ""} 499 | {break, rest} -> {[break], rest} 500 | end 501 | end, 502 | fn _ -> :ok end 503 | ) 504 | end 505 | end 506 | 507 | @doc """ 508 | Converts all characters in the given string to upper case 509 | according to the Unicode Casing algorithm. 510 | 511 | ### Arguments 512 | 513 | * `string` is any `t:String.t/0`. 514 | 515 | * `options` is a keyword list of options. 516 | 517 | ### Options 518 | 519 | * `:locale` is any [ISO 639](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) 520 | language code or a [LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) 521 | which provides integration with [ex_cldr](https://hex.pm/packages/ex_cldr) 522 | applications. The default is `:any` which signifies the 523 | application of the base Unicode casing algorithm. 524 | 525 | ### Notes 526 | 527 | * The locale option determines the use of certain locale-specific 528 | casing rules. Where no specific casing rules apply to 529 | the given locale, the base Unicode casing algorithm is 530 | applied. The locales which have customized casing rules 531 | are returned by `Unicode.String.special_casing_locales/0`. 532 | 533 | ### Returns 534 | 535 | * `downcased_string` 536 | 537 | ### Examples 538 | 539 | # Basic case transformation 540 | iex> Unicode.String.upcase("the quick brown fox") 541 | "THE QUICK BROWN FOX" 542 | 543 | # Dotted-I in Turkish and Azeri 544 | iex> Unicode.String.upcase("Diyarbakır", locale: :tr) 545 | "DİYARBAKIR" 546 | 547 | # Upper case in Greek removes diacritics 548 | iex> Unicode.String.upcase("Πατάτα, Αέρας, Μυστήριο", locale: :el) 549 | "ΠΑΤΑΤΑ, ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ" 550 | 551 | """ 552 | @doc since: "1.3.0" 553 | 554 | @spec upcase(String.t(), Keyword.t()) :: String.t() 555 | def upcase(string, options \\ []) when is_list(options) do 556 | with {:ok, locale} <- casing_locale_from_options(options) do 557 | Case.Mapping.upcase(string, locale) 558 | end 559 | end 560 | 561 | @doc """ 562 | Converts all characters in the given string to lower case 563 | according to the Unicode Casing algorithm. 564 | 565 | ### Arguments 566 | 567 | * `string` is any `t:String.t/0`. 568 | 569 | * `options` is a keyword list of options. 570 | 571 | ### Options 572 | 573 | * `:locale` is any [ISO 639](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) 574 | language code or a [LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) 575 | which provides integration with [ex_cldr](https://hex.pm/packages/ex_cldr) 576 | applications. The default is `:any` which signifies the 577 | application of the base Unicode casing algorithm. 578 | 579 | ### Notes 580 | 581 | * The locale option determines the use of certain locale-specific 582 | casing rules. Where no specific casing rules apply to 583 | the given locale, the base Unicode casing algorithm is 584 | applied. The locales which have customized casing rules 585 | are returned by `Unicode.String.special_casing_locales/0`. 586 | 587 | ### Returns 588 | 589 | * `downcased_string` 590 | 591 | ### Examples 592 | 593 | iex> Unicode.String.downcase("THE QUICK BROWN FOX") 594 | "the quick brown fox" 595 | 596 | # Lower case Greek with a final sigma 597 | iex> Unicode.String.downcase("ὈΔΥΣΣΕΎΣ", locale: :el) 598 | "ὀδυσσεύς" 599 | 600 | # Lower case in Turkish and Azeri correctly handles 601 | # undotted-i and undotted-I 602 | iex> Unicode.String.downcase("DİYARBAKIR", locale: :tr) 603 | "diyarbakır" 604 | 605 | """ 606 | @doc since: "1.3.0" 607 | 608 | @spec downcase(String.t(), Keyword.t()) :: String.t() 609 | def downcase(string, options \\ []) when is_list(options) do 610 | with {:ok, locale} <- casing_locale_from_options(options) do 611 | Case.Mapping.downcase(string, locale) 612 | end 613 | end 614 | 615 | @doc """ 616 | Converts the given string to title case 617 | according to the Unicode Casing algorithm. 618 | 619 | Title casing is the process of transforming 620 | the first character of each word in a string 621 | to upper case and the following characters 622 | in the word to lower case. 623 | 624 | As a result this algorithm does not conform 625 | to the norms of all languages and cultures. 626 | However special processing is performed for 627 | the Dutch dipthong "IJ" when using the `:nl` 628 | casing locale. 629 | 630 | Further work will focus on improving title 631 | casing of Greek dipthongs. 632 | 633 | ### Arguments 634 | 635 | * `string` is any `t:String.t/0`. 636 | 637 | * `options` is a keyword list of options. 638 | 639 | ### Options 640 | 641 | * `:locale` is any [ISO 639](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) 642 | language code or a [LanguageTag](https://hexdocs.pm/ex_cldr/Cldr.LanguageTag.html) 643 | which provides integration with [ex_cldr](https://hex.pm/packages/ex_cldr) 644 | applications. The default is `:any` which signifies the 645 | application of the base Unicode casing algorithm. 646 | 647 | ### Notes 648 | 649 | * The locale option determines the use of certain locale-specific 650 | casing rules. Where no specific casing rules apply to 651 | the given locale, the base Unicode casing algorithm is 652 | applied. The locales which have customized casing rules 653 | are returned by `Unicode.String.special_casing_locales/0`. 654 | 655 | * The string is broken into words using 656 | `Unicode.String.break/2` which implements the 657 | [Unicode segmentation algorithm](https://unicode.org/reports/tr29/). 658 | 659 | ### Returns 660 | 661 | * `title_cased_string`. 662 | 663 | ### Examples 664 | 665 | iex> Unicode.String.titlecase("THE QUICK BROWN FOX") 666 | "The Quick Brown Fox" 667 | 668 | # Title case Dutch with leading dipthong 669 | iex> Unicode.String.titlecase("ijsselmeer", locale: :nl) 670 | "IJsselmeer" 671 | 672 | """ 673 | @doc since: "1.3.0" 674 | 675 | @spec titlecase(String.t(), Keyword.t()) :: String.t() 676 | def titlecase(string, options \\ []) when is_list(options) do 677 | with {:ok, casing_locale} <- casing_locale_from_options(options), 678 | {:ok, segmentation_locale} <- segmentation_locale_from_options(:word, options) do 679 | stream_options = Keyword.merge(options, break: :word, locale: segmentation_locale) 680 | 681 | string 682 | |> stream(stream_options) 683 | |> Enum.map(&Case.Mapping.titlecase(&1, casing_locale)) 684 | |> Enum.join() 685 | end 686 | end 687 | 688 | # These locales have some aadditional processing 689 | # beyond that specified in SpecialCasing.txt 690 | @special_casing_locales [:nl, :el] 691 | @casing_locales (@special_casing_locales ++ Unicode.Utils.known_casing_locales()) 692 | |> Enum.sort() 693 | 694 | @doc """ 695 | Returms a list of locales that have special 696 | casing rules. 697 | 698 | ### Example 699 | 700 | iex> Unicode.String.special_casing_locales() 701 | [:az, :el, :lt, :nl, :tr] 702 | 703 | """ 704 | def special_casing_locales do 705 | @casing_locales 706 | end 707 | 708 | # 709 | # Helpers 710 | # 711 | 712 | @doc false 713 | def casing_locale(locale) do 714 | casing_locale_from_options(locale: locale) 715 | end 716 | 717 | @doc false 718 | def segmentation_locale(break, locale) do 719 | segmentation_locale_from_options(break, locale: locale) 720 | end 721 | 722 | defp casing_locale_from_options(options) do 723 | options 724 | |> Keyword.get(:locale) 725 | |> match_locale(@casing_locales, :any) 726 | |> wrap(:ok) 727 | end 728 | 729 | @segmentation_locales Segment.known_segmentation_locales() 730 | @dictionary_locales Dictionary.known_dictionary_locales() 731 | 732 | defp segmentation_locale_from_options(:word, options) do 733 | locale = Keyword.get(options, :locale) 734 | segmentation_locale = match_locale(locale, @segmentation_locales, :root) 735 | dictionary_locale = match_locale(locale, @dictionary_locales, nil) 736 | 737 | if dictionary_locale do 738 | Dictionary.ensure_dictionary_loaded_if_available(dictionary_locale) 739 | end 740 | 741 | (dictionary_locale || segmentation_locale) 742 | |> wrap(:ok) 743 | end 744 | 745 | defp segmentation_locale_from_options(_break, options) do 746 | options 747 | |> Keyword.get(:locale) 748 | |> match_locale(@segmentation_locales, :root) 749 | |> wrap(:ok) 750 | end 751 | 752 | @doc false 753 | def dictionary_locale(locale) do 754 | dictionary_locale_from_options(locale: locale) 755 | end 756 | 757 | @dictionary_locales Dictionary.known_dictionary_locales() 758 | 759 | defp dictionary_locale_from_options(options) do 760 | options 761 | |> Keyword.get(:locale) 762 | |> match_locale(@dictionary_locales, nil) 763 | |> wrap(:ok) 764 | end 765 | 766 | defp wrap({:error, _} = error, _) do 767 | error 768 | end 769 | 770 | defp wrap(term, atom) do 771 | {atom, term} 772 | end 773 | 774 | defp match_locale(nil, _known_locales, default) do 775 | default 776 | end 777 | 778 | # The Enum.sort/1 here relies on the coincidental fact tha the three fields 779 | # are alphabetically in the order we already want 780 | 781 | defp match_locale(locale, known_locales, default) when is_struct(locale, Cldr.LanguageTag) do 782 | locale 783 | |> Map.take([:canonical_locale_name, :cldr_locale_name, :language]) 784 | |> Enum.sort() 785 | |> Keyword.values() 786 | |> Enum.uniq() 787 | |> Enum.map(&atomize/1) 788 | |> find_matching_locale(known_locales, default) 789 | end 790 | 791 | defp match_locale(locale, known_locales, default) when is_binary(locale) do 792 | locale 793 | |> String.split(["-", "_"]) 794 | |> build_candidate_locales() 795 | |> find_matching_locale(known_locales, default) 796 | end 797 | 798 | defp match_locale(locale, known_locales, default) when is_atom(locale) do 799 | if locale in known_locales do 800 | locale 801 | else 802 | match_locale(to_string(locale), known_locales, default) 803 | end 804 | end 805 | 806 | # Means it was a segment match request 807 | defp match_locale(locale, _known_locales, :root) do 808 | {:error, Segment.unknown_locale_error(locale)} 809 | end 810 | 811 | # Means it was a casing match request 812 | defp match_locale(locale, _known_locales, :any) do 813 | {:error, Case.Mapping.unknown_locale_error(locale)} 814 | end 815 | 816 | def find_matching_locale(candidates, known_locales, default) do 817 | Enum.reduce_while(candidates, default, fn candidate, default -> 818 | if candidate in known_locales do 819 | {:halt, candidate} 820 | else 821 | {:cont, default} 822 | end 823 | end) 824 | end 825 | 826 | defp build_candidate_locales([language]) when is_language(language) do 827 | language 828 | |> String.downcase() 829 | |> atomize() 830 | |> List.wrap() 831 | |> Enum.reject(&is_nil/1) 832 | end 833 | 834 | defp build_candidate_locales([language, territory | _rest]) 835 | when is_language(language) and is_territory(territory) do 836 | language = downcase(language) 837 | territory = upcase(territory) 838 | 839 | Enum.reject([atomize("#{language}-#{territory}"), atomize(language)], &is_nil/1) 840 | end 841 | 842 | defp build_candidate_locales([language, script, territory | _rest]) 843 | when is_language(language) and is_script(script) and is_territory(territory) do 844 | language = downcase(language) 845 | script = titlecase(script) 846 | territory = upcase(territory) 847 | 848 | Enum.reject([ 849 | atomize("#{language}-#{territory}"), 850 | atomize("#{language}-#{script}"), 851 | atomize(language) 852 | ], &is_nil/1) 853 | end 854 | 855 | defp build_candidate_locales([language, script | _rest]) 856 | when is_language(language) and is_script(script) do 857 | language = downcase(language) 858 | script = titlecase(script) 859 | 860 | Enum.reject([atomize("#{language}-#{script}"), atomize(language)], &is_nil/1) 861 | end 862 | 863 | defp build_candidate_locales([language | _rest]) when is_language(language) do 864 | build_candidate_locales([language]) 865 | end 866 | 867 | defp build_candidate_locales(["root"]) do 868 | [:root] 869 | end 870 | 871 | defp build_candidate_locales(_other) do 872 | [] 873 | end 874 | 875 | defp atomize(string) do 876 | String.to_existing_atom(string) 877 | rescue 878 | ArgumentError -> 879 | nil 880 | end 881 | 882 | @breaks [:word, :grapheme, :line, :sentence] 883 | 884 | defp validate(:break, break) do 885 | if break in @breaks do 886 | {:ok, break} 887 | else 888 | {:error, "Unknown break #{inspect(break)}. Valid breaks are #{inspect(@breaks)}"} 889 | end 890 | end 891 | end 892 | -------------------------------------------------------------------------------- /lib/unicode/trie.ex: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.Trie do 2 | def new(list) do 3 | :btrie.new(list) 4 | end 5 | 6 | def find_prefix(string, dictionary) do 7 | :btrie.find_prefix(string, dictionary) 8 | end 9 | 10 | def has_key(string, dictionary) do 11 | :btrie.is_key(string, dictionary) 12 | end 13 | end -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elixir-unicode/unicode_string/2ae703bb30551cf25d0dbf4100908c4986a6bbfc/logo.png -------------------------------------------------------------------------------- /mix.exs: -------------------------------------------------------------------------------- 1 | defmodule Unicode.String.MixProject do 2 | use Mix.Project 3 | 4 | @version "1.7.0" 5 | 6 | def project do 7 | [ 8 | app: :unicode_string, 9 | version: @version, 10 | elixir: "~> 1.11", 11 | start_permanent: Mix.env() == :prod, 12 | build_embedded: Mix.env() == :prod, 13 | deps: deps(), 14 | docs: docs(), 15 | name: "Unicode String", 16 | source_url: "https://github.com/elixir-unicode/unicode_string", 17 | description: description(), 18 | package: package(), 19 | elixirc_paths: elixirc_paths(Mix.env()), 20 | dialyzer: [ 21 | plt_add_apps: ~w(mix sweet_xml)a, 22 | flags: [:underspecs] 23 | ] 24 | ] 25 | end 26 | 27 | defp description do 28 | """ 29 | Unicode locale-aware case folding, case mapping (upcase, downcase and titlecase) 30 | case-insensitive equality as well as word, line, grapheme and sentence 31 | breaking and streaming. 32 | """ 33 | end 34 | 35 | defp package do 36 | [ 37 | maintainers: ["Kip Cole"], 38 | licenses: ["Apache-2.0"], 39 | logo: "logo.png", 40 | links: links(), 41 | files: [ 42 | "lib", 43 | "priv", 44 | "logo.png", 45 | "mix.exs", 46 | "README*", 47 | "CHANGELOG*", 48 | "LICENSE*" 49 | ] 50 | ] 51 | end 52 | 53 | def application do 54 | [ 55 | extra_applications: [:logger] 56 | ] 57 | end 58 | 59 | defp deps do 60 | [ 61 | {:unicode, "~> 1.19"}, 62 | {:unicode_set, "~> 1.5"}, 63 | 64 | {:trie, "~> 2.0"}, 65 | {:ex_cldr, "~> 2.38", optional: true}, 66 | {:jason, "~> 1.0", optional: true}, 67 | {:sweet_xml, "~> 0.7", runtime: false}, 68 | {:dialyxir, "~> 1.0", only: [:dev, :test], runtime: false}, 69 | # {:benchee, "~> 1.0", only: :dev, optional: true}, 70 | {:ex_doc, "~> 0.23", only: [:dev, :release], optional: true, runtime: false} 71 | ] 72 | end 73 | 74 | def links do 75 | %{ 76 | "GitHub" => "https://github.com/elixir-unicode/unicode_string", 77 | "Readme" => "https://github.com/elixir-unicode/unicode_string/blob/v#{@version}/README.md", 78 | "Changelog" => 79 | "https://github.com/elixir-unicode/unicode_string/blob/v#{@version}/CHANGELOG.md" 80 | } 81 | end 82 | 83 | def docs do 84 | [ 85 | source_ref: "v#{@version}", 86 | main: "readme", 87 | logo: "logo.png", 88 | extras: [ 89 | "README.md", 90 | "LICENSE.md", 91 | "CHANGELOG.md" 92 | ], 93 | skip_undefined_reference_warnings_on: ["changelog", "CHANGELOG.md"], 94 | formatters: ["html"] 95 | ] 96 | end 97 | 98 | defp elixirc_paths(:test), do: ["lib", "mix", "src", "test"] 99 | defp elixirc_paths(:dev), do: ["lib", "mix", "src", "bench"] 100 | defp elixirc_paths(_), do: ["lib", "src"] 101 | end 102 | -------------------------------------------------------------------------------- /mix.lock: -------------------------------------------------------------------------------- 1 | %{ 2 | "cldr_utils": {:hex, :cldr_utils, "2.28.2", "f500667164a9043369071e4f9dcef31f88b8589b2e2c07a1eb9f9fa53cb1dce9", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.5", [hex: :certifi, repo: "hexpm", optional: true]}, {:decimal, "~> 1.9 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "c506eb1a170ba7cdca59b304ba02a56795ed119856662f6b1a420af80ec42551"}, 3 | "decimal": {:hex, :decimal, "2.3.0", "3ad6255aa77b4a3c4f818171b12d237500e63525c2fd056699967a3e7ea20f62", [:mix], [], "hexpm", "a4d66355cb29cb47c3cf30e71329e58361cfcb37c34235ef3bf1d7bf3773aeac"}, 4 | "dialyxir": {:hex, :dialyxir, "1.4.5", "ca1571ac18e0f88d4ab245f0b60fa31ff1b12cbae2b11bd25d207f865e8ae78a", [:mix], [{:erlex, ">= 0.2.7", [hex: :erlex, repo: "hexpm", optional: false]}], "hexpm", "b0fb08bb8107c750db5c0b324fa2df5ceaa0f9307690ee3c1f6ba5b9eb5d35c3"}, 5 | "earmark_parser": {:hex, :earmark_parser, "1.4.44", "f20830dd6b5c77afe2b063777ddbbff09f9759396500cdbe7523efd58d7a339c", [:mix], [], "hexpm", "4778ac752b4701a5599215f7030989c989ffdc4f6df457c5f36938cc2d2a2750"}, 6 | "erlex": {:hex, :erlex, "0.2.7", "810e8725f96ab74d17aac676e748627a07bc87eb950d2b83acd29dc047a30595", [:mix], [], "hexpm", "3ed95f79d1a844c3f6bf0cea61e0d5612a42ce56da9c03f01df538685365efb0"}, 7 | "ex_cldr": {:hex, :ex_cldr, "2.42.0", "17ea930e88b8802b330e1c1e288cdbaba52cbfafcccf371ed34b299a47101ffb", [:mix], [{:cldr_utils, "~> 2.28", [hex: :cldr_utils, repo: "hexpm", optional: false]}, {:decimal, "~> 1.6 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:gettext, "~> 0.19", [hex: :gettext, repo: "hexpm", optional: true]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: true]}, {:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: true]}], "hexpm", "07264a7225810ecae6bdd6715d8800c037a1248dc0063923cddc4ca3c4888df6"}, 8 | "ex_doc": {:hex, :ex_doc, "0.37.3", "f7816881a443cd77872b7d6118e8a55f547f49903aef8747dbcb345a75b462f9", [:mix], [{:earmark_parser, "~> 1.4.42", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_c, ">= 0.1.0", [hex: :makeup_c, repo: "hexpm", optional: true]}, {:makeup_elixir, "~> 0.14 or ~> 1.0", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1 or ~> 1.0", [hex: :makeup_erlang, repo: "hexpm", optional: false]}, {:makeup_html, ">= 0.1.0", [hex: :makeup_html, repo: "hexpm", optional: true]}], "hexpm", "e6aebca7156e7c29b5da4daa17f6361205b2ae5f26e5c7d8ca0d3f7e18972233"}, 9 | "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, 10 | "makeup": {:hex, :makeup, "1.2.1", "e90ac1c65589ef354378def3ba19d401e739ee7ee06fb47f94c687016e3713d1", [:mix], [{:nimble_parsec, "~> 1.4", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "d36484867b0bae0fea568d10131197a4c2e47056a6fbe84922bf6ba71c8d17ce"}, 11 | "makeup_elixir": {:hex, :makeup_elixir, "1.0.1", "e928a4f984e795e41e3abd27bfc09f51db16ab8ba1aebdba2b3a575437efafc2", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "7284900d412a3e5cfd97fdaed4f5ed389b8f2b4cb49efc0eb3bd10e2febf9507"}, 12 | "makeup_erlang": {:hex, :makeup_erlang, "1.0.2", "03e1804074b3aa64d5fad7aa64601ed0fb395337b982d9bcf04029d68d51b6a7", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "af33ff7ef368d5893e4a267933e7744e46ce3cf1f61e2dccf53a111ed3aa3727"}, 13 | "nimble_parsec": {:hex, :nimble_parsec, "1.4.2", "8efba0122db06df95bfaa78f791344a89352ba04baedd3849593bfce4d0dc1c6", [:mix], [], "hexpm", "4b21398942dda052b403bbe1da991ccd03a053668d147d53fb8c4e0efe09c973"}, 14 | "sweet_xml": {:hex, :sweet_xml, "0.7.5", "803a563113981aaac202a1dbd39771562d0ad31004ddbfc9b5090bdcd5605277", [:mix], [], "hexpm", "193b28a9b12891cae351d81a0cead165ffe67df1b73fe5866d10629f4faefb12"}, 15 | "trie": {:hex, :trie, "2.0.7", "09fa6b08cda978fe97e5b68cd4fca68c6d6fba8e941a9e66c75a4b4bf383af91", [:rebar3], [], "hexpm", "6b86092654bc6383d5c72dfbb32b466d3a70d3e95be37538bb5500ee888fa944"}, 16 | "unicode": {:hex, :unicode, "1.20.0", "10189cfe98b03ebb8be6efd00df0936c1c94d75bfbd62cba2bdf958fef3ee4a7", [:mix], [], "hexpm", "fa581cf80b3b1b7f42e4d24a69109dfac465cec27a62c661306c81f4ab35894c"}, 17 | "unicode_set": {:hex, :unicode_set, "1.5.0", "f2dcc40b1e8daf1a04433c705d9a8fb8ccdfc8fd5763a92d414a3e0775414cfb", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}, {:unicode, "~> 1.13", [hex: :unicode, repo: "hexpm", optional: false]}], "hexpm", "6c7f200e52fb90434d6b783eaa4e0ea303cfc4844ea25b2fc1ba3eb8a6901b11"}, 18 | } 19 | -------------------------------------------------------------------------------- /mix/myapp_backend.ex: -------------------------------------------------------------------------------- 1 | defmodule MyApp.Cldr do 2 | use Cldr, 3 | locales: ["en", "de", "tr", "az", "fr-CA", "lt", "fr", "sv", "ar"], 4 | default_locale: "en", 5 | providers: [] 6 | end -------------------------------------------------------------------------------- /priv/dictionaries/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elixir-unicode/unicode_string/2ae703bb30551cf25d0dbf4100908c4986a6bbfc/priv/dictionaries/.gitkeep -------------------------------------------------------------------------------- /priv/segments/de.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Port. 13 | Alt. 14 | Di. 15 | Ges. 16 | frz. 17 | entspr. 18 | Gebr. 19 | erw. 20 | Frl. 21 | Inh. 22 | k.u.k. 23 | Ca. 24 | J.D. 25 | Ausg. 26 | evtl. 27 | So. 28 | i.B. 29 | s.a. 30 | kgl. 31 | Sept. 32 | o.B. 33 | Sa. 34 | ev. 35 | Dez. 36 | am. 37 | i.R. 38 | eigtl. 39 | i.J. 40 | u.U. 41 | G. 42 | z.Hd. 43 | u.A.w.g. 44 | Kl. 45 | Spezif. 46 | Obj. 47 | Ing. 48 | D. h. 49 | Folg. 50 | Akt. 51 | i.A. 52 | Msp. 53 | U.U. 54 | Chr. 55 | R. 56 | Einh. 57 | schwäb. 58 | Vgl. 59 | Aug. 60 | Dipl.-Ing. 61 | W. 62 | B. 63 | U. U. 64 | J. 65 | Fa. 66 | Mo. 67 | n.u.Z. 68 | Op. 69 | Mrd. 70 | e.h. 71 | Hr. 72 | Hrn. 73 | Ztr. 74 | k. u. k. 75 | Bibl. 76 | d.Ä. 77 | b. 78 | M. 79 | i.H. 80 | v.R.w. 81 | o.A. 82 | St. 83 | Dr. 84 | Fn. 85 | Abs. 86 | Rd. 87 | Dtzd. 88 | Jahrh. 89 | Z. 90 | Std. 91 | n. Chr. 92 | möbl. 93 | tägl. 94 | gest. 95 | gesch. 96 | z.B. 97 | Hbf. 98 | Abt. 99 | A.M. 100 | e.Wz. 101 | v.T. 102 | Nov. 103 | z. 104 | Prot. 105 | U.S. 106 | Wg. 107 | u.v.a. 108 | Adr. 109 | App. 110 | ggf. 111 | ggfs. 112 | Jan. 113 | O. 114 | Rel. 115 | od. 116 | Pfd. 117 | a.a.O. 118 | p.Adr. 119 | P. 120 | Gem. 121 | v. Chr. 122 | Art. 123 | z.Z. 124 | S.A. 125 | i.V. 126 | verh. 127 | Ausschl. 128 | m.W. 129 | Dir. 130 | Verf. 131 | Sek. 132 | r. 133 | Chin. 134 | Feb. 135 | Int. 136 | Sep. 137 | Gesch. 138 | schweiz. 139 | Bed. 140 | a.Rh. 141 | jew. 142 | vgl. 143 | a.M. 144 | Str. 145 | exkl. 146 | gek. 147 | Erf. 148 | u.Ä. 149 | ehem. 150 | näml. 151 | u. Z. 152 | v. u. Z. 153 | sog. 154 | C. 155 | Dipl.-Kfm. 156 | mtl. 157 | Hrsg. 158 | Qu. 159 | röm. 160 | u. 161 | U. 162 | Adj. 163 | Kap. 164 | hpts. 165 | a.D. 166 | gedr. 167 | Best. 168 | N. 169 | v.u.Z. 170 | Phys. 171 | Fr. 172 | d.J. 173 | Reg.-Bez. 174 | m.E. 175 | schles. 176 | Max. 177 | Ltd. 178 | südd. 179 | inkl. 180 | geb. 181 | Ggf. 182 | Inc. 183 | kath. 184 | kfm. 185 | Nr. 186 | Proz. 187 | Dim. 188 | verw. 189 | Reg. 190 | Dat. 191 | Evtl. 192 | led. 193 | F. 194 | Test. 195 | Schr. 196 | Do. 197 | PIN. 198 | Z. Zt. 199 | v.Chr. 200 | Tägl. 201 | s. 202 | amtl. 203 | Temp. 204 | Mind. 205 | e.V. 206 | Abw. 207 | P.M. 208 | F.f. 209 | a.a.S. 210 | Mod. 211 | Co. 212 | Min. 213 | Allg. 214 | Geograph. 215 | Jr. 216 | Urspr. 217 | Apr. 218 | Z. B. 219 | v.H. 220 | A. 221 | einschl. 222 | Trans. 223 | zzgl. 224 | StR. 225 | Fam. 226 | I. 227 | jhrl. 228 | u.a. 229 | Ben. 230 | o.g. 231 | Kfm. 232 | Konv. 233 | Mi. 234 | L. 235 | beil. 236 | T. 237 | Ursprüngl. 238 | röm.-kath. 239 | Okt. 240 | u.ä. 241 | Tel. 242 | D. 243 | Ber. 244 | Kop. 245 | Mio. 246 | Y. 247 | U.S.A. 248 | v. H. 249 | Forts. f. 250 | Rep. 251 | Hptst. 252 | österr. 253 | 254 | 255 | 256 | 257 | 258 | -------------------------------------------------------------------------------- /priv/segments/el.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | [[$STerm] [\u003B \u037E]] 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /priv/segments/en.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | L.P. 13 | Alt. 14 | Approx. 15 | E.G. 16 | O. 17 | Maj. 18 | Misc. 19 | P.O. 20 | J.D. 21 | Jam. 22 | Card. 23 | Dec. 24 | Sept. 25 | MR. 26 | Long. 27 | Hat. 28 | G. 29 | Link. 30 | DC. 31 | D.C. 32 | M.T. 33 | Hz. 34 | Mrs. 35 | By. 36 | Act. 37 | Var. 38 | N.V. 39 | Aug. 40 | B. 41 | S.A. 42 | Up. 43 | Job. 44 | Num. 45 | M.I.T. 46 | Ok. 47 | Org. 48 | Ex. 49 | Cont. 50 | U. 51 | Mart. 52 | Fn. 53 | Abs. 54 | Lt. 55 | OK. 56 | Z. 57 | E. 58 | Kb. 59 | Est. 60 | A.M. 61 | L.A. 62 | Prof. 63 | U.S. 64 | Nov. 65 | Ph.D. 66 | Mar. 67 | I.T. 68 | exec. 69 | Jan. 70 | N.Y. 71 | X. 72 | Md. 73 | Op. 74 | vs. 75 | D.A. 76 | A.D. 77 | R.L. 78 | P.M. 79 | Or. 80 | M.R. 81 | Cap. 82 | PC. 83 | Feb. 84 | Exec. 85 | I.e. 86 | Sep. 87 | Gb. 88 | K. 89 | U.S.C. 90 | Mt. 91 | S. 92 | A.S. 93 | C.O.D. 94 | Capt. 95 | Col. 96 | In. 97 | C.F. 98 | Adj. 99 | AD. 100 | I.D. 101 | Mgr. 102 | R.T. 103 | B.V. 104 | M. 105 | Conn. 106 | Yr. 107 | Rev. 108 | Phys. 109 | pp. 110 | Ms. 111 | To. 112 | Sgt. 113 | J.K. 114 | Nr. 115 | Jun. 116 | Fri. 117 | S.A.R. 118 | Lev. 119 | Lt.Cdr. 120 | Def. 121 | F. 122 | Do. 123 | Joe. 124 | Id. 125 | Mr. 126 | Dept. 127 | Is. 128 | Pvt. 129 | Diff. 130 | Hon.B.A. 131 | Q. 132 | Mb. 133 | On. 134 | Min. 135 | J.B. 136 | Ed. 137 | AB. 138 | A. 139 | S.p.A. 140 | I. 141 | a.m. 142 | Comm. 143 | Go. 144 | VS. 145 | L. 146 | All. 147 | PP. 148 | P.V. 149 | T. 150 | K.R. 151 | Etc. 152 | D. 153 | Adv. 154 | Lib. 155 | E.g. 156 | Pro. 157 | U.S.A. 158 | S.E. 159 | AA. 160 | Rep. 161 | Sq. 162 | As. 163 | 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /priv/segments/en_US.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /priv/segments/en_US_POSIX.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | [[$MidNumLet]-[.]] 19 | [[$MidNum] [.]] 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /priv/segments/es.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | Rdos. 13 | JJ.OO. 14 | Sres. 15 | fig. 16 | may. 17 | RR.HH. 18 | oct. 19 | cap. 20 | mié. 21 | doc. 22 | Excmo. 23 | Trab. 24 | Excmos. 25 | Kit. 26 | Inc. 27 | FF.CC. 28 | DC. 29 | ago. 30 | trad. 31 | SA. 32 | Rvdos. 33 | ed. 34 | Exmo. 35 | jul. 36 | col. 37 | RAM. 38 | Srtas. 39 | ene. 40 | Rol. 41 | Fabric. 42 | Comm. 43 | vid. 44 | Da. 45 | dic. 46 | ss. 47 | abr. 48 | ntra. 49 | Sra. 50 | dtor. 51 | cf. 52 | dom. 53 | prov. 54 | Emm. 55 | Sr. 56 | licdo. 57 | p.ej. 58 | bol. 59 | figs. 60 | Vda. 61 | Dr. 62 | ntro. 63 | Desv. 64 | O.M. 65 | Ldo. 66 | Drs. 67 | sáb. 68 | feb. 69 | Ltda. 70 | Lcda. 71 | Exma. 72 | C.V. 73 | SS.MM. 74 | Lda. 75 | U.S. 76 | hnos. 77 | R.D. 78 | Korn. 79 | v.gr. 80 | vs. 81 | Ilmas. 82 | Rdo. 83 | ej. 84 | vie. 85 | jue. 86 | a. C. 87 | Ilmos. 88 | e. c. 89 | Excma. 90 | afma. 91 | licda. 92 | Em. 93 | K. 94 | sras. 95 | MM. 96 | fund. 97 | Mons. 98 | Lcdo. 99 | afmo. 100 | C. 101 | A.C. 102 | dptos. 103 | Col. 104 | Srta. 105 | Av. 106 | Ant. 107 | depto. 108 | Var. 109 | H.P. 110 | D. 111 | M. 112 | C.P. 113 | Rev. 114 | Rvdmos. 115 | Fr. 116 | Ilmo. 117 | afmos. 118 | Ltd. 119 | afmas. 120 | prof. 121 | lun. 122 | SS.AA. 123 | Sol. 124 | nov. 125 | mss. 126 | Dña. 127 | Seg. 128 | mar. 129 | Rvdmo. 130 | Reg. 131 | ms. 132 | Sras. 133 | sres. 134 | U.S.A. 135 | Sta. 136 | Sdad. 137 | Dra. 138 | srs. 139 | R.U. 140 | deptos. 141 | dpto. 142 | jun. 143 | bco. 144 | Cía. 145 | Id. 146 | Mr. 147 | e.g. 148 | C.S. 149 | Excmas. 150 | Dª. 151 | Rvdo. 152 | Lic. 153 | cfr. 154 | Corp. 155 | Dto. 156 | Ilma. 157 | L. 158 | All. 159 | PP. 160 | d. C. 161 | Ltdo. 162 | mtro. 163 | Mrs. 164 | Desc. 165 | Avda. 166 | Exmas. 167 | a. e. c. 168 | Bien. 169 | Exmos. 170 | AA. 171 | Sto. 172 | CA. 173 | sept. 174 | Exc. 175 | c/c. 176 | 177 | 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /priv/segments/fi.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | \p{Word_Break=MidLetter} 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /priv/segments/fr.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | aux. 13 | config. 14 | collab. 15 | M. 16 | dim. 17 | imprim. 18 | oct. 19 | syst. 20 | bull. 21 | MM. 22 | doc. 23 | P.O. 24 | hôp. 25 | Mart. 26 | juil. 27 | broch. 28 | adr. 29 | symb. 30 | C. 31 | anc. 32 | voit. 33 | Jr. 34 | graph. 35 | dir. 36 | éd. 37 | fig. 38 | édit. 39 | niv. 40 | quart. 41 | cam. 42 | éval. 43 | anon. 44 | réf. 45 | Comm. 46 | Prof. 47 | févr. 48 | indus. 49 | DC. 50 | équiv. 51 | illustr. 52 | acoust. 53 | nov. 54 | L. 55 | All. 56 | U.S. 57 | S.M.A.R.T. 58 | sept. 59 | avr. 60 | jeu. 61 | dest. 62 | P.-D. G. 63 | ill. 64 | coll. 65 | encycl. 66 | mer. 67 | Desc. 68 | ven. 69 | P. 70 | lun. 71 | Inc. 72 | sam. 73 | D. 74 | append. 75 | Var. 76 | categ. 77 | janv. 78 | S.A. 79 | imm. 80 | U.S.A. 81 | mar. 82 | exempl. 83 | déc. 84 | ann. 85 | U. 86 | synth. 87 | dict. 88 | av. J.-C. 89 | W. 90 | Op. 91 | ap. J.-C. 92 | gouv. 93 | trav. publ. 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /priv/segments/it.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | N.B. 13 | div. 14 | a.C. 15 | fig. 16 | d.p.R. 17 | c.c.p. 18 | Cfr. 19 | vol. 20 | Geom. 21 | O.d.G. 22 | S.p.A. 23 | ver. 24 | N.d.A. 25 | dott. 26 | arch. 27 | d.C. 28 | N.d.T. 29 | rag. 30 | Sig. 31 | Mod. 32 | pag. 33 | dr. 34 | tav. 35 | N.d.E. 36 | DC. 37 | mitt. 38 | Ing. 39 | int. 40 | on. 41 | C.P. 42 | ag. 43 | L. 44 | U.S. 45 | S.M.A.R.T. 46 | p.i. 47 | tab. 48 | Ltd. 49 | Liv. 50 | D. 51 | U.S.A. 52 | sez. 53 | avv. 54 | S.A.R. 55 | all. 56 | p. 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /priv/segments/ja.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | [[\p{Line_Break=Ideographic}] [$CJ]] 17 | \p{Line_Break=Nonstarter} 18 | 19 | 20 | 21 | 22 | \p{Hiragana} 23 | ($Hiragana $FE*) 24 | [[\p{Ideographic}] [\u3005 \u3007 \u303B]] 25 | ($Ideographic $FE*) 26 | 27 | 28 | $Hiragana × $Hiragana 29 | $Ideographic × $Ideographic 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /priv/segments/pt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | psicol. 13 | fig. 14 | compl. 15 | rep. 16 | cap. 17 | doc. 18 | fisiol. 19 | dipl. 20 | astron. 21 | port. 22 | eletrôn. 23 | geom. 24 | mov. 25 | ago. 26 | trad. 27 | arquit. 28 | dez. 29 | ed. 30 | apt. 31 | Exmo. 32 | col. 33 | ff. 34 | univ. 35 | res. 36 | R. 37 | transp. 38 | D.C 39 | l. 40 | des. 41 | fev. 42 | abr. 43 | liter. 44 | lat. 45 | Dir. 46 | cf. 47 | adm. 48 | fot. 49 | p.m. 50 | P.M. 51 | créd. 52 | jur. 53 | com. 54 | anat. 55 | dir. 56 | end. 57 | fís. 58 | E. 59 | Est. 60 | cont. 61 | matem. 62 | Drs. 63 | gên. 64 | neol. 65 | pág. 66 | índ. 67 | Ltda. 68 | Exma. 69 | esp. 70 | ingl. 71 | tecnol. 72 | Mar. 73 | símb. 74 | Pe. 75 | pal. 76 | filos. 77 | V.T. 78 | fasc. 79 | vs. 80 | mai. 81 | S.A. 82 | profa. 83 | N.Sra. 84 | r.s.v.p. 85 | cel. 86 | mat. 87 | abrev. 88 | out. 89 | long. 90 | aux. 91 | arit. 92 | aer. 93 | jul. 94 | lin. 95 | S. 96 | méd. 97 | odontol. 98 | org. 99 | A.C. 100 | jun. 101 | déb. 102 | Av. 103 | álg. 104 | sup. 105 | fl. 106 | odont. 107 | caps. 108 | relat. 109 | organiz. 110 | hist. 111 | Fr. 112 | Ilmo. 113 | fem. 114 | ap. 115 | Ltd. 116 | pol. 117 | séc. 118 | prof. 119 | cx. 120 | nov. 121 | quím. 122 | mús. 123 | agric. 124 | mar. 125 | W.C. 126 | fr. 127 | cat. 128 | jan. 129 | pron. 130 | rel. 131 | autom. 132 | Sta. 133 | Dra. 134 | p. 135 | tel. 136 | div. 137 | p. ex. 138 | a.C. 139 | bras. 140 | Alm. 141 | Dr. 142 | comp. 143 | pq. 144 | arqueol. 145 | náut. 146 | biogr. 147 | f. 148 | círc. 149 | fac. 150 | d.C. 151 | apart. 152 | ex. 153 | Jr. 154 | set. 155 | tec. 156 | sociol. 157 | gram. 158 | ind. 159 | Ilma. 160 | vol. 161 | eng. 162 | rod. 163 | Ph.D. 164 | Dras. 165 | pp. 166 | elem. 167 | máq. 168 | cód. 169 | eletr. 170 | prod. 171 | ref. 172 | fil. 173 | a.m. 174 | A.M 175 | obs. 176 | N.T. 177 | contab. 178 | Sto. 179 | lit. 180 | educ. 181 | rementente 182 | desc. 183 | próx. 184 | 185 | 186 | 187 | 188 | 189 | -------------------------------------------------------------------------------- /priv/segments/root.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | \p{Grapheme_Cluster_Break=CR} 18 | \p{Grapheme_Cluster_Break=LF} 19 | \p{Grapheme_Cluster_Break=Control} 20 | \p{Grapheme_Cluster_Break=Extend} 21 | \p{Grapheme_Cluster_Break=ZWJ} 22 | \p{Grapheme_Cluster_Break=Regional_Indicator} 23 | \p{Grapheme_Cluster_Break=Prepend} 24 | \p{Grapheme_Cluster_Break=SpacingMark} 25 | \p{Grapheme_Cluster_Break=L} 26 | \p{Grapheme_Cluster_Break=V} 27 | \p{Grapheme_Cluster_Break=T} 28 | \p{Grapheme_Cluster_Break=LV} 29 | \p{Grapheme_Cluster_Break=LVT} 30 | 31 | 32 | 33 | 34 | [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] 35 | \p{Indic_Conjunct_Break=Linker} 36 | \p{Indic_Conjunct_Break=Consonant} 37 | \p{Extended_Pictographic} 38 | [\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}] 39 | 40 | 41 | 42 | 43 | 44 | $CR × $LF 45 | ( $Control | $CR | $LF ) ÷ 46 | ÷ ( $Control | $CR | $LF ) 47 | 48 | $L × ( $L | $V | $LV | $LVT ) 49 | ( $LV | $V ) × ( $V | $T ) 50 | ( $LVT | $T) × $T 51 | × ($Extend | $ZWJ) 52 | 53 | × $SpacingMark 54 | $Prepend × 55 | $LinkingConsonant $ExtCccZwj* $ConjunctLinker $ExtCccZwj* × $LinkingConsonant 56 | $ExtPict $Extend* $ZWJ × $ExtPict 57 | 58 | ^ ($RI $RI)* $RI × $RI 59 | [^$RI] ($RI $RI)* $RI × $RI 60 | 61 | 62 | 63 | 64 | 65 | 66 | \p{Line_Break=Ambiguous} 67 | \p{Line_Break=Aksara} 68 | \p{Line_Break=Alphabetic} 69 | \p{Line_Break=Aksara_Prebase} 70 | \p{Line_Break=Aksara_Start} 71 | \p{Line_Break=Break_Both} 72 | \p{Line_Break=Break_After} 73 | \p{Line_Break=Break_Before} 74 | \p{Line_Break=Mandatory_Break} 75 | \p{Line_Break=Contingent_Break} 76 | \p{Line_Break=Close_Punctuation} 77 | \p{Line_Break=CP} 78 | \p{Line_Break=Combining_Mark} 79 | \p{Line_Break=Carriage_Return} 80 | \p{Line_Break=Exclamation} 81 | \p{Line_Break=Glue} 82 | \p{Line_Break=H2} 83 | \p{Line_Break=H3} 84 | \p{Line_Break=HL} 85 | \p{Line_Break=Hyphen} 86 | \p{Line_Break=Ideographic} 87 | \p{Line_Break=Inseparable} 88 | \p{Line_Break=Infix_Numeric} 89 | \p{Line_Break=JL} 90 | \p{Line_Break=JT} 91 | \p{Line_Break=JV} 92 | \p{Line_Break=Line_Feed} 93 | \p{Line_Break=Next_Line} 94 | \p{Line_Break=Nonstarter} 95 | \p{Line_Break=Numeric} 96 | \p{Line_Break=Open_Punctuation} 97 | \p{Line_Break=Postfix_Numeric} 98 | \p{Line_Break=Prefix_Numeric} 99 | \p{Line_Break=Quotation} 100 | \p{Line_Break=Complex_Context} 101 | \p{Line_Break=Surrogate} 102 | \p{Line_Break=Space} 103 | \p{Line_Break=Break_Symbols} 104 | \p{Line_Break=Virama_Final} 105 | \p{Line_Break=Virama} 106 | \p{Line_Break=Word_Joiner} 107 | \p{Line_Break=Unknown} 108 | \p{Line_Break=ZWSpace} 109 | \p{Line_Break=Conditional_Japanese_Starter} 110 | \p{Line_Break=Regional_Indicator} 111 | \p{Line_Break=E_Base} 112 | \p{Line_Break=E_Modifier} 113 | \p{Line_Break=ZWJ} 114 | \p{Line_Break=ZWJ} 115 | [$QU & \p{gc=Pi}] 116 | [$QU & \p{gc=Pf}] 117 | [$QU - \p{gc=Pi}] 118 | [$QU - \p{gc=Pf}] 119 | [^\p{ea=F}\p{ea=W}\p{ea=H}] 120 | [$BA & $NotEastAsian] 121 | 122 | [\u2010] 123 | [$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]] 124 | [$OP-[\p{ea=F}\p{ea=W}\p{ea=H}]] 125 | [\p{Extended_Pictographic}&\p{gc=Cn}] 126 | 127 | 128 | 129 | ^ 130 | (?!.) 131 | 132 | [$CM1 $ZWJ] 133 | 134 | 135 | 136 | [$AI $AL $SG $XX $SA] 137 | [$NS $CJ] 138 | 139 | 140 | 141 | $CM* 142 | 143 | [$SP $BK $CR $LF $NL $ZW] 144 | [^ $SP $BK $CR $LF $NL $ZW] 145 | [^ $SP $BA $HY $CM] 146 | [^ $BA $HY $CM] 147 | [^ $NU $CM] 148 | ($AI $X) 149 | ($AK $X) 150 | ($AL $X) 151 | ($AP $X) 152 | ($AS $X) 153 | ($B2 $X) 154 | ($BA $X) 155 | ($BB $X) 156 | ($CB $X) 157 | ($CL $X) 158 | ($CP $X) 159 | ($CM $X) 160 | ($EX $X) 161 | ($GL $X) 162 | ($H2 $X) 163 | ($H3 $X) 164 | ($HL $X) 165 | ($HY $X) 166 | ($ID $X) 167 | ($IN $X) 168 | ($IS $X) 169 | ($JL $X) 170 | ($JT $X) 171 | ($JV $X) 172 | ($NS $X) 173 | ($NU $X) 174 | ($OP $X) 175 | ($PO $X) 176 | ($PR $X) 177 | ($QU $X) 178 | ($SA $X) 179 | ($SG $X) 180 | ($SY $X) 181 | ($VF $X) 182 | ($VI $X) 183 | ($WJ $X) 184 | ($XX $X) 185 | ($RI $X) 186 | ($EB $X) 187 | ($EM $X) 188 | ($ZWJ $X) 189 | ($QU_Pi $X) 190 | ($QU_Pf $X) 191 | ($QUmPi $X) 192 | ($QUmPf $X) 193 | ( $NotEastAsian | [$NotEastAsian - $Spec1_] $X) 194 | (NonEastAsianBA $X) 195 | ($DottedCircle $X) 196 | ($Hyphen $X) 197 | ($CP30 $X) 198 | ($OP30 $X) 199 | 200 | 201 | ($AL | ^ $CM | (?<=$Spec1_) $CM) 202 | ( $NotEastAsian | ^ $CM | (?<=$Spec1_) $CM ) 203 | 204 | 205 | 206 | 207 | $BK ÷ 208 | 209 | $CR × $LF 210 | $CR ÷ 211 | $LF ÷ 212 | $NL ÷ 213 | 214 | × ( $BK | $CR | $LF | $NL ) 215 | 216 | × $SP 217 | × $ZW 218 | 219 | $ZW $SP* ÷ 220 | 221 | $ZWJ_O × 222 | 223 | 224 | $Spec2_ × $CM 225 | × $WJ 226 | $WJ × 227 | 228 | $GL × 229 | $Spec3a_ × $GL 230 | $Spec3b_ $CM+ × $GL 231 | ^ $CM+ × $GL 232 | 233 | × $EX 234 | × $CL 235 | × $CP 236 | × $SY 237 | 238 | $OP $SP* × 239 | 240 | 241 | 242 | ( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* × 243 | 244 | 245 | × $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ) 246 | 247 | $SP ÷ $IS $NU 248 | 249 | × $IS 250 | 251 | ($CL | $CP) $SP* × $NS 252 | 253 | $B2 $SP* × $B2 254 | 255 | $SP ÷ 256 | 257 | × $QUmPi 258 | $QUmPf × 259 | 260 | $NotEastAsian × $QU 261 | × $QU ( $NotEastAsian | $eot ) 262 | $QU × $NotEastAsian 263 | ( $sot | $NotEastAsian ) $QU × 264 | 265 | ÷ $CB 266 | $CB ÷ 267 | 268 | ( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL 269 | 270 | × $BA 271 | × $HY 272 | × $NS 273 | $BB × 274 | 275 | $HL ($HY | $NonEastAsianBA) × [^$HL] 276 | 277 | $SY × $HL 278 | 279 | × $IN 280 | 281 | ($AL | $HL) × $NU 282 | $NU × ($AL | $HL) 283 | 284 | $PR × ($ID | $EB | $EM) 285 | ($ID | $EB | $EM) × $PO 286 | 287 | ($PR | $PO) × ($AL | $HL) 288 | ($AL | $HL) × ($PR | $PO) 289 | 290 | $NU ( $SY | $IS )* $CL × $PO 291 | $NU ( $SY | $IS )* $CP × $PO 292 | $NU ( $SY | $IS )* $CL × $PR 293 | $NU ( $SY | $IS )* $CP × $PR 294 | $NU ( $SY | $IS )* × $PO 295 | $NU ( $SY | $IS )* × $PR 296 | $PO × $OP $NU 297 | $PO × $OP $IS $NU 298 | $PO × $NU 299 | $PR × $OP $NU 300 | $PR × $OP $IS $NU 301 | $PR × $NU 302 | $HY × $NU 303 | $IS × $NU 304 | $NU ( $SY | $IS )* × $NU 305 | 306 | $JL × $JL | $JV | $H2 | $H3 307 | $JV | $H2 × $JV | $JT 308 | $JT | $H3 × $JT 309 | 310 | $JL | $JV | $JT | $H2 | $H3 × $PO 311 | $PR × $JL | $JV | $JT | $H2 | $H3 312 | 313 | ($AL | $HL) × ($AL | $HL) 314 | 315 | $AP × ($AK | $DottedCircle | $AS) 316 | ($AK | $DottedCircle | $AS) × ($VF | $VI) 317 | ($AK | $DottedCircle | $AS) $VI × ($AK | $DottedCircle) 318 | ($AK | $DottedCircle | $AS) × ($AK | $DottedCircle | $AS) $VF 319 | 320 | $IS × ($AL | $HL) 321 | 322 | ($AL | $HL | $NU) × $OP30 323 | $CP30 × ($AL | $HL | $NU) 324 | 325 | $sot ($RI $RI)* $RI × $RI 326 | [^$RI] ($RI $RI)* $RI × $RI 327 | $RI ÷ $RI 328 | 329 | $EB × $EM 330 | $ExtPictUnassigned × $EM 331 | 332 | 333 | 334 | 335 | 336 | \p{Sentence_Break=CR} 337 | \p{Sentence_Break=LF} 338 | \p{Sentence_Break=Extend} 339 | \p{Sentence_Break=Format} 340 | \p{Sentence_Break=Sep} 341 | \p{Sentence_Break=Sp} 342 | \p{Sentence_Break=Lower} 343 | \p{Sentence_Break=Upper} 344 | \p{Sentence_Break=OLetter} 345 | \p{Sentence_Break=Numeric} 346 | \p{Sentence_Break=ATerm} 347 | \p{Sentence_Break=STerm} 348 | \p{Sentence_Break=Close} 349 | \p{Sentence_Break=SContinue} 350 | . 351 | 352 | 353 | [$Format $Extend] 354 | [^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm] 355 | ($Sp $FE*) 356 | ($Lower $FE*) 357 | ($Upper $FE*) 358 | ($OLetter $FE*) 359 | ($Numeric $FE*) 360 | ($ATerm $FE*) 361 | ($STerm $FE*) 362 | ($Close $FE*) 363 | ($SContinue $FE*) 364 | 365 | ($Sep | $CR | $LF) 366 | ($STerm | $ATerm) 367 | 368 | 369 | 370 | 371 | 372 | $CR × $LF 373 | 374 | $ParaSep ÷ 375 | 376 | 377 | 378 | × [$Format $Extend] 379 | 380 | 381 | 382 | 383 | $ATerm × $Numeric 384 | ($Upper | $Lower) $ATerm × $Upper 385 | $ATerm $Close* $Sp* × $NotPreLower_* $Lower 386 | $SATerm $Close* $Sp* × ($SContinue | $SATerm) 387 | 388 | $SATerm $Close* × ( $Close | $Sp | $ParaSep ) 389 | 390 | $SATerm $Close* $Sp* × ( $Sp | $ParaSep ) 391 | $SATerm $Close* $Sp* $ParaSep? ÷ 392 | 393 | × $Any 394 | 395 | 396 | 397 | 398 | 399 | \p{Word_Break=CR} 400 | \p{Word_Break=LF} 401 | \p{Word_Break=Newline} 402 | \p{Word_Break=Extend} 403 | 404 | [\p{Word_Break=Format}] 405 | \p{Word_Break=Katakana} 406 | \p{Word_Break=ALetter} 407 | \p{Word_Break=MidLetter} 408 | \p{Word_Break=MidNum} 409 | \p{Word_Break=MidNumLet} 410 | \p{Word_Break=Numeric} 411 | \p{Word_Break=ExtendNumLet} 412 | \p{Word_Break=Regional_Indicator} 413 | \p{Word_Break=Hebrew_Letter} 414 | \p{Word_Break=Double_Quote} 415 | \p{Word_Break=Single_Quote} 416 | \p{Word_Break=ZWJ} 417 | 418 | \p{Extended_Pictographic} 419 | \p{Word_Break=WSegSpace} 420 | 421 | ($ALetter | $Hebrew_Letter) 422 | ($MidNumLet | $Single_Quote) 423 | 424 | 425 | [$Format $Extend $ZWJ] 426 | [^ $Newline $CR $LF ] 427 | ($Katakana $FE*) 428 | ($ALetter $FE*) 429 | ($MidLetter $FE*) 430 | ($MidNum $FE*) 431 | ($MidNumLet $FE*) 432 | ($Numeric $FE*) 433 | ($ExtendNumLet $FE*) 434 | ($RI $FE*) 435 | ($Hebrew_Letter $FE*) 436 | ($Double_Quote $FE*) 437 | ($Single_Quote $FE*) 438 | ($AHLetter $FE*) 439 | ($MidNumLetQ $FE*) 440 | 441 | 442 | 443 | 444 | 445 | $CR × $LF 446 | 447 | ($Newline | $CR | $LF) ÷ 448 | ÷ ($Newline | $CR | $LF) 449 | 450 | $ZWJ × $ExtPict 451 | $WSegSpace × $WSegSpace 452 | 453 | 454 | 455 | $NotBreak_ × [$Format $Extend $ZWJ] 456 | 457 | 458 | $AHLetter × $AHLetter 459 | 460 | $AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter 461 | $AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter 462 | $Hebrew_Letter × $Single_Quote 463 | $Hebrew_Letter × $Double_Quote $Hebrew_Letter 464 | $Hebrew_Letter $Double_Quote × $Hebrew_Letter 465 | 466 | $Numeric × $Numeric 467 | $AHLetter × $Numeric 468 | $Numeric × $AHLetter 469 | 470 | $Numeric ($MidNum | $MidNumLetQ) × $Numeric 471 | $Numeric × ($MidNum | $MidNumLetQ) $Numeric 472 | 473 | $Katakana × $Katakana 474 | 475 | ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet 476 | $ExtendNumLet × ($AHLetter | $Numeric | $Katakana) 477 | 478 | ^ ($RI $RI)* $RI × $RI 479 | [^$RI] ($RI $RI)* $RI × $RI 480 | 481 | 482 | 483 | 484 | 485 | -------------------------------------------------------------------------------- /priv/segments/ru.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | руб. 13 | янв. 14 | до н. э. 15 | сент. 16 | тел. 17 | дек. 18 | февр. 19 | нояб. 20 | апр. 21 | н. э. 22 | окт. 23 | тыс. 24 | авг. 25 | проф. 26 | н.э. 27 | кв. 28 | ул. 29 | отд. 30 | 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /priv/segments/sv.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | \p{Word_Break=MidLetter} 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /priv/segments/zh.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | [[\p{Line_Break=Ideographic}] [$CJ]] 18 | \p{Line_Break=Nonstarter} 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /priv/segments/zh_Hant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 8 | 9 | 10 | 11 | 12 |