├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── cabal-mac-win.yml
    │   ├── haskell-ci.yml
    │   └── stack.yml
├── .gitignore
├── Data
    └── Text
    │   ├── ICU.hs
    │   └── ICU
    │       ├── BiDi.hsc
    │       ├── BiDi
    │           └── Internal.hs
    │       ├── BitMask.hs
    │       ├── Break.hsc
    │       ├── Break
    │           ├── Pure.hs
    │           └── Types.hs
    │       ├── Calendar.hsc
    │       ├── CaseMap.hsc
    │       ├── Char.hsc
    │       ├── CharsetDetection.hsc
    │       ├── CharsetDetection
    │           └── Internal.hsc
    │       ├── Collate.hsc
    │       ├── Collate
    │           ├── Internal.hs
    │           └── Pure.hs
    │       ├── Convert.hs
    │       ├── Convert
    │           └── Internal.hs
    │       ├── DateFormatter.hsc
    │       ├── Enumerator.hsc
    │       ├── Error.hsc
    │       ├── Error
    │           └── Internal.hsc
    │       ├── Internal.hsc
    │       ├── Iterator.hs
    │       ├── Locale.hsc
    │       ├── Normalize.hsc
    │       ├── Normalize
    │           └── Internal.hsc
    │       ├── Normalize2.hsc
    │       ├── Number.hsc
    │       ├── Number
    │           └── Internal.hs
    │       ├── NumberFormatter.hsc
    │       ├── Regex.hs
    │       ├── Regex
    │           ├── Internal.hsc
    │           └── Pure.hs
    │       ├── Shape.hsc
    │       ├── Spoof.hsc
    │       ├── Spoof
    │           ├── Internal.hs
    │           └── Pure.hs
    │       ├── Text.hs
    │       └── Types.hs
├── LICENSE
├── README.markdown
├── Setup.lhs
├── benchmarks
    └── Breaker.hs
├── cabal.haskell-ci
├── cbits
    └── text_icu.c
├── changelog.md
├── hackage-docs.sh
├── include
    └── hs_text_icu.h
├── tests
    ├── Properties.hs
    ├── QuickCheckUtils.hs
    └── Tests.hs
└── text-icu.cabal


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # From:
 2 | # - https://github.com/rhysd/actionlint/issues/228#issuecomment-1272493095
 3 | # - https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot
 4 | 
 5 | # Set update schedule for GitHub Actions
 6 | 
 7 | version: 2
 8 | updates:
 9 | 
10 |   - package-ecosystem: "github-actions"
11 |     directory: "/"
12 |     schedule:
13 |       # Check for updates to GitHub Actions every week
14 |       interval: "weekly"
15 | 


--------------------------------------------------------------------------------
/.github/workflows/cabal-mac-win.yml:
--------------------------------------------------------------------------------
  1 | name: CI for macOS and Windows building with cabal
  2 | on:
  3 |   push:
  4 |     branches:
  5 |     - master
  6 |   pull_request:
  7 |     branches:
  8 |     - master
  9 | 
 10 | jobs:
 11 | 
 12 |   cabal:
 13 |     name: Cabal
 14 |     runs-on: ${{ matrix.os }}
 15 |     strategy:
 16 |       matrix:
 17 |         os: [macOS-latest, windows-latest]
 18 |         ghc:
 19 |           # - 9.2.8
 20 |           - 9.4.8
 21 |           - 9.6.6
 22 |           - 9.8.4
 23 |           - 9.10.1
 24 |           - 9.12.1
 25 |       fail-fast: false
 26 | 
 27 |     steps:
 28 | 
 29 |     # Andreas, 2023-02-12:
 30 |     # Putting the MSYS2 /usr/bin (and thus 'tar') into the PATH destroys the Cache action v3.
 31 |     # See: https://github.com/actions/cache/issues/1073
 32 |     # However the /mingw64/bin PATH for the MSYS2-installed 'pkg-config' is fine.
 33 |     #
 34 |     - name: Setup MSYS path for pkg-config
 35 |       if: ${{ runner.os == 'Windows' }}
 36 |       run: |
 37 |         echo "C:\msys64\mingw64\bin" | Out-File -FilePath "$env:GITHUB_PATH" -Append
 38 | 
 39 |     - name: Install the ICU library (Windows)
 40 |       if: ${{ runner.os == 'Windows' }}
 41 |       shell: pwsh
 42 |         ## Even though pwsh is default for windows, keep this for sake of actionlint/shellcheck!
 43 |       run: |
 44 |         $env:PATH = "C:\msys64\usr\bin;$env:PATH"
 45 |         pacman --noconfirm -S mingw-w64-x86_64-pkg-config mingw-w64-x86_64-icu
 46 | 
 47 |     # Installing ICU is not necessary, macOS-latest already has v69.1
 48 |     # - name: Install the ICU library (macOS)
 49 |     #   shell: bash
 50 |     #   run: |
 51 |     #     brew install icu4c
 52 | 
 53 |     # Let `pkg-config` find the ICU libs.
 54 |     # Also test whether it actually works, and print some debug information.
 55 |     - name: Set up pkg-config for the ICU library (macOS)
 56 |       if: ${{ runner.os == 'macOS' }}
 57 |       run: |
 58 |         PKG_CONFIG_PATH=$(brew --prefix)/opt/icu4c/lib/pkgconfig
 59 |         echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH}" >> "${GITHUB_ENV}"
 60 |         ## The rest is debug info:
 61 |         echo "$ ls -l ${PKG_CONFIG_PATH}/"
 62 |         ls -l "${PKG_CONFIG_PATH}/"
 63 |         export PKG_CONFIG_PATH
 64 |         echo "$ pkg-config --modversion icu-i18n"
 65 |         pkg-config --modversion icu-i18n
 66 |         echo "$ pkg-config --libs --static icu-i18n"
 67 |         pkg-config --libs --static icu-i18n
 68 | 
 69 |     # # Test of `pkg-config --list-all` in connection with macOS-11/12 env bug:
 70 |     # # https://github.com/actions/runner-images/issues/6364
 71 |     # # This was fixed upstream 2022-10-10.
 72 |     # - name: Check integrity of pkg-config database
 73 |     #   if: ${{ runner.os == 'macOS' }}
 74 |     #   run: |
 75 |     #     echo "$ pkg-config --list-all"
 76 |     #     pkg-config --list-all
 77 |     #     echo "========================================================================"
 78 |     #     echo "$ pkg-config --list-all | cut -f 1 -d ' '"
 79 |     #     pkg-config --list-all | cut -f 1 -d ' '
 80 |     #     echo "========================================================================"
 81 |     #     echo "$ pkg-config --list-all | cut -f 1 -d ' ' | xargs pkg-config --modversion"
 82 |     #     pkg-config --list-all | cut -f 1 -d ' ' | xargs pkg-config --modversion
 83 | 
 84 |     - name: Determine the ICU version
 85 |       shell: bash
 86 |       run: |
 87 |         ICU_VER=$(pkg-config --modversion icu-i18n)
 88 |         echo "ICU_VER=${ICU_VER}"
 89 |         echo "ICU_VER=${ICU_VER}" >> "${GITHUB_ENV}"
 90 | 
 91 |     - name: Checkout
 92 |       uses: actions/checkout@v4
 93 | 
 94 |     - name: Setup Haskell
 95 |       uses: haskell-actions/setup@v2
 96 |       id: setup
 97 |       with:
 98 |         ghc-version: ${{ matrix.ghc }}
 99 |         cabal-update: true
100 | 
101 |     - name: Configure
102 |       run: |
103 |         cabal configure --enable-tests
104 |         cabal build --dry-run
105 |       # cabal build --dry-run creates dist-newstyle/cache/plan.json
106 | 
107 |     - name: Restore cached build products
108 |       uses: actions/cache/restore@v4
109 |       id: cache
110 |       with:
111 |         path: |
112 |           ${{ steps.setup.outputs.cabal-store }}
113 |           dist-newstyle
114 |         key: ${{ env.key }}-cabal-${{ steps.setup.outputs.cabal-version }}-plan-${{ hashFiles('dist-newstyle/cache/plan.json') }}
115 |         restore-keys: |
116 |              ${{ env.key }}-cabal-${{ steps.setup.outputs.cabal-version }}-
117 |              ${{ env.key }}-
118 |       env:
119 |         key: ${{ runner.os }}-ghc-${{ steps.setup.outputs.ghc-version }}-icu-${{ env.ICU_VER }}-
120 | 
121 |     - name: Build
122 |       run: |
123 |         cabal build all
124 | 
125 |     - name: Test
126 |       run: |
127 |         cabal test all --test-show-details=direct
128 | 
129 |     - name: Cache build products
130 |       uses: actions/cache/save@v4
131 |       if:   always() && steps.cache.outputs.cache-hit != 'true'
132 |       with:
133 |         path: |
134 |           ${{ steps.setup.outputs.cabal-store }}
135 |           dist-newstyle
136 |         key: ${{ steps.cache.outputs.cache-primary-key }}
137 | 


--------------------------------------------------------------------------------
/.github/workflows/haskell-ci.yml:
--------------------------------------------------------------------------------
  1 | # This GitHub workflow config has been generated by a script via
  2 | #
  3 | #   haskell-ci 'github' 'text-icu.cabal'
  4 | #
  5 | # To regenerate the script (for example after adjusting tested-with) run
  6 | #
  7 | #   haskell-ci regenerate
  8 | #
  9 | # For more information, see https://github.com/haskell-CI/haskell-ci
 10 | #
 11 | # version: 0.19.20250216
 12 | #
 13 | # REGENDATA ("0.19.20250216",["github","text-icu.cabal"])
 14 | #
 15 | name: Haskell-CI
 16 | on:
 17 |   push:
 18 |     branches:
 19 |       - master
 20 |   pull_request:
 21 |     branches:
 22 |       - master
 23 | jobs:
 24 |   linux:
 25 |     name: Haskell-CI - Linux - ${{ matrix.compiler }}
 26 |     runs-on: ubuntu-24.04
 27 |     timeout-minutes:
 28 |       60
 29 |     container:
 30 |       image: buildpack-deps:jammy
 31 |     continue-on-error: ${{ matrix.allow-failure }}
 32 |     strategy:
 33 |       matrix:
 34 |         include:
 35 |           - compiler: ghc-9.12.1
 36 |             compilerKind: ghc
 37 |             compilerVersion: 9.12.1
 38 |             setup-method: ghcup
 39 |             allow-failure: false
 40 |           - compiler: ghc-9.10.1
 41 |             compilerKind: ghc
 42 |             compilerVersion: 9.10.1
 43 |             setup-method: ghcup
 44 |             allow-failure: false
 45 |           - compiler: ghc-9.8.4
 46 |             compilerKind: ghc
 47 |             compilerVersion: 9.8.4
 48 |             setup-method: ghcup
 49 |             allow-failure: false
 50 |           - compiler: ghc-9.6.6
 51 |             compilerKind: ghc
 52 |             compilerVersion: 9.6.6
 53 |             setup-method: ghcup
 54 |             allow-failure: false
 55 |           - compiler: ghc-9.4.8
 56 |             compilerKind: ghc
 57 |             compilerVersion: 9.4.8
 58 |             setup-method: ghcup
 59 |             allow-failure: false
 60 |           - compiler: ghc-9.2.8
 61 |             compilerKind: ghc
 62 |             compilerVersion: 9.2.8
 63 |             setup-method: ghcup
 64 |             allow-failure: false
 65 |           - compiler: ghc-9.0.2
 66 |             compilerKind: ghc
 67 |             compilerVersion: 9.0.2
 68 |             setup-method: ghcup
 69 |             allow-failure: false
 70 |           - compiler: ghc-8.10.7
 71 |             compilerKind: ghc
 72 |             compilerVersion: 8.10.7
 73 |             setup-method: ghcup
 74 |             allow-failure: false
 75 |           - compiler: ghc-8.8.4
 76 |             compilerKind: ghc
 77 |             compilerVersion: 8.8.4
 78 |             setup-method: ghcup
 79 |             allow-failure: false
 80 |           - compiler: ghc-8.6.5
 81 |             compilerKind: ghc
 82 |             compilerVersion: 8.6.5
 83 |             setup-method: ghcup
 84 |             allow-failure: false
 85 |           - compiler: ghc-8.4.4
 86 |             compilerKind: ghc
 87 |             compilerVersion: 8.4.4
 88 |             setup-method: ghcup
 89 |             allow-failure: false
 90 |           - compiler: ghc-8.2.2
 91 |             compilerKind: ghc
 92 |             compilerVersion: 8.2.2
 93 |             setup-method: ghcup
 94 |             allow-failure: false
 95 |           - compiler: ghc-8.0.2
 96 |             compilerKind: ghc
 97 |             compilerVersion: 8.0.2
 98 |             setup-method: ghcup
 99 |             allow-failure: false
100 |       fail-fast: false
101 |     steps:
102 |       - name: apt-get install
103 |         run: |
104 |           apt-get update
105 |           apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5 libnuma-dev
106 |       - name: Install GHCup
107 |         run: |
108 |           mkdir -p "$HOME/.ghcup/bin"
109 |           curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup"
110 |           chmod a+x "$HOME/.ghcup/bin/ghcup"
111 |       - name: Install cabal-install
112 |         run: |
113 |           "$HOME/.ghcup/bin/ghcup" install cabal 3.12.1.0 || (cat "$HOME"/.ghcup/logs/*.* && false)
114 |           echo "CABAL=$HOME/.ghcup/bin/cabal-3.12.1.0 -vnormal+nowrap" >> "$GITHUB_ENV"
115 |       - name: Install GHC (GHCup)
116 |         if: matrix.setup-method == 'ghcup'
117 |         run: |
118 |           "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false)
119 |           HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER")
120 |           HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#')
121 |           HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#')
122 |           echo "HC=$HC" >> "$GITHUB_ENV"
123 |           echo "HCPKG=$HCPKG" >> "$GITHUB_ENV"
124 |           echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV"
125 |         env:
126 |           HCKIND: ${{ matrix.compilerKind }}
127 |           HCNAME: ${{ matrix.compiler }}
128 |           HCVER: ${{ matrix.compilerVersion }}
129 |       - name: Set PATH and environment variables
130 |         run: |
131 |           echo "$HOME/.cabal/bin" >> $GITHUB_PATH
132 |           echo "LANG=C.UTF-8" >> "$GITHUB_ENV"
133 |           echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV"
134 |           echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV"
135 |           HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))')
136 |           echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV"
137 |           echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV"
138 |           echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV"
139 |           echo "HEADHACKAGE=false" >> "$GITHUB_ENV"
140 |           echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV"
141 |         env:
142 |           HCKIND: ${{ matrix.compilerKind }}
143 |           HCNAME: ${{ matrix.compiler }}
144 |           HCVER: ${{ matrix.compilerVersion }}
145 |       - name: env
146 |         run: |
147 |           env
148 |       - name: write cabal config
149 |         run: |
150 |           mkdir -p $CABAL_DIR
151 |           cat >> $CABAL_CONFIG <<EOF
152 |           remote-build-reporting: anonymous
153 |           write-ghc-environment-files: never
154 |           remote-repo-cache: $CABAL_DIR/packages
155 |           logs-dir:          $CABAL_DIR/logs
156 |           world-file:        $CABAL_DIR/world
157 |           extra-prog-path:   $CABAL_DIR/bin
158 |           symlink-bindir:    $CABAL_DIR/bin
159 |           installdir:        $CABAL_DIR/bin
160 |           build-summary:     $CABAL_DIR/logs/build.log
161 |           store-dir:         $CABAL_DIR/store
162 |           install-dirs user
163 |             prefix: $CABAL_DIR
164 |           repository hackage.haskell.org
165 |             url: http://hackage.haskell.org/
166 |           EOF
167 |           cat >> $CABAL_CONFIG <<EOF
168 |           program-default-options
169 |             ghc-options: $GHCJOBS +RTS -M3G -RTS
170 |           EOF
171 |           cat $CABAL_CONFIG
172 |       - name: versions
173 |         run: |
174 |           $HC --version || true
175 |           $HC --print-project-git-commit-id || true
176 |           $CABAL --version || true
177 |       - name: update cabal index
178 |         run: |
179 |           $CABAL v2-update -v
180 |       - name: install cabal-plan
181 |         run: |
182 |           mkdir -p $HOME/.cabal/bin
183 |           curl -sL https://github.com/haskell-hvr/cabal-plan/releases/download/v0.7.3.0/cabal-plan-0.7.3.0-x86_64-linux.xz > cabal-plan.xz
184 |           echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2  cabal-plan.xz' | sha256sum -c -
185 |           xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan
186 |           rm -f cabal-plan.xz
187 |           chmod a+x $HOME/.cabal/bin/cabal-plan
188 |           cabal-plan --version
189 |       - name: checkout
190 |         uses: actions/checkout@v4
191 |         with:
192 |           path: source
193 |       - name: initial cabal.project for sdist
194 |         run: |
195 |           touch cabal.project
196 |           echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project
197 |           cat cabal.project
198 |       - name: sdist
199 |         run: |
200 |           mkdir -p sdist
201 |           $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist
202 |       - name: unpack
203 |         run: |
204 |           mkdir -p unpacked
205 |           find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \;
206 |       - name: generate cabal.project
207 |         run: |
208 |           PKGDIR_text_icu="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/text-icu-[0-9.]*')"
209 |           echo "PKGDIR_text_icu=${PKGDIR_text_icu}" >> "$GITHUB_ENV"
210 |           rm -f cabal.project cabal.project.local
211 |           touch cabal.project
212 |           touch cabal.project.local
213 |           echo "packages: ${PKGDIR_text_icu}" >> cabal.project
214 |           if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo "package text-icu" >> cabal.project ; fi
215 |           if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo "    ghc-options: -Werror=missing-methods" >> cabal.project ; fi
216 |           cat >> cabal.project <<EOF
217 |           EOF
218 |           $HCPKG list --simple-output --names-only | perl -ne 'for (split /\s+/) { print "constraints: any.$_ installed\n" unless /^(text-icu)$/; }' >> cabal.project.local
219 |           cat cabal.project
220 |           cat cabal.project.local
221 |       - name: dump install plan
222 |         run: |
223 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all
224 |           cabal-plan
225 |       - name: restore cache
226 |         uses: actions/cache/restore@v4
227 |         with:
228 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
229 |           path: ~/.cabal/store
230 |           restore-keys: ${{ runner.os }}-${{ matrix.compiler }}-
231 |       - name: install dependencies
232 |         run: |
233 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all
234 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all
235 |       - name: build w/o tests
236 |         run: |
237 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
238 |       - name: build
239 |         run: |
240 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always
241 |       - name: tests
242 |         run: |
243 |           $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct
244 |       - name: cabal check
245 |         run: |
246 |           cd ${PKGDIR_text_icu} || false
247 |           ${CABAL} -vnormal check
248 |       - name: haddock
249 |         run: |
250 |           $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all
251 |       - name: unconstrained build
252 |         run: |
253 |           rm -f cabal.project.local
254 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
255 |       - name: save cache
256 |         if: always()
257 |         uses: actions/cache/save@v4
258 |         with:
259 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
260 |           path: ~/.cabal/store
261 | 


--------------------------------------------------------------------------------
/.github/workflows/stack.yml:
--------------------------------------------------------------------------------
  1 | # This workflow uses 'stack' as installed on the github runner.
  2 | # GHC is installed via stack.
  3 | 
  4 | name: Stack build
  5 | 
  6 | on:
  7 |   push:
  8 |     branches:
  9 |       - master
 10 |   pull_request:
 11 |     branches:
 12 |       - master
 13 | 
 14 | defaults:
 15 |   run:
 16 |     shell: bash
 17 | 
 18 | jobs:
 19 |   stack:
 20 |     name: ${{ matrix.os }} Stack ${{ matrix.plan.resolver }}
 21 |     strategy:
 22 |       fail-fast: false
 23 |       matrix:
 24 |         os:
 25 |           - macOS-latest
 26 |           - ubuntu-22.04
 27 |           - ubuntu-24.04
 28 |           - windows-latest
 29 |         plan:
 30 |           - resolver: 'nightly'
 31 |           - resolver: 'lts'
 32 | 
 33 |     runs-on: ${{ matrix.os }}
 34 |     env:
 35 |       STACK: stack --no-terminal --resolver ${{ matrix.plan.resolver }}
 36 | 
 37 |     steps:
 38 |     - uses: actions/checkout@v4
 39 | 
 40 |     - uses: haskell-actions/setup@v2
 41 |       with:
 42 |         enable-stack: true
 43 |         stack-no-global: true
 44 | 
 45 |     - name: Configure
 46 |       run: |
 47 |         $STACK init
 48 | 
 49 |     - name: Install GHC via stack
 50 |       run: |
 51 |         $STACK ghc -- --version
 52 | 
 53 |     - name: Haskell versions
 54 |       run: |
 55 |         STACK_VERSION=$(${STACK} --numeric-version)
 56 |         echo "STACK_VERSION=${STACK_VERSION}" >> "${GITHUB_ENV}"
 57 |         GHC_VERSION=$(${STACK} ghc -- --numeric-version)
 58 |         echo "GHC_VERSION=${GHC_VERSION}" >> "${GITHUB_ENV}"
 59 | 
 60 |       ## This causes troubles on Windows (spaces in env variable)?
 61 |       # STACK_ROOT=$(${STACK} path --stack-root)
 62 |       # echo "STACK_ROOT=${STACK_ROOT}" >> "${GITHUB_ENV}"
 63 | 
 64 |     - name: Set up for the ICU library (macOS)
 65 |       if: runner.os == 'macOS'
 66 |       run: |
 67 |         ICU4C=$(brew --prefix)/opt/icu4c
 68 |         echo "PKG_CONFIG_PATH=${ICU4C}/lib/pkgconfig" >> "${GITHUB_ENV}"
 69 | 
 70 |     - name: Install the ICU library (Windows)
 71 |       # We also install pkgconf, which superseds pkg-config.
 72 |       if: ${{ runner.os == 'Windows' }}
 73 |       run: |
 74 |         $STACK exec -- pacman --noconfirm -S msys2-keyring
 75 |         $STACK exec -- pacman --noconfirm -S mingw-w64-x86_64-pkgconf
 76 |         $STACK exec -- pacman --noconfirm -S mingw-w64-x86_64-icu
 77 |       ## Alternatively, in the last line, install a specific version of ICU, like 71:
 78 |       # $STACK exec -- bash -c "curl -LO ${ICU_URL} && pacman --noconfirm -U *.pkg.tar.zst"
 79 |       # env:
 80 |       #   ICU_URL: "https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-icu-71.1-1-any.pkg.tar.zst"
 81 | 
 82 |     - name: Determine the ICU version
 83 |       run: |
 84 |         ICU_VERSION=$($STACK exec -- pkg-config --modversion icu-i18n)
 85 |         echo "ICU version ${ICU_VERSION}"
 86 |         echo "ICU_VERSION=${ICU_VERSION}" >> "${GITHUB_ENV}"
 87 | 
 88 |     ## Caching ~/.stack without --system-ghc is probably not a good idea:
 89 |     ## - too fat
 90 |     ## - should be sensibly restored before installing GHC via stack,
 91 |     ##   but then we don't know the GHC version; so at least 'lts' and 'nightly' would be brittle
 92 |     ##
 93 |     # - uses: actions/cache@v3
 94 |     #   with:
 95 |     #     path: ${{ env.STACK_ROOT }}
 96 |     #     key: ${{ runner.os }}-stack-${{ env.STACK_VERSION }}-ghc-${{ env.GHC_VERSION }}-icu-${{ env.ICU_VER }}-resolver-${{ matrix.plan.resolver }}-sha-${{ github.sha }}
 97 |     #     restore-keys: ${{ runner.os }}-stack-${{ env.STACK_VERSION }}-ghc-${{ env.GHC_VERSION }}-icu-${{ env.ICU_VER }}-resolver-${{ matrix.plan.resolver }}-
 98 | 
 99 |     - name: Install dependencies
100 |       run: |
101 |         $STACK test --only-dependencies
102 | 
103 |     - name: Build
104 |       run: |
105 |         $STACK build --haddock --no-haddock-deps
106 | 
107 |     - name: Test
108 |       run: |
109 |         $STACK -j 1 test --haddock --no-haddock-deps
110 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist-newstyle/
2 | cabal.project.local
3 | cabal.project.local~*
4 | .stack-work/
5 | stack*.yaml.lock


--------------------------------------------------------------------------------
/Data/Text/ICU.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, NoImplicitPrelude #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU
  4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Commonly used functions for Unicode, implemented as bindings to the
 12 | -- International Components for Unicode (ICU) libraries.
 13 | --
 14 | -- This module contains only the most commonly used types and
 15 | -- functions.  Other modules in this package expose richer interfaces.
 16 | module Data.Text.ICU
 17 |     (
 18 |     -- * Data representation
 19 |     -- $data
 20 | 
 21 |     -- * Types
 22 |       LocaleName(..)
 23 |     -- * Locales
 24 |     , availableLocales
 25 |     -- * Boundary analysis
 26 |     -- $break
 27 |     , Breaker
 28 |     , Break
 29 |     , brkPrefix
 30 |     , brkBreak
 31 |     , brkSuffix
 32 |     , brkStatus
 33 |     , Line(..)
 34 |     , Word(..)
 35 |     , breakCharacter
 36 |     , breakLine
 37 |     , breakSentence
 38 |     , breakWord
 39 |     , breaks
 40 |     , breaksRight
 41 |     -- * Case mapping
 42 |     , toCaseFold
 43 |     , toLower
 44 |     , toUpper
 45 |     -- * Iteration
 46 |     , CharIterator
 47 |     , fromString
 48 |     , fromText
 49 |     , fromUtf8
 50 |     -- * Normalization
 51 |     -- $compat
 52 |     -- ** Normalize unicode strings
 53 |     , nfc, nfd, nfkc, nfkd, nfkcCasefold
 54 |     -- ** Checks for normalization
 55 |     , quickCheck, isNormalized
 56 |     -- * String comparison
 57 |     -- ** Normalization-sensitive string comparison
 58 |     , CompareOption(..)
 59 |     , compareUnicode
 60 |     -- ** Locale-sensitive string collation
 61 |     -- $collate
 62 |     , Collator
 63 |     , collator
 64 |     , collatorWith
 65 |     , collatorFromRules
 66 |     , collatorFromRulesWith
 67 |     , collate
 68 |     , collateIter
 69 |     , sortKey
 70 |     , uca
 71 |     -- * Regular expressions
 72 |     , MatchOption(..)
 73 |     , ParseError(errError, errLine, errOffset)
 74 |     , Match
 75 |     , Regex
 76 |     , Regular
 77 |     -- ** Construction
 78 |     , regex
 79 |     , regex'
 80 |     -- ** Inspection
 81 |     , pattern
 82 |     -- ** Searching
 83 |     , find
 84 |     , findAll
 85 |     -- ** Match groups
 86 |     -- $group
 87 |     , groupCount
 88 |     , unfold
 89 |     , span
 90 |     , group
 91 |     , prefix
 92 |     , suffix
 93 |     -- * Spoof checking
 94 |     -- $spoof
 95 |     , Spoof
 96 |     , SpoofParams(..)
 97 |     , S.SpoofCheck(..)
 98 |     , S.RestrictionLevel(..)
 99 |     , S.SpoofCheckResult(..)
100 |     -- ** Construction
101 |     , spoof
102 |     , spoofWithParams
103 |     , spoofFromSource
104 |     , spoofFromSerialized
105 |     -- ** String checking
106 |     , areConfusable
107 |     , spoofCheck
108 |     , getSkeleton
109 |     -- ** Configuration
110 |     , getChecks
111 |     , getAllowedLocales
112 |     , getRestrictionLevel
113 |     -- ** Persistence
114 |     , serialize
115 |     -- * Calendars
116 |     , Calendar, CalendarType(..), SystemTimeZoneType(..), CalendarField(..),
117 |     -- ** Construction
118 |     calendar,
119 |     -- ** Operations on calendars
120 |     roll, add, set1, set, get,
121 |     -- * Number formatting
122 |     NumberFormatter, numberFormatter, formatIntegral, formatIntegral', formatDouble, formatDouble',
123 |     -- * Date formatting
124 |     DateFormatter, FormatStyle(..), DateFormatSymbolType(..), standardDateFormatter, patternDateFormatter, dateSymbols, formatCalendar,
125 |     ) where
126 | 
127 | import Data.Text.ICU.Break.Pure
128 | import Data.Text.ICU.Calendar
129 | import Data.Text.ICU.Collate.Pure
130 | import Data.Text.ICU.DateFormatter
131 | import Data.Text.ICU.Internal
132 | import Data.Text.ICU.Iterator
133 | import Data.Text.ICU.Locale
134 | import Data.Text.ICU.Normalize2
135 | import Data.Text.ICU.NumberFormatter
136 | import Data.Text.ICU.Regex.Pure
137 | import qualified Data.Text.ICU.Spoof as S
138 | import Data.Text.ICU.Spoof.Pure
139 | import Data.Text.ICU.Text
140 | #if defined(__HADDOCK__)
141 | import Data.Text.Foreign
142 | import Data.Text (Text)
143 | #endif
144 | 
145 | -- $data
146 | --
147 | -- The Haskell 'Text' type is implemented as an array in the Haskell
148 | -- heap.  This means that its location is not pinned; it may be copied
149 | -- during a garbage collection pass.  ICU, on the other hand, works
150 | -- with strings that are allocated in the normal system heap and have
151 | -- a fixed address.
152 | --
153 | -- To accommodate this need, these bindings use the functions from
154 | -- "Data.Text.Foreign" to copy data between the Haskell heap and the
155 | -- system heap.  The copied strings are still managed automatically,
156 | -- but the need to duplicate data does add some performance and memory
157 | -- overhead.
158 | 
159 | -- $break
160 | --
161 | -- Text boundary analysis is the process of locating linguistic
162 | -- boundaries while formatting and handling text. Examples of this
163 | -- process include:
164 | --
165 | -- * Locating appropriate points to word-wrap text to fit within
166 | --   specific margins while displaying or printing.
167 | --
168 | -- * Counting characters, words, sentences, or paragraphs.
169 | --
170 | -- * Making a list of the unique words in a document.
171 | --
172 | -- * Figuring out if a given range of text contains only whole words.
173 | --
174 | -- * Capitalizing the first letter of each word.
175 | --
176 | -- * Locating a particular unit of the text (For example, finding the
177 | --   third word in the document).
178 | --
179 | -- The 'Breaker' type was designed to support these kinds of
180 | -- tasks.
181 | --
182 | -- For the impure boundary analysis API (which is richer, but less
183 | -- easy to use than the pure API), see the "Data.Text.ICU.Break"
184 | -- module.  The impure API supports some uses that may be less
185 | -- efficient via the pure API, including:
186 | --
187 | -- * Locating the beginning of a word that the user has selected.
188 | --
189 | -- * Determining how far to move the text cursor when the user hits an
190 | --   arrow key (Some characters require more than one position in the
191 | --   text store and some characters in the text store do not display
192 | --   at all).
193 | 
194 | -- $collate
195 | --
196 | -- For the impure collation API (which is richer, but less easy to
197 | -- use than the pure API), see the "Data.Text.ICU.Collate"
198 | -- module.
199 | 
200 | -- $group
201 | --
202 | -- Capturing groups are numbered starting from zero.  Group zero is
203 | -- always the entire matching text.  Groups greater than zero contain
204 | -- the text matching each capturing group in a regular expression.
205 | 
206 | -- $spoof
207 | --
208 | -- The 'Spoof' type performs security checks on visually confusable
209 | -- (spoof) strings.  For the impure spoof checking API (which is
210 | -- richer, but less easy to use than the pure API), see the
211 | -- "Data.Text.ICU.Spoof" module.
212 | --
213 | -- See <http://unicode.org/reports/tr36/ UTR #36> and
214 | -- <http://unicode.org/reports/tr39/ UTS #39> for detailed information
215 | -- about the underlying algorithms and databases used by these functions.
216 | 
217 | -- $formatting
218 | --
219 | -- You create a 'NumberFormat' with 'numberFormatter' according to a locale
220 | -- and a choice of pre-defined formats. A 'NumberFormat' provides a formatting
221 | -- facility that 'format's numbers
222 | -- according to the chosen locale. Alternatively create and apply a 'NumberFormat'
223 | -- in a single step with 'formatNumber'' (it may be faster to re-use a NumberFormat though).
224 | -- See the section \"Patterns\" at <https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classDecimalFormat.html#Patterns>
225 | -- for further details regarding pattern strings.
226 | 
227 | -- $compat
228 | -- See module 'Data.Text.ICU.Normalization2' for the full interface which provides some compatibility with the former API.
229 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/BiDi.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ForeignFunctionInterface #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.BiDi
  4 | -- Copyright   : (c) 2018 Ondrej Palkovsky
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Implementation of the Unicode Bidirectional Algorithm. See the documentation
 12 | -- of the libicu library for additional details.
 13 | --
 14 | -- -- /Note/: this module is not thread safe. /Do not/ call the
 15 | -- functions on one BiDi object from more than one thread!
 16 | 
 17 | module Data.Text.ICU.BiDi
 18 |   (
 19 |     BiDi
 20 |   -- ** Basic functions
 21 |   , open
 22 |   , openSized
 23 |   -- ** Set data
 24 |   , setPara
 25 |   , setLine
 26 |   -- ** Access the BiDi object
 27 |   , countParagraphs
 28 |   , getParagraphByIndex
 29 |   , getProcessedLength
 30 |   -- ** Output text
 31 |   , writeReordered
 32 |   , WriteOption(..)
 33 |   -- ** High-level functions
 34 |   , reorderParagraphs
 35 |   ) where
 36 | 
 37 | #include <unicode/ubidi.h>
 38 | 
 39 | import Data.Text.ICU.BiDi.Internal
 40 | import Foreign.Marshal.Utils (with)
 41 | import Foreign.Storable (peek)
 42 | import Foreign.Ptr (FunPtr, Ptr)
 43 | import Data.Int (Int32, Int16)
 44 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError)
 45 | import Data.Text (Text)
 46 | import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr, newICUPtr)
 47 | import Foreign.C.Types (CInt(..))
 48 | import Data.List (foldl')
 49 | import Data.Bits ((.|.))
 50 | import System.IO.Unsafe (unsafePerformIO)
 51 | import Data.Traversable (for)
 52 | 
 53 | -- | Allocate a BiDi structure.
 54 | open :: IO BiDi
 55 | open = newICUPtr BiDi ubidi_close ubidi_open
 56 | 
 57 | -- | Allocate a BiDi structure with preallocated memory for internal structures.
 58 | openSized ::
 59 |      Int32 -- ^ is the maximum text or line length that internal memory will be preallocated for.
 60 |            -- An attempt to associate this object with a longer text will fail, unless this value is 0.
 61 |   -> Int32 -- ^ is the maximum anticipated number of same-level runs that internal memory will be preallocated for.
 62 |            -- An attempt to access visual runs on an object that was not preallocated for as many runs as the text was actually resolved to will fail, unless this value is 0.
 63 |   -> IO BiDi
 64 | openSized maxlen maxruncount =
 65 |   newICUPtr BiDi ubidi_close $ handleError (ubidi_openSized maxlen maxruncount)
 66 | 
 67 | -- | Perform the Unicode Bidi algorithm. It is defined in the Unicode Standard Annex #9, version 13,
 68 | -- also described in The Unicode Standard, Version 4.0.
 69 | -- This function takes a piece of plain text containing one or more paragraphs,
 70 | -- with or without externally specified embedding levels from styled text and
 71 | -- computes the left-right-directionality of each character.
 72 | setPara ::
 73 |      BiDi
 74 |   -> Text
 75 |   -> Int32 -- ^ specifies the default level for the text; it is typically 0 (LTR) or 1 (RTL)
 76 |   -> IO ()
 77 | setPara bidi t paraLevel =
 78 |   withBiDi bidi $ \bptr ->
 79 |     useAsUCharPtr t $ \sptr slen -> handleError (ubidi_setPara bptr sptr (fromIntegral slen) paraLevel)
 80 | 
 81 | -- | Sets a BiDi to contain the reordering information, especially the resolved levels,
 82 | -- for all the characters in a line of text
 83 | setLine ::
 84 |      BiDi -- ^  the parent paragraph object. It must have been set by a successful call to 'setPara'.
 85 |   -> Int32 -- ^ is the line's first index into the text
 86 |   -> Int32 -- ^ is just behind the line's last index into the text (its last index +1).
 87 |   -> BiDi -- ^ is the object that will now represent a line of the text
 88 |   -> IO ()
 89 | setLine paraBidi start limit lineBidi =
 90 |   withBiDi paraBidi $ \paraptr ->
 91 |     withBiDi lineBidi $ \lineptr ->
 92 |       handleError (ubidi_setLine paraptr start limit lineptr)
 93 | 
 94 | -- | Get the number of paragraphs.
 95 | countParagraphs :: BiDi -> IO Int32
 96 | countParagraphs bidi = withBiDi bidi ubidi_countParagraphs
 97 | 
 98 | -- | Get a paragraph, given the index of this paragraph.
 99 | getParagraphByIndex ::
100 |      BiDi
101 |   -> Int32 -- ^ is the number of the paragraph, in the range [0..ubidi_countParagraphs(pBiDi)-1].
102 |   -> IO (Int32, Int32) -- ^ index of the first character of the paragraph in the text and limit of the paragraph
103 | getParagraphByIndex bidi paraIndex =
104 |   withBiDi bidi $ \bptr ->
105 |     with 0 $ \pstart ->
106 |       with 0 $ \pend -> do
107 |         handleError (ubidi_getParagraphByIndex bptr paraIndex pstart pend)
108 |         (,) <$> (fromIntegral <$> peek pstart)
109 |             <*> (fromIntegral <$> peek pend)
110 | 
111 | -- | Get the length of the source text processed by the last call to 'setPara'.
112 | getProcessedLength :: BiDi -> IO Int32
113 | getProcessedLength bidi = withBiDi bidi ubidi_getProcessedLength
114 | 
115 | data WriteOption =
116 |   DoMirroring
117 |   -- ^ replace characters with the "mirrored" property in RTL runs by their mirror-image mappings
118 |   | InsertLrmForNumeric
119 |   -- ^ surround the run with LRMs if necessary; this is part of the approximate "inverse Bidi" algorithm
120 |   | KeepBaseCombining
121 |   -- ^ keep combining characters after their base characters in RTL runs
122 |   | OutputReverse
123 |   -- ^ write the output in reverse order
124 |   | RemoveBidiControls
125 |   -- ^ remove Bidi control characters (this does not affect InsertLrmForNumeric)
126 |   deriving (Show)
127 | 
128 | reduceWriteOpts :: [WriteOption] -> Int16
129 | reduceWriteOpts = foldl' orO 0
130 |     where a `orO` b = a .|. fromWriteOption b
131 | 
132 | fromWriteOption :: WriteOption -> Int16
133 | fromWriteOption DoMirroring   = #const UBIDI_DO_MIRRORING
134 | fromWriteOption InsertLrmForNumeric   = #const UBIDI_INSERT_LRM_FOR_NUMERIC
135 | fromWriteOption KeepBaseCombining   = #const UBIDI_KEEP_BASE_COMBINING
136 | fromWriteOption OutputReverse   = #const UBIDI_OUTPUT_REVERSE
137 | fromWriteOption RemoveBidiControls   = #const UBIDI_REMOVE_BIDI_CONTROLS
138 | 
139 | -- | Take a BiDi object containing the reordering information for a piece of text
140 | -- (one or more paragraphs) set by 'setPara' or for a line of text set by 'setLine'
141 | -- and write a reordered string to the destination buffer.
142 | writeReordered :: BiDi -> [WriteOption] -> IO Text
143 | writeReordered bidi opts = do
144 |   destLen <- getProcessedLength bidi
145 |   let options' = reduceWriteOpts opts
146 |   withBiDi bidi $ \bptr ->
147 |     handleOverflowError (fromIntegral destLen)
148 |       (\dptr dlen -> ubidi_writeReordered bptr dptr (fromIntegral dlen) options')
149 |       (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
150 | 
151 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_open" ubidi_open
152 |   :: IO (Ptr UBiDi)
153 | 
154 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_openSized" ubidi_openSized
155 |   :: Int32 -> Int32 -> Ptr UErrorCode -> IO (Ptr UBiDi)
156 | 
157 | foreign import ccall unsafe "hs_text_icu.h &__hs_ubidi_close" ubidi_close
158 |   :: FunPtr (Ptr UBiDi -> IO ())
159 | 
160 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_setPara" ubidi_setPara
161 |   :: Ptr UBiDi -> Ptr UChar -> Int32 -> Int32 -> Ptr UErrorCode -> IO ()
162 | 
163 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_countParagraphs" ubidi_countParagraphs
164 |   :: Ptr UBiDi -> IO Int32
165 | 
166 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_getParagraphByIndex" ubidi_getParagraphByIndex
167 |   :: Ptr UBiDi -> Int32 -> Ptr CInt -> Ptr CInt -> Ptr UErrorCode -> IO ()
168 | 
169 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_getProcessedLength" ubidi_getProcessedLength
170 |   :: Ptr UBiDi -> IO Int32
171 | 
172 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_writeReordered" ubidi_writeReordered
173 |   :: Ptr UBiDi -> Ptr UChar -> Int32 -> Int16 -> Ptr UErrorCode -> IO Int32
174 | 
175 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_setLine" ubidi_setLine
176 |   :: Ptr UBiDi -> Int32 -> Int32 -> Ptr UBiDi -> Ptr UErrorCode -> IO ()
177 | 
178 | -- | Helper function to reorder a text to a series of paragraphs.
179 | reorderParagraphs :: [WriteOption] -> Text -> [Text]
180 | reorderParagraphs options input =
181 |   unsafePerformIO $ do
182 |     bidi <- open
183 |     setPara bidi input 0
184 |     pcount <- countParagraphs bidi
185 |     lineBidi <- open
186 |     for [0..pcount-1] $ \pidx -> do
187 |         (start,limit) <- getParagraphByIndex bidi pidx
188 |         setLine bidi start limit lineBidi
189 |         writeReordered lineBidi options
190 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/BiDi/Internal.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Bidi.Internal
 4 | -- Copyright   : (c) Ondrej Palkovsky 2018
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Internal types for Unicode bidirectional algorithm
12 | 
13 | module Data.Text.ICU.BiDi.Internal
14 |     (
15 |       BiDi(..)
16 |     , UBiDi
17 |     , withBiDi
18 |     ) where
19 | 
20 | import Data.Typeable (Typeable)
21 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
22 | import Foreign.Ptr (Ptr)
23 | 
24 | data UBiDi
25 | 
26 | -- | BiDi object.  /Note/: this structure is not
27 | -- thread safe. It is /not/ safe to use value of this type
28 | -- simultaneously from multiple threads.
29 | newtype BiDi = BiDi (ForeignPtr UBiDi)
30 |                  deriving (Eq, Typeable)
31 | 
32 | instance Show BiDi where
33 |     show _ = "BiDi"
34 | 
35 | withBiDi :: BiDi -> (Ptr UBiDi -> IO a) -> IO a
36 | {-# INLINE withBiDi #-}
37 | withBiDi (BiDi cnv) = withForeignPtr cnv
38 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/BitMask.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DefaultSignatures, ScopedTypeVariables #-}
 2 | 
 3 | -- From http://stackoverflow.com/a/15911213
 4 | 
 5 | module Data.Text.ICU.BitMask
 6 |        (
 7 |     -- * Bit mask twiddling API
 8 |     -- $api
 9 |     -- * Types
10 |       ToBitMask(..)
11 |     -- * Functions
12 |     , fromBitMask
13 |     , highestValueInBitMask
14 |     ) where
15 | 
16 | import Data.Bits ((.&.), (.|.))
17 | import Data.Maybe (listToMaybe)
18 | 
19 | -- $api
20 | -- Conversion to and from enumerated types representable as
21 | -- a compact bitmask.
22 | 
23 | class ToBitMask a where
24 |   toBitMask :: a -> Int
25 | 
26 | instance (ToBitMask a) => ToBitMask [a] where
27 |   toBitMask = foldr ((.|.) . toBitMask) 0
28 | 
29 | fromBitMask :: (Enum a, Bounded a, ToBitMask a) => Int -> [a]
30 | fromBitMask bm = filter inBitMask $ enumFrom minBound
31 |   where inBitMask val = (bm .&. toBitMask val) == toBitMask val
32 | 
33 | highestValueInBitMask :: (Enum a, Bounded a, ToBitMask a) => Int -> Maybe a
34 | highestValueInBitMask = listToMaybe . reverse . fromBitMask
35 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Break.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, ForeignFunctionInterface, RecordWildCards #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.Break
  4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- String breaking functions for Unicode, implemented as bindings to
 12 | -- the International Components for Unicode (ICU) libraries.
 13 | --
 14 | -- The text boundary positions are found according to the rules described in
 15 | -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex
 16 | -- #14, Line Breaking Properties.  These are available at
 17 | -- <http://www.unicode.org/reports/tr14/> and
 18 | -- <http://www.unicode.org/reports/tr29/>.
 19 | 
 20 | module Data.Text.ICU.Break
 21 |     (
 22 |     -- * Types
 23 |       BreakIterator
 24 |     , Line(..)
 25 |     , Data.Text.ICU.Break.Word(..)
 26 |     -- * Breaking functions
 27 |     , breakCharacter
 28 |     , breakLine
 29 |     , breakSentence
 30 |     , breakWord
 31 |     , clone
 32 |     , setText
 33 |     -- * Iteration functions
 34 |     -- $indices
 35 |     , current
 36 |     , first
 37 |     , last
 38 |     , next
 39 |     , previous
 40 |     , preceding
 41 |     , following
 42 |     , isBoundary
 43 |     -- * Iterator status
 44 |     , getStatus
 45 |     , getStatuses
 46 |     -- * Locales
 47 |     , available
 48 |     ) where
 49 | 
 50 | #include <unicode/ubrk.h>
 51 | 
 52 | import Control.DeepSeq (NFData(..))
 53 | import Control.Monad (forM)
 54 | import Data.IORef (newIORef, writeIORef)
 55 | import Data.Int (Int32)
 56 | import Data.Text (Text)
 57 | import Data.Text.ICU.Break.Types (BreakIterator(..), UBreakIterator)
 58 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
 59 | import Data.Text.ICU.Internal (LocaleName(..), UBool, UChar, asBool, withLocaleName, TextI, UText, asUTextPtr, withUTextPtr, newICUPtr)
 60 | import Foreign.C.String (CString, peekCString)
 61 | import Foreign.C.Types (CInt(..))
 62 | import Foreign.ForeignPtr (withForeignPtr)
 63 | import Foreign.Marshal.Array (allocaArray, peekArray)
 64 | import Foreign.Marshal.Utils (with)
 65 | import Foreign.Ptr (FunPtr, Ptr, nullPtr)
 66 | import Prelude hiding (last)
 67 | import System.IO.Unsafe (unsafePerformIO)
 68 | 
 69 | -- $indices
 70 | --
 71 | -- /Important note/: All of the indices accepted and returned by
 72 | -- functions in this module are offsets into the raw UTF-16 text
 73 | -- array, /not/ a count of codepoints.
 74 | 
 75 | -- | Line break status.
 76 | data Line = Soft                -- ^ A soft line break is a position at
 77 |                                 -- which a line break is acceptable, but not
 78 |                                 -- required.
 79 |           | Hard
 80 |             deriving (Eq, Show, Enum)
 81 | 
 82 | instance NFData Line where
 83 |     rnf !_ = ()
 84 | 
 85 | -- | Word break status.
 86 | data Word = Uncategorized       -- ^ A \"word\" that does not fit into another
 87 |                                 -- category.  Includes spaces and most
 88 |                                 -- punctuation.
 89 |           | Number              -- ^ A word that appears to be a number.
 90 |           | Letter              -- ^ A word containing letters, excluding
 91 |                                 -- hiragana, katakana or ideographic
 92 |                                 -- characters.
 93 |           | Kana                -- ^ A word containing kana characters.
 94 |           | Ideograph           -- ^ A word containing ideographic characters.
 95 |             deriving (Eq, Show, Enum)
 96 | 
 97 | instance NFData Data.Text.ICU.Break.Word where
 98 |     rnf !_ = ()
 99 | 
100 | -- | Break a string on character boundaries.
101 | --
102 | -- Character boundary analysis identifies the boundaries of \"Extended
103 | -- Grapheme Clusters\", which are groupings of codepoints that should be
104 | -- treated as character-like units for many text operations.  Please see
105 | -- Unicode Standard Annex #29, Unicode Text Segmentation,
106 | -- <http://www.unicode.org/reports/tr29/> for additional information on
107 | -- grapheme clusters and guidelines on their use.
108 | breakCharacter :: LocaleName -> Text -> IO (BreakIterator ())
109 | breakCharacter = open (#const UBRK_CHARACTER) (const ())
110 | 
111 | -- | Break a string on line boundaries.
112 | --
113 | -- Line boundary analysis determines where a text string can be broken when
114 | -- line wrapping. The mechanism correctly handles punctuation and hyphenated
115 | -- words.
116 | breakLine :: LocaleName -> Text -> IO (BreakIterator Line)
117 | breakLine = open (#const UBRK_LINE) asLine
118 |   where
119 |     asLine i
120 |       | i < (#const UBRK_LINE_SOFT_LIMIT) = Soft
121 |       | i < (#const UBRK_LINE_HARD_LIMIT) = Hard
122 |       | otherwise = error $ "unknown line break status " ++ show i
123 | 
124 | -- | Break a string on sentence boundaries.
125 | --
126 | -- Sentence boundary analysis allows selection with correct interpretation
127 | -- of periods within numbers and abbreviations, and trailing punctuation
128 | -- marks such as quotation marks and parentheses.
129 | breakSentence :: LocaleName -> Text -> IO (BreakIterator ())
130 | breakSentence = open (#const UBRK_SENTENCE) (const ())
131 | 
132 | -- | Break a string on word boundaries.
133 | --
134 | -- Word boundary analysis is used by search and replace functions, as well
135 | -- as within text editing applications that allow the user to select words
136 | -- with a double click. Word selection provides correct interpretation of
137 | -- punctuation marks within and following words. Characters that are not
138 | -- part of a word, such as symbols or punctuation marks, have word breaks on
139 | -- both sides.
140 | breakWord :: LocaleName -> Text -> IO (BreakIterator Data.Text.ICU.Break.Word)
141 | breakWord = open (#const UBRK_WORD) asWord
142 |   where
143 |     asWord i
144 |       | i < (#const UBRK_WORD_NONE_LIMIT) = Uncategorized
145 |       | i < (#const UBRK_WORD_NUMBER_LIMIT) = Number
146 |       | i < (#const UBRK_WORD_LETTER_LIMIT) = Letter
147 |       | i < (#const UBRK_WORD_KANA_LIMIT) = Kana
148 |       | i < (#const UBRK_WORD_IDEO_LIMIT) = Ideograph
149 |       | otherwise = error $ "unknown word break status " ++ show i
150 | 
151 | -- | Create a new 'BreakIterator' for locating text boundaries in the
152 | -- specified locale.
153 | open :: UBreakIteratorType -> (Int32 -> a) -> LocaleName -> Text
154 |      -> IO (BreakIterator a)
155 | open brk f loc t = withLocaleName loc $ \locale -> do
156 |   r <- newIORef undefined
157 |   b <- newICUPtr (BR r f) ubrk_close $
158 |     handleError $ ubrk_open brk locale nullPtr 0
159 |   setText b t
160 |   return b
161 | 
162 | -- | Point an existing 'BreakIterator' at a new piece of text.
163 | setText :: BreakIterator a -> Text -> IO ()
164 | setText BR{..} t = do
165 |   fp <- asUTextPtr t
166 |   withUTextPtr fp $ \ ptr -> do
167 |     withForeignPtr brIter $ \p -> handleError $ ubrk_setUText p ptr
168 |     writeIORef brText fp
169 | 
170 | -- | Thread safe cloning operation.  This is substantially faster than
171 | -- creating a new 'BreakIterator' from scratch.
172 | clone :: BreakIterator a -> IO (BreakIterator a)
173 | clone BR{..} = newICUPtr (BR brText brStatus) ubrk_close $
174 |   withForeignPtr brIter $ \p ->
175 |     with 1 $ handleError . ubrk_safeClone p nullPtr
176 | 
177 | asIndex :: (Ptr UBreakIterator -> IO Int32) -> BreakIterator a -> IO (Maybe TextI)
178 | asIndex act BR{..} = do
179 |   i <- withForeignPtr brIter act
180 |   return $! if i == (#const UBRK_DONE)
181 |             then Nothing
182 |             else Just $! fromIntegral i
183 | 
184 | -- | Reset the breaker to the beginning of the text to be scanned.
185 | first :: BreakIterator a -> IO TextI
186 | first BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_first
187 | 
188 | -- | Reset the breaker to the end of the text to be scanned.
189 | last :: BreakIterator a -> IO TextI
190 | last BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_last
191 | 
192 | -- | Advance the iterator and break at the text boundary that follows the
193 | -- current text boundary.
194 | next :: BreakIterator a -> IO (Maybe TextI)
195 | next = asIndex ubrk_next
196 | 
197 | -- | Advance the iterator and break at the text boundary that precedes the
198 | -- current text boundary.
199 | previous :: BreakIterator a -> IO (Maybe TextI)
200 | previous = asIndex ubrk_previous
201 | 
202 | -- | Determine the text boundary preceding the specified offset.
203 | preceding :: BreakIterator a -> Int -> IO (Maybe TextI)
204 | preceding bi i = asIndex (flip ubrk_preceding (fromIntegral i)) bi
205 | 
206 | -- | Determine the text boundary following the specified offset.
207 | following :: BreakIterator a -> Int -> IO (Maybe TextI)
208 | following bi i = asIndex (flip ubrk_following (fromIntegral i)) bi
209 | 
210 | -- | Return the character index most recently returned by 'next',
211 | -- 'previous', 'first', or 'last'.
212 | current :: BreakIterator a -> IO (Maybe TextI)
213 | current = asIndex ubrk_current
214 | 
215 | -- | Return the status from the break rule that determined the most recently
216 | -- returned break position.  For rules that do not specify a status, a
217 | -- default value of @()@ is returned.
218 | getStatus :: BreakIterator a -> IO a
219 | getStatus BR{..} = brStatus `fmap` withForeignPtr brIter ubrk_getRuleStatus
220 | 
221 | -- | Return statuses from all of the break rules that determined the most
222 | -- recently returned break position.
223 | getStatuses :: BreakIterator a -> IO [a]
224 | getStatuses BR{..} =
225 |   withForeignPtr brIter $ \brk -> do
226 |     n <- handleError $ ubrk_getRuleStatusVec brk nullPtr 0
227 |     allocaArray (fromIntegral n) $ \ptr -> do
228 |       _ <- handleError $ ubrk_getRuleStatusVec brk ptr n
229 |       map brStatus `fmap` peekArray (fromIntegral n) ptr
230 | 
231 | -- | Determine whether the specified position is a boundary position.
232 | -- As a side effect, leaves the iterator pointing to the first
233 | -- boundary position at or after the given offset.
234 | isBoundary :: BreakIterator a -> Int -> IO Bool
235 | isBoundary BR{..} i = asBool `fmap` withForeignPtr brIter (flip ubrk_isBoundary (fromIntegral i))
236 | 
237 | -- | Locales for which text breaking information is available.  A
238 | -- 'BreakIterator' in a locale in this list will perform the correct
239 | -- text breaking for the locale.
240 | available :: [LocaleName]
241 | available = unsafePerformIO $ do
242 |   n <- ubrk_countAvailable
243 |   forM [0..n-1] $ \i -> ubrk_getAvailable i >>= fmap Locale . peekCString
244 | {-# NOINLINE available #-}
245 | 
246 | type UBreakIteratorType = CInt
247 | 
248 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_open" ubrk_open
249 |     :: UBreakIteratorType -> CString -> Ptr UChar -> Int32 -> Ptr UErrorCode
250 |     -> IO (Ptr UBreakIterator)
251 | 
252 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_setUText" ubrk_setUText
253 |     :: Ptr UBreakIterator -> Ptr UText -> Ptr UErrorCode
254 |     -> IO ()
255 | 
256 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_safeClone" ubrk_safeClone
257 |     :: Ptr UBreakIterator -> Ptr a -> Ptr Int32 -> Ptr UErrorCode
258 |     -> IO (Ptr UBreakIterator)
259 | 
260 | foreign import ccall unsafe "hs_text_icu.h &__hs_ubrk_close" ubrk_close
261 |     :: FunPtr (Ptr UBreakIterator -> IO ())
262 | 
263 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_current" ubrk_current
264 |     :: Ptr UBreakIterator -> IO Int32
265 | 
266 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_first" ubrk_first
267 |     :: Ptr UBreakIterator -> IO Int32
268 | 
269 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_last" ubrk_last
270 |     :: Ptr UBreakIterator -> IO Int32
271 | 
272 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_next" ubrk_next
273 |     :: Ptr UBreakIterator -> IO Int32
274 | 
275 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_previous" ubrk_previous
276 |     :: Ptr UBreakIterator -> IO Int32
277 | 
278 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_preceding" ubrk_preceding
279 |     :: Ptr UBreakIterator -> Int32 -> IO Int32
280 | 
281 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_following" ubrk_following
282 |     :: Ptr UBreakIterator -> Int32 -> IO Int32
283 | 
284 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatus" ubrk_getRuleStatus
285 |     :: Ptr UBreakIterator -> IO Int32
286 | 
287 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatusVec" ubrk_getRuleStatusVec
288 |     :: Ptr UBreakIterator -> Ptr Int32 -> Int32 -> Ptr UErrorCode -> IO Int32
289 | 
290 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_isBoundary" ubrk_isBoundary
291 |     :: Ptr UBreakIterator -> Int32 -> IO UBool
292 | 
293 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_countAvailable" ubrk_countAvailable
294 |     :: IO Int32
295 | 
296 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getAvailable" ubrk_getAvailable
297 |     :: Int32 -> IO CString
298 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Break/Pure.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, RecordWildCards, CPP #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.Break.Pure
  4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- String breaking functions for Unicode, implemented as bindings to
 12 | -- the International Components for Unicode (ICU) libraries.
 13 | --
 14 | -- The text boundary positions are found according to the rules described in
 15 | -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex
 16 | -- #14, Line Breaking Properties.  These are available at
 17 | -- <http://www.unicode.org/reports/tr14/> and
 18 | -- <http://www.unicode.org/reports/tr29/>.
 19 | 
 20 | module Data.Text.ICU.Break.Pure
 21 |     (
 22 |     -- * Types
 23 |       Breaker
 24 |     , Break
 25 |     , brkPrefix
 26 |     , brkBreak
 27 |     , brkSuffix
 28 |     , brkStatus
 29 |     , Line(..)
 30 |     , Data.Text.ICU.Break.Word(..)
 31 |     -- * Breaking functions
 32 |     , breakCharacter
 33 |     , breakLine
 34 |     , breakSentence
 35 |     , breakWord
 36 |     -- * Iteration
 37 |     , breaks
 38 |     , breaksRight
 39 |     ) where
 40 | 
 41 | import Control.DeepSeq (NFData(..))
 42 | import Data.Text (Text, empty)
 43 | import Data.Text.ICU.Break (Line, Word)
 44 | import Data.Text.ICU.Break.Types (BreakIterator(..))
 45 | import Data.Text.ICU.Internal (LocaleName, takeWord, dropWord)
 46 | import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO)
 47 | import qualified Data.Text.ICU.Break as IO
 48 | 
 49 | -- | A boundary analyser.
 50 | newtype Breaker a = B (BreakIterator a)
 51 | 
 52 | new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a
 53 | new act loc = unsafePerformIO $ B `fmap` act loc empty
 54 | 
 55 | -- | Break a string on character boundaries.
 56 | --
 57 | -- Character boundary analysis identifies the boundaries of "Extended
 58 | -- Grapheme Clusters", which are groupings of codepoints that should be
 59 | -- treated as character-like units for many text operations.  Please see
 60 | -- Unicode Standard Annex #29, Unicode Text Segmentation,
 61 | -- <http://www.unicode.org/reports/tr29/> for additional information on
 62 | -- grapheme clusters and guidelines on their use.
 63 | breakCharacter :: LocaleName -> Breaker ()
 64 | breakCharacter = new IO.breakCharacter
 65 | 
 66 | -- | Break a string on line boundaries.
 67 | --
 68 | -- Line boundary analysis determines where a text string can be broken when
 69 | -- line wrapping. The mechanism correctly handles punctuation and hyphenated
 70 | -- words.
 71 | breakLine :: LocaleName -> Breaker Line
 72 | breakLine = new IO.breakLine
 73 | 
 74 | -- | Break a string on sentence boundaries.
 75 | --
 76 | -- Sentence boundary analysis allows selection with correct interpretation
 77 | -- of periods within numbers and abbreviations, and trailing punctuation
 78 | -- marks such as quotation marks and parentheses.
 79 | breakSentence :: LocaleName -> Breaker ()
 80 | breakSentence = new IO.breakSentence
 81 | 
 82 | -- | Break a string on word boundaries.
 83 | --
 84 | -- Word boundary analysis is used by search and replace functions, as well
 85 | -- as within text editing applications that allow the user to select words
 86 | -- with a double click. Word selection provides correct interpretation of
 87 | -- punctuation marks within and following words. Characters that are not
 88 | -- part of a word, such as symbols or punctuation marks, have word breaks on
 89 | -- both sides.
 90 | breakWord :: LocaleName -> Breaker Data.Text.ICU.Break.Word
 91 | breakWord = new IO.breakWord
 92 | 
 93 | -- | A break in a string.
 94 | data Break a = Break {
 95 |       brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break.
 96 |     , brkBreak :: {-# UNPACK #-} !Text  -- ^ Text of the current break.
 97 |     , brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break.
 98 |     , brkStatus :: !a
 99 |     -- ^ Status of the current break (only meaningful if 'Line' or 'Word').
100 |     } deriving (Eq, Show)
101 | 
102 | instance (NFData a) => NFData (Break a) where
103 |     rnf Break{..} = rnf brkStatus
104 | 
105 | -- | Return a list of all breaks in a string, from left to right.
106 | breaks :: Breaker a -> Text -> [Break a]
107 | breaks (B b) t = unsafePerformIO $ do
108 |   bi <- IO.clone b
109 |   IO.setText bi t
110 |   let go p = do
111 |         mix <- IO.next bi
112 |         case mix of
113 |           Nothing -> return []
114 |           Just n -> do
115 |             s <- IO.getStatus bi
116 |             let d = n-p
117 |                 u = dropWord p t
118 |             (Break (takeWord p t) (takeWord d u) (dropWord d u) s :) `fmap` go n
119 |   unsafeInterleaveIO $ go =<< IO.first bi
120 | 
121 | -- | Return a list of all breaks in a string, from right to left.
122 | breaksRight :: Breaker a -> Text -> [Break a]
123 | breaksRight (B b) t = unsafePerformIO $ do
124 |   bi <- IO.clone b
125 |   IO.setText bi t
126 |   let go p = do
127 |         mix <- IO.previous bi
128 |         case mix of
129 |           Nothing -> return []
130 |           Just n -> do
131 |             s <- IO.getStatus bi
132 |             let d = p-n
133 |                 u = dropWord n t
134 |             (Break (takeWord n t) (takeWord d u) (dropWord d u) s :) `fmap` go n
135 |   unsafeInterleaveIO $ go =<< IO.last bi
136 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Break/Types.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE EmptyDataDecls #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Break.Internal
 4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | 
11 | module Data.Text.ICU.Break.Types
12 |     (
13 |       BreakIterator(..)
14 |     , UBreakIterator
15 |     ) where
16 | 
17 | import Data.IORef (IORef)
18 | import Data.Int (Int32)
19 | import Foreign.ForeignPtr (ForeignPtr)
20 | import Data.Text.ICU.Internal (UTextPtr)
21 | 
22 | -- A boundary breaker.
23 | data BreakIterator a = BR {
24 |       brText :: IORef UTextPtr
25 |     , brStatus :: Int32 -> a
26 |     , brIter :: ForeignPtr UBreakIterator
27 |     }
28 | 
29 | data UBreakIterator
30 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/CaseMap.hsc:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.CaseMap
 4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Character set case mapping functions for Unicode, implemented as
12 | -- bindings to the International Components for Unicode (ICU)
13 | -- libraries.
14 | 
15 | module Data.Text.ICU.CaseMap
16 |     (
17 |       CaseMap
18 |     , CaseOption(..)
19 |     , caseMap
20 |     ) where
21 | 
22 | #include <unicode/ucasemap.h>
23 | #include <unicode/uchar.h>
24 | 
25 | import Data.Bits ((.|.))
26 | import Data.List (foldl')
27 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
28 | import Data.Text.ICU.Internal (LocaleName, withLocaleName)
29 | import Data.Typeable (Typeable)
30 | import Data.Word (Word32)
31 | import Foreign.C.String (CString)
32 | import Foreign.ForeignPtr (ForeignPtr)
33 | import Foreign.Ptr (FunPtr, Ptr)
34 | import System.IO.Unsafe (unsafePerformIO)
35 | 
36 | data UCaseMap
37 | 
38 | data CaseMap = CaseMap {-# UNPACK #-} !(ForeignPtr UCaseMap)
39 |                deriving (Eq, Typeable)
40 | 
41 | data CaseOption = FoldCaseExcludeSpecialI
42 |                 | TitleCaseNoLowerCase
43 |                 | TitleCaseNoBreakAdjustment
44 |                   deriving (Eq, Enum)
45 | 
46 | instance NFData CaseOption where
47 |     rnf !_ = ()
48 | 
49 | fromCaseMapOption :: CaseOption -> Word32
50 | fromCaseMapOption FoldCaseExcludeSpecialI    = #const U_FOLD_CASE_EXCLUDE_SPECIAL_I
51 | fromCaseMapOption TitleCaseNoLowerCase       = #const U_TITLECASE_NO_LOWERCASE
52 | fromCaseMapOption TitleCaseNoBreakAdjustment = #const U_TITLECASE_NO_BREAK_ADJUSTMENT
53 | 
54 | reduceCaseMapOptions :: [CaseOption] -> Word32
55 | reduceCaseMapOptions = foldl' (.|.) (#const U_FOLD_CASE_DEFAULT) .
56 |                        map fromCaseMapOption
57 | 
58 | caseMap :: LocaleName -> [CaseOption] -> CaseMap
59 | caseMap name opts = unsafePerformIO $ newICUPtr CaseMap ucasemap_close $
60 |   withLocaleName name $ \nptr -> handleError $ ucasemap_open nptr $
61 |     reduceCaseMapOptions $ opts
62 | 
63 | foreign import ccall unsafe "hs_text_icu.h __hs_ucasemap_open" ucasemap_open
64 |     :: CString -> Word32 -> Ptr UErrorCode -> IO (Ptr UCaseMap)
65 | 
66 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucasemap_close" ucasemap_close
67 |     :: FunPtr (Ptr UCaseMap -> IO ())
68 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/CharsetDetection.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ForeignFunctionInterface #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.CharsetDetection
  4 | -- Copyright   : (c) 2017 Zac Slade
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Access to the Unicode Character Set Detection facilities, implemented in
 12 | -- the International Components for Unicode (ICU) libraries.
 13 | --
 14 | -- For more information see the \"Character Set Detection\" chapter
 15 | -- in the ICU User Guide
 16 | -- <http://userguide.icu-project.org/conversion/detection>.
 17 | module Data.Text.ICU.CharsetDetection
 18 |     (
 19 |       setText
 20 |     , detect
 21 |     , mkCharsetDetector
 22 |     , withCharsetDetector
 23 |     , wrapUCharsetMatch
 24 |     , CharsetMatch
 25 |     , CharsetDetector
 26 |     , getConfidence
 27 |     , getName
 28 |     , getLanguage
 29 |     ) where
 30 | 
 31 | import Foreign.Ptr (Ptr)
 32 | import Foreign.C.String (CString)
 33 | import Foreign.C.Types (CChar)
 34 | import qualified Data.ByteString as BS
 35 | import Data.ByteString (ByteString)
 36 | import qualified Data.Text.Encoding as TE
 37 | import Data.Text (Text)
 38 | 
 39 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
 40 | import Data.Text.ICU.CharsetDetection.Internal (UCharsetMatch, UCharsetDetector,
 41 |                                                 CharsetDetector, CharsetMatch,
 42 |                                                 mkCharsetDetector,
 43 |                                                 withCharsetDetector,
 44 |                                                 withCharsetMatch,
 45 |                                                 wrapUCharsetMatch)
 46 | 
 47 | #include <unicode/ucsdet.h>
 48 | 
 49 | -- | From the ICU C API documentation:
 50 | -- "Character set detection is at best an imprecise operation. The
 51 | -- detection process will attempt to identify the charset that best matches
 52 | -- the characteristics of the byte data, but the process is partly statistical
 53 | -- in nature, and the results can not be guaranteed to always be correct.
 54 | --
 55 | -- For best accuracy in charset detection, the input data should be primarily
 56 | -- in a single language, and a minimum of a few hundred bytes worth of plain
 57 | -- text in the language are needed. The detection process will attempt to
 58 | -- ignore html or xml style markup that could otherwise obscure the content."
 59 | 
 60 | -- | Use the first 512 bytes, if available, as the text in the
 61 | -- 'CharsetDetector' object. This function is low-level and used by the more
 62 | -- high-level 'detect' function.
 63 | setText :: ByteString -> CharsetDetector -> IO ()
 64 | setText bs ucsd = withCharsetDetector ucsd go
 65 |   where
 66 |     go u = if BS.length bs < 512
 67 |               then BS.useAsCStringLen bs (\(text,size) -> handleError $ ucsdet_setText u text size)
 68 |               else BS.useAsCStringLen (BS.take 512 bs) (\(text,size) -> handleError $ ucsdet_setText u text size)
 69 | 
 70 | -- | Attempt to perform a detection without an input filter. The best match
 71 | -- will be returned.
 72 | detect :: ByteString -> IO CharsetMatch
 73 | detect bs = do
 74 |     ucsd <- mkCharsetDetector
 75 |     setText bs ucsd
 76 |     wrapUCharsetMatch ucsd $ withCharsetDetector ucsd (handleError . ucsdet_detect)
 77 | 
 78 | -- | See the confidence score from 0-100 of the 'CharsetMatch' object.
 79 | getConfidence :: CharsetMatch -> IO Int
 80 | getConfidence ucm = withCharsetMatch ucm $ handleError . ucsdet_getConfidence
 81 | 
 82 | -- | Extract the character set encoding name from the 'CharsetMatch'
 83 | -- object.
 84 | getName :: CharsetMatch -> IO Text
 85 | getName ucsm = do
 86 |     bs <- withCharsetMatch ucsm (handleError . ucsdet_getName) >>= BS.packCString
 87 |     return $ TE.decodeUtf8 bs
 88 | 
 89 | -- | Extracts the three letter ISO code for the language encoded in the
 90 | -- 'CharsetMatch'.
 91 | getLanguage :: CharsetMatch -> IO Text
 92 | getLanguage ucsm = do
 93 |     bs <- withCharsetMatch ucsm (handleError . ucsdet_getLanguage) >>= BS.packCString
 94 |     return $ TE.decodeUtf8 bs
 95 | 
 96 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_setText" ucsdet_setText
 97 |     :: Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO ()
 98 | 
 99 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_detect" ucsdet_detect
100 |     :: Ptr UCharsetDetector -> Ptr UErrorCode -> IO (Ptr UCharsetMatch)
101 | 
102 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getConfidence" ucsdet_getConfidence
103 |     :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int
104 | 
105 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getName" ucsdet_getName
106 |     :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString
107 | 
108 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getLanguage" ucsdet_getLanguage
109 |     :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString
110 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/CharsetDetection/Internal.hsc:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface, EmptyDataDecls #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.CharsetDetection.Internal
 4 | -- Copyright   : (c) 2017 Zac Slade
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Access to the Unicode Character Set Detection facilities, implemented in
12 | -- the International Components for Unicode (ICU) libraries.
13 | --
14 | -- For more information see the \"Character Set Detection\" chapter
15 | -- in the ICU User Guide
16 | -- <http://userguide.icu-project.org/conversion/detection>.
17 | module Data.Text.ICU.CharsetDetection.Internal
18 |     (
19 |       UCharsetDetector
20 |     , UCharsetMatch
21 |     , CharsetMatch(..)
22 |     , CharsetDetector(..)
23 |     , withCharsetDetector
24 |     , wrapUCharsetDetector
25 |     , wrapUCharsetMatch
26 |     , mkCharsetDetector
27 |     , withCharsetMatch
28 |     ) where
29 | 
30 | import Data.Typeable (Typeable)
31 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
32 | import Foreign.Ptr (FunPtr, Ptr)
33 | 
34 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
35 | import Data.Text.ICU.Internal (newICUPtr)
36 | 
37 | #include <unicode/ucsdet.h>
38 | 
39 | -- | Opaque handle to a character set detector
40 | data UCharsetDetector
41 | 
42 | -- | Handy wrapper for the pointer to the 'UCharsetDetector'. We must
43 | -- always call ucsdet_close on any UCharsetDetector when we are done. The
44 | -- 'withCharsetDetector' and 'wrapUCharsetDetector' functions simplify
45 | -- management of the pointers.
46 | data CharsetDetector = CharsetDetector {
47 |     charsetDetectorPtr :: {-# UNPACK #-} !(ForeignPtr UCharsetDetector)
48 | } deriving (Typeable)
49 | 
50 | mkCharsetDetector :: IO CharsetDetector
51 | mkCharsetDetector = wrapUCharsetDetector $ handleError ucsdet_open
52 | 
53 | -- | Temporarily unwraps an 'CharsetDetector' to perform operations on its
54 | -- raw 'UCharsetDetector' handle.
55 | withCharsetDetector :: CharsetDetector -> (Ptr UCharsetDetector -> IO a) -> IO a
56 | withCharsetDetector (CharsetDetector ucsd) = withForeignPtr ucsd
57 | {-# INLINE withCharsetDetector #-}
58 | 
59 | -- | Wraps a raw 'UCharsetDetector' in an 'CharsetDetector', closing the
60 | -- handle when the last reference to the object is dropped.
61 | wrapUCharsetDetector :: IO (Ptr UCharsetDetector) -> IO CharsetDetector
62 | wrapUCharsetDetector = newICUPtr CharsetDetector ucsdet_close
63 | {-# INLINE wrapUCharsetDetector #-}
64 | 
65 | -- | Opaque handle to a character set match
66 | data UCharsetMatch
67 | 
68 | -- | Opaque character set match handle. The memory backing these objects is
69 | -- managed entirely by the ICU C library.
70 | -- TODO: UCharsetMatch is reset after the setText call. We need to handle it.
71 | data CharsetMatch =
72 |     CharsetMatch
73 |     { charsetMatchPtr :: {-# UNPACK #-} !(Ptr UCharsetMatch)
74 |     , charsetMatchDetector :: CharsetDetector
75 |     -- ^ keep reference since UCharsetMatch object is owned
76 |     -- by the UCharsetDetector.
77 |     }
78 |     deriving (Typeable)
79 | 
80 | wrapUCharsetMatch :: CharsetDetector -> IO (Ptr UCharsetMatch) -> IO CharsetMatch
81 | wrapUCharsetMatch cd = fmap $ flip CharsetMatch cd
82 | 
83 | withCharsetMatch :: CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a
84 | withCharsetMatch (CharsetMatch ucsm _) f = f ucsm
85 | 
86 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucsdet_close" ucsdet_close
87 |     :: FunPtr (Ptr UCharsetDetector -> IO ())
88 | 
89 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_open" ucsdet_open
90 |     :: Ptr UErrorCode -> IO (Ptr UCharsetDetector)
91 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Collate/Internal.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Collate.Internal
 4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Internals of the string collation infrastructure.
12 | 
13 | module Data.Text.ICU.Collate.Internal
14 |     (
15 |     -- * Unicode collation API
16 |       MCollator(..)
17 |     , Collator(..)
18 |     , UCollator
19 |     , withCollator
20 |     , wrap
21 |     ) where
22 | 
23 | import Data.Typeable (Typeable)
24 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
25 | import Foreign.Ptr (FunPtr, Ptr)
26 | import Data.Text.ICU.Internal (newICUPtr)
27 | 
28 | -- $api
29 | --
30 | 
31 | data UCollator
32 | 
33 | -- | String collator type.
34 | data MCollator = MCollator {-# UNPACK #-} !(ForeignPtr UCollator)
35 |                  deriving (Typeable)
36 | 
37 | -- | String collator type.
38 | newtype Collator = C MCollator
39 |     deriving (Typeable)
40 | 
41 | withCollator :: MCollator -> (Ptr UCollator -> IO a) -> IO a
42 | withCollator (MCollator col) action = withForeignPtr col action
43 | {-# INLINE withCollator #-}
44 | 
45 | wrap :: IO (Ptr UCollator) -> IO MCollator
46 | wrap = newICUPtr MCollator ucol_close
47 | {-# INLINE wrap #-}
48 | 
49 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucol_close" ucol_close
50 |     :: FunPtr (Ptr UCollator -> IO ())
51 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Collate/Pure.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface, ScopedTypeVariables  #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.Collate.Pure
  4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Pure string collation functions for Unicode, implemented as
 12 | -- bindings to the International Components for Unicode (ICU)
 13 | -- libraries.
 14 | --
 15 | -- For the impure collation API (which is richer, but less easy to
 16 | -- use), see the "Data.Text.ICU.Collate" module.
 17 | 
 18 | module Data.Text.ICU.Collate.Pure
 19 |     (
 20 |     -- * Unicode collation API
 21 |     -- $api
 22 |       Collator
 23 |     , collator
 24 |     , collatorWith
 25 |     , collatorFromRules
 26 |     , collatorFromRulesWith
 27 |     , collate
 28 |     , collateIter
 29 |     , rules
 30 |     , sortKey
 31 |     , uca
 32 |     ) where
 33 | 
 34 | import qualified Control.Exception as E
 35 | import Control.Monad (forM_)
 36 | import Data.ByteString (ByteString)
 37 | import Data.Text (Text)
 38 | import Data.Text.ICU.Error.Internal (ParseError(..))
 39 | import Data.Text.ICU.Collate.Internal (Collator(..))
 40 | import Data.Text.ICU.Internal (CharIterator, LocaleName(..))
 41 | import System.IO.Unsafe (unsafePerformIO)
 42 | import qualified Data.Text.ICU.Collate as IO
 43 | 
 44 | -- $api
 45 | --
 46 | 
 47 | -- | Create an immutable 'Collator' for comparing strings.
 48 | --
 49 | -- If 'Root' is passed as the locale, UCA collation rules will be
 50 | -- used.
 51 | collator :: LocaleName -> Collator
 52 | collator loc = unsafePerformIO $ C `fmap` IO.open loc
 53 | 
 54 | -- | Create an immutable 'Collator' with the given 'Attribute's.
 55 | collatorWith :: LocaleName -> [IO.Attribute] -> Collator
 56 | collatorWith loc atts = unsafePerformIO $ do
 57 |   mc <- IO.open loc
 58 |   forM_ atts $ IO.setAttribute mc
 59 |   return (C mc)
 60 | 
 61 | -- | Create an immutable 'Collator' from the given collation rules.
 62 | collatorFromRules :: Text -> Either ParseError Collator
 63 | collatorFromRules rul = collatorFromRulesWith rul []
 64 | 
 65 | -- | Create an immutable 'Collator' from the given collation rules with the given 'Attribute's.
 66 | collatorFromRulesWith :: Text -> [IO.Attribute] -> Either ParseError Collator
 67 | collatorFromRulesWith rul atts = unsafePerformIO $
 68 |   (Right `fmap` openAndSetAtts)
 69 |   `E.catch` \(err::ParseError) -> return (Left err)
 70 |   where
 71 |     openAndSetAtts = do
 72 |       mc <- IO.openRules rul Nothing Nothing
 73 |       forM_ atts $ IO.setAttribute mc
 74 |       return (C mc)
 75 | 
 76 | -- | Get rules for the given 'Collator'.
 77 | rules :: Collator -> Text
 78 | rules (C c) = unsafePerformIO $ IO.getRules c
 79 | 
 80 | -- | Compare two strings.
 81 | collate :: Collator -> Text -> Text -> Ordering
 82 | collate (C c) a b = unsafePerformIO $ IO.collate c a b
 83 | {-# INLINE collate #-}
 84 | 
 85 | -- | Compare two 'CharIterator's.
 86 | --
 87 | -- If either iterator was constructed from a 'ByteString', it does not
 88 | -- need to be copied or converted beforehand, so this function can be
 89 | -- quite cheap.
 90 | collateIter :: Collator -> CharIterator -> CharIterator -> Ordering
 91 | collateIter (C c) a b = unsafePerformIO $ IO.collateIter c a b
 92 | {-# INLINE collateIter #-}
 93 | 
 94 | -- | Create a key for sorting the 'Text' using the given 'Collator'.
 95 | -- The result of comparing two 'ByteString's that have been
 96 | -- transformed with 'sortKey' will be the same as the result of
 97 | -- 'collate' on the two untransformed 'Text's.
 98 | sortKey :: Collator -> Text -> ByteString
 99 | sortKey (C c) = unsafePerformIO . IO.sortKey c
100 | {-# INLINE sortKey #-}
101 | 
102 | -- | A 'Collator' that uses the Unicode Collation Algorithm (UCA).
103 | uca :: Collator
104 | uca = collator Root
105 | {-# NOINLINE uca #-}
106 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Convert.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ForeignFunctionInterface, CPP #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.Convert
  4 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Character set conversion functions for Unicode, implemented as
 12 | -- bindings to the International Components for Unicode (ICU)
 13 | -- libraries.
 14 | module Data.Text.ICU.Convert
 15 |     (
 16 |     -- * Character set conversion
 17 |       Converter
 18 |     -- ** Basic functions
 19 |     , open
 20 |     , fromUnicode
 21 |     , toUnicode
 22 |     -- ** Converter metadata
 23 |     , getName
 24 |     , usesFallback
 25 |     , isAmbiguous
 26 |     -- * Functions for controlling global behavior
 27 |     , getDefaultName
 28 |     , setDefaultName
 29 |     -- * Miscellaneous functions
 30 |     , compareNames
 31 |     , aliases
 32 |     -- * Metadata
 33 |     , converterNames
 34 |     , standardNames
 35 |     ) where
 36 | 
 37 | import Data.ByteString.Internal (ByteString, createAndTrim)
 38 | import Data.ByteString.Unsafe (unsafeUseAsCStringLen)
 39 | import Data.Int (Int32)
 40 | import Data.Text (Text)
 41 | import Data.Text.Foreign (fromPtr, useAsPtr)
 42 | #if !MIN_VERSION_text(2,0,0)
 43 | import Data.Text.ICU.Internal (UChar)
 44 | #endif
 45 | import Data.Text.ICU.Internal (lengthWord)
 46 | import Data.Text.ICU.Convert.Internal
 47 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
 48 | import Data.Word (Word8, Word16)
 49 | import Foreign.C.String (CString, peekCString, withCString)
 50 | import Foreign.C.Types (CInt(..))
 51 | import Foreign.Marshal.Array (allocaArray)
 52 | import Foreign.Ptr (FunPtr, Ptr)
 53 | import System.IO.Unsafe (unsafePerformIO)
 54 | import Data.Text.ICU.Internal (UBool, asBool, asOrdering, withName, newICUPtr)
 55 | 
 56 | -- | Do a fuzzy compare of two converter/alias names.  The comparison
 57 | -- is case-insensitive, ignores leading zeroes if they are not
 58 | -- followed by further digits, and ignores all but letters and digits.
 59 | -- Thus the strings @\"UTF-8\"@, @\"utf_8\"@, @\"u*T\@f08\"@ and
 60 | -- @\"Utf 8\"@ are exactly equivalent.  See section 1.4, Charset Alias
 61 | -- Matching in Unicode Technical Standard #22 at
 62 | -- <http://www.unicode.org/reports/tr22/>
 63 | compareNames :: String -> String -> Ordering
 64 | compareNames a b =
 65 |   unsafePerformIO . withCString a $ \aptr ->
 66 |     fmap asOrdering . withCString b $ ucnv_compareNames aptr
 67 | 
 68 | -- | Create a 'Converter' with the name of a coded character set
 69 | -- specified as a string.  The actual name will be resolved with the
 70 | -- alias file using a case-insensitive string comparison that ignores
 71 | -- leading zeroes and all non-alphanumeric characters.  E.g., the
 72 | -- names @\"UTF8\"@, @\"utf-8\"@, @\"u*T\@f08\"@ and @\"Utf 8\"@ are
 73 | -- all equivalent (see also 'compareNames').  If an empty string is
 74 | -- passed for the converter name, it will create one with the
 75 | -- 'getDefaultName' return value.
 76 | --
 77 | -- A converter name may contain options like a locale specification to
 78 | -- control the specific behavior of the newly instantiated converter.
 79 | -- The meaning of the options depends on the particular converter.  If
 80 | -- an option is not defined for or recognized by a given converter,
 81 | -- then it is ignored.
 82 | --
 83 | -- Options are appended to the converter name string, with a comma
 84 | -- between the name and the first option and also between adjacent
 85 | -- options.
 86 | --
 87 | -- If the alias is ambiguous, then the preferred converter is used.
 88 | --
 89 | -- The conversion behavior and names can vary between platforms. ICU
 90 | -- may convert some characters differently from other
 91 | -- platforms. Details on this topic are in the ICU User's Guide at
 92 | -- <http://icu-project.org/userguide/conversion.html>. Aliases
 93 | -- starting with a @\"cp\"@ prefix have no specific meaning other than
 94 | -- its an alias starting with the letters @\"cp\"@. Please do not
 95 | -- associate any meaning to these aliases.
 96 | open :: String                  -- ^ Name of the converter to use.
 97 |      -> Maybe Bool              -- ^ Whether to use fallback mappings
 98 |                                 -- (see 'usesFallback' for details).
 99 |      -> IO Converter
100 | open name mf = do
101 |   c <- newICUPtr Converter ucnv_close $ withName name (handleError . ucnv_open)
102 |   case mf of
103 |     Just f -> withConverter c $ \p -> ucnv_setFallback p . fromIntegral . fromEnum $ f
104 |     _ -> return ()
105 |   return c
106 | 
107 | -- | Encode a Unicode string into a code page string using the given converter.
108 | fromUnicode :: Converter -> Text -> ByteString
109 | fromUnicode cnv t =
110 |   unsafePerformIO . useAsPtr t $ \tptr tlen ->
111 |     withConverter cnv $ \cptr -> do
112 |       let capacity = fromIntegral . ucnv_max_bytes_for_string cptr . fromIntegral $
113 |                      lengthWord t
114 |       createAndTrim (fromIntegral capacity) $ \sptr ->
115 |         fmap fromIntegral . handleError $
116 | #if MIN_VERSION_text(2,0,0)
117 |            ucnv_fromAlgorithmic_UTF8
118 | #else
119 |            ucnv_fromUChars
120 | #endif
121 |            cptr sptr capacity tptr (fromIntegral tlen)
122 | 
123 | -- | Decode an encoded string into a Unicode string using the given converter.
124 | toUnicode :: Converter -> ByteString -> Text
125 | toUnicode cnv bs =
126 |   unsafePerformIO . unsafeUseAsCStringLen bs $ \(sptr, slen) ->
127 |     withConverter cnv $ \cptr -> do
128 |       let (capacity, conv) =
129 | #if MIN_VERSION_text(2,0,0)
130 |               (slen * 4, ucnv_toAlgorithmic_UTF8)
131 | #else
132 |               (slen * 2, ucnv_toUChars)
133 | #endif
134 |       allocaArray capacity $ \tptr ->
135 |         fromPtr tptr =<< (fmap fromIntegral . handleError $
136 |                           conv cptr tptr (fromIntegral capacity) sptr
137 |                                         (fromIntegral slen))
138 | 
139 | -- | Determines whether the converter uses fallback mappings or not.
140 | -- This flag has restrictions.  Regardless of this flag, the converter
141 | -- will always use fallbacks from Unicode Private Use codepoints, as
142 | -- well as reverse fallbacks (to Unicode).  For details see \".ucm
143 | -- File Format\" in the Conversion Data chapter of the ICU User Guide:
144 | -- <http://www.icu-project.org/userguide/conversion-data.html#ucmformat>
145 | usesFallback :: Converter -> Bool
146 | usesFallback cnv = unsafePerformIO $
147 |                    asBool `fmap` withConverter cnv ucnv_usesFallback
148 | 
149 | -- | Returns the current default converter name. If you want to 'open'
150 | -- a default converter, you do not need to use this function.  It is
151 | -- faster to pass the empty string to 'open' the default converter.
152 | getDefaultName :: IO String
153 | getDefaultName = peekCString =<< ucnv_getDefaultName
154 | 
155 | -- | Indicates whether the converter contains ambiguous mappings of
156 | -- the same character or not.
157 | isAmbiguous :: Converter -> Bool
158 | isAmbiguous cnv = asBool . unsafePerformIO $ withConverter cnv ucnv_isAmbiguous
159 | 
160 | -- | Sets the current default converter name. If this function needs
161 | -- to be called, it should be called during application
162 | -- initialization. Most of the time, the results from 'getDefaultName'
163 | -- or 'open' with an empty string argument is sufficient for your
164 | -- application.
165 | --
166 | -- /Note/: this function is not thread safe. /Do not/ call this
167 | -- function when /any/ ICU function is being used from more than one
168 | -- thread!
169 | setDefaultName :: String -> IO ()
170 | setDefaultName s = withCString s $ ucnv_setDefaultName
171 | 
172 | -- | A list of the canonical names of all available converters.
173 | converterNames :: [String]
174 | {-# NOINLINE converterNames #-}
175 | converterNames = unsafePerformIO $
176 |   mapM ((peekCString =<<) . ucnv_getAvailableName) [0..ucnv_countAvailable-1]
177 | 
178 | -- | The list of supported standard names.
179 | standardNames :: [String]
180 | {-# NOINLINE standardNames #-}
181 | standardNames = filter (not . null) . unsafePerformIO $
182 |   mapM ((peekCString =<<) . handleError . ucnv_getStandard) [0..ucnv_countStandards-1]
183 | 
184 | -- | Return the aliases for a given converter or alias name.
185 | aliases :: String -> [String]
186 | aliases name = unsafePerformIO . withCString name $ \ptr -> do
187 |   count <- handleError $ ucnv_countAliases ptr
188 |   if count == 0
189 |     then return []
190 |     else mapM ((peekCString =<<) . handleError . ucnv_getAlias ptr) [0..count-1]
191 | 
192 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_open" ucnv_open
193 |     :: CString -> Ptr UErrorCode -> IO (Ptr UConverter)
194 | 
195 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucnv_close" ucnv_close
196 |     :: FunPtr (Ptr UConverter -> IO ())
197 | 
198 | foreign import ccall unsafe "__hs_ucnv_get_max_bytes_for_string" ucnv_max_bytes_for_string
199 |     :: Ptr UConverter -> CInt -> CInt
200 | 
201 | #if MIN_VERSION_text(2,0,0)
202 | 
203 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toAlgorithmic_UTF8" ucnv_toAlgorithmic_UTF8
204 |     :: Ptr UConverter -> Ptr Word8 -> Int32 -> CString -> Int32
205 |     -> Ptr UErrorCode -> IO Int32
206 | 
207 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromAlgorithmic_UTF8" ucnv_fromAlgorithmic_UTF8
208 |     :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr Word8 -> Int32
209 |     -> Ptr UErrorCode -> IO Int32
210 | 
211 | #else
212 | 
213 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toUChars" ucnv_toUChars
214 |     :: Ptr UConverter -> Ptr UChar -> Int32 -> CString -> Int32
215 |     -> Ptr UErrorCode -> IO Int32
216 | 
217 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromUChars" ucnv_fromUChars
218 |     :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr UChar -> Int32
219 |     -> Ptr UErrorCode -> IO Int32
220 | 
221 | #endif
222 | 
223 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_compareNames" ucnv_compareNames
224 |     :: CString -> CString -> IO CInt
225 | 
226 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getDefaultName" ucnv_getDefaultName
227 |     :: IO CString
228 | 
229 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setDefaultName" ucnv_setDefaultName
230 |     :: CString -> IO ()
231 | 
232 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAvailable" ucnv_countAvailable
233 |     :: Int32
234 | 
235 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAvailableName" ucnv_getAvailableName
236 |     :: Int32 -> IO CString
237 | 
238 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAliases" ucnv_countAliases
239 |     :: CString -> Ptr UErrorCode -> IO Word16
240 | 
241 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAlias" ucnv_getAlias
242 |     :: CString -> Word16 -> Ptr UErrorCode -> IO CString
243 | 
244 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countStandards" ucnv_countStandards
245 |     :: Word16
246 | 
247 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getStandard" ucnv_getStandard
248 |     :: Word16 -> Ptr UErrorCode -> IO CString
249 | 
250 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_usesFallback" ucnv_usesFallback
251 |     :: Ptr UConverter -> IO UBool
252 | 
253 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setFallback" ucnv_setFallback
254 |     :: Ptr UConverter -> UBool -> IO ()
255 | 
256 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_isAmbiguous" ucnv_isAmbiguous
257 |     :: Ptr UConverter -> IO UBool
258 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Convert/Internal.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Convert.Internal
 4 | -- Copyright   : (c) Bryan O'Sullivan 2009
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Low-level character set types and functions.
12 | 
13 | module Data.Text.ICU.Convert.Internal
14 |     (
15 |       Converter(..)
16 |     , UConverter
17 |     , getName
18 |     , withConverter
19 |     ) where
20 | 
21 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
22 | import Data.Typeable (Typeable)
23 | import Foreign.C.String (CString, peekCString)
24 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
25 | import Foreign.Ptr (Ptr)
26 | import System.IO.Unsafe (unsafePerformIO)
27 | 
28 | data UConverter
29 | 
30 | -- | Character set converter type.  /Note/: this structure is not
31 | -- thread safe. It is /not/ safe to use value of this type
32 | -- simultaneously from multiple threads.
33 | data Converter = Converter {-# UNPACK #-} !(ForeignPtr UConverter)
34 |                  deriving (Eq, Typeable)
35 | 
36 | instance Show Converter where
37 |     show c = "Converter " ++ show (getName c)
38 | 
39 | withConverter :: Converter -> (Ptr UConverter -> IO a) -> IO a
40 | {-# INLINE withConverter #-}
41 | withConverter (Converter cnv) action = withForeignPtr cnv action
42 | 
43 | -- | Gets the internal, canonical name of the converter.
44 | getName :: Converter -> String
45 | getName cnv = unsafePerformIO .
46 |   withConverter cnv $ \ptr ->
47 |     peekCString =<< handleError (ucnv_getName ptr)
48 | 
49 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getName" ucnv_getName
50 |     :: Ptr UConverter -> Ptr UErrorCode -> IO CString
51 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Enumerator.hsc:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE EmptyDataDecls, BangPatterns, ForeignFunctionInterface, RecordWildCards #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Calendar
 4 | -- Copyright   : (c) 2021 Torsten Kemps-Benedix
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Calendar functions implemented as bindings to
12 | -- the International Components for Unicode (ICU) libraries.
13 | 
14 | module Data.Text.ICU.Enumerator
15 |     (next, toList, createEnumerator, Enumerator, UEnumerator,
16 |     ) where
17 | 
18 | #include <unicode/uenum.h>
19 | 
20 | import Data.Int (Int32)
21 | import Data.Text (Text)
22 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
23 | import Data.Text.ICU.Internal (UChar, newICUPtr, fromUCharPtr)
24 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
25 | import Foreign.Marshal.Alloc (alloca)
26 | import Foreign.Ptr (FunPtr, Ptr, nullPtr)
27 | import Foreign.Storable (peek)
28 | import Prelude hiding (last)
29 | 
30 | data UEnumerator
31 | 
32 | newtype Enumerator = Enumerator {enumeratorToForeignPtr :: ForeignPtr UEnumerator}
33 | 
34 | createEnumerator :: IO (Ptr UEnumerator) -> IO Enumerator
35 | createEnumerator = newICUPtr Enumerator uenum_close
36 | 
37 | next :: Enumerator -> IO (Maybe Text)
38 | next enum = withForeignPtr (enumeratorToForeignPtr enum) $ \enumPtr ->
39 |   alloca $ \lenPtr -> do
40 |     textPtr <- handleError $ uenum_unext enumPtr lenPtr
41 |     if textPtr == nullPtr
42 |       then pure Nothing
43 |       else do
44 |           n <- peek lenPtr
45 |           t <- fromUCharPtr textPtr (fromIntegral n)
46 |           pure $ Just t
47 | 
48 | toList :: Enumerator -> IO [Text]
49 | toList enum = reverse <$> go []
50 |   where
51 |     go l = do
52 |       mx <- next enum
53 |       case mx of
54 |         Nothing -> pure l
55 |         Just x -> go (x:l)
56 | 
57 | foreign import ccall unsafe "hs_text_icu.h &__hs_uenum_close" uenum_close
58 |     :: FunPtr (Ptr UEnumerator -> IO ())
59 | foreign import ccall unsafe "hs_text_icu.h __hs_uenum_unext" uenum_unext
60 |     :: Ptr UEnumerator -> Ptr Int32 -> Ptr UErrorCode
61 |     -> IO (Ptr UChar)
62 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Error.hsc:
--------------------------------------------------------------------------------
  1 | -- |
  2 | -- Module      : Data.Text.ICU.Error
  3 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  4 | --
  5 | -- License     : BSD-style
  6 | -- Maintainer  : bos@serpentine.com
  7 | -- Stability   : experimental
  8 | -- Portability : GHC
  9 | --
 10 | -- Errors thrown by bindings to the International Components for
 11 | -- Unicode (ICU) libraries.
 12 | --
 13 | -- Most ICU functions can throw an 'ICUError' value as an exception.
 14 | -- Some can additionally throw a 'ParseError', if more detailed error
 15 | -- information is necessary.
 16 | module Data.Text.ICU.Error
 17 |     (
 18 |      -- * Types
 19 |      ICUError,
 20 |      ParseError(errError, errLine, errOffset),
 21 | 
 22 |      -- * Functions
 23 |      isSuccess,
 24 |      isFailure,
 25 |      errorName,
 26 |      isRegexError,
 27 | 
 28 |      -- * Warnings
 29 |      u_USING_FALLBACK_WARNING,
 30 |      u_USING_DEFAULT_WARNING,
 31 |      u_SAFECLONE_ALLOCATED_WARNING,
 32 |      u_STATE_OLD_WARNING,
 33 |      u_STRING_NOT_TERMINATED_WARNING,
 34 |      u_SORT_KEY_TOO_SHORT_WARNING,
 35 |      u_AMBIGUOUS_ALIAS_WARNING,
 36 |      u_DIFFERENT_UCA_VERSION,
 37 | 
 38 |      -- * Errors
 39 |      u_ILLEGAL_ARGUMENT_ERROR,
 40 |      u_MISSING_RESOURCE_ERROR,
 41 |      u_INVALID_FORMAT_ERROR,
 42 |      u_FILE_ACCESS_ERROR,
 43 |      u_INTERNAL_PROGRAM_ERROR,
 44 |      u_MESSAGE_PARSE_ERROR,
 45 |      u_MEMORY_ALLOCATION_ERROR,
 46 |      u_INDEX_OUTOFBOUNDS_ERROR,
 47 |      u_PARSE_ERROR,
 48 |      u_INVALID_CHAR_FOUND,
 49 |      u_TRUNCATED_CHAR_FOUND,
 50 |      u_ILLEGAL_CHAR_FOUND,
 51 |      u_INVALID_TABLE_FORMAT,
 52 |      u_INVALID_TABLE_FILE,
 53 |      u_BUFFER_OVERFLOW_ERROR,
 54 |      u_UNSUPPORTED_ERROR,
 55 |      u_RESOURCE_TYPE_MISMATCH,
 56 |      u_ILLEGAL_ESCAPE_SEQUENCE,
 57 |      u_UNSUPPORTED_ESCAPE_SEQUENCE,
 58 |      u_NO_SPACE_AVAILABLE,
 59 |      u_CE_NOT_FOUND_ERROR,
 60 |      u_PRIMARY_TOO_LONG_ERROR,
 61 |      u_STATE_TOO_OLD_ERROR,
 62 |      u_TOO_MANY_ALIASES_ERROR,
 63 |      u_ENUM_OUT_OF_SYNC_ERROR,
 64 |      u_INVARIANT_CONVERSION_ERROR,
 65 |      u_INVALID_STATE_ERROR,
 66 |      u_COLLATOR_VERSION_MISMATCH,
 67 |      u_USELESS_COLLATOR_ERROR,
 68 |      u_NO_WRITE_PERMISSION,
 69 | 
 70 |      -- ** Transliterator errors
 71 |      u_BAD_VARIABLE_DEFINITION,
 72 |      u_MALFORMED_RULE,
 73 |      u_MALFORMED_SET,
 74 |      u_MALFORMED_UNICODE_ESCAPE,
 75 |      u_MALFORMED_VARIABLE_DEFINITION,
 76 |      u_MALFORMED_VARIABLE_REFERENCE,
 77 |      u_MISPLACED_CURSOR_OFFSET,
 78 |      u_MISPLACED_QUANTIFIER,
 79 |      u_MISSING_OPERATOR,
 80 |      u_MULTIPLE_ANTE_CONTEXTS,
 81 |      u_MULTIPLE_CURSORS,
 82 |      u_MULTIPLE_POST_CONTEXTS,
 83 |      u_TRAILING_BACKSLASH,
 84 |      u_UNDEFINED_SEGMENT_REFERENCE,
 85 |      u_UNDEFINED_VARIABLE,
 86 |      u_UNQUOTED_SPECIAL,
 87 |      u_UNTERMINATED_QUOTE,
 88 |      u_RULE_MASK_ERROR,
 89 |      u_MISPLACED_COMPOUND_FILTER,
 90 |      u_MULTIPLE_COMPOUND_FILTERS,
 91 |      u_INVALID_RBT_SYNTAX,
 92 |      u_MALFORMED_PRAGMA,
 93 |      u_UNCLOSED_SEGMENT,
 94 |      u_VARIABLE_RANGE_EXHAUSTED,
 95 |      u_VARIABLE_RANGE_OVERLAP,
 96 |      u_ILLEGAL_CHARACTER,
 97 |      u_INTERNAL_TRANSLITERATOR_ERROR,
 98 |      u_INVALID_ID,
 99 |      u_INVALID_FUNCTION,
100 | 
101 |      -- ** Formatting API parsing errors
102 |      u_UNEXPECTED_TOKEN,
103 |      u_MULTIPLE_DECIMAL_SEPARATORS,
104 |      u_MULTIPLE_EXPONENTIAL_SYMBOLS,
105 |      u_MALFORMED_EXPONENTIAL_PATTERN,
106 |      u_MULTIPLE_PERCENT_SYMBOLS,
107 |      u_MULTIPLE_PERMILL_SYMBOLS,
108 |      u_MULTIPLE_PAD_SPECIFIERS,
109 |      u_PATTERN_SYNTAX_ERROR,
110 |      u_ILLEGAL_PAD_POSITION,
111 |      u_UNMATCHED_BRACES,
112 |      u_ARGUMENT_TYPE_MISMATCH,
113 |      u_DUPLICATE_KEYWORD,
114 |      u_UNDEFINED_KEYWORD,
115 |      u_DEFAULT_KEYWORD_MISSING,
116 | 
117 |      -- ** Break iterator errors
118 |      u_BRK_INTERNAL_ERROR,
119 |      u_BRK_HEX_DIGITS_EXPECTED,
120 |      u_BRK_SEMICOLON_EXPECTED,
121 |      u_BRK_RULE_SYNTAX,
122 |      u_BRK_UNCLOSED_SET,
123 |      u_BRK_ASSIGN_ERROR,
124 |      u_BRK_VARIABLE_REDFINITION,
125 |      u_BRK_MISMATCHED_PAREN,
126 |      u_BRK_NEW_LINE_IN_QUOTED_STRING,
127 |      u_BRK_UNDEFINED_VARIABLE,
128 |      u_BRK_INIT_ERROR,
129 |      u_BRK_RULE_EMPTY_SET,
130 |      u_BRK_UNRECOGNIZED_OPTION,
131 |      u_BRK_MALFORMED_RULE_TAG,
132 | 
133 |      -- ** Regular expression errors
134 |      u_REGEX_INTERNAL_ERROR,
135 |      u_REGEX_RULE_SYNTAX,
136 |      u_REGEX_INVALID_STATE,
137 |      u_REGEX_BAD_ESCAPE_SEQUENCE,
138 |      u_REGEX_PROPERTY_SYNTAX,
139 |      u_REGEX_UNIMPLEMENTED,
140 |      u_REGEX_MISMATCHED_PAREN,
141 |      u_REGEX_NUMBER_TOO_BIG,
142 |      u_REGEX_BAD_INTERVAL,
143 |      u_REGEX_MAX_LT_MIN,
144 |      u_REGEX_INVALID_BACK_REF,
145 |      u_REGEX_INVALID_FLAG,
146 |      u_REGEX_SET_CONTAINS_STRING,
147 |      u_REGEX_OCTAL_TOO_BIG,
148 |      u_REGEX_INVALID_RANGE,
149 |      u_REGEX_STACK_OVERFLOW,
150 |      u_REGEX_TIME_OUT,
151 |      u_REGEX_STOPPED_BY_CALLER,
152 | 
153 |      -- ** IDNA errors
154 |      u_IDNA_PROHIBITED_ERROR,
155 |      u_IDNA_UNASSIGNED_ERROR,
156 |      u_IDNA_CHECK_BIDI_ERROR,
157 |      u_IDNA_STD3_ASCII_RULES_ERROR,
158 |      u_IDNA_ACE_PREFIX_ERROR,
159 |      u_IDNA_VERIFICATION_ERROR,
160 |      u_IDNA_LABEL_TOO_LONG_ERROR,
161 |      u_IDNA_ZERO_LENGTH_LABEL_ERROR,
162 |      u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR
163 |     ) where
164 | 
165 | #ifdef mingw32_HOST_OS
166 | #define U_HAVE_INTTYPES_H 1
167 | #endif
168 | 
169 | #include <unicode/utypes.h>
170 | 
171 | import Data.Text.ICU.Error.Internal
172 | 
173 | #{enum ICUError, ICUError,
174 |   u_USING_FALLBACK_WARNING = U_USING_FALLBACK_WARNING,
175 |   u_USING_DEFAULT_WARNING = U_USING_DEFAULT_WARNING,
176 |   u_SAFECLONE_ALLOCATED_WARNING = U_SAFECLONE_ALLOCATED_WARNING,
177 |   u_STATE_OLD_WARNING = U_STATE_OLD_WARNING,
178 |   u_STRING_NOT_TERMINATED_WARNING = U_STRING_NOT_TERMINATED_WARNING,
179 |   u_SORT_KEY_TOO_SHORT_WARNING = U_SORT_KEY_TOO_SHORT_WARNING,
180 |   u_AMBIGUOUS_ALIAS_WARNING = U_AMBIGUOUS_ALIAS_WARNING,
181 |   u_DIFFERENT_UCA_VERSION = U_DIFFERENT_UCA_VERSION,
182 |   u_ILLEGAL_ARGUMENT_ERROR = U_ILLEGAL_ARGUMENT_ERROR,
183 |   u_MISSING_RESOURCE_ERROR = U_MISSING_RESOURCE_ERROR,
184 |   u_INVALID_FORMAT_ERROR = U_INVALID_FORMAT_ERROR,
185 |   u_FILE_ACCESS_ERROR = U_FILE_ACCESS_ERROR,
186 |   u_INTERNAL_PROGRAM_ERROR = U_INTERNAL_PROGRAM_ERROR,
187 |   u_MESSAGE_PARSE_ERROR = U_MESSAGE_PARSE_ERROR,
188 |   u_MEMORY_ALLOCATION_ERROR = U_MEMORY_ALLOCATION_ERROR,
189 |   u_INDEX_OUTOFBOUNDS_ERROR = U_INDEX_OUTOFBOUNDS_ERROR,
190 |   u_PARSE_ERROR = U_PARSE_ERROR,
191 |   u_INVALID_CHAR_FOUND = U_INVALID_CHAR_FOUND,
192 |   u_TRUNCATED_CHAR_FOUND = U_TRUNCATED_CHAR_FOUND,
193 |   u_ILLEGAL_CHAR_FOUND = U_ILLEGAL_CHAR_FOUND,
194 |   u_INVALID_TABLE_FORMAT = U_INVALID_TABLE_FORMAT,
195 |   u_INVALID_TABLE_FILE = U_INVALID_TABLE_FILE,
196 |   u_BUFFER_OVERFLOW_ERROR = U_BUFFER_OVERFLOW_ERROR,
197 |   u_UNSUPPORTED_ERROR = U_UNSUPPORTED_ERROR,
198 |   u_RESOURCE_TYPE_MISMATCH = U_RESOURCE_TYPE_MISMATCH,
199 |   u_ILLEGAL_ESCAPE_SEQUENCE = U_ILLEGAL_ESCAPE_SEQUENCE,
200 |   u_UNSUPPORTED_ESCAPE_SEQUENCE = U_UNSUPPORTED_ESCAPE_SEQUENCE,
201 |   u_NO_SPACE_AVAILABLE = U_NO_SPACE_AVAILABLE,
202 |   u_CE_NOT_FOUND_ERROR = U_CE_NOT_FOUND_ERROR,
203 |   u_PRIMARY_TOO_LONG_ERROR = U_PRIMARY_TOO_LONG_ERROR,
204 |   u_STATE_TOO_OLD_ERROR = U_STATE_TOO_OLD_ERROR,
205 |   u_TOO_MANY_ALIASES_ERROR = U_TOO_MANY_ALIASES_ERROR,
206 |   u_ENUM_OUT_OF_SYNC_ERROR = U_ENUM_OUT_OF_SYNC_ERROR,
207 |   u_INVARIANT_CONVERSION_ERROR = U_INVARIANT_CONVERSION_ERROR,
208 |   u_INVALID_STATE_ERROR = U_INVALID_STATE_ERROR,
209 |   u_COLLATOR_VERSION_MISMATCH = U_COLLATOR_VERSION_MISMATCH,
210 |   u_USELESS_COLLATOR_ERROR = U_USELESS_COLLATOR_ERROR,
211 |   u_NO_WRITE_PERMISSION = U_NO_WRITE_PERMISSION,
212 |   u_BAD_VARIABLE_DEFINITION = U_BAD_VARIABLE_DEFINITION,
213 |   u_MALFORMED_RULE = U_MALFORMED_RULE,
214 |   u_MALFORMED_SET = U_MALFORMED_SET,
215 |   u_MALFORMED_UNICODE_ESCAPE = U_MALFORMED_UNICODE_ESCAPE,
216 |   u_MALFORMED_VARIABLE_DEFINITION = U_MALFORMED_VARIABLE_DEFINITION,
217 |   u_MALFORMED_VARIABLE_REFERENCE = U_MALFORMED_VARIABLE_REFERENCE,
218 |   u_MISPLACED_CURSOR_OFFSET = U_MISPLACED_CURSOR_OFFSET,
219 |   u_MISPLACED_QUANTIFIER = U_MISPLACED_QUANTIFIER,
220 |   u_MISSING_OPERATOR = U_MISSING_OPERATOR,
221 |   u_MULTIPLE_ANTE_CONTEXTS = U_MULTIPLE_ANTE_CONTEXTS,
222 |   u_MULTIPLE_CURSORS = U_MULTIPLE_CURSORS,
223 |   u_MULTIPLE_POST_CONTEXTS = U_MULTIPLE_POST_CONTEXTS,
224 |   u_TRAILING_BACKSLASH = U_TRAILING_BACKSLASH,
225 |   u_UNDEFINED_SEGMENT_REFERENCE = U_UNDEFINED_SEGMENT_REFERENCE,
226 |   u_UNDEFINED_VARIABLE = U_UNDEFINED_VARIABLE,
227 |   u_UNQUOTED_SPECIAL = U_UNQUOTED_SPECIAL,
228 |   u_UNTERMINATED_QUOTE = U_UNTERMINATED_QUOTE,
229 |   u_RULE_MASK_ERROR = U_RULE_MASK_ERROR,
230 |   u_MISPLACED_COMPOUND_FILTER = U_MISPLACED_COMPOUND_FILTER,
231 |   u_MULTIPLE_COMPOUND_FILTERS = U_MULTIPLE_COMPOUND_FILTERS,
232 |   u_INVALID_RBT_SYNTAX = U_INVALID_RBT_SYNTAX,
233 |   u_MALFORMED_PRAGMA = U_MALFORMED_PRAGMA,
234 |   u_UNCLOSED_SEGMENT = U_UNCLOSED_SEGMENT,
235 |   u_VARIABLE_RANGE_EXHAUSTED = U_VARIABLE_RANGE_EXHAUSTED,
236 |   u_VARIABLE_RANGE_OVERLAP = U_VARIABLE_RANGE_OVERLAP,
237 |   u_ILLEGAL_CHARACTER = U_ILLEGAL_CHARACTER,
238 |   u_INTERNAL_TRANSLITERATOR_ERROR = U_INTERNAL_TRANSLITERATOR_ERROR,
239 |   u_INVALID_ID = U_INVALID_ID,
240 |   u_INVALID_FUNCTION = U_INVALID_FUNCTION,
241 |   u_UNEXPECTED_TOKEN = U_UNEXPECTED_TOKEN,
242 |   u_MULTIPLE_DECIMAL_SEPARATORS = U_MULTIPLE_DECIMAL_SEPARATORS,
243 |   u_MULTIPLE_EXPONENTIAL_SYMBOLS = U_MULTIPLE_EXPONENTIAL_SYMBOLS,
244 |   u_MALFORMED_EXPONENTIAL_PATTERN = U_MALFORMED_EXPONENTIAL_PATTERN,
245 |   u_MULTIPLE_PERCENT_SYMBOLS = U_MULTIPLE_PERCENT_SYMBOLS,
246 |   u_MULTIPLE_PERMILL_SYMBOLS = U_MULTIPLE_PERMILL_SYMBOLS,
247 |   u_MULTIPLE_PAD_SPECIFIERS = U_MULTIPLE_PAD_SPECIFIERS,
248 |   u_PATTERN_SYNTAX_ERROR = U_PATTERN_SYNTAX_ERROR,
249 |   u_ILLEGAL_PAD_POSITION = U_ILLEGAL_PAD_POSITION,
250 |   u_UNMATCHED_BRACES = U_UNMATCHED_BRACES,
251 |   u_ARGUMENT_TYPE_MISMATCH = U_ARGUMENT_TYPE_MISMATCH,
252 |   u_DUPLICATE_KEYWORD = U_DUPLICATE_KEYWORD,
253 |   u_UNDEFINED_KEYWORD = U_UNDEFINED_KEYWORD,
254 |   u_DEFAULT_KEYWORD_MISSING = U_DEFAULT_KEYWORD_MISSING,
255 |   u_BRK_INTERNAL_ERROR = U_BRK_INTERNAL_ERROR,
256 |   u_BRK_HEX_DIGITS_EXPECTED = U_BRK_HEX_DIGITS_EXPECTED,
257 |   u_BRK_SEMICOLON_EXPECTED = U_BRK_SEMICOLON_EXPECTED,
258 |   u_BRK_RULE_SYNTAX = U_BRK_RULE_SYNTAX,
259 |   u_BRK_UNCLOSED_SET = U_BRK_UNCLOSED_SET,
260 |   u_BRK_ASSIGN_ERROR = U_BRK_ASSIGN_ERROR,
261 |   u_BRK_VARIABLE_REDFINITION = U_BRK_VARIABLE_REDFINITION,
262 |   u_BRK_MISMATCHED_PAREN = U_BRK_MISMATCHED_PAREN,
263 |   u_BRK_NEW_LINE_IN_QUOTED_STRING = U_BRK_NEW_LINE_IN_QUOTED_STRING,
264 |   u_BRK_UNDEFINED_VARIABLE = U_BRK_UNDEFINED_VARIABLE,
265 |   u_BRK_INIT_ERROR = U_BRK_INIT_ERROR,
266 |   u_BRK_RULE_EMPTY_SET = U_BRK_RULE_EMPTY_SET,
267 |   u_BRK_UNRECOGNIZED_OPTION = U_BRK_UNRECOGNIZED_OPTION,
268 |   u_BRK_MALFORMED_RULE_TAG = U_BRK_MALFORMED_RULE_TAG,
269 |   u_REGEX_INTERNAL_ERROR = U_REGEX_INTERNAL_ERROR,
270 |   u_REGEX_RULE_SYNTAX = U_REGEX_RULE_SYNTAX,
271 |   u_REGEX_INVALID_STATE = U_REGEX_INVALID_STATE,
272 |   u_REGEX_BAD_ESCAPE_SEQUENCE = U_REGEX_BAD_ESCAPE_SEQUENCE,
273 |   u_REGEX_PROPERTY_SYNTAX = U_REGEX_PROPERTY_SYNTAX,
274 |   u_REGEX_UNIMPLEMENTED = U_REGEX_UNIMPLEMENTED,
275 |   u_REGEX_MISMATCHED_PAREN = U_REGEX_MISMATCHED_PAREN,
276 |   u_REGEX_NUMBER_TOO_BIG = U_REGEX_NUMBER_TOO_BIG,
277 |   u_REGEX_BAD_INTERVAL = U_REGEX_BAD_INTERVAL,
278 |   u_REGEX_MAX_LT_MIN = U_REGEX_MAX_LT_MIN,
279 |   u_REGEX_INVALID_BACK_REF = U_REGEX_INVALID_BACK_REF,
280 |   u_REGEX_INVALID_FLAG = U_REGEX_INVALID_FLAG,
281 |   u_REGEX_SET_CONTAINS_STRING = U_REGEX_SET_CONTAINS_STRING,
282 |   u_REGEX_OCTAL_TOO_BIG = U_REGEX_OCTAL_TOO_BIG,
283 |   u_REGEX_INVALID_RANGE = U_REGEX_INVALID_RANGE,
284 |   u_REGEX_STACK_OVERFLOW = U_REGEX_STACK_OVERFLOW,
285 |   u_REGEX_TIME_OUT = U_REGEX_TIME_OUT,
286 |   u_REGEX_STOPPED_BY_CALLER = U_REGEX_STOPPED_BY_CALLER,
287 |   u_IDNA_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR,
288 |   u_IDNA_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR,
289 |   u_IDNA_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR,
290 |   u_IDNA_STD3_ASCII_RULES_ERROR = U_IDNA_STD3_ASCII_RULES_ERROR,
291 |   u_IDNA_ACE_PREFIX_ERROR = U_IDNA_ACE_PREFIX_ERROR,
292 |   u_IDNA_VERIFICATION_ERROR = U_IDNA_VERIFICATION_ERROR,
293 |   u_IDNA_LABEL_TOO_LONG_ERROR = U_IDNA_LABEL_TOO_LONG_ERROR,
294 |   u_IDNA_ZERO_LENGTH_LABEL_ERROR = U_IDNA_ZERO_LENGTH_LABEL_ERROR,
295 |   u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR
296 | }
297 | 
298 | isRegexError :: ICUError -> Bool
299 | isRegexError (ICUError err) =
300 |     err >= #{const U_REGEX_ERROR_START} && err < #{const U_REGEX_ERROR_LIMIT}
301 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Error/Internal.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, DeriveDataTypeable, ForeignFunctionInterface,
  2 |     RecordWildCards, ScopedTypeVariables #-}
  3 | 
  4 | module Data.Text.ICU.Error.Internal
  5 |     (
  6 |     -- * Types
  7 |       ICUError(..)
  8 |     -- ** Low-level types
  9 |     , UErrorCode
 10 |     , ParseError(errError, errLine, errOffset)
 11 |     , UParseError
 12 |     -- * Functions
 13 |     , isFailure
 14 |     , isSuccess
 15 |     , errorName
 16 |     , handleError
 17 |     , handleOverflowError
 18 |     , handleParseError
 19 |     , throwOnError
 20 |     , withError
 21 |     ) where
 22 | 
 23 | import Control.DeepSeq (NFData(..))
 24 | import Control.Exception (Exception, throwIO)
 25 | import Data.Function (fix)
 26 | import Foreign.Ptr (Ptr)
 27 | import Foreign.Marshal.Alloc (alloca, allocaBytes)
 28 | import Foreign.Marshal.Utils (with)
 29 | import Foreign.Marshal.Array (allocaArray)
 30 | import Data.Int (Int32)
 31 | import Data.Typeable (Typeable)
 32 | import Foreign.C.String (CString, peekCString)
 33 | import Foreign.C.Types (CInt(..))
 34 | import Foreign.Storable (Storable(..))
 35 | import System.IO.Unsafe (unsafePerformIO)
 36 | 
 37 | #include <unicode/parseerr.h>
 38 | #include <unicode/utypes.h>
 39 | 
 40 | type UErrorCode = CInt
 41 | 
 42 | -- | ICU error type.  This is an instance of the 'Exception' type
 43 | -- class.  A value of this type may be thrown as an exception by most
 44 | -- ICU functions.
 45 | newtype ICUError = ICUError {
 46 |       fromErrorCode :: UErrorCode
 47 |     } deriving (Eq, Typeable)
 48 | 
 49 | instance Show ICUError where
 50 |     show code = "ICUError " ++ errorName code
 51 | 
 52 | instance Exception ICUError
 53 | 
 54 | instance NFData ICUError where
 55 |     rnf !_ = ()
 56 | 
 57 | -- | Detailed information about parsing errors.  Used by ICU parsing
 58 | -- engines that parse long rules, patterns, or programs, where the
 59 | -- text being parsed is long enough that more information than an
 60 | -- 'ICUError' is needed to localize the error.
 61 | data ParseError = ParseError {
 62 |       errError :: ICUError
 63 |     , errLine :: !(Maybe Int)
 64 |     -- ^ The line on which the error occured.  If the parser uses this
 65 |     -- field, it sets it to the line number of the source text line on
 66 |     -- which the error appears, which will be be a positive value.  If
 67 |     -- the parser does not support line numbers, the value will be
 68 |     -- 'Nothing'.
 69 |     , errOffset :: !(Maybe Int)
 70 |     -- ^ The character offset to the error.  If the 'errLine' field is
 71 |     -- 'Just' some value, then this field contains the offset from the
 72 |     -- beginning of the line that contains the error.  Otherwise, it
 73 |     -- represents the offset from the start of the text.  If the
 74 |     -- parser does not support this field, it will have a value of
 75 |     -- 'Nothing'.
 76 |     } deriving (Show, Typeable)
 77 | 
 78 | instance NFData ParseError where
 79 |     rnf ParseError{..} = rnf errError `seq` rnf errLine `seq` rnf errOffset
 80 | 
 81 | type UParseError = ParseError
 82 | 
 83 | instance Exception ParseError
 84 | 
 85 | -- | Indicate whether the given error code is a success.
 86 | isSuccess :: ICUError -> Bool
 87 | {-# INLINE isSuccess #-}
 88 | isSuccess = (<= 0) . fromErrorCode
 89 | 
 90 | -- | Indicate whether the given error code is a failure.
 91 | isFailure :: ICUError -> Bool
 92 | {-# INLINE isFailure #-}
 93 | isFailure = (> 0) . fromErrorCode
 94 | 
 95 | -- | Throw an exception if the given code is actually an error.
 96 | throwOnError :: UErrorCode -> IO ()
 97 | {-# INLINE throwOnError #-}
 98 | throwOnError code = do
 99 |   let err = (ICUError code)
100 |   if isFailure err
101 |     then throwIO err
102 |     else return ()
103 | 
104 | withError :: (Ptr UErrorCode -> IO a) -> IO (ICUError, a)
105 | {-# INLINE withError #-}
106 | withError action = with 0 $ \errPtr -> do
107 |                      ret <- action errPtr
108 |                      err <- peek errPtr
109 |                      return (ICUError err, ret)
110 | 
111 | handleError :: (Ptr UErrorCode -> IO a) -> IO a
112 | {-# INLINE handleError #-}
113 | handleError action = with 0 $ \errPtr -> do
114 |                        ret <- action errPtr
115 |                        throwOnError =<< peek errPtr
116 |                        return ret
117 | 
118 | -- | Deal with ICU functions that report a buffer overflow error if we
119 | -- give them an insufficiently large buffer.  Our first call will
120 | -- report a buffer overflow, in which case we allocate a correctly
121 | -- sized buffer and try again.
122 | handleOverflowError :: (Storable a) =>
123 |                        Int
124 |                     -- ^ Initial guess at buffer size.
125 |                     -> (Ptr a -> Int32 -> Ptr UErrorCode -> IO Int32)
126 |                     -- ^ Function that retrieves data.
127 |                     -> (Ptr a -> Int -> IO b)
128 |                     -- ^ Function that fills destination buffer if no
129 |                     -- overflow occurred.
130 |                     -> IO b
131 | handleOverflowError guess fill retrieve =
132 |   alloca $ \uerrPtr -> flip fix guess $ \loop n ->
133 |     (either (loop . fromIntegral) return =<<) . allocaArray n $ \ptr -> do
134 |       poke uerrPtr 0
135 |       ret <- fill ptr (fromIntegral n) uerrPtr
136 |       err <- peek uerrPtr
137 |       case undefined of
138 |         _| err == (#const U_BUFFER_OVERFLOW_ERROR)
139 |                      -> return (Left ret)
140 |          | err > 0   -> throwIO (ICUError err)
141 |          | otherwise -> Right `fmap` retrieve ptr (fromIntegral ret)
142 | 
143 | handleParseError :: (ICUError -> Bool)
144 |                  -> (Ptr UParseError -> Ptr UErrorCode -> IO a) -> IO a
145 | handleParseError isParseError action = with 0 $ \uerrPtr ->
146 |   allocaBytes (#{size UParseError}) $ \perrPtr -> do
147 |     ret <- action perrPtr uerrPtr
148 |     err <- ICUError `fmap` peek uerrPtr
149 |     case undefined of
150 |      _| isParseError err -> throwParseError perrPtr err
151 |       | isFailure err -> throwIO err
152 |       | otherwise     -> return ret
153 | 
154 | throwParseError :: Ptr UParseError -> ICUError -> IO a
155 | throwParseError ptr err = do
156 |   (line::Int32) <- #{peek UParseError, line} ptr
157 |   (offset::Int32) <- #{peek UParseError, offset} ptr
158 |   let wrap k = if k == -1 then Nothing else Just $! fromIntegral k
159 |   throwIO $! ParseError err (wrap line) (wrap offset)
160 | 
161 | -- | Return a string representing the name of the given error code.
162 | errorName :: ICUError -> String
163 | errorName code = unsafePerformIO $
164 |                  peekCString (u_errorName (fromErrorCode code))
165 | 
166 | foreign import ccall unsafe "hs_text_icu.h __hs_u_errorName" u_errorName
167 |     :: UErrorCode -> CString
168 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Internal.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE EmptyDataDecls, ForeignFunctionInterface, GeneralizedNewtypeDeriving, TupleSections #-}
  2 | 
  3 | module Data.Text.ICU.Internal
  4 |     (
  5 |       LocaleName(..)
  6 |     , UBool
  7 |     , UChar
  8 |     , UChar32
  9 |     , UCharIterator
 10 |     , CharIterator(..)
 11 |     , UText, UTextPtr
 12 |     , asBool
 13 |     , asOrdering
 14 |     , withCharIterator
 15 |     , withLocaleName
 16 |     , withName
 17 |     , useAsUCharPtr, fromUCharPtr, I16, asUCharForeignPtr
 18 |     , asUTextPtr, withUTextPtr, withUTextPtrText, emptyUTextPtr, utextPtrLength
 19 |     , TextI, takeWord, dropWord, lengthWord
 20 |     , newICUPtr
 21 |     ) where
 22 | 
 23 | #include <unicode/uiter.h>
 24 | 
 25 | import Control.Exception (mask_)
 26 | import Control.DeepSeq (NFData(..))
 27 | import Data.ByteString.Internal (ByteString(..))
 28 | import Data.Int (Int8, Int32, Int64)
 29 | import Data.String (IsString(..))
 30 | import Data.Text (Text, empty)
 31 | import Data.Text.Encoding (decodeUtf8)
 32 | import Data.Text.Foreign (useAsPtr, asForeignPtr, fromPtr)
 33 | #if MIN_VERSION_text(2,0,0)
 34 | import Data.Text.Foreign (I8, dropWord8, takeWord8, lengthWord8)
 35 | import Foreign.ForeignPtr (mallocForeignPtrArray)
 36 | import Foreign.Marshal.Array (allocaArray)
 37 | import Foreign.Storable (peek)
 38 | #else
 39 | import Data.Text.Foreign (I16, dropWord16, takeWord16, lengthWord16)
 40 | #endif
 41 | import Data.Word (Word8, Word16, Word32)
 42 | import Foreign.C.String (CString, withCString)
 43 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr, newForeignPtr, FinalizerPtr)
 44 | import Foreign.Marshal.Alloc (allocaBytes)
 45 | import Foreign.Marshal.Utils (with)
 46 | import Foreign.Ptr (Ptr, nullPtr, FunPtr)
 47 | import Data.Text.ICU.Error.Internal (UErrorCode)
 48 | import System.IO.Unsafe (unsafePerformIO)
 49 | 
 50 | -- | A type that supports efficient iteration over Unicode characters.
 51 | --
 52 | -- As an example of where this may be useful, a function using this
 53 | -- type may be able to iterate over a UTF-8 'ByteString' directly,
 54 | -- rather than first copying and converting it to an intermediate
 55 | -- form.  This type also allows e.g. comparison between 'Text' and
 56 | -- 'ByteString', with minimal overhead.
 57 | data CharIterator = CIText !Text
 58 |                   | CIUTF8 !ByteString
 59 | 
 60 | instance Show CharIterator where
 61 |     show (CIText t)  = show t
 62 |     show (CIUTF8 bs) = show (decodeUtf8 bs)
 63 | 
 64 | data UCharIterator
 65 | 
 66 | -- | Temporarily allocate a 'UCharIterator' and use it with the
 67 | -- contents of the to-be-iterated-over string.
 68 | withCharIterator :: CharIterator -> (Ptr UCharIterator -> IO a) -> IO a
 69 | withCharIterator (CIUTF8 (PS fp _ l)) act =
 70 |     allocaBytes (#{size UCharIterator}) $ \i -> withForeignPtr fp $ \p ->
 71 |     uiter_setUTF8 i p (fromIntegral l) >> act i
 72 | withCharIterator (CIText t) act =
 73 |     allocaBytes (#{size UCharIterator}) $ \i -> useAsPtr t $ \p l ->
 74 | #if MIN_VERSION_text(2,0,0)
 75 |     uiter_setUTF8 i p (fromIntegral l) >> act i
 76 | #else
 77 |     uiter_setString i p (fromIntegral l) >> act i
 78 | #endif
 79 | 
 80 | type UBool   = Int8
 81 | type UChar   = Word16
 82 | type UChar32 = Word32
 83 | 
 84 | asBool :: Integral a => a -> Bool
 85 | {-# INLINE asBool #-}
 86 | asBool = (/=0)
 87 | 
 88 | asOrdering :: Integral a => a -> Ordering
 89 | {-# INLINE asOrdering #-}
 90 | asOrdering i
 91 |     | i < 0     = LT
 92 |     | i == 0    = EQ
 93 |     | otherwise = GT
 94 | 
 95 | withName :: String -> (CString -> IO a) -> IO a
 96 | withName name act
 97 |     | null name = act nullPtr
 98 |     | otherwise = withCString name act
 99 | 
100 | -- | The name of a locale.
101 | data LocaleName = Root
102 |                 -- ^ The root locale.  For a description of resource bundles
103 |                 -- and the root resource, see
104 |                 -- <http://userguide.icu-project.org/locale/resources>.
105 |                 | Locale String -- ^ A specific locale.
106 |                 | Current       -- ^ The program's current locale.
107 |                   deriving (Eq, Ord, Read, Show)
108 | 
109 | instance NFData LocaleName where
110 |     rnf Root       = ()
111 |     rnf (Locale l) = rnf l
112 |     rnf Current    = ()
113 | 
114 | instance IsString LocaleName where
115 |     fromString = Locale
116 | 
117 | withLocaleName :: LocaleName -> (CString -> IO a) -> IO a
118 | withLocaleName Current act = act nullPtr
119 | withLocaleName Root act = withCString "" act
120 | withLocaleName (Locale n) act = withCString n act
121 | 
122 | #if !MIN_VERSION_text(2,0,0)
123 | foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setString" uiter_setString
124 |     :: Ptr UCharIterator -> Ptr UChar -> Int32 -> IO ()
125 | #endif
126 | 
127 | foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setUTF8" uiter_setUTF8
128 |     :: Ptr UCharIterator -> Ptr Word8 -> Int32 -> IO ()
129 | 
130 | 
131 | data UText
132 | 
133 | -- | Pointer to UText which also keeps pointer to source text so it won't be
134 | -- garbage collected.
135 | data UTextPtr
136 |     = UTextPtr
137 |       { utextPtr :: ForeignPtr UText
138 |       , utextPtrText :: ForeignPtr TextChar
139 |       , utextPtrLength :: TextI
140 |       }
141 | 
142 | emptyUTextPtr :: UTextPtr
143 | emptyUTextPtr = unsafePerformIO $ asUTextPtr empty
144 | {-# NOINLINE emptyUTextPtr #-}
145 | 
146 | withUTextPtr :: UTextPtr -> (Ptr UText -> IO a) -> IO a
147 | withUTextPtr = withForeignPtr . utextPtr
148 | 
149 | withUTextPtrText :: UTextPtr -> (Ptr TextChar -> IO a) -> IO a
150 | withUTextPtrText = withForeignPtr . utextPtrText
151 | 
152 | -- | Returns UTF-8 UText for text >= 2.0 or UTF-16 UText for previous versions.
153 | asUTextPtr :: Text -> IO UTextPtr
154 | asUTextPtr t = do
155 |     (fp,l) <- asForeignPtr t
156 |     with 0 $ \ e -> withForeignPtr fp $ \ p ->
157 |       newICUPtr (\ ut -> UTextPtr ut fp l) utext_close $
158 | #if MIN_VERSION_text(2,0,0)
159 |         utext_openUTF8
160 | #else
161 |         utext_openUChars
162 | #endif
163 |           nullPtr p (fromIntegral l) e
164 | 
165 | foreign import ccall unsafe "hs_text_icu.h &__hs_utext_close" utext_close
166 |     :: FunPtr (Ptr UText -> IO ())
167 | 
168 | useAsUCharPtr :: Text -> (Ptr UChar -> I16 -> IO a) -> IO a
169 | asUCharForeignPtr :: Text -> IO (ForeignPtr UChar, I16)
170 | fromUCharPtr :: Ptr UChar -> I16 -> IO Text
171 | 
172 | dropWord, takeWord :: TextI -> Text -> Text
173 | lengthWord :: Text -> Int
174 | 
175 | #if MIN_VERSION_text(2,0,0)
176 | newtype I16 = I16 Int
177 |     deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
178 | 
179 | type TextChar = Word8
180 | type TextI = I8
181 | 
182 | useAsUCharPtr t act = useAsPtr t $ \tptr tlen ->
183 |     allocaArray (fromIntegral tlen) $ \ dst ->
184 |         act dst =<< fromUtf8 dst tptr tlen
185 | 
186 | asUCharForeignPtr t = useAsPtr t $ \tptr tlen -> do
187 |     fp <- mallocForeignPtrArray (fromIntegral tlen)
188 |     withForeignPtr fp $ \ dst -> (fp,) <$> fromUtf8 dst tptr tlen
189 | 
190 | fromUtf8 :: Ptr UChar -> Ptr Word8 -> I8 -> IO I16
191 | fromUtf8 dst tptr tlen =
192 |     with 0 $ \ err ->
193 |     with 0 $ \ dstLen -> do
194 |         _ <- u_strFromUTF8Lenient dst (fromIntegral tlen) dstLen tptr
195 |               (fromIntegral tlen) err
196 |         fromIntegral <$> peek dstLen
197 | 
198 | fromUCharPtr p l =
199 |     with 0 $ \ err ->
200 |     with 0 $ \ dstLen ->
201 |     allocaArray capacity $ \ dst -> do
202 |         _ <- u_strToUTF8 dst (fromIntegral capacity) dstLen p
203 |             (fromIntegral l) err
204 |         dl <- peek dstLen
205 |         fromPtr dst (fromIntegral dl)
206 |     where capacity = fromIntegral l * 3
207 | 
208 | dropWord = dropWord8
209 | takeWord = takeWord8
210 | lengthWord = lengthWord8
211 | 
212 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strFromUTF8Lenient" u_strFromUTF8Lenient
213 |     :: Ptr UChar -> Int32 -> Ptr Int32 -> Ptr Word8 -> Int32 -> Ptr UErrorCode
214 |     -> IO CString
215 | 
216 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strToUTF8" u_strToUTF8
217 |     :: Ptr Word8 -> Int32 -> Ptr Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode
218 |     -> IO CString
219 | 
220 | foreign import ccall unsafe "hs_text_icu.h __hs_utext_openUTF8" utext_openUTF8
221 |     :: Ptr UText -> Ptr Word8 -> Int64 -> Ptr UErrorCode -> IO (Ptr UText)
222 | 
223 | #else
224 | 
225 | type TextChar = UChar
226 | type TextI = I16
227 | 
228 | -- text < 2.0 has UChar as internal representation.
229 | useAsUCharPtr = useAsPtr
230 | asUCharForeignPtr = asForeignPtr
231 | fromUCharPtr = fromPtr
232 | 
233 | dropWord = dropWord16
234 | takeWord = takeWord16
235 | lengthWord = lengthWord16
236 | 
237 | foreign import ccall unsafe "hs_text_icu.h __hs_utext_openUChars" utext_openUChars
238 |     :: Ptr UText -> Ptr UChar -> Int64 -> Ptr UErrorCode -> IO (Ptr UText)
239 | 
240 | #endif
241 | 
242 | -- | Allocate new ICU data structure (usually via @*_open@ call),
243 | -- add finalizer (@*_close@ call) and wrap resulting 'ForeignPtr'.
244 | --
245 | -- Exceptions are masked since the memory leak is possible if any
246 | -- asynchronous exception (such as a timeout) is raised between
247 | -- allocating C data and 'newForeignPtr' call.
248 | newICUPtr :: (ForeignPtr a -> i) -> FinalizerPtr a -> IO (Ptr a) -> IO i
249 | newICUPtr wrap close open = fmap wrap $ mask_ $ newForeignPtr close =<< open
250 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Iterator.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ForeignFunctionInterface #-}
 2 | {-# OPTIONS_GHC -fno-warn-orphans #-}
 3 | -- |
 4 | -- Module      : Data.Text.ICU.Iterator
 5 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 6 | --
 7 | -- License     : BSD-style
 8 | -- Maintainer  : bos@serpentine.com
 9 | -- Stability   : experimental
10 | -- Portability : GHC
11 | --
12 | -- Iteration functions for Unicode, implemented as bindings to the
13 | -- International Components for Unicode (ICU) libraries.
14 | --
15 | -- Unlike the C and C++ @UCharIterator@ type, the Haskell
16 | -- 'CharIterator' type is immutable, and can safely be used in pure
17 | -- code.
18 | --
19 | -- Functions using these iterators may be more efficient than their
20 | -- counterparts.  For instance, the 'CharIterator' type allows a UTF-8
21 | -- 'ByteString' to be compared against a 'Text', without first
22 | -- converting the 'ByteString':
23 | --
24 | -- > fromUtf8 bs == fromText t
25 | module Data.Text.ICU.Iterator
26 |     (
27 |     -- * Types and constructors
28 |       CharIterator
29 |     , fromString
30 |     , fromText
31 |     , fromUtf8
32 |     ) where
33 | 
34 | import Data.ByteString (ByteString)
35 | import Data.Int (Int32)
36 | import Data.Text (Text, pack)
37 | import Data.Text.ICU.Internal (CharIterator(..), UCharIterator, asOrdering,
38 |                                withCharIterator)
39 | import Foreign.Ptr (Ptr)
40 | import System.IO.Unsafe (unsafePerformIO)
41 | 
42 | instance Eq CharIterator where
43 |     a == b = compareIter a b == EQ
44 | 
45 | instance Ord CharIterator where
46 |     compare = compareIter
47 | 
48 | -- | Compare two 'CharIterator's.
49 | compareIter :: CharIterator -> CharIterator -> Ordering
50 | compareIter a b = unsafePerformIO . fmap asOrdering .
51 |   withCharIterator a $ withCharIterator b . u_strCompareIter
52 | 
53 | -- | Construct a 'CharIterator' from a Unicode string.
54 | fromString :: String -> CharIterator
55 | fromString = CIText . pack
56 | {-# INLINE fromString #-}
57 | 
58 | -- | Construct a 'CharIterator' from a Unicode string.
59 | fromText :: Text -> CharIterator
60 | fromText = CIText
61 | {-# INLINE fromText #-}
62 | 
63 | -- | Construct a 'CharIterator' from a Unicode string encoded as a
64 | -- UTF-8 'ByteString'. The validity of the encoded string is *not*
65 | -- checked.
66 | fromUtf8 :: ByteString -> CharIterator
67 | fromUtf8 = CIUTF8
68 | {-# INLINE fromUtf8 #-}
69 | 
70 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strCompareIter" u_strCompareIter
71 |     :: Ptr UCharIterator -> Ptr UCharIterator -> IO Int32
72 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Locale.hsc:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Locale
 4 | -- Copyright   : (c) 2021 Torsten Kemps-Benedix
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Locale functions implemented as bindings to
12 | -- the International Components for Unicode (ICU) libraries.
13 | 
14 | module Data.Text.ICU.Locale
15 |     (availableLocales
16 |     ) where
17 | 
18 | #include <unicode/uloc.h>
19 | 
20 | import Control.Monad (forM)
21 | import Data.Int (Int32)
22 | import Foreign.C.String (CString, peekCString)
23 | import Prelude hiding (last)
24 | 
25 | -- |  Get the available default locales, i.e. locales that return data when passed to ICU 
26 | -- APIs, but not including legacy or alias locales. 
27 | availableLocales :: IO [String]
28 | availableLocales = do
29 |   n <- uloc_countAvailable
30 |   forM [0..n-1] $ \i -> uloc_getAvailable i >>= peekCString
31 | 
32 | foreign import ccall unsafe "hs_text_icu.h __hs_uloc_getAvailable" uloc_getAvailable
33 |     :: Int32 -> IO CString
34 | foreign import ccall unsafe "hs_text_icu.h __hs_uloc_countAvailable" uloc_countAvailable
35 |     :: IO Int32
36 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Normalize.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.Normalize
  4 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Character set normalization functions for Unicode, implemented as
 12 | -- bindings to the International Components for Unicode (ICU)
 13 | -- libraries.
 14 | --
 15 | -- This module is based on the now deprecated "unorm.h" functions.
 16 | -- Please use Data.Text.ICU.Normalize2 instead.
 17 | 
 18 | module Data.Text.ICU.Normalize {-# DEPRECATED "Use Data.Text.ICU.Normalize2 instead" #-}
 19 |     (
 20 |     -- * Unicode normalization API
 21 |     -- $api
 22 |       NormalizationMode(..)
 23 |     -- * Normalization functions
 24 |     , normalize
 25 |     -- * Normalization checks
 26 |     , quickCheck
 27 |     , isNormalized
 28 |     -- * Normalization-sensitive comparison
 29 |     , CompareOption(..)
 30 |     , compare
 31 |     ) where
 32 | 
 33 | #ifdef mingw32_HOST_OS
 34 | #define U_HAVE_INTTYPES_H 1
 35 | #endif
 36 | 
 37 | #include <unicode/uchar.h>
 38 | #include <unicode/unorm.h>
 39 | 
 40 | import Data.Text (Text)
 41 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError)
 42 | import Data.Text.ICU.Internal (UBool, UChar, asBool, asOrdering, useAsUCharPtr, fromUCharPtr)
 43 | import Data.Text.ICU.Normalize.Internal (UNormalizationCheckResult, toNCR)
 44 | import Data.Typeable (Typeable)
 45 | import Data.Int (Int32)
 46 | import Data.Word (Word32)
 47 | import Foreign.C.Types (CInt(..))
 48 | import Foreign.Ptr (Ptr)
 49 | import System.IO.Unsafe (unsafePerformIO)
 50 | import Prelude hiding (compare)
 51 | import Data.List (foldl')
 52 | import Data.Bits ((.|.))
 53 | 
 54 | -- $api
 55 | --
 56 | -- The 'normalize' function transforms Unicode text into an equivalent
 57 | -- composed or decomposed form, allowing for easier sorting and
 58 | -- searching of text.  'normalize' supports the standard normalization
 59 | -- forms described in <http://www.unicode.org/unicode/reports/tr15/>,
 60 | -- Unicode Standard Annex #15: Unicode Normalization Forms.
 61 | --
 62 | -- Characters with accents or other adornments can be encoded in
 63 | -- several different ways in Unicode.  For example, take the character A-acute.
 64 | -- In Unicode, this can be encoded as a single character (the
 65 | -- \"composed\" form):
 66 | --
 67 | -- @
 68 | --      00C1    LATIN CAPITAL LETTER A WITH ACUTE
 69 | -- @
 70 | --
 71 | -- or as two separate characters (the \"decomposed\" form):
 72 | --
 73 | -- @
 74 | --      0041    LATIN CAPITAL LETTER A
 75 | --      0301    COMBINING ACUTE ACCENT
 76 | -- @
 77 | --
 78 | -- To a user of your program, however, both of these sequences should
 79 | -- be treated as the same \"user-level\" character \"A with acute
 80 | -- accent\".  When you are searching or comparing text, you must
 81 | -- ensure that these two sequences are treated equivalently.  In
 82 | -- addition, you must handle characters with more than one accent.
 83 | -- Sometimes the order of a character's combining accents is
 84 | -- significant, while in other cases accent sequences in different
 85 | -- orders are really equivalent.
 86 | --
 87 | -- Similarly, the string \"ffi\" can be encoded as three separate letters:
 88 | --
 89 | -- @
 90 | --      0066    LATIN SMALL LETTER F
 91 | --      0066    LATIN SMALL LETTER F
 92 | --      0069    LATIN SMALL LETTER I
 93 | -- @
 94 | --
 95 | -- or as the single character
 96 | --
 97 | -- @
 98 | --      FB03    LATIN SMALL LIGATURE FFI
 99 | -- @
100 | --
101 | -- The \"ffi\" ligature is not a distinct semantic character, and
102 | -- strictly speaking it shouldn't be in Unicode at all, but it was
103 | -- included for compatibility with existing character sets that
104 | -- already provided it.  The Unicode standard identifies such
105 | -- characters by giving them \"compatibility\" decompositions into the
106 | -- corresponding semantic characters.  When sorting and searching, you
107 | -- will often want to use these mappings.
108 | --
109 | -- 'normalize' helps solve these problems by transforming text into
110 | -- the canonical composed and decomposed forms as shown in the first
111 | -- example above.  In addition, you can have it perform compatibility
112 | -- decompositions so that you can treat compatibility characters the
113 | -- same as their equivalents.  Finally, 'normalize' rearranges accents
114 | -- into the proper canonical order, so that you do not have to worry
115 | -- about accent rearrangement on your own.
116 | --
117 | -- Form 'FCD', \"Fast C or D\", is also designed for collation.  It
118 | -- allows to work on strings that are not necessarily normalized with
119 | -- an algorithm (like in collation) that works under \"canonical
120 | -- closure\", i.e., it treats precomposed characters and their
121 | -- decomposed equivalents the same.
122 | --
123 | -- It is not a normalization form because it does not provide for
124 | -- uniqueness of representation. Multiple strings may be canonically
125 | -- equivalent (their NFDs are identical) and may all conform to 'FCD'
126 | -- without being identical themselves.
127 | --
128 | -- The form is defined such that the \"raw decomposition\", the
129 | -- recursive canonical decomposition of each character, results in a
130 | -- string that is canonically ordered. This means that precomposed
131 | -- characters are allowed for as long as their decompositions do not
132 | -- need canonical reordering.
133 | --
134 | -- Its advantage for a process like collation is that all 'NFD' and
135 | -- most 'NFC' texts - and many unnormalized texts - already conform to
136 | -- 'FCD' and do not need to be normalized ('NFD') for such a
137 | -- process. The 'FCD' 'quickCheck' will return 'Yes' for most strings
138 | -- in practice.
139 | --
140 | -- @'normalize' 'FCD'@ may be implemented with 'NFD'.
141 | --
142 | -- For more details on 'FCD' see the collation design document:
143 | -- <http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm>
144 | --
145 | -- ICU collation performs either 'NFD' or 'FCD' normalization
146 | -- automatically if normalization is turned on for the collator
147 | -- object.  Beyond collation and string search, normalized strings may
148 | -- be useful for string equivalence comparisons,
149 | -- transliteration/transcription, unique representations, etc.
150 | --
151 | -- The W3C generally recommends to exchange texts in 'NFC'.  Note also
152 | -- that most legacy character encodings use only precomposed forms and
153 | -- often do not encode any combining marks by themselves. For
154 | -- conversion to such character encodings the Unicode text needs to be
155 | -- normalized to 'NFC'.  For more usage examples, see the Unicode
156 | -- Standard Annex.
157 | 
158 | type UCompareOption = Word32
159 | 
160 | -- | Options to 'compare'.
161 | data CompareOption = InputIsFCD
162 |                    -- ^ The caller knows that both strings fulfill the
163 |                    -- 'FCD' conditions.  If /not/ set, 'compare' will
164 |                    -- 'quickCheck' for 'FCD' and normalize if
165 |                    -- necessary.
166 |                    | CompareIgnoreCase
167 |                    -- ^ Compare strings case-insensitively using case
168 |                    -- folding, instead of case-sensitively.  If set,
169 |                    -- then the following case folding options are
170 |                    -- used.
171 |                    | FoldCaseExcludeSpecialI
172 |                    -- ^ When case folding, exclude the special I
173 |                    -- character.  For use with Turkic
174 |                    -- (Turkish/Azerbaijani) text data.
175 |                      deriving (Eq, Show, Enum, Typeable)
176 | 
177 | fromCompareOption :: CompareOption -> UCompareOption
178 | fromCompareOption InputIsFCD              = #const UNORM_INPUT_IS_FCD
179 | fromCompareOption CompareIgnoreCase       = #const U_COMPARE_IGNORE_CASE
180 | fromCompareOption FoldCaseExcludeSpecialI = #const U_FOLD_CASE_EXCLUDE_SPECIAL_I
181 | 
182 | reduceCompareOptions :: [CompareOption] -> UCompareOption
183 | reduceCompareOptions = foldl' orO (#const U_COMPARE_CODE_POINT_ORDER)
184 |     where a `orO` b = a .|. fromCompareOption b
185 | 
186 | type UNormalizationMode = CInt
187 | 
188 | -- | Normalization modes.
189 | data NormalizationMode
190 |     = None   -- ^ No decomposition/composition.
191 |     | NFD    -- ^ Canonical decomposition.
192 |     | NFKD   -- ^ Compatibility decomposition.
193 |     | NFC    -- ^ Canonical decomposition followed by canonical composition.
194 |     | NFKC   -- ^ Compatibility decomposition followed by canonical composition.
195 |     | FCD    -- ^ \"Fast C or D\" form.
196 |       deriving (Eq, Show, Enum, Typeable)
197 | 
198 | toNM :: NormalizationMode -> UNormalizationMode
199 | toNM None = #const UNORM_NONE
200 | toNM NFD  = #const UNORM_NFD
201 | toNM NFKD = #const UNORM_NFKD
202 | toNM NFC  = #const UNORM_NFC
203 | toNM NFKC = #const UNORM_NFKC
204 | toNM FCD  = #const UNORM_FCD
205 | 
206 | -- | Normalize a string according to the specified normalization mode.
207 | normalize :: NormalizationMode -> Text -> Text
208 | normalize mode t = unsafePerformIO . useAsUCharPtr t $ \sptr slen ->
209 |   let slen' = fromIntegral slen
210 |       mode' = toNM mode
211 |   in handleOverflowError (fromIntegral slen)
212 |      (\dptr dlen -> unorm_normalize sptr slen' mode' 0 dptr (fromIntegral dlen))
213 |      (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
214 | 
215 | 
216 | -- | Perform an efficient check on a string, to quickly determine if
217 | -- the string is in a particular normalization form.
218 | --
219 | -- A 'Nothing' result indicates that a definite answer could not be
220 | -- determined quickly, and a more thorough check is required,
221 | -- e.g. with 'isNormalized'.  The user may have to convert the string
222 | -- to its normalized form and compare the results.
223 | --
224 | -- A result of 'Just' 'True' or 'Just' 'False' indicates that the
225 | -- string definitely is, or is not, in the given normalization form.
226 | quickCheck :: NormalizationMode -> Text -> Maybe Bool
227 | quickCheck mode t =
228 |   unsafePerformIO . useAsUCharPtr t $ \ptr len ->
229 |     fmap toNCR . handleError $ unorm_quickCheck ptr (fromIntegral len)
230 |                                (toNM mode)
231 | 
232 | -- | Indicate whether a string is in a given normalization form.
233 | --
234 | -- Unlike 'quickCheck', this function returns a definitive result.
235 | -- For 'NFD', 'NFKD', and 'FCD' normalization forms, both functions
236 | -- work in exactly the same ways.  For 'NFC' and 'NFKC' forms, where
237 | -- 'quickCheck' may return 'Nothing', this function will perform
238 | -- further tests to arrive at a definitive result.
239 | isNormalized :: NormalizationMode -> Text -> Bool
240 | isNormalized mode t =
241 |   unsafePerformIO . useAsUCharPtr t $ \ptr len ->
242 |     fmap asBool . handleError $ unorm_isNormalized ptr (fromIntegral len)
243 |                                 (toNM mode)
244 | 
245 | -- | Compare two strings for canonical equivalence.  Further options
246 | -- include case-insensitive comparison and codepoint order (as
247 | -- opposed to code unit order).
248 | --
249 | -- Canonical equivalence between two strings is defined as their
250 | -- normalized forms ('NFD' or 'NFC') being identical.  This function
251 | -- compares strings incrementally instead of normalizing (and
252 | -- optionally case-folding) both strings entirely, improving
253 | -- performance significantly.
254 | --
255 | -- Bulk normalization is only necessary if the strings do not fulfill
256 | -- the 'FCD' conditions. Only in this case, and only if the strings
257 | -- are relatively long, is memory allocated temporarily.  For 'FCD'
258 | -- strings and short non-'FCD' strings there is no memory allocation.
259 | compare :: [CompareOption] -> Text -> Text -> Ordering
260 | compare opts a b = unsafePerformIO .
261 |   useAsUCharPtr a $ \aptr alen ->
262 |     useAsUCharPtr b $ \bptr blen ->
263 |       fmap asOrdering . handleError $
264 |       unorm_compare aptr (fromIntegral alen) bptr (fromIntegral blen)
265 |                     (reduceCompareOptions opts)
266 | 
267 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_compare" unorm_compare
268 |     :: Ptr UChar -> Int32 
269 |     -> Ptr UChar -> Int32 
270 |     -> Word32
271 |     -> Ptr UErrorCode 
272 |     -> IO Int32
273 | 
274 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_quickCheck" unorm_quickCheck
275 |     :: Ptr UChar -> Int32 
276 |     -> UNormalizationMode 
277 |     -> Ptr UErrorCode
278 |     -> IO UNormalizationCheckResult
279 | 
280 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_isNormalized" unorm_isNormalized
281 |     :: Ptr UChar -> Int32 
282 |     -> UNormalizationMode 
283 |     -> Ptr UErrorCode 
284 |     -> IO UBool
285 | 
286 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_normalize" unorm_normalize
287 |     :: Ptr UChar -> Int32 
288 |     -> UNormalizationMode 
289 |     -> Int32
290 |     -> Ptr UChar -> Int32 
291 |     -> Ptr UErrorCode 
292 |     -> IO Int32
293 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Normalize/Internal.hsc:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Normalize.Internal
 4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | 
11 | module Data.Text.ICU.Normalize.Internal
12 |     (
13 |       UNormalizationCheckResult
14 |     , toNCR
15 |     ) where
16 | 
17 | #include <unicode/unorm.h>
18 | 
19 | import Foreign.C.Types (CInt)
20 | 
21 | type UNormalizationCheckResult = CInt
22 | 
23 | toNCR :: UNormalizationCheckResult -> Maybe Bool
24 | toNCR (#const UNORM_NO)    = Just False
25 | toNCR (#const UNORM_MAYBE) = Nothing
26 | toNCR (#const UNORM_YES)   = Just True
27 | toNCR _                    = error "toNormalizationCheckResult"
28 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Number.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.Number
  4 | -- Copyright   : (c) 2020 Torsten Kemps-Benedix
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : tkx68@icloud.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- New users with are strongly encouraged to see
 12 | -- if Data.Text.ICU.NumberFormatter fits their use case.
 13 | -- Although not deprecated, this header is provided for backwards
 14 | -- compatibility only.
 15 | 
 16 | module Data.Text.ICU.Number
 17 |     (
 18 |     -- * Unicode number formatting API
 19 |     -- $api
 20 |     numberFormatter
 21 |     , FormattableNumber, formatNumber, formatNumber'
 22 |     , NumberFormatStyle(..)
 23 |     , NumberFormat
 24 | ) where
 25 | 
 26 | #ifdef mingw32_HOST_OS
 27 | #define U_HAVE_INTTYPES_H 1
 28 | #endif
 29 | 
 30 | #include <unicode/unum.h>
 31 | #include <unicode/parseerr.h>
 32 | 
 33 | import GHC.Natural
 34 | import Data.Text (Text)
 35 | import qualified Data.Text as T
 36 | import Data.Text.ICU.Error
 37 | import Data.Text.ICU.Error.Internal (UErrorCode, UParseError, handleParseError, handleOverflowError)
 38 | import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr)
 39 | import Data.Text.ICU.Internal (LocaleName, withLocaleName)
 40 | import Data.Typeable (Typeable)
 41 | import Data.Int (Int32)
 42 | import Foreign.C.Types (CInt(..), CDouble(..))
 43 | import Foreign.Ptr (Ptr)
 44 | import System.IO.Unsafe (unsafePerformIO)
 45 | import Prelude hiding (compare)
 46 | import Foreign.C.String (CString)
 47 | import Data.Text.ICU.Number.Internal
 48 | -- $api
 49 | --
 50 | -- This module helps you to format and parse numbers for any locale. Your code
 51 | -- can be completely independent of the locale conventions for decimal points,
 52 | -- thousands-separators, or even the particular decimal digits used, or whether
 53 | -- the number format is even decimal. There are different number format styles
 54 | -- like decimal, currency, percent and spelled-out.
 55 | --
 56 | -- Use 'formatter' to create a formatter and 'format' to format numbers.
 57 | 
 58 | -- | The possible number format styles.
 59 | data NumberFormatStyle
 60 |     = NUM_PATTERN_DECIMAL Text -- ^ Decimal format defined by a pattern string. See the section \"Patterns\" at <https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classDecimalFormat.html#Patterns> for further details regarding pattern strings.
 61 |     | NUM_DECIMAL -- ^ Decimal format ("normal" style).
 62 |     | NUM_CURRENCY -- ^ Currency format (generic). Defaults to UNUM_CURRENCY_STANDARD style (using currency symbol, e.g., "$1.00", with non-accounting style for negative values e.g. using minus sign). The specific style may be specified using the -cf- locale key.
 63 |     | NUM_PERCENT -- ^ Percent format.
 64 |     | NUM_SCIENTIFIC -- ^ Scientific format.
 65 |     | NUM_SPELLOUT -- ^ Spellout rule-based format. The default ruleset can be specified/changed using unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS.
 66 |     | NUM_ORDINAL -- ^ Ordinal rule-based format. The default ruleset can be specified/changed using unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS.
 67 |     | NUM_DURATION -- ^ Duration rule-based format.
 68 |     | NUM_NUMBERING_SYSTEM -- ^ Numbering system rule-based format.
 69 |     | NUM_PATTERN_RULEBASED Text -- ^ Rule-based format defined by a pattern string.  See the section \"Patterns\" at <https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classDecimalFormat.html#Patterns> for further details regarding pattern strings.
 70 |     | NUM_CURRENCY_ISO -- ^ Currency format with an ISO currency code, e.g., "USD1.00".
 71 |     | NUM_CURRENCY_PLURAL -- ^ Currency format with a pluralized currency name, e.g., "1.00 US dollar" and "3.00 US dollars".
 72 |     | NUM_CURRENCY_ACCOUNTING -- ^ Currency format for accounting, e.g., "($3.00)" for negative currency amount instead of "-$3.00" (UNUM_CURRENCY). Overrides any style specified using -cf- key in locale.
 73 |     | NUM_CASH_CURRENCY -- ^ Currency format with a currency symbol given CASH usage, e.g., "NT$3" instead of "NT$3.23".
 74 |     | NUM_DECIMAL_COMPACT_SHORT -- ^ Decimal format expressed using compact notation (short form, corresponds to UNumberCompactStyle=UNUM_SHORT) e.g. "23K", "45B"
 75 |     | NUM_DECIMAL_COMPACT_LONG -- ^ Decimal format expressed using compact notation (long form, corresponds to UNumberCompactStyle=UNUM_LONG) e.g. "23 thousand", "45 billion"
 76 |     | NUM_CURRENCY_STANDARD -- ^ Currency format with a currency symbol, e.g., "$1.00", using non-accounting style for negative values (e.g. minus sign). Overrides any style specified using -cf- key in locale.
 77 |     | NUM_FORMAT_STYLE_COUNT -- ^ One more than the highest normal UNumberFormatStyle value. Deprecated: ICU 58 The numeric value may change over time, see ICU ticket #12420.
 78 |     | NUM_DEFAULT -- ^ Default format.
 79 |     | NUM_IGNORE -- ^ Alias for NUM_PATTERN_DECIMAL.
 80 |       deriving (Eq, Show, Typeable)
 81 | 
 82 | type UNumberFormatStyle = CInt
 83 | 
 84 | toNFS :: NumberFormatStyle -> UNumberFormatStyle
 85 | toNFS (NUM_PATTERN_DECIMAL _) = #const UNUM_PATTERN_DECIMAL
 86 | toNFS NUM_DECIMAL = #const UNUM_DECIMAL
 87 | toNFS NUM_CURRENCY = #const UNUM_CURRENCY
 88 | toNFS NUM_PERCENT = #const UNUM_PERCENT
 89 | toNFS NUM_SCIENTIFIC = #const UNUM_SCIENTIFIC
 90 | toNFS NUM_SPELLOUT = #const UNUM_SPELLOUT
 91 | toNFS NUM_ORDINAL = #const UNUM_ORDINAL
 92 | toNFS NUM_DURATION = #const UNUM_DURATION
 93 | toNFS NUM_NUMBERING_SYSTEM = #const UNUM_NUMBERING_SYSTEM
 94 | toNFS (NUM_PATTERN_RULEBASED _) = #const UNUM_PATTERN_RULEBASED
 95 | toNFS NUM_CURRENCY_ISO = #const UNUM_CURRENCY_ISO
 96 | toNFS NUM_CURRENCY_PLURAL = #const UNUM_CURRENCY_PLURAL
 97 | toNFS NUM_CURRENCY_ACCOUNTING = #const UNUM_CURRENCY_ACCOUNTING
 98 | toNFS NUM_CASH_CURRENCY = #const UNUM_CASH_CURRENCY
 99 | toNFS NUM_DECIMAL_COMPACT_SHORT = #const UNUM_DECIMAL_COMPACT_SHORT
100 | toNFS NUM_DECIMAL_COMPACT_LONG = #const UNUM_DECIMAL_COMPACT_LONG
101 | toNFS NUM_CURRENCY_STANDARD = #const UNUM_CURRENCY_STANDARD
102 | toNFS NUM_FORMAT_STYLE_COUNT = #const UNUM_FORMAT_STYLE_COUNT
103 | toNFS NUM_DEFAULT = #const UNUM_DEFAULT
104 | toNFS NUM_IGNORE = #const UNUM_IGNORE
105 | 
106 | -- | Create and return a new NumberFormat for formatting and parsing numbers.
107 | --
108 | -- A NumberFormat may be used to format numbers by calling unum_format, and
109 | -- to parse numbers by calling unum_parse. The caller must call unum_close when
110 | -- done to release resources used by this object.
111 | numberFormatter :: NumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively.
112 |         -> LocaleName -- ^ 	A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE".
113 |         -> NumberFormat
114 | numberFormatter sty@(NUM_PATTERN_DECIMAL pattern) loc = numberFormatter' (toNFS sty) pattern loc
115 | numberFormatter sty@(NUM_PATTERN_RULEBASED pattern) loc = numberFormatter' (toNFS sty) pattern loc
116 | numberFormatter style loc = numberFormatter' (toNFS style) T.empty loc
117 | 
118 | numberFormatter' :: UNumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively.
119 |         -> Text
120 |         -> LocaleName -- ^ 	A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE".
121 |         -> NumberFormat
122 | numberFormatter' style pattern loc =
123 |     System.IO.Unsafe.unsafePerformIO $ fmap C $ wrap $
124 |     useAsUCharPtr pattern $ \patternPtr patternLen ->
125 |             withLocaleName loc $
126 |                 handleParseError (== u_PARSE_ERROR) . (unum_open style patternPtr (fromIntegral patternLen))
127 | 
128 | foreign import ccall unsafe "hs_text_icu.h __hs_unum_open" unum_open
129 |     :: UNumberFormatStyle -> Ptr UChar -> Int32 -> CString -> Ptr UParseError  -> Ptr UErrorCode -> IO (Ptr UNumberFormat)
130 | 
131 | -- | Format an integer using a NumberFormat.
132 | --
133 | -- The integer will be formatted according to the UNumberFormat's locale.
134 | class FormattableNumber n where
135 |     formatNumber :: NumberFormat -- ^ The formatter to use.
136 |                 -> n -- ^ The number to format.
137 |                 -> Text
138 | 
139 | -- | Create a formatter and apply it in one step.
140 | formatNumber' :: (FormattableNumber n)
141 |             => NumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively.
142 |             -> LocaleName -- ^ 	A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE".
143 |             -> n -- ^ The number to format.
144 |             -> Text
145 | formatNumber' style loc x = formatNumber (numberFormatter style loc) x
146 | 
147 | instance FormattableNumber Integer where
148 |     formatNumber (C nf) x = numberFormatInt nf (fromIntegral x)
149 | 
150 | instance FormattableNumber Natural where
151 |     formatNumber (C nf) x = numberFormatInt nf (fromIntegral x)
152 | 
153 | instance FormattableNumber Int where
154 |     formatNumber (C nf) x = numberFormatInt nf x
155 | 
156 | instance FormattableNumber Double where
157 |     formatNumber (C nf) x = numberFormatDouble nf x
158 | 
159 | instance FormattableNumber Float where
160 |     formatNumber (C nf) x = numberFormatDouble nf (fromRational $ toRational x)
161 | 
162 | -- | Create a number format.
163 | numberFormatInt :: MNumberFormat -> Int -> Text
164 | numberFormatInt nf x = System.IO.Unsafe.unsafePerformIO $
165 |   withNumberFormat nf $ \nptr ->
166 |     handleOverflowError 100
167 |         (\dptr dlen ec -> unum_formatInt64 nptr (fromIntegral x) dptr (fromIntegral dlen) ec)
168 |         (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
169 | 
170 | -- | Format a number.
171 | numberFormatDouble :: MNumberFormat -> Double -> Text
172 | numberFormatDouble nf x = System.IO.Unsafe.unsafePerformIO $
173 |   withNumberFormat nf $ \nptr ->
174 |     handleOverflowError 100
175 |         (\dptr dlen ec -> unum_formatDouble nptr (CDouble x) dptr (fromIntegral dlen) ec)
176 |         (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
177 | 
178 | foreign import ccall unsafe "hs_text_icu.h __hs_unum_formatInt64" unum_formatInt64
179 |     :: Ptr UNumberFormat -> Int -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32
180 | foreign import ccall unsafe "hs_text_icu.h __hs_unum_formatDouble" unum_formatDouble
181 |     :: Ptr UNumberFormat -> CDouble -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32
182 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Number/Internal.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Collate.Internal
 4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Internals of the string collation infrastructure.
12 | 
13 | module Data.Text.ICU.Number.Internal
14 |     (
15 |     -- * Unicode collation API
16 |       MNumberFormat(..)
17 |     , NumberFormat(..)
18 |     , UNumberFormat
19 |     , withNumberFormat
20 |     , wrap
21 |     )
22 | where
23 | 
24 | import Data.Typeable (Typeable)
25 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
26 | import Foreign.Ptr (FunPtr, Ptr)
27 | import Data.Text.ICU.Internal (newICUPtr)
28 | 
29 | -- $api
30 | --
31 | 
32 | data UNumberFormat
33 | 
34 | -- | This is the number formatter. It can be created with 'formatter'. Use it to format numbers with the 'format' function.
35 | data MNumberFormat = MNumberFormat {-# UNPACK #-} !(ForeignPtr UNumberFormat)
36 |                  deriving (Typeable)
37 | 
38 | -- | This is the number formatter. It can be created with 'formatter'. Use it to format numbers with the 'format' function.
39 | newtype NumberFormat = C MNumberFormat
40 |     deriving (Typeable)
41 | 
42 | withNumberFormat :: MNumberFormat -> (Ptr UNumberFormat -> IO a) -> IO a
43 | withNumberFormat (MNumberFormat col) action = withForeignPtr col action
44 | {-# INLINE withNumberFormat #-}
45 | 
46 | wrap :: IO (Ptr UNumberFormat) -> IO MNumberFormat
47 | wrap = newICUPtr MNumberFormat unum_close
48 | {-# INLINE wrap #-}
49 | 
50 | foreign import ccall unsafe "hs_text_icu.h &__hs_unum_close" unum_close
51 |     :: FunPtr (Ptr UNumberFormat -> IO ())
52 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/NumberFormatter.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE EmptyDataDecls, RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-}
  2 | -- |
  3 | -- Module      : Data.Text.ICU.NumberFormatter
  4 | -- Copyright   : (c) 2021 Torsten Kemps-Benedix
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Number formatter implemented as bindings to
 12 | -- the International Components for Unicode (ICU) libraries.
 13 | 
 14 | module Data.Text.ICU.NumberFormatter
 15 |     (
 16 |       -- * Data
 17 |       NumberFormatter,
 18 |       -- * Formatter
 19 |       numberFormatter,
 20 |       -- $skeleton
 21 |       -- * Formatting functions
 22 |       formatIntegral, formatIntegral', formatDouble, formatDouble'
 23 |     ) where
 24 | 
 25 | #include <unicode/unumberformatter.h>
 26 | 
 27 | import Data.Int (Int32, Int64)
 28 | import Data.Text (Text)
 29 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError)
 30 | import Data.Text.ICU.Internal (LocaleName(..), UChar, withLocaleName, newICUPtr, fromUCharPtr, useAsUCharPtr)
 31 | import Foreign.C.String (CString)
 32 | import Foreign.C.Types (CDouble(..))
 33 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
 34 | import Foreign.Ptr (FunPtr, Ptr)
 35 | import Prelude hiding (last)
 36 | import System.IO.Unsafe (unsafePerformIO)
 37 | 
 38 | -- $skeleton
 39 | --
 40 | -- Here are some examples for number skeletons, see
 41 | -- https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html#examples for more:
 42 | --
 43 | -- +----------------------------+-----------------+--------+--------------+-------------------------------------------------------------+
 44 | -- | Long Skeleton              | Concise Skeleton | Input | en-US Output | Comments                                                    |
 45 | -- +============================+==================+=======+==============+=============================================================+
 46 | -- | percent                    | %                | 25    | 25%          |                                                             |
 47 | -- | .00                        |.00               | 25    | 25.00        | Equivalent to Precision::fixedFraction(2)                   |
 48 | -- | percent .00                | % .00            | 25    | 25.00%       |                                                             |
 49 | -- | scale/100                  | scale/100        | 0.3   | 30           | Multiply by 100 before formatting                           |
 50 | -- | percent scale/100          | %x100            | 0.3   | 30%          |                                                             |
 51 | -- | measure-unit/length-meter  | unit/meter       | 5     | 5 m          | UnitWidth defaults to Short                                 |
 52 | -- | unit-width-full-name       | unit/meter       | 5     | 5 meters     |                                                             |
 53 | -- | compact-short              | K                | 5000  | 5K           |                                                             |
 54 | -- | compact-long               | KK               | 5000  | 5 thousand   |                                                             |
 55 | -- | group-min2                 | ,?               | 5000  | 5000         | Require 2 digits in group for separator                     |
 56 | -- | group-min2                 | ,?               | 15000 | 15,000       |                                                             |
 57 | -- | sign-always                | +!               | 60    | +60          | Show sign on all numbers                                    |
 58 | -- | sign-always                | +!               | 0     | +0           |                                                             |
 59 | -- | sign-except-zero           | +?               | 60    | +60          | Show sign on all numbers except 0                           |
 60 | -- | sign-except-zero           | +?               | 0     | 0            |                                                             |
 61 | -- +----------------------------+-----------------+--------+--------------+-------------------------------------------------------------+
 62 | 
 63 | data UNumberFormatter
 64 | data UFormattedNumber
 65 | 
 66 | newtype NumberFormatter = NumberFormatter (ForeignPtr UNumberFormatter)
 67 | 
 68 | -- | Create a new 'NumberFormatter'.
 69 | --
 70 | -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify
 71 | -- the number skeletons. And use 'availableLocales' in order to find the allowed locale names. These
 72 | -- usuallly look like "en", "de", "de_AT" etc. See 'formatIntegral' and 'formatDouble' for some examples.
 73 | numberFormatter :: Text -> LocaleName -> IO NumberFormatter
 74 | numberFormatter skel loc =
 75 |   withLocaleName loc $ \locale ->
 76 |     useAsUCharPtr skel $ \skelPtr skelLen ->
 77 |       newICUPtr NumberFormatter unumf_close $
 78 |         handleError $ unumf_openForSkeletonAndLocale skelPtr (fromIntegral skelLen) locale
 79 | 
 80 | -- | Format an integral number.
 81 | --
 82 | -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify
 83 | -- the number skeletons.
 84 | --
 85 | -- >>> import Data.Text
 86 | -- >>> nf <- numberFormatter (pack "precision-integer") (Locale "de")
 87 | -- >>> formatIntegral nf 12345
 88 | -- "12.345"
 89 | -- >>> nf2 <- numberFormatter (pack "precision-integer") (Locale "fr")
 90 | -- >>> formatIntegral nf2 12345
 91 | -- "12\8239\&345"
 92 | formatIntegral :: Integral a => NumberFormatter -> a -> Text
 93 | formatIntegral (NumberFormatter nf) x = unsafePerformIO $ do
 94 |   withForeignPtr nf $ \nfPtr -> do
 95 |     resultPtr <- newResult
 96 |     withForeignPtr resultPtr $ \resPtr -> do
 97 |       handleError $ unumf_formatInt nfPtr (fromIntegral x) resPtr
 98 |       t <- handleOverflowError (fromIntegral (64 :: Int))
 99 |         (\dptr dlen -> unumf_resultToString resPtr dptr dlen)
100 |         (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
101 |       pure t
102 | 
103 | -- | Create a number formatter and apply it to an integral number.
104 | formatIntegral' :: (Integral a) => Text -> LocaleName -> a -> Text
105 | formatIntegral' skel loc x = unsafePerformIO $ do
106 |   nf <- numberFormatter skel loc
107 |   pure $ formatIntegral nf x
108 | 
109 | -- | Format a Double.
110 | --
111 | -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify
112 | -- the number skeletons.
113 | --
114 | -- >>> import Data.Text
115 | -- >>> nf3 <- numberFormatter (pack "precision-currency-cash") (Locale "it")
116 | -- >>> formatDouble nf3 12345.6789
117 | -- "12.345,68"
118 | formatDouble :: NumberFormatter -> Double -> Text
119 | formatDouble (NumberFormatter nf) x = unsafePerformIO $ do
120 |   withForeignPtr nf $ \nfPtr -> do
121 |     resultPtr <- newResult
122 |     withForeignPtr resultPtr $ \resPtr -> do
123 |       handleError $ unumf_formatDouble nfPtr (CDouble x) resPtr
124 |       t <- handleOverflowError (fromIntegral (64 :: Int))
125 |         (\dptr dlen -> unumf_resultToString resPtr dptr dlen)
126 |         (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
127 |       pure t
128 | 
129 | -- | Create a number formatter and apply it to a Double.
130 | formatDouble' :: Text -> LocaleName -> Double -> Text
131 | formatDouble' skel loc x = unsafePerformIO $ do
132 |   nf <- numberFormatter skel loc
133 |   pure $ formatDouble nf x
134 | 
135 | newResult :: IO (ForeignPtr UFormattedNumber)
136 | newResult = newICUPtr id unumf_closeResult $ handleError unumf_openResult
137 | 
138 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_openForSkeletonAndLocale" unumf_openForSkeletonAndLocale
139 |     :: Ptr UChar -> Int32 -> CString -> Ptr UErrorCode -> IO (Ptr UNumberFormatter)
140 | foreign import ccall unsafe "hs_text_icu.h &__hs_unumf_close" unumf_close
141 |     :: FunPtr (Ptr UNumberFormatter -> IO ())
142 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_openResult" unumf_openResult
143 |     :: Ptr UErrorCode -> IO (Ptr UFormattedNumber)
144 | foreign import ccall unsafe "hs_text_icu.h &__hs_unumf_closeResult" unumf_closeResult
145 |     :: FunPtr (Ptr UFormattedNumber -> IO ())
146 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_formatInt" unumf_formatInt
147 |     :: Ptr UNumberFormatter -> Int64 -> Ptr UFormattedNumber -> Ptr UErrorCode -> IO ()
148 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_formatDouble" unumf_formatDouble
149 |     :: Ptr UNumberFormatter -> CDouble -> Ptr UFormattedNumber -> Ptr UErrorCode -> IO ()
150 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_resultToString" unumf_resultToString
151 |     :: Ptr UFormattedNumber -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32
152 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Regex.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, EmptyDataDecls, MagicHash, RecordWildCards,
  2 |     ScopedTypeVariables #-}
  3 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  4 | 
  5 | -- |
  6 | -- Module      : Data.Text.ICU.Regex
  7 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com
 11 | -- Stability   : experimental
 12 | -- Portability : GHC
 13 | --
 14 | -- Regular expression support for Unicode, implemented as bindings to
 15 | -- the International Components for Unicode (ICU) libraries.
 16 | --
 17 | -- The syntax and behaviour of ICU regular expressions are Perl-like.
 18 | -- For complete details, see the ICU User Guide entry at
 19 | -- <http://userguide.icu-project.org/strings/regexp>.
 20 | --
 21 | -- /Note/: The functions in this module are not thread safe.  For
 22 | -- thread safe use, see 'clone' below, or use the pure functions in
 23 | -- "Data.Text.ICU".
 24 | 
 25 | module Data.Text.ICU.Regex
 26 |     (
 27 |     -- * Types
 28 |       MatchOption(..)
 29 |     , ParseError(errError, errLine, errOffset)
 30 |     , Regex
 31 |     -- * Functions
 32 |     -- ** Construction
 33 |     , regex
 34 |     , regex'
 35 |     , clone
 36 |     -- ** Managing text to search
 37 |     , setText
 38 |     , getUTextPtr
 39 |     -- ** Inspection
 40 |     , pattern
 41 |     -- ** Searching
 42 |     , find
 43 |     , findNext
 44 |     -- ** Match groups
 45 |     -- $groups
 46 |     , groupCount
 47 |     , start
 48 |     , end
 49 |     , start_
 50 |     , end_
 51 |     ) where
 52 | 
 53 | import Data.Text.ICU.Regex.Internal
 54 | import qualified Control.Exception as E
 55 | import Data.IORef (newIORef, readIORef, writeIORef)
 56 | import Data.Text (Text)
 57 | import Data.Text.ICU.Internal (asBool, UTextPtr, asUTextPtr, emptyUTextPtr, TextI, withUTextPtr, fromUCharPtr, newICUPtr)
 58 | import Data.Text.ICU.Error.Internal (ParseError(..), handleError)
 59 | import Foreign.ForeignPtr (withForeignPtr)
 60 | import Foreign.Marshal.Alloc (alloca)
 61 | import Foreign.Storable (peek)
 62 | import System.IO.Unsafe (unsafePerformIO)
 63 | 
 64 | instance Show Regex where
 65 |     show re = "Regex " ++ show (pattern re)
 66 | 
 67 | -- $groups
 68 | --
 69 | -- Capturing groups are numbered starting from zero.  Group zero is
 70 | -- always the entire matching text.  Groups greater than zero contain
 71 | -- the text matching each capturing group in a regular expression.
 72 | 
 73 | -- | Compile a regular expression with the given options.  This is
 74 | -- safest to use when the pattern is constructed at run time.
 75 | regex' :: [MatchOption] -> Text -> IO (Either ParseError Regex)
 76 | regex' opts pat = (Right `fmap` regex opts pat) `E.catch` \(err::ParseError) ->
 77 |                   return (Left err)
 78 | 
 79 | -- | Set the subject text string upon which the regular expression
 80 | -- will look for matches.  This function may be called any number of
 81 | -- times, allowing the regular expression pattern to be applied to
 82 | -- different strings.
 83 | setText :: Regex -> Text -> IO ()
 84 | setText Regex{..} t = do
 85 |   hayfp <- asUTextPtr t
 86 |   withForeignPtr reRe $ \rePtr ->
 87 |     withUTextPtr hayfp $ \hayPtr -> handleError $
 88 |       uregex_setUText rePtr hayPtr
 89 |   writeIORef reText hayfp
 90 | 
 91 | -- | Get the subject text that is currently associated with this
 92 | -- regular expression object.
 93 | getUTextPtr :: Regex -> IO UTextPtr
 94 | getUTextPtr Regex{..} = readIORef reText
 95 | 
 96 | -- | Return the source form of the pattern used to construct this
 97 | -- regular expression or match.
 98 | pattern :: Regex -> Text
 99 | pattern Regex{..} = unsafePerformIO . withForeignPtr reRe $ \rePtr ->
100 |   alloca $ \lenPtr -> do
101 |     textPtr <- handleError $ uregex_pattern rePtr lenPtr
102 |     (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr
103 | 
104 | -- | Find the first matching substring of the input string that
105 | -- matches the pattern.
106 | --
107 | -- If /n/ is non-negative, the search for a match begins at the
108 | -- specified index, and any match region is reset.
109 | --
110 | -- If /n/ is -1, the search begins at the start of the input region,
111 | -- or at the start of the full string if no region has been specified.
112 | --
113 | -- If a match is found, 'start', 'end', and 'group' will provide more
114 | -- information regarding the match.
115 | find :: Regex -> TextI -> IO Bool
116 | find Regex{..} n =
117 |     fmap asBool . withForeignPtr reRe $ \rePtr -> handleError $
118 |     uregex_find rePtr (fromIntegral n)
119 | 
120 | -- | Find the next pattern match in the input string.  Begin searching
121 | -- the input at the location following the end of he previous match,
122 | -- or at the start of the string (or region) if there is no previous
123 | -- match.
124 | --
125 | -- If a match is found, 'start', 'end', and 'group' will provide more
126 | -- information regarding the match.
127 | findNext :: Regex -> IO Bool
128 | findNext Regex{..} =
129 |     fmap asBool . withForeignPtr reRe $ handleError . uregex_findNext
130 | 
131 | -- | Make a copy of a compiled regular expression.  Cloning a regular
132 | -- expression is faster than opening a second instance from the source
133 | -- form of the expression, and requires less memory.
134 | --
135 | -- Note that the current input string and the position of any matched
136 | -- text within it are not cloned; only the pattern itself and and the
137 | -- match mode flags are copied.
138 | --
139 | -- Cloning can be particularly useful to threaded applications that
140 | -- perform multiple match operations in parallel.  Each concurrent RE
141 | -- operation requires its own instance of a 'Regex'.
142 | clone :: Regex -> IO Regex
143 | {-# INLINE clone #-}
144 | clone Regex{..} = do
145 |   newICUPtr Regex uregex_close
146 |     (withForeignPtr reRe (handleError . uregex_clone))
147 |     <*>
148 |     newIORef emptyUTextPtr
149 | 
150 | -- | Return the number of capturing groups in this regular
151 | -- expression's pattern.
152 | groupCount :: Regex -> IO Int
153 | groupCount Regex{..} =
154 |     fmap fromIntegral . withForeignPtr reRe $ handleError . uregex_groupCount
155 | 
156 | -- | Returns the index in the input string of the start of the text
157 | -- matched by the specified capture group during the previous match
158 | -- operation.  Returns @-1@ if the capture group was not part of the
159 | -- last match.
160 | start_ :: Regex -> Int -> IO TextI
161 | start_ Regex{..} n =
162 |     fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $
163 |     uregex_start rePtr (fromIntegral n)
164 | 
165 | -- | Returns the index in the input string of the end of the text
166 | -- matched by the specified capture group during the previous match
167 | -- operation.  Returns @-1@ if the capture group was not part of
168 | -- the last match.
169 | end_ :: Regex -> Int -> IO TextI
170 | end_ Regex{..} n =
171 |   fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $
172 |   uregex_end rePtr (fromIntegral n)
173 | 
174 | -- | Returns the index in the input string of the start of the text
175 | -- matched by the specified capture group during the previous match
176 | -- operation.  Returns 'Nothing' if the capture group was not part of
177 | -- the last match.
178 | start :: Regex -> Int -> IO (Maybe TextI)
179 | start r n = check `fmap` start_ r n
180 | 
181 | -- | Returns the index in the input string of the end of the text
182 | -- matched by the specified capture group during the previous match
183 | -- operation.  Returns 'Nothing' if the capture group was not part of
184 | -- the last match.
185 | end :: Regex -> Int -> IO (Maybe TextI)
186 | end r n = check `fmap` end_ r n
187 | 
188 | check :: TextI -> Maybe TextI
189 | check (-1) = Nothing
190 | check k    = Just $! fromIntegral k
191 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Regex/Internal.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, DeriveDataTypeable, EmptyDataDecls,
  2 |     ForeignFunctionInterface, MagicHash, RecordWildCards,
  3 |     ScopedTypeVariables #-}
  4 | 
  5 | -- |
  6 | -- Module      : Data.Text.ICU.Regex.Internal
  7 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com
 11 | -- Stability   : experimental
 12 | -- Portability : GHC
 13 | --
 14 | -- Regular expression support for Unicode, implemented as bindings to
 15 | -- the International Components for Unicode (ICU) libraries.
 16 | --
 17 | -- The syntax and behaviour of ICU regular expressions are Perl-like.
 18 | -- For complete details, see the ICU User Guide entry at
 19 | -- <http://userguide.icu-project.org/strings/regexp>.
 20 | 
 21 | module Data.Text.ICU.Regex.Internal
 22 |     (
 23 |     -- * Types
 24 |       MatchOption(..)
 25 |     , Regex(..)
 26 |     , URegularExpression
 27 |     -- * Functions
 28 |     , regex
 29 |     , uregex_clone
 30 |     , uregex_close
 31 |     , uregex_end
 32 |     , uregex_find
 33 |     , uregex_findNext
 34 |     , uregex_getText
 35 |     , uregex_group
 36 |     , uregex_groupCount
 37 |     , uregex_pattern
 38 |     , uregex_setUText
 39 |     , uregex_start
 40 |     ) where
 41 | 
 42 | import Control.Monad (when)
 43 | import Data.IORef (IORef, newIORef)
 44 | import Data.Int (Int32)
 45 | import Data.Text (Text)
 46 | import Data.Text.ICU.Internal (UBool, UChar, UTextPtr, UText, useAsUCharPtr, withUTextPtr, emptyUTextPtr, newICUPtr)
 47 | import Data.Text.ICU.Error (isRegexError)
 48 | import Data.Text.ICU.Error.Internal (UParseError, UErrorCode,
 49 |                                      handleError, handleParseError)
 50 | import Data.Typeable (Typeable)
 51 | import Data.Word (Word32)
 52 | import Foreign.ForeignPtr (ForeignPtr)
 53 | import Foreign.Ptr (FunPtr, Ptr)
 54 | 
 55 | #include <unicode/uregex.h>
 56 | 
 57 | -- | Options for controlling matching behaviour.
 58 | data MatchOption
 59 |     = CaseInsensitive
 60 |     -- ^ Enable case insensitive matching.
 61 |     | Comments
 62 |     -- ^ Allow comments and white space within patterns.
 63 |     | DotAll
 64 |     -- ^ If set, @\'.\'@ matches line terminators. Otherwise @\'.\'@
 65 |     -- matching stops at line end.
 66 |     | Literal
 67 |     -- ^ If set, treat the entire pattern as a literal string.
 68 |     --  Metacharacters or escape sequences in the input sequence will
 69 |     --  be given no special meaning.
 70 |     --
 71 |     --  The option 'CaseInsensitive' retains its meanings on matching
 72 |     --  when used in conjunction with this option.  Other options
 73 |     --  become superfluous.
 74 |     | Multiline
 75 |     -- ^ Control behaviour of @\'$\'@ and @\'^\'@. If set, recognize
 76 |     -- line terminators within string, Otherwise, match only at start
 77 |     -- and end of input string.
 78 |     | HaskellLines
 79 |     -- ^ Haskell-only line endings.  When this mode is enabled, only
 80 |     -- @\'\\n\'@ is recognized as a line ending in the behavior of
 81 |     -- @\'.\'@, @\'^\'@, and @\'$\'@.
 82 |     | UnicodeWord
 83 |     -- ^ Unicode word boundaries.  If set, @\'\\\\b\'@ uses the
 84 |     -- Unicode TR 29 definition of word boundaries.
 85 |     --
 86 |     -- /Warning/: Unicode word boundaries are quite different from
 87 |     -- traditional regular expression word boundaries.  See
 88 |     -- <http://unicode.org/reports/tr29/#Word_Boundaries>.
 89 |     | ErrorOnUnknownEscapes
 90 |     -- ^ Throw an error on unrecognized backslash escapes.  If set,
 91 |     -- fail with an error on patterns that contain backslash-escaped
 92 |     -- ASCII letters without a known special meaning.  If this flag is
 93 |     -- not set, these escaped letters represent themselves.
 94 |     | WorkLimit Int
 95 |     -- ^ Set a processing limit for match operations.
 96 |     --
 97 |     -- Some patterns, when matching certain strings, can run in
 98 |     -- exponential time.  For practical purposes, the match operation
 99 |     -- may appear to be in an infinite loop.  When a limit is set a
100 |     -- match operation will fail with an error if the limit is
101 |     -- exceeded.
102 |     --
103 |     -- The units of the limit are steps of the match engine.
104 |     -- Correspondence with actual processor time will depend on the
105 |     -- speed of the processor and the details of the specific pattern,
106 |     -- but will typically be on the order of milliseconds.
107 |     --
108 |     -- By default, the matching time is not limited.
109 |     | StackLimit Int
110 |     -- ^ Set the amount of heap storage available for use by the match
111 |     -- backtracking stack.
112 |     --
113 |     -- ICU uses a backtracking regular expression engine, with the
114 |     -- backtrack stack maintained on the heap.  This function sets the
115 |     -- limit to the amount of memory that can be used for this
116 |     -- purpose.  A backtracking stack overflow will result in an error
117 |     -- from the match operation that caused it.
118 |     --
119 |     -- A limit is desirable because a malicious or poorly designed
120 |     -- pattern can use excessive memory, potentially crashing the
121 |     -- process.  A limit is enabled by default.
122 |       deriving (Eq, Show, Typeable)
123 | 
124 | -- | A compiled regular expression.
125 | --
126 | -- 'Regex' values are usually constructed using the 'regex' or
127 | -- 'regex'' functions.  This type is also an instance of 'IsString',
128 | -- so if you have the @OverloadedStrings@ language extension enabled,
129 | -- you can construct a 'Regex' by simply writing the pattern in
130 | -- quotes (though this does not allow you to specify any 'Option's).
131 | data Regex = Regex {
132 |       reRe :: ForeignPtr URegularExpression
133 |     , reText :: IORef UTextPtr
134 |     }
135 | 
136 | -- | Compile a regular expression with the given options.  This
137 | -- function throws a 'ParseError' if the pattern is invalid.
138 | --
139 | -- The 'Regex' is initialized with empty text to search against.
140 | regex :: [MatchOption] -> Text -> IO Regex
141 | regex opts pat = useAsUCharPtr pat $ \pptr plen ->
142 |   newICUPtr Regex uregex_close (do
143 |     ptr <- handleParseError isRegexError $
144 |            uregex_open pptr (fromIntegral plen) flags
145 |     withUTextPtr hayfp $ \hayPtr -> handleError $
146 |       uregex_setUText ptr hayPtr
147 |     when (workLimit > -1) .
148 |       handleError $ uregex_setTimeLimit ptr (fromIntegral workLimit)
149 |     when (stackLimit > -1) .
150 |       handleError $ uregex_setStackLimit ptr (fromIntegral stackLimit)
151 |     return ptr)
152 |   <*> newIORef hayfp
153 |   where (flags,workLimit,stackLimit) = toURegexpOpts opts
154 |         hayfp = emptyUTextPtr
155 | 
156 | data URegularExpression
157 | 
158 | type URegexpFlag = Word32
159 | 
160 | toURegexpOpts :: [MatchOption] -> (URegexpFlag,Int,Int)
161 | toURegexpOpts = foldl go (0,-1,-1)
162 |   where
163 |     go (!flag,work,stack) opt = (flag+flag',work',stack')
164 |      where
165 |       flag' = case opt of
166 |                 CaseInsensitive       -> #const UREGEX_CASE_INSENSITIVE
167 |                 Comments              -> #const UREGEX_COMMENTS
168 |                 DotAll                -> #const UREGEX_DOTALL
169 |                 Literal               -> #const UREGEX_LITERAL
170 |                 Multiline             -> #const UREGEX_MULTILINE
171 |                 HaskellLines          -> #const UREGEX_UNIX_LINES
172 |                 UnicodeWord           -> #const UREGEX_UWORD
173 |                 ErrorOnUnknownEscapes -> #const UREGEX_ERROR_ON_UNKNOWN_ESCAPES
174 |                 _                     -> 0
175 |       work' = case opt of
176 |                 WorkLimit limit       -> limit
177 |                 _                     -> work
178 |       stack' = case opt of
179 |                  StackLimit limit     -> limit
180 |                  _                    -> stack
181 | 
182 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_open" uregex_open
183 |     :: Ptr UChar -> Int32 -> Word32 -> Ptr UParseError -> Ptr UErrorCode
184 |     -> IO (Ptr URegularExpression)
185 | 
186 | foreign import ccall unsafe "hs_text_icu.h &__hs_uregex_close" uregex_close
187 |     :: FunPtr (Ptr URegularExpression -> IO ())
188 | 
189 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_clone" uregex_clone
190 |     :: Ptr URegularExpression -> Ptr UErrorCode
191 |     -> IO (Ptr URegularExpression)
192 | 
193 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_pattern" uregex_pattern
194 |     :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode
195 |     -> IO (Ptr UChar)
196 | 
197 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setUText" uregex_setUText
198 |     :: Ptr URegularExpression -> Ptr UText -> Ptr UErrorCode -> IO ()
199 | 
200 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_getText" uregex_getText
201 |     :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar)
202 | 
203 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_find" uregex_find
204 |     :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO UBool
205 | 
206 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_findNext" uregex_findNext
207 |     :: Ptr URegularExpression -> Ptr UErrorCode -> IO UBool
208 | 
209 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_start" uregex_start
210 |     :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32
211 | 
212 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_end" uregex_end
213 |     :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32
214 | 
215 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_groupCount" uregex_groupCount
216 |     :: Ptr URegularExpression -> Ptr UErrorCode -> IO Int32
217 | 
218 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_group" uregex_group
219 |     :: Ptr URegularExpression -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode
220 |     -> IO Int32
221 | 
222 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setTimeLimit" uregex_setTimeLimit
223 |     :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO ()
224 | 
225 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setStackLimit" uregex_setStackLimit
226 |     :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO ()
227 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Regex/Pure.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, EmptyDataDecls, ScopedTypeVariables #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.ICU.Regex.Pure
  5 | -- Copyright   : (c) 2010 Bryan O'Sullivan
  6 | --
  7 | -- License     : BSD-style
  8 | -- Maintainer  : bos@serpentine.com
  9 | -- Stability   : experimental
 10 | -- Portability : GHC
 11 | --
 12 | -- Regular expression support for Unicode, implemented as bindings to
 13 | -- the International Components for Unicode (ICU) libraries.
 14 | --
 15 | -- The functions in this module are pure and hence thread safe, but
 16 | -- may not be as fast or as flexible as those in the
 17 | -- "Data.Text.ICU.Regex" module.
 18 | --
 19 | -- The syntax and behaviour of ICU regular expressions are Perl-like.
 20 | -- For complete details, see the ICU User Guide entry at
 21 | -- <http://userguide.icu-project.org/strings/regexp>.
 22 | 
 23 | module Data.Text.ICU.Regex.Pure
 24 |     (
 25 |     -- * Types
 26 |       MatchOption(..)
 27 |     , ParseError(errError, errLine, errOffset)
 28 |     , Match
 29 |     , Regex
 30 |     , Regular
 31 |     -- * Functions
 32 |     -- ** Construction
 33 |     , regex
 34 |     , regex'
 35 |     -- ** Inspection
 36 |     , pattern
 37 |     -- ** Searching
 38 |     , find
 39 |     , findAll
 40 |     -- ** Match groups
 41 |     -- $group
 42 |     , groupCount
 43 |     , unfold
 44 |     , span
 45 |     , group
 46 |     , prefix
 47 |     , suffix
 48 |     ) where
 49 | 
 50 | import qualified Control.Exception as E
 51 | import Data.String (IsString(..))
 52 | import Data.Text (Text)
 53 | import qualified Data.Text as T
 54 | import qualified Data.Text.Foreign as T
 55 | import Data.Text.ICU.Internal (TextI, fromUCharPtr, lengthWord, withUTextPtrText, utextPtrLength)
 56 | import Data.Text.ICU.Error.Internal (ParseError(..), handleError)
 57 | import qualified Data.Text.ICU.Regex as IO
 58 | import Data.Text.ICU.Regex.Internal hiding (Regex(..), regex)
 59 | import qualified Data.Text.ICU.Regex.Internal as Internal
 60 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
 61 | import Foreign.Marshal.Alloc (alloca)
 62 | import Foreign.Marshal.Array (advancePtr)
 63 | import Foreign.Storable (peek)
 64 | import Prelude hiding (span)
 65 | import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO)
 66 | 
 67 | -- | A compiled regular expression.
 68 | --
 69 | -- 'Regex' values are usually constructed using the 'regex' or
 70 | -- 'regex'' functions.  This type is also an instance of 'IsString',
 71 | -- so if you have the @OverloadedStrings@ language extension enabled,
 72 | -- you can construct a 'Regex' by simply writing the pattern in
 73 | -- quotes (though this does not allow you to specify any 'Option's).
 74 | newtype Regex = Regex {
 75 |       reRe :: Internal.Regex
 76 |     }
 77 | 
 78 | instance Show Regex where
 79 |     show re = "Regex " ++ show (pattern re)
 80 | 
 81 | instance IsString Regex where
 82 |     fromString = regex [] . T.pack
 83 | 
 84 | -- | A match for a regular expression.
 85 | data Match = Match {
 86 |       matchRe :: Internal.Regex
 87 |     , _matchPrev :: TextI
 88 |     }
 89 | 
 90 | instance Show Match where
 91 |     show m = "Match " ++ show (unfold group m)
 92 | 
 93 | -- | A typeclass for functions common to both 'Match' and 'Regex'
 94 | -- types.
 95 | class Regular r where
 96 |     regRe :: r -> Internal.Regex
 97 | 
 98 |     regFp :: r -> ForeignPtr URegularExpression
 99 |     regFp = Internal.reRe . regRe
100 |     {-# INLINE regFp #-}
101 | 
102 | instance Regular Match where
103 |     regRe = matchRe
104 | 
105 | instance Regular Regex where
106 |     regRe = reRe
107 | 
108 | -- | Compile a regular expression with the given options.  This
109 | -- function throws a 'ParseError' if the pattern is invalid, so it is
110 | -- best for use when the pattern is statically known.
111 | regex :: [MatchOption] -> Text -> Regex
112 | regex opts pat = Regex . unsafePerformIO $ IO.regex opts pat
113 | 
114 | -- | Compile a regular expression with the given options.  This is
115 | -- safest to use when the pattern is constructed at run time.
116 | regex' :: [MatchOption] -> Text -> Either ParseError Regex
117 | regex' opts pat = unsafePerformIO $
118 |   ((Right . Regex) `fmap` Internal.regex opts pat) `E.catch`
119 |   \(err::ParseError) -> return (Left err)
120 | 
121 | -- | Return the source form of the pattern used to construct this
122 | -- regular expression or match.
123 | pattern :: Regular r => r -> Text
124 | pattern r = unsafePerformIO . withForeignPtr (regFp r) $ \rePtr ->
125 |   alloca $ \lenPtr -> do
126 |     textPtr <- handleError $ uregex_pattern rePtr lenPtr
127 |     (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr
128 | 
129 | -- | Find the first match for the regular expression in the given text.
130 | find :: Regex -> Text -> Maybe Match
131 | find re0 haystack = unsafePerformIO .
132 |   matching re0 haystack $ \re -> do
133 |     m <- IO.findNext re
134 |     return $! if m then Just (Match re 0) else Nothing
135 | 
136 | -- | Lazily find all matches for the regular expression in the given
137 | -- text.
138 | findAll :: Regex -> Text -> [Match]
139 | findAll re0 haystack = unsafePerformIO . unsafeInterleaveIO $ go 0
140 |   where
141 |     len = fromIntegral . lengthWord $ haystack
142 |     go !n | n >= len  = return []
143 |           | otherwise = matching re0 haystack $ \re -> do
144 |       found <- IO.find re n
145 |       if found
146 |         then do
147 |           n' <- IO.end_ re 0
148 |           (Match re n:) `fmap` go n'
149 |         else return []
150 | 
151 | matching :: Regex -> Text -> (IO.Regex -> IO a) -> IO a
152 | matching (Regex re0) haystack act = do
153 |   re <- IO.clone re0
154 |   IO.setText re haystack
155 |   act re
156 | 
157 | -- $group
158 | --
159 | -- Capturing groups are numbered starting from zero.  Group zero is
160 | -- always the entire matching text.  Groups greater than zero contain
161 | -- the text matching each capturing group in a regular expression.
162 | 
163 | -- | Return the number of capturing groups in this regular
164 | -- expression or match's pattern.
165 | groupCount :: Regular r => r -> Int
166 | groupCount = unsafePerformIO . IO.groupCount . regRe
167 | {-# INLINE groupCount #-}
168 | 
169 | -- | A combinator for returning a list of all capturing groups on a
170 | -- 'Match'.
171 | unfold :: (Int -> Match -> Maybe Text) -> Match -> [Text]
172 | unfold f m = go 0
173 |   where go !n = case f n m of
174 |                   Nothing -> []
175 |                   Just z  -> z : go (n+1)
176 | 
177 | -- | Return the /n/th capturing group in a match, or 'Nothing' if /n/
178 | -- is out of bounds.
179 | group :: Int -> Match -> Maybe Text
180 | group n m = grouping n m $ \re -> do
181 |   let n' = fromIntegral n
182 |   start <- fromIntegral `fmap` IO.start_ re n'
183 |   end <- fromIntegral `fmap` IO.end_ re n'
184 |   ut <- IO.getUTextPtr re
185 |   withUTextPtrText ut $ \ptr ->
186 |     T.fromPtr (ptr `advancePtr` fromIntegral start) (end - start)
187 | 
188 | -- | Return the prefix of the /n/th capturing group in a match (the
189 | -- text from the start of the string to the start of the match), or
190 | -- 'Nothing' if /n/ is out of bounds.
191 | prefix :: Int -> Match -> Maybe Text
192 | prefix n m = grouping n m $ \re -> do
193 |   start <- fromIntegral `fmap` IO.start_ re n
194 |   ut <- IO.getUTextPtr re
195 |   withUTextPtrText ut (`T.fromPtr` start)
196 | 
197 | -- | Return the span of text between the end of the previous match and
198 | -- the beginning of the current match.
199 | span :: Match -> Text
200 | span (Match re p) = unsafePerformIO $ do
201 |   start <- IO.start_ re 0
202 |   ut <- IO.getUTextPtr re
203 |   withUTextPtrText ut $ \ptr ->
204 |     T.fromPtr (ptr `advancePtr` fromIntegral p) (start - p)
205 | 
206 | -- | Return the suffix of the /n/th capturing group in a match (the
207 | -- text from the end of the match to the end of the string), or
208 | -- 'Nothing' if /n/ is out of bounds.
209 | suffix :: Int -> Match -> Maybe Text
210 | suffix n m = grouping n m $ \re -> do
211 |   end <- fromIntegral `fmap` IO.end_ re n
212 |   ut <- IO.getUTextPtr re
213 |   withUTextPtrText ut $ \ptr -> do
214 |     T.fromPtr (ptr `advancePtr` fromIntegral end) (utextPtrLength ut - end)
215 | 
216 | grouping :: Int -> Match -> (Internal.Regex -> IO a) -> Maybe a
217 | grouping n (Match m _) act = unsafePerformIO $ do
218 |   count <- IO.groupCount m
219 |   let n' = fromIntegral n
220 |   if n' == 0 || (n' >= 0 && n' <= count)
221 |     then Just `fmap` act m
222 |     else return Nothing
223 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Shape.hsc:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, ForeignFunctionInterface #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.ICU.Shape
  5 | -- Copyright   : (c) 2018 Ondrej Palkovsky
  6 | --
  7 | -- License     : BSD-style
  8 | -- Maintainer  : bos@serpentine.com
  9 | -- Stability   : experimental
 10 | -- Portability : GHC
 11 | --
 12 | -- Bindings for arabic shaping, implemented as bindings to
 13 | -- the International Components for Unicode (ICU) libraries.
 14 | --
 15 | 
 16 | module Data.Text.ICU.Shape
 17 |   (
 18 |       shapeArabic
 19 |     , ShapeOption(..)
 20 |   ) where
 21 | 
 22 | #include <unicode/ushape.h>
 23 | 
 24 | import Data.List (foldl')
 25 | import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError)
 26 | import Data.Bits ((.|.))
 27 | import Data.Int (Int32)
 28 | import Foreign.Ptr (Ptr)
 29 | import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr)
 30 | import Data.Text (Text)
 31 | import System.IO.Unsafe (unsafePerformIO)
 32 | 
 33 | -- | Options for the 'shapeArabic' function.
 34 | data ShapeOption =
 35 |     AggregateTaskheel
 36 |   -- ^ Tashkeel aggregation option: Replaces any combination of U+0651 with one of U+064C, U+064D, U+064E, U+064F, U+0650 with U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively.
 37 |   | DigitTypeAnExtended
 38 |   -- ^ Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9).
 39 |   | DigitsAlen2AnInitAl
 40 |   -- ^ Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly
 41 |   -- directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
 42 |   | DigitsAlen2AnInitLr
 43 |   -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]).
 44 |   | DigitsAn2En
 45 |   -- ^ Digit shaping option: Replace Arabic-Indic digits by European digits (U+0030...).
 46 |   | DigitsEn2An
 47 |   -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits.
 48 |   | LengthFixedSpacesAtBeginning
 49 |   -- ^ If more room is necessary, then try to consume spaces at the beginning of the text.
 50 |   | LengthFixedSpacesAtEnd
 51 |   -- ^ If more room is necessary, then try to consume spaces at the end of the text.
 52 |   | LengthFixedSpacesNear
 53 |   -- ^ If more room is necessary, then try to consume spaces next to modified characters.
 54 |   | LettersShape
 55 |   -- ^ Letter shaping option: replace abstract letter characters by "shaped" ones.
 56 |   | LettersUnshape
 57 |   -- ^ Letter shaping option: replace "shaped" letter characters by abstract ones.
 58 |   | LettersShapeTashkeelIsolated
 59 |   -- ^ The only difference with LettersShape is that Tashkeel letters are always "shaped" into the isolated form instead of the medial form (selecting codepoints from the Arabic Presentation Forms-B block).
 60 |   | PreservePresentation
 61 |   -- ^ Presentation form option: Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B characters with 0+06xx characters, before shaping.
 62 |   | TextDirectionVisualLTR
 63 |   -- ^ Direction indicator: the source is in visual LTR order, the leftmost displayed character stored first.
 64 |   deriving (Show)
 65 | 
 66 | reduceShapeOpts :: [ShapeOption] -> Int32
 67 | reduceShapeOpts = foldl' orO 0
 68 |     where a `orO` b = a .|. fromShapeOption b
 69 | 
 70 | fromShapeOption :: ShapeOption -> Int32
 71 | fromShapeOption AggregateTaskheel   = #const U_SHAPE_AGGREGATE_TASHKEEL
 72 | fromShapeOption DigitTypeAnExtended   = #const U_SHAPE_DIGIT_TYPE_AN_EXTENDED
 73 | fromShapeOption DigitsAlen2AnInitAl = #const U_SHAPE_DIGITS_ALEN2AN_INIT_AL
 74 | fromShapeOption DigitsAlen2AnInitLr = #const U_SHAPE_DIGITS_ALEN2AN_INIT_LR
 75 | fromShapeOption DigitsAn2En = #const U_SHAPE_DIGITS_AN2EN
 76 | fromShapeOption DigitsEn2An = #const U_SHAPE_DIGITS_EN2AN
 77 | fromShapeOption LengthFixedSpacesAtBeginning = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING
 78 | fromShapeOption LengthFixedSpacesAtEnd = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_END
 79 | fromShapeOption LengthFixedSpacesNear = #const U_SHAPE_LENGTH_FIXED_SPACES_NEAR
 80 | fromShapeOption LettersShape = #const U_SHAPE_LETTERS_SHAPE
 81 | fromShapeOption LettersUnshape = #const U_SHAPE_LETTERS_UNSHAPE
 82 | fromShapeOption LettersShapeTashkeelIsolated = #const U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED
 83 | fromShapeOption PreservePresentation = #const U_SHAPE_PRESERVE_PRESENTATION
 84 | fromShapeOption TextDirectionVisualLTR = #const U_SHAPE_TEXT_DIRECTION_VISUAL_LTR
 85 | 
 86 | -- | Shape Arabic text on a character basis.
 87 | --
 88 | -- Text-based shaping means that some character codepoints in the text are replaced by
 89 | -- others depending on the context. It transforms one kind of text into another.
 90 | -- In comparison, modern displays for Arabic text select appropriate, context-dependent font
 91 | -- glyphs for each text element, which means that they transform text into a glyph vector.
 92 | --
 93 | -- You probably want to call this with the LettersShape option in the default case.
 94 | shapeArabic :: [ShapeOption] -> Text -> Text
 95 | shapeArabic options t = unsafePerformIO . useAsUCharPtr t $ \sptr slen ->
 96 |   let slen' = fromIntegral slen
 97 |       options' = reduceShapeOpts options
 98 |   in handleOverflowError (fromIntegral slen)
 99 |       (\dptr dlen -> u_shapeArabic sptr slen' dptr (fromIntegral dlen) options')
100 |       (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
101 | 
102 | foreign import ccall unsafe "hs_text_icu.h __hs_u_shapeArabic" u_shapeArabic
103 |   :: Ptr UChar -> Int32
104 |   -> Ptr UChar -> Int32
105 |   -> Int32 -> Ptr UErrorCode -> IO Int32
106 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Spoof/Internal.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Spoof.Internal
 4 | -- Copyright   : (c) 2015 Ben Hamilton
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bgertzfield@gmail.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Internals of the spoofability check infrastructure.
12 | 
13 | module Data.Text.ICU.Spoof.Internal
14 |     (
15 |     -- * Unicode spoof checking API
16 |     -- $api
17 |     -- * Types
18 |       MSpoof(..)
19 |     , Spoof(..)
20 |     , USpoof
21 |     -- * Functions
22 |     , withSpoof
23 |     , wrap
24 |     , wrapWithSerialized
25 |     ) where
26 | 
27 | import Data.Typeable (Typeable)
28 | import Data.Word (Word8)
29 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr)
30 | import Foreign.Ptr (FunPtr, Ptr)
31 | import Data.Text.ICU.Internal (newICUPtr)
32 | 
33 | -- $api
34 | -- Low-level operations on spoof checkers.
35 | 
36 | -- | Opaque handle to a configurable spoof checker.
37 | data USpoof
38 | 
39 | -- | Configurable spoof checker wrapping an opaque handle
40 | -- and optionally wrapping a previously serialized instance.
41 | data MSpoof = MSpoof {
42 |     serializedBuf :: Maybe (ForeignPtr Word8)
43 |   , spoofPtr :: {-# UNPACK #-} !(ForeignPtr USpoof)
44 | } deriving (Typeable)
45 | 
46 | -- | Spoof checker type.
47 | newtype Spoof = S MSpoof
48 |     deriving (Typeable)
49 | 
50 | -- | Temporarily unwraps an 'MSpoof' to perform operations on its raw 'USpoof'
51 | -- handle.
52 | withSpoof :: MSpoof -> (Ptr USpoof -> IO a) -> IO a
53 | withSpoof (MSpoof _ spoof) = withForeignPtr spoof
54 | {-# INLINE withSpoof #-}
55 | 
56 | -- | Wraps a raw 'USpoof' handle in an 'MSpoof', closing the handle when
57 | -- the last reference to the object is dropped.
58 | wrap :: IO (Ptr USpoof) -> IO MSpoof
59 | wrap = newICUPtr (MSpoof Nothing) uspoof_close
60 | {-# INLINE wrap #-}
61 | 
62 | -- | Wraps a previously serialized spoof checker and raw 'USpoof' handle
63 | -- in an 'MSpoof', closing the handle and releasing the 'ForeignPtr' when
64 | -- the last reference to the object is dropped.
65 | wrapWithSerialized :: ForeignPtr Word8 -> IO (Ptr USpoof) -> IO MSpoof
66 | wrapWithSerialized s = newICUPtr (MSpoof $ Just s) uspoof_close
67 | {-# INLINE wrapWithSerialized #-}
68 | 
69 | foreign import ccall unsafe "hs_text_icu.h &__hs_uspoof_close" uspoof_close
70 |     :: FunPtr (Ptr USpoof -> IO ())
71 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Spoof/Pure.hs:
--------------------------------------------------------------------------------
  1 | -- |
  2 | -- Module      : Data.Text.ICU.Spoof.Pure
  3 | -- Copyright   : (c) 2015 Ben Hamilton
  4 | --
  5 | -- License     : BSD-style
  6 | -- Maintainer  : bgertzfield@gmail.com
  7 | -- Stability   : experimental
  8 | -- Portability : GHC
  9 | --
 10 | -- Pure string spoof checking functions for Unicode, implemented as
 11 | -- bindings to the International Components for Unicode (ICU)
 12 | -- libraries.
 13 | --
 14 | -- For the impure spoof checking API (which is richer, but less easy to
 15 | -- use), see the "Data.Text.ICU.Spoof" module.
 16 | 
 17 | module Data.Text.ICU.Spoof.Pure
 18 |     (
 19 |     -- * Types
 20 |       Spoof
 21 |     , SpoofParams(..)
 22 |     , spoof
 23 |     , spoofWithParams
 24 |     , spoofFromSource
 25 |     , spoofFromSerialized
 26 |     -- * String spoof checks
 27 |     , areConfusable
 28 |     , getSkeleton
 29 |     , spoofCheck
 30 |     -- * Configuration
 31 |     , getAllowedLocales
 32 |     , getChecks
 33 |     , getRestrictionLevel
 34 |     -- * Persistence
 35 |     , serialize
 36 |     ) where
 37 | 
 38 | import Data.ByteString (ByteString)
 39 | import Data.Foldable (forM_)
 40 | import Data.Text (Text)
 41 | import Data.Text.ICU.Spoof.Internal (Spoof(..))
 42 | import System.IO.Unsafe (unsafePerformIO)
 43 | import qualified Data.Text.ICU.Spoof as S
 44 | 
 45 | data SpoofParams
 46 |   -- | Used to configure a 'Spoof' checker via 'spoofWithParams'.
 47 |   = SpoofParams {
 48 |     -- | Optional 'S.SpoofCheck's to perform on a string. By default, performs
 49 |     -- all checks except 'CharLimit'.
 50 |     spoofChecks :: Maybe [S.SpoofCheck]
 51 |     -- | Optional 'S.RestrictionLevel' to which characters in the string will
 52 |     -- be limited. By default, uses 'HighlyRestrictive'.
 53 |   , level :: Maybe S.RestrictionLevel
 54 |     -- | Optional locale(s) whose scripts will be used to limit the
 55 |     -- set of allowed characters in a string. If set, automatically
 56 |     -- enables the 'CharLimit' spoof check.
 57 |   , locales :: Maybe [String]
 58 | } deriving (Show, Eq)
 59 | 
 60 | applySpoofParams :: SpoofParams -> S.MSpoof -> S.MSpoof
 61 | applySpoofParams (SpoofParams c lev loc) s = unsafePerformIO $ do
 62 |   forM_ c (S.setChecks s)
 63 |   forM_ lev (S.setRestrictionLevel s)
 64 |   forM_ loc (S.setAllowedLocales s)
 65 |   return s
 66 | 
 67 | -- | Open an immutable 'Spoof' checker with default options (all
 68 | -- 'S.SpoofCheck's except 'CharLimit').
 69 | spoof :: Spoof
 70 | spoof = unsafePerformIO $ S `fmap` S.open
 71 | {-# NOINLINE spoof #-}
 72 | 
 73 | -- | Open an immutable 'Spoof' checker with specific 'SpoofParams'
 74 | -- to control its behavior.
 75 | spoofWithParams :: SpoofParams -> Spoof
 76 | spoofWithParams p = unsafePerformIO $ do
 77 |   s <- S.open
 78 |   return (S $ applySpoofParams p s)
 79 | 
 80 | -- | Open a immutable 'Spoof' checker with specific 'SpoofParams'
 81 | -- to control its behavior and custom rules given the UTF-8 encoded
 82 | -- contents of the @confusables.txt@ and @confusablesWholeScript.txt@
 83 | -- files as described in <http://unicode.org/reports/tr39/ Unicode UAX #39>.
 84 | spoofFromSource :: (ByteString, ByteString) -> SpoofParams -> Spoof
 85 | spoofFromSource (confusables, confusablesWholeScript) p = unsafePerformIO $ do
 86 |   s <- S.openFromSource (confusables, confusablesWholeScript)
 87 |   return (S $ applySpoofParams p s)
 88 | 
 89 | -- | Create an immutable spoof checker with specific 'SpoofParams'
 90 | -- to control its behavior and custom rules previously returned
 91 | -- by 'serialize'.
 92 | spoofFromSerialized :: ByteString -> SpoofParams -> Spoof
 93 | spoofFromSerialized b p = unsafePerformIO $ do
 94 |   s <- S.openFromSerialized b
 95 |   return (S $ applySpoofParams p s)
 96 | 
 97 | -- | Check two strings for confusability.
 98 | areConfusable :: Spoof -> Text -> Text -> S.SpoofCheckResult
 99 | areConfusable (S s) t1 t2 = unsafePerformIO $ S.areConfusable s t1 t2
100 | 
101 | -- | Check a string for spoofing issues.
102 | spoofCheck :: Spoof -> Text -> S.SpoofCheckResult
103 | spoofCheck (S s) t = unsafePerformIO $ S.spoofCheck s t
104 | 
105 | -- | Generates re-usable \"skeleton\" strings which can be used (via
106 | -- Unicode equality) to check if an identifier is confusable
107 | -- with some large set of existing identifiers.
108 | --
109 | -- If you cache the returned strings in storage, you /must/ invalidate
110 | -- your cache any time the underlying confusables database changes
111 | -- (i.e., on ICU upgrade).
112 | --
113 | -- By default, assumes all input strings have been passed through
114 | -- 'toCaseFold' and are lower-case. To change this, pass
115 | -- 'SkeletonAnyCase'.
116 | --
117 | -- By default, builds skeletons which catch visually confusable
118 | -- characters across multiple scripts.  Pass 'SkeletonSingleScript' to
119 | -- override that behavior and build skeletons which catch visually
120 | -- confusable characters across single scripts.
121 | getSkeleton :: Spoof -> Maybe S.SkeletonTypeOverride -> Text -> Text
122 | getSkeleton (S s) o t = unsafePerformIO $ S.getSkeleton s o t
123 | 
124 | -- | Gets the restriction level currently configured in the spoof
125 | -- checker, if present.
126 | getRestrictionLevel :: Spoof -> Maybe S.RestrictionLevel
127 | getRestrictionLevel (S s) = unsafePerformIO $ S.getRestrictionLevel s
128 | 
129 | -- | Gets the checks currently configured in the spoof checker.
130 | getChecks :: Spoof -> [S.SpoofCheck]
131 | getChecks (S s) = unsafePerformIO $ S.getChecks s
132 | 
133 | -- | Gets the locales whose scripts are currently allowed by the spoof
134 | -- checker.  (We don't use 'LocaleName' since the root and default
135 | -- locales have no meaning here.)
136 | getAllowedLocales :: Spoof -> [String]
137 | getAllowedLocales (S s) = unsafePerformIO $ S.getAllowedLocales s
138 | 
139 | -- | Serializes the rules in this spoof checker to a byte array,
140 | -- suitable for re-use by 'spoofFromSerialized'.
141 | --
142 | -- Only includes any data provided to 'openFromSource'. Does not
143 | -- include any other state or configuration.
144 | serialize :: Spoof -> ByteString
145 | serialize (S s) = unsafePerformIO $ S.serialize s
146 | 
147 | {-# INLINE spoofCheck #-}
148 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Text.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ForeignFunctionInterface #-}
 2 | -- |
 3 | -- Module      : Data.Text.ICU.Text
 4 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 5 | --
 6 | -- License     : BSD-style
 7 | -- Maintainer  : bos@serpentine.com
 8 | -- Stability   : experimental
 9 | -- Portability : GHC
10 | --
11 | -- Functions for manipulating Unicode text, implemented as bindings to
12 | -- the International Components for Unicode (ICU) libraries.
13 | module Data.Text.ICU.Text
14 |     (
15 |     -- * Case conversion
16 |     -- $case
17 |       toCaseFold
18 |     , toLower
19 |     , toUpper
20 |     ) where
21 | 
22 | import Data.Int (Int32)
23 | import Data.Text (Text)
24 | import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError)
25 | import Data.Text.ICU.Internal (LocaleName, UChar, withLocaleName, useAsUCharPtr, fromUCharPtr)
26 | import Data.Word (Word32)
27 | import Foreign.C.String (CString)
28 | import Foreign.Ptr (Ptr)
29 | import System.IO.Unsafe (unsafePerformIO)
30 | 
31 | -- $case
32 | --
33 | -- In some languages, case conversion is a locale- and
34 | -- context-dependent operation. The case conversion functions in this
35 | -- module are locale and context sensitive.
36 | 
37 | -- | Case-fold the characters in a string.
38 | --
39 | -- Case folding is locale independent and not context sensitive, but
40 | -- there is an option for treating the letter I specially for Turkic
41 | -- languages.  The result may be longer or shorter than the original.
42 | toCaseFold :: Bool -- ^ Whether to include or exclude mappings for
43 |                    -- dotted and dotless I and i that are marked with
44 |                    -- 'I' in @CaseFolding.txt@.
45 |            -> Text -> Text
46 | toCaseFold excludeI s = unsafePerformIO .
47 |   useAsUCharPtr s $ \sptr slen -> do
48 |     let opts = fromIntegral . fromEnum $ excludeI
49 |     handleOverflowError (fromIntegral slen)
50 |         (\dptr dlen -> u_strFoldCase dptr dlen sptr (fromIntegral slen) opts)
51 |         (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
52 | 
53 | type CaseMapper = Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> CString
54 |                 -> Ptr UErrorCode -> IO Int32
55 | 
56 | caseMap :: CaseMapper -> LocaleName -> Text -> Text
57 | caseMap mapFn loc s = unsafePerformIO .
58 |   withLocaleName loc $ \locale ->
59 |     useAsUCharPtr s $ \sptr slen ->
60 |       handleOverflowError (fromIntegral slen)
61 |       (\dptr dlen -> mapFn dptr dlen sptr (fromIntegral slen) locale)
62 |       (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen))
63 | 
64 | -- | Lowercase the characters in a string.
65 | --
66 | -- Casing is locale dependent and context sensitive.  The result may
67 | -- be longer or shorter than the original.
68 | toLower :: LocaleName -> Text -> Text
69 | toLower = caseMap u_strToLower
70 | 
71 | -- | Uppercase the characters in a string.
72 | --
73 | -- Casing is locale dependent and context sensitive.  The result may
74 | -- be longer or shorter than the original.
75 | toUpper :: LocaleName -> Text -> Text
76 | toUpper = caseMap u_strToUpper
77 | 
78 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strFoldCase" u_strFoldCase
79 |     :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Word32 -> Ptr UErrorCode
80 |     -> IO Int32
81 | 
82 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strToLower" u_strToLower
83 |     :: CaseMapper
84 | 
85 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strToUpper" u_strToUpper
86 |     :: CaseMapper
87 | 


--------------------------------------------------------------------------------
/Data/Text/ICU/Types.hs:
--------------------------------------------------------------------------------
 1 | -- |
 2 | -- Module      : Data.Text.ICU.Types
 3 | -- Copyright   : (c) 2010 Bryan O'Sullivan
 4 | --
 5 | -- License     : BSD-style
 6 | -- Maintainer  : bos@serpentine.com
 7 | -- Stability   : experimental
 8 | -- Portability : GHC
 9 | --
10 | -- Types for use when manipulating Unicode text, using the bindings to
11 | -- the International Components for Unicode (ICU) libraries.
12 | module Data.Text.ICU.Types
13 |     (
14 |     -- * Widely used types
15 |       LocaleName(..)
16 |     , ParseError(errError, errLine, errOffset)
17 |     ) where
18 | 
19 | import Data.Text.ICU.Error.Internal (ParseError(..))
20 | import Data.Text.ICU.Internal (LocaleName(..))
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2009, Bryan O'Sullivan
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions
 6 | are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | # Text-ICU: Comprehensive support for string manipulation
 2 | 
 3 | This package provides the Data.Text.ICU library, for performing
 4 | complex manipulation of Unicode text.  It provides features such as
 5 | the following:
 6 | 
 7 | * Unicode normalization
 8 | 
 9 | * Conversion to and from many common and obscure encodings
10 | 
11 | * Date and number formatting
12 | 
13 | * Comparison and collation
14 | 
15 | ## Prerequisites
16 | 
17 | This library is implemented as bindings to the well-respected [ICU
18 | library](https://icu.unicode.org/) (which is not bundled, and must
19 | be installed separately).
20 | 
21 | ### macOS
22 | 
23 |     brew install icu4c
24 |     brew link icu4c --force
25 | 
26 | You might need:
27 | 
28 |     export PKG_CONFIG_PATH="$(brew --prefix)/opt/icu4c/lib/pkgconfig"
29 | 
30 | ### Debian/Ubuntu
31 | 
32 |     sudo apt-get update
33 |     sudo apt-get install libicu-dev
34 | 
35 | ### Fedora/CentOS
36 | 
37 |     sudo dnf install unzip libicu-devel
38 | 
39 | ### Nix/NixOS
40 | 
41 |     nix-shell --packages icu
42 | 
43 | ### Windows/MSYS2
44 | 
45 | Under MSYS2, `ICU` can be installed via `pacman`.
46 | 
47 |     pacman --noconfirm -S mingw-w64-x86_64-icu
48 | 
49 | Depending on the age of the MSYS2 installation, the keyring might need
50 | to be updated to avoid certification issues, and `pkg-config` might
51 | need to be added.  In this case, do this first:
52 | 
53 |     pacman --noconfirm -Sy msys2-keyring
54 |     pacman --noconfirm -S mingw-w64-x86_64-pkgconf
55 | 
56 | ### Windows/stack
57 | 
58 | With `stack` on Windows, which comes with its _own_ bundled MSYS2, the
59 | following commands give up-to-date system dependencies for
60 | `text-icu-0.8.0` (tested 2023-09-30):
61 | 
62 |     stack exec -- pacman --noconfirm -Sy msys2-keyring
63 |     stack exec -- pacman --noconfirm -S mingw-w64-x86_64-pkgconf
64 |     stack exec -- pacman --noconfirm -S mingw-w64-x86_64-icu
65 | 
66 | 
67 | ## Compatibility
68 | 
69 | Upstream ICU occasionally introduces backwards-incompatible API
70 | breaks. This package tries to stay up to date with upstream, and is
71 | currently more or less in sync with ICU 72.
72 | 
73 | Minimum required version is ICU 62.
74 | 
75 | 
76 | ## Get involved!
77 | 
78 | Please report bugs via the
79 | [github issue tracker](https://github.com/haskell/text-icu/issues).
80 | 
81 | 
82 | ## Authors
83 | 
84 | This library was written by Bryan O'Sullivan.
85 | 


--------------------------------------------------------------------------------
/Setup.lhs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env runhaskell
2 | > import Distribution.Simple
3 | > main = defaultMain
4 | 


--------------------------------------------------------------------------------
/benchmarks/Breaker.hs:
--------------------------------------------------------------------------------
 1 | -- Estimate the time difference between creating a breaker.
 2 | 
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | 
 5 | import Control.Monad
 6 | import qualified Data.Text as T
 7 | import Data.Text.IO as T
 8 | import Data.Text.ICU.Break as IO
 9 | import Data.Text.ICU as ICU
10 | import System.Environment
11 | 
12 | consume b = go
13 |   where
14 |     go = do
15 |       m <- next b
16 |       case m of
17 |         Nothing -> return ()
18 |         Just _ -> go
19 | 
20 | manyBreakers (t:ts) = do
21 |   b <- IO.breakWord "en" t
22 |   consume b
23 |   manyBreakers ts
24 | manyBreakers _ = return ()
25 | 
26 | oneBreaker ts = do
27 |   b <- IO.breakWord "en" ""
28 |   forM_ ts $ \t -> do
29 |     setText b t
30 |     consume b
31 | 
32 | cloneBreakers ts = do
33 |   b <- IO.breakWord "en" ""
34 |   forM_ ts $ \t -> do
35 |     b' <- clone b
36 |     setText b' t
37 |     consume b'
38 | 
39 | pureBreaker ts = do
40 |   let b = ICU.breakWord "en"
41 |   forM_ ts $ \t -> length (breaks b t) `seq` return ()
42 | 
43 | main = do
44 |   (kind:files) <- getArgs
45 |   let act = case kind of
46 |               "one" -> oneBreaker
47 |               "many" -> manyBreakers
48 |               "clone" -> cloneBreakers
49 |               "pure" -> pureBreaker
50 |   forM_ files $ \f -> T.readFile f >>= act . T.lines
51 | 


--------------------------------------------------------------------------------
/cabal.haskell-ci:
--------------------------------------------------------------------------------
 1 | -- Recreate .github/workflows/haskell-ci.yml with command `haskell-ci regenerate`.
 2 | 
 3 | branches: master
 4 | 
 5 | -- text-icu-0.8 requires a newer ICU lib (shipped not with Ubuntu 18.04, but with 20.04)
 6 | -- distribution: focal
 7 | 
 8 | -- installed: +all -binary -bytestring -containers -deepseq -directory -time -unix
 9 | 
10 | -- constraint-set text-2.1
11 | --   -- Somehow GHCs 8.4 - 9.2 pick up the wrong version of text
12 | --   -- See https://github.com/haskell/text-icu/actions/runs/6341389618/job/17224925091?pr=93
13 | --   ghc: >=8.2 && < 8.3 || >= 9.4
14 | --   constraints: text ^>= 2.1
15 | --   tests: True
16 | --   run-tests: True
17 | --
18 | -- raw-project
19 | --   allow-newer: text
20 | 


--------------------------------------------------------------------------------
/changelog.md:
--------------------------------------------------------------------------------
 1 | Unreleased
 2 | 
 3 | * Drop support for GHC 7.10
 4 | 
 5 | 0.8.0.5
 6 | 
 7 | *  Make homebrew optional #(99)
 8 | 
 9 | 0.8.0.4
10 | 
11 | * Fixed tests to work with ICU < 72 (#94)
12 | 
13 | 0.8.0.3
14 | 
15 | * Support for ICU 72 (#94)
16 | 
17 | 0.8.0.2
18 | 
19 | * Support for creating a collator from custom rules (#76)
20 | 
21 | 0.8.0.1
22 | 
23 | * Restore build with GHC 7.10 - 8.8 (#61)
24 | * New CI for Linux, macOS and Windows (#63, #64, #66, #69)
25 | 
26 | 0.8.0
27 | 
28 | * Support for text-2.0 (#57)
29 | * Support for ICU 69 and new features (#55)
30 | * Add lib/include dirs for newer homebrew (#54)
31 | * basic number formatting added (#46)
32 | * Declare pkg-config dependencies (#43)
33 | * Added support for arabic shaping and BiDi (#41)
34 | * Include icuio lib (#36)
35 | * Character Set Detection (#27)
36 | 
37 | 0.7.1.0
38 | 
39 | * Add fix for undefined TRUE value in cbits (#52)
40 | * Improve CI and documentation (#20)
41 | 
42 | Thanks to everyone who contributed!
43 | 
44 | 0.7.0.0
45 | 
46 | * Built and tested against ICU 53.
47 | 
48 | * The isoComment function has been deprecated, and will be removed in
49 |   the next major release.
50 | 
51 | * The Collator type is no longer an instance of Eq, as this
52 |   functionality has been removed from ICU 53.
53 | 
54 | * Many NFData instances have been added.
55 | 


--------------------------------------------------------------------------------
/hackage-docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run this script in the top-level of your package directory
 4 | # (where the .cabal file is) to compile documentation and
 5 | # upload it to hackage.
 6 | 
 7 | # Requirements:
 8 | # cabal-install-1.24 (for --for-hackage)
 9 | # haddock 2.17 (for the hyperlinked source)
10 | 
11 | set -e
12 | 
13 | dir=$(mktemp -d dist-docs.XXXXXX)
14 | trap 'rm -r "$dir"' EXIT
15 | 
16 | cabal haddock --builddir="$dir" --haddock-for-hackage --enable-doc
17 | cabal upload  --publish -d $dir/*-docs.tar.gz
18 | 


--------------------------------------------------------------------------------
/tests/Properties.hs:
--------------------------------------------------------------------------------
  1 | -- Tester beware!
  2 | --
  3 | -- Many of the tests below are "weak", i.e. they ensure that functions
  4 | -- return results, without checking whether the results are correct.
  5 | -- Weak tests are described as such.
  6 | 
  7 | {-# LANGUAGE CPP #-}
  8 | {-# LANGUAGE OverloadedStrings, LambdaCase #-}
  9 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
 10 | module Properties (propertyTests, testCases) where
 11 | 
 12 | import Control.Monad (unless)
 13 | import qualified Control.Exception as E
 14 | import Control.DeepSeq (NFData(..))
 15 | import Data.Function (on)
 16 | import Data.Maybe (fromMaybe)
 17 | import Data.Text (Text)
 18 | import Data.Text.ICU (LocaleName(..), ParseError(..))
 19 | import QuickCheckUtils (NonEmptyText(..), LatinSpoofableText(..),
 20 |                         NonSpoofableText(..), Utf8Text(..))
 21 | import Data.Text.ICU.Normalize2 (NormalizationMode(..))
 22 | import qualified Data.Text.ICU.Normalize2 as I
 23 | import Test.Framework (Test, testGroup)
 24 | import Test.Framework.Providers.QuickCheck2 (testProperty)
 25 | import Test.Framework.Providers.HUnit (hUnitTestToTests)
 26 | import Test.HUnit ((~?=), (@?=), (~:))
 27 | import qualified Test.HUnit (Test(..), assertFailure)
 28 | import Test.HUnit.Lang (HUnitFailure (..), FailureReason (..))
 29 | import Test.QuickCheck.Monadic (monadicIO, run, assert)
 30 | import qualified Data.Text as T
 31 | import qualified Data.Text.Encoding as T
 32 | import qualified Data.Text.ICU as I
 33 | import qualified Data.Text.ICU.BiDi as BiDi
 34 | import qualified Data.Text.ICU.Calendar as Cal
 35 | import qualified Data.Text.ICU.Convert as I
 36 | import qualified Data.Text.ICU.Char as I
 37 | import qualified Data.Text.ICU.CharsetDetection as CD
 38 | import qualified Data.Text.ICU.Error as Err
 39 | import qualified Data.Text.ICU.Number as N
 40 | import qualified Data.Text.ICU.Shape as S
 41 | import System.IO.Unsafe (unsafePerformIO)
 42 | 
 43 | #if !MIN_VERSION_base(4,11,0)
 44 | import Data.Semigroup ((<>))
 45 | #endif
 46 | 
 47 | {-# ANN module ("HLint: use camelCase"::String) #-}
 48 | 
 49 | t_rnf :: (NFData b) => (a -> b) -> a -> Bool
 50 | t_rnf f t = rnf (f t) == ()
 51 | 
 52 | t_nonEmpty :: (Text -> Text) -> Text -> Bool
 53 | t_nonEmpty f t
 54 |     | T.null t  = T.null ft
 55 |     | otherwise = T.length ft > 0
 56 |   where ft = f t
 57 | 
 58 | -- Case mapping
 59 | 
 60 | -- These tests are all fairly weak.
 61 | 
 62 | t_toCaseFold bool = t_nonEmpty $ I.toCaseFold bool
 63 | t_toLower locale = t_nonEmpty $ I.toLower locale
 64 | t_toUpper locale = t_nonEmpty $ I.toUpper locale
 65 | 
 66 | -- Iteration
 67 | 
 68 | t_charIterator_String a b = (compare `on` I.fromString) a b == compare a b
 69 | t_charIterator_Text a b = (compare `on` I.fromText) a b == compare a b
 70 | t_charIterator_Utf8 a b = (compare `on` I.fromUtf8) ba bb == compare ba bb
 71 |   where ba = T.encodeUtf8 a; bb = T.encodeUtf8 b
 72 | 
 73 | -- Normalization
 74 | 
 75 | t_quickCheck_isNormalized mode normMode txt
 76 |   | mode `elem` [NFD, NFKD] =          quickCheck == Just isNormalized
 77 |   | otherwise = fromMaybe isNormalized quickCheck ==      isNormalized
 78 |   where quickCheck   = I.quickCheck mode normTxt
 79 |         isNormalized = I.isNormalized mode normTxt
 80 |         normTxt      = I.normalize normMode txt
 81 | 
 82 | -- Collation
 83 | 
 84 | t_collate a b = c a b == flipOrdering (c b a)
 85 |     where c = I.collate I.uca
 86 | 
 87 | t_collate_emptyRule a b = I.collate cUca a b == I.collate cEmpty a b
 88 |   where
 89 |     cUca = I.uca
 90 |     cEmpty = either (error "Can’t create empty collator") id
 91 |       $ I.collatorFromRules ""
 92 | 
 93 | flipOrdering :: Ordering -> Ordering
 94 | flipOrdering = \ case
 95 |     GT -> LT
 96 |     LT -> GT
 97 |     EQ -> EQ
 98 | 
 99 | -- Convert
100 | 
101 | converter e = unsafePerformIO $ I.open e Nothing
102 | 
103 | t_convert a = I.toUnicode c (I.fromUnicode c a) == a
104 |     where c = converter "UTF-32"
105 | 
106 | 
107 | -- Unicode character database
108 | 
109 | -- These tests are weak.
110 | 
111 | t_blockCode = t_rnf I.blockCode
112 | t_charFullName c = I.charFromFullName (I.charFullName c) == Just c
113 | t_charName c = maybe True (==c) $ I.charFromName (I.charName c)
114 | t_combiningClass = t_rnf I.combiningClass
115 | t_direction = t_rnf I.direction
116 | -- t_property p = t_rnf $ I.property p
117 | t_isMirrored = t_rnf $ I.isMirrored
118 | t_mirror = t_rnf $ I.mirror
119 | t_digitToInt = t_rnf $ I.digitToInt
120 | t_numericValue = t_rnf $ I.numericValue
121 | 
122 | -- Spoofing
123 | 
124 | t_nonspoofable (NonSpoofableText t) = I.spoofCheck I.spoof t == I.CheckOK
125 | t_spoofable (LatinSpoofableText t) = I.spoofCheck I.spoof t ==
126 |                                      I.CheckFailed [I.RestrictionLevel]
127 | t_confusable (NonEmptyText t) = I.areConfusable I.spoof t t `elem`
128 |   [I.CheckFailed [I.MixedScriptConfusable]
129 |   ,I.CheckFailed [I.SingleScriptConfusable]]
130 | 
131 | -- Encoding Guessing
132 | 
133 | t_Utf8IsUtf8 a = monadicIO $ do
134 |     val <- run $ CD.detect (utf8Text a) >>= CD.getName
135 |     assert $ T.isPrefixOf "UTF-8" val
136 | 
137 | propertyTests :: Test
138 | propertyTests =
139 |   testGroup "Properties" [
140 |     testProperty "t_toCaseFold" t_toCaseFold
141 |   , testProperty "t_toLower" t_toLower
142 |   , testProperty "t_toUpper" t_toUpper
143 |   , testProperty "t_charIterator_String" t_charIterator_String
144 |   , testProperty "t_charIterator_Text" t_charIterator_Text
145 |   , testProperty "t_charIterator_Utf8" t_charIterator_Utf8
146 |   , testProperty "t_quickCheck_isNormalized" t_quickCheck_isNormalized
147 |   , testProperty "t_collate" t_collate
148 |   , testProperty "t_collate_emptyRule" t_collate_emptyRule
149 |   , testProperty "t_convert" t_convert
150 |   , testProperty "t_blockCode" t_blockCode
151 |   , testProperty "t_charFullName" t_charFullName
152 |   , testProperty "t_charName" t_charName
153 |   , testProperty "t_combiningClass" t_combiningClass
154 |   , testProperty "t_direction" $ t_direction
155 | --, testProperty "t_property" t_property
156 |   , testProperty "t_isMirrored" t_isMirrored
157 |   , testProperty "t_mirror" t_mirror
158 |   , testProperty "t_digitToInt" t_digitToInt
159 |   , testProperty "t_numericValue" t_numericValue
160 |   , testProperty "t_spoofable" t_spoofable
161 |   , testProperty "t_nonspoofable" t_nonspoofable
162 |   , testProperty "t_confusable" t_confusable
163 |   , testProperty "t_Utf8IsUtf8" t_Utf8IsUtf8
164 |   ]
165 | 
166 | testCases :: Test
167 | testCases =
168 |   testGroup "Test cases" $ hUnitTestToTests $ Test.HUnit.TestList $
169 |   [I.normalize NFC "Ame\x0301lie" ~?= "Amélie"
170 |   ,I.normalize NFC "(⊃｡•́︵•̀｡)⊃" ~?= "(⊃｡•́︵•̀｡)⊃"
171 |   ,map I.brkBreak (I.breaks (I.breakWord (Locale "en_US")) "Hi, Amélie!")
172 |      ~?= ["Hi",","," ","Amélie","!"]
173 |   ,map I.brkBreak (I.breaksRight (I.breakLine (Locale "ru")) "Привет, мир!")
174 |      ~?= ["мир!","Привет, "]
175 |   ,(I.unfold I.group <$> I.findAll "[abc]+" "xx b yy ac") ~?= [["b"],["ac"]]
176 |   ,I.toUpper (Locale "de-DE") "ß" ~?= "SS"
177 |   ,I.toCaseFold False "ﬂag" ~?= "flag"
178 |   ,map I.blockCode ['\x1FA50', '\203257', '\73494']
179 |      `oneOf`
180 |      [[I.ChessSymbols, I.CjkUnifiedIdeographsExtensionH, I.Kawi]
181 |      ,[I.ChessSymbols, I.NoBlock, I.NoBlock]
182 |       -- ICU < 72 does not have last two codes
183 |      ]
184 |   ,I.direction '\x2068' ~?= I.FirstStrongIsolate
185 |   ,I.getSkeleton I.spoof Nothing "\1089\1072t" ~?= "cat"
186 |   ,S.shapeArabic [S.LettersShape] (nosp "ا ب ت ث") ~?= (nosp "ﺍ ﺑ ﺘ ﺚ")
187 |   ,BiDi.reorderParagraphs [] (nosp "abc ا ب ت ث def\n123")
188 |      ~?= ["abc" <> T.reverse (nosp "ا ب ت ث") <> "def\n", "123"]
189 |   ,N.formatNumber (N.numberFormatter N.NUM_CURRENCY_PLURAL "en_US")
190 |      (12.5 :: Double) ~?= "12.50 US dollars"
191 | 
192 |   ,do
193 |      dfDe <- I.standardDateFormatter I.MediumFormatStyle I.LongFormatStyle
194 |        (Locale "de_DE") ""
195 |      c <- cal "CET" 2000 00 01 02 03 00
196 |      return $ I.formatCalendar dfDe (Cal.add c [(Cal.Hour, 25), (Cal.Second, 65)])
197 |    `ioEq`
198 |     "2. Januar 2000 um 03:04:05"
199 | 
200 |   ,do
201 |      dfAt <- I.standardDateFormatter I.LongFormatStyle I.LongFormatStyle
202 |        (Locale "de_AT") "CET"
203 |      return $ I.dateSymbols dfAt I.Months
204 |    `ioEq`
205 |    ["Jänner","Februar","März","April","Mai","Juni"
206 |    ,"Juli","August","September","Oktober","November","Dezember"]
207 | 
208 |   ,do
209 |      dfP <- I.patternDateFormatter
210 |        "MMMM dd, yyyy GGGG, hh 'o''clock' a, VVVV" (Locale "en_US") ""
211 |      c <- cal "America/Los_Angeles" 2000 00 02 03 04 05
212 |      return $ I.formatCalendar dfP c
213 |    `ioEq`
214 |     "January 02, 2000 Anno Domini, 03 o'clock AM, Los Angeles Time"
215 | 
216 |   ,(flip Cal.getField Cal.Year =<< cal "UTC" 1999 01 02 03 04 05) `ioEq` 1999
217 | 
218 |   ,(elem "en_US" <$> I.availableLocales) `ioEq` True
219 | 
220 |   ,(flip I.formatIntegral (12345 :: Int)
221 |     <$> I.numberFormatter "precision-integer" (Locale "fr"))
222 |    `ioEq` "12\8239\&345"
223 | 
224 |   ,(flip I.formatDouble 12345.6789
225 |     <$> I.numberFormatter "precision-currency-cash currency/EUR" (Locale "it"))
226 |    `ioEq` "12.345,68\160€"
227 | 
228 |   , Test.HUnit.TestLabel "collate" testCases_collate
229 | 
230 |   ]
231 |   <>
232 |   concat
233 |   [conv "ISO-2022-CN" "程序設計"  "\ESC$)A\SO3LPr\ESC$)G]CSS\SI"
234 |   ,conv "cp1251" "Привет, мир!"  "\207\240\232\226\229\242, \236\232\240!"
235 |   ]
236 |   where conv n f t = [I.fromUnicode c f ~?= t, I.toUnicode c t ~?= f]
237 |             where c = converter n
238 |         nosp = T.filter (/= ' ')
239 |         cal tz y m d h mn s = do
240 |             c <- Cal.calendar tz (Locale "en_US") Cal.TraditionalCalendarType
241 |             Cal.setDateTime c y m d h mn s
242 |             return c
243 |         ioEq io a = Test.HUnit.TestCase $ do
244 |             x <- io
245 |             x @?= a
246 |         oneOf actual expected = Test.HUnit.TestCase $
247 |           unless (actual `elem` expected) $
248 |             E.throwIO $ HUnitFailure Nothing $ ExpectedButGot Nothing
249 |               (unlines $ "one of:" : map show expected) (show actual)
250 | 
251 | testCases_collate :: Test.HUnit.Test
252 | testCases_collate = Test.HUnit.TestList $
253 |   [ Test.HUnit.TestLabel "invalid format" $
254 |     assertParseError (I.collatorFromRules "& a < <") Err.u_INVALID_FORMAT_ERROR (Just 0) (Just 4)
255 |   , Test.HUnit.TestLabel "custom collator" $ Test.HUnit.TestCase $ do
256 |       let c = either (error "Can’t create b<a collator") id
257 |               $ I.collatorFromRules "& b < a"
258 |       I.collate c "a" "b" @?= GT
259 |   ]
260 |   where
261 |     assertParseError (Left e) err line offset = Test.HUnit.TestList
262 |       [ "errError" ~: errError e ~?= err
263 |       , "errLine" ~: errLine e ~?= line
264 |       , "errOffset" ~: errOffset e ~?= offset
265 |       ]
266 |     assertParseError (Right _) _ _ _ = Test.HUnit.TestCase $ Test.HUnit.assertFailure "Expects a Left"
267 | 


--------------------------------------------------------------------------------
/tests/QuickCheckUtils.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | {-# LANGUAGE CPP #-}
 3 | {-# OPTIONS_GHC -fno-warn-orphans #-}
 4 | 
 5 | module QuickCheckUtils (NonEmptyText(..), LatinSpoofableText(..),
 6 |                         NonSpoofableText(..), Utf8Text(..)) where
 7 | 
 8 | import Data.Text.ICU (Collator, LocaleName(..))
 9 | import Data.Text.ICU.Normalize2
10 | import Data.Text.ICU.Break (available)
11 | import Test.QuickCheck (Arbitrary(..), Gen, elements, listOf1, suchThat, vectorOf)
12 | import Data.Word (Word8)
13 | import qualified Data.ByteString as BS
14 | import qualified Data.Text as T
15 | import qualified Data.Text.ICU as I
16 | 
17 | #if !MIN_VERSION_base(4,11,0)
18 | import Data.Semigroup ((<>))
19 | #endif
20 | 
21 | instance Arbitrary T.Text where
22 |     arbitrary = T.pack `fmap` arbitrary
23 |     shrink = map T.pack . shrink . T.unpack
24 | 
25 | instance Arbitrary BS.ByteString where
26 |     arbitrary = BS.pack <$> arbitrary
27 |     shrink xs = BS.pack <$> shrink (BS.unpack xs)
28 | 
29 | instance Arbitrary LocaleName where
30 |     arbitrary = elements (Root:available)
31 | 
32 | instance Arbitrary NormalizationMode where
33 |     arbitrary = elements [NFD .. NFKCCasefold]
34 | 
35 | instance Arbitrary Collator where
36 |     arbitrary = I.collator <$> arbitrary
37 | 
38 | newtype NonEmptyText = NonEmptyText { nonEmptyText :: T.Text } deriving Show
39 | 
40 | instance Arbitrary NonEmptyText where
41 |   arbitrary = NonEmptyText . T.pack <$> listOf1 arbitrary
42 | 
43 | newtype LatinSpoofableText = LatinSpoofableText { latinSpoofableText :: T.Text }
44 |                            deriving Show
45 | instance Arbitrary LatinSpoofableText where
46 |     arbitrary = LatinSpoofableText <$> T.pack . (<>) "latin" <$>
47 |                 listOf1 genCyrillicLatinSpoofableChar
48 | 
49 | genCyrillicLatinSpoofableChar :: Gen Char
50 | genCyrillicLatinSpoofableChar = elements (
51 |   "\x043A\x043E\x0433\x0435\x043A\x043C" ++
52 |   ['\x043E'..'\x0443'] ++
53 |   ['\x0445'..'\x0446'] ++
54 |   "\x044A" ++
55 |   ['\x0454'..'\x0456'] ++
56 |   "\x0458\x045B\x048D\x0491\x0493\x049B\x049F\x04AB\x04AD\x04AF\x04B1\x04BB" ++
57 |   "\x04BD\x04BF" ++
58 |   ['\x04CE'..'\x04CF'] ++
59 |   "\x04D5\x04D9\x04E9\x0501\x0511\x051B\x051D")
60 | 
61 | newtype NonSpoofableText = NonSpoofableText { nonSpoofableText :: T.Text }
62 |                          deriving Show
63 | 
64 | instance Arbitrary NonSpoofableText where
65 |     arbitrary = NonSpoofableText <$> T.pack <$> listOf1 genNonSpoofableChar
66 | 
67 | genNonSpoofableChar :: Gen Char
68 | genNonSpoofableChar = elements "QDFRz"
69 | 
70 | newtype Utf8Text = Utf8Text { utf8Text :: BS.ByteString }
71 |                  deriving Show
72 | 
73 | instance Arbitrary Utf8Text where
74 |     arbitrary = Utf8Text . BS.pack <$> vectorOf 300
75 |         (suchThat
76 |             (arbitrary :: Gen Word8)
77 |             (`elem` ([0x41..0x5A] ++ [0x61..0x7A]))
78 |         )
79 | 


--------------------------------------------------------------------------------
/tests/Tests.hs:
--------------------------------------------------------------------------------
1 | module Main (main) where
2 | 
3 | import Test.Framework (defaultMain)
4 | 
5 | import qualified Properties
6 | 
7 | main :: IO ()
8 | main = defaultMain [Properties.propertyTests, Properties.testCases]
9 | 


--------------------------------------------------------------------------------
/text-icu.cabal:
--------------------------------------------------------------------------------
  1 | cabal-version:  1.18
  2 |   -- 1.18 introduced extra-doc-files
  3 | name:           text-icu
  4 | version:        0.8.0.5
  5 | synopsis:       Bindings to the ICU library
  6 | homepage:       https://github.com/haskell/text-icu
  7 | bug-reports:    https://github.com/haskell/text-icu/issues
  8 | description:
  9 |   Haskell bindings to the International Components for Unicode (ICU)
 10 |   libraries.  These libraries provide robust and full-featured Unicode
 11 |   services on a wide variety of platforms.
 12 |   .
 13 |   Features include:
 14 |   .
 15 |   * Both pure and impure bindings, to allow for fine control over efficiency
 16 |     and ease of use.
 17 |   .
 18 |   * Breaking of strings on character, word, sentence, and line boundaries.
 19 |   .
 20 |   * Access to the Unicode Character Database (UCD) of character metadata.
 21 |   .
 22 |   * String collation functions, for locales where the conventions for
 23 |     lexicographic ordering differ from the simple numeric ordering of
 24 |     character codes.
 25 |   .
 26 |   * Character set conversion functions, allowing conversion between
 27 |     Unicode and over 220 character encodings.
 28 |   .
 29 |   * Unicode normalization.  (When implementations keep strings in a
 30 |     normalized form, they can be assured that equivalent strings have a
 31 |     unique binary representation.)
 32 |   .
 33 |   * Regular expression search and replace.
 34 |   .
 35 |   * Security checks for visually confusable (spoofable) strings.
 36 |   .
 37 |   * Bidirectional Unicode algorithm
 38 |   .
 39 |   * Calendar objects holding dates and times.
 40 |   .
 41 |   * Number and calendar formatting.
 42 | maintainer:     Vladimir Shabanov <dev@vshabanov.com>
 43 | copyright:      2009-2015 Bryan O'Sullivan
 44 | category:       Data, Text
 45 | license:        BSD3
 46 | license-file:   LICENSE
 47 | build-type:     Simple
 48 | 
 49 | extra-doc-files:
 50 |   README.markdown
 51 |   changelog.md
 52 | 
 53 | extra-source-files:
 54 |   benchmarks/Breaker.hs
 55 |   include/hs_text_icu.h
 56 | 
 57 | tested-with:
 58 |   GHC == 9.12.1
 59 |   GHC == 9.10.1
 60 |   GHC == 9.8.4
 61 |   GHC == 9.6.6
 62 |   GHC == 9.4.8
 63 |   GHC == 9.2.8
 64 |   GHC == 9.0.2
 65 |   GHC == 8.10.7
 66 |   GHC == 8.8.4
 67 |   GHC == 8.6.5
 68 |   GHC == 8.4.4
 69 |   GHC == 8.2.2
 70 |   GHC == 8.0.2
 71 | 
 72 | flag homebrew
 73 |   Description: Assume homebrew on macOS. Automatically add /usr/local/opt/ and /opt/homebrew/opt/ paths to extra-lib-dirs and include-dirs.
 74 |   Default:     True
 75 |   Manual:      True
 76 | 
 77 | library
 78 |   default-language:  Haskell98
 79 |   build-depends:
 80 |     base        >= 4.9      && < 5,
 81 |     bytestring  >= 0.9      && < 0.13,
 82 |     deepseq     >= 1.4.2.0  && < 1.6,
 83 |     text        >= 0.9.1.0  && < 1.3  || >= 2.0 && < 2.2,
 84 |     time        >= 1.5      && < 1.15
 85 |   pkgconfig-depends:
 86 |     icu-i18n    >= 62.1
 87 | 
 88 |   exposed-modules:
 89 |       Data.Text.ICU
 90 |       Data.Text.ICU.BiDi
 91 |       Data.Text.ICU.Calendar
 92 |       Data.Text.ICU.Break
 93 |       Data.Text.ICU.Char
 94 |       Data.Text.ICU.CharsetDetection
 95 |       Data.Text.ICU.Collate
 96 |       Data.Text.ICU.Convert
 97 |       Data.Text.ICU.DateFormatter
 98 |       Data.Text.ICU.Error
 99 |       Data.Text.ICU.Locale
100 |       Data.Text.ICU.Normalize
101 |       Data.Text.ICU.Number
102 |       Data.Text.ICU.Normalize2
103 |       Data.Text.ICU.NumberFormatter
104 |       Data.Text.ICU.Regex
105 |       Data.Text.ICU.Shape
106 |       Data.Text.ICU.Spoof
107 |       Data.Text.ICU.Types
108 |   other-modules:
109 |       Data.Text.ICU.BiDi.Internal
110 |       Data.Text.ICU.BitMask
111 |       Data.Text.ICU.Break.Pure
112 |       Data.Text.ICU.Break.Types
113 |       Data.Text.ICU.CharsetDetection.Internal
114 |       Data.Text.ICU.Collate.Internal
115 |       Data.Text.ICU.Collate.Pure
116 |       Data.Text.ICU.Number.Internal
117 |       Data.Text.ICU.Convert.Internal
118 |       Data.Text.ICU.Enumerator
119 |       Data.Text.ICU.Error.Internal
120 |       Data.Text.ICU.Internal
121 |       Data.Text.ICU.Iterator
122 |       Data.Text.ICU.Normalize.Internal
123 |       Data.Text.ICU.Regex.Internal
124 |       Data.Text.ICU.Regex.Pure
125 |       Data.Text.ICU.Spoof.Internal
126 |       Data.Text.ICU.Spoof.Pure
127 |       Data.Text.ICU.Text
128 |   c-sources: cbits/text_icu.c
129 |   cc-options: -Wall -Wextra -pedantic -Wno-deprecated
130 |   include-dirs: include
131 |   if os(darwin) && flag(homebrew)
132 |     extra-lib-dirs:
133 |       /usr/local/opt/icu4c/lib
134 |       /opt/homebrew/opt/icu4c/lib
135 |     include-dirs:
136 |       /usr/local/opt/icu4c/include
137 |       /opt/homebrew/opt/icu4c/include
138 |   extra-libraries: icuuc
139 |   if os(mingw32)
140 |     extra-libraries: icuin icudt icuio
141 |   else
142 |     extra-libraries: icui18n icudata
143 | 
144 |   ghc-options: -Wall
145 |   if impl(ghc >= 8.0)
146 |     ghc-options:  -Wcompat
147 | 
148 | test-suite tests
149 |   default-language: Haskell98
150 |   type:             exitcode-stdio-1.0
151 |   hs-source-dirs:   tests
152 |   main-is:          Tests.hs
153 |   other-modules:    Properties QuickCheckUtils
154 | 
155 |   ghc-options:
156 |     -Wall -threaded -rtsopts
157 | 
158 |   build-depends:
159 |     HUnit >= 1.2,
160 |     QuickCheck >= 2.4,
161 |     array,
162 |     base,
163 |     bytestring,
164 |     deepseq,
165 |     directory,
166 |     ghc-prim,
167 |     random,
168 |     test-framework >= 0.4,
169 |     test-framework-hunit >= 0.2,
170 |     test-framework-quickcheck2 >= 0.2,
171 |     text,
172 |     text-icu
173 |   if impl(ghc <= 8.2)
174 |     build-depends: semigroups
175 | 
176 | source-repository head
177 |   type:     git
178 |   location: https://github.com/haskell/text-icu
179 | 


--------------------------------------------------------------------------------