├── .github ├── dependabot.yml └── workflows │ ├── cabal-mac-win.yml │ ├── haskell-ci.yml │ └── stack.yml ├── .gitignore ├── Data └── Text │ ├── ICU.hs │ └── ICU │ ├── BiDi.hsc │ ├── BiDi │ └── Internal.hs │ ├── BitMask.hs │ ├── Break.hsc │ ├── Break │ ├── Pure.hs │ └── Types.hs │ ├── Calendar.hsc │ ├── CaseMap.hsc │ ├── Char.hsc │ ├── CharsetDetection.hsc │ ├── CharsetDetection │ └── Internal.hsc │ ├── Collate.hsc │ ├── Collate │ ├── Internal.hs │ └── Pure.hs │ ├── Convert.hs │ ├── Convert │ └── Internal.hs │ ├── DateFormatter.hsc │ ├── Enumerator.hsc │ ├── Error.hsc │ ├── Error │ └── Internal.hsc │ ├── Internal.hsc │ ├── Iterator.hs │ ├── Locale.hsc │ ├── Normalize.hsc │ ├── Normalize │ └── Internal.hsc │ ├── Normalize2.hsc │ ├── Number.hsc │ ├── Number │ └── Internal.hs │ ├── NumberFormatter.hsc │ ├── Regex.hs │ ├── Regex │ ├── Internal.hsc │ └── Pure.hs │ ├── Shape.hsc │ ├── Spoof.hsc │ ├── Spoof │ ├── Internal.hs │ └── Pure.hs │ ├── Text.hs │ └── Types.hs ├── LICENSE ├── README.markdown ├── Setup.lhs ├── benchmarks └── Breaker.hs ├── cabal.haskell-ci ├── cbits └── text_icu.c ├── changelog.md ├── hackage-docs.sh ├── include └── hs_text_icu.h ├── tests ├── Properties.hs ├── QuickCheckUtils.hs └── Tests.hs └── text-icu.cabal /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # From: 2 | # - https://github.com/rhysd/actionlint/issues/228#issuecomment-1272493095 3 | # - https://docs.github.com/en/code-security/dependabot/working-with-dependabot/keeping-your-actions-up-to-date-with-dependabot 4 | 5 | # Set update schedule for GitHub Actions 6 | 7 | version: 2 8 | updates: 9 | 10 | - package-ecosystem: "github-actions" 11 | directory: "/" 12 | schedule: 13 | # Check for updates to GitHub Actions every week 14 | interval: "weekly" 15 | -------------------------------------------------------------------------------- /.github/workflows/cabal-mac-win.yml: -------------------------------------------------------------------------------- 1 | name: CI for macOS and Windows building with cabal 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | 10 | jobs: 11 | 12 | cabal: 13 | name: Cabal 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [macOS-latest, windows-latest] 18 | ghc: 19 | # - 9.2.8 20 | - 9.4.8 21 | - 9.6.6 22 | - 9.8.4 23 | - 9.10.1 24 | - 9.12.1 25 | fail-fast: false 26 | 27 | steps: 28 | 29 | # Andreas, 2023-02-12: 30 | # Putting the MSYS2 /usr/bin (and thus 'tar') into the PATH destroys the Cache action v3. 31 | # See: https://github.com/actions/cache/issues/1073 32 | # However the /mingw64/bin PATH for the MSYS2-installed 'pkg-config' is fine. 33 | # 34 | - name: Setup MSYS path for pkg-config 35 | if: ${{ runner.os == 'Windows' }} 36 | run: | 37 | echo "C:\msys64\mingw64\bin" | Out-File -FilePath "$env:GITHUB_PATH" -Append 38 | 39 | - name: Install the ICU library (Windows) 40 | if: ${{ runner.os == 'Windows' }} 41 | shell: pwsh 42 | ## Even though pwsh is default for windows, keep this for sake of actionlint/shellcheck! 43 | run: | 44 | $env:PATH = "C:\msys64\usr\bin;$env:PATH" 45 | pacman --noconfirm -S mingw-w64-x86_64-pkg-config mingw-w64-x86_64-icu 46 | 47 | # Installing ICU is not necessary, macOS-latest already has v69.1 48 | # - name: Install the ICU library (macOS) 49 | # shell: bash 50 | # run: | 51 | # brew install icu4c 52 | 53 | # Let `pkg-config` find the ICU libs. 54 | # Also test whether it actually works, and print some debug information. 55 | - name: Set up pkg-config for the ICU library (macOS) 56 | if: ${{ runner.os == 'macOS' }} 57 | run: | 58 | PKG_CONFIG_PATH=$(brew --prefix)/opt/icu4c/lib/pkgconfig 59 | echo "PKG_CONFIG_PATH=${PKG_CONFIG_PATH}" >> "${GITHUB_ENV}" 60 | ## The rest is debug info: 61 | echo "$ ls -l ${PKG_CONFIG_PATH}/" 62 | ls -l "${PKG_CONFIG_PATH}/" 63 | export PKG_CONFIG_PATH 64 | echo "$ pkg-config --modversion icu-i18n" 65 | pkg-config --modversion icu-i18n 66 | echo "$ pkg-config --libs --static icu-i18n" 67 | pkg-config --libs --static icu-i18n 68 | 69 | # # Test of `pkg-config --list-all` in connection with macOS-11/12 env bug: 70 | # # https://github.com/actions/runner-images/issues/6364 71 | # # This was fixed upstream 2022-10-10. 72 | # - name: Check integrity of pkg-config database 73 | # if: ${{ runner.os == 'macOS' }} 74 | # run: | 75 | # echo "$ pkg-config --list-all" 76 | # pkg-config --list-all 77 | # echo "========================================================================" 78 | # echo "$ pkg-config --list-all | cut -f 1 -d ' '" 79 | # pkg-config --list-all | cut -f 1 -d ' ' 80 | # echo "========================================================================" 81 | # echo "$ pkg-config --list-all | cut -f 1 -d ' ' | xargs pkg-config --modversion" 82 | # pkg-config --list-all | cut -f 1 -d ' ' | xargs pkg-config --modversion 83 | 84 | - name: Determine the ICU version 85 | shell: bash 86 | run: | 87 | ICU_VER=$(pkg-config --modversion icu-i18n) 88 | echo "ICU_VER=${ICU_VER}" 89 | echo "ICU_VER=${ICU_VER}" >> "${GITHUB_ENV}" 90 | 91 | - name: Checkout 92 | uses: actions/checkout@v4 93 | 94 | - name: Setup Haskell 95 | uses: haskell-actions/setup@v2 96 | id: setup 97 | with: 98 | ghc-version: ${{ matrix.ghc }} 99 | cabal-update: true 100 | 101 | - name: Configure 102 | run: | 103 | cabal configure --enable-tests 104 | cabal build --dry-run 105 | # cabal build --dry-run creates dist-newstyle/cache/plan.json 106 | 107 | - name: Restore cached build products 108 | uses: actions/cache/restore@v4 109 | id: cache 110 | with: 111 | path: | 112 | ${{ steps.setup.outputs.cabal-store }} 113 | dist-newstyle 114 | key: ${{ env.key }}-cabal-${{ steps.setup.outputs.cabal-version }}-plan-${{ hashFiles('dist-newstyle/cache/plan.json') }} 115 | restore-keys: | 116 | ${{ env.key }}-cabal-${{ steps.setup.outputs.cabal-version }}- 117 | ${{ env.key }}- 118 | env: 119 | key: ${{ runner.os }}-ghc-${{ steps.setup.outputs.ghc-version }}-icu-${{ env.ICU_VER }}- 120 | 121 | - name: Build 122 | run: | 123 | cabal build all 124 | 125 | - name: Test 126 | run: | 127 | cabal test all --test-show-details=direct 128 | 129 | - name: Cache build products 130 | uses: actions/cache/save@v4 131 | if: always() && steps.cache.outputs.cache-hit != 'true' 132 | with: 133 | path: | 134 | ${{ steps.setup.outputs.cabal-store }} 135 | dist-newstyle 136 | key: ${{ steps.cache.outputs.cache-primary-key }} 137 | -------------------------------------------------------------------------------- /.github/workflows/haskell-ci.yml: -------------------------------------------------------------------------------- 1 | # This GitHub workflow config has been generated by a script via 2 | # 3 | # haskell-ci 'github' 'text-icu.cabal' 4 | # 5 | # To regenerate the script (for example after adjusting tested-with) run 6 | # 7 | # haskell-ci regenerate 8 | # 9 | # For more information, see https://github.com/haskell-CI/haskell-ci 10 | # 11 | # version: 0.19.20250216 12 | # 13 | # REGENDATA ("0.19.20250216",["github","text-icu.cabal"]) 14 | # 15 | name: Haskell-CI 16 | on: 17 | push: 18 | branches: 19 | - master 20 | pull_request: 21 | branches: 22 | - master 23 | jobs: 24 | linux: 25 | name: Haskell-CI - Linux - ${{ matrix.compiler }} 26 | runs-on: ubuntu-24.04 27 | timeout-minutes: 28 | 60 29 | container: 30 | image: buildpack-deps:jammy 31 | continue-on-error: ${{ matrix.allow-failure }} 32 | strategy: 33 | matrix: 34 | include: 35 | - compiler: ghc-9.12.1 36 | compilerKind: ghc 37 | compilerVersion: 9.12.1 38 | setup-method: ghcup 39 | allow-failure: false 40 | - compiler: ghc-9.10.1 41 | compilerKind: ghc 42 | compilerVersion: 9.10.1 43 | setup-method: ghcup 44 | allow-failure: false 45 | - compiler: ghc-9.8.4 46 | compilerKind: ghc 47 | compilerVersion: 9.8.4 48 | setup-method: ghcup 49 | allow-failure: false 50 | - compiler: ghc-9.6.6 51 | compilerKind: ghc 52 | compilerVersion: 9.6.6 53 | setup-method: ghcup 54 | allow-failure: false 55 | - compiler: ghc-9.4.8 56 | compilerKind: ghc 57 | compilerVersion: 9.4.8 58 | setup-method: ghcup 59 | allow-failure: false 60 | - compiler: ghc-9.2.8 61 | compilerKind: ghc 62 | compilerVersion: 9.2.8 63 | setup-method: ghcup 64 | allow-failure: false 65 | - compiler: ghc-9.0.2 66 | compilerKind: ghc 67 | compilerVersion: 9.0.2 68 | setup-method: ghcup 69 | allow-failure: false 70 | - compiler: ghc-8.10.7 71 | compilerKind: ghc 72 | compilerVersion: 8.10.7 73 | setup-method: ghcup 74 | allow-failure: false 75 | - compiler: ghc-8.8.4 76 | compilerKind: ghc 77 | compilerVersion: 8.8.4 78 | setup-method: ghcup 79 | allow-failure: false 80 | - compiler: ghc-8.6.5 81 | compilerKind: ghc 82 | compilerVersion: 8.6.5 83 | setup-method: ghcup 84 | allow-failure: false 85 | - compiler: ghc-8.4.4 86 | compilerKind: ghc 87 | compilerVersion: 8.4.4 88 | setup-method: ghcup 89 | allow-failure: false 90 | - compiler: ghc-8.2.2 91 | compilerKind: ghc 92 | compilerVersion: 8.2.2 93 | setup-method: ghcup 94 | allow-failure: false 95 | - compiler: ghc-8.0.2 96 | compilerKind: ghc 97 | compilerVersion: 8.0.2 98 | setup-method: ghcup 99 | allow-failure: false 100 | fail-fast: false 101 | steps: 102 | - name: apt-get install 103 | run: | 104 | apt-get update 105 | apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5 libnuma-dev 106 | - name: Install GHCup 107 | run: | 108 | mkdir -p "$HOME/.ghcup/bin" 109 | curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup" 110 | chmod a+x "$HOME/.ghcup/bin/ghcup" 111 | - name: Install cabal-install 112 | run: | 113 | "$HOME/.ghcup/bin/ghcup" install cabal 3.12.1.0 || (cat "$HOME"/.ghcup/logs/*.* && false) 114 | echo "CABAL=$HOME/.ghcup/bin/cabal-3.12.1.0 -vnormal+nowrap" >> "$GITHUB_ENV" 115 | - name: Install GHC (GHCup) 116 | if: matrix.setup-method == 'ghcup' 117 | run: | 118 | "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false) 119 | HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER") 120 | HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#') 121 | HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#') 122 | echo "HC=$HC" >> "$GITHUB_ENV" 123 | echo "HCPKG=$HCPKG" >> "$GITHUB_ENV" 124 | echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV" 125 | env: 126 | HCKIND: ${{ matrix.compilerKind }} 127 | HCNAME: ${{ matrix.compiler }} 128 | HCVER: ${{ matrix.compilerVersion }} 129 | - name: Set PATH and environment variables 130 | run: | 131 | echo "$HOME/.cabal/bin" >> $GITHUB_PATH 132 | echo "LANG=C.UTF-8" >> "$GITHUB_ENV" 133 | echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV" 134 | echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV" 135 | HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))') 136 | echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV" 137 | echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV" 138 | echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV" 139 | echo "HEADHACKAGE=false" >> "$GITHUB_ENV" 140 | echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV" 141 | env: 142 | HCKIND: ${{ matrix.compilerKind }} 143 | HCNAME: ${{ matrix.compiler }} 144 | HCVER: ${{ matrix.compilerVersion }} 145 | - name: env 146 | run: | 147 | env 148 | - name: write cabal config 149 | run: | 150 | mkdir -p $CABAL_DIR 151 | cat >> $CABAL_CONFIG <> $CABAL_CONFIG < cabal-plan.xz 184 | echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2 cabal-plan.xz' | sha256sum -c - 185 | xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan 186 | rm -f cabal-plan.xz 187 | chmod a+x $HOME/.cabal/bin/cabal-plan 188 | cabal-plan --version 189 | - name: checkout 190 | uses: actions/checkout@v4 191 | with: 192 | path: source 193 | - name: initial cabal.project for sdist 194 | run: | 195 | touch cabal.project 196 | echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project 197 | cat cabal.project 198 | - name: sdist 199 | run: | 200 | mkdir -p sdist 201 | $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist 202 | - name: unpack 203 | run: | 204 | mkdir -p unpacked 205 | find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \; 206 | - name: generate cabal.project 207 | run: | 208 | PKGDIR_text_icu="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/text-icu-[0-9.]*')" 209 | echo "PKGDIR_text_icu=${PKGDIR_text_icu}" >> "$GITHUB_ENV" 210 | rm -f cabal.project cabal.project.local 211 | touch cabal.project 212 | touch cabal.project.local 213 | echo "packages: ${PKGDIR_text_icu}" >> cabal.project 214 | if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo "package text-icu" >> cabal.project ; fi 215 | if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo " ghc-options: -Werror=missing-methods" >> cabal.project ; fi 216 | cat >> cabal.project <> cabal.project.local 219 | cat cabal.project 220 | cat cabal.project.local 221 | - name: dump install plan 222 | run: | 223 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all 224 | cabal-plan 225 | - name: restore cache 226 | uses: actions/cache/restore@v4 227 | with: 228 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 229 | path: ~/.cabal/store 230 | restore-keys: ${{ runner.os }}-${{ matrix.compiler }}- 231 | - name: install dependencies 232 | run: | 233 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all 234 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all 235 | - name: build w/o tests 236 | run: | 237 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 238 | - name: build 239 | run: | 240 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always 241 | - name: tests 242 | run: | 243 | $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct 244 | - name: cabal check 245 | run: | 246 | cd ${PKGDIR_text_icu} || false 247 | ${CABAL} -vnormal check 248 | - name: haddock 249 | run: | 250 | $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all 251 | - name: unconstrained build 252 | run: | 253 | rm -f cabal.project.local 254 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 255 | - name: save cache 256 | if: always() 257 | uses: actions/cache/save@v4 258 | with: 259 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 260 | path: ~/.cabal/store 261 | -------------------------------------------------------------------------------- /.github/workflows/stack.yml: -------------------------------------------------------------------------------- 1 | # This workflow uses 'stack' as installed on the github runner. 2 | # GHC is installed via stack. 3 | 4 | name: Stack build 5 | 6 | on: 7 | push: 8 | branches: 9 | - master 10 | pull_request: 11 | branches: 12 | - master 13 | 14 | defaults: 15 | run: 16 | shell: bash 17 | 18 | jobs: 19 | stack: 20 | name: ${{ matrix.os }} Stack ${{ matrix.plan.resolver }} 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | os: 25 | - macOS-latest 26 | - ubuntu-22.04 27 | - ubuntu-24.04 28 | - windows-latest 29 | plan: 30 | - resolver: 'nightly' 31 | - resolver: 'lts' 32 | 33 | runs-on: ${{ matrix.os }} 34 | env: 35 | STACK: stack --no-terminal --resolver ${{ matrix.plan.resolver }} 36 | 37 | steps: 38 | - uses: actions/checkout@v4 39 | 40 | - uses: haskell-actions/setup@v2 41 | with: 42 | enable-stack: true 43 | stack-no-global: true 44 | 45 | - name: Configure 46 | run: | 47 | $STACK init 48 | 49 | - name: Install GHC via stack 50 | run: | 51 | $STACK ghc -- --version 52 | 53 | - name: Haskell versions 54 | run: | 55 | STACK_VERSION=$(${STACK} --numeric-version) 56 | echo "STACK_VERSION=${STACK_VERSION}" >> "${GITHUB_ENV}" 57 | GHC_VERSION=$(${STACK} ghc -- --numeric-version) 58 | echo "GHC_VERSION=${GHC_VERSION}" >> "${GITHUB_ENV}" 59 | 60 | ## This causes troubles on Windows (spaces in env variable)? 61 | # STACK_ROOT=$(${STACK} path --stack-root) 62 | # echo "STACK_ROOT=${STACK_ROOT}" >> "${GITHUB_ENV}" 63 | 64 | - name: Set up for the ICU library (macOS) 65 | if: runner.os == 'macOS' 66 | run: | 67 | ICU4C=$(brew --prefix)/opt/icu4c 68 | echo "PKG_CONFIG_PATH=${ICU4C}/lib/pkgconfig" >> "${GITHUB_ENV}" 69 | 70 | - name: Install the ICU library (Windows) 71 | # We also install pkgconf, which superseds pkg-config. 72 | if: ${{ runner.os == 'Windows' }} 73 | run: | 74 | $STACK exec -- pacman --noconfirm -S msys2-keyring 75 | $STACK exec -- pacman --noconfirm -S mingw-w64-x86_64-pkgconf 76 | $STACK exec -- pacman --noconfirm -S mingw-w64-x86_64-icu 77 | ## Alternatively, in the last line, install a specific version of ICU, like 71: 78 | # $STACK exec -- bash -c "curl -LO ${ICU_URL} && pacman --noconfirm -U *.pkg.tar.zst" 79 | # env: 80 | # ICU_URL: "https://repo.msys2.org/mingw/mingw64/mingw-w64-x86_64-icu-71.1-1-any.pkg.tar.zst" 81 | 82 | - name: Determine the ICU version 83 | run: | 84 | ICU_VERSION=$($STACK exec -- pkg-config --modversion icu-i18n) 85 | echo "ICU version ${ICU_VERSION}" 86 | echo "ICU_VERSION=${ICU_VERSION}" >> "${GITHUB_ENV}" 87 | 88 | ## Caching ~/.stack without --system-ghc is probably not a good idea: 89 | ## - too fat 90 | ## - should be sensibly restored before installing GHC via stack, 91 | ## but then we don't know the GHC version; so at least 'lts' and 'nightly' would be brittle 92 | ## 93 | # - uses: actions/cache@v3 94 | # with: 95 | # path: ${{ env.STACK_ROOT }} 96 | # key: ${{ runner.os }}-stack-${{ env.STACK_VERSION }}-ghc-${{ env.GHC_VERSION }}-icu-${{ env.ICU_VER }}-resolver-${{ matrix.plan.resolver }}-sha-${{ github.sha }} 97 | # restore-keys: ${{ runner.os }}-stack-${{ env.STACK_VERSION }}-ghc-${{ env.GHC_VERSION }}-icu-${{ env.ICU_VER }}-resolver-${{ matrix.plan.resolver }}- 98 | 99 | - name: Install dependencies 100 | run: | 101 | $STACK test --only-dependencies 102 | 103 | - name: Build 104 | run: | 105 | $STACK build --haddock --no-haddock-deps 106 | 107 | - name: Test 108 | run: | 109 | $STACK -j 1 test --haddock --no-haddock-deps 110 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist-newstyle/ 2 | cabal.project.local 3 | cabal.project.local~* 4 | .stack-work/ 5 | stack*.yaml.lock -------------------------------------------------------------------------------- /Data/Text/ICU.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, NoImplicitPrelude #-} 2 | -- | 3 | -- Module : Data.Text.ICU 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Commonly used functions for Unicode, implemented as bindings to the 12 | -- International Components for Unicode (ICU) libraries. 13 | -- 14 | -- This module contains only the most commonly used types and 15 | -- functions. Other modules in this package expose richer interfaces. 16 | module Data.Text.ICU 17 | ( 18 | -- * Data representation 19 | -- $data 20 | 21 | -- * Types 22 | LocaleName(..) 23 | -- * Locales 24 | , availableLocales 25 | -- * Boundary analysis 26 | -- $break 27 | , Breaker 28 | , Break 29 | , brkPrefix 30 | , brkBreak 31 | , brkSuffix 32 | , brkStatus 33 | , Line(..) 34 | , Word(..) 35 | , breakCharacter 36 | , breakLine 37 | , breakSentence 38 | , breakWord 39 | , breaks 40 | , breaksRight 41 | -- * Case mapping 42 | , toCaseFold 43 | , toLower 44 | , toUpper 45 | -- * Iteration 46 | , CharIterator 47 | , fromString 48 | , fromText 49 | , fromUtf8 50 | -- * Normalization 51 | -- $compat 52 | -- ** Normalize unicode strings 53 | , nfc, nfd, nfkc, nfkd, nfkcCasefold 54 | -- ** Checks for normalization 55 | , quickCheck, isNormalized 56 | -- * String comparison 57 | -- ** Normalization-sensitive string comparison 58 | , CompareOption(..) 59 | , compareUnicode 60 | -- ** Locale-sensitive string collation 61 | -- $collate 62 | , Collator 63 | , collator 64 | , collatorWith 65 | , collatorFromRules 66 | , collatorFromRulesWith 67 | , collate 68 | , collateIter 69 | , sortKey 70 | , uca 71 | -- * Regular expressions 72 | , MatchOption(..) 73 | , ParseError(errError, errLine, errOffset) 74 | , Match 75 | , Regex 76 | , Regular 77 | -- ** Construction 78 | , regex 79 | , regex' 80 | -- ** Inspection 81 | , pattern 82 | -- ** Searching 83 | , find 84 | , findAll 85 | -- ** Match groups 86 | -- $group 87 | , groupCount 88 | , unfold 89 | , span 90 | , group 91 | , prefix 92 | , suffix 93 | -- * Spoof checking 94 | -- $spoof 95 | , Spoof 96 | , SpoofParams(..) 97 | , S.SpoofCheck(..) 98 | , S.RestrictionLevel(..) 99 | , S.SpoofCheckResult(..) 100 | -- ** Construction 101 | , spoof 102 | , spoofWithParams 103 | , spoofFromSource 104 | , spoofFromSerialized 105 | -- ** String checking 106 | , areConfusable 107 | , spoofCheck 108 | , getSkeleton 109 | -- ** Configuration 110 | , getChecks 111 | , getAllowedLocales 112 | , getRestrictionLevel 113 | -- ** Persistence 114 | , serialize 115 | -- * Calendars 116 | , Calendar, CalendarType(..), SystemTimeZoneType(..), CalendarField(..), 117 | -- ** Construction 118 | calendar, 119 | -- ** Operations on calendars 120 | roll, add, set1, set, get, 121 | -- * Number formatting 122 | NumberFormatter, numberFormatter, formatIntegral, formatIntegral', formatDouble, formatDouble', 123 | -- * Date formatting 124 | DateFormatter, FormatStyle(..), DateFormatSymbolType(..), standardDateFormatter, patternDateFormatter, dateSymbols, formatCalendar, 125 | ) where 126 | 127 | import Data.Text.ICU.Break.Pure 128 | import Data.Text.ICU.Calendar 129 | import Data.Text.ICU.Collate.Pure 130 | import Data.Text.ICU.DateFormatter 131 | import Data.Text.ICU.Internal 132 | import Data.Text.ICU.Iterator 133 | import Data.Text.ICU.Locale 134 | import Data.Text.ICU.Normalize2 135 | import Data.Text.ICU.NumberFormatter 136 | import Data.Text.ICU.Regex.Pure 137 | import qualified Data.Text.ICU.Spoof as S 138 | import Data.Text.ICU.Spoof.Pure 139 | import Data.Text.ICU.Text 140 | #if defined(__HADDOCK__) 141 | import Data.Text.Foreign 142 | import Data.Text (Text) 143 | #endif 144 | 145 | -- $data 146 | -- 147 | -- The Haskell 'Text' type is implemented as an array in the Haskell 148 | -- heap. This means that its location is not pinned; it may be copied 149 | -- during a garbage collection pass. ICU, on the other hand, works 150 | -- with strings that are allocated in the normal system heap and have 151 | -- a fixed address. 152 | -- 153 | -- To accommodate this need, these bindings use the functions from 154 | -- "Data.Text.Foreign" to copy data between the Haskell heap and the 155 | -- system heap. The copied strings are still managed automatically, 156 | -- but the need to duplicate data does add some performance and memory 157 | -- overhead. 158 | 159 | -- $break 160 | -- 161 | -- Text boundary analysis is the process of locating linguistic 162 | -- boundaries while formatting and handling text. Examples of this 163 | -- process include: 164 | -- 165 | -- * Locating appropriate points to word-wrap text to fit within 166 | -- specific margins while displaying or printing. 167 | -- 168 | -- * Counting characters, words, sentences, or paragraphs. 169 | -- 170 | -- * Making a list of the unique words in a document. 171 | -- 172 | -- * Figuring out if a given range of text contains only whole words. 173 | -- 174 | -- * Capitalizing the first letter of each word. 175 | -- 176 | -- * Locating a particular unit of the text (For example, finding the 177 | -- third word in the document). 178 | -- 179 | -- The 'Breaker' type was designed to support these kinds of 180 | -- tasks. 181 | -- 182 | -- For the impure boundary analysis API (which is richer, but less 183 | -- easy to use than the pure API), see the "Data.Text.ICU.Break" 184 | -- module. The impure API supports some uses that may be less 185 | -- efficient via the pure API, including: 186 | -- 187 | -- * Locating the beginning of a word that the user has selected. 188 | -- 189 | -- * Determining how far to move the text cursor when the user hits an 190 | -- arrow key (Some characters require more than one position in the 191 | -- text store and some characters in the text store do not display 192 | -- at all). 193 | 194 | -- $collate 195 | -- 196 | -- For the impure collation API (which is richer, but less easy to 197 | -- use than the pure API), see the "Data.Text.ICU.Collate" 198 | -- module. 199 | 200 | -- $group 201 | -- 202 | -- Capturing groups are numbered starting from zero. Group zero is 203 | -- always the entire matching text. Groups greater than zero contain 204 | -- the text matching each capturing group in a regular expression. 205 | 206 | -- $spoof 207 | -- 208 | -- The 'Spoof' type performs security checks on visually confusable 209 | -- (spoof) strings. For the impure spoof checking API (which is 210 | -- richer, but less easy to use than the pure API), see the 211 | -- "Data.Text.ICU.Spoof" module. 212 | -- 213 | -- See and 214 | -- for detailed information 215 | -- about the underlying algorithms and databases used by these functions. 216 | 217 | -- $formatting 218 | -- 219 | -- You create a 'NumberFormat' with 'numberFormatter' according to a locale 220 | -- and a choice of pre-defined formats. A 'NumberFormat' provides a formatting 221 | -- facility that 'format's numbers 222 | -- according to the chosen locale. Alternatively create and apply a 'NumberFormat' 223 | -- in a single step with 'formatNumber'' (it may be faster to re-use a NumberFormat though). 224 | -- See the section \"Patterns\" at 225 | -- for further details regarding pattern strings. 226 | 227 | -- $compat 228 | -- See module 'Data.Text.ICU.Normalization2' for the full interface which provides some compatibility with the former API. 229 | -------------------------------------------------------------------------------- /Data/Text/ICU/BiDi.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.BiDi 4 | -- Copyright : (c) 2018 Ondrej Palkovsky 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Implementation of the Unicode Bidirectional Algorithm. See the documentation 12 | -- of the libicu library for additional details. 13 | -- 14 | -- -- /Note/: this module is not thread safe. /Do not/ call the 15 | -- functions on one BiDi object from more than one thread! 16 | 17 | module Data.Text.ICU.BiDi 18 | ( 19 | BiDi 20 | -- ** Basic functions 21 | , open 22 | , openSized 23 | -- ** Set data 24 | , setPara 25 | , setLine 26 | -- ** Access the BiDi object 27 | , countParagraphs 28 | , getParagraphByIndex 29 | , getProcessedLength 30 | -- ** Output text 31 | , writeReordered 32 | , WriteOption(..) 33 | -- ** High-level functions 34 | , reorderParagraphs 35 | ) where 36 | 37 | #include 38 | 39 | import Data.Text.ICU.BiDi.Internal 40 | import Foreign.Marshal.Utils (with) 41 | import Foreign.Storable (peek) 42 | import Foreign.Ptr (FunPtr, Ptr) 43 | import Data.Int (Int32, Int16) 44 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) 45 | import Data.Text (Text) 46 | import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr, newICUPtr) 47 | import Foreign.C.Types (CInt(..)) 48 | import Data.List (foldl') 49 | import Data.Bits ((.|.)) 50 | import System.IO.Unsafe (unsafePerformIO) 51 | import Data.Traversable (for) 52 | 53 | -- | Allocate a BiDi structure. 54 | open :: IO BiDi 55 | open = newICUPtr BiDi ubidi_close ubidi_open 56 | 57 | -- | Allocate a BiDi structure with preallocated memory for internal structures. 58 | openSized :: 59 | Int32 -- ^ is the maximum text or line length that internal memory will be preallocated for. 60 | -- An attempt to associate this object with a longer text will fail, unless this value is 0. 61 | -> Int32 -- ^ is the maximum anticipated number of same-level runs that internal memory will be preallocated for. 62 | -- An attempt to access visual runs on an object that was not preallocated for as many runs as the text was actually resolved to will fail, unless this value is 0. 63 | -> IO BiDi 64 | openSized maxlen maxruncount = 65 | newICUPtr BiDi ubidi_close $ handleError (ubidi_openSized maxlen maxruncount) 66 | 67 | -- | Perform the Unicode Bidi algorithm. It is defined in the Unicode Standard Annex #9, version 13, 68 | -- also described in The Unicode Standard, Version 4.0. 69 | -- This function takes a piece of plain text containing one or more paragraphs, 70 | -- with or without externally specified embedding levels from styled text and 71 | -- computes the left-right-directionality of each character. 72 | setPara :: 73 | BiDi 74 | -> Text 75 | -> Int32 -- ^ specifies the default level for the text; it is typically 0 (LTR) or 1 (RTL) 76 | -> IO () 77 | setPara bidi t paraLevel = 78 | withBiDi bidi $ \bptr -> 79 | useAsUCharPtr t $ \sptr slen -> handleError (ubidi_setPara bptr sptr (fromIntegral slen) paraLevel) 80 | 81 | -- | Sets a BiDi to contain the reordering information, especially the resolved levels, 82 | -- for all the characters in a line of text 83 | setLine :: 84 | BiDi -- ^ the parent paragraph object. It must have been set by a successful call to 'setPara'. 85 | -> Int32 -- ^ is the line's first index into the text 86 | -> Int32 -- ^ is just behind the line's last index into the text (its last index +1). 87 | -> BiDi -- ^ is the object that will now represent a line of the text 88 | -> IO () 89 | setLine paraBidi start limit lineBidi = 90 | withBiDi paraBidi $ \paraptr -> 91 | withBiDi lineBidi $ \lineptr -> 92 | handleError (ubidi_setLine paraptr start limit lineptr) 93 | 94 | -- | Get the number of paragraphs. 95 | countParagraphs :: BiDi -> IO Int32 96 | countParagraphs bidi = withBiDi bidi ubidi_countParagraphs 97 | 98 | -- | Get a paragraph, given the index of this paragraph. 99 | getParagraphByIndex :: 100 | BiDi 101 | -> Int32 -- ^ is the number of the paragraph, in the range [0..ubidi_countParagraphs(pBiDi)-1]. 102 | -> IO (Int32, Int32) -- ^ index of the first character of the paragraph in the text and limit of the paragraph 103 | getParagraphByIndex bidi paraIndex = 104 | withBiDi bidi $ \bptr -> 105 | with 0 $ \pstart -> 106 | with 0 $ \pend -> do 107 | handleError (ubidi_getParagraphByIndex bptr paraIndex pstart pend) 108 | (,) <$> (fromIntegral <$> peek pstart) 109 | <*> (fromIntegral <$> peek pend) 110 | 111 | -- | Get the length of the source text processed by the last call to 'setPara'. 112 | getProcessedLength :: BiDi -> IO Int32 113 | getProcessedLength bidi = withBiDi bidi ubidi_getProcessedLength 114 | 115 | data WriteOption = 116 | DoMirroring 117 | -- ^ replace characters with the "mirrored" property in RTL runs by their mirror-image mappings 118 | | InsertLrmForNumeric 119 | -- ^ surround the run with LRMs if necessary; this is part of the approximate "inverse Bidi" algorithm 120 | | KeepBaseCombining 121 | -- ^ keep combining characters after their base characters in RTL runs 122 | | OutputReverse 123 | -- ^ write the output in reverse order 124 | | RemoveBidiControls 125 | -- ^ remove Bidi control characters (this does not affect InsertLrmForNumeric) 126 | deriving (Show) 127 | 128 | reduceWriteOpts :: [WriteOption] -> Int16 129 | reduceWriteOpts = foldl' orO 0 130 | where a `orO` b = a .|. fromWriteOption b 131 | 132 | fromWriteOption :: WriteOption -> Int16 133 | fromWriteOption DoMirroring = #const UBIDI_DO_MIRRORING 134 | fromWriteOption InsertLrmForNumeric = #const UBIDI_INSERT_LRM_FOR_NUMERIC 135 | fromWriteOption KeepBaseCombining = #const UBIDI_KEEP_BASE_COMBINING 136 | fromWriteOption OutputReverse = #const UBIDI_OUTPUT_REVERSE 137 | fromWriteOption RemoveBidiControls = #const UBIDI_REMOVE_BIDI_CONTROLS 138 | 139 | -- | Take a BiDi object containing the reordering information for a piece of text 140 | -- (one or more paragraphs) set by 'setPara' or for a line of text set by 'setLine' 141 | -- and write a reordered string to the destination buffer. 142 | writeReordered :: BiDi -> [WriteOption] -> IO Text 143 | writeReordered bidi opts = do 144 | destLen <- getProcessedLength bidi 145 | let options' = reduceWriteOpts opts 146 | withBiDi bidi $ \bptr -> 147 | handleOverflowError (fromIntegral destLen) 148 | (\dptr dlen -> ubidi_writeReordered bptr dptr (fromIntegral dlen) options') 149 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 150 | 151 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_open" ubidi_open 152 | :: IO (Ptr UBiDi) 153 | 154 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_openSized" ubidi_openSized 155 | :: Int32 -> Int32 -> Ptr UErrorCode -> IO (Ptr UBiDi) 156 | 157 | foreign import ccall unsafe "hs_text_icu.h &__hs_ubidi_close" ubidi_close 158 | :: FunPtr (Ptr UBiDi -> IO ()) 159 | 160 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_setPara" ubidi_setPara 161 | :: Ptr UBiDi -> Ptr UChar -> Int32 -> Int32 -> Ptr UErrorCode -> IO () 162 | 163 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_countParagraphs" ubidi_countParagraphs 164 | :: Ptr UBiDi -> IO Int32 165 | 166 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_getParagraphByIndex" ubidi_getParagraphByIndex 167 | :: Ptr UBiDi -> Int32 -> Ptr CInt -> Ptr CInt -> Ptr UErrorCode -> IO () 168 | 169 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_getProcessedLength" ubidi_getProcessedLength 170 | :: Ptr UBiDi -> IO Int32 171 | 172 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_writeReordered" ubidi_writeReordered 173 | :: Ptr UBiDi -> Ptr UChar -> Int32 -> Int16 -> Ptr UErrorCode -> IO Int32 174 | 175 | foreign import ccall unsafe "hs_text_icu.h __hs_ubidi_setLine" ubidi_setLine 176 | :: Ptr UBiDi -> Int32 -> Int32 -> Ptr UBiDi -> Ptr UErrorCode -> IO () 177 | 178 | -- | Helper function to reorder a text to a series of paragraphs. 179 | reorderParagraphs :: [WriteOption] -> Text -> [Text] 180 | reorderParagraphs options input = 181 | unsafePerformIO $ do 182 | bidi <- open 183 | setPara bidi input 0 184 | pcount <- countParagraphs bidi 185 | lineBidi <- open 186 | for [0..pcount-1] $ \pidx -> do 187 | (start,limit) <- getParagraphByIndex bidi pidx 188 | setLine bidi start limit lineBidi 189 | writeReordered lineBidi options 190 | -------------------------------------------------------------------------------- /Data/Text/ICU/BiDi/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Bidi.Internal 4 | -- Copyright : (c) Ondrej Palkovsky 2018 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Internal types for Unicode bidirectional algorithm 12 | 13 | module Data.Text.ICU.BiDi.Internal 14 | ( 15 | BiDi(..) 16 | , UBiDi 17 | , withBiDi 18 | ) where 19 | 20 | import Data.Typeable (Typeable) 21 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) 22 | import Foreign.Ptr (Ptr) 23 | 24 | data UBiDi 25 | 26 | -- | BiDi object. /Note/: this structure is not 27 | -- thread safe. It is /not/ safe to use value of this type 28 | -- simultaneously from multiple threads. 29 | newtype BiDi = BiDi (ForeignPtr UBiDi) 30 | deriving (Eq, Typeable) 31 | 32 | instance Show BiDi where 33 | show _ = "BiDi" 34 | 35 | withBiDi :: BiDi -> (Ptr UBiDi -> IO a) -> IO a 36 | {-# INLINE withBiDi #-} 37 | withBiDi (BiDi cnv) = withForeignPtr cnv 38 | -------------------------------------------------------------------------------- /Data/Text/ICU/BitMask.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DefaultSignatures, ScopedTypeVariables #-} 2 | 3 | -- From http://stackoverflow.com/a/15911213 4 | 5 | module Data.Text.ICU.BitMask 6 | ( 7 | -- * Bit mask twiddling API 8 | -- $api 9 | -- * Types 10 | ToBitMask(..) 11 | -- * Functions 12 | , fromBitMask 13 | , highestValueInBitMask 14 | ) where 15 | 16 | import Data.Bits ((.&.), (.|.)) 17 | import Data.Maybe (listToMaybe) 18 | 19 | -- $api 20 | -- Conversion to and from enumerated types representable as 21 | -- a compact bitmask. 22 | 23 | class ToBitMask a where 24 | toBitMask :: a -> Int 25 | 26 | instance (ToBitMask a) => ToBitMask [a] where 27 | toBitMask = foldr ((.|.) . toBitMask) 0 28 | 29 | fromBitMask :: (Enum a, Bounded a, ToBitMask a) => Int -> [a] 30 | fromBitMask bm = filter inBitMask $ enumFrom minBound 31 | where inBitMask val = (bm .&. toBitMask val) == toBitMask val 32 | 33 | highestValueInBitMask :: (Enum a, Bounded a, ToBitMask a) => Int -> Maybe a 34 | highestValueInBitMask = listToMaybe . reverse . fromBitMask 35 | -------------------------------------------------------------------------------- /Data/Text/ICU/Break.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, ForeignFunctionInterface, RecordWildCards #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Break 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- String breaking functions for Unicode, implemented as bindings to 12 | -- the International Components for Unicode (ICU) libraries. 13 | -- 14 | -- The text boundary positions are found according to the rules described in 15 | -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex 16 | -- #14, Line Breaking Properties. These are available at 17 | -- and 18 | -- . 19 | 20 | module Data.Text.ICU.Break 21 | ( 22 | -- * Types 23 | BreakIterator 24 | , Line(..) 25 | , Data.Text.ICU.Break.Word(..) 26 | -- * Breaking functions 27 | , breakCharacter 28 | , breakLine 29 | , breakSentence 30 | , breakWord 31 | , clone 32 | , setText 33 | -- * Iteration functions 34 | -- $indices 35 | , current 36 | , first 37 | , last 38 | , next 39 | , previous 40 | , preceding 41 | , following 42 | , isBoundary 43 | -- * Iterator status 44 | , getStatus 45 | , getStatuses 46 | -- * Locales 47 | , available 48 | ) where 49 | 50 | #include 51 | 52 | import Control.DeepSeq (NFData(..)) 53 | import Control.Monad (forM) 54 | import Data.IORef (newIORef, writeIORef) 55 | import Data.Int (Int32) 56 | import Data.Text (Text) 57 | import Data.Text.ICU.Break.Types (BreakIterator(..), UBreakIterator) 58 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError) 59 | import Data.Text.ICU.Internal (LocaleName(..), UBool, UChar, asBool, withLocaleName, TextI, UText, asUTextPtr, withUTextPtr, newICUPtr) 60 | import Foreign.C.String (CString, peekCString) 61 | import Foreign.C.Types (CInt(..)) 62 | import Foreign.ForeignPtr (withForeignPtr) 63 | import Foreign.Marshal.Array (allocaArray, peekArray) 64 | import Foreign.Marshal.Utils (with) 65 | import Foreign.Ptr (FunPtr, Ptr, nullPtr) 66 | import Prelude hiding (last) 67 | import System.IO.Unsafe (unsafePerformIO) 68 | 69 | -- $indices 70 | -- 71 | -- /Important note/: All of the indices accepted and returned by 72 | -- functions in this module are offsets into the raw UTF-16 text 73 | -- array, /not/ a count of codepoints. 74 | 75 | -- | Line break status. 76 | data Line = Soft -- ^ A soft line break is a position at 77 | -- which a line break is acceptable, but not 78 | -- required. 79 | | Hard 80 | deriving (Eq, Show, Enum) 81 | 82 | instance NFData Line where 83 | rnf !_ = () 84 | 85 | -- | Word break status. 86 | data Word = Uncategorized -- ^ A \"word\" that does not fit into another 87 | -- category. Includes spaces and most 88 | -- punctuation. 89 | | Number -- ^ A word that appears to be a number. 90 | | Letter -- ^ A word containing letters, excluding 91 | -- hiragana, katakana or ideographic 92 | -- characters. 93 | | Kana -- ^ A word containing kana characters. 94 | | Ideograph -- ^ A word containing ideographic characters. 95 | deriving (Eq, Show, Enum) 96 | 97 | instance NFData Data.Text.ICU.Break.Word where 98 | rnf !_ = () 99 | 100 | -- | Break a string on character boundaries. 101 | -- 102 | -- Character boundary analysis identifies the boundaries of \"Extended 103 | -- Grapheme Clusters\", which are groupings of codepoints that should be 104 | -- treated as character-like units for many text operations. Please see 105 | -- Unicode Standard Annex #29, Unicode Text Segmentation, 106 | -- for additional information on 107 | -- grapheme clusters and guidelines on their use. 108 | breakCharacter :: LocaleName -> Text -> IO (BreakIterator ()) 109 | breakCharacter = open (#const UBRK_CHARACTER) (const ()) 110 | 111 | -- | Break a string on line boundaries. 112 | -- 113 | -- Line boundary analysis determines where a text string can be broken when 114 | -- line wrapping. The mechanism correctly handles punctuation and hyphenated 115 | -- words. 116 | breakLine :: LocaleName -> Text -> IO (BreakIterator Line) 117 | breakLine = open (#const UBRK_LINE) asLine 118 | where 119 | asLine i 120 | | i < (#const UBRK_LINE_SOFT_LIMIT) = Soft 121 | | i < (#const UBRK_LINE_HARD_LIMIT) = Hard 122 | | otherwise = error $ "unknown line break status " ++ show i 123 | 124 | -- | Break a string on sentence boundaries. 125 | -- 126 | -- Sentence boundary analysis allows selection with correct interpretation 127 | -- of periods within numbers and abbreviations, and trailing punctuation 128 | -- marks such as quotation marks and parentheses. 129 | breakSentence :: LocaleName -> Text -> IO (BreakIterator ()) 130 | breakSentence = open (#const UBRK_SENTENCE) (const ()) 131 | 132 | -- | Break a string on word boundaries. 133 | -- 134 | -- Word boundary analysis is used by search and replace functions, as well 135 | -- as within text editing applications that allow the user to select words 136 | -- with a double click. Word selection provides correct interpretation of 137 | -- punctuation marks within and following words. Characters that are not 138 | -- part of a word, such as symbols or punctuation marks, have word breaks on 139 | -- both sides. 140 | breakWord :: LocaleName -> Text -> IO (BreakIterator Data.Text.ICU.Break.Word) 141 | breakWord = open (#const UBRK_WORD) asWord 142 | where 143 | asWord i 144 | | i < (#const UBRK_WORD_NONE_LIMIT) = Uncategorized 145 | | i < (#const UBRK_WORD_NUMBER_LIMIT) = Number 146 | | i < (#const UBRK_WORD_LETTER_LIMIT) = Letter 147 | | i < (#const UBRK_WORD_KANA_LIMIT) = Kana 148 | | i < (#const UBRK_WORD_IDEO_LIMIT) = Ideograph 149 | | otherwise = error $ "unknown word break status " ++ show i 150 | 151 | -- | Create a new 'BreakIterator' for locating text boundaries in the 152 | -- specified locale. 153 | open :: UBreakIteratorType -> (Int32 -> a) -> LocaleName -> Text 154 | -> IO (BreakIterator a) 155 | open brk f loc t = withLocaleName loc $ \locale -> do 156 | r <- newIORef undefined 157 | b <- newICUPtr (BR r f) ubrk_close $ 158 | handleError $ ubrk_open brk locale nullPtr 0 159 | setText b t 160 | return b 161 | 162 | -- | Point an existing 'BreakIterator' at a new piece of text. 163 | setText :: BreakIterator a -> Text -> IO () 164 | setText BR{..} t = do 165 | fp <- asUTextPtr t 166 | withUTextPtr fp $ \ ptr -> do 167 | withForeignPtr brIter $ \p -> handleError $ ubrk_setUText p ptr 168 | writeIORef brText fp 169 | 170 | -- | Thread safe cloning operation. This is substantially faster than 171 | -- creating a new 'BreakIterator' from scratch. 172 | clone :: BreakIterator a -> IO (BreakIterator a) 173 | clone BR{..} = newICUPtr (BR brText brStatus) ubrk_close $ 174 | withForeignPtr brIter $ \p -> 175 | with 1 $ handleError . ubrk_safeClone p nullPtr 176 | 177 | asIndex :: (Ptr UBreakIterator -> IO Int32) -> BreakIterator a -> IO (Maybe TextI) 178 | asIndex act BR{..} = do 179 | i <- withForeignPtr brIter act 180 | return $! if i == (#const UBRK_DONE) 181 | then Nothing 182 | else Just $! fromIntegral i 183 | 184 | -- | Reset the breaker to the beginning of the text to be scanned. 185 | first :: BreakIterator a -> IO TextI 186 | first BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_first 187 | 188 | -- | Reset the breaker to the end of the text to be scanned. 189 | last :: BreakIterator a -> IO TextI 190 | last BR{..} = fromIntegral `fmap` withForeignPtr brIter ubrk_last 191 | 192 | -- | Advance the iterator and break at the text boundary that follows the 193 | -- current text boundary. 194 | next :: BreakIterator a -> IO (Maybe TextI) 195 | next = asIndex ubrk_next 196 | 197 | -- | Advance the iterator and break at the text boundary that precedes the 198 | -- current text boundary. 199 | previous :: BreakIterator a -> IO (Maybe TextI) 200 | previous = asIndex ubrk_previous 201 | 202 | -- | Determine the text boundary preceding the specified offset. 203 | preceding :: BreakIterator a -> Int -> IO (Maybe TextI) 204 | preceding bi i = asIndex (flip ubrk_preceding (fromIntegral i)) bi 205 | 206 | -- | Determine the text boundary following the specified offset. 207 | following :: BreakIterator a -> Int -> IO (Maybe TextI) 208 | following bi i = asIndex (flip ubrk_following (fromIntegral i)) bi 209 | 210 | -- | Return the character index most recently returned by 'next', 211 | -- 'previous', 'first', or 'last'. 212 | current :: BreakIterator a -> IO (Maybe TextI) 213 | current = asIndex ubrk_current 214 | 215 | -- | Return the status from the break rule that determined the most recently 216 | -- returned break position. For rules that do not specify a status, a 217 | -- default value of @()@ is returned. 218 | getStatus :: BreakIterator a -> IO a 219 | getStatus BR{..} = brStatus `fmap` withForeignPtr brIter ubrk_getRuleStatus 220 | 221 | -- | Return statuses from all of the break rules that determined the most 222 | -- recently returned break position. 223 | getStatuses :: BreakIterator a -> IO [a] 224 | getStatuses BR{..} = 225 | withForeignPtr brIter $ \brk -> do 226 | n <- handleError $ ubrk_getRuleStatusVec brk nullPtr 0 227 | allocaArray (fromIntegral n) $ \ptr -> do 228 | _ <- handleError $ ubrk_getRuleStatusVec brk ptr n 229 | map brStatus `fmap` peekArray (fromIntegral n) ptr 230 | 231 | -- | Determine whether the specified position is a boundary position. 232 | -- As a side effect, leaves the iterator pointing to the first 233 | -- boundary position at or after the given offset. 234 | isBoundary :: BreakIterator a -> Int -> IO Bool 235 | isBoundary BR{..} i = asBool `fmap` withForeignPtr brIter (flip ubrk_isBoundary (fromIntegral i)) 236 | 237 | -- | Locales for which text breaking information is available. A 238 | -- 'BreakIterator' in a locale in this list will perform the correct 239 | -- text breaking for the locale. 240 | available :: [LocaleName] 241 | available = unsafePerformIO $ do 242 | n <- ubrk_countAvailable 243 | forM [0..n-1] $ \i -> ubrk_getAvailable i >>= fmap Locale . peekCString 244 | {-# NOINLINE available #-} 245 | 246 | type UBreakIteratorType = CInt 247 | 248 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_open" ubrk_open 249 | :: UBreakIteratorType -> CString -> Ptr UChar -> Int32 -> Ptr UErrorCode 250 | -> IO (Ptr UBreakIterator) 251 | 252 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_setUText" ubrk_setUText 253 | :: Ptr UBreakIterator -> Ptr UText -> Ptr UErrorCode 254 | -> IO () 255 | 256 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_safeClone" ubrk_safeClone 257 | :: Ptr UBreakIterator -> Ptr a -> Ptr Int32 -> Ptr UErrorCode 258 | -> IO (Ptr UBreakIterator) 259 | 260 | foreign import ccall unsafe "hs_text_icu.h &__hs_ubrk_close" ubrk_close 261 | :: FunPtr (Ptr UBreakIterator -> IO ()) 262 | 263 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_current" ubrk_current 264 | :: Ptr UBreakIterator -> IO Int32 265 | 266 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_first" ubrk_first 267 | :: Ptr UBreakIterator -> IO Int32 268 | 269 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_last" ubrk_last 270 | :: Ptr UBreakIterator -> IO Int32 271 | 272 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_next" ubrk_next 273 | :: Ptr UBreakIterator -> IO Int32 274 | 275 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_previous" ubrk_previous 276 | :: Ptr UBreakIterator -> IO Int32 277 | 278 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_preceding" ubrk_preceding 279 | :: Ptr UBreakIterator -> Int32 -> IO Int32 280 | 281 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_following" ubrk_following 282 | :: Ptr UBreakIterator -> Int32 -> IO Int32 283 | 284 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatus" ubrk_getRuleStatus 285 | :: Ptr UBreakIterator -> IO Int32 286 | 287 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getRuleStatusVec" ubrk_getRuleStatusVec 288 | :: Ptr UBreakIterator -> Ptr Int32 -> Int32 -> Ptr UErrorCode -> IO Int32 289 | 290 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_isBoundary" ubrk_isBoundary 291 | :: Ptr UBreakIterator -> Int32 -> IO UBool 292 | 293 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_countAvailable" ubrk_countAvailable 294 | :: IO Int32 295 | 296 | foreign import ccall unsafe "hs_text_icu.h __hs_ubrk_getAvailable" ubrk_getAvailable 297 | :: Int32 -> IO CString 298 | -------------------------------------------------------------------------------- /Data/Text/ICU/Break/Pure.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, RecordWildCards, CPP #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Break.Pure 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- String breaking functions for Unicode, implemented as bindings to 12 | -- the International Components for Unicode (ICU) libraries. 13 | -- 14 | -- The text boundary positions are found according to the rules described in 15 | -- Unicode Standard Annex #29, Text Boundaries, and Unicode Standard Annex 16 | -- #14, Line Breaking Properties. These are available at 17 | -- and 18 | -- . 19 | 20 | module Data.Text.ICU.Break.Pure 21 | ( 22 | -- * Types 23 | Breaker 24 | , Break 25 | , brkPrefix 26 | , brkBreak 27 | , brkSuffix 28 | , brkStatus 29 | , Line(..) 30 | , Data.Text.ICU.Break.Word(..) 31 | -- * Breaking functions 32 | , breakCharacter 33 | , breakLine 34 | , breakSentence 35 | , breakWord 36 | -- * Iteration 37 | , breaks 38 | , breaksRight 39 | ) where 40 | 41 | import Control.DeepSeq (NFData(..)) 42 | import Data.Text (Text, empty) 43 | import Data.Text.ICU.Break (Line, Word) 44 | import Data.Text.ICU.Break.Types (BreakIterator(..)) 45 | import Data.Text.ICU.Internal (LocaleName, takeWord, dropWord) 46 | import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO) 47 | import qualified Data.Text.ICU.Break as IO 48 | 49 | -- | A boundary analyser. 50 | newtype Breaker a = B (BreakIterator a) 51 | 52 | new :: (LocaleName -> Text -> IO (BreakIterator a)) -> LocaleName -> Breaker a 53 | new act loc = unsafePerformIO $ B `fmap` act loc empty 54 | 55 | -- | Break a string on character boundaries. 56 | -- 57 | -- Character boundary analysis identifies the boundaries of "Extended 58 | -- Grapheme Clusters", which are groupings of codepoints that should be 59 | -- treated as character-like units for many text operations. Please see 60 | -- Unicode Standard Annex #29, Unicode Text Segmentation, 61 | -- for additional information on 62 | -- grapheme clusters and guidelines on their use. 63 | breakCharacter :: LocaleName -> Breaker () 64 | breakCharacter = new IO.breakCharacter 65 | 66 | -- | Break a string on line boundaries. 67 | -- 68 | -- Line boundary analysis determines where a text string can be broken when 69 | -- line wrapping. The mechanism correctly handles punctuation and hyphenated 70 | -- words. 71 | breakLine :: LocaleName -> Breaker Line 72 | breakLine = new IO.breakLine 73 | 74 | -- | Break a string on sentence boundaries. 75 | -- 76 | -- Sentence boundary analysis allows selection with correct interpretation 77 | -- of periods within numbers and abbreviations, and trailing punctuation 78 | -- marks such as quotation marks and parentheses. 79 | breakSentence :: LocaleName -> Breaker () 80 | breakSentence = new IO.breakSentence 81 | 82 | -- | Break a string on word boundaries. 83 | -- 84 | -- Word boundary analysis is used by search and replace functions, as well 85 | -- as within text editing applications that allow the user to select words 86 | -- with a double click. Word selection provides correct interpretation of 87 | -- punctuation marks within and following words. Characters that are not 88 | -- part of a word, such as symbols or punctuation marks, have word breaks on 89 | -- both sides. 90 | breakWord :: LocaleName -> Breaker Data.Text.ICU.Break.Word 91 | breakWord = new IO.breakWord 92 | 93 | -- | A break in a string. 94 | data Break a = Break { 95 | brkPrefix :: {-# UNPACK #-} !Text -- ^ Prefix of the current break. 96 | , brkBreak :: {-# UNPACK #-} !Text -- ^ Text of the current break. 97 | , brkSuffix :: {-# UNPACK #-} !Text -- ^ Suffix of the current break. 98 | , brkStatus :: !a 99 | -- ^ Status of the current break (only meaningful if 'Line' or 'Word'). 100 | } deriving (Eq, Show) 101 | 102 | instance (NFData a) => NFData (Break a) where 103 | rnf Break{..} = rnf brkStatus 104 | 105 | -- | Return a list of all breaks in a string, from left to right. 106 | breaks :: Breaker a -> Text -> [Break a] 107 | breaks (B b) t = unsafePerformIO $ do 108 | bi <- IO.clone b 109 | IO.setText bi t 110 | let go p = do 111 | mix <- IO.next bi 112 | case mix of 113 | Nothing -> return [] 114 | Just n -> do 115 | s <- IO.getStatus bi 116 | let d = n-p 117 | u = dropWord p t 118 | (Break (takeWord p t) (takeWord d u) (dropWord d u) s :) `fmap` go n 119 | unsafeInterleaveIO $ go =<< IO.first bi 120 | 121 | -- | Return a list of all breaks in a string, from right to left. 122 | breaksRight :: Breaker a -> Text -> [Break a] 123 | breaksRight (B b) t = unsafePerformIO $ do 124 | bi <- IO.clone b 125 | IO.setText bi t 126 | let go p = do 127 | mix <- IO.previous bi 128 | case mix of 129 | Nothing -> return [] 130 | Just n -> do 131 | s <- IO.getStatus bi 132 | let d = p-n 133 | u = dropWord n t 134 | (Break (takeWord n t) (takeWord d u) (dropWord d u) s :) `fmap` go n 135 | unsafeInterleaveIO $ go =<< IO.last bi 136 | -------------------------------------------------------------------------------- /Data/Text/ICU/Break/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE EmptyDataDecls #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Break.Internal 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | 11 | module Data.Text.ICU.Break.Types 12 | ( 13 | BreakIterator(..) 14 | , UBreakIterator 15 | ) where 16 | 17 | import Data.IORef (IORef) 18 | import Data.Int (Int32) 19 | import Foreign.ForeignPtr (ForeignPtr) 20 | import Data.Text.ICU.Internal (UTextPtr) 21 | 22 | -- A boundary breaker. 23 | data BreakIterator a = BR { 24 | brText :: IORef UTextPtr 25 | , brStatus :: Int32 -> a 26 | , brIter :: ForeignPtr UBreakIterator 27 | } 28 | 29 | data UBreakIterator 30 | -------------------------------------------------------------------------------- /Data/Text/ICU/CaseMap.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.CaseMap 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Character set case mapping functions for Unicode, implemented as 12 | -- bindings to the International Components for Unicode (ICU) 13 | -- libraries. 14 | 15 | module Data.Text.ICU.CaseMap 16 | ( 17 | CaseMap 18 | , CaseOption(..) 19 | , caseMap 20 | ) where 21 | 22 | #include 23 | #include 24 | 25 | import Data.Bits ((.|.)) 26 | import Data.List (foldl') 27 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError) 28 | import Data.Text.ICU.Internal (LocaleName, withLocaleName) 29 | import Data.Typeable (Typeable) 30 | import Data.Word (Word32) 31 | import Foreign.C.String (CString) 32 | import Foreign.ForeignPtr (ForeignPtr) 33 | import Foreign.Ptr (FunPtr, Ptr) 34 | import System.IO.Unsafe (unsafePerformIO) 35 | 36 | data UCaseMap 37 | 38 | data CaseMap = CaseMap {-# UNPACK #-} !(ForeignPtr UCaseMap) 39 | deriving (Eq, Typeable) 40 | 41 | data CaseOption = FoldCaseExcludeSpecialI 42 | | TitleCaseNoLowerCase 43 | | TitleCaseNoBreakAdjustment 44 | deriving (Eq, Enum) 45 | 46 | instance NFData CaseOption where 47 | rnf !_ = () 48 | 49 | fromCaseMapOption :: CaseOption -> Word32 50 | fromCaseMapOption FoldCaseExcludeSpecialI = #const U_FOLD_CASE_EXCLUDE_SPECIAL_I 51 | fromCaseMapOption TitleCaseNoLowerCase = #const U_TITLECASE_NO_LOWERCASE 52 | fromCaseMapOption TitleCaseNoBreakAdjustment = #const U_TITLECASE_NO_BREAK_ADJUSTMENT 53 | 54 | reduceCaseMapOptions :: [CaseOption] -> Word32 55 | reduceCaseMapOptions = foldl' (.|.) (#const U_FOLD_CASE_DEFAULT) . 56 | map fromCaseMapOption 57 | 58 | caseMap :: LocaleName -> [CaseOption] -> CaseMap 59 | caseMap name opts = unsafePerformIO $ newICUPtr CaseMap ucasemap_close $ 60 | withLocaleName name $ \nptr -> handleError $ ucasemap_open nptr $ 61 | reduceCaseMapOptions $ opts 62 | 63 | foreign import ccall unsafe "hs_text_icu.h __hs_ucasemap_open" ucasemap_open 64 | :: CString -> Word32 -> Ptr UErrorCode -> IO (Ptr UCaseMap) 65 | 66 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucasemap_close" ucasemap_close 67 | :: FunPtr (Ptr UCaseMap -> IO ()) 68 | -------------------------------------------------------------------------------- /Data/Text/ICU/CharsetDetection.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.CharsetDetection 4 | -- Copyright : (c) 2017 Zac Slade 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Access to the Unicode Character Set Detection facilities, implemented in 12 | -- the International Components for Unicode (ICU) libraries. 13 | -- 14 | -- For more information see the \"Character Set Detection\" chapter 15 | -- in the ICU User Guide 16 | -- . 17 | module Data.Text.ICU.CharsetDetection 18 | ( 19 | setText 20 | , detect 21 | , mkCharsetDetector 22 | , withCharsetDetector 23 | , wrapUCharsetMatch 24 | , CharsetMatch 25 | , CharsetDetector 26 | , getConfidence 27 | , getName 28 | , getLanguage 29 | ) where 30 | 31 | import Foreign.Ptr (Ptr) 32 | import Foreign.C.String (CString) 33 | import Foreign.C.Types (CChar) 34 | import qualified Data.ByteString as BS 35 | import Data.ByteString (ByteString) 36 | import qualified Data.Text.Encoding as TE 37 | import Data.Text (Text) 38 | 39 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError) 40 | import Data.Text.ICU.CharsetDetection.Internal (UCharsetMatch, UCharsetDetector, 41 | CharsetDetector, CharsetMatch, 42 | mkCharsetDetector, 43 | withCharsetDetector, 44 | withCharsetMatch, 45 | wrapUCharsetMatch) 46 | 47 | #include 48 | 49 | -- | From the ICU C API documentation: 50 | -- "Character set detection is at best an imprecise operation. The 51 | -- detection process will attempt to identify the charset that best matches 52 | -- the characteristics of the byte data, but the process is partly statistical 53 | -- in nature, and the results can not be guaranteed to always be correct. 54 | -- 55 | -- For best accuracy in charset detection, the input data should be primarily 56 | -- in a single language, and a minimum of a few hundred bytes worth of plain 57 | -- text in the language are needed. The detection process will attempt to 58 | -- ignore html or xml style markup that could otherwise obscure the content." 59 | 60 | -- | Use the first 512 bytes, if available, as the text in the 61 | -- 'CharsetDetector' object. This function is low-level and used by the more 62 | -- high-level 'detect' function. 63 | setText :: ByteString -> CharsetDetector -> IO () 64 | setText bs ucsd = withCharsetDetector ucsd go 65 | where 66 | go u = if BS.length bs < 512 67 | then BS.useAsCStringLen bs (\(text,size) -> handleError $ ucsdet_setText u text size) 68 | else BS.useAsCStringLen (BS.take 512 bs) (\(text,size) -> handleError $ ucsdet_setText u text size) 69 | 70 | -- | Attempt to perform a detection without an input filter. The best match 71 | -- will be returned. 72 | detect :: ByteString -> IO CharsetMatch 73 | detect bs = do 74 | ucsd <- mkCharsetDetector 75 | setText bs ucsd 76 | wrapUCharsetMatch ucsd $ withCharsetDetector ucsd (handleError . ucsdet_detect) 77 | 78 | -- | See the confidence score from 0-100 of the 'CharsetMatch' object. 79 | getConfidence :: CharsetMatch -> IO Int 80 | getConfidence ucm = withCharsetMatch ucm $ handleError . ucsdet_getConfidence 81 | 82 | -- | Extract the character set encoding name from the 'CharsetMatch' 83 | -- object. 84 | getName :: CharsetMatch -> IO Text 85 | getName ucsm = do 86 | bs <- withCharsetMatch ucsm (handleError . ucsdet_getName) >>= BS.packCString 87 | return $ TE.decodeUtf8 bs 88 | 89 | -- | Extracts the three letter ISO code for the language encoded in the 90 | -- 'CharsetMatch'. 91 | getLanguage :: CharsetMatch -> IO Text 92 | getLanguage ucsm = do 93 | bs <- withCharsetMatch ucsm (handleError . ucsdet_getLanguage) >>= BS.packCString 94 | return $ TE.decodeUtf8 bs 95 | 96 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_setText" ucsdet_setText 97 | :: Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO () 98 | 99 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_detect" ucsdet_detect 100 | :: Ptr UCharsetDetector -> Ptr UErrorCode -> IO (Ptr UCharsetMatch) 101 | 102 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getConfidence" ucsdet_getConfidence 103 | :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int 104 | 105 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getName" ucsdet_getName 106 | :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString 107 | 108 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getLanguage" ucsdet_getLanguage 109 | :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString 110 | -------------------------------------------------------------------------------- /Data/Text/ICU/CharsetDetection/Internal.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface, EmptyDataDecls #-} 2 | -- | 3 | -- Module : Data.Text.ICU.CharsetDetection.Internal 4 | -- Copyright : (c) 2017 Zac Slade 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Access to the Unicode Character Set Detection facilities, implemented in 12 | -- the International Components for Unicode (ICU) libraries. 13 | -- 14 | -- For more information see the \"Character Set Detection\" chapter 15 | -- in the ICU User Guide 16 | -- . 17 | module Data.Text.ICU.CharsetDetection.Internal 18 | ( 19 | UCharsetDetector 20 | , UCharsetMatch 21 | , CharsetMatch(..) 22 | , CharsetDetector(..) 23 | , withCharsetDetector 24 | , wrapUCharsetDetector 25 | , wrapUCharsetMatch 26 | , mkCharsetDetector 27 | , withCharsetMatch 28 | ) where 29 | 30 | import Data.Typeable (Typeable) 31 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) 32 | import Foreign.Ptr (FunPtr, Ptr) 33 | 34 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError) 35 | import Data.Text.ICU.Internal (newICUPtr) 36 | 37 | #include 38 | 39 | -- | Opaque handle to a character set detector 40 | data UCharsetDetector 41 | 42 | -- | Handy wrapper for the pointer to the 'UCharsetDetector'. We must 43 | -- always call ucsdet_close on any UCharsetDetector when we are done. The 44 | -- 'withCharsetDetector' and 'wrapUCharsetDetector' functions simplify 45 | -- management of the pointers. 46 | data CharsetDetector = CharsetDetector { 47 | charsetDetectorPtr :: {-# UNPACK #-} !(ForeignPtr UCharsetDetector) 48 | } deriving (Typeable) 49 | 50 | mkCharsetDetector :: IO CharsetDetector 51 | mkCharsetDetector = wrapUCharsetDetector $ handleError ucsdet_open 52 | 53 | -- | Temporarily unwraps an 'CharsetDetector' to perform operations on its 54 | -- raw 'UCharsetDetector' handle. 55 | withCharsetDetector :: CharsetDetector -> (Ptr UCharsetDetector -> IO a) -> IO a 56 | withCharsetDetector (CharsetDetector ucsd) = withForeignPtr ucsd 57 | {-# INLINE withCharsetDetector #-} 58 | 59 | -- | Wraps a raw 'UCharsetDetector' in an 'CharsetDetector', closing the 60 | -- handle when the last reference to the object is dropped. 61 | wrapUCharsetDetector :: IO (Ptr UCharsetDetector) -> IO CharsetDetector 62 | wrapUCharsetDetector = newICUPtr CharsetDetector ucsdet_close 63 | {-# INLINE wrapUCharsetDetector #-} 64 | 65 | -- | Opaque handle to a character set match 66 | data UCharsetMatch 67 | 68 | -- | Opaque character set match handle. The memory backing these objects is 69 | -- managed entirely by the ICU C library. 70 | -- TODO: UCharsetMatch is reset after the setText call. We need to handle it. 71 | data CharsetMatch = 72 | CharsetMatch 73 | { charsetMatchPtr :: {-# UNPACK #-} !(Ptr UCharsetMatch) 74 | , charsetMatchDetector :: CharsetDetector 75 | -- ^ keep reference since UCharsetMatch object is owned 76 | -- by the UCharsetDetector. 77 | } 78 | deriving (Typeable) 79 | 80 | wrapUCharsetMatch :: CharsetDetector -> IO (Ptr UCharsetMatch) -> IO CharsetMatch 81 | wrapUCharsetMatch cd = fmap $ flip CharsetMatch cd 82 | 83 | withCharsetMatch :: CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a 84 | withCharsetMatch (CharsetMatch ucsm _) f = f ucsm 85 | 86 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucsdet_close" ucsdet_close 87 | :: FunPtr (Ptr UCharsetDetector -> IO ()) 88 | 89 | foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_open" ucsdet_open 90 | :: Ptr UErrorCode -> IO (Ptr UCharsetDetector) 91 | -------------------------------------------------------------------------------- /Data/Text/ICU/Collate/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Collate.Internal 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Internals of the string collation infrastructure. 12 | 13 | module Data.Text.ICU.Collate.Internal 14 | ( 15 | -- * Unicode collation API 16 | MCollator(..) 17 | , Collator(..) 18 | , UCollator 19 | , withCollator 20 | , wrap 21 | ) where 22 | 23 | import Data.Typeable (Typeable) 24 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) 25 | import Foreign.Ptr (FunPtr, Ptr) 26 | import Data.Text.ICU.Internal (newICUPtr) 27 | 28 | -- $api 29 | -- 30 | 31 | data UCollator 32 | 33 | -- | String collator type. 34 | data MCollator = MCollator {-# UNPACK #-} !(ForeignPtr UCollator) 35 | deriving (Typeable) 36 | 37 | -- | String collator type. 38 | newtype Collator = C MCollator 39 | deriving (Typeable) 40 | 41 | withCollator :: MCollator -> (Ptr UCollator -> IO a) -> IO a 42 | withCollator (MCollator col) action = withForeignPtr col action 43 | {-# INLINE withCollator #-} 44 | 45 | wrap :: IO (Ptr UCollator) -> IO MCollator 46 | wrap = newICUPtr MCollator ucol_close 47 | {-# INLINE wrap #-} 48 | 49 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucol_close" ucol_close 50 | :: FunPtr (Ptr UCollator -> IO ()) 51 | -------------------------------------------------------------------------------- /Data/Text/ICU/Collate/Pure.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, ForeignFunctionInterface, ScopedTypeVariables #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Collate.Pure 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Pure string collation functions for Unicode, implemented as 12 | -- bindings to the International Components for Unicode (ICU) 13 | -- libraries. 14 | -- 15 | -- For the impure collation API (which is richer, but less easy to 16 | -- use), see the "Data.Text.ICU.Collate" module. 17 | 18 | module Data.Text.ICU.Collate.Pure 19 | ( 20 | -- * Unicode collation API 21 | -- $api 22 | Collator 23 | , collator 24 | , collatorWith 25 | , collatorFromRules 26 | , collatorFromRulesWith 27 | , collate 28 | , collateIter 29 | , rules 30 | , sortKey 31 | , uca 32 | ) where 33 | 34 | import qualified Control.Exception as E 35 | import Control.Monad (forM_) 36 | import Data.ByteString (ByteString) 37 | import Data.Text (Text) 38 | import Data.Text.ICU.Error.Internal (ParseError(..)) 39 | import Data.Text.ICU.Collate.Internal (Collator(..)) 40 | import Data.Text.ICU.Internal (CharIterator, LocaleName(..)) 41 | import System.IO.Unsafe (unsafePerformIO) 42 | import qualified Data.Text.ICU.Collate as IO 43 | 44 | -- $api 45 | -- 46 | 47 | -- | Create an immutable 'Collator' for comparing strings. 48 | -- 49 | -- If 'Root' is passed as the locale, UCA collation rules will be 50 | -- used. 51 | collator :: LocaleName -> Collator 52 | collator loc = unsafePerformIO $ C `fmap` IO.open loc 53 | 54 | -- | Create an immutable 'Collator' with the given 'Attribute's. 55 | collatorWith :: LocaleName -> [IO.Attribute] -> Collator 56 | collatorWith loc atts = unsafePerformIO $ do 57 | mc <- IO.open loc 58 | forM_ atts $ IO.setAttribute mc 59 | return (C mc) 60 | 61 | -- | Create an immutable 'Collator' from the given collation rules. 62 | collatorFromRules :: Text -> Either ParseError Collator 63 | collatorFromRules rul = collatorFromRulesWith rul [] 64 | 65 | -- | Create an immutable 'Collator' from the given collation rules with the given 'Attribute's. 66 | collatorFromRulesWith :: Text -> [IO.Attribute] -> Either ParseError Collator 67 | collatorFromRulesWith rul atts = unsafePerformIO $ 68 | (Right `fmap` openAndSetAtts) 69 | `E.catch` \(err::ParseError) -> return (Left err) 70 | where 71 | openAndSetAtts = do 72 | mc <- IO.openRules rul Nothing Nothing 73 | forM_ atts $ IO.setAttribute mc 74 | return (C mc) 75 | 76 | -- | Get rules for the given 'Collator'. 77 | rules :: Collator -> Text 78 | rules (C c) = unsafePerformIO $ IO.getRules c 79 | 80 | -- | Compare two strings. 81 | collate :: Collator -> Text -> Text -> Ordering 82 | collate (C c) a b = unsafePerformIO $ IO.collate c a b 83 | {-# INLINE collate #-} 84 | 85 | -- | Compare two 'CharIterator's. 86 | -- 87 | -- If either iterator was constructed from a 'ByteString', it does not 88 | -- need to be copied or converted beforehand, so this function can be 89 | -- quite cheap. 90 | collateIter :: Collator -> CharIterator -> CharIterator -> Ordering 91 | collateIter (C c) a b = unsafePerformIO $ IO.collateIter c a b 92 | {-# INLINE collateIter #-} 93 | 94 | -- | Create a key for sorting the 'Text' using the given 'Collator'. 95 | -- The result of comparing two 'ByteString's that have been 96 | -- transformed with 'sortKey' will be the same as the result of 97 | -- 'collate' on the two untransformed 'Text's. 98 | sortKey :: Collator -> Text -> ByteString 99 | sortKey (C c) = unsafePerformIO . IO.sortKey c 100 | {-# INLINE sortKey #-} 101 | 102 | -- | A 'Collator' that uses the Unicode Collation Algorithm (UCA). 103 | uca :: Collator 104 | uca = collator Root 105 | {-# NOINLINE uca #-} 106 | -------------------------------------------------------------------------------- /Data/Text/ICU/Convert.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface, CPP #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Convert 4 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Character set conversion functions for Unicode, implemented as 12 | -- bindings to the International Components for Unicode (ICU) 13 | -- libraries. 14 | module Data.Text.ICU.Convert 15 | ( 16 | -- * Character set conversion 17 | Converter 18 | -- ** Basic functions 19 | , open 20 | , fromUnicode 21 | , toUnicode 22 | -- ** Converter metadata 23 | , getName 24 | , usesFallback 25 | , isAmbiguous 26 | -- * Functions for controlling global behavior 27 | , getDefaultName 28 | , setDefaultName 29 | -- * Miscellaneous functions 30 | , compareNames 31 | , aliases 32 | -- * Metadata 33 | , converterNames 34 | , standardNames 35 | ) where 36 | 37 | import Data.ByteString.Internal (ByteString, createAndTrim) 38 | import Data.ByteString.Unsafe (unsafeUseAsCStringLen) 39 | import Data.Int (Int32) 40 | import Data.Text (Text) 41 | import Data.Text.Foreign (fromPtr, useAsPtr) 42 | #if !MIN_VERSION_text(2,0,0) 43 | import Data.Text.ICU.Internal (UChar) 44 | #endif 45 | import Data.Text.ICU.Internal (lengthWord) 46 | import Data.Text.ICU.Convert.Internal 47 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError) 48 | import Data.Word (Word8, Word16) 49 | import Foreign.C.String (CString, peekCString, withCString) 50 | import Foreign.C.Types (CInt(..)) 51 | import Foreign.Marshal.Array (allocaArray) 52 | import Foreign.Ptr (FunPtr, Ptr) 53 | import System.IO.Unsafe (unsafePerformIO) 54 | import Data.Text.ICU.Internal (UBool, asBool, asOrdering, withName, newICUPtr) 55 | 56 | -- | Do a fuzzy compare of two converter/alias names. The comparison 57 | -- is case-insensitive, ignores leading zeroes if they are not 58 | -- followed by further digits, and ignores all but letters and digits. 59 | -- Thus the strings @\"UTF-8\"@, @\"utf_8\"@, @\"u*T\@f08\"@ and 60 | -- @\"Utf 8\"@ are exactly equivalent. See section 1.4, Charset Alias 61 | -- Matching in Unicode Technical Standard #22 at 62 | -- 63 | compareNames :: String -> String -> Ordering 64 | compareNames a b = 65 | unsafePerformIO . withCString a $ \aptr -> 66 | fmap asOrdering . withCString b $ ucnv_compareNames aptr 67 | 68 | -- | Create a 'Converter' with the name of a coded character set 69 | -- specified as a string. The actual name will be resolved with the 70 | -- alias file using a case-insensitive string comparison that ignores 71 | -- leading zeroes and all non-alphanumeric characters. E.g., the 72 | -- names @\"UTF8\"@, @\"utf-8\"@, @\"u*T\@f08\"@ and @\"Utf 8\"@ are 73 | -- all equivalent (see also 'compareNames'). If an empty string is 74 | -- passed for the converter name, it will create one with the 75 | -- 'getDefaultName' return value. 76 | -- 77 | -- A converter name may contain options like a locale specification to 78 | -- control the specific behavior of the newly instantiated converter. 79 | -- The meaning of the options depends on the particular converter. If 80 | -- an option is not defined for or recognized by a given converter, 81 | -- then it is ignored. 82 | -- 83 | -- Options are appended to the converter name string, with a comma 84 | -- between the name and the first option and also between adjacent 85 | -- options. 86 | -- 87 | -- If the alias is ambiguous, then the preferred converter is used. 88 | -- 89 | -- The conversion behavior and names can vary between platforms. ICU 90 | -- may convert some characters differently from other 91 | -- platforms. Details on this topic are in the ICU User's Guide at 92 | -- . Aliases 93 | -- starting with a @\"cp\"@ prefix have no specific meaning other than 94 | -- its an alias starting with the letters @\"cp\"@. Please do not 95 | -- associate any meaning to these aliases. 96 | open :: String -- ^ Name of the converter to use. 97 | -> Maybe Bool -- ^ Whether to use fallback mappings 98 | -- (see 'usesFallback' for details). 99 | -> IO Converter 100 | open name mf = do 101 | c <- newICUPtr Converter ucnv_close $ withName name (handleError . ucnv_open) 102 | case mf of 103 | Just f -> withConverter c $ \p -> ucnv_setFallback p . fromIntegral . fromEnum $ f 104 | _ -> return () 105 | return c 106 | 107 | -- | Encode a Unicode string into a code page string using the given converter. 108 | fromUnicode :: Converter -> Text -> ByteString 109 | fromUnicode cnv t = 110 | unsafePerformIO . useAsPtr t $ \tptr tlen -> 111 | withConverter cnv $ \cptr -> do 112 | let capacity = fromIntegral . ucnv_max_bytes_for_string cptr . fromIntegral $ 113 | lengthWord t 114 | createAndTrim (fromIntegral capacity) $ \sptr -> 115 | fmap fromIntegral . handleError $ 116 | #if MIN_VERSION_text(2,0,0) 117 | ucnv_fromAlgorithmic_UTF8 118 | #else 119 | ucnv_fromUChars 120 | #endif 121 | cptr sptr capacity tptr (fromIntegral tlen) 122 | 123 | -- | Decode an encoded string into a Unicode string using the given converter. 124 | toUnicode :: Converter -> ByteString -> Text 125 | toUnicode cnv bs = 126 | unsafePerformIO . unsafeUseAsCStringLen bs $ \(sptr, slen) -> 127 | withConverter cnv $ \cptr -> do 128 | let (capacity, conv) = 129 | #if MIN_VERSION_text(2,0,0) 130 | (slen * 4, ucnv_toAlgorithmic_UTF8) 131 | #else 132 | (slen * 2, ucnv_toUChars) 133 | #endif 134 | allocaArray capacity $ \tptr -> 135 | fromPtr tptr =<< (fmap fromIntegral . handleError $ 136 | conv cptr tptr (fromIntegral capacity) sptr 137 | (fromIntegral slen)) 138 | 139 | -- | Determines whether the converter uses fallback mappings or not. 140 | -- This flag has restrictions. Regardless of this flag, the converter 141 | -- will always use fallbacks from Unicode Private Use codepoints, as 142 | -- well as reverse fallbacks (to Unicode). For details see \".ucm 143 | -- File Format\" in the Conversion Data chapter of the ICU User Guide: 144 | -- 145 | usesFallback :: Converter -> Bool 146 | usesFallback cnv = unsafePerformIO $ 147 | asBool `fmap` withConverter cnv ucnv_usesFallback 148 | 149 | -- | Returns the current default converter name. If you want to 'open' 150 | -- a default converter, you do not need to use this function. It is 151 | -- faster to pass the empty string to 'open' the default converter. 152 | getDefaultName :: IO String 153 | getDefaultName = peekCString =<< ucnv_getDefaultName 154 | 155 | -- | Indicates whether the converter contains ambiguous mappings of 156 | -- the same character or not. 157 | isAmbiguous :: Converter -> Bool 158 | isAmbiguous cnv = asBool . unsafePerformIO $ withConverter cnv ucnv_isAmbiguous 159 | 160 | -- | Sets the current default converter name. If this function needs 161 | -- to be called, it should be called during application 162 | -- initialization. Most of the time, the results from 'getDefaultName' 163 | -- or 'open' with an empty string argument is sufficient for your 164 | -- application. 165 | -- 166 | -- /Note/: this function is not thread safe. /Do not/ call this 167 | -- function when /any/ ICU function is being used from more than one 168 | -- thread! 169 | setDefaultName :: String -> IO () 170 | setDefaultName s = withCString s $ ucnv_setDefaultName 171 | 172 | -- | A list of the canonical names of all available converters. 173 | converterNames :: [String] 174 | {-# NOINLINE converterNames #-} 175 | converterNames = unsafePerformIO $ 176 | mapM ((peekCString =<<) . ucnv_getAvailableName) [0..ucnv_countAvailable-1] 177 | 178 | -- | The list of supported standard names. 179 | standardNames :: [String] 180 | {-# NOINLINE standardNames #-} 181 | standardNames = filter (not . null) . unsafePerformIO $ 182 | mapM ((peekCString =<<) . handleError . ucnv_getStandard) [0..ucnv_countStandards-1] 183 | 184 | -- | Return the aliases for a given converter or alias name. 185 | aliases :: String -> [String] 186 | aliases name = unsafePerformIO . withCString name $ \ptr -> do 187 | count <- handleError $ ucnv_countAliases ptr 188 | if count == 0 189 | then return [] 190 | else mapM ((peekCString =<<) . handleError . ucnv_getAlias ptr) [0..count-1] 191 | 192 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_open" ucnv_open 193 | :: CString -> Ptr UErrorCode -> IO (Ptr UConverter) 194 | 195 | foreign import ccall unsafe "hs_text_icu.h &__hs_ucnv_close" ucnv_close 196 | :: FunPtr (Ptr UConverter -> IO ()) 197 | 198 | foreign import ccall unsafe "__hs_ucnv_get_max_bytes_for_string" ucnv_max_bytes_for_string 199 | :: Ptr UConverter -> CInt -> CInt 200 | 201 | #if MIN_VERSION_text(2,0,0) 202 | 203 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toAlgorithmic_UTF8" ucnv_toAlgorithmic_UTF8 204 | :: Ptr UConverter -> Ptr Word8 -> Int32 -> CString -> Int32 205 | -> Ptr UErrorCode -> IO Int32 206 | 207 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromAlgorithmic_UTF8" ucnv_fromAlgorithmic_UTF8 208 | :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr Word8 -> Int32 209 | -> Ptr UErrorCode -> IO Int32 210 | 211 | #else 212 | 213 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_toUChars" ucnv_toUChars 214 | :: Ptr UConverter -> Ptr UChar -> Int32 -> CString -> Int32 215 | -> Ptr UErrorCode -> IO Int32 216 | 217 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_fromUChars" ucnv_fromUChars 218 | :: Ptr UConverter -> Ptr Word8 -> Int32 -> Ptr UChar -> Int32 219 | -> Ptr UErrorCode -> IO Int32 220 | 221 | #endif 222 | 223 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_compareNames" ucnv_compareNames 224 | :: CString -> CString -> IO CInt 225 | 226 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getDefaultName" ucnv_getDefaultName 227 | :: IO CString 228 | 229 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setDefaultName" ucnv_setDefaultName 230 | :: CString -> IO () 231 | 232 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAvailable" ucnv_countAvailable 233 | :: Int32 234 | 235 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAvailableName" ucnv_getAvailableName 236 | :: Int32 -> IO CString 237 | 238 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countAliases" ucnv_countAliases 239 | :: CString -> Ptr UErrorCode -> IO Word16 240 | 241 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getAlias" ucnv_getAlias 242 | :: CString -> Word16 -> Ptr UErrorCode -> IO CString 243 | 244 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_countStandards" ucnv_countStandards 245 | :: Word16 246 | 247 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getStandard" ucnv_getStandard 248 | :: Word16 -> Ptr UErrorCode -> IO CString 249 | 250 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_usesFallback" ucnv_usesFallback 251 | :: Ptr UConverter -> IO UBool 252 | 253 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_setFallback" ucnv_setFallback 254 | :: Ptr UConverter -> UBool -> IO () 255 | 256 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_isAmbiguous" ucnv_isAmbiguous 257 | :: Ptr UConverter -> IO UBool 258 | -------------------------------------------------------------------------------- /Data/Text/ICU/Convert/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Convert.Internal 4 | -- Copyright : (c) Bryan O'Sullivan 2009 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Low-level character set types and functions. 12 | 13 | module Data.Text.ICU.Convert.Internal 14 | ( 15 | Converter(..) 16 | , UConverter 17 | , getName 18 | , withConverter 19 | ) where 20 | 21 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError) 22 | import Data.Typeable (Typeable) 23 | import Foreign.C.String (CString, peekCString) 24 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) 25 | import Foreign.Ptr (Ptr) 26 | import System.IO.Unsafe (unsafePerformIO) 27 | 28 | data UConverter 29 | 30 | -- | Character set converter type. /Note/: this structure is not 31 | -- thread safe. It is /not/ safe to use value of this type 32 | -- simultaneously from multiple threads. 33 | data Converter = Converter {-# UNPACK #-} !(ForeignPtr UConverter) 34 | deriving (Eq, Typeable) 35 | 36 | instance Show Converter where 37 | show c = "Converter " ++ show (getName c) 38 | 39 | withConverter :: Converter -> (Ptr UConverter -> IO a) -> IO a 40 | {-# INLINE withConverter #-} 41 | withConverter (Converter cnv) action = withForeignPtr cnv action 42 | 43 | -- | Gets the internal, canonical name of the converter. 44 | getName :: Converter -> String 45 | getName cnv = unsafePerformIO . 46 | withConverter cnv $ \ptr -> 47 | peekCString =<< handleError (ucnv_getName ptr) 48 | 49 | foreign import ccall unsafe "hs_text_icu.h __hs_ucnv_getName" ucnv_getName 50 | :: Ptr UConverter -> Ptr UErrorCode -> IO CString 51 | -------------------------------------------------------------------------------- /Data/Text/ICU/Enumerator.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE EmptyDataDecls, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Calendar 4 | -- Copyright : (c) 2021 Torsten Kemps-Benedix 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Calendar functions implemented as bindings to 12 | -- the International Components for Unicode (ICU) libraries. 13 | 14 | module Data.Text.ICU.Enumerator 15 | (next, toList, createEnumerator, Enumerator, UEnumerator, 16 | ) where 17 | 18 | #include 19 | 20 | import Data.Int (Int32) 21 | import Data.Text (Text) 22 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError) 23 | import Data.Text.ICU.Internal (UChar, newICUPtr, fromUCharPtr) 24 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) 25 | import Foreign.Marshal.Alloc (alloca) 26 | import Foreign.Ptr (FunPtr, Ptr, nullPtr) 27 | import Foreign.Storable (peek) 28 | import Prelude hiding (last) 29 | 30 | data UEnumerator 31 | 32 | newtype Enumerator = Enumerator {enumeratorToForeignPtr :: ForeignPtr UEnumerator} 33 | 34 | createEnumerator :: IO (Ptr UEnumerator) -> IO Enumerator 35 | createEnumerator = newICUPtr Enumerator uenum_close 36 | 37 | next :: Enumerator -> IO (Maybe Text) 38 | next enum = withForeignPtr (enumeratorToForeignPtr enum) $ \enumPtr -> 39 | alloca $ \lenPtr -> do 40 | textPtr <- handleError $ uenum_unext enumPtr lenPtr 41 | if textPtr == nullPtr 42 | then pure Nothing 43 | else do 44 | n <- peek lenPtr 45 | t <- fromUCharPtr textPtr (fromIntegral n) 46 | pure $ Just t 47 | 48 | toList :: Enumerator -> IO [Text] 49 | toList enum = reverse <$> go [] 50 | where 51 | go l = do 52 | mx <- next enum 53 | case mx of 54 | Nothing -> pure l 55 | Just x -> go (x:l) 56 | 57 | foreign import ccall unsafe "hs_text_icu.h &__hs_uenum_close" uenum_close 58 | :: FunPtr (Ptr UEnumerator -> IO ()) 59 | foreign import ccall unsafe "hs_text_icu.h __hs_uenum_unext" uenum_unext 60 | :: Ptr UEnumerator -> Ptr Int32 -> Ptr UErrorCode 61 | -> IO (Ptr UChar) 62 | -------------------------------------------------------------------------------- /Data/Text/ICU/Error.hsc: -------------------------------------------------------------------------------- 1 | -- | 2 | -- Module : Data.Text.ICU.Error 3 | -- Copyright : (c) 2010 Bryan O'Sullivan 4 | -- 5 | -- License : BSD-style 6 | -- Maintainer : bos@serpentine.com 7 | -- Stability : experimental 8 | -- Portability : GHC 9 | -- 10 | -- Errors thrown by bindings to the International Components for 11 | -- Unicode (ICU) libraries. 12 | -- 13 | -- Most ICU functions can throw an 'ICUError' value as an exception. 14 | -- Some can additionally throw a 'ParseError', if more detailed error 15 | -- information is necessary. 16 | module Data.Text.ICU.Error 17 | ( 18 | -- * Types 19 | ICUError, 20 | ParseError(errError, errLine, errOffset), 21 | 22 | -- * Functions 23 | isSuccess, 24 | isFailure, 25 | errorName, 26 | isRegexError, 27 | 28 | -- * Warnings 29 | u_USING_FALLBACK_WARNING, 30 | u_USING_DEFAULT_WARNING, 31 | u_SAFECLONE_ALLOCATED_WARNING, 32 | u_STATE_OLD_WARNING, 33 | u_STRING_NOT_TERMINATED_WARNING, 34 | u_SORT_KEY_TOO_SHORT_WARNING, 35 | u_AMBIGUOUS_ALIAS_WARNING, 36 | u_DIFFERENT_UCA_VERSION, 37 | 38 | -- * Errors 39 | u_ILLEGAL_ARGUMENT_ERROR, 40 | u_MISSING_RESOURCE_ERROR, 41 | u_INVALID_FORMAT_ERROR, 42 | u_FILE_ACCESS_ERROR, 43 | u_INTERNAL_PROGRAM_ERROR, 44 | u_MESSAGE_PARSE_ERROR, 45 | u_MEMORY_ALLOCATION_ERROR, 46 | u_INDEX_OUTOFBOUNDS_ERROR, 47 | u_PARSE_ERROR, 48 | u_INVALID_CHAR_FOUND, 49 | u_TRUNCATED_CHAR_FOUND, 50 | u_ILLEGAL_CHAR_FOUND, 51 | u_INVALID_TABLE_FORMAT, 52 | u_INVALID_TABLE_FILE, 53 | u_BUFFER_OVERFLOW_ERROR, 54 | u_UNSUPPORTED_ERROR, 55 | u_RESOURCE_TYPE_MISMATCH, 56 | u_ILLEGAL_ESCAPE_SEQUENCE, 57 | u_UNSUPPORTED_ESCAPE_SEQUENCE, 58 | u_NO_SPACE_AVAILABLE, 59 | u_CE_NOT_FOUND_ERROR, 60 | u_PRIMARY_TOO_LONG_ERROR, 61 | u_STATE_TOO_OLD_ERROR, 62 | u_TOO_MANY_ALIASES_ERROR, 63 | u_ENUM_OUT_OF_SYNC_ERROR, 64 | u_INVARIANT_CONVERSION_ERROR, 65 | u_INVALID_STATE_ERROR, 66 | u_COLLATOR_VERSION_MISMATCH, 67 | u_USELESS_COLLATOR_ERROR, 68 | u_NO_WRITE_PERMISSION, 69 | 70 | -- ** Transliterator errors 71 | u_BAD_VARIABLE_DEFINITION, 72 | u_MALFORMED_RULE, 73 | u_MALFORMED_SET, 74 | u_MALFORMED_UNICODE_ESCAPE, 75 | u_MALFORMED_VARIABLE_DEFINITION, 76 | u_MALFORMED_VARIABLE_REFERENCE, 77 | u_MISPLACED_CURSOR_OFFSET, 78 | u_MISPLACED_QUANTIFIER, 79 | u_MISSING_OPERATOR, 80 | u_MULTIPLE_ANTE_CONTEXTS, 81 | u_MULTIPLE_CURSORS, 82 | u_MULTIPLE_POST_CONTEXTS, 83 | u_TRAILING_BACKSLASH, 84 | u_UNDEFINED_SEGMENT_REFERENCE, 85 | u_UNDEFINED_VARIABLE, 86 | u_UNQUOTED_SPECIAL, 87 | u_UNTERMINATED_QUOTE, 88 | u_RULE_MASK_ERROR, 89 | u_MISPLACED_COMPOUND_FILTER, 90 | u_MULTIPLE_COMPOUND_FILTERS, 91 | u_INVALID_RBT_SYNTAX, 92 | u_MALFORMED_PRAGMA, 93 | u_UNCLOSED_SEGMENT, 94 | u_VARIABLE_RANGE_EXHAUSTED, 95 | u_VARIABLE_RANGE_OVERLAP, 96 | u_ILLEGAL_CHARACTER, 97 | u_INTERNAL_TRANSLITERATOR_ERROR, 98 | u_INVALID_ID, 99 | u_INVALID_FUNCTION, 100 | 101 | -- ** Formatting API parsing errors 102 | u_UNEXPECTED_TOKEN, 103 | u_MULTIPLE_DECIMAL_SEPARATORS, 104 | u_MULTIPLE_EXPONENTIAL_SYMBOLS, 105 | u_MALFORMED_EXPONENTIAL_PATTERN, 106 | u_MULTIPLE_PERCENT_SYMBOLS, 107 | u_MULTIPLE_PERMILL_SYMBOLS, 108 | u_MULTIPLE_PAD_SPECIFIERS, 109 | u_PATTERN_SYNTAX_ERROR, 110 | u_ILLEGAL_PAD_POSITION, 111 | u_UNMATCHED_BRACES, 112 | u_ARGUMENT_TYPE_MISMATCH, 113 | u_DUPLICATE_KEYWORD, 114 | u_UNDEFINED_KEYWORD, 115 | u_DEFAULT_KEYWORD_MISSING, 116 | 117 | -- ** Break iterator errors 118 | u_BRK_INTERNAL_ERROR, 119 | u_BRK_HEX_DIGITS_EXPECTED, 120 | u_BRK_SEMICOLON_EXPECTED, 121 | u_BRK_RULE_SYNTAX, 122 | u_BRK_UNCLOSED_SET, 123 | u_BRK_ASSIGN_ERROR, 124 | u_BRK_VARIABLE_REDFINITION, 125 | u_BRK_MISMATCHED_PAREN, 126 | u_BRK_NEW_LINE_IN_QUOTED_STRING, 127 | u_BRK_UNDEFINED_VARIABLE, 128 | u_BRK_INIT_ERROR, 129 | u_BRK_RULE_EMPTY_SET, 130 | u_BRK_UNRECOGNIZED_OPTION, 131 | u_BRK_MALFORMED_RULE_TAG, 132 | 133 | -- ** Regular expression errors 134 | u_REGEX_INTERNAL_ERROR, 135 | u_REGEX_RULE_SYNTAX, 136 | u_REGEX_INVALID_STATE, 137 | u_REGEX_BAD_ESCAPE_SEQUENCE, 138 | u_REGEX_PROPERTY_SYNTAX, 139 | u_REGEX_UNIMPLEMENTED, 140 | u_REGEX_MISMATCHED_PAREN, 141 | u_REGEX_NUMBER_TOO_BIG, 142 | u_REGEX_BAD_INTERVAL, 143 | u_REGEX_MAX_LT_MIN, 144 | u_REGEX_INVALID_BACK_REF, 145 | u_REGEX_INVALID_FLAG, 146 | u_REGEX_SET_CONTAINS_STRING, 147 | u_REGEX_OCTAL_TOO_BIG, 148 | u_REGEX_INVALID_RANGE, 149 | u_REGEX_STACK_OVERFLOW, 150 | u_REGEX_TIME_OUT, 151 | u_REGEX_STOPPED_BY_CALLER, 152 | 153 | -- ** IDNA errors 154 | u_IDNA_PROHIBITED_ERROR, 155 | u_IDNA_UNASSIGNED_ERROR, 156 | u_IDNA_CHECK_BIDI_ERROR, 157 | u_IDNA_STD3_ASCII_RULES_ERROR, 158 | u_IDNA_ACE_PREFIX_ERROR, 159 | u_IDNA_VERIFICATION_ERROR, 160 | u_IDNA_LABEL_TOO_LONG_ERROR, 161 | u_IDNA_ZERO_LENGTH_LABEL_ERROR, 162 | u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR 163 | ) where 164 | 165 | #ifdef mingw32_HOST_OS 166 | #define U_HAVE_INTTYPES_H 1 167 | #endif 168 | 169 | #include 170 | 171 | import Data.Text.ICU.Error.Internal 172 | 173 | #{enum ICUError, ICUError, 174 | u_USING_FALLBACK_WARNING = U_USING_FALLBACK_WARNING, 175 | u_USING_DEFAULT_WARNING = U_USING_DEFAULT_WARNING, 176 | u_SAFECLONE_ALLOCATED_WARNING = U_SAFECLONE_ALLOCATED_WARNING, 177 | u_STATE_OLD_WARNING = U_STATE_OLD_WARNING, 178 | u_STRING_NOT_TERMINATED_WARNING = U_STRING_NOT_TERMINATED_WARNING, 179 | u_SORT_KEY_TOO_SHORT_WARNING = U_SORT_KEY_TOO_SHORT_WARNING, 180 | u_AMBIGUOUS_ALIAS_WARNING = U_AMBIGUOUS_ALIAS_WARNING, 181 | u_DIFFERENT_UCA_VERSION = U_DIFFERENT_UCA_VERSION, 182 | u_ILLEGAL_ARGUMENT_ERROR = U_ILLEGAL_ARGUMENT_ERROR, 183 | u_MISSING_RESOURCE_ERROR = U_MISSING_RESOURCE_ERROR, 184 | u_INVALID_FORMAT_ERROR = U_INVALID_FORMAT_ERROR, 185 | u_FILE_ACCESS_ERROR = U_FILE_ACCESS_ERROR, 186 | u_INTERNAL_PROGRAM_ERROR = U_INTERNAL_PROGRAM_ERROR, 187 | u_MESSAGE_PARSE_ERROR = U_MESSAGE_PARSE_ERROR, 188 | u_MEMORY_ALLOCATION_ERROR = U_MEMORY_ALLOCATION_ERROR, 189 | u_INDEX_OUTOFBOUNDS_ERROR = U_INDEX_OUTOFBOUNDS_ERROR, 190 | u_PARSE_ERROR = U_PARSE_ERROR, 191 | u_INVALID_CHAR_FOUND = U_INVALID_CHAR_FOUND, 192 | u_TRUNCATED_CHAR_FOUND = U_TRUNCATED_CHAR_FOUND, 193 | u_ILLEGAL_CHAR_FOUND = U_ILLEGAL_CHAR_FOUND, 194 | u_INVALID_TABLE_FORMAT = U_INVALID_TABLE_FORMAT, 195 | u_INVALID_TABLE_FILE = U_INVALID_TABLE_FILE, 196 | u_BUFFER_OVERFLOW_ERROR = U_BUFFER_OVERFLOW_ERROR, 197 | u_UNSUPPORTED_ERROR = U_UNSUPPORTED_ERROR, 198 | u_RESOURCE_TYPE_MISMATCH = U_RESOURCE_TYPE_MISMATCH, 199 | u_ILLEGAL_ESCAPE_SEQUENCE = U_ILLEGAL_ESCAPE_SEQUENCE, 200 | u_UNSUPPORTED_ESCAPE_SEQUENCE = U_UNSUPPORTED_ESCAPE_SEQUENCE, 201 | u_NO_SPACE_AVAILABLE = U_NO_SPACE_AVAILABLE, 202 | u_CE_NOT_FOUND_ERROR = U_CE_NOT_FOUND_ERROR, 203 | u_PRIMARY_TOO_LONG_ERROR = U_PRIMARY_TOO_LONG_ERROR, 204 | u_STATE_TOO_OLD_ERROR = U_STATE_TOO_OLD_ERROR, 205 | u_TOO_MANY_ALIASES_ERROR = U_TOO_MANY_ALIASES_ERROR, 206 | u_ENUM_OUT_OF_SYNC_ERROR = U_ENUM_OUT_OF_SYNC_ERROR, 207 | u_INVARIANT_CONVERSION_ERROR = U_INVARIANT_CONVERSION_ERROR, 208 | u_INVALID_STATE_ERROR = U_INVALID_STATE_ERROR, 209 | u_COLLATOR_VERSION_MISMATCH = U_COLLATOR_VERSION_MISMATCH, 210 | u_USELESS_COLLATOR_ERROR = U_USELESS_COLLATOR_ERROR, 211 | u_NO_WRITE_PERMISSION = U_NO_WRITE_PERMISSION, 212 | u_BAD_VARIABLE_DEFINITION = U_BAD_VARIABLE_DEFINITION, 213 | u_MALFORMED_RULE = U_MALFORMED_RULE, 214 | u_MALFORMED_SET = U_MALFORMED_SET, 215 | u_MALFORMED_UNICODE_ESCAPE = U_MALFORMED_UNICODE_ESCAPE, 216 | u_MALFORMED_VARIABLE_DEFINITION = U_MALFORMED_VARIABLE_DEFINITION, 217 | u_MALFORMED_VARIABLE_REFERENCE = U_MALFORMED_VARIABLE_REFERENCE, 218 | u_MISPLACED_CURSOR_OFFSET = U_MISPLACED_CURSOR_OFFSET, 219 | u_MISPLACED_QUANTIFIER = U_MISPLACED_QUANTIFIER, 220 | u_MISSING_OPERATOR = U_MISSING_OPERATOR, 221 | u_MULTIPLE_ANTE_CONTEXTS = U_MULTIPLE_ANTE_CONTEXTS, 222 | u_MULTIPLE_CURSORS = U_MULTIPLE_CURSORS, 223 | u_MULTIPLE_POST_CONTEXTS = U_MULTIPLE_POST_CONTEXTS, 224 | u_TRAILING_BACKSLASH = U_TRAILING_BACKSLASH, 225 | u_UNDEFINED_SEGMENT_REFERENCE = U_UNDEFINED_SEGMENT_REFERENCE, 226 | u_UNDEFINED_VARIABLE = U_UNDEFINED_VARIABLE, 227 | u_UNQUOTED_SPECIAL = U_UNQUOTED_SPECIAL, 228 | u_UNTERMINATED_QUOTE = U_UNTERMINATED_QUOTE, 229 | u_RULE_MASK_ERROR = U_RULE_MASK_ERROR, 230 | u_MISPLACED_COMPOUND_FILTER = U_MISPLACED_COMPOUND_FILTER, 231 | u_MULTIPLE_COMPOUND_FILTERS = U_MULTIPLE_COMPOUND_FILTERS, 232 | u_INVALID_RBT_SYNTAX = U_INVALID_RBT_SYNTAX, 233 | u_MALFORMED_PRAGMA = U_MALFORMED_PRAGMA, 234 | u_UNCLOSED_SEGMENT = U_UNCLOSED_SEGMENT, 235 | u_VARIABLE_RANGE_EXHAUSTED = U_VARIABLE_RANGE_EXHAUSTED, 236 | u_VARIABLE_RANGE_OVERLAP = U_VARIABLE_RANGE_OVERLAP, 237 | u_ILLEGAL_CHARACTER = U_ILLEGAL_CHARACTER, 238 | u_INTERNAL_TRANSLITERATOR_ERROR = U_INTERNAL_TRANSLITERATOR_ERROR, 239 | u_INVALID_ID = U_INVALID_ID, 240 | u_INVALID_FUNCTION = U_INVALID_FUNCTION, 241 | u_UNEXPECTED_TOKEN = U_UNEXPECTED_TOKEN, 242 | u_MULTIPLE_DECIMAL_SEPARATORS = U_MULTIPLE_DECIMAL_SEPARATORS, 243 | u_MULTIPLE_EXPONENTIAL_SYMBOLS = U_MULTIPLE_EXPONENTIAL_SYMBOLS, 244 | u_MALFORMED_EXPONENTIAL_PATTERN = U_MALFORMED_EXPONENTIAL_PATTERN, 245 | u_MULTIPLE_PERCENT_SYMBOLS = U_MULTIPLE_PERCENT_SYMBOLS, 246 | u_MULTIPLE_PERMILL_SYMBOLS = U_MULTIPLE_PERMILL_SYMBOLS, 247 | u_MULTIPLE_PAD_SPECIFIERS = U_MULTIPLE_PAD_SPECIFIERS, 248 | u_PATTERN_SYNTAX_ERROR = U_PATTERN_SYNTAX_ERROR, 249 | u_ILLEGAL_PAD_POSITION = U_ILLEGAL_PAD_POSITION, 250 | u_UNMATCHED_BRACES = U_UNMATCHED_BRACES, 251 | u_ARGUMENT_TYPE_MISMATCH = U_ARGUMENT_TYPE_MISMATCH, 252 | u_DUPLICATE_KEYWORD = U_DUPLICATE_KEYWORD, 253 | u_UNDEFINED_KEYWORD = U_UNDEFINED_KEYWORD, 254 | u_DEFAULT_KEYWORD_MISSING = U_DEFAULT_KEYWORD_MISSING, 255 | u_BRK_INTERNAL_ERROR = U_BRK_INTERNAL_ERROR, 256 | u_BRK_HEX_DIGITS_EXPECTED = U_BRK_HEX_DIGITS_EXPECTED, 257 | u_BRK_SEMICOLON_EXPECTED = U_BRK_SEMICOLON_EXPECTED, 258 | u_BRK_RULE_SYNTAX = U_BRK_RULE_SYNTAX, 259 | u_BRK_UNCLOSED_SET = U_BRK_UNCLOSED_SET, 260 | u_BRK_ASSIGN_ERROR = U_BRK_ASSIGN_ERROR, 261 | u_BRK_VARIABLE_REDFINITION = U_BRK_VARIABLE_REDFINITION, 262 | u_BRK_MISMATCHED_PAREN = U_BRK_MISMATCHED_PAREN, 263 | u_BRK_NEW_LINE_IN_QUOTED_STRING = U_BRK_NEW_LINE_IN_QUOTED_STRING, 264 | u_BRK_UNDEFINED_VARIABLE = U_BRK_UNDEFINED_VARIABLE, 265 | u_BRK_INIT_ERROR = U_BRK_INIT_ERROR, 266 | u_BRK_RULE_EMPTY_SET = U_BRK_RULE_EMPTY_SET, 267 | u_BRK_UNRECOGNIZED_OPTION = U_BRK_UNRECOGNIZED_OPTION, 268 | u_BRK_MALFORMED_RULE_TAG = U_BRK_MALFORMED_RULE_TAG, 269 | u_REGEX_INTERNAL_ERROR = U_REGEX_INTERNAL_ERROR, 270 | u_REGEX_RULE_SYNTAX = U_REGEX_RULE_SYNTAX, 271 | u_REGEX_INVALID_STATE = U_REGEX_INVALID_STATE, 272 | u_REGEX_BAD_ESCAPE_SEQUENCE = U_REGEX_BAD_ESCAPE_SEQUENCE, 273 | u_REGEX_PROPERTY_SYNTAX = U_REGEX_PROPERTY_SYNTAX, 274 | u_REGEX_UNIMPLEMENTED = U_REGEX_UNIMPLEMENTED, 275 | u_REGEX_MISMATCHED_PAREN = U_REGEX_MISMATCHED_PAREN, 276 | u_REGEX_NUMBER_TOO_BIG = U_REGEX_NUMBER_TOO_BIG, 277 | u_REGEX_BAD_INTERVAL = U_REGEX_BAD_INTERVAL, 278 | u_REGEX_MAX_LT_MIN = U_REGEX_MAX_LT_MIN, 279 | u_REGEX_INVALID_BACK_REF = U_REGEX_INVALID_BACK_REF, 280 | u_REGEX_INVALID_FLAG = U_REGEX_INVALID_FLAG, 281 | u_REGEX_SET_CONTAINS_STRING = U_REGEX_SET_CONTAINS_STRING, 282 | u_REGEX_OCTAL_TOO_BIG = U_REGEX_OCTAL_TOO_BIG, 283 | u_REGEX_INVALID_RANGE = U_REGEX_INVALID_RANGE, 284 | u_REGEX_STACK_OVERFLOW = U_REGEX_STACK_OVERFLOW, 285 | u_REGEX_TIME_OUT = U_REGEX_TIME_OUT, 286 | u_REGEX_STOPPED_BY_CALLER = U_REGEX_STOPPED_BY_CALLER, 287 | u_IDNA_PROHIBITED_ERROR = U_IDNA_PROHIBITED_ERROR, 288 | u_IDNA_UNASSIGNED_ERROR = U_IDNA_UNASSIGNED_ERROR, 289 | u_IDNA_CHECK_BIDI_ERROR = U_IDNA_CHECK_BIDI_ERROR, 290 | u_IDNA_STD3_ASCII_RULES_ERROR = U_IDNA_STD3_ASCII_RULES_ERROR, 291 | u_IDNA_ACE_PREFIX_ERROR = U_IDNA_ACE_PREFIX_ERROR, 292 | u_IDNA_VERIFICATION_ERROR = U_IDNA_VERIFICATION_ERROR, 293 | u_IDNA_LABEL_TOO_LONG_ERROR = U_IDNA_LABEL_TOO_LONG_ERROR, 294 | u_IDNA_ZERO_LENGTH_LABEL_ERROR = U_IDNA_ZERO_LENGTH_LABEL_ERROR, 295 | u_IDNA_DOMAIN_NAME_TOO_LONG_ERROR = U_IDNA_DOMAIN_NAME_TOO_LONG_ERROR 296 | } 297 | 298 | isRegexError :: ICUError -> Bool 299 | isRegexError (ICUError err) = 300 | err >= #{const U_REGEX_ERROR_START} && err < #{const U_REGEX_ERROR_LIMIT} 301 | -------------------------------------------------------------------------------- /Data/Text/ICU/Error/Internal.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, DeriveDataTypeable, ForeignFunctionInterface, 2 | RecordWildCards, ScopedTypeVariables #-} 3 | 4 | module Data.Text.ICU.Error.Internal 5 | ( 6 | -- * Types 7 | ICUError(..) 8 | -- ** Low-level types 9 | , UErrorCode 10 | , ParseError(errError, errLine, errOffset) 11 | , UParseError 12 | -- * Functions 13 | , isFailure 14 | , isSuccess 15 | , errorName 16 | , handleError 17 | , handleOverflowError 18 | , handleParseError 19 | , throwOnError 20 | , withError 21 | ) where 22 | 23 | import Control.DeepSeq (NFData(..)) 24 | import Control.Exception (Exception, throwIO) 25 | import Data.Function (fix) 26 | import Foreign.Ptr (Ptr) 27 | import Foreign.Marshal.Alloc (alloca, allocaBytes) 28 | import Foreign.Marshal.Utils (with) 29 | import Foreign.Marshal.Array (allocaArray) 30 | import Data.Int (Int32) 31 | import Data.Typeable (Typeable) 32 | import Foreign.C.String (CString, peekCString) 33 | import Foreign.C.Types (CInt(..)) 34 | import Foreign.Storable (Storable(..)) 35 | import System.IO.Unsafe (unsafePerformIO) 36 | 37 | #include 38 | #include 39 | 40 | type UErrorCode = CInt 41 | 42 | -- | ICU error type. This is an instance of the 'Exception' type 43 | -- class. A value of this type may be thrown as an exception by most 44 | -- ICU functions. 45 | newtype ICUError = ICUError { 46 | fromErrorCode :: UErrorCode 47 | } deriving (Eq, Typeable) 48 | 49 | instance Show ICUError where 50 | show code = "ICUError " ++ errorName code 51 | 52 | instance Exception ICUError 53 | 54 | instance NFData ICUError where 55 | rnf !_ = () 56 | 57 | -- | Detailed information about parsing errors. Used by ICU parsing 58 | -- engines that parse long rules, patterns, or programs, where the 59 | -- text being parsed is long enough that more information than an 60 | -- 'ICUError' is needed to localize the error. 61 | data ParseError = ParseError { 62 | errError :: ICUError 63 | , errLine :: !(Maybe Int) 64 | -- ^ The line on which the error occured. If the parser uses this 65 | -- field, it sets it to the line number of the source text line on 66 | -- which the error appears, which will be be a positive value. If 67 | -- the parser does not support line numbers, the value will be 68 | -- 'Nothing'. 69 | , errOffset :: !(Maybe Int) 70 | -- ^ The character offset to the error. If the 'errLine' field is 71 | -- 'Just' some value, then this field contains the offset from the 72 | -- beginning of the line that contains the error. Otherwise, it 73 | -- represents the offset from the start of the text. If the 74 | -- parser does not support this field, it will have a value of 75 | -- 'Nothing'. 76 | } deriving (Show, Typeable) 77 | 78 | instance NFData ParseError where 79 | rnf ParseError{..} = rnf errError `seq` rnf errLine `seq` rnf errOffset 80 | 81 | type UParseError = ParseError 82 | 83 | instance Exception ParseError 84 | 85 | -- | Indicate whether the given error code is a success. 86 | isSuccess :: ICUError -> Bool 87 | {-# INLINE isSuccess #-} 88 | isSuccess = (<= 0) . fromErrorCode 89 | 90 | -- | Indicate whether the given error code is a failure. 91 | isFailure :: ICUError -> Bool 92 | {-# INLINE isFailure #-} 93 | isFailure = (> 0) . fromErrorCode 94 | 95 | -- | Throw an exception if the given code is actually an error. 96 | throwOnError :: UErrorCode -> IO () 97 | {-# INLINE throwOnError #-} 98 | throwOnError code = do 99 | let err = (ICUError code) 100 | if isFailure err 101 | then throwIO err 102 | else return () 103 | 104 | withError :: (Ptr UErrorCode -> IO a) -> IO (ICUError, a) 105 | {-# INLINE withError #-} 106 | withError action = with 0 $ \errPtr -> do 107 | ret <- action errPtr 108 | err <- peek errPtr 109 | return (ICUError err, ret) 110 | 111 | handleError :: (Ptr UErrorCode -> IO a) -> IO a 112 | {-# INLINE handleError #-} 113 | handleError action = with 0 $ \errPtr -> do 114 | ret <- action errPtr 115 | throwOnError =<< peek errPtr 116 | return ret 117 | 118 | -- | Deal with ICU functions that report a buffer overflow error if we 119 | -- give them an insufficiently large buffer. Our first call will 120 | -- report a buffer overflow, in which case we allocate a correctly 121 | -- sized buffer and try again. 122 | handleOverflowError :: (Storable a) => 123 | Int 124 | -- ^ Initial guess at buffer size. 125 | -> (Ptr a -> Int32 -> Ptr UErrorCode -> IO Int32) 126 | -- ^ Function that retrieves data. 127 | -> (Ptr a -> Int -> IO b) 128 | -- ^ Function that fills destination buffer if no 129 | -- overflow occurred. 130 | -> IO b 131 | handleOverflowError guess fill retrieve = 132 | alloca $ \uerrPtr -> flip fix guess $ \loop n -> 133 | (either (loop . fromIntegral) return =<<) . allocaArray n $ \ptr -> do 134 | poke uerrPtr 0 135 | ret <- fill ptr (fromIntegral n) uerrPtr 136 | err <- peek uerrPtr 137 | case undefined of 138 | _| err == (#const U_BUFFER_OVERFLOW_ERROR) 139 | -> return (Left ret) 140 | | err > 0 -> throwIO (ICUError err) 141 | | otherwise -> Right `fmap` retrieve ptr (fromIntegral ret) 142 | 143 | handleParseError :: (ICUError -> Bool) 144 | -> (Ptr UParseError -> Ptr UErrorCode -> IO a) -> IO a 145 | handleParseError isParseError action = with 0 $ \uerrPtr -> 146 | allocaBytes (#{size UParseError}) $ \perrPtr -> do 147 | ret <- action perrPtr uerrPtr 148 | err <- ICUError `fmap` peek uerrPtr 149 | case undefined of 150 | _| isParseError err -> throwParseError perrPtr err 151 | | isFailure err -> throwIO err 152 | | otherwise -> return ret 153 | 154 | throwParseError :: Ptr UParseError -> ICUError -> IO a 155 | throwParseError ptr err = do 156 | (line::Int32) <- #{peek UParseError, line} ptr 157 | (offset::Int32) <- #{peek UParseError, offset} ptr 158 | let wrap k = if k == -1 then Nothing else Just $! fromIntegral k 159 | throwIO $! ParseError err (wrap line) (wrap offset) 160 | 161 | -- | Return a string representing the name of the given error code. 162 | errorName :: ICUError -> String 163 | errorName code = unsafePerformIO $ 164 | peekCString (u_errorName (fromErrorCode code)) 165 | 166 | foreign import ccall unsafe "hs_text_icu.h __hs_u_errorName" u_errorName 167 | :: UErrorCode -> CString 168 | -------------------------------------------------------------------------------- /Data/Text/ICU/Internal.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE EmptyDataDecls, ForeignFunctionInterface, GeneralizedNewtypeDeriving, TupleSections #-} 2 | 3 | module Data.Text.ICU.Internal 4 | ( 5 | LocaleName(..) 6 | , UBool 7 | , UChar 8 | , UChar32 9 | , UCharIterator 10 | , CharIterator(..) 11 | , UText, UTextPtr 12 | , asBool 13 | , asOrdering 14 | , withCharIterator 15 | , withLocaleName 16 | , withName 17 | , useAsUCharPtr, fromUCharPtr, I16, asUCharForeignPtr 18 | , asUTextPtr, withUTextPtr, withUTextPtrText, emptyUTextPtr, utextPtrLength 19 | , TextI, takeWord, dropWord, lengthWord 20 | , newICUPtr 21 | ) where 22 | 23 | #include 24 | 25 | import Control.Exception (mask_) 26 | import Control.DeepSeq (NFData(..)) 27 | import Data.ByteString.Internal (ByteString(..)) 28 | import Data.Int (Int8, Int32, Int64) 29 | import Data.String (IsString(..)) 30 | import Data.Text (Text, empty) 31 | import Data.Text.Encoding (decodeUtf8) 32 | import Data.Text.Foreign (useAsPtr, asForeignPtr, fromPtr) 33 | #if MIN_VERSION_text(2,0,0) 34 | import Data.Text.Foreign (I8, dropWord8, takeWord8, lengthWord8) 35 | import Foreign.ForeignPtr (mallocForeignPtrArray) 36 | import Foreign.Marshal.Array (allocaArray) 37 | import Foreign.Storable (peek) 38 | #else 39 | import Data.Text.Foreign (I16, dropWord16, takeWord16, lengthWord16) 40 | #endif 41 | import Data.Word (Word8, Word16, Word32) 42 | import Foreign.C.String (CString, withCString) 43 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr, newForeignPtr, FinalizerPtr) 44 | import Foreign.Marshal.Alloc (allocaBytes) 45 | import Foreign.Marshal.Utils (with) 46 | import Foreign.Ptr (Ptr, nullPtr, FunPtr) 47 | import Data.Text.ICU.Error.Internal (UErrorCode) 48 | import System.IO.Unsafe (unsafePerformIO) 49 | 50 | -- | A type that supports efficient iteration over Unicode characters. 51 | -- 52 | -- As an example of where this may be useful, a function using this 53 | -- type may be able to iterate over a UTF-8 'ByteString' directly, 54 | -- rather than first copying and converting it to an intermediate 55 | -- form. This type also allows e.g. comparison between 'Text' and 56 | -- 'ByteString', with minimal overhead. 57 | data CharIterator = CIText !Text 58 | | CIUTF8 !ByteString 59 | 60 | instance Show CharIterator where 61 | show (CIText t) = show t 62 | show (CIUTF8 bs) = show (decodeUtf8 bs) 63 | 64 | data UCharIterator 65 | 66 | -- | Temporarily allocate a 'UCharIterator' and use it with the 67 | -- contents of the to-be-iterated-over string. 68 | withCharIterator :: CharIterator -> (Ptr UCharIterator -> IO a) -> IO a 69 | withCharIterator (CIUTF8 (PS fp _ l)) act = 70 | allocaBytes (#{size UCharIterator}) $ \i -> withForeignPtr fp $ \p -> 71 | uiter_setUTF8 i p (fromIntegral l) >> act i 72 | withCharIterator (CIText t) act = 73 | allocaBytes (#{size UCharIterator}) $ \i -> useAsPtr t $ \p l -> 74 | #if MIN_VERSION_text(2,0,0) 75 | uiter_setUTF8 i p (fromIntegral l) >> act i 76 | #else 77 | uiter_setString i p (fromIntegral l) >> act i 78 | #endif 79 | 80 | type UBool = Int8 81 | type UChar = Word16 82 | type UChar32 = Word32 83 | 84 | asBool :: Integral a => a -> Bool 85 | {-# INLINE asBool #-} 86 | asBool = (/=0) 87 | 88 | asOrdering :: Integral a => a -> Ordering 89 | {-# INLINE asOrdering #-} 90 | asOrdering i 91 | | i < 0 = LT 92 | | i == 0 = EQ 93 | | otherwise = GT 94 | 95 | withName :: String -> (CString -> IO a) -> IO a 96 | withName name act 97 | | null name = act nullPtr 98 | | otherwise = withCString name act 99 | 100 | -- | The name of a locale. 101 | data LocaleName = Root 102 | -- ^ The root locale. For a description of resource bundles 103 | -- and the root resource, see 104 | -- . 105 | | Locale String -- ^ A specific locale. 106 | | Current -- ^ The program's current locale. 107 | deriving (Eq, Ord, Read, Show) 108 | 109 | instance NFData LocaleName where 110 | rnf Root = () 111 | rnf (Locale l) = rnf l 112 | rnf Current = () 113 | 114 | instance IsString LocaleName where 115 | fromString = Locale 116 | 117 | withLocaleName :: LocaleName -> (CString -> IO a) -> IO a 118 | withLocaleName Current act = act nullPtr 119 | withLocaleName Root act = withCString "" act 120 | withLocaleName (Locale n) act = withCString n act 121 | 122 | #if !MIN_VERSION_text(2,0,0) 123 | foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setString" uiter_setString 124 | :: Ptr UCharIterator -> Ptr UChar -> Int32 -> IO () 125 | #endif 126 | 127 | foreign import ccall unsafe "hs_text_icu.h __hs_uiter_setUTF8" uiter_setUTF8 128 | :: Ptr UCharIterator -> Ptr Word8 -> Int32 -> IO () 129 | 130 | 131 | data UText 132 | 133 | -- | Pointer to UText which also keeps pointer to source text so it won't be 134 | -- garbage collected. 135 | data UTextPtr 136 | = UTextPtr 137 | { utextPtr :: ForeignPtr UText 138 | , utextPtrText :: ForeignPtr TextChar 139 | , utextPtrLength :: TextI 140 | } 141 | 142 | emptyUTextPtr :: UTextPtr 143 | emptyUTextPtr = unsafePerformIO $ asUTextPtr empty 144 | {-# NOINLINE emptyUTextPtr #-} 145 | 146 | withUTextPtr :: UTextPtr -> (Ptr UText -> IO a) -> IO a 147 | withUTextPtr = withForeignPtr . utextPtr 148 | 149 | withUTextPtrText :: UTextPtr -> (Ptr TextChar -> IO a) -> IO a 150 | withUTextPtrText = withForeignPtr . utextPtrText 151 | 152 | -- | Returns UTF-8 UText for text >= 2.0 or UTF-16 UText for previous versions. 153 | asUTextPtr :: Text -> IO UTextPtr 154 | asUTextPtr t = do 155 | (fp,l) <- asForeignPtr t 156 | with 0 $ \ e -> withForeignPtr fp $ \ p -> 157 | newICUPtr (\ ut -> UTextPtr ut fp l) utext_close $ 158 | #if MIN_VERSION_text(2,0,0) 159 | utext_openUTF8 160 | #else 161 | utext_openUChars 162 | #endif 163 | nullPtr p (fromIntegral l) e 164 | 165 | foreign import ccall unsafe "hs_text_icu.h &__hs_utext_close" utext_close 166 | :: FunPtr (Ptr UText -> IO ()) 167 | 168 | useAsUCharPtr :: Text -> (Ptr UChar -> I16 -> IO a) -> IO a 169 | asUCharForeignPtr :: Text -> IO (ForeignPtr UChar, I16) 170 | fromUCharPtr :: Ptr UChar -> I16 -> IO Text 171 | 172 | dropWord, takeWord :: TextI -> Text -> Text 173 | lengthWord :: Text -> Int 174 | 175 | #if MIN_VERSION_text(2,0,0) 176 | newtype I16 = I16 Int 177 | deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show) 178 | 179 | type TextChar = Word8 180 | type TextI = I8 181 | 182 | useAsUCharPtr t act = useAsPtr t $ \tptr tlen -> 183 | allocaArray (fromIntegral tlen) $ \ dst -> 184 | act dst =<< fromUtf8 dst tptr tlen 185 | 186 | asUCharForeignPtr t = useAsPtr t $ \tptr tlen -> do 187 | fp <- mallocForeignPtrArray (fromIntegral tlen) 188 | withForeignPtr fp $ \ dst -> (fp,) <$> fromUtf8 dst tptr tlen 189 | 190 | fromUtf8 :: Ptr UChar -> Ptr Word8 -> I8 -> IO I16 191 | fromUtf8 dst tptr tlen = 192 | with 0 $ \ err -> 193 | with 0 $ \ dstLen -> do 194 | _ <- u_strFromUTF8Lenient dst (fromIntegral tlen) dstLen tptr 195 | (fromIntegral tlen) err 196 | fromIntegral <$> peek dstLen 197 | 198 | fromUCharPtr p l = 199 | with 0 $ \ err -> 200 | with 0 $ \ dstLen -> 201 | allocaArray capacity $ \ dst -> do 202 | _ <- u_strToUTF8 dst (fromIntegral capacity) dstLen p 203 | (fromIntegral l) err 204 | dl <- peek dstLen 205 | fromPtr dst (fromIntegral dl) 206 | where capacity = fromIntegral l * 3 207 | 208 | dropWord = dropWord8 209 | takeWord = takeWord8 210 | lengthWord = lengthWord8 211 | 212 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strFromUTF8Lenient" u_strFromUTF8Lenient 213 | :: Ptr UChar -> Int32 -> Ptr Int32 -> Ptr Word8 -> Int32 -> Ptr UErrorCode 214 | -> IO CString 215 | 216 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strToUTF8" u_strToUTF8 217 | :: Ptr Word8 -> Int32 -> Ptr Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode 218 | -> IO CString 219 | 220 | foreign import ccall unsafe "hs_text_icu.h __hs_utext_openUTF8" utext_openUTF8 221 | :: Ptr UText -> Ptr Word8 -> Int64 -> Ptr UErrorCode -> IO (Ptr UText) 222 | 223 | #else 224 | 225 | type TextChar = UChar 226 | type TextI = I16 227 | 228 | -- text < 2.0 has UChar as internal representation. 229 | useAsUCharPtr = useAsPtr 230 | asUCharForeignPtr = asForeignPtr 231 | fromUCharPtr = fromPtr 232 | 233 | dropWord = dropWord16 234 | takeWord = takeWord16 235 | lengthWord = lengthWord16 236 | 237 | foreign import ccall unsafe "hs_text_icu.h __hs_utext_openUChars" utext_openUChars 238 | :: Ptr UText -> Ptr UChar -> Int64 -> Ptr UErrorCode -> IO (Ptr UText) 239 | 240 | #endif 241 | 242 | -- | Allocate new ICU data structure (usually via @*_open@ call), 243 | -- add finalizer (@*_close@ call) and wrap resulting 'ForeignPtr'. 244 | -- 245 | -- Exceptions are masked since the memory leak is possible if any 246 | -- asynchronous exception (such as a timeout) is raised between 247 | -- allocating C data and 'newForeignPtr' call. 248 | newICUPtr :: (ForeignPtr a -> i) -> FinalizerPtr a -> IO (Ptr a) -> IO i 249 | newICUPtr wrap close open = fmap wrap $ mask_ $ newForeignPtr close =<< open 250 | -------------------------------------------------------------------------------- /Data/Text/ICU/Iterator.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | {-# OPTIONS_GHC -fno-warn-orphans #-} 3 | -- | 4 | -- Module : Data.Text.ICU.Iterator 5 | -- Copyright : (c) 2010 Bryan O'Sullivan 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- Iteration functions for Unicode, implemented as bindings to the 13 | -- International Components for Unicode (ICU) libraries. 14 | -- 15 | -- Unlike the C and C++ @UCharIterator@ type, the Haskell 16 | -- 'CharIterator' type is immutable, and can safely be used in pure 17 | -- code. 18 | -- 19 | -- Functions using these iterators may be more efficient than their 20 | -- counterparts. For instance, the 'CharIterator' type allows a UTF-8 21 | -- 'ByteString' to be compared against a 'Text', without first 22 | -- converting the 'ByteString': 23 | -- 24 | -- > fromUtf8 bs == fromText t 25 | module Data.Text.ICU.Iterator 26 | ( 27 | -- * Types and constructors 28 | CharIterator 29 | , fromString 30 | , fromText 31 | , fromUtf8 32 | ) where 33 | 34 | import Data.ByteString (ByteString) 35 | import Data.Int (Int32) 36 | import Data.Text (Text, pack) 37 | import Data.Text.ICU.Internal (CharIterator(..), UCharIterator, asOrdering, 38 | withCharIterator) 39 | import Foreign.Ptr (Ptr) 40 | import System.IO.Unsafe (unsafePerformIO) 41 | 42 | instance Eq CharIterator where 43 | a == b = compareIter a b == EQ 44 | 45 | instance Ord CharIterator where 46 | compare = compareIter 47 | 48 | -- | Compare two 'CharIterator's. 49 | compareIter :: CharIterator -> CharIterator -> Ordering 50 | compareIter a b = unsafePerformIO . fmap asOrdering . 51 | withCharIterator a $ withCharIterator b . u_strCompareIter 52 | 53 | -- | Construct a 'CharIterator' from a Unicode string. 54 | fromString :: String -> CharIterator 55 | fromString = CIText . pack 56 | {-# INLINE fromString #-} 57 | 58 | -- | Construct a 'CharIterator' from a Unicode string. 59 | fromText :: Text -> CharIterator 60 | fromText = CIText 61 | {-# INLINE fromText #-} 62 | 63 | -- | Construct a 'CharIterator' from a Unicode string encoded as a 64 | -- UTF-8 'ByteString'. The validity of the encoded string is *not* 65 | -- checked. 66 | fromUtf8 :: ByteString -> CharIterator 67 | fromUtf8 = CIUTF8 68 | {-# INLINE fromUtf8 #-} 69 | 70 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strCompareIter" u_strCompareIter 71 | :: Ptr UCharIterator -> Ptr UCharIterator -> IO Int32 72 | -------------------------------------------------------------------------------- /Data/Text/ICU/Locale.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Locale 4 | -- Copyright : (c) 2021 Torsten Kemps-Benedix 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Locale functions implemented as bindings to 12 | -- the International Components for Unicode (ICU) libraries. 13 | 14 | module Data.Text.ICU.Locale 15 | (availableLocales 16 | ) where 17 | 18 | #include 19 | 20 | import Control.Monad (forM) 21 | import Data.Int (Int32) 22 | import Foreign.C.String (CString, peekCString) 23 | import Prelude hiding (last) 24 | 25 | -- | Get the available default locales, i.e. locales that return data when passed to ICU 26 | -- APIs, but not including legacy or alias locales. 27 | availableLocales :: IO [String] 28 | availableLocales = do 29 | n <- uloc_countAvailable 30 | forM [0..n-1] $ \i -> uloc_getAvailable i >>= peekCString 31 | 32 | foreign import ccall unsafe "hs_text_icu.h __hs_uloc_getAvailable" uloc_getAvailable 33 | :: Int32 -> IO CString 34 | foreign import ccall unsafe "hs_text_icu.h __hs_uloc_countAvailable" uloc_countAvailable 35 | :: IO Int32 36 | -------------------------------------------------------------------------------- /Data/Text/ICU/Normalize.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Normalize 4 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Character set normalization functions for Unicode, implemented as 12 | -- bindings to the International Components for Unicode (ICU) 13 | -- libraries. 14 | -- 15 | -- This module is based on the now deprecated "unorm.h" functions. 16 | -- Please use Data.Text.ICU.Normalize2 instead. 17 | 18 | module Data.Text.ICU.Normalize {-# DEPRECATED "Use Data.Text.ICU.Normalize2 instead" #-} 19 | ( 20 | -- * Unicode normalization API 21 | -- $api 22 | NormalizationMode(..) 23 | -- * Normalization functions 24 | , normalize 25 | -- * Normalization checks 26 | , quickCheck 27 | , isNormalized 28 | -- * Normalization-sensitive comparison 29 | , CompareOption(..) 30 | , compare 31 | ) where 32 | 33 | #ifdef mingw32_HOST_OS 34 | #define U_HAVE_INTTYPES_H 1 35 | #endif 36 | 37 | #include 38 | #include 39 | 40 | import Data.Text (Text) 41 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) 42 | import Data.Text.ICU.Internal (UBool, UChar, asBool, asOrdering, useAsUCharPtr, fromUCharPtr) 43 | import Data.Text.ICU.Normalize.Internal (UNormalizationCheckResult, toNCR) 44 | import Data.Typeable (Typeable) 45 | import Data.Int (Int32) 46 | import Data.Word (Word32) 47 | import Foreign.C.Types (CInt(..)) 48 | import Foreign.Ptr (Ptr) 49 | import System.IO.Unsafe (unsafePerformIO) 50 | import Prelude hiding (compare) 51 | import Data.List (foldl') 52 | import Data.Bits ((.|.)) 53 | 54 | -- $api 55 | -- 56 | -- The 'normalize' function transforms Unicode text into an equivalent 57 | -- composed or decomposed form, allowing for easier sorting and 58 | -- searching of text. 'normalize' supports the standard normalization 59 | -- forms described in , 60 | -- Unicode Standard Annex #15: Unicode Normalization Forms. 61 | -- 62 | -- Characters with accents or other adornments can be encoded in 63 | -- several different ways in Unicode. For example, take the character A-acute. 64 | -- In Unicode, this can be encoded as a single character (the 65 | -- \"composed\" form): 66 | -- 67 | -- @ 68 | -- 00C1 LATIN CAPITAL LETTER A WITH ACUTE 69 | -- @ 70 | -- 71 | -- or as two separate characters (the \"decomposed\" form): 72 | -- 73 | -- @ 74 | -- 0041 LATIN CAPITAL LETTER A 75 | -- 0301 COMBINING ACUTE ACCENT 76 | -- @ 77 | -- 78 | -- To a user of your program, however, both of these sequences should 79 | -- be treated as the same \"user-level\" character \"A with acute 80 | -- accent\". When you are searching or comparing text, you must 81 | -- ensure that these two sequences are treated equivalently. In 82 | -- addition, you must handle characters with more than one accent. 83 | -- Sometimes the order of a character's combining accents is 84 | -- significant, while in other cases accent sequences in different 85 | -- orders are really equivalent. 86 | -- 87 | -- Similarly, the string \"ffi\" can be encoded as three separate letters: 88 | -- 89 | -- @ 90 | -- 0066 LATIN SMALL LETTER F 91 | -- 0066 LATIN SMALL LETTER F 92 | -- 0069 LATIN SMALL LETTER I 93 | -- @ 94 | -- 95 | -- or as the single character 96 | -- 97 | -- @ 98 | -- FB03 LATIN SMALL LIGATURE FFI 99 | -- @ 100 | -- 101 | -- The \"ffi\" ligature is not a distinct semantic character, and 102 | -- strictly speaking it shouldn't be in Unicode at all, but it was 103 | -- included for compatibility with existing character sets that 104 | -- already provided it. The Unicode standard identifies such 105 | -- characters by giving them \"compatibility\" decompositions into the 106 | -- corresponding semantic characters. When sorting and searching, you 107 | -- will often want to use these mappings. 108 | -- 109 | -- 'normalize' helps solve these problems by transforming text into 110 | -- the canonical composed and decomposed forms as shown in the first 111 | -- example above. In addition, you can have it perform compatibility 112 | -- decompositions so that you can treat compatibility characters the 113 | -- same as their equivalents. Finally, 'normalize' rearranges accents 114 | -- into the proper canonical order, so that you do not have to worry 115 | -- about accent rearrangement on your own. 116 | -- 117 | -- Form 'FCD', \"Fast C or D\", is also designed for collation. It 118 | -- allows to work on strings that are not necessarily normalized with 119 | -- an algorithm (like in collation) that works under \"canonical 120 | -- closure\", i.e., it treats precomposed characters and their 121 | -- decomposed equivalents the same. 122 | -- 123 | -- It is not a normalization form because it does not provide for 124 | -- uniqueness of representation. Multiple strings may be canonically 125 | -- equivalent (their NFDs are identical) and may all conform to 'FCD' 126 | -- without being identical themselves. 127 | -- 128 | -- The form is defined such that the \"raw decomposition\", the 129 | -- recursive canonical decomposition of each character, results in a 130 | -- string that is canonically ordered. This means that precomposed 131 | -- characters are allowed for as long as their decompositions do not 132 | -- need canonical reordering. 133 | -- 134 | -- Its advantage for a process like collation is that all 'NFD' and 135 | -- most 'NFC' texts - and many unnormalized texts - already conform to 136 | -- 'FCD' and do not need to be normalized ('NFD') for such a 137 | -- process. The 'FCD' 'quickCheck' will return 'Yes' for most strings 138 | -- in practice. 139 | -- 140 | -- @'normalize' 'FCD'@ may be implemented with 'NFD'. 141 | -- 142 | -- For more details on 'FCD' see the collation design document: 143 | -- 144 | -- 145 | -- ICU collation performs either 'NFD' or 'FCD' normalization 146 | -- automatically if normalization is turned on for the collator 147 | -- object. Beyond collation and string search, normalized strings may 148 | -- be useful for string equivalence comparisons, 149 | -- transliteration/transcription, unique representations, etc. 150 | -- 151 | -- The W3C generally recommends to exchange texts in 'NFC'. Note also 152 | -- that most legacy character encodings use only precomposed forms and 153 | -- often do not encode any combining marks by themselves. For 154 | -- conversion to such character encodings the Unicode text needs to be 155 | -- normalized to 'NFC'. For more usage examples, see the Unicode 156 | -- Standard Annex. 157 | 158 | type UCompareOption = Word32 159 | 160 | -- | Options to 'compare'. 161 | data CompareOption = InputIsFCD 162 | -- ^ The caller knows that both strings fulfill the 163 | -- 'FCD' conditions. If /not/ set, 'compare' will 164 | -- 'quickCheck' for 'FCD' and normalize if 165 | -- necessary. 166 | | CompareIgnoreCase 167 | -- ^ Compare strings case-insensitively using case 168 | -- folding, instead of case-sensitively. If set, 169 | -- then the following case folding options are 170 | -- used. 171 | | FoldCaseExcludeSpecialI 172 | -- ^ When case folding, exclude the special I 173 | -- character. For use with Turkic 174 | -- (Turkish/Azerbaijani) text data. 175 | deriving (Eq, Show, Enum, Typeable) 176 | 177 | fromCompareOption :: CompareOption -> UCompareOption 178 | fromCompareOption InputIsFCD = #const UNORM_INPUT_IS_FCD 179 | fromCompareOption CompareIgnoreCase = #const U_COMPARE_IGNORE_CASE 180 | fromCompareOption FoldCaseExcludeSpecialI = #const U_FOLD_CASE_EXCLUDE_SPECIAL_I 181 | 182 | reduceCompareOptions :: [CompareOption] -> UCompareOption 183 | reduceCompareOptions = foldl' orO (#const U_COMPARE_CODE_POINT_ORDER) 184 | where a `orO` b = a .|. fromCompareOption b 185 | 186 | type UNormalizationMode = CInt 187 | 188 | -- | Normalization modes. 189 | data NormalizationMode 190 | = None -- ^ No decomposition/composition. 191 | | NFD -- ^ Canonical decomposition. 192 | | NFKD -- ^ Compatibility decomposition. 193 | | NFC -- ^ Canonical decomposition followed by canonical composition. 194 | | NFKC -- ^ Compatibility decomposition followed by canonical composition. 195 | | FCD -- ^ \"Fast C or D\" form. 196 | deriving (Eq, Show, Enum, Typeable) 197 | 198 | toNM :: NormalizationMode -> UNormalizationMode 199 | toNM None = #const UNORM_NONE 200 | toNM NFD = #const UNORM_NFD 201 | toNM NFKD = #const UNORM_NFKD 202 | toNM NFC = #const UNORM_NFC 203 | toNM NFKC = #const UNORM_NFKC 204 | toNM FCD = #const UNORM_FCD 205 | 206 | -- | Normalize a string according to the specified normalization mode. 207 | normalize :: NormalizationMode -> Text -> Text 208 | normalize mode t = unsafePerformIO . useAsUCharPtr t $ \sptr slen -> 209 | let slen' = fromIntegral slen 210 | mode' = toNM mode 211 | in handleOverflowError (fromIntegral slen) 212 | (\dptr dlen -> unorm_normalize sptr slen' mode' 0 dptr (fromIntegral dlen)) 213 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 214 | 215 | 216 | -- | Perform an efficient check on a string, to quickly determine if 217 | -- the string is in a particular normalization form. 218 | -- 219 | -- A 'Nothing' result indicates that a definite answer could not be 220 | -- determined quickly, and a more thorough check is required, 221 | -- e.g. with 'isNormalized'. The user may have to convert the string 222 | -- to its normalized form and compare the results. 223 | -- 224 | -- A result of 'Just' 'True' or 'Just' 'False' indicates that the 225 | -- string definitely is, or is not, in the given normalization form. 226 | quickCheck :: NormalizationMode -> Text -> Maybe Bool 227 | quickCheck mode t = 228 | unsafePerformIO . useAsUCharPtr t $ \ptr len -> 229 | fmap toNCR . handleError $ unorm_quickCheck ptr (fromIntegral len) 230 | (toNM mode) 231 | 232 | -- | Indicate whether a string is in a given normalization form. 233 | -- 234 | -- Unlike 'quickCheck', this function returns a definitive result. 235 | -- For 'NFD', 'NFKD', and 'FCD' normalization forms, both functions 236 | -- work in exactly the same ways. For 'NFC' and 'NFKC' forms, where 237 | -- 'quickCheck' may return 'Nothing', this function will perform 238 | -- further tests to arrive at a definitive result. 239 | isNormalized :: NormalizationMode -> Text -> Bool 240 | isNormalized mode t = 241 | unsafePerformIO . useAsUCharPtr t $ \ptr len -> 242 | fmap asBool . handleError $ unorm_isNormalized ptr (fromIntegral len) 243 | (toNM mode) 244 | 245 | -- | Compare two strings for canonical equivalence. Further options 246 | -- include case-insensitive comparison and codepoint order (as 247 | -- opposed to code unit order). 248 | -- 249 | -- Canonical equivalence between two strings is defined as their 250 | -- normalized forms ('NFD' or 'NFC') being identical. This function 251 | -- compares strings incrementally instead of normalizing (and 252 | -- optionally case-folding) both strings entirely, improving 253 | -- performance significantly. 254 | -- 255 | -- Bulk normalization is only necessary if the strings do not fulfill 256 | -- the 'FCD' conditions. Only in this case, and only if the strings 257 | -- are relatively long, is memory allocated temporarily. For 'FCD' 258 | -- strings and short non-'FCD' strings there is no memory allocation. 259 | compare :: [CompareOption] -> Text -> Text -> Ordering 260 | compare opts a b = unsafePerformIO . 261 | useAsUCharPtr a $ \aptr alen -> 262 | useAsUCharPtr b $ \bptr blen -> 263 | fmap asOrdering . handleError $ 264 | unorm_compare aptr (fromIntegral alen) bptr (fromIntegral blen) 265 | (reduceCompareOptions opts) 266 | 267 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_compare" unorm_compare 268 | :: Ptr UChar -> Int32 269 | -> Ptr UChar -> Int32 270 | -> Word32 271 | -> Ptr UErrorCode 272 | -> IO Int32 273 | 274 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_quickCheck" unorm_quickCheck 275 | :: Ptr UChar -> Int32 276 | -> UNormalizationMode 277 | -> Ptr UErrorCode 278 | -> IO UNormalizationCheckResult 279 | 280 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_isNormalized" unorm_isNormalized 281 | :: Ptr UChar -> Int32 282 | -> UNormalizationMode 283 | -> Ptr UErrorCode 284 | -> IO UBool 285 | 286 | foreign import ccall unsafe "hs_text_icu.h __hs_unorm_normalize" unorm_normalize 287 | :: Ptr UChar -> Int32 288 | -> UNormalizationMode 289 | -> Int32 290 | -> Ptr UChar -> Int32 291 | -> Ptr UErrorCode 292 | -> IO Int32 293 | -------------------------------------------------------------------------------- /Data/Text/ICU/Normalize/Internal.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Normalize.Internal 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | 11 | module Data.Text.ICU.Normalize.Internal 12 | ( 13 | UNormalizationCheckResult 14 | , toNCR 15 | ) where 16 | 17 | #include 18 | 19 | import Foreign.C.Types (CInt) 20 | 21 | type UNormalizationCheckResult = CInt 22 | 23 | toNCR :: UNormalizationCheckResult -> Maybe Bool 24 | toNCR (#const UNORM_NO) = Just False 25 | toNCR (#const UNORM_MAYBE) = Nothing 26 | toNCR (#const UNORM_YES) = Just True 27 | toNCR _ = error "toNormalizationCheckResult" 28 | -------------------------------------------------------------------------------- /Data/Text/ICU/Number.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, DeriveDataTypeable, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Number 4 | -- Copyright : (c) 2020 Torsten Kemps-Benedix 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : tkx68@icloud.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- New users with are strongly encouraged to see 12 | -- if Data.Text.ICU.NumberFormatter fits their use case. 13 | -- Although not deprecated, this header is provided for backwards 14 | -- compatibility only. 15 | 16 | module Data.Text.ICU.Number 17 | ( 18 | -- * Unicode number formatting API 19 | -- $api 20 | numberFormatter 21 | , FormattableNumber, formatNumber, formatNumber' 22 | , NumberFormatStyle(..) 23 | , NumberFormat 24 | ) where 25 | 26 | #ifdef mingw32_HOST_OS 27 | #define U_HAVE_INTTYPES_H 1 28 | #endif 29 | 30 | #include 31 | #include 32 | 33 | import GHC.Natural 34 | import Data.Text (Text) 35 | import qualified Data.Text as T 36 | import Data.Text.ICU.Error 37 | import Data.Text.ICU.Error.Internal (UErrorCode, UParseError, handleParseError, handleOverflowError) 38 | import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr) 39 | import Data.Text.ICU.Internal (LocaleName, withLocaleName) 40 | import Data.Typeable (Typeable) 41 | import Data.Int (Int32) 42 | import Foreign.C.Types (CInt(..), CDouble(..)) 43 | import Foreign.Ptr (Ptr) 44 | import System.IO.Unsafe (unsafePerformIO) 45 | import Prelude hiding (compare) 46 | import Foreign.C.String (CString) 47 | import Data.Text.ICU.Number.Internal 48 | -- $api 49 | -- 50 | -- This module helps you to format and parse numbers for any locale. Your code 51 | -- can be completely independent of the locale conventions for decimal points, 52 | -- thousands-separators, or even the particular decimal digits used, or whether 53 | -- the number format is even decimal. There are different number format styles 54 | -- like decimal, currency, percent and spelled-out. 55 | -- 56 | -- Use 'formatter' to create a formatter and 'format' to format numbers. 57 | 58 | -- | The possible number format styles. 59 | data NumberFormatStyle 60 | = NUM_PATTERN_DECIMAL Text -- ^ Decimal format defined by a pattern string. See the section \"Patterns\" at for further details regarding pattern strings. 61 | | NUM_DECIMAL -- ^ Decimal format ("normal" style). 62 | | NUM_CURRENCY -- ^ Currency format (generic). Defaults to UNUM_CURRENCY_STANDARD style (using currency symbol, e.g., "$1.00", with non-accounting style for negative values e.g. using minus sign). The specific style may be specified using the -cf- locale key. 63 | | NUM_PERCENT -- ^ Percent format. 64 | | NUM_SCIENTIFIC -- ^ Scientific format. 65 | | NUM_SPELLOUT -- ^ Spellout rule-based format. The default ruleset can be specified/changed using unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. 66 | | NUM_ORDINAL -- ^ Ordinal rule-based format. The default ruleset can be specified/changed using unum_setTextAttribute with UNUM_DEFAULT_RULESET; the available public rulesets can be listed using unum_getTextAttribute with UNUM_PUBLIC_RULESETS. 67 | | NUM_DURATION -- ^ Duration rule-based format. 68 | | NUM_NUMBERING_SYSTEM -- ^ Numbering system rule-based format. 69 | | NUM_PATTERN_RULEBASED Text -- ^ Rule-based format defined by a pattern string. See the section \"Patterns\" at for further details regarding pattern strings. 70 | | NUM_CURRENCY_ISO -- ^ Currency format with an ISO currency code, e.g., "USD1.00". 71 | | NUM_CURRENCY_PLURAL -- ^ Currency format with a pluralized currency name, e.g., "1.00 US dollar" and "3.00 US dollars". 72 | | NUM_CURRENCY_ACCOUNTING -- ^ Currency format for accounting, e.g., "($3.00)" for negative currency amount instead of "-$3.00" (UNUM_CURRENCY). Overrides any style specified using -cf- key in locale. 73 | | NUM_CASH_CURRENCY -- ^ Currency format with a currency symbol given CASH usage, e.g., "NT$3" instead of "NT$3.23". 74 | | NUM_DECIMAL_COMPACT_SHORT -- ^ Decimal format expressed using compact notation (short form, corresponds to UNumberCompactStyle=UNUM_SHORT) e.g. "23K", "45B" 75 | | NUM_DECIMAL_COMPACT_LONG -- ^ Decimal format expressed using compact notation (long form, corresponds to UNumberCompactStyle=UNUM_LONG) e.g. "23 thousand", "45 billion" 76 | | NUM_CURRENCY_STANDARD -- ^ Currency format with a currency symbol, e.g., "$1.00", using non-accounting style for negative values (e.g. minus sign). Overrides any style specified using -cf- key in locale. 77 | | NUM_FORMAT_STYLE_COUNT -- ^ One more than the highest normal UNumberFormatStyle value. Deprecated: ICU 58 The numeric value may change over time, see ICU ticket #12420. 78 | | NUM_DEFAULT -- ^ Default format. 79 | | NUM_IGNORE -- ^ Alias for NUM_PATTERN_DECIMAL. 80 | deriving (Eq, Show, Typeable) 81 | 82 | type UNumberFormatStyle = CInt 83 | 84 | toNFS :: NumberFormatStyle -> UNumberFormatStyle 85 | toNFS (NUM_PATTERN_DECIMAL _) = #const UNUM_PATTERN_DECIMAL 86 | toNFS NUM_DECIMAL = #const UNUM_DECIMAL 87 | toNFS NUM_CURRENCY = #const UNUM_CURRENCY 88 | toNFS NUM_PERCENT = #const UNUM_PERCENT 89 | toNFS NUM_SCIENTIFIC = #const UNUM_SCIENTIFIC 90 | toNFS NUM_SPELLOUT = #const UNUM_SPELLOUT 91 | toNFS NUM_ORDINAL = #const UNUM_ORDINAL 92 | toNFS NUM_DURATION = #const UNUM_DURATION 93 | toNFS NUM_NUMBERING_SYSTEM = #const UNUM_NUMBERING_SYSTEM 94 | toNFS (NUM_PATTERN_RULEBASED _) = #const UNUM_PATTERN_RULEBASED 95 | toNFS NUM_CURRENCY_ISO = #const UNUM_CURRENCY_ISO 96 | toNFS NUM_CURRENCY_PLURAL = #const UNUM_CURRENCY_PLURAL 97 | toNFS NUM_CURRENCY_ACCOUNTING = #const UNUM_CURRENCY_ACCOUNTING 98 | toNFS NUM_CASH_CURRENCY = #const UNUM_CASH_CURRENCY 99 | toNFS NUM_DECIMAL_COMPACT_SHORT = #const UNUM_DECIMAL_COMPACT_SHORT 100 | toNFS NUM_DECIMAL_COMPACT_LONG = #const UNUM_DECIMAL_COMPACT_LONG 101 | toNFS NUM_CURRENCY_STANDARD = #const UNUM_CURRENCY_STANDARD 102 | toNFS NUM_FORMAT_STYLE_COUNT = #const UNUM_FORMAT_STYLE_COUNT 103 | toNFS NUM_DEFAULT = #const UNUM_DEFAULT 104 | toNFS NUM_IGNORE = #const UNUM_IGNORE 105 | 106 | -- | Create and return a new NumberFormat for formatting and parsing numbers. 107 | -- 108 | -- A NumberFormat may be used to format numbers by calling unum_format, and 109 | -- to parse numbers by calling unum_parse. The caller must call unum_close when 110 | -- done to release resources used by this object. 111 | numberFormatter :: NumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively. 112 | -> LocaleName -- ^ A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE". 113 | -> NumberFormat 114 | numberFormatter sty@(NUM_PATTERN_DECIMAL pattern) loc = numberFormatter' (toNFS sty) pattern loc 115 | numberFormatter sty@(NUM_PATTERN_RULEBASED pattern) loc = numberFormatter' (toNFS sty) pattern loc 116 | numberFormatter style loc = numberFormatter' (toNFS style) T.empty loc 117 | 118 | numberFormatter' :: UNumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively. 119 | -> Text 120 | -> LocaleName -- ^ A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE". 121 | -> NumberFormat 122 | numberFormatter' style pattern loc = 123 | System.IO.Unsafe.unsafePerformIO $ fmap C $ wrap $ 124 | useAsUCharPtr pattern $ \patternPtr patternLen -> 125 | withLocaleName loc $ 126 | handleParseError (== u_PARSE_ERROR) . (unum_open style patternPtr (fromIntegral patternLen)) 127 | 128 | foreign import ccall unsafe "hs_text_icu.h __hs_unum_open" unum_open 129 | :: UNumberFormatStyle -> Ptr UChar -> Int32 -> CString -> Ptr UParseError -> Ptr UErrorCode -> IO (Ptr UNumberFormat) 130 | 131 | -- | Format an integer using a NumberFormat. 132 | -- 133 | -- The integer will be formatted according to the UNumberFormat's locale. 134 | class FormattableNumber n where 135 | formatNumber :: NumberFormat -- ^ The formatter to use. 136 | -> n -- ^ The number to format. 137 | -> Text 138 | 139 | -- | Create a formatter and apply it in one step. 140 | formatNumber' :: (FormattableNumber n) 141 | => NumberFormatStyle -- ^ The type of number format to open. If NUM_PATTERN_DECIMAL or NUM_PATTERN_RULEBASED is passed then the number format is opened using the given pattern, which must conform to the syntax described in DecimalFormat or RuleBasedNumberFormat, respectively. 142 | -> LocaleName -- ^ A locale identifier to use to determine formatting and parsing conventions, or NULL to use the default locale, e.g. "de_DE". 143 | -> n -- ^ The number to format. 144 | -> Text 145 | formatNumber' style loc x = formatNumber (numberFormatter style loc) x 146 | 147 | instance FormattableNumber Integer where 148 | formatNumber (C nf) x = numberFormatInt nf (fromIntegral x) 149 | 150 | instance FormattableNumber Natural where 151 | formatNumber (C nf) x = numberFormatInt nf (fromIntegral x) 152 | 153 | instance FormattableNumber Int where 154 | formatNumber (C nf) x = numberFormatInt nf x 155 | 156 | instance FormattableNumber Double where 157 | formatNumber (C nf) x = numberFormatDouble nf x 158 | 159 | instance FormattableNumber Float where 160 | formatNumber (C nf) x = numberFormatDouble nf (fromRational $ toRational x) 161 | 162 | -- | Create a number format. 163 | numberFormatInt :: MNumberFormat -> Int -> Text 164 | numberFormatInt nf x = System.IO.Unsafe.unsafePerformIO $ 165 | withNumberFormat nf $ \nptr -> 166 | handleOverflowError 100 167 | (\dptr dlen ec -> unum_formatInt64 nptr (fromIntegral x) dptr (fromIntegral dlen) ec) 168 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 169 | 170 | -- | Format a number. 171 | numberFormatDouble :: MNumberFormat -> Double -> Text 172 | numberFormatDouble nf x = System.IO.Unsafe.unsafePerformIO $ 173 | withNumberFormat nf $ \nptr -> 174 | handleOverflowError 100 175 | (\dptr dlen ec -> unum_formatDouble nptr (CDouble x) dptr (fromIntegral dlen) ec) 176 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 177 | 178 | foreign import ccall unsafe "hs_text_icu.h __hs_unum_formatInt64" unum_formatInt64 179 | :: Ptr UNumberFormat -> Int -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 180 | foreign import ccall unsafe "hs_text_icu.h __hs_unum_formatDouble" unum_formatDouble 181 | :: Ptr UNumberFormat -> CDouble -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 182 | -------------------------------------------------------------------------------- /Data/Text/ICU/Number/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Collate.Internal 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Internals of the string collation infrastructure. 12 | 13 | module Data.Text.ICU.Number.Internal 14 | ( 15 | -- * Unicode collation API 16 | MNumberFormat(..) 17 | , NumberFormat(..) 18 | , UNumberFormat 19 | , withNumberFormat 20 | , wrap 21 | ) 22 | where 23 | 24 | import Data.Typeable (Typeable) 25 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) 26 | import Foreign.Ptr (FunPtr, Ptr) 27 | import Data.Text.ICU.Internal (newICUPtr) 28 | 29 | -- $api 30 | -- 31 | 32 | data UNumberFormat 33 | 34 | -- | This is the number formatter. It can be created with 'formatter'. Use it to format numbers with the 'format' function. 35 | data MNumberFormat = MNumberFormat {-# UNPACK #-} !(ForeignPtr UNumberFormat) 36 | deriving (Typeable) 37 | 38 | -- | This is the number formatter. It can be created with 'formatter'. Use it to format numbers with the 'format' function. 39 | newtype NumberFormat = C MNumberFormat 40 | deriving (Typeable) 41 | 42 | withNumberFormat :: MNumberFormat -> (Ptr UNumberFormat -> IO a) -> IO a 43 | withNumberFormat (MNumberFormat col) action = withForeignPtr col action 44 | {-# INLINE withNumberFormat #-} 45 | 46 | wrap :: IO (Ptr UNumberFormat) -> IO MNumberFormat 47 | wrap = newICUPtr MNumberFormat unum_close 48 | {-# INLINE wrap #-} 49 | 50 | foreign import ccall unsafe "hs_text_icu.h &__hs_unum_close" unum_close 51 | :: FunPtr (Ptr UNumberFormat -> IO ()) 52 | -------------------------------------------------------------------------------- /Data/Text/ICU/NumberFormatter.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE EmptyDataDecls, RankNTypes, BangPatterns, ForeignFunctionInterface, RecordWildCards #-} 2 | -- | 3 | -- Module : Data.Text.ICU.NumberFormatter 4 | -- Copyright : (c) 2021 Torsten Kemps-Benedix 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Number formatter implemented as bindings to 12 | -- the International Components for Unicode (ICU) libraries. 13 | 14 | module Data.Text.ICU.NumberFormatter 15 | ( 16 | -- * Data 17 | NumberFormatter, 18 | -- * Formatter 19 | numberFormatter, 20 | -- $skeleton 21 | -- * Formatting functions 22 | formatIntegral, formatIntegral', formatDouble, formatDouble' 23 | ) where 24 | 25 | #include 26 | 27 | import Data.Int (Int32, Int64) 28 | import Data.Text (Text) 29 | import Data.Text.ICU.Error.Internal (UErrorCode, handleError, handleOverflowError) 30 | import Data.Text.ICU.Internal (LocaleName(..), UChar, withLocaleName, newICUPtr, fromUCharPtr, useAsUCharPtr) 31 | import Foreign.C.String (CString) 32 | import Foreign.C.Types (CDouble(..)) 33 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) 34 | import Foreign.Ptr (FunPtr, Ptr) 35 | import Prelude hiding (last) 36 | import System.IO.Unsafe (unsafePerformIO) 37 | 38 | -- $skeleton 39 | -- 40 | -- Here are some examples for number skeletons, see 41 | -- https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html#examples for more: 42 | -- 43 | -- +----------------------------+-----------------+--------+--------------+-------------------------------------------------------------+ 44 | -- | Long Skeleton | Concise Skeleton | Input | en-US Output | Comments | 45 | -- +============================+==================+=======+==============+=============================================================+ 46 | -- | percent | % | 25 | 25% | | 47 | -- | .00 |.00 | 25 | 25.00 | Equivalent to Precision::fixedFraction(2) | 48 | -- | percent .00 | % .00 | 25 | 25.00% | | 49 | -- | scale/100 | scale/100 | 0.3 | 30 | Multiply by 100 before formatting | 50 | -- | percent scale/100 | %x100 | 0.3 | 30% | | 51 | -- | measure-unit/length-meter | unit/meter | 5 | 5 m | UnitWidth defaults to Short | 52 | -- | unit-width-full-name | unit/meter | 5 | 5 meters | | 53 | -- | compact-short | K | 5000 | 5K | | 54 | -- | compact-long | KK | 5000 | 5 thousand | | 55 | -- | group-min2 | ,? | 5000 | 5000 | Require 2 digits in group for separator | 56 | -- | group-min2 | ,? | 15000 | 15,000 | | 57 | -- | sign-always | +! | 60 | +60 | Show sign on all numbers | 58 | -- | sign-always | +! | 0 | +0 | | 59 | -- | sign-except-zero | +? | 60 | +60 | Show sign on all numbers except 0 | 60 | -- | sign-except-zero | +? | 0 | 0 | | 61 | -- +----------------------------+-----------------+--------+--------------+-------------------------------------------------------------+ 62 | 63 | data UNumberFormatter 64 | data UFormattedNumber 65 | 66 | newtype NumberFormatter = NumberFormatter (ForeignPtr UNumberFormatter) 67 | 68 | -- | Create a new 'NumberFormatter'. 69 | -- 70 | -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify 71 | -- the number skeletons. And use 'availableLocales' in order to find the allowed locale names. These 72 | -- usuallly look like "en", "de", "de_AT" etc. See 'formatIntegral' and 'formatDouble' for some examples. 73 | numberFormatter :: Text -> LocaleName -> IO NumberFormatter 74 | numberFormatter skel loc = 75 | withLocaleName loc $ \locale -> 76 | useAsUCharPtr skel $ \skelPtr skelLen -> 77 | newICUPtr NumberFormatter unumf_close $ 78 | handleError $ unumf_openForSkeletonAndLocale skelPtr (fromIntegral skelLen) locale 79 | 80 | -- | Format an integral number. 81 | -- 82 | -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify 83 | -- the number skeletons. 84 | -- 85 | -- >>> import Data.Text 86 | -- >>> nf <- numberFormatter (pack "precision-integer") (Locale "de") 87 | -- >>> formatIntegral nf 12345 88 | -- "12.345" 89 | -- >>> nf2 <- numberFormatter (pack "precision-integer") (Locale "fr") 90 | -- >>> formatIntegral nf2 12345 91 | -- "12\8239\&345" 92 | formatIntegral :: Integral a => NumberFormatter -> a -> Text 93 | formatIntegral (NumberFormatter nf) x = unsafePerformIO $ do 94 | withForeignPtr nf $ \nfPtr -> do 95 | resultPtr <- newResult 96 | withForeignPtr resultPtr $ \resPtr -> do 97 | handleError $ unumf_formatInt nfPtr (fromIntegral x) resPtr 98 | t <- handleOverflowError (fromIntegral (64 :: Int)) 99 | (\dptr dlen -> unumf_resultToString resPtr dptr dlen) 100 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 101 | pure t 102 | 103 | -- | Create a number formatter and apply it to an integral number. 104 | formatIntegral' :: (Integral a) => Text -> LocaleName -> a -> Text 105 | formatIntegral' skel loc x = unsafePerformIO $ do 106 | nf <- numberFormatter skel loc 107 | pure $ formatIntegral nf x 108 | 109 | -- | Format a Double. 110 | -- 111 | -- See https://unicode-org.github.io/icu/userguide/format_parse/numbers/skeletons.html for how to specify 112 | -- the number skeletons. 113 | -- 114 | -- >>> import Data.Text 115 | -- >>> nf3 <- numberFormatter (pack "precision-currency-cash") (Locale "it") 116 | -- >>> formatDouble nf3 12345.6789 117 | -- "12.345,68" 118 | formatDouble :: NumberFormatter -> Double -> Text 119 | formatDouble (NumberFormatter nf) x = unsafePerformIO $ do 120 | withForeignPtr nf $ \nfPtr -> do 121 | resultPtr <- newResult 122 | withForeignPtr resultPtr $ \resPtr -> do 123 | handleError $ unumf_formatDouble nfPtr (CDouble x) resPtr 124 | t <- handleOverflowError (fromIntegral (64 :: Int)) 125 | (\dptr dlen -> unumf_resultToString resPtr dptr dlen) 126 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 127 | pure t 128 | 129 | -- | Create a number formatter and apply it to a Double. 130 | formatDouble' :: Text -> LocaleName -> Double -> Text 131 | formatDouble' skel loc x = unsafePerformIO $ do 132 | nf <- numberFormatter skel loc 133 | pure $ formatDouble nf x 134 | 135 | newResult :: IO (ForeignPtr UFormattedNumber) 136 | newResult = newICUPtr id unumf_closeResult $ handleError unumf_openResult 137 | 138 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_openForSkeletonAndLocale" unumf_openForSkeletonAndLocale 139 | :: Ptr UChar -> Int32 -> CString -> Ptr UErrorCode -> IO (Ptr UNumberFormatter) 140 | foreign import ccall unsafe "hs_text_icu.h &__hs_unumf_close" unumf_close 141 | :: FunPtr (Ptr UNumberFormatter -> IO ()) 142 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_openResult" unumf_openResult 143 | :: Ptr UErrorCode -> IO (Ptr UFormattedNumber) 144 | foreign import ccall unsafe "hs_text_icu.h &__hs_unumf_closeResult" unumf_closeResult 145 | :: FunPtr (Ptr UFormattedNumber -> IO ()) 146 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_formatInt" unumf_formatInt 147 | :: Ptr UNumberFormatter -> Int64 -> Ptr UFormattedNumber -> Ptr UErrorCode -> IO () 148 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_formatDouble" unumf_formatDouble 149 | :: Ptr UNumberFormatter -> CDouble -> Ptr UFormattedNumber -> Ptr UErrorCode -> IO () 150 | foreign import ccall unsafe "hs_text_icu.h __hs_unumf_resultToString" unumf_resultToString 151 | :: Ptr UFormattedNumber -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 152 | -------------------------------------------------------------------------------- /Data/Text/ICU/Regex.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, EmptyDataDecls, MagicHash, RecordWildCards, 2 | ScopedTypeVariables #-} 3 | {-# OPTIONS_GHC -fno-warn-orphans #-} 4 | 5 | -- | 6 | -- Module : Data.Text.ICU.Regex 7 | -- Copyright : (c) 2010 Bryan O'Sullivan 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com 11 | -- Stability : experimental 12 | -- Portability : GHC 13 | -- 14 | -- Regular expression support for Unicode, implemented as bindings to 15 | -- the International Components for Unicode (ICU) libraries. 16 | -- 17 | -- The syntax and behaviour of ICU regular expressions are Perl-like. 18 | -- For complete details, see the ICU User Guide entry at 19 | -- . 20 | -- 21 | -- /Note/: The functions in this module are not thread safe. For 22 | -- thread safe use, see 'clone' below, or use the pure functions in 23 | -- "Data.Text.ICU". 24 | 25 | module Data.Text.ICU.Regex 26 | ( 27 | -- * Types 28 | MatchOption(..) 29 | , ParseError(errError, errLine, errOffset) 30 | , Regex 31 | -- * Functions 32 | -- ** Construction 33 | , regex 34 | , regex' 35 | , clone 36 | -- ** Managing text to search 37 | , setText 38 | , getUTextPtr 39 | -- ** Inspection 40 | , pattern 41 | -- ** Searching 42 | , find 43 | , findNext 44 | -- ** Match groups 45 | -- $groups 46 | , groupCount 47 | , start 48 | , end 49 | , start_ 50 | , end_ 51 | ) where 52 | 53 | import Data.Text.ICU.Regex.Internal 54 | import qualified Control.Exception as E 55 | import Data.IORef (newIORef, readIORef, writeIORef) 56 | import Data.Text (Text) 57 | import Data.Text.ICU.Internal (asBool, UTextPtr, asUTextPtr, emptyUTextPtr, TextI, withUTextPtr, fromUCharPtr, newICUPtr) 58 | import Data.Text.ICU.Error.Internal (ParseError(..), handleError) 59 | import Foreign.ForeignPtr (withForeignPtr) 60 | import Foreign.Marshal.Alloc (alloca) 61 | import Foreign.Storable (peek) 62 | import System.IO.Unsafe (unsafePerformIO) 63 | 64 | instance Show Regex where 65 | show re = "Regex " ++ show (pattern re) 66 | 67 | -- $groups 68 | -- 69 | -- Capturing groups are numbered starting from zero. Group zero is 70 | -- always the entire matching text. Groups greater than zero contain 71 | -- the text matching each capturing group in a regular expression. 72 | 73 | -- | Compile a regular expression with the given options. This is 74 | -- safest to use when the pattern is constructed at run time. 75 | regex' :: [MatchOption] -> Text -> IO (Either ParseError Regex) 76 | regex' opts pat = (Right `fmap` regex opts pat) `E.catch` \(err::ParseError) -> 77 | return (Left err) 78 | 79 | -- | Set the subject text string upon which the regular expression 80 | -- will look for matches. This function may be called any number of 81 | -- times, allowing the regular expression pattern to be applied to 82 | -- different strings. 83 | setText :: Regex -> Text -> IO () 84 | setText Regex{..} t = do 85 | hayfp <- asUTextPtr t 86 | withForeignPtr reRe $ \rePtr -> 87 | withUTextPtr hayfp $ \hayPtr -> handleError $ 88 | uregex_setUText rePtr hayPtr 89 | writeIORef reText hayfp 90 | 91 | -- | Get the subject text that is currently associated with this 92 | -- regular expression object. 93 | getUTextPtr :: Regex -> IO UTextPtr 94 | getUTextPtr Regex{..} = readIORef reText 95 | 96 | -- | Return the source form of the pattern used to construct this 97 | -- regular expression or match. 98 | pattern :: Regex -> Text 99 | pattern Regex{..} = unsafePerformIO . withForeignPtr reRe $ \rePtr -> 100 | alloca $ \lenPtr -> do 101 | textPtr <- handleError $ uregex_pattern rePtr lenPtr 102 | (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr 103 | 104 | -- | Find the first matching substring of the input string that 105 | -- matches the pattern. 106 | -- 107 | -- If /n/ is non-negative, the search for a match begins at the 108 | -- specified index, and any match region is reset. 109 | -- 110 | -- If /n/ is -1, the search begins at the start of the input region, 111 | -- or at the start of the full string if no region has been specified. 112 | -- 113 | -- If a match is found, 'start', 'end', and 'group' will provide more 114 | -- information regarding the match. 115 | find :: Regex -> TextI -> IO Bool 116 | find Regex{..} n = 117 | fmap asBool . withForeignPtr reRe $ \rePtr -> handleError $ 118 | uregex_find rePtr (fromIntegral n) 119 | 120 | -- | Find the next pattern match in the input string. Begin searching 121 | -- the input at the location following the end of he previous match, 122 | -- or at the start of the string (or region) if there is no previous 123 | -- match. 124 | -- 125 | -- If a match is found, 'start', 'end', and 'group' will provide more 126 | -- information regarding the match. 127 | findNext :: Regex -> IO Bool 128 | findNext Regex{..} = 129 | fmap asBool . withForeignPtr reRe $ handleError . uregex_findNext 130 | 131 | -- | Make a copy of a compiled regular expression. Cloning a regular 132 | -- expression is faster than opening a second instance from the source 133 | -- form of the expression, and requires less memory. 134 | -- 135 | -- Note that the current input string and the position of any matched 136 | -- text within it are not cloned; only the pattern itself and and the 137 | -- match mode flags are copied. 138 | -- 139 | -- Cloning can be particularly useful to threaded applications that 140 | -- perform multiple match operations in parallel. Each concurrent RE 141 | -- operation requires its own instance of a 'Regex'. 142 | clone :: Regex -> IO Regex 143 | {-# INLINE clone #-} 144 | clone Regex{..} = do 145 | newICUPtr Regex uregex_close 146 | (withForeignPtr reRe (handleError . uregex_clone)) 147 | <*> 148 | newIORef emptyUTextPtr 149 | 150 | -- | Return the number of capturing groups in this regular 151 | -- expression's pattern. 152 | groupCount :: Regex -> IO Int 153 | groupCount Regex{..} = 154 | fmap fromIntegral . withForeignPtr reRe $ handleError . uregex_groupCount 155 | 156 | -- | Returns the index in the input string of the start of the text 157 | -- matched by the specified capture group during the previous match 158 | -- operation. Returns @-1@ if the capture group was not part of the 159 | -- last match. 160 | start_ :: Regex -> Int -> IO TextI 161 | start_ Regex{..} n = 162 | fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $ 163 | uregex_start rePtr (fromIntegral n) 164 | 165 | -- | Returns the index in the input string of the end of the text 166 | -- matched by the specified capture group during the previous match 167 | -- operation. Returns @-1@ if the capture group was not part of 168 | -- the last match. 169 | end_ :: Regex -> Int -> IO TextI 170 | end_ Regex{..} n = 171 | fmap fromIntegral . withForeignPtr reRe $ \rePtr -> handleError $ 172 | uregex_end rePtr (fromIntegral n) 173 | 174 | -- | Returns the index in the input string of the start of the text 175 | -- matched by the specified capture group during the previous match 176 | -- operation. Returns 'Nothing' if the capture group was not part of 177 | -- the last match. 178 | start :: Regex -> Int -> IO (Maybe TextI) 179 | start r n = check `fmap` start_ r n 180 | 181 | -- | Returns the index in the input string of the end of the text 182 | -- matched by the specified capture group during the previous match 183 | -- operation. Returns 'Nothing' if the capture group was not part of 184 | -- the last match. 185 | end :: Regex -> Int -> IO (Maybe TextI) 186 | end r n = check `fmap` end_ r n 187 | 188 | check :: TextI -> Maybe TextI 189 | check (-1) = Nothing 190 | check k = Just $! fromIntegral k 191 | -------------------------------------------------------------------------------- /Data/Text/ICU/Regex/Internal.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, DeriveDataTypeable, EmptyDataDecls, 2 | ForeignFunctionInterface, MagicHash, RecordWildCards, 3 | ScopedTypeVariables #-} 4 | 5 | -- | 6 | -- Module : Data.Text.ICU.Regex.Internal 7 | -- Copyright : (c) 2010 Bryan O'Sullivan 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com 11 | -- Stability : experimental 12 | -- Portability : GHC 13 | -- 14 | -- Regular expression support for Unicode, implemented as bindings to 15 | -- the International Components for Unicode (ICU) libraries. 16 | -- 17 | -- The syntax and behaviour of ICU regular expressions are Perl-like. 18 | -- For complete details, see the ICU User Guide entry at 19 | -- . 20 | 21 | module Data.Text.ICU.Regex.Internal 22 | ( 23 | -- * Types 24 | MatchOption(..) 25 | , Regex(..) 26 | , URegularExpression 27 | -- * Functions 28 | , regex 29 | , uregex_clone 30 | , uregex_close 31 | , uregex_end 32 | , uregex_find 33 | , uregex_findNext 34 | , uregex_getText 35 | , uregex_group 36 | , uregex_groupCount 37 | , uregex_pattern 38 | , uregex_setUText 39 | , uregex_start 40 | ) where 41 | 42 | import Control.Monad (when) 43 | import Data.IORef (IORef, newIORef) 44 | import Data.Int (Int32) 45 | import Data.Text (Text) 46 | import Data.Text.ICU.Internal (UBool, UChar, UTextPtr, UText, useAsUCharPtr, withUTextPtr, emptyUTextPtr, newICUPtr) 47 | import Data.Text.ICU.Error (isRegexError) 48 | import Data.Text.ICU.Error.Internal (UParseError, UErrorCode, 49 | handleError, handleParseError) 50 | import Data.Typeable (Typeable) 51 | import Data.Word (Word32) 52 | import Foreign.ForeignPtr (ForeignPtr) 53 | import Foreign.Ptr (FunPtr, Ptr) 54 | 55 | #include 56 | 57 | -- | Options for controlling matching behaviour. 58 | data MatchOption 59 | = CaseInsensitive 60 | -- ^ Enable case insensitive matching. 61 | | Comments 62 | -- ^ Allow comments and white space within patterns. 63 | | DotAll 64 | -- ^ If set, @\'.\'@ matches line terminators. Otherwise @\'.\'@ 65 | -- matching stops at line end. 66 | | Literal 67 | -- ^ If set, treat the entire pattern as a literal string. 68 | -- Metacharacters or escape sequences in the input sequence will 69 | -- be given no special meaning. 70 | -- 71 | -- The option 'CaseInsensitive' retains its meanings on matching 72 | -- when used in conjunction with this option. Other options 73 | -- become superfluous. 74 | | Multiline 75 | -- ^ Control behaviour of @\'$\'@ and @\'^\'@. If set, recognize 76 | -- line terminators within string, Otherwise, match only at start 77 | -- and end of input string. 78 | | HaskellLines 79 | -- ^ Haskell-only line endings. When this mode is enabled, only 80 | -- @\'\\n\'@ is recognized as a line ending in the behavior of 81 | -- @\'.\'@, @\'^\'@, and @\'$\'@. 82 | | UnicodeWord 83 | -- ^ Unicode word boundaries. If set, @\'\\\\b\'@ uses the 84 | -- Unicode TR 29 definition of word boundaries. 85 | -- 86 | -- /Warning/: Unicode word boundaries are quite different from 87 | -- traditional regular expression word boundaries. See 88 | -- . 89 | | ErrorOnUnknownEscapes 90 | -- ^ Throw an error on unrecognized backslash escapes. If set, 91 | -- fail with an error on patterns that contain backslash-escaped 92 | -- ASCII letters without a known special meaning. If this flag is 93 | -- not set, these escaped letters represent themselves. 94 | | WorkLimit Int 95 | -- ^ Set a processing limit for match operations. 96 | -- 97 | -- Some patterns, when matching certain strings, can run in 98 | -- exponential time. For practical purposes, the match operation 99 | -- may appear to be in an infinite loop. When a limit is set a 100 | -- match operation will fail with an error if the limit is 101 | -- exceeded. 102 | -- 103 | -- The units of the limit are steps of the match engine. 104 | -- Correspondence with actual processor time will depend on the 105 | -- speed of the processor and the details of the specific pattern, 106 | -- but will typically be on the order of milliseconds. 107 | -- 108 | -- By default, the matching time is not limited. 109 | | StackLimit Int 110 | -- ^ Set the amount of heap storage available for use by the match 111 | -- backtracking stack. 112 | -- 113 | -- ICU uses a backtracking regular expression engine, with the 114 | -- backtrack stack maintained on the heap. This function sets the 115 | -- limit to the amount of memory that can be used for this 116 | -- purpose. A backtracking stack overflow will result in an error 117 | -- from the match operation that caused it. 118 | -- 119 | -- A limit is desirable because a malicious or poorly designed 120 | -- pattern can use excessive memory, potentially crashing the 121 | -- process. A limit is enabled by default. 122 | deriving (Eq, Show, Typeable) 123 | 124 | -- | A compiled regular expression. 125 | -- 126 | -- 'Regex' values are usually constructed using the 'regex' or 127 | -- 'regex'' functions. This type is also an instance of 'IsString', 128 | -- so if you have the @OverloadedStrings@ language extension enabled, 129 | -- you can construct a 'Regex' by simply writing the pattern in 130 | -- quotes (though this does not allow you to specify any 'Option's). 131 | data Regex = Regex { 132 | reRe :: ForeignPtr URegularExpression 133 | , reText :: IORef UTextPtr 134 | } 135 | 136 | -- | Compile a regular expression with the given options. This 137 | -- function throws a 'ParseError' if the pattern is invalid. 138 | -- 139 | -- The 'Regex' is initialized with empty text to search against. 140 | regex :: [MatchOption] -> Text -> IO Regex 141 | regex opts pat = useAsUCharPtr pat $ \pptr plen -> 142 | newICUPtr Regex uregex_close (do 143 | ptr <- handleParseError isRegexError $ 144 | uregex_open pptr (fromIntegral plen) flags 145 | withUTextPtr hayfp $ \hayPtr -> handleError $ 146 | uregex_setUText ptr hayPtr 147 | when (workLimit > -1) . 148 | handleError $ uregex_setTimeLimit ptr (fromIntegral workLimit) 149 | when (stackLimit > -1) . 150 | handleError $ uregex_setStackLimit ptr (fromIntegral stackLimit) 151 | return ptr) 152 | <*> newIORef hayfp 153 | where (flags,workLimit,stackLimit) = toURegexpOpts opts 154 | hayfp = emptyUTextPtr 155 | 156 | data URegularExpression 157 | 158 | type URegexpFlag = Word32 159 | 160 | toURegexpOpts :: [MatchOption] -> (URegexpFlag,Int,Int) 161 | toURegexpOpts = foldl go (0,-1,-1) 162 | where 163 | go (!flag,work,stack) opt = (flag+flag',work',stack') 164 | where 165 | flag' = case opt of 166 | CaseInsensitive -> #const UREGEX_CASE_INSENSITIVE 167 | Comments -> #const UREGEX_COMMENTS 168 | DotAll -> #const UREGEX_DOTALL 169 | Literal -> #const UREGEX_LITERAL 170 | Multiline -> #const UREGEX_MULTILINE 171 | HaskellLines -> #const UREGEX_UNIX_LINES 172 | UnicodeWord -> #const UREGEX_UWORD 173 | ErrorOnUnknownEscapes -> #const UREGEX_ERROR_ON_UNKNOWN_ESCAPES 174 | _ -> 0 175 | work' = case opt of 176 | WorkLimit limit -> limit 177 | _ -> work 178 | stack' = case opt of 179 | StackLimit limit -> limit 180 | _ -> stack 181 | 182 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_open" uregex_open 183 | :: Ptr UChar -> Int32 -> Word32 -> Ptr UParseError -> Ptr UErrorCode 184 | -> IO (Ptr URegularExpression) 185 | 186 | foreign import ccall unsafe "hs_text_icu.h &__hs_uregex_close" uregex_close 187 | :: FunPtr (Ptr URegularExpression -> IO ()) 188 | 189 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_clone" uregex_clone 190 | :: Ptr URegularExpression -> Ptr UErrorCode 191 | -> IO (Ptr URegularExpression) 192 | 193 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_pattern" uregex_pattern 194 | :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode 195 | -> IO (Ptr UChar) 196 | 197 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setUText" uregex_setUText 198 | :: Ptr URegularExpression -> Ptr UText -> Ptr UErrorCode -> IO () 199 | 200 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_getText" uregex_getText 201 | :: Ptr URegularExpression -> Ptr Int32 -> Ptr UErrorCode -> IO (Ptr UChar) 202 | 203 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_find" uregex_find 204 | :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO UBool 205 | 206 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_findNext" uregex_findNext 207 | :: Ptr URegularExpression -> Ptr UErrorCode -> IO UBool 208 | 209 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_start" uregex_start 210 | :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32 211 | 212 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_end" uregex_end 213 | :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO Int32 214 | 215 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_groupCount" uregex_groupCount 216 | :: Ptr URegularExpression -> Ptr UErrorCode -> IO Int32 217 | 218 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_group" uregex_group 219 | :: Ptr URegularExpression -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode 220 | -> IO Int32 221 | 222 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setTimeLimit" uregex_setTimeLimit 223 | :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO () 224 | 225 | foreign import ccall unsafe "hs_text_icu.h __hs_uregex_setStackLimit" uregex_setStackLimit 226 | :: Ptr URegularExpression -> Int32 -> Ptr UErrorCode -> IO () 227 | -------------------------------------------------------------------------------- /Data/Text/ICU/Regex/Pure.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, EmptyDataDecls, ScopedTypeVariables #-} 2 | 3 | -- | 4 | -- Module : Data.Text.ICU.Regex.Pure 5 | -- Copyright : (c) 2010 Bryan O'Sullivan 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- Regular expression support for Unicode, implemented as bindings to 13 | -- the International Components for Unicode (ICU) libraries. 14 | -- 15 | -- The functions in this module are pure and hence thread safe, but 16 | -- may not be as fast or as flexible as those in the 17 | -- "Data.Text.ICU.Regex" module. 18 | -- 19 | -- The syntax and behaviour of ICU regular expressions are Perl-like. 20 | -- For complete details, see the ICU User Guide entry at 21 | -- . 22 | 23 | module Data.Text.ICU.Regex.Pure 24 | ( 25 | -- * Types 26 | MatchOption(..) 27 | , ParseError(errError, errLine, errOffset) 28 | , Match 29 | , Regex 30 | , Regular 31 | -- * Functions 32 | -- ** Construction 33 | , regex 34 | , regex' 35 | -- ** Inspection 36 | , pattern 37 | -- ** Searching 38 | , find 39 | , findAll 40 | -- ** Match groups 41 | -- $group 42 | , groupCount 43 | , unfold 44 | , span 45 | , group 46 | , prefix 47 | , suffix 48 | ) where 49 | 50 | import qualified Control.Exception as E 51 | import Data.String (IsString(..)) 52 | import Data.Text (Text) 53 | import qualified Data.Text as T 54 | import qualified Data.Text.Foreign as T 55 | import Data.Text.ICU.Internal (TextI, fromUCharPtr, lengthWord, withUTextPtrText, utextPtrLength) 56 | import Data.Text.ICU.Error.Internal (ParseError(..), handleError) 57 | import qualified Data.Text.ICU.Regex as IO 58 | import Data.Text.ICU.Regex.Internal hiding (Regex(..), regex) 59 | import qualified Data.Text.ICU.Regex.Internal as Internal 60 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) 61 | import Foreign.Marshal.Alloc (alloca) 62 | import Foreign.Marshal.Array (advancePtr) 63 | import Foreign.Storable (peek) 64 | import Prelude hiding (span) 65 | import System.IO.Unsafe (unsafeInterleaveIO, unsafePerformIO) 66 | 67 | -- | A compiled regular expression. 68 | -- 69 | -- 'Regex' values are usually constructed using the 'regex' or 70 | -- 'regex'' functions. This type is also an instance of 'IsString', 71 | -- so if you have the @OverloadedStrings@ language extension enabled, 72 | -- you can construct a 'Regex' by simply writing the pattern in 73 | -- quotes (though this does not allow you to specify any 'Option's). 74 | newtype Regex = Regex { 75 | reRe :: Internal.Regex 76 | } 77 | 78 | instance Show Regex where 79 | show re = "Regex " ++ show (pattern re) 80 | 81 | instance IsString Regex where 82 | fromString = regex [] . T.pack 83 | 84 | -- | A match for a regular expression. 85 | data Match = Match { 86 | matchRe :: Internal.Regex 87 | , _matchPrev :: TextI 88 | } 89 | 90 | instance Show Match where 91 | show m = "Match " ++ show (unfold group m) 92 | 93 | -- | A typeclass for functions common to both 'Match' and 'Regex' 94 | -- types. 95 | class Regular r where 96 | regRe :: r -> Internal.Regex 97 | 98 | regFp :: r -> ForeignPtr URegularExpression 99 | regFp = Internal.reRe . regRe 100 | {-# INLINE regFp #-} 101 | 102 | instance Regular Match where 103 | regRe = matchRe 104 | 105 | instance Regular Regex where 106 | regRe = reRe 107 | 108 | -- | Compile a regular expression with the given options. This 109 | -- function throws a 'ParseError' if the pattern is invalid, so it is 110 | -- best for use when the pattern is statically known. 111 | regex :: [MatchOption] -> Text -> Regex 112 | regex opts pat = Regex . unsafePerformIO $ IO.regex opts pat 113 | 114 | -- | Compile a regular expression with the given options. This is 115 | -- safest to use when the pattern is constructed at run time. 116 | regex' :: [MatchOption] -> Text -> Either ParseError Regex 117 | regex' opts pat = unsafePerformIO $ 118 | ((Right . Regex) `fmap` Internal.regex opts pat) `E.catch` 119 | \(err::ParseError) -> return (Left err) 120 | 121 | -- | Return the source form of the pattern used to construct this 122 | -- regular expression or match. 123 | pattern :: Regular r => r -> Text 124 | pattern r = unsafePerformIO . withForeignPtr (regFp r) $ \rePtr -> 125 | alloca $ \lenPtr -> do 126 | textPtr <- handleError $ uregex_pattern rePtr lenPtr 127 | (fromUCharPtr textPtr . fromIntegral) =<< peek lenPtr 128 | 129 | -- | Find the first match for the regular expression in the given text. 130 | find :: Regex -> Text -> Maybe Match 131 | find re0 haystack = unsafePerformIO . 132 | matching re0 haystack $ \re -> do 133 | m <- IO.findNext re 134 | return $! if m then Just (Match re 0) else Nothing 135 | 136 | -- | Lazily find all matches for the regular expression in the given 137 | -- text. 138 | findAll :: Regex -> Text -> [Match] 139 | findAll re0 haystack = unsafePerformIO . unsafeInterleaveIO $ go 0 140 | where 141 | len = fromIntegral . lengthWord $ haystack 142 | go !n | n >= len = return [] 143 | | otherwise = matching re0 haystack $ \re -> do 144 | found <- IO.find re n 145 | if found 146 | then do 147 | n' <- IO.end_ re 0 148 | (Match re n:) `fmap` go n' 149 | else return [] 150 | 151 | matching :: Regex -> Text -> (IO.Regex -> IO a) -> IO a 152 | matching (Regex re0) haystack act = do 153 | re <- IO.clone re0 154 | IO.setText re haystack 155 | act re 156 | 157 | -- $group 158 | -- 159 | -- Capturing groups are numbered starting from zero. Group zero is 160 | -- always the entire matching text. Groups greater than zero contain 161 | -- the text matching each capturing group in a regular expression. 162 | 163 | -- | Return the number of capturing groups in this regular 164 | -- expression or match's pattern. 165 | groupCount :: Regular r => r -> Int 166 | groupCount = unsafePerformIO . IO.groupCount . regRe 167 | {-# INLINE groupCount #-} 168 | 169 | -- | A combinator for returning a list of all capturing groups on a 170 | -- 'Match'. 171 | unfold :: (Int -> Match -> Maybe Text) -> Match -> [Text] 172 | unfold f m = go 0 173 | where go !n = case f n m of 174 | Nothing -> [] 175 | Just z -> z : go (n+1) 176 | 177 | -- | Return the /n/th capturing group in a match, or 'Nothing' if /n/ 178 | -- is out of bounds. 179 | group :: Int -> Match -> Maybe Text 180 | group n m = grouping n m $ \re -> do 181 | let n' = fromIntegral n 182 | start <- fromIntegral `fmap` IO.start_ re n' 183 | end <- fromIntegral `fmap` IO.end_ re n' 184 | ut <- IO.getUTextPtr re 185 | withUTextPtrText ut $ \ptr -> 186 | T.fromPtr (ptr `advancePtr` fromIntegral start) (end - start) 187 | 188 | -- | Return the prefix of the /n/th capturing group in a match (the 189 | -- text from the start of the string to the start of the match), or 190 | -- 'Nothing' if /n/ is out of bounds. 191 | prefix :: Int -> Match -> Maybe Text 192 | prefix n m = grouping n m $ \re -> do 193 | start <- fromIntegral `fmap` IO.start_ re n 194 | ut <- IO.getUTextPtr re 195 | withUTextPtrText ut (`T.fromPtr` start) 196 | 197 | -- | Return the span of text between the end of the previous match and 198 | -- the beginning of the current match. 199 | span :: Match -> Text 200 | span (Match re p) = unsafePerformIO $ do 201 | start <- IO.start_ re 0 202 | ut <- IO.getUTextPtr re 203 | withUTextPtrText ut $ \ptr -> 204 | T.fromPtr (ptr `advancePtr` fromIntegral p) (start - p) 205 | 206 | -- | Return the suffix of the /n/th capturing group in a match (the 207 | -- text from the end of the match to the end of the string), or 208 | -- 'Nothing' if /n/ is out of bounds. 209 | suffix :: Int -> Match -> Maybe Text 210 | suffix n m = grouping n m $ \re -> do 211 | end <- fromIntegral `fmap` IO.end_ re n 212 | ut <- IO.getUTextPtr re 213 | withUTextPtrText ut $ \ptr -> do 214 | T.fromPtr (ptr `advancePtr` fromIntegral end) (utextPtrLength ut - end) 215 | 216 | grouping :: Int -> Match -> (Internal.Regex -> IO a) -> Maybe a 217 | grouping n (Match m _) act = unsafePerformIO $ do 218 | count <- IO.groupCount m 219 | let n' = fromIntegral n 220 | if n' == 0 || (n' >= 0 && n' <= count) 221 | then Just `fmap` act m 222 | else return Nothing 223 | -------------------------------------------------------------------------------- /Data/Text/ICU/Shape.hsc: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, ForeignFunctionInterface #-} 2 | 3 | -- | 4 | -- Module : Data.Text.ICU.Shape 5 | -- Copyright : (c) 2018 Ondrej Palkovsky 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- Bindings for arabic shaping, implemented as bindings to 13 | -- the International Components for Unicode (ICU) libraries. 14 | -- 15 | 16 | module Data.Text.ICU.Shape 17 | ( 18 | shapeArabic 19 | , ShapeOption(..) 20 | ) where 21 | 22 | #include 23 | 24 | import Data.List (foldl') 25 | import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError) 26 | import Data.Bits ((.|.)) 27 | import Data.Int (Int32) 28 | import Foreign.Ptr (Ptr) 29 | import Data.Text.ICU.Internal (UChar, useAsUCharPtr, fromUCharPtr) 30 | import Data.Text (Text) 31 | import System.IO.Unsafe (unsafePerformIO) 32 | 33 | -- | Options for the 'shapeArabic' function. 34 | data ShapeOption = 35 | AggregateTaskheel 36 | -- ^ Tashkeel aggregation option: Replaces any combination of U+0651 with one of U+064C, U+064D, U+064E, U+064F, U+0650 with U+FC5E, U+FC5F, U+FC60, U+FC61, U+FC62 consecutively. 37 | | DigitTypeAnExtended 38 | -- ^ Digit type option: Use Eastern (Extended) Arabic-Indic digits (U+06f0...U+06f9). 39 | | DigitsAlen2AnInitAl 40 | -- ^ Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly 41 | -- directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]). 42 | | DigitsAlen2AnInitLr 43 | -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits if the most recent strongly directional character is an Arabic letter (u_charDirection() result U_RIGHT_TO_LEFT_ARABIC [AL]). 44 | | DigitsAn2En 45 | -- ^ Digit shaping option: Replace Arabic-Indic digits by European digits (U+0030...). 46 | | DigitsEn2An 47 | -- ^ Digit shaping option: Replace European digits (U+0030...) by Arabic-Indic digits. 48 | | LengthFixedSpacesAtBeginning 49 | -- ^ If more room is necessary, then try to consume spaces at the beginning of the text. 50 | | LengthFixedSpacesAtEnd 51 | -- ^ If more room is necessary, then try to consume spaces at the end of the text. 52 | | LengthFixedSpacesNear 53 | -- ^ If more room is necessary, then try to consume spaces next to modified characters. 54 | | LettersShape 55 | -- ^ Letter shaping option: replace abstract letter characters by "shaped" ones. 56 | | LettersUnshape 57 | -- ^ Letter shaping option: replace "shaped" letter characters by abstract ones. 58 | | LettersShapeTashkeelIsolated 59 | -- ^ The only difference with LettersShape is that Tashkeel letters are always "shaped" into the isolated form instead of the medial form (selecting codepoints from the Arabic Presentation Forms-B block). 60 | | PreservePresentation 61 | -- ^ Presentation form option: Don't replace Arabic Presentation Forms-A and Arabic Presentation Forms-B characters with 0+06xx characters, before shaping. 62 | | TextDirectionVisualLTR 63 | -- ^ Direction indicator: the source is in visual LTR order, the leftmost displayed character stored first. 64 | deriving (Show) 65 | 66 | reduceShapeOpts :: [ShapeOption] -> Int32 67 | reduceShapeOpts = foldl' orO 0 68 | where a `orO` b = a .|. fromShapeOption b 69 | 70 | fromShapeOption :: ShapeOption -> Int32 71 | fromShapeOption AggregateTaskheel = #const U_SHAPE_AGGREGATE_TASHKEEL 72 | fromShapeOption DigitTypeAnExtended = #const U_SHAPE_DIGIT_TYPE_AN_EXTENDED 73 | fromShapeOption DigitsAlen2AnInitAl = #const U_SHAPE_DIGITS_ALEN2AN_INIT_AL 74 | fromShapeOption DigitsAlen2AnInitLr = #const U_SHAPE_DIGITS_ALEN2AN_INIT_LR 75 | fromShapeOption DigitsAn2En = #const U_SHAPE_DIGITS_AN2EN 76 | fromShapeOption DigitsEn2An = #const U_SHAPE_DIGITS_EN2AN 77 | fromShapeOption LengthFixedSpacesAtBeginning = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_BEGINNING 78 | fromShapeOption LengthFixedSpacesAtEnd = #const U_SHAPE_LENGTH_FIXED_SPACES_AT_END 79 | fromShapeOption LengthFixedSpacesNear = #const U_SHAPE_LENGTH_FIXED_SPACES_NEAR 80 | fromShapeOption LettersShape = #const U_SHAPE_LETTERS_SHAPE 81 | fromShapeOption LettersUnshape = #const U_SHAPE_LETTERS_UNSHAPE 82 | fromShapeOption LettersShapeTashkeelIsolated = #const U_SHAPE_LETTERS_SHAPE_TASHKEEL_ISOLATED 83 | fromShapeOption PreservePresentation = #const U_SHAPE_PRESERVE_PRESENTATION 84 | fromShapeOption TextDirectionVisualLTR = #const U_SHAPE_TEXT_DIRECTION_VISUAL_LTR 85 | 86 | -- | Shape Arabic text on a character basis. 87 | -- 88 | -- Text-based shaping means that some character codepoints in the text are replaced by 89 | -- others depending on the context. It transforms one kind of text into another. 90 | -- In comparison, modern displays for Arabic text select appropriate, context-dependent font 91 | -- glyphs for each text element, which means that they transform text into a glyph vector. 92 | -- 93 | -- You probably want to call this with the LettersShape option in the default case. 94 | shapeArabic :: [ShapeOption] -> Text -> Text 95 | shapeArabic options t = unsafePerformIO . useAsUCharPtr t $ \sptr slen -> 96 | let slen' = fromIntegral slen 97 | options' = reduceShapeOpts options 98 | in handleOverflowError (fromIntegral slen) 99 | (\dptr dlen -> u_shapeArabic sptr slen' dptr (fromIntegral dlen) options') 100 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 101 | 102 | foreign import ccall unsafe "hs_text_icu.h __hs_u_shapeArabic" u_shapeArabic 103 | :: Ptr UChar -> Int32 104 | -> Ptr UChar -> Int32 105 | -> Int32 -> Ptr UErrorCode -> IO Int32 106 | -------------------------------------------------------------------------------- /Data/Text/ICU/Spoof/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveDataTypeable, EmptyDataDecls, ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Spoof.Internal 4 | -- Copyright : (c) 2015 Ben Hamilton 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bgertzfield@gmail.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Internals of the spoofability check infrastructure. 12 | 13 | module Data.Text.ICU.Spoof.Internal 14 | ( 15 | -- * Unicode spoof checking API 16 | -- $api 17 | -- * Types 18 | MSpoof(..) 19 | , Spoof(..) 20 | , USpoof 21 | -- * Functions 22 | , withSpoof 23 | , wrap 24 | , wrapWithSerialized 25 | ) where 26 | 27 | import Data.Typeable (Typeable) 28 | import Data.Word (Word8) 29 | import Foreign.ForeignPtr (ForeignPtr, withForeignPtr) 30 | import Foreign.Ptr (FunPtr, Ptr) 31 | import Data.Text.ICU.Internal (newICUPtr) 32 | 33 | -- $api 34 | -- Low-level operations on spoof checkers. 35 | 36 | -- | Opaque handle to a configurable spoof checker. 37 | data USpoof 38 | 39 | -- | Configurable spoof checker wrapping an opaque handle 40 | -- and optionally wrapping a previously serialized instance. 41 | data MSpoof = MSpoof { 42 | serializedBuf :: Maybe (ForeignPtr Word8) 43 | , spoofPtr :: {-# UNPACK #-} !(ForeignPtr USpoof) 44 | } deriving (Typeable) 45 | 46 | -- | Spoof checker type. 47 | newtype Spoof = S MSpoof 48 | deriving (Typeable) 49 | 50 | -- | Temporarily unwraps an 'MSpoof' to perform operations on its raw 'USpoof' 51 | -- handle. 52 | withSpoof :: MSpoof -> (Ptr USpoof -> IO a) -> IO a 53 | withSpoof (MSpoof _ spoof) = withForeignPtr spoof 54 | {-# INLINE withSpoof #-} 55 | 56 | -- | Wraps a raw 'USpoof' handle in an 'MSpoof', closing the handle when 57 | -- the last reference to the object is dropped. 58 | wrap :: IO (Ptr USpoof) -> IO MSpoof 59 | wrap = newICUPtr (MSpoof Nothing) uspoof_close 60 | {-# INLINE wrap #-} 61 | 62 | -- | Wraps a previously serialized spoof checker and raw 'USpoof' handle 63 | -- in an 'MSpoof', closing the handle and releasing the 'ForeignPtr' when 64 | -- the last reference to the object is dropped. 65 | wrapWithSerialized :: ForeignPtr Word8 -> IO (Ptr USpoof) -> IO MSpoof 66 | wrapWithSerialized s = newICUPtr (MSpoof $ Just s) uspoof_close 67 | {-# INLINE wrapWithSerialized #-} 68 | 69 | foreign import ccall unsafe "hs_text_icu.h &__hs_uspoof_close" uspoof_close 70 | :: FunPtr (Ptr USpoof -> IO ()) 71 | -------------------------------------------------------------------------------- /Data/Text/ICU/Spoof/Pure.hs: -------------------------------------------------------------------------------- 1 | -- | 2 | -- Module : Data.Text.ICU.Spoof.Pure 3 | -- Copyright : (c) 2015 Ben Hamilton 4 | -- 5 | -- License : BSD-style 6 | -- Maintainer : bgertzfield@gmail.com 7 | -- Stability : experimental 8 | -- Portability : GHC 9 | -- 10 | -- Pure string spoof checking functions for Unicode, implemented as 11 | -- bindings to the International Components for Unicode (ICU) 12 | -- libraries. 13 | -- 14 | -- For the impure spoof checking API (which is richer, but less easy to 15 | -- use), see the "Data.Text.ICU.Spoof" module. 16 | 17 | module Data.Text.ICU.Spoof.Pure 18 | ( 19 | -- * Types 20 | Spoof 21 | , SpoofParams(..) 22 | , spoof 23 | , spoofWithParams 24 | , spoofFromSource 25 | , spoofFromSerialized 26 | -- * String spoof checks 27 | , areConfusable 28 | , getSkeleton 29 | , spoofCheck 30 | -- * Configuration 31 | , getAllowedLocales 32 | , getChecks 33 | , getRestrictionLevel 34 | -- * Persistence 35 | , serialize 36 | ) where 37 | 38 | import Data.ByteString (ByteString) 39 | import Data.Foldable (forM_) 40 | import Data.Text (Text) 41 | import Data.Text.ICU.Spoof.Internal (Spoof(..)) 42 | import System.IO.Unsafe (unsafePerformIO) 43 | import qualified Data.Text.ICU.Spoof as S 44 | 45 | data SpoofParams 46 | -- | Used to configure a 'Spoof' checker via 'spoofWithParams'. 47 | = SpoofParams { 48 | -- | Optional 'S.SpoofCheck's to perform on a string. By default, performs 49 | -- all checks except 'CharLimit'. 50 | spoofChecks :: Maybe [S.SpoofCheck] 51 | -- | Optional 'S.RestrictionLevel' to which characters in the string will 52 | -- be limited. By default, uses 'HighlyRestrictive'. 53 | , level :: Maybe S.RestrictionLevel 54 | -- | Optional locale(s) whose scripts will be used to limit the 55 | -- set of allowed characters in a string. If set, automatically 56 | -- enables the 'CharLimit' spoof check. 57 | , locales :: Maybe [String] 58 | } deriving (Show, Eq) 59 | 60 | applySpoofParams :: SpoofParams -> S.MSpoof -> S.MSpoof 61 | applySpoofParams (SpoofParams c lev loc) s = unsafePerformIO $ do 62 | forM_ c (S.setChecks s) 63 | forM_ lev (S.setRestrictionLevel s) 64 | forM_ loc (S.setAllowedLocales s) 65 | return s 66 | 67 | -- | Open an immutable 'Spoof' checker with default options (all 68 | -- 'S.SpoofCheck's except 'CharLimit'). 69 | spoof :: Spoof 70 | spoof = unsafePerformIO $ S `fmap` S.open 71 | {-# NOINLINE spoof #-} 72 | 73 | -- | Open an immutable 'Spoof' checker with specific 'SpoofParams' 74 | -- to control its behavior. 75 | spoofWithParams :: SpoofParams -> Spoof 76 | spoofWithParams p = unsafePerformIO $ do 77 | s <- S.open 78 | return (S $ applySpoofParams p s) 79 | 80 | -- | Open a immutable 'Spoof' checker with specific 'SpoofParams' 81 | -- to control its behavior and custom rules given the UTF-8 encoded 82 | -- contents of the @confusables.txt@ and @confusablesWholeScript.txt@ 83 | -- files as described in . 84 | spoofFromSource :: (ByteString, ByteString) -> SpoofParams -> Spoof 85 | spoofFromSource (confusables, confusablesWholeScript) p = unsafePerformIO $ do 86 | s <- S.openFromSource (confusables, confusablesWholeScript) 87 | return (S $ applySpoofParams p s) 88 | 89 | -- | Create an immutable spoof checker with specific 'SpoofParams' 90 | -- to control its behavior and custom rules previously returned 91 | -- by 'serialize'. 92 | spoofFromSerialized :: ByteString -> SpoofParams -> Spoof 93 | spoofFromSerialized b p = unsafePerformIO $ do 94 | s <- S.openFromSerialized b 95 | return (S $ applySpoofParams p s) 96 | 97 | -- | Check two strings for confusability. 98 | areConfusable :: Spoof -> Text -> Text -> S.SpoofCheckResult 99 | areConfusable (S s) t1 t2 = unsafePerformIO $ S.areConfusable s t1 t2 100 | 101 | -- | Check a string for spoofing issues. 102 | spoofCheck :: Spoof -> Text -> S.SpoofCheckResult 103 | spoofCheck (S s) t = unsafePerformIO $ S.spoofCheck s t 104 | 105 | -- | Generates re-usable \"skeleton\" strings which can be used (via 106 | -- Unicode equality) to check if an identifier is confusable 107 | -- with some large set of existing identifiers. 108 | -- 109 | -- If you cache the returned strings in storage, you /must/ invalidate 110 | -- your cache any time the underlying confusables database changes 111 | -- (i.e., on ICU upgrade). 112 | -- 113 | -- By default, assumes all input strings have been passed through 114 | -- 'toCaseFold' and are lower-case. To change this, pass 115 | -- 'SkeletonAnyCase'. 116 | -- 117 | -- By default, builds skeletons which catch visually confusable 118 | -- characters across multiple scripts. Pass 'SkeletonSingleScript' to 119 | -- override that behavior and build skeletons which catch visually 120 | -- confusable characters across single scripts. 121 | getSkeleton :: Spoof -> Maybe S.SkeletonTypeOverride -> Text -> Text 122 | getSkeleton (S s) o t = unsafePerformIO $ S.getSkeleton s o t 123 | 124 | -- | Gets the restriction level currently configured in the spoof 125 | -- checker, if present. 126 | getRestrictionLevel :: Spoof -> Maybe S.RestrictionLevel 127 | getRestrictionLevel (S s) = unsafePerformIO $ S.getRestrictionLevel s 128 | 129 | -- | Gets the checks currently configured in the spoof checker. 130 | getChecks :: Spoof -> [S.SpoofCheck] 131 | getChecks (S s) = unsafePerformIO $ S.getChecks s 132 | 133 | -- | Gets the locales whose scripts are currently allowed by the spoof 134 | -- checker. (We don't use 'LocaleName' since the root and default 135 | -- locales have no meaning here.) 136 | getAllowedLocales :: Spoof -> [String] 137 | getAllowedLocales (S s) = unsafePerformIO $ S.getAllowedLocales s 138 | 139 | -- | Serializes the rules in this spoof checker to a byte array, 140 | -- suitable for re-use by 'spoofFromSerialized'. 141 | -- 142 | -- Only includes any data provided to 'openFromSource'. Does not 143 | -- include any other state or configuration. 144 | serialize :: Spoof -> ByteString 145 | serialize (S s) = unsafePerformIO $ S.serialize s 146 | 147 | {-# INLINE spoofCheck #-} 148 | -------------------------------------------------------------------------------- /Data/Text/ICU/Text.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | -- | 3 | -- Module : Data.Text.ICU.Text 4 | -- Copyright : (c) 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Functions for manipulating Unicode text, implemented as bindings to 12 | -- the International Components for Unicode (ICU) libraries. 13 | module Data.Text.ICU.Text 14 | ( 15 | -- * Case conversion 16 | -- $case 17 | toCaseFold 18 | , toLower 19 | , toUpper 20 | ) where 21 | 22 | import Data.Int (Int32) 23 | import Data.Text (Text) 24 | import Data.Text.ICU.Error.Internal (UErrorCode, handleOverflowError) 25 | import Data.Text.ICU.Internal (LocaleName, UChar, withLocaleName, useAsUCharPtr, fromUCharPtr) 26 | import Data.Word (Word32) 27 | import Foreign.C.String (CString) 28 | import Foreign.Ptr (Ptr) 29 | import System.IO.Unsafe (unsafePerformIO) 30 | 31 | -- $case 32 | -- 33 | -- In some languages, case conversion is a locale- and 34 | -- context-dependent operation. The case conversion functions in this 35 | -- module are locale and context sensitive. 36 | 37 | -- | Case-fold the characters in a string. 38 | -- 39 | -- Case folding is locale independent and not context sensitive, but 40 | -- there is an option for treating the letter I specially for Turkic 41 | -- languages. The result may be longer or shorter than the original. 42 | toCaseFold :: Bool -- ^ Whether to include or exclude mappings for 43 | -- dotted and dotless I and i that are marked with 44 | -- 'I' in @CaseFolding.txt@. 45 | -> Text -> Text 46 | toCaseFold excludeI s = unsafePerformIO . 47 | useAsUCharPtr s $ \sptr slen -> do 48 | let opts = fromIntegral . fromEnum $ excludeI 49 | handleOverflowError (fromIntegral slen) 50 | (\dptr dlen -> u_strFoldCase dptr dlen sptr (fromIntegral slen) opts) 51 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 52 | 53 | type CaseMapper = Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> CString 54 | -> Ptr UErrorCode -> IO Int32 55 | 56 | caseMap :: CaseMapper -> LocaleName -> Text -> Text 57 | caseMap mapFn loc s = unsafePerformIO . 58 | withLocaleName loc $ \locale -> 59 | useAsUCharPtr s $ \sptr slen -> 60 | handleOverflowError (fromIntegral slen) 61 | (\dptr dlen -> mapFn dptr dlen sptr (fromIntegral slen) locale) 62 | (\dptr dlen -> fromUCharPtr dptr (fromIntegral dlen)) 63 | 64 | -- | Lowercase the characters in a string. 65 | -- 66 | -- Casing is locale dependent and context sensitive. The result may 67 | -- be longer or shorter than the original. 68 | toLower :: LocaleName -> Text -> Text 69 | toLower = caseMap u_strToLower 70 | 71 | -- | Uppercase the characters in a string. 72 | -- 73 | -- Casing is locale dependent and context sensitive. The result may 74 | -- be longer or shorter than the original. 75 | toUpper :: LocaleName -> Text -> Text 76 | toUpper = caseMap u_strToUpper 77 | 78 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strFoldCase" u_strFoldCase 79 | :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Word32 -> Ptr UErrorCode 80 | -> IO Int32 81 | 82 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strToLower" u_strToLower 83 | :: CaseMapper 84 | 85 | foreign import ccall unsafe "hs_text_icu.h __hs_u_strToUpper" u_strToUpper 86 | :: CaseMapper 87 | -------------------------------------------------------------------------------- /Data/Text/ICU/Types.hs: -------------------------------------------------------------------------------- 1 | -- | 2 | -- Module : Data.Text.ICU.Types 3 | -- Copyright : (c) 2010 Bryan O'Sullivan 4 | -- 5 | -- License : BSD-style 6 | -- Maintainer : bos@serpentine.com 7 | -- Stability : experimental 8 | -- Portability : GHC 9 | -- 10 | -- Types for use when manipulating Unicode text, using the bindings to 11 | -- the International Components for Unicode (ICU) libraries. 12 | module Data.Text.ICU.Types 13 | ( 14 | -- * Widely used types 15 | LocaleName(..) 16 | , ParseError(errError, errLine, errOffset) 17 | ) where 18 | 19 | import Data.Text.ICU.Error.Internal (ParseError(..)) 20 | import Data.Text.ICU.Internal (LocaleName(..)) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009, Bryan O'Sullivan 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # Text-ICU: Comprehensive support for string manipulation 2 | 3 | This package provides the Data.Text.ICU library, for performing 4 | complex manipulation of Unicode text. It provides features such as 5 | the following: 6 | 7 | * Unicode normalization 8 | 9 | * Conversion to and from many common and obscure encodings 10 | 11 | * Date and number formatting 12 | 13 | * Comparison and collation 14 | 15 | ## Prerequisites 16 | 17 | This library is implemented as bindings to the well-respected [ICU 18 | library](https://icu.unicode.org/) (which is not bundled, and must 19 | be installed separately). 20 | 21 | ### macOS 22 | 23 | brew install icu4c 24 | brew link icu4c --force 25 | 26 | You might need: 27 | 28 | export PKG_CONFIG_PATH="$(brew --prefix)/opt/icu4c/lib/pkgconfig" 29 | 30 | ### Debian/Ubuntu 31 | 32 | sudo apt-get update 33 | sudo apt-get install libicu-dev 34 | 35 | ### Fedora/CentOS 36 | 37 | sudo dnf install unzip libicu-devel 38 | 39 | ### Nix/NixOS 40 | 41 | nix-shell --packages icu 42 | 43 | ### Windows/MSYS2 44 | 45 | Under MSYS2, `ICU` can be installed via `pacman`. 46 | 47 | pacman --noconfirm -S mingw-w64-x86_64-icu 48 | 49 | Depending on the age of the MSYS2 installation, the keyring might need 50 | to be updated to avoid certification issues, and `pkg-config` might 51 | need to be added. In this case, do this first: 52 | 53 | pacman --noconfirm -Sy msys2-keyring 54 | pacman --noconfirm -S mingw-w64-x86_64-pkgconf 55 | 56 | ### Windows/stack 57 | 58 | With `stack` on Windows, which comes with its _own_ bundled MSYS2, the 59 | following commands give up-to-date system dependencies for 60 | `text-icu-0.8.0` (tested 2023-09-30): 61 | 62 | stack exec -- pacman --noconfirm -Sy msys2-keyring 63 | stack exec -- pacman --noconfirm -S mingw-w64-x86_64-pkgconf 64 | stack exec -- pacman --noconfirm -S mingw-w64-x86_64-icu 65 | 66 | 67 | ## Compatibility 68 | 69 | Upstream ICU occasionally introduces backwards-incompatible API 70 | breaks. This package tries to stay up to date with upstream, and is 71 | currently more or less in sync with ICU 72. 72 | 73 | Minimum required version is ICU 62. 74 | 75 | 76 | ## Get involved! 77 | 78 | Please report bugs via the 79 | [github issue tracker](https://github.com/haskell/text-icu/issues). 80 | 81 | 82 | ## Authors 83 | 84 | This library was written by Bryan O'Sullivan. 85 | -------------------------------------------------------------------------------- /Setup.lhs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env runhaskell 2 | > import Distribution.Simple 3 | > main = defaultMain 4 | -------------------------------------------------------------------------------- /benchmarks/Breaker.hs: -------------------------------------------------------------------------------- 1 | -- Estimate the time difference between creating a breaker. 2 | 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | import Control.Monad 6 | import qualified Data.Text as T 7 | import Data.Text.IO as T 8 | import Data.Text.ICU.Break as IO 9 | import Data.Text.ICU as ICU 10 | import System.Environment 11 | 12 | consume b = go 13 | where 14 | go = do 15 | m <- next b 16 | case m of 17 | Nothing -> return () 18 | Just _ -> go 19 | 20 | manyBreakers (t:ts) = do 21 | b <- IO.breakWord "en" t 22 | consume b 23 | manyBreakers ts 24 | manyBreakers _ = return () 25 | 26 | oneBreaker ts = do 27 | b <- IO.breakWord "en" "" 28 | forM_ ts $ \t -> do 29 | setText b t 30 | consume b 31 | 32 | cloneBreakers ts = do 33 | b <- IO.breakWord "en" "" 34 | forM_ ts $ \t -> do 35 | b' <- clone b 36 | setText b' t 37 | consume b' 38 | 39 | pureBreaker ts = do 40 | let b = ICU.breakWord "en" 41 | forM_ ts $ \t -> length (breaks b t) `seq` return () 42 | 43 | main = do 44 | (kind:files) <- getArgs 45 | let act = case kind of 46 | "one" -> oneBreaker 47 | "many" -> manyBreakers 48 | "clone" -> cloneBreakers 49 | "pure" -> pureBreaker 50 | forM_ files $ \f -> T.readFile f >>= act . T.lines 51 | -------------------------------------------------------------------------------- /cabal.haskell-ci: -------------------------------------------------------------------------------- 1 | -- Recreate .github/workflows/haskell-ci.yml with command `haskell-ci regenerate`. 2 | 3 | branches: master 4 | 5 | -- text-icu-0.8 requires a newer ICU lib (shipped not with Ubuntu 18.04, but with 20.04) 6 | -- distribution: focal 7 | 8 | -- installed: +all -binary -bytestring -containers -deepseq -directory -time -unix 9 | 10 | -- constraint-set text-2.1 11 | -- -- Somehow GHCs 8.4 - 9.2 pick up the wrong version of text 12 | -- -- See https://github.com/haskell/text-icu/actions/runs/6341389618/job/17224925091?pr=93 13 | -- ghc: >=8.2 && < 8.3 || >= 9.4 14 | -- constraints: text ^>= 2.1 15 | -- tests: True 16 | -- run-tests: True 17 | -- 18 | -- raw-project 19 | -- allow-newer: text 20 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | Unreleased 2 | 3 | * Drop support for GHC 7.10 4 | 5 | 0.8.0.5 6 | 7 | * Make homebrew optional #(99) 8 | 9 | 0.8.0.4 10 | 11 | * Fixed tests to work with ICU < 72 (#94) 12 | 13 | 0.8.0.3 14 | 15 | * Support for ICU 72 (#94) 16 | 17 | 0.8.0.2 18 | 19 | * Support for creating a collator from custom rules (#76) 20 | 21 | 0.8.0.1 22 | 23 | * Restore build with GHC 7.10 - 8.8 (#61) 24 | * New CI for Linux, macOS and Windows (#63, #64, #66, #69) 25 | 26 | 0.8.0 27 | 28 | * Support for text-2.0 (#57) 29 | * Support for ICU 69 and new features (#55) 30 | * Add lib/include dirs for newer homebrew (#54) 31 | * basic number formatting added (#46) 32 | * Declare pkg-config dependencies (#43) 33 | * Added support for arabic shaping and BiDi (#41) 34 | * Include icuio lib (#36) 35 | * Character Set Detection (#27) 36 | 37 | 0.7.1.0 38 | 39 | * Add fix for undefined TRUE value in cbits (#52) 40 | * Improve CI and documentation (#20) 41 | 42 | Thanks to everyone who contributed! 43 | 44 | 0.7.0.0 45 | 46 | * Built and tested against ICU 53. 47 | 48 | * The isoComment function has been deprecated, and will be removed in 49 | the next major release. 50 | 51 | * The Collator type is no longer an instance of Eq, as this 52 | functionality has been removed from ICU 53. 53 | 54 | * Many NFData instances have been added. 55 | -------------------------------------------------------------------------------- /hackage-docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Run this script in the top-level of your package directory 4 | # (where the .cabal file is) to compile documentation and 5 | # upload it to hackage. 6 | 7 | # Requirements: 8 | # cabal-install-1.24 (for --for-hackage) 9 | # haddock 2.17 (for the hyperlinked source) 10 | 11 | set -e 12 | 13 | dir=$(mktemp -d dist-docs.XXXXXX) 14 | trap 'rm -r "$dir"' EXIT 15 | 16 | cabal haddock --builddir="$dir" --haddock-for-hackage --enable-doc 17 | cabal upload --publish -d $dir/*-docs.tar.gz 18 | -------------------------------------------------------------------------------- /tests/Properties.hs: -------------------------------------------------------------------------------- 1 | -- Tester beware! 2 | -- 3 | -- Many of the tests below are "weak", i.e. they ensure that functions 4 | -- return results, without checking whether the results are correct. 5 | -- Weak tests are described as such. 6 | 7 | {-# LANGUAGE CPP #-} 8 | {-# LANGUAGE OverloadedStrings, LambdaCase #-} 9 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-} 10 | module Properties (propertyTests, testCases) where 11 | 12 | import Control.Monad (unless) 13 | import qualified Control.Exception as E 14 | import Control.DeepSeq (NFData(..)) 15 | import Data.Function (on) 16 | import Data.Maybe (fromMaybe) 17 | import Data.Text (Text) 18 | import Data.Text.ICU (LocaleName(..), ParseError(..)) 19 | import QuickCheckUtils (NonEmptyText(..), LatinSpoofableText(..), 20 | NonSpoofableText(..), Utf8Text(..)) 21 | import Data.Text.ICU.Normalize2 (NormalizationMode(..)) 22 | import qualified Data.Text.ICU.Normalize2 as I 23 | import Test.Framework (Test, testGroup) 24 | import Test.Framework.Providers.QuickCheck2 (testProperty) 25 | import Test.Framework.Providers.HUnit (hUnitTestToTests) 26 | import Test.HUnit ((~?=), (@?=), (~:)) 27 | import qualified Test.HUnit (Test(..), assertFailure) 28 | import Test.HUnit.Lang (HUnitFailure (..), FailureReason (..)) 29 | import Test.QuickCheck.Monadic (monadicIO, run, assert) 30 | import qualified Data.Text as T 31 | import qualified Data.Text.Encoding as T 32 | import qualified Data.Text.ICU as I 33 | import qualified Data.Text.ICU.BiDi as BiDi 34 | import qualified Data.Text.ICU.Calendar as Cal 35 | import qualified Data.Text.ICU.Convert as I 36 | import qualified Data.Text.ICU.Char as I 37 | import qualified Data.Text.ICU.CharsetDetection as CD 38 | import qualified Data.Text.ICU.Error as Err 39 | import qualified Data.Text.ICU.Number as N 40 | import qualified Data.Text.ICU.Shape as S 41 | import System.IO.Unsafe (unsafePerformIO) 42 | 43 | #if !MIN_VERSION_base(4,11,0) 44 | import Data.Semigroup ((<>)) 45 | #endif 46 | 47 | {-# ANN module ("HLint: use camelCase"::String) #-} 48 | 49 | t_rnf :: (NFData b) => (a -> b) -> a -> Bool 50 | t_rnf f t = rnf (f t) == () 51 | 52 | t_nonEmpty :: (Text -> Text) -> Text -> Bool 53 | t_nonEmpty f t 54 | | T.null t = T.null ft 55 | | otherwise = T.length ft > 0 56 | where ft = f t 57 | 58 | -- Case mapping 59 | 60 | -- These tests are all fairly weak. 61 | 62 | t_toCaseFold bool = t_nonEmpty $ I.toCaseFold bool 63 | t_toLower locale = t_nonEmpty $ I.toLower locale 64 | t_toUpper locale = t_nonEmpty $ I.toUpper locale 65 | 66 | -- Iteration 67 | 68 | t_charIterator_String a b = (compare `on` I.fromString) a b == compare a b 69 | t_charIterator_Text a b = (compare `on` I.fromText) a b == compare a b 70 | t_charIterator_Utf8 a b = (compare `on` I.fromUtf8) ba bb == compare ba bb 71 | where ba = T.encodeUtf8 a; bb = T.encodeUtf8 b 72 | 73 | -- Normalization 74 | 75 | t_quickCheck_isNormalized mode normMode txt 76 | | mode `elem` [NFD, NFKD] = quickCheck == Just isNormalized 77 | | otherwise = fromMaybe isNormalized quickCheck == isNormalized 78 | where quickCheck = I.quickCheck mode normTxt 79 | isNormalized = I.isNormalized mode normTxt 80 | normTxt = I.normalize normMode txt 81 | 82 | -- Collation 83 | 84 | t_collate a b = c a b == flipOrdering (c b a) 85 | where c = I.collate I.uca 86 | 87 | t_collate_emptyRule a b = I.collate cUca a b == I.collate cEmpty a b 88 | where 89 | cUca = I.uca 90 | cEmpty = either (error "Can’t create empty collator") id 91 | $ I.collatorFromRules "" 92 | 93 | flipOrdering :: Ordering -> Ordering 94 | flipOrdering = \ case 95 | GT -> LT 96 | LT -> GT 97 | EQ -> EQ 98 | 99 | -- Convert 100 | 101 | converter e = unsafePerformIO $ I.open e Nothing 102 | 103 | t_convert a = I.toUnicode c (I.fromUnicode c a) == a 104 | where c = converter "UTF-32" 105 | 106 | 107 | -- Unicode character database 108 | 109 | -- These tests are weak. 110 | 111 | t_blockCode = t_rnf I.blockCode 112 | t_charFullName c = I.charFromFullName (I.charFullName c) == Just c 113 | t_charName c = maybe True (==c) $ I.charFromName (I.charName c) 114 | t_combiningClass = t_rnf I.combiningClass 115 | t_direction = t_rnf I.direction 116 | -- t_property p = t_rnf $ I.property p 117 | t_isMirrored = t_rnf $ I.isMirrored 118 | t_mirror = t_rnf $ I.mirror 119 | t_digitToInt = t_rnf $ I.digitToInt 120 | t_numericValue = t_rnf $ I.numericValue 121 | 122 | -- Spoofing 123 | 124 | t_nonspoofable (NonSpoofableText t) = I.spoofCheck I.spoof t == I.CheckOK 125 | t_spoofable (LatinSpoofableText t) = I.spoofCheck I.spoof t == 126 | I.CheckFailed [I.RestrictionLevel] 127 | t_confusable (NonEmptyText t) = I.areConfusable I.spoof t t `elem` 128 | [I.CheckFailed [I.MixedScriptConfusable] 129 | ,I.CheckFailed [I.SingleScriptConfusable]] 130 | 131 | -- Encoding Guessing 132 | 133 | t_Utf8IsUtf8 a = monadicIO $ do 134 | val <- run $ CD.detect (utf8Text a) >>= CD.getName 135 | assert $ T.isPrefixOf "UTF-8" val 136 | 137 | propertyTests :: Test 138 | propertyTests = 139 | testGroup "Properties" [ 140 | testProperty "t_toCaseFold" t_toCaseFold 141 | , testProperty "t_toLower" t_toLower 142 | , testProperty "t_toUpper" t_toUpper 143 | , testProperty "t_charIterator_String" t_charIterator_String 144 | , testProperty "t_charIterator_Text" t_charIterator_Text 145 | , testProperty "t_charIterator_Utf8" t_charIterator_Utf8 146 | , testProperty "t_quickCheck_isNormalized" t_quickCheck_isNormalized 147 | , testProperty "t_collate" t_collate 148 | , testProperty "t_collate_emptyRule" t_collate_emptyRule 149 | , testProperty "t_convert" t_convert 150 | , testProperty "t_blockCode" t_blockCode 151 | , testProperty "t_charFullName" t_charFullName 152 | , testProperty "t_charName" t_charName 153 | , testProperty "t_combiningClass" t_combiningClass 154 | , testProperty "t_direction" $ t_direction 155 | --, testProperty "t_property" t_property 156 | , testProperty "t_isMirrored" t_isMirrored 157 | , testProperty "t_mirror" t_mirror 158 | , testProperty "t_digitToInt" t_digitToInt 159 | , testProperty "t_numericValue" t_numericValue 160 | , testProperty "t_spoofable" t_spoofable 161 | , testProperty "t_nonspoofable" t_nonspoofable 162 | , testProperty "t_confusable" t_confusable 163 | , testProperty "t_Utf8IsUtf8" t_Utf8IsUtf8 164 | ] 165 | 166 | testCases :: Test 167 | testCases = 168 | testGroup "Test cases" $ hUnitTestToTests $ Test.HUnit.TestList $ 169 | [I.normalize NFC "Ame\x0301lie" ~?= "Amélie" 170 | ,I.normalize NFC "(⊃。•́︵•̀。)⊃" ~?= "(⊃。•́︵•̀。)⊃" 171 | ,map I.brkBreak (I.breaks (I.breakWord (Locale "en_US")) "Hi, Amélie!") 172 | ~?= ["Hi",","," ","Amélie","!"] 173 | ,map I.brkBreak (I.breaksRight (I.breakLine (Locale "ru")) "Привет, мир!") 174 | ~?= ["мир!","Привет, "] 175 | ,(I.unfold I.group <$> I.findAll "[abc]+" "xx b yy ac") ~?= [["b"],["ac"]] 176 | ,I.toUpper (Locale "de-DE") "ß" ~?= "SS" 177 | ,I.toCaseFold False "flag" ~?= "flag" 178 | ,map I.blockCode ['\x1FA50', '\203257', '\73494'] 179 | `oneOf` 180 | [[I.ChessSymbols, I.CjkUnifiedIdeographsExtensionH, I.Kawi] 181 | ,[I.ChessSymbols, I.NoBlock, I.NoBlock] 182 | -- ICU < 72 does not have last two codes 183 | ] 184 | ,I.direction '\x2068' ~?= I.FirstStrongIsolate 185 | ,I.getSkeleton I.spoof Nothing "\1089\1072t" ~?= "cat" 186 | ,S.shapeArabic [S.LettersShape] (nosp "ا ب ت ث") ~?= (nosp "ﺍ ﺑ ﺘ ﺚ") 187 | ,BiDi.reorderParagraphs [] (nosp "abc ا ب ت ث def\n123") 188 | ~?= ["abc" <> T.reverse (nosp "ا ب ت ث") <> "def\n", "123"] 189 | ,N.formatNumber (N.numberFormatter N.NUM_CURRENCY_PLURAL "en_US") 190 | (12.5 :: Double) ~?= "12.50 US dollars" 191 | 192 | ,do 193 | dfDe <- I.standardDateFormatter I.MediumFormatStyle I.LongFormatStyle 194 | (Locale "de_DE") "" 195 | c <- cal "CET" 2000 00 01 02 03 00 196 | return $ I.formatCalendar dfDe (Cal.add c [(Cal.Hour, 25), (Cal.Second, 65)]) 197 | `ioEq` 198 | "2. Januar 2000 um 03:04:05" 199 | 200 | ,do 201 | dfAt <- I.standardDateFormatter I.LongFormatStyle I.LongFormatStyle 202 | (Locale "de_AT") "CET" 203 | return $ I.dateSymbols dfAt I.Months 204 | `ioEq` 205 | ["Jänner","Februar","März","April","Mai","Juni" 206 | ,"Juli","August","September","Oktober","November","Dezember"] 207 | 208 | ,do 209 | dfP <- I.patternDateFormatter 210 | "MMMM dd, yyyy GGGG, hh 'o''clock' a, VVVV" (Locale "en_US") "" 211 | c <- cal "America/Los_Angeles" 2000 00 02 03 04 05 212 | return $ I.formatCalendar dfP c 213 | `ioEq` 214 | "January 02, 2000 Anno Domini, 03 o'clock AM, Los Angeles Time" 215 | 216 | ,(flip Cal.getField Cal.Year =<< cal "UTC" 1999 01 02 03 04 05) `ioEq` 1999 217 | 218 | ,(elem "en_US" <$> I.availableLocales) `ioEq` True 219 | 220 | ,(flip I.formatIntegral (12345 :: Int) 221 | <$> I.numberFormatter "precision-integer" (Locale "fr")) 222 | `ioEq` "12\8239\&345" 223 | 224 | ,(flip I.formatDouble 12345.6789 225 | <$> I.numberFormatter "precision-currency-cash currency/EUR" (Locale "it")) 226 | `ioEq` "12.345,68\160€" 227 | 228 | , Test.HUnit.TestLabel "collate" testCases_collate 229 | 230 | ] 231 | <> 232 | concat 233 | [conv "ISO-2022-CN" "程序設計" "\ESC$)A\SO3LPr\ESC$)G]CSS\SI" 234 | ,conv "cp1251" "Привет, мир!" "\207\240\232\226\229\242, \236\232\240!" 235 | ] 236 | where conv n f t = [I.fromUnicode c f ~?= t, I.toUnicode c t ~?= f] 237 | where c = converter n 238 | nosp = T.filter (/= ' ') 239 | cal tz y m d h mn s = do 240 | c <- Cal.calendar tz (Locale "en_US") Cal.TraditionalCalendarType 241 | Cal.setDateTime c y m d h mn s 242 | return c 243 | ioEq io a = Test.HUnit.TestCase $ do 244 | x <- io 245 | x @?= a 246 | oneOf actual expected = Test.HUnit.TestCase $ 247 | unless (actual `elem` expected) $ 248 | E.throwIO $ HUnitFailure Nothing $ ExpectedButGot Nothing 249 | (unlines $ "one of:" : map show expected) (show actual) 250 | 251 | testCases_collate :: Test.HUnit.Test 252 | testCases_collate = Test.HUnit.TestList $ 253 | [ Test.HUnit.TestLabel "invalid format" $ 254 | assertParseError (I.collatorFromRules "& a < <") Err.u_INVALID_FORMAT_ERROR (Just 0) (Just 4) 255 | , Test.HUnit.TestLabel "custom collator" $ Test.HUnit.TestCase $ do 256 | let c = either (error "Can’t create b)) 19 | #endif 20 | 21 | instance Arbitrary T.Text where 22 | arbitrary = T.pack `fmap` arbitrary 23 | shrink = map T.pack . shrink . T.unpack 24 | 25 | instance Arbitrary BS.ByteString where 26 | arbitrary = BS.pack <$> arbitrary 27 | shrink xs = BS.pack <$> shrink (BS.unpack xs) 28 | 29 | instance Arbitrary LocaleName where 30 | arbitrary = elements (Root:available) 31 | 32 | instance Arbitrary NormalizationMode where 33 | arbitrary = elements [NFD .. NFKCCasefold] 34 | 35 | instance Arbitrary Collator where 36 | arbitrary = I.collator <$> arbitrary 37 | 38 | newtype NonEmptyText = NonEmptyText { nonEmptyText :: T.Text } deriving Show 39 | 40 | instance Arbitrary NonEmptyText where 41 | arbitrary = NonEmptyText . T.pack <$> listOf1 arbitrary 42 | 43 | newtype LatinSpoofableText = LatinSpoofableText { latinSpoofableText :: T.Text } 44 | deriving Show 45 | instance Arbitrary LatinSpoofableText where 46 | arbitrary = LatinSpoofableText <$> T.pack . (<>) "latin" <$> 47 | listOf1 genCyrillicLatinSpoofableChar 48 | 49 | genCyrillicLatinSpoofableChar :: Gen Char 50 | genCyrillicLatinSpoofableChar = elements ( 51 | "\x043A\x043E\x0433\x0435\x043A\x043C" ++ 52 | ['\x043E'..'\x0443'] ++ 53 | ['\x0445'..'\x0446'] ++ 54 | "\x044A" ++ 55 | ['\x0454'..'\x0456'] ++ 56 | "\x0458\x045B\x048D\x0491\x0493\x049B\x049F\x04AB\x04AD\x04AF\x04B1\x04BB" ++ 57 | "\x04BD\x04BF" ++ 58 | ['\x04CE'..'\x04CF'] ++ 59 | "\x04D5\x04D9\x04E9\x0501\x0511\x051B\x051D") 60 | 61 | newtype NonSpoofableText = NonSpoofableText { nonSpoofableText :: T.Text } 62 | deriving Show 63 | 64 | instance Arbitrary NonSpoofableText where 65 | arbitrary = NonSpoofableText <$> T.pack <$> listOf1 genNonSpoofableChar 66 | 67 | genNonSpoofableChar :: Gen Char 68 | genNonSpoofableChar = elements "QDFRz" 69 | 70 | newtype Utf8Text = Utf8Text { utf8Text :: BS.ByteString } 71 | deriving Show 72 | 73 | instance Arbitrary Utf8Text where 74 | arbitrary = Utf8Text . BS.pack <$> vectorOf 300 75 | (suchThat 76 | (arbitrary :: Gen Word8) 77 | (`elem` ([0x41..0x5A] ++ [0x61..0x7A])) 78 | ) 79 | -------------------------------------------------------------------------------- /tests/Tests.hs: -------------------------------------------------------------------------------- 1 | module Main (main) where 2 | 3 | import Test.Framework (defaultMain) 4 | 5 | import qualified Properties 6 | 7 | main :: IO () 8 | main = defaultMain [Properties.propertyTests, Properties.testCases] 9 | -------------------------------------------------------------------------------- /text-icu.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 1.18 2 | -- 1.18 introduced extra-doc-files 3 | name: text-icu 4 | version: 0.8.0.5 5 | synopsis: Bindings to the ICU library 6 | homepage: https://github.com/haskell/text-icu 7 | bug-reports: https://github.com/haskell/text-icu/issues 8 | description: 9 | Haskell bindings to the International Components for Unicode (ICU) 10 | libraries. These libraries provide robust and full-featured Unicode 11 | services on a wide variety of platforms. 12 | . 13 | Features include: 14 | . 15 | * Both pure and impure bindings, to allow for fine control over efficiency 16 | and ease of use. 17 | . 18 | * Breaking of strings on character, word, sentence, and line boundaries. 19 | . 20 | * Access to the Unicode Character Database (UCD) of character metadata. 21 | . 22 | * String collation functions, for locales where the conventions for 23 | lexicographic ordering differ from the simple numeric ordering of 24 | character codes. 25 | . 26 | * Character set conversion functions, allowing conversion between 27 | Unicode and over 220 character encodings. 28 | . 29 | * Unicode normalization. (When implementations keep strings in a 30 | normalized form, they can be assured that equivalent strings have a 31 | unique binary representation.) 32 | . 33 | * Regular expression search and replace. 34 | . 35 | * Security checks for visually confusable (spoofable) strings. 36 | . 37 | * Bidirectional Unicode algorithm 38 | . 39 | * Calendar objects holding dates and times. 40 | . 41 | * Number and calendar formatting. 42 | maintainer: Vladimir Shabanov 43 | copyright: 2009-2015 Bryan O'Sullivan 44 | category: Data, Text 45 | license: BSD3 46 | license-file: LICENSE 47 | build-type: Simple 48 | 49 | extra-doc-files: 50 | README.markdown 51 | changelog.md 52 | 53 | extra-source-files: 54 | benchmarks/Breaker.hs 55 | include/hs_text_icu.h 56 | 57 | tested-with: 58 | GHC == 9.12.1 59 | GHC == 9.10.1 60 | GHC == 9.8.4 61 | GHC == 9.6.6 62 | GHC == 9.4.8 63 | GHC == 9.2.8 64 | GHC == 9.0.2 65 | GHC == 8.10.7 66 | GHC == 8.8.4 67 | GHC == 8.6.5 68 | GHC == 8.4.4 69 | GHC == 8.2.2 70 | GHC == 8.0.2 71 | 72 | flag homebrew 73 | Description: Assume homebrew on macOS. Automatically add /usr/local/opt/ and /opt/homebrew/opt/ paths to extra-lib-dirs and include-dirs. 74 | Default: True 75 | Manual: True 76 | 77 | library 78 | default-language: Haskell98 79 | build-depends: 80 | base >= 4.9 && < 5, 81 | bytestring >= 0.9 && < 0.13, 82 | deepseq >= 1.4.2.0 && < 1.6, 83 | text >= 0.9.1.0 && < 1.3 || >= 2.0 && < 2.2, 84 | time >= 1.5 && < 1.15 85 | pkgconfig-depends: 86 | icu-i18n >= 62.1 87 | 88 | exposed-modules: 89 | Data.Text.ICU 90 | Data.Text.ICU.BiDi 91 | Data.Text.ICU.Calendar 92 | Data.Text.ICU.Break 93 | Data.Text.ICU.Char 94 | Data.Text.ICU.CharsetDetection 95 | Data.Text.ICU.Collate 96 | Data.Text.ICU.Convert 97 | Data.Text.ICU.DateFormatter 98 | Data.Text.ICU.Error 99 | Data.Text.ICU.Locale 100 | Data.Text.ICU.Normalize 101 | Data.Text.ICU.Number 102 | Data.Text.ICU.Normalize2 103 | Data.Text.ICU.NumberFormatter 104 | Data.Text.ICU.Regex 105 | Data.Text.ICU.Shape 106 | Data.Text.ICU.Spoof 107 | Data.Text.ICU.Types 108 | other-modules: 109 | Data.Text.ICU.BiDi.Internal 110 | Data.Text.ICU.BitMask 111 | Data.Text.ICU.Break.Pure 112 | Data.Text.ICU.Break.Types 113 | Data.Text.ICU.CharsetDetection.Internal 114 | Data.Text.ICU.Collate.Internal 115 | Data.Text.ICU.Collate.Pure 116 | Data.Text.ICU.Number.Internal 117 | Data.Text.ICU.Convert.Internal 118 | Data.Text.ICU.Enumerator 119 | Data.Text.ICU.Error.Internal 120 | Data.Text.ICU.Internal 121 | Data.Text.ICU.Iterator 122 | Data.Text.ICU.Normalize.Internal 123 | Data.Text.ICU.Regex.Internal 124 | Data.Text.ICU.Regex.Pure 125 | Data.Text.ICU.Spoof.Internal 126 | Data.Text.ICU.Spoof.Pure 127 | Data.Text.ICU.Text 128 | c-sources: cbits/text_icu.c 129 | cc-options: -Wall -Wextra -pedantic -Wno-deprecated 130 | include-dirs: include 131 | if os(darwin) && flag(homebrew) 132 | extra-lib-dirs: 133 | /usr/local/opt/icu4c/lib 134 | /opt/homebrew/opt/icu4c/lib 135 | include-dirs: 136 | /usr/local/opt/icu4c/include 137 | /opt/homebrew/opt/icu4c/include 138 | extra-libraries: icuuc 139 | if os(mingw32) 140 | extra-libraries: icuin icudt icuio 141 | else 142 | extra-libraries: icui18n icudata 143 | 144 | ghc-options: -Wall 145 | if impl(ghc >= 8.0) 146 | ghc-options: -Wcompat 147 | 148 | test-suite tests 149 | default-language: Haskell98 150 | type: exitcode-stdio-1.0 151 | hs-source-dirs: tests 152 | main-is: Tests.hs 153 | other-modules: Properties QuickCheckUtils 154 | 155 | ghc-options: 156 | -Wall -threaded -rtsopts 157 | 158 | build-depends: 159 | HUnit >= 1.2, 160 | QuickCheck >= 2.4, 161 | array, 162 | base, 163 | bytestring, 164 | deepseq, 165 | directory, 166 | ghc-prim, 167 | random, 168 | test-framework >= 0.4, 169 | test-framework-hunit >= 0.2, 170 | test-framework-quickcheck2 >= 0.2, 171 | text, 172 | text-icu 173 | if impl(ghc <= 8.2) 174 | build-depends: semigroups 175 | 176 | source-repository head 177 | type: git 178 | location: https://github.com/haskell/text-icu 179 | --------------------------------------------------------------------------------