├── src-bench ├── cabal.project ├── Bench.hs └── text-short-bench.cabal ├── .gitignore ├── cabal.project ├── src-ghc804 └── PrimOps.hs ├── cbits ├── memcmp.c └── cbits.c ├── cabal.haskell-ci ├── src └── Data │ └── Text │ ├── Short │ ├── Unsafe.hs │ ├── Partial.hs │ └── Internal.hs │ └── Short.hs ├── LICENSE ├── text-short.cabal ├── ChangeLog.md ├── .github └── workflows │ └── haskell-ci.yml └── src-test └── Tests.hs /src-bench/cabal.project: -------------------------------------------------------------------------------- 1 | packages: . 2 | -- benchmarks: True 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /dist-newstyle/ 2 | /dist/ 3 | *~ 4 | .ghc.environment.* 5 | -------------------------------------------------------------------------------- /cabal.project: -------------------------------------------------------------------------------- 1 | packages: . 2 | tests: True 3 | 4 | package text-short 5 | flags: +asserts 6 | -------------------------------------------------------------------------------- /src-ghc804/PrimOps.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MagicHash #-} 2 | {-# LANGUAGE Unsafe #-} 3 | 4 | module PrimOps ( compareByteArrays# ) where 5 | 6 | import GHC.Exts (compareByteArrays#) 7 | -------------------------------------------------------------------------------- /cbits/memcmp.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int 4 | hs_text_short_memcmp(const void *s1, const size_t s1ofs, const void *s2, const size_t s2ofs, const size_t n) 5 | { 6 | if (!n) return 0; 7 | 8 | const void *s1_ = s1+s1ofs; 9 | const void *s2_ = s2+s2ofs; 10 | 11 | return (s1_ == s2_) ? 0 : memcmp(s1_, s2_, n); 12 | } 13 | -------------------------------------------------------------------------------- /cabal.haskell-ci: -------------------------------------------------------------------------------- 1 | branches: master 2 | head-hackage: False 3 | 4 | constraint-set bytestring-0.11 5 | constraints: bytestring ==0.11.* 6 | ghc: <9.7 7 | tests: True 8 | run-tests: True 9 | 10 | constraint-set text-1 11 | ghc: <9.3 12 | constraints: text ^>=1.2.3.0 13 | tests: True 14 | run-tests: True 15 | 16 | constraint-set text-2 17 | ghc: >= 8.0 && <9.7 18 | constraints: text ^>=2 19 | tests: True 20 | run-tests: True 21 | -------------------------------------------------------------------------------- /src/Data/Text/Short/Unsafe.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE Unsafe #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Short.Unsafe 5 | -- Copyright : © Herbert Valerio Riedel 2017 6 | -- License : BSD3 7 | -- 8 | -- Maintainer : hvr@gnu.org 9 | -- Stability : stable 10 | -- 11 | -- Unsafe API 12 | -- 13 | -- This module provides unsafe conversion functions 14 | module Data.Text.Short.Unsafe 15 | ( fromShortByteStringUnsafe 16 | , fromByteStringUnsafe 17 | ) where 18 | 19 | import Data.Text.Short.Internal 20 | import Prelude () 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, Herbert Valerio Riedel 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Herbert Valerio Riedel nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /src/Data/Text/Short/Partial.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE Trustworthy #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Short.Partial 5 | -- Copyright : © Herbert Valerio Riedel 2018 6 | -- License : BSD3 7 | -- 8 | -- Maintainer : hvr@gnu.org 9 | -- Stability : stable 10 | -- 11 | -- Partial functions vocabulary 12 | -- 13 | -- This module provides common partial functions for operating on 'ShortText'. 14 | -- 15 | -- The use of these functions is discouraged as they tend to be error-prone. 16 | -- 17 | -- @since 0.1.2 18 | module Data.Text.Short.Partial 19 | ( head 20 | , tail 21 | , init 22 | , last 23 | , index 24 | 25 | , foldl1 26 | , foldl1' 27 | , foldr1 28 | ) where 29 | 30 | import Data.Text.Short 31 | import Data.Text.Short.Internal 32 | import Prelude () 33 | 34 | -- | \(\mathcal{O}(1)\) Returns first character of a non-empty 'ShortText' 35 | -- 36 | -- >>> head "abcd" 37 | -- 'a' 38 | -- 39 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's. 40 | -- Consider using the total functions 'uncons' or 'indexMaybe' 41 | -- instead. 42 | -- 43 | -- @since 0.1.2 44 | head :: ShortText -> Char 45 | head = maybe (error "head: empty ShortText") fst . uncons 46 | 47 | -- | \(\mathcal{O}(n)\) Drop first character from non-empty 'ShortText'. 48 | -- 49 | -- >>> tail "abcd" 50 | -- "bcd" 51 | -- 52 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's. 53 | -- Consider using the total functions 'uncons' or 'drop' instead. 54 | -- 55 | -- @since 0.1.2 56 | tail :: ShortText -> ShortText 57 | tail = maybe (error "tail: empty ShortText") snd . uncons 58 | 59 | -- | \(\mathcal{O}(n)\) Drop last character from non-empty 'ShortText'. 60 | -- 61 | -- >>> tail "abcd" 62 | -- "bcd" 63 | -- 64 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's. 65 | -- Consider using the total functions 'unsnoc' or 'dropEnd' instead. 66 | -- 67 | -- @since 0.1.2 68 | init :: ShortText -> ShortText 69 | init = maybe (error "init: empty ShortText") fst . unsnoc 70 | 71 | -- | \(\mathcal{O}(1)\) Return last character from non-empty 'ShortText'. 72 | -- 73 | -- >>> last "abcd" 74 | -- 'd' 75 | -- 76 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's. 77 | -- Consider using the total functions 'unsnoc' or 'indexEndMaybe' 78 | -- instead. 79 | -- 80 | -- @since 0.1.2 81 | last :: ShortText -> Char 82 | last = maybe (error "last: empty ShortText") snd . unsnoc 83 | 84 | -- | \(\mathcal{O}(n)\) Retrieve \(i\)-th character (code-point) 85 | -- 86 | -- >>> index "abcd" 1 87 | -- 'b' 88 | -- 89 | -- __Note__: Will throw an 'error' exception if index is out of 90 | -- bounds. Consider using the total functions 'indexMaybe' or 91 | -- 'indexEndMaybe' instead. 92 | -- 93 | -- @since 0.1.2 94 | index :: ShortText -> Int -> Char 95 | index st i = case indexMaybe st i of 96 | Nothing -> error "index: not within ShortText" 97 | Just c -> c 98 | 99 | -- $setup 100 | -- >>> :set -XOverloadedStrings 101 | -------------------------------------------------------------------------------- /src-bench/Bench.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Criterion.Main 4 | 5 | import Control.Exception 6 | import qualified Data.Text as T 7 | import qualified Data.Text.Encoding as T 8 | import qualified Data.Text.Short.Internal as IUT 9 | 10 | -- The function we're benchmarking. 11 | fib m | m < 0 = error "negative!" 12 | | otherwise = go m 13 | where 14 | go 0 = 0 15 | go 1 = 1 16 | go n = go (n-1) + go (n-2) 17 | 18 | {-# NOINLINE allcharsBS1 #-} 19 | allcharsBS1 = T.encodeUtf8 (T.pack ['\x0'..'\x7f']) 20 | 21 | {-# NOINLINE allcharsST1 #-} 22 | Just allcharsST1 = IUT.fromByteString allcharsBS1 23 | 24 | {-# NOINLINE allcharsST1' #-} 25 | Just allcharsST1' = IUT.fromByteString (IUT.toByteString allcharsST1) 26 | 27 | {-# NOINLINE allcharsBS2 #-} 28 | allcharsBS2 = T.encodeUtf8 (T.pack $ ['\x0'..'\xd7ff'] ++ ['\xe000'..'\x10ffff']) 29 | 30 | {-# NOINLINE allcharsST2 #-} 31 | Just allcharsST2 = IUT.fromByteString allcharsBS2 32 | 33 | {-# NOINLINE allcharsST2' #-} 34 | Just allcharsST2' = IUT.fromByteString (IUT.toByteString allcharsST2) 35 | 36 | {-# NOINLINE allAscii128K #-} 37 | allAscii128K = mconcat (replicate 1024 allcharsST1) 38 | 39 | -- Our benchmark harness. 40 | main = do 41 | evaluate allcharsST1 42 | evaluate allcharsST1' 43 | evaluate allcharsST2 44 | evaluate allcharsST2' 45 | evaluate allAscii128K 46 | 47 | defaultMain 48 | [ bgroup "singleton" 49 | [ bench "'a' :: ShortText" $ whnf IUT.singleton 'a' 50 | , bench "'a' :: Text" $ whnf (T.singleton) 'a' 51 | , bench "U+10FFFF :: ShortText" $ whnf (IUT.singleton) '\x10ffff' 52 | , bench "U+10FFFF :: Text" $ whnf (T.singleton) '\x10ffff' 53 | ] 54 | , bgroup "toString" 55 | [ bench "t1" $ nf IUT.toString allcharsST1 56 | , bench "t2" $ nf IUT.toString allcharsST2 57 | , bench "t3" $ nf IUT.toString allAscii128K 58 | , bench "t1 (Text)" $ nf T.unpack (IUT.toText allcharsST1) 59 | , bench "t2 (Text)" $ nf T.unpack (IUT.toText allcharsST2) 60 | , bench "t3 (Text)" $ nf T.unpack (IUT.toText allAscii128K) 61 | ] 62 | 63 | , bgroup "length" 64 | [ bench "1" $ whnf IUT.length allcharsST1 65 | , bench "2" $ whnf IUT.length allcharsST2 66 | , bench "3" $ whnf IUT.length allAscii128K 67 | ] 68 | 69 | , bgroup "==" 70 | [ bench "== 1a" $ whnf (== allcharsST1) allcharsST1 71 | , bench "== 1b" $ whnf (== allcharsST1) allcharsST1' 72 | , bench "== 2a" $ whnf (== allcharsST2) allcharsST2 73 | , bench "== 2b" $ whnf (== allcharsST2) allcharsST2' 74 | ] 75 | 76 | , bgroup "isAscii" 77 | [ bench "isAscii 1" $ whnf IUT.isAscii allcharsST1 78 | , bench "isAscii 2" $ whnf IUT.isAscii allcharsST2 79 | , bench "isAscii 3" $ whnf IUT.isAscii allAscii128K 80 | ] 81 | 82 | , bgroup "isValidUtf8" 83 | [ bench "isValidUtf8 1" $ whnf IUT.isValidUtf8 allcharsST1 84 | , bench "fromByteString 1" $ whnf IUT.fromByteString allcharsBS1 85 | 86 | , bench "isValidUtf8 2" $ whnf IUT.isValidUtf8 allcharsST2 87 | , bench "fromByteString 2" $ whnf IUT.fromByteString allcharsBS2 88 | ] 89 | ] 90 | 91 | -------------------------------------------------------------------------------- /text-short.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 1.18 2 | name: text-short 3 | version: 0.1.6 4 | x-revision: 3 5 | synopsis: Memory-efficient representation of Unicode text strings 6 | license: BSD3 7 | license-file: LICENSE 8 | author: Herbert Valerio Riedel 9 | maintainer: hvr@gnu.org 10 | bug-reports: https://github.com/hvr/text-short/issues 11 | category: Data 12 | build-type: Simple 13 | description: 14 | This package provides the 'ShortText' type which is suitable for keeping many short strings in memory. This is similiar to how 'ShortByteString' relates to 'ByteString'. 15 | . 16 | The main difference between 'Text' and 'ShortText' is that 'ShortText' doesn't support zero-copy slicing (thereby saving 2 words), and, compared to text-1.*, that it uses UTF-8 instead of UTF-16 internally. Consequently, the memory footprint of a (boxed) 'ShortText' value is 4 words (2 words when unboxed) plus the length of the UTF-8 encoded payload. 17 | 18 | tested-with: 19 | GHC ==8.6.5 20 | || ==8.8.3 21 | || ==8.10.7 22 | || ==9.0.2 23 | || ==9.2.8 24 | || ==9.4.8 25 | || ==9.6.6 26 | || ==9.8.4 27 | || ==9.10.1 28 | || ==9.12.1 29 | 30 | extra-source-files: ChangeLog.md 31 | 32 | source-repository head 33 | type: git 34 | location: https://github.com/hvr/text-short.git 35 | 36 | flag asserts 37 | description: Enable runtime-checks via @assert@ 38 | default: False 39 | manual: True 40 | 41 | library 42 | exposed-modules: 43 | Data.Text.Short 44 | Data.Text.Short.Partial 45 | Data.Text.Short.Unsafe 46 | 47 | other-modules: Data.Text.Short.Internal 48 | build-depends: 49 | base >=4.12 && <4.22 50 | , binary >=0.8.6.0 && <0.9 51 | , bytestring >=0.10.8.2 && <0.13 52 | , deepseq >=1.4.4.0 && <1.6 53 | , ghc-prim >=0.5.3 && <0.14 54 | , hashable >=1.4.4.0 && <1.6 55 | , template-haskell >=2.14.0.0 && <2.24 56 | , text >=1.2.3.1 && <1.3 || >=2.0 && <2.2 57 | 58 | other-modules: PrimOps 59 | hs-source-dirs: src src-ghc804 60 | default-language: Haskell2010 61 | other-extensions: 62 | CPP 63 | GeneralizedNewtypeDeriving 64 | MagicHash 65 | TemplateHaskellQuotes 66 | Trustworthy 67 | UnliftedFFITypes 68 | Unsafe 69 | 70 | c-sources: cbits/cbits.c 71 | 72 | if flag(asserts) 73 | ghc-options: -fno-ignore-asserts 74 | 75 | else 76 | cc-options: -DNDEBUG=1 77 | 78 | ghc-options: -Wall 79 | cc-options: -Wall 80 | 81 | test-suite text-short-tests 82 | type: exitcode-stdio-1.0 83 | hs-source-dirs: src-test 84 | main-is: Tests.hs 85 | 86 | -- bytestring dependency for cabal_macros.h 87 | build-depends: 88 | base 89 | , binary 90 | , bytestring 91 | , template-haskell 92 | , text 93 | , text-short 94 | 95 | -- deps which don't inherit constraints from library stanza: 96 | build-depends: 97 | tasty >=1.4 && <1.6 98 | , tasty-hunit >=0.10.0 && <0.11 99 | , tasty-quickcheck >=0.10 && <0.12 100 | 101 | default-language: Haskell2010 102 | -------------------------------------------------------------------------------- /src-bench/text-short-bench.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 2.0 2 | 3 | name: text-short-bench 4 | version: 0.1.2 5 | synopsis: Memory-efficient representation of Unicode text strings 6 | license: BSD3 7 | license-file: LICENSE 8 | author: Herbert Valerio Riedel 9 | maintainer: hvr@gnu.org 10 | bug-reports: https://github.com/hvr/text-short/issues 11 | category: Data 12 | build-type: Simple 13 | description: This package provides the 'ShortText' type which is suitable for keeping many short strings in memory. This is similiar to how 'ShortByteString' relates to 'ByteString'. 14 | . 15 | The main difference between 'Text' and 'ShortText' is that 'ShortText' uses UTF-8 instead of UTF-16 internally and also doesn't support slicing (thereby saving 2 words). Consequently, the memory footprint of a (boxed) 'ShortText' value is 4 words (2 words when unboxed) plus the length of the UTF-8 encoded payload. 16 | 17 | tested-with: GHC==8.2.2, GHC==8.0.2, GHC==7.10.3, GHC==7.8.4 18 | extra-source-files: ChangeLog.md 19 | 20 | Source-Repository head 21 | Type: git 22 | Location: https://github.com/hvr/text-short.git 23 | 24 | flag asserts 25 | description: Enable runtime-checks via @assert@ 26 | default: False 27 | manual: True 28 | 29 | library text-short-iut 30 | exposed-modules: Data.Text.Short 31 | Data.Text.Short.Partial 32 | Data.Text.Short.Unsafe 33 | 34 | exposed-modules: Data.Text.Short.Internal 35 | 36 | build-depends: base >= 4.7 && < 4.11 37 | , bytestring >= 0.10.4 && < 0.11 38 | , hashable >= 1.2.6 && < 1.3 39 | , deepseq >= 1.3 && < 1.5 40 | , text >= 1.0 && < 1.3 41 | , binary >= 0.7.1 && < 0.9 42 | , ghc-prim >= 0.3.1 && < 0.6 43 | 44 | if !impl(ghc >= 8.0) 45 | build-depends: semigroups >= 0.18.2 && < 0.19 46 | 47 | -- GHC version specific PrimOps 48 | if impl(ghc >= 8.4) 49 | hs-source-dirs: ../src-ghc804 50 | else 51 | c-sources: ../cbits/memcmp.c 52 | hs-source-dirs: ../src-ghc708 53 | other-modules: PrimOps 54 | 55 | hs-source-dirs: ../src 56 | 57 | default-language: Haskell2010 58 | other-extensions: CPP 59 | , GeneralizedNewtypeDeriving 60 | , MagicHash 61 | , UnliftedFFITypes 62 | , Trustworthy 63 | , Unsafe 64 | 65 | c-sources: ../cbits/cbits.c 66 | 67 | if flag(asserts) 68 | ghc-options: -fno-ignore-asserts 69 | else 70 | cc-options: -DNDEBUG=1 71 | 72 | ghc-options: -Wall 73 | cc-options: -O3 -Wall 74 | 75 | executable text-short-bench 76 | default-language: Haskell2010 77 | -- type: exitcode-stdio-1.0 78 | hs-source-dirs: . 79 | main-is: Bench.hs 80 | 81 | build-depends: base 82 | , binary 83 | , text 84 | , text-short-iut 85 | -- deps which don't inherit constraints from library stanza: 86 | , criterion >= 1.3.0.0 && < 1.4 87 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | ## 0.1.6 2 | 3 | * Drop support for GHC prior 8.6.5 4 | * Support GHC-9.10 (base-4.21) 5 | 6 | ## 0.1.5 7 | 8 | * text-2.0 support 9 | 10 | ## 0.1.4 11 | 12 | * Fix `fromString` for single character strings. 13 | https://github.com/haskell-hvr/text-short/issues/20 14 | * Add Template Haskell `Lift ShortText` instance. 15 | 16 | ## 0.1.3 17 | 18 | * Add `Data ShortText` instance 19 | * Define `Typeable ShortText` also for GHC 7.8 as well 20 | (NB: for GHC 7.10.3 and up `Typeable` instances are automatically 21 | defined even when not mentioned explicitly in a `deriving` clause) 22 | * Add equivalent verb `Data.Text.split` to `Data.Text.Short` API 23 | 24 | split :: (Char -> Bool) -> ShortText -> [ShortText] 25 | 26 | ## 0.1.2 27 | 28 | * Add `IsList ShortText` and `PrintfArg ShortText` instances 29 | * Expose partial functions via new `Data.Text.Short.Partial` module 30 | 31 | foldl1 :: (Char -> Char -> Char) -> ShortText -> Char 32 | foldl1' :: (Char -> Char -> Char) -> ShortText -> Char 33 | foldr1 :: (Char -> Char -> Char) -> ShortText -> Char 34 | head :: ShortText -> Char 35 | index :: ShortText -> Int -> Char 36 | init :: ShortText -> ShortText 37 | last :: ShortText -> Char 38 | tail :: ShortText -> ShortText 39 | 40 | * Add several `Data.Text` verbs to `Data.Text.Short` API 41 | 42 | (!?) :: ShortText -> Int -> Maybe Char 43 | all :: (Char -> Bool) -> ShortText -> Bool 44 | any :: (Char -> Bool) -> ShortText -> Bool 45 | append :: ShortText -> ShortText -> ShortText 46 | break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) 47 | breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) 48 | concat :: [ShortText] -> ShortText 49 | cons :: Char -> ShortText -> ShortText 50 | drop :: Int -> ShortText -> ShortText 51 | dropAround :: (Char -> Bool) -> ShortText -> ShortText 52 | dropEnd :: Int -> ShortText -> ShortText 53 | dropWhile :: (Char -> Bool) -> ShortText -> ShortText 54 | dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText 55 | empty :: ShortText 56 | filter :: (Char -> Bool) -> ShortText -> ShortText 57 | find :: (Char -> Bool) -> ShortText -> Maybe Char 58 | findIndex :: (Char -> Bool) -> ShortText -> Maybe Int 59 | foldl :: (a -> Char -> a) -> a -> ShortText -> a 60 | foldl' :: (a -> Char -> a) -> a -> ShortText -> a 61 | foldr :: (Char -> a -> a) -> a -> ShortText -> a 62 | indexEndMaybe :: ShortText -> Int -> Maybe Char 63 | indexMaybe :: ShortText -> Int -> Maybe Char 64 | intercalate :: ShortText -> [ShortText] -> ShortText 65 | intersperse :: Char -> ShortText -> ShortText 66 | isPrefixOf :: ShortText -> ShortText -> Bool 67 | isSuffixOf :: ShortText -> ShortText -> Bool 68 | pack :: [Char] -> ShortText 69 | replicate :: Int -> ShortText -> ShortText 70 | reverse :: ShortText -> ShortText 71 | singleton :: Char -> ShortText 72 | snoc :: ShortText -> Char -> ShortText 73 | span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) 74 | spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText) 75 | splitAt :: Int -> ShortText -> (ShortText, ShortText) 76 | splitAtEnd :: Int -> ShortText -> (ShortText, ShortText) 77 | stripPrefix :: ShortText -> ShortText -> Maybe ShortText 78 | stripSuffix :: ShortText -> ShortText -> Maybe ShortText 79 | take :: Int -> ShortText -> ShortText 80 | takeEnd :: Int -> ShortText -> ShortText 81 | takeWhile :: (Char -> Bool) -> ShortText -> ShortText 82 | takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText 83 | uncons :: ShortText -> Maybe (Char, ShortText) 84 | unpack :: ShortText -> [Char] 85 | unsnoc :: ShortText -> Maybe (ShortText, Char) 86 | 87 | * Optimise low-level primitives 88 | * Add support for GHC 8.4 89 | 90 | ## 0.1.1 91 | 92 | * Expose *unsafe* conversion API via `Data.Text.Short.Unsafe` module 93 | * Minor documentation improvement 94 | 95 | ## 0.1 96 | 97 | * First version. Released on an unsuspecting world. 98 | -------------------------------------------------------------------------------- /src/Data/Text/Short.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE Trustworthy #-} 2 | -- | 3 | -- Module : Data.Text.Short 4 | -- Copyright : © Herbert Valerio Riedel 2017 5 | -- License : BSD3 6 | -- 7 | -- Maintainer : hvr@gnu.org 8 | -- Stability : stable 9 | -- 10 | -- Memory-efficient representation of Unicode text strings. 11 | -- 12 | -- This module is intended to be imported @qualified@, to avoid name 13 | -- clashes with "Prelude" functions, e.g. 14 | -- 15 | -- > import qualified Data.Text.Short as TS 16 | -- > import qualified Data.Text.Short (ShortText) 17 | -- 18 | -- This modules deliberately omits (common) partial functions, which 19 | -- can be found in "Data.Text.Short.Partial" instead. 20 | -- 21 | -- @since 0.1 22 | module Data.Text.Short 23 | ( -- * The 'ShortText' type 24 | ShortText 25 | 26 | -- * Basic operations 27 | -- ** Construction 28 | , empty 29 | , singleton 30 | , pack 31 | , append 32 | , concat 33 | , cons 34 | , snoc 35 | , replicate 36 | 37 | -- ** Deconstruction 38 | , unpack 39 | , uncons 40 | , unsnoc 41 | 42 | -- ** Querying & predicates 43 | , null 44 | , length 45 | , isAscii 46 | , all 47 | , any 48 | , find 49 | , isPrefixOf 50 | , isSuffixOf 51 | 52 | -- ** Lookup & indexing 53 | , (!?) 54 | , indexMaybe 55 | , indexEndMaybe 56 | , findIndex 57 | 58 | -- * Splitting 'ShortText's 59 | -- ** Basic functions 60 | , take 61 | , takeEnd 62 | , drop 63 | , dropEnd 64 | , takeWhile 65 | , takeWhileEnd 66 | , dropWhile 67 | , dropWhileEnd 68 | 69 | , dropAround 70 | 71 | -- ** Pair-valued functions 72 | , splitAt 73 | , splitAtEnd 74 | , span 75 | , break 76 | , spanEnd 77 | , breakEnd 78 | 79 | -- ** Breaking into many substrings 80 | , split 81 | 82 | -- ** Suffix & Prefix operations 83 | , stripPrefix 84 | , stripSuffix 85 | 86 | -- * Transformations 87 | , intersperse 88 | , intercalate 89 | , reverse 90 | , filter 91 | 92 | -- * Folds 93 | , foldl 94 | , foldl' 95 | , foldr 96 | 97 | -- * Conversions 98 | -- ** 'String' 99 | , fromString 100 | , toString 101 | 102 | -- ** 'Text' 103 | , fromText 104 | , toText 105 | 106 | -- ** 'ByteString' 107 | , fromShortByteString 108 | , toShortByteString 109 | 110 | , fromByteString 111 | , toByteString 112 | 113 | , toBuilder 114 | 115 | ) where 116 | 117 | import Data.Semigroup 118 | import Data.Text.Short.Internal 119 | import Prelude () 120 | 121 | -- | \(\mathcal{O}(n)\) Variant of 'span' with negated predicate. 122 | -- 123 | -- >>> break (> 'c') "abcdabcd" 124 | -- ("abc","dabcd") 125 | -- 126 | -- prop> break p t == span (not . p) t 127 | -- 128 | -- prop> fst (break p t) <> snd (break p t) == t 129 | -- 130 | -- @since 0.1.2 131 | break :: (Char -> Bool) -> ShortText -> (ShortText,ShortText) 132 | break p st = span (not . p) st 133 | 134 | -- | \(\mathcal{O}(n)\) Variant of 'spanEnd' with negated predicate. 135 | -- 136 | -- >>> breakEnd (< 'c') "abcdabcd" 137 | -- ("abcdab","cd") 138 | -- 139 | -- prop> breakEnd p t == spanEnd (not . p) t 140 | -- 141 | -- prop> fst (breakEnd p t) <> snd (breakEnd p t) == t 142 | -- 143 | -- @since 0.1.2 144 | breakEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText) 145 | breakEnd p st = spanEnd (not . p) st 146 | 147 | -- | \(\mathcal{O}(n)\) Index /i/-th code-point in 'ShortText'. 148 | -- 149 | -- Infix operator alias of 'indexMaybe' 150 | -- 151 | -- >>> "abcdefg" !? 2 152 | -- Just 'c' 153 | -- 154 | -- @since 0.1.2 155 | (!?) :: ShortText -> Int -> Maybe Char 156 | (!?) = indexMaybe 157 | 158 | -- | \(\mathcal{O}(n)\) Test whether /any/ code points in 'ShortText' satisfy a predicate. 159 | -- 160 | -- >>> any (> 'c') "abcdabcd" 161 | -- True 162 | -- 163 | -- >>> any (const True) "" 164 | -- False 165 | -- 166 | -- >>> any (== 'c') "abdabd" 167 | -- False 168 | -- 169 | -- prop> any p t == not (all (not . p) t) 170 | -- 171 | -- @since 0.1.2 172 | any :: (Char -> Bool) -> ShortText -> Bool 173 | any p st = case find p st of 174 | Nothing -> False 175 | Just _ -> True 176 | 177 | -- | \(\mathcal{O}(n)\) Concatenate two 'ShortText's 178 | -- 179 | -- This is a type-specialised alias of '<>'. 180 | -- 181 | -- >>> append "foo" "bar" 182 | -- "foobar" 183 | -- 184 | -- prop> length (append t1 t2) == length t1 + length t2 185 | -- 186 | -- @since 0.1.2 187 | append :: ShortText -> ShortText -> ShortText 188 | append = (<>) 189 | 190 | -- | \(\mathcal{O}(n)\) Concatenate list of 'ShortText's 191 | -- 192 | -- This is a type-specialised alias of 'mconcat'. 193 | -- 194 | -- >>> concat [] 195 | -- "" 196 | -- 197 | -- >>> concat ["foo","bar","doo"] 198 | -- "foobardoo" 199 | -- 200 | -- @since 0.1.2 201 | concat :: [ShortText] -> ShortText 202 | concat = mconcat 203 | 204 | -- | \(\mathcal{O}(0)\) The /empty/ 'ShortText'. 205 | -- 206 | -- This is a type-specialised alias of 'mempty'. 207 | -- 208 | -- >>> empty 209 | -- "" 210 | -- 211 | -- >>> null empty 212 | -- True 213 | -- 214 | -- @since 0.1.2 215 | empty :: ShortText 216 | empty = mempty 217 | 218 | -- | \(\mathcal{O}(n)\) Construct a 'ShortText' from a list of 'Char's. 219 | -- 220 | -- This is an alias for 'fromString'. 221 | -- 222 | -- @since 0.1.2 223 | pack :: [Char] -> ShortText 224 | pack = fromString 225 | 226 | -- | \(\mathcal{O}(n)\) Convert 'ShortText' into a list of 'Char's. 227 | -- 228 | -- This is an alias for 'toString'. 229 | -- 230 | -- prop> (pack . unpack) t == t 231 | -- 232 | -- @since 0.1.2 233 | unpack :: ShortText -> [Char] 234 | unpack = toString 235 | 236 | -- | \(\mathcal{O}(n)\) Take prefix of given length or return whole 'ShortText' if too short. 237 | -- 238 | -- >>> take 3 "abcdef" 239 | -- "abc" 240 | -- 241 | -- >>> take 3 "ab" 242 | -- "ab" 243 | -- 244 | -- @since 0.1.2 245 | take :: Int -> ShortText -> ShortText 246 | take n = fst . splitAt n 247 | 248 | -- | \(\mathcal{O}(n)\) Take suffix of given length or return whole 'ShortText' if too short. 249 | -- 250 | -- >>> takeEnd 3 "abcdefg" 251 | -- "efg" 252 | -- 253 | -- >>> takeEnd 3 "ab" 254 | -- "ab" 255 | -- 256 | -- @since 0.1.2 257 | takeEnd :: Int -> ShortText -> ShortText 258 | takeEnd n = snd . splitAtEnd n 259 | 260 | -- | \(\mathcal{O}(n)\) Take remove prefix of given length from 'ShortText' or return 'empty' 'ShortText' if too short. 261 | -- 262 | -- >>> drop 4 "abcdef" 263 | -- "ef" 264 | -- 265 | -- >>> drop 4 "ab" 266 | -- "" 267 | -- 268 | -- @since 0.1.2 269 | drop :: Int -> ShortText -> ShortText 270 | drop n = snd . splitAt n 271 | 272 | -- | \(\mathcal{O}(n)\) Take remove suffix of given length from 'ShortText' or return 'empty' 'ShortText' if too short. 273 | -- 274 | -- >>> drop 4 "abcdefghi" 275 | -- "efghi" 276 | -- 277 | -- >>> drop 4 "ab" 278 | -- "" 279 | -- 280 | -- @since 0.1.2 281 | dropEnd :: Int -> ShortText -> ShortText 282 | dropEnd n = fst . splitAtEnd n 283 | 284 | -- | \(\mathcal{O}(n)\) Take longest prefix satisfying given predicate. 285 | -- 286 | -- prop> takeWhile p t == fst (span p t) 287 | -- 288 | -- >>> takeWhile (< 'c') "abcdabcd" 289 | -- "ab" 290 | -- 291 | -- @since 0.1.2 292 | takeWhile :: (Char -> Bool) -> ShortText -> ShortText 293 | takeWhile p = fst . span p 294 | 295 | -- | \(\mathcal{O}(n)\) Take longest suffix satisfying given predicate. 296 | -- 297 | -- prop> takeWhileEnd p t == snd (spanEnd p t) 298 | -- 299 | -- >>> takeWhileEnd (>= 'c') "abcdabcd" 300 | -- "cd" 301 | -- 302 | -- @since 0.1.2 303 | takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText 304 | takeWhileEnd p = snd . spanEnd p 305 | 306 | -- | \(\mathcal{O}(n)\) Remove longest prefix satisfying given predicate. 307 | -- 308 | -- prop> dropWhile p t == snd (span p t) 309 | -- 310 | -- >>> dropWhile (< 'c') "abcdabcd" 311 | -- "cdabcd" 312 | -- 313 | -- @since 0.1.2 314 | dropWhile :: (Char -> Bool) -> ShortText -> ShortText 315 | dropWhile p = snd . span p 316 | 317 | -- | \(\mathcal{O}(n)\) Remove longest suffix satisfying given predicate. 318 | -- 319 | -- prop> dropWhileEnd p t == fst (spanEnd p t) 320 | -- 321 | -- >>> dropWhileEnd (>= 'c') "abcdabcd" 322 | -- "abcdab" 323 | -- 324 | -- @since 0.1.2 325 | dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText 326 | dropWhileEnd p = fst . spanEnd p 327 | 328 | 329 | -- $setup 330 | -- >>> :set -XOverloadedStrings 331 | -- >>> import Text.Show.Functions () 332 | -- >>> import qualified Test.QuickCheck.Arbitrary as QC 333 | -- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary } 334 | -------------------------------------------------------------------------------- /.github/workflows/haskell-ci.yml: -------------------------------------------------------------------------------- 1 | # This GitHub workflow config has been generated by a script via 2 | # 3 | # haskell-ci 'github' 'cabal.project' 4 | # 5 | # To regenerate the script (for example after adjusting tested-with) run 6 | # 7 | # haskell-ci regenerate 8 | # 9 | # For more information, see https://github.com/haskell-CI/haskell-ci 10 | # 11 | # version: 0.19.20241220 12 | # 13 | # REGENDATA ("0.19.20241220",["github","cabal.project"]) 14 | # 15 | name: Haskell-CI 16 | on: 17 | push: 18 | branches: 19 | - master 20 | pull_request: 21 | branches: 22 | - master 23 | jobs: 24 | linux: 25 | name: Haskell-CI - Linux - ${{ matrix.compiler }} 26 | runs-on: ubuntu-20.04 27 | timeout-minutes: 28 | 60 29 | container: 30 | image: buildpack-deps:jammy 31 | continue-on-error: ${{ matrix.allow-failure }} 32 | strategy: 33 | matrix: 34 | include: 35 | - compiler: ghc-9.12.1 36 | compilerKind: ghc 37 | compilerVersion: 9.12.1 38 | setup-method: ghcup 39 | allow-failure: false 40 | - compiler: ghc-9.10.1 41 | compilerKind: ghc 42 | compilerVersion: 9.10.1 43 | setup-method: ghcup 44 | allow-failure: false 45 | - compiler: ghc-9.8.4 46 | compilerKind: ghc 47 | compilerVersion: 9.8.4 48 | setup-method: ghcup 49 | allow-failure: false 50 | - compiler: ghc-9.6.6 51 | compilerKind: ghc 52 | compilerVersion: 9.6.6 53 | setup-method: ghcup 54 | allow-failure: false 55 | - compiler: ghc-9.4.8 56 | compilerKind: ghc 57 | compilerVersion: 9.4.8 58 | setup-method: ghcup 59 | allow-failure: false 60 | - compiler: ghc-9.2.8 61 | compilerKind: ghc 62 | compilerVersion: 9.2.8 63 | setup-method: ghcup 64 | allow-failure: false 65 | - compiler: ghc-9.0.2 66 | compilerKind: ghc 67 | compilerVersion: 9.0.2 68 | setup-method: ghcup 69 | allow-failure: false 70 | - compiler: ghc-8.10.7 71 | compilerKind: ghc 72 | compilerVersion: 8.10.7 73 | setup-method: ghcup 74 | allow-failure: false 75 | - compiler: ghc-8.8.3 76 | compilerKind: ghc 77 | compilerVersion: 8.8.3 78 | setup-method: ghcup 79 | allow-failure: false 80 | - compiler: ghc-8.6.5 81 | compilerKind: ghc 82 | compilerVersion: 8.6.5 83 | setup-method: ghcup 84 | allow-failure: false 85 | fail-fast: false 86 | steps: 87 | - name: apt-get install 88 | run: | 89 | apt-get update 90 | apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5 91 | - name: Install GHCup 92 | run: | 93 | mkdir -p "$HOME/.ghcup/bin" 94 | curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup" 95 | chmod a+x "$HOME/.ghcup/bin/ghcup" 96 | - name: Install cabal-install (prerelease) 97 | run: | 98 | "$HOME/.ghcup/bin/ghcup" config add-release-channel https://raw.githubusercontent.com/haskell/ghcup-metadata/master/ghcup-prereleases-0.0.8.yaml; 99 | "$HOME/.ghcup/bin/ghcup" install cabal 3.15.0.0.2024.10.3 || (cat "$HOME"/.ghcup/logs/*.* && false) 100 | echo "CABAL=$HOME/.ghcup/bin/cabal-3.15.0.0.2024.10.3 -vnormal+nowrap" >> "$GITHUB_ENV" 101 | - name: Install GHC (GHCup) 102 | if: matrix.setup-method == 'ghcup' 103 | run: | 104 | "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false) 105 | HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER") 106 | HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#') 107 | HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#') 108 | echo "HC=$HC" >> "$GITHUB_ENV" 109 | echo "HCPKG=$HCPKG" >> "$GITHUB_ENV" 110 | echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV" 111 | env: 112 | HCKIND: ${{ matrix.compilerKind }} 113 | HCNAME: ${{ matrix.compiler }} 114 | HCVER: ${{ matrix.compilerVersion }} 115 | - name: Set PATH and environment variables 116 | run: | 117 | echo "$HOME/.cabal/bin" >> $GITHUB_PATH 118 | echo "LANG=C.UTF-8" >> "$GITHUB_ENV" 119 | echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV" 120 | echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV" 121 | HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))') 122 | echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV" 123 | echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV" 124 | echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV" 125 | echo "HEADHACKAGE=false" >> "$GITHUB_ENV" 126 | echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV" 127 | env: 128 | HCKIND: ${{ matrix.compilerKind }} 129 | HCNAME: ${{ matrix.compiler }} 130 | HCVER: ${{ matrix.compilerVersion }} 131 | - name: env 132 | run: | 133 | env 134 | - name: write cabal config 135 | run: | 136 | mkdir -p $CABAL_DIR 137 | cat >> $CABAL_CONFIG <> $CABAL_CONFIG < cabal-plan.xz 170 | echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2 cabal-plan.xz' | sha256sum -c - 171 | xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan 172 | rm -f cabal-plan.xz 173 | chmod a+x $HOME/.cabal/bin/cabal-plan 174 | cabal-plan --version 175 | - name: checkout 176 | uses: actions/checkout@v4 177 | with: 178 | path: source 179 | - name: initial cabal.project for sdist 180 | run: | 181 | touch cabal.project 182 | echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project 183 | cat cabal.project 184 | - name: sdist 185 | run: | 186 | mkdir -p sdist 187 | $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist 188 | - name: unpack 189 | run: | 190 | mkdir -p unpacked 191 | find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \; 192 | - name: generate cabal.project 193 | run: | 194 | PKGDIR_text_short="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/text-short-[0-9.]*')" 195 | echo "PKGDIR_text_short=${PKGDIR_text_short}" >> "$GITHUB_ENV" 196 | rm -f cabal.project cabal.project.local 197 | touch cabal.project 198 | touch cabal.project.local 199 | echo "packages: ${PKGDIR_text_short}" >> cabal.project 200 | echo "package text-short" >> cabal.project 201 | echo " ghc-options: -Werror=missing-methods" >> cabal.project 202 | cat >> cabal.project <> cabal.project.local 205 | cat cabal.project 206 | cat cabal.project.local 207 | - name: dump install plan 208 | run: | 209 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all 210 | cabal-plan 211 | - name: restore cache 212 | uses: actions/cache/restore@v4 213 | with: 214 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 215 | path: ~/.cabal/store 216 | restore-keys: ${{ runner.os }}-${{ matrix.compiler }}- 217 | - name: install dependencies 218 | run: | 219 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all 220 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all 221 | - name: build w/o tests 222 | run: | 223 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 224 | - name: build 225 | run: | 226 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always 227 | - name: tests 228 | run: | 229 | $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct 230 | - name: cabal check 231 | run: | 232 | cd ${PKGDIR_text_short} || false 233 | ${CABAL} -vnormal check 234 | - name: haddock 235 | run: | 236 | $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all 237 | - name: unconstrained build 238 | run: | 239 | rm -f cabal.project.local 240 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 241 | - name: prepare for constraint sets 242 | run: | 243 | rm -f cabal.project.local 244 | - name: constraint set text-2 245 | run: | 246 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' all --dry-run ; fi 247 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then cabal-plan topo | sort ; fi 248 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' --dependencies-only -j2 all ; fi 249 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' all ; fi 250 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-test $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' all ; fi 251 | - name: constraint set text-1 252 | run: | 253 | if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' all --dry-run ; fi 254 | if [ $((HCNUMVER < 90400)) -ne 0 ] ; then cabal-plan topo | sort ; fi 255 | if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' --dependencies-only -j2 all ; fi 256 | if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' all ; fi 257 | if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-test $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' all ; fi 258 | - name: constraint set bytestring-0.11 259 | run: | 260 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' all --dry-run ; fi 261 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then cabal-plan topo | sort ; fi 262 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' --dependencies-only -j2 all ; fi 263 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' all ; fi 264 | if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-test $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' all ; fi 265 | - name: save cache 266 | if: always() 267 | uses: actions/cache/save@v4 268 | with: 269 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 270 | path: ~/.cabal/store 271 | -------------------------------------------------------------------------------- /src-test/Tests.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# LANGUAGE OverloadedLists #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE TemplateHaskell #-} 5 | {-# OPTIONS_GHC -fno-warn-orphans #-} 6 | 7 | #ifndef MIN_VERSION_GLASGOW_HASKELL 8 | #define MIN_VERSION_GLASGOW_HASKELL(x,y,z,w) ((x*100 + y) >= __GLASGOW_HASKELL__) 9 | #endif 10 | 11 | module Main(main) where 12 | 13 | import Data.Binary 14 | import Data.Char 15 | import Data.Maybe 16 | import Data.Monoid 17 | import qualified Data.String as D.S 18 | import qualified Data.ByteString as BS 19 | import qualified Data.Text as T 20 | import qualified Data.Text.Encoding as T 21 | import qualified Data.Text.Short as IUT 22 | import qualified Data.Text.Short.Partial as IUT 23 | import Test.Tasty 24 | import Test.Tasty.HUnit 25 | import Test.Tasty.QuickCheck as QC 26 | import Text.Show.Functions () 27 | 28 | fromByteStringRef = either (const Nothing) (Just . IUT.fromText) . T.decodeUtf8' 29 | 30 | main :: IO () 31 | main = defaultMain (adjustOption (QuickCheckTests 50000 `max`) $ tests) 32 | 33 | tests :: TestTree 34 | tests = testGroup "Tests" [unitTests,qcProps] 35 | 36 | -- ShortText w/ in-bounds index 37 | data STI = STI IUT.ShortText Int 38 | deriving (Eq,Show) 39 | 40 | newtype ST = ST IUT.ShortText 41 | deriving (Eq,Show) 42 | 43 | instance Arbitrary STI where 44 | arbitrary = do 45 | t <- arbitrary 46 | i <- choose (0, T.length t - 1) 47 | return $! STI (IUT.fromText t) i 48 | 49 | instance Arbitrary ST where 50 | arbitrary = fmap (ST . IUT.fromText) arbitrary 51 | shrink (ST st) = map (ST . IUT.fromText) (shrink (IUT.toText st)) 52 | 53 | qcProps :: TestTree 54 | qcProps = testGroup "Properties" 55 | [ QC.testProperty "length/fromText" $ \t -> IUT.length (IUT.fromText t) == T.length t 56 | , QC.testProperty "length/fromString" $ \s -> IUT.length (IUT.fromString s) == length s 57 | , QC.testProperty "length/append" $ \(ST t1) (ST t2) -> IUT.length t1 + IUT.length t2 == IUT.length (IUT.append t1 t2) 58 | , QC.testProperty "compare" $ \t1 t2 -> IUT.fromText t1 `compare` IUT.fromText t2 == t1 `compare` t2 59 | , QC.testProperty "(==)" $ \t1 t2 -> (IUT.fromText t1 == IUT.fromText t2) == (t1 == t2) 60 | , QC.testProperty "(!?)" $ \t -> 61 | let t' = IUT.fromText t 62 | in and ([ mapMaybe (t' IUT.!?) ([0 .. T.length t -1 ] :: [Int]) == T.unpack t 63 | , mapMaybe (t' IUT.!?) [-5 .. -1] == [] 64 | , mapMaybe (t' IUT.!?) [T.length t .. T.length t + 5] == [] 65 | ] :: [Bool]) 66 | , QC.testProperty "indexEndMaybe" $ \t -> 67 | let t' = IUT.fromText t 68 | in and ([ mapMaybe (IUT.indexEndMaybe t') [0 .. T.length t -1 ] == T.unpack (T.reverse t) 69 | , mapMaybe (IUT.indexEndMaybe t') [-5 .. -1] == [] 70 | , mapMaybe (IUT.indexEndMaybe t') [T.length t .. T.length t + 5] == [] 71 | ] :: [Bool]) 72 | , QC.testProperty "toText.fromText" $ \t -> (IUT.toText . IUT.fromText) t == t 73 | , QC.testProperty "fromByteString" $ \b -> IUT.fromByteString b == fromByteStringRef b 74 | , QC.testProperty "fromByteString.toByteString" $ \t -> let ts = IUT.fromText t in (IUT.fromByteString . IUT.toByteString) ts == Just ts 75 | , QC.testProperty "toString.fromString" $ \s -> (IUT.toString . IUT.fromString) s == s 76 | , QC.testProperty "isAscii" $ \s -> IUT.isAscii (IUT.fromString s) == all isAscii s 77 | , QC.testProperty "isAscii2" $ \t -> IUT.isAscii (IUT.fromText t) == T.all isAscii t 78 | , QC.testProperty "splitAt" $ \t -> 79 | let t' = IUT.fromText t 80 | mapBoth f (x,y) = (f x, f y) 81 | in and [ mapBoth IUT.toText (IUT.splitAt i t') == T.splitAt i t | i <- [-5 .. 5+T.length t ] ] 82 | , QC.testProperty "intercalate/split" $ \t c -> 83 | let t' = IUT.fromText t 84 | in IUT.intercalate (IUT.singleton c) (IUT.split (== c) t') == t' 85 | 86 | , QC.testProperty "intersperse" $ \t c -> IUT.intersperse c (IUT.fromText t) == IUT.fromText (T.intersperse c t) 87 | , QC.testProperty "intercalate" $ \t1 t2 -> IUT.intercalate (IUT.fromText t1) (map IUT.fromText t2) == IUT.fromText (T.intercalate t1 t2) 88 | , QC.testProperty "reverse.singleton" $ \c -> IUT.reverse (IUT.singleton c) == IUT.singleton c 89 | , QC.testProperty "reverse" $ \t -> IUT.reverse (IUT.fromText t) == IUT.fromText (T.reverse t) 90 | , QC.testProperty "filter" $ \p t -> IUT.filter p (IUT.fromText t) == IUT.fromText (T.filter p t) 91 | , QC.testProperty "replicate" $ \n t -> IUT.replicate n (IUT.fromText t) == IUT.fromText (T.replicate n t) 92 | , QC.testProperty "dropAround" $ \p t -> IUT.dropAround p (IUT.fromText t) == IUT.fromText (T.dropAround p t) 93 | 94 | , QC.testProperty "foldl" $ \f z t -> IUT.foldl f (z :: Char) (IUT.fromText t) == T.foldl f (z :: Char) t 95 | , QC.testProperty "foldl #2" $ \t -> IUT.foldl (\n _ -> (n+1)) 0 (IUT.fromText t) == T.length t 96 | , QC.testProperty "foldl #3" $ \t -> IUT.foldl (\s c -> c : s) [] (IUT.fromText t) == T.unpack (T.reverse t) 97 | 98 | , QC.testProperty "foldl'" $ \f z t -> IUT.foldl' f (z :: Char) (IUT.fromText t) == T.foldl' f (z :: Char) t 99 | , QC.testProperty "foldl' #2" $ \t -> IUT.foldl' (\n _ -> (n+1)) 0 (IUT.fromText t) == T.length t 100 | , QC.testProperty "foldl' #3" $ \t -> IUT.foldl' (\s c -> c : s) [] (IUT.fromText t) == T.unpack (T.reverse t) 101 | 102 | , QC.testProperty "foldr" $ \f z t -> IUT.foldr f (z :: Char) (IUT.fromText t) == T.foldr f (z :: Char) t 103 | , QC.testProperty "foldr #2" $ \t -> IUT.foldr (\_ n -> (n+1)) 0 (IUT.fromText t) == T.length t 104 | , QC.testProperty "foldr #3" $ \t -> IUT.foldr (:) [] (IUT.fromText t) == T.unpack t 105 | 106 | , QC.testProperty "foldr1" $ \f t -> (not (T.null t)) ==> IUT.foldr1 f (IUT.fromText t) == T.foldr1 f t 107 | , QC.testProperty "foldl1" $ \f t -> (not (T.null t)) ==> IUT.foldl1 f (IUT.fromText t) == T.foldl1 f t 108 | , QC.testProperty "foldl1'" $ \f t -> (not (T.null t)) ==> IUT.foldl1' f (IUT.fromText t) == T.foldl1' f t 109 | 110 | , QC.testProperty "splitAtEnd" $ \t -> 111 | let t' = IUT.fromText t 112 | n' = IUT.length t' 113 | in and [ (IUT.splitAt (n'-i) t') == IUT.splitAtEnd i t' | i <- [-5 .. 5+n' ] ] 114 | 115 | , QC.testProperty "find" $ \t -> IUT.find Data.Char.isAscii (IUT.fromText t) == T.find Data.Char.isAscii t 116 | , QC.testProperty "findIndex" $ \t -> IUT.findIndex Data.Char.isAscii (IUT.fromText t) == T.findIndex Data.Char.isAscii t 117 | 118 | , QC.testProperty "isSuffixOf" $ \t1 t2 -> IUT.fromText t1 `IUT.isSuffixOf` IUT.fromText t2 == t1 `T.isSuffixOf` t2 119 | , QC.testProperty "isPrefixOf" $ \t1 t2 -> IUT.fromText t1 `IUT.isPrefixOf` IUT.fromText t2 == t1 `T.isPrefixOf` t2 120 | 121 | , QC.testProperty "stripPrefix" $ \t1 t2 -> IUT.stripPrefix (IUT.fromText t1) (IUT.fromText t2) == 122 | fmap IUT.fromText (T.stripPrefix t1 t2) 123 | 124 | , QC.testProperty "stripSuffix" $ \t1 t2 -> IUT.stripSuffix (IUT.fromText t1) (IUT.fromText t2) == 125 | fmap IUT.fromText (T.stripSuffix t1 t2) 126 | 127 | , QC.testProperty "stripPrefix 2" $ \(STI t i) -> 128 | let (pfx,sfx) = IUT.splitAt i t 129 | in IUT.stripPrefix pfx t == Just sfx 130 | 131 | , QC.testProperty "stripSuffix 2" $ \(STI t i) -> 132 | let (pfx,sfx) = IUT.splitAt i t 133 | in IUT.stripSuffix sfx t == Just pfx 134 | 135 | , QC.testProperty "cons" $ \c t -> IUT.singleton c <> IUT.fromText t == IUT.cons c (IUT.fromText t) 136 | , QC.testProperty "snoc" $ \c t -> IUT.fromText t <> IUT.singleton c == IUT.snoc (IUT.fromText t) c 137 | 138 | , QC.testProperty "uncons" $ \c t -> IUT.uncons (IUT.singleton c <> IUT.fromText t) == Just (c, IUT.fromText t) 139 | 140 | , QC.testProperty "unsnoc" $ \c t -> IUT.unsnoc (IUT.fromText t <> IUT.singleton c) == Just (IUT.fromText t, c) 141 | 142 | , QC.testProperty "break" $ \t -> let (l,r) = IUT.break Data.Char.isAscii (IUT.fromText t) 143 | in T.break Data.Char.isAscii t == (IUT.toText l,IUT.toText r) 144 | 145 | , QC.testProperty "span" $ \t -> let (l,r) = IUT.span Data.Char.isAscii (IUT.fromText t) 146 | in T.span Data.Char.isAscii t == (IUT.toText l,IUT.toText r) 147 | 148 | , QC.testProperty "breakEnd" $ \t -> let (l,r) = IUT.breakEnd Data.Char.isAscii (IUT.fromText t) 149 | in t_breakEnd Data.Char.isAscii t == (IUT.toText l,IUT.toText r) 150 | 151 | , QC.testProperty "spanEnd" $ \t -> let (l,r) = IUT.spanEnd Data.Char.isAscii (IUT.fromText t) 152 | in t_spanEnd Data.Char.isAscii t == (IUT.toText l,IUT.toText r) 153 | 154 | , QC.testProperty "splitAt/isPrefixOf" $ \t -> 155 | let t' = IUT.fromText t 156 | in and [ IUT.isPrefixOf (fst (IUT.splitAt i t')) t' | i <- [-5 .. 5+T.length t ] ] 157 | , QC.testProperty "splitAt/isSuffixOf" $ \t -> 158 | let t' = IUT.fromText t 159 | in and [ IUT.isSuffixOf (snd (IUT.splitAt i t')) t' | i <- [-5 .. 5+T.length t ] ] 160 | ] 161 | 162 | t_breakEnd p t = t_spanEnd (not . p) t 163 | t_spanEnd p t = (T.dropWhileEnd p t, T.takeWhileEnd p t) 164 | 165 | unitTests = testGroup "Unit-tests" 166 | [ testCase "fromText mempty" $ IUT.fromText mempty @?= mempty 167 | , testCase "fromShortByteString [0xc0,0x80]" $ IUT.fromShortByteString "\xc0\x80" @?= Nothing 168 | , testCase "fromByteString [0xc0,0x80]" $ IUT.fromByteString "\xc0\x80" @?= Nothing 169 | , testCase "fromByteString [0xf0,0x90,0x80,0x80]" $ IUT.fromByteString "\xf0\x90\x80\x80" @?= Just "\x10000" 170 | , testCase "fromByteString [0xf4,0x90,0x80,0x80]" $ IUT.fromByteString "\244\144\128\128" @?= Nothing 171 | , testCase "IsString U+D800" $ "\xFFFD" @?= (IUT.fromString "\xD800") 172 | -- , testCase "IsString U+D800" $ (IUT.fromString "\xD800") @?= IUT.fromText ("\xD800" :: T.Text) 173 | 174 | #if !(MIN_VERSION_bytestring(0,11,0) && MIN_VERSION_GLASGOW_HASKELL(9,0,1,0) && !MIN_VERSION_GLASGOW_HASKELL(9,0,2,0)) 175 | -- https://gitlab.haskell.org/ghc/ghc/-/issues/19976 176 | , testCase "Binary.encode" $ encode ("Hello \8364 & \171581!\NUL" :: IUT.ShortText) @?= "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\DC2Hello \226\130\172 & \240\169\184\189!\NUL" 177 | , testCase "Binary.decode" $ decode ("\NUL\NUL\NUL\NUL\NUL\NUL\NUL\DC2Hello \226\130\172 & \240\169\184\189!\NUL") @?= ("Hello \8364 & \171581!\NUL" :: IUT.ShortText) 178 | #endif 179 | , testCase "singleton" $ [ c | c <- [minBound..maxBound], IUT.singleton c /= IUT.fromText (T.singleton c) ] @?= [] 180 | 181 | , testCase "splitAtEnd" $ IUT.splitAtEnd 1 "€€" @?= ("€","€") 182 | , testCase "split#1" $ IUT.split (== 'a') "aabbaca" @?= ["", "", "bb", "c", ""] 183 | , testCase "split#2" $ IUT.split (const False) "aabbaca" @?= ["aabbaca"] 184 | , testCase "split#3" $ IUT.split (const True) "abc" @?= ["","","",""] 185 | , testCase "split#4" $ IUT.split (const True) "" @?= [""] 186 | 187 | , testCase "literal0" $ IUT.unpack testLit0 @?= [] 188 | , testCase "literal1" $ IUT.unpack testLit1 @?= ['€','\0','€','\0'] 189 | , testCase "literal2" $ IUT.unpack testLit2 @?= ['\xFFFD','\xD7FF','\xFFFD','\xE000'] 190 | , testCase "literal3" $ IUT.unpack testLit3 @?= ['\1'..'\x7f'] 191 | , testCase "literal4" $ IUT.unpack testLit4 @?= map toEnum [0,1,126,127,128,129,130,256,2046,2047,2048,2049,2050,65530,65531,65532,65533,65534,65533,65535,65536,65537,65538,1114110,1114111] 192 | , testCase "literal5" $ IUT.unpack testLit5 @?= map toEnum [28961] 193 | , testCase "literal6" $ IUT.unpack testLit6 @?= map toEnum [0] 194 | , testCase "literal7" $ IUT.unpack testLit7 @?= map toEnum [66328] 195 | , testCase "literal8" $ IUT.unpack testLit8 @?= map toEnum [127] 196 | 197 | -- list literals 198 | , testCase "literal9" $ [] @?= ("" :: IUT.ShortText) 199 | , testCase "literal10" $ ['¤','€','$'] @?= ("¤€$" :: IUT.ShortText) 200 | , testCase "literal12" $ IUT.unpack ['\xD800','\xD7FF','\xDFFF','\xE000'] @?= ['\xFFFD','\xD7FF','\xFFFD','\xE000'] 201 | 202 | -- template haskell 203 | , testCase "TH.Lift" $ do 204 | testLit1 @?= $([| testLit1 |]) 205 | testLit2 @?= $([| testLit2 |]) 206 | testLit3 @?= $([| testLit3 |]) 207 | testLit4 @?= $([| testLit4 |]) 208 | testLit5 @?= $([| testLit5 |]) 209 | testLit6 @?= $([| testLit6 |]) 210 | testLit7 @?= $([| testLit7 |]) 211 | testLit8 @?= $([| testLit8 |]) 212 | 213 | , testCase "TTH.Lift" $ do 214 | testLit1 @?= $$([|| testLit1 ||]) 215 | testLit2 @?= $$([|| testLit2 ||]) 216 | testLit3 @?= $$([|| testLit3 ||]) 217 | testLit4 @?= $$([|| testLit4 ||]) 218 | testLit5 @?= $$([|| testLit5 ||]) 219 | testLit6 @?= $$([|| testLit6 ||]) 220 | testLit7 @?= $$([|| testLit7 ||]) 221 | testLit8 @?= $$([|| testLit8 ||]) 222 | ] 223 | 224 | -- isScalar :: Char -> Bool 225 | -- isScalar c = c < '\xD800' || c >= '\xE000' 226 | 227 | 228 | {-# NOINLINE testLit0 #-} 229 | testLit0 :: IUT.ShortText 230 | testLit0 = "" 231 | 232 | {-# NOINLINE testLit1 #-} 233 | testLit1 :: IUT.ShortText 234 | testLit1 = "€\NUL€\NUL" 235 | 236 | {-# NOINLINE testLit2 #-} 237 | testLit2 :: IUT.ShortText 238 | testLit2 = "\xD800\xD7FF\xDFFF\xE000" 239 | 240 | {-# NOINLINE testLit3 #-} 241 | testLit3 :: IUT.ShortText 242 | testLit3 = "\SOH\STX\ETX\EOT\ENQ\ACK\a\b\t\n\v\f\r\SO\SI\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\EM\SUB\ESC\FS\GS\RS\US !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\DEL" 243 | 244 | {-# NOINLINE testLit4 #-} 245 | testLit4 :: IUT.ShortText 246 | testLit4 = "\NUL\SOH~\DEL\128\129\130\256\2046\2047\2048\2049\2050\65530\65531\65532\65533\65534\65533\65535\65536\65537\65538\1114110\1114111" 247 | 248 | {-# NOINLINE testLit5 #-} 249 | testLit5 :: IUT.ShortText 250 | testLit5 = "無" 251 | 252 | {-# NOINLINE testLit6 #-} 253 | testLit6 :: IUT.ShortText 254 | testLit6 = "\NUL" 255 | 256 | {-# NOINLINE testLit7 #-} 257 | testLit7 :: IUT.ShortText 258 | testLit7 = "𐌘" 259 | 260 | {-# NOINLINE testLit8 #-} 261 | testLit8 :: IUT.ShortText 262 | testLit8 = "\x7f" 263 | 264 | ------------------------------------------------------------------------------- 265 | -- orphans 266 | ------------------------------------------------------------------------------- 267 | 268 | -- orphan instances to not depend on quickcheck-instances 269 | -- which would cause cycles 270 | 271 | instance Arbitrary BS.ByteString where 272 | arbitrary = BS.pack `fmap` arbitrary 273 | shrink xs = BS.pack `fmap` shrink (BS.unpack xs) 274 | 275 | instance Arbitrary T.Text where 276 | arbitrary = T.pack `fmap` arbitrary 277 | shrink xs = T.pack `fmap` shrink (T.unpack xs) 278 | -------------------------------------------------------------------------------- /cbits/cbits.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, Herbert Valerio Riedel 3 | * 4 | * All rights reserved. 5 | * 6 | * Redistribution and use in source and binary forms, with or without 7 | * modification, are permitted provided that the following conditions are met: 8 | * 9 | * * Redistributions of source code must retain the above copyright 10 | * notice, this list of conditions and the following disclaimer. 11 | * 12 | * * Redistributions in binary form must reproduce the above 13 | * copyright notice, this list of conditions and the following 14 | * disclaimer in the documentation and/or other materials provided 15 | * with the distribution. 16 | * 17 | * * Neither the name of Herbert Valerio Riedel nor the names of other 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | */ 33 | 34 | #if !defined(NDEBUG) 35 | # warning assert(3) checks enabled 36 | #endif 37 | 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | #if !defined(SIZEOF_VOID_P) 46 | # error SIZEOF_VOID_P not defined 47 | #endif 48 | 49 | #if (SIZEOF_VOID_P) == 8 50 | const static bool is_64bit = true; 51 | #elif (SIZEOF_VOID_P) == 4 52 | const static bool is_64bit = false; 53 | #else 54 | # error unexpected SIZEOF_VOID_P value 55 | #endif 56 | 57 | #if (WORDS_BIGENDIAN) 58 | const static bool is_bigendian = true; 59 | #else 60 | const static bool is_bigendian = false; 61 | #endif 62 | 63 | #if defined(__GNUC__) 64 | # define likely(x) __builtin_expect(!!(x),1) 65 | # define unlikely(x) __builtin_expect(!!(x),0) 66 | #else 67 | # define likely(x) (x) 68 | # define unlikely(x) (x) 69 | #endif 70 | 71 | /* test whether octet in UTF-8 steam is not a continuation byte, i.e. a leading byte */ 72 | #define utf8_lead_p(octet) (((octet) & 0xc0) != 0x80) 73 | 74 | /* 0 <= x <= 0x110000 */ 75 | typedef HsWord codepoint_t; 76 | 77 | /* Count number of code-points in well-formed utf8 string */ 78 | size_t 79 | hs_text_short_length(const uint8_t buf[], const size_t n) 80 | { 81 | size_t j = 0; 82 | size_t l = 0; 83 | 84 | /* Both GCC & Clang are able to optimise the code below quite well at -O3 */ 85 | for (j = 0; j < n; j++) 86 | if (utf8_lead_p(buf[j])) 87 | l++; 88 | 89 | return l; 90 | } 91 | 92 | /* Locate offset of j-th code-point in well-formed utf8 string 93 | * 94 | */ 95 | size_t 96 | hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i) 97 | { 98 | if (!n) 99 | return n; 100 | 101 | size_t m = 0; 102 | size_t j = 0; 103 | 104 | for (;;) { 105 | assert(m >= 0); 106 | assert(j <= i); 107 | assert(j <= m); 108 | 109 | if (j == i) 110 | return m; /* found */ 111 | 112 | if (i-j >= n-m) 113 | return n; /* i-th char is >= buf+n */ 114 | 115 | assert(m < n); 116 | const uint8_t b0 = buf[m]; 117 | 118 | if (!(b0 & 0x80)) 119 | m += 1; /* 0_______ */ 120 | else 121 | switch(b0 >> 4) { 122 | case 0xf: /* 11110___ */ 123 | m += 4; 124 | break; 125 | case 0xe: /* 1110____ */ 126 | m += 3; 127 | break; 128 | default: /* 110_____ */ 129 | m += 2; 130 | break; 131 | } 132 | 133 | j += 1; 134 | } 135 | 136 | assert(0); 137 | } 138 | 139 | /* Locate offset of j-th code-point (in reverse direction) in 140 | * well-formed utf8 string starting at end of buffer. 141 | * 142 | * The 0-th character from the end is the last character in the utf8 143 | * string (if it exists). 144 | * 145 | * Returns original 'n' if out of bounds. 146 | * 147 | */ 148 | size_t 149 | hs_text_short_index_ofs_rev(const uint8_t buf[], const size_t n, const size_t i) 150 | { 151 | size_t m = n; 152 | size_t j = i; 153 | 154 | for (;;) { 155 | assert(m <= n); 156 | assert(j >= 0); 157 | 158 | if (j >= m) 159 | return n; /* i-th char is < buf */ 160 | 161 | /* if (m == i-j) /\* suffix is made up only of ASCII chars, so we can shortcut *\/ */ 162 | /* return 0; */ 163 | 164 | /* scan until octet does not match 10_ */ 165 | assert(m > 0); 166 | if (!(buf[--m] & 0x80)) 167 | goto l_cont; 168 | 169 | assert(m > 0); 170 | if (utf8_lead_p(buf[--m])) { 171 | assert ((buf[m] & 0xe0) == 0xc0); /* 110_ */ 172 | goto l_cont; 173 | } 174 | 175 | assert(m > 0); 176 | if (utf8_lead_p(buf[--m])) { 177 | assert ((buf[m] & 0xf0) == 0xe0); /* 1110_ */ 178 | goto l_cont; 179 | } 180 | 181 | /* this must be a non-10_ octet in a well-formed stream */ 182 | assert(m > 0); 183 | m -= 1; 184 | 185 | assert ((buf[m] & 0xf8) == 0xf0); /* 11110_ */ 186 | 187 | l_cont: 188 | assert(utf8_lead_p(buf[m])); 189 | 190 | if (!j) 191 | return m; /* found */ 192 | 193 | j -= 1; 194 | } 195 | 196 | assert(0); 197 | } 198 | 199 | /* Decode UTF8 code units into code-point 200 | * Assumes buf[] points to start of a valid UTF8-encoded code-point 201 | */ 202 | static inline uint32_t 203 | hs_text_short_decode_cp(const uint8_t buf[]) 204 | { 205 | /* 7 bits | 0xxxxxxx 206 | * 11 bits | 110yyyyx 10xxxxxx 207 | * 16 bits | 1110yyyy 10yxxxxx 10xxxxxx 208 | * 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 209 | */ 210 | 211 | const uint8_t b0 = buf[0]; 212 | 213 | if (!(b0 & 0x80)) 214 | return b0; 215 | 216 | uint32_t cp = 0; 217 | 218 | switch(b0 >> 4) { 219 | case 0xf: /* 11110___ */ 220 | assert((b0 & 0xf8) == 0xf0); 221 | assert(!utf8_lead_p(buf[1])); 222 | assert(!utf8_lead_p(buf[2])); 223 | assert(!utf8_lead_p(buf[3])); 224 | cp = ((uint32_t)(b0 & 0x07)) << (6+6+6); 225 | cp |= ((uint32_t)(buf[1] & 0x3f)) << (6+6); 226 | cp |= ((uint32_t)(buf[2] & 0x3f)) << 6; 227 | cp |= buf[3] & 0x3f; 228 | assert (cp > 0xffff); assert (cp < 0x110000); 229 | return cp; 230 | 231 | case 0xe: /* 1110____ */ 232 | assert(!utf8_lead_p(buf[1])); 233 | assert(!utf8_lead_p(buf[2])); 234 | cp = ((uint32_t)(b0 & 0x0f)) << (6+6); 235 | cp |= ((uint32_t)(buf[1] & 0x3f)) << 6; 236 | cp |= buf[2] & 0x3f; 237 | assert (cp > 0x7ff); assert (cp < 0x10000); 238 | assert (cp < 0xd800 || cp > 0xdfff); 239 | return cp; 240 | 241 | default: /* 110_____ */ 242 | assert((b0 & 0xe0) == 0xc0); 243 | assert(!utf8_lead_p(buf[1])); 244 | cp = ((uint32_t)(b0 & 0x1f)) << 6; 245 | cp |= buf[1] & 0x3f; 246 | assert (cp > 0x7f); assert (cp < 0x800); 247 | return cp; 248 | } 249 | } 250 | 251 | /* decode codepoint starting at buf[ofs] */ 252 | codepoint_t 253 | hs_text_short_ofs_cp(const uint8_t buf[], const size_t ofs) 254 | { 255 | return hs_text_short_decode_cp(buf+ofs); 256 | } 257 | 258 | /* reverse-decode codepoint starting at offset right after a code-point */ 259 | codepoint_t 260 | hs_text_short_ofs_cp_rev(const uint8_t *buf, const size_t ofs) 261 | { 262 | /* 7 bits | 0xxxxxxx 263 | * 11 bits | 110yyyyx 10xxxxxx 264 | * 16 bits | 1110yyyy 10yxxxxx 10xxxxxx 265 | * 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 266 | */ 267 | 268 | buf = buf + ofs - 1; 269 | 270 | /* this octet is either 10_ or 0_ */ 271 | uint32_t cp = *buf; 272 | 273 | if (!(cp & 0x80)) 274 | return cp; 275 | 276 | assert (!utf8_lead_p(cp)); 277 | cp &= 0x3f; 278 | 279 | /* this octet is either 10_ or 110_ */ 280 | { 281 | const uint8_t b = *(--buf); 282 | assert (!utf8_lead_p(b) || ((b & 0xe0) == 0xc0)); 283 | 284 | cp |= (b & 0x3f) << 6; 285 | 286 | if (b & 0x40) { 287 | assert (cp > 0x7f); assert (cp < 0x800); 288 | return cp; 289 | } 290 | } 291 | 292 | /* this octet is either 10_ or 1110_ */ 293 | { 294 | const uint8_t b = *(--buf); 295 | assert (!utf8_lead_p(b) || ((b & 0xf0) == 0xe0)); 296 | 297 | if (b & 0x40) { 298 | cp |= (b & 0xf) << 12; 299 | 300 | assert (cp > 0x7ff); assert (cp < 0x10000); 301 | assert (cp < 0xd800 || cp > 0xdfff); 302 | return cp; 303 | } 304 | 305 | cp |= (b & 0x3f) << 12; 306 | } 307 | 308 | /* this octet must be 11110_ */ 309 | const uint8_t b = *(buf-1); 310 | assert ((b & 0xf8) == 0xf0); 311 | 312 | cp |= (b & 0x7) << 18; 313 | 314 | assert (cp > 0xffff); assert (cp < 0x110000); 315 | return cp; 316 | } 317 | 318 | /* Retrieve i-th code-point in (valid) UTF8 stream 319 | * 320 | * Returns -1 if out of bounds 321 | */ 322 | codepoint_t 323 | hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i) 324 | { 325 | const size_t ofs = hs_text_short_index_ofs(buf, n, i); 326 | 327 | if (ofs >= n) 328 | return -1; 329 | 330 | return hs_text_short_decode_cp(&buf[ofs]); 331 | } 332 | 333 | /* Retrieve i-th code-point in (valid) UTF8 stream 334 | * 335 | * Returns -1 if out of bounds 336 | */ 337 | codepoint_t 338 | hs_text_short_index_cp_rev(const uint8_t buf[], const size_t n, const size_t i) 339 | { 340 | const size_t ofs = hs_text_short_index_ofs_rev(buf, n, i); 341 | 342 | if (ofs >= n) 343 | return -1; 344 | 345 | return hs_text_short_decode_cp(&buf[ofs]); 346 | } 347 | 348 | /* Validate UTF8 encoding 349 | 350 | 7 bits | 0xxxxxxx 351 | 11 bits | 110yyyyx 10xxxxxx 352 | 16 bits | 1110yyyy 10yxxxxx 10xxxxxx 353 | 21 bits | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 354 | 355 | Valid code-points: 356 | 357 | [U+0000 .. U+D7FF] + [U+E000 .. U+10FFFF] 358 | 359 | Return values: 360 | 361 | 0 -> ok 362 | 363 | 1 -> invalid byte/code-point 364 | 365 | -1 -> truncated (1 byte missing) 366 | -2 -> truncated (2 byte missing) 367 | -3 -> truncated (3 byte missing) 368 | 369 | */ 370 | 371 | int 372 | hs_text_short_is_valid_utf8(const uint8_t buf[], const size_t n) 373 | { 374 | size_t j = 0; 375 | 376 | while (j < n) { 377 | const uint8_t b0 = buf[j++]; 378 | 379 | if (!(b0 & 0x80)) 380 | continue; /* b0 elem [ 0x00 .. 0x7f ] */ 381 | 382 | if ((b0 & 0xe0) == 0xc0) { /* [ 0xc0 .. 0xdf ] */ 383 | if (!(b0 & 0x1e)) return 1; /* 0xc0 or 0xc1; denorm */ 384 | if (j >= n) return -1; 385 | 386 | goto l_trail1; /* b1 */ 387 | } 388 | 389 | if ((b0 & 0xf0) == 0xe0) { /* [ 0xe0 .. 0xef ] */ 390 | if ((j+1) >= n) return (n-(j+2)); 391 | 392 | const uint8_t b1 = buf[j++]; 393 | if (utf8_lead_p(b1)) return 1; /* b1 elem [ 0x80 .. 0xbf ] */ 394 | 395 | /* if b0==0xe0: b1 elem [ 0xa0 .. 0xbf ] */ 396 | if (!((b0 & 0x0f) | (b1 & 0x20))) return 1; /* denorm */ 397 | 398 | /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */ 399 | /* if b0==0xed: b1 elem [ 0x80 .. 0x9f ] */ 400 | if ((b0 == 0xed) && (b1 & 0x20)) return 1; 401 | 402 | goto l_trail1; /* b2 */ 403 | } 404 | 405 | if ((b0 & 0xfc) == 0xf0) { /* [ 0xf0 .. 0xf3 ] */ 406 | if ((j+2) >= n) return (n-(j+3)); 407 | 408 | const uint8_t b1 = buf[j++]; 409 | 410 | if (utf8_lead_p(b1)) /* b1 elem [ 0x80 .. 0xbf ] */ 411 | return 1; 412 | 413 | if (!((b0 & 0x03) | (b1 & 0x30))) /* if b0==0xf0: b1 elem [ 0x90 .. 0xbf ] */ 414 | return 1; 415 | 416 | goto l_trail2; /* b1, b2 */ 417 | } 418 | 419 | if (b0 == 0xf4) { 420 | if ((j+2) >= n) return (n-(j+3)); 421 | 422 | /* b1 */ 423 | if ((buf[j++] & 0xf0) != 0x80) return 1; 424 | /* b1 elem [ 0x80 .. 0x8f ] */ 425 | 426 | l_trail2: 427 | /* b2 */ 428 | if (utf8_lead_p(buf[j++])) return 1; 429 | /* b2 elem [ 0x80 .. 0xbf ] */ 430 | 431 | l_trail1: 432 | /* b3 */ 433 | if (utf8_lead_p(buf[j++])) return 1; 434 | /* b3 elem [ 0x80 .. 0xbf ] */ 435 | 436 | continue; 437 | } 438 | 439 | /* invalid b0 byte */ 440 | return 1; 441 | } 442 | 443 | assert(j == n); 444 | 445 | return 0; 446 | } 447 | 448 | 449 | /* Returns length of longest ASCII-code-point prefix. 450 | */ 451 | size_t 452 | hs_text_short_ascii_length(const uint8_t buf[], const size_t n) 453 | { 454 | size_t j = 0; 455 | 456 | if (is_64bit) { 457 | /* "vectorized" optimisation checking 8 octets at once 458 | * 459 | * NB: A 64-bit aligned buffer is assumed. This is assumption is 460 | * justified when the buffer is the payload of a `ByteArray#`. 461 | */ 462 | const uint64_t *buf64 = (const uint64_t*)buf; 463 | 464 | for (; (j+7) < n; j+=8, ++buf64) 465 | if (*buf64 & UINT64_C(0x8080808080808080)) 466 | break; 467 | } else { 468 | /* "vectorized" optimisation checking 4 octets at once */ 469 | const uint32_t *buf32 = (const uint32_t*)buf; 470 | 471 | for (; (j+3) < n; j+=4, ++buf32) 472 | if (*buf32 & UINT64_C(0x80808080)) 473 | break; 474 | } 475 | 476 | for (; j < n; ++j) 477 | if (buf[j] & 0x80) 478 | return j; 479 | 480 | return j; 481 | } 482 | 483 | /* Test whether well-formed UTF8 string contains only ASCII code-points 484 | * returns 0 if not ASCII 485 | * 486 | * This code assumes a naturally aligned buf[] 487 | */ 488 | int 489 | hs_text_short_is_ascii(const uint8_t buf[], const size_t n) 490 | { 491 | size_t j = 0; 492 | 493 | if (n < 2) 494 | return 1; 495 | 496 | if (is_64bit) { 497 | /* "vectorized" optimisation checking 8 octets at once 498 | * 499 | * NB: A 64-bit aligned buffer is assumed. This is assumption is 500 | * justified when the buffer is the payload of a `ByteArray#`. 501 | * 502 | */ 503 | const uint64_t *buf64 = (const uint64_t*)buf; 504 | 505 | for (; (j+7) < n; j+=8, ++buf64) 506 | if (*buf64 & UINT64_C(0x8080808080808080)) 507 | return 0; 508 | 509 | if (j < n) { 510 | const int maskshift = (8 - (n - j)) << 3; 511 | const uint64_t mask = is_bigendian ? (UINT64_C(0x8080808080808080) << maskshift) /* big endian */ 512 | : (UINT64_C(0x8080808080808080) >> maskshift); /* little endian */ 513 | 514 | if (*buf64 & mask) 515 | return 0; 516 | } 517 | } else { 518 | /* "vectorized" optimisation checking 4 octets at once */ 519 | const uint32_t *buf32 = (const uint32_t*)buf; 520 | 521 | for (; (j+3) < n; j+=4, ++buf32) 522 | if (*buf32 & UINT64_C(0x80808080)) 523 | return 0; 524 | 525 | for (; j < n; ++j) 526 | if (buf[j] & 0x80) 527 | return 0; 528 | } 529 | 530 | return 1; 531 | } 532 | 533 | /* 534 | * Compute length of (transcoded) mutf8 literal 535 | * 536 | * If the mutf8 literal does not contain either surrogates nor escaped 537 | * NULs, a positive length is returned which matches what strlen(3) 538 | * would have returned. 539 | * 540 | * Otherwise, a negated size is returned which corresponds to the size 541 | * of a the mutf8->utf8 transcoded string. 542 | * 543 | */ 544 | HsInt 545 | hs_text_short_mutf8_strlen(const uint8_t buf[]) 546 | { 547 | size_t j = 0; 548 | size_t nulls = 0; 549 | bool surr_seen = false; 550 | 551 | for (;;) { 552 | const uint8_t b0 = buf[j]; 553 | 554 | if (unlikely(!b0)) 555 | break; 556 | 557 | if (likely(!(b0 & 0x80))) 558 | j += 1; /* 0_______ */ 559 | else 560 | switch(b0 >> 4) { 561 | case 0xf: /* 11110___ */ 562 | j += 4; 563 | break; 564 | case 0xe: /* 1110____ */ 565 | /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */ 566 | if (unlikely(!surr_seen && (b0 == 0xed) && (buf[j+1] & 0x20))) 567 | surr_seen = true; 568 | j += 3; 569 | break; 570 | default: /* 110_____ */ 571 | /* escaped NUL */ 572 | if (unlikely((b0 == 0xc0) && (buf[j+1] == 0x80))) 573 | nulls += 1; 574 | j += 2; 575 | break; 576 | } 577 | } /* for */ 578 | 579 | 580 | if ((nulls > 0) || surr_seen) 581 | return -(HsInt)(j - nulls); 582 | 583 | return j; 584 | } 585 | 586 | /* Transcode Modified UTF-8 to proper UTF-8 587 | * 588 | * This involves 589 | * 590 | * 1. Unescape denormal 2-byte NULs (0xC0 0x80) 591 | * 2. Rewrite surrogate pairs to U+FFFD 592 | */ 593 | void 594 | hs_text_short_mutf8_trans(const uint8_t src0[], uint8_t dst0[]) 595 | { 596 | const uint8_t *src = src0; 597 | uint8_t *dst = dst0; 598 | 599 | for (;;) { 600 | const uint8_t b0 = *src++; 601 | assert(utf8_lead_p(b0)); 602 | 603 | if (likely(!(b0 & 0x80))) { /* 0_______ */ 604 | if (unlikely(!b0)) 605 | break; 606 | 607 | *dst++ = b0; 608 | continue; 609 | } 610 | 611 | switch(b0 >> 4) { 612 | case 0xf: /* 11110___ */ 613 | assert(!utf8_lead_p(src[0])); 614 | assert(!utf8_lead_p(src[1])); 615 | assert(!utf8_lead_p(src[2])); 616 | *dst++ = b0; 617 | *dst++ = *src++; 618 | *dst++ = *src++; 619 | *dst++ = *src++; 620 | break; 621 | 622 | case 0xe: { /* 1110____ */ 623 | const uint8_t b1 = *src++; 624 | const uint8_t b2 = *src++; 625 | assert(!utf8_lead_p(b1)); 626 | assert(!utf8_lead_p(b2)); 627 | if (unlikely((b0 == 0xed) && (b1 & 0x20))) { 628 | /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] 629 | * -> translate into U+FFFD 630 | */ 631 | *dst++ = 0xef; 632 | *dst++ = 0xbf; 633 | *dst++ = 0xbd; 634 | } else { 635 | *dst++ = b0; 636 | *dst++ = b1; 637 | *dst++ = b2; 638 | } 639 | break; 640 | } 641 | default: { /* 110_____ */ 642 | const uint8_t b1 = *src++; 643 | assert(!utf8_lead_p(b1)); 644 | if (unlikely((b0 == 0xc0) && (b1 == 0x80))) { 645 | /* escaped/denormal U+0000 -> normalize */ 646 | *dst++ = 0x00; 647 | } else { 648 | *dst++ = b0; 649 | *dst++ = b1; 650 | } 651 | break; 652 | } 653 | } /* switch */ 654 | } /* for */ 655 | 656 | assert(labs(hs_text_short_mutf8_strlen(src0)) == (dst - dst0)); 657 | } 658 | -------------------------------------------------------------------------------- /src/Data/Text/Short/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE DeriveDataTypeable #-} 4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 5 | {-# LANGUAGE MagicHash #-} 6 | {-# LANGUAGE RankNTypes #-} 7 | {-# LANGUAGE TemplateHaskellQuotes #-} 8 | {-# LANGUAGE TypeFamilies #-} 9 | {-# LANGUAGE UnboxedTuples #-} 10 | {-# LANGUAGE UnliftedFFITypes #-} 11 | {-# LANGUAGE Unsafe #-} 12 | {-# LANGUAGE ViewPatterns #-} 13 | 14 | -- | 15 | -- Module : Data.Text.Short.Internal 16 | -- Copyright : © Herbert Valerio Riedel 2017 17 | -- License : BSD3 18 | -- 19 | -- Maintainer : hvr@gnu.org 20 | -- Stability : stable 21 | -- 22 | -- Memory-efficient representation of Unicode text strings. 23 | -- 24 | -- @since 0.1 25 | module Data.Text.Short.Internal 26 | ( -- * The 'ShortText' type 27 | ShortText(..) 28 | 29 | -- * Basic operations 30 | , null 31 | , length 32 | , isAscii 33 | , splitAt 34 | , splitAtEnd 35 | , indexEndMaybe 36 | , indexMaybe 37 | , isPrefixOf 38 | , stripPrefix 39 | , isSuffixOf 40 | , stripSuffix 41 | 42 | , cons 43 | , snoc 44 | , uncons 45 | , unsnoc 46 | 47 | , findIndex 48 | , find 49 | , all 50 | 51 | , span 52 | , spanEnd 53 | , split 54 | 55 | , intersperse 56 | , intercalate 57 | , reverse 58 | , replicate 59 | 60 | , filter 61 | , dropAround 62 | 63 | , foldl 64 | , foldl' 65 | , foldr 66 | , foldl1 67 | , foldl1' 68 | , foldr1 69 | 70 | -- * Conversions 71 | -- ** 'Char' 72 | , singleton 73 | 74 | -- ** 'String' 75 | , Data.Text.Short.Internal.fromString 76 | , toString 77 | 78 | -- ** 'T.Text' 79 | , fromText 80 | , toText 81 | 82 | -- ** 'BS.ByteString' 83 | , fromShortByteString 84 | , fromShortByteStringUnsafe 85 | , toShortByteString 86 | 87 | , fromByteString 88 | , fromByteStringUnsafe 89 | , toByteString 90 | 91 | , toBuilder 92 | 93 | -- * misc 94 | -- ** For Haddock 95 | 96 | , BS.ByteString 97 | , T.Text 98 | , module Prelude 99 | , module Data.Monoid 100 | 101 | -- ** Internals 102 | , isValidUtf8 103 | ) where 104 | 105 | import Prelude 106 | (Bool(..), Ordering(..), Int, Char, String, Maybe(..), IO, Eq, Ord, Num, Read, 107 | Show, ($), ($!), (.), (==), (/=), (+), (*), (-), (>>), (<=), (<), (>), (>=), 108 | compare, show, showsPrec, readsPrec, abs, return, fmap, error, 109 | undefined, otherwise, fromIntegral, max, min, not, fst, snd, map, seq, fail, maybe) 110 | 111 | import Control.DeepSeq (NFData) 112 | import Control.Monad.ST (stToIO) 113 | import Data.Binary 114 | import Data.Bits 115 | import qualified Data.ByteString as BS 116 | import qualified Data.ByteString.Builder as BB 117 | import Data.ByteString.Short (ShortByteString) 118 | import qualified Data.ByteString.Short as BSS 119 | import qualified Data.ByteString.Short.Internal as BSSI 120 | import Data.Char (ord) 121 | import Data.Data (Data(..),constrIndex, Constr, 122 | mkConstr, DataType, mkDataType, 123 | Fixity(Prefix)) 124 | import Data.Hashable (Hashable) 125 | import Data.Typeable (Typeable) 126 | import qualified Data.List as List 127 | import Data.Maybe (fromMaybe, isNothing) 128 | import Data.Monoid (Monoid, mempty, mconcat) 129 | import Data.Semigroup 130 | import qualified Data.String as S 131 | import qualified Data.Text as T 132 | import Foreign.C 133 | import GHC.Base (assert, unsafeChr) 134 | import qualified GHC.CString as GHC 135 | import GHC.Exts (Addr#, ByteArray#, Int (I#), 136 | Int#, MutableByteArray#, 137 | Ptr (..), RealWorld, Word (W#)) 138 | import qualified GHC.Exts 139 | import qualified GHC.Foreign as GHC 140 | import GHC.IO.Encoding 141 | import GHC.ST 142 | import System.IO.Unsafe 143 | import Text.Printf (PrintfArg, formatArg, 144 | formatString) 145 | 146 | import qualified Language.Haskell.TH.Syntax as TH 147 | 148 | #if MIN_VERSION_text(2,0,0) 149 | import qualified Data.Text.Internal as TI 150 | import qualified Data.Text.Array as TA 151 | #else 152 | import qualified Data.Text.Encoding as T 153 | #endif 154 | 155 | import qualified PrimOps 156 | 157 | -- | A compact representation of Unicode strings. 158 | -- 159 | -- A 'ShortText' value is a sequence of Unicode scalar values, as defined in 160 | -- ; 161 | -- This means that a 'ShortText' is a list of (scalar) Unicode code-points (i.e. code-points in the range @[U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]@). 162 | -- 163 | -- This type relates to 'T.Text' as 'ShortByteString' relates to 'BS.ByteString' by providing a more compact type. Please consult the documentation of "Data.ByteString.Short" for more information. 164 | -- 165 | -- Currently, a boxed unshared 'T.Text' has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 1, 2, 3 or 4 bytes per code-point for text-2 (due to the internal UTF-8 representation) or 2 or 4 bytes per code-point for text-1 (due to the internal UTF-16 representation). Each 'T.Text' value which can share its payload with another 'T.Text' requires only 4 words additionally. Unlike 'BS.ByteString', 'T.Text' use unpinned memory. 166 | -- 167 | -- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation). 168 | -- 169 | -- It can be shown that for realistic data . 170 | -- 171 | -- __NOTE__: The `Typeable` instance isn't defined for GHC 7.8 (and older) prior to @text-short-0.1.3@ 172 | -- 173 | -- @since 0.1 174 | newtype ShortText = ShortText ShortByteString 175 | deriving (Hashable,Monoid,NFData,Data.Semigroup.Semigroup,Typeable) 176 | 177 | -- | It exposes a similar 'Data' instance abstraction as 'T.Text' (see 178 | -- discussion referenced there for more details), preserving the 179 | -- @[Char]@ data abstraction at the cost of inefficiency. 180 | -- 181 | -- @since 0.1.3 182 | instance Data ShortText where 183 | gfoldl f z txt = z fromString `f` (toString txt) 184 | toConstr _ = packConstr 185 | gunfold k z c = case constrIndex c of 186 | 1 -> k (z fromString) 187 | _ -> error "gunfold" 188 | dataTypeOf _ = shortTextDataType 189 | 190 | packConstr :: Constr 191 | packConstr = mkConstr shortTextDataType "fromString" [] Prefix 192 | 193 | shortTextDataType :: DataType 194 | shortTextDataType = mkDataType "Data.Text.Short" [packConstr] 195 | 196 | instance Eq ShortText where 197 | {-# INLINE (==) #-} 198 | (==) x y 199 | | lx /= ly = False 200 | | lx == 0 = True 201 | | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of 202 | 0# -> True 203 | _ -> False 204 | where 205 | !lx@(I# n#) = toLength x 206 | !ly = toLength y 207 | 208 | instance Ord ShortText where 209 | compare t1 t2 210 | | n == 0 = compare n1 n2 211 | | otherwise = case PrimOps.compareByteArrays# ba1# 0# ba2# 0# n# of 212 | r# | I# r# < 0 -> LT 213 | | I# r# > 0 -> GT 214 | | n1 < n2 -> LT 215 | | n1 > n2 -> GT 216 | | otherwise -> EQ 217 | where 218 | ba1# = toByteArray# t1 219 | ba2# = toByteArray# t2 220 | !n1 = toLength t1 221 | !n2 = toLength t2 222 | !n@(I# n#) = n1 `min` n2 223 | 224 | instance Show ShortText where 225 | showsPrec p (ShortText b) = showsPrec p (decodeStringShort' utf8 b) 226 | show (ShortText b) = show (decodeStringShort' utf8 b) 227 | 228 | instance Read ShortText where 229 | readsPrec p = map (\(x,s) -> (ShortText $ encodeStringShort utf8 x,s)) . readsPrec p 230 | 231 | -- | @since 0.1.2 232 | instance PrintfArg ShortText where 233 | formatArg txt = formatString $ toString txt 234 | 235 | -- | The 'Binary' encoding matches the one for 'T.Text' 236 | instance Binary ShortText where 237 | put = put . toShortByteString 238 | get = do 239 | sbs <- get 240 | case fromShortByteString sbs of 241 | Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream" 242 | Just st -> return st 243 | 244 | -- | Since 0.1.3 245 | instance TH.Lift ShortText where 246 | -- TODO: Use DeriveLift with bytestring-0.11.2.0 247 | lift t = [| fromString s |] 248 | where s = toString t 249 | 250 | #if MIN_VERSION_template_haskell(2,17,0) 251 | liftTyped = TH.unsafeCodeCoerce . TH.lift 252 | #elif MIN_VERSION_template_haskell(2,16,0) 253 | liftTyped = TH.unsafeTExpCoerce . TH.lift 254 | #endif 255 | 256 | -- | \(\mathcal{O}(1)\) Test whether a 'ShortText' is empty. 257 | -- 258 | -- >>> null "" 259 | -- True 260 | -- 261 | -- prop> null (singleton c) == False 262 | -- 263 | -- prop> null t == (length t == 0) 264 | -- 265 | -- @since 0.1 266 | null :: ShortText -> Bool 267 | null = BSS.null . toShortByteString 268 | 269 | -- | \(\mathcal{O}(n)\) Count the number of Unicode code-points in a 'ShortText'. 270 | -- 271 | -- >>> length "abcd€" 272 | -- 5 273 | -- 274 | -- >>> length "" 275 | -- 0 276 | -- 277 | -- prop> length t >= 0 278 | -- 279 | -- @since 0.1 280 | length :: ShortText -> Int 281 | length st = fromIntegral $ unsafeDupablePerformIO (c_text_short_length (toByteArray# st) (toCSize st)) 282 | 283 | foreign import ccall unsafe "hs_text_short_length" c_text_short_length :: ByteArray# -> CSize -> IO CSize 284 | 285 | -- | \(\mathcal{O}(n)\) Test whether 'ShortText' contains only ASCII code-points (i.e. only U+0000 through U+007F). 286 | -- 287 | -- This is a more efficient version of @'all' 'Data.Char.isAscii'@. 288 | -- 289 | -- >>> isAscii "" 290 | -- True 291 | -- 292 | -- >>> isAscii "abc\NUL" 293 | -- True 294 | -- 295 | -- >>> isAscii "abcd€" 296 | -- False 297 | -- 298 | -- prop> isAscii t == all (< '\x80') t 299 | -- 300 | -- @since 0.1 301 | isAscii :: ShortText -> Bool 302 | isAscii st = (/= 0) $ unsafeDupablePerformIO (c_text_short_is_ascii (toByteArray# st) sz) 303 | where 304 | sz = toCSize st 305 | 306 | foreign import ccall unsafe "hs_text_short_is_ascii" c_text_short_is_ascii :: ByteArray# -> CSize -> IO CInt 307 | 308 | -- | \(\mathcal{O}(n)\) Test whether /all/ code points in 'ShortText' satisfy a predicate. 309 | -- 310 | -- >>> all (const False) "" 311 | -- True 312 | -- 313 | -- >>> all (> 'c') "abcdabcd" 314 | -- False 315 | -- 316 | -- >>> all (/= 'c') "abdabd" 317 | -- True 318 | -- 319 | -- @since 0.1.2 320 | all :: (Char -> Bool) -> ShortText -> Bool 321 | all p st = isNothing (findOfs (not . p) st (B 0)) 322 | 323 | -- | \(\mathcal{O}(n)\) Return the left-most codepoint in 'ShortText' that satisfies the given predicate. 324 | -- 325 | -- >>> find (> 'b') "abcdabcd" 326 | -- Just 'c' 327 | -- 328 | -- >>> find (> 'b') "ababab" 329 | -- Nothing 330 | -- 331 | -- @since 0.1.2 332 | find :: (Char -> Bool) -> ShortText -> Maybe Char 333 | find p st = go 0 334 | where 335 | go !ofs 336 | | ofs >= sz = Nothing 337 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 338 | in c `seq` ofs' `seq` 339 | if p c 340 | then Just c 341 | else go ofs' 342 | 343 | !sz = toB st 344 | 345 | -- | \(\mathcal{O}(n)\) Return the index of the left-most codepoint in 'ShortText' that satisfies the given predicate. 346 | -- 347 | -- >>> findIndex (> 'b') "abcdabcdef" 348 | -- Just 2 349 | -- 350 | -- >>> findIndex (> 'b') "ababab" 351 | -- Nothing 352 | -- 353 | -- prop> (indexMaybe t =<< findIndex p t) == find p t 354 | -- 355 | -- @since 0.1.2 356 | findIndex :: (Char -> Bool) -> ShortText -> Maybe Int 357 | findIndex p st = go 0 0 358 | where 359 | go !ofs !i 360 | | ofs >= sz = Nothing 361 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 362 | in c `seq` ofs' `seq` 363 | if p c 364 | then Just i 365 | else go ofs' (i+1) 366 | 367 | !sz = toB st 368 | 369 | 370 | -- | \(\mathcal{O}(n)\) Splits a string into components delimited by separators, 371 | -- where the predicate returns True for a separator element. The 372 | -- resulting components do not contain the separators. Two adjacent 373 | -- separators result in an empty component in the output. eg. 374 | -- 375 | -- >>> split (=='a') "aabbaca" 376 | -- ["","","bb","c",""] 377 | -- 378 | -- >>> split (=='a') "" 379 | -- [""] 380 | -- 381 | -- prop> intercalate (singleton c) (split (== c) t) = t 382 | -- 383 | -- __NOTE__: 'split' never returns an empty list to match the semantics of its counterpart from "Data.Text". 384 | -- 385 | -- @since 0.1.3 386 | split :: (Char -> Bool) -> ShortText -> [ShortText] 387 | split p st0 = go 0 388 | where 389 | go !ofs0 = case findOfs' p st0 ofs0 of 390 | Just (ofs1,ofs2) -> slice st0 ofs0 (ofs1-ofs0) : go ofs2 391 | Nothing 392 | | ofs0 == 0 -> st0 : [] 393 | | otherwise -> slice st0 ofs0 (maxOfs-ofs0) : [] 394 | 395 | !maxOfs = toB st0 396 | 397 | -- internal helper 398 | {-# INLINE findOfs #-} 399 | findOfs :: (Char -> Bool) -> ShortText -> B -> Maybe B 400 | findOfs p st = go 401 | where 402 | go :: B -> Maybe B 403 | go !ofs | ofs >= sz = Nothing 404 | go !ofs | p c = Just ofs 405 | | otherwise = go ofs' 406 | where 407 | (c,ofs') = decodeCharAtOfs st ofs 408 | 409 | !sz = toB st 410 | 411 | {-# INLINE findOfs' #-} 412 | findOfs' :: (Char -> Bool) -> ShortText -> B -> Maybe (B,B) 413 | findOfs' p st = go 414 | where 415 | go :: B -> Maybe (B,B) 416 | go !ofs | ofs >= sz = Nothing 417 | go !ofs | p c = Just (ofs,ofs') 418 | | otherwise = go ofs' 419 | where 420 | (c,ofs') = decodeCharAtOfs st ofs 421 | 422 | !sz = toB st 423 | 424 | 425 | {-# INLINE findOfsRev #-} 426 | findOfsRev :: (Char -> Bool) -> ShortText -> B -> Maybe B 427 | findOfsRev p st = go 428 | where 429 | go (B 0) = Nothing 430 | go !ofs 431 | | p (cp2ch cp) = Just ofs 432 | | otherwise = go (ofs-cpLen cp) 433 | where 434 | !cp = readCodePointRev st ofs 435 | 436 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into longest prefix satisfying the given predicate and the remaining suffix. 437 | -- 438 | -- >>> span (< 'c') "abcdabcd" 439 | -- ("ab","cdabcd") 440 | -- 441 | -- prop> fst (span p t) <> snd (span p t) == t 442 | -- 443 | -- @since 0.1.2 444 | span :: (Char -> Bool) -> ShortText -> (ShortText,ShortText) 445 | span p st 446 | | Just ofs <- findOfs (not . p) st (B 0) = splitAtOfs ofs st 447 | | otherwise = (st,mempty) 448 | 449 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into longest suffix satisfying the given predicate and the preceding prefix. 450 | -- 451 | -- >>> spanEnd (> 'c') "abcdabcd" 452 | -- ("abcdabc","d") 453 | -- 454 | -- prop> fst (spanEnd p t) <> snd (spanEnd p t) == t 455 | -- 456 | -- @since 0.1.2 457 | spanEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText) 458 | spanEnd p st 459 | | Just ofs <- findOfsRev (not . p) st (toB st) = splitAtOfs ofs st 460 | | otherwise = (mempty,st) 461 | 462 | ---------------------------------------------------------------------------- 463 | 464 | toCSize :: ShortText -> CSize 465 | toCSize = fromIntegral . BSS.length . toShortByteString 466 | 467 | toB :: ShortText -> B 468 | toB = fromIntegral . BSS.length . toShortByteString 469 | 470 | toLength :: ShortText -> Int 471 | toLength st = I# (toLength# st) 472 | 473 | toLength# :: ShortText -> Int# 474 | toLength# st = GHC.Exts.sizeofByteArray# (toByteArray# st) 475 | 476 | toByteArray# :: ShortText -> ByteArray# 477 | toByteArray# (ShortText (BSSI.SBS ba#)) = ba# 478 | 479 | -- | \(\mathcal{O}(0)\) Converts to UTF-8 encoded 'ShortByteString' 480 | -- 481 | -- This operation has effectively no overhead, as it's currently merely a @newtype@-cast. 482 | -- 483 | -- @since 0.1 484 | toShortByteString :: ShortText -> ShortByteString 485 | toShortByteString (ShortText b) = b 486 | 487 | -- | \(\mathcal{O}(n)\) Converts to UTF-8 encoded 'BS.ByteString' 488 | -- 489 | -- @since 0.1 490 | toByteString :: ShortText -> BS.ByteString 491 | toByteString = BSS.fromShort . toShortByteString 492 | 493 | -- | Construct a 'BB.Builder' that encodes 'ShortText' as UTF-8. 494 | -- 495 | -- @since 0.1 496 | toBuilder :: ShortText -> BB.Builder 497 | toBuilder = BB.shortByteString . toShortByteString 498 | 499 | -- | \(\mathcal{O}(n)\) Convert to 'String' 500 | -- 501 | -- prop> (fromString . toString) t == t 502 | -- 503 | -- __Note__: See documentation of 'fromString' for why @('toString' . 'fromString')@ is not an identity function. 504 | -- 505 | -- @since 0.1 506 | toString :: ShortText -> String 507 | -- NOTE: impl below beats 508 | -- toString = decodeStringShort' utf8 . toShortByteString 509 | -- except for smallish strings 510 | toString st = go 0 511 | where 512 | go !ofs 513 | | ofs >= sz = [] 514 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 515 | in c `seq` ofs' `seq` (c : go ofs') 516 | 517 | !sz = toB st 518 | 519 | ---------------------------------------------------------------------------- 520 | -- Folds 521 | 522 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with 523 | -- the binary operator and an initial in forward direction (i.e. from 524 | -- left to right). 525 | -- 526 | -- >>> foldl (\_ _ -> True) False "" 527 | -- False 528 | -- 529 | -- >>> foldl (\s c -> c : s) ['.'] "abcd" 530 | -- "dcba." 531 | -- 532 | -- @since 0.1.2 533 | foldl :: (a -> Char -> a) -> a -> ShortText -> a 534 | foldl f z st = go 0 z 535 | where 536 | go !ofs acc 537 | | ofs >= sz = acc 538 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 539 | in c `seq` ofs' `seq` go ofs' (f acc c) 540 | 541 | !sz = toB st 542 | 543 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator. 544 | -- 545 | -- >>> foldl1 max "abcdcba" 546 | -- 'd' 547 | -- 548 | -- >>> foldl1 const "abcd" 549 | -- 'a' 550 | -- 551 | -- >>> foldl1 (flip const) "abcd" 552 | -- 'd' 553 | -- 554 | -- __Note__: Will throw an 'error' exception if index is out of bounds. 555 | -- 556 | -- @since 0.1.2 557 | foldl1 :: (Char -> Char -> Char) -> ShortText -> Char 558 | foldl1 f st 559 | | sz == 0 = error "foldl1: empty ShortText" 560 | | otherwise = go c0sz c0 561 | where 562 | go !ofs acc 563 | | ofs >= sz = acc 564 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 565 | in c `seq` ofs' `seq` go ofs' (f acc c) 566 | !sz = toB st 567 | (c0,c0sz) = decodeCharAtOfs st (B 0) 568 | 569 | -- | \(\mathcal{O}(n)\) Strict version of 'foldl'. 570 | -- 571 | -- @since 0.1.2 572 | foldl' :: (a -> Char -> a) -> a -> ShortText -> a 573 | foldl' f !z st = go 0 z 574 | where 575 | go !ofs !acc 576 | | ofs >= sz = acc 577 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 578 | in c `seq` ofs' `seq` go ofs' (f acc c) 579 | 580 | !sz = toB st 581 | 582 | -- | \(\mathcal{O}(n)\) Strict version of 'foldl1'. 583 | -- 584 | -- @since 0.1.2 585 | foldl1' :: (Char -> Char -> Char) -> ShortText -> Char 586 | foldl1' f st 587 | | sz == 0 = error "foldl1: empty ShortText" 588 | | otherwise = go c0sz c0 589 | where 590 | go !ofs !acc 591 | | ofs >= sz = acc 592 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 593 | in c `seq` ofs' `seq` go ofs' (f acc c) 594 | !sz = toB st 595 | (c0,c0sz) = decodeCharAtOfs st (B 0) 596 | 597 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with 598 | -- the binary operator and an initial in reverse direction (i.e. from 599 | -- right to left). 600 | -- 601 | -- >>> foldr (\_ _ -> True) False "" 602 | -- False 603 | -- 604 | -- >>> foldr (:) ['.'] "abcd" 605 | -- "abcd." 606 | -- 607 | -- @since 0.1.2 608 | foldr :: (Char -> a -> a) -> a -> ShortText -> a 609 | foldr f z st = go 0 610 | where 611 | go !ofs 612 | | ofs >= sz = z 613 | | otherwise = let (c,ofs') = decodeCharAtOfs st ofs 614 | in c `seq` ofs' `seq` f c (go ofs') 615 | 616 | !sz = toB st 617 | 618 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator. 619 | -- 620 | -- >>> foldr1 max "abcdcba" 621 | -- 'd' 622 | -- 623 | -- >>> foldr1 const "abcd" 624 | -- 'a' 625 | -- 626 | -- >>> foldr1 (flip const) "abcd" 627 | -- 'd' 628 | -- 629 | -- __Note__: Will throw an 'error' exception if index is out of bounds. 630 | -- 631 | -- @since 0.1.2 632 | foldr1 :: (Char -> Char -> Char) -> ShortText -> Char 633 | foldr1 f st 634 | | sz == 0 = error "foldr1: empty ShortText" 635 | | otherwise = go 0 636 | where 637 | go !ofs = let (c,ofs') = decodeCharAtOfs st ofs 638 | in c `seq` ofs' `seq` 639 | (if ofs' >= sz then c else f c (go ofs')) 640 | 641 | !sz = toB st 642 | 643 | -- | \(\mathcal{O}(n)\) Convert to 'T.Text' 644 | -- 645 | -- prop> (fromText . toText) t == t 646 | -- 647 | -- prop> (toText . fromText) t == t 648 | -- 649 | -- This is \(\mathcal{O}(1)\) with @text-2@. 650 | -- Previously it wasn't because 'T.Text' used UTF-16 as its internal representation. 651 | -- 652 | -- @since 0.1 653 | toText :: ShortText -> T.Text 654 | #if MIN_VERSION_text(2,0,0) 655 | toText (ShortText (BSSI.SBS ba)) = TI.Text (TA.ByteArray ba) 0 (I# (GHC.Exts.sizeofByteArray# ba)) 656 | #else 657 | toText = T.decodeUtf8 . toByteString 658 | #endif 659 | 660 | ---- 661 | 662 | -- | \(\mathcal{O}(n)\) Construct/pack from 'String' 663 | -- 664 | -- >>> fromString [] 665 | -- "" 666 | -- 667 | -- >>> fromString ['a','b','c'] 668 | -- "abc" 669 | -- 670 | -- >>> fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000 671 | -- "\55295\65533\65533\57344" 672 | -- 673 | -- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD. 674 | -- 675 | -- @since 0.1 676 | fromString :: String -> ShortText 677 | fromString s = case s of 678 | [] -> mempty 679 | [c] -> singleton $ r c 680 | _ -> ShortText . encodeStringShort utf8 . map r $ s 681 | where 682 | r c | isSurr (ord c) = '\xFFFD' 683 | | otherwise = c 684 | 685 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from 'T.Text' 686 | -- 687 | -- This is \(\mathcal{O}(1)\) with @text-2@ when the 'T.Text' is not sliced. 688 | -- Previously it wasn't because 'T.Text' used UTF-16 as its internal representation. 689 | -- 690 | -- @since 0.1 691 | fromText :: T.Text -> ShortText 692 | #if MIN_VERSION_text(2,0,0) 693 | fromText (TI.Text (TA.ByteArray ba) off len) = 694 | ShortText (BSSI.SBS (case sliceByteArray (TA.ByteArray ba) off len of TA.ByteArray ba' -> ba')) 695 | 696 | sliceByteArray :: TA.Array -> Int -> Int -> TA.Array 697 | sliceByteArray ta@(TA.ByteArray ba) 0 len 698 | | len == I# (GHC.Exts.sizeofByteArray# ba) 699 | = ta 700 | sliceByteArray ta off len = TA.run $ do 701 | ma <- TA.new len 702 | TA.copyI len ma 0 ta off 703 | return ma 704 | 705 | #else 706 | fromText = fromByteStringUnsafe . T.encodeUtf8 707 | #endif 708 | 709 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString' 710 | -- 711 | -- This operation doesn't copy the input 'ShortByteString' but it 712 | -- cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding. 713 | -- 714 | -- Returns 'Nothing' in case of invalid UTF-8 encoding. 715 | -- 716 | -- >>> fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A 717 | -- Just "\NUL8\66330" 718 | -- 719 | -- >>> fromShortByteString "\xC0\x80" -- invalid denormalised U+00 720 | -- Nothing 721 | -- 722 | -- >>> fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point) 723 | -- Nothing 724 | -- 725 | -- >>> fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF 726 | -- Just "\1114111" 727 | -- 728 | -- >>> fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid) 729 | -- Nothing 730 | -- 731 | -- prop> fromShortByteString (toShortByteString t) == Just t 732 | -- 733 | -- @since 0.1 734 | fromShortByteString :: ShortByteString -> Maybe ShortText 735 | fromShortByteString sbs 736 | | isValidUtf8 st = Just st 737 | | otherwise = Nothing 738 | where 739 | st = ShortText sbs 740 | 741 | -- | \(\mathcal{O}(0)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString' 742 | -- 743 | -- This operation has effectively no overhead, as it's currently merely a @newtype@-cast. 744 | -- 745 | -- __WARNING__: Unlike the safe 'fromShortByteString' conversion, this 746 | -- conversion is /unsafe/ as it doesn't validate the well-formedness of the 747 | -- UTF-8 encoding. 748 | -- 749 | -- @since 0.1.1 750 | fromShortByteStringUnsafe :: ShortByteString -> ShortText 751 | fromShortByteStringUnsafe = ShortText 752 | 753 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString' 754 | -- 755 | -- 'fromByteString' accepts (or rejects) the same input data as 'fromShortByteString'. 756 | -- 757 | -- Returns 'Nothing' in case of invalid UTF-8 encoding. 758 | -- 759 | -- @since 0.1 760 | fromByteString :: BS.ByteString -> Maybe ShortText 761 | fromByteString = fromShortByteString . BSS.toShort 762 | 763 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString' 764 | -- 765 | -- This operation is \(\mathcal{O}(n)\) because the 'BS.ByteString' needs to be 766 | -- copied into an unpinned 'ByteArray#'. 767 | -- 768 | -- __WARNING__: Unlike the safe 'fromByteString' conversion, this 769 | -- conversion is /unsafe/ as it doesn't validate the well-formedness of the 770 | -- UTF-8 encoding. 771 | -- 772 | -- @since 0.1.1 773 | fromByteStringUnsafe :: BS.ByteString -> ShortText 774 | fromByteStringUnsafe = ShortText . BSS.toShort 775 | 776 | ---------------------------------------------------------------------------- 777 | 778 | encodeString :: TextEncoding -> String -> BS.ByteString 779 | encodeString te str = unsafePerformIO $ GHC.withCStringLen te str BS.packCStringLen 780 | 781 | -- decodeString :: TextEncoding -> BS.ByteString -> Maybe String 782 | -- decodeString te bs = cvtEx $ unsafePerformIO $ try $ BS.useAsCStringLen bs (GHC.peekCStringLen te) 783 | -- where 784 | -- cvtEx :: Either IOException a -> Maybe a 785 | -- cvtEx = either (const Nothing) Just 786 | 787 | decodeString' :: TextEncoding -> BS.ByteString -> String 788 | decodeString' te bs = unsafePerformIO $ BS.useAsCStringLen bs (GHC.peekCStringLen te) 789 | 790 | decodeStringShort' :: TextEncoding -> ShortByteString -> String 791 | decodeStringShort' te = decodeString' te . BSS.fromShort 792 | 793 | encodeStringShort :: TextEncoding -> String -> BSS.ShortByteString 794 | encodeStringShort te = BSS.toShort . encodeString te 795 | 796 | -- isValidUtf8' :: ShortText -> Int 797 | -- isValidUtf8' st = fromIntegral $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st)) 798 | 799 | isValidUtf8 :: ShortText -> Bool 800 | isValidUtf8 st = (==0) $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st)) 801 | 802 | type CCodePoint = Word 803 | 804 | foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt 805 | 806 | foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO CCodePoint 807 | 808 | -- | \(\mathcal{O}(n)\) Lookup /i/-th code-point in 'ShortText'. 809 | -- 810 | -- Returns 'Nothing' if out of bounds. 811 | -- 812 | -- prop> indexMaybe (singleton c) 0 == Just c 813 | -- 814 | -- prop> indexMaybe t 0 == fmap fst (uncons t) 815 | -- 816 | -- prop> indexMaybe mempty i == Nothing 817 | -- 818 | -- @since 0.1.2 819 | indexMaybe :: ShortText -> Int -> Maybe Char 820 | indexMaybe st i 821 | | i < 0 = Nothing 822 | | otherwise = cp2chSafe cp 823 | where 824 | cp = CP $ unsafeDupablePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i)) 825 | 826 | -- | \(\mathcal{O}(n)\) Lookup /i/-th code-point from the end of 'ShortText'. 827 | -- 828 | -- Returns 'Nothing' if out of bounds. 829 | -- 830 | -- prop> indexEndMaybe (singleton c) 0 == Just c 831 | -- 832 | -- prop> indexEndMaybe t 0 == fmap snd (unsnoc t) 833 | -- 834 | -- prop> indexEndMaybe mempty i == Nothing 835 | -- 836 | -- @since 0.1.2 837 | indexEndMaybe :: ShortText -> Int -> Maybe Char 838 | indexEndMaybe st i 839 | | i < 0 = Nothing 840 | | otherwise = cp2chSafe cp 841 | where 842 | cp = CP $ unsafeDupablePerformIO (c_text_short_index_rev (toByteArray# st) (toCSize st) (fromIntegral i)) 843 | 844 | foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev :: ByteArray# -> CSize -> CSize -> IO CCodePoint 845 | 846 | 847 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves. 848 | -- 849 | -- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties: 850 | -- 851 | -- prop> length (fst (splitAt n t)) == min (length t) (max 0 n) 852 | -- 853 | -- prop> fst (splitAt n t) <> snd (splitAt n t) == t 854 | -- 855 | -- >>> splitAt 2 "abcdef" 856 | -- ("ab","cdef") 857 | -- 858 | -- >>> splitAt 10 "abcdef" 859 | -- ("abcdef","") 860 | -- 861 | -- >>> splitAt (-1) "abcdef" 862 | -- ("","abcdef") 863 | -- 864 | -- @since 0.1.2 865 | splitAt :: Int -> ShortText -> (ShortText,ShortText) 866 | splitAt i st 867 | | i <= 0 = (mempty,st) 868 | | otherwise = splitAtOfs ofs st 869 | where 870 | ofs = csizeToB $ 871 | unsafeDupablePerformIO (c_text_short_index_ofs (toByteArray# st) stsz (fromIntegral i)) 872 | stsz = toCSize st 873 | 874 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves. 875 | -- 876 | -- @'splitAtEnd' n t@ returns a pair of 'ShortText' with the following properties: 877 | -- 878 | -- prop> length (snd (splitAtEnd n t)) == min (length t) (max 0 n) 879 | -- 880 | -- prop> fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t 881 | -- 882 | -- prop> splitAtEnd n t == splitAt (length t - n) t 883 | -- 884 | -- >>> splitAtEnd 2 "abcdef" 885 | -- ("abcd","ef") 886 | -- 887 | -- >>> splitAtEnd 10 "abcdef" 888 | -- ("","abcdef") 889 | -- 890 | -- >>> splitAtEnd (-1) "abcdef" 891 | -- ("abcdef","") 892 | -- 893 | -- @since 0.1.2 894 | splitAtEnd :: Int -> ShortText -> (ShortText,ShortText) 895 | splitAtEnd i st 896 | | i <= 0 = (st,mempty) 897 | | ofs >= stsz = (mempty,st) 898 | | otherwise = splitAtOfs ofs st 899 | where 900 | ofs = csizeToB $ 901 | unsafeDupablePerformIO (c_text_short_index_ofs_rev (toByteArray# st) (toCSize st) (fromIntegral (i-1))) 902 | stsz = toB st 903 | 904 | {-# INLINE splitAtOfs #-} 905 | splitAtOfs :: B -> ShortText -> (ShortText,ShortText) 906 | splitAtOfs ofs st 907 | | ofs == 0 = (mempty,st) 908 | | ofs >= stsz = (st,mempty) 909 | | otherwise = (slice st 0 ofs, slice st ofs (stsz-ofs)) 910 | where 911 | !stsz = toB st 912 | 913 | foreign import ccall unsafe "hs_text_short_index_ofs" c_text_short_index_ofs :: ByteArray# -> CSize -> CSize -> IO CSize 914 | 915 | foreign import ccall unsafe "hs_text_short_index_ofs_rev" c_text_short_index_ofs_rev :: ByteArray# -> CSize -> CSize -> IO CSize 916 | 917 | 918 | -- | \(\mathcal{O}(n)\) Inverse operation to 'cons' 919 | -- 920 | -- Returns 'Nothing' for empty input 'ShortText'. 921 | -- 922 | -- prop> uncons (cons c t) == Just (c,t) 923 | -- 924 | -- >>> uncons "" 925 | -- Nothing 926 | -- 927 | -- >>> uncons "fmap" 928 | -- Just ('f',"map") 929 | -- 930 | -- @since 0.1.2 931 | uncons :: ShortText -> Maybe (Char,ShortText) 932 | uncons st 933 | | null st = Nothing 934 | | len2 == 0 = Just (c0, mempty) 935 | | otherwise = Just (c0, slice st ofs len2) 936 | where 937 | c0 = cp2ch cp0 938 | cp0 = readCodePoint st 0 939 | ofs = cpLen cp0 940 | len2 = toB st - ofs 941 | 942 | -- | \(\mathcal{O}(n)\) Inverse operation to 'snoc' 943 | -- 944 | -- Returns 'Nothing' for empty input 'ShortText'. 945 | -- 946 | -- prop> unsnoc (snoc t c) == Just (t,c) 947 | -- 948 | -- >>> unsnoc "" 949 | -- Nothing 950 | -- 951 | -- >>> unsnoc "fmap" 952 | -- Just ("fma",'p') 953 | -- 954 | -- @since 0.1.2 955 | unsnoc :: ShortText -> Maybe (ShortText,Char) 956 | unsnoc st 957 | | null st = Nothing 958 | | len1 == 0 = Just (mempty, c0) 959 | | otherwise = Just (slice st 0 len1, c0) 960 | where 961 | c0 = cp2ch cp0 962 | cp0 = readCodePointRev st stsz 963 | stsz = toB st 964 | len1 = stsz - cpLen cp0 965 | 966 | -- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a prefix of the second 'ShortText' 967 | -- 968 | -- >>> isPrefixOf "ab" "abcdef" 969 | -- True 970 | -- 971 | -- >>> isPrefixOf "ac" "abcdef" 972 | -- False 973 | -- 974 | -- prop> isPrefixOf "" t == True 975 | -- 976 | -- prop> isPrefixOf t t == True 977 | -- 978 | -- @since 0.1.2 979 | isPrefixOf :: ShortText -> ShortText -> Bool 980 | isPrefixOf x y 981 | | lx > ly = False 982 | | lx == 0 = True 983 | | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of 984 | 0# -> True 985 | _ -> False 986 | where 987 | !lx@(I# n#) = toLength x 988 | !ly = toLength y 989 | 990 | -- | \(\mathcal{O}(n)\) Strip prefix from second 'ShortText' argument. 991 | -- 992 | -- Returns 'Nothing' if first argument is not a prefix of the second argument. 993 | -- 994 | -- >>> stripPrefix "text-" "text-short" 995 | -- Just "short" 996 | -- 997 | -- >>> stripPrefix "test-" "text-short" 998 | -- Nothing 999 | -- 1000 | -- @since 0.1.2 1001 | stripPrefix :: ShortText -> ShortText -> Maybe ShortText 1002 | stripPrefix pfx t 1003 | | isPrefixOf pfx t = Just $! snd (splitAtOfs (toB pfx) t) 1004 | | otherwise = Nothing 1005 | 1006 | -- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a suffix of the second 'ShortText' 1007 | -- 1008 | -- >>> isSuffixOf "ef" "abcdef" 1009 | -- True 1010 | -- 1011 | -- >>> isPrefixOf "df" "abcdef" 1012 | -- False 1013 | -- 1014 | -- prop> isSuffixOf "" t == True 1015 | -- 1016 | -- prop> isSuffixOf t t == True 1017 | -- 1018 | -- @since 0.1.2 1019 | isSuffixOf :: ShortText -> ShortText -> Bool 1020 | isSuffixOf x y 1021 | | lx > ly = False 1022 | | lx == 0 = True 1023 | | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) ofs2# n# of 1024 | 0# -> True 1025 | _ -> False 1026 | where 1027 | !(I# ofs2#) = ly - lx 1028 | !lx@(I# n#) = toLength x 1029 | !ly = toLength y 1030 | 1031 | -- | \(\mathcal{O}(n)\) Strip suffix from second 'ShortText' argument. 1032 | -- 1033 | -- Returns 'Nothing' if first argument is not a suffix of the second argument. 1034 | -- 1035 | -- >>> stripSuffix "-short" "text-short" 1036 | -- Just "text" 1037 | -- 1038 | -- >>> stripSuffix "-utf8" "text-short" 1039 | -- Nothing 1040 | -- 1041 | -- @since 0.1.2 1042 | stripSuffix :: ShortText -> ShortText -> Maybe ShortText 1043 | stripSuffix sfx t 1044 | | isSuffixOf sfx t = Just $! fst (splitAtOfs pfxLen t) 1045 | | otherwise = Nothing 1046 | where 1047 | pfxLen = toB t - toB sfx 1048 | 1049 | ---------------------------------------------------------------------------- 1050 | 1051 | -- | \(\mathcal{O}(n)\) Insert character between characters of 'ShortText'. 1052 | -- 1053 | -- >>> intersperse '*' "_" 1054 | -- "_" 1055 | -- 1056 | -- >>> intersperse '*' "MASH" 1057 | -- "M*A*S*H" 1058 | -- 1059 | -- @since 0.1.2 1060 | intersperse :: Char -> ShortText -> ShortText 1061 | intersperse c st 1062 | | null st = mempty 1063 | | sn == 1 = st 1064 | | otherwise = create newsz $ \mba -> do 1065 | let !cp0 = readCodePoint st 0 1066 | !cp0sz = cpLen cp0 1067 | writeCodePointN cp0sz mba 0 cp0 1068 | go mba (sn - 1) cp0sz cp0sz 1069 | where 1070 | newsz = ssz + ((sn-1) `mulB` csz) 1071 | ssz = toB st 1072 | sn = length st 1073 | csz = cpLen cp 1074 | cp = ch2cp c 1075 | 1076 | go :: MBA s -> Int -> B -> B -> ST s () 1077 | go _ 0 !_ !_ = return () 1078 | go mba n ofs ofs2 = do 1079 | let !cp1 = readCodePoint st ofs2 1080 | !cp1sz = cpLen cp1 1081 | writeCodePointN csz mba ofs cp 1082 | writeCodePointN cp1sz mba (ofs+csz) cp1 1083 | go mba (n-1) (ofs+csz+cp1sz) (ofs2+cp1sz) 1084 | 1085 | -- | \(\mathcal{O}(n)\) Insert 'ShortText' inbetween list of 'ShortText's. 1086 | -- 1087 | -- >>> intercalate ", " [] 1088 | -- "" 1089 | -- 1090 | -- >>> intercalate ", " ["foo"] 1091 | -- "foo" 1092 | -- 1093 | -- >>> intercalate ", " ["foo","bar","doo"] 1094 | -- "foo, bar, doo" 1095 | -- 1096 | -- prop> intercalate "" ts == concat ts 1097 | -- 1098 | -- @since 0.1.2 1099 | intercalate :: ShortText -> [ShortText] -> ShortText 1100 | intercalate _ [] = mempty 1101 | intercalate _ [t] = t 1102 | intercalate sep ts 1103 | | null sep = mconcat ts 1104 | | otherwise = mconcat (List.intersperse sep ts) 1105 | 1106 | -- | \(\mathcal{O}(n*m)\) Replicate a 'ShortText'. 1107 | -- 1108 | -- A repetition count smaller than 1 results in an empty string result. 1109 | -- 1110 | -- >>> replicate 3 "jobs!" 1111 | -- "jobs!jobs!jobs!" 1112 | -- 1113 | -- >>> replicate 10000 "" 1114 | -- "" 1115 | -- 1116 | -- >>> replicate 0 "nothing" 1117 | -- "" 1118 | -- 1119 | -- prop> length (replicate n t) == max 0 n * length t 1120 | -- 1121 | -- @since 0.1.2 1122 | replicate :: Int -> ShortText -> ShortText 1123 | replicate n0 t 1124 | | n0 < 1 = mempty 1125 | | null t = mempty 1126 | | otherwise = create (n0 `mulB` sz) (go 0) 1127 | where 1128 | go :: Int -> MBA s -> ST s () 1129 | go j mba 1130 | | j == n0 = return () 1131 | | otherwise = do 1132 | copyByteArray t 0 mba (j `mulB` sz) sz 1133 | go (j+1) mba 1134 | 1135 | sz = toB t 1136 | 1137 | -- | \(\mathcal{O}(n)\) Reverse characters in 'ShortText'. 1138 | -- 1139 | -- >>> reverse "star live desserts" 1140 | -- "stressed evil rats" 1141 | -- 1142 | -- prop> reverse (singleton c) == singleton c 1143 | -- 1144 | -- prop> reverse (reverse t) == t 1145 | -- 1146 | -- @since 0.1.2 1147 | reverse :: ShortText -> ShortText 1148 | reverse st 1149 | | null st = mempty 1150 | | sn == 1 = st 1151 | | otherwise = create sz $ go sn 0 1152 | where 1153 | sz = toB st 1154 | sn = length st 1155 | 1156 | go :: Int -> B -> MBA s -> ST s () 1157 | go 0 !_ _ = return () 1158 | go i ofs mba = do 1159 | let !cp = readCodePoint st ofs 1160 | !cpsz = cpLen cp 1161 | !ofs' = ofs+cpsz 1162 | writeCodePointN cpsz mba (sz - ofs') cp 1163 | go (i-1) ofs' mba 1164 | 1165 | 1166 | -- | \(\mathcal{O}(n)\) Remove characters from 'ShortText' which don't satisfy given predicate. 1167 | -- 1168 | -- >>> filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!" 1169 | -- "Y dn't nd vwls t cnvy nfrmtn!" 1170 | -- 1171 | -- prop> filter (const False) t == "" 1172 | -- 1173 | -- prop> filter (const True) t == t 1174 | -- 1175 | -- prop> length (filter p t) <= length t 1176 | -- 1177 | -- prop> filter p t == pack [ c | c <- unpack t, p c ] 1178 | -- 1179 | -- @since 0.1.2 1180 | filter :: (Char -> Bool) -> ShortText -> ShortText 1181 | filter p t 1182 | = case (mofs1,mofs2) of 1183 | (Nothing, _) -> t -- no non-accepted characters found 1184 | (Just 0, Nothing) -> mempty -- no accepted characters found 1185 | (Just ofs1, Nothing) -> slice t 0 ofs1 -- only prefix accepted 1186 | (Just ofs1, Just ofs2) -> createShrink (t0sz-(ofs2-ofs1)) $ \mba -> do 1187 | -- copy accepted prefix 1188 | copyByteArray t 0 mba 0 ofs1 1189 | -- [ofs1 .. ofs2) are a non-accepted region 1190 | -- filter rest after ofs2 1191 | t1sz <- go mba ofs2 ofs1 1192 | return t1sz 1193 | where 1194 | mofs1 = findOfs (not . p) t (B 0) -- first non-accepted Char 1195 | mofs2 = findOfs p t (fromMaybe (B 0) mofs1) -- first accepted Char 1196 | 1197 | t0sz = toB t 1198 | 1199 | go :: MBA s -> B -> B -> ST s B 1200 | go mba !t0ofs !t1ofs 1201 | | t0ofs >= t0sz = return t1ofs 1202 | | otherwise = let !cp = readCodePoint t t0ofs 1203 | !cpsz = cpLen cp 1204 | in if p (cp2ch cp) 1205 | then writeCodePointN cpsz mba t1ofs cp >> 1206 | go mba (t0ofs+cpsz) (t1ofs+cpsz) 1207 | else go mba (t0ofs+cpsz) t1ofs -- skip code-point 1208 | 1209 | -- | \(\mathcal{O}(n)\) Strip characters from the beginning end and of 'ShortText' which satisfy given predicate. 1210 | -- 1211 | -- >>> dropAround (== ' ') " white space " 1212 | -- "white space" 1213 | -- 1214 | -- >>> dropAround (> 'a') "bcdefghi" 1215 | -- "" 1216 | -- 1217 | -- @since 0.1.2 1218 | dropAround :: (Char -> Bool) -> ShortText -> ShortText 1219 | dropAround p t0 = case (mofs1,mofs2) of 1220 | (Nothing,_) -> mempty 1221 | (Just ofs1,Just ofs2) 1222 | | ofs1 == 0, ofs2 == t0sz -> t0 1223 | | ofs1 < ofs2 -> create (ofs2-ofs1) $ \mba -> do 1224 | copyByteArray t0 ofs1 mba (B 0) (ofs2-ofs1) 1225 | (_,_) -> error "dropAround: the impossible happened" 1226 | where 1227 | mofs1 = findOfs (not . p) t0 (B 0) 1228 | mofs2 = findOfsRev (not . p) t0 t0sz 1229 | t0sz = toB t0 1230 | 1231 | ---------------------------------------------------------------------------- 1232 | 1233 | -- | Construct a new 'ShortText' from an existing one by slicing 1234 | -- 1235 | -- NB: The 'CSize' arguments refer to byte-offsets 1236 | slice :: ShortText -> B -> B -> ShortText 1237 | slice st ofs len 1238 | | ofs < 0 = error "invalid offset" 1239 | | len < 0 = error "invalid length" 1240 | | len' == 0 = mempty 1241 | | otherwise = create len' $ \mba -> copyByteArray st ofs' mba 0 len' 1242 | where 1243 | len0 = toB st 1244 | len' = max 0 (min len (len0-ofs)) 1245 | ofs' = max 0 ofs 1246 | 1247 | ---------------------------------------------------------------------------- 1248 | -- low-level MutableByteArray# helpers 1249 | 1250 | -- | Byte offset (or size) in bytes 1251 | -- 1252 | -- This currently wraps an 'Int' because this is what GHC's primops 1253 | -- currently use for byte offsets/sizes. 1254 | newtype B = B { unB :: Int } 1255 | deriving (Ord,Eq,Num) 1256 | 1257 | {- TODO: introduce operators for 'B' to avoid 'Num' -} 1258 | 1259 | mulB :: Int -> B -> B 1260 | mulB n (B b) = B (n*b) 1261 | 1262 | csizeFromB :: B -> CSize 1263 | csizeFromB = fromIntegral . unB 1264 | 1265 | csizeToB :: CSize -> B 1266 | csizeToB = B . fromIntegral 1267 | 1268 | data MBA s = MBA# { unMBA# :: MutableByteArray# s } 1269 | 1270 | {-# INLINE create #-} 1271 | create :: B -> (forall s. MBA s -> ST s ()) -> ShortText 1272 | create n go = runST $ do 1273 | mba <- newByteArray n 1274 | go mba 1275 | unsafeFreeze mba 1276 | 1277 | {-# INLINE createShrink #-} 1278 | createShrink :: B -> (forall s. MBA s -> ST s B) -> ShortText 1279 | createShrink n go = runST $ do 1280 | mba <- newByteArray n 1281 | n' <- go mba 1282 | if n' < n 1283 | then unsafeFreezeShrink mba n' 1284 | else unsafeFreeze mba 1285 | 1286 | {-# INLINE unsafeFreeze #-} 1287 | unsafeFreeze :: MBA s -> ST s ShortText 1288 | unsafeFreeze (MBA# mba#) 1289 | = ST $ \s -> case GHC.Exts.unsafeFreezeByteArray# mba# s of 1290 | (# s', ba# #) -> (# s', ShortText (BSSI.SBS ba#) #) 1291 | 1292 | {-# INLINE copyByteArray #-} 1293 | copyByteArray :: ShortText -> B -> MBA s -> B -> B -> ST s () 1294 | copyByteArray (ShortText (BSSI.SBS src#)) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B (I# len#)) 1295 | = ST $ \s -> case GHC.Exts.copyByteArray# src# src_off# dst# dst_off# len# s of 1296 | s' -> (# s', () #) 1297 | 1298 | {-# INLINE newByteArray #-} 1299 | newByteArray :: B -> ST s (MBA s) 1300 | newByteArray (B (I# n#)) 1301 | = ST $ \s -> case GHC.Exts.newByteArray# n# s of 1302 | (# s', mba# #) -> (# s', MBA# mba# #) 1303 | 1304 | {-# INLINE writeWord8Array #-} 1305 | writeWord8Array :: MBA s -> B -> Word -> ST s () 1306 | writeWord8Array (MBA# mba#) (B (I# i#)) (W# w#) 1307 | = ST $ \s -> 1308 | #if __GLASGOW_HASKELL__ >= 902 1309 | case GHC.Exts.writeWord8Array# mba# i# (GHC.Exts.wordToWord8# w#) s of 1310 | #else 1311 | case GHC.Exts.writeWord8Array# mba# i# w# s of 1312 | #endif 1313 | s' -> (# s', () #) 1314 | {- not needed yet 1315 | {-# INLINE indexWord8Array #-} 1316 | indexWord8Array :: ShortText -> B -> Word 1317 | indexWord8Array (ShortText (BSSI.SBS ba#)) (B (I# i#)) = W# (GHC.Exts.indexWord8Array# ba# i#) 1318 | -} 1319 | 1320 | {-# INLINE copyAddrToByteArray #-} 1321 | copyAddrToByteArray :: Ptr a -> MBA RealWorld -> B -> B -> ST RealWorld () 1322 | copyAddrToByteArray (Ptr src#) (MBA# dst#) (B (I# dst_off#)) (B (I# len#)) 1323 | = ST $ \s -> case GHC.Exts.copyAddrToByteArray# src# dst# dst_off# len# s of 1324 | s' -> (# s', () #) 1325 | 1326 | ---------------------------------------------------------------------------- 1327 | -- unsafeFreezeShrink 1328 | 1329 | #if __GLASGOW_HASKELL__ >= 710 1330 | -- for GHC versions which have the 'shrinkMutableByteArray#' primop 1331 | {-# INLINE unsafeFreezeShrink #-} 1332 | unsafeFreezeShrink :: MBA s -> B -> ST s ShortText 1333 | unsafeFreezeShrink mba n = do 1334 | shrink mba n 1335 | unsafeFreeze mba 1336 | 1337 | {-# INLINE shrink #-} 1338 | shrink :: MBA s -> B -> ST s () 1339 | shrink (MBA# mba#) (B (I# i#)) 1340 | = ST $ \s -> case GHC.Exts.shrinkMutableByteArray# mba# i# s of 1341 | s' -> (# s', () #) 1342 | #else 1343 | -- legacy code for GHC versions which lack `shrinkMutableByteArray#` primop 1344 | {-# INLINE unsafeFreezeShrink #-} 1345 | unsafeFreezeShrink :: MBA s -> B -> ST s ShortText 1346 | unsafeFreezeShrink mba0 n = do 1347 | mba' <- newByteArray n 1348 | copyByteArray2 mba0 0 mba' 0 n 1349 | unsafeFreeze mba' 1350 | 1351 | {-# INLINE copyByteArray2 #-} 1352 | copyByteArray2 :: MBA s -> B -> MBA s -> B -> B -> ST s () 1353 | copyByteArray2 (MBA# src#) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B( I# len#)) 1354 | = ST $ \s -> case GHC.Exts.copyMutableByteArray# src# src_off# dst# dst_off# len# s of 1355 | s' -> (# s', () #) 1356 | #endif 1357 | 1358 | ---------------------------------------------------------------------------- 1359 | -- Helpers for encoding code points into UTF-8 code units 1360 | -- 1361 | -- 7 bits| < 0x80 | 0xxxxxxx 1362 | -- 11 bits| < 0x800 | 110yyyyx 10xxxxxx 1363 | -- 16 bits| < 0x10000 | 1110yyyy 10yxxxxx 10xxxxxx 1364 | -- 21 bits| | 11110yyy 10yyxxxx 10xxxxxx 10xxxxxx 1365 | 1366 | -- | Unicode Code-point 1367 | -- 1368 | -- Keeping it as a 'Word' is more convenient for bit-ops and FFI 1369 | newtype CP = CP Word 1370 | 1371 | {-# INLINE ch2cp #-} 1372 | ch2cp :: Char -> CP 1373 | ch2cp (ord -> ci) 1374 | | isSurr ci = CP 0xFFFD 1375 | | otherwise = CP (fromIntegral ci) 1376 | 1377 | {-# INLINE isSurr #-} 1378 | isSurr :: (Num i, Bits i) => i -> Bool 1379 | isSurr ci = ci .&. 0xfff800 == 0xd800 1380 | 1381 | {-# INLINE cp2ch #-} 1382 | cp2ch :: CP -> Char 1383 | cp2ch (CP w) = (w < 0x110000) `assert` unsafeChr (fromIntegral w) 1384 | 1385 | -- used/needed by index-lookup functions to encode out of bounds 1386 | cp2chSafe :: CP -> Maybe Char 1387 | cp2chSafe cp 1388 | | cpNull cp = Nothing 1389 | | otherwise = Just $! cp2ch cp 1390 | where 1391 | cpNull :: CP -> Bool 1392 | cpNull (CP w) = w >= 0x110000 1393 | 1394 | {-# INLINE cpLen #-} 1395 | cpLen :: CP -> B 1396 | cpLen (CP cp) 1397 | | cp < 0x80 = B 1 1398 | | cp < 0x800 = B 2 1399 | | cp < 0x10000 = B 3 1400 | | otherwise = B 4 1401 | 1402 | -- convenience wrapper; unsafe like readCodePoint 1403 | {-# INLINE decodeCharAtOfs #-} 1404 | decodeCharAtOfs :: ShortText -> B -> (Char,B) 1405 | decodeCharAtOfs st ofs = (c,ofs') 1406 | where 1407 | c = cp2ch cp 1408 | ofs' = ofs + cpLen cp 1409 | cp = readCodePoint st ofs 1410 | {- pure version of decodeCharAtOfs, but unfortunately significantly slower 1411 | 1412 | decodeCharAtOfs st ofs 1413 | | b0 < 0x80 = (cp2ch $ CP b0,ofs + B 1) 1414 | | otherwise = case b0 `unsafeShiftR` 4 of 1415 | 0xf -> (cp2ch $ CP go4, ofs + B 4) 1416 | 0xe -> (cp2ch $ CP go3, ofs + B 3) 1417 | _ -> (cp2ch $ CP go2, ofs + B 2) 1418 | where 1419 | b0 = buf 0 1420 | buf j = indexWord8Array st (ofs+j) 1421 | 1422 | go2 = ((b0 .&. 0x1f) `unsafeShiftL` 6) 1423 | .|. (buf 1 .&. 0x3f) 1424 | 1425 | go3 = ((b0 .&. 0x0f) `unsafeShiftL` (6+6)) 1426 | .|. ((buf 1 .&. 0x3f) `unsafeShiftL` 6) 1427 | .|. (buf 2 .&. 0x3f) 1428 | 1429 | go4 = ((b0 .&. 0x07) `unsafeShiftL` (6+6+6)) 1430 | .|. ((buf 1 .&. 0x3f) `unsafeShiftL` (6+6)) 1431 | .|. ((buf 2 .&. 0x3f) `unsafeShiftL` 6) 1432 | .|. (buf 3 .&. 0x3f) 1433 | -} 1434 | 1435 | 1436 | -- | \(\mathcal{O}(1)\) Construct 'ShortText' from single codepoint. 1437 | -- 1438 | -- prop> singleton c == pack [c] 1439 | -- 1440 | -- prop> length (singleton c) == 1 1441 | -- 1442 | -- >>> singleton 'A' 1443 | -- "A" 1444 | -- 1445 | -- >>> map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000 1446 | -- ["\55295","\65533","\65533","\57344"] 1447 | -- 1448 | -- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD. 1449 | -- 1450 | -- @since 0.1.2 1451 | singleton :: Char -> ShortText 1452 | singleton = singleton' . ch2cp 1453 | 1454 | singleton' :: CP -> ShortText 1455 | singleton' cp@(CP cpw) 1456 | | cpw < 0x80 = create 1 $ \mba -> writeCodePoint1 mba 0 cp 1457 | | cpw < 0x800 = create 2 $ \mba -> writeCodePoint2 mba 0 cp 1458 | | cpw < 0x10000 = create 3 $ \mba -> writeCodePoint3 mba 0 cp 1459 | | otherwise = create 4 $ \mba -> writeCodePoint4 mba 0 cp 1460 | 1461 | -- | \(\mathcal{O}(n)\) Prepend a character to a 'ShortText'. 1462 | -- 1463 | -- prop> cons c t == singleton c <> t 1464 | -- 1465 | -- @since 0.1.2 1466 | cons :: Char -> ShortText -> ShortText 1467 | cons (ch2cp -> cp@(CP cpw)) sfx 1468 | | n == 0 = singleton' cp 1469 | | cpw < 0x80 = create (n+1) $ \mba -> writeCodePoint1 mba 0 cp >> copySfx 1 mba 1470 | | cpw < 0x800 = create (n+2) $ \mba -> writeCodePoint2 mba 0 cp >> copySfx 2 mba 1471 | | cpw < 0x10000 = create (n+3) $ \mba -> writeCodePoint3 mba 0 cp >> copySfx 3 mba 1472 | | otherwise = create (n+4) $ \mba -> writeCodePoint4 mba 0 cp >> copySfx 4 mba 1473 | where 1474 | !n = toB sfx 1475 | 1476 | copySfx :: B -> MBA s -> ST s () 1477 | copySfx ofs mba = copyByteArray sfx 0 mba ofs n 1478 | 1479 | -- | \(\mathcal{O}(n)\) Append a character to the ond of a 'ShortText'. 1480 | -- 1481 | -- prop> snoc t c == t <> singleton c 1482 | -- 1483 | -- @since 0.1.2 1484 | snoc :: ShortText -> Char -> ShortText 1485 | snoc pfx (ch2cp -> cp@(CP cpw)) 1486 | | n == 0 = singleton' cp 1487 | | cpw < 0x80 = create (n+1) $ \mba -> copyPfx mba >> writeCodePoint1 mba n cp 1488 | | cpw < 0x800 = create (n+2) $ \mba -> copyPfx mba >> writeCodePoint2 mba n cp 1489 | | cpw < 0x10000 = create (n+3) $ \mba -> copyPfx mba >> writeCodePoint3 mba n cp 1490 | | otherwise = create (n+4) $ \mba -> copyPfx mba >> writeCodePoint4 mba n cp 1491 | where 1492 | !n = toB pfx 1493 | 1494 | copyPfx :: MBA s -> ST s () 1495 | copyPfx mba = copyByteArray pfx 0 mba 0 n 1496 | 1497 | {- 1498 | writeCodePoint :: MBA s -> Int -> Word -> ST s () 1499 | writeCodePoint mba ofs cp 1500 | | cp < 0x80 = writeCodePoint1 mba ofs cp 1501 | | cp < 0x800 = writeCodePoint2 mba ofs cp 1502 | | cp < 0x10000 = writeCodePoint3 mba ofs cp 1503 | | otherwise = writeCodePoint4 mba ofs cp 1504 | -} 1505 | 1506 | writeCodePointN :: B -> MBA s -> B -> CP -> ST s () 1507 | writeCodePointN 1 = writeCodePoint1 1508 | writeCodePointN 2 = writeCodePoint2 1509 | writeCodePointN 3 = writeCodePoint3 1510 | writeCodePointN 4 = writeCodePoint4 1511 | writeCodePointN _ = undefined 1512 | 1513 | writeCodePoint1 :: MBA s -> B -> CP -> ST s () 1514 | writeCodePoint1 mba ofs (CP cp) = 1515 | writeWord8Array mba ofs cp 1516 | 1517 | writeCodePoint2 :: MBA s -> B -> CP -> ST s () 1518 | writeCodePoint2 mba ofs (CP cp) = do 1519 | writeWord8Array mba ofs (0xc0 .|. (cp `unsafeShiftR` 6)) 1520 | writeWord8Array mba (ofs+1) (0x80 .|. (cp .&. 0x3f)) 1521 | 1522 | writeCodePoint3 :: MBA s -> B -> CP -> ST s () 1523 | writeCodePoint3 mba ofs (CP cp) = do 1524 | writeWord8Array mba ofs (0xe0 .|. (cp `unsafeShiftR` 12)) 1525 | writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 6) .&. 0x3f)) 1526 | writeWord8Array mba (ofs+2) (0x80 .|. (cp .&. 0x3f)) 1527 | 1528 | writeCodePoint4 :: MBA s -> B -> CP -> ST s () 1529 | writeCodePoint4 mba ofs (CP cp) = do 1530 | writeWord8Array mba ofs (0xf0 .|. (cp `unsafeShiftR` 18)) 1531 | writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 12) .&. 0x3f)) 1532 | writeWord8Array mba (ofs+2) (0x80 .|. ((cp `unsafeShiftR` 6) .&. 0x3f)) 1533 | writeWord8Array mba (ofs+3) (0x80 .|. (cp .&. 0x3f)) 1534 | 1535 | -- beware: UNSAFE! 1536 | readCodePoint :: ShortText -> B -> CP 1537 | readCodePoint st (csizeFromB -> ofs) 1538 | = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp (toByteArray# st) ofs) 1539 | 1540 | foreign import ccall unsafe "hs_text_short_ofs_cp" c_text_short_ofs_cp :: ByteArray# -> CSize -> IO CCodePoint 1541 | 1542 | readCodePointRev :: ShortText -> B -> CP 1543 | readCodePointRev st (csizeFromB -> ofs) 1544 | = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp_rev (toByteArray# st) ofs) 1545 | 1546 | foreign import ccall unsafe "hs_text_short_ofs_cp_rev" c_text_short_ofs_cp_rev :: ByteArray# -> CSize -> IO CCodePoint 1547 | 1548 | ---------------------------------------------------------------------------- 1549 | -- string & list literals 1550 | 1551 | -- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) character literals are replaced by U+FFFD. 1552 | -- 1553 | -- @since 0.1.2 1554 | instance GHC.Exts.IsList ShortText where 1555 | type (Item ShortText) = Char 1556 | fromList = fromString 1557 | toList = toString 1558 | 1559 | -- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) in string literals are replaced by U+FFFD. 1560 | -- 1561 | -- This matches the behaviour of 'S.IsString' instance for 'T.Text'. 1562 | instance S.IsString ShortText where 1563 | fromString = fromStringLit 1564 | 1565 | -- i.e., don't inline before Phase 0 1566 | {-# INLINE [0] fromStringLit #-} 1567 | fromStringLit :: String -> ShortText 1568 | fromStringLit = fromString 1569 | 1570 | {-# RULES "ShortText empty literal" fromStringLit "" = mempty #-} 1571 | 1572 | -- TODO: this doesn't seem to fire 1573 | {-# RULES "ShortText singleton literal" forall c . fromStringLit [c] = singleton c #-} 1574 | 1575 | {-# RULES "ShortText literal ASCII" forall s . fromStringLit (GHC.unpackCString# s) = fromLitAsciiAddr# s #-} 1576 | 1577 | {-# RULES "ShortText literal UTF-8" forall s . fromStringLit (GHC.unpackCStringUtf8# s) = fromLitMUtf8Addr# s #-} 1578 | 1579 | {-# NOINLINE fromLitAsciiAddr# #-} 1580 | fromLitAsciiAddr# :: Addr# -> ShortText 1581 | fromLitAsciiAddr# (Ptr -> ptr) = unsafeDupablePerformIO $ do 1582 | sz <- csizeToB `fmap` c_strlen ptr 1583 | 1584 | case sz `compare` 0 of 1585 | EQ -> return mempty -- should not happen if rules fire correctly 1586 | GT -> stToIO $ do 1587 | mba <- newByteArray sz 1588 | copyAddrToByteArray ptr mba 0 sz 1589 | unsafeFreeze mba 1590 | LT -> return (error "fromLitAsciiAddr#") 1591 | -- NOTE: should never happen unless strlen(3) overflows (NB: CSize 1592 | -- is unsigned; the overflow would occur when converting to 1593 | -- 'B') 1594 | 1595 | foreign import ccall unsafe "strlen" c_strlen :: CString -> IO CSize 1596 | 1597 | -- GHC uses an encoding resembling Modified UTF-8 for non-ASCII string-literals 1598 | {-# NOINLINE fromLitMUtf8Addr# #-} 1599 | fromLitMUtf8Addr# :: Addr# -> ShortText 1600 | fromLitMUtf8Addr# (Ptr -> ptr) = unsafeDupablePerformIO $ do 1601 | sz <- B `fmap` c_text_short_mutf8_strlen ptr 1602 | 1603 | case sz `compare` 0 of 1604 | EQ -> return mempty -- should not happen if rules fire correctly 1605 | GT -> stToIO $ do 1606 | mba <- newByteArray sz 1607 | copyAddrToByteArray ptr mba 0 sz 1608 | unsafeFreeze mba 1609 | LT -> do 1610 | mba <- stToIO (newByteArray (abs sz)) 1611 | c_text_short_mutf8_trans ptr (unMBA# mba) 1612 | stToIO (unsafeFreeze mba) 1613 | 1614 | foreign import ccall unsafe "hs_text_short_mutf8_strlen" c_text_short_mutf8_strlen :: CString -> IO Int 1615 | 1616 | foreign import ccall unsafe "hs_text_short_mutf8_trans" c_text_short_mutf8_trans :: CString -> MutableByteArray# RealWorld -> IO () 1617 | 1618 | -- $setup 1619 | -- >>> :set -XOverloadedStrings 1620 | -- >>> import Data.Text.Short (pack, unpack, concat) 1621 | -- >>> import Text.Show.Functions () 1622 | -- >>> import qualified Test.QuickCheck.Arbitrary as QC 1623 | -- >>> import Test.QuickCheck.Instances () 1624 | -- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary } 1625 | --------------------------------------------------------------------------------