├── src-bench
    ├── cabal.project
    ├── Bench.hs
    └── text-short-bench.cabal
├── .gitignore
├── cabal.project
├── src-ghc804
    └── PrimOps.hs
├── cbits
    ├── memcmp.c
    └── cbits.c
├── cabal.haskell-ci
├── src
    └── Data
    │   └── Text
    │       ├── Short
    │           ├── Unsafe.hs
    │           ├── Partial.hs
    │           └── Internal.hs
    │       └── Short.hs
├── LICENSE
├── text-short.cabal
├── ChangeLog.md
├── .github
    └── workflows
    │   └── haskell-ci.yml
└── src-test
    └── Tests.hs


/src-bench/cabal.project:
--------------------------------------------------------------------------------
1 | packages: .
2 | -- benchmarks: True
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /dist-newstyle/
2 | /dist/
3 | *~
4 | .ghc.environment.*
5 | 


--------------------------------------------------------------------------------
/cabal.project:
--------------------------------------------------------------------------------
1 | packages: .
2 | tests: True
3 | 
4 | package text-short
5 |   flags: +asserts
6 | 


--------------------------------------------------------------------------------
/src-ghc804/PrimOps.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE MagicHash #-}
2 | {-# LANGUAGE Unsafe    #-}
3 | 
4 | module PrimOps ( compareByteArrays# ) where
5 | 
6 | import           GHC.Exts (compareByteArrays#)
7 | 


--------------------------------------------------------------------------------
/cbits/memcmp.c:
--------------------------------------------------------------------------------
 1 | #include <string.h>
 2 | 
 3 | int
 4 | hs_text_short_memcmp(const void *s1, const size_t s1ofs, const void *s2, const size_t s2ofs, const size_t n)
 5 | {
 6 |   if (!n) return 0;
 7 | 
 8 |   const void *s1_ = s1+s1ofs;
 9 |   const void *s2_ = s2+s2ofs;
10 | 
11 |   return (s1_ == s2_) ? 0 : memcmp(s1_, s2_, n);
12 | }
13 | 


--------------------------------------------------------------------------------
/cabal.haskell-ci:
--------------------------------------------------------------------------------
 1 | branches: master
 2 | head-hackage: False
 3 | 
 4 | constraint-set bytestring-0.11
 5 |   constraints: bytestring ==0.11.*
 6 |   ghc: <9.7
 7 |   tests: True
 8 |   run-tests: True
 9 | 
10 | constraint-set text-1
11 |   ghc: <9.3
12 |   constraints: text ^>=1.2.3.0
13 |   tests: True
14 |   run-tests: True
15 | 
16 | constraint-set text-2
17 |   ghc: >= 8.0 && <9.7
18 |   constraints: text ^>=2
19 |   tests: True
20 |   run-tests: True
21 | 


--------------------------------------------------------------------------------
/src/Data/Text/Short/Unsafe.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE Unsafe #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.Short.Unsafe
 5 | -- Copyright   : © Herbert Valerio Riedel 2017
 6 | -- License     : BSD3
 7 | --
 8 | -- Maintainer  : hvr@gnu.org
 9 | -- Stability   : stable
10 | --
11 | -- Unsafe API
12 | --
13 | -- This module provides unsafe conversion functions
14 | module Data.Text.Short.Unsafe
15 |     ( fromShortByteStringUnsafe
16 |     , fromByteStringUnsafe
17 |     ) where
18 | 
19 | import Data.Text.Short.Internal
20 | import Prelude ()
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, Herbert Valerio Riedel
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Herbert Valerio Riedel nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/src/Data/Text/Short/Partial.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE Trustworthy #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.Short.Partial
  5 | -- Copyright   : © Herbert Valerio Riedel 2018
  6 | -- License     : BSD3
  7 | --
  8 | -- Maintainer  : hvr@gnu.org
  9 | -- Stability   : stable
 10 | --
 11 | -- Partial functions vocabulary
 12 | --
 13 | -- This module provides common partial functions for operating on 'ShortText'.
 14 | --
 15 | -- The use of these functions is discouraged as they tend to be error-prone.
 16 | --
 17 | -- @since 0.1.2
 18 | module Data.Text.Short.Partial
 19 |     ( head
 20 |     , tail
 21 |     , init
 22 |     , last
 23 |     , index
 24 | 
 25 |     , foldl1
 26 |     , foldl1'
 27 |     , foldr1
 28 |     ) where
 29 | 
 30 | import           Data.Text.Short
 31 | import           Data.Text.Short.Internal
 32 | import           Prelude                  ()
 33 | 
 34 | -- | \(\mathcal{O}(1)\) Returns first character of a non-empty 'ShortText'
 35 | --
 36 | -- >>> head "abcd"
 37 | -- 'a'
 38 | --
 39 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's.
 40 | -- Consider using the total functions 'uncons' or 'indexMaybe'
 41 | -- instead.
 42 | --
 43 | -- @since 0.1.2
 44 | head :: ShortText -> Char
 45 | head = maybe (error "head: empty ShortText") fst . uncons
 46 | 
 47 | -- | \(\mathcal{O}(n)\) Drop first character from non-empty 'ShortText'.
 48 | --
 49 | -- >>> tail "abcd"
 50 | -- "bcd"
 51 | --
 52 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's.
 53 | -- Consider using the total functions 'uncons' or 'drop' instead.
 54 | --
 55 | -- @since 0.1.2
 56 | tail :: ShortText -> ShortText
 57 | tail = maybe (error "tail: empty ShortText") snd . uncons
 58 | 
 59 | -- | \(\mathcal{O}(n)\) Drop last character from non-empty 'ShortText'.
 60 | --
 61 | -- >>> tail "abcd"
 62 | -- "bcd"
 63 | --
 64 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's.
 65 | -- Consider using the total functions 'unsnoc' or 'dropEnd' instead.
 66 | --
 67 | -- @since 0.1.2
 68 | init :: ShortText -> ShortText
 69 | init = maybe (error "init: empty ShortText") fst . unsnoc
 70 | 
 71 | -- | \(\mathcal{O}(1)\) Return last character from non-empty 'ShortText'.
 72 | --
 73 | -- >>> last "abcd"
 74 | -- 'd'
 75 | --
 76 | -- __Note__: Will throw an 'error' exception for empty 'ShortText's.
 77 | -- Consider using the total functions 'unsnoc' or 'indexEndMaybe'
 78 | -- instead.
 79 | --
 80 | -- @since 0.1.2
 81 | last :: ShortText -> Char
 82 | last = maybe (error "last: empty ShortText") snd . unsnoc
 83 | 
 84 | -- | \(\mathcal{O}(n)\) Retrieve \(i\)-th character (code-point)
 85 | --
 86 | -- >>> index "abcd" 1
 87 | -- 'b'
 88 | --
 89 | -- __Note__: Will throw an 'error' exception if index is out of
 90 | -- bounds.  Consider using the total functions 'indexMaybe' or
 91 | -- 'indexEndMaybe' instead.
 92 | --
 93 | -- @since 0.1.2
 94 | index :: ShortText -> Int -> Char
 95 | index st i = case indexMaybe st i of
 96 |                Nothing -> error "index: not within ShortText"
 97 |                Just c  -> c
 98 | 
 99 | -- $setup
100 | -- >>> :set -XOverloadedStrings
101 | 


--------------------------------------------------------------------------------
/src-bench/Bench.hs:
--------------------------------------------------------------------------------
 1 | module Main where
 2 | 
 3 | import           Criterion.Main
 4 | 
 5 | import           Control.Exception
 6 | import qualified Data.Text                as T
 7 | import qualified Data.Text.Encoding       as T
 8 | import qualified Data.Text.Short.Internal as IUT
 9 | 
10 | -- The function we're benchmarking.
11 | fib m | m < 0     = error "negative!"
12 |       | otherwise = go m
13 |   where
14 |     go 0 = 0
15 |     go 1 = 1
16 |     go n = go (n-1) + go (n-2)
17 | 
18 | {-# NOINLINE allcharsBS1 #-}
19 | allcharsBS1 = T.encodeUtf8 (T.pack ['\x0'..'\x7f'])
20 | 
21 | {-# NOINLINE allcharsST1 #-}
22 | Just allcharsST1 = IUT.fromByteString allcharsBS1
23 | 
24 | {-# NOINLINE allcharsST1' #-}
25 | Just allcharsST1' = IUT.fromByteString (IUT.toByteString allcharsST1)
26 | 
27 | {-# NOINLINE allcharsBS2 #-}
28 | allcharsBS2 = T.encodeUtf8 (T.pack $ ['\x0'..'\xd7ff'] ++ ['\xe000'..'\x10ffff'])
29 | 
30 | {-# NOINLINE allcharsST2 #-}
31 | Just allcharsST2 = IUT.fromByteString allcharsBS2
32 | 
33 | {-# NOINLINE allcharsST2' #-}
34 | Just allcharsST2' = IUT.fromByteString (IUT.toByteString allcharsST2)
35 | 
36 | {-# NOINLINE allAscii128K #-}
37 | allAscii128K = mconcat (replicate 1024 allcharsST1)
38 | 
39 | -- Our benchmark harness.
40 | main = do
41 |   evaluate allcharsST1
42 |   evaluate allcharsST1'
43 |   evaluate allcharsST2
44 |   evaluate allcharsST2'
45 |   evaluate allAscii128K
46 | 
47 |   defaultMain
48 |     [ bgroup "singleton"
49 |       [ bench "'a' :: ShortText" $ whnf IUT.singleton 'a'
50 |       , bench "'a' :: Text" $ whnf (T.singleton) 'a'
51 |       , bench "U+10FFFF :: ShortText" $ whnf (IUT.singleton) '\x10ffff'
52 |       , bench "U+10FFFF :: Text" $ whnf (T.singleton) '\x10ffff'
53 |       ]
54 |     , bgroup "toString"
55 |       [ bench "t1" $ nf IUT.toString allcharsST1
56 |       , bench "t2" $ nf IUT.toString allcharsST2
57 |       , bench "t3" $ nf IUT.toString allAscii128K
58 |       , bench "t1 (Text)" $ nf T.unpack (IUT.toText allcharsST1)
59 |       , bench "t2 (Text)" $ nf T.unpack (IUT.toText allcharsST2)
60 |       , bench "t3 (Text)" $ nf T.unpack (IUT.toText allAscii128K)
61 |       ]
62 | 
63 |     , bgroup "length"
64 |       [ bench "1" $ whnf IUT.length allcharsST1
65 |       , bench "2" $ whnf IUT.length allcharsST2
66 |       , bench "3" $ whnf IUT.length allAscii128K
67 |       ]
68 | 
69 |     , bgroup "=="
70 |       [ bench "== 1a"  $ whnf (== allcharsST1) allcharsST1
71 |       , bench "== 1b"  $ whnf (== allcharsST1) allcharsST1'
72 |       , bench "== 2a"  $ whnf (== allcharsST2) allcharsST2
73 |       , bench "== 2b"  $ whnf (== allcharsST2) allcharsST2'
74 |       ]
75 | 
76 |     , bgroup "isAscii"
77 |       [ bench "isAscii 1" $ whnf IUT.isAscii allcharsST1
78 |       , bench "isAscii 2" $ whnf IUT.isAscii allcharsST2
79 |       , bench "isAscii 3" $ whnf IUT.isAscii allAscii128K
80 |       ]
81 | 
82 |     , bgroup "isValidUtf8"
83 |       [ bench "isValidUtf8 1" $ whnf IUT.isValidUtf8 allcharsST1
84 |       , bench "fromByteString 1" $ whnf IUT.fromByteString allcharsBS1
85 | 
86 |       , bench "isValidUtf8 2" $ whnf IUT.isValidUtf8 allcharsST2
87 |       , bench "fromByteString 2" $ whnf IUT.fromByteString allcharsBS2
88 |       ]
89 |     ]
90 | 
91 | 


--------------------------------------------------------------------------------
/text-short.cabal:
--------------------------------------------------------------------------------
  1 | cabal-version:      1.18
  2 | name:               text-short
  3 | version:            0.1.6
  4 | x-revision:         3
  5 | synopsis:           Memory-efficient representation of Unicode text strings
  6 | license:            BSD3
  7 | license-file:       LICENSE
  8 | author:             Herbert Valerio Riedel
  9 | maintainer:         hvr@gnu.org
 10 | bug-reports:        https://github.com/hvr/text-short/issues
 11 | category:           Data
 12 | build-type:         Simple
 13 | description:
 14 |   This package provides the 'ShortText' type which is suitable for keeping many short strings in memory. This is similiar to how 'ShortByteString' relates to 'ByteString'.
 15 |   .
 16 |   The main difference between 'Text' and 'ShortText' is that 'ShortText' doesn't support zero-copy slicing (thereby saving 2 words), and, compared to text-1.*, that it uses UTF-8 instead of UTF-16 internally. Consequently, the memory footprint of a (boxed) 'ShortText' value is 4 words (2 words when unboxed) plus the length of the UTF-8 encoded payload.
 17 | 
 18 | tested-with:
 19 |   GHC ==8.6.5
 20 |    || ==8.8.3
 21 |    || ==8.10.7
 22 |    || ==9.0.2
 23 |    || ==9.2.8
 24 |    || ==9.4.8
 25 |    || ==9.6.6
 26 |    || ==9.8.4
 27 |    || ==9.10.1
 28 |    || ==9.12.1
 29 | 
 30 | extra-source-files: ChangeLog.md
 31 | 
 32 | source-repository head
 33 |   type:     git
 34 |   location: https://github.com/hvr/text-short.git
 35 | 
 36 | flag asserts
 37 |   description: Enable runtime-checks via @assert@
 38 |   default:     False
 39 |   manual:      True
 40 | 
 41 | library
 42 |   exposed-modules:
 43 |     Data.Text.Short
 44 |     Data.Text.Short.Partial
 45 |     Data.Text.Short.Unsafe
 46 | 
 47 |   other-modules:    Data.Text.Short.Internal
 48 |   build-depends:
 49 |       base              >=4.12     && <4.22
 50 |     , binary            >=0.8.6.0  && <0.9
 51 |     , bytestring        >=0.10.8.2 && <0.13
 52 |     , deepseq           >=1.4.4.0  && <1.6
 53 |     , ghc-prim          >=0.5.3    && <0.14
 54 |     , hashable          >=1.4.4.0  && <1.6
 55 |     , template-haskell  >=2.14.0.0 && <2.24
 56 |     , text              >=1.2.3.1  && <1.3  || >=2.0 && <2.2
 57 | 
 58 |   other-modules:    PrimOps
 59 |   hs-source-dirs:   src src-ghc804
 60 |   default-language: Haskell2010
 61 |   other-extensions:
 62 |     CPP
 63 |     GeneralizedNewtypeDeriving
 64 |     MagicHash
 65 |     TemplateHaskellQuotes
 66 |     Trustworthy
 67 |     UnliftedFFITypes
 68 |     Unsafe
 69 | 
 70 |   c-sources:        cbits/cbits.c
 71 | 
 72 |   if flag(asserts)
 73 |     ghc-options: -fno-ignore-asserts
 74 | 
 75 |   else
 76 |     cc-options: -DNDEBUG=1
 77 | 
 78 |   ghc-options:      -Wall
 79 |   cc-options:       -Wall
 80 | 
 81 | test-suite text-short-tests
 82 |   type:             exitcode-stdio-1.0
 83 |   hs-source-dirs:   src-test
 84 |   main-is:          Tests.hs
 85 | 
 86 |   -- bytestring dependency for cabal_macros.h
 87 |   build-depends:
 88 |       base
 89 |     , binary
 90 |     , bytestring
 91 |     , template-haskell
 92 |     , text
 93 |     , text-short
 94 | 
 95 |   -- deps which don't inherit constraints from library stanza:
 96 |   build-depends:
 97 |       tasty             >=1.4    && <1.6
 98 |     , tasty-hunit       >=0.10.0 && <0.11
 99 |     , tasty-quickcheck  >=0.10   && <0.12
100 | 
101 |   default-language: Haskell2010
102 | 


--------------------------------------------------------------------------------
/src-bench/text-short-bench.cabal:
--------------------------------------------------------------------------------
 1 | cabal-version:       2.0
 2 | 
 3 | name:                text-short-bench
 4 | version:             0.1.2
 5 | synopsis:            Memory-efficient representation of Unicode text strings
 6 | license:             BSD3
 7 | license-file:        LICENSE
 8 | author:              Herbert Valerio Riedel
 9 | maintainer:          hvr@gnu.org
10 | bug-reports:         https://github.com/hvr/text-short/issues
11 | category:            Data
12 | build-type:          Simple
13 | description:         This package provides the 'ShortText' type which is suitable for keeping many short strings in memory. This is similiar to how 'ShortByteString' relates to 'ByteString'.
14 |                      .
15 |                      The main difference between 'Text' and 'ShortText' is that 'ShortText' uses UTF-8 instead of UTF-16 internally and also doesn't support slicing (thereby saving 2 words). Consequently, the memory footprint of a (boxed) 'ShortText' value is 4 words (2 words when unboxed) plus the length of the UTF-8 encoded payload.
16 | 
17 | tested-with:         GHC==8.2.2, GHC==8.0.2, GHC==7.10.3, GHC==7.8.4
18 | extra-source-files:  ChangeLog.md
19 | 
20 | Source-Repository head
21 |     Type:              git
22 |     Location:          https://github.com/hvr/text-short.git
23 | 
24 | flag asserts
25 |   description: Enable runtime-checks via @assert@
26 |   default: False
27 |   manual: True
28 | 
29 | library text-short-iut
30 |   exposed-modules:     Data.Text.Short
31 |                        Data.Text.Short.Partial
32 |                        Data.Text.Short.Unsafe
33 | 
34 |   exposed-modules:     Data.Text.Short.Internal
35 | 
36 |   build-depends:       base        >= 4.7    && < 4.11
37 |                      , bytestring  >= 0.10.4 && < 0.11
38 |                      , hashable    >= 1.2.6  && < 1.3
39 |                      , deepseq     >= 1.3    && < 1.5
40 |                      , text        >= 1.0    && < 1.3
41 |                      , binary      >= 0.7.1  && < 0.9
42 |                      , ghc-prim    >= 0.3.1  && < 0.6
43 | 
44 |   if !impl(ghc >= 8.0)
45 |      build-depends: semigroups >= 0.18.2 && < 0.19
46 | 
47 |   -- GHC version specific PrimOps
48 |   if impl(ghc >= 8.4)
49 |      hs-source-dirs:   ../src-ghc804
50 |   else
51 |      c-sources:        ../cbits/memcmp.c
52 |      hs-source-dirs:   ../src-ghc708
53 |   other-modules:       PrimOps
54 | 
55 |   hs-source-dirs:      ../src
56 | 
57 |   default-language:    Haskell2010
58 |   other-extensions:    CPP
59 |                      , GeneralizedNewtypeDeriving
60 |                      , MagicHash
61 |                      , UnliftedFFITypes
62 |                      , Trustworthy
63 |                      , Unsafe
64 | 
65 |   c-sources: ../cbits/cbits.c
66 | 
67 |   if flag(asserts)
68 |      ghc-options: -fno-ignore-asserts
69 |   else
70 |      cc-options: -DNDEBUG=1
71 | 
72 |   ghc-options: -Wall
73 |   cc-options: -O3 -Wall
74 | 
75 | executable text-short-bench
76 |   default-language:    Haskell2010
77 | --  type:                exitcode-stdio-1.0
78 |   hs-source-dirs:      .
79 |   main-is:             Bench.hs
80 | 
81 |   build-depends: base
82 |                , binary
83 |                , text
84 |                , text-short-iut
85 |                  -- deps which don't inherit constraints from library stanza:
86 |                , criterion >= 1.3.0.0 && < 1.4
87 | 


--------------------------------------------------------------------------------
/ChangeLog.md:
--------------------------------------------------------------------------------
 1 | ## 0.1.6
 2 | 
 3 |   * Drop support for GHC prior 8.6.5
 4 |   * Support GHC-9.10 (base-4.21)
 5 | 
 6 | ## 0.1.5
 7 | 
 8 |   * text-2.0 support
 9 | 
10 | ## 0.1.4
11 | 
12 |   * Fix `fromString` for single character strings.
13 |     https://github.com/haskell-hvr/text-short/issues/20
14 |   * Add Template Haskell `Lift ShortText` instance.
15 | 
16 | ## 0.1.3
17 | 
18 |   * Add `Data ShortText` instance
19 |   * Define `Typeable ShortText` also for GHC 7.8 as well
20 |     (NB: for GHC 7.10.3 and up `Typeable` instances are automatically
21 |      defined even when not mentioned explicitly in a `deriving` clause)
22 |   * Add equivalent verb `Data.Text.split` to `Data.Text.Short` API
23 | 
24 |         split :: (Char -> Bool) -> ShortText -> [ShortText]
25 | 
26 | ## 0.1.2
27 | 
28 |   * Add `IsList ShortText` and `PrintfArg ShortText` instances
29 |   * Expose partial functions via new `Data.Text.Short.Partial` module
30 | 
31 |         foldl1 :: (Char -> Char -> Char) -> ShortText -> Char
32 |         foldl1' :: (Char -> Char -> Char) -> ShortText -> Char
33 |         foldr1 :: (Char -> Char -> Char) -> ShortText -> Char
34 |         head :: ShortText -> Char
35 |         index :: ShortText -> Int -> Char
36 |         init :: ShortText -> ShortText
37 |         last :: ShortText -> Char
38 |         tail :: ShortText -> ShortText
39 | 
40 |   * Add several `Data.Text` verbs to `Data.Text.Short` API
41 | 
42 |         (!?) :: ShortText -> Int -> Maybe Char
43 |         all :: (Char -> Bool) -> ShortText -> Bool
44 |         any :: (Char -> Bool) -> ShortText -> Bool
45 |         append :: ShortText -> ShortText -> ShortText
46 |         break :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
47 |         breakEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
48 |         concat :: [ShortText] -> ShortText
49 |         cons :: Char -> ShortText -> ShortText
50 |         drop :: Int -> ShortText -> ShortText
51 |         dropAround :: (Char -> Bool) -> ShortText -> ShortText
52 |         dropEnd :: Int -> ShortText -> ShortText
53 |         dropWhile :: (Char -> Bool) -> ShortText -> ShortText
54 |         dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
55 |         empty :: ShortText
56 |         filter :: (Char -> Bool) -> ShortText -> ShortText
57 |         find :: (Char -> Bool) -> ShortText -> Maybe Char
58 |         findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
59 |         foldl :: (a -> Char -> a) -> a -> ShortText -> a
60 |         foldl' :: (a -> Char -> a) -> a -> ShortText -> a
61 |         foldr :: (Char -> a -> a) -> a -> ShortText -> a
62 |         indexEndMaybe :: ShortText -> Int -> Maybe Char
63 |         indexMaybe :: ShortText -> Int -> Maybe Char
64 |         intercalate :: ShortText -> [ShortText] -> ShortText
65 |         intersperse :: Char -> ShortText -> ShortText
66 |         isPrefixOf :: ShortText -> ShortText -> Bool
67 |         isSuffixOf :: ShortText -> ShortText -> Bool
68 |         pack :: [Char] -> ShortText
69 |         replicate :: Int -> ShortText -> ShortText
70 |         reverse :: ShortText -> ShortText
71 |         singleton :: Char -> ShortText
72 |         snoc :: ShortText -> Char -> ShortText
73 |         span :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
74 |         spanEnd :: (Char -> Bool) -> ShortText -> (ShortText, ShortText)
75 |         splitAt :: Int -> ShortText -> (ShortText, ShortText)
76 |         splitAtEnd :: Int -> ShortText -> (ShortText, ShortText)
77 |         stripPrefix :: ShortText -> ShortText -> Maybe ShortText
78 |         stripSuffix :: ShortText -> ShortText -> Maybe ShortText
79 |         take :: Int -> ShortText -> ShortText
80 |         takeEnd :: Int -> ShortText -> ShortText
81 |         takeWhile :: (Char -> Bool) -> ShortText -> ShortText
82 |         takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
83 |         uncons :: ShortText -> Maybe (Char, ShortText)
84 |         unpack :: ShortText -> [Char]
85 |         unsnoc :: ShortText -> Maybe (ShortText, Char)
86 | 
87 |   * Optimise low-level primitives
88 |   * Add support for GHC 8.4
89 | 
90 | ## 0.1.1
91 | 
92 | * Expose *unsafe* conversion API via `Data.Text.Short.Unsafe` module
93 | * Minor documentation improvement
94 | 
95 | ## 0.1
96 | 
97 | * First version. Released on an unsuspecting world.
98 | 


--------------------------------------------------------------------------------
/src/Data/Text/Short.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE Trustworthy #-}
  2 | -- |
  3 | -- Module      : Data.Text.Short
  4 | -- Copyright   : © Herbert Valerio Riedel 2017
  5 | -- License     : BSD3
  6 | --
  7 | -- Maintainer  : hvr@gnu.org
  8 | -- Stability   : stable
  9 | --
 10 | -- Memory-efficient representation of Unicode text strings.
 11 | --
 12 | -- This module is intended to be imported @qualified@, to avoid name
 13 | -- clashes with "Prelude" functions, e.g.
 14 | --
 15 | -- > import qualified Data.Text.Short as TS
 16 | -- > import qualified Data.Text.Short (ShortText)
 17 | --
 18 | -- This modules deliberately omits (common) partial functions, which
 19 | -- can be found in "Data.Text.Short.Partial" instead.
 20 | --
 21 | -- @since 0.1
 22 | module Data.Text.Short
 23 |     ( -- * The 'ShortText' type
 24 |       ShortText
 25 | 
 26 |       -- * Basic operations
 27 |       -- ** Construction
 28 |     , empty
 29 |     , singleton
 30 |     , pack
 31 |     , append
 32 |     , concat
 33 |     , cons
 34 |     , snoc
 35 |     , replicate
 36 | 
 37 |       -- ** Deconstruction
 38 |     , unpack
 39 |     , uncons
 40 |     , unsnoc
 41 | 
 42 |       -- ** Querying & predicates
 43 |     , null
 44 |     , length
 45 |     , isAscii
 46 |     , all
 47 |     , any
 48 |     , find
 49 |     , isPrefixOf
 50 |     , isSuffixOf
 51 | 
 52 |       -- ** Lookup & indexing
 53 |     , (!?)
 54 |     , indexMaybe
 55 |     , indexEndMaybe
 56 |     , findIndex
 57 | 
 58 |       -- * Splitting 'ShortText's
 59 |       -- ** Basic functions
 60 |     , take
 61 |     , takeEnd
 62 |     , drop
 63 |     , dropEnd
 64 |     , takeWhile
 65 |     , takeWhileEnd
 66 |     , dropWhile
 67 |     , dropWhileEnd
 68 | 
 69 |     , dropAround
 70 | 
 71 |       -- ** Pair-valued functions
 72 |     , splitAt
 73 |     , splitAtEnd
 74 |     , span
 75 |     , break
 76 |     , spanEnd
 77 |     , breakEnd
 78 | 
 79 |       -- ** Breaking into many substrings
 80 |     , split
 81 | 
 82 |       -- ** Suffix & Prefix operations
 83 |     , stripPrefix
 84 |     , stripSuffix
 85 | 
 86 |       -- * Transformations
 87 |     , intersperse
 88 |     , intercalate
 89 |     , reverse
 90 |     , filter
 91 | 
 92 |       -- * Folds
 93 |     , foldl
 94 |     , foldl'
 95 |     , foldr
 96 | 
 97 |       -- * Conversions
 98 |       -- ** 'String'
 99 |     , fromString
100 |     , toString
101 | 
102 |       -- ** 'Text'
103 |     , fromText
104 |     , toText
105 | 
106 |       -- ** 'ByteString'
107 |     , fromShortByteString
108 |     , toShortByteString
109 | 
110 |     , fromByteString
111 |     , toByteString
112 | 
113 |     , toBuilder
114 | 
115 |     ) where
116 | 
117 | import           Data.Semigroup
118 | import           Data.Text.Short.Internal
119 | import           Prelude                  ()
120 | 
121 | -- | \(\mathcal{O}(n)\) Variant of 'span' with negated predicate.
122 | --
123 | -- >>> break (> 'c') "abcdabcd"
124 | -- ("abc","dabcd")
125 | --
126 | -- prop> break p t == span (not . p) t
127 | --
128 | -- prop> fst (break p t) <> snd (break p t) == t
129 | --
130 | -- @since 0.1.2
131 | break :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
132 | break p st = span (not . p) st
133 | 
134 | -- | \(\mathcal{O}(n)\) Variant of 'spanEnd' with negated predicate.
135 | --
136 | -- >>> breakEnd (< 'c') "abcdabcd"
137 | -- ("abcdab","cd")
138 | --
139 | -- prop> breakEnd p t == spanEnd (not . p) t
140 | --
141 | -- prop> fst (breakEnd p t) <> snd (breakEnd p t) == t
142 | --
143 | -- @since 0.1.2
144 | breakEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
145 | breakEnd p st = spanEnd (not . p) st
146 | 
147 | -- | \(\mathcal{O}(n)\) Index /i/-th code-point in 'ShortText'.
148 | --
149 | -- Infix operator alias of 'indexMaybe'
150 | --
151 | -- >>> "abcdefg" !? 2
152 | -- Just 'c'
153 | --
154 | -- @since 0.1.2
155 | (!?) :: ShortText -> Int -> Maybe Char
156 | (!?) = indexMaybe
157 | 
158 | -- | \(\mathcal{O}(n)\) Test whether /any/ code points in 'ShortText' satisfy a predicate.
159 | --
160 | -- >>> any (> 'c') "abcdabcd"
161 | -- True
162 | --
163 | -- >>> any (const True) ""
164 | -- False
165 | --
166 | -- >>> any (== 'c') "abdabd"
167 | -- False
168 | --
169 | -- prop> any p t == not (all (not . p) t)
170 | --
171 | -- @since 0.1.2
172 | any :: (Char -> Bool) -> ShortText -> Bool
173 | any p st = case find p st of
174 |              Nothing -> False
175 |              Just _  -> True
176 | 
177 | -- | \(\mathcal{O}(n)\) Concatenate two 'ShortText's
178 | --
179 | -- This is a type-specialised alias of '<>'.
180 | --
181 | -- >>> append "foo" "bar"
182 | -- "foobar"
183 | --
184 | -- prop> length (append t1 t2) == length t1 + length t2
185 | --
186 | -- @since 0.1.2
187 | append :: ShortText -> ShortText -> ShortText
188 | append = (<>)
189 | 
190 | -- | \(\mathcal{O}(n)\) Concatenate list of 'ShortText's
191 | --
192 | -- This is a type-specialised alias of 'mconcat'.
193 | --
194 | -- >>> concat []
195 | -- ""
196 | --
197 | -- >>> concat ["foo","bar","doo"]
198 | -- "foobardoo"
199 | --
200 | -- @since 0.1.2
201 | concat :: [ShortText] -> ShortText
202 | concat = mconcat
203 | 
204 | -- | \(\mathcal{O}(0)\) The /empty/ 'ShortText'.
205 | --
206 | -- This is a type-specialised alias of 'mempty'.
207 | --
208 | -- >>> empty
209 | -- ""
210 | --
211 | -- >>> null empty
212 | -- True
213 | --
214 | -- @since 0.1.2
215 | empty :: ShortText
216 | empty = mempty
217 | 
218 | -- | \(\mathcal{O}(n)\) Construct a 'ShortText' from a list of 'Char's.
219 | --
220 | -- This is an alias for 'fromString'.
221 | --
222 | -- @since 0.1.2
223 | pack :: [Char] -> ShortText
224 | pack = fromString
225 | 
226 | -- | \(\mathcal{O}(n)\) Convert 'ShortText' into a list of 'Char's.
227 | --
228 | -- This is an alias for 'toString'.
229 | --
230 | -- prop> (pack . unpack) t == t
231 | --
232 | -- @since 0.1.2
233 | unpack :: ShortText -> [Char]
234 | unpack = toString
235 | 
236 | -- | \(\mathcal{O}(n)\) Take prefix of given length or return whole 'ShortText' if too short.
237 | --
238 | -- >>> take 3 "abcdef"
239 | -- "abc"
240 | --
241 | -- >>> take 3 "ab"
242 | -- "ab"
243 | --
244 | -- @since 0.1.2
245 | take :: Int -> ShortText -> ShortText
246 | take n = fst . splitAt n
247 | 
248 | -- | \(\mathcal{O}(n)\) Take suffix of given length or return whole 'ShortText' if too short.
249 | --
250 | -- >>> takeEnd 3 "abcdefg"
251 | -- "efg"
252 | --
253 | -- >>> takeEnd 3 "ab"
254 | -- "ab"
255 | --
256 | -- @since 0.1.2
257 | takeEnd :: Int -> ShortText -> ShortText
258 | takeEnd n = snd . splitAtEnd n
259 | 
260 | -- | \(\mathcal{O}(n)\) Take remove prefix of given length from 'ShortText' or return 'empty' 'ShortText' if too short.
261 | --
262 | -- >>> drop 4 "abcdef"
263 | -- "ef"
264 | --
265 | -- >>> drop 4 "ab"
266 | -- ""
267 | --
268 | -- @since 0.1.2
269 | drop :: Int -> ShortText -> ShortText
270 | drop n = snd . splitAt n
271 | 
272 | -- | \(\mathcal{O}(n)\) Take remove suffix of given length from 'ShortText' or return 'empty' 'ShortText' if too short.
273 | --
274 | -- >>> drop 4 "abcdefghi"
275 | -- "efghi"
276 | --
277 | -- >>> drop 4 "ab"
278 | -- ""
279 | --
280 | -- @since 0.1.2
281 | dropEnd :: Int -> ShortText -> ShortText
282 | dropEnd n = fst . splitAtEnd n
283 | 
284 | -- | \(\mathcal{O}(n)\) Take longest prefix satisfying given predicate.
285 | --
286 | -- prop> takeWhile p t == fst (span p t)
287 | --
288 | -- >>> takeWhile (< 'c') "abcdabcd"
289 | -- "ab"
290 | --
291 | -- @since 0.1.2
292 | takeWhile :: (Char -> Bool) -> ShortText -> ShortText
293 | takeWhile p = fst . span p
294 | 
295 | -- | \(\mathcal{O}(n)\) Take longest suffix satisfying given predicate.
296 | --
297 | -- prop> takeWhileEnd p t == snd (spanEnd p t)
298 | --
299 | -- >>> takeWhileEnd (>= 'c') "abcdabcd"
300 | -- "cd"
301 | --
302 | -- @since 0.1.2
303 | takeWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
304 | takeWhileEnd p = snd . spanEnd p
305 | 
306 | -- | \(\mathcal{O}(n)\) Remove longest prefix satisfying given predicate.
307 | --
308 | -- prop> dropWhile p t == snd (span p t)
309 | --
310 | -- >>> dropWhile (< 'c') "abcdabcd"
311 | -- "cdabcd"
312 | --
313 | -- @since 0.1.2
314 | dropWhile :: (Char -> Bool) -> ShortText -> ShortText
315 | dropWhile p = snd . span p
316 | 
317 | -- | \(\mathcal{O}(n)\) Remove longest suffix satisfying given predicate.
318 | --
319 | -- prop> dropWhileEnd p t == fst (spanEnd p t)
320 | --
321 | -- >>> dropWhileEnd (>= 'c') "abcdabcd"
322 | -- "abcdab"
323 | --
324 | -- @since 0.1.2
325 | dropWhileEnd :: (Char -> Bool) -> ShortText -> ShortText
326 | dropWhileEnd p = fst . spanEnd p
327 | 
328 | 
329 | -- $setup
330 | -- >>> :set -XOverloadedStrings
331 | -- >>> import Text.Show.Functions ()
332 | -- >>> import qualified Test.QuickCheck.Arbitrary as QC
333 | -- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary }
334 | 


--------------------------------------------------------------------------------
/.github/workflows/haskell-ci.yml:
--------------------------------------------------------------------------------
  1 | # This GitHub workflow config has been generated by a script via
  2 | #
  3 | #   haskell-ci 'github' 'cabal.project'
  4 | #
  5 | # To regenerate the script (for example after adjusting tested-with) run
  6 | #
  7 | #   haskell-ci regenerate
  8 | #
  9 | # For more information, see https://github.com/haskell-CI/haskell-ci
 10 | #
 11 | # version: 0.19.20241220
 12 | #
 13 | # REGENDATA ("0.19.20241220",["github","cabal.project"])
 14 | #
 15 | name: Haskell-CI
 16 | on:
 17 |   push:
 18 |     branches:
 19 |       - master
 20 |   pull_request:
 21 |     branches:
 22 |       - master
 23 | jobs:
 24 |   linux:
 25 |     name: Haskell-CI - Linux - ${{ matrix.compiler }}
 26 |     runs-on: ubuntu-20.04
 27 |     timeout-minutes:
 28 |       60
 29 |     container:
 30 |       image: buildpack-deps:jammy
 31 |     continue-on-error: ${{ matrix.allow-failure }}
 32 |     strategy:
 33 |       matrix:
 34 |         include:
 35 |           - compiler: ghc-9.12.1
 36 |             compilerKind: ghc
 37 |             compilerVersion: 9.12.1
 38 |             setup-method: ghcup
 39 |             allow-failure: false
 40 |           - compiler: ghc-9.10.1
 41 |             compilerKind: ghc
 42 |             compilerVersion: 9.10.1
 43 |             setup-method: ghcup
 44 |             allow-failure: false
 45 |           - compiler: ghc-9.8.4
 46 |             compilerKind: ghc
 47 |             compilerVersion: 9.8.4
 48 |             setup-method: ghcup
 49 |             allow-failure: false
 50 |           - compiler: ghc-9.6.6
 51 |             compilerKind: ghc
 52 |             compilerVersion: 9.6.6
 53 |             setup-method: ghcup
 54 |             allow-failure: false
 55 |           - compiler: ghc-9.4.8
 56 |             compilerKind: ghc
 57 |             compilerVersion: 9.4.8
 58 |             setup-method: ghcup
 59 |             allow-failure: false
 60 |           - compiler: ghc-9.2.8
 61 |             compilerKind: ghc
 62 |             compilerVersion: 9.2.8
 63 |             setup-method: ghcup
 64 |             allow-failure: false
 65 |           - compiler: ghc-9.0.2
 66 |             compilerKind: ghc
 67 |             compilerVersion: 9.0.2
 68 |             setup-method: ghcup
 69 |             allow-failure: false
 70 |           - compiler: ghc-8.10.7
 71 |             compilerKind: ghc
 72 |             compilerVersion: 8.10.7
 73 |             setup-method: ghcup
 74 |             allow-failure: false
 75 |           - compiler: ghc-8.8.3
 76 |             compilerKind: ghc
 77 |             compilerVersion: 8.8.3
 78 |             setup-method: ghcup
 79 |             allow-failure: false
 80 |           - compiler: ghc-8.6.5
 81 |             compilerKind: ghc
 82 |             compilerVersion: 8.6.5
 83 |             setup-method: ghcup
 84 |             allow-failure: false
 85 |       fail-fast: false
 86 |     steps:
 87 |       - name: apt-get install
 88 |         run: |
 89 |           apt-get update
 90 |           apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5
 91 |       - name: Install GHCup
 92 |         run: |
 93 |           mkdir -p "$HOME/.ghcup/bin"
 94 |           curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup"
 95 |           chmod a+x "$HOME/.ghcup/bin/ghcup"
 96 |       - name: Install cabal-install (prerelease)
 97 |         run: |
 98 |           "$HOME/.ghcup/bin/ghcup" config add-release-channel https://raw.githubusercontent.com/haskell/ghcup-metadata/master/ghcup-prereleases-0.0.8.yaml;
 99 |           "$HOME/.ghcup/bin/ghcup" install cabal 3.15.0.0.2024.10.3 || (cat "$HOME"/.ghcup/logs/*.* && false)
100 |           echo "CABAL=$HOME/.ghcup/bin/cabal-3.15.0.0.2024.10.3 -vnormal+nowrap" >> "$GITHUB_ENV"
101 |       - name: Install GHC (GHCup)
102 |         if: matrix.setup-method == 'ghcup'
103 |         run: |
104 |           "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false)
105 |           HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER")
106 |           HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#')
107 |           HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#')
108 |           echo "HC=$HC" >> "$GITHUB_ENV"
109 |           echo "HCPKG=$HCPKG" >> "$GITHUB_ENV"
110 |           echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV"
111 |         env:
112 |           HCKIND: ${{ matrix.compilerKind }}
113 |           HCNAME: ${{ matrix.compiler }}
114 |           HCVER: ${{ matrix.compilerVersion }}
115 |       - name: Set PATH and environment variables
116 |         run: |
117 |           echo "$HOME/.cabal/bin" >> $GITHUB_PATH
118 |           echo "LANG=C.UTF-8" >> "$GITHUB_ENV"
119 |           echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV"
120 |           echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV"
121 |           HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))')
122 |           echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV"
123 |           echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV"
124 |           echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV"
125 |           echo "HEADHACKAGE=false" >> "$GITHUB_ENV"
126 |           echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV"
127 |         env:
128 |           HCKIND: ${{ matrix.compilerKind }}
129 |           HCNAME: ${{ matrix.compiler }}
130 |           HCVER: ${{ matrix.compilerVersion }}
131 |       - name: env
132 |         run: |
133 |           env
134 |       - name: write cabal config
135 |         run: |
136 |           mkdir -p $CABAL_DIR
137 |           cat >> $CABAL_CONFIG <<EOF
138 |           remote-build-reporting: anonymous
139 |           write-ghc-environment-files: never
140 |           remote-repo-cache: $CABAL_DIR/packages
141 |           logs-dir:          $CABAL_DIR/logs
142 |           world-file:        $CABAL_DIR/world
143 |           extra-prog-path:   $CABAL_DIR/bin
144 |           symlink-bindir:    $CABAL_DIR/bin
145 |           installdir:        $CABAL_DIR/bin
146 |           build-summary:     $CABAL_DIR/logs/build.log
147 |           store-dir:         $CABAL_DIR/store
148 |           install-dirs user
149 |             prefix: $CABAL_DIR
150 |           repository hackage.haskell.org
151 |             url: http://hackage.haskell.org/
152 |           EOF
153 |           cat >> $CABAL_CONFIG <<EOF
154 |           program-default-options
155 |             ghc-options: $GHCJOBS +RTS -M3G -RTS
156 |           EOF
157 |           cat $CABAL_CONFIG
158 |       - name: versions
159 |         run: |
160 |           $HC --version || true
161 |           $HC --print-project-git-commit-id || true
162 |           $CABAL --version || true
163 |       - name: update cabal index
164 |         run: |
165 |           $CABAL v2-update -v
166 |       - name: install cabal-plan
167 |         run: |
168 |           mkdir -p $HOME/.cabal/bin
169 |           curl -sL https://github.com/haskell-hvr/cabal-plan/releases/download/v0.7.3.0/cabal-plan-0.7.3.0-x86_64-linux.xz > cabal-plan.xz
170 |           echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2  cabal-plan.xz' | sha256sum -c -
171 |           xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan
172 |           rm -f cabal-plan.xz
173 |           chmod a+x $HOME/.cabal/bin/cabal-plan
174 |           cabal-plan --version
175 |       - name: checkout
176 |         uses: actions/checkout@v4
177 |         with:
178 |           path: source
179 |       - name: initial cabal.project for sdist
180 |         run: |
181 |           touch cabal.project
182 |           echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project
183 |           cat cabal.project
184 |       - name: sdist
185 |         run: |
186 |           mkdir -p sdist
187 |           $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist
188 |       - name: unpack
189 |         run: |
190 |           mkdir -p unpacked
191 |           find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \;
192 |       - name: generate cabal.project
193 |         run: |
194 |           PKGDIR_text_short="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/text-short-[0-9.]*')"
195 |           echo "PKGDIR_text_short=${PKGDIR_text_short}" >> "$GITHUB_ENV"
196 |           rm -f cabal.project cabal.project.local
197 |           touch cabal.project
198 |           touch cabal.project.local
199 |           echo "packages: ${PKGDIR_text_short}" >> cabal.project
200 |           echo "package text-short" >> cabal.project
201 |           echo "    ghc-options: -Werror=missing-methods" >> cabal.project
202 |           cat >> cabal.project <<EOF
203 |           EOF
204 |           $HCPKG list --simple-output --names-only | perl -ne 'for (split /\s+/) { print "constraints: any.$_ installed\n" unless /^(text-short)$/; }' >> cabal.project.local
205 |           cat cabal.project
206 |           cat cabal.project.local
207 |       - name: dump install plan
208 |         run: |
209 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all
210 |           cabal-plan
211 |       - name: restore cache
212 |         uses: actions/cache/restore@v4
213 |         with:
214 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
215 |           path: ~/.cabal/store
216 |           restore-keys: ${{ runner.os }}-${{ matrix.compiler }}-
217 |       - name: install dependencies
218 |         run: |
219 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all
220 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all
221 |       - name: build w/o tests
222 |         run: |
223 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
224 |       - name: build
225 |         run: |
226 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always
227 |       - name: tests
228 |         run: |
229 |           $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct
230 |       - name: cabal check
231 |         run: |
232 |           cd ${PKGDIR_text_short} || false
233 |           ${CABAL} -vnormal check
234 |       - name: haddock
235 |         run: |
236 |           $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all
237 |       - name: unconstrained build
238 |         run: |
239 |           rm -f cabal.project.local
240 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
241 |       - name: prepare for constraint sets
242 |         run: |
243 |           rm -f cabal.project.local
244 |       - name: constraint set text-2
245 |         run: |
246 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' all --dry-run ; fi
247 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then cabal-plan topo | sort ; fi
248 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' --dependencies-only -j2 all ; fi
249 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' all ; fi
250 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-test $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=2' all ; fi
251 |       - name: constraint set text-1
252 |         run: |
253 |           if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' all --dry-run ; fi
254 |           if [ $((HCNUMVER < 90400)) -ne 0 ] ; then cabal-plan topo | sort ; fi
255 |           if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' --dependencies-only -j2 all ; fi
256 |           if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' all ; fi
257 |           if [ $((HCNUMVER < 90400)) -ne 0 ] ; then $CABAL v2-test $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='text ^>=1.2.3.0' all ; fi
258 |       - name: constraint set bytestring-0.11
259 |         run: |
260 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' all --dry-run ; fi
261 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then cabal-plan topo | sort ; fi
262 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' --dependencies-only -j2 all ; fi
263 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-build $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' all ; fi
264 |           if [ $((HCNUMVER < 90800)) -ne 0 ] ; then $CABAL v2-test $ARG_COMPILER --enable-tests --disable-benchmarks --constraint='bytestring ==0.11.*' all ; fi
265 |       - name: save cache
266 |         if: always()
267 |         uses: actions/cache/save@v4
268 |         with:
269 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
270 |           path: ~/.cabal/store
271 | 


--------------------------------------------------------------------------------
/src-test/Tests.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP               #-}
  2 | {-# LANGUAGE OverloadedLists   #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | {-# LANGUAGE TemplateHaskell   #-}
  5 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  6 | 
  7 | #ifndef MIN_VERSION_GLASGOW_HASKELL
  8 | #define MIN_VERSION_GLASGOW_HASKELL(x,y,z,w) ((x*100 + y) >= __GLASGOW_HASKELL__)
  9 | #endif
 10 | 
 11 | module Main(main) where
 12 | 
 13 | import           Data.Binary
 14 | import           Data.Char
 15 | import           Data.Maybe
 16 | import           Data.Monoid
 17 | import qualified Data.String               as D.S
 18 | import qualified Data.ByteString           as BS
 19 | import qualified Data.Text                 as T
 20 | import qualified Data.Text.Encoding        as T
 21 | import qualified Data.Text.Short           as IUT
 22 | import qualified Data.Text.Short.Partial   as IUT
 23 | import           Test.Tasty
 24 | import           Test.Tasty.HUnit
 25 | import           Test.Tasty.QuickCheck     as QC
 26 | import           Text.Show.Functions       ()
 27 | 
 28 | fromByteStringRef = either (const Nothing) (Just . IUT.fromText) . T.decodeUtf8'
 29 | 
 30 | main :: IO ()
 31 | main = defaultMain (adjustOption (QuickCheckTests 50000 `max`) $ tests)
 32 | 
 33 | tests :: TestTree
 34 | tests = testGroup "Tests" [unitTests,qcProps]
 35 | 
 36 | -- ShortText w/ in-bounds index
 37 | data STI = STI IUT.ShortText Int
 38 |          deriving (Eq,Show)
 39 | 
 40 | newtype ST = ST IUT.ShortText
 41 |            deriving (Eq,Show)
 42 | 
 43 | instance Arbitrary STI where
 44 |   arbitrary = do
 45 |     t <- arbitrary
 46 |     i <- choose (0, T.length t - 1)
 47 |     return $! STI (IUT.fromText t) i
 48 | 
 49 | instance Arbitrary ST where
 50 |   arbitrary = fmap (ST . IUT.fromText) arbitrary
 51 |   shrink (ST st) = map (ST . IUT.fromText) (shrink (IUT.toText st))
 52 | 
 53 | qcProps :: TestTree
 54 | qcProps = testGroup "Properties"
 55 |   [ QC.testProperty "length/fromText"   $ \t -> IUT.length (IUT.fromText t) == T.length t
 56 |   , QC.testProperty "length/fromString" $ \s -> IUT.length (IUT.fromString s) == length s
 57 |   , QC.testProperty "length/append"     $ \(ST t1) (ST t2) -> IUT.length t1 + IUT.length t2 == IUT.length (IUT.append t1 t2)
 58 |   , QC.testProperty "compare" $ \t1 t2 -> IUT.fromText t1 `compare` IUT.fromText t2  == t1 `compare` t2
 59 |   , QC.testProperty "(==)" $ \t1 t2 -> (IUT.fromText t1 == IUT.fromText t2)  == (t1 == t2)
 60 |   , QC.testProperty "(!?)" $ \t ->
 61 |       let t' = IUT.fromText t
 62 |       in and ([ mapMaybe (t' IUT.!?) ([0 .. T.length t -1 ] :: [Int]) == T.unpack t
 63 |               , mapMaybe (t' IUT.!?) [-5 .. -1] == []
 64 |               , mapMaybe (t' IUT.!?) [T.length t .. T.length t + 5] == []
 65 |               ] :: [Bool])
 66 |   , QC.testProperty "indexEndMaybe" $ \t ->
 67 |       let t' = IUT.fromText t
 68 |       in and ([ mapMaybe (IUT.indexEndMaybe t') [0 .. T.length t -1 ] == T.unpack (T.reverse t)
 69 |               , mapMaybe (IUT.indexEndMaybe t') [-5 .. -1] == []
 70 |               , mapMaybe (IUT.indexEndMaybe t') [T.length t .. T.length t + 5] == []
 71 |               ] :: [Bool])
 72 |   , QC.testProperty "toText.fromText"   $ \t -> (IUT.toText . IUT.fromText) t == t
 73 |   , QC.testProperty "fromByteString"    $ \b -> IUT.fromByteString b == fromByteStringRef b
 74 |   , QC.testProperty "fromByteString.toByteString" $ \t -> let ts = IUT.fromText t in (IUT.fromByteString . IUT.toByteString) ts == Just ts
 75 |   , QC.testProperty "toString.fromString" $ \s -> (IUT.toString . IUT.fromString) s == s
 76 |   , QC.testProperty "isAscii"  $ \s -> IUT.isAscii (IUT.fromString s) == all isAscii s
 77 |   , QC.testProperty "isAscii2" $ \t -> IUT.isAscii (IUT.fromText t)   == T.all isAscii t
 78 |   , QC.testProperty "splitAt" $ \t ->
 79 |       let t' = IUT.fromText t
 80 |           mapBoth f (x,y) = (f x, f y)
 81 |       in and [ mapBoth IUT.toText (IUT.splitAt i t') == T.splitAt i t | i <- [-5 .. 5+T.length t ] ]
 82 |   , QC.testProperty "intercalate/split" $ \t c ->
 83 |       let t' = IUT.fromText t
 84 |       in IUT.intercalate (IUT.singleton c) (IUT.split (== c) t') == t'
 85 | 
 86 |   , QC.testProperty "intersperse" $ \t c -> IUT.intersperse c (IUT.fromText t) == IUT.fromText (T.intersperse c t)
 87 |   , QC.testProperty "intercalate" $ \t1 t2 -> IUT.intercalate (IUT.fromText t1) (map IUT.fromText t2) == IUT.fromText (T.intercalate t1 t2)
 88 |   , QC.testProperty "reverse.singleton" $ \c -> IUT.reverse (IUT.singleton c) == IUT.singleton c
 89 |   , QC.testProperty "reverse"     $ \t -> IUT.reverse (IUT.fromText t) == IUT.fromText (T.reverse t)
 90 |   , QC.testProperty "filter"      $ \p t -> IUT.filter p (IUT.fromText t) == IUT.fromText (T.filter p t)
 91 |   , QC.testProperty "replicate"   $ \n t -> IUT.replicate n (IUT.fromText t) == IUT.fromText (T.replicate n t)
 92 |   , QC.testProperty "dropAround"  $ \p t -> IUT.dropAround p (IUT.fromText t) == IUT.fromText (T.dropAround p t)
 93 | 
 94 |   , QC.testProperty "foldl"       $ \f z t -> IUT.foldl f (z :: Char) (IUT.fromText t) == T.foldl f (z :: Char) t
 95 |   , QC.testProperty "foldl #2"    $ \t -> IUT.foldl (\n _ -> (n+1)) 0 (IUT.fromText t) == T.length t
 96 |   , QC.testProperty "foldl #3"    $ \t -> IUT.foldl (\s c -> c : s) [] (IUT.fromText t) == T.unpack (T.reverse t)
 97 | 
 98 |   , QC.testProperty "foldl'"      $ \f z t -> IUT.foldl' f (z :: Char) (IUT.fromText t) == T.foldl' f (z :: Char) t
 99 |   , QC.testProperty "foldl' #2"   $ \t -> IUT.foldl' (\n _ -> (n+1)) 0 (IUT.fromText t) == T.length t
100 |   , QC.testProperty "foldl' #3"   $ \t -> IUT.foldl' (\s c -> c : s) [] (IUT.fromText t) == T.unpack (T.reverse t)
101 | 
102 |   , QC.testProperty "foldr"       $ \f z t -> IUT.foldr f (z :: Char) (IUT.fromText t) == T.foldr f (z :: Char) t
103 |   , QC.testProperty "foldr #2"    $ \t -> IUT.foldr (\_ n -> (n+1)) 0 (IUT.fromText t) == T.length t
104 |   , QC.testProperty "foldr #3"    $ \t -> IUT.foldr (:) [] (IUT.fromText t) == T.unpack t
105 | 
106 |   , QC.testProperty "foldr1"      $ \f t -> (not (T.null t)) ==> IUT.foldr1 f (IUT.fromText t) == T.foldr1 f t
107 |   , QC.testProperty "foldl1"      $ \f t -> (not (T.null t)) ==> IUT.foldl1 f (IUT.fromText t) == T.foldl1 f t
108 |   , QC.testProperty "foldl1'"     $ \f t -> (not (T.null t)) ==> IUT.foldl1' f (IUT.fromText t) == T.foldl1' f t
109 | 
110 |   , QC.testProperty "splitAtEnd" $ \t ->
111 |       let t' = IUT.fromText t
112 |           n' = IUT.length t'
113 |       in and [ (IUT.splitAt (n'-i) t') == IUT.splitAtEnd i t' | i <- [-5 .. 5+n' ] ]
114 | 
115 |   , QC.testProperty "find" $ \t -> IUT.find Data.Char.isAscii (IUT.fromText t) == T.find Data.Char.isAscii t
116 |   , QC.testProperty "findIndex" $ \t -> IUT.findIndex Data.Char.isAscii (IUT.fromText t) == T.findIndex Data.Char.isAscii t
117 | 
118 |   , QC.testProperty "isSuffixOf" $ \t1 t2 -> IUT.fromText t1 `IUT.isSuffixOf` IUT.fromText t2  == t1 `T.isSuffixOf` t2
119 |   , QC.testProperty "isPrefixOf" $ \t1 t2 -> IUT.fromText t1 `IUT.isPrefixOf` IUT.fromText t2  == t1 `T.isPrefixOf` t2
120 | 
121 |   , QC.testProperty "stripPrefix" $ \t1 t2 -> IUT.stripPrefix (IUT.fromText t1) (IUT.fromText t2) ==
122 |                                                 fmap IUT.fromText (T.stripPrefix t1 t2)
123 | 
124 |   , QC.testProperty "stripSuffix" $ \t1 t2 -> IUT.stripSuffix (IUT.fromText t1) (IUT.fromText t2) ==
125 |                                                 fmap IUT.fromText (T.stripSuffix t1 t2)
126 | 
127 |   , QC.testProperty "stripPrefix 2" $ \(STI t i) ->
128 |       let (pfx,sfx) = IUT.splitAt i t
129 |       in IUT.stripPrefix pfx t == Just sfx
130 | 
131 |   , QC.testProperty "stripSuffix 2" $ \(STI t i) ->
132 |       let (pfx,sfx) = IUT.splitAt i t
133 |       in IUT.stripSuffix sfx t == Just pfx
134 | 
135 |   , QC.testProperty "cons" $ \c t -> IUT.singleton c <> IUT.fromText t == IUT.cons c (IUT.fromText t)
136 |   , QC.testProperty "snoc" $ \c t -> IUT.fromText t <> IUT.singleton c == IUT.snoc (IUT.fromText t) c
137 | 
138 |   , QC.testProperty "uncons" $ \c t -> IUT.uncons (IUT.singleton c <> IUT.fromText t) == Just (c, IUT.fromText t)
139 | 
140 |   , QC.testProperty "unsnoc" $ \c t -> IUT.unsnoc (IUT.fromText t <> IUT.singleton c) == Just (IUT.fromText t, c)
141 | 
142 |   , QC.testProperty "break" $ \t -> let (l,r)   = IUT.break Data.Char.isAscii (IUT.fromText t)
143 |                                     in  T.break Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
144 | 
145 |   , QC.testProperty "span"  $ \t -> let (l,r)   = IUT.span Data.Char.isAscii (IUT.fromText t)
146 |                                     in  T.span Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
147 | 
148 |   , QC.testProperty "breakEnd" $ \t -> let (l,r)   = IUT.breakEnd Data.Char.isAscii (IUT.fromText t)
149 |                                        in  t_breakEnd Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
150 | 
151 |   , QC.testProperty "spanEnd"  $ \t -> let (l,r)   = IUT.spanEnd Data.Char.isAscii (IUT.fromText t)
152 |                                        in  t_spanEnd Data.Char.isAscii t == (IUT.toText l,IUT.toText r)
153 | 
154 |   , QC.testProperty "splitAt/isPrefixOf" $ \t ->
155 |       let t' = IUT.fromText t
156 |       in and [ IUT.isPrefixOf (fst (IUT.splitAt i t')) t' | i <- [-5 .. 5+T.length t ] ]
157 |   , QC.testProperty "splitAt/isSuffixOf" $ \t ->
158 |       let t' = IUT.fromText t
159 |       in and [ IUT.isSuffixOf (snd (IUT.splitAt i t')) t' | i <- [-5 .. 5+T.length t ] ]
160 |   ]
161 | 
162 | t_breakEnd p t = t_spanEnd (not . p) t
163 | t_spanEnd  p t = (T.dropWhileEnd p t, T.takeWhileEnd p t)
164 | 
165 | unitTests = testGroup "Unit-tests"
166 |   [ testCase "fromText mempty" $ IUT.fromText mempty @?= mempty
167 |   , testCase "fromShortByteString [0xc0,0x80]" $ IUT.fromShortByteString "\xc0\x80" @?= Nothing
168 |   , testCase "fromByteString [0xc0,0x80]" $ IUT.fromByteString "\xc0\x80" @?= Nothing
169 |   , testCase "fromByteString [0xf0,0x90,0x80,0x80]" $ IUT.fromByteString "\xf0\x90\x80\x80" @?= Just "\x10000"
170 |   , testCase "fromByteString [0xf4,0x90,0x80,0x80]" $ IUT.fromByteString "\244\144\128\128" @?= Nothing
171 |   , testCase "IsString U+D800" $ "\xFFFD" @?= (IUT.fromString "\xD800")
172 | --  , testCase "IsString U+D800" $ (IUT.fromString "\xD800") @?= IUT.fromText ("\xD800" :: T.Text)
173 | 
174 | #if !(MIN_VERSION_bytestring(0,11,0) && MIN_VERSION_GLASGOW_HASKELL(9,0,1,0) && !MIN_VERSION_GLASGOW_HASKELL(9,0,2,0))
175 |   -- https://gitlab.haskell.org/ghc/ghc/-/issues/19976
176 |   , testCase "Binary.encode" $ encode ("Hello \8364 & \171581!\NUL" :: IUT.ShortText) @?= "\NUL\NUL\NUL\NUL\NUL\NUL\NUL\DC2Hello \226\130\172 & \240\169\184\189!\NUL"
177 |   , testCase "Binary.decode" $ decode ("\NUL\NUL\NUL\NUL\NUL\NUL\NUL\DC2Hello \226\130\172 & \240\169\184\189!\NUL") @?= ("Hello \8364 & \171581!\NUL" :: IUT.ShortText)
178 | #endif
179 |   , testCase "singleton" $ [ c | c <- [minBound..maxBound], IUT.singleton c /= IUT.fromText (T.singleton c) ] @?= []
180 | 
181 |   , testCase "splitAtEnd" $ IUT.splitAtEnd 1 "€€" @?= ("€","€")
182 |   , testCase "split#1" $ IUT.split (== 'a') "aabbaca" @?= ["", "", "bb", "c", ""]
183 |   , testCase "split#2" $ IUT.split (const False) "aabbaca" @?= ["aabbaca"]
184 |   , testCase "split#3" $ IUT.split (const True) "abc" @?= ["","","",""]
185 |   , testCase "split#4" $ IUT.split (const True) "" @?= [""]
186 | 
187 |   , testCase "literal0" $ IUT.unpack testLit0 @?= []
188 |   , testCase "literal1" $ IUT.unpack testLit1 @?= ['€','\0','€','\0']
189 |   , testCase "literal2" $ IUT.unpack testLit2 @?= ['\xFFFD','\xD7FF','\xFFFD','\xE000']
190 |   , testCase "literal3" $ IUT.unpack testLit3 @?= ['\1'..'\x7f']
191 |   , testCase "literal4" $ IUT.unpack testLit4 @?= map toEnum [0,1,126,127,128,129,130,256,2046,2047,2048,2049,2050,65530,65531,65532,65533,65534,65533,65535,65536,65537,65538,1114110,1114111]
192 |   , testCase "literal5" $ IUT.unpack testLit5 @?= map toEnum [28961]
193 |   , testCase "literal6" $ IUT.unpack testLit6 @?= map toEnum [0]
194 |   , testCase "literal7" $ IUT.unpack testLit7 @?= map toEnum [66328]
195 |   , testCase "literal8" $ IUT.unpack testLit8 @?= map toEnum [127]
196 | 
197 |     -- list literals
198 |   , testCase "literal9"  $ [] @?= ("" :: IUT.ShortText)
199 |   , testCase "literal10" $ ['¤','€','$'] @?= ("¤€$" :: IUT.ShortText)
200 |   , testCase "literal12" $ IUT.unpack ['\xD800','\xD7FF','\xDFFF','\xE000'] @?= ['\xFFFD','\xD7FF','\xFFFD','\xE000']
201 | 
202 |     -- template haskell
203 |   , testCase "TH.Lift" $ do
204 |       testLit1 @?= $([| testLit1 |])
205 |       testLit2 @?= $([| testLit2 |])
206 |       testLit3 @?= $([| testLit3 |])
207 |       testLit4 @?= $([| testLit4 |])
208 |       testLit5 @?= $([| testLit5 |])
209 |       testLit6 @?= $([| testLit6 |])
210 |       testLit7 @?= $([| testLit7 |])
211 |       testLit8 @?= $([| testLit8 |])
212 | 
213 |   , testCase "TTH.Lift" $ do
214 |       testLit1 @?= $$([|| testLit1 ||])
215 |       testLit2 @?= $$([|| testLit2 ||])
216 |       testLit3 @?= $$([|| testLit3 ||])
217 |       testLit4 @?= $$([|| testLit4 ||])
218 |       testLit5 @?= $$([|| testLit5 ||])
219 |       testLit6 @?= $$([|| testLit6 ||])
220 |       testLit7 @?= $$([|| testLit7 ||])
221 |       testLit8 @?= $$([|| testLit8 ||])
222 |  ]
223 | 
224 | -- isScalar :: Char -> Bool
225 | -- isScalar c = c < '\xD800' || c >= '\xE000'
226 | 
227 | 
228 | {-# NOINLINE testLit0 #-}
229 | testLit0 :: IUT.ShortText
230 | testLit0 = ""
231 | 
232 | {-# NOINLINE testLit1 #-}
233 | testLit1 :: IUT.ShortText
234 | testLit1 = "€\NUL€\NUL"
235 | 
236 | {-# NOINLINE testLit2 #-}
237 | testLit2 :: IUT.ShortText
238 | testLit2 = "\xD800\xD7FF\xDFFF\xE000"
239 | 
240 | {-# NOINLINE testLit3 #-}
241 | testLit3 :: IUT.ShortText
242 | testLit3 = "\SOH\STX\ETX\EOT\ENQ\ACK\a\b\t\n\v\f\r\SO\SI\DLE\DC1\DC2\DC3\DC4\NAK\SYN\ETB\CAN\EM\SUB\ESC\FS\GS\RS\US !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\DEL"
243 | 
244 | {-# NOINLINE testLit4 #-}
245 | testLit4 :: IUT.ShortText
246 | testLit4 = "\NUL\SOH~\DEL\128\129\130\256\2046\2047\2048\2049\2050\65530\65531\65532\65533\65534\65533\65535\65536\65537\65538\1114110\1114111"
247 | 
248 | {-# NOINLINE testLit5 #-}
249 | testLit5 :: IUT.ShortText
250 | testLit5 = "無"
251 | 
252 | {-# NOINLINE testLit6 #-}
253 | testLit6 :: IUT.ShortText
254 | testLit6 = "\NUL"
255 | 
256 | {-# NOINLINE testLit7 #-}
257 | testLit7 :: IUT.ShortText
258 | testLit7 = "𐌘"
259 | 
260 | {-# NOINLINE testLit8 #-}
261 | testLit8 :: IUT.ShortText
262 | testLit8 = "\x7f"
263 | 
264 | -------------------------------------------------------------------------------
265 | -- orphans
266 | -------------------------------------------------------------------------------
267 | 
268 | -- orphan instances to not depend on quickcheck-instances
269 | -- which would cause cycles
270 | 
271 | instance Arbitrary BS.ByteString where
272 |     arbitrary = BS.pack `fmap` arbitrary
273 |     shrink xs = BS.pack `fmap` shrink (BS.unpack xs)
274 | 
275 | instance Arbitrary T.Text where
276 |     arbitrary = T.pack `fmap` arbitrary
277 |     shrink xs = T.pack `fmap` shrink (T.unpack xs)
278 | 


--------------------------------------------------------------------------------
/cbits/cbits.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2017, Herbert Valerio Riedel
  3 |  *
  4 |  * All rights reserved.
  5 |  *
  6 |  * Redistribution and use in source and binary forms, with or without
  7 |  * modification, are permitted provided that the following conditions are met:
  8 |  *
  9 |  *     * Redistributions of source code must retain the above copyright
 10 |  *       notice, this list of conditions and the following disclaimer.
 11 |  *
 12 |  *     * Redistributions in binary form must reproduce the above
 13 |  *       copyright notice, this list of conditions and the following
 14 |  *       disclaimer in the documentation and/or other materials provided
 15 |  *       with the distribution.
 16 |  *
 17 |  *     * Neither the name of Herbert Valerio Riedel nor the names of other
 18 |  *       contributors may be used to endorse or promote products derived
 19 |  *       from this software without specific prior written permission.
 20 |  *
 21 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 22 |  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 23 |  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 24 |  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 25 |  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 26 |  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 27 |  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 28 |  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 29 |  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 30 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 31 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 32 |  */
 33 | 
 34 | #if !defined(NDEBUG)
 35 | # warning assert(3) checks enabled
 36 | #endif
 37 | 
 38 | #include <stdint.h>
 39 | #include <stdbool.h>
 40 | #include <string.h>
 41 | #include <stdlib.h>
 42 | #include <assert.h>
 43 | #include <HsFFI.h>
 44 | 
 45 | #if !defined(SIZEOF_VOID_P)
 46 | # error <HsFFI.h> SIZEOF_VOID_P not defined
 47 | #endif
 48 | 
 49 | #if (SIZEOF_VOID_P) == 8
 50 | const static bool is_64bit = true;
 51 | #elif (SIZEOF_VOID_P) == 4
 52 | const static bool is_64bit = false;
 53 | #else
 54 | # error unexpected SIZEOF_VOID_P value
 55 | #endif
 56 | 
 57 | #if (WORDS_BIGENDIAN)
 58 | const static bool is_bigendian = true;
 59 | #else
 60 | const static bool is_bigendian = false;
 61 | #endif
 62 | 
 63 | #if defined(__GNUC__)
 64 | # define likely(x)     __builtin_expect(!!(x),1)
 65 | # define unlikely(x)   __builtin_expect(!!(x),0)
 66 | #else
 67 | # define likely(x)     (x)
 68 | # define unlikely(x)   (x)
 69 | #endif
 70 | 
 71 | /* test whether octet in UTF-8 steam is not a continuation byte, i.e. a leading byte */
 72 | #define utf8_lead_p(octet) (((octet) & 0xc0) != 0x80)
 73 | 
 74 | /* 0 <= x <= 0x110000 */
 75 | typedef HsWord codepoint_t;
 76 | 
 77 | /* Count number of code-points in well-formed utf8 string */
 78 | size_t
 79 | hs_text_short_length(const uint8_t buf[], const size_t n)
 80 | {
 81 |   size_t j = 0;
 82 |   size_t l = 0;
 83 | 
 84 |   /* Both GCC & Clang are able to optimise the code below quite well at -O3 */
 85 |   for (j = 0; j < n; j++)
 86 |     if (utf8_lead_p(buf[j]))
 87 |       l++;
 88 | 
 89 |   return l;
 90 | }
 91 | 
 92 | /* Locate offset of j-th code-point in well-formed utf8 string
 93 |  *
 94 |  */
 95 | size_t
 96 | hs_text_short_index_ofs(const uint8_t buf[], const size_t n, const size_t i)
 97 | {
 98 |   if (!n)
 99 |     return n;
100 | 
101 |   size_t m = 0;
102 |   size_t j = 0;
103 | 
104 |   for (;;) {
105 |     assert(m >= 0);
106 |     assert(j <= i);
107 |     assert(j <= m);
108 | 
109 |     if (j == i)
110 |       return m; /* found */
111 | 
112 |     if (i-j >= n-m)
113 |       return n; /* i-th char is >= buf+n */
114 | 
115 |     assert(m < n);
116 |     const uint8_t b0 = buf[m];
117 | 
118 |     if (!(b0 & 0x80))
119 |       m += 1;   /* 0_______ */
120 |     else
121 |       switch(b0 >> 4) {
122 |       case 0xf: /* 11110___ */
123 |         m += 4;
124 |         break;
125 |       case 0xe: /* 1110____ */
126 |         m += 3;
127 |         break;
128 |       default:  /* 110_____ */
129 |         m += 2;
130 |         break;
131 |       }
132 | 
133 |     j += 1;
134 |   }
135 | 
136 |   assert(0);
137 | }
138 | 
139 | /* Locate offset of j-th code-point (in reverse direction) in
140 |  * well-formed utf8 string starting at end of buffer.
141 |  *
142 |  * The 0-th character from the end is the last character in the utf8
143 |  * string (if it exists).
144 |  *
145 |  * Returns original 'n' if out of bounds.
146 |  *
147 |  */
148 | size_t
149 | hs_text_short_index_ofs_rev(const uint8_t buf[], const size_t n, const size_t i)
150 | {
151 |   size_t m = n;
152 |   size_t j = i;
153 | 
154 |   for (;;) {
155 |     assert(m <= n);
156 |     assert(j >= 0);
157 | 
158 |     if (j >= m)
159 |       return n; /* i-th char is < buf */
160 | 
161 |     /* if (m == i-j) /\* suffix is made up only of ASCII chars, so we can shortcut *\/ */
162 |     /*   return 0; */
163 | 
164 |     /* scan until octet does not match 10_ */
165 |     assert(m > 0);
166 |     if (!(buf[--m] & 0x80))
167 |       goto l_cont;
168 | 
169 |     assert(m > 0);
170 |     if (utf8_lead_p(buf[--m])) {
171 |       assert ((buf[m] & 0xe0) == 0xc0); /* 110_ */
172 |       goto l_cont;
173 |     }
174 | 
175 |     assert(m > 0);
176 |     if (utf8_lead_p(buf[--m])) {
177 |       assert ((buf[m] & 0xf0) == 0xe0); /* 1110_ */
178 |       goto l_cont;
179 |     }
180 | 
181 |     /* this must be a non-10_ octet in a well-formed stream */
182 |     assert(m > 0);
183 |     m -= 1;
184 | 
185 |     assert ((buf[m] & 0xf8) == 0xf0); /* 11110_ */
186 | 
187 |   l_cont:
188 |     assert(utf8_lead_p(buf[m]));
189 | 
190 |     if (!j)
191 |       return m; /* found */
192 | 
193 |     j -= 1;
194 |   }
195 | 
196 |   assert(0);
197 | }
198 | 
199 | /* Decode UTF8 code units into code-point
200 |  * Assumes buf[] points to start of a valid UTF8-encoded code-point
201 |  */
202 | static inline uint32_t
203 | hs_text_short_decode_cp(const uint8_t buf[])
204 | {
205 |   /*  7 bits | 0xxxxxxx
206 |    * 11 bits | 110yyyyx  10xxxxxx
207 |    * 16 bits | 1110yyyy  10yxxxxx  10xxxxxx
208 |    * 21 bits | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
209 |    */
210 | 
211 |   const uint8_t b0 = buf[0];
212 | 
213 |   if (!(b0 & 0x80))
214 |     return b0;
215 | 
216 |   uint32_t cp = 0;
217 | 
218 |   switch(b0 >> 4) {
219 |   case 0xf: /* 11110___ */
220 |     assert((b0 & 0xf8) == 0xf0);
221 |     assert(!utf8_lead_p(buf[1]));
222 |     assert(!utf8_lead_p(buf[2]));
223 |     assert(!utf8_lead_p(buf[3]));
224 |     cp  = ((uint32_t)(b0     & 0x07)) << (6+6+6);
225 |     cp |= ((uint32_t)(buf[1] & 0x3f)) << (6+6);
226 |     cp |= ((uint32_t)(buf[2] & 0x3f)) << 6;
227 |     cp |=             buf[3] & 0x3f;
228 |     assert (cp > 0xffff); assert (cp < 0x110000);
229 |     return cp;
230 | 
231 |   case 0xe: /* 1110____ */
232 |     assert(!utf8_lead_p(buf[1]));
233 |     assert(!utf8_lead_p(buf[2]));
234 |     cp  = ((uint32_t)(b0     & 0x0f)) << (6+6);
235 |     cp |= ((uint32_t)(buf[1] & 0x3f)) << 6;
236 |     cp |=             buf[2] & 0x3f;
237 |     assert (cp > 0x7ff); assert (cp < 0x10000);
238 |     assert (cp < 0xd800 || cp > 0xdfff);
239 |     return cp;
240 | 
241 |   default:  /* 110_____ */
242 |     assert((b0 & 0xe0) == 0xc0);
243 |     assert(!utf8_lead_p(buf[1]));
244 |     cp  = ((uint32_t)(b0     & 0x1f)) << 6;
245 |     cp |=             buf[1] & 0x3f;
246 |     assert (cp > 0x7f); assert (cp < 0x800);
247 |     return cp;
248 |   }
249 | }
250 | 
251 | /* decode codepoint starting at buf[ofs] */
252 | codepoint_t
253 | hs_text_short_ofs_cp(const uint8_t buf[], const size_t ofs)
254 | {
255 |   return hs_text_short_decode_cp(buf+ofs);
256 | }
257 | 
258 | /* reverse-decode codepoint starting at offset right after a code-point */
259 | codepoint_t
260 | hs_text_short_ofs_cp_rev(const uint8_t *buf, const size_t ofs)
261 | {
262 |   /*  7 bits | 0xxxxxxx
263 |    * 11 bits | 110yyyyx  10xxxxxx
264 |    * 16 bits | 1110yyyy  10yxxxxx  10xxxxxx
265 |    * 21 bits | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
266 |    */
267 | 
268 |   buf = buf + ofs - 1;
269 | 
270 |   /* this octet is either 10_ or 0_ */
271 |   uint32_t cp = *buf;
272 | 
273 |   if (!(cp & 0x80))
274 |     return cp;
275 | 
276 |   assert (!utf8_lead_p(cp));
277 |   cp &= 0x3f;
278 | 
279 |   /* this octet is either 10_ or 110_ */
280 |   {
281 |     const uint8_t b = *(--buf);
282 |     assert (!utf8_lead_p(b) || ((b & 0xe0) == 0xc0));
283 | 
284 |     cp |=  (b & 0x3f) << 6;
285 | 
286 |     if (b & 0x40) {
287 |       assert (cp > 0x7f); assert (cp < 0x800);
288 |       return cp;
289 |     }
290 |   }
291 | 
292 |   /* this octet is either 10_ or 1110_ */
293 |   {
294 |     const uint8_t b = *(--buf);
295 |     assert (!utf8_lead_p(b) || ((b & 0xf0) == 0xe0));
296 | 
297 |     if (b & 0x40) {
298 |       cp |= (b & 0xf) << 12;
299 | 
300 |       assert (cp > 0x7ff); assert (cp < 0x10000);
301 |       assert (cp < 0xd800 || cp > 0xdfff);
302 |       return cp;
303 |     }
304 | 
305 |     cp |= (b & 0x3f) << 12;
306 |   }
307 | 
308 |   /* this octet must be 11110_ */
309 |   const uint8_t b = *(buf-1);
310 |   assert ((b & 0xf8) == 0xf0);
311 | 
312 |   cp |= (b & 0x7) << 18;
313 | 
314 |   assert (cp > 0xffff); assert (cp < 0x110000);
315 |   return cp;
316 | }
317 | 
318 | /* Retrieve i-th code-point in (valid) UTF8 stream
319 |  *
320 |  * Returns -1 if out of bounds
321 |  */
322 | codepoint_t
323 | hs_text_short_index_cp(const uint8_t buf[], const size_t n, const size_t i)
324 | {
325 |   const size_t ofs = hs_text_short_index_ofs(buf, n, i);
326 | 
327 |   if (ofs >= n)
328 |     return -1;
329 | 
330 |   return hs_text_short_decode_cp(&buf[ofs]);
331 | }
332 | 
333 | /* Retrieve i-th code-point in (valid) UTF8 stream
334 |  *
335 |  * Returns -1 if out of bounds
336 |  */
337 | codepoint_t
338 | hs_text_short_index_cp_rev(const uint8_t buf[], const size_t n, const size_t i)
339 | {
340 |   const size_t ofs = hs_text_short_index_ofs_rev(buf, n, i);
341 | 
342 |   if (ofs >= n)
343 |     return -1;
344 | 
345 |   return hs_text_short_decode_cp(&buf[ofs]);
346 | }
347 | 
348 | /* Validate UTF8 encoding
349 | 
350 |  7 bits | 0xxxxxxx
351 | 11 bits | 110yyyyx  10xxxxxx
352 | 16 bits | 1110yyyy  10yxxxxx  10xxxxxx
353 | 21 bits | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
354 | 
355 | Valid code-points:
356 | 
357 |  [U+0000 .. U+D7FF] + [U+E000 .. U+10FFFF]
358 | 
359 | Return values:
360 | 
361 |   0 -> ok
362 | 
363 |   1 -> invalid byte/code-point
364 | 
365 |  -1 -> truncated (1 byte missing)
366 |  -2 -> truncated (2 byte missing)
367 |  -3 -> truncated (3 byte missing)
368 | 
369 | */
370 | 
371 | int
372 | hs_text_short_is_valid_utf8(const uint8_t buf[], const size_t n)
373 | {
374 |   size_t j = 0;
375 | 
376 |   while (j < n) {
377 |     const uint8_t b0 = buf[j++];
378 | 
379 |     if (!(b0 & 0x80))
380 |       continue; /* b0 elem [ 0x00 .. 0x7f ] */
381 | 
382 |     if ((b0 & 0xe0) == 0xc0) { /* [ 0xc0 .. 0xdf ] */
383 |       if (!(b0 & 0x1e)) return 1; /* 0xc0 or 0xc1; denorm */
384 |       if (j >= n) return -1;
385 | 
386 |       goto l_trail1; /* b1 */
387 |     }
388 | 
389 |     if ((b0 & 0xf0) == 0xe0) { /* [ 0xe0 .. 0xef ] */
390 |       if ((j+1) >= n) return (n-(j+2));
391 | 
392 |       const uint8_t b1 = buf[j++];
393 |       if (utf8_lead_p(b1)) return 1; /* b1 elem [ 0x80 .. 0xbf ] */
394 | 
395 |       /* if b0==0xe0: b1 elem [ 0xa0 .. 0xbf ] */
396 |       if (!((b0 & 0x0f) | (b1 & 0x20))) return 1; /* denorm */
397 | 
398 |       /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
399 |       /* if b0==0xed: b1 elem [ 0x80 .. 0x9f ] */
400 |       if ((b0 == 0xed) && (b1 & 0x20)) return 1;
401 | 
402 |       goto l_trail1; /* b2 */
403 |     }
404 | 
405 |     if ((b0 & 0xfc) == 0xf0) { /* [ 0xf0 .. 0xf3 ] */
406 |       if ((j+2) >= n) return (n-(j+3));
407 | 
408 |       const uint8_t b1 = buf[j++];
409 | 
410 |       if (utf8_lead_p(b1))         /* b1 elem [ 0x80 .. 0xbf ] */
411 |         return 1;
412 | 
413 |       if (!((b0 & 0x03) | (b1 & 0x30))) /* if b0==0xf0: b1 elem [ 0x90 .. 0xbf ] */
414 |         return 1;
415 | 
416 |       goto l_trail2; /* b1, b2 */
417 |     }
418 | 
419 |     if (b0 == 0xf4) {
420 |       if ((j+2) >= n) return (n-(j+3));
421 | 
422 |       /* b1 */
423 |       if ((buf[j++] & 0xf0) != 0x80) return 1;
424 |       /* b1 elem [ 0x80 .. 0x8f ] */
425 | 
426 |     l_trail2:
427 |       /* b2 */
428 |       if (utf8_lead_p(buf[j++])) return 1;
429 |       /* b2 elem [ 0x80 .. 0xbf ] */
430 | 
431 |     l_trail1:
432 |       /* b3 */
433 |       if (utf8_lead_p(buf[j++])) return 1;
434 |       /* b3 elem [ 0x80 .. 0xbf ] */
435 | 
436 |       continue;
437 |     }
438 | 
439 |     /* invalid b0 byte */
440 |     return 1;
441 |   }
442 | 
443 |   assert(j == n);
444 | 
445 |   return 0;
446 | }
447 | 
448 | 
449 | /* Returns length of longest ASCII-code-point prefix.
450 |  */
451 | size_t
452 | hs_text_short_ascii_length(const uint8_t buf[], const size_t n)
453 | {
454 |   size_t j = 0;
455 | 
456 |   if (is_64bit) {
457 |     /* "vectorized" optimisation checking 8 octets at once
458 |      *
459 |      * NB: A 64-bit aligned buffer is assumed. This is assumption is
460 |      * justified when the buffer is the payload of a `ByteArray#`.
461 |      */
462 |     const uint64_t *buf64 = (const uint64_t*)buf;
463 | 
464 |     for (; (j+7) < n; j+=8, ++buf64)
465 |       if (*buf64 & UINT64_C(0x8080808080808080))
466 |         break;
467 |   } else {
468 |     /* "vectorized" optimisation checking 4 octets at once */
469 |     const uint32_t *buf32 = (const uint32_t*)buf;
470 | 
471 |     for (; (j+3) < n; j+=4, ++buf32)
472 |       if (*buf32 & UINT64_C(0x80808080))
473 |         break;
474 |   }
475 | 
476 |   for (; j < n; ++j)
477 |     if (buf[j] & 0x80)
478 |       return j;
479 | 
480 |   return j;
481 | }
482 | 
483 | /* Test whether well-formed UTF8 string contains only ASCII code-points
484 |  * returns 0 if not ASCII
485 |  *
486 |  * This code assumes a naturally aligned buf[]
487 |  */
488 | int
489 | hs_text_short_is_ascii(const uint8_t buf[], const size_t n)
490 | {
491 |   size_t j = 0;
492 | 
493 |   if (n < 2)
494 |     return 1;
495 | 
496 |   if (is_64bit) {
497 |     /* "vectorized" optimisation checking 8 octets at once
498 |      *
499 |      * NB: A 64-bit aligned buffer is assumed. This is assumption is
500 |      * justified when the buffer is the payload of a `ByteArray#`.
501 |      *
502 |      */
503 |     const uint64_t *buf64 = (const uint64_t*)buf;
504 | 
505 |     for (; (j+7) < n; j+=8, ++buf64)
506 |       if (*buf64 & UINT64_C(0x8080808080808080))
507 |         return 0;
508 | 
509 |     if (j < n) {
510 |       const int maskshift = (8 - (n - j)) << 3;
511 |       const uint64_t mask = is_bigendian ? (UINT64_C(0x8080808080808080) << maskshift)  /* big endian */
512 |                                          : (UINT64_C(0x8080808080808080) >> maskshift); /* little endian */
513 | 
514 |       if (*buf64 & mask)
515 |         return 0;
516 |     }
517 |   } else {
518 |     /* "vectorized" optimisation checking 4 octets at once */
519 |     const uint32_t *buf32 = (const uint32_t*)buf;
520 | 
521 |     for (; (j+3) < n; j+=4, ++buf32)
522 |       if (*buf32 & UINT64_C(0x80808080))
523 |         return 0;
524 | 
525 |     for (; j < n; ++j)
526 |       if (buf[j] & 0x80)
527 |         return 0;
528 |   }
529 | 
530 |   return 1;
531 | }
532 | 
533 | /*
534 |  * Compute length of (transcoded) mutf8 literal
535 |  *
536 |  * If the mutf8 literal does not contain either surrogates nor escaped
537 |  * NULs, a positive length is returned which matches what strlen(3)
538 |  * would have returned.
539 |  *
540 |  * Otherwise, a negated size is returned which corresponds to the size
541 |  * of a the mutf8->utf8 transcoded string.
542 |  *
543 |  */
544 | HsInt
545 | hs_text_short_mutf8_strlen(const uint8_t buf[])
546 | {
547 |   size_t j = 0;
548 |   size_t nulls = 0;
549 |   bool surr_seen = false;
550 | 
551 |   for (;;) {
552 |     const uint8_t b0 = buf[j];
553 | 
554 |     if (unlikely(!b0))
555 |       break;
556 | 
557 |     if (likely(!(b0 & 0x80)))
558 |       j += 1;   /* 0_______ */
559 |     else
560 |       switch(b0 >> 4) {
561 |       case 0xf: /* 11110___ */
562 |         j += 4;
563 |         break;
564 |       case 0xe: /* 1110____ */
565 |         /* UTF16 Surrogate pairs [U+D800 .. U+DFFF] */
566 |         if (unlikely(!surr_seen && (b0 == 0xed) && (buf[j+1] & 0x20)))
567 |           surr_seen = true;
568 |         j += 3;
569 |         break;
570 |       default:  /* 110_____ */
571 |         /* escaped NUL */
572 |         if (unlikely((b0 == 0xc0) && (buf[j+1] == 0x80)))
573 |           nulls += 1;
574 |         j += 2;
575 |         break;
576 |       }
577 |   } /* for */
578 | 
579 | 
580 |   if ((nulls > 0) || surr_seen)
581 |     return -(HsInt)(j - nulls);
582 | 
583 |   return j;
584 | }
585 | 
586 | /* Transcode Modified UTF-8 to proper UTF-8
587 |  *
588 |  * This involves
589 |  *
590 |  *  1. Unescape denormal 2-byte NULs (0xC0 0x80)
591 |  *  2. Rewrite surrogate pairs to U+FFFD
592 |  */
593 | void
594 | hs_text_short_mutf8_trans(const uint8_t src0[], uint8_t dst0[])
595 | {
596 |   const uint8_t *src = src0;
597 |   uint8_t *dst = dst0;
598 | 
599 |   for (;;) {
600 |     const uint8_t b0 = *src++;
601 |     assert(utf8_lead_p(b0));
602 | 
603 |     if (likely(!(b0 & 0x80))) { /* 0_______ */
604 |       if (unlikely(!b0))
605 |         break;
606 | 
607 |       *dst++ = b0;
608 |       continue;
609 |     }
610 | 
611 |     switch(b0 >> 4) {
612 |     case 0xf: /* 11110___ */
613 |       assert(!utf8_lead_p(src[0]));
614 |       assert(!utf8_lead_p(src[1]));
615 |       assert(!utf8_lead_p(src[2]));
616 |       *dst++ = b0;
617 |       *dst++ = *src++;
618 |       *dst++ = *src++;
619 |       *dst++ = *src++;
620 |       break;
621 | 
622 |     case 0xe: { /* 1110____ */
623 |       const uint8_t b1 = *src++;
624 |       const uint8_t b2 = *src++;
625 |       assert(!utf8_lead_p(b1));
626 |       assert(!utf8_lead_p(b2));
627 |       if (unlikely((b0 == 0xed) && (b1 & 0x20))) {
628 |         /* UTF16 Surrogate pairs [U+D800 .. U+DFFF]
629 |          * -> translate into U+FFFD
630 |          */
631 |         *dst++ = 0xef;
632 |         *dst++ = 0xbf;
633 |         *dst++ = 0xbd;
634 |       } else {
635 |         *dst++ = b0;
636 |         *dst++ = b1;
637 |         *dst++ = b2;
638 |       }
639 |       break;
640 |     }
641 |     default: { /* 110_____ */
642 |       const uint8_t b1 = *src++;
643 |       assert(!utf8_lead_p(b1));
644 |       if (unlikely((b0 == 0xc0) && (b1 == 0x80))) {
645 |         /* escaped/denormal U+0000 -> normalize */
646 |         *dst++ = 0x00;
647 |       } else {
648 |         *dst++ = b0;
649 |         *dst++ = b1;
650 |       }
651 |       break;
652 |     }
653 |     } /* switch */
654 |   } /* for */
655 | 
656 |   assert(labs(hs_text_short_mutf8_strlen(src0)) == (dst - dst0));
657 | }
658 | 


--------------------------------------------------------------------------------
/src/Data/Text/Short/Internal.hs:
--------------------------------------------------------------------------------
   1 | {-# LANGUAGE BangPatterns               #-}
   2 | {-# LANGUAGE CPP                        #-}
   3 | {-# LANGUAGE DeriveDataTypeable         #-}
   4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
   5 | {-# LANGUAGE MagicHash                  #-}
   6 | {-# LANGUAGE RankNTypes                 #-}
   7 | {-# LANGUAGE TemplateHaskellQuotes      #-}
   8 | {-# LANGUAGE TypeFamilies               #-}
   9 | {-# LANGUAGE UnboxedTuples              #-}
  10 | {-# LANGUAGE UnliftedFFITypes           #-}
  11 | {-# LANGUAGE Unsafe                     #-}
  12 | {-# LANGUAGE ViewPatterns               #-}
  13 | 
  14 | -- |
  15 | -- Module      : Data.Text.Short.Internal
  16 | -- Copyright   : © Herbert Valerio Riedel 2017
  17 | -- License     : BSD3
  18 | --
  19 | -- Maintainer  : hvr@gnu.org
  20 | -- Stability   : stable
  21 | --
  22 | -- Memory-efficient representation of Unicode text strings.
  23 | --
  24 | -- @since 0.1
  25 | module Data.Text.Short.Internal
  26 |     ( -- * The 'ShortText' type
  27 |       ShortText(..)
  28 | 
  29 |       -- * Basic operations
  30 |     , null
  31 |     , length
  32 |     , isAscii
  33 |     , splitAt
  34 |     , splitAtEnd
  35 |     , indexEndMaybe
  36 |     , indexMaybe
  37 |     , isPrefixOf
  38 |     , stripPrefix
  39 |     , isSuffixOf
  40 |     , stripSuffix
  41 | 
  42 |     , cons
  43 |     , snoc
  44 |     , uncons
  45 |     , unsnoc
  46 | 
  47 |     , findIndex
  48 |     , find
  49 |     , all
  50 | 
  51 |     , span
  52 |     , spanEnd
  53 |     , split
  54 | 
  55 |     , intersperse
  56 |     , intercalate
  57 |     , reverse
  58 |     , replicate
  59 | 
  60 |     , filter
  61 |     , dropAround
  62 | 
  63 |     , foldl
  64 |     , foldl'
  65 |     , foldr
  66 |     , foldl1
  67 |     , foldl1'
  68 |     , foldr1
  69 | 
  70 |       -- * Conversions
  71 |       -- ** 'Char'
  72 |     , singleton
  73 | 
  74 |       -- ** 'String'
  75 |     , Data.Text.Short.Internal.fromString
  76 |     , toString
  77 | 
  78 |       -- ** 'T.Text'
  79 |     , fromText
  80 |     , toText
  81 | 
  82 |       -- ** 'BS.ByteString'
  83 |     , fromShortByteString
  84 |     , fromShortByteStringUnsafe
  85 |     , toShortByteString
  86 | 
  87 |     , fromByteString
  88 |     , fromByteStringUnsafe
  89 |     , toByteString
  90 | 
  91 |     , toBuilder
  92 | 
  93 |       -- * misc
  94 |       -- ** For Haddock
  95 | 
  96 |     , BS.ByteString
  97 |     , T.Text
  98 |     , module Prelude
  99 |     , module Data.Monoid
 100 | 
 101 |       -- ** Internals
 102 |     , isValidUtf8
 103 |     ) where
 104 | 
 105 | import           Prelude
 106 |   (Bool(..), Ordering(..), Int, Char, String, Maybe(..), IO, Eq, Ord, Num, Read,
 107 |    Show, ($), ($!), (.), (==), (/=), (+), (*), (-), (>>), (<=), (<), (>), (>=),
 108 |    compare, show, showsPrec, readsPrec, abs, return, fmap, error,
 109 |    undefined, otherwise, fromIntegral, max, min, not, fst, snd, map, seq, fail, maybe)
 110 | 
 111 | import           Control.DeepSeq                (NFData)
 112 | import           Control.Monad.ST               (stToIO)
 113 | import           Data.Binary
 114 | import           Data.Bits
 115 | import qualified Data.ByteString                as BS
 116 | import qualified Data.ByteString.Builder        as BB
 117 | import           Data.ByteString.Short          (ShortByteString)
 118 | import qualified Data.ByteString.Short          as BSS
 119 | import qualified Data.ByteString.Short.Internal as BSSI
 120 | import           Data.Char                      (ord)
 121 | import           Data.Data                      (Data(..),constrIndex, Constr,
 122 |                                                  mkConstr, DataType, mkDataType,
 123 |                                                  Fixity(Prefix))
 124 | import           Data.Hashable                  (Hashable)
 125 | import           Data.Typeable                  (Typeable)
 126 | import qualified Data.List                      as List
 127 | import           Data.Maybe                     (fromMaybe, isNothing)
 128 | import           Data.Monoid                    (Monoid, mempty, mconcat)
 129 | import           Data.Semigroup
 130 | import qualified Data.String                    as S
 131 | import qualified Data.Text                      as T
 132 | import           Foreign.C
 133 | import           GHC.Base                       (assert, unsafeChr)
 134 | import qualified GHC.CString                    as GHC
 135 | import           GHC.Exts                       (Addr#, ByteArray#, Int (I#),
 136 |                                                  Int#, MutableByteArray#,
 137 |                                                  Ptr (..), RealWorld, Word (W#))
 138 | import qualified GHC.Exts
 139 | import qualified GHC.Foreign                    as GHC
 140 | import           GHC.IO.Encoding
 141 | import           GHC.ST
 142 | import           System.IO.Unsafe
 143 | import           Text.Printf                    (PrintfArg, formatArg,
 144 |                                                  formatString)
 145 | 
 146 | import qualified Language.Haskell.TH.Syntax     as TH
 147 | 
 148 | #if MIN_VERSION_text(2,0,0)
 149 | import qualified Data.Text.Internal             as TI
 150 | import qualified Data.Text.Array                as TA
 151 | #else
 152 | import qualified Data.Text.Encoding             as T
 153 | #endif
 154 | 
 155 | import qualified PrimOps
 156 | 
 157 | -- | A compact representation of Unicode strings.
 158 | --
 159 | -- A 'ShortText' value is a sequence of Unicode scalar values, as defined in
 160 | -- <http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#page=35 §3.9, definition D76 of the Unicode 5.2 standard >;
 161 | -- This means that a 'ShortText' is a list of (scalar) Unicode code-points (i.e. code-points in the range @[U+00 .. U+D7FF] ∪ [U+E000 .. U+10FFFF]@).
 162 | --
 163 | -- This type relates to 'T.Text' as 'ShortByteString' relates to 'BS.ByteString' by providing a more compact type. Please consult the documentation of "Data.ByteString.Short" for more information.
 164 | --
 165 | -- Currently, a boxed unshared 'T.Text' has a memory footprint of 6 words (i.e. 48 bytes on 64-bit systems) plus 1, 2, 3 or 4 bytes per code-point for text-2 (due to the internal UTF-8 representation) or 2 or 4 bytes per  code-point for text-1 (due to the internal UTF-16 representation). Each 'T.Text' value which can share its payload with another 'T.Text' requires only 4 words additionally. Unlike 'BS.ByteString', 'T.Text' use unpinned memory.
 166 | --
 167 | -- In comparison, the footprint of a boxed 'ShortText' is only 4 words (i.e. 32 bytes on 64-bit systems) plus 1, 2, 3, or 4 bytes per code-point (due to the internal UTF-8 representation).
 168 | --
 169 | -- It can be shown that for realistic data <http://utf8everywhere.org/#asian UTF-16, which is used by text-1, has a space overhead of 50% over UTF-8>.
 170 | --
 171 | -- __NOTE__: The `Typeable` instance isn't defined for GHC 7.8 (and older) prior to @text-short-0.1.3@
 172 | --
 173 | -- @since 0.1
 174 | newtype ShortText = ShortText ShortByteString
 175 |                   deriving (Hashable,Monoid,NFData,Data.Semigroup.Semigroup,Typeable)
 176 | 
 177 | -- | It exposes a similar 'Data' instance abstraction as 'T.Text' (see
 178 | -- discussion referenced there for more details), preserving the
 179 | -- @[Char]@ data abstraction at the cost of inefficiency.
 180 | --
 181 | -- @since 0.1.3
 182 | instance Data ShortText where
 183 |   gfoldl f z txt = z fromString `f` (toString txt)
 184 |   toConstr _ = packConstr
 185 |   gunfold k z c = case constrIndex c of
 186 |     1 -> k (z fromString)
 187 |     _ -> error "gunfold"
 188 |   dataTypeOf _ = shortTextDataType
 189 | 
 190 | packConstr :: Constr
 191 | packConstr = mkConstr shortTextDataType "fromString" [] Prefix
 192 | 
 193 | shortTextDataType :: DataType
 194 | shortTextDataType = mkDataType "Data.Text.Short" [packConstr]
 195 | 
 196 | instance Eq ShortText where
 197 |   {-# INLINE (==) #-}
 198 |   (==) x y
 199 |     | lx /= ly  = False
 200 |     | lx ==  0  = True
 201 |     | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of
 202 |                     0# -> True
 203 |                     _  -> False
 204 |     where
 205 |       !lx@(I# n#) = toLength x
 206 |       !ly = toLength y
 207 | 
 208 | instance Ord ShortText where
 209 |   compare t1 t2
 210 |     | n == 0  = compare n1 n2
 211 |     | otherwise = case PrimOps.compareByteArrays# ba1# 0# ba2# 0# n# of
 212 |         r# | I# r# < 0 -> LT
 213 |            | I# r# > 0 -> GT
 214 |            | n1 < n2   -> LT
 215 |            | n1 > n2   -> GT
 216 |            | otherwise -> EQ
 217 |     where
 218 |       ba1# = toByteArray# t1
 219 |       ba2# = toByteArray# t2
 220 |       !n1 = toLength t1
 221 |       !n2 = toLength t2
 222 |       !n@(I# n#) = n1 `min` n2
 223 | 
 224 | instance Show ShortText where
 225 |     showsPrec p (ShortText b) = showsPrec p (decodeStringShort' utf8 b)
 226 |     show (ShortText b)        = show        (decodeStringShort' utf8 b)
 227 | 
 228 | instance Read ShortText where
 229 |     readsPrec p = map (\(x,s) -> (ShortText $ encodeStringShort utf8 x,s)) . readsPrec p
 230 | 
 231 | -- | @since 0.1.2
 232 | instance PrintfArg ShortText where
 233 |   formatArg txt = formatString $ toString txt
 234 | 
 235 | -- | The 'Binary' encoding matches the one for 'T.Text'
 236 | instance Binary ShortText where
 237 |     put = put . toShortByteString
 238 |     get = do
 239 |         sbs <- get
 240 |         case fromShortByteString sbs of
 241 |           Nothing -> fail "Binary.get(ShortText): Invalid UTF-8 stream"
 242 |           Just st -> return st
 243 | 
 244 | -- | Since 0.1.3
 245 | instance TH.Lift ShortText where
 246 |     -- TODO: Use DeriveLift with bytestring-0.11.2.0
 247 |     lift t = [| fromString s |]
 248 |       where s = toString t
 249 | 
 250 | #if MIN_VERSION_template_haskell(2,17,0)
 251 |     liftTyped = TH.unsafeCodeCoerce . TH.lift
 252 | #elif MIN_VERSION_template_haskell(2,16,0)
 253 |     liftTyped = TH.unsafeTExpCoerce . TH.lift
 254 | #endif
 255 | 
 256 | -- | \(\mathcal{O}(1)\) Test whether a 'ShortText' is empty.
 257 | --
 258 | -- >>> null ""
 259 | -- True
 260 | --
 261 | -- prop> null (singleton c) == False
 262 | --
 263 | -- prop> null t == (length t == 0)
 264 | --
 265 | -- @since 0.1
 266 | null :: ShortText -> Bool
 267 | null = BSS.null . toShortByteString
 268 | 
 269 | -- | \(\mathcal{O}(n)\) Count the number of Unicode code-points in a 'ShortText'.
 270 | --
 271 | -- >>> length "abcd€"
 272 | -- 5
 273 | --
 274 | -- >>> length ""
 275 | -- 0
 276 | --
 277 | -- prop> length t >= 0
 278 | --
 279 | -- @since 0.1
 280 | length :: ShortText -> Int
 281 | length st = fromIntegral $ unsafeDupablePerformIO (c_text_short_length (toByteArray# st) (toCSize st))
 282 | 
 283 | foreign import ccall unsafe "hs_text_short_length" c_text_short_length :: ByteArray# -> CSize -> IO CSize
 284 | 
 285 | -- | \(\mathcal{O}(n)\) Test whether 'ShortText' contains only ASCII code-points (i.e. only U+0000 through U+007F).
 286 | --
 287 | -- This is a more efficient version of @'all' 'Data.Char.isAscii'@.
 288 | --
 289 | -- >>> isAscii ""
 290 | -- True
 291 | --
 292 | -- >>> isAscii "abc\NUL"
 293 | -- True
 294 | --
 295 | -- >>> isAscii "abcd€"
 296 | -- False
 297 | --
 298 | -- prop> isAscii t == all (< '\x80') t
 299 | --
 300 | -- @since 0.1
 301 | isAscii :: ShortText -> Bool
 302 | isAscii st = (/= 0) $ unsafeDupablePerformIO (c_text_short_is_ascii (toByteArray# st) sz)
 303 |   where
 304 |     sz = toCSize st
 305 | 
 306 | foreign import ccall unsafe "hs_text_short_is_ascii" c_text_short_is_ascii :: ByteArray# -> CSize -> IO CInt
 307 | 
 308 | -- | \(\mathcal{O}(n)\) Test whether /all/ code points in 'ShortText' satisfy a predicate.
 309 | --
 310 | -- >>> all (const False) ""
 311 | -- True
 312 | --
 313 | -- >>> all (> 'c') "abcdabcd"
 314 | -- False
 315 | --
 316 | -- >>> all (/= 'c') "abdabd"
 317 | -- True
 318 | --
 319 | -- @since 0.1.2
 320 | all :: (Char -> Bool) -> ShortText -> Bool
 321 | all p st = isNothing (findOfs (not . p) st (B 0))
 322 | 
 323 | -- | \(\mathcal{O}(n)\) Return the left-most codepoint in 'ShortText' that satisfies the given predicate.
 324 | --
 325 | -- >>> find (> 'b') "abcdabcd"
 326 | -- Just 'c'
 327 | --
 328 | -- >>> find (> 'b') "ababab"
 329 | -- Nothing
 330 | --
 331 | -- @since 0.1.2
 332 | find :: (Char -> Bool) -> ShortText -> Maybe Char
 333 | find p st = go 0
 334 |   where
 335 |     go !ofs
 336 |       | ofs >= sz  = Nothing
 337 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 338 |                      in c `seq` ofs' `seq`
 339 |                         if p c
 340 |                         then Just c
 341 |                         else go ofs'
 342 | 
 343 |     !sz = toB st
 344 | 
 345 | -- | \(\mathcal{O}(n)\) Return the index of the left-most codepoint in 'ShortText' that satisfies the given predicate.
 346 | --
 347 | -- >>> findIndex (> 'b') "abcdabcdef"
 348 | -- Just 2
 349 | --
 350 | -- >>> findIndex (> 'b') "ababab"
 351 | -- Nothing
 352 | --
 353 | -- prop> (indexMaybe t =<< findIndex p t) == find p t
 354 | --
 355 | -- @since 0.1.2
 356 | findIndex :: (Char -> Bool) -> ShortText -> Maybe Int
 357 | findIndex p st = go 0 0
 358 |   where
 359 |     go !ofs !i
 360 |       | ofs >= sz  = Nothing
 361 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 362 |                      in c `seq` ofs' `seq`
 363 |                         if p c
 364 |                         then Just i
 365 |                         else go ofs' (i+1)
 366 | 
 367 |     !sz = toB st
 368 | 
 369 | 
 370 | -- | \(\mathcal{O}(n)\) Splits a string into components delimited by separators,
 371 | -- where the predicate returns True for a separator element.  The
 372 | -- resulting components do not contain the separators.  Two adjacent
 373 | -- separators result in an empty component in the output.  eg.
 374 | --
 375 | -- >>> split (=='a') "aabbaca"
 376 | -- ["","","bb","c",""]
 377 | --
 378 | -- >>> split (=='a') ""
 379 | -- [""]
 380 | --
 381 | -- prop> intercalate (singleton c) (split (== c) t) = t
 382 | --
 383 | -- __NOTE__: 'split' never returns an empty list to match the semantics of its counterpart from "Data.Text".
 384 | --
 385 | -- @since 0.1.3
 386 | split :: (Char -> Bool) -> ShortText -> [ShortText]
 387 | split p st0 = go 0
 388 |   where
 389 |     go !ofs0 = case findOfs' p st0 ofs0 of
 390 |       Just (ofs1,ofs2) -> slice st0 ofs0 (ofs1-ofs0) : go ofs2
 391 |       Nothing
 392 |         | ofs0 == 0 -> st0 : []
 393 |         | otherwise -> slice st0 ofs0 (maxOfs-ofs0) : []
 394 | 
 395 |     !maxOfs = toB st0
 396 | 
 397 | -- internal helper
 398 | {-# INLINE findOfs #-}
 399 | findOfs :: (Char -> Bool) -> ShortText -> B -> Maybe B
 400 | findOfs p st = go
 401 |   where
 402 |     go :: B -> Maybe B
 403 |     go !ofs | ofs >= sz = Nothing
 404 |     go !ofs | p c       = Just ofs
 405 |             | otherwise = go ofs'
 406 |       where
 407 |         (c,ofs') = decodeCharAtOfs st ofs
 408 | 
 409 |     !sz = toB st
 410 | 
 411 | {-# INLINE findOfs' #-}
 412 | findOfs' :: (Char -> Bool) -> ShortText -> B -> Maybe (B,B)
 413 | findOfs' p st = go
 414 |   where
 415 |     go :: B -> Maybe (B,B)
 416 |     go !ofs | ofs >= sz = Nothing
 417 |     go !ofs | p c       = Just (ofs,ofs')
 418 |             | otherwise = go ofs'
 419 |       where
 420 |         (c,ofs') = decodeCharAtOfs st ofs
 421 | 
 422 |     !sz = toB st
 423 | 
 424 | 
 425 | {-# INLINE findOfsRev #-}
 426 | findOfsRev :: (Char -> Bool) -> ShortText -> B -> Maybe B
 427 | findOfsRev p st = go
 428 |   where
 429 |     go (B 0) = Nothing
 430 |     go !ofs
 431 |       | p (cp2ch cp) = Just ofs
 432 |       | otherwise    = go (ofs-cpLen cp)
 433 |       where
 434 |         !cp = readCodePointRev st ofs
 435 | 
 436 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into longest prefix satisfying the given predicate and the remaining suffix.
 437 | --
 438 | -- >>> span (< 'c') "abcdabcd"
 439 | -- ("ab","cdabcd")
 440 | --
 441 | -- prop> fst (span p t) <> snd (span p t) == t
 442 | --
 443 | -- @since 0.1.2
 444 | span :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
 445 | span p st
 446 |   | Just ofs <- findOfs (not . p) st (B 0) = splitAtOfs ofs st
 447 |   | otherwise = (st,mempty)
 448 | 
 449 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into longest suffix satisfying the given predicate and the preceding prefix.
 450 | --
 451 | -- >>> spanEnd (> 'c') "abcdabcd"
 452 | -- ("abcdabc","d")
 453 | --
 454 | -- prop> fst (spanEnd p t) <> snd (spanEnd p t) == t
 455 | --
 456 | -- @since 0.1.2
 457 | spanEnd :: (Char -> Bool) -> ShortText -> (ShortText,ShortText)
 458 | spanEnd p st
 459 |   | Just ofs <- findOfsRev (not . p) st (toB st) = splitAtOfs ofs st
 460 |   | otherwise = (mempty,st)
 461 | 
 462 | ----------------------------------------------------------------------------
 463 | 
 464 | toCSize :: ShortText -> CSize
 465 | toCSize = fromIntegral . BSS.length . toShortByteString
 466 | 
 467 | toB :: ShortText -> B
 468 | toB = fromIntegral . BSS.length . toShortByteString
 469 | 
 470 | toLength :: ShortText -> Int
 471 | toLength st = I# (toLength# st)
 472 | 
 473 | toLength# :: ShortText -> Int#
 474 | toLength# st = GHC.Exts.sizeofByteArray# (toByteArray# st)
 475 | 
 476 | toByteArray# :: ShortText -> ByteArray#
 477 | toByteArray# (ShortText (BSSI.SBS ba#)) = ba#
 478 | 
 479 | -- | \(\mathcal{O}(0)\) Converts to UTF-8 encoded 'ShortByteString'
 480 | --
 481 | -- This operation has effectively no overhead, as it's currently merely a @newtype@-cast.
 482 | --
 483 | -- @since 0.1
 484 | toShortByteString :: ShortText -> ShortByteString
 485 | toShortByteString (ShortText b) = b
 486 | 
 487 | -- | \(\mathcal{O}(n)\) Converts to UTF-8 encoded 'BS.ByteString'
 488 | --
 489 | -- @since 0.1
 490 | toByteString :: ShortText -> BS.ByteString
 491 | toByteString = BSS.fromShort . toShortByteString
 492 | 
 493 | -- | Construct a 'BB.Builder' that encodes 'ShortText' as UTF-8.
 494 | --
 495 | -- @since 0.1
 496 | toBuilder :: ShortText -> BB.Builder
 497 | toBuilder = BB.shortByteString . toShortByteString
 498 | 
 499 | -- | \(\mathcal{O}(n)\) Convert to 'String'
 500 | --
 501 | -- prop> (fromString . toString) t == t
 502 | --
 503 | -- __Note__: See documentation of 'fromString' for why @('toString' . 'fromString')@ is not an identity function.
 504 | --
 505 | -- @since 0.1
 506 | toString :: ShortText -> String
 507 | -- NOTE: impl below beats
 508 | --   toString = decodeStringShort' utf8 . toShortByteString
 509 | -- except for smallish strings
 510 | toString st = go 0
 511 |   where
 512 |     go !ofs
 513 |       | ofs >= sz  = []
 514 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 515 |                      in c `seq` ofs' `seq` (c : go ofs')
 516 | 
 517 |     !sz = toB st
 518 | 
 519 | ----------------------------------------------------------------------------
 520 | -- Folds
 521 | 
 522 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with
 523 | -- the binary operator and an initial in forward direction (i.e. from
 524 | -- left to right).
 525 | --
 526 | -- >>> foldl (\_ _ -> True) False ""
 527 | -- False
 528 | --
 529 | -- >>> foldl (\s c -> c : s) ['.'] "abcd"
 530 | -- "dcba."
 531 | --
 532 | -- @since 0.1.2
 533 | foldl :: (a -> Char -> a) -> a -> ShortText -> a
 534 | foldl f z st = go 0 z
 535 |   where
 536 |     go !ofs acc
 537 |       | ofs >= sz  = acc
 538 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 539 |                      in c `seq` ofs' `seq` go ofs' (f acc c)
 540 | 
 541 |     !sz = toB st
 542 | 
 543 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator.
 544 | --
 545 | -- >>> foldl1 max "abcdcba"
 546 | -- 'd'
 547 | --
 548 | -- >>> foldl1 const "abcd"
 549 | -- 'a'
 550 | --
 551 | -- >>> foldl1 (flip const) "abcd"
 552 | -- 'd'
 553 | --
 554 | -- __Note__: Will throw an 'error' exception if index is out of bounds.
 555 | --
 556 | -- @since 0.1.2
 557 | foldl1 :: (Char -> Char -> Char) -> ShortText -> Char
 558 | foldl1 f st
 559 |   | sz == 0    = error "foldl1: empty ShortText"
 560 |   | otherwise  = go c0sz c0
 561 |   where
 562 |     go !ofs acc
 563 |       | ofs >= sz  = acc
 564 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 565 |                      in c `seq` ofs' `seq` go ofs' (f acc c)
 566 |     !sz = toB st
 567 |     (c0,c0sz) = decodeCharAtOfs st (B 0)
 568 | 
 569 | -- | \(\mathcal{O}(n)\) Strict version of 'foldl'.
 570 | --
 571 | -- @since 0.1.2
 572 | foldl' :: (a -> Char -> a) -> a -> ShortText -> a
 573 | foldl' f !z st = go 0 z
 574 |   where
 575 |     go !ofs !acc
 576 |       | ofs >= sz  = acc
 577 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 578 |                      in c `seq` ofs' `seq` go ofs' (f acc c)
 579 | 
 580 |     !sz = toB st
 581 | 
 582 | -- | \(\mathcal{O}(n)\) Strict version of 'foldl1'.
 583 | --
 584 | -- @since 0.1.2
 585 | foldl1' :: (Char -> Char -> Char) -> ShortText -> Char
 586 | foldl1' f st
 587 |   | sz == 0    = error "foldl1: empty ShortText"
 588 |   | otherwise  = go c0sz c0
 589 |   where
 590 |     go !ofs !acc
 591 |       | ofs >= sz  = acc
 592 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 593 |                      in c `seq` ofs' `seq` go ofs' (f acc c)
 594 |     !sz = toB st
 595 |     (c0,c0sz) = decodeCharAtOfs st (B 0)
 596 | 
 597 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with
 598 | -- the binary operator and an initial in reverse direction (i.e. from
 599 | -- right to left).
 600 | --
 601 | -- >>> foldr (\_ _ -> True) False ""
 602 | -- False
 603 | --
 604 | -- >>> foldr (:) ['.'] "abcd"
 605 | -- "abcd."
 606 | --
 607 | -- @since 0.1.2
 608 | foldr :: (Char -> a -> a) -> a -> ShortText -> a
 609 | foldr f z st = go 0
 610 |   where
 611 |     go !ofs
 612 |       | ofs >= sz  = z
 613 |       | otherwise  = let (c,ofs') = decodeCharAtOfs st ofs
 614 |                      in c `seq` ofs' `seq` f c (go ofs')
 615 | 
 616 |     !sz = toB st
 617 | 
 618 | -- | \(\mathcal{O}(n)\) Reduces the characters of the 'ShortText' with the binary operator.
 619 | --
 620 | -- >>> foldr1 max "abcdcba"
 621 | -- 'd'
 622 | --
 623 | -- >>> foldr1 const "abcd"
 624 | -- 'a'
 625 | --
 626 | -- >>> foldr1 (flip const) "abcd"
 627 | -- 'd'
 628 | --
 629 | -- __Note__: Will throw an 'error' exception if index is out of bounds.
 630 | --
 631 | -- @since 0.1.2
 632 | foldr1 :: (Char -> Char -> Char) -> ShortText -> Char
 633 | foldr1 f st
 634 |   | sz == 0    = error "foldr1: empty ShortText"
 635 |   | otherwise  = go 0
 636 |   where
 637 |     go !ofs = let (c,ofs') = decodeCharAtOfs st ofs
 638 |               in c `seq` ofs' `seq`
 639 |                  (if ofs' >= sz then c else f c (go ofs'))
 640 | 
 641 |     !sz = toB st
 642 | 
 643 | -- | \(\mathcal{O}(n)\) Convert to 'T.Text'
 644 | --
 645 | -- prop> (fromText . toText) t == t
 646 | --
 647 | -- prop> (toText . fromText) t == t
 648 | --
 649 | -- This is \(\mathcal{O}(1)\) with @text-2@.
 650 | -- Previously it wasn't because 'T.Text' used UTF-16 as its internal representation.
 651 | --
 652 | -- @since 0.1
 653 | toText :: ShortText -> T.Text
 654 | #if MIN_VERSION_text(2,0,0)
 655 | toText (ShortText (BSSI.SBS ba)) = TI.Text (TA.ByteArray ba) 0 (I# (GHC.Exts.sizeofByteArray# ba))
 656 | #else
 657 | toText = T.decodeUtf8 . toByteString
 658 | #endif
 659 | 
 660 | ----
 661 | 
 662 | -- | \(\mathcal{O}(n)\) Construct/pack from 'String'
 663 | --
 664 | -- >>> fromString []
 665 | -- ""
 666 | --
 667 | -- >>> fromString ['a','b','c']
 668 | -- "abc"
 669 | --
 670 | -- >>> fromString ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
 671 | -- "\55295\65533\65533\57344"
 672 | --
 673 | -- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
 674 | --
 675 | -- @since 0.1
 676 | fromString :: String -> ShortText
 677 | fromString s = case s of
 678 |   []  -> mempty
 679 |   [c] -> singleton $ r c
 680 |   _   -> ShortText . encodeStringShort utf8 . map r $ s
 681 |   where
 682 |     r c | isSurr (ord c) = '\xFFFD'
 683 |         | otherwise      = c
 684 | 
 685 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from 'T.Text'
 686 | --
 687 | -- This is \(\mathcal{O}(1)\) with @text-2@ when the 'T.Text' is not sliced.
 688 | -- Previously it wasn't because 'T.Text' used UTF-16 as its internal representation.
 689 | --
 690 | -- @since 0.1
 691 | fromText :: T.Text -> ShortText
 692 | #if MIN_VERSION_text(2,0,0)
 693 | fromText (TI.Text (TA.ByteArray ba) off len) =
 694 |     ShortText (BSSI.SBS (case sliceByteArray (TA.ByteArray ba) off len of TA.ByteArray ba' -> ba'))
 695 | 
 696 | sliceByteArray :: TA.Array -> Int -> Int -> TA.Array
 697 | sliceByteArray ta@(TA.ByteArray ba) 0 len
 698 |     | len == I# (GHC.Exts.sizeofByteArray# ba)
 699 |     = ta
 700 | sliceByteArray ta off len = TA.run $ do
 701 |     ma <- TA.new len
 702 |     TA.copyI len ma 0 ta off
 703 |     return ma
 704 | 
 705 | #else
 706 | fromText = fromByteStringUnsafe . T.encodeUtf8
 707 | #endif
 708 | 
 709 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString'
 710 | --
 711 | -- This operation doesn't copy the input 'ShortByteString' but it
 712 | -- cannot be \(\mathcal{O}(1)\) because we need to validate the UTF-8 encoding.
 713 | --
 714 | -- Returns 'Nothing' in case of invalid UTF-8 encoding.
 715 | --
 716 | -- >>> fromShortByteString "\x00\x38\xF0\x90\x8C\x9A" -- U+00 U+38 U+1031A
 717 | -- Just "\NUL8\66330"
 718 | --
 719 | -- >>> fromShortByteString "\xC0\x80" -- invalid denormalised U+00
 720 | -- Nothing
 721 | --
 722 | -- >>> fromShortByteString "\xED\xA0\x80" -- U+D800 (non-scalar code-point)
 723 | -- Nothing
 724 | --
 725 | -- >>> fromShortByteString "\xF4\x8f\xbf\xbf" -- U+10FFFF
 726 | -- Just "\1114111"
 727 | --
 728 | -- >>> fromShortByteString "\xF4\x90\x80\x80" -- U+110000 (invalid)
 729 | -- Nothing
 730 | --
 731 | -- prop> fromShortByteString (toShortByteString t) == Just t
 732 | --
 733 | -- @since 0.1
 734 | fromShortByteString :: ShortByteString -> Maybe ShortText
 735 | fromShortByteString sbs
 736 |   | isValidUtf8 st  = Just st
 737 |   | otherwise       = Nothing
 738 |   where
 739 |     st = ShortText sbs
 740 | 
 741 | -- | \(\mathcal{O}(0)\) Construct 'ShortText' from UTF-8 encoded 'ShortByteString'
 742 | --
 743 | -- This operation has effectively no overhead, as it's currently merely a @newtype@-cast.
 744 | --
 745 | -- __WARNING__: Unlike the safe 'fromShortByteString' conversion, this
 746 | -- conversion is /unsafe/ as it doesn't validate the well-formedness of the
 747 | -- UTF-8 encoding.
 748 | --
 749 | -- @since 0.1.1
 750 | fromShortByteStringUnsafe :: ShortByteString -> ShortText
 751 | fromShortByteStringUnsafe = ShortText
 752 | 
 753 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString'
 754 | --
 755 | -- 'fromByteString' accepts (or rejects) the same input data as 'fromShortByteString'.
 756 | --
 757 | -- Returns 'Nothing' in case of invalid UTF-8 encoding.
 758 | --
 759 | -- @since 0.1
 760 | fromByteString :: BS.ByteString -> Maybe ShortText
 761 | fromByteString = fromShortByteString . BSS.toShort
 762 | 
 763 | -- | \(\mathcal{O}(n)\) Construct 'ShortText' from UTF-8 encoded 'BS.ByteString'
 764 | --
 765 | -- This operation is \(\mathcal{O}(n)\) because the 'BS.ByteString' needs to be
 766 | -- copied into an unpinned 'ByteArray#'.
 767 | --
 768 | -- __WARNING__: Unlike the safe 'fromByteString' conversion, this
 769 | -- conversion is /unsafe/ as it doesn't validate the well-formedness of the
 770 | -- UTF-8 encoding.
 771 | --
 772 | -- @since 0.1.1
 773 | fromByteStringUnsafe :: BS.ByteString -> ShortText
 774 | fromByteStringUnsafe = ShortText . BSS.toShort
 775 | 
 776 | ----------------------------------------------------------------------------
 777 | 
 778 | encodeString :: TextEncoding -> String -> BS.ByteString
 779 | encodeString te str = unsafePerformIO $ GHC.withCStringLen te str BS.packCStringLen
 780 | 
 781 | -- decodeString :: TextEncoding -> BS.ByteString -> Maybe String
 782 | -- decodeString te bs = cvtEx $ unsafePerformIO $ try $ BS.useAsCStringLen bs (GHC.peekCStringLen te)
 783 | --   where
 784 | --     cvtEx :: Either IOException a -> Maybe a
 785 | --     cvtEx = either (const Nothing) Just
 786 | 
 787 | decodeString' :: TextEncoding -> BS.ByteString -> String
 788 | decodeString' te bs = unsafePerformIO $ BS.useAsCStringLen bs (GHC.peekCStringLen te)
 789 | 
 790 | decodeStringShort' :: TextEncoding -> ShortByteString -> String
 791 | decodeStringShort' te = decodeString' te . BSS.fromShort
 792 | 
 793 | encodeStringShort :: TextEncoding -> String -> BSS.ShortByteString
 794 | encodeStringShort te = BSS.toShort . encodeString te
 795 | 
 796 | -- isValidUtf8' :: ShortText -> Int
 797 | -- isValidUtf8' st = fromIntegral $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))
 798 | 
 799 | isValidUtf8 :: ShortText -> Bool
 800 | isValidUtf8 st = (==0) $ unsafeDupablePerformIO (c_text_short_is_valid_utf8 (toByteArray# st) (toCSize st))
 801 | 
 802 | type CCodePoint = Word
 803 | 
 804 | foreign import ccall unsafe "hs_text_short_is_valid_utf8" c_text_short_is_valid_utf8 :: ByteArray# -> CSize -> IO CInt
 805 | 
 806 | foreign import ccall unsafe "hs_text_short_index_cp" c_text_short_index :: ByteArray# -> CSize -> CSize -> IO CCodePoint
 807 | 
 808 | -- | \(\mathcal{O}(n)\) Lookup /i/-th code-point in 'ShortText'.
 809 | --
 810 | -- Returns 'Nothing' if out of bounds.
 811 | --
 812 | -- prop> indexMaybe (singleton c) 0 == Just c
 813 | --
 814 | -- prop> indexMaybe t 0 == fmap fst (uncons t)
 815 | --
 816 | -- prop> indexMaybe mempty i == Nothing
 817 | --
 818 | -- @since 0.1.2
 819 | indexMaybe :: ShortText -> Int -> Maybe Char
 820 | indexMaybe st i
 821 |   | i < 0      = Nothing
 822 |   | otherwise  = cp2chSafe cp
 823 |   where
 824 |     cp = CP $ unsafeDupablePerformIO (c_text_short_index (toByteArray# st) (toCSize st) (fromIntegral i))
 825 | 
 826 | -- | \(\mathcal{O}(n)\) Lookup /i/-th code-point from the end of 'ShortText'.
 827 | --
 828 | -- Returns 'Nothing' if out of bounds.
 829 | --
 830 | -- prop> indexEndMaybe (singleton c) 0 == Just c
 831 | --
 832 | -- prop> indexEndMaybe t 0 == fmap snd (unsnoc t)
 833 | --
 834 | -- prop> indexEndMaybe mempty i == Nothing
 835 | --
 836 | -- @since 0.1.2
 837 | indexEndMaybe :: ShortText -> Int -> Maybe Char
 838 | indexEndMaybe st i
 839 |   | i < 0      = Nothing
 840 |   | otherwise  = cp2chSafe cp
 841 |   where
 842 |     cp = CP $ unsafeDupablePerformIO (c_text_short_index_rev (toByteArray# st) (toCSize st) (fromIntegral i))
 843 | 
 844 | foreign import ccall unsafe "hs_text_short_index_cp_rev" c_text_short_index_rev :: ByteArray# -> CSize -> CSize -> IO CCodePoint
 845 | 
 846 | 
 847 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
 848 | --
 849 | -- @'splitAt' n t@ returns a pair of 'ShortText' with the following properties:
 850 | --
 851 | -- prop> length (fst (splitAt n t)) == min (length t) (max 0 n)
 852 | --
 853 | -- prop> fst (splitAt n t) <> snd (splitAt n t) == t
 854 | --
 855 | -- >>> splitAt 2 "abcdef"
 856 | -- ("ab","cdef")
 857 | --
 858 | -- >>> splitAt 10 "abcdef"
 859 | -- ("abcdef","")
 860 | --
 861 | -- >>> splitAt (-1) "abcdef"
 862 | -- ("","abcdef")
 863 | --
 864 | -- @since 0.1.2
 865 | splitAt :: Int -> ShortText -> (ShortText,ShortText)
 866 | splitAt i st
 867 |   | i <= 0    = (mempty,st)
 868 |   | otherwise = splitAtOfs ofs st
 869 |   where
 870 |     ofs   = csizeToB $
 871 |             unsafeDupablePerformIO (c_text_short_index_ofs (toByteArray# st) stsz (fromIntegral i))
 872 |     stsz  = toCSize st
 873 | 
 874 | -- | \(\mathcal{O}(n)\) Split 'ShortText' into two halves.
 875 | --
 876 | -- @'splitAtEnd' n t@ returns a pair of 'ShortText' with the following properties:
 877 | --
 878 | -- prop> length (snd (splitAtEnd n t)) == min (length t) (max 0 n)
 879 | --
 880 | -- prop> fst (splitAtEnd n t) <> snd (splitAtEnd n t) == t
 881 | --
 882 | -- prop> splitAtEnd n t == splitAt (length t - n) t
 883 | --
 884 | -- >>> splitAtEnd 2 "abcdef"
 885 | -- ("abcd","ef")
 886 | --
 887 | -- >>> splitAtEnd 10 "abcdef"
 888 | -- ("","abcdef")
 889 | --
 890 | -- >>> splitAtEnd (-1) "abcdef"
 891 | -- ("abcdef","")
 892 | --
 893 | -- @since 0.1.2
 894 | splitAtEnd :: Int -> ShortText -> (ShortText,ShortText)
 895 | splitAtEnd i st
 896 |   | i <= 0      = (st,mempty)
 897 |   | ofs >= stsz = (mempty,st)
 898 |   | otherwise   = splitAtOfs ofs st
 899 |   where
 900 |     ofs   = csizeToB $
 901 |             unsafeDupablePerformIO (c_text_short_index_ofs_rev (toByteArray# st) (toCSize st) (fromIntegral (i-1)))
 902 |     stsz  = toB st
 903 | 
 904 | {-# INLINE splitAtOfs #-}
 905 | splitAtOfs :: B -> ShortText -> (ShortText,ShortText)
 906 | splitAtOfs ofs st
 907 |   | ofs  == 0    = (mempty,st)
 908 |   | ofs  >= stsz = (st,mempty)
 909 |   | otherwise    = (slice st 0 ofs, slice st ofs (stsz-ofs))
 910 |   where
 911 |     !stsz  = toB st
 912 | 
 913 | foreign import ccall unsafe "hs_text_short_index_ofs" c_text_short_index_ofs :: ByteArray# -> CSize -> CSize -> IO CSize
 914 | 
 915 | foreign import ccall unsafe "hs_text_short_index_ofs_rev" c_text_short_index_ofs_rev :: ByteArray# -> CSize -> CSize -> IO CSize
 916 | 
 917 | 
 918 | -- | \(\mathcal{O}(n)\) Inverse operation to 'cons'
 919 | --
 920 | -- Returns 'Nothing' for empty input 'ShortText'.
 921 | --
 922 | -- prop> uncons (cons c t) == Just (c,t)
 923 | --
 924 | -- >>> uncons ""
 925 | -- Nothing
 926 | --
 927 | -- >>> uncons "fmap"
 928 | -- Just ('f',"map")
 929 | --
 930 | -- @since 0.1.2
 931 | uncons :: ShortText -> Maybe (Char,ShortText)
 932 | uncons st
 933 |   | null st    = Nothing
 934 |   | len2 == 0  = Just (c0, mempty)
 935 |   | otherwise  = Just (c0, slice st ofs len2)
 936 |   where
 937 |     c0  = cp2ch cp0
 938 |     cp0 = readCodePoint st 0
 939 |     ofs = cpLen cp0
 940 |     len2 = toB st - ofs
 941 | 
 942 | -- | \(\mathcal{O}(n)\) Inverse operation to 'snoc'
 943 | --
 944 | -- Returns 'Nothing' for empty input 'ShortText'.
 945 | --
 946 | -- prop> unsnoc (snoc t c) == Just (t,c)
 947 | --
 948 | -- >>> unsnoc ""
 949 | -- Nothing
 950 | --
 951 | -- >>> unsnoc "fmap"
 952 | -- Just ("fma",'p')
 953 | --
 954 | -- @since 0.1.2
 955 | unsnoc :: ShortText -> Maybe (ShortText,Char)
 956 | unsnoc st
 957 |   | null st    = Nothing
 958 |   | len1 == 0  = Just (mempty, c0)
 959 |   | otherwise  = Just (slice st 0 len1, c0)
 960 |   where
 961 |     c0  = cp2ch cp0
 962 |     cp0 = readCodePointRev st stsz
 963 |     stsz = toB st
 964 |     len1 = stsz - cpLen cp0
 965 | 
 966 | -- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a prefix of the second 'ShortText'
 967 | --
 968 | -- >>> isPrefixOf "ab" "abcdef"
 969 | -- True
 970 | --
 971 | -- >>> isPrefixOf "ac" "abcdef"
 972 | -- False
 973 | --
 974 | -- prop> isPrefixOf "" t == True
 975 | --
 976 | -- prop> isPrefixOf t t == True
 977 | --
 978 | -- @since 0.1.2
 979 | isPrefixOf :: ShortText -> ShortText -> Bool
 980 | isPrefixOf x y
 981 |   | lx > ly = False
 982 |   | lx == 0 = True
 983 |   | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) 0# n# of
 984 |                   0# -> True
 985 |                   _  -> False
 986 |   where
 987 |     !lx@(I# n#) = toLength x
 988 |     !ly = toLength y
 989 | 
 990 | -- | \(\mathcal{O}(n)\) Strip prefix from second 'ShortText' argument.
 991 | --
 992 | -- Returns 'Nothing' if first argument is not a prefix of the second argument.
 993 | --
 994 | -- >>> stripPrefix "text-" "text-short"
 995 | -- Just "short"
 996 | --
 997 | -- >>> stripPrefix "test-" "text-short"
 998 | -- Nothing
 999 | --
1000 | -- @since 0.1.2
1001 | stripPrefix :: ShortText -> ShortText -> Maybe ShortText
1002 | stripPrefix pfx t
1003 |   | isPrefixOf pfx t = Just $! snd (splitAtOfs (toB pfx) t)
1004 |   | otherwise        = Nothing
1005 | 
1006 | -- | \(\mathcal{O}(n)\) Tests whether the first 'ShortText' is a suffix of the second 'ShortText'
1007 | --
1008 | -- >>> isSuffixOf "ef" "abcdef"
1009 | -- True
1010 | --
1011 | -- >>> isPrefixOf "df" "abcdef"
1012 | -- False
1013 | --
1014 | -- prop> isSuffixOf "" t == True
1015 | --
1016 | -- prop> isSuffixOf t t == True
1017 | --
1018 | -- @since 0.1.2
1019 | isSuffixOf :: ShortText -> ShortText -> Bool
1020 | isSuffixOf x y
1021 |   | lx > ly = False
1022 |   | lx == 0 = True
1023 |   | otherwise = case PrimOps.compareByteArrays# (toByteArray# x) 0# (toByteArray# y) ofs2# n# of
1024 |                   0# -> True
1025 |                   _  -> False
1026 |   where
1027 |     !(I# ofs2#) = ly - lx
1028 |     !lx@(I# n#) = toLength x
1029 |     !ly = toLength y
1030 | 
1031 | -- | \(\mathcal{O}(n)\) Strip suffix from second 'ShortText' argument.
1032 | --
1033 | -- Returns 'Nothing' if first argument is not a suffix of the second argument.
1034 | --
1035 | -- >>> stripSuffix "-short" "text-short"
1036 | -- Just "text"
1037 | --
1038 | -- >>> stripSuffix "-utf8" "text-short"
1039 | -- Nothing
1040 | --
1041 | -- @since 0.1.2
1042 | stripSuffix :: ShortText -> ShortText -> Maybe ShortText
1043 | stripSuffix sfx t
1044 |   | isSuffixOf sfx t = Just $! fst (splitAtOfs pfxLen t)
1045 |   | otherwise        = Nothing
1046 |   where
1047 |     pfxLen = toB t - toB sfx
1048 | 
1049 | ----------------------------------------------------------------------------
1050 | 
1051 | -- | \(\mathcal{O}(n)\) Insert character between characters of 'ShortText'.
1052 | --
1053 | -- >>> intersperse '*' "_"
1054 | -- "_"
1055 | --
1056 | -- >>> intersperse '*' "MASH"
1057 | -- "M*A*S*H"
1058 | --
1059 | -- @since 0.1.2
1060 | intersperse :: Char -> ShortText -> ShortText
1061 | intersperse c st
1062 |   | null st = mempty
1063 |   | sn == 1 = st
1064 |   | otherwise = create newsz $ \mba -> do
1065 |       let !cp0 = readCodePoint st 0
1066 |           !cp0sz = cpLen cp0
1067 |       writeCodePointN cp0sz mba 0 cp0
1068 |       go mba (sn - 1) cp0sz cp0sz
1069 |   where
1070 |     newsz = ssz + ((sn-1) `mulB` csz)
1071 |     ssz = toB st
1072 |     sn  = length st
1073 |     csz = cpLen cp
1074 |     cp  = ch2cp c
1075 | 
1076 |     go :: MBA s -> Int -> B -> B -> ST s ()
1077 |     go _   0 !_  !_   = return ()
1078 |     go mba n ofs ofs2 = do
1079 |       let !cp1 = readCodePoint st ofs2
1080 |           !cp1sz = cpLen cp1
1081 |       writeCodePointN csz   mba ofs cp
1082 |       writeCodePointN cp1sz mba (ofs+csz) cp1
1083 |       go mba (n-1) (ofs+csz+cp1sz) (ofs2+cp1sz)
1084 | 
1085 | -- | \(\mathcal{O}(n)\) Insert 'ShortText' inbetween list of 'ShortText's.
1086 | --
1087 | -- >>> intercalate ", " []
1088 | -- ""
1089 | --
1090 | -- >>> intercalate ", " ["foo"]
1091 | -- "foo"
1092 | --
1093 | -- >>> intercalate ", " ["foo","bar","doo"]
1094 | -- "foo, bar, doo"
1095 | --
1096 | -- prop> intercalate "" ts == concat ts
1097 | --
1098 | -- @since 0.1.2
1099 | intercalate :: ShortText -> [ShortText] -> ShortText
1100 | intercalate _ []  = mempty
1101 | intercalate _ [t] = t
1102 | intercalate sep ts
1103 |   | null sep   = mconcat ts
1104 |   | otherwise  = mconcat (List.intersperse sep ts)
1105 | 
1106 | -- | \(\mathcal{O}(n*m)\) Replicate a 'ShortText'.
1107 | --
1108 | -- A repetition count smaller than 1 results in an empty string result.
1109 | --
1110 | -- >>> replicate 3 "jobs!"
1111 | -- "jobs!jobs!jobs!"
1112 | --
1113 | -- >>> replicate 10000 ""
1114 | -- ""
1115 | --
1116 | -- >>> replicate 0 "nothing"
1117 | -- ""
1118 | --
1119 | -- prop> length (replicate n t) == max 0 n * length t
1120 | --
1121 | -- @since 0.1.2
1122 | replicate :: Int -> ShortText -> ShortText
1123 | replicate n0 t
1124 |   | n0 < 1     = mempty
1125 |   | null t    = mempty
1126 |   | otherwise = create (n0 `mulB` sz) (go 0)
1127 |   where
1128 |     go :: Int -> MBA s -> ST s ()
1129 |     go j mba
1130 |       | j == n0    = return ()
1131 |       | otherwise  = do
1132 |           copyByteArray t 0 mba (j `mulB` sz) sz
1133 |           go (j+1) mba
1134 | 
1135 |     sz = toB t
1136 | 
1137 | -- | \(\mathcal{O}(n)\) Reverse characters in 'ShortText'.
1138 | --
1139 | -- >>> reverse "star live desserts"
1140 | -- "stressed evil rats"
1141 | --
1142 | -- prop> reverse (singleton c) == singleton c
1143 | --
1144 | -- prop> reverse (reverse t) == t
1145 | --
1146 | -- @since 0.1.2
1147 | reverse :: ShortText -> ShortText
1148 | reverse st
1149 |   | null st   = mempty
1150 |   | sn == 1   = st
1151 |   | otherwise = create sz $ go sn 0
1152 |   where
1153 |     sz = toB st
1154 |     sn = length st
1155 | 
1156 |     go :: Int -> B -> MBA s -> ST s ()
1157 |     go 0 !_  _   = return ()
1158 |     go i ofs mba = do
1159 |       let !cp   = readCodePoint st ofs
1160 |           !cpsz = cpLen cp
1161 |           !ofs' = ofs+cpsz
1162 |       writeCodePointN cpsz mba (sz - ofs') cp
1163 |       go (i-1) ofs' mba
1164 | 
1165 | 
1166 | -- | \(\mathcal{O}(n)\) Remove characters from 'ShortText' which don't satisfy given predicate.
1167 | --
1168 | -- >>> filter (`notElem` ['a','e','i','o','u']) "You don't need vowels to convey information!"
1169 | -- "Y dn't nd vwls t cnvy nfrmtn!"
1170 | --
1171 | -- prop> filter (const False) t == ""
1172 | --
1173 | -- prop> filter (const True) t == t
1174 | --
1175 | -- prop> length (filter p t) <= length t
1176 | --
1177 | -- prop> filter p t == pack [ c | c <- unpack t, p c ]
1178 | --
1179 | -- @since 0.1.2
1180 | filter :: (Char -> Bool) -> ShortText -> ShortText
1181 | filter p t
1182 |   = case (mofs1,mofs2) of
1183 |       (Nothing,   _)       -> t -- no non-accepted characters found
1184 |       (Just 0,    Nothing) -> mempty -- no accepted characters found
1185 |       (Just ofs1, Nothing) -> slice t 0 ofs1 -- only prefix accepted
1186 |       (Just ofs1, Just ofs2) -> createShrink (t0sz-(ofs2-ofs1)) $ \mba -> do
1187 |         -- copy accepted prefix
1188 |         copyByteArray t 0 mba 0 ofs1
1189 |         -- [ofs1 .. ofs2) are a non-accepted region
1190 |         -- filter rest after ofs2
1191 |         t1sz <- go mba ofs2 ofs1
1192 |         return t1sz
1193 |   where
1194 |     mofs1 = findOfs (not . p) t (B 0) -- first non-accepted Char
1195 |     mofs2 = findOfs p t (fromMaybe (B 0) mofs1) -- first accepted Char
1196 | 
1197 |     t0sz = toB t
1198 | 
1199 |     go :: MBA s -> B -> B -> ST s B
1200 |     go mba !t0ofs !t1ofs
1201 |       | t0ofs >= t0sz = return t1ofs
1202 |       | otherwise = let !cp = readCodePoint t t0ofs
1203 |                         !cpsz = cpLen cp
1204 |                     in if p (cp2ch cp)
1205 |                        then writeCodePointN cpsz mba t1ofs cp >>
1206 |                             go mba (t0ofs+cpsz) (t1ofs+cpsz)
1207 |                        else go mba (t0ofs+cpsz) t1ofs -- skip code-point
1208 | 
1209 | -- | \(\mathcal{O}(n)\) Strip characters from the beginning end and of 'ShortText' which satisfy given predicate.
1210 | --
1211 | -- >>> dropAround (== ' ') "   white   space   "
1212 | -- "white   space"
1213 | --
1214 | -- >>> dropAround (> 'a') "bcdefghi"
1215 | -- ""
1216 | --
1217 | -- @since 0.1.2
1218 | dropAround :: (Char -> Bool) -> ShortText -> ShortText
1219 | dropAround p t0 = case (mofs1,mofs2) of
1220 |                     (Nothing,_) -> mempty
1221 |                     (Just ofs1,Just ofs2)
1222 |                       | ofs1 == 0, ofs2 == t0sz  -> t0
1223 |                       | ofs1 < ofs2  -> create (ofs2-ofs1) $ \mba -> do
1224 |                           copyByteArray t0 ofs1 mba (B 0) (ofs2-ofs1)
1225 |                     (_,_) -> error "dropAround: the impossible happened"
1226 |   where
1227 |     mofs1 = findOfs    (not . p) t0 (B 0)
1228 |     mofs2 = findOfsRev (not . p) t0 t0sz
1229 |     t0sz = toB t0
1230 | 
1231 | ----------------------------------------------------------------------------
1232 | 
1233 | -- | Construct a new 'ShortText' from an existing one by slicing
1234 | --
1235 | -- NB: The 'CSize' arguments refer to byte-offsets
1236 | slice :: ShortText -> B -> B -> ShortText
1237 | slice st ofs len
1238 |   | ofs < 0    = error "invalid offset"
1239 |   | len < 0    = error "invalid length"
1240 |   | len' == 0  = mempty
1241 |   | otherwise  = create len' $ \mba -> copyByteArray st ofs' mba 0 len'
1242 |   where
1243 |     len0 = toB st
1244 |     len' = max 0 (min len (len0-ofs))
1245 |     ofs' = max 0 ofs
1246 | 
1247 | ----------------------------------------------------------------------------
1248 | -- low-level MutableByteArray# helpers
1249 | 
1250 | -- | Byte offset (or size) in bytes
1251 | --
1252 | -- This currently wraps an 'Int' because this is what GHC's primops
1253 | -- currently use for byte offsets/sizes.
1254 | newtype B = B { unB :: Int }
1255 |           deriving (Ord,Eq,Num)
1256 | 
1257 | {- TODO: introduce operators for 'B' to avoid 'Num' -}
1258 | 
1259 | mulB :: Int -> B -> B
1260 | mulB n (B b) = B (n*b)
1261 | 
1262 | csizeFromB :: B -> CSize
1263 | csizeFromB = fromIntegral . unB
1264 | 
1265 | csizeToB :: CSize -> B
1266 | csizeToB = B . fromIntegral
1267 | 
1268 | data MBA s = MBA# { unMBA# :: MutableByteArray# s }
1269 | 
1270 | {-# INLINE create #-}
1271 | create :: B -> (forall s. MBA s -> ST s ()) -> ShortText
1272 | create n go = runST $ do
1273 |   mba <- newByteArray n
1274 |   go mba
1275 |   unsafeFreeze mba
1276 | 
1277 | {-# INLINE createShrink #-}
1278 | createShrink :: B -> (forall s. MBA s -> ST s B) -> ShortText
1279 | createShrink n go = runST $ do
1280 |   mba <- newByteArray n
1281 |   n' <- go mba
1282 |   if n' < n
1283 |     then unsafeFreezeShrink mba n'
1284 |     else unsafeFreeze mba
1285 | 
1286 | {-# INLINE unsafeFreeze #-}
1287 | unsafeFreeze :: MBA s -> ST s ShortText
1288 | unsafeFreeze (MBA# mba#)
1289 |   = ST $ \s -> case GHC.Exts.unsafeFreezeByteArray# mba# s of
1290 |                  (# s', ba# #) -> (# s', ShortText (BSSI.SBS ba#) #)
1291 | 
1292 | {-# INLINE copyByteArray #-}
1293 | copyByteArray :: ShortText -> B -> MBA s -> B -> B -> ST s ()
1294 | copyByteArray (ShortText (BSSI.SBS src#)) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B (I# len#))
1295 |   = ST $ \s -> case GHC.Exts.copyByteArray# src# src_off# dst# dst_off# len# s of
1296 |                  s' -> (# s', () #)
1297 | 
1298 | {-# INLINE newByteArray #-}
1299 | newByteArray :: B -> ST s (MBA s)
1300 | newByteArray (B (I# n#))
1301 |   = ST $ \s -> case GHC.Exts.newByteArray# n# s of
1302 |                  (# s', mba# #) -> (# s', MBA# mba# #)
1303 | 
1304 | {-# INLINE writeWord8Array #-}
1305 | writeWord8Array :: MBA s -> B -> Word -> ST s ()
1306 | writeWord8Array (MBA# mba#) (B (I# i#)) (W# w#)
1307 |   = ST $ \s ->
1308 | #if __GLASGOW_HASKELL__ >= 902
1309 |       case GHC.Exts.writeWord8Array# mba# i# (GHC.Exts.wordToWord8# w#) s of
1310 | #else
1311 |       case GHC.Exts.writeWord8Array# mba# i# w# s of
1312 | #endif
1313 |                  s' -> (# s', () #)
1314 | {- not needed yet
1315 | {-# INLINE indexWord8Array #-}
1316 | indexWord8Array :: ShortText -> B -> Word
1317 | indexWord8Array (ShortText (BSSI.SBS ba#)) (B (I# i#)) = W# (GHC.Exts.indexWord8Array# ba# i#)
1318 | -}
1319 | 
1320 | {-# INLINE copyAddrToByteArray #-}
1321 | copyAddrToByteArray :: Ptr a -> MBA RealWorld -> B -> B -> ST RealWorld ()
1322 | copyAddrToByteArray (Ptr src#) (MBA# dst#) (B (I# dst_off#)) (B (I# len#))
1323 |   = ST $ \s -> case GHC.Exts.copyAddrToByteArray# src# dst# dst_off# len# s of
1324 |                  s' -> (# s', () #)
1325 | 
1326 | ----------------------------------------------------------------------------
1327 | -- unsafeFreezeShrink
1328 | 
1329 | #if __GLASGOW_HASKELL__ >= 710
1330 | -- for GHC versions which have the 'shrinkMutableByteArray#' primop
1331 | {-# INLINE unsafeFreezeShrink #-}
1332 | unsafeFreezeShrink :: MBA s -> B -> ST s ShortText
1333 | unsafeFreezeShrink mba n = do
1334 |   shrink mba n
1335 |   unsafeFreeze mba
1336 | 
1337 | {-# INLINE shrink #-}
1338 | shrink :: MBA s -> B -> ST s ()
1339 | shrink (MBA# mba#) (B (I# i#))
1340 |   = ST $ \s -> case GHC.Exts.shrinkMutableByteArray# mba# i# s of
1341 |                  s' -> (# s', () #)
1342 | #else
1343 | -- legacy code for GHC versions which lack `shrinkMutableByteArray#` primop
1344 | {-# INLINE unsafeFreezeShrink #-}
1345 | unsafeFreezeShrink :: MBA s -> B -> ST s ShortText
1346 | unsafeFreezeShrink mba0 n = do
1347 |   mba' <- newByteArray n
1348 |   copyByteArray2 mba0 0 mba' 0 n
1349 |   unsafeFreeze mba'
1350 | 
1351 | {-# INLINE copyByteArray2 #-}
1352 | copyByteArray2 :: MBA s -> B -> MBA s -> B -> B -> ST s ()
1353 | copyByteArray2 (MBA# src#) (B (I# src_off#)) (MBA# dst#) (B (I# dst_off#)) (B( I# len#))
1354 |   = ST $ \s -> case GHC.Exts.copyMutableByteArray# src# src_off# dst# dst_off# len# s of
1355 |                  s' -> (# s', () #)
1356 | #endif
1357 | 
1358 | ----------------------------------------------------------------------------
1359 | -- Helpers for encoding code points into UTF-8 code units
1360 | --
1361 | --   7 bits| <    0x80 | 0xxxxxxx
1362 | --  11 bits| <   0x800 | 110yyyyx  10xxxxxx
1363 | --  16 bits| < 0x10000 | 1110yyyy  10yxxxxx  10xxxxxx
1364 | --  21 bits|           | 11110yyy  10yyxxxx  10xxxxxx  10xxxxxx
1365 | 
1366 | -- | Unicode Code-point
1367 | --
1368 | -- Keeping it as a 'Word' is more convenient for bit-ops and FFI
1369 | newtype CP = CP Word
1370 | 
1371 | {-# INLINE ch2cp #-}
1372 | ch2cp :: Char -> CP
1373 | ch2cp (ord -> ci)
1374 |   | isSurr ci  = CP 0xFFFD
1375 |   | otherwise  = CP (fromIntegral ci)
1376 | 
1377 | {-# INLINE isSurr #-}
1378 | isSurr :: (Num i, Bits i) => i -> Bool
1379 | isSurr ci = ci .&. 0xfff800 == 0xd800
1380 | 
1381 | {-# INLINE cp2ch #-}
1382 | cp2ch :: CP -> Char
1383 | cp2ch (CP w) = (w < 0x110000) `assert` unsafeChr (fromIntegral w)
1384 | 
1385 | -- used/needed by index-lookup functions to encode out of bounds
1386 | cp2chSafe :: CP -> Maybe Char
1387 | cp2chSafe cp
1388 |   | cpNull cp = Nothing
1389 |   | otherwise = Just $! cp2ch cp
1390 |   where
1391 |     cpNull :: CP -> Bool
1392 |     cpNull (CP w) = w >= 0x110000
1393 | 
1394 | {-# INLINE cpLen #-}
1395 | cpLen :: CP -> B
1396 | cpLen (CP cp)
1397 |   | cp <    0x80  = B 1
1398 |   | cp <   0x800  = B 2
1399 |   | cp < 0x10000  = B 3
1400 |   | otherwise     = B 4
1401 | 
1402 | -- convenience wrapper; unsafe like readCodePoint
1403 | {-# INLINE decodeCharAtOfs #-}
1404 | decodeCharAtOfs :: ShortText -> B -> (Char,B)
1405 | decodeCharAtOfs st ofs = (c,ofs')
1406 |   where
1407 |     c    = cp2ch cp
1408 |     ofs' = ofs + cpLen cp
1409 |     cp   = readCodePoint st ofs
1410 | {- pure version of decodeCharAtOfs, but unfortunately significantly slower
1411 | 
1412 | decodeCharAtOfs st ofs
1413 |   | b0 < 0x80 = (cp2ch $ CP b0,ofs + B 1)
1414 |   | otherwise = case b0 `unsafeShiftR` 4 of
1415 |                   0xf -> (cp2ch $ CP go4, ofs + B 4)
1416 |                   0xe -> (cp2ch $ CP go3, ofs + B 3)
1417 |                   _   -> (cp2ch $ CP go2, ofs + B 2)
1418 |   where
1419 |     b0    = buf 0
1420 |     buf j = indexWord8Array st (ofs+j)
1421 | 
1422 |     go2 =     ((b0    .&. 0x1f) `unsafeShiftL` 6)
1423 |           .|.  (buf 1 .&. 0x3f)
1424 | 
1425 |     go3 =     ((b0    .&. 0x0f) `unsafeShiftL` (6+6))
1426 |           .|. ((buf 1 .&. 0x3f) `unsafeShiftL` 6)
1427 |           .|.  (buf 2 .&. 0x3f)
1428 | 
1429 |     go4 =     ((b0    .&. 0x07) `unsafeShiftL` (6+6+6))
1430 |           .|. ((buf 1 .&. 0x3f) `unsafeShiftL` (6+6))
1431 |           .|. ((buf 2 .&. 0x3f) `unsafeShiftL` 6)
1432 |           .|.  (buf 3 .&. 0x3f)
1433 | -}
1434 | 
1435 | 
1436 | -- | \(\mathcal{O}(1)\) Construct 'ShortText' from single codepoint.
1437 | --
1438 | -- prop> singleton c == pack [c]
1439 | --
1440 | -- prop> length (singleton c) == 1
1441 | --
1442 | -- >>> singleton 'A'
1443 | -- "A"
1444 | --
1445 | -- >>> map singleton ['\55295','\55296','\57343','\57344'] -- U+D7FF U+D800 U+DFFF U+E000
1446 | -- ["\55295","\65533","\65533","\57344"]
1447 | --
1448 | -- __Note__: This function is total because it replaces the (invalid) code-points U+D800 through U+DFFF with the replacement character U+FFFD.
1449 | --
1450 | -- @since 0.1.2
1451 | singleton :: Char -> ShortText
1452 | singleton = singleton' . ch2cp
1453 | 
1454 | singleton' :: CP -> ShortText
1455 | singleton' cp@(CP cpw)
1456 |   | cpw <    0x80  = create 1 $ \mba -> writeCodePoint1 mba 0 cp
1457 |   | cpw <   0x800  = create 2 $ \mba -> writeCodePoint2 mba 0 cp
1458 |   | cpw < 0x10000  = create 3 $ \mba -> writeCodePoint3 mba 0 cp
1459 |   | otherwise      = create 4 $ \mba -> writeCodePoint4 mba 0 cp
1460 | 
1461 | -- | \(\mathcal{O}(n)\) Prepend a character to a 'ShortText'.
1462 | --
1463 | -- prop> cons c t == singleton c <> t
1464 | --
1465 | -- @since 0.1.2
1466 | cons :: Char -> ShortText -> ShortText
1467 | cons (ch2cp -> cp@(CP cpw)) sfx
1468 |   | n == 0         = singleton' cp
1469 |   | cpw <    0x80  = create (n+1) $ \mba -> writeCodePoint1 mba 0 cp >> copySfx 1 mba
1470 |   | cpw <   0x800  = create (n+2) $ \mba -> writeCodePoint2 mba 0 cp >> copySfx 2 mba
1471 |   | cpw < 0x10000  = create (n+3) $ \mba -> writeCodePoint3 mba 0 cp >> copySfx 3 mba
1472 |   | otherwise      = create (n+4) $ \mba -> writeCodePoint4 mba 0 cp >> copySfx 4 mba
1473 |   where
1474 |     !n = toB sfx
1475 | 
1476 |     copySfx :: B -> MBA s -> ST s ()
1477 |     copySfx ofs mba = copyByteArray sfx 0 mba ofs n
1478 | 
1479 | -- | \(\mathcal{O}(n)\) Append a character to the ond of a 'ShortText'.
1480 | --
1481 | -- prop> snoc t c == t <> singleton c
1482 | --
1483 | -- @since 0.1.2
1484 | snoc :: ShortText -> Char -> ShortText
1485 | snoc pfx (ch2cp -> cp@(CP cpw))
1486 |   | n == 0         = singleton' cp
1487 |   | cpw <    0x80  = create (n+1) $ \mba -> copyPfx mba >> writeCodePoint1 mba n cp
1488 |   | cpw <   0x800  = create (n+2) $ \mba -> copyPfx mba >> writeCodePoint2 mba n cp
1489 |   | cpw < 0x10000  = create (n+3) $ \mba -> copyPfx mba >> writeCodePoint3 mba n cp
1490 |   | otherwise      = create (n+4) $ \mba -> copyPfx mba >> writeCodePoint4 mba n cp
1491 |   where
1492 |     !n = toB pfx
1493 | 
1494 |     copyPfx :: MBA s -> ST s ()
1495 |     copyPfx mba = copyByteArray pfx 0 mba 0 n
1496 | 
1497 | {-
1498 | writeCodePoint :: MBA s -> Int -> Word -> ST s ()
1499 | writeCodePoint mba ofs cp
1500 |   | cp <    0x80  = writeCodePoint1 mba ofs cp
1501 |   | cp <   0x800  = writeCodePoint2 mba ofs cp
1502 |   | cp < 0x10000  = writeCodePoint3 mba ofs cp
1503 |   | otherwise     = writeCodePoint4 mba ofs cp
1504 | -}
1505 | 
1506 | writeCodePointN :: B -> MBA s -> B -> CP -> ST s ()
1507 | writeCodePointN 1 = writeCodePoint1
1508 | writeCodePointN 2 = writeCodePoint2
1509 | writeCodePointN 3 = writeCodePoint3
1510 | writeCodePointN 4 = writeCodePoint4
1511 | writeCodePointN _ = undefined
1512 | 
1513 | writeCodePoint1 :: MBA s -> B -> CP -> ST s ()
1514 | writeCodePoint1 mba ofs (CP cp) =
1515 |   writeWord8Array mba ofs cp
1516 | 
1517 | writeCodePoint2 :: MBA s -> B -> CP -> ST s ()
1518 | writeCodePoint2 mba ofs (CP cp) = do
1519 |   writeWord8Array mba  ofs    (0xc0 .|. (cp `unsafeShiftR` 6))
1520 |   writeWord8Array mba (ofs+1) (0x80 .|. (cp                     .&. 0x3f))
1521 | 
1522 | writeCodePoint3 :: MBA s -> B -> CP -> ST s ()
1523 | writeCodePoint3 mba ofs (CP cp) = do
1524 |   writeWord8Array mba  ofs    (0xe0 .|.  (cp `unsafeShiftR` 12))
1525 |   writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 6)  .&. 0x3f))
1526 |   writeWord8Array mba (ofs+2) (0x80 .|.  (cp                    .&. 0x3f))
1527 | 
1528 | writeCodePoint4 :: MBA s -> B -> CP -> ST s ()
1529 | writeCodePoint4 mba ofs (CP cp) = do
1530 |   writeWord8Array mba  ofs    (0xf0 .|.  (cp `unsafeShiftR` 18))
1531 |   writeWord8Array mba (ofs+1) (0x80 .|. ((cp `unsafeShiftR` 12) .&. 0x3f))
1532 |   writeWord8Array mba (ofs+2) (0x80 .|. ((cp `unsafeShiftR` 6)  .&. 0x3f))
1533 |   writeWord8Array mba (ofs+3) (0x80 .|. (cp                     .&. 0x3f))
1534 | 
1535 | -- beware: UNSAFE!
1536 | readCodePoint :: ShortText -> B -> CP
1537 | readCodePoint st (csizeFromB -> ofs)
1538 |   = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp (toByteArray# st) ofs)
1539 | 
1540 | foreign import ccall unsafe "hs_text_short_ofs_cp" c_text_short_ofs_cp :: ByteArray# -> CSize -> IO CCodePoint
1541 | 
1542 | readCodePointRev :: ShortText -> B -> CP
1543 | readCodePointRev st (csizeFromB -> ofs)
1544 |   = CP $ unsafeDupablePerformIO (c_text_short_ofs_cp_rev (toByteArray# st) ofs)
1545 | 
1546 | foreign import ccall unsafe "hs_text_short_ofs_cp_rev" c_text_short_ofs_cp_rev :: ByteArray# -> CSize -> IO CCodePoint
1547 | 
1548 | ----------------------------------------------------------------------------
1549 | -- string & list literals
1550 | 
1551 | -- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) character literals are replaced by U+FFFD.
1552 | --
1553 | -- @since 0.1.2
1554 | instance GHC.Exts.IsList ShortText where
1555 |     type (Item ShortText) = Char
1556 |     fromList = fromString
1557 |     toList   = toString
1558 | 
1559 | -- | __Note__: Surrogate pairs (@[U+D800 .. U+DFFF]@) in string literals are replaced by U+FFFD.
1560 | --
1561 | -- This matches the behaviour of 'S.IsString' instance for 'T.Text'.
1562 | instance S.IsString ShortText where
1563 |     fromString = fromStringLit
1564 | 
1565 | -- i.e., don't inline before Phase 0
1566 | {-# INLINE [0] fromStringLit #-}
1567 | fromStringLit :: String -> ShortText
1568 | fromStringLit = fromString
1569 | 
1570 | {-# RULES "ShortText empty literal" fromStringLit "" = mempty #-}
1571 | 
1572 | -- TODO: this doesn't seem to fire
1573 | {-# RULES "ShortText singleton literal" forall c . fromStringLit [c] = singleton c #-}
1574 | 
1575 | {-# RULES "ShortText literal ASCII" forall s . fromStringLit (GHC.unpackCString# s) = fromLitAsciiAddr# s #-}
1576 | 
1577 | {-# RULES "ShortText literal UTF-8" forall s . fromStringLit (GHC.unpackCStringUtf8# s) = fromLitMUtf8Addr# s #-}
1578 | 
1579 | {-# NOINLINE fromLitAsciiAddr# #-}
1580 | fromLitAsciiAddr# :: Addr# -> ShortText
1581 | fromLitAsciiAddr# (Ptr -> ptr) = unsafeDupablePerformIO $ do
1582 |   sz <- csizeToB `fmap` c_strlen ptr
1583 | 
1584 |   case sz `compare` 0 of
1585 |     EQ -> return mempty -- should not happen if rules fire correctly
1586 |     GT -> stToIO $ do
1587 |       mba <- newByteArray sz
1588 |       copyAddrToByteArray ptr mba 0 sz
1589 |       unsafeFreeze mba
1590 |     LT -> return (error "fromLitAsciiAddr#")
1591 |           -- NOTE: should never happen unless strlen(3) overflows (NB: CSize
1592 |           -- is unsigned; the overflow would occur when converting to
1593 |           -- 'B')
1594 | 
1595 | foreign import ccall unsafe "strlen" c_strlen :: CString -> IO CSize
1596 | 
1597 | -- GHC uses an encoding resembling Modified UTF-8 for non-ASCII string-literals
1598 | {-# NOINLINE fromLitMUtf8Addr# #-}
1599 | fromLitMUtf8Addr# :: Addr# -> ShortText
1600 | fromLitMUtf8Addr# (Ptr -> ptr) = unsafeDupablePerformIO $ do
1601 |   sz <- B `fmap` c_text_short_mutf8_strlen ptr
1602 | 
1603 |   case sz `compare` 0 of
1604 |     EQ -> return mempty -- should not happen if rules fire correctly
1605 |     GT -> stToIO $ do
1606 |       mba <- newByteArray sz
1607 |       copyAddrToByteArray ptr mba 0 sz
1608 |       unsafeFreeze mba
1609 |     LT -> do
1610 |       mba <- stToIO (newByteArray (abs sz))
1611 |       c_text_short_mutf8_trans ptr (unMBA# mba)
1612 |       stToIO (unsafeFreeze mba)
1613 | 
1614 | foreign import ccall unsafe "hs_text_short_mutf8_strlen" c_text_short_mutf8_strlen :: CString -> IO Int
1615 | 
1616 | foreign import ccall unsafe "hs_text_short_mutf8_trans" c_text_short_mutf8_trans :: CString -> MutableByteArray# RealWorld -> IO ()
1617 | 
1618 | -- $setup
1619 | -- >>> :set -XOverloadedStrings
1620 | -- >>> import Data.Text.Short (pack, unpack, concat)
1621 | -- >>> import Text.Show.Functions ()
1622 | -- >>> import qualified Test.QuickCheck.Arbitrary as QC
1623 | -- >>> import Test.QuickCheck.Instances ()
1624 | -- >>> instance QC.Arbitrary ShortText where { arbitrary = fmap fromString QC.arbitrary }
1625 | 


--------------------------------------------------------------------------------