├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── release.yaml
    │   └── build.yaml
├── cabal.project
├── src-ghc-cstrlen
    └── Cstrlen.hs
├── src
    └── Data
    │   ├── Bytes
    │       ├── Text
    │       │   ├── Windows1252.hs
    │       │   ├── Utf8.hs
    │       │   ├── Ascii.hs
    │       │   ├── AsciiExt.hs
    │       │   └── Latin1.hs
    │       ├── Indexed.hs
    │       ├── Internal
    │       │   └── Show.hs
    │       ├── IO.hs
    │       ├── Types.hs
    │       ├── Encode
    │       │   ├── BigEndian.hs
    │       │   └── LittleEndian.hs
    │       ├── Mutable.hs
    │       ├── Internal.hs
    │       ├── Search.hs
    │       ├── Byte.hs
    │       ├── Chunks.hs
    │       └── Pure.hs
    │   └── Bytes.hs
├── include
    └── bs_custom.h
├── src-no-ghc-cstrlen
    └── Cstrlen.hs
├── .gitignore
├── src-new-reps
    └── Reps.hs
├── src-old-reps
    └── Reps.hs
├── README.md
├── LICENSE
├── cbits
    └── bs_custom.c
├── fourmolu.yaml
├── bench
    └── Main.hs
├── byteslice.cabal
├── CHANGELOG.md
└── test
    └── Main.hs


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | @byteverse/l3c
2 | 


--------------------------------------------------------------------------------
/cabal.project:
--------------------------------------------------------------------------------
1 | -- 2024-02-06: Including this to make "cabal test" work on build machine.
2 | packages: .
3 | tests: True
4 | 


--------------------------------------------------------------------------------
/src-ghc-cstrlen/Cstrlen.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE MagicHash #-}
2 | 
3 | module Cstrlen
4 |   ( cstringLength#
5 |   ) where
6 | 
7 | import GHC.Exts (cstringLength#)
8 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Text/Windows1252.hs:
--------------------------------------------------------------------------------
1 | {- | Placeholder module in case there is demand for treating 'Bytes' as
2 | Windows-1252-encoded text
3 | -}
4 | module Data.Bytes.Text.Windows1252 () where
5 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - "*"
 6 | 
 7 | jobs:
 8 |   call-workflow:
 9 |     uses: byteverse/.github/.github/workflows/release.yaml@main
10 |     secrets: inherit
11 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - "*"
 6 | 
 7 | jobs:
 8 |   call-workflow:
 9 |     uses: byteverse/.github/.github/workflows/build-matrix.yaml@main
10 |     with:
11 |       cabal-file: byteslice.cabal
12 | 


--------------------------------------------------------------------------------
/include/bs_custom.h:
--------------------------------------------------------------------------------
1 | #define _GNU_SOURCE
2 | 
3 | #include <string.h>
4 | #include "Rts.h"
5 | 
6 | HsInt memchr_ba_many(unsigned char *p, HsInt off, HsInt len, HsInt *sizes, HsInt sizesLen, unsigned char w);
7 | HsInt count_ba(unsigned char *p, HsInt off, HsInt len, unsigned char w);
8 | 


--------------------------------------------------------------------------------
/src-no-ghc-cstrlen/Cstrlen.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MagicHash #-}
 2 | {-# LANGUAGE TypeFamilies #-}
 3 | {-# LANGUAGE UnliftedFFITypes #-}
 4 | 
 5 | module Cstrlen
 6 |   ( cstringLength#
 7 |   ) where
 8 | 
 9 | import GHC.Exts (Addr#, Int#)
10 | 
11 | foreign import ccall unsafe "strlen" c_strlen :: Addr# -> Int#
12 | 
13 | cstringLength# :: Addr# -> Int#
14 | cstringLength# = c_strlen
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | dist
 3 | dist-*
 4 | cabal-dev
 5 | *.o
 6 | *.hi
 7 | *.chi
 8 | *.chs.h
 9 | *.dyn_o
10 | *.dyn_hi
11 | .hpc
12 | .hsenv
13 | .cabal-sandbox/
14 | cabal.sandbox.config
15 | *.prof
16 | *.aux
17 | *.hp
18 | *.eventlog
19 | .stack-work/
20 | cabal.project.local
21 | cabal.project.local~
22 | .HTF/
23 | .ghc.environment.*
24 | stack.yaml
25 | *.swm
26 | *.swo
27 | *.swp
28 | test_results/**
29 | 


--------------------------------------------------------------------------------
/src-new-reps/Reps.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DataKinds #-}
 2 | {-# LANGUAGE GADTSyntax #-}
 3 | {-# LANGUAGE KindSignatures #-}
 4 | {-# LANGUAGE MagicHash #-}
 5 | {-# LANGUAGE UnboxedTuples #-}
 6 | {-# LANGUAGE UnliftedNewtypes #-}
 7 | 
 8 | module Reps
 9 |   ( Bytes# (..)
10 |   , word8ToWord#
11 |   ) where
12 | 
13 | import GHC.Exts (ByteArray#, Int#, Levity (Unlifted), RuntimeRep (..), TYPE, word8ToWord#)
14 | 
15 | newtype Bytes# :: TYPE ('TupleRep '[ 'BoxedRep 'Unlifted, 'IntRep, 'IntRep]) where
16 |   Bytes# :: (# ByteArray#, Int#, Int# #) -> Bytes#
17 | 


--------------------------------------------------------------------------------
/src-old-reps/Reps.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE GADTSyntax #-}
 2 | {-# LANGUAGE KindSignatures #-}
 3 | {-# LANGUAGE MagicHash #-}
 4 | {-# LANGUAGE TypeInType #-}
 5 | {-# LANGUAGE UnboxedTuples #-}
 6 | {-# LANGUAGE UnliftedNewtypes #-}
 7 | 
 8 | module Reps
 9 |   ( Bytes# (..)
10 |   , word8ToWord#
11 |   ) where
12 | 
13 | import GHC.Exts (ByteArray#, Int#, RuntimeRep (..), TYPE, Word#)
14 | 
15 | newtype Bytes# :: TYPE ('TupleRep '[ 'UnliftedRep, 'IntRep, 'IntRep]) where
16 |   Bytes# :: (# ByteArray#, Int#, Int# #) -> Bytes#
17 | 
18 | -- In GHC 9.2, the lifted Word8 type started being backed by the
19 | -- unlifted Word8# instead of by Word#. This is a compatibility hack.
20 | word8ToWord# :: Word# -> Word#
21 | {-# INLINE word8ToWord# #-}
22 | word8ToWord# w = w
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # byteslice
 2 | 
 3 | ## Purpose
 4 | 
 5 | Types for dealing with slices of 'ByteArray' and 'MutableByteArray'.
 6 | These are never supposed to introduce overhead. Rather, they exist
 7 | to clarify intent in type signatures.
 8 | 
 9 |     receive ::
10 |          Resource -- ^ Some scarce resource
11 |       -> MutableByteArray RealWorld -- ^ Buffer
12 |       -> Int -- ^ Offset
13 |       -> Int -- ^ Length
14 |       -> IO ()
15 | 
16 | With this library, we instead write
17 | 
18 |     receive ::
19 |          Resource -- ^ Some scarce resource
20 |       -> MutableBytes RealWorld -- ^ Buffer
21 |       -> IO ()
22 | 
23 | The combination of the worker-wrapper transformation and inlining means
24 | that we can expect these two to end up generating the same code in most
25 | situations.
26 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Indexed.hs:
--------------------------------------------------------------------------------
 1 | {-# language DataKinds #-}
 2 | {-# language MagicHash #-}
 3 | {-# language TypeOperators #-}
 4 | 
 5 | module Data.Bytes.Indexed
 6 |   ( ByteArrayN
 7 |   , append
 8 |   , length
 9 |   , length#
10 |   ) where
11 | 
12 | import Prelude hiding (length)
13 | 
14 | import Data.Primitive (ByteArray(ByteArray))
15 | import Data.Bytes.Types (ByteArrayN(ByteArrayN))
16 | import GHC.TypeNats (type (+))
17 | import Arithmetic.Types (Nat, Nat#)
18 | 
19 | import qualified Data.Primitive as PM
20 | import qualified Arithmetic.Unsafe as Unsafe
21 | import qualified GHC.Exts as Exts
22 | 
23 | append :: ByteArrayN m -> ByteArrayN n -> ByteArrayN (m + n)
24 | {-# inline append #-}
25 | append (ByteArrayN x) (ByteArrayN y) = ByteArrayN (x <> y)
26 | 
27 | -- | Recover a witness of the length.
28 | length :: ByteArrayN n -> Nat n
29 | {-# inline length #-}
30 | length (ByteArrayN x) = Unsafe.Nat (PM.sizeofByteArray x)
31 | 
32 | -- | Recover an unboxed witness of the length.
33 | length# :: ByteArrayN n -> Nat# n
34 | {-# inline length# #-}
35 | length# (ByteArrayN (ByteArray x)) = Unsafe.Nat# (Exts.sizeofByteArray# x)
36 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Internal/Show.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | 
 3 | module Data.Bytes.Internal.Show
 4 |   ( showsSlice
 5 |   ) where
 6 | 
 7 | import Data.Bits (unsafeShiftR, (.&.))
 8 | import Data.Char (ord)
 9 | import Data.Primitive (ByteArray)
10 | import Data.Word (Word8)
11 | import GHC.Base (unsafeChr)
12 | 
13 | import qualified Data.Primitive as PM
14 | 
15 | showsSlice :: ByteArray -> Int -> Int -> String -> String
16 | showsSlice arr off len s =
17 |   if len == 0
18 |     then showString "[]" s
19 |     else
20 |       showString "[0x" $
21 |         showHexDigitsWord8 (PM.indexByteArray arr off) $
22 |           showHexLoop (off + 1) (len - 1) arr $
23 |             showChar ']' $
24 |               s
25 | 
26 | showHexLoop :: Int -> Int -> ByteArray -> String -> String
27 | showHexLoop !ix !len !arr s =
28 |   if len > 0
29 |     then ',' : '0' : 'x' : showHexDigitsWord8 (PM.indexByteArray arr ix) (showHexLoop (ix + 1) (len - 1) arr s)
30 |     else s
31 | 
32 | showHexDigitsWord8 :: Word8 -> String -> String
33 | showHexDigitsWord8 !w s = word4ToChar (unsafeShiftR w 4) : word4ToChar (0x0F .&. w) : s
34 | 
35 | word4ToChar :: Word8 -> Char
36 | word4ToChar w =
37 |   if w < 10
38 |     then unsafeChr (ord '0' + fromIntegral w)
39 |     else unsafeChr (ord 'a' + (fromIntegral w) - 10)
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, Andrew Martin
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Andrew Martin nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/cbits/bs_custom.c:
--------------------------------------------------------------------------------
 1 | #define _GNU_SOURCE
 2 | 
 3 | #include <bs_custom.h>
 4 | #include <string.h>
 5 | #include "Rts.h"
 6 | 
 7 | // Find all occurrences of a byte, writing the lengths of each piece
 8 | // to the sizes buffer. This uses rawmemchr, so the total number of
 9 | // occurrences of the delimiter must be computed in advance. This
10 | // returns the total length of all pieces processed.
11 | //
12 | // Portability: On Linux, this uses rawmemchr. On all other platforms,
13 | // it uses memchr. On Linux, the length of the byte sequence is not
14 | // used. On other platforms, this is repeatedly decremented to provide
15 | // an appropriate third argument for memchr.
16 | HsInt memchr_ba_many(unsigned char *p, HsInt off, HsInt len, HsInt *sizes, HsInt sizesLen, unsigned char w) {
17 |   HsInt szIx, delta, total;
18 |   unsigned char* pos;
19 |   p = p + off;
20 |   total = 0;
21 |   for (szIx = 0; szIx < sizesLen; ++szIx) {
22 | #if defined(__linux__) && !AVOID_RAWMEMCHR
23 |     pos = (unsigned char*)(rawmemchr((void*)p,(int)w));
24 |     delta = (HsInt)(pos - p);
25 | #else
26 |     pos = (unsigned char*)(memchr((void*)p,(int)w,(size_t)len));
27 |     delta = (HsInt)(pos - p);
28 |     len = len - (delta + 1);
29 | #endif
30 |     sizes[szIx] = delta;
31 |     total = total + delta + 1;
32 |     p = pos + 1;
33 |   }
34 |   return total;
35 | }
36 | 
37 | // TODO: Possibly use SIMD in here. Or check so see if gcc optimizes
38 | // this on its own.
39 | HsInt count_ba(unsigned char *p, HsInt off, HsInt len, unsigned char w) {
40 |   HsInt c;
41 |   p = p + off;
42 |   for (c = 0; len-- != 0; ++p)
43 |       if (*p == w)
44 |           ++c;
45 |   return c;
46 | }
47 | 


--------------------------------------------------------------------------------
/fourmolu.yaml:
--------------------------------------------------------------------------------
 1 | # Number of spaces per indentation step
 2 | indentation: 2
 3 | 
 4 | # Max line length for automatic line breaking
 5 | column-limit: 200
 6 | 
 7 | # Styling of arrows in type signatures (choices: trailing, leading, or leading-args)
 8 | function-arrows: trailing
 9 | 
10 | # How to place commas in multi-line lists, records, etc. (choices: leading or trailing)
11 | comma-style: leading
12 | 
13 | # Styling of import/export lists (choices: leading, trailing, or diff-friendly)
14 | import-export-style: leading
15 | 
16 | # Whether to full-indent or half-indent 'where' bindings past the preceding body
17 | indent-wheres: false
18 | 
19 | # Whether to leave a space before an opening record brace
20 | record-brace-space: true
21 | 
22 | # Number of spaces between top-level declarations
23 | newlines-between-decls: 1
24 | 
25 | # How to print Haddock comments (choices: single-line, multi-line, or multi-line-compact)
26 | haddock-style: multi-line
27 | 
28 | # How to print module docstring
29 | haddock-style-module: null
30 | 
31 | # Styling of let blocks (choices: auto, inline, newline, or mixed)
32 | let-style: auto
33 | 
34 | # How to align the 'in' keyword with respect to the 'let' keyword (choices: left-align, right-align, or no-space)
35 | in-style: right-align
36 | 
37 | # Whether to put parentheses around a single constraint (choices: auto, always, or never)
38 | single-constraint-parens: always
39 | 
40 | # Output Unicode syntax (choices: detect, always, or never)
41 | unicode: never
42 | 
43 | # Give the programmer more choice on where to insert blank lines
44 | respectful: true
45 | 
46 | # Fixity information for operators
47 | fixities: []
48 | 
49 | # Module reexports Fourmolu should know about
50 | reexports: []
51 | 
52 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/IO.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BlockArguments #-}
 2 | {-# LANGUAGE MagicHash #-}
 3 | {-# LANGUAGE TypeApplications #-}
 4 | {-# LANGUAGE UnboxedTuples #-}
 5 | 
 6 | module Data.Bytes.IO
 7 |   ( hGet
 8 |   , hPut
 9 |   ) where
10 | 
11 | import Data.Bytes.Pure (contents, pin)
12 | import Data.Bytes.Types (Bytes (Bytes))
13 | import Data.Primitive (ByteArray (..), MutableByteArray)
14 | import qualified Data.Primitive as PM
15 | import Data.Word (Word8)
16 | import Foreign.Ptr (Ptr)
17 | import qualified GHC.Exts as Exts
18 | import GHC.IO (IO (IO))
19 | import System.IO (Handle)
20 | import qualified System.IO as IO
21 | 
22 | {- | Read 'Bytes' directly from the specified 'Handle'. The resulting
23 | 'Bytes' are pinned. This is implemented with 'IO.hGetBuf'.
24 | -}
25 | hGet :: Handle -> Int -> IO Bytes
26 | hGet h i = createPinnedAndTrim i (\p -> IO.hGetBuf h p i)
27 | 
28 | {- | Outputs 'Bytes' to the specified 'Handle'. This is implemented
29 | with 'IO.hPutBuf'.
30 | -}
31 | hPut :: Handle -> Bytes -> IO ()
32 | hPut h b0 = do
33 |   let b1@(Bytes arr _ len) = pin b0
34 |   IO.hPutBuf h (contents b1) len
35 |   touchByteArrayIO arr
36 | 
37 | -- Only used internally.
38 | createPinnedAndTrim :: Int -> (Ptr Word8 -> IO Int) -> IO Bytes
39 | {-# INLINE createPinnedAndTrim #-}
40 | createPinnedAndTrim maxSz f = do
41 |   arr@(PM.MutableByteArray arr#) <- PM.newPinnedByteArray maxSz
42 |   sz <- f (PM.mutableByteArrayContents arr)
43 |   touchMutableByteArrayIO arr
44 |   PM.shrinkMutablePrimArray (PM.MutablePrimArray @Exts.RealWorld @Word8 arr#) sz
45 |   r <- PM.unsafeFreezeByteArray arr
46 |   pure (Bytes r 0 sz)
47 | 
48 | touchMutableByteArrayIO :: MutableByteArray s -> IO ()
49 | touchMutableByteArrayIO (PM.MutableByteArray x) =
50 |   IO (\s -> (# Exts.touch# x s, () #))
51 | 
52 | touchByteArrayIO :: ByteArray -> IO ()
53 | touchByteArrayIO (ByteArray x) =
54 |   IO (\s -> (# Exts.touch# x s, () #))
55 | 


--------------------------------------------------------------------------------
/bench/Main.hs:
--------------------------------------------------------------------------------
 1 | import Gauge.Main (bench, bgroup, defaultMain, whnf)
 2 | 
 3 | import Data.Bytes.Types
 4 | import Data.List (permutations)
 5 | 
 6 | import qualified Data.Bytes as Bytes
 7 | import qualified Data.Bytes.Text.Ascii as Ascii
 8 | import qualified Data.Bytes.Text.Utf8 as Utf8
 9 | 
10 | naiveMconcat :: [Bytes] -> Bytes
11 | naiveMconcat = foldr mappend mempty
12 | 
13 | main :: IO ()
14 | main =
15 |   defaultMain
16 |     [ bench "mconcat" $ whnf mconcat mconcatBytes
17 |     , bench "naiveMconcat" $ whnf naiveMconcat mconcatBytes
18 |     , bgroup
19 |         "replace"
20 |         [ bench "the-dog-and-the-shadow" (whnf replaceMeat theDogAndTheShadow)
21 |         ]
22 |     , bgroup
23 |         "ascii"
24 |         [ bench "toText" (whnf Ascii.toText theDogAndTheShadow)
25 |         ]
26 |     , bgroup
27 |         "utf8"
28 |         [ bench "toText" (whnf Utf8.toText theDogAndTheShadow)
29 |         ]
30 |     ]
31 | 
32 | mconcatBytes :: [Bytes]
33 | mconcatBytes = fmap Ascii.fromString $ permutations ['a' .. 'g']
34 | 
35 | replaceMeat :: Bytes -> Bytes
36 | {-# NOINLINE replaceMeat #-}
37 | replaceMeat x = Bytes.replace meat gruel x
38 | 
39 | meat :: Bytes
40 | meat = Ascii.fromString "meat"
41 | 
42 | gruel :: Bytes
43 | gruel = Ascii.fromString "gruel"
44 | 
45 | theDogAndTheShadow :: Bytes
46 | theDogAndTheShadow =
47 |   Ascii.fromString $
48 |     concat
49 |       [ "It happened that a Dog had got a piece of meat and was "
50 |       , "carrying it home in his mouth to eat it in peace. "
51 |       , "Now on his way home he had to cross a plank lying across "
52 |       , "a running brook. As he crossed, he looked down and saw his "
53 |       , "own shadow reflected in the water beneath. Thinking it was "
54 |       , "another dog with another piece of meat, he made up his mind "
55 |       , "to have that also. So he made a snap at the shadow in the water, "
56 |       , "but as he opened his mouth the piece of meat fell out, dropped "
57 |       , "into the water and was never seen more."
58 |       ]
59 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Text/Utf8.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | {-# LANGUAGE CPP #-}
 3 | 
 4 | -- | Convert 'Bytes' to and from 'Text' and 'ShortText'.
 5 | module Data.Bytes.Text.Utf8
 6 |   ( fromShortText
 7 |   , toShortText
 8 |   , fromText
 9 |   , toText
10 |   ) where
11 | 
12 | import Data.Bytes.Types (Bytes (Bytes))
13 | import Data.Primitive (ByteArray (ByteArray))
14 | import Data.Text (Text)
15 | import Data.Text.Short (ShortText)
16 | 
17 | import qualified Data.Bytes as Bytes
18 | import qualified Data.Text.Array as A
19 | import qualified Data.Text.Internal as I
20 | import qualified Data.Text.Short as TS
21 | 
22 | import qualified Data.Text.Internal.Validate
23 | 
24 | {- | Encode 'ShortText' using UTF-8. Since 'ShortText' is backed by a UTF-8
25 | byte sequence, this does not perform a copy.
26 | -}
27 | fromShortText :: ShortText -> Bytes
28 | {-# INLINE fromShortText #-}
29 | fromShortText = Bytes.fromShortByteString . TS.toShortByteString
30 | 
31 | {- | Attempt to interpret the byte sequence as UTF-8 encoded text. Returns
32 | 'Nothing' if the bytes are not UTF-8 encoded text.
33 | -}
34 | toShortText :: Bytes -> Maybe ShortText
35 | {-# INLINE toShortText #-}
36 | toShortText !b = TS.fromShortByteString (Bytes.toShortByteString b)
37 | 
38 | -- | Encode 'Text' using @UTF-8@. Only available when building with
39 | -- @text-2.0@ and newer. Since 'Text' is backed by a UTF-8
40 | -- byte sequence, this does not perform a copy.
41 | fromText :: Text -> Bytes
42 | {-# inline fromText #-}
43 | fromText (I.Text (A.ByteArray b) off len) = Bytes (ByteArray b) off len
44 | 
45 | -- | Attempt to interpret byte sequence as @UTF-8@ encoded 'Text'.
46 | -- Only available when building with @text-2.1@ and newer. Since
47 | -- 'Text' is backed by a UTF-8 byte sequence, this does not perform a
48 | -- copy.
49 | toText :: Bytes -> Maybe Text
50 | {-# inline toText #-}
51 | toText (Bytes b@(ByteArray b0) off len) =
52 |   if Data.Text.Internal.Validate.isValidUtf8ByteArray b off len
53 |     then Just (I.Text (A.ByteArray b0) off len)
54 |     else Nothing
55 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Types.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE DataKinds #-}
 2 | {-# LANGUAGE DuplicateRecordFields #-}
 3 | {-# LANGUAGE MagicHash #-}
 4 | {-# LANGUAGE PolyKinds #-}
 5 | {-# LANGUAGE ScopedTypeVariables #-}
 6 | {-# LANGUAGE TypeFamilies #-}
 7 | 
 8 | module Data.Bytes.Types
 9 |   ( Bytes (..)
10 |   , Bytes# (..)
11 |   , MutableBytes (..)
12 |   , UnmanagedBytes (..)
13 |   , BytesN (..)
14 |   , ByteArrayN (..)
15 |   ) where
16 | 
17 | import Data.Bytes.Internal (Bytes (..))
18 | import Data.Bytes.Internal.Show (showsSlice)
19 | import Data.Primitive (ByteArray (..), MutableByteArray (..))
20 | import Data.Primitive.Addr (Addr)
21 | import Data.Proxy (Proxy (Proxy))
22 | import GHC.Natural (naturalToInteger)
23 | import GHC.TypeNats (KnownNat, Nat, natVal)
24 | import Reps (Bytes# (..))
25 | 
26 | {- | A slice of a 'ByteArray' whose compile-time-known length is represented
27 | by a phantom type variable. Consumers of this data constructor must be
28 | careful to preserve the expected invariant.
29 | -}
30 | data BytesN (n :: Nat) = BytesN
31 |   { array :: {-# UNPACK #-} !ByteArray
32 |   , offset :: {-# UNPACK #-} !Int
33 |   }
34 | 
35 | instance (KnownNat n) => Show (BytesN n) where
36 |   showsPrec _ (BytesN arr off) s =
37 |     let len = fromInteger (naturalToInteger (natVal (Proxy :: Proxy n)))
38 |      in showsSlice arr off len s
39 | 
40 | {- | A 'ByteArray' whose compile-time-known length is represented
41 | by a phantom type variable. Consumers of this data constructor must be
42 | careful to preserve the expected invariant.
43 | -}
44 | newtype ByteArrayN (n :: Nat) = ByteArrayN
45 |   { array :: ByteArray
46 |   }
47 | 
48 | instance (KnownNat n) => Show (ByteArrayN n) where
49 |   showsPrec _ (ByteArrayN arr) s =
50 |     let len = fromInteger (naturalToInteger (natVal (Proxy :: Proxy n)))
51 |      in showsSlice arr 0 len s
52 | 
53 | -- | A slice of a 'MutableByteArray'.
54 | data MutableBytes s = MutableBytes
55 |   { array :: {-# UNPACK #-} !(MutableByteArray s)
56 |   , offset :: {-# UNPACK #-} !Int
57 |   , length :: {-# UNPACK #-} !Int
58 |   }
59 | 
60 | -- | A slice of unmanaged memory.
61 | data UnmanagedBytes = UnmanagedBytes
62 |   { address :: {-# UNPACK #-} !Addr
63 |   , length :: {-# UNPACK #-} !Int
64 |   }
65 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Encode/BigEndian.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | {-# LANGUAGE TypeApplications #-}
 3 | 
 4 | module Data.Bytes.Encode.BigEndian
 5 |   ( word16
 6 |   , word32
 7 |   , word64
 8 |   , int16
 9 |   , int32
10 |   , int64
11 |   ) where
12 | 
13 | import Control.Monad.ST.Run (runByteArrayST)
14 | import Data.Bits (unsafeShiftR)
15 | import Data.Bytes.Types (Bytes)
16 | import Data.Int (Int16, Int32, Int64)
17 | import Data.Primitive (ByteArray)
18 | import Data.Word (Word16, Word32, Word64, Word8)
19 | 
20 | import qualified Data.Bytes.Pure as Pure
21 | import qualified Data.Primitive as PM
22 | 
23 | -- | Encode a 32-bit signed integer as 4 bytes.
24 | int32 :: Int32 -> Bytes
25 | {-# INLINE int32 #-}
26 | int32 = word32 . fromIntegral @Int32 @Word32
27 | 
28 | -- | Encode a 32-bit unsigned integer as 4 bytes.
29 | word32 :: Word32 -> Bytes
30 | word32 !w = Pure.fromByteArray (word32U w)
31 | 
32 | word32U :: Word32 -> ByteArray
33 | word32U !w = runByteArrayST $ do
34 |   arr <- PM.newByteArray 4
35 |   PM.writeByteArray arr 0 (fromIntegral @Word32 @Word8 (unsafeShiftR w 24))
36 |   PM.writeByteArray arr 1 (fromIntegral @Word32 @Word8 (unsafeShiftR w 16))
37 |   PM.writeByteArray arr 2 (fromIntegral @Word32 @Word8 (unsafeShiftR w 8))
38 |   PM.writeByteArray arr 3 (fromIntegral @Word32 @Word8 w)
39 |   PM.unsafeFreezeByteArray arr
40 | 
41 | -- | Encode a 16-bit signed integer as 4 bytes.
42 | int16 :: Int16 -> Bytes
43 | {-# INLINE int16 #-}
44 | int16 = word16 . fromIntegral @Int16 @Word16
45 | 
46 | -- | Encode a 16-bit unsigned integer as 4 bytes.
47 | word16 :: Word16 -> Bytes
48 | word16 !w = Pure.fromByteArray (word16U w)
49 | 
50 | word16U :: Word16 -> ByteArray
51 | word16U !w = runByteArrayST $ do
52 |   arr <- PM.newByteArray 2
53 |   PM.writeByteArray arr 0 (fromIntegral @Word16 @Word8 (unsafeShiftR w 8))
54 |   PM.writeByteArray arr 1 (fromIntegral @Word16 @Word8 w)
55 |   PM.unsafeFreezeByteArray arr
56 | 
57 | -- | Encode a 16-bit signed integer as 4 bytes.
58 | int64 :: Int64 -> Bytes
59 | {-# INLINE int64 #-}
60 | int64 = word64 . fromIntegral @Int64 @Word64
61 | 
62 | -- | Encode a 16-bit unsigned integer as 4 bytes.
63 | word64 :: Word64 -> Bytes
64 | word64 !w = Pure.fromByteArray (word64U w)
65 | 
66 | word64U :: Word64 -> ByteArray
67 | word64U !w = runByteArrayST $ do
68 |   arr <- PM.newByteArray 8
69 |   PM.writeByteArray arr 0 (fromIntegral @Word64 @Word8 (unsafeShiftR w 56))
70 |   PM.writeByteArray arr 1 (fromIntegral @Word64 @Word8 (unsafeShiftR w 48))
71 |   PM.writeByteArray arr 2 (fromIntegral @Word64 @Word8 (unsafeShiftR w 40))
72 |   PM.writeByteArray arr 3 (fromIntegral @Word64 @Word8 (unsafeShiftR w 32))
73 |   PM.writeByteArray arr 4 (fromIntegral @Word64 @Word8 (unsafeShiftR w 24))
74 |   PM.writeByteArray arr 5 (fromIntegral @Word64 @Word8 (unsafeShiftR w 16))
75 |   PM.writeByteArray arr 6 (fromIntegral @Word64 @Word8 (unsafeShiftR w 8))
76 |   PM.writeByteArray arr 7 (fromIntegral @Word64 @Word8 w)
77 |   PM.unsafeFreezeByteArray arr
78 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Encode/LittleEndian.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | {-# LANGUAGE TypeApplications #-}
 3 | 
 4 | module Data.Bytes.Encode.LittleEndian
 5 |   ( word16
 6 |   , word32
 7 |   , word64
 8 |   , int16
 9 |   , int32
10 |   , int64
11 |   ) where
12 | 
13 | import Control.Monad.ST.Run (runByteArrayST)
14 | import Data.Bits (unsafeShiftR)
15 | import Data.Bytes.Types (Bytes)
16 | import Data.Int (Int16, Int32, Int64)
17 | import Data.Primitive (ByteArray)
18 | import Data.Word (Word16, Word32, Word64, Word8)
19 | 
20 | import qualified Data.Bytes.Pure as Pure
21 | import qualified Data.Primitive as PM
22 | 
23 | -- | Encode a 32-bit signed integer as 4 bytes.
24 | int32 :: Int32 -> Bytes
25 | {-# INLINE int32 #-}
26 | int32 = word32 . fromIntegral @Int32 @Word32
27 | 
28 | -- | Encode a 32-bit unsigned integer as 4 bytes.
29 | word32 :: Word32 -> Bytes
30 | word32 !w = Pure.fromByteArray (word32U w)
31 | 
32 | word32U :: Word32 -> ByteArray
33 | word32U !w = runByteArrayST $ do
34 |   arr <- PM.newByteArray 4
35 |   PM.writeByteArray arr 3 (fromIntegral @Word32 @Word8 (unsafeShiftR w 24))
36 |   PM.writeByteArray arr 2 (fromIntegral @Word32 @Word8 (unsafeShiftR w 16))
37 |   PM.writeByteArray arr 1 (fromIntegral @Word32 @Word8 (unsafeShiftR w 8))
38 |   PM.writeByteArray arr 0 (fromIntegral @Word32 @Word8 w)
39 |   PM.unsafeFreezeByteArray arr
40 | 
41 | -- | Encode a 16-bit signed integer as 4 bytes.
42 | int16 :: Int16 -> Bytes
43 | {-# INLINE int16 #-}
44 | int16 = word16 . fromIntegral @Int16 @Word16
45 | 
46 | -- | Encode a 16-bit unsigned integer as 4 bytes.
47 | word16 :: Word16 -> Bytes
48 | word16 !w = Pure.fromByteArray (word16U w)
49 | 
50 | word16U :: Word16 -> ByteArray
51 | word16U !w = runByteArrayST $ do
52 |   arr <- PM.newByteArray 2
53 |   PM.writeByteArray arr 1 (fromIntegral @Word16 @Word8 (unsafeShiftR w 8))
54 |   PM.writeByteArray arr 0 (fromIntegral @Word16 @Word8 w)
55 |   PM.unsafeFreezeByteArray arr
56 | 
57 | -- | Encode a 16-bit signed integer as 4 bytes.
58 | int64 :: Int64 -> Bytes
59 | {-# INLINE int64 #-}
60 | int64 = word64 . fromIntegral @Int64 @Word64
61 | 
62 | -- | Encode a 16-bit unsigned integer as 4 bytes.
63 | word64 :: Word64 -> Bytes
64 | word64 !w = Pure.fromByteArray (word64U w)
65 | 
66 | word64U :: Word64 -> ByteArray
67 | word64U !w = runByteArrayST $ do
68 |   arr <- PM.newByteArray 8
69 |   PM.writeByteArray arr 7 (fromIntegral @Word64 @Word8 (unsafeShiftR w 56))
70 |   PM.writeByteArray arr 6 (fromIntegral @Word64 @Word8 (unsafeShiftR w 48))
71 |   PM.writeByteArray arr 5 (fromIntegral @Word64 @Word8 (unsafeShiftR w 40))
72 |   PM.writeByteArray arr 4 (fromIntegral @Word64 @Word8 (unsafeShiftR w 32))
73 |   PM.writeByteArray arr 3 (fromIntegral @Word64 @Word8 (unsafeShiftR w 24))
74 |   PM.writeByteArray arr 2 (fromIntegral @Word64 @Word8 (unsafeShiftR w 16))
75 |   PM.writeByteArray arr 1 (fromIntegral @Word64 @Word8 (unsafeShiftR w 8))
76 |   PM.writeByteArray arr 0 (fromIntegral @Word64 @Word8 w)
77 |   PM.unsafeFreezeByteArray arr
78 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Mutable.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE LambdaCase #-}
  3 | 
  4 | {- | If you are interested in sub-arrays of 'MutableByteArray's (e.g. writing
  5 | quicksort), it would be grossly inefficient to make a copy of the sub-array.
  6 | On the other hand, it'd be really annoying to track limit indices by hand.
  7 | 
  8 | This module defines the 'MutableBytes' type which exposes a standard array
  9 | interface for a sub-arrays without copying and without manual index
 10 | manipulation. For immutable arrays, see 'Data.Bytes'.
 11 | -}
 12 | module Data.Bytes.Mutable
 13 |   ( -- * Types
 14 |     MutableBytes
 15 | 
 16 |     -- * Filtering
 17 |   , takeWhile
 18 |   , dropWhile
 19 | 
 20 |     -- * Unsafe Slicing
 21 |   , unsafeTake
 22 |   , unsafeDrop
 23 | 
 24 |     -- * Conversion
 25 |   , fromMutableByteArray
 26 |   ) where
 27 | 
 28 | import Prelude hiding (dropWhile, takeWhile)
 29 | 
 30 | import Control.Monad.Primitive (PrimMonad, PrimState)
 31 | import Data.Bytes.Types (MutableBytes (MutableBytes))
 32 | import Data.Primitive (MutableByteArray)
 33 | import Data.Word (Word8)
 34 | 
 35 | import qualified Data.Primitive as PM
 36 | 
 37 | {- | Take bytes while the predicate is true, aliasing the
 38 | argument array.
 39 | -}
 40 | takeWhile ::
 41 |   (PrimMonad m) =>
 42 |   (Word8 -> m Bool) ->
 43 |   MutableBytes (PrimState m) ->
 44 |   m (MutableBytes (PrimState m))
 45 | {-# INLINE takeWhile #-}
 46 | takeWhile k b = do
 47 |   n <- countWhile k b
 48 |   pure (unsafeTake n b)
 49 | 
 50 | {- | Drop bytes while the predicate is true, aliasing the
 51 | argument array.
 52 | -}
 53 | dropWhile ::
 54 |   (PrimMonad m) =>
 55 |   (Word8 -> m Bool) ->
 56 |   MutableBytes (PrimState m) ->
 57 |   m (MutableBytes (PrimState m))
 58 | {-# INLINE dropWhile #-}
 59 | dropWhile k b = do
 60 |   n <- countWhile k b
 61 |   pure (unsafeDrop n b)
 62 | 
 63 | -- | Take the first @n@ bytes from the argument, aliasing it.
 64 | unsafeTake :: Int -> MutableBytes s -> MutableBytes s
 65 | {-# INLINE unsafeTake #-}
 66 | unsafeTake n (MutableBytes arr off _) =
 67 |   MutableBytes arr off n
 68 | 
 69 | {- | Drop the first @n@ bytes from the argument, aliasing it.
 70 | The new length will be @len - n@.
 71 | -}
 72 | unsafeDrop :: Int -> MutableBytes s -> MutableBytes s
 73 | {-# INLINE unsafeDrop #-}
 74 | unsafeDrop n (MutableBytes arr off len) =
 75 |   MutableBytes arr (off + n) (len - n)
 76 | 
 77 | {- | Create a slice of 'MutableBytes' that spans the entire
 78 | argument array. This aliases the argument.
 79 | -}
 80 | fromMutableByteArray ::
 81 |   (PrimMonad m) =>
 82 |   MutableByteArray (PrimState m) ->
 83 |   m (MutableBytes (PrimState m))
 84 | {-# INLINE fromMutableByteArray #-}
 85 | fromMutableByteArray mba = do
 86 |   sz <- PM.getSizeofMutableByteArray mba
 87 |   pure (MutableBytes mba 0 sz)
 88 | 
 89 | -- Internal. The returns the number of bytes that match the
 90 | -- predicate until the first non-match occurs. If all bytes
 91 | -- match the predicate, this will return the length originally
 92 | -- provided.
 93 | countWhile ::
 94 |   (PrimMonad m) =>
 95 |   (Word8 -> m Bool) ->
 96 |   MutableBytes (PrimState m) ->
 97 |   m Int
 98 | {-# INLINE countWhile #-}
 99 | countWhile k (MutableBytes arr off0 len0) = go off0 len0 0
100 |  where
101 |   go !off !len !n =
102 |     if len > 0
103 |       then
104 |         (k =<< PM.readByteArray arr off) >>= \case
105 |           True -> go (off + 1) (len - 1) (n + 1)
106 |           False -> pure n
107 |       else pure n
108 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Internal.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | {-# LANGUAGE DataKinds #-}
 3 | {-# LANGUAGE DuplicateRecordFields #-}
 4 | {-# LANGUAGE MagicHash #-}
 5 | {-# LANGUAGE TypeFamilies #-}
 6 | 
 7 | -- This needs to be in its own module to prevent a cyclic dependency
 8 | -- between UnliftedBytes and Data.Bytes.Types
 9 | module Data.Bytes.Internal
10 |   ( Bytes (..)
11 |   ) where
12 | 
13 | import Control.Monad.ST (runST)
14 | import Control.Monad.ST.Run (runByteArrayST)
15 | import Data.Bytes.Internal.Show (showsSlice)
16 | import Data.Primitive (ByteArray (..))
17 | import Data.Word (Word8)
18 | import GHC.Exts (Int (I#), IsList (..), compareByteArrays#, isTrue#, sameMutableByteArray#, unsafeCoerce#)
19 | 
20 | import qualified Data.Foldable as F
21 | import qualified Data.List as L
22 | import qualified Data.Primitive as PM
23 | 
24 | -- | A slice of a 'ByteArray'.
25 | data Bytes = Bytes
26 |   { array :: {-# UNPACK #-} !ByteArray
27 |   , offset :: {-# UNPACK #-} !Int
28 |   , length :: {-# UNPACK #-} !Int
29 |   }
30 | 
31 | instance IsList Bytes where
32 |   type Item Bytes = Word8
33 |   fromListN n xs = Bytes (fromListN n xs) 0 n
34 |   fromList xs = fromListN (L.length xs) xs
35 |   toList (Bytes arr off len) = toListLoop off len arr
36 | 
37 | toListLoop :: Int -> Int -> ByteArray -> [Word8]
38 | toListLoop !off !len !arr =
39 |   if len > 0
40 |     then PM.indexByteArray arr off : toListLoop (off + 1) (len - 1) arr
41 |     else []
42 | 
43 | instance Show Bytes where
44 |   showsPrec _ (Bytes arr off len) s = showsSlice arr off len s
45 | 
46 | instance Eq Bytes where
47 |   Bytes arr1 off1 len1 == Bytes arr2 off2 len2
48 |     | len1 /= len2 = False
49 |     | sameByteArray arr1 arr2 && off1 == off2 = True
50 |     | otherwise = compareByteArrays arr1 off1 arr2 off2 len1 == EQ
51 | 
52 | instance Ord Bytes where
53 |   compare (Bytes arr1 off1 len1) (Bytes arr2 off2 len2)
54 |     | sameByteArray arr1 arr2 && off1 == off2 && len1 == len2 = EQ
55 |     | otherwise = compareByteArrays arr1 off1 arr2 off2 (min len1 len2) <> compare len1 len2
56 | 
57 | instance Semigroup Bytes where
58 |   -- TODO: Do the trick to move the data constructor to the outside
59 |   -- of runST.
60 |   Bytes arrA offA lenA <> Bytes arrB offB lenB = runST $ do
61 |     marr <- PM.newByteArray (lenA + lenB)
62 |     PM.copyByteArray marr 0 arrA offA lenA
63 |     PM.copyByteArray marr lenA arrB offB lenB
64 |     r <- PM.unsafeFreezeByteArray marr
65 |     pure (Bytes r 0 (lenA + lenB))
66 | 
67 | instance Monoid Bytes where
68 |   mempty = Bytes mempty 0 0
69 |   mconcat [] = mempty
70 |   mconcat [x] = x
71 |   mconcat bs = Bytes r 0 fullLen
72 |    where
73 |     !fullLen = L.foldl' (\acc (Bytes _ _ len) -> acc + len) 0 bs
74 |     r = runByteArrayST $ do
75 |       marr <- PM.newByteArray fullLen
76 |       !_ <-
77 |         F.foldlM
78 |           ( \ !currLen (Bytes arr off len) -> do
79 |               PM.copyByteArray marr currLen arr off len
80 |               pure (currLen + len)
81 |           )
82 |           0
83 |           bs
84 |       PM.unsafeFreezeByteArray marr
85 | 
86 | compareByteArrays :: ByteArray -> Int -> ByteArray -> Int -> Int -> Ordering
87 | {-# INLINE compareByteArrays #-}
88 | compareByteArrays (ByteArray ba1#) (I# off1#) (ByteArray ba2#) (I# off2#) (I# n#) =
89 |   compare (I# (compareByteArrays# ba1# off1# ba2# off2# n#)) 0
90 | 
91 | sameByteArray :: ByteArray -> ByteArray -> Bool
92 | {-# INLINE sameByteArray #-}
93 | sameByteArray (ByteArray ba1#) (ByteArray ba2#) =
94 |   isTrue# (sameMutableByteArray# (unsafeCoerce# ba1#) (unsafeCoerce# ba2#))
95 | 


--------------------------------------------------------------------------------
/byteslice.cabal:
--------------------------------------------------------------------------------
  1 | cabal-version:   2.4
  2 | name:            byteslice
  3 | version:         0.2.15.0
  4 | synopsis:        Slicing managed and unmanaged memory
  5 | description:
  6 |   This library provides types that allow the user to talk about a slice of
  7 |   a ByteArray or a MutableByteArray. It also offers UnmanagedBytes, which
  8 |   is kind of like a slice into unmanaged memory. However, it is just an
  9 |   address and a length.
 10 | 
 11 | homepage:        https://github.com/byteverse/byteslice
 12 | bug-reports:     https://github.com/byteverse/byteslice/issues
 13 | license:         BSD-3-Clause
 14 | license-file:    LICENSE
 15 | author:          Andrew Martin
 16 | maintainer:      amartin@layer3com.com
 17 | copyright:       2020 Andrew Martin
 18 | category:        Data
 19 | extra-doc-files:
 20 |   CHANGELOG.md
 21 |   README.md
 22 | 
 23 | tested-with:     GHC ==9.4.8 || ==9.6.3 || ==9.8.1
 24 | 
 25 | common build-settings
 26 |   default-language: Haskell2010
 27 |   ghc-options:      -Wall -Wunused-packages
 28 | 
 29 | flag avoid-rawmemchr
 30 |   default:     True
 31 |   description: Avoid using rawmemchr which is non-portable GNU libc only
 32 | 
 33 | library
 34 |   import:           build-settings
 35 |   exposed-modules:
 36 |     Data.Bytes
 37 |     Data.Bytes.Chunks
 38 |     Data.Bytes.Indexed
 39 |     Data.Bytes.Encode.BigEndian
 40 |     Data.Bytes.Encode.LittleEndian
 41 |     Data.Bytes.Internal
 42 |     Data.Bytes.Mutable
 43 |     Data.Bytes.Text.Ascii
 44 |     Data.Bytes.Text.AsciiExt
 45 |     Data.Bytes.Text.Latin1
 46 |     Data.Bytes.Text.Utf8
 47 |     Data.Bytes.Text.Windows1252
 48 |     Data.Bytes.Types
 49 | 
 50 |   other-modules:
 51 |     Cstrlen
 52 |     Data.Bytes.Byte
 53 |     Data.Bytes.Internal.Show
 54 |     Data.Bytes.IO
 55 |     Data.Bytes.Pure
 56 |     Data.Bytes.Search
 57 |     Reps
 58 | 
 59 |   build-depends:
 60 |     , base                >=4.14   && <5
 61 |     , bytestring          >=0.10.8 && <0.13
 62 |     , natural-arithmetic  >=0.1.4
 63 |     , primitive           >=0.7.4  && <0.10
 64 |     , primitive-addr      >=0.1    && <0.2
 65 |     , primitive-unlifted  >=0.1.2  && <2.3
 66 |     , run-st              >=0.1.1  && <0.2
 67 |     , text                >=2.1
 68 |     , text-short          >=0.1.3  && <0.2
 69 |     , tuples              >=0.1    && <0.2
 70 |     , vector              >=0.12   && <0.14
 71 | 
 72 |   hs-source-dirs:   src
 73 |   ghc-options:      -O2
 74 | 
 75 |   if impl(ghc >=9.2)
 76 |     hs-source-dirs: src-new-reps
 77 | 
 78 |   else
 79 |     hs-source-dirs: src-old-reps
 80 | 
 81 |   if impl(ghc >=9.0)
 82 |     hs-source-dirs: src-ghc-cstrlen
 83 | 
 84 |   else
 85 |     hs-source-dirs: src-no-ghc-cstrlen
 86 | 
 87 |   include-dirs:     include
 88 |   includes:         bs_custom.h
 89 |   install-includes: bs_custom.h
 90 |   c-sources:        cbits/bs_custom.c
 91 | 
 92 |   if flag(avoid-rawmemchr)
 93 |     cc-options: -DAVOID_RAWMEMCHR=1
 94 | 
 95 | test-suite test
 96 |   import:         build-settings
 97 |   type:           exitcode-stdio-1.0
 98 |   hs-source-dirs: test
 99 |   main-is:        Main.hs
100 |   build-depends:
101 |     , base                >=4.11.1 && <5
102 |     , byteslice
103 |     , bytestring
104 |     , primitive
105 |     , quickcheck-classes  >=0.6.4
106 |     , tasty
107 |     , tasty-hunit
108 |     , tasty-quickcheck
109 |     , text                >=2.1
110 |     , transformers
111 | 
112 | benchmark bench
113 |   import:         build-settings
114 |   type:           exitcode-stdio-1.0
115 |   build-depends:
116 |     , base
117 |     , byteslice
118 |     , gauge
119 | 
120 |   ghc-options:    -O2
121 |   hs-source-dirs: bench
122 |   main-is:        Main.hs
123 | 
124 | source-repository head
125 |   type:     git
126 |   location: git://github.com/byteverse/byteslice.git
127 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Text/Ascii.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE MagicHash #-}
  3 | {-# LANGUAGE BinaryLiterals #-}
  4 | {-# LANGUAGE CPP #-}
  5 | {-# LANGUAGE NumericUnderscores #-}
  6 | {-# LANGUAGE TypeApplications #-}
  7 | 
  8 | {- | This module treats 'Bytes' data as holding ASCII text. Providing bytes
  9 | outside the ASCII range (@U+0000@ -- @U+007F@) may cause a failure or
 10 | unspecified results, but such bytes will never be inspected.
 11 | 
 12 | For functions that can operate on ASCII-compatible encodings, see
 13 | 'Data.Bytes.Text.AsciiExt'.
 14 | -}
 15 | module Data.Bytes.Text.Ascii
 16 |   ( fromString
 17 |   , decodeDecWord
 18 |   , equalsCStringCaseInsensitive
 19 |   , toShortText
 20 |   , toShortTextU
 21 |   , toText
 22 |   ) where
 23 | 
 24 | import Data.Bits ((.&.))
 25 | import Data.ByteString.Short.Internal (ShortByteString (SBS))
 26 | import Data.Bytes.Text.Latin1 (decodeDecWord)
 27 | import Data.Bytes.Types (Bytes (Bytes))
 28 | import Data.Char (ord)
 29 | import Data.Primitive (ByteArray)
 30 | import Data.Text (Text)
 31 | import Data.Text.Short (ShortText)
 32 | import Data.Word (Word8)
 33 | import Foreign.C.String (CString)
 34 | import Foreign.Ptr (Ptr, castPtr, plusPtr)
 35 | import GHC.Exts (Int#,Word#,ByteArray#,(+#),(<#))
 36 | import GHC.Int (Int(I#))
 37 | 
 38 | import qualified Data.Bytes.Pure as Bytes
 39 | import qualified Data.Primitive as PM
 40 | import qualified Data.Primitive.Ptr as PM
 41 | import qualified Data.Text.Array as A
 42 | import qualified Data.Text.Internal as I
 43 | import qualified Data.Text.Short.Unsafe as TS
 44 | import qualified GHC.Exts as Exts
 45 | 
 46 | {- | Convert a 'String' consisting of only characters in the ASCII block
 47 | to a byte sequence. Any character with a codepoint above @U+007F@ is
 48 | replaced by @U+0000@.
 49 | -}
 50 | fromString :: String -> Bytes
 51 | fromString =
 52 |   Bytes.fromByteArray
 53 |     . Exts.fromList
 54 |     . map (\c -> let i = ord c in if i < 128 then fromIntegral @Int @Word8 i else 0)
 55 | 
 56 | -- TODO presumably also fromText and fromShortText
 57 | 
 58 | toShortText :: Bytes -> Maybe ShortText
 59 | {-# INLINE toShortText #-}
 60 | toShortText !b = case Bytes.foldr (\w acc -> w < 128 && acc) True b of
 61 |   True -> Just (TS.fromShortByteStringUnsafe (Bytes.toShortByteString b))
 62 |   False -> Nothing
 63 | 
 64 | toShortTextU :: ByteArray -> Maybe ShortText
 65 | {-# INLINE toShortTextU #-}
 66 | toShortTextU !b = case Bytes.foldr (\w acc -> w < 128 && acc) True (Bytes.fromByteArray b) of
 67 |   True -> Just (TS.fromShortByteStringUnsafe (case b of PM.ByteArray x -> SBS x))
 68 |   False -> Nothing
 69 | 
 70 | -- | Interpret byte sequence as ASCII codepoints.
 71 | -- Returns 'Nothing' if any of the bytes are outside of the
 72 | -- range @0x00-0x7F@
 73 | --
 74 | -- This does not work on 32-bit architectures. I am not certain if it
 75 | -- even compiles on those systems.
 76 | toText :: Bytes -> Maybe Text
 77 | toText (Bytes (PM.ByteArray arr) off@(I# off# ) len@(I# len# )) =
 78 |   let !r0 = validateAscii# arr off# len# (off# +# len# )
 79 |       !r1 = Exts.and# r0 0b1000_0000_1000_0000_1000_0000_1000_0000_1000_0000_1000_0000_1000_0000_1000_0000##
 80 |    in case r1 of
 81 |         0## -> Just (I.Text (A.ByteArray arr) off len)
 82 |         _ -> Nothing
 83 | 
 84 | -- returns 0 to mean that the input slice was all ascii text
 85 | -- any other number means that a non-ascii byte was encountered
 86 | -- Precondition: len and end must agree with one another.
 87 | validateAscii# :: ByteArray# -> Int# -> Int# -> Int# -> Word#
 88 | {-# noinline validateAscii# #-}
 89 | validateAscii# arr off len end
 90 |   -- This length check at the beginning is not just a performance optimization.
 91 |   -- If the length of the slice is less than 8, the calculations of "middle start"
 92 |   -- and "middle end" in the otherwise clause are out of bounds.
 93 |   | 1# <- len <# 20# = validateRangeSlowly# 0## off end arr
 94 |   | otherwise =
 95 |       let !middleStartSwar = Exts.uncheckedIShiftRL# (off +# 7#) 3#
 96 |           !middleEndSwar = Exts.uncheckedIShiftRL# end 3#
 97 |           !w0 = validateRangeSwar# 0## middleStartSwar middleEndSwar arr
 98 |           !w1 = validateRangeSlowly# w0 off (Exts.uncheckedIShiftL# middleStartSwar 3# ) arr
 99 |           !w2 = validateRangeSlowly# w1 (Exts.uncheckedIShiftL# middleEndSwar 3# ) end arr
100 |        in w2
101 | 
102 | -- Here, the offset and the end refer to 64-bit word units, not byte units
103 | -- like they do in validateRangeSlowly.
104 | validateRangeSwar# :: Word# -> Int# -> Int# -> ByteArray# -> Word#
105 | {-# inline validateRangeSwar# #-}
106 | validateRangeSwar# acc0 off0 end arr = go off0 acc0
107 |   where
108 |   go :: Int# -> Word# -> Word#
109 |   go off acc = case off <# end of
110 |     1# -> go (off +# 1#) (Exts.or# (Exts.indexWordArray# arr off) acc)
111 |     _ -> acc
112 | 
113 | -- Accepts a start and end position. The end position is exclusive.
114 | validateRangeSlowly# :: Word# -> Int# -> Int# -> ByteArray# -> Word#
115 | {-# inline validateRangeSlowly# #-}
116 | validateRangeSlowly# acc0 off0 end arr = go off0 acc0
117 |   where
118 |   go :: Int# -> Word# -> Word#
119 |   go off acc = case off <# end of
120 |     1# -> go (off +# 1#) (Exts.or# (Exts.word8ToWord# (Exts.indexWord8Array# arr off)) acc)
121 |     _ -> acc
122 | 
123 | {- | Is the byte sequence equal to the @NUL@-terminated C String?
124 | The C string must be a constant.
125 | -}
126 | equalsCStringCaseInsensitive :: CString -> Bytes -> Bool
127 | {-# INLINE equalsCStringCaseInsensitive #-}
128 | equalsCStringCaseInsensitive !ptr0 (Bytes arr off0 len0) = go (castPtr ptr0 :: Ptr Word8) off0 len0
129 |  where
130 |   go !ptr !off !len = case len of
131 |     0 -> PM.indexOffPtr ptr 0 == (0 :: Word8)
132 |     _ -> case PM.indexOffPtr ptr 0 of
133 |       0 -> False
134 |       c ->
135 |         (c .&. 0b1101_1111) == (PM.indexByteArray arr off .&. 0b1101_1111)
136 |           && go (plusPtr ptr 1) (off + 1) (len - 1)
137 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Revision history for byteslice
  2 | 
  3 | ## 0.2.15.0 -- 2024-06-12
  4 | 
  5 | * Add functions for recovering length from `ByteArrayN`
  6 | * Add `Data.Bytes.Indexed` for functions on `ByteArrayN`
  7 | 
  8 | ## 0.2.14.0 -- 2024-02-26
  9 | 
 10 | * Add functions to `Data.Bytes.Text.AsciiExt`: `split(1|2|3|4)`,
 11 |   `splitTetragram1`, `anyEq`, `takeWhileNotEq`, `dropWhileNotEq`,
 12 |   `takeWhileEndNotEq`, dropWhileEndEq`.
 13 | * Increment upper bound on `primitive-unlifted`
 14 | 
 15 | ## 0.2.13.2 -- 2024-02-06
 16 | 
 17 | * Restore `Data.Bytes.Text.Utf8.toText`.
 18 | 
 19 | ## 0.2.13.1 -- 2024-02-01
 20 | 
 21 | * Update package metadata.
 22 | 
 23 | ## 0.2.13.0 -- 2024-01-12
 24 | 
 25 | * Add `replicate` and `replicateByte` to `Data.Bytes.Chunks`.
 26 | 
 27 | ## 0.2.12.0 -- 2023-12-14
 28 | 
 29 | * Add `Data.Bytes.Chunks.cons`.
 30 | * Add `Data.Bytes.fromLazyByteString`.
 31 | * Allow building with GHC 9.8.
 32 | 
 33 | ## 0.2.11.1 -- 2023-07-26
 34 | 
 35 | * Fix regression causing build failure in GHC 9.2.
 36 | 
 37 | ## 0.2.11.0 -- 2023-07-25
 38 | 
 39 | * Add `Data.Bytes.Encode.LittleEndian`.
 40 | * Add `splitTetragram1`.
 41 | 
 42 | ## 0.2.10.0 -- 2023-05-01
 43 | 
 44 | * Add `equals13`, `equals14`, `equals15`.
 45 | * Add `Show` instances for `BytesN` and `ByteArrayN`.
 46 | * Add `equalsCStringCaseInsensitive` for ascii text.
 47 | 
 48 | ## 0.2.9.0 -- 2022-12-08
 49 | 
 50 | * Add `toShortText`, `toShortTextU`, and `toText` to `Data.Byte.Text.Ascii`.
 51 | * Add `fromShortText` and `fromText` to `Data.Bytes.Text.Utf8`.
 52 | 
 53 | ## 0.2.8.0 -- 2022-11-21
 54 | 
 55 | * Add `Data.Bytes.replace` and `Data.Bytes.findIndices`
 56 | * Add `Data.Bytes.fromShortText`
 57 | * Add `Data.Bytes.fromPrimArray`
 58 | * Add `decodeDecWord`
 59 | * Add `concatArray` and `concatArrayU`.
 60 | 
 61 | ## 0.2.7.0 -- 2022-02-16
 62 | 
 63 | * Add support for GHC 9.2.
 64 | * Drop support for GHC 8.8 and earlier.
 65 | * Add `foldlM` and `foldrM` for mondic folds over byte sequences.
 66 | 
 67 | ## 0.2.6.0 -- 2021-09-15
 68 | 
 69 | * Add `BytesN` and `ByteArrayN`.
 70 | * Add `isInfixOf`.
 71 | * Add `hForLines_` and `hFoldLines`.
 72 | * Add `lift` and `unlift` for converting between `Bytes` and `Bytes#`.
 73 | * Move text-oriented functions from Data.Bytes to `Data.Bytes.Text.*`.
 74 |   Provide aliases with older names that come with deprecation warning.
 75 | 
 76 | ## 0.2.5.2 -- 2021-02-23
 77 | 
 78 | * Correct compatibility shims.
 79 | 
 80 | ## 0.2.5.1 -- 2021-02-22 (deprecated)
 81 | 
 82 | * Compatibility with GHC 9.0.
 83 | 
 84 | ## 0.2.5.0 -- 2021-01-22
 85 | 
 86 | * Add `Data.Bytes.Chunks.concatByteString`.
 87 | * Expose `pinnedToByteString` to end users.
 88 | 
 89 | ## 0.2.4.0 -- 2020-10-15
 90 | 
 91 | * Add `toByteString` and `fromByteString`.
 92 | * Add `fromShortByteString`.
 93 | * Add `equalsLatin(9|10|11|12)`.
 94 | * Add `toPinnedByteArray`, `toPinnedByteArrayClone`, and `concatPinnedU`.
 95 | * Add `toLowerAsciiByteArrayClone`.
 96 | * Add `intercalateByte2`.
 97 | * Add `splitEnd1`.
 98 | 
 99 | ## 0.2.3.0 -- 2020-04-30
100 | 
101 | * Add `fnv1a32` and `fnv1a64`, implementations of the 32-bit and
102 |   64-bit variants of the FNV-1a hash algorithm, to both `Data.Bytes`
103 |   and `Data.Bytes.Chunks`.
104 | * Add `Data.Bytes.Chunks.null`.
105 | * Add `readFile` to both `Data.Bytes` and `Data.Bytes.Chunks`.
106 | * Add `foldl'` to `Data.Bytes.Chunks`.
107 | * Add `split` to `Data.Bytes.Chunks`.
108 | * Add `splitStream` for splitting as a good stream-fusion producer.
109 | * Add `hPut` and `writeFile` to `Data.Bytes.Chunks`.
110 | * Add `fromCString#`.
111 | * Add `Bytes#` newtype on GHC 8.10 and up.
112 | 
113 | ## 0.2.2.0 -- 2020-02-27
114 | 
115 | * Add `split4`.
116 | * Add `equalsCString`.
117 | * Add `stripCStringPrefix`.
118 | * Add `equalsLatin8`.
119 | * Add `emptyPinned`.
120 | * Add `concatPinned` to `Data.Bytes.Chunks`.
121 | * Add `any` and `all`.
122 | 
123 | ## 0.2.1.0 -- 2020-01-22
124 | 
125 | * Add `longestCommonPrefix`.
126 | * Fix broken `Ord` instance of `Bytes`.
127 | 
128 | ## 0.2.0.0 -- 2020-01-20
129 | 
130 | * Change behavior of `split`. This function previously had a special case
131 |   for zero-length byte sequences to mirror the behavior `bytestring`'s
132 |   behavior. Now, `split` returns a singleton list with the empty byte
133 |   sequence in this case.
134 | * Add `splitNonEmpty` so that users who need to take advantage of the
135 |   non-null guarantee `split` provides can do so.
136 | * Add `splitU` and `splitInitU` for users who are going to split a
137 |   byte sequence without and consume the results more than once.
138 | * Make the C code compile on platforms that do not have `rawmemchr`.
139 | * Rename `splitOnce` to `split1`.
140 | * Add `split2` and `split3`.
141 | * Add `equalsLatin{1,2,3,4,5,6,7}`
142 | * Add `ifoldl'`.
143 | * Add `hGet` and `hPut`.
144 | * Move `Data.Bytes.Chunks` from `small-bytearray-builder` to `byteslice`.
145 | * Rename `Data.Bytes.Chunks.concat` to `concatU` (the U means unsliced),
146 |   and add a new `concat` that returns `Bytes`.
147 | * Add `fromBytes`, `fromByteArray`, and `unsafeCopy` to `Data.Bytes.Chunks`.
148 | * Add `hGetContents` to `Data.Bytes.Chunks`.
149 | * Add `isBytePrefixOf` and `isByteSuffixOf`.
150 | * Add `replicate` and `replicateU`.
151 | * Add `Monoid` instance for `Bytes`.
152 | * Add `singleton`, `doubleton`, `tripleton`, and their unsliced variants.
153 | * Rename `copy` to `unsafeCopy`.
154 | * Add `fromLatinString`.
155 | * Change the behavior of `fromAsciiString` to replace out-of-bounds codepoints
156 |   with NUL.
157 | * Add `unsnoc` and `uncons`.
158 | 
159 | ## 0.1.4.0 -- 2019-11-12
160 | 
161 | * Add `toLatinString`.
162 | * Add `stripPrefix`, `stripSuffix`, `stripOptionalPrefix`, and
163 |   `stripOptionalSuffix`.
164 | * Add `takeWhileEnd` and `dropWhileEnd`.
165 | * Add `count`.
166 | * Add an optimized `split` function.
167 | * Add `splitInit`.
168 | * Add `splitFirst`.
169 | * Add `copy`.
170 | * Add `pin`.
171 | * Add `touch`.
172 | * Add `elem`.
173 | * Add `unsafeIndex`.
174 | 
175 | ## 0.1.3.0 -- 2019-09-15
176 | 
177 | * Add `isPrefixOf` and `isSuffixOf`.
178 | * Add `foldl`, `foldr`, `foldl'`, and `foldr'`.
179 | 
180 | ## 0.1.2.0 -- 2019-08-21
181 | 
182 | * Add `Data.Bytes.Mutable` module.
183 | * Add `Data.Bytes` module.
184 | 
185 | ## 0.1.1.0 -- 2019-07-03
186 | 
187 | * Add record labels for Bytes and MutableBytes
188 | * Add UnmanagedBytes. This is just an Addr and a length.
189 | 
190 | ## 0.1.0.0 -- 2019-04-30
191 | 
192 | * First version.
193 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Text/AsciiExt.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE LambdaCase #-}
  3 | {-# LANGUAGE RankNTypes #-}
  4 | {-# LANGUAGE TypeApplications #-}
  5 | 
  6 | -- For functions that can fail for bytes outside the ASCII range, see
  7 | -- 'Data.Bytes.Ascii'. For functions that can inspect bytes outside ASCII, see
  8 | -- any of the modules for ASCII-compatible encodings (e.g. 'Data.Bytes.Utf8',
  9 | -- 'Data.Bytes.Latin1', and so on).
 10 | 
 11 | {- | This module contains functions which operate on supersets of 'Bytes' containing ASCII-encoded text.
 12 | That is, none of the functions here inspect bytes with a value greater than 127, and do not fail due to the presence of such bytes.
 13 | -}
 14 | module Data.Bytes.Text.AsciiExt
 15 |   ( -- * Line-Oriented IO
 16 |     hFoldLines
 17 |   , hForLines_
 18 | 
 19 |     -- ** Standard Handles
 20 |   , forLines_
 21 |   , foldLines
 22 | 
 23 |     -- * Predicates
 24 |   , anyEq
 25 | 
 26 |     -- * Filtering
 27 |   , takeWhileNotEq
 28 |   , dropWhileNotEq
 29 |   , takeWhileEndNotEq
 30 |   , dropWhileEndEq
 31 | 
 32 |     -- * Splitting
 33 |     -- ** Fixed from Beginning
 34 |   , split1
 35 |   , splitTetragram1
 36 |   , split2
 37 |   , split3
 38 |   , split4
 39 | 
 40 |     -- * Text Manipulation
 41 |   , toLowerU
 42 |   ) where
 43 | 
 44 | import Control.Monad.ST (ST)
 45 | import Control.Monad.ST.Run (runByteArrayST)
 46 | import Data.Bytes.Types (Bytes (..))
 47 | import Data.Char (ord)
 48 | import Data.Primitive (ByteArray)
 49 | import Data.Word (Word8)
 50 | import System.IO (Handle, hIsEOF, stdin)
 51 | 
 52 | import qualified Data.ByteString.Char8 as BC8
 53 | import qualified Data.Bytes.Pure as Bytes
 54 | import qualified Data.Bytes.Byte as Byte
 55 | import qualified Data.Primitive as PM
 56 | 
 57 | -- | `hForLines_` over `stdin`
 58 | forLines_ :: (Bytes -> IO a) -> IO ()
 59 | {-# INLINEABLE forLines_ #-}
 60 | forLines_ = hForLines_ stdin
 61 | 
 62 | -- | `hFoldLines` over `stdin`
 63 | foldLines :: a -> (a -> Bytes -> IO a) -> IO a
 64 | {-# INLINEABLE foldLines #-}
 65 | foldLines = hFoldLines stdin
 66 | 
 67 | {- | Perform an action on each line of the input, discarding results.
 68 | To maintain a running state, see 'hFoldLines'.
 69 | 
 70 | Lines are extracted with with 'BC8.hGetLine', which does not document its
 71 | detection algorithm. As of writing (bytestring v0.11.1.0), lines are
 72 | delimited by a single @\n@ character (UNIX-style, as all things should be).
 73 | -}
 74 | hForLines_ :: Handle -> (Bytes -> IO a) -> IO ()
 75 | hForLines_ h body = loop
 76 |  where
 77 |   loop =
 78 |     hIsEOF h >>= \case
 79 |       False -> do
 80 |         line <- Bytes.fromByteString <$> BC8.hGetLine h
 81 |         _ <- body line
 82 |         loop
 83 |       True -> pure ()
 84 | 
 85 | {- | Perform an action on each line of the input, threading state through the computation.
 86 | If you do not need to keep a state, see `hForLines_`.
 87 | 
 88 | Lines are extracted with with 'BC8.hGetLine', which does not document its
 89 | detection algorithm. As of writing (bytestring v0.11.1.0), lines are
 90 | delimited by a single @\n@ character (UNIX-style, as all things should be).
 91 | -}
 92 | hFoldLines :: Handle -> a -> (a -> Bytes -> IO a) -> IO a
 93 | hFoldLines h z body = loop z
 94 |  where
 95 |   loop !x =
 96 |     hIsEOF h >>= \case
 97 |       False -> do
 98 |         line <- Bytes.fromByteString <$> BC8.hGetLine h
 99 |         x' <- body x line
100 |         loop x'
101 |       True -> pure x
102 | 
103 | {- | /O(n)/ Convert ASCII letters to lowercase. This adds @0x20@ to bytes in the
104 | range @[0x41,0x5A]@ (@A-Z@ ⇒ @a-z@) and leaves all other bytes alone.
105 | Unconditionally copies the bytes.
106 | -}
107 | toLowerU :: Bytes -> ByteArray
108 | toLowerU (Bytes src off0 len0) =
109 |   runByteArrayST action
110 |  where
111 |   action :: forall s. ST s ByteArray
112 |   action = do
113 |     dst <- PM.newByteArray len0
114 |     let go !off !ix !len =
115 |           if len == 0
116 |             then pure ()
117 |             else do
118 |               let w = PM.indexByteArray src off :: Word8
119 |                   w' =
120 |                     if w >= 0x41 && w <= 0x5A
121 |                       then w + 32
122 |                       else w
123 |               PM.writeByteArray dst ix w'
124 |               go (off + 1) (ix + 1) (len - 1)
125 |     go off0 0 len0
126 |     PM.unsafeFreezeByteArray dst
127 | 
128 | -- | Throws an exception the 'Char' argument is non-ascii.
129 | split1 :: Char -> Bytes -> Maybe (Bytes, Bytes)
130 | {-# INLINE split1 #-}
131 | split1 !c !b
132 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split1: argument not in ASCII range"
133 |   | otherwise = Byte.split1 (c2w c) b
134 | 
135 | -- | Throws an exception the 'Char' argument is non-ascii.
136 | split2 :: Char -> Bytes -> Maybe (Bytes, Bytes, Bytes)
137 | {-# INLINE split2 #-}
138 | split2 !c !b
139 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split2: argument not in ASCII range"
140 |   | otherwise = Byte.split2 (c2w c) b
141 | 
142 | -- | Throws an exception the 'Char' argument is non-ascii.
143 | split3 :: Char -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes)
144 | {-# INLINE split3 #-}
145 | split3 !c !b
146 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split3: argument not in ASCII range"
147 |   | otherwise = Byte.split3 (c2w c) b
148 | 
149 | -- | Throws an exception the 'Char' argument is non-ascii.
150 | split4 :: Char -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes, Bytes)
151 | {-# INLINE split4 #-}
152 | split4 !c !b
153 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split4: argument not in ASCII range"
154 |   | otherwise = Byte.split4 (c2w c) b
155 | 
156 | -- | Throws an exception if any of the 'Char' arguments are non-ascii.
157 | splitTetragram1 :: Char -> Char -> Char -> Char -> Bytes -> Maybe (Bytes, Bytes)
158 | {-# inline splitTetragram1 #-}
159 | splitTetragram1 !c0 !c1 !c2 !c3 !b
160 |   | c0 > '\DEL' || c1 > '\DEL' || c2 > '\DEL' || c3 > '\DEL' =
161 |       errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.splitTetragram1: one of the characters is not in ASCII range"
162 |   | otherwise = Bytes.splitTetragram1 (c2w c0) (c2w c1) (c2w c2) (c2w c3) b
163 | 
164 | c2w :: Char -> Word8
165 | {-# inline c2w #-}
166 | c2w !c = fromIntegral @Int @Word8 (ord c)
167 | 
168 | -- | Throws an exception the 'Char' argument is non-ascii.
169 | dropWhileNotEq :: Char -> Bytes -> Bytes
170 | dropWhileNotEq !c !b
171 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.dropWhileNotEq: argument not in ASCII range"
172 |   | otherwise =
173 |       let !w = c2w c
174 |        in Bytes.unsafeDrop (Bytes.countWhile (/= w) b) b
175 | 
176 | -- | Throws an exception the 'Char' argument is non-ascii.
177 | takeWhileNotEq :: Char -> Bytes -> Bytes
178 | takeWhileNotEq !c !b
179 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.takeWhileNotEq: argument not in ASCII range"
180 |   | otherwise =
181 |       let !w = c2w c
182 |        in Bytes.unsafeTake (Bytes.countWhile (/= w) b) b
183 |   
184 | -- | Throws an exception the 'Char' argument is non-ascii.
185 | takeWhileEndNotEq :: Char -> Bytes -> Bytes
186 | takeWhileEndNotEq !c !b
187 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.takeWhileEndNotEq: argument not in ASCII range"
188 |   | otherwise =
189 |       let !w = c2w c
190 |           !n = Bytes.countWhileEnd (/=w) b
191 |        in Bytes (array b) (offset b + Bytes.length b - n) n
192 | 
193 | -- | Throws an exception the 'Char' argument is non-ascii.
194 | dropWhileEndEq :: Char -> Bytes -> Bytes
195 | dropWhileEndEq !c !b
196 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.dropWhileEndEq: argument not in ASCII range"
197 |   | otherwise =
198 |       let !w = c2w c
199 |           !n = Bytes.countWhileEnd (==w) b
200 |        in Bytes.unsafeTake (Bytes.length b - n) b
201 | 
202 | -- | Throws an exception the 'Char' argument is non-ascii.
203 | anyEq :: Char -> Bytes -> Bool
204 | anyEq !c !b
205 |   | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.takeWhileNotEq: argument not in ASCII range"
206 |   | otherwise =
207 |       let !w = c2w c
208 |        in Bytes.any (==w) b
209 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Search.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE BlockArguments #-}
  3 | {-# LANGUAGE DuplicateRecordFields #-}
  4 | {-# LANGUAGE MagicHash #-}
  5 | {-# LANGUAGE NamedFieldPuns #-}
  6 | {-# LANGUAGE RankNTypes #-}
  7 | 
  8 | -- This is broken out into a separate module to make it easier
  9 | -- to dump core and investigate performance issues.
 10 | module Data.Bytes.Search
 11 |   ( findIndices
 12 |   , replace
 13 |   , isInfixOf
 14 |   ) where
 15 | 
 16 | import Prelude hiding (all, any, dropWhile, elem, foldl, foldr, length, map, null, readFile, replicate, takeWhile)
 17 | 
 18 | import Control.Monad.ST.Run (runByteArrayST, runPrimArrayST)
 19 | import Data.Bits (finiteBitSize, shiftL, (.&.), (.|.))
 20 | import Data.Bytes.Pure (length, unsafeHead, unsafeIndex)
 21 | import Data.Bytes.Types (Bytes (Bytes, array, offset))
 22 | import Data.Primitive (ByteArray, PrimArray)
 23 | import GHC.Exts (Int (I#))
 24 | import GHC.Word (Word32)
 25 | 
 26 | import qualified Data.Bytes.Byte as Byte
 27 | import qualified Data.Bytes.Pure as Pure
 28 | import qualified Data.Bytes.Types as Types
 29 | import qualified Data.Primitive as PM
 30 | 
 31 | -- Implementation Notes
 32 | -- =====================
 33 | -- For karp rabin, there are some easy performance improvements
 34 | -- left on the table. The main optimization that has been done is making
 35 | -- sure that there is no unnecessary boxing of Int, Word32, or Bytes
 36 | -- going on. Here are some other things that have not been done:
 37 | --
 38 | 
 39 | -- * The hash is currently a Word32. It would be better to use either
 40 | 
 41 | --   Word or Word64 for this. We would need for hashKey to be different.
 42 | 
 43 | -- * In several places, we track an index into a Bytes. This index gets
 44 | 
 45 | --   repeatedly added to the base offset as we loop over the bytes. We
 46 | --   could instead track the true offset instead of repeatedly
 47 | --   recalculating it.
 48 | 
 49 | {- | Replace every non-overlapping occurrence of @needle@ in
 50 | @haystack@ with @replacement@.
 51 | -}
 52 | replace ::
 53 |   -- | needle, must not be empty
 54 |   Bytes ->
 55 |   -- | replacement
 56 |   Bytes ->
 57 |   -- | haystack
 58 |   Bytes ->
 59 |   Bytes
 60 | {-# NOINLINE replace #-}
 61 | -- Implementation note: there is a lot of room to improve the performance
 62 | -- of this function.
 63 | replace !needle !replacement !haystack@Bytes {array = haystackArray, offset = haystackIndex, length = haystackLength}
 64 |   | Pure.length needle == 0 = errorWithoutStackTrace "Data.Bytes.replace: needle of length zero"
 65 |   | Pure.length haystack == 0 = Pure.empty
 66 |   | Pure.length needle == 1
 67 |   , Pure.length replacement == 1 =
 68 |       let !needle0 = unsafeIndex needle 0
 69 |           !replacement0 = unsafeIndex replacement 0
 70 |        in Pure.map (\w -> if w == needle0 then replacement0 else w) haystack
 71 |   | otherwise =
 72 |       let !hp = rollingHash needle
 73 |           !ixs = findIndicesKarpRabin 0 hp needle haystackArray haystackIndex haystackLength
 74 |        in Pure.fromByteArray (replaceIndices ixs replacement (Pure.length needle) haystackArray haystackIndex haystackLength)
 75 | 
 76 | -- This is an internal function because it deals explicitly with
 77 | -- an offset into a byte array.
 78 | --
 79 | -- Example:
 80 | 
 81 | -- * haystack len: 39
 82 | 
 83 | -- * ixs: 7, 19, 33
 84 | 
 85 | -- * patLen: 5
 86 | 
 87 | -- * replacment: foo (len 3)
 88 | 
 89 | -- We want to perform these copies:
 90 | 
 91 | -- * src[0,7] -> dst[0,7]
 92 | 
 93 | -- * foo -> dst[7,3]
 94 | 
 95 | -- * src[12,7] -> dst[10,7]
 96 | 
 97 | -- * foo -> dst[17,3]
 98 | 
 99 | -- * src[24,9] -> dst[20,9]
100 | 
101 | -- * foo -> dst[29,3]
102 | 
103 | -- * src[38,1] -> dst[32,1]
104 | replaceIndices :: PrimArray Int -> Bytes -> Int -> ByteArray -> Int -> Int -> ByteArray
105 | replaceIndices !ixs !replacement !patLen !haystack !ix0 !len0 = runByteArrayST $ do
106 |   let !ixsLen = PM.sizeofPrimArray ixs
107 |   let !delta = Pure.length replacement - patLen
108 |   dst <- PM.newByteArray (len0 + ixsLen * delta)
109 |   let applyReplacement !ixIx !prevSrcIx =
110 |         if ixIx < ixsLen
111 |           then do
112 |             let !srcMatchIx = PM.indexPrimArray ixs ixIx
113 |             let !offset = ixIx * delta
114 |             let !dstIx = srcMatchIx + offset - ix0
115 |             Pure.unsafeCopy
116 |               dst
117 |               (prevSrcIx + offset - ix0)
118 |               Bytes {array = haystack, offset = prevSrcIx, length = srcMatchIx - prevSrcIx}
119 |             Pure.unsafeCopy dst dstIx replacement
120 |             applyReplacement (ixIx + 1) (srcMatchIx + patLen)
121 |           else do
122 |             let !offset = ixIx * delta
123 |             Pure.unsafeCopy
124 |               dst
125 |               (prevSrcIx + offset - ix0)
126 |               Bytes {array = haystack, offset = prevSrcIx, length = (len0 + ix0) - prevSrcIx}
127 |             PM.unsafeFreezeByteArray dst
128 |   applyReplacement 0 ix0
129 | 
130 | -- | Find locations of non-overlapping instances of @needle@ within @haystack@.
131 | findIndices ::
132 |   -- | needle
133 |   Bytes ->
134 |   -- | haystack
135 |   Bytes ->
136 |   PrimArray Int
137 | findIndices needle Bytes {array, offset = off, length = len}
138 |   | needleLen == 0 = errorWithoutStackTrace "Data.Bytes.findIndices: needle with length zero"
139 |   | len == 0 = mempty
140 |   | otherwise =
141 |       let !hp = rollingHash needle
142 |        in findIndicesKarpRabin (negate off) hp needle array off len
143 |  where
144 |   needleLen = Pure.length needle
145 | 
146 | -- Precondition: Haystack has non-zero length
147 | -- Precondition: Pattern has non-zero length
148 | -- Uses karp rabin to search.
149 | -- Easy opportunity to improve implementation. Instead of having karpRabin
150 | -- return two slices, we could have it just return a single index.
151 | findIndicesKarpRabin ::
152 |   Int -> -- Output index modifier. Set to negated initial index to make slicing invisible in results.
153 |   Word32 -> -- Hash to search for (must agree with pattern)
154 |   Bytes -> -- Pattern to search for
155 |   ByteArray ->
156 |   Int -> -- initial index
157 |   Int -> -- length
158 |   PrimArray Int
159 | findIndicesKarpRabin !ixModifier !hp !pat !haystack !ix0 !len0 = runPrimArrayST $ do
160 |   let dstLen = 1 + quot len0 (Pure.length pat)
161 |   dst <- PM.newPrimArray dstLen
162 |   let go !ix !len !ixIx = case karpRabin hp pat Bytes {array = haystack, offset = ix, length = len} of
163 |         (-1) -> do
164 |           PM.shrinkMutablePrimArray dst ixIx
165 |           PM.unsafeFreezePrimArray dst
166 |         skipCount -> do
167 |           let !advancement = skipCount - Pure.length pat
168 |           let !advancement' = advancement + Pure.length pat
169 |           PM.writePrimArray dst ixIx (ix + advancement + ixModifier)
170 |           let !ix' = ix + advancement'
171 |           go ix' (len - advancement') (ixIx + 1)
172 |   go ix0 len0 0
173 | 
174 | -- Output: Negative one means match not found. Other negative
175 | -- numbers should not occur. Zero may occur. Positive number
176 | -- means the number of bytes to skip to make it past the match.
177 | breakSubstring ::
178 |   -- | String to search for
179 |   Bytes ->
180 |   -- | String to search in
181 |   Bytes ->
182 |   Int
183 | breakSubstring !pat !haystack@(Bytes _ off0 _) =
184 |   case lp of
185 |     0 -> 0
186 |     1 -> case Byte.elemIndexLoop# (unsafeHead pat) haystack of
187 |       (-1#) -> (-1)
188 |       off -> 1 + (I# off) - off0
189 |     _ ->
190 |       if lp * 8 <= finiteBitSize (0 :: Word)
191 |         then shift haystack
192 |         else karpRabin (rollingHash pat) pat haystack
193 |  where
194 |   lp = length pat
195 |   {-# INLINE shift #-}
196 |   shift :: Bytes -> Int
197 |   shift !src
198 |     | length src < lp = (-1)
199 |     | otherwise = search (intoWord $ Pure.unsafeTake lp src) lp
200 |    where
201 |     intoWord :: Bytes -> Word
202 |     intoWord = Pure.foldl' (\w b -> (w `shiftL` 8) .|. fromIntegral b) 0
203 |     wp = intoWord pat
204 |     mask = (1 `shiftL` (8 * lp)) - 1
205 |     search :: Word -> Int -> Int
206 |     search !w !i
207 |       | w == wp = i
208 |       | length src <= i = (-1)
209 |       | otherwise = search w' (i + 1)
210 |      where
211 |       b = fromIntegral (Pure.unsafeIndex src i)
212 |       w' = mask .&. ((w `shiftL` 8) .|. b)
213 | 
214 | -- Only used for karp rabin
215 | rollingHash :: Bytes -> Word32
216 | {-# INLINE rollingHash #-}
217 | rollingHash = Pure.foldl' (\h b -> h * hashKey + fromIntegral b) 0
218 | 
219 | hashKey :: Word32
220 | {-# INLINE hashKey #-}
221 | hashKey = 2891336453
222 | 
223 | -- Precondition: Length of bytes is greater than or equal to 1.
224 | -- Precondition: Rolling hash agrees with pattern.
225 | -- Output: Negative one means match not found. Other negative
226 | -- numbers should not occur. Zero should not occur. Positive number
227 | -- means the number of bytes to skip to make it past the match.
228 | karpRabin :: Word32 -> Bytes -> Bytes -> Int
229 | karpRabin !hp !pat !src
230 |   | length src < lp = (-1)
231 |   | otherwise = search (rollingHash $ Pure.unsafeTake lp src) lp
232 |  where
233 |   lp :: Int
234 |   !lp = Pure.length pat
235 |   m :: Word32
236 |   !m = hashKey ^ lp
237 |   get :: Int -> Word32
238 |   get !ix = fromIntegral (Pure.unsafeIndex src ix)
239 |   search !hs !i
240 |     | hp == hs && eqBytesNoShortCut pat (Pure.unsafeTake lp (Pure.unsafeDrop (i - lp) src)) = i
241 |     | length src <= i = (-1)
242 |     | otherwise = search hs' (i + 1)
243 |    where
244 |     hs' =
245 |       hs * hashKey
246 |         + get i
247 |         - m * get (i - lp)
248 | 
249 | {- | Is the first argument an infix of the second argument?
250 | 
251 | Uses the Rabin-Karp algorithm: expected time @O(n+m)@, worst-case @O(nm)@.
252 | -}
253 | isInfixOf ::
254 |   -- | String to search for
255 |   Bytes ->
256 |   -- | String to search in
257 |   Bytes ->
258 |   Bool
259 | isInfixOf p s = Pure.null p || breakSubstring p s >= 0
260 | 
261 | -- Precondition: both arguments have the same length
262 | -- Skips the pointer equality check and the length check.
263 | eqBytesNoShortCut :: Bytes -> Bytes -> Bool
264 | {-# INLINE eqBytesNoShortCut #-}
265 | eqBytesNoShortCut (Bytes arr1 off1 len1) (Bytes arr2 off2 _) =
266 |   PM.compareByteArrays arr1 off1 arr2 off2 len1 == EQ
267 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Byte.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE BlockArguments #-}
  3 | {-# LANGUAGE MagicHash #-}
  4 | {-# LANGUAGE NamedFieldPuns #-}
  5 | {-# LANGUAGE ScopedTypeVariables #-}
  6 | {-# LANGUAGE UnliftedFFITypes #-}
  7 | 
  8 | -- This internal module has functions for splitting strings
  9 | -- on a particular byte and for counting occurences of that
 10 | -- byte.
 11 | module Data.Bytes.Byte
 12 |   ( -- Re-exported by Data.Bytes
 13 |     count
 14 |   , split
 15 |   , splitU
 16 |   , splitNonEmpty
 17 |   , splitStream
 18 |   , splitInit
 19 |   , splitInitU
 20 |   , split1
 21 |   , split2
 22 |   , split3
 23 |   , split4
 24 |   , splitEnd1
 25 |   -- Used by other internal modules
 26 |   , elemIndexLoop#
 27 |   ) where
 28 | 
 29 | import Prelude hiding (length)
 30 | 
 31 | import Control.Monad.ST (runST)
 32 | import Control.Monad.ST.Run (runPrimArrayST)
 33 | import Data.Bytes.Types (Bytes (..))
 34 | import Data.List.NonEmpty (NonEmpty ((:|)))
 35 | import Data.Primitive (ByteArray (..), MutablePrimArray (..), PrimArray (..))
 36 | import Data.Primitive.Unlifted.Array (UnliftedArray)
 37 | import Data.Tuple.Types (IntPair (IntPair))
 38 | import Data.Vector.Fusion.Stream.Monadic (Step (Done, Yield), Stream (Stream))
 39 | import Data.Word (Word8)
 40 | import GHC.Exts (ByteArray#, Int (I#), Int#, MutableByteArray#)
 41 | import GHC.IO (unsafeIOToST)
 42 | 
 43 | import qualified Data.Primitive as PM
 44 | import qualified Data.Primitive.Unlifted.Array as PM
 45 | import qualified GHC.Exts as Exts
 46 | 
 47 | -- | Count the number of times the byte appears in the sequence.
 48 | count :: Word8 -> Bytes -> Int
 49 | count !b (Bytes {array = ByteArray arr, offset, length}) =
 50 |   count_ba arr offset length b
 51 | 
 52 | {- | Variant of 'split' that returns an array of unsliced byte sequences.
 53 | Unlike 'split', this is not a good producer for list fusion. (It does
 54 | not return a list, so it could not be.) Prefer 'split' if the result
 55 | is going to be consumed exactly once by a good consumer. Prefer 'splitU'
 56 | if the result of the split is going to be around for a while and
 57 | inspected multiple times.
 58 | -}
 59 | splitU :: Word8 -> Bytes -> UnliftedArray ByteArray
 60 | splitU !w !bs =
 61 |   let !lens = splitLengthsAlt w bs
 62 |       !lensSz = PM.sizeofPrimArray lens
 63 |    in splitCommonU lens lensSz bs
 64 | 
 65 | {- | Variant of 'splitU' that drops the trailing element. See 'splitInit'
 66 | for an explanation of why this may be useful.
 67 | -}
 68 | splitInitU :: Word8 -> Bytes -> UnliftedArray ByteArray
 69 | splitInitU !w !bs =
 70 |   let !lens = splitLengthsAlt w bs
 71 |       !lensSz = PM.sizeofPrimArray lens
 72 |    in splitCommonU lens (lensSz - 1) bs
 73 | 
 74 | -- Internal function
 75 | splitCommonU ::
 76 |   PrimArray Int -> -- array of segment lengths
 77 |   Int -> -- number of lengths to consider
 78 |   Bytes ->
 79 |   UnliftedArray ByteArray
 80 | splitCommonU !lens !lensSz Bytes {array, offset = arrIx0} = runST do
 81 |   dst <- PM.unsafeNewUnliftedArray lensSz
 82 |   let go !lenIx !arrIx =
 83 |         if lenIx < lensSz
 84 |           then do
 85 |             let !len = PM.indexPrimArray lens lenIx
 86 |             buf <- PM.newByteArray len
 87 |             PM.copyByteArray buf 0 array arrIx len
 88 |             buf' <- PM.unsafeFreezeByteArray buf
 89 |             PM.writeUnliftedArray dst lenIx buf'
 90 |             go (lenIx + 1) (arrIx + len + 1)
 91 |           else pure ()
 92 |   go 0 arrIx0
 93 |   PM.unsafeFreezeUnliftedArray dst
 94 | 
 95 | {- | Break a byte sequence into pieces separated by the byte argument,
 96 | consuming the delimiter. This function is a good producer for list
 97 | fusion. It is common to immidiately consume the results of @split@
 98 | with @foldl'@, @traverse_@, @foldlM@, and being a good producer helps
 99 | in this situation.
100 | 
101 | Note: this function differs from its counterpart in @bytestring@.
102 | If the byte sequence is empty, this returns a singleton list with
103 | the empty byte sequence.
104 | -}
105 | split :: Word8 -> Bytes -> [Bytes]
106 | {-# INLINE split #-}
107 | split !w !bs@Bytes {array, offset = arrIx0} =
108 |   Exts.build
109 |     ( \g x0 ->
110 |         let go !lenIx !arrIx =
111 |               if lenIx < lensSz
112 |                 then
113 |                   let !len = PM.indexPrimArray lens lenIx
114 |                    in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1))
115 |                 else x0
116 |          in go 0 arrIx0
117 |     )
118 |  where
119 |   !lens = splitLengthsAlt w bs
120 |   !lensSz = PM.sizeofPrimArray lens
121 | 
122 | {- | Variant of 'split' that intended for use with stream fusion rather
123 | than @build@-@foldr@ fusion.
124 | -}
125 | splitStream :: forall m. (Applicative m) => Word8 -> Bytes -> Stream m Bytes
126 | {-# INLINE [1] splitStream #-}
127 | splitStream !w !bs@Bytes {array, offset = arrIx0} = Stream step (IntPair 0 arrIx0)
128 |  where
129 |   !lens = splitLengthsAlt w bs
130 |   !lensSz = PM.sizeofPrimArray lens
131 |   {-# INLINE [0] step #-}
132 |   step :: IntPair -> m (Step IntPair Bytes)
133 |   step (IntPair lenIx arrIx) =
134 |     if lenIx < lensSz
135 |       then do
136 |         let !len = PM.indexPrimArray lens lenIx
137 |             !element = Bytes array arrIx len
138 |             !acc = IntPair (lenIx + 1) (arrIx + len + 1)
139 |         pure (Yield element acc)
140 |       else pure Done
141 | 
142 | {- | Variant of 'split' that returns the result as a 'NonEmpty'
143 | instead of @[]@. This is also eligible for stream fusion.
144 | -}
145 | splitNonEmpty :: Word8 -> Bytes -> NonEmpty Bytes
146 | {-# INLINE splitNonEmpty #-}
147 | splitNonEmpty !w !bs@Bytes {array, offset = arrIx0} =
148 |   Bytes array arrIx0 len0
149 |     :| Exts.build
150 |       ( \g x0 ->
151 |           let go !lenIx !arrIx =
152 |                 if lenIx < lensSz
153 |                   then
154 |                     let !len = PM.indexPrimArray lens lenIx
155 |                      in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1))
156 |                   else x0
157 |            in go 1 (1 + (arrIx0 + len0))
158 |       )
159 |  where
160 |   !lens = splitLengthsAlt w bs
161 |   !lensSz = PM.sizeofPrimArray lens
162 |   !len0 = PM.indexPrimArray lens 0 :: Int
163 | 
164 | {- | Variant of 'split' that drops the trailing element. This behaves
165 | correctly even if the byte sequence is empty. This is a good producer
166 | for list fusion. This is useful when splitting a text file
167 | into lines.
168 | <https://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap03.html#tag_03_392 POSIX>
169 | mandates that text files end with a newline, so the list resulting
170 | from 'split' always has an empty byte sequence as its last element.
171 | With 'splitInit', that unwanted element is discarded.
172 | -}
173 | splitInit :: Word8 -> Bytes -> [Bytes]
174 | {-# INLINE splitInit #-}
175 | splitInit !w !bs@Bytes {array, offset = arrIx0} =
176 |   Exts.build
177 |     ( \g x0 ->
178 |         let go !lenIx !arrIx =
179 |               if lenIx < lensSz
180 |                 then
181 |                   let !len = PM.indexPrimArray lens lenIx
182 |                    in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1))
183 |                 else x0
184 |          in go 0 arrIx0
185 |     )
186 |  where
187 |   -- Remember, the resulting array from splitLengthsAlt always has
188 |   -- a length of at least one.
189 |   !lens = splitLengthsAlt w bs
190 |   !lensSz = PM.sizeofPrimArray lens - 1
191 | 
192 | -- Internal function. This is just like splitLengths except that
193 | -- it does not treat the empty byte sequences specially. The result
194 | -- for that byte sequence is a singleton array with the element zero.
195 | splitLengthsAlt :: Word8 -> Bytes -> PrimArray Int
196 | splitLengthsAlt b Bytes {array = ByteArray arr#, offset = off, length = len} = runPrimArrayST do
197 |   let !n = count_ba arr# off len b
198 |   dst@(MutablePrimArray dst#) :: MutablePrimArray s Int <- PM.newPrimArray (n + 1)
199 |   total <- unsafeIOToST (memchr_ba_many arr# off len dst# n b)
200 |   PM.writePrimArray dst n (len - total)
201 |   PM.unsafeFreezePrimArray dst
202 | 
203 | foreign import ccall unsafe "bs_custom.h memchr_ba_many"
204 |   memchr_ba_many ::
205 |     ByteArray# -> Int -> Int -> MutableByteArray# s -> Int -> Word8 -> IO Int
206 | 
207 | foreign import ccall unsafe "bs_custom.h count_ba"
208 |   count_ba ::
209 |     ByteArray# -> Int -> Int -> Word8 -> Int
210 | 
211 | {- | Split a byte sequence on the first occurrence of the target
212 | byte. The target is removed from the result. For example:
213 | 
214 | >>> split1 0xA [0x1,0x2,0xA,0xB]
215 | Just ([0x1,0x2],[0xB])
216 | -}
217 | split1 :: Word8 -> Bytes -> Maybe (Bytes, Bytes)
218 | {-# INLINE split1 #-}
219 | split1 w b@(Bytes arr off len) = case elemIndexLoop# w b of
220 |   (-1#) -> Nothing
221 |   i# ->
222 |     let i = I# i#
223 |      in Just (Bytes arr off (i - off), Bytes arr (i + 1) (len - (1 + i - off)))
224 | 
225 | {- | Split a byte sequence on the first and second occurrences
226 | of the target byte. The target is removed from the result.
227 | For example:
228 | 
229 | >>> split2 0xA [0x1,0x2,0xA,0xB,0xA,0xA,0xA]
230 | Just ([0x1,0x2],[0xB],[0xA,0xA])
231 | -}
232 | split2 :: Word8 -> Bytes -> Maybe (Bytes, Bytes, Bytes)
233 | {-# INLINE split2 #-}
234 | split2 w b@(Bytes arr off len) = case elemIndexLoop# w b of
235 |   (-1#) -> Nothing
236 |   i# ->
237 |     let i = I# i#
238 |      in case elemIndexLoop# w (Bytes arr (i + 1) (len - (1 + i - off))) of
239 |           (-1#) -> Nothing
240 |           j# ->
241 |             let j = I# j#
242 |              in Just
243 |                   ( Bytes arr off (i - off)
244 |                   , Bytes arr (i + 1) (j - (i + 1))
245 |                   , Bytes arr (j + 1) (len - (1 + j - off))
246 |                   )
247 | 
248 | {- | Split a byte sequence on the first, second, and third occurrences
249 | of the target byte. The target is removed from the result.
250 | For example:
251 | 
252 | >>> split3 0xA [0x1,0x2,0xA,0xB,0xA,0xA,0xA]
253 | Just ([0x1,0x2],[0xB],[],[0xA])
254 | -}
255 | split3 :: Word8 -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes)
256 | {-# INLINE split3 #-}
257 | split3 w b@(Bytes arr off len) = case elemIndexLoop# w b of
258 |   (-1#) -> Nothing
259 |   i# ->
260 |     let i = I# i#
261 |      in case elemIndexLoop# w (Bytes arr (i + 1) (len - (1 + i - off))) of
262 |           (-1#) -> Nothing
263 |           j# ->
264 |             let j = I# j#
265 |              in case elemIndexLoop# w (Bytes arr (j + 1) (len - (1 + j - off))) of
266 |                   (-1#) -> Nothing
267 |                   k# ->
268 |                     let k = I# k#
269 |                      in Just
270 |                           ( Bytes arr off (i - off)
271 |                           , Bytes arr (i + 1) (j - (i + 1))
272 |                           , Bytes arr (j + 1) (k - (j + 1))
273 |                           , Bytes arr (k + 1) (len - (1 + k - off))
274 |                           )
275 | 
276 | {- | Split a byte sequence on the first, second, third, and fourth
277 | occurrences of the target byte. The target is removed from the result.
278 | For example:
279 | 
280 | >>> split4 0xA [0x1,0x2,0xA,0xB,0xA,0xA,0xA]
281 | Just ([0x1,0x2],[0xB],[],[],[])
282 | -}
283 | split4 :: Word8 -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes, Bytes)
284 | {-# INLINE split4 #-}
285 | split4 w b@(Bytes arr off len) = case elemIndexLoop# w b of
286 |   (-1#) -> Nothing
287 |   i# ->
288 |     let i = I# i#
289 |      in case elemIndexLoop# w (Bytes arr (i + 1) (len - (1 + i - off))) of
290 |           (-1#) -> Nothing
291 |           j# ->
292 |             let j = I# j#
293 |              in case elemIndexLoop# w (Bytes arr (j + 1) (len - (1 + j - off))) of
294 |                   (-1#) -> Nothing
295 |                   k# ->
296 |                     let k = I# k#
297 |                      in case elemIndexLoop# w (Bytes arr (k + 1) (len - (1 + k - off))) of
298 |                           (-1#) -> Nothing
299 |                           m# ->
300 |                             let m = I# m#
301 |                              in Just
302 |                                   ( Bytes arr off (i - off)
303 |                                   , Bytes arr (i + 1) (j - (i + 1))
304 |                                   , Bytes arr (j + 1) (k - (j + 1))
305 |                                   , Bytes arr (k + 1) (m - (k + 1))
306 |                                   , Bytes arr (m + 1) (len - (1 + m - off))
307 |                                   )
308 | 
309 | -- This returns the offset into the byte array. This is not an index
310 | -- that will mean anything to the end user, so it cannot be returned
311 | -- to them.
312 | --
313 | -- Exported for use in other internal modules because it is needed in
314 | -- Data.Bytes.Search.
315 | elemIndexLoop# :: Word8 -> Bytes -> Int#
316 | {-# INLINE elemIndexLoop# #-}
317 | elemIndexLoop# !w (Bytes arr off@(I# off#) len) = case len of
318 |   0 -> (-1#)
319 |   _ ->
320 |     if PM.indexByteArray arr off == w
321 |       then off#
322 |       else elemIndexLoop# w (Bytes arr (off + 1) (len - 1))
323 | 
324 | -- Variant of elemIndexLoop# that starts at the end. Similarly, returns
325 | -- negative one if the element is not found.
326 | elemIndexLoopBackwards# :: Word8 -> ByteArray -> Int -> Int -> Int#
327 | elemIndexLoopBackwards# !w !arr !start !pos@(I# pos#) =
328 |   if pos < start
329 |     then (-1#)
330 |     else
331 |       if PM.indexByteArray arr pos == w
332 |         then pos#
333 |         else elemIndexLoopBackwards# w arr start (pos - 1)
334 | 
335 | {- | Split a byte sequence on the last occurrence of the target
336 | byte. The target is removed from the result. For example:
337 | 
338 | >>> split1 0xA [0x1,0x2,0xA,0xB,0xA,0xC]
339 | Just ([0x1,0x2,0xA,0xB],[0xC])
340 | -}
341 | splitEnd1 :: Word8 -> Bytes -> Maybe (Bytes, Bytes)
342 | {-# INLINE splitEnd1 #-}
343 | splitEnd1 !w (Bytes arr off len) = case elemIndexLoopBackwards# w arr off (off + len - 1) of
344 |   (-1#) -> Nothing
345 |   i# ->
346 |     let i = I# i#
347 |      in Just (Bytes arr off (i - off), Bytes arr (i + 1) (len - (1 + i - off)))
348 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Chunks.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE DerivingStrategies #-}
  3 | {-# LANGUAGE DuplicateRecordFields #-}
  4 | {-# LANGUAGE MagicHash #-}
  5 | {-# LANGUAGE NamedFieldPuns #-}
  6 | {-# LANGUAGE RankNTypes #-}
  7 | {-# LANGUAGE TypeApplications #-}
  8 | {-# LANGUAGE TypeFamilies #-}
  9 | {-# LANGUAGE UnboxedTuples #-}
 10 | 
 11 | {- | Chunks of bytes. This is useful as a target for a builder
 12 | or as a way to read a large amount of whose size is unknown
 13 | in advance. Structurally, this type is similar to
 14 | @Data.ByteString.Lazy.ByteString@. However, the type in this
 15 | module is strict in its spine. Additionally, none of the
 16 | @Handle@ functions perform lazy I\/O.
 17 | -}
 18 | module Data.Bytes.Chunks
 19 |   ( -- * Types
 20 |     Chunks (..)
 21 | 
 22 |     -- * Properties
 23 |   , length
 24 |   , null
 25 | 
 26 |     -- * Manipulate
 27 |   , cons
 28 |   , concat
 29 |   , concatPinned
 30 |   , concatU
 31 |   , concatPinnedU
 32 |   , concatByteString
 33 |   , reverse
 34 |   , reverseOnto
 35 |   , replicate
 36 |   , replicateByte
 37 | 
 38 |     -- * Folds
 39 |   , foldl'
 40 | 
 41 |     -- * Splitting
 42 |   , split
 43 | 
 44 |     -- * Hashing
 45 |   , fnv1a32
 46 |   , fnv1a64
 47 | 
 48 |     -- * Create
 49 |   , fromBytes
 50 |   , fromByteArray
 51 | 
 52 |     -- * Copy to buffer
 53 |   , unsafeCopy
 54 | 
 55 |     -- * I\/O with Handles
 56 |   , hGetContents
 57 |   , readFile
 58 |   , hPut
 59 |   , writeFile
 60 |   ) where
 61 | 
 62 | import Prelude hiding (Foldable (..), concat, readFile, replicate, reverse, writeFile)
 63 | 
 64 | import Control.Exception (IOException, catch)
 65 | import Control.Monad.ST.Run (runIntByteArrayST)
 66 | import Data.Bits (xor)
 67 | import Data.ByteString (ByteString)
 68 | import Data.Bytes.Types (Bytes (Bytes))
 69 | import Data.Primitive (ByteArray (..), MutableByteArray (..))
 70 | import Data.Word (Word32, Word64, Word8)
 71 | import GHC.Exts (ByteArray#, Int (I#), Int#, MutableByteArray#, State#, (+#))
 72 | import GHC.ST (ST (..))
 73 | import System.IO (Handle, IOMode (ReadMode, WriteMode), hFileSize, withBinaryFile)
 74 | 
 75 | import qualified Data.Bytes.Byte as Byte
 76 | import qualified Data.Bytes.IO as IO
 77 | import qualified Data.Bytes.Pure as Bytes
 78 | import qualified Data.Bytes.Types as B
 79 | import qualified Data.Primitive as PM
 80 | import qualified GHC.Exts as Exts
 81 | 
 82 | -- | A cons-list of byte sequences.
 83 | data Chunks
 84 |   = ChunksCons {-# UNPACK #-} !Bytes !Chunks
 85 |   | ChunksNil
 86 |   deriving stock (Show)
 87 | 
 88 | instance Semigroup Chunks where
 89 |   ChunksNil <> a = a
 90 |   cs@(ChunksCons _ _) <> ChunksNil = cs
 91 |   as@(ChunksCons _ _) <> bs@(ChunksCons _ _) =
 92 |     reverseOnto bs (reverse as)
 93 | 
 94 | instance Monoid Chunks where
 95 |   mempty = ChunksNil
 96 | 
 97 | -- | This uses @concat@ to form an equivalence class.
 98 | instance Eq Chunks where
 99 |   -- TODO: There is a more efficient way to do this, but
100 |   -- it is tedious.
101 |   a == b = concat a == concat b
102 | 
103 | -- | Add a byte sequence to the beginning.
104 | cons :: Bytes -> Chunks -> Chunks
105 | {-# INLINE cons #-}
106 | cons = ChunksCons
107 | 
108 | {- | Repeat the byte sequence over and over. Returns empty chunks when given
109 | a negative repetition count.
110 | -}
111 | replicate ::
112 |   Bytes ->
113 |   -- | Number of times to repeat the sequence.
114 |   Int ->
115 |   Chunks
116 | replicate !b@(Bytes _ _ len) !n
117 |   | n <= 0 = ChunksNil
118 |   | len == 0 = ChunksNil
119 |   | otherwise = go n ChunksNil
120 |  where
121 |   -- Implementation note: We do not have to reverse the chunks at the end.
122 |   go i !acc = case i of
123 |     0 -> acc
124 |     _ -> go (i - 1) (ChunksCons b acc)
125 | 
126 | {- | Repeat the byte over and over. This builds a single byte array that
127 | is at most 64KiB and shares that across every @ChunksCons@ cell.
128 | 
129 | An as example, creating a 2GiB chunks this way would use 64KiB for the
130 | byte array, and there would be the additional overhead of the 2^15
131 | @ChunksCons@ data constructors. On a 64-bit platform, @ChunksCons@
132 | takes 40 bytes, so the total memory consumption would be
133 | @2^16 + 40 * 2^15@, which is roughly 1.37MB. The same reasoning
134 | shows that it takes about 83.95MB to represent a 128GiB chunks.
135 | 
136 | The size of the shared payload is an implementation detail. Do not
137 | rely on this function producing 64KiB chunks. The implementation might
138 | one day change to something smarter that minimizes the memory footprint
139 | for very large chunks.
140 | -}
141 | replicateByte ::
142 |   Word8 ->
143 |   -- | Number of times to replicate the byte
144 |   Int ->
145 |   Chunks
146 | replicateByte !w !n
147 |   | n <= 0 = ChunksNil
148 |   | n < 65536 = ChunksCons (Bytes.replicate n w) ChunksNil
149 |   | otherwise = go (Bytes.replicateU 65536 w) n ChunksNil
150 |  where
151 |   go !shared !remaining !acc
152 |     | remaining == 0 = acc
153 |     | remaining < 65536 = ChunksCons (Bytes shared 0 remaining) acc
154 |     | otherwise = go shared (remaining - 65536) (ChunksCons (Bytes shared 0 65536) acc)
155 | 
156 | -- | Are there any bytes in the chunked byte sequences?
157 | null :: Chunks -> Bool
158 | null = go
159 |  where
160 |   go ChunksNil = True
161 |   go (ChunksCons (Bytes _ _ len) xs) = case len of
162 |     0 -> go xs
163 |     _ -> False
164 | 
165 | {- | Variant of 'concat' that ensure that the resulting byte
166 | sequence is pinned memory.
167 | -}
168 | concatPinned :: Chunks -> Bytes
169 | concatPinned x = case x of
170 |   ChunksNil -> Bytes.emptyPinned
171 |   ChunksCons b y -> case y of
172 |     ChunksNil -> Bytes.pin b
173 |     ChunksCons c z -> case concatPinnedFollowing2 b c z of
174 |       (# len, r #) -> Bytes (ByteArray r) 0 (I# len)
175 | 
176 | -- | Concatenate chunks into a strict bytestring.
177 | concatByteString :: Chunks -> ByteString
178 | concatByteString c = Bytes.pinnedToByteString (concatPinned c)
179 | 
180 | -- | Concatenate chunks into a single contiguous byte sequence.
181 | concat :: Chunks -> Bytes
182 | concat x = case x of
183 |   ChunksNil -> Bytes.empty
184 |   ChunksCons b y -> case y of
185 |     ChunksNil -> b
186 |     ChunksCons c z -> case concatFollowing2 b c z of
187 |       (# len, r #) -> Bytes (ByteArray r) 0 (I# len)
188 | 
189 | -- | Variant of 'concat' that returns an unsliced byte sequence.
190 | concatU :: Chunks -> ByteArray
191 | concatU x = case x of
192 |   ChunksNil -> mempty
193 |   ChunksCons b y -> case y of
194 |     ChunksNil -> Bytes.toByteArray b
195 |     ChunksCons c z -> case concatFollowing2 b c z of
196 |       (# _, r #) -> ByteArray r
197 | 
198 | -- | Variant of 'concatPinned' that returns an unsliced pinned byte sequence.
199 | concatPinnedU :: Chunks -> ByteArray
200 | concatPinnedU x = case x of
201 |   ChunksNil -> Bytes.emptyPinnedU
202 |   ChunksCons b y -> case y of
203 |     ChunksNil -> Bytes.toPinnedByteArray b
204 |     ChunksCons c z -> case concatPinnedFollowing2 b c z of
205 |       (# _, r #) -> ByteArray r
206 | 
207 | concatFollowing2 :: Bytes -> Bytes -> Chunks -> (# Int#, ByteArray# #)
208 | concatFollowing2 = internalConcatFollowing2 PM.newByteArray
209 | 
210 | concatPinnedFollowing2 :: Bytes -> Bytes -> Chunks -> (# Int#, ByteArray# #)
211 | concatPinnedFollowing2 = internalConcatFollowing2 PM.newPinnedByteArray
212 | 
213 | internalConcatFollowing2 ::
214 |   (forall s. Int -> ST s (MutableByteArray s)) ->
215 |   Bytes ->
216 |   Bytes ->
217 |   Chunks ->
218 |   (# Int#, ByteArray# #)
219 | {-# INLINE internalConcatFollowing2 #-}
220 | internalConcatFollowing2
221 |   allocate
222 |   (Bytes {array = c, offset = coff, length = szc})
223 |   (Bytes {array = d, offset = doff, length = szd})
224 |   ds =
225 |     let !(I# x, ByteArray y) = runIntByteArrayST $ do
226 |           let !szboth = szc + szd
227 |               !len = chunksLengthGo szboth ds
228 |           dst <- allocate len
229 |           PM.copyByteArray dst 0 c coff szc
230 |           PM.copyByteArray dst szc d doff szd
231 |           -- Note: len2 will always be the same as len.
232 |           !len2 <- unsafeCopy dst szboth ds
233 |           result <- PM.unsafeFreezeByteArray dst
234 |           pure (len2, result)
235 |      in (# x, y #)
236 | 
237 | -- | The total number of bytes in all the chunks.
238 | length :: Chunks -> Int
239 | length = chunksLengthGo 0
240 | 
241 | chunksLengthGo :: Int -> Chunks -> Int
242 | chunksLengthGo !n ChunksNil = n
243 | chunksLengthGo !n (ChunksCons (Bytes {B.length = len}) cs) =
244 |   chunksLengthGo (n + len) cs
245 | 
246 | {- | Copy the contents of the chunks into a mutable array.
247 | Precondition: The destination must have enough space to
248 | house the contents. This is not checked.
249 | -}
250 | unsafeCopy ::
251 |   -- | Destination
252 |   MutableByteArray s ->
253 |   -- | Destination offset
254 |   Int ->
255 |   -- | Source
256 |   Chunks ->
257 |   -- | Returns the next index into the destination after the payload
258 |   ST s Int
259 | {-# INLINE unsafeCopy #-}
260 | unsafeCopy (MutableByteArray dst) (I# off) cs =
261 |   ST
262 |     ( \s0 -> case copy# dst off cs s0 of
263 |         (# s1, nextOff #) -> (# s1, I# nextOff #)
264 |     )
265 | 
266 | copy# :: MutableByteArray# s -> Int# -> Chunks -> State# s -> (# State# s, Int# #)
267 | copy# _ off ChunksNil s0 = (# s0, off #)
268 | copy# marr off (ChunksCons (Bytes {B.array, B.offset, B.length = len}) cs) s0 =
269 |   case Exts.copyByteArray# (unBa array) (unI offset) marr off (unI len) s0 of
270 |     s1 -> copy# marr (off +# unI len) cs s1
271 | 
272 | -- | Reverse chunks but not the bytes within each chunk.
273 | reverse :: Chunks -> Chunks
274 | reverse = reverseOnto ChunksNil
275 | 
276 | {- | Variant of 'reverse' that allows the caller to provide
277 | an initial list of chunks that the reversed chunks will
278 | be pushed onto.
279 | -}
280 | reverseOnto :: Chunks -> Chunks -> Chunks
281 | reverseOnto !x ChunksNil = x
282 | reverseOnto !x (ChunksCons y ys) =
283 |   reverseOnto (ChunksCons y x) ys
284 | 
285 | unI :: Int -> Int#
286 | {-# INLINE unI #-}
287 | unI (I# i) = i
288 | 
289 | unBa :: ByteArray -> ByteArray#
290 | {-# INLINE unBa #-}
291 | unBa (ByteArray x) = x
292 | 
293 | -- | Read a handle's entire contents strictly into chunks.
294 | hGetContents :: Handle -> IO Chunks
295 | hGetContents !h = hGetContentsCommon ChunksNil h
296 | 
297 | -- | Read a handle's entire contents strictly into chunks.
298 | hGetContentsHint :: Int -> Handle -> IO Chunks
299 | hGetContentsHint !hint !h = do
300 |   c <- IO.hGet h hint
301 |   let !r = ChunksCons c ChunksNil
302 |   if Bytes.length c == hint
303 |     then pure r
304 |     else hGetContentsCommon r h
305 | 
306 | hGetContentsCommon ::
307 |   Chunks -> -- reversed chunks
308 |   Handle ->
309 |   IO Chunks
310 | hGetContentsCommon !acc0 !h = go acc0
311 |  where
312 |   go !acc = do
313 |     c <- IO.hGet h chunkSize
314 |     let !r = ChunksCons c acc
315 |     if Bytes.length c == chunkSize
316 |       then go r
317 |       else pure $! reverse r
318 | 
319 | {- | Read an entire file strictly into chunks. If reading from a
320 | regular file, this makes an effort read the file into a single
321 | chunk.
322 | -}
323 | readFile :: FilePath -> IO Chunks
324 | readFile f = withBinaryFile f ReadMode $ \h -> do
325 |   -- Implementation copied from bytestring.
326 |   -- hFileSize fails if file is not regular file (like
327 |   -- /dev/null). Catch exception and try reading anyway.
328 |   filesz <- catch (hFileSize h) useZeroIfNotRegularFile
329 |   let hint = (fromIntegral filesz `max` 255) + 1
330 |   hGetContentsHint hint h
331 |  where
332 |   -- Our initial size is one bigger than the file size so that in the
333 |   -- typical case we will read the whole file in one go and not have
334 |   -- to allocate any more chunks. We'll still do the right thing if the
335 |   -- file size is 0 or is changed before we do the read.
336 | 
337 |   useZeroIfNotRegularFile :: IOException -> IO Integer
338 |   useZeroIfNotRegularFile _ = return 0
339 | 
340 | chunkSize :: Int
341 | chunkSize = 16384 - 16
342 | 
343 | -- | Create a list of chunks with a single chunk.
344 | fromBytes :: Bytes -> Chunks
345 | fromBytes !b = ChunksCons b ChunksNil
346 | 
347 | -- | Variant of 'fromBytes' where the single chunk is unsliced.
348 | fromByteArray :: ByteArray -> Chunks
349 | fromByteArray !b = fromBytes (Bytes.fromByteArray b)
350 | 
351 | -- | Left fold over all bytes in the chunks, strict in the accumulator.
352 | foldl' :: (a -> Word8 -> a) -> a -> Chunks -> a
353 | {-# INLINE foldl' #-}
354 | foldl' g = go
355 |  where
356 |   go !a ChunksNil = a
357 |   go !a (ChunksCons c cs) = go (Bytes.foldl' g a c) cs
358 | 
359 | -- | Hash byte sequence with 32-bit variant of FNV-1a.
360 | fnv1a32 :: Chunks -> Word32
361 | fnv1a32 !b =
362 |   foldl'
363 |     ( \acc w -> (fromIntegral @Word8 @Word32 w `xor` acc) * 0x01000193
364 |     )
365 |     0x811c9dc5
366 |     b
367 | 
368 | -- | Hash byte sequence with 64-bit variant of FNV-1a.
369 | fnv1a64 :: Chunks -> Word64
370 | fnv1a64 !b =
371 |   foldl'
372 |     ( \acc w -> (fromIntegral @Word8 @Word64 w `xor` acc) * 0x00000100000001B3
373 |     )
374 |     0xcbf29ce484222325
375 |     b
376 | 
377 | {- | Outputs 'Chunks' to the specified 'Handle'. This is implemented
378 | with 'IO.hPut'.
379 | -}
380 | hPut :: Handle -> Chunks -> IO ()
381 | hPut h = go
382 |  where
383 |   go ChunksNil = pure ()
384 |   go (ChunksCons c cs) = IO.hPut h c *> go cs
385 | 
386 | {- | Write 'Chunks' to a file, replacing the previous contents of
387 | the file.
388 | -}
389 | writeFile :: FilePath -> Chunks -> IO ()
390 | writeFile path cs = withBinaryFile path WriteMode (\h -> hPut h cs)
391 | 
392 | {- | Break chunks of bytes into contiguous pieces separated by the
393 | byte argument. This is a good producer for list fusion. For this
394 | function to perform well, each chunk should contain multiple separators.
395 | Any piece that spans multiple chunks must be copied.
396 | -}
397 | split :: Word8 -> Chunks -> [Bytes]
398 | {-# INLINE split #-}
399 | split !w !cs0 =
400 |   Exts.build
401 |     ( \g x0 ->
402 |         -- It is possible to optimize for the common case where a
403 |         -- piece does not span multiple chunks. However, such an
404 |         -- optimization would actually cause this to tail call in
405 |         -- two places rather than one and may actually adversely
406 |         -- affect performance. It hasn't been benchmarked.
407 |         let go !cs = case splitOnto ChunksNil w cs of
408 |               (hd, tl) ->
409 |                 let !x = concat (reverse hd)
410 |                  in case tl of
411 |                       ChunksNil -> x0
412 |                       _ -> g x (go tl)
413 |          in go cs0
414 |     )
415 | 
416 | splitOnto :: Chunks -> Word8 -> Chunks -> (Chunks, Chunks)
417 | {-# INLINE splitOnto #-}
418 | splitOnto !acc0 !w !cs0 = go acc0 cs0
419 |  where
420 |   go !acc ChunksNil = (acc, ChunksNil)
421 |   go !acc (ChunksCons b bs) = case Byte.split1 w b of
422 |     Nothing -> go (ChunksCons b acc) bs
423 |     Just (hd, tl) ->
424 |       let !r1 = ChunksCons hd acc
425 |           !r2 = ChunksCons tl bs
426 |        in (r1, r2)
427 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Text/Latin1.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE MagicHash #-}
  3 | {-# LANGUAGE TypeApplications #-}
  4 | {-# LANGUAGE UnboxedTuples #-}
  5 | 
  6 | {- | This module treats 'Bytes' data as holding text encoded in ISO-8859-1. This
  7 | encoding can only encode codepoints strictly below @U+0100@, but this allows
  8 | each codepoint to be placed directly into a single byte. This range consists
  9 | of Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls, which includes
 10 | ASCII.
 11 | 
 12 | Strictly, ISO-8859-1 is not to be confused with ISO/IEC 8859-1 (which was the
 13 | default encoding for webpages before HTML5). ISO/IEC 8859-1 lacks encodings
 14 | for the C0 and C1 control characters.
 15 | 
 16 | With HTML5, the default encoding of webpages was changed to Windows-1252,
 17 | which is _not_ compatible with ISO-8859-1. Windows-1252 uses the C1 Control
 18 | range (@U+0080@ -- @U+009F@) mostly to encode a variety of printable
 19 | characters. For this encoding, see 'Data.Bytes.Text.Windows1252'.
 20 | -}
 21 | module Data.Bytes.Text.Latin1
 22 |   ( toString
 23 |   , fromString
 24 |   , decodeDecWord
 25 | 
 26 |     -- * Specialized Comparisons
 27 |   , equals1
 28 |   , equals2
 29 |   , equals3
 30 |   , equals4
 31 |   , equals5
 32 |   , equals6
 33 |   , equals7
 34 |   , equals8
 35 |   , equals9
 36 |   , equals10
 37 |   , equals11
 38 |   , equals12
 39 |   , equals13
 40 |   , equals14
 41 |   , equals15
 42 |   ) where
 43 | 
 44 | import Prelude hiding (length)
 45 | 
 46 | import Data.Bytes.Types (Bytes (..))
 47 | import Data.Char (chr, ord)
 48 | import Data.Primitive (ByteArray (ByteArray))
 49 | import Data.Word (Word8)
 50 | import GHC.Exts (Char (C#), Int (I#), Word (W#), Word#, int2Word#, ltWord#, or#)
 51 | 
 52 | import qualified Data.Bytes.Pure as Bytes
 53 | import qualified Data.Primitive as PM
 54 | import qualified GHC.Exts as Exts
 55 | 
 56 | {- | Convert a 'String' consisting of only characters representable
 57 | by ISO-8859-1. These are encoded with ISO-8859-1. Any character
 58 | with a codepoint above @U+00FF@ is replaced by an unspecified byte.
 59 | -}
 60 | fromString :: String -> Bytes
 61 | fromString =
 62 |   Bytes.fromByteArray . Exts.fromList . map (fromIntegral @Int @Word8 . ord)
 63 | 
 64 | -- | Interpret a byte sequence as text encoded by ISO-8859-1.
 65 | toString :: Bytes -> String
 66 | {-# INLINE toString #-}
 67 | toString = Bytes.foldr (\w xs -> chr (fromIntegral @Word8 @Int w) : xs) []
 68 | 
 69 | -- TODO presumably also fromText and fromShortText
 70 | 
 71 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
 72 | a singleton whose element matches the character?
 73 | -}
 74 | equals1 :: Char -> Bytes -> Bool
 75 | {-# INLINE equals1 #-}
 76 | equals1 !c0 (Bytes arr off len) = case len of
 77 |   1 -> c0 == indexCharArray arr off
 78 |   _ -> False
 79 | 
 80 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
 81 | a doubleton whose elements match the characters?
 82 | -}
 83 | equals2 :: Char -> Char -> Bytes -> Bool
 84 | equals2 !c0 !c1 (Bytes arr off len) = case len of
 85 |   2 ->
 86 |     c0 == indexCharArray arr off
 87 |       && c1 == indexCharArray arr (off + 1)
 88 |   _ -> False
 89 | 
 90 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
 91 | a tripleton whose elements match the characters?
 92 | -}
 93 | equals3 :: Char -> Char -> Char -> Bytes -> Bool
 94 | equals3 !c0 !c1 !c2 (Bytes arr off len) = case len of
 95 |   3 ->
 96 |     c0 == indexCharArray arr off
 97 |       && c1 == indexCharArray arr (off + 1)
 98 |       && c2 == indexCharArray arr (off + 2)
 99 |   _ -> False
100 | 
101 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
102 | a quadrupleton whose elements match the characters?
103 | -}
104 | equals4 :: Char -> Char -> Char -> Char -> Bytes -> Bool
105 | equals4 !c0 !c1 !c2 !c3 (Bytes arr off len) = case len of
106 |   4 ->
107 |     c0 == indexCharArray arr off
108 |       && c1 == indexCharArray arr (off + 1)
109 |       && c2 == indexCharArray arr (off + 2)
110 |       && c3 == indexCharArray arr (off + 3)
111 |   _ -> False
112 | 
113 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
114 | a quintupleton whose elements match the characters?
115 | -}
116 | equals5 :: Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
117 | equals5 !c0 !c1 !c2 !c3 !c4 (Bytes arr off len) = case len of
118 |   5 ->
119 |     c0 == indexCharArray arr off
120 |       && c1 == indexCharArray arr (off + 1)
121 |       && c2 == indexCharArray arr (off + 2)
122 |       && c3 == indexCharArray arr (off + 3)
123 |       && c4 == indexCharArray arr (off + 4)
124 |   _ -> False
125 | 
126 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
127 | a sextupleton whose elements match the characters?
128 | -}
129 | equals6 :: Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
130 | equals6 !c0 !c1 !c2 !c3 !c4 !c5 (Bytes arr off len) = case len of
131 |   6 ->
132 |     c0 == indexCharArray arr off
133 |       && c1 == indexCharArray arr (off + 1)
134 |       && c2 == indexCharArray arr (off + 2)
135 |       && c3 == indexCharArray arr (off + 3)
136 |       && c4 == indexCharArray arr (off + 4)
137 |       && c5 == indexCharArray arr (off + 5)
138 |   _ -> False
139 | 
140 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
141 | a septupleton whose elements match the characters?
142 | -}
143 | equals7 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
144 | equals7 !c0 !c1 !c2 !c3 !c4 !c5 !c6 (Bytes arr off len) = case len of
145 |   7 ->
146 |     c0 == indexCharArray arr off
147 |       && c1 == indexCharArray arr (off + 1)
148 |       && c2 == indexCharArray arr (off + 2)
149 |       && c3 == indexCharArray arr (off + 3)
150 |       && c4 == indexCharArray arr (off + 4)
151 |       && c5 == indexCharArray arr (off + 5)
152 |       && c6 == indexCharArray arr (off + 6)
153 |   _ -> False
154 | 
155 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
156 | an octupleton whose elements match the characters?
157 | -}
158 | equals8 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
159 | equals8 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 (Bytes arr off len) = case len of
160 |   8 ->
161 |     c0 == indexCharArray arr off
162 |       && c1 == indexCharArray arr (off + 1)
163 |       && c2 == indexCharArray arr (off + 2)
164 |       && c3 == indexCharArray arr (off + 3)
165 |       && c4 == indexCharArray arr (off + 4)
166 |       && c5 == indexCharArray arr (off + 5)
167 |       && c6 == indexCharArray arr (off + 6)
168 |       && c7 == indexCharArray arr (off + 7)
169 |   _ -> False
170 | 
171 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
172 | a 9-tuple whose elements match the characters?
173 | -}
174 | equals9 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
175 | equals9 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 (Bytes arr off len) = case len of
176 |   9 ->
177 |     c0 == indexCharArray arr off
178 |       && c1 == indexCharArray arr (off + 1)
179 |       && c2 == indexCharArray arr (off + 2)
180 |       && c3 == indexCharArray arr (off + 3)
181 |       && c4 == indexCharArray arr (off + 4)
182 |       && c5 == indexCharArray arr (off + 5)
183 |       && c6 == indexCharArray arr (off + 6)
184 |       && c7 == indexCharArray arr (off + 7)
185 |       && c8 == indexCharArray arr (off + 8)
186 |   _ -> False
187 | 
188 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
189 | a 10-tuple whose elements match the characters?
190 | -}
191 | equals10 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
192 | equals10 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 (Bytes arr off len) = case len of
193 |   10 ->
194 |     c0 == indexCharArray arr off
195 |       && c1 == indexCharArray arr (off + 1)
196 |       && c2 == indexCharArray arr (off + 2)
197 |       && c3 == indexCharArray arr (off + 3)
198 |       && c4 == indexCharArray arr (off + 4)
199 |       && c5 == indexCharArray arr (off + 5)
200 |       && c6 == indexCharArray arr (off + 6)
201 |       && c7 == indexCharArray arr (off + 7)
202 |       && c8 == indexCharArray arr (off + 8)
203 |       && c9 == indexCharArray arr (off + 9)
204 |   _ -> False
205 | 
206 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
207 | a 11-tuple whose elements match the characters?
208 | -}
209 | equals11 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
210 | equals11 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 (Bytes arr off len) = case len of
211 |   11 ->
212 |     c0 == indexCharArray arr off
213 |       && c1 == indexCharArray arr (off + 1)
214 |       && c2 == indexCharArray arr (off + 2)
215 |       && c3 == indexCharArray arr (off + 3)
216 |       && c4 == indexCharArray arr (off + 4)
217 |       && c5 == indexCharArray arr (off + 5)
218 |       && c6 == indexCharArray arr (off + 6)
219 |       && c7 == indexCharArray arr (off + 7)
220 |       && c8 == indexCharArray arr (off + 8)
221 |       && c9 == indexCharArray arr (off + 9)
222 |       && c10 == indexCharArray arr (off + 10)
223 |   _ -> False
224 | 
225 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
226 | a 12-tuple whose elements match the characters?
227 | -}
228 | equals12 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
229 | equals12 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 (Bytes arr off len) = case len of
230 |   12 ->
231 |     c0 == indexCharArray arr off
232 |       && c1 == indexCharArray arr (off + 1)
233 |       && c2 == indexCharArray arr (off + 2)
234 |       && c3 == indexCharArray arr (off + 3)
235 |       && c4 == indexCharArray arr (off + 4)
236 |       && c5 == indexCharArray arr (off + 5)
237 |       && c6 == indexCharArray arr (off + 6)
238 |       && c7 == indexCharArray arr (off + 7)
239 |       && c8 == indexCharArray arr (off + 8)
240 |       && c9 == indexCharArray arr (off + 9)
241 |       && c10 == indexCharArray arr (off + 10)
242 |       && c11 == indexCharArray arr (off + 11)
243 |   _ -> False
244 | 
245 | equals13 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
246 | equals13 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 (Bytes arr off len) = case len of
247 |   13 ->
248 |     c0 == indexCharArray arr off
249 |       && c1 == indexCharArray arr (off + 1)
250 |       && c2 == indexCharArray arr (off + 2)
251 |       && c3 == indexCharArray arr (off + 3)
252 |       && c4 == indexCharArray arr (off + 4)
253 |       && c5 == indexCharArray arr (off + 5)
254 |       && c6 == indexCharArray arr (off + 6)
255 |       && c7 == indexCharArray arr (off + 7)
256 |       && c8 == indexCharArray arr (off + 8)
257 |       && c9 == indexCharArray arr (off + 9)
258 |       && c10 == indexCharArray arr (off + 10)
259 |       && c11 == indexCharArray arr (off + 11)
260 |       && c12 == indexCharArray arr (off + 12)
261 |   _ -> False
262 | 
263 | equals14 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
264 | equals14 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 !c13 (Bytes arr off len) = case len of
265 |   14 ->
266 |     c0 == indexCharArray arr off
267 |       && c1 == indexCharArray arr (off + 1)
268 |       && c2 == indexCharArray arr (off + 2)
269 |       && c3 == indexCharArray arr (off + 3)
270 |       && c4 == indexCharArray arr (off + 4)
271 |       && c5 == indexCharArray arr (off + 5)
272 |       && c6 == indexCharArray arr (off + 6)
273 |       && c7 == indexCharArray arr (off + 7)
274 |       && c8 == indexCharArray arr (off + 8)
275 |       && c9 == indexCharArray arr (off + 9)
276 |       && c10 == indexCharArray arr (off + 10)
277 |       && c11 == indexCharArray arr (off + 11)
278 |       && c12 == indexCharArray arr (off + 12)
279 |       && c13 == indexCharArray arr (off + 13)
280 |   _ -> False
281 | 
282 | equals15 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
283 | equals15 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 !c13 !c14 (Bytes arr off len) = case len of
284 |   15 ->
285 |     c0 == indexCharArray arr off
286 |       && c1 == indexCharArray arr (off + 1)
287 |       && c2 == indexCharArray arr (off + 2)
288 |       && c3 == indexCharArray arr (off + 3)
289 |       && c4 == indexCharArray arr (off + 4)
290 |       && c5 == indexCharArray arr (off + 5)
291 |       && c6 == indexCharArray arr (off + 6)
292 |       && c7 == indexCharArray arr (off + 7)
293 |       && c8 == indexCharArray arr (off + 8)
294 |       && c9 == indexCharArray arr (off + 9)
295 |       && c10 == indexCharArray arr (off + 10)
296 |       && c11 == indexCharArray arr (off + 11)
297 |       && c12 == indexCharArray arr (off + 12)
298 |       && c13 == indexCharArray arr (off + 13)
299 |       && c14 == indexCharArray arr (off + 14)
300 |   _ -> False
301 | 
302 | indexCharArray :: ByteArray -> Int -> Char
303 | indexCharArray (ByteArray arr) (I# off) = C# (Exts.indexCharArray# arr off)
304 | 
305 | {- | Decode machine-sized word from decimal representation. Returns
306 | Nothing on overflow. Allows any number of leading zeros. Trailing
307 | non-digit bytes cause Nothing to be returned.
308 | -}
309 | decodeDecWord :: Bytes -> Maybe Word
310 | {-# INLINE decodeDecWord #-}
311 | decodeDecWord !b = case decWordStart b of
312 |   (# (# #) | #) -> Nothing
313 |   (# | w #) -> Just (W# w)
314 | 
315 | decWordStart ::
316 |   Bytes -> -- Chunk
317 |   (# (# #) | Word# #)
318 | {-# NOINLINE decWordStart #-}
319 | decWordStart !chunk0 =
320 |   if length chunk0 > 0
321 |     then
322 |       let !w =
323 |             fromIntegral @Word8 @Word
324 |               (PM.indexByteArray (array chunk0) (offset chunk0))
325 |               - 48
326 |        in if w < 10
327 |             then decWordMore w (Bytes.unsafeDrop 1 chunk0)
328 |             else (# (# #) | #)
329 |     else (# (# #) | #)
330 |  where
331 |   decWordMore ::
332 |     Word -> -- Accumulator
333 |     Bytes -> -- Chunk
334 |     (# (# #) | Word# #)
335 |   decWordMore !acc !chunk =
336 |     let len = length chunk
337 |      in case len of
338 |           0 -> (# | unW (fromIntegral acc) #)
339 |           _ ->
340 |             let !w =
341 |                   fromIntegral @Word8 @Word
342 |                     (PM.indexByteArray (array chunk) (offset chunk))
343 |                     - 48
344 |              in if w < 10
345 |                   then
346 |                     let (overflow, acc') = unsignedPushBase10 acc w
347 |                      in if overflow
348 |                           then (# (# #) | #)
349 |                           else decWordMore acc' (Bytes.unsafeDrop 1 chunk)
350 |                   else (# (# #) | #)
351 | 
352 | unsignedPushBase10 :: Word -> Word -> (Bool, Word)
353 | {-# INLINE unsignedPushBase10 #-}
354 | unsignedPushBase10 (W# a) (W# b) =
355 |   let !(# ca, r0 #) = Exts.timesWord2# a 10##
356 |       !r1 = Exts.plusWord# r0 b
357 |       !cb = int2Word# (ltWord# r1 r0)
358 |       !c = ca `or#` cb
359 |    in (case c of 0## -> False; _ -> True, W# r1)
360 | 
361 | unW :: Word -> Word#
362 | {-# INLINE unW #-}
363 | unW (W# w) = w
364 | 


--------------------------------------------------------------------------------
/src/Data/Bytes/Pure.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE BlockArguments #-}
  3 | {-# LANGUAGE MagicHash #-}
  4 | {-# LANGUAGE TypeApplications #-}
  5 | 
  6 | module Data.Bytes.Pure
  7 |   ( empty
  8 |   , emptyPinned
  9 |   , emptyPinnedU
 10 |   , pin
 11 |   , contents
 12 |   , unsafeCopy
 13 |   , toByteArray
 14 |   , toByteArrayClone
 15 |   , toPinnedByteArray
 16 |   , toPinnedByteArrayClone
 17 |   , fromByteArray
 18 |   , fromPrimArray
 19 |   , length
 20 |   , foldlM
 21 |   , foldrM
 22 |   , foldl
 23 |   , foldl'
 24 |   , foldr
 25 |   , ifoldl'
 26 |   , foldr'
 27 |   , fnv1a32
 28 |   , fnv1a64
 29 |   , toByteString
 30 |   , pinnedToByteString
 31 |   , fromByteString
 32 |   , fromLazyByteString
 33 |   , unsafeDrop
 34 |   , unsafeTake
 35 |   , unsafeIndex
 36 |   , unsafeHead
 37 |   , map
 38 |   , mapU
 39 |   , null
 40 |   , toShortByteString
 41 |   , replicate
 42 |   , replicateU
 43 |   , splitTetragram1
 44 |   , findTetragramIndex
 45 |   , countWhile
 46 |   , countWhileEnd
 47 |   , any
 48 |   , all
 49 |   ) where
 50 | 
 51 | import Prelude hiding (Foldable (..), map, replicate, any, all)
 52 | 
 53 | import Control.Monad.Primitive (PrimMonad, PrimState)
 54 | import Control.Monad.ST.Run (runByteArrayST)
 55 | import Data.Bits (unsafeShiftL, (.|.))
 56 | import Data.Bits (xor)
 57 | import Data.ByteString (ByteString)
 58 | import Data.ByteString.Short.Internal (ShortByteString (SBS))
 59 | import Data.Bytes.Types (Bytes (Bytes))
 60 | import Data.Primitive (ByteArray (ByteArray), MutableByteArray, PrimArray (PrimArray))
 61 | import Data.Word (Word32, Word64, Word8)
 62 | import Foreign.Ptr (Ptr, plusPtr)
 63 | import GHC.IO (unsafeIOToST)
 64 | 
 65 | import qualified Data.ByteString as ByteString
 66 | import qualified Data.ByteString.Internal as ByteString
 67 | import qualified Data.ByteString.Lazy as LBS
 68 | import qualified Data.ByteString.Lazy.Internal as LBS
 69 | import qualified Data.ByteString.Unsafe as ByteString
 70 | import qualified Data.Primitive as PM
 71 | import qualified GHC.Exts as Exts
 72 | import qualified GHC.ForeignPtr as ForeignPtr
 73 | 
 74 | -- | The empty byte sequence.
 75 | empty :: Bytes
 76 | empty = Bytes mempty 0 0
 77 | 
 78 | -- | The empty pinned byte sequence.
 79 | emptyPinned :: Bytes
 80 | emptyPinned =
 81 |   Bytes
 82 |     ( runByteArrayST
 83 |         (PM.newPinnedByteArray 0 >>= PM.unsafeFreezeByteArray)
 84 |     )
 85 |     0
 86 |     0
 87 | 
 88 | -- | The empty pinned byte sequence.
 89 | emptyPinnedU :: ByteArray
 90 | emptyPinnedU =
 91 |   runByteArrayST
 92 |     (PM.newPinnedByteArray 0 >>= PM.unsafeFreezeByteArray)
 93 | 
 94 | {- | Yields a pinned byte sequence whose contents are identical to those
 95 | of the original byte sequence. If the @ByteArray@ backing the argument
 96 | was already pinned, this simply aliases the argument and does not perform
 97 | any copying.
 98 | -}
 99 | pin :: Bytes -> Bytes
100 | pin b@(Bytes arr _ len) = case PM.isByteArrayPinned arr of
101 |   True -> b
102 |   False ->
103 |     Bytes
104 |       ( runByteArrayST do
105 |           dst <- PM.newPinnedByteArray len
106 |           unsafeCopy dst 0 b
107 |           PM.unsafeFreezeByteArray dst
108 |       )
109 |       0
110 |       len
111 | 
112 | {- | Convert the sliced 'Bytes' to an unsliced 'ByteArray'. This
113 | reuses the array backing the sliced 'Bytes' if the slicing metadata
114 | implies that all of the bytes are used. Otherwise, it makes a copy.
115 | -}
116 | toByteArray :: Bytes -> ByteArray
117 | {-# INLINE toByteArray #-}
118 | toByteArray b@(Bytes arr off len)
119 |   | off == 0, PM.sizeofByteArray arr == len = arr
120 |   | otherwise = toByteArrayClone b
121 | 
122 | {- | Variant of 'toByteArray' that unconditionally makes a copy of
123 | the array backing the sliced 'Bytes' even if the original array
124 | could be reused. Prefer 'toByteArray'.
125 | -}
126 | toByteArrayClone :: Bytes -> ByteArray
127 | {-# INLINE toByteArrayClone #-}
128 | toByteArrayClone (Bytes arr off len) = runByteArrayST $ do
129 |   m <- PM.newByteArray len
130 |   PM.copyByteArray m 0 arr off len
131 |   PM.unsafeFreezeByteArray m
132 | 
133 | {- | Copy the byte sequence into a mutable buffer. The buffer must have
134 | enough space to accomodate the byte sequence, but this this is not
135 | checked.
136 | -}
137 | unsafeCopy ::
138 |   (PrimMonad m) =>
139 |   -- | Destination
140 |   MutableByteArray (PrimState m) ->
141 |   -- | Destination Offset
142 |   Int ->
143 |   -- | Source
144 |   Bytes ->
145 |   m ()
146 | {-# INLINE unsafeCopy #-}
147 | unsafeCopy dst dstIx (Bytes src srcIx len) =
148 |   PM.copyByteArray dst dstIx src srcIx len
149 | 
150 | -- | Create a slice of 'Bytes' that spans the entire argument array.
151 | fromByteArray :: ByteArray -> Bytes
152 | {-# INLINE fromByteArray #-}
153 | fromByteArray b = Bytes b 0 (PM.sizeofByteArray b)
154 | 
155 | -- | Create a slice of 'Bytes' that spans the entire 'PrimArray' of 8-bit words.
156 | fromPrimArray :: PrimArray Word8 -> Bytes
157 | {-# INLINE fromPrimArray #-}
158 | fromPrimArray p@(PrimArray b) = Bytes (ByteArray b) 0 (PM.sizeofPrimArray p)
159 | 
160 | -- | The length of a slice of bytes.
161 | length :: Bytes -> Int
162 | {-# INLINE length #-}
163 | length (Bytes _ _ len) = len
164 | 
165 | -- | Hash byte sequence with 32-bit variant of FNV-1a.
166 | fnv1a32 :: Bytes -> Word32
167 | fnv1a32 !b =
168 |   foldl'
169 |     ( \acc w -> (fromIntegral @Word8 @Word32 w `xor` acc) * 0x01000193
170 |     )
171 |     0x811c9dc5
172 |     b
173 | 
174 | -- | Hash byte sequence with 64-bit variant of FNV-1a.
175 | fnv1a64 :: Bytes -> Word64
176 | fnv1a64 !b =
177 |   foldl'
178 |     ( \acc w -> (fromIntegral @Word8 @Word64 w `xor` acc) * 0x00000100000001B3
179 |     )
180 |     0xcbf29ce484222325
181 |     b
182 | 
183 | -- | Left fold over bytes, non-strict in the accumulator.
184 | foldl :: (a -> Word8 -> a) -> a -> Bytes -> a
185 | {-# INLINE foldl #-}
186 | foldl f a0 (Bytes arr off0 len0) =
187 |   go (off0 + len0 - 1) (len0 - 1)
188 |  where
189 |   go !off !ix = case ix of
190 |     (-1) -> a0
191 |     _ -> f (go (off - 1) (ix - 1)) (PM.indexByteArray arr off)
192 | 
193 | -- | Left fold over bytes, strict in the accumulator.
194 | foldl' :: (a -> Word8 -> a) -> a -> Bytes -> a
195 | {-# INLINE foldl' #-}
196 | foldl' f a0 (Bytes arr off0 len0) = go a0 off0 len0
197 |  where
198 |   go !a !off !len = case len of
199 |     0 -> a
200 |     _ -> go (f a (PM.indexByteArray arr off)) (off + 1) (len - 1)
201 | 
202 | -- | Left monadic fold over bytes, non-strict in the accumulator.
203 | foldlM :: (Monad m) => (a -> Word8 -> m a) -> a -> Bytes -> m a
204 | {-# INLINE foldlM #-}
205 | foldlM f a0 (Bytes arr off0 len0) = go a0 off0 len0
206 |  where
207 |   go a !off !len = case len of
208 |     0 -> pure a
209 |     _ -> do
210 |       a' <- f a (PM.indexByteArray arr off)
211 |       go a' (off + 1) (len - 1)
212 | 
213 | -- | Right fold over bytes, non-strict in the accumulator.
214 | foldr :: (Word8 -> a -> a) -> a -> Bytes -> a
215 | {-# INLINE foldr #-}
216 | foldr f a0 (Bytes arr off0 len0) = go off0 len0
217 |  where
218 |   go !off !len = case len of
219 |     0 -> a0
220 |     _ -> f (PM.indexByteArray arr off) (go (off + 1) (len - 1))
221 | 
222 | {- | Left fold over bytes, strict in the accumulator. The reduction function
223 | is applied to each element along with its index.
224 | -}
225 | ifoldl' :: (a -> Int -> Word8 -> a) -> a -> Bytes -> a
226 | {-# INLINE ifoldl' #-}
227 | ifoldl' f a0 (Bytes arr off0 len0) = go a0 0 off0 len0
228 |  where
229 |   go !a !ix !off !len = case len of
230 |     0 -> a
231 |     _ -> go (f a ix (PM.indexByteArray arr off)) (ix + 1) (off + 1) (len - 1)
232 | 
233 | -- | Right fold over bytes, strict in the accumulator.
234 | foldr' :: (Word8 -> a -> a) -> a -> Bytes -> a
235 | {-# INLINE foldr' #-}
236 | foldr' f a0 (Bytes arr off0 len0) =
237 |   go a0 (off0 + len0 - 1) (len0 - 1)
238 |  where
239 |   go !a !off !ix = case ix of
240 |     (-1) -> a
241 |     _ -> go (f (PM.indexByteArray arr off) a) (off - 1) (ix - 1)
242 | 
243 | -- | Right monadic fold over bytes, non-strict in the accumulator.
244 | foldrM :: (Monad m) => (Word8 -> a -> m a) -> a -> Bytes -> m a
245 | {-# INLINE foldrM #-}
246 | foldrM f a0 (Bytes arr off0 len0) =
247 |   go a0 (off0 + len0 - 1) (len0 - 1)
248 |  where
249 |   go !a !off !ix = case ix of
250 |     (-1) -> pure a
251 |     _ -> do
252 |       a' <- f (PM.indexByteArray arr off) a
253 |       go a' (off - 1) (ix - 1)
254 | 
255 | {- | Yields a pointer to the beginning of the byte sequence. It is only safe
256 | to call this on a 'Bytes' backed by a pinned @ByteArray@.
257 | -}
258 | contents :: Bytes -> Ptr Word8
259 | {-# INLINE contents #-}
260 | contents (Bytes arr off _) = plusPtr (PM.byteArrayContents arr) off
261 | 
262 | {- | Convert the sliced 'Bytes' to an unsliced 'ByteArray'. This
263 | reuses the array backing the sliced 'Bytes' if the slicing metadata
264 | implies that all of the bytes are used and they are already pinned.
265 | Otherwise, it makes a copy.
266 | -}
267 | toPinnedByteArray :: Bytes -> ByteArray
268 | {-# INLINE toPinnedByteArray #-}
269 | toPinnedByteArray b@(Bytes arr off len)
270 |   | off == 0, PM.sizeofByteArray arr == len, PM.isByteArrayPinned arr = arr
271 |   | otherwise = toPinnedByteArrayClone b
272 | 
273 | {- | Variant of 'toPinnedByteArray' that unconditionally makes a copy of
274 | the array backing the sliced 'Bytes' even if the original array
275 | could be reused. Prefer 'toPinnedByteArray'.
276 | -}
277 | toPinnedByteArrayClone :: Bytes -> ByteArray
278 | toPinnedByteArrayClone (Bytes arr off len) = runByteArrayST $ do
279 |   m <- PM.newPinnedByteArray len
280 |   PM.copyByteArray m 0 arr off len
281 |   PM.unsafeFreezeByteArray m
282 | 
283 | {- | /O(n)/ when unpinned, /O(1)/ when pinned. Create a 'ByteString' from
284 | a byte sequence. This only copies the byte sequence if it is not pinned.
285 | -}
286 | toByteString :: Bytes -> ByteString
287 | toByteString !b = pinnedToByteString (pin b)
288 | 
289 | {- | Convert a pinned 'Bytes' to a 'ByteString'
290 | /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise.
291 | -}
292 | pinnedToByteString :: Bytes -> ByteString
293 | pinnedToByteString (Bytes y@(PM.ByteArray x) off len) =
294 |   ByteString.PS
295 |     ( ForeignPtr.ForeignPtr
296 |         (case plusPtr (PM.byteArrayContents y) off of Exts.Ptr p -> p)
297 |         (ForeignPtr.PlainPtr (Exts.unsafeCoerce# x))
298 |     )
299 |     0
300 |     len
301 | 
302 | -- | /O(n)/ Copy a 'ByteString' to a byte sequence.
303 | fromByteString :: ByteString -> Bytes
304 | fromByteString !b =
305 |   Bytes
306 |     ( runByteArrayST $ unsafeIOToST $ do
307 |         dst@(PM.MutableByteArray dst#) <- PM.newByteArray len
308 |         ByteString.unsafeUseAsCString b $ \src -> do
309 |           PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst#) 0 src len
310 |         PM.unsafeFreezeByteArray dst
311 |     )
312 |     0
313 |     len
314 |  where
315 |   !len = ByteString.length b
316 | 
317 | -- | /O(n)/ Copy a lazy bytestring to a byte sequence.
318 | fromLazyByteString :: LBS.ByteString -> Bytes
319 | fromLazyByteString x = case LBS.length x of
320 |   0 -> empty
321 |   n64 ->
322 |     let n = fromIntegral n64 :: Int
323 |      in Bytes
324 |           ( runByteArrayST $ unsafeIOToST $ do
325 |               dst@(PM.MutableByteArray dst#) <- PM.newByteArray n
326 |               let loop chunks !ix = case chunks of
327 |                     LBS.Empty -> PM.unsafeFreezeByteArray dst
328 |                     LBS.Chunk c cs -> do
329 |                       let !len = ByteString.length c
330 |                       ByteString.unsafeUseAsCString c $ \src -> do
331 |                         PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst#) ix src len
332 |                       loop cs (ix + len)
333 |               loop x 0
334 |           )
335 |           0
336 |           n
337 | 
338 | -- | Drop the first @n@ bytes from the argument. Precondition: @n ≤ len@
339 | unsafeDrop :: Int -> Bytes -> Bytes
340 | {-# INLINE unsafeDrop #-}
341 | unsafeDrop n (Bytes arr off len) =
342 |   Bytes arr (off + n) (len - n)
343 | 
344 | -- | Variant of 'map' that returns unsliced byte sequence.
345 | mapU :: (Word8 -> Word8) -> Bytes -> ByteArray
346 | {-# INLINE mapU #-}
347 | mapU f (Bytes array ix0 len) = runByteArrayST do
348 |   dst <- PM.newByteArray len
349 |   let go !srcIx !dstIx =
350 |         if dstIx < len
351 |           then do
352 |             let w = PM.indexByteArray array srcIx :: Word8
353 |             PM.writeByteArray dst dstIx (f w)
354 |             go (srcIx + 1) (dstIx + 1)
355 |           else PM.unsafeFreezeByteArray dst
356 |   go ix0 0
357 | 
358 | {- | Map over bytes in a sequence. The result has the same length as
359 | the argument.
360 | -}
361 | map :: (Word8 -> Word8) -> Bytes -> Bytes
362 | {-# INLINE map #-}
363 | map f !b = Bytes (mapU f b) 0 (length b)
364 | 
365 | -- | Is the byte sequence empty?
366 | null :: Bytes -> Bool
367 | {-# INLINE null #-}
368 | null (Bytes _ _ len) = len == 0
369 | 
370 | -- | Take the first @n@ bytes from the argument. Precondition: @n ≤ len@
371 | unsafeTake :: Int -> Bytes -> Bytes
372 | {-# INLINE unsafeTake #-}
373 | unsafeTake n (Bytes arr off _) =
374 |   Bytes arr off n
375 | 
376 | {- | Index into the byte sequence at the given position. This index
377 | must be less than the length.
378 | -}
379 | unsafeIndex :: Bytes -> Int -> Word8
380 | {-# INLINE unsafeIndex #-}
381 | unsafeIndex (Bytes arr off _) ix = PM.indexByteArray arr (off + ix)
382 | 
383 | -- | Access the first byte. The given 'Bytes' must be non-empty.
384 | {-# INLINE unsafeHead #-}
385 | unsafeHead :: Bytes -> Word8
386 | unsafeHead bs = unsafeIndex bs 0
387 | 
388 | {- | Convert the sliced 'Bytes' to an unsliced 'ShortByteString'. This
389 | reuses the array backing the sliced 'Bytes' if the slicing metadata
390 | implies that all of the bytes are used. Otherwise, it makes a copy.
391 | -}
392 | toShortByteString :: Bytes -> ShortByteString
393 | {-# INLINE toShortByteString #-}
394 | toShortByteString !b = case toByteArray b of
395 |   PM.ByteArray x -> SBS x
396 | 
397 | -- | Replicate a byte @n@ times.
398 | replicate ::
399 |   -- | Desired length @n@
400 |   Int ->
401 |   -- | Byte to replicate
402 |   Word8 ->
403 |   Bytes
404 | replicate !n !w = Bytes (replicateU n w) 0 n
405 | 
406 | -- | Variant of 'replicate' that returns a unsliced byte array.
407 | replicateU :: Int -> Word8 -> ByteArray
408 | replicateU !n !w = runByteArrayST do
409 |   arr <- PM.newByteArray n
410 |   PM.setByteArray arr 0 n w
411 |   PM.unsafeFreezeByteArray arr
412 | 
413 | splitTetragram1 ::
414 |   Word8 ->
415 |   Word8 ->
416 |   Word8 ->
417 |   Word8 ->
418 |   Bytes ->
419 |   Maybe (Bytes, Bytes)
420 | splitTetragram1 !w0 !w1 !w2 !w3 !b = case findTetragramIndex w0 w1 w2 w3 b of
421 |   Nothing -> Nothing
422 |   Just n -> Just (unsafeTake n b, unsafeDrop (n + 4) b)
423 | 
424 | findTetragramIndex ::
425 |   Word8 ->
426 |   Word8 ->
427 |   Word8 ->
428 |   Word8 ->
429 |   Bytes ->
430 |   Maybe Int
431 | findTetragramIndex !w0 !w1 !w2 !w3 (Bytes arr off len) =
432 |   if len < 4
433 |     then Nothing
434 |     else
435 |       let !target =
436 |             unsafeShiftL (fromIntegral w0 :: Word32) 24
437 |               .|. unsafeShiftL (fromIntegral w1 :: Word32) 16
438 |               .|. unsafeShiftL (fromIntegral w2 :: Word32) 8
439 |               .|. unsafeShiftL (fromIntegral w3 :: Word32) 0
440 |           !end = off + len
441 |           go !ix !acc =
442 |             if acc == target
443 |               then
444 |                 let n = ix - off
445 |                  in Just (n - 4)
446 |               else
447 |                 if ix < end
448 |                   then
449 |                     let !w = PM.indexByteArray arr ix :: Word8
450 |                         acc' =
451 |                           (fromIntegral w :: Word32)
452 |                             .|. unsafeShiftL acc 8
453 |                      in go (ix + 1) acc'
454 |                   else Nothing
455 |           !acc0 =
456 |             unsafeShiftL (fromIntegral (PM.indexByteArray arr 0 :: Word8) :: Word32) 24
457 |               .|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 1 :: Word8) :: Word32) 16
458 |               .|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 2 :: Word8) :: Word32) 8
459 |               .|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 3 :: Word8) :: Word32) 0
460 |        in go 4 acc0
461 | 
462 | -- Internal. The returns the number of bytes that match the
463 | -- predicate until the first non-match occurs. If all bytes
464 | -- match the predicate, this will return the length originally
465 | -- provided.
466 | countWhile :: (Word8 -> Bool) -> Bytes -> Int
467 | {-# INLINE countWhile #-}
468 | countWhile k (Bytes arr off0 len0) = go off0 len0 0
469 |  where
470 |   go !off !len !n =
471 |     if len > 0
472 |       then
473 |         if k (PM.indexByteArray arr off)
474 |           then go (off + 1) (len - 1) (n + 1)
475 |           else n
476 |       else n
477 | 
478 | -- Internal. Variant of countWhile that starts from the end
479 | -- of the string instead of the beginning.
480 | countWhileEnd :: (Word8 -> Bool) -> Bytes -> Int
481 | {-# INLINE countWhileEnd #-}
482 | countWhileEnd k (Bytes arr off0 len0) = go (off0 + len0 - 1) (len0 - 1) 0
483 |  where
484 |   go !off !len !n =
485 |     if len >= 0
486 |       then
487 |         if k (PM.indexByteArray arr off)
488 |           then go (off - 1) (len - 1) (n + 1)
489 |           else n
490 |       else n
491 | 
492 | -- | /O(n)/ Returns true if any byte in the sequence satisfies the predicate.
493 | any :: (Word8 -> Bool) -> Bytes -> Bool
494 | {-# INLINE any #-}
495 | any f = foldr (\b r -> f b || r) False
496 | 
497 | -- | /O(n)/ Returns true if all bytes in the sequence satisfy the predicate.
498 | all :: (Word8 -> Bool) -> Bytes -> Bool
499 | {-# INLINE all #-}
500 | all f = foldr (\b r -> f b && r) True
501 | 
502 | 


--------------------------------------------------------------------------------
/test/Main.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE MultiWayIf #-}
  2 | {-# LANGUAGE BangPatterns #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | {-# LANGUAGE ScopedTypeVariables #-}
  5 | {-# LANGUAGE TypeApplications #-}
  6 | {-# OPTIONS_GHC -fno-warn-orphans #-}
  7 | 
  8 | import Control.Monad.Trans.Writer (Writer, tell)
  9 | import Data.Bytes.Chunks (Chunks (ChunksCons, ChunksNil))
 10 | import Data.Bytes.Types (Bytes (Bytes))
 11 | import Data.Char (ord)
 12 | import Data.Primitive (ByteArray)
 13 | import Data.Proxy (Proxy (..))
 14 | import Data.Text (Text)
 15 | import Data.Word (Word8)
 16 | import Test.Tasty (TestTree, defaultMain, testGroup)
 17 | import Test.Tasty.HUnit (testCase, (@=?))
 18 | import Test.Tasty.QuickCheck
 19 |   ( ASCIIString (ASCIIString)
 20 |   , Arbitrary
 21 |   , Discard (Discard)
 22 |   , property
 23 |   , testProperty
 24 |   , (===)
 25 |   , (==>)
 26 |   )
 27 | 
 28 | import qualified Data.ByteString as ByteString
 29 | import qualified Data.Bytes as Bytes
 30 | import qualified Data.Bytes.Chunks as Chunks
 31 | import qualified Data.Bytes.Text.Ascii as Ascii
 32 | import qualified Data.Bytes.Text.AsciiExt as AsciiExt
 33 | import qualified Data.Bytes.Text.Latin1 as Latin1
 34 | import qualified Data.Foldable as Foldable
 35 | import qualified Data.List as List
 36 | import qualified Data.Primitive as PM
 37 | import qualified Data.Text as Text
 38 | import qualified Data.Text.Encoding as Text.Encoding
 39 | import qualified GHC.Exts as Exts
 40 | import qualified Test.QuickCheck.Classes as QCC
 41 | import qualified Test.Tasty.HUnit as THU
 42 | import qualified Test.Tasty.QuickCheck as TQC
 43 | import qualified Data.Text.Array as A
 44 | import qualified Data.Text.Internal as I
 45 | 
 46 | main :: IO ()
 47 | main = defaultMain tests
 48 | 
 49 | tests :: TestTree
 50 | tests =
 51 |   testGroup
 52 |     "Bytes"
 53 |     [ lawsToTest (QCC.eqLaws (Proxy :: Proxy Bytes))
 54 |     , lawsToTest (QCC.ordLaws (Proxy :: Proxy Bytes))
 55 |     , lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Bytes))
 56 |     , lawsToTest (QCC.monoidLaws (Proxy :: Proxy Bytes))
 57 |     , testGroup
 58 |         "toLowerAsciiByteArrayClone"
 59 |         [ testCase "A" $
 60 |             THU.assertBool "" $
 61 |               Bytes.isPrefixOf (bytes "hey") (bytes "hey man")
 62 |         ]
 63 |     , testGroup
 64 |         "isPrefixOf"
 65 |         [ testCase "A" $
 66 |             AsciiExt.toLowerU (bytes "FooBar")
 67 |               @=? pack "foobar"
 68 |         , testCase "B" $
 69 |             THU.assertBool "" $
 70 |               not (Bytes.isPrefixOf (bytes "an") (bytes "hey man"))
 71 |         ]
 72 |     , testGroup
 73 |         "isSuffixOf"
 74 |         [ testCase "A" $
 75 |             THU.assertBool "" $
 76 |               Bytes.isSuffixOf (bytes "an") (bytes "hey man")
 77 |         , testCase "B" $
 78 |             THU.assertBool "" $
 79 |               not (Bytes.isSuffixOf (bytes "h") (bytes "hey man"))
 80 |         ]
 81 |     , testGroup
 82 |         "isInfixOf"
 83 |         [ testCase "small pattern A" $
 84 |             THU.assertBool "" $
 85 |               Bytes.isInfixOf (bytes "y m") (bytes "hey man")
 86 |         , testCase "small pattern B" $
 87 |             THU.assertBool "" $
 88 |               Bytes.isInfixOf (bytes "h") (bytes "hey man")
 89 |         , testCase "small pattern C" $
 90 |             THU.assertBool "" $
 91 |               Bytes.isInfixOf (bytes "an") (bytes "hey man")
 92 |         , testCase "small pattern D" $
 93 |             THU.assertBool "" $
 94 |               not (Bytes.isInfixOf (bytes "Y M") (bytes "hey man"))
 95 |         , testCase "large pattern A" $
 96 |             THU.assertBool "" $
 97 |               Bytes.isInfixOf (bytes "Hello, hello!") (bytes "I say hello. Hello, hello! I don't know why you say goodbye, I say hello!")
 98 |         , testCase "large pattern D" $
 99 |             THU.assertBool "" $
100 |               not (Bytes.isInfixOf (bytes "Hello, hello!") (bytes "I don't know why you say goodbye, I say hello!"))
101 |         , testCase "edge: empty pattern" $
102 |             THU.assertBool "" $
103 |               Bytes.isInfixOf (bytes "") (bytes "hello hello!")
104 |         , testCase "edge: empty string" $
105 |             THU.assertBool "" $
106 |               not (Bytes.isInfixOf (bytes "hello hello!") (bytes ""))
107 |         ]
108 |     , testGroup
109 |         "findIndices"
110 |         [ testCase "A" $
111 |             Exts.fromList [4]
112 |               @=? Bytes.findIndices (Bytes.fromByteArray (pack "greatest showman")) (Bytes.fromByteArray (pack "the greatest showman of all"))
113 |         , testCase "B" $
114 |             Exts.fromList [4]
115 |               @=? Bytes.findIndices (bytes "greatest showman") (bytes "the greatest showman of all")
116 |         , testCase "C" $
117 |             Exts.fromList [0, 1, 2, 3, 4, 5]
118 |               @=? Bytes.findIndices (bytes "a") (bytes "aaaaaa")
119 |         , testCase "D" $
120 |             Exts.fromList [0, 2, 4]
121 |               @=? Bytes.findIndices (bytes "aa") (bytes "aaaaaaa")
122 |         ]
123 |     , testGroup
124 |         "replace-spec"
125 |         [ testCase "A" $
126 |             Bytes.empty
127 |               @=? Bytes.replace (Bytes.fromByteArray (pack "hello")) (Bytes.fromByteArray (pack "world")) Bytes.empty
128 |         , testCase "B" $
129 |             bytes "xzybbcbbc"
130 |               @=? Bytes.replace (bytes "a") (bytes "b") (bytes "xzyabcabc")
131 |         , testCase "C" $
132 |             bytes "my favorite month is March!"
133 |               @=? Bytes.replace (bytes "November") (bytes "March") (bytes "my favorite month is November!")
134 |         , testCase "D" $
135 |             bytes "Saturn, Saturn, Mars, Saturn"
136 |               @=? Bytes.replace (bytes "Jupiter") (bytes "Saturn") (bytes "Jupiter, Jupiter, Mars, Jupiter")
137 |         ]
138 |     , testGroup
139 |         "stripOptionalSuffix"
140 |         [ testCase "A" $
141 |             Ascii.fromString "hey m"
142 |               @=? Bytes.stripOptionalSuffix (bytes "an") (bytes "hey man")
143 |         , testCase "B" $
144 |             Ascii.fromString "hey man"
145 |               @=? Bytes.stripOptionalSuffix (bytes "h") (bytes "hey man")
146 |         ]
147 |     , testGroup
148 |         "longestCommonPrefix"
149 |         [ testProperty "finds prefix" $ \(pre :: Bytes) (a :: Bytes) (b :: Bytes) ->
150 |             if
151 |               | Just (wa, _) <- Bytes.uncons a
152 |               , Just (wb, _) <- Bytes.uncons b
153 |               , wa /= wb ->
154 |                   Bytes.longestCommonPrefix (pre <> a) (pre <> b) === pre
155 |               | otherwise -> property Discard
156 |         , testProperty "finds no prefix" $ \(a :: Bytes) (b :: Bytes) ->
157 |             if
158 |               | Just (wa, _) <- Bytes.uncons a
159 |               , Just (wb, _) <- Bytes.uncons b
160 |               , wa /= wb ->
161 |                   Bytes.longestCommonPrefix a b === mempty
162 |               | otherwise -> property Discard
163 |         ]
164 |     , testGroup
165 |         "dropWhileEnd"
166 |         [ testCase "A" $
167 |             Ascii.fromString "aabbcc"
168 |               @=? Bytes.dropWhileEnd (== c2w 'b') (bytes "aabbccbb")
169 |         ]
170 |     , testGroup
171 |         "takeWhileEnd"
172 |         [ testCase "A" $
173 |             Ascii.fromString "bb"
174 |               @=? Bytes.takeWhileEnd (== c2w 'b') (bytes "aabbccbb")
175 |         , testCase "B" $
176 |             Ascii.fromString ""
177 |               @=? Bytes.takeWhileEnd (/= c2w '\n') (bytes "aabbccbb\n")
178 |         , testCase "C" $
179 |             slicedPack [0x1, 0x2, 0x3]
180 |               @=? Bytes.takeWhileEnd (/= 0x0) (slicedPack [0x1, 0x0, 0x1, 0x2, 0x3])
181 |         ]
182 |     , testProperty "decodeDecWord" $ \(w :: Word) ->
183 |         Just w
184 |           === Latin1.decodeDecWord (Bytes.unsafeDrop 1 (Exts.fromList (0xFF : (Exts.toList (Latin1.fromString (show w))))))
185 |     , testProperty "elem" $ \(x :: Word8) (xs :: [Word8]) ->
186 |         List.elem x xs
187 |           === Bytes.elem x (Bytes.unsafeDrop 1 (Exts.fromList (x : xs)))
188 |     , testProperty "foldl" $ \(x :: Word8) (xs :: [Word8]) ->
189 |         List.foldl (-) 0 xs
190 |           === Bytes.foldl (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs)))
191 |     , testProperty "foldlM" $ \(x :: Word8) (xs :: [Word8]) ->
192 |         let f acc y = (tell [x] >> pure (acc - y)) :: Writer [Word8] Word8
193 |          in Foldable.foldlM f 0 xs
194 |               === Bytes.foldlM f 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs)))
195 |     , testProperty "foldrM" $ \(x :: Word8) (xs :: [Word8]) ->
196 |         let f acc y = (tell [x] >> pure (acc - y)) :: Writer [Word8] Word8
197 |          in Foldable.foldrM f 0 xs
198 |               === Bytes.foldrM f 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs)))
199 |     , testProperty "foldl'" $ \(x :: Word8) (xs :: [Word8]) ->
200 |         List.foldl' (-) 0 xs
201 |           === Bytes.foldl' (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs)))
202 |     , testProperty "foldr" $ \(x :: Word8) (xs :: [Word8]) ->
203 |         Foldable.foldr (-) 0 xs
204 |           === Bytes.foldr (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs)))
205 |     , testProperty "foldr'" $ \(x :: Word8) (xs :: [Word8]) ->
206 |         Foldable.foldr' (-) 0 xs
207 |           === Bytes.foldr' (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs)))
208 |     , testProperty "count" $ \(x :: Word8) (xs :: [Word8]) ->
209 |         ByteString.count x (ByteString.pack xs)
210 |           === Bytes.count x (slicedPack xs)
211 |     , testProperty "split" $ \(x :: Word8) (xs :: [Word8]) ->
212 |         not (List.null xs)
213 |           ==> ByteString.split x (ByteString.pack xs)
214 |           === map bytesToByteString (Bytes.split x (slicedPack xs))
215 |     , testProperty "intercalate" $ \(x :: Bytes) (xs :: [Bytes]) ->
216 |         mconcat (List.intersperse x xs)
217 |           === Bytes.intercalate x xs
218 |     , testProperty "concatArray" $ \(xs :: [Bytes]) ->
219 |         mconcat xs
220 |           === Bytes.concatArray (Exts.fromList xs)
221 |     , testProperty "splitNonEmpty" $ \(x :: Word8) (xs :: [Word8]) ->
222 |         Bytes.split x (slicedPack xs)
223 |           === Foldable.toList (Bytes.splitNonEmpty x (slicedPack xs))
224 |     , testProperty "splitInit" $ \(x :: Word8) (xs :: [Word8]) -> case xs of
225 |         [] -> Bytes.splitInit x (slicedPack xs) === []
226 |         _ ->
227 |           List.init (ByteString.split x (ByteString.pack xs))
228 |             === map bytesToByteString (Bytes.splitInit x (slicedPack xs))
229 |     , testProperty "splitU" $ \(x :: Word8) (xs :: [Word8]) ->
230 |         not (List.null xs)
231 |           ==> Bytes.split x (slicedPack xs)
232 |           === map Bytes.fromByteArray (Exts.toList (Bytes.splitU x (slicedPack xs)))
233 |     , testCase "splitInit-A" $
234 |         [Ascii.fromString "hello", Ascii.fromString "world"]
235 |           @=? (Bytes.splitInit 0x0A (Ascii.fromString "hello\nworld\n"))
236 |     , testCase "splitInit-B" $
237 |         [Ascii.fromString "hello", Ascii.fromString "world"]
238 |           @=? (Bytes.splitInit 0x0A (Ascii.fromString "hello\nworld\nthere"))
239 |     , testProperty "replace" $ \(ws :: ASCIIString) (xs :: ASCIIString) (ys :: ASCIIString) (zs :: ASCIIString) ->
240 |         let ws' = asciiStringToBytes ws
241 |             xs' = asciiStringToBytes xs
242 |             ys' = asciiStringToBytes ys
243 |             zs' = asciiStringToBytes zs
244 |             ws'' = asciiStringToText ws
245 |             xs'' = asciiStringToText xs
246 |             ys'' = asciiStringToText ys
247 |             zs'' = asciiStringToText zs
248 |          in case ws of
249 |               ASCIIString [] -> property Discard
250 |               _ ->
251 |                 Bytes.replace ws' xs' (ys' <> ws' <> zs')
252 |                   === Bytes.fromByteString (Text.Encoding.encodeUtf8 (Text.replace ws'' xs'' (ys'' <> ws'' <> zs'')))
253 |     , testProperty "splitEnd1" $ \(x :: Word8) (xs :: [Word8]) ->
254 |         case ByteString.split x (ByteString.pack xs) of
255 |           [] -> Bytes.splitEnd1 x (slicedPack xs) === Nothing
256 |           [_] -> Bytes.splitEnd1 x (slicedPack xs) === Nothing
257 |           [y1, z1] -> case Bytes.splitEnd1 x (slicedPack xs) of
258 |             Nothing -> property False
259 |             Just (y2, z2) -> (y1, z1) === (bytesToByteString y2, bytesToByteString z2)
260 |           _ -> property Discard
261 |     , testProperty "split1" $ \(x :: Word8) (xs :: [Word8]) ->
262 |         case ByteString.split x (ByteString.pack xs) of
263 |           [] -> Bytes.split1 x (slicedPack xs) === Nothing
264 |           [_] -> Bytes.split1 x (slicedPack xs) === Nothing
265 |           [y1, z1] -> case Bytes.split1 x (slicedPack xs) of
266 |             Nothing -> property False
267 |             Just (y2, z2) -> (y1, z1) === (bytesToByteString y2, bytesToByteString z2)
268 |           _ -> property Discard
269 |     , testProperty "splitTetragram1A" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (w3 :: Word8) (xs :: [Word8]) ->
270 |         (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF && w3 /= 0xEF)
271 |           ==> case Bytes.splitTetragram1 w0 w1 w2 w3 (slicedPack (0xEF : w0 : w1 : w2 : w3 : xs)) of
272 |             Nothing -> property False
273 |             Just (pre, post) ->
274 |               (pre, post) === (Exts.fromList [0xEF], Exts.fromList xs)
275 |     , testProperty "splitTetragram1B" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (w3 :: Word8) (xs :: [Word8]) ->
276 |         (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF && w3 /= 0xEF)
277 |           ==> case Bytes.splitTetragram1 w0 w1 w2 w3 (slicedPack (0xEF : 0xEF : 0xEF : 0xEF : w0 : w1 : w2 : w3 : xs)) of
278 |             Nothing -> property False
279 |             Just (pre, post) ->
280 |               (pre, post) === (Exts.fromList (List.replicate 4 0xEF), Exts.fromList xs)
281 |     , testProperty "splitTetragram1C" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (xs :: [Word8]) ->
282 |         (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF)
283 |           ==> case Bytes.splitTetragram1 w0 w1 w2 0xEF (slicedPack (xs ++ [w0, w1, w2, 0xEF])) of
284 |             Nothing -> property False
285 |             Just (pre, post) ->
286 |               (pre, post) === (Exts.fromList xs, mempty)
287 |     , testProperty "splitTetragram1D" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (xs :: [Word8]) ->
288 |         (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF)
289 |           ==> case Bytes.splitTetragram1 w0 w1 w2 0xEF (slicedPack (xs ++ [w0, w1, w2, 0xEF, 0xEF])) of
290 |             Nothing -> property False
291 |             Just (pre, post) ->
292 |               (pre, post) === (Exts.fromList xs, Exts.fromList [0xEF])
293 |     , testProperty "split2" $ \(xs :: [Word8]) (ys :: [Word8]) (zs :: [Word8]) ->
294 |         (all (/= 0xEF) xs && all (/= 0xEF) ys && all (/= 0xEF) zs)
295 |           ==> case Bytes.split2 0xEF (slicedPack (xs ++ [0xEF] ++ ys ++ [0xEF] ++ zs)) of
296 |             Just r -> r === (slicedPack xs, slicedPack ys, slicedPack zs)
297 |             Nothing -> property False
298 |     , testProperty "split3" $ \(ws :: [Word8]) (xs :: [Word8]) (ys :: [Word8]) (zs :: [Word8]) ->
299 |         (all (/= 0xEF) ws && all (/= 0xEF) xs && all (/= 0xEF) ys && all (/= 0xEF) zs)
300 |           ==> case Bytes.split3 0xEF (slicedPack (ws ++ [0xEF] ++ xs ++ [0xEF] ++ ys ++ [0xEF] ++ zs)) of
301 |             Just r -> r === (slicedPack ws, slicedPack xs, slicedPack ys, slicedPack zs)
302 |             Nothing -> property False
303 |     , testProperty "split4" $ \(ws :: [Word8]) (xs :: [Word8]) (ys :: [Word8]) (zs :: [Word8]) (ps :: [Word8]) ->
304 |         (all (/= 0xEF) ws && all (/= 0xEF) xs && all (/= 0xEF) ys && all (/= 0xEF) zs && all (/= 0xEF) ps)
305 |           ==> case Bytes.split4 0xEF (slicedPack (ws ++ [0xEF] ++ xs ++ [0xEF] ++ ys ++ [0xEF] ++ zs ++ [0xEF] ++ ps)) of
306 |             Just r -> r === (slicedPack ws, slicedPack xs, slicedPack ys, slicedPack zs, slicedPack ps)
307 |             Nothing -> property False
308 |     , testGroup
309 |         "FNV-1a"
310 |         [ testGroup
311 |             "32-bit"
312 |             [ testCase "empty" (Bytes.fnv1a32 Bytes.empty @=? 0x811c9dc5)
313 |             , testCase "a" (Bytes.fnv1a32 (bytes "a") @=? 0xe40c292c)
314 |             , testCase "foobar" (Bytes.fnv1a32 (bytes "foobar") @=? 0xbf9cf968)
315 |             ]
316 |         , testGroup
317 |             "64-bit"
318 |             [ testCase "empty" (Bytes.fnv1a64 Bytes.empty @=? 0xcbf29ce484222325)
319 |             , testCase "a" (Bytes.fnv1a64 (bytes "a") @=? 0xaf63dc4c8601ec8c)
320 |             , testCase "foobar" (Bytes.fnv1a64 (bytes "foobar") @=? 0x85944171f73967e8)
321 |             , testCase "google.com" (Bytes.fnv1a64 (bytes "google.com") @=? 0xe1a2c1ae38dcdf45)
322 |             ]
323 |         ]
324 |     , testGroup
325 |         "Chunks"
326 |         [ lawsToTest (QCC.eqLaws (Proxy :: Proxy Chunks))
327 |         , lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Chunks))
328 |         , lawsToTest (QCC.monoidLaws (Proxy :: Proxy Chunks))
329 |         , testGroup
330 |             "concatenation"
331 |             [ testProperty "concat=concatU" $ \(c :: Chunks) ->
332 |                 Chunks.concat c === Bytes.fromByteArray (Chunks.concatU c)
333 |             , testProperty "concat-singleton" $ \(b :: Bytes) ->
334 |                 Chunks.concat (ChunksCons b ChunksNil) === b
335 |             , testProperty "concatU-singleton" $ \(b :: Bytes) ->
336 |                 Chunks.concatU (ChunksCons b ChunksNil) === Bytes.toByteArray b
337 |             ]
338 |         ]
339 |     , testGroup
340 |         "ASCII"
341 |         [ testGroup
342 |           "toText=naiveAsciiToText"
343 |           [ testProperty "alpha" $ \(ws :: [Word8]) ->
344 |               let b = slicedPack ws in naiveAsciiToText b === Ascii.toText b
345 |           , testProperty "beta" $ \(ws :: [Word8]) ->
346 |               let b = slicedPackAlt ws in naiveAsciiToText b === Ascii.toText b
347 |           , testProperty "gamma" $ \(ASCIIString cs) ->
348 |               let b = bytes cs in naiveAsciiToText b === Ascii.toText b
349 |           ]
350 |         ]
351 |     ]
352 | 
353 | bytes :: String -> Bytes
354 | bytes s = let b = pack ('x' : s) in Bytes b 1 (PM.sizeofByteArray b - 1)
355 | 
356 | slicedPack :: [Word8] -> Bytes
357 | slicedPack s =
358 |   let b = Exts.fromList ([0x00] ++ s ++ [0x00])
359 |    in Bytes b 1 (PM.sizeofByteArray b - 2)
360 | 
361 | slicedPackAlt :: [Word8] -> Bytes
362 | slicedPackAlt s =
363 |   let b = Exts.fromList ([0xFF] ++ s ++ [0x80])
364 |    in Bytes b 1 (PM.sizeofByteArray b - 2)
365 | 
366 | pack :: String -> ByteArray
367 | pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord)
368 | 
369 | c2w :: Char -> Word8
370 | c2w = fromIntegral . ord
371 | 
372 | bytesToByteString :: Bytes -> ByteString.ByteString
373 | bytesToByteString = ByteString.pack . Bytes.foldr (:) []
374 | 
375 | instance Arbitrary Bytes where
376 |   arbitrary = do
377 |     xs :: [Word8] <- TQC.arbitrary
378 |     front <- TQC.choose (0, 2)
379 |     back <- TQC.choose (0, 2)
380 |     let frontPad = replicate front (254 :: Word8)
381 |     let backPad = replicate back (254 :: Word8)
382 |     let raw = Exts.fromList (frontPad ++ xs ++ backPad)
383 |     pure (Bytes raw front (length xs))
384 | 
385 | instance Arbitrary Chunks where
386 |   arbitrary = do
387 |     xs :: [[Word8]] <- TQC.arbitrary
388 |     let ys =
389 |           map
390 |             (\x -> Exts.fromList ([255] ++ x ++ [255]))
391 |             xs
392 |         zs =
393 |           foldr
394 |             ( \b cs ->
395 |                 ChunksCons (Bytes b 1 (PM.sizeofByteArray b - 2)) cs
396 |             )
397 |             ChunksNil
398 |             ys
399 |     pure zs
400 | 
401 | lawsToTest :: QCC.Laws -> TestTree
402 | lawsToTest (QCC.Laws name pairs) = testGroup name (map (uncurry TQC.testProperty) pairs)
403 | 
404 | asciiStringToBytes :: ASCIIString -> Bytes
405 | asciiStringToBytes (ASCIIString x) = bytes x
406 | 
407 | asciiStringToText :: ASCIIString -> Text
408 | asciiStringToText (ASCIIString x) = Text.pack x
409 | 
410 | naiveAsciiToText :: Bytes -> Maybe Text
411 | {-# noinline naiveAsciiToText #-}
412 | naiveAsciiToText !b@(Bytes (PM.ByteArray arr) off len) = case Bytes.foldr (\w acc -> w < 128 && acc) True b of
413 |   True -> Just (I.Text (A.ByteArray arr) off len)
414 |   False -> Nothing
415 | 


--------------------------------------------------------------------------------
/src/Data/Bytes.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE BlockArguments #-}
  3 | {-# LANGUAGE DataKinds #-}
  4 | {-# LANGUAGE DuplicateRecordFields #-}
  5 | {-# LANGUAGE KindSignatures #-}
  6 | {-# LANGUAGE MagicHash #-}
  7 | {-# LANGUAGE NamedFieldPuns #-}
  8 | {-# LANGUAGE RankNTypes #-}
  9 | {-# LANGUAGE UnboxedTuples #-}
 10 | 
 11 | {- | If you are interested in sub-arrays of 'ByteArray's (e.g. writing a binary
 12 | search), it would be grossly inefficient to make a copy of the sub-array. On
 13 | the other hand, it'd be really annoying to track limit indices by hand.
 14 | 
 15 | This module defines the 'Bytes' type which exposes a standard array interface
 16 | for a sub-arrays without copying and without manual index manipulation. --
 17 | For mutable arrays, see 'Data.Bytes.Mutable'.
 18 | -}
 19 | module Data.Bytes
 20 |   ( -- * Types
 21 |     Bytes
 22 | 
 23 |     -- * Constants
 24 |   , Pure.empty
 25 |   , Pure.emptyPinned
 26 |   , Pure.emptyPinnedU
 27 | 
 28 |     -- * Properties
 29 |   , Pure.null
 30 |   , Pure.length
 31 | 
 32 |     -- * Decompose
 33 |   , uncons
 34 |   , unsnoc
 35 | 
 36 |     -- * Predicates
 37 |   , Pure.any
 38 |   , Pure.all
 39 | 
 40 |     -- * Create
 41 | 
 42 |     -- ** Sliced
 43 |   , singleton
 44 |   , doubleton
 45 |   , tripleton
 46 |   , Pure.replicate
 47 | 
 48 |     -- ** Unsliced
 49 |   , singletonU
 50 |   , doubletonU
 51 |   , tripletonU
 52 |   , Pure.replicateU
 53 | 
 54 |     -- * Filtering
 55 |   , takeWhile
 56 |   , dropWhile
 57 |   , takeWhileEnd
 58 |   , dropWhileEnd
 59 | 
 60 |     -- * Traversals
 61 |   , Pure.map
 62 |   , Pure.mapU
 63 | 
 64 |     -- * Folds
 65 |   , Pure.foldl
 66 |   , Pure.foldl'
 67 |   , Pure.foldr
 68 |   , Pure.foldr'
 69 | 
 70 |     -- * Folds with Indices
 71 |   , Pure.ifoldl'
 72 | 
 73 |     -- * Monadic Folds
 74 |   , Pure.foldlM
 75 |   , Pure.foldrM
 76 | 
 77 |     -- * Common Folds
 78 |   , elem
 79 | 
 80 |     -- * Splitting
 81 | 
 82 |     -- ** Unlimited
 83 |   , Byte.split
 84 |   , Byte.splitU
 85 |   , Byte.splitInit
 86 |   , Byte.splitInitU
 87 |   , Byte.splitNonEmpty
 88 |   , Byte.splitStream
 89 | 
 90 |     -- ** Fixed from Beginning
 91 |   , Byte.split1
 92 |   , Pure.splitTetragram1
 93 |   , Byte.split2
 94 |   , Byte.split3
 95 |   , Byte.split4
 96 | 
 97 |     -- ** Fixed from End
 98 |   , Byte.splitEnd1
 99 | 
100 |     -- * Combining
101 |   , intercalate
102 |   , intercalateByte2
103 |   , concatArray
104 |   , concatArrayU
105 | 
106 |     -- * Searching
107 |   , replace
108 |   , findIndices
109 |   , Pure.findTetragramIndex
110 | 
111 |     -- * Counting
112 |   , Byte.count
113 | 
114 |     -- * Prefix and Suffix
115 | 
116 |     -- ** Byte Sequence
117 |   , isPrefixOf
118 |   , isSuffixOf
119 |   , isInfixOf
120 |   , stripPrefix
121 |   , stripOptionalPrefix
122 |   , stripSuffix
123 |   , stripOptionalSuffix
124 |   , longestCommonPrefix
125 | 
126 |     -- ** C Strings
127 |   , stripCStringPrefix
128 | 
129 |     -- ** Single Byte
130 |   , isBytePrefixOf
131 |   , isByteSuffixOf
132 | 
133 |     -- * Equality
134 | 
135 |     -- ** Fixed Characters
136 |   , equalsLatin1
137 |   , equalsLatin2
138 |   , equalsLatin3
139 |   , equalsLatin4
140 |   , equalsLatin5
141 |   , equalsLatin6
142 |   , equalsLatin7
143 |   , equalsLatin8
144 |   , equalsLatin9
145 |   , equalsLatin10
146 |   , equalsLatin11
147 |   , equalsLatin12
148 | 
149 |     -- ** C Strings
150 |   , equalsCString
151 | 
152 |     -- * Hashing
153 |   , Pure.fnv1a32
154 |   , Pure.fnv1a64
155 | 
156 |     -- * Unsafe Slicing
157 |   , Pure.unsafeTake
158 |   , Pure.unsafeDrop
159 |   , Pure.unsafeIndex
160 |   , Pure.unsafeHead
161 | 
162 |     -- * Copying
163 |   , Pure.unsafeCopy
164 | 
165 |     -- * Pointers
166 |   , Pure.pin
167 |   , Pure.contents
168 |   , touch
169 | 
170 |     -- * Conversion
171 |   , Pure.toByteArray
172 |   , Pure.toByteArrayClone
173 |   , Pure.toPinnedByteArray
174 |   , Pure.toPinnedByteArrayClone
175 |   , fromAsciiString
176 |   , fromLatinString
177 |   , Pure.fromByteArray
178 |   , Pure.fromPrimArray
179 |   , toLatinString
180 |   , fromCString#
181 |   , Pure.toByteString
182 |   , Pure.pinnedToByteString
183 |   , Pure.fromByteString
184 |   , Pure.fromLazyByteString
185 |   , fromShortByteString
186 |   , fromShortText
187 |   , toShortByteString
188 |   , toShortByteStringClone
189 |   , toLowerAsciiByteArrayClone
190 | 
191 |     -- * I\/O with Handles
192 |   , BIO.hGet
193 |   , readFile
194 |   , BIO.hPut
195 | 
196 |     -- * Unlifted Types
197 |   , lift
198 |   , unlift
199 | 
200 |     -- * Length Indexed
201 |   , withLength
202 |   , withLengthU
203 |   ) where
204 | 
205 | import Prelude hiding (all, any, dropWhile, elem, foldl, foldr, length, map, null, readFile, replicate, takeWhile)
206 | 
207 | import Control.Monad.Primitive (PrimMonad, primitive_, unsafeIOToPrim)
208 | import Control.Monad.ST.Run (runByteArrayST)
209 | import Cstrlen (cstringLength#)
210 | import Data.ByteString.Short.Internal (ShortByteString (SBS))
211 | import Data.Bytes.Pure (fromByteArray, length, toShortByteString, unsafeDrop, unsafeIndex)
212 | import Data.Bytes.Search (findIndices, isInfixOf, replace)
213 | import Data.Bytes.Types (ByteArrayN (ByteArrayN), Bytes (Bytes, array, offset), BytesN (BytesN))
214 | import Data.Primitive (Array, ByteArray (ByteArray))
215 | import Data.Text.Short (ShortText)
216 | import Foreign.C.String (CString)
217 | import Foreign.Ptr (Ptr, castPtr, plusPtr)
218 | import GHC.Exts (Addr#, Int (I#), Int#, Ptr (Ptr), Word#)
219 | import GHC.Word (Word8 (W8#))
220 | import Reps (Bytes# (..), word8ToWord#)
221 | 
222 | import qualified Arithmetic.Nat as Nat
223 | import qualified Arithmetic.Types as Arithmetic
224 | import qualified Data.Bytes.Byte as Byte
225 | import qualified Data.Bytes.Chunks as Chunks
226 | import qualified Data.Bytes.IO as BIO
227 | import qualified Data.Bytes.Pure as Pure
228 | import qualified Data.Bytes.Text.Ascii as Ascii
229 | import qualified Data.Bytes.Text.AsciiExt as AsciiExt
230 | import qualified Data.Bytes.Text.Latin1 as Latin1
231 | import qualified Data.Bytes.Types as Types
232 | import qualified Data.Foldable as F
233 | import qualified Data.List as List
234 | import qualified Data.Primitive as PM
235 | import qualified Data.Primitive.Ptr as PM
236 | import qualified Data.Text.Short as TS
237 | import qualified GHC.Exts as Exts
238 | import qualified GHC.TypeNats as GHC
239 | 
240 | {- | Extract the head and tail of the 'Bytes', returning 'Nothing' if
241 | it is empty.
242 | -}
243 | uncons :: Bytes -> Maybe (Word8, Bytes)
244 | {-# INLINE uncons #-}
245 | uncons b = case length b of
246 |   0 -> Nothing
247 |   _ -> Just (unsafeIndex b 0, unsafeDrop 1 b)
248 | 
249 | {- | Extract the @init@ and @last@ of the 'Bytes', returning 'Nothing' if
250 | it is empty.
251 | -}
252 | unsnoc :: Bytes -> Maybe (Bytes, Word8)
253 | {-# INLINE unsnoc #-}
254 | unsnoc b@(Bytes arr off len) = case len of
255 |   0 -> Nothing
256 |   _ ->
257 |     let !len' = len - 1
258 |      in Just (Bytes arr off len', unsafeIndex b len')
259 | 
260 | {- | Does the byte sequence begin with the given byte? False if the
261 | byte sequence is empty.
262 | -}
263 | isBytePrefixOf :: Word8 -> Bytes -> Bool
264 | {-# INLINE isBytePrefixOf #-}
265 | isBytePrefixOf w b = case length b of
266 |   0 -> False
267 |   _ -> unsafeIndex b 0 == w
268 | 
269 | {- | Does the byte sequence end with the given byte? False if the
270 | byte sequence is empty.
271 | -}
272 | isByteSuffixOf :: Word8 -> Bytes -> Bool
273 | isByteSuffixOf w b = case len of
274 |   0 -> False
275 |   _ -> unsafeIndex b (len - 1) == w
276 |  where
277 |   len = length b
278 | 
279 | -- | Is the first argument a prefix of the second argument?
280 | isPrefixOf :: Bytes -> Bytes -> Bool
281 | isPrefixOf (Bytes a aOff aLen) (Bytes b bOff bLen) =
282 |   -- For prefix and suffix testing, we do not use
283 |   -- the sameByteArray optimization that we use in
284 |   -- the Eq instance. Prefix and suffix testing seldom
285 |   -- compares a byte array with the same in-memory
286 |   -- byte array.
287 |   if aLen <= bLen
288 |     then compareByteArrays a aOff b bOff aLen == EQ
289 |     else False
290 | 
291 | -- | Is the first argument a suffix of the second argument?
292 | isSuffixOf :: Bytes -> Bytes -> Bool
293 | isSuffixOf (Bytes a aOff aLen) (Bytes b bOff bLen) =
294 |   if aLen <= bLen
295 |     then compareByteArrays a aOff b (bOff + bLen - aLen) aLen == EQ
296 |     else False
297 | 
298 | -- | Find the longest string which is a prefix of both arguments.
299 | longestCommonPrefix :: Bytes -> Bytes -> Bytes
300 | longestCommonPrefix a b = loop 0
301 |  where
302 |   loop :: Int -> Bytes
303 |   loop !into
304 |     | into < maxLen
305 |         && unsafeIndex a into == unsafeIndex b into =
306 |         loop (into + 1)
307 |     | otherwise = Pure.unsafeTake into a
308 |   maxLen = min (length a) (length b)
309 | 
310 | -- | Create a byte sequence with one byte.
311 | singleton :: Word8 -> Bytes
312 | {-# INLINE singleton #-}
313 | singleton !a = Bytes (singletonU a) 0 1
314 | 
315 | -- | Create a byte sequence with two bytes.
316 | doubleton :: Word8 -> Word8 -> Bytes
317 | {-# INLINE doubleton #-}
318 | doubleton !a !b = Bytes (doubletonU a b) 0 2
319 | 
320 | -- | Create a byte sequence with three bytes.
321 | tripleton :: Word8 -> Word8 -> Word8 -> Bytes
322 | {-# INLINE tripleton #-}
323 | tripleton !a !b !c = Bytes (tripletonU a b c) 0 3
324 | 
325 | -- | Create an unsliced byte sequence with one byte.
326 | singletonU :: Word8 -> ByteArray
327 | {-# INLINE singletonU #-}
328 | singletonU !a = runByteArrayST do
329 |   arr <- PM.newByteArray 1
330 |   PM.writeByteArray arr 0 a
331 |   PM.unsafeFreezeByteArray arr
332 | 
333 | -- | Create an unsliced byte sequence with two bytes.
334 | doubletonU :: Word8 -> Word8 -> ByteArray
335 | {-# INLINE doubletonU #-}
336 | doubletonU !a !b = runByteArrayST do
337 |   arr <- PM.newByteArray 2
338 |   PM.writeByteArray arr 0 a
339 |   PM.writeByteArray arr 1 b
340 |   PM.unsafeFreezeByteArray arr
341 | 
342 | -- | Create an unsliced byte sequence with three bytes.
343 | tripletonU :: Word8 -> Word8 -> Word8 -> ByteArray
344 | {-# INLINE tripletonU #-}
345 | tripletonU !a !b !c = runByteArrayST do
346 |   arr <- PM.newByteArray 3
347 |   PM.writeByteArray arr 0 a
348 |   PM.writeByteArray arr 1 b
349 |   PM.writeByteArray arr 2 c
350 |   PM.unsafeFreezeByteArray arr
351 | 
352 | {- | /O(n)/ Return the suffix of the second string if its prefix
353 | matches the entire first string.
354 | -}
355 | stripPrefix :: Bytes -> Bytes -> Maybe Bytes
356 | stripPrefix !pre !str =
357 |   if pre `isPrefixOf` str
358 |     then Just (Bytes (array str) (offset str + length pre) (length str - length pre))
359 |     else Nothing
360 | 
361 | {- | /O(n)/ Return the suffix of the second string if its prefix
362 | matches the entire first string. Otherwise, return the second
363 | string unchanged.
364 | -}
365 | stripOptionalPrefix :: Bytes -> Bytes -> Bytes
366 | stripOptionalPrefix !pre !str =
367 |   if pre `isPrefixOf` str
368 |     then Bytes (array str) (offset str + length pre) (length str - length pre)
369 |     else str
370 | 
371 | {- | /O(n)/ Return the prefix of the second string if its suffix
372 | matches the entire first string.
373 | -}
374 | stripSuffix :: Bytes -> Bytes -> Maybe Bytes
375 | stripSuffix !suf !str =
376 |   if suf `isSuffixOf` str
377 |     then Just (Bytes (array str) (offset str) (length str - length suf))
378 |     else Nothing
379 | 
380 | {- | /O(n)/ Return the prefix of the second string if its suffix
381 | matches the entire first string. Otherwise, return the second
382 | string unchanged.
383 | -}
384 | stripOptionalSuffix :: Bytes -> Bytes -> Bytes
385 | stripOptionalSuffix !suf !str =
386 |   if suf `isSuffixOf` str
387 |     then Bytes (array str) (offset str) (length str - length suf)
388 |     else str
389 | 
390 | -- | Is the byte a member of the byte sequence?
391 | elem :: Word8 -> Bytes -> Bool
392 | elem (W8# w) b = case elemLoop 0# (word8ToWord# w) b of
393 |   1# -> True
394 |   _ -> False
395 | 
396 | elemLoop :: Int# -> Word# -> Bytes -> Int#
397 | elemLoop !r !w (Bytes arr@(ByteArray arr#) off@(I# off#) len) = case len of
398 |   0 -> r
399 |   _ -> elemLoop (Exts.orI# r (Exts.eqWord# w (word8ToWord# (Exts.indexWord8Array# arr# off#)))) w (Bytes arr (off + 1) (len - 1))
400 | 
401 | -- | Take bytes while the predicate is true.
402 | takeWhile :: (Word8 -> Bool) -> Bytes -> Bytes
403 | {-# INLINE takeWhile #-}
404 | takeWhile k b = Pure.unsafeTake (Pure.countWhile k b) b
405 | 
406 | -- | Drop bytes while the predicate is true.
407 | dropWhile :: (Word8 -> Bool) -> Bytes -> Bytes
408 | {-# INLINE dropWhile #-}
409 | dropWhile k b = Pure.unsafeDrop (Pure.countWhile k b) b
410 | 
411 | {- | /O(n)/ 'dropWhileEnd' @p@ @b@ returns the prefix remaining after
412 | dropping characters that satisfy the predicate @p@ from the end of
413 | @t@.
414 | -}
415 | dropWhileEnd :: (Word8 -> Bool) -> Bytes -> Bytes
416 | {-# INLINE dropWhileEnd #-}
417 | dropWhileEnd k !b = Pure.unsafeTake (length b - Pure.countWhileEnd k b) b
418 | 
419 | {- | /O(n)/ 'takeWhileEnd' @p@ @b@ returns the longest suffix of
420 | elements that satisfy predicate @p@.
421 | -}
422 | takeWhileEnd :: (Word8 -> Bool) -> Bytes -> Bytes
423 | {-# INLINE takeWhileEnd #-}
424 | takeWhileEnd k !b =
425 |   let n = Pure.countWhileEnd k b
426 |    in Bytes (array b) (offset b + length b - n) n
427 | 
428 | {- | Convert a 'String' consisting of only characters in the ASCII block
429 | to a byte sequence. Any character with a codepoint above @U+007F@ is
430 | replaced by @U+0000@.
431 | -}
432 | fromAsciiString :: String -> Bytes
433 | {-# DEPRECATED fromAsciiString "use Data.Bytes.Text.Ascii.fromString instead" #-}
434 | {-# INLINE fromAsciiString #-}
435 | fromAsciiString = Ascii.fromString
436 | 
437 | {- | Convert a 'String' consisting of only characters representable
438 | by ISO-8859-1. These are encoded with ISO-8859-1. Any character
439 | with a codepoint above @U+00FF@ is replaced by an unspecified byte.
440 | -}
441 | fromLatinString :: String -> Bytes
442 | {-# DEPRECATED fromLatinString "use Data.Bytes.Text.Latin1.fromString instead" #-}
443 | {-# INLINE fromLatinString #-}
444 | fromLatinString = Latin1.fromString
445 | 
446 | -- | Interpret a byte sequence as text encoded by ISO-8859-1.
447 | toLatinString :: Bytes -> String
448 | {-# DEPRECATED toLatinString "use Data.Bytes.Text.Latin1.toString instead" #-}
449 | {-# INLINE toLatinString #-}
450 | toLatinString = Latin1.toString
451 | 
452 | -- | Copy a primitive string literal into managed memory.
453 | fromCString# :: Addr# -> Bytes
454 | fromCString# a =
455 |   Bytes
456 |     ( runByteArrayST $ do
457 |         dst@(PM.MutableByteArray dst#) <- PM.newByteArray len
458 |         PM.copyPtrToMutablePrimArray
459 |           (PM.MutablePrimArray dst#)
460 |           0
461 |           (Ptr a :: Ptr Word8)
462 |           len
463 |         PM.unsafeFreezeByteArray dst
464 |     )
465 |     0
466 |     len
467 |  where
468 |   len = I# (cstringLength# a)
469 | 
470 | compareByteArrays :: ByteArray -> Int -> ByteArray -> Int -> Int -> Ordering
471 | {-# INLINE compareByteArrays #-}
472 | compareByteArrays (ByteArray ba1#) (I# off1#) (ByteArray ba2#) (I# off2#) (I# n#) =
473 |   compare (I# (Exts.compareByteArrays# ba1# off1# ba2# off2# n#)) 0
474 | 
475 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
476 | a singleton whose element matches the character?
477 | -}
478 | equalsLatin1 :: Char -> Bytes -> Bool
479 | {-# DEPRECATED equalsLatin1 "use Data.Bytes.Text.Latin1.equals1 instead" #-}
480 | {-# INLINE equalsLatin1 #-}
481 | equalsLatin1 = Latin1.equals1
482 | 
483 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
484 | a doubleton whose elements match the characters?
485 | -}
486 | equalsLatin2 :: Char -> Char -> Bytes -> Bool
487 | {-# DEPRECATED equalsLatin2 "use Data.Bytes.Text.Latin1.equals2 instead" #-}
488 | {-# INLINE equalsLatin2 #-}
489 | equalsLatin2 = Latin1.equals2
490 | 
491 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
492 | a tripleton whose elements match the characters?
493 | -}
494 | equalsLatin3 :: Char -> Char -> Char -> Bytes -> Bool
495 | {-# DEPRECATED equalsLatin3 "use Data.Bytes.Text.Latin1.equals3 instead" #-}
496 | {-# INLINE equalsLatin3 #-}
497 | equalsLatin3 = Latin1.equals3
498 | 
499 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
500 | a quadrupleton whose elements match the characters?
501 | -}
502 | equalsLatin4 :: Char -> Char -> Char -> Char -> Bytes -> Bool
503 | {-# DEPRECATED equalsLatin4 "use Data.Bytes.Text.Latin1.equals4 instead" #-}
504 | {-# INLINE equalsLatin4 #-}
505 | equalsLatin4 = Latin1.equals4
506 | 
507 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
508 | a quintupleton whose elements match the characters?
509 | -}
510 | equalsLatin5 :: Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
511 | {-# DEPRECATED equalsLatin5 "use Data.Bytes.Text.Latin1.equals5 instead" #-}
512 | {-# INLINE equalsLatin5 #-}
513 | equalsLatin5 = Latin1.equals5
514 | 
515 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
516 | a sextupleton whose elements match the characters?
517 | -}
518 | equalsLatin6 :: Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
519 | {-# DEPRECATED equalsLatin6 "use Data.Bytes.Text.Latin1.equals6 instead" #-}
520 | {-# INLINE equalsLatin6 #-}
521 | equalsLatin6 = Latin1.equals6
522 | 
523 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
524 | a septupleton whose elements match the characters?
525 | -}
526 | equalsLatin7 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
527 | {-# DEPRECATED equalsLatin7 "use Data.Bytes.Text.Latin1.equals7 instead" #-}
528 | {-# INLINE equalsLatin7 #-}
529 | equalsLatin7 = Latin1.equals7
530 | 
531 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
532 | an octupleton whose elements match the characters?
533 | -}
534 | equalsLatin8 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
535 | {-# DEPRECATED equalsLatin8 "use Data.Bytes.Text.Latin1.equals8 instead" #-}
536 | {-# INLINE equalsLatin8 #-}
537 | equalsLatin8 = Latin1.equals8
538 | 
539 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
540 | a 9-tuple whose elements match the characters?
541 | -}
542 | equalsLatin9 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
543 | {-# DEPRECATED equalsLatin9 "use Data.Bytes.Text.Latin1.equals9 instead" #-}
544 | {-# INLINE equalsLatin9 #-}
545 | equalsLatin9 = Latin1.equals9
546 | 
547 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
548 | a 10-tuple whose elements match the characters?
549 | -}
550 | equalsLatin10 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
551 | {-# DEPRECATED equalsLatin10 "use Data.Bytes.Text.Latin1.equals10 instead" #-}
552 | {-# INLINE equalsLatin10 #-}
553 | equalsLatin10 = Latin1.equals10
554 | 
555 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
556 | a 11-tuple whose elements match the characters?
557 | -}
558 | equalsLatin11 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
559 | {-# DEPRECATED equalsLatin11 "use Data.Bytes.Text.Latin1.equals11 instead" #-}
560 | {-# INLINE equalsLatin11 #-}
561 | equalsLatin11 = Latin1.equals11
562 | 
563 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text,
564 | a 12-tuple whose elements match the characters?
565 | -}
566 | equalsLatin12 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool
567 | {-# DEPRECATED equalsLatin12 "use Data.Bytes.Text.Latin1.equals12 instead" #-}
568 | {-# INLINE equalsLatin12 #-}
569 | equalsLatin12 = Latin1.equals12
570 | 
571 | {- | Is the byte sequence equal to the @NUL@-terminated C String?
572 | The C string must be a constant.
573 | -}
574 | equalsCString :: CString -> Bytes -> Bool
575 | {-# INLINE equalsCString #-}
576 | equalsCString !ptr0 (Bytes arr off0 len0) = go (castPtr ptr0 :: Ptr Word8) off0 len0
577 |  where
578 |   go !ptr !off !len = case len of
579 |     0 -> PM.indexOffPtr ptr 0 == (0 :: Word8)
580 |     _ -> case PM.indexOffPtr ptr 0 of
581 |       0 -> False
582 |       c -> c == PM.indexByteArray arr off && go (plusPtr ptr 1) (off + 1) (len - 1)
583 | 
584 | {- | /O(n)/ Variant of 'stripPrefix' that takes a @NUL@-terminated C String
585 | as the prefix to test for.
586 | -}
587 | stripCStringPrefix :: CString -> Bytes -> Maybe Bytes
588 | {-# INLINE stripCStringPrefix #-}
589 | stripCStringPrefix !ptr0 (Bytes arr off0 len0) = go (castPtr ptr0 :: Ptr Word8) off0 len0
590 |  where
591 |   go !ptr !off !len = case PM.indexOffPtr ptr 0 of
592 |     0 -> Just (Bytes arr off len)
593 |     c -> case len of
594 |       0 -> Nothing
595 |       _ -> case c == PM.indexByteArray arr off of
596 |         True -> go (plusPtr ptr 1) (off + 1) (len - 1)
597 |         False -> Nothing
598 | 
599 | {- | Touch the byte array backing the byte sequence. This sometimes needed
600 | after calling 'Pure.contents' so that the @ByteArray@ does not get garbage
601 | collected.
602 | -}
603 | touch :: (PrimMonad m) => Bytes -> m ()
604 | touch (Bytes (ByteArray arr) _ _) =
605 |   unsafeIOToPrim
606 |     (primitive_ (\s -> Exts.touch# arr s))
607 | 
608 | -- | Read an entire file strictly into a 'Bytes'.
609 | readFile :: FilePath -> IO Bytes
610 | readFile f = Chunks.concat <$> Chunks.readFile f
611 | 
612 | {- | /O(n)/ The intercalate function takes a separator 'Bytes' and a list of
613 | 'Bytes' and concatenates the list elements by interspersing the separator
614 | between each element.
615 | -}
616 | intercalate ::
617 |   -- | Separator (interspersed into the list)
618 |   Bytes ->
619 |   -- | List
620 |   [Bytes] ->
621 |   Bytes
622 | intercalate !_ [] = mempty
623 | intercalate !_ [x] = x
624 | intercalate (Bytes sarr soff slen) (Bytes arr0 off0 len0 : bs) = Bytes r 0 fullLen
625 |  where
626 |   !fullLen = List.foldl' (\acc (Bytes _ _ len) -> acc + len + slen) 0 bs + len0
627 |   r = runByteArrayST $ do
628 |     marr <- PM.newByteArray fullLen
629 |     PM.copyByteArray marr 0 arr0 off0 len0
630 |     !_ <-
631 |       F.foldlM
632 |         ( \ !currLen (Bytes arr off len) -> do
633 |             PM.copyByteArray marr currLen sarr soff slen
634 |             PM.copyByteArray marr (currLen + slen) arr off len
635 |             pure (currLen + len + slen)
636 |         )
637 |         len0
638 |         bs
639 |     PM.unsafeFreezeByteArray marr
640 | 
641 | {- | Specialization of 'intercalate' where the separator is a single byte and
642 | there are exactly two byte sequences that are being concatenated.
643 | -}
644 | intercalateByte2 ::
645 |   -- | Separator
646 |   Word8 ->
647 |   -- | First byte sequence
648 |   Bytes ->
649 |   -- | Second byte sequence
650 |   Bytes ->
651 |   Bytes
652 | intercalateByte2 !sep !a !b =
653 |   Bytes
654 |     { Types.array = runByteArrayST $ do
655 |         dst <- PM.newByteArray len
656 |         Pure.unsafeCopy dst 0 a
657 |         PM.writeByteArray dst (length a) sep
658 |         Pure.unsafeCopy dst (length a + 1) b
659 |         PM.unsafeFreezeByteArray dst
660 |     , Types.length = len
661 |     , Types.offset = 0
662 |     }
663 |  where
664 |   len = length a + length b + 1
665 | 
666 | {- | Variant of 'toShortByteString' that unconditionally makes a copy of
667 | the array backing the sliced 'Bytes' even if the original array
668 | could be reused. Prefer 'toShortByteString'.
669 | -}
670 | toShortByteStringClone :: Bytes -> ShortByteString
671 | {-# INLINE toShortByteStringClone #-}
672 | toShortByteStringClone !b = case Pure.toByteArrayClone b of
673 |   PM.ByteArray x -> SBS x
674 | 
675 | -- | /O(1)/ Create 'Bytes' from a 'ShortByteString'.
676 | fromShortByteString :: ShortByteString -> Bytes
677 | {-# INLINE fromShortByteString #-}
678 | fromShortByteString (SBS x) = fromByteArray (ByteArray x)
679 | 
680 | {- | /O(1)/ Create 'Bytes' from a 'ShortText'. This encodes the text as UTF-8.
681 | It is a no-op.
682 | -}
683 | fromShortText :: ShortText -> Bytes
684 | {-# INLINE fromShortText #-}
685 | fromShortText t = case TS.toShortByteString t of
686 |   SBS x -> fromByteArray (ByteArray x)
687 | 
688 | {- | /O(n)/ Interpreting the bytes an ASCII-encoded characters, convert
689 | the string to lowercase. This adds @0x20@ to bytes in the range
690 | @[0x41,0x5A]@ and leaves all other bytes alone. Unconditionally
691 | copies the bytes.
692 | -}
693 | toLowerAsciiByteArrayClone :: Bytes -> ByteArray
694 | {-# DEPRECATED toLowerAsciiByteArrayClone "use Data.Bytes/Text/AsciiExt.toLowerU" #-}
695 | {-# INLINE toLowerAsciiByteArrayClone #-}
696 | toLowerAsciiByteArrayClone = AsciiExt.toLowerU
697 | 
698 | lift :: Bytes# -> Bytes
699 | {-# INLINE lift #-}
700 | lift (Bytes# (# arr, off, len #)) = Bytes (ByteArray arr) (I# off) (I# len)
701 | 
702 | unlift :: Bytes -> Bytes#
703 | {-# INLINE unlift #-}
704 | unlift (Bytes (ByteArray arr) (I# off) (I# len)) =
705 |   Bytes# (# arr, off, len #)
706 | 
707 | concatArrayU :: Array Bytes -> ByteArray
708 | {-# NOINLINE concatArrayU #-}
709 | concatArrayU !xs = runByteArrayST $ do
710 |   let !arrLen = PM.sizeofArray xs
711 |   let !totalByteLen = F.foldl' (\acc b -> length b + acc) 0 xs
712 |   dst <- PM.newByteArray totalByteLen
713 |   let go !ix !dstOff =
714 |         if ix < arrLen
715 |           then do
716 |             x <- PM.indexArrayM xs ix
717 |             Pure.unsafeCopy dst dstOff x
718 |             go (ix + 1) (dstOff + length x)
719 |           else PM.unsafeFreezeByteArray dst
720 |   go 0 0
721 | 
722 | concatArray :: Array Bytes -> Bytes
723 | {-# INLINE concatArray #-}
724 | concatArray !xs = Pure.fromByteArray (concatArrayU xs)
725 | 
726 | {- | Convert 'Bytes' to 'BytesN', exposing the length in a type-safe
727 | way in the callback.
728 | -}
729 | withLength ::
730 |   Bytes ->
731 |   (forall (n :: GHC.Nat). Arithmetic.Nat n -> BytesN n -> a) ->
732 |   a
733 | {-# INLINE withLength #-}
734 | withLength Bytes {array, offset, length = len} f =
735 |   Nat.with
736 |     len
737 |     (\n -> f n BytesN {array, offset})
738 | 
739 | withLengthU ::
740 |   ByteArray ->
741 |   (forall (n :: GHC.Nat). Arithmetic.Nat n -> ByteArrayN n -> a) ->
742 |   a
743 | {-# INLINE withLengthU #-}
744 | withLengthU !arr f =
745 |   Nat.with
746 |     (PM.sizeofByteArray arr)
747 |     (\n -> f n (ByteArrayN arr))
748 | 


--------------------------------------------------------------------------------