├── .github ├── CODEOWNERS └── workflows │ ├── release.yaml │ └── build.yaml ├── cabal.project ├── src-ghc-cstrlen └── Cstrlen.hs ├── src └── Data │ ├── Bytes │ ├── Text │ │ ├── Windows1252.hs │ │ ├── Utf8.hs │ │ ├── Ascii.hs │ │ ├── AsciiExt.hs │ │ └── Latin1.hs │ ├── Indexed.hs │ ├── Internal │ │ └── Show.hs │ ├── IO.hs │ ├── Types.hs │ ├── Encode │ │ ├── BigEndian.hs │ │ └── LittleEndian.hs │ ├── Mutable.hs │ ├── Internal.hs │ ├── Search.hs │ ├── Byte.hs │ ├── Chunks.hs │ └── Pure.hs │ └── Bytes.hs ├── include └── bs_custom.h ├── src-no-ghc-cstrlen └── Cstrlen.hs ├── .gitignore ├── src-new-reps └── Reps.hs ├── src-old-reps └── Reps.hs ├── README.md ├── LICENSE ├── cbits └── bs_custom.c ├── fourmolu.yaml ├── bench └── Main.hs ├── byteslice.cabal ├── CHANGELOG.md └── test └── Main.hs /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | @byteverse/l3c 2 | -------------------------------------------------------------------------------- /cabal.project: -------------------------------------------------------------------------------- 1 | -- 2024-02-06: Including this to make "cabal test" work on build machine. 2 | packages: . 3 | tests: True 4 | -------------------------------------------------------------------------------- /src-ghc-cstrlen/Cstrlen.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MagicHash #-} 2 | 3 | module Cstrlen 4 | ( cstringLength# 5 | ) where 6 | 7 | import GHC.Exts (cstringLength#) 8 | -------------------------------------------------------------------------------- /src/Data/Bytes/Text/Windows1252.hs: -------------------------------------------------------------------------------- 1 | {- | Placeholder module in case there is demand for treating 'Bytes' as 2 | Windows-1252-encoded text 3 | -} 4 | module Data.Bytes.Text.Windows1252 () where 5 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: release 2 | on: 3 | push: 4 | tags: 5 | - "*" 6 | 7 | jobs: 8 | call-workflow: 9 | uses: byteverse/.github/.github/workflows/release.yaml@main 10 | secrets: inherit 11 | -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: 3 | pull_request: 4 | branches: 5 | - "*" 6 | 7 | jobs: 8 | call-workflow: 9 | uses: byteverse/.github/.github/workflows/build-matrix.yaml@main 10 | with: 11 | cabal-file: byteslice.cabal 12 | -------------------------------------------------------------------------------- /include/bs_custom.h: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | 3 | #include 4 | #include "Rts.h" 5 | 6 | HsInt memchr_ba_many(unsigned char *p, HsInt off, HsInt len, HsInt *sizes, HsInt sizesLen, unsigned char w); 7 | HsInt count_ba(unsigned char *p, HsInt off, HsInt len, unsigned char w); 8 | -------------------------------------------------------------------------------- /src-no-ghc-cstrlen/Cstrlen.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MagicHash #-} 2 | {-# LANGUAGE TypeFamilies #-} 3 | {-# LANGUAGE UnliftedFFITypes #-} 4 | 5 | module Cstrlen 6 | ( cstringLength# 7 | ) where 8 | 9 | import GHC.Exts (Addr#, Int#) 10 | 11 | foreign import ccall unsafe "strlen" c_strlen :: Addr# -> Int# 12 | 13 | cstringLength# :: Addr# -> Int# 14 | cstringLength# = c_strlen 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | dist 3 | dist-* 4 | cabal-dev 5 | *.o 6 | *.hi 7 | *.chi 8 | *.chs.h 9 | *.dyn_o 10 | *.dyn_hi 11 | .hpc 12 | .hsenv 13 | .cabal-sandbox/ 14 | cabal.sandbox.config 15 | *.prof 16 | *.aux 17 | *.hp 18 | *.eventlog 19 | .stack-work/ 20 | cabal.project.local 21 | cabal.project.local~ 22 | .HTF/ 23 | .ghc.environment.* 24 | stack.yaml 25 | *.swm 26 | *.swo 27 | *.swp 28 | test_results/** 29 | -------------------------------------------------------------------------------- /src-new-reps/Reps.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DataKinds #-} 2 | {-# LANGUAGE GADTSyntax #-} 3 | {-# LANGUAGE KindSignatures #-} 4 | {-# LANGUAGE MagicHash #-} 5 | {-# LANGUAGE UnboxedTuples #-} 6 | {-# LANGUAGE UnliftedNewtypes #-} 7 | 8 | module Reps 9 | ( Bytes# (..) 10 | , word8ToWord# 11 | ) where 12 | 13 | import GHC.Exts (ByteArray#, Int#, Levity (Unlifted), RuntimeRep (..), TYPE, word8ToWord#) 14 | 15 | newtype Bytes# :: TYPE ('TupleRep '[ 'BoxedRep 'Unlifted, 'IntRep, 'IntRep]) where 16 | Bytes# :: (# ByteArray#, Int#, Int# #) -> Bytes# 17 | -------------------------------------------------------------------------------- /src-old-reps/Reps.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTSyntax #-} 2 | {-# LANGUAGE KindSignatures #-} 3 | {-# LANGUAGE MagicHash #-} 4 | {-# LANGUAGE TypeInType #-} 5 | {-# LANGUAGE UnboxedTuples #-} 6 | {-# LANGUAGE UnliftedNewtypes #-} 7 | 8 | module Reps 9 | ( Bytes# (..) 10 | , word8ToWord# 11 | ) where 12 | 13 | import GHC.Exts (ByteArray#, Int#, RuntimeRep (..), TYPE, Word#) 14 | 15 | newtype Bytes# :: TYPE ('TupleRep '[ 'UnliftedRep, 'IntRep, 'IntRep]) where 16 | Bytes# :: (# ByteArray#, Int#, Int# #) -> Bytes# 17 | 18 | -- In GHC 9.2, the lifted Word8 type started being backed by the 19 | -- unlifted Word8# instead of by Word#. This is a compatibility hack. 20 | word8ToWord# :: Word# -> Word# 21 | {-# INLINE word8ToWord# #-} 22 | word8ToWord# w = w 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # byteslice 2 | 3 | ## Purpose 4 | 5 | Types for dealing with slices of 'ByteArray' and 'MutableByteArray'. 6 | These are never supposed to introduce overhead. Rather, they exist 7 | to clarify intent in type signatures. 8 | 9 | receive :: 10 | Resource -- ^ Some scarce resource 11 | -> MutableByteArray RealWorld -- ^ Buffer 12 | -> Int -- ^ Offset 13 | -> Int -- ^ Length 14 | -> IO () 15 | 16 | With this library, we instead write 17 | 18 | receive :: 19 | Resource -- ^ Some scarce resource 20 | -> MutableBytes RealWorld -- ^ Buffer 21 | -> IO () 22 | 23 | The combination of the worker-wrapper transformation and inlining means 24 | that we can expect these two to end up generating the same code in most 25 | situations. 26 | -------------------------------------------------------------------------------- /src/Data/Bytes/Indexed.hs: -------------------------------------------------------------------------------- 1 | {-# language DataKinds #-} 2 | {-# language MagicHash #-} 3 | {-# language TypeOperators #-} 4 | 5 | module Data.Bytes.Indexed 6 | ( ByteArrayN 7 | , append 8 | , length 9 | , length# 10 | ) where 11 | 12 | import Prelude hiding (length) 13 | 14 | import Data.Primitive (ByteArray(ByteArray)) 15 | import Data.Bytes.Types (ByteArrayN(ByteArrayN)) 16 | import GHC.TypeNats (type (+)) 17 | import Arithmetic.Types (Nat, Nat#) 18 | 19 | import qualified Data.Primitive as PM 20 | import qualified Arithmetic.Unsafe as Unsafe 21 | import qualified GHC.Exts as Exts 22 | 23 | append :: ByteArrayN m -> ByteArrayN n -> ByteArrayN (m + n) 24 | {-# inline append #-} 25 | append (ByteArrayN x) (ByteArrayN y) = ByteArrayN (x <> y) 26 | 27 | -- | Recover a witness of the length. 28 | length :: ByteArrayN n -> Nat n 29 | {-# inline length #-} 30 | length (ByteArrayN x) = Unsafe.Nat (PM.sizeofByteArray x) 31 | 32 | -- | Recover an unboxed witness of the length. 33 | length# :: ByteArrayN n -> Nat# n 34 | {-# inline length# #-} 35 | length# (ByteArrayN (ByteArray x)) = Unsafe.Nat# (Exts.sizeofByteArray# x) 36 | -------------------------------------------------------------------------------- /src/Data/Bytes/Internal/Show.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | 3 | module Data.Bytes.Internal.Show 4 | ( showsSlice 5 | ) where 6 | 7 | import Data.Bits (unsafeShiftR, (.&.)) 8 | import Data.Char (ord) 9 | import Data.Primitive (ByteArray) 10 | import Data.Word (Word8) 11 | import GHC.Base (unsafeChr) 12 | 13 | import qualified Data.Primitive as PM 14 | 15 | showsSlice :: ByteArray -> Int -> Int -> String -> String 16 | showsSlice arr off len s = 17 | if len == 0 18 | then showString "[]" s 19 | else 20 | showString "[0x" $ 21 | showHexDigitsWord8 (PM.indexByteArray arr off) $ 22 | showHexLoop (off + 1) (len - 1) arr $ 23 | showChar ']' $ 24 | s 25 | 26 | showHexLoop :: Int -> Int -> ByteArray -> String -> String 27 | showHexLoop !ix !len !arr s = 28 | if len > 0 29 | then ',' : '0' : 'x' : showHexDigitsWord8 (PM.indexByteArray arr ix) (showHexLoop (ix + 1) (len - 1) arr s) 30 | else s 31 | 32 | showHexDigitsWord8 :: Word8 -> String -> String 33 | showHexDigitsWord8 !w s = word4ToChar (unsafeShiftR w 4) : word4ToChar (0x0F .&. w) : s 34 | 35 | word4ToChar :: Word8 -> Char 36 | word4ToChar w = 37 | if w < 10 38 | then unsafeChr (ord '0' + fromIntegral w) 39 | else unsafeChr (ord 'a' + (fromIntegral w) - 10) 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Andrew Martin 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Andrew Martin nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /cbits/bs_custom.c: -------------------------------------------------------------------------------- 1 | #define _GNU_SOURCE 2 | 3 | #include 4 | #include 5 | #include "Rts.h" 6 | 7 | // Find all occurrences of a byte, writing the lengths of each piece 8 | // to the sizes buffer. This uses rawmemchr, so the total number of 9 | // occurrences of the delimiter must be computed in advance. This 10 | // returns the total length of all pieces processed. 11 | // 12 | // Portability: On Linux, this uses rawmemchr. On all other platforms, 13 | // it uses memchr. On Linux, the length of the byte sequence is not 14 | // used. On other platforms, this is repeatedly decremented to provide 15 | // an appropriate third argument for memchr. 16 | HsInt memchr_ba_many(unsigned char *p, HsInt off, HsInt len, HsInt *sizes, HsInt sizesLen, unsigned char w) { 17 | HsInt szIx, delta, total; 18 | unsigned char* pos; 19 | p = p + off; 20 | total = 0; 21 | for (szIx = 0; szIx < sizesLen; ++szIx) { 22 | #if defined(__linux__) && !AVOID_RAWMEMCHR 23 | pos = (unsigned char*)(rawmemchr((void*)p,(int)w)); 24 | delta = (HsInt)(pos - p); 25 | #else 26 | pos = (unsigned char*)(memchr((void*)p,(int)w,(size_t)len)); 27 | delta = (HsInt)(pos - p); 28 | len = len - (delta + 1); 29 | #endif 30 | sizes[szIx] = delta; 31 | total = total + delta + 1; 32 | p = pos + 1; 33 | } 34 | return total; 35 | } 36 | 37 | // TODO: Possibly use SIMD in here. Or check so see if gcc optimizes 38 | // this on its own. 39 | HsInt count_ba(unsigned char *p, HsInt off, HsInt len, unsigned char w) { 40 | HsInt c; 41 | p = p + off; 42 | for (c = 0; len-- != 0; ++p) 43 | if (*p == w) 44 | ++c; 45 | return c; 46 | } 47 | -------------------------------------------------------------------------------- /fourmolu.yaml: -------------------------------------------------------------------------------- 1 | # Number of spaces per indentation step 2 | indentation: 2 3 | 4 | # Max line length for automatic line breaking 5 | column-limit: 200 6 | 7 | # Styling of arrows in type signatures (choices: trailing, leading, or leading-args) 8 | function-arrows: trailing 9 | 10 | # How to place commas in multi-line lists, records, etc. (choices: leading or trailing) 11 | comma-style: leading 12 | 13 | # Styling of import/export lists (choices: leading, trailing, or diff-friendly) 14 | import-export-style: leading 15 | 16 | # Whether to full-indent or half-indent 'where' bindings past the preceding body 17 | indent-wheres: false 18 | 19 | # Whether to leave a space before an opening record brace 20 | record-brace-space: true 21 | 22 | # Number of spaces between top-level declarations 23 | newlines-between-decls: 1 24 | 25 | # How to print Haddock comments (choices: single-line, multi-line, or multi-line-compact) 26 | haddock-style: multi-line 27 | 28 | # How to print module docstring 29 | haddock-style-module: null 30 | 31 | # Styling of let blocks (choices: auto, inline, newline, or mixed) 32 | let-style: auto 33 | 34 | # How to align the 'in' keyword with respect to the 'let' keyword (choices: left-align, right-align, or no-space) 35 | in-style: right-align 36 | 37 | # Whether to put parentheses around a single constraint (choices: auto, always, or never) 38 | single-constraint-parens: always 39 | 40 | # Output Unicode syntax (choices: detect, always, or never) 41 | unicode: never 42 | 43 | # Give the programmer more choice on where to insert blank lines 44 | respectful: true 45 | 46 | # Fixity information for operators 47 | fixities: [] 48 | 49 | # Module reexports Fourmolu should know about 50 | reexports: [] 51 | 52 | -------------------------------------------------------------------------------- /src/Data/Bytes/IO.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BlockArguments #-} 2 | {-# LANGUAGE MagicHash #-} 3 | {-# LANGUAGE TypeApplications #-} 4 | {-# LANGUAGE UnboxedTuples #-} 5 | 6 | module Data.Bytes.IO 7 | ( hGet 8 | , hPut 9 | ) where 10 | 11 | import Data.Bytes.Pure (contents, pin) 12 | import Data.Bytes.Types (Bytes (Bytes)) 13 | import Data.Primitive (ByteArray (..), MutableByteArray) 14 | import qualified Data.Primitive as PM 15 | import Data.Word (Word8) 16 | import Foreign.Ptr (Ptr) 17 | import qualified GHC.Exts as Exts 18 | import GHC.IO (IO (IO)) 19 | import System.IO (Handle) 20 | import qualified System.IO as IO 21 | 22 | {- | Read 'Bytes' directly from the specified 'Handle'. The resulting 23 | 'Bytes' are pinned. This is implemented with 'IO.hGetBuf'. 24 | -} 25 | hGet :: Handle -> Int -> IO Bytes 26 | hGet h i = createPinnedAndTrim i (\p -> IO.hGetBuf h p i) 27 | 28 | {- | Outputs 'Bytes' to the specified 'Handle'. This is implemented 29 | with 'IO.hPutBuf'. 30 | -} 31 | hPut :: Handle -> Bytes -> IO () 32 | hPut h b0 = do 33 | let b1@(Bytes arr _ len) = pin b0 34 | IO.hPutBuf h (contents b1) len 35 | touchByteArrayIO arr 36 | 37 | -- Only used internally. 38 | createPinnedAndTrim :: Int -> (Ptr Word8 -> IO Int) -> IO Bytes 39 | {-# INLINE createPinnedAndTrim #-} 40 | createPinnedAndTrim maxSz f = do 41 | arr@(PM.MutableByteArray arr#) <- PM.newPinnedByteArray maxSz 42 | sz <- f (PM.mutableByteArrayContents arr) 43 | touchMutableByteArrayIO arr 44 | PM.shrinkMutablePrimArray (PM.MutablePrimArray @Exts.RealWorld @Word8 arr#) sz 45 | r <- PM.unsafeFreezeByteArray arr 46 | pure (Bytes r 0 sz) 47 | 48 | touchMutableByteArrayIO :: MutableByteArray s -> IO () 49 | touchMutableByteArrayIO (PM.MutableByteArray x) = 50 | IO (\s -> (# Exts.touch# x s, () #)) 51 | 52 | touchByteArrayIO :: ByteArray -> IO () 53 | touchByteArrayIO (ByteArray x) = 54 | IO (\s -> (# Exts.touch# x s, () #)) 55 | -------------------------------------------------------------------------------- /bench/Main.hs: -------------------------------------------------------------------------------- 1 | import Gauge.Main (bench, bgroup, defaultMain, whnf) 2 | 3 | import Data.Bytes.Types 4 | import Data.List (permutations) 5 | 6 | import qualified Data.Bytes as Bytes 7 | import qualified Data.Bytes.Text.Ascii as Ascii 8 | import qualified Data.Bytes.Text.Utf8 as Utf8 9 | 10 | naiveMconcat :: [Bytes] -> Bytes 11 | naiveMconcat = foldr mappend mempty 12 | 13 | main :: IO () 14 | main = 15 | defaultMain 16 | [ bench "mconcat" $ whnf mconcat mconcatBytes 17 | , bench "naiveMconcat" $ whnf naiveMconcat mconcatBytes 18 | , bgroup 19 | "replace" 20 | [ bench "the-dog-and-the-shadow" (whnf replaceMeat theDogAndTheShadow) 21 | ] 22 | , bgroup 23 | "ascii" 24 | [ bench "toText" (whnf Ascii.toText theDogAndTheShadow) 25 | ] 26 | , bgroup 27 | "utf8" 28 | [ bench "toText" (whnf Utf8.toText theDogAndTheShadow) 29 | ] 30 | ] 31 | 32 | mconcatBytes :: [Bytes] 33 | mconcatBytes = fmap Ascii.fromString $ permutations ['a' .. 'g'] 34 | 35 | replaceMeat :: Bytes -> Bytes 36 | {-# NOINLINE replaceMeat #-} 37 | replaceMeat x = Bytes.replace meat gruel x 38 | 39 | meat :: Bytes 40 | meat = Ascii.fromString "meat" 41 | 42 | gruel :: Bytes 43 | gruel = Ascii.fromString "gruel" 44 | 45 | theDogAndTheShadow :: Bytes 46 | theDogAndTheShadow = 47 | Ascii.fromString $ 48 | concat 49 | [ "It happened that a Dog had got a piece of meat and was " 50 | , "carrying it home in his mouth to eat it in peace. " 51 | , "Now on his way home he had to cross a plank lying across " 52 | , "a running brook. As he crossed, he looked down and saw his " 53 | , "own shadow reflected in the water beneath. Thinking it was " 54 | , "another dog with another piece of meat, he made up his mind " 55 | , "to have that also. So he made a snap at the shadow in the water, " 56 | , "but as he opened his mouth the piece of meat fell out, dropped " 57 | , "into the water and was never seen more." 58 | ] 59 | -------------------------------------------------------------------------------- /src/Data/Bytes/Text/Utf8.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | 4 | -- | Convert 'Bytes' to and from 'Text' and 'ShortText'. 5 | module Data.Bytes.Text.Utf8 6 | ( fromShortText 7 | , toShortText 8 | , fromText 9 | , toText 10 | ) where 11 | 12 | import Data.Bytes.Types (Bytes (Bytes)) 13 | import Data.Primitive (ByteArray (ByteArray)) 14 | import Data.Text (Text) 15 | import Data.Text.Short (ShortText) 16 | 17 | import qualified Data.Bytes as Bytes 18 | import qualified Data.Text.Array as A 19 | import qualified Data.Text.Internal as I 20 | import qualified Data.Text.Short as TS 21 | 22 | import qualified Data.Text.Internal.Validate 23 | 24 | {- | Encode 'ShortText' using UTF-8. Since 'ShortText' is backed by a UTF-8 25 | byte sequence, this does not perform a copy. 26 | -} 27 | fromShortText :: ShortText -> Bytes 28 | {-# INLINE fromShortText #-} 29 | fromShortText = Bytes.fromShortByteString . TS.toShortByteString 30 | 31 | {- | Attempt to interpret the byte sequence as UTF-8 encoded text. Returns 32 | 'Nothing' if the bytes are not UTF-8 encoded text. 33 | -} 34 | toShortText :: Bytes -> Maybe ShortText 35 | {-# INLINE toShortText #-} 36 | toShortText !b = TS.fromShortByteString (Bytes.toShortByteString b) 37 | 38 | -- | Encode 'Text' using @UTF-8@. Only available when building with 39 | -- @text-2.0@ and newer. Since 'Text' is backed by a UTF-8 40 | -- byte sequence, this does not perform a copy. 41 | fromText :: Text -> Bytes 42 | {-# inline fromText #-} 43 | fromText (I.Text (A.ByteArray b) off len) = Bytes (ByteArray b) off len 44 | 45 | -- | Attempt to interpret byte sequence as @UTF-8@ encoded 'Text'. 46 | -- Only available when building with @text-2.1@ and newer. Since 47 | -- 'Text' is backed by a UTF-8 byte sequence, this does not perform a 48 | -- copy. 49 | toText :: Bytes -> Maybe Text 50 | {-# inline toText #-} 51 | toText (Bytes b@(ByteArray b0) off len) = 52 | if Data.Text.Internal.Validate.isValidUtf8ByteArray b off len 53 | then Just (I.Text (A.ByteArray b0) off len) 54 | else Nothing 55 | -------------------------------------------------------------------------------- /src/Data/Bytes/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DataKinds #-} 2 | {-# LANGUAGE DuplicateRecordFields #-} 3 | {-# LANGUAGE MagicHash #-} 4 | {-# LANGUAGE PolyKinds #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeFamilies #-} 7 | 8 | module Data.Bytes.Types 9 | ( Bytes (..) 10 | , Bytes# (..) 11 | , MutableBytes (..) 12 | , UnmanagedBytes (..) 13 | , BytesN (..) 14 | , ByteArrayN (..) 15 | ) where 16 | 17 | import Data.Bytes.Internal (Bytes (..)) 18 | import Data.Bytes.Internal.Show (showsSlice) 19 | import Data.Primitive (ByteArray (..), MutableByteArray (..)) 20 | import Data.Primitive.Addr (Addr) 21 | import Data.Proxy (Proxy (Proxy)) 22 | import GHC.Natural (naturalToInteger) 23 | import GHC.TypeNats (KnownNat, Nat, natVal) 24 | import Reps (Bytes# (..)) 25 | 26 | {- | A slice of a 'ByteArray' whose compile-time-known length is represented 27 | by a phantom type variable. Consumers of this data constructor must be 28 | careful to preserve the expected invariant. 29 | -} 30 | data BytesN (n :: Nat) = BytesN 31 | { array :: {-# UNPACK #-} !ByteArray 32 | , offset :: {-# UNPACK #-} !Int 33 | } 34 | 35 | instance (KnownNat n) => Show (BytesN n) where 36 | showsPrec _ (BytesN arr off) s = 37 | let len = fromInteger (naturalToInteger (natVal (Proxy :: Proxy n))) 38 | in showsSlice arr off len s 39 | 40 | {- | A 'ByteArray' whose compile-time-known length is represented 41 | by a phantom type variable. Consumers of this data constructor must be 42 | careful to preserve the expected invariant. 43 | -} 44 | newtype ByteArrayN (n :: Nat) = ByteArrayN 45 | { array :: ByteArray 46 | } 47 | 48 | instance (KnownNat n) => Show (ByteArrayN n) where 49 | showsPrec _ (ByteArrayN arr) s = 50 | let len = fromInteger (naturalToInteger (natVal (Proxy :: Proxy n))) 51 | in showsSlice arr 0 len s 52 | 53 | -- | A slice of a 'MutableByteArray'. 54 | data MutableBytes s = MutableBytes 55 | { array :: {-# UNPACK #-} !(MutableByteArray s) 56 | , offset :: {-# UNPACK #-} !Int 57 | , length :: {-# UNPACK #-} !Int 58 | } 59 | 60 | -- | A slice of unmanaged memory. 61 | data UnmanagedBytes = UnmanagedBytes 62 | { address :: {-# UNPACK #-} !Addr 63 | , length :: {-# UNPACK #-} !Int 64 | } 65 | -------------------------------------------------------------------------------- /src/Data/Bytes/Encode/BigEndian.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE TypeApplications #-} 3 | 4 | module Data.Bytes.Encode.BigEndian 5 | ( word16 6 | , word32 7 | , word64 8 | , int16 9 | , int32 10 | , int64 11 | ) where 12 | 13 | import Control.Monad.ST.Run (runByteArrayST) 14 | import Data.Bits (unsafeShiftR) 15 | import Data.Bytes.Types (Bytes) 16 | import Data.Int (Int16, Int32, Int64) 17 | import Data.Primitive (ByteArray) 18 | import Data.Word (Word16, Word32, Word64, Word8) 19 | 20 | import qualified Data.Bytes.Pure as Pure 21 | import qualified Data.Primitive as PM 22 | 23 | -- | Encode a 32-bit signed integer as 4 bytes. 24 | int32 :: Int32 -> Bytes 25 | {-# INLINE int32 #-} 26 | int32 = word32 . fromIntegral @Int32 @Word32 27 | 28 | -- | Encode a 32-bit unsigned integer as 4 bytes. 29 | word32 :: Word32 -> Bytes 30 | word32 !w = Pure.fromByteArray (word32U w) 31 | 32 | word32U :: Word32 -> ByteArray 33 | word32U !w = runByteArrayST $ do 34 | arr <- PM.newByteArray 4 35 | PM.writeByteArray arr 0 (fromIntegral @Word32 @Word8 (unsafeShiftR w 24)) 36 | PM.writeByteArray arr 1 (fromIntegral @Word32 @Word8 (unsafeShiftR w 16)) 37 | PM.writeByteArray arr 2 (fromIntegral @Word32 @Word8 (unsafeShiftR w 8)) 38 | PM.writeByteArray arr 3 (fromIntegral @Word32 @Word8 w) 39 | PM.unsafeFreezeByteArray arr 40 | 41 | -- | Encode a 16-bit signed integer as 4 bytes. 42 | int16 :: Int16 -> Bytes 43 | {-# INLINE int16 #-} 44 | int16 = word16 . fromIntegral @Int16 @Word16 45 | 46 | -- | Encode a 16-bit unsigned integer as 4 bytes. 47 | word16 :: Word16 -> Bytes 48 | word16 !w = Pure.fromByteArray (word16U w) 49 | 50 | word16U :: Word16 -> ByteArray 51 | word16U !w = runByteArrayST $ do 52 | arr <- PM.newByteArray 2 53 | PM.writeByteArray arr 0 (fromIntegral @Word16 @Word8 (unsafeShiftR w 8)) 54 | PM.writeByteArray arr 1 (fromIntegral @Word16 @Word8 w) 55 | PM.unsafeFreezeByteArray arr 56 | 57 | -- | Encode a 16-bit signed integer as 4 bytes. 58 | int64 :: Int64 -> Bytes 59 | {-# INLINE int64 #-} 60 | int64 = word64 . fromIntegral @Int64 @Word64 61 | 62 | -- | Encode a 16-bit unsigned integer as 4 bytes. 63 | word64 :: Word64 -> Bytes 64 | word64 !w = Pure.fromByteArray (word64U w) 65 | 66 | word64U :: Word64 -> ByteArray 67 | word64U !w = runByteArrayST $ do 68 | arr <- PM.newByteArray 8 69 | PM.writeByteArray arr 0 (fromIntegral @Word64 @Word8 (unsafeShiftR w 56)) 70 | PM.writeByteArray arr 1 (fromIntegral @Word64 @Word8 (unsafeShiftR w 48)) 71 | PM.writeByteArray arr 2 (fromIntegral @Word64 @Word8 (unsafeShiftR w 40)) 72 | PM.writeByteArray arr 3 (fromIntegral @Word64 @Word8 (unsafeShiftR w 32)) 73 | PM.writeByteArray arr 4 (fromIntegral @Word64 @Word8 (unsafeShiftR w 24)) 74 | PM.writeByteArray arr 5 (fromIntegral @Word64 @Word8 (unsafeShiftR w 16)) 75 | PM.writeByteArray arr 6 (fromIntegral @Word64 @Word8 (unsafeShiftR w 8)) 76 | PM.writeByteArray arr 7 (fromIntegral @Word64 @Word8 w) 77 | PM.unsafeFreezeByteArray arr 78 | -------------------------------------------------------------------------------- /src/Data/Bytes/Encode/LittleEndian.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE TypeApplications #-} 3 | 4 | module Data.Bytes.Encode.LittleEndian 5 | ( word16 6 | , word32 7 | , word64 8 | , int16 9 | , int32 10 | , int64 11 | ) where 12 | 13 | import Control.Monad.ST.Run (runByteArrayST) 14 | import Data.Bits (unsafeShiftR) 15 | import Data.Bytes.Types (Bytes) 16 | import Data.Int (Int16, Int32, Int64) 17 | import Data.Primitive (ByteArray) 18 | import Data.Word (Word16, Word32, Word64, Word8) 19 | 20 | import qualified Data.Bytes.Pure as Pure 21 | import qualified Data.Primitive as PM 22 | 23 | -- | Encode a 32-bit signed integer as 4 bytes. 24 | int32 :: Int32 -> Bytes 25 | {-# INLINE int32 #-} 26 | int32 = word32 . fromIntegral @Int32 @Word32 27 | 28 | -- | Encode a 32-bit unsigned integer as 4 bytes. 29 | word32 :: Word32 -> Bytes 30 | word32 !w = Pure.fromByteArray (word32U w) 31 | 32 | word32U :: Word32 -> ByteArray 33 | word32U !w = runByteArrayST $ do 34 | arr <- PM.newByteArray 4 35 | PM.writeByteArray arr 3 (fromIntegral @Word32 @Word8 (unsafeShiftR w 24)) 36 | PM.writeByteArray arr 2 (fromIntegral @Word32 @Word8 (unsafeShiftR w 16)) 37 | PM.writeByteArray arr 1 (fromIntegral @Word32 @Word8 (unsafeShiftR w 8)) 38 | PM.writeByteArray arr 0 (fromIntegral @Word32 @Word8 w) 39 | PM.unsafeFreezeByteArray arr 40 | 41 | -- | Encode a 16-bit signed integer as 4 bytes. 42 | int16 :: Int16 -> Bytes 43 | {-# INLINE int16 #-} 44 | int16 = word16 . fromIntegral @Int16 @Word16 45 | 46 | -- | Encode a 16-bit unsigned integer as 4 bytes. 47 | word16 :: Word16 -> Bytes 48 | word16 !w = Pure.fromByteArray (word16U w) 49 | 50 | word16U :: Word16 -> ByteArray 51 | word16U !w = runByteArrayST $ do 52 | arr <- PM.newByteArray 2 53 | PM.writeByteArray arr 1 (fromIntegral @Word16 @Word8 (unsafeShiftR w 8)) 54 | PM.writeByteArray arr 0 (fromIntegral @Word16 @Word8 w) 55 | PM.unsafeFreezeByteArray arr 56 | 57 | -- | Encode a 16-bit signed integer as 4 bytes. 58 | int64 :: Int64 -> Bytes 59 | {-# INLINE int64 #-} 60 | int64 = word64 . fromIntegral @Int64 @Word64 61 | 62 | -- | Encode a 16-bit unsigned integer as 4 bytes. 63 | word64 :: Word64 -> Bytes 64 | word64 !w = Pure.fromByteArray (word64U w) 65 | 66 | word64U :: Word64 -> ByteArray 67 | word64U !w = runByteArrayST $ do 68 | arr <- PM.newByteArray 8 69 | PM.writeByteArray arr 7 (fromIntegral @Word64 @Word8 (unsafeShiftR w 56)) 70 | PM.writeByteArray arr 6 (fromIntegral @Word64 @Word8 (unsafeShiftR w 48)) 71 | PM.writeByteArray arr 5 (fromIntegral @Word64 @Word8 (unsafeShiftR w 40)) 72 | PM.writeByteArray arr 4 (fromIntegral @Word64 @Word8 (unsafeShiftR w 32)) 73 | PM.writeByteArray arr 3 (fromIntegral @Word64 @Word8 (unsafeShiftR w 24)) 74 | PM.writeByteArray arr 2 (fromIntegral @Word64 @Word8 (unsafeShiftR w 16)) 75 | PM.writeByteArray arr 1 (fromIntegral @Word64 @Word8 (unsafeShiftR w 8)) 76 | PM.writeByteArray arr 0 (fromIntegral @Word64 @Word8 w) 77 | PM.unsafeFreezeByteArray arr 78 | -------------------------------------------------------------------------------- /src/Data/Bytes/Mutable.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE LambdaCase #-} 3 | 4 | {- | If you are interested in sub-arrays of 'MutableByteArray's (e.g. writing 5 | quicksort), it would be grossly inefficient to make a copy of the sub-array. 6 | On the other hand, it'd be really annoying to track limit indices by hand. 7 | 8 | This module defines the 'MutableBytes' type which exposes a standard array 9 | interface for a sub-arrays without copying and without manual index 10 | manipulation. For immutable arrays, see 'Data.Bytes'. 11 | -} 12 | module Data.Bytes.Mutable 13 | ( -- * Types 14 | MutableBytes 15 | 16 | -- * Filtering 17 | , takeWhile 18 | , dropWhile 19 | 20 | -- * Unsafe Slicing 21 | , unsafeTake 22 | , unsafeDrop 23 | 24 | -- * Conversion 25 | , fromMutableByteArray 26 | ) where 27 | 28 | import Prelude hiding (dropWhile, takeWhile) 29 | 30 | import Control.Monad.Primitive (PrimMonad, PrimState) 31 | import Data.Bytes.Types (MutableBytes (MutableBytes)) 32 | import Data.Primitive (MutableByteArray) 33 | import Data.Word (Word8) 34 | 35 | import qualified Data.Primitive as PM 36 | 37 | {- | Take bytes while the predicate is true, aliasing the 38 | argument array. 39 | -} 40 | takeWhile :: 41 | (PrimMonad m) => 42 | (Word8 -> m Bool) -> 43 | MutableBytes (PrimState m) -> 44 | m (MutableBytes (PrimState m)) 45 | {-# INLINE takeWhile #-} 46 | takeWhile k b = do 47 | n <- countWhile k b 48 | pure (unsafeTake n b) 49 | 50 | {- | Drop bytes while the predicate is true, aliasing the 51 | argument array. 52 | -} 53 | dropWhile :: 54 | (PrimMonad m) => 55 | (Word8 -> m Bool) -> 56 | MutableBytes (PrimState m) -> 57 | m (MutableBytes (PrimState m)) 58 | {-# INLINE dropWhile #-} 59 | dropWhile k b = do 60 | n <- countWhile k b 61 | pure (unsafeDrop n b) 62 | 63 | -- | Take the first @n@ bytes from the argument, aliasing it. 64 | unsafeTake :: Int -> MutableBytes s -> MutableBytes s 65 | {-# INLINE unsafeTake #-} 66 | unsafeTake n (MutableBytes arr off _) = 67 | MutableBytes arr off n 68 | 69 | {- | Drop the first @n@ bytes from the argument, aliasing it. 70 | The new length will be @len - n@. 71 | -} 72 | unsafeDrop :: Int -> MutableBytes s -> MutableBytes s 73 | {-# INLINE unsafeDrop #-} 74 | unsafeDrop n (MutableBytes arr off len) = 75 | MutableBytes arr (off + n) (len - n) 76 | 77 | {- | Create a slice of 'MutableBytes' that spans the entire 78 | argument array. This aliases the argument. 79 | -} 80 | fromMutableByteArray :: 81 | (PrimMonad m) => 82 | MutableByteArray (PrimState m) -> 83 | m (MutableBytes (PrimState m)) 84 | {-# INLINE fromMutableByteArray #-} 85 | fromMutableByteArray mba = do 86 | sz <- PM.getSizeofMutableByteArray mba 87 | pure (MutableBytes mba 0 sz) 88 | 89 | -- Internal. The returns the number of bytes that match the 90 | -- predicate until the first non-match occurs. If all bytes 91 | -- match the predicate, this will return the length originally 92 | -- provided. 93 | countWhile :: 94 | (PrimMonad m) => 95 | (Word8 -> m Bool) -> 96 | MutableBytes (PrimState m) -> 97 | m Int 98 | {-# INLINE countWhile #-} 99 | countWhile k (MutableBytes arr off0 len0) = go off0 len0 0 100 | where 101 | go !off !len !n = 102 | if len > 0 103 | then 104 | (k =<< PM.readByteArray arr off) >>= \case 105 | True -> go (off + 1) (len - 1) (n + 1) 106 | False -> pure n 107 | else pure n 108 | -------------------------------------------------------------------------------- /src/Data/Bytes/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE DuplicateRecordFields #-} 4 | {-# LANGUAGE MagicHash #-} 5 | {-# LANGUAGE TypeFamilies #-} 6 | 7 | -- This needs to be in its own module to prevent a cyclic dependency 8 | -- between UnliftedBytes and Data.Bytes.Types 9 | module Data.Bytes.Internal 10 | ( Bytes (..) 11 | ) where 12 | 13 | import Control.Monad.ST (runST) 14 | import Control.Monad.ST.Run (runByteArrayST) 15 | import Data.Bytes.Internal.Show (showsSlice) 16 | import Data.Primitive (ByteArray (..)) 17 | import Data.Word (Word8) 18 | import GHC.Exts (Int (I#), IsList (..), compareByteArrays#, isTrue#, sameMutableByteArray#, unsafeCoerce#) 19 | 20 | import qualified Data.Foldable as F 21 | import qualified Data.List as L 22 | import qualified Data.Primitive as PM 23 | 24 | -- | A slice of a 'ByteArray'. 25 | data Bytes = Bytes 26 | { array :: {-# UNPACK #-} !ByteArray 27 | , offset :: {-# UNPACK #-} !Int 28 | , length :: {-# UNPACK #-} !Int 29 | } 30 | 31 | instance IsList Bytes where 32 | type Item Bytes = Word8 33 | fromListN n xs = Bytes (fromListN n xs) 0 n 34 | fromList xs = fromListN (L.length xs) xs 35 | toList (Bytes arr off len) = toListLoop off len arr 36 | 37 | toListLoop :: Int -> Int -> ByteArray -> [Word8] 38 | toListLoop !off !len !arr = 39 | if len > 0 40 | then PM.indexByteArray arr off : toListLoop (off + 1) (len - 1) arr 41 | else [] 42 | 43 | instance Show Bytes where 44 | showsPrec _ (Bytes arr off len) s = showsSlice arr off len s 45 | 46 | instance Eq Bytes where 47 | Bytes arr1 off1 len1 == Bytes arr2 off2 len2 48 | | len1 /= len2 = False 49 | | sameByteArray arr1 arr2 && off1 == off2 = True 50 | | otherwise = compareByteArrays arr1 off1 arr2 off2 len1 == EQ 51 | 52 | instance Ord Bytes where 53 | compare (Bytes arr1 off1 len1) (Bytes arr2 off2 len2) 54 | | sameByteArray arr1 arr2 && off1 == off2 && len1 == len2 = EQ 55 | | otherwise = compareByteArrays arr1 off1 arr2 off2 (min len1 len2) <> compare len1 len2 56 | 57 | instance Semigroup Bytes where 58 | -- TODO: Do the trick to move the data constructor to the outside 59 | -- of runST. 60 | Bytes arrA offA lenA <> Bytes arrB offB lenB = runST $ do 61 | marr <- PM.newByteArray (lenA + lenB) 62 | PM.copyByteArray marr 0 arrA offA lenA 63 | PM.copyByteArray marr lenA arrB offB lenB 64 | r <- PM.unsafeFreezeByteArray marr 65 | pure (Bytes r 0 (lenA + lenB)) 66 | 67 | instance Monoid Bytes where 68 | mempty = Bytes mempty 0 0 69 | mconcat [] = mempty 70 | mconcat [x] = x 71 | mconcat bs = Bytes r 0 fullLen 72 | where 73 | !fullLen = L.foldl' (\acc (Bytes _ _ len) -> acc + len) 0 bs 74 | r = runByteArrayST $ do 75 | marr <- PM.newByteArray fullLen 76 | !_ <- 77 | F.foldlM 78 | ( \ !currLen (Bytes arr off len) -> do 79 | PM.copyByteArray marr currLen arr off len 80 | pure (currLen + len) 81 | ) 82 | 0 83 | bs 84 | PM.unsafeFreezeByteArray marr 85 | 86 | compareByteArrays :: ByteArray -> Int -> ByteArray -> Int -> Int -> Ordering 87 | {-# INLINE compareByteArrays #-} 88 | compareByteArrays (ByteArray ba1#) (I# off1#) (ByteArray ba2#) (I# off2#) (I# n#) = 89 | compare (I# (compareByteArrays# ba1# off1# ba2# off2# n#)) 0 90 | 91 | sameByteArray :: ByteArray -> ByteArray -> Bool 92 | {-# INLINE sameByteArray #-} 93 | sameByteArray (ByteArray ba1#) (ByteArray ba2#) = 94 | isTrue# (sameMutableByteArray# (unsafeCoerce# ba1#) (unsafeCoerce# ba2#)) 95 | -------------------------------------------------------------------------------- /byteslice.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 2.4 2 | name: byteslice 3 | version: 0.2.15.0 4 | synopsis: Slicing managed and unmanaged memory 5 | description: 6 | This library provides types that allow the user to talk about a slice of 7 | a ByteArray or a MutableByteArray. It also offers UnmanagedBytes, which 8 | is kind of like a slice into unmanaged memory. However, it is just an 9 | address and a length. 10 | 11 | homepage: https://github.com/byteverse/byteslice 12 | bug-reports: https://github.com/byteverse/byteslice/issues 13 | license: BSD-3-Clause 14 | license-file: LICENSE 15 | author: Andrew Martin 16 | maintainer: amartin@layer3com.com 17 | copyright: 2020 Andrew Martin 18 | category: Data 19 | extra-doc-files: 20 | CHANGELOG.md 21 | README.md 22 | 23 | tested-with: GHC ==9.4.8 || ==9.6.3 || ==9.8.1 24 | 25 | common build-settings 26 | default-language: Haskell2010 27 | ghc-options: -Wall -Wunused-packages 28 | 29 | flag avoid-rawmemchr 30 | default: True 31 | description: Avoid using rawmemchr which is non-portable GNU libc only 32 | 33 | library 34 | import: build-settings 35 | exposed-modules: 36 | Data.Bytes 37 | Data.Bytes.Chunks 38 | Data.Bytes.Indexed 39 | Data.Bytes.Encode.BigEndian 40 | Data.Bytes.Encode.LittleEndian 41 | Data.Bytes.Internal 42 | Data.Bytes.Mutable 43 | Data.Bytes.Text.Ascii 44 | Data.Bytes.Text.AsciiExt 45 | Data.Bytes.Text.Latin1 46 | Data.Bytes.Text.Utf8 47 | Data.Bytes.Text.Windows1252 48 | Data.Bytes.Types 49 | 50 | other-modules: 51 | Cstrlen 52 | Data.Bytes.Byte 53 | Data.Bytes.Internal.Show 54 | Data.Bytes.IO 55 | Data.Bytes.Pure 56 | Data.Bytes.Search 57 | Reps 58 | 59 | build-depends: 60 | , base >=4.14 && <5 61 | , bytestring >=0.10.8 && <0.13 62 | , natural-arithmetic >=0.1.4 63 | , primitive >=0.7.4 && <0.10 64 | , primitive-addr >=0.1 && <0.2 65 | , primitive-unlifted >=0.1.2 && <2.3 66 | , run-st >=0.1.1 && <0.2 67 | , text >=2.1 68 | , text-short >=0.1.3 && <0.2 69 | , tuples >=0.1 && <0.2 70 | , vector >=0.12 && <0.14 71 | 72 | hs-source-dirs: src 73 | ghc-options: -O2 74 | 75 | if impl(ghc >=9.2) 76 | hs-source-dirs: src-new-reps 77 | 78 | else 79 | hs-source-dirs: src-old-reps 80 | 81 | if impl(ghc >=9.0) 82 | hs-source-dirs: src-ghc-cstrlen 83 | 84 | else 85 | hs-source-dirs: src-no-ghc-cstrlen 86 | 87 | include-dirs: include 88 | includes: bs_custom.h 89 | install-includes: bs_custom.h 90 | c-sources: cbits/bs_custom.c 91 | 92 | if flag(avoid-rawmemchr) 93 | cc-options: -DAVOID_RAWMEMCHR=1 94 | 95 | test-suite test 96 | import: build-settings 97 | type: exitcode-stdio-1.0 98 | hs-source-dirs: test 99 | main-is: Main.hs 100 | build-depends: 101 | , base >=4.11.1 && <5 102 | , byteslice 103 | , bytestring 104 | , primitive 105 | , quickcheck-classes >=0.6.4 106 | , tasty 107 | , tasty-hunit 108 | , tasty-quickcheck 109 | , text >=2.1 110 | , transformers 111 | 112 | benchmark bench 113 | import: build-settings 114 | type: exitcode-stdio-1.0 115 | build-depends: 116 | , base 117 | , byteslice 118 | , gauge 119 | 120 | ghc-options: -O2 121 | hs-source-dirs: bench 122 | main-is: Main.hs 123 | 124 | source-repository head 125 | type: git 126 | location: git://github.com/byteverse/byteslice.git 127 | -------------------------------------------------------------------------------- /src/Data/Bytes/Text/Ascii.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE MagicHash #-} 3 | {-# LANGUAGE BinaryLiterals #-} 4 | {-# LANGUAGE CPP #-} 5 | {-# LANGUAGE NumericUnderscores #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | 8 | {- | This module treats 'Bytes' data as holding ASCII text. Providing bytes 9 | outside the ASCII range (@U+0000@ -- @U+007F@) may cause a failure or 10 | unspecified results, but such bytes will never be inspected. 11 | 12 | For functions that can operate on ASCII-compatible encodings, see 13 | 'Data.Bytes.Text.AsciiExt'. 14 | -} 15 | module Data.Bytes.Text.Ascii 16 | ( fromString 17 | , decodeDecWord 18 | , equalsCStringCaseInsensitive 19 | , toShortText 20 | , toShortTextU 21 | , toText 22 | ) where 23 | 24 | import Data.Bits ((.&.)) 25 | import Data.ByteString.Short.Internal (ShortByteString (SBS)) 26 | import Data.Bytes.Text.Latin1 (decodeDecWord) 27 | import Data.Bytes.Types (Bytes (Bytes)) 28 | import Data.Char (ord) 29 | import Data.Primitive (ByteArray) 30 | import Data.Text (Text) 31 | import Data.Text.Short (ShortText) 32 | import Data.Word (Word8) 33 | import Foreign.C.String (CString) 34 | import Foreign.Ptr (Ptr, castPtr, plusPtr) 35 | import GHC.Exts (Int#,Word#,ByteArray#,(+#),(<#)) 36 | import GHC.Int (Int(I#)) 37 | 38 | import qualified Data.Bytes.Pure as Bytes 39 | import qualified Data.Primitive as PM 40 | import qualified Data.Primitive.Ptr as PM 41 | import qualified Data.Text.Array as A 42 | import qualified Data.Text.Internal as I 43 | import qualified Data.Text.Short.Unsafe as TS 44 | import qualified GHC.Exts as Exts 45 | 46 | {- | Convert a 'String' consisting of only characters in the ASCII block 47 | to a byte sequence. Any character with a codepoint above @U+007F@ is 48 | replaced by @U+0000@. 49 | -} 50 | fromString :: String -> Bytes 51 | fromString = 52 | Bytes.fromByteArray 53 | . Exts.fromList 54 | . map (\c -> let i = ord c in if i < 128 then fromIntegral @Int @Word8 i else 0) 55 | 56 | -- TODO presumably also fromText and fromShortText 57 | 58 | toShortText :: Bytes -> Maybe ShortText 59 | {-# INLINE toShortText #-} 60 | toShortText !b = case Bytes.foldr (\w acc -> w < 128 && acc) True b of 61 | True -> Just (TS.fromShortByteStringUnsafe (Bytes.toShortByteString b)) 62 | False -> Nothing 63 | 64 | toShortTextU :: ByteArray -> Maybe ShortText 65 | {-# INLINE toShortTextU #-} 66 | toShortTextU !b = case Bytes.foldr (\w acc -> w < 128 && acc) True (Bytes.fromByteArray b) of 67 | True -> Just (TS.fromShortByteStringUnsafe (case b of PM.ByteArray x -> SBS x)) 68 | False -> Nothing 69 | 70 | -- | Interpret byte sequence as ASCII codepoints. 71 | -- Returns 'Nothing' if any of the bytes are outside of the 72 | -- range @0x00-0x7F@ 73 | -- 74 | -- This does not work on 32-bit architectures. I am not certain if it 75 | -- even compiles on those systems. 76 | toText :: Bytes -> Maybe Text 77 | toText (Bytes (PM.ByteArray arr) off@(I# off# ) len@(I# len# )) = 78 | let !r0 = validateAscii# arr off# len# (off# +# len# ) 79 | !r1 = Exts.and# r0 0b1000_0000_1000_0000_1000_0000_1000_0000_1000_0000_1000_0000_1000_0000_1000_0000## 80 | in case r1 of 81 | 0## -> Just (I.Text (A.ByteArray arr) off len) 82 | _ -> Nothing 83 | 84 | -- returns 0 to mean that the input slice was all ascii text 85 | -- any other number means that a non-ascii byte was encountered 86 | -- Precondition: len and end must agree with one another. 87 | validateAscii# :: ByteArray# -> Int# -> Int# -> Int# -> Word# 88 | {-# noinline validateAscii# #-} 89 | validateAscii# arr off len end 90 | -- This length check at the beginning is not just a performance optimization. 91 | -- If the length of the slice is less than 8, the calculations of "middle start" 92 | -- and "middle end" in the otherwise clause are out of bounds. 93 | | 1# <- len <# 20# = validateRangeSlowly# 0## off end arr 94 | | otherwise = 95 | let !middleStartSwar = Exts.uncheckedIShiftRL# (off +# 7#) 3# 96 | !middleEndSwar = Exts.uncheckedIShiftRL# end 3# 97 | !w0 = validateRangeSwar# 0## middleStartSwar middleEndSwar arr 98 | !w1 = validateRangeSlowly# w0 off (Exts.uncheckedIShiftL# middleStartSwar 3# ) arr 99 | !w2 = validateRangeSlowly# w1 (Exts.uncheckedIShiftL# middleEndSwar 3# ) end arr 100 | in w2 101 | 102 | -- Here, the offset and the end refer to 64-bit word units, not byte units 103 | -- like they do in validateRangeSlowly. 104 | validateRangeSwar# :: Word# -> Int# -> Int# -> ByteArray# -> Word# 105 | {-# inline validateRangeSwar# #-} 106 | validateRangeSwar# acc0 off0 end arr = go off0 acc0 107 | where 108 | go :: Int# -> Word# -> Word# 109 | go off acc = case off <# end of 110 | 1# -> go (off +# 1#) (Exts.or# (Exts.indexWordArray# arr off) acc) 111 | _ -> acc 112 | 113 | -- Accepts a start and end position. The end position is exclusive. 114 | validateRangeSlowly# :: Word# -> Int# -> Int# -> ByteArray# -> Word# 115 | {-# inline validateRangeSlowly# #-} 116 | validateRangeSlowly# acc0 off0 end arr = go off0 acc0 117 | where 118 | go :: Int# -> Word# -> Word# 119 | go off acc = case off <# end of 120 | 1# -> go (off +# 1#) (Exts.or# (Exts.word8ToWord# (Exts.indexWord8Array# arr off)) acc) 121 | _ -> acc 122 | 123 | {- | Is the byte sequence equal to the @NUL@-terminated C String? 124 | The C string must be a constant. 125 | -} 126 | equalsCStringCaseInsensitive :: CString -> Bytes -> Bool 127 | {-# INLINE equalsCStringCaseInsensitive #-} 128 | equalsCStringCaseInsensitive !ptr0 (Bytes arr off0 len0) = go (castPtr ptr0 :: Ptr Word8) off0 len0 129 | where 130 | go !ptr !off !len = case len of 131 | 0 -> PM.indexOffPtr ptr 0 == (0 :: Word8) 132 | _ -> case PM.indexOffPtr ptr 0 of 133 | 0 -> False 134 | c -> 135 | (c .&. 0b1101_1111) == (PM.indexByteArray arr off .&. 0b1101_1111) 136 | && go (plusPtr ptr 1) (off + 1) (len - 1) 137 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Revision history for byteslice 2 | 3 | ## 0.2.15.0 -- 2024-06-12 4 | 5 | * Add functions for recovering length from `ByteArrayN` 6 | * Add `Data.Bytes.Indexed` for functions on `ByteArrayN` 7 | 8 | ## 0.2.14.0 -- 2024-02-26 9 | 10 | * Add functions to `Data.Bytes.Text.AsciiExt`: `split(1|2|3|4)`, 11 | `splitTetragram1`, `anyEq`, `takeWhileNotEq`, `dropWhileNotEq`, 12 | `takeWhileEndNotEq`, dropWhileEndEq`. 13 | * Increment upper bound on `primitive-unlifted` 14 | 15 | ## 0.2.13.2 -- 2024-02-06 16 | 17 | * Restore `Data.Bytes.Text.Utf8.toText`. 18 | 19 | ## 0.2.13.1 -- 2024-02-01 20 | 21 | * Update package metadata. 22 | 23 | ## 0.2.13.0 -- 2024-01-12 24 | 25 | * Add `replicate` and `replicateByte` to `Data.Bytes.Chunks`. 26 | 27 | ## 0.2.12.0 -- 2023-12-14 28 | 29 | * Add `Data.Bytes.Chunks.cons`. 30 | * Add `Data.Bytes.fromLazyByteString`. 31 | * Allow building with GHC 9.8. 32 | 33 | ## 0.2.11.1 -- 2023-07-26 34 | 35 | * Fix regression causing build failure in GHC 9.2. 36 | 37 | ## 0.2.11.0 -- 2023-07-25 38 | 39 | * Add `Data.Bytes.Encode.LittleEndian`. 40 | * Add `splitTetragram1`. 41 | 42 | ## 0.2.10.0 -- 2023-05-01 43 | 44 | * Add `equals13`, `equals14`, `equals15`. 45 | * Add `Show` instances for `BytesN` and `ByteArrayN`. 46 | * Add `equalsCStringCaseInsensitive` for ascii text. 47 | 48 | ## 0.2.9.0 -- 2022-12-08 49 | 50 | * Add `toShortText`, `toShortTextU`, and `toText` to `Data.Byte.Text.Ascii`. 51 | * Add `fromShortText` and `fromText` to `Data.Bytes.Text.Utf8`. 52 | 53 | ## 0.2.8.0 -- 2022-11-21 54 | 55 | * Add `Data.Bytes.replace` and `Data.Bytes.findIndices` 56 | * Add `Data.Bytes.fromShortText` 57 | * Add `Data.Bytes.fromPrimArray` 58 | * Add `decodeDecWord` 59 | * Add `concatArray` and `concatArrayU`. 60 | 61 | ## 0.2.7.0 -- 2022-02-16 62 | 63 | * Add support for GHC 9.2. 64 | * Drop support for GHC 8.8 and earlier. 65 | * Add `foldlM` and `foldrM` for mondic folds over byte sequences. 66 | 67 | ## 0.2.6.0 -- 2021-09-15 68 | 69 | * Add `BytesN` and `ByteArrayN`. 70 | * Add `isInfixOf`. 71 | * Add `hForLines_` and `hFoldLines`. 72 | * Add `lift` and `unlift` for converting between `Bytes` and `Bytes#`. 73 | * Move text-oriented functions from Data.Bytes to `Data.Bytes.Text.*`. 74 | Provide aliases with older names that come with deprecation warning. 75 | 76 | ## 0.2.5.2 -- 2021-02-23 77 | 78 | * Correct compatibility shims. 79 | 80 | ## 0.2.5.1 -- 2021-02-22 (deprecated) 81 | 82 | * Compatibility with GHC 9.0. 83 | 84 | ## 0.2.5.0 -- 2021-01-22 85 | 86 | * Add `Data.Bytes.Chunks.concatByteString`. 87 | * Expose `pinnedToByteString` to end users. 88 | 89 | ## 0.2.4.0 -- 2020-10-15 90 | 91 | * Add `toByteString` and `fromByteString`. 92 | * Add `fromShortByteString`. 93 | * Add `equalsLatin(9|10|11|12)`. 94 | * Add `toPinnedByteArray`, `toPinnedByteArrayClone`, and `concatPinnedU`. 95 | * Add `toLowerAsciiByteArrayClone`. 96 | * Add `intercalateByte2`. 97 | * Add `splitEnd1`. 98 | 99 | ## 0.2.3.0 -- 2020-04-30 100 | 101 | * Add `fnv1a32` and `fnv1a64`, implementations of the 32-bit and 102 | 64-bit variants of the FNV-1a hash algorithm, to both `Data.Bytes` 103 | and `Data.Bytes.Chunks`. 104 | * Add `Data.Bytes.Chunks.null`. 105 | * Add `readFile` to both `Data.Bytes` and `Data.Bytes.Chunks`. 106 | * Add `foldl'` to `Data.Bytes.Chunks`. 107 | * Add `split` to `Data.Bytes.Chunks`. 108 | * Add `splitStream` for splitting as a good stream-fusion producer. 109 | * Add `hPut` and `writeFile` to `Data.Bytes.Chunks`. 110 | * Add `fromCString#`. 111 | * Add `Bytes#` newtype on GHC 8.10 and up. 112 | 113 | ## 0.2.2.0 -- 2020-02-27 114 | 115 | * Add `split4`. 116 | * Add `equalsCString`. 117 | * Add `stripCStringPrefix`. 118 | * Add `equalsLatin8`. 119 | * Add `emptyPinned`. 120 | * Add `concatPinned` to `Data.Bytes.Chunks`. 121 | * Add `any` and `all`. 122 | 123 | ## 0.2.1.0 -- 2020-01-22 124 | 125 | * Add `longestCommonPrefix`. 126 | * Fix broken `Ord` instance of `Bytes`. 127 | 128 | ## 0.2.0.0 -- 2020-01-20 129 | 130 | * Change behavior of `split`. This function previously had a special case 131 | for zero-length byte sequences to mirror the behavior `bytestring`'s 132 | behavior. Now, `split` returns a singleton list with the empty byte 133 | sequence in this case. 134 | * Add `splitNonEmpty` so that users who need to take advantage of the 135 | non-null guarantee `split` provides can do so. 136 | * Add `splitU` and `splitInitU` for users who are going to split a 137 | byte sequence without and consume the results more than once. 138 | * Make the C code compile on platforms that do not have `rawmemchr`. 139 | * Rename `splitOnce` to `split1`. 140 | * Add `split2` and `split3`. 141 | * Add `equalsLatin{1,2,3,4,5,6,7}` 142 | * Add `ifoldl'`. 143 | * Add `hGet` and `hPut`. 144 | * Move `Data.Bytes.Chunks` from `small-bytearray-builder` to `byteslice`. 145 | * Rename `Data.Bytes.Chunks.concat` to `concatU` (the U means unsliced), 146 | and add a new `concat` that returns `Bytes`. 147 | * Add `fromBytes`, `fromByteArray`, and `unsafeCopy` to `Data.Bytes.Chunks`. 148 | * Add `hGetContents` to `Data.Bytes.Chunks`. 149 | * Add `isBytePrefixOf` and `isByteSuffixOf`. 150 | * Add `replicate` and `replicateU`. 151 | * Add `Monoid` instance for `Bytes`. 152 | * Add `singleton`, `doubleton`, `tripleton`, and their unsliced variants. 153 | * Rename `copy` to `unsafeCopy`. 154 | * Add `fromLatinString`. 155 | * Change the behavior of `fromAsciiString` to replace out-of-bounds codepoints 156 | with NUL. 157 | * Add `unsnoc` and `uncons`. 158 | 159 | ## 0.1.4.0 -- 2019-11-12 160 | 161 | * Add `toLatinString`. 162 | * Add `stripPrefix`, `stripSuffix`, `stripOptionalPrefix`, and 163 | `stripOptionalSuffix`. 164 | * Add `takeWhileEnd` and `dropWhileEnd`. 165 | * Add `count`. 166 | * Add an optimized `split` function. 167 | * Add `splitInit`. 168 | * Add `splitFirst`. 169 | * Add `copy`. 170 | * Add `pin`. 171 | * Add `touch`. 172 | * Add `elem`. 173 | * Add `unsafeIndex`. 174 | 175 | ## 0.1.3.0 -- 2019-09-15 176 | 177 | * Add `isPrefixOf` and `isSuffixOf`. 178 | * Add `foldl`, `foldr`, `foldl'`, and `foldr'`. 179 | 180 | ## 0.1.2.0 -- 2019-08-21 181 | 182 | * Add `Data.Bytes.Mutable` module. 183 | * Add `Data.Bytes` module. 184 | 185 | ## 0.1.1.0 -- 2019-07-03 186 | 187 | * Add record labels for Bytes and MutableBytes 188 | * Add UnmanagedBytes. This is just an Addr and a length. 189 | 190 | ## 0.1.0.0 -- 2019-04-30 191 | 192 | * First version. 193 | -------------------------------------------------------------------------------- /src/Data/Bytes/Text/AsciiExt.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE LambdaCase #-} 3 | {-# LANGUAGE RankNTypes #-} 4 | {-# LANGUAGE TypeApplications #-} 5 | 6 | -- For functions that can fail for bytes outside the ASCII range, see 7 | -- 'Data.Bytes.Ascii'. For functions that can inspect bytes outside ASCII, see 8 | -- any of the modules for ASCII-compatible encodings (e.g. 'Data.Bytes.Utf8', 9 | -- 'Data.Bytes.Latin1', and so on). 10 | 11 | {- | This module contains functions which operate on supersets of 'Bytes' containing ASCII-encoded text. 12 | That is, none of the functions here inspect bytes with a value greater than 127, and do not fail due to the presence of such bytes. 13 | -} 14 | module Data.Bytes.Text.AsciiExt 15 | ( -- * Line-Oriented IO 16 | hFoldLines 17 | , hForLines_ 18 | 19 | -- ** Standard Handles 20 | , forLines_ 21 | , foldLines 22 | 23 | -- * Predicates 24 | , anyEq 25 | 26 | -- * Filtering 27 | , takeWhileNotEq 28 | , dropWhileNotEq 29 | , takeWhileEndNotEq 30 | , dropWhileEndEq 31 | 32 | -- * Splitting 33 | -- ** Fixed from Beginning 34 | , split1 35 | , splitTetragram1 36 | , split2 37 | , split3 38 | , split4 39 | 40 | -- * Text Manipulation 41 | , toLowerU 42 | ) where 43 | 44 | import Control.Monad.ST (ST) 45 | import Control.Monad.ST.Run (runByteArrayST) 46 | import Data.Bytes.Types (Bytes (..)) 47 | import Data.Char (ord) 48 | import Data.Primitive (ByteArray) 49 | import Data.Word (Word8) 50 | import System.IO (Handle, hIsEOF, stdin) 51 | 52 | import qualified Data.ByteString.Char8 as BC8 53 | import qualified Data.Bytes.Pure as Bytes 54 | import qualified Data.Bytes.Byte as Byte 55 | import qualified Data.Primitive as PM 56 | 57 | -- | `hForLines_` over `stdin` 58 | forLines_ :: (Bytes -> IO a) -> IO () 59 | {-# INLINEABLE forLines_ #-} 60 | forLines_ = hForLines_ stdin 61 | 62 | -- | `hFoldLines` over `stdin` 63 | foldLines :: a -> (a -> Bytes -> IO a) -> IO a 64 | {-# INLINEABLE foldLines #-} 65 | foldLines = hFoldLines stdin 66 | 67 | {- | Perform an action on each line of the input, discarding results. 68 | To maintain a running state, see 'hFoldLines'. 69 | 70 | Lines are extracted with with 'BC8.hGetLine', which does not document its 71 | detection algorithm. As of writing (bytestring v0.11.1.0), lines are 72 | delimited by a single @\n@ character (UNIX-style, as all things should be). 73 | -} 74 | hForLines_ :: Handle -> (Bytes -> IO a) -> IO () 75 | hForLines_ h body = loop 76 | where 77 | loop = 78 | hIsEOF h >>= \case 79 | False -> do 80 | line <- Bytes.fromByteString <$> BC8.hGetLine h 81 | _ <- body line 82 | loop 83 | True -> pure () 84 | 85 | {- | Perform an action on each line of the input, threading state through the computation. 86 | If you do not need to keep a state, see `hForLines_`. 87 | 88 | Lines are extracted with with 'BC8.hGetLine', which does not document its 89 | detection algorithm. As of writing (bytestring v0.11.1.0), lines are 90 | delimited by a single @\n@ character (UNIX-style, as all things should be). 91 | -} 92 | hFoldLines :: Handle -> a -> (a -> Bytes -> IO a) -> IO a 93 | hFoldLines h z body = loop z 94 | where 95 | loop !x = 96 | hIsEOF h >>= \case 97 | False -> do 98 | line <- Bytes.fromByteString <$> BC8.hGetLine h 99 | x' <- body x line 100 | loop x' 101 | True -> pure x 102 | 103 | {- | /O(n)/ Convert ASCII letters to lowercase. This adds @0x20@ to bytes in the 104 | range @[0x41,0x5A]@ (@A-Z@ ⇒ @a-z@) and leaves all other bytes alone. 105 | Unconditionally copies the bytes. 106 | -} 107 | toLowerU :: Bytes -> ByteArray 108 | toLowerU (Bytes src off0 len0) = 109 | runByteArrayST action 110 | where 111 | action :: forall s. ST s ByteArray 112 | action = do 113 | dst <- PM.newByteArray len0 114 | let go !off !ix !len = 115 | if len == 0 116 | then pure () 117 | else do 118 | let w = PM.indexByteArray src off :: Word8 119 | w' = 120 | if w >= 0x41 && w <= 0x5A 121 | then w + 32 122 | else w 123 | PM.writeByteArray dst ix w' 124 | go (off + 1) (ix + 1) (len - 1) 125 | go off0 0 len0 126 | PM.unsafeFreezeByteArray dst 127 | 128 | -- | Throws an exception the 'Char' argument is non-ascii. 129 | split1 :: Char -> Bytes -> Maybe (Bytes, Bytes) 130 | {-# INLINE split1 #-} 131 | split1 !c !b 132 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split1: argument not in ASCII range" 133 | | otherwise = Byte.split1 (c2w c) b 134 | 135 | -- | Throws an exception the 'Char' argument is non-ascii. 136 | split2 :: Char -> Bytes -> Maybe (Bytes, Bytes, Bytes) 137 | {-# INLINE split2 #-} 138 | split2 !c !b 139 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split2: argument not in ASCII range" 140 | | otherwise = Byte.split2 (c2w c) b 141 | 142 | -- | Throws an exception the 'Char' argument is non-ascii. 143 | split3 :: Char -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes) 144 | {-# INLINE split3 #-} 145 | split3 !c !b 146 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split3: argument not in ASCII range" 147 | | otherwise = Byte.split3 (c2w c) b 148 | 149 | -- | Throws an exception the 'Char' argument is non-ascii. 150 | split4 :: Char -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes, Bytes) 151 | {-# INLINE split4 #-} 152 | split4 !c !b 153 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.split4: argument not in ASCII range" 154 | | otherwise = Byte.split4 (c2w c) b 155 | 156 | -- | Throws an exception if any of the 'Char' arguments are non-ascii. 157 | splitTetragram1 :: Char -> Char -> Char -> Char -> Bytes -> Maybe (Bytes, Bytes) 158 | {-# inline splitTetragram1 #-} 159 | splitTetragram1 !c0 !c1 !c2 !c3 !b 160 | | c0 > '\DEL' || c1 > '\DEL' || c2 > '\DEL' || c3 > '\DEL' = 161 | errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.splitTetragram1: one of the characters is not in ASCII range" 162 | | otherwise = Bytes.splitTetragram1 (c2w c0) (c2w c1) (c2w c2) (c2w c3) b 163 | 164 | c2w :: Char -> Word8 165 | {-# inline c2w #-} 166 | c2w !c = fromIntegral @Int @Word8 (ord c) 167 | 168 | -- | Throws an exception the 'Char' argument is non-ascii. 169 | dropWhileNotEq :: Char -> Bytes -> Bytes 170 | dropWhileNotEq !c !b 171 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.dropWhileNotEq: argument not in ASCII range" 172 | | otherwise = 173 | let !w = c2w c 174 | in Bytes.unsafeDrop (Bytes.countWhile (/= w) b) b 175 | 176 | -- | Throws an exception the 'Char' argument is non-ascii. 177 | takeWhileNotEq :: Char -> Bytes -> Bytes 178 | takeWhileNotEq !c !b 179 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.takeWhileNotEq: argument not in ASCII range" 180 | | otherwise = 181 | let !w = c2w c 182 | in Bytes.unsafeTake (Bytes.countWhile (/= w) b) b 183 | 184 | -- | Throws an exception the 'Char' argument is non-ascii. 185 | takeWhileEndNotEq :: Char -> Bytes -> Bytes 186 | takeWhileEndNotEq !c !b 187 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.takeWhileEndNotEq: argument not in ASCII range" 188 | | otherwise = 189 | let !w = c2w c 190 | !n = Bytes.countWhileEnd (/=w) b 191 | in Bytes (array b) (offset b + Bytes.length b - n) n 192 | 193 | -- | Throws an exception the 'Char' argument is non-ascii. 194 | dropWhileEndEq :: Char -> Bytes -> Bytes 195 | dropWhileEndEq !c !b 196 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.dropWhileEndEq: argument not in ASCII range" 197 | | otherwise = 198 | let !w = c2w c 199 | !n = Bytes.countWhileEnd (==w) b 200 | in Bytes.unsafeTake (Bytes.length b - n) b 201 | 202 | -- | Throws an exception the 'Char' argument is non-ascii. 203 | anyEq :: Char -> Bytes -> Bool 204 | anyEq !c !b 205 | | c > '\DEL' = errorWithoutStackTrace "Data.Bytes.Text.AsciiExt.takeWhileNotEq: argument not in ASCII range" 206 | | otherwise = 207 | let !w = c2w c 208 | in Bytes.any (==w) b 209 | -------------------------------------------------------------------------------- /src/Data/Bytes/Search.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE BlockArguments #-} 3 | {-# LANGUAGE DuplicateRecordFields #-} 4 | {-# LANGUAGE MagicHash #-} 5 | {-# LANGUAGE NamedFieldPuns #-} 6 | {-# LANGUAGE RankNTypes #-} 7 | 8 | -- This is broken out into a separate module to make it easier 9 | -- to dump core and investigate performance issues. 10 | module Data.Bytes.Search 11 | ( findIndices 12 | , replace 13 | , isInfixOf 14 | ) where 15 | 16 | import Prelude hiding (all, any, dropWhile, elem, foldl, foldr, length, map, null, readFile, replicate, takeWhile) 17 | 18 | import Control.Monad.ST.Run (runByteArrayST, runPrimArrayST) 19 | import Data.Bits (finiteBitSize, shiftL, (.&.), (.|.)) 20 | import Data.Bytes.Pure (length, unsafeHead, unsafeIndex) 21 | import Data.Bytes.Types (Bytes (Bytes, array, offset)) 22 | import Data.Primitive (ByteArray, PrimArray) 23 | import GHC.Exts (Int (I#)) 24 | import GHC.Word (Word32) 25 | 26 | import qualified Data.Bytes.Byte as Byte 27 | import qualified Data.Bytes.Pure as Pure 28 | import qualified Data.Bytes.Types as Types 29 | import qualified Data.Primitive as PM 30 | 31 | -- Implementation Notes 32 | -- ===================== 33 | -- For karp rabin, there are some easy performance improvements 34 | -- left on the table. The main optimization that has been done is making 35 | -- sure that there is no unnecessary boxing of Int, Word32, or Bytes 36 | -- going on. Here are some other things that have not been done: 37 | -- 38 | 39 | -- * The hash is currently a Word32. It would be better to use either 40 | 41 | -- Word or Word64 for this. We would need for hashKey to be different. 42 | 43 | -- * In several places, we track an index into a Bytes. This index gets 44 | 45 | -- repeatedly added to the base offset as we loop over the bytes. We 46 | -- could instead track the true offset instead of repeatedly 47 | -- recalculating it. 48 | 49 | {- | Replace every non-overlapping occurrence of @needle@ in 50 | @haystack@ with @replacement@. 51 | -} 52 | replace :: 53 | -- | needle, must not be empty 54 | Bytes -> 55 | -- | replacement 56 | Bytes -> 57 | -- | haystack 58 | Bytes -> 59 | Bytes 60 | {-# NOINLINE replace #-} 61 | -- Implementation note: there is a lot of room to improve the performance 62 | -- of this function. 63 | replace !needle !replacement !haystack@Bytes {array = haystackArray, offset = haystackIndex, length = haystackLength} 64 | | Pure.length needle == 0 = errorWithoutStackTrace "Data.Bytes.replace: needle of length zero" 65 | | Pure.length haystack == 0 = Pure.empty 66 | | Pure.length needle == 1 67 | , Pure.length replacement == 1 = 68 | let !needle0 = unsafeIndex needle 0 69 | !replacement0 = unsafeIndex replacement 0 70 | in Pure.map (\w -> if w == needle0 then replacement0 else w) haystack 71 | | otherwise = 72 | let !hp = rollingHash needle 73 | !ixs = findIndicesKarpRabin 0 hp needle haystackArray haystackIndex haystackLength 74 | in Pure.fromByteArray (replaceIndices ixs replacement (Pure.length needle) haystackArray haystackIndex haystackLength) 75 | 76 | -- This is an internal function because it deals explicitly with 77 | -- an offset into a byte array. 78 | -- 79 | -- Example: 80 | 81 | -- * haystack len: 39 82 | 83 | -- * ixs: 7, 19, 33 84 | 85 | -- * patLen: 5 86 | 87 | -- * replacment: foo (len 3) 88 | 89 | -- We want to perform these copies: 90 | 91 | -- * src[0,7] -> dst[0,7] 92 | 93 | -- * foo -> dst[7,3] 94 | 95 | -- * src[12,7] -> dst[10,7] 96 | 97 | -- * foo -> dst[17,3] 98 | 99 | -- * src[24,9] -> dst[20,9] 100 | 101 | -- * foo -> dst[29,3] 102 | 103 | -- * src[38,1] -> dst[32,1] 104 | replaceIndices :: PrimArray Int -> Bytes -> Int -> ByteArray -> Int -> Int -> ByteArray 105 | replaceIndices !ixs !replacement !patLen !haystack !ix0 !len0 = runByteArrayST $ do 106 | let !ixsLen = PM.sizeofPrimArray ixs 107 | let !delta = Pure.length replacement - patLen 108 | dst <- PM.newByteArray (len0 + ixsLen * delta) 109 | let applyReplacement !ixIx !prevSrcIx = 110 | if ixIx < ixsLen 111 | then do 112 | let !srcMatchIx = PM.indexPrimArray ixs ixIx 113 | let !offset = ixIx * delta 114 | let !dstIx = srcMatchIx + offset - ix0 115 | Pure.unsafeCopy 116 | dst 117 | (prevSrcIx + offset - ix0) 118 | Bytes {array = haystack, offset = prevSrcIx, length = srcMatchIx - prevSrcIx} 119 | Pure.unsafeCopy dst dstIx replacement 120 | applyReplacement (ixIx + 1) (srcMatchIx + patLen) 121 | else do 122 | let !offset = ixIx * delta 123 | Pure.unsafeCopy 124 | dst 125 | (prevSrcIx + offset - ix0) 126 | Bytes {array = haystack, offset = prevSrcIx, length = (len0 + ix0) - prevSrcIx} 127 | PM.unsafeFreezeByteArray dst 128 | applyReplacement 0 ix0 129 | 130 | -- | Find locations of non-overlapping instances of @needle@ within @haystack@. 131 | findIndices :: 132 | -- | needle 133 | Bytes -> 134 | -- | haystack 135 | Bytes -> 136 | PrimArray Int 137 | findIndices needle Bytes {array, offset = off, length = len} 138 | | needleLen == 0 = errorWithoutStackTrace "Data.Bytes.findIndices: needle with length zero" 139 | | len == 0 = mempty 140 | | otherwise = 141 | let !hp = rollingHash needle 142 | in findIndicesKarpRabin (negate off) hp needle array off len 143 | where 144 | needleLen = Pure.length needle 145 | 146 | -- Precondition: Haystack has non-zero length 147 | -- Precondition: Pattern has non-zero length 148 | -- Uses karp rabin to search. 149 | -- Easy opportunity to improve implementation. Instead of having karpRabin 150 | -- return two slices, we could have it just return a single index. 151 | findIndicesKarpRabin :: 152 | Int -> -- Output index modifier. Set to negated initial index to make slicing invisible in results. 153 | Word32 -> -- Hash to search for (must agree with pattern) 154 | Bytes -> -- Pattern to search for 155 | ByteArray -> 156 | Int -> -- initial index 157 | Int -> -- length 158 | PrimArray Int 159 | findIndicesKarpRabin !ixModifier !hp !pat !haystack !ix0 !len0 = runPrimArrayST $ do 160 | let dstLen = 1 + quot len0 (Pure.length pat) 161 | dst <- PM.newPrimArray dstLen 162 | let go !ix !len !ixIx = case karpRabin hp pat Bytes {array = haystack, offset = ix, length = len} of 163 | (-1) -> do 164 | PM.shrinkMutablePrimArray dst ixIx 165 | PM.unsafeFreezePrimArray dst 166 | skipCount -> do 167 | let !advancement = skipCount - Pure.length pat 168 | let !advancement' = advancement + Pure.length pat 169 | PM.writePrimArray dst ixIx (ix + advancement + ixModifier) 170 | let !ix' = ix + advancement' 171 | go ix' (len - advancement') (ixIx + 1) 172 | go ix0 len0 0 173 | 174 | -- Output: Negative one means match not found. Other negative 175 | -- numbers should not occur. Zero may occur. Positive number 176 | -- means the number of bytes to skip to make it past the match. 177 | breakSubstring :: 178 | -- | String to search for 179 | Bytes -> 180 | -- | String to search in 181 | Bytes -> 182 | Int 183 | breakSubstring !pat !haystack@(Bytes _ off0 _) = 184 | case lp of 185 | 0 -> 0 186 | 1 -> case Byte.elemIndexLoop# (unsafeHead pat) haystack of 187 | (-1#) -> (-1) 188 | off -> 1 + (I# off) - off0 189 | _ -> 190 | if lp * 8 <= finiteBitSize (0 :: Word) 191 | then shift haystack 192 | else karpRabin (rollingHash pat) pat haystack 193 | where 194 | lp = length pat 195 | {-# INLINE shift #-} 196 | shift :: Bytes -> Int 197 | shift !src 198 | | length src < lp = (-1) 199 | | otherwise = search (intoWord $ Pure.unsafeTake lp src) lp 200 | where 201 | intoWord :: Bytes -> Word 202 | intoWord = Pure.foldl' (\w b -> (w `shiftL` 8) .|. fromIntegral b) 0 203 | wp = intoWord pat 204 | mask = (1 `shiftL` (8 * lp)) - 1 205 | search :: Word -> Int -> Int 206 | search !w !i 207 | | w == wp = i 208 | | length src <= i = (-1) 209 | | otherwise = search w' (i + 1) 210 | where 211 | b = fromIntegral (Pure.unsafeIndex src i) 212 | w' = mask .&. ((w `shiftL` 8) .|. b) 213 | 214 | -- Only used for karp rabin 215 | rollingHash :: Bytes -> Word32 216 | {-# INLINE rollingHash #-} 217 | rollingHash = Pure.foldl' (\h b -> h * hashKey + fromIntegral b) 0 218 | 219 | hashKey :: Word32 220 | {-# INLINE hashKey #-} 221 | hashKey = 2891336453 222 | 223 | -- Precondition: Length of bytes is greater than or equal to 1. 224 | -- Precondition: Rolling hash agrees with pattern. 225 | -- Output: Negative one means match not found. Other negative 226 | -- numbers should not occur. Zero should not occur. Positive number 227 | -- means the number of bytes to skip to make it past the match. 228 | karpRabin :: Word32 -> Bytes -> Bytes -> Int 229 | karpRabin !hp !pat !src 230 | | length src < lp = (-1) 231 | | otherwise = search (rollingHash $ Pure.unsafeTake lp src) lp 232 | where 233 | lp :: Int 234 | !lp = Pure.length pat 235 | m :: Word32 236 | !m = hashKey ^ lp 237 | get :: Int -> Word32 238 | get !ix = fromIntegral (Pure.unsafeIndex src ix) 239 | search !hs !i 240 | | hp == hs && eqBytesNoShortCut pat (Pure.unsafeTake lp (Pure.unsafeDrop (i - lp) src)) = i 241 | | length src <= i = (-1) 242 | | otherwise = search hs' (i + 1) 243 | where 244 | hs' = 245 | hs * hashKey 246 | + get i 247 | - m * get (i - lp) 248 | 249 | {- | Is the first argument an infix of the second argument? 250 | 251 | Uses the Rabin-Karp algorithm: expected time @O(n+m)@, worst-case @O(nm)@. 252 | -} 253 | isInfixOf :: 254 | -- | String to search for 255 | Bytes -> 256 | -- | String to search in 257 | Bytes -> 258 | Bool 259 | isInfixOf p s = Pure.null p || breakSubstring p s >= 0 260 | 261 | -- Precondition: both arguments have the same length 262 | -- Skips the pointer equality check and the length check. 263 | eqBytesNoShortCut :: Bytes -> Bytes -> Bool 264 | {-# INLINE eqBytesNoShortCut #-} 265 | eqBytesNoShortCut (Bytes arr1 off1 len1) (Bytes arr2 off2 _) = 266 | PM.compareByteArrays arr1 off1 arr2 off2 len1 == EQ 267 | -------------------------------------------------------------------------------- /src/Data/Bytes/Byte.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE BlockArguments #-} 3 | {-# LANGUAGE MagicHash #-} 4 | {-# LANGUAGE NamedFieldPuns #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE UnliftedFFITypes #-} 7 | 8 | -- This internal module has functions for splitting strings 9 | -- on a particular byte and for counting occurences of that 10 | -- byte. 11 | module Data.Bytes.Byte 12 | ( -- Re-exported by Data.Bytes 13 | count 14 | , split 15 | , splitU 16 | , splitNonEmpty 17 | , splitStream 18 | , splitInit 19 | , splitInitU 20 | , split1 21 | , split2 22 | , split3 23 | , split4 24 | , splitEnd1 25 | -- Used by other internal modules 26 | , elemIndexLoop# 27 | ) where 28 | 29 | import Prelude hiding (length) 30 | 31 | import Control.Monad.ST (runST) 32 | import Control.Monad.ST.Run (runPrimArrayST) 33 | import Data.Bytes.Types (Bytes (..)) 34 | import Data.List.NonEmpty (NonEmpty ((:|))) 35 | import Data.Primitive (ByteArray (..), MutablePrimArray (..), PrimArray (..)) 36 | import Data.Primitive.Unlifted.Array (UnliftedArray) 37 | import Data.Tuple.Types (IntPair (IntPair)) 38 | import Data.Vector.Fusion.Stream.Monadic (Step (Done, Yield), Stream (Stream)) 39 | import Data.Word (Word8) 40 | import GHC.Exts (ByteArray#, Int (I#), Int#, MutableByteArray#) 41 | import GHC.IO (unsafeIOToST) 42 | 43 | import qualified Data.Primitive as PM 44 | import qualified Data.Primitive.Unlifted.Array as PM 45 | import qualified GHC.Exts as Exts 46 | 47 | -- | Count the number of times the byte appears in the sequence. 48 | count :: Word8 -> Bytes -> Int 49 | count !b (Bytes {array = ByteArray arr, offset, length}) = 50 | count_ba arr offset length b 51 | 52 | {- | Variant of 'split' that returns an array of unsliced byte sequences. 53 | Unlike 'split', this is not a good producer for list fusion. (It does 54 | not return a list, so it could not be.) Prefer 'split' if the result 55 | is going to be consumed exactly once by a good consumer. Prefer 'splitU' 56 | if the result of the split is going to be around for a while and 57 | inspected multiple times. 58 | -} 59 | splitU :: Word8 -> Bytes -> UnliftedArray ByteArray 60 | splitU !w !bs = 61 | let !lens = splitLengthsAlt w bs 62 | !lensSz = PM.sizeofPrimArray lens 63 | in splitCommonU lens lensSz bs 64 | 65 | {- | Variant of 'splitU' that drops the trailing element. See 'splitInit' 66 | for an explanation of why this may be useful. 67 | -} 68 | splitInitU :: Word8 -> Bytes -> UnliftedArray ByteArray 69 | splitInitU !w !bs = 70 | let !lens = splitLengthsAlt w bs 71 | !lensSz = PM.sizeofPrimArray lens 72 | in splitCommonU lens (lensSz - 1) bs 73 | 74 | -- Internal function 75 | splitCommonU :: 76 | PrimArray Int -> -- array of segment lengths 77 | Int -> -- number of lengths to consider 78 | Bytes -> 79 | UnliftedArray ByteArray 80 | splitCommonU !lens !lensSz Bytes {array, offset = arrIx0} = runST do 81 | dst <- PM.unsafeNewUnliftedArray lensSz 82 | let go !lenIx !arrIx = 83 | if lenIx < lensSz 84 | then do 85 | let !len = PM.indexPrimArray lens lenIx 86 | buf <- PM.newByteArray len 87 | PM.copyByteArray buf 0 array arrIx len 88 | buf' <- PM.unsafeFreezeByteArray buf 89 | PM.writeUnliftedArray dst lenIx buf' 90 | go (lenIx + 1) (arrIx + len + 1) 91 | else pure () 92 | go 0 arrIx0 93 | PM.unsafeFreezeUnliftedArray dst 94 | 95 | {- | Break a byte sequence into pieces separated by the byte argument, 96 | consuming the delimiter. This function is a good producer for list 97 | fusion. It is common to immidiately consume the results of @split@ 98 | with @foldl'@, @traverse_@, @foldlM@, and being a good producer helps 99 | in this situation. 100 | 101 | Note: this function differs from its counterpart in @bytestring@. 102 | If the byte sequence is empty, this returns a singleton list with 103 | the empty byte sequence. 104 | -} 105 | split :: Word8 -> Bytes -> [Bytes] 106 | {-# INLINE split #-} 107 | split !w !bs@Bytes {array, offset = arrIx0} = 108 | Exts.build 109 | ( \g x0 -> 110 | let go !lenIx !arrIx = 111 | if lenIx < lensSz 112 | then 113 | let !len = PM.indexPrimArray lens lenIx 114 | in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1)) 115 | else x0 116 | in go 0 arrIx0 117 | ) 118 | where 119 | !lens = splitLengthsAlt w bs 120 | !lensSz = PM.sizeofPrimArray lens 121 | 122 | {- | Variant of 'split' that intended for use with stream fusion rather 123 | than @build@-@foldr@ fusion. 124 | -} 125 | splitStream :: forall m. (Applicative m) => Word8 -> Bytes -> Stream m Bytes 126 | {-# INLINE [1] splitStream #-} 127 | splitStream !w !bs@Bytes {array, offset = arrIx0} = Stream step (IntPair 0 arrIx0) 128 | where 129 | !lens = splitLengthsAlt w bs 130 | !lensSz = PM.sizeofPrimArray lens 131 | {-# INLINE [0] step #-} 132 | step :: IntPair -> m (Step IntPair Bytes) 133 | step (IntPair lenIx arrIx) = 134 | if lenIx < lensSz 135 | then do 136 | let !len = PM.indexPrimArray lens lenIx 137 | !element = Bytes array arrIx len 138 | !acc = IntPair (lenIx + 1) (arrIx + len + 1) 139 | pure (Yield element acc) 140 | else pure Done 141 | 142 | {- | Variant of 'split' that returns the result as a 'NonEmpty' 143 | instead of @[]@. This is also eligible for stream fusion. 144 | -} 145 | splitNonEmpty :: Word8 -> Bytes -> NonEmpty Bytes 146 | {-# INLINE splitNonEmpty #-} 147 | splitNonEmpty !w !bs@Bytes {array, offset = arrIx0} = 148 | Bytes array arrIx0 len0 149 | :| Exts.build 150 | ( \g x0 -> 151 | let go !lenIx !arrIx = 152 | if lenIx < lensSz 153 | then 154 | let !len = PM.indexPrimArray lens lenIx 155 | in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1)) 156 | else x0 157 | in go 1 (1 + (arrIx0 + len0)) 158 | ) 159 | where 160 | !lens = splitLengthsAlt w bs 161 | !lensSz = PM.sizeofPrimArray lens 162 | !len0 = PM.indexPrimArray lens 0 :: Int 163 | 164 | {- | Variant of 'split' that drops the trailing element. This behaves 165 | correctly even if the byte sequence is empty. This is a good producer 166 | for list fusion. This is useful when splitting a text file 167 | into lines. 168 | 169 | mandates that text files end with a newline, so the list resulting 170 | from 'split' always has an empty byte sequence as its last element. 171 | With 'splitInit', that unwanted element is discarded. 172 | -} 173 | splitInit :: Word8 -> Bytes -> [Bytes] 174 | {-# INLINE splitInit #-} 175 | splitInit !w !bs@Bytes {array, offset = arrIx0} = 176 | Exts.build 177 | ( \g x0 -> 178 | let go !lenIx !arrIx = 179 | if lenIx < lensSz 180 | then 181 | let !len = PM.indexPrimArray lens lenIx 182 | in g (Bytes array arrIx len) (go (lenIx + 1) (arrIx + len + 1)) 183 | else x0 184 | in go 0 arrIx0 185 | ) 186 | where 187 | -- Remember, the resulting array from splitLengthsAlt always has 188 | -- a length of at least one. 189 | !lens = splitLengthsAlt w bs 190 | !lensSz = PM.sizeofPrimArray lens - 1 191 | 192 | -- Internal function. This is just like splitLengths except that 193 | -- it does not treat the empty byte sequences specially. The result 194 | -- for that byte sequence is a singleton array with the element zero. 195 | splitLengthsAlt :: Word8 -> Bytes -> PrimArray Int 196 | splitLengthsAlt b Bytes {array = ByteArray arr#, offset = off, length = len} = runPrimArrayST do 197 | let !n = count_ba arr# off len b 198 | dst@(MutablePrimArray dst#) :: MutablePrimArray s Int <- PM.newPrimArray (n + 1) 199 | total <- unsafeIOToST (memchr_ba_many arr# off len dst# n b) 200 | PM.writePrimArray dst n (len - total) 201 | PM.unsafeFreezePrimArray dst 202 | 203 | foreign import ccall unsafe "bs_custom.h memchr_ba_many" 204 | memchr_ba_many :: 205 | ByteArray# -> Int -> Int -> MutableByteArray# s -> Int -> Word8 -> IO Int 206 | 207 | foreign import ccall unsafe "bs_custom.h count_ba" 208 | count_ba :: 209 | ByteArray# -> Int -> Int -> Word8 -> Int 210 | 211 | {- | Split a byte sequence on the first occurrence of the target 212 | byte. The target is removed from the result. For example: 213 | 214 | >>> split1 0xA [0x1,0x2,0xA,0xB] 215 | Just ([0x1,0x2],[0xB]) 216 | -} 217 | split1 :: Word8 -> Bytes -> Maybe (Bytes, Bytes) 218 | {-# INLINE split1 #-} 219 | split1 w b@(Bytes arr off len) = case elemIndexLoop# w b of 220 | (-1#) -> Nothing 221 | i# -> 222 | let i = I# i# 223 | in Just (Bytes arr off (i - off), Bytes arr (i + 1) (len - (1 + i - off))) 224 | 225 | {- | Split a byte sequence on the first and second occurrences 226 | of the target byte. The target is removed from the result. 227 | For example: 228 | 229 | >>> split2 0xA [0x1,0x2,0xA,0xB,0xA,0xA,0xA] 230 | Just ([0x1,0x2],[0xB],[0xA,0xA]) 231 | -} 232 | split2 :: Word8 -> Bytes -> Maybe (Bytes, Bytes, Bytes) 233 | {-# INLINE split2 #-} 234 | split2 w b@(Bytes arr off len) = case elemIndexLoop# w b of 235 | (-1#) -> Nothing 236 | i# -> 237 | let i = I# i# 238 | in case elemIndexLoop# w (Bytes arr (i + 1) (len - (1 + i - off))) of 239 | (-1#) -> Nothing 240 | j# -> 241 | let j = I# j# 242 | in Just 243 | ( Bytes arr off (i - off) 244 | , Bytes arr (i + 1) (j - (i + 1)) 245 | , Bytes arr (j + 1) (len - (1 + j - off)) 246 | ) 247 | 248 | {- | Split a byte sequence on the first, second, and third occurrences 249 | of the target byte. The target is removed from the result. 250 | For example: 251 | 252 | >>> split3 0xA [0x1,0x2,0xA,0xB,0xA,0xA,0xA] 253 | Just ([0x1,0x2],[0xB],[],[0xA]) 254 | -} 255 | split3 :: Word8 -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes) 256 | {-# INLINE split3 #-} 257 | split3 w b@(Bytes arr off len) = case elemIndexLoop# w b of 258 | (-1#) -> Nothing 259 | i# -> 260 | let i = I# i# 261 | in case elemIndexLoop# w (Bytes arr (i + 1) (len - (1 + i - off))) of 262 | (-1#) -> Nothing 263 | j# -> 264 | let j = I# j# 265 | in case elemIndexLoop# w (Bytes arr (j + 1) (len - (1 + j - off))) of 266 | (-1#) -> Nothing 267 | k# -> 268 | let k = I# k# 269 | in Just 270 | ( Bytes arr off (i - off) 271 | , Bytes arr (i + 1) (j - (i + 1)) 272 | , Bytes arr (j + 1) (k - (j + 1)) 273 | , Bytes arr (k + 1) (len - (1 + k - off)) 274 | ) 275 | 276 | {- | Split a byte sequence on the first, second, third, and fourth 277 | occurrences of the target byte. The target is removed from the result. 278 | For example: 279 | 280 | >>> split4 0xA [0x1,0x2,0xA,0xB,0xA,0xA,0xA] 281 | Just ([0x1,0x2],[0xB],[],[],[]) 282 | -} 283 | split4 :: Word8 -> Bytes -> Maybe (Bytes, Bytes, Bytes, Bytes, Bytes) 284 | {-# INLINE split4 #-} 285 | split4 w b@(Bytes arr off len) = case elemIndexLoop# w b of 286 | (-1#) -> Nothing 287 | i# -> 288 | let i = I# i# 289 | in case elemIndexLoop# w (Bytes arr (i + 1) (len - (1 + i - off))) of 290 | (-1#) -> Nothing 291 | j# -> 292 | let j = I# j# 293 | in case elemIndexLoop# w (Bytes arr (j + 1) (len - (1 + j - off))) of 294 | (-1#) -> Nothing 295 | k# -> 296 | let k = I# k# 297 | in case elemIndexLoop# w (Bytes arr (k + 1) (len - (1 + k - off))) of 298 | (-1#) -> Nothing 299 | m# -> 300 | let m = I# m# 301 | in Just 302 | ( Bytes arr off (i - off) 303 | , Bytes arr (i + 1) (j - (i + 1)) 304 | , Bytes arr (j + 1) (k - (j + 1)) 305 | , Bytes arr (k + 1) (m - (k + 1)) 306 | , Bytes arr (m + 1) (len - (1 + m - off)) 307 | ) 308 | 309 | -- This returns the offset into the byte array. This is not an index 310 | -- that will mean anything to the end user, so it cannot be returned 311 | -- to them. 312 | -- 313 | -- Exported for use in other internal modules because it is needed in 314 | -- Data.Bytes.Search. 315 | elemIndexLoop# :: Word8 -> Bytes -> Int# 316 | {-# INLINE elemIndexLoop# #-} 317 | elemIndexLoop# !w (Bytes arr off@(I# off#) len) = case len of 318 | 0 -> (-1#) 319 | _ -> 320 | if PM.indexByteArray arr off == w 321 | then off# 322 | else elemIndexLoop# w (Bytes arr (off + 1) (len - 1)) 323 | 324 | -- Variant of elemIndexLoop# that starts at the end. Similarly, returns 325 | -- negative one if the element is not found. 326 | elemIndexLoopBackwards# :: Word8 -> ByteArray -> Int -> Int -> Int# 327 | elemIndexLoopBackwards# !w !arr !start !pos@(I# pos#) = 328 | if pos < start 329 | then (-1#) 330 | else 331 | if PM.indexByteArray arr pos == w 332 | then pos# 333 | else elemIndexLoopBackwards# w arr start (pos - 1) 334 | 335 | {- | Split a byte sequence on the last occurrence of the target 336 | byte. The target is removed from the result. For example: 337 | 338 | >>> split1 0xA [0x1,0x2,0xA,0xB,0xA,0xC] 339 | Just ([0x1,0x2,0xA,0xB],[0xC]) 340 | -} 341 | splitEnd1 :: Word8 -> Bytes -> Maybe (Bytes, Bytes) 342 | {-# INLINE splitEnd1 #-} 343 | splitEnd1 !w (Bytes arr off len) = case elemIndexLoopBackwards# w arr off (off + len - 1) of 344 | (-1#) -> Nothing 345 | i# -> 346 | let i = I# i# 347 | in Just (Bytes arr off (i - off), Bytes arr (i + 1) (len - (1 + i - off))) 348 | -------------------------------------------------------------------------------- /src/Data/Bytes/Chunks.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE DerivingStrategies #-} 3 | {-# LANGUAGE DuplicateRecordFields #-} 4 | {-# LANGUAGE MagicHash #-} 5 | {-# LANGUAGE NamedFieldPuns #-} 6 | {-# LANGUAGE RankNTypes #-} 7 | {-# LANGUAGE TypeApplications #-} 8 | {-# LANGUAGE TypeFamilies #-} 9 | {-# LANGUAGE UnboxedTuples #-} 10 | 11 | {- | Chunks of bytes. This is useful as a target for a builder 12 | or as a way to read a large amount of whose size is unknown 13 | in advance. Structurally, this type is similar to 14 | @Data.ByteString.Lazy.ByteString@. However, the type in this 15 | module is strict in its spine. Additionally, none of the 16 | @Handle@ functions perform lazy I\/O. 17 | -} 18 | module Data.Bytes.Chunks 19 | ( -- * Types 20 | Chunks (..) 21 | 22 | -- * Properties 23 | , length 24 | , null 25 | 26 | -- * Manipulate 27 | , cons 28 | , concat 29 | , concatPinned 30 | , concatU 31 | , concatPinnedU 32 | , concatByteString 33 | , reverse 34 | , reverseOnto 35 | , replicate 36 | , replicateByte 37 | 38 | -- * Folds 39 | , foldl' 40 | 41 | -- * Splitting 42 | , split 43 | 44 | -- * Hashing 45 | , fnv1a32 46 | , fnv1a64 47 | 48 | -- * Create 49 | , fromBytes 50 | , fromByteArray 51 | 52 | -- * Copy to buffer 53 | , unsafeCopy 54 | 55 | -- * I\/O with Handles 56 | , hGetContents 57 | , readFile 58 | , hPut 59 | , writeFile 60 | ) where 61 | 62 | import Prelude hiding (Foldable (..), concat, readFile, replicate, reverse, writeFile) 63 | 64 | import Control.Exception (IOException, catch) 65 | import Control.Monad.ST.Run (runIntByteArrayST) 66 | import Data.Bits (xor) 67 | import Data.ByteString (ByteString) 68 | import Data.Bytes.Types (Bytes (Bytes)) 69 | import Data.Primitive (ByteArray (..), MutableByteArray (..)) 70 | import Data.Word (Word32, Word64, Word8) 71 | import GHC.Exts (ByteArray#, Int (I#), Int#, MutableByteArray#, State#, (+#)) 72 | import GHC.ST (ST (..)) 73 | import System.IO (Handle, IOMode (ReadMode, WriteMode), hFileSize, withBinaryFile) 74 | 75 | import qualified Data.Bytes.Byte as Byte 76 | import qualified Data.Bytes.IO as IO 77 | import qualified Data.Bytes.Pure as Bytes 78 | import qualified Data.Bytes.Types as B 79 | import qualified Data.Primitive as PM 80 | import qualified GHC.Exts as Exts 81 | 82 | -- | A cons-list of byte sequences. 83 | data Chunks 84 | = ChunksCons {-# UNPACK #-} !Bytes !Chunks 85 | | ChunksNil 86 | deriving stock (Show) 87 | 88 | instance Semigroup Chunks where 89 | ChunksNil <> a = a 90 | cs@(ChunksCons _ _) <> ChunksNil = cs 91 | as@(ChunksCons _ _) <> bs@(ChunksCons _ _) = 92 | reverseOnto bs (reverse as) 93 | 94 | instance Monoid Chunks where 95 | mempty = ChunksNil 96 | 97 | -- | This uses @concat@ to form an equivalence class. 98 | instance Eq Chunks where 99 | -- TODO: There is a more efficient way to do this, but 100 | -- it is tedious. 101 | a == b = concat a == concat b 102 | 103 | -- | Add a byte sequence to the beginning. 104 | cons :: Bytes -> Chunks -> Chunks 105 | {-# INLINE cons #-} 106 | cons = ChunksCons 107 | 108 | {- | Repeat the byte sequence over and over. Returns empty chunks when given 109 | a negative repetition count. 110 | -} 111 | replicate :: 112 | Bytes -> 113 | -- | Number of times to repeat the sequence. 114 | Int -> 115 | Chunks 116 | replicate !b@(Bytes _ _ len) !n 117 | | n <= 0 = ChunksNil 118 | | len == 0 = ChunksNil 119 | | otherwise = go n ChunksNil 120 | where 121 | -- Implementation note: We do not have to reverse the chunks at the end. 122 | go i !acc = case i of 123 | 0 -> acc 124 | _ -> go (i - 1) (ChunksCons b acc) 125 | 126 | {- | Repeat the byte over and over. This builds a single byte array that 127 | is at most 64KiB and shares that across every @ChunksCons@ cell. 128 | 129 | An as example, creating a 2GiB chunks this way would use 64KiB for the 130 | byte array, and there would be the additional overhead of the 2^15 131 | @ChunksCons@ data constructors. On a 64-bit platform, @ChunksCons@ 132 | takes 40 bytes, so the total memory consumption would be 133 | @2^16 + 40 * 2^15@, which is roughly 1.37MB. The same reasoning 134 | shows that it takes about 83.95MB to represent a 128GiB chunks. 135 | 136 | The size of the shared payload is an implementation detail. Do not 137 | rely on this function producing 64KiB chunks. The implementation might 138 | one day change to something smarter that minimizes the memory footprint 139 | for very large chunks. 140 | -} 141 | replicateByte :: 142 | Word8 -> 143 | -- | Number of times to replicate the byte 144 | Int -> 145 | Chunks 146 | replicateByte !w !n 147 | | n <= 0 = ChunksNil 148 | | n < 65536 = ChunksCons (Bytes.replicate n w) ChunksNil 149 | | otherwise = go (Bytes.replicateU 65536 w) n ChunksNil 150 | where 151 | go !shared !remaining !acc 152 | | remaining == 0 = acc 153 | | remaining < 65536 = ChunksCons (Bytes shared 0 remaining) acc 154 | | otherwise = go shared (remaining - 65536) (ChunksCons (Bytes shared 0 65536) acc) 155 | 156 | -- | Are there any bytes in the chunked byte sequences? 157 | null :: Chunks -> Bool 158 | null = go 159 | where 160 | go ChunksNil = True 161 | go (ChunksCons (Bytes _ _ len) xs) = case len of 162 | 0 -> go xs 163 | _ -> False 164 | 165 | {- | Variant of 'concat' that ensure that the resulting byte 166 | sequence is pinned memory. 167 | -} 168 | concatPinned :: Chunks -> Bytes 169 | concatPinned x = case x of 170 | ChunksNil -> Bytes.emptyPinned 171 | ChunksCons b y -> case y of 172 | ChunksNil -> Bytes.pin b 173 | ChunksCons c z -> case concatPinnedFollowing2 b c z of 174 | (# len, r #) -> Bytes (ByteArray r) 0 (I# len) 175 | 176 | -- | Concatenate chunks into a strict bytestring. 177 | concatByteString :: Chunks -> ByteString 178 | concatByteString c = Bytes.pinnedToByteString (concatPinned c) 179 | 180 | -- | Concatenate chunks into a single contiguous byte sequence. 181 | concat :: Chunks -> Bytes 182 | concat x = case x of 183 | ChunksNil -> Bytes.empty 184 | ChunksCons b y -> case y of 185 | ChunksNil -> b 186 | ChunksCons c z -> case concatFollowing2 b c z of 187 | (# len, r #) -> Bytes (ByteArray r) 0 (I# len) 188 | 189 | -- | Variant of 'concat' that returns an unsliced byte sequence. 190 | concatU :: Chunks -> ByteArray 191 | concatU x = case x of 192 | ChunksNil -> mempty 193 | ChunksCons b y -> case y of 194 | ChunksNil -> Bytes.toByteArray b 195 | ChunksCons c z -> case concatFollowing2 b c z of 196 | (# _, r #) -> ByteArray r 197 | 198 | -- | Variant of 'concatPinned' that returns an unsliced pinned byte sequence. 199 | concatPinnedU :: Chunks -> ByteArray 200 | concatPinnedU x = case x of 201 | ChunksNil -> Bytes.emptyPinnedU 202 | ChunksCons b y -> case y of 203 | ChunksNil -> Bytes.toPinnedByteArray b 204 | ChunksCons c z -> case concatPinnedFollowing2 b c z of 205 | (# _, r #) -> ByteArray r 206 | 207 | concatFollowing2 :: Bytes -> Bytes -> Chunks -> (# Int#, ByteArray# #) 208 | concatFollowing2 = internalConcatFollowing2 PM.newByteArray 209 | 210 | concatPinnedFollowing2 :: Bytes -> Bytes -> Chunks -> (# Int#, ByteArray# #) 211 | concatPinnedFollowing2 = internalConcatFollowing2 PM.newPinnedByteArray 212 | 213 | internalConcatFollowing2 :: 214 | (forall s. Int -> ST s (MutableByteArray s)) -> 215 | Bytes -> 216 | Bytes -> 217 | Chunks -> 218 | (# Int#, ByteArray# #) 219 | {-# INLINE internalConcatFollowing2 #-} 220 | internalConcatFollowing2 221 | allocate 222 | (Bytes {array = c, offset = coff, length = szc}) 223 | (Bytes {array = d, offset = doff, length = szd}) 224 | ds = 225 | let !(I# x, ByteArray y) = runIntByteArrayST $ do 226 | let !szboth = szc + szd 227 | !len = chunksLengthGo szboth ds 228 | dst <- allocate len 229 | PM.copyByteArray dst 0 c coff szc 230 | PM.copyByteArray dst szc d doff szd 231 | -- Note: len2 will always be the same as len. 232 | !len2 <- unsafeCopy dst szboth ds 233 | result <- PM.unsafeFreezeByteArray dst 234 | pure (len2, result) 235 | in (# x, y #) 236 | 237 | -- | The total number of bytes in all the chunks. 238 | length :: Chunks -> Int 239 | length = chunksLengthGo 0 240 | 241 | chunksLengthGo :: Int -> Chunks -> Int 242 | chunksLengthGo !n ChunksNil = n 243 | chunksLengthGo !n (ChunksCons (Bytes {B.length = len}) cs) = 244 | chunksLengthGo (n + len) cs 245 | 246 | {- | Copy the contents of the chunks into a mutable array. 247 | Precondition: The destination must have enough space to 248 | house the contents. This is not checked. 249 | -} 250 | unsafeCopy :: 251 | -- | Destination 252 | MutableByteArray s -> 253 | -- | Destination offset 254 | Int -> 255 | -- | Source 256 | Chunks -> 257 | -- | Returns the next index into the destination after the payload 258 | ST s Int 259 | {-# INLINE unsafeCopy #-} 260 | unsafeCopy (MutableByteArray dst) (I# off) cs = 261 | ST 262 | ( \s0 -> case copy# dst off cs s0 of 263 | (# s1, nextOff #) -> (# s1, I# nextOff #) 264 | ) 265 | 266 | copy# :: MutableByteArray# s -> Int# -> Chunks -> State# s -> (# State# s, Int# #) 267 | copy# _ off ChunksNil s0 = (# s0, off #) 268 | copy# marr off (ChunksCons (Bytes {B.array, B.offset, B.length = len}) cs) s0 = 269 | case Exts.copyByteArray# (unBa array) (unI offset) marr off (unI len) s0 of 270 | s1 -> copy# marr (off +# unI len) cs s1 271 | 272 | -- | Reverse chunks but not the bytes within each chunk. 273 | reverse :: Chunks -> Chunks 274 | reverse = reverseOnto ChunksNil 275 | 276 | {- | Variant of 'reverse' that allows the caller to provide 277 | an initial list of chunks that the reversed chunks will 278 | be pushed onto. 279 | -} 280 | reverseOnto :: Chunks -> Chunks -> Chunks 281 | reverseOnto !x ChunksNil = x 282 | reverseOnto !x (ChunksCons y ys) = 283 | reverseOnto (ChunksCons y x) ys 284 | 285 | unI :: Int -> Int# 286 | {-# INLINE unI #-} 287 | unI (I# i) = i 288 | 289 | unBa :: ByteArray -> ByteArray# 290 | {-# INLINE unBa #-} 291 | unBa (ByteArray x) = x 292 | 293 | -- | Read a handle's entire contents strictly into chunks. 294 | hGetContents :: Handle -> IO Chunks 295 | hGetContents !h = hGetContentsCommon ChunksNil h 296 | 297 | -- | Read a handle's entire contents strictly into chunks. 298 | hGetContentsHint :: Int -> Handle -> IO Chunks 299 | hGetContentsHint !hint !h = do 300 | c <- IO.hGet h hint 301 | let !r = ChunksCons c ChunksNil 302 | if Bytes.length c == hint 303 | then pure r 304 | else hGetContentsCommon r h 305 | 306 | hGetContentsCommon :: 307 | Chunks -> -- reversed chunks 308 | Handle -> 309 | IO Chunks 310 | hGetContentsCommon !acc0 !h = go acc0 311 | where 312 | go !acc = do 313 | c <- IO.hGet h chunkSize 314 | let !r = ChunksCons c acc 315 | if Bytes.length c == chunkSize 316 | then go r 317 | else pure $! reverse r 318 | 319 | {- | Read an entire file strictly into chunks. If reading from a 320 | regular file, this makes an effort read the file into a single 321 | chunk. 322 | -} 323 | readFile :: FilePath -> IO Chunks 324 | readFile f = withBinaryFile f ReadMode $ \h -> do 325 | -- Implementation copied from bytestring. 326 | -- hFileSize fails if file is not regular file (like 327 | -- /dev/null). Catch exception and try reading anyway. 328 | filesz <- catch (hFileSize h) useZeroIfNotRegularFile 329 | let hint = (fromIntegral filesz `max` 255) + 1 330 | hGetContentsHint hint h 331 | where 332 | -- Our initial size is one bigger than the file size so that in the 333 | -- typical case we will read the whole file in one go and not have 334 | -- to allocate any more chunks. We'll still do the right thing if the 335 | -- file size is 0 or is changed before we do the read. 336 | 337 | useZeroIfNotRegularFile :: IOException -> IO Integer 338 | useZeroIfNotRegularFile _ = return 0 339 | 340 | chunkSize :: Int 341 | chunkSize = 16384 - 16 342 | 343 | -- | Create a list of chunks with a single chunk. 344 | fromBytes :: Bytes -> Chunks 345 | fromBytes !b = ChunksCons b ChunksNil 346 | 347 | -- | Variant of 'fromBytes' where the single chunk is unsliced. 348 | fromByteArray :: ByteArray -> Chunks 349 | fromByteArray !b = fromBytes (Bytes.fromByteArray b) 350 | 351 | -- | Left fold over all bytes in the chunks, strict in the accumulator. 352 | foldl' :: (a -> Word8 -> a) -> a -> Chunks -> a 353 | {-# INLINE foldl' #-} 354 | foldl' g = go 355 | where 356 | go !a ChunksNil = a 357 | go !a (ChunksCons c cs) = go (Bytes.foldl' g a c) cs 358 | 359 | -- | Hash byte sequence with 32-bit variant of FNV-1a. 360 | fnv1a32 :: Chunks -> Word32 361 | fnv1a32 !b = 362 | foldl' 363 | ( \acc w -> (fromIntegral @Word8 @Word32 w `xor` acc) * 0x01000193 364 | ) 365 | 0x811c9dc5 366 | b 367 | 368 | -- | Hash byte sequence with 64-bit variant of FNV-1a. 369 | fnv1a64 :: Chunks -> Word64 370 | fnv1a64 !b = 371 | foldl' 372 | ( \acc w -> (fromIntegral @Word8 @Word64 w `xor` acc) * 0x00000100000001B3 373 | ) 374 | 0xcbf29ce484222325 375 | b 376 | 377 | {- | Outputs 'Chunks' to the specified 'Handle'. This is implemented 378 | with 'IO.hPut'. 379 | -} 380 | hPut :: Handle -> Chunks -> IO () 381 | hPut h = go 382 | where 383 | go ChunksNil = pure () 384 | go (ChunksCons c cs) = IO.hPut h c *> go cs 385 | 386 | {- | Write 'Chunks' to a file, replacing the previous contents of 387 | the file. 388 | -} 389 | writeFile :: FilePath -> Chunks -> IO () 390 | writeFile path cs = withBinaryFile path WriteMode (\h -> hPut h cs) 391 | 392 | {- | Break chunks of bytes into contiguous pieces separated by the 393 | byte argument. This is a good producer for list fusion. For this 394 | function to perform well, each chunk should contain multiple separators. 395 | Any piece that spans multiple chunks must be copied. 396 | -} 397 | split :: Word8 -> Chunks -> [Bytes] 398 | {-# INLINE split #-} 399 | split !w !cs0 = 400 | Exts.build 401 | ( \g x0 -> 402 | -- It is possible to optimize for the common case where a 403 | -- piece does not span multiple chunks. However, such an 404 | -- optimization would actually cause this to tail call in 405 | -- two places rather than one and may actually adversely 406 | -- affect performance. It hasn't been benchmarked. 407 | let go !cs = case splitOnto ChunksNil w cs of 408 | (hd, tl) -> 409 | let !x = concat (reverse hd) 410 | in case tl of 411 | ChunksNil -> x0 412 | _ -> g x (go tl) 413 | in go cs0 414 | ) 415 | 416 | splitOnto :: Chunks -> Word8 -> Chunks -> (Chunks, Chunks) 417 | {-# INLINE splitOnto #-} 418 | splitOnto !acc0 !w !cs0 = go acc0 cs0 419 | where 420 | go !acc ChunksNil = (acc, ChunksNil) 421 | go !acc (ChunksCons b bs) = case Byte.split1 w b of 422 | Nothing -> go (ChunksCons b acc) bs 423 | Just (hd, tl) -> 424 | let !r1 = ChunksCons hd acc 425 | !r2 = ChunksCons tl bs 426 | in (r1, r2) 427 | -------------------------------------------------------------------------------- /src/Data/Bytes/Text/Latin1.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE MagicHash #-} 3 | {-# LANGUAGE TypeApplications #-} 4 | {-# LANGUAGE UnboxedTuples #-} 5 | 6 | {- | This module treats 'Bytes' data as holding text encoded in ISO-8859-1. This 7 | encoding can only encode codepoints strictly below @U+0100@, but this allows 8 | each codepoint to be placed directly into a single byte. This range consists 9 | of Unicode Basic Latin, Latin-1 Supplement and C0+C1 Controls, which includes 10 | ASCII. 11 | 12 | Strictly, ISO-8859-1 is not to be confused with ISO/IEC 8859-1 (which was the 13 | default encoding for webpages before HTML5). ISO/IEC 8859-1 lacks encodings 14 | for the C0 and C1 control characters. 15 | 16 | With HTML5, the default encoding of webpages was changed to Windows-1252, 17 | which is _not_ compatible with ISO-8859-1. Windows-1252 uses the C1 Control 18 | range (@U+0080@ -- @U+009F@) mostly to encode a variety of printable 19 | characters. For this encoding, see 'Data.Bytes.Text.Windows1252'. 20 | -} 21 | module Data.Bytes.Text.Latin1 22 | ( toString 23 | , fromString 24 | , decodeDecWord 25 | 26 | -- * Specialized Comparisons 27 | , equals1 28 | , equals2 29 | , equals3 30 | , equals4 31 | , equals5 32 | , equals6 33 | , equals7 34 | , equals8 35 | , equals9 36 | , equals10 37 | , equals11 38 | , equals12 39 | , equals13 40 | , equals14 41 | , equals15 42 | ) where 43 | 44 | import Prelude hiding (length) 45 | 46 | import Data.Bytes.Types (Bytes (..)) 47 | import Data.Char (chr, ord) 48 | import Data.Primitive (ByteArray (ByteArray)) 49 | import Data.Word (Word8) 50 | import GHC.Exts (Char (C#), Int (I#), Word (W#), Word#, int2Word#, ltWord#, or#) 51 | 52 | import qualified Data.Bytes.Pure as Bytes 53 | import qualified Data.Primitive as PM 54 | import qualified GHC.Exts as Exts 55 | 56 | {- | Convert a 'String' consisting of only characters representable 57 | by ISO-8859-1. These are encoded with ISO-8859-1. Any character 58 | with a codepoint above @U+00FF@ is replaced by an unspecified byte. 59 | -} 60 | fromString :: String -> Bytes 61 | fromString = 62 | Bytes.fromByteArray . Exts.fromList . map (fromIntegral @Int @Word8 . ord) 63 | 64 | -- | Interpret a byte sequence as text encoded by ISO-8859-1. 65 | toString :: Bytes -> String 66 | {-# INLINE toString #-} 67 | toString = Bytes.foldr (\w xs -> chr (fromIntegral @Word8 @Int w) : xs) [] 68 | 69 | -- TODO presumably also fromText and fromShortText 70 | 71 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 72 | a singleton whose element matches the character? 73 | -} 74 | equals1 :: Char -> Bytes -> Bool 75 | {-# INLINE equals1 #-} 76 | equals1 !c0 (Bytes arr off len) = case len of 77 | 1 -> c0 == indexCharArray arr off 78 | _ -> False 79 | 80 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 81 | a doubleton whose elements match the characters? 82 | -} 83 | equals2 :: Char -> Char -> Bytes -> Bool 84 | equals2 !c0 !c1 (Bytes arr off len) = case len of 85 | 2 -> 86 | c0 == indexCharArray arr off 87 | && c1 == indexCharArray arr (off + 1) 88 | _ -> False 89 | 90 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 91 | a tripleton whose elements match the characters? 92 | -} 93 | equals3 :: Char -> Char -> Char -> Bytes -> Bool 94 | equals3 !c0 !c1 !c2 (Bytes arr off len) = case len of 95 | 3 -> 96 | c0 == indexCharArray arr off 97 | && c1 == indexCharArray arr (off + 1) 98 | && c2 == indexCharArray arr (off + 2) 99 | _ -> False 100 | 101 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 102 | a quadrupleton whose elements match the characters? 103 | -} 104 | equals4 :: Char -> Char -> Char -> Char -> Bytes -> Bool 105 | equals4 !c0 !c1 !c2 !c3 (Bytes arr off len) = case len of 106 | 4 -> 107 | c0 == indexCharArray arr off 108 | && c1 == indexCharArray arr (off + 1) 109 | && c2 == indexCharArray arr (off + 2) 110 | && c3 == indexCharArray arr (off + 3) 111 | _ -> False 112 | 113 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 114 | a quintupleton whose elements match the characters? 115 | -} 116 | equals5 :: Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 117 | equals5 !c0 !c1 !c2 !c3 !c4 (Bytes arr off len) = case len of 118 | 5 -> 119 | c0 == indexCharArray arr off 120 | && c1 == indexCharArray arr (off + 1) 121 | && c2 == indexCharArray arr (off + 2) 122 | && c3 == indexCharArray arr (off + 3) 123 | && c4 == indexCharArray arr (off + 4) 124 | _ -> False 125 | 126 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 127 | a sextupleton whose elements match the characters? 128 | -} 129 | equals6 :: Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 130 | equals6 !c0 !c1 !c2 !c3 !c4 !c5 (Bytes arr off len) = case len of 131 | 6 -> 132 | c0 == indexCharArray arr off 133 | && c1 == indexCharArray arr (off + 1) 134 | && c2 == indexCharArray arr (off + 2) 135 | && c3 == indexCharArray arr (off + 3) 136 | && c4 == indexCharArray arr (off + 4) 137 | && c5 == indexCharArray arr (off + 5) 138 | _ -> False 139 | 140 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 141 | a septupleton whose elements match the characters? 142 | -} 143 | equals7 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 144 | equals7 !c0 !c1 !c2 !c3 !c4 !c5 !c6 (Bytes arr off len) = case len of 145 | 7 -> 146 | c0 == indexCharArray arr off 147 | && c1 == indexCharArray arr (off + 1) 148 | && c2 == indexCharArray arr (off + 2) 149 | && c3 == indexCharArray arr (off + 3) 150 | && c4 == indexCharArray arr (off + 4) 151 | && c5 == indexCharArray arr (off + 5) 152 | && c6 == indexCharArray arr (off + 6) 153 | _ -> False 154 | 155 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 156 | an octupleton whose elements match the characters? 157 | -} 158 | equals8 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 159 | equals8 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 (Bytes arr off len) = case len of 160 | 8 -> 161 | c0 == indexCharArray arr off 162 | && c1 == indexCharArray arr (off + 1) 163 | && c2 == indexCharArray arr (off + 2) 164 | && c3 == indexCharArray arr (off + 3) 165 | && c4 == indexCharArray arr (off + 4) 166 | && c5 == indexCharArray arr (off + 5) 167 | && c6 == indexCharArray arr (off + 6) 168 | && c7 == indexCharArray arr (off + 7) 169 | _ -> False 170 | 171 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 172 | a 9-tuple whose elements match the characters? 173 | -} 174 | equals9 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 175 | equals9 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 (Bytes arr off len) = case len of 176 | 9 -> 177 | c0 == indexCharArray arr off 178 | && c1 == indexCharArray arr (off + 1) 179 | && c2 == indexCharArray arr (off + 2) 180 | && c3 == indexCharArray arr (off + 3) 181 | && c4 == indexCharArray arr (off + 4) 182 | && c5 == indexCharArray arr (off + 5) 183 | && c6 == indexCharArray arr (off + 6) 184 | && c7 == indexCharArray arr (off + 7) 185 | && c8 == indexCharArray arr (off + 8) 186 | _ -> False 187 | 188 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 189 | a 10-tuple whose elements match the characters? 190 | -} 191 | equals10 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 192 | equals10 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 (Bytes arr off len) = case len of 193 | 10 -> 194 | c0 == indexCharArray arr off 195 | && c1 == indexCharArray arr (off + 1) 196 | && c2 == indexCharArray arr (off + 2) 197 | && c3 == indexCharArray arr (off + 3) 198 | && c4 == indexCharArray arr (off + 4) 199 | && c5 == indexCharArray arr (off + 5) 200 | && c6 == indexCharArray arr (off + 6) 201 | && c7 == indexCharArray arr (off + 7) 202 | && c8 == indexCharArray arr (off + 8) 203 | && c9 == indexCharArray arr (off + 9) 204 | _ -> False 205 | 206 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 207 | a 11-tuple whose elements match the characters? 208 | -} 209 | equals11 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 210 | equals11 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 (Bytes arr off len) = case len of 211 | 11 -> 212 | c0 == indexCharArray arr off 213 | && c1 == indexCharArray arr (off + 1) 214 | && c2 == indexCharArray arr (off + 2) 215 | && c3 == indexCharArray arr (off + 3) 216 | && c4 == indexCharArray arr (off + 4) 217 | && c5 == indexCharArray arr (off + 5) 218 | && c6 == indexCharArray arr (off + 6) 219 | && c7 == indexCharArray arr (off + 7) 220 | && c8 == indexCharArray arr (off + 8) 221 | && c9 == indexCharArray arr (off + 9) 222 | && c10 == indexCharArray arr (off + 10) 223 | _ -> False 224 | 225 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 226 | a 12-tuple whose elements match the characters? 227 | -} 228 | equals12 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 229 | equals12 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 (Bytes arr off len) = case len of 230 | 12 -> 231 | c0 == indexCharArray arr off 232 | && c1 == indexCharArray arr (off + 1) 233 | && c2 == indexCharArray arr (off + 2) 234 | && c3 == indexCharArray arr (off + 3) 235 | && c4 == indexCharArray arr (off + 4) 236 | && c5 == indexCharArray arr (off + 5) 237 | && c6 == indexCharArray arr (off + 6) 238 | && c7 == indexCharArray arr (off + 7) 239 | && c8 == indexCharArray arr (off + 8) 240 | && c9 == indexCharArray arr (off + 9) 241 | && c10 == indexCharArray arr (off + 10) 242 | && c11 == indexCharArray arr (off + 11) 243 | _ -> False 244 | 245 | equals13 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 246 | equals13 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 (Bytes arr off len) = case len of 247 | 13 -> 248 | c0 == indexCharArray arr off 249 | && c1 == indexCharArray arr (off + 1) 250 | && c2 == indexCharArray arr (off + 2) 251 | && c3 == indexCharArray arr (off + 3) 252 | && c4 == indexCharArray arr (off + 4) 253 | && c5 == indexCharArray arr (off + 5) 254 | && c6 == indexCharArray arr (off + 6) 255 | && c7 == indexCharArray arr (off + 7) 256 | && c8 == indexCharArray arr (off + 8) 257 | && c9 == indexCharArray arr (off + 9) 258 | && c10 == indexCharArray arr (off + 10) 259 | && c11 == indexCharArray arr (off + 11) 260 | && c12 == indexCharArray arr (off + 12) 261 | _ -> False 262 | 263 | equals14 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 264 | equals14 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 !c13 (Bytes arr off len) = case len of 265 | 14 -> 266 | c0 == indexCharArray arr off 267 | && c1 == indexCharArray arr (off + 1) 268 | && c2 == indexCharArray arr (off + 2) 269 | && c3 == indexCharArray arr (off + 3) 270 | && c4 == indexCharArray arr (off + 4) 271 | && c5 == indexCharArray arr (off + 5) 272 | && c6 == indexCharArray arr (off + 6) 273 | && c7 == indexCharArray arr (off + 7) 274 | && c8 == indexCharArray arr (off + 8) 275 | && c9 == indexCharArray arr (off + 9) 276 | && c10 == indexCharArray arr (off + 10) 277 | && c11 == indexCharArray arr (off + 11) 278 | && c12 == indexCharArray arr (off + 12) 279 | && c13 == indexCharArray arr (off + 13) 280 | _ -> False 281 | 282 | equals15 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 283 | equals15 !c0 !c1 !c2 !c3 !c4 !c5 !c6 !c7 !c8 !c9 !c10 !c11 !c12 !c13 !c14 (Bytes arr off len) = case len of 284 | 15 -> 285 | c0 == indexCharArray arr off 286 | && c1 == indexCharArray arr (off + 1) 287 | && c2 == indexCharArray arr (off + 2) 288 | && c3 == indexCharArray arr (off + 3) 289 | && c4 == indexCharArray arr (off + 4) 290 | && c5 == indexCharArray arr (off + 5) 291 | && c6 == indexCharArray arr (off + 6) 292 | && c7 == indexCharArray arr (off + 7) 293 | && c8 == indexCharArray arr (off + 8) 294 | && c9 == indexCharArray arr (off + 9) 295 | && c10 == indexCharArray arr (off + 10) 296 | && c11 == indexCharArray arr (off + 11) 297 | && c12 == indexCharArray arr (off + 12) 298 | && c13 == indexCharArray arr (off + 13) 299 | && c14 == indexCharArray arr (off + 14) 300 | _ -> False 301 | 302 | indexCharArray :: ByteArray -> Int -> Char 303 | indexCharArray (ByteArray arr) (I# off) = C# (Exts.indexCharArray# arr off) 304 | 305 | {- | Decode machine-sized word from decimal representation. Returns 306 | Nothing on overflow. Allows any number of leading zeros. Trailing 307 | non-digit bytes cause Nothing to be returned. 308 | -} 309 | decodeDecWord :: Bytes -> Maybe Word 310 | {-# INLINE decodeDecWord #-} 311 | decodeDecWord !b = case decWordStart b of 312 | (# (# #) | #) -> Nothing 313 | (# | w #) -> Just (W# w) 314 | 315 | decWordStart :: 316 | Bytes -> -- Chunk 317 | (# (# #) | Word# #) 318 | {-# NOINLINE decWordStart #-} 319 | decWordStart !chunk0 = 320 | if length chunk0 > 0 321 | then 322 | let !w = 323 | fromIntegral @Word8 @Word 324 | (PM.indexByteArray (array chunk0) (offset chunk0)) 325 | - 48 326 | in if w < 10 327 | then decWordMore w (Bytes.unsafeDrop 1 chunk0) 328 | else (# (# #) | #) 329 | else (# (# #) | #) 330 | where 331 | decWordMore :: 332 | Word -> -- Accumulator 333 | Bytes -> -- Chunk 334 | (# (# #) | Word# #) 335 | decWordMore !acc !chunk = 336 | let len = length chunk 337 | in case len of 338 | 0 -> (# | unW (fromIntegral acc) #) 339 | _ -> 340 | let !w = 341 | fromIntegral @Word8 @Word 342 | (PM.indexByteArray (array chunk) (offset chunk)) 343 | - 48 344 | in if w < 10 345 | then 346 | let (overflow, acc') = unsignedPushBase10 acc w 347 | in if overflow 348 | then (# (# #) | #) 349 | else decWordMore acc' (Bytes.unsafeDrop 1 chunk) 350 | else (# (# #) | #) 351 | 352 | unsignedPushBase10 :: Word -> Word -> (Bool, Word) 353 | {-# INLINE unsignedPushBase10 #-} 354 | unsignedPushBase10 (W# a) (W# b) = 355 | let !(# ca, r0 #) = Exts.timesWord2# a 10## 356 | !r1 = Exts.plusWord# r0 b 357 | !cb = int2Word# (ltWord# r1 r0) 358 | !c = ca `or#` cb 359 | in (case c of 0## -> False; _ -> True, W# r1) 360 | 361 | unW :: Word -> Word# 362 | {-# INLINE unW #-} 363 | unW (W# w) = w 364 | -------------------------------------------------------------------------------- /src/Data/Bytes/Pure.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE BlockArguments #-} 3 | {-# LANGUAGE MagicHash #-} 4 | {-# LANGUAGE TypeApplications #-} 5 | 6 | module Data.Bytes.Pure 7 | ( empty 8 | , emptyPinned 9 | , emptyPinnedU 10 | , pin 11 | , contents 12 | , unsafeCopy 13 | , toByteArray 14 | , toByteArrayClone 15 | , toPinnedByteArray 16 | , toPinnedByteArrayClone 17 | , fromByteArray 18 | , fromPrimArray 19 | , length 20 | , foldlM 21 | , foldrM 22 | , foldl 23 | , foldl' 24 | , foldr 25 | , ifoldl' 26 | , foldr' 27 | , fnv1a32 28 | , fnv1a64 29 | , toByteString 30 | , pinnedToByteString 31 | , fromByteString 32 | , fromLazyByteString 33 | , unsafeDrop 34 | , unsafeTake 35 | , unsafeIndex 36 | , unsafeHead 37 | , map 38 | , mapU 39 | , null 40 | , toShortByteString 41 | , replicate 42 | , replicateU 43 | , splitTetragram1 44 | , findTetragramIndex 45 | , countWhile 46 | , countWhileEnd 47 | , any 48 | , all 49 | ) where 50 | 51 | import Prelude hiding (Foldable (..), map, replicate, any, all) 52 | 53 | import Control.Monad.Primitive (PrimMonad, PrimState) 54 | import Control.Monad.ST.Run (runByteArrayST) 55 | import Data.Bits (unsafeShiftL, (.|.)) 56 | import Data.Bits (xor) 57 | import Data.ByteString (ByteString) 58 | import Data.ByteString.Short.Internal (ShortByteString (SBS)) 59 | import Data.Bytes.Types (Bytes (Bytes)) 60 | import Data.Primitive (ByteArray (ByteArray), MutableByteArray, PrimArray (PrimArray)) 61 | import Data.Word (Word32, Word64, Word8) 62 | import Foreign.Ptr (Ptr, plusPtr) 63 | import GHC.IO (unsafeIOToST) 64 | 65 | import qualified Data.ByteString as ByteString 66 | import qualified Data.ByteString.Internal as ByteString 67 | import qualified Data.ByteString.Lazy as LBS 68 | import qualified Data.ByteString.Lazy.Internal as LBS 69 | import qualified Data.ByteString.Unsafe as ByteString 70 | import qualified Data.Primitive as PM 71 | import qualified GHC.Exts as Exts 72 | import qualified GHC.ForeignPtr as ForeignPtr 73 | 74 | -- | The empty byte sequence. 75 | empty :: Bytes 76 | empty = Bytes mempty 0 0 77 | 78 | -- | The empty pinned byte sequence. 79 | emptyPinned :: Bytes 80 | emptyPinned = 81 | Bytes 82 | ( runByteArrayST 83 | (PM.newPinnedByteArray 0 >>= PM.unsafeFreezeByteArray) 84 | ) 85 | 0 86 | 0 87 | 88 | -- | The empty pinned byte sequence. 89 | emptyPinnedU :: ByteArray 90 | emptyPinnedU = 91 | runByteArrayST 92 | (PM.newPinnedByteArray 0 >>= PM.unsafeFreezeByteArray) 93 | 94 | {- | Yields a pinned byte sequence whose contents are identical to those 95 | of the original byte sequence. If the @ByteArray@ backing the argument 96 | was already pinned, this simply aliases the argument and does not perform 97 | any copying. 98 | -} 99 | pin :: Bytes -> Bytes 100 | pin b@(Bytes arr _ len) = case PM.isByteArrayPinned arr of 101 | True -> b 102 | False -> 103 | Bytes 104 | ( runByteArrayST do 105 | dst <- PM.newPinnedByteArray len 106 | unsafeCopy dst 0 b 107 | PM.unsafeFreezeByteArray dst 108 | ) 109 | 0 110 | len 111 | 112 | {- | Convert the sliced 'Bytes' to an unsliced 'ByteArray'. This 113 | reuses the array backing the sliced 'Bytes' if the slicing metadata 114 | implies that all of the bytes are used. Otherwise, it makes a copy. 115 | -} 116 | toByteArray :: Bytes -> ByteArray 117 | {-# INLINE toByteArray #-} 118 | toByteArray b@(Bytes arr off len) 119 | | off == 0, PM.sizeofByteArray arr == len = arr 120 | | otherwise = toByteArrayClone b 121 | 122 | {- | Variant of 'toByteArray' that unconditionally makes a copy of 123 | the array backing the sliced 'Bytes' even if the original array 124 | could be reused. Prefer 'toByteArray'. 125 | -} 126 | toByteArrayClone :: Bytes -> ByteArray 127 | {-# INLINE toByteArrayClone #-} 128 | toByteArrayClone (Bytes arr off len) = runByteArrayST $ do 129 | m <- PM.newByteArray len 130 | PM.copyByteArray m 0 arr off len 131 | PM.unsafeFreezeByteArray m 132 | 133 | {- | Copy the byte sequence into a mutable buffer. The buffer must have 134 | enough space to accomodate the byte sequence, but this this is not 135 | checked. 136 | -} 137 | unsafeCopy :: 138 | (PrimMonad m) => 139 | -- | Destination 140 | MutableByteArray (PrimState m) -> 141 | -- | Destination Offset 142 | Int -> 143 | -- | Source 144 | Bytes -> 145 | m () 146 | {-# INLINE unsafeCopy #-} 147 | unsafeCopy dst dstIx (Bytes src srcIx len) = 148 | PM.copyByteArray dst dstIx src srcIx len 149 | 150 | -- | Create a slice of 'Bytes' that spans the entire argument array. 151 | fromByteArray :: ByteArray -> Bytes 152 | {-# INLINE fromByteArray #-} 153 | fromByteArray b = Bytes b 0 (PM.sizeofByteArray b) 154 | 155 | -- | Create a slice of 'Bytes' that spans the entire 'PrimArray' of 8-bit words. 156 | fromPrimArray :: PrimArray Word8 -> Bytes 157 | {-# INLINE fromPrimArray #-} 158 | fromPrimArray p@(PrimArray b) = Bytes (ByteArray b) 0 (PM.sizeofPrimArray p) 159 | 160 | -- | The length of a slice of bytes. 161 | length :: Bytes -> Int 162 | {-# INLINE length #-} 163 | length (Bytes _ _ len) = len 164 | 165 | -- | Hash byte sequence with 32-bit variant of FNV-1a. 166 | fnv1a32 :: Bytes -> Word32 167 | fnv1a32 !b = 168 | foldl' 169 | ( \acc w -> (fromIntegral @Word8 @Word32 w `xor` acc) * 0x01000193 170 | ) 171 | 0x811c9dc5 172 | b 173 | 174 | -- | Hash byte sequence with 64-bit variant of FNV-1a. 175 | fnv1a64 :: Bytes -> Word64 176 | fnv1a64 !b = 177 | foldl' 178 | ( \acc w -> (fromIntegral @Word8 @Word64 w `xor` acc) * 0x00000100000001B3 179 | ) 180 | 0xcbf29ce484222325 181 | b 182 | 183 | -- | Left fold over bytes, non-strict in the accumulator. 184 | foldl :: (a -> Word8 -> a) -> a -> Bytes -> a 185 | {-# INLINE foldl #-} 186 | foldl f a0 (Bytes arr off0 len0) = 187 | go (off0 + len0 - 1) (len0 - 1) 188 | where 189 | go !off !ix = case ix of 190 | (-1) -> a0 191 | _ -> f (go (off - 1) (ix - 1)) (PM.indexByteArray arr off) 192 | 193 | -- | Left fold over bytes, strict in the accumulator. 194 | foldl' :: (a -> Word8 -> a) -> a -> Bytes -> a 195 | {-# INLINE foldl' #-} 196 | foldl' f a0 (Bytes arr off0 len0) = go a0 off0 len0 197 | where 198 | go !a !off !len = case len of 199 | 0 -> a 200 | _ -> go (f a (PM.indexByteArray arr off)) (off + 1) (len - 1) 201 | 202 | -- | Left monadic fold over bytes, non-strict in the accumulator. 203 | foldlM :: (Monad m) => (a -> Word8 -> m a) -> a -> Bytes -> m a 204 | {-# INLINE foldlM #-} 205 | foldlM f a0 (Bytes arr off0 len0) = go a0 off0 len0 206 | where 207 | go a !off !len = case len of 208 | 0 -> pure a 209 | _ -> do 210 | a' <- f a (PM.indexByteArray arr off) 211 | go a' (off + 1) (len - 1) 212 | 213 | -- | Right fold over bytes, non-strict in the accumulator. 214 | foldr :: (Word8 -> a -> a) -> a -> Bytes -> a 215 | {-# INLINE foldr #-} 216 | foldr f a0 (Bytes arr off0 len0) = go off0 len0 217 | where 218 | go !off !len = case len of 219 | 0 -> a0 220 | _ -> f (PM.indexByteArray arr off) (go (off + 1) (len - 1)) 221 | 222 | {- | Left fold over bytes, strict in the accumulator. The reduction function 223 | is applied to each element along with its index. 224 | -} 225 | ifoldl' :: (a -> Int -> Word8 -> a) -> a -> Bytes -> a 226 | {-# INLINE ifoldl' #-} 227 | ifoldl' f a0 (Bytes arr off0 len0) = go a0 0 off0 len0 228 | where 229 | go !a !ix !off !len = case len of 230 | 0 -> a 231 | _ -> go (f a ix (PM.indexByteArray arr off)) (ix + 1) (off + 1) (len - 1) 232 | 233 | -- | Right fold over bytes, strict in the accumulator. 234 | foldr' :: (Word8 -> a -> a) -> a -> Bytes -> a 235 | {-# INLINE foldr' #-} 236 | foldr' f a0 (Bytes arr off0 len0) = 237 | go a0 (off0 + len0 - 1) (len0 - 1) 238 | where 239 | go !a !off !ix = case ix of 240 | (-1) -> a 241 | _ -> go (f (PM.indexByteArray arr off) a) (off - 1) (ix - 1) 242 | 243 | -- | Right monadic fold over bytes, non-strict in the accumulator. 244 | foldrM :: (Monad m) => (Word8 -> a -> m a) -> a -> Bytes -> m a 245 | {-# INLINE foldrM #-} 246 | foldrM f a0 (Bytes arr off0 len0) = 247 | go a0 (off0 + len0 - 1) (len0 - 1) 248 | where 249 | go !a !off !ix = case ix of 250 | (-1) -> pure a 251 | _ -> do 252 | a' <- f (PM.indexByteArray arr off) a 253 | go a' (off - 1) (ix - 1) 254 | 255 | {- | Yields a pointer to the beginning of the byte sequence. It is only safe 256 | to call this on a 'Bytes' backed by a pinned @ByteArray@. 257 | -} 258 | contents :: Bytes -> Ptr Word8 259 | {-# INLINE contents #-} 260 | contents (Bytes arr off _) = plusPtr (PM.byteArrayContents arr) off 261 | 262 | {- | Convert the sliced 'Bytes' to an unsliced 'ByteArray'. This 263 | reuses the array backing the sliced 'Bytes' if the slicing metadata 264 | implies that all of the bytes are used and they are already pinned. 265 | Otherwise, it makes a copy. 266 | -} 267 | toPinnedByteArray :: Bytes -> ByteArray 268 | {-# INLINE toPinnedByteArray #-} 269 | toPinnedByteArray b@(Bytes arr off len) 270 | | off == 0, PM.sizeofByteArray arr == len, PM.isByteArrayPinned arr = arr 271 | | otherwise = toPinnedByteArrayClone b 272 | 273 | {- | Variant of 'toPinnedByteArray' that unconditionally makes a copy of 274 | the array backing the sliced 'Bytes' even if the original array 275 | could be reused. Prefer 'toPinnedByteArray'. 276 | -} 277 | toPinnedByteArrayClone :: Bytes -> ByteArray 278 | toPinnedByteArrayClone (Bytes arr off len) = runByteArrayST $ do 279 | m <- PM.newPinnedByteArray len 280 | PM.copyByteArray m 0 arr off len 281 | PM.unsafeFreezeByteArray m 282 | 283 | {- | /O(n)/ when unpinned, /O(1)/ when pinned. Create a 'ByteString' from 284 | a byte sequence. This only copies the byte sequence if it is not pinned. 285 | -} 286 | toByteString :: Bytes -> ByteString 287 | toByteString !b = pinnedToByteString (pin b) 288 | 289 | {- | Convert a pinned 'Bytes' to a 'ByteString' 290 | /O(1)/ Precondition: bytes are pinned. Behavior is undefined otherwise. 291 | -} 292 | pinnedToByteString :: Bytes -> ByteString 293 | pinnedToByteString (Bytes y@(PM.ByteArray x) off len) = 294 | ByteString.PS 295 | ( ForeignPtr.ForeignPtr 296 | (case plusPtr (PM.byteArrayContents y) off of Exts.Ptr p -> p) 297 | (ForeignPtr.PlainPtr (Exts.unsafeCoerce# x)) 298 | ) 299 | 0 300 | len 301 | 302 | -- | /O(n)/ Copy a 'ByteString' to a byte sequence. 303 | fromByteString :: ByteString -> Bytes 304 | fromByteString !b = 305 | Bytes 306 | ( runByteArrayST $ unsafeIOToST $ do 307 | dst@(PM.MutableByteArray dst#) <- PM.newByteArray len 308 | ByteString.unsafeUseAsCString b $ \src -> do 309 | PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst#) 0 src len 310 | PM.unsafeFreezeByteArray dst 311 | ) 312 | 0 313 | len 314 | where 315 | !len = ByteString.length b 316 | 317 | -- | /O(n)/ Copy a lazy bytestring to a byte sequence. 318 | fromLazyByteString :: LBS.ByteString -> Bytes 319 | fromLazyByteString x = case LBS.length x of 320 | 0 -> empty 321 | n64 -> 322 | let n = fromIntegral n64 :: Int 323 | in Bytes 324 | ( runByteArrayST $ unsafeIOToST $ do 325 | dst@(PM.MutableByteArray dst#) <- PM.newByteArray n 326 | let loop chunks !ix = case chunks of 327 | LBS.Empty -> PM.unsafeFreezeByteArray dst 328 | LBS.Chunk c cs -> do 329 | let !len = ByteString.length c 330 | ByteString.unsafeUseAsCString c $ \src -> do 331 | PM.copyPtrToMutablePrimArray (PM.MutablePrimArray dst#) ix src len 332 | loop cs (ix + len) 333 | loop x 0 334 | ) 335 | 0 336 | n 337 | 338 | -- | Drop the first @n@ bytes from the argument. Precondition: @n ≤ len@ 339 | unsafeDrop :: Int -> Bytes -> Bytes 340 | {-# INLINE unsafeDrop #-} 341 | unsafeDrop n (Bytes arr off len) = 342 | Bytes arr (off + n) (len - n) 343 | 344 | -- | Variant of 'map' that returns unsliced byte sequence. 345 | mapU :: (Word8 -> Word8) -> Bytes -> ByteArray 346 | {-# INLINE mapU #-} 347 | mapU f (Bytes array ix0 len) = runByteArrayST do 348 | dst <- PM.newByteArray len 349 | let go !srcIx !dstIx = 350 | if dstIx < len 351 | then do 352 | let w = PM.indexByteArray array srcIx :: Word8 353 | PM.writeByteArray dst dstIx (f w) 354 | go (srcIx + 1) (dstIx + 1) 355 | else PM.unsafeFreezeByteArray dst 356 | go ix0 0 357 | 358 | {- | Map over bytes in a sequence. The result has the same length as 359 | the argument. 360 | -} 361 | map :: (Word8 -> Word8) -> Bytes -> Bytes 362 | {-# INLINE map #-} 363 | map f !b = Bytes (mapU f b) 0 (length b) 364 | 365 | -- | Is the byte sequence empty? 366 | null :: Bytes -> Bool 367 | {-# INLINE null #-} 368 | null (Bytes _ _ len) = len == 0 369 | 370 | -- | Take the first @n@ bytes from the argument. Precondition: @n ≤ len@ 371 | unsafeTake :: Int -> Bytes -> Bytes 372 | {-# INLINE unsafeTake #-} 373 | unsafeTake n (Bytes arr off _) = 374 | Bytes arr off n 375 | 376 | {- | Index into the byte sequence at the given position. This index 377 | must be less than the length. 378 | -} 379 | unsafeIndex :: Bytes -> Int -> Word8 380 | {-# INLINE unsafeIndex #-} 381 | unsafeIndex (Bytes arr off _) ix = PM.indexByteArray arr (off + ix) 382 | 383 | -- | Access the first byte. The given 'Bytes' must be non-empty. 384 | {-# INLINE unsafeHead #-} 385 | unsafeHead :: Bytes -> Word8 386 | unsafeHead bs = unsafeIndex bs 0 387 | 388 | {- | Convert the sliced 'Bytes' to an unsliced 'ShortByteString'. This 389 | reuses the array backing the sliced 'Bytes' if the slicing metadata 390 | implies that all of the bytes are used. Otherwise, it makes a copy. 391 | -} 392 | toShortByteString :: Bytes -> ShortByteString 393 | {-# INLINE toShortByteString #-} 394 | toShortByteString !b = case toByteArray b of 395 | PM.ByteArray x -> SBS x 396 | 397 | -- | Replicate a byte @n@ times. 398 | replicate :: 399 | -- | Desired length @n@ 400 | Int -> 401 | -- | Byte to replicate 402 | Word8 -> 403 | Bytes 404 | replicate !n !w = Bytes (replicateU n w) 0 n 405 | 406 | -- | Variant of 'replicate' that returns a unsliced byte array. 407 | replicateU :: Int -> Word8 -> ByteArray 408 | replicateU !n !w = runByteArrayST do 409 | arr <- PM.newByteArray n 410 | PM.setByteArray arr 0 n w 411 | PM.unsafeFreezeByteArray arr 412 | 413 | splitTetragram1 :: 414 | Word8 -> 415 | Word8 -> 416 | Word8 -> 417 | Word8 -> 418 | Bytes -> 419 | Maybe (Bytes, Bytes) 420 | splitTetragram1 !w0 !w1 !w2 !w3 !b = case findTetragramIndex w0 w1 w2 w3 b of 421 | Nothing -> Nothing 422 | Just n -> Just (unsafeTake n b, unsafeDrop (n + 4) b) 423 | 424 | findTetragramIndex :: 425 | Word8 -> 426 | Word8 -> 427 | Word8 -> 428 | Word8 -> 429 | Bytes -> 430 | Maybe Int 431 | findTetragramIndex !w0 !w1 !w2 !w3 (Bytes arr off len) = 432 | if len < 4 433 | then Nothing 434 | else 435 | let !target = 436 | unsafeShiftL (fromIntegral w0 :: Word32) 24 437 | .|. unsafeShiftL (fromIntegral w1 :: Word32) 16 438 | .|. unsafeShiftL (fromIntegral w2 :: Word32) 8 439 | .|. unsafeShiftL (fromIntegral w3 :: Word32) 0 440 | !end = off + len 441 | go !ix !acc = 442 | if acc == target 443 | then 444 | let n = ix - off 445 | in Just (n - 4) 446 | else 447 | if ix < end 448 | then 449 | let !w = PM.indexByteArray arr ix :: Word8 450 | acc' = 451 | (fromIntegral w :: Word32) 452 | .|. unsafeShiftL acc 8 453 | in go (ix + 1) acc' 454 | else Nothing 455 | !acc0 = 456 | unsafeShiftL (fromIntegral (PM.indexByteArray arr 0 :: Word8) :: Word32) 24 457 | .|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 1 :: Word8) :: Word32) 16 458 | .|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 2 :: Word8) :: Word32) 8 459 | .|. unsafeShiftL (fromIntegral (PM.indexByteArray arr 3 :: Word8) :: Word32) 0 460 | in go 4 acc0 461 | 462 | -- Internal. The returns the number of bytes that match the 463 | -- predicate until the first non-match occurs. If all bytes 464 | -- match the predicate, this will return the length originally 465 | -- provided. 466 | countWhile :: (Word8 -> Bool) -> Bytes -> Int 467 | {-# INLINE countWhile #-} 468 | countWhile k (Bytes arr off0 len0) = go off0 len0 0 469 | where 470 | go !off !len !n = 471 | if len > 0 472 | then 473 | if k (PM.indexByteArray arr off) 474 | then go (off + 1) (len - 1) (n + 1) 475 | else n 476 | else n 477 | 478 | -- Internal. Variant of countWhile that starts from the end 479 | -- of the string instead of the beginning. 480 | countWhileEnd :: (Word8 -> Bool) -> Bytes -> Int 481 | {-# INLINE countWhileEnd #-} 482 | countWhileEnd k (Bytes arr off0 len0) = go (off0 + len0 - 1) (len0 - 1) 0 483 | where 484 | go !off !len !n = 485 | if len >= 0 486 | then 487 | if k (PM.indexByteArray arr off) 488 | then go (off - 1) (len - 1) (n + 1) 489 | else n 490 | else n 491 | 492 | -- | /O(n)/ Returns true if any byte in the sequence satisfies the predicate. 493 | any :: (Word8 -> Bool) -> Bytes -> Bool 494 | {-# INLINE any #-} 495 | any f = foldr (\b r -> f b || r) False 496 | 497 | -- | /O(n)/ Returns true if all bytes in the sequence satisfy the predicate. 498 | all :: (Word8 -> Bool) -> Bytes -> Bool 499 | {-# INLINE all #-} 500 | all f = foldr (\b r -> f b && r) True 501 | 502 | -------------------------------------------------------------------------------- /test/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiWayIf #-} 2 | {-# LANGUAGE BangPatterns #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE ScopedTypeVariables #-} 5 | {-# LANGUAGE TypeApplications #-} 6 | {-# OPTIONS_GHC -fno-warn-orphans #-} 7 | 8 | import Control.Monad.Trans.Writer (Writer, tell) 9 | import Data.Bytes.Chunks (Chunks (ChunksCons, ChunksNil)) 10 | import Data.Bytes.Types (Bytes (Bytes)) 11 | import Data.Char (ord) 12 | import Data.Primitive (ByteArray) 13 | import Data.Proxy (Proxy (..)) 14 | import Data.Text (Text) 15 | import Data.Word (Word8) 16 | import Test.Tasty (TestTree, defaultMain, testGroup) 17 | import Test.Tasty.HUnit (testCase, (@=?)) 18 | import Test.Tasty.QuickCheck 19 | ( ASCIIString (ASCIIString) 20 | , Arbitrary 21 | , Discard (Discard) 22 | , property 23 | , testProperty 24 | , (===) 25 | , (==>) 26 | ) 27 | 28 | import qualified Data.ByteString as ByteString 29 | import qualified Data.Bytes as Bytes 30 | import qualified Data.Bytes.Chunks as Chunks 31 | import qualified Data.Bytes.Text.Ascii as Ascii 32 | import qualified Data.Bytes.Text.AsciiExt as AsciiExt 33 | import qualified Data.Bytes.Text.Latin1 as Latin1 34 | import qualified Data.Foldable as Foldable 35 | import qualified Data.List as List 36 | import qualified Data.Primitive as PM 37 | import qualified Data.Text as Text 38 | import qualified Data.Text.Encoding as Text.Encoding 39 | import qualified GHC.Exts as Exts 40 | import qualified Test.QuickCheck.Classes as QCC 41 | import qualified Test.Tasty.HUnit as THU 42 | import qualified Test.Tasty.QuickCheck as TQC 43 | import qualified Data.Text.Array as A 44 | import qualified Data.Text.Internal as I 45 | 46 | main :: IO () 47 | main = defaultMain tests 48 | 49 | tests :: TestTree 50 | tests = 51 | testGroup 52 | "Bytes" 53 | [ lawsToTest (QCC.eqLaws (Proxy :: Proxy Bytes)) 54 | , lawsToTest (QCC.ordLaws (Proxy :: Proxy Bytes)) 55 | , lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Bytes)) 56 | , lawsToTest (QCC.monoidLaws (Proxy :: Proxy Bytes)) 57 | , testGroup 58 | "toLowerAsciiByteArrayClone" 59 | [ testCase "A" $ 60 | THU.assertBool "" $ 61 | Bytes.isPrefixOf (bytes "hey") (bytes "hey man") 62 | ] 63 | , testGroup 64 | "isPrefixOf" 65 | [ testCase "A" $ 66 | AsciiExt.toLowerU (bytes "FooBar") 67 | @=? pack "foobar" 68 | , testCase "B" $ 69 | THU.assertBool "" $ 70 | not (Bytes.isPrefixOf (bytes "an") (bytes "hey man")) 71 | ] 72 | , testGroup 73 | "isSuffixOf" 74 | [ testCase "A" $ 75 | THU.assertBool "" $ 76 | Bytes.isSuffixOf (bytes "an") (bytes "hey man") 77 | , testCase "B" $ 78 | THU.assertBool "" $ 79 | not (Bytes.isSuffixOf (bytes "h") (bytes "hey man")) 80 | ] 81 | , testGroup 82 | "isInfixOf" 83 | [ testCase "small pattern A" $ 84 | THU.assertBool "" $ 85 | Bytes.isInfixOf (bytes "y m") (bytes "hey man") 86 | , testCase "small pattern B" $ 87 | THU.assertBool "" $ 88 | Bytes.isInfixOf (bytes "h") (bytes "hey man") 89 | , testCase "small pattern C" $ 90 | THU.assertBool "" $ 91 | Bytes.isInfixOf (bytes "an") (bytes "hey man") 92 | , testCase "small pattern D" $ 93 | THU.assertBool "" $ 94 | not (Bytes.isInfixOf (bytes "Y M") (bytes "hey man")) 95 | , testCase "large pattern A" $ 96 | THU.assertBool "" $ 97 | Bytes.isInfixOf (bytes "Hello, hello!") (bytes "I say hello. Hello, hello! I don't know why you say goodbye, I say hello!") 98 | , testCase "large pattern D" $ 99 | THU.assertBool "" $ 100 | not (Bytes.isInfixOf (bytes "Hello, hello!") (bytes "I don't know why you say goodbye, I say hello!")) 101 | , testCase "edge: empty pattern" $ 102 | THU.assertBool "" $ 103 | Bytes.isInfixOf (bytes "") (bytes "hello hello!") 104 | , testCase "edge: empty string" $ 105 | THU.assertBool "" $ 106 | not (Bytes.isInfixOf (bytes "hello hello!") (bytes "")) 107 | ] 108 | , testGroup 109 | "findIndices" 110 | [ testCase "A" $ 111 | Exts.fromList [4] 112 | @=? Bytes.findIndices (Bytes.fromByteArray (pack "greatest showman")) (Bytes.fromByteArray (pack "the greatest showman of all")) 113 | , testCase "B" $ 114 | Exts.fromList [4] 115 | @=? Bytes.findIndices (bytes "greatest showman") (bytes "the greatest showman of all") 116 | , testCase "C" $ 117 | Exts.fromList [0, 1, 2, 3, 4, 5] 118 | @=? Bytes.findIndices (bytes "a") (bytes "aaaaaa") 119 | , testCase "D" $ 120 | Exts.fromList [0, 2, 4] 121 | @=? Bytes.findIndices (bytes "aa") (bytes "aaaaaaa") 122 | ] 123 | , testGroup 124 | "replace-spec" 125 | [ testCase "A" $ 126 | Bytes.empty 127 | @=? Bytes.replace (Bytes.fromByteArray (pack "hello")) (Bytes.fromByteArray (pack "world")) Bytes.empty 128 | , testCase "B" $ 129 | bytes "xzybbcbbc" 130 | @=? Bytes.replace (bytes "a") (bytes "b") (bytes "xzyabcabc") 131 | , testCase "C" $ 132 | bytes "my favorite month is March!" 133 | @=? Bytes.replace (bytes "November") (bytes "March") (bytes "my favorite month is November!") 134 | , testCase "D" $ 135 | bytes "Saturn, Saturn, Mars, Saturn" 136 | @=? Bytes.replace (bytes "Jupiter") (bytes "Saturn") (bytes "Jupiter, Jupiter, Mars, Jupiter") 137 | ] 138 | , testGroup 139 | "stripOptionalSuffix" 140 | [ testCase "A" $ 141 | Ascii.fromString "hey m" 142 | @=? Bytes.stripOptionalSuffix (bytes "an") (bytes "hey man") 143 | , testCase "B" $ 144 | Ascii.fromString "hey man" 145 | @=? Bytes.stripOptionalSuffix (bytes "h") (bytes "hey man") 146 | ] 147 | , testGroup 148 | "longestCommonPrefix" 149 | [ testProperty "finds prefix" $ \(pre :: Bytes) (a :: Bytes) (b :: Bytes) -> 150 | if 151 | | Just (wa, _) <- Bytes.uncons a 152 | , Just (wb, _) <- Bytes.uncons b 153 | , wa /= wb -> 154 | Bytes.longestCommonPrefix (pre <> a) (pre <> b) === pre 155 | | otherwise -> property Discard 156 | , testProperty "finds no prefix" $ \(a :: Bytes) (b :: Bytes) -> 157 | if 158 | | Just (wa, _) <- Bytes.uncons a 159 | , Just (wb, _) <- Bytes.uncons b 160 | , wa /= wb -> 161 | Bytes.longestCommonPrefix a b === mempty 162 | | otherwise -> property Discard 163 | ] 164 | , testGroup 165 | "dropWhileEnd" 166 | [ testCase "A" $ 167 | Ascii.fromString "aabbcc" 168 | @=? Bytes.dropWhileEnd (== c2w 'b') (bytes "aabbccbb") 169 | ] 170 | , testGroup 171 | "takeWhileEnd" 172 | [ testCase "A" $ 173 | Ascii.fromString "bb" 174 | @=? Bytes.takeWhileEnd (== c2w 'b') (bytes "aabbccbb") 175 | , testCase "B" $ 176 | Ascii.fromString "" 177 | @=? Bytes.takeWhileEnd (/= c2w '\n') (bytes "aabbccbb\n") 178 | , testCase "C" $ 179 | slicedPack [0x1, 0x2, 0x3] 180 | @=? Bytes.takeWhileEnd (/= 0x0) (slicedPack [0x1, 0x0, 0x1, 0x2, 0x3]) 181 | ] 182 | , testProperty "decodeDecWord" $ \(w :: Word) -> 183 | Just w 184 | === Latin1.decodeDecWord (Bytes.unsafeDrop 1 (Exts.fromList (0xFF : (Exts.toList (Latin1.fromString (show w)))))) 185 | , testProperty "elem" $ \(x :: Word8) (xs :: [Word8]) -> 186 | List.elem x xs 187 | === Bytes.elem x (Bytes.unsafeDrop 1 (Exts.fromList (x : xs))) 188 | , testProperty "foldl" $ \(x :: Word8) (xs :: [Word8]) -> 189 | List.foldl (-) 0 xs 190 | === Bytes.foldl (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs))) 191 | , testProperty "foldlM" $ \(x :: Word8) (xs :: [Word8]) -> 192 | let f acc y = (tell [x] >> pure (acc - y)) :: Writer [Word8] Word8 193 | in Foldable.foldlM f 0 xs 194 | === Bytes.foldlM f 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs))) 195 | , testProperty "foldrM" $ \(x :: Word8) (xs :: [Word8]) -> 196 | let f acc y = (tell [x] >> pure (acc - y)) :: Writer [Word8] Word8 197 | in Foldable.foldrM f 0 xs 198 | === Bytes.foldrM f 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs))) 199 | , testProperty "foldl'" $ \(x :: Word8) (xs :: [Word8]) -> 200 | List.foldl' (-) 0 xs 201 | === Bytes.foldl' (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs))) 202 | , testProperty "foldr" $ \(x :: Word8) (xs :: [Word8]) -> 203 | Foldable.foldr (-) 0 xs 204 | === Bytes.foldr (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs))) 205 | , testProperty "foldr'" $ \(x :: Word8) (xs :: [Word8]) -> 206 | Foldable.foldr' (-) 0 xs 207 | === Bytes.foldr' (-) 0 (Bytes.unsafeDrop 1 (Exts.fromList (x : xs))) 208 | , testProperty "count" $ \(x :: Word8) (xs :: [Word8]) -> 209 | ByteString.count x (ByteString.pack xs) 210 | === Bytes.count x (slicedPack xs) 211 | , testProperty "split" $ \(x :: Word8) (xs :: [Word8]) -> 212 | not (List.null xs) 213 | ==> ByteString.split x (ByteString.pack xs) 214 | === map bytesToByteString (Bytes.split x (slicedPack xs)) 215 | , testProperty "intercalate" $ \(x :: Bytes) (xs :: [Bytes]) -> 216 | mconcat (List.intersperse x xs) 217 | === Bytes.intercalate x xs 218 | , testProperty "concatArray" $ \(xs :: [Bytes]) -> 219 | mconcat xs 220 | === Bytes.concatArray (Exts.fromList xs) 221 | , testProperty "splitNonEmpty" $ \(x :: Word8) (xs :: [Word8]) -> 222 | Bytes.split x (slicedPack xs) 223 | === Foldable.toList (Bytes.splitNonEmpty x (slicedPack xs)) 224 | , testProperty "splitInit" $ \(x :: Word8) (xs :: [Word8]) -> case xs of 225 | [] -> Bytes.splitInit x (slicedPack xs) === [] 226 | _ -> 227 | List.init (ByteString.split x (ByteString.pack xs)) 228 | === map bytesToByteString (Bytes.splitInit x (slicedPack xs)) 229 | , testProperty "splitU" $ \(x :: Word8) (xs :: [Word8]) -> 230 | not (List.null xs) 231 | ==> Bytes.split x (slicedPack xs) 232 | === map Bytes.fromByteArray (Exts.toList (Bytes.splitU x (slicedPack xs))) 233 | , testCase "splitInit-A" $ 234 | [Ascii.fromString "hello", Ascii.fromString "world"] 235 | @=? (Bytes.splitInit 0x0A (Ascii.fromString "hello\nworld\n")) 236 | , testCase "splitInit-B" $ 237 | [Ascii.fromString "hello", Ascii.fromString "world"] 238 | @=? (Bytes.splitInit 0x0A (Ascii.fromString "hello\nworld\nthere")) 239 | , testProperty "replace" $ \(ws :: ASCIIString) (xs :: ASCIIString) (ys :: ASCIIString) (zs :: ASCIIString) -> 240 | let ws' = asciiStringToBytes ws 241 | xs' = asciiStringToBytes xs 242 | ys' = asciiStringToBytes ys 243 | zs' = asciiStringToBytes zs 244 | ws'' = asciiStringToText ws 245 | xs'' = asciiStringToText xs 246 | ys'' = asciiStringToText ys 247 | zs'' = asciiStringToText zs 248 | in case ws of 249 | ASCIIString [] -> property Discard 250 | _ -> 251 | Bytes.replace ws' xs' (ys' <> ws' <> zs') 252 | === Bytes.fromByteString (Text.Encoding.encodeUtf8 (Text.replace ws'' xs'' (ys'' <> ws'' <> zs''))) 253 | , testProperty "splitEnd1" $ \(x :: Word8) (xs :: [Word8]) -> 254 | case ByteString.split x (ByteString.pack xs) of 255 | [] -> Bytes.splitEnd1 x (slicedPack xs) === Nothing 256 | [_] -> Bytes.splitEnd1 x (slicedPack xs) === Nothing 257 | [y1, z1] -> case Bytes.splitEnd1 x (slicedPack xs) of 258 | Nothing -> property False 259 | Just (y2, z2) -> (y1, z1) === (bytesToByteString y2, bytesToByteString z2) 260 | _ -> property Discard 261 | , testProperty "split1" $ \(x :: Word8) (xs :: [Word8]) -> 262 | case ByteString.split x (ByteString.pack xs) of 263 | [] -> Bytes.split1 x (slicedPack xs) === Nothing 264 | [_] -> Bytes.split1 x (slicedPack xs) === Nothing 265 | [y1, z1] -> case Bytes.split1 x (slicedPack xs) of 266 | Nothing -> property False 267 | Just (y2, z2) -> (y1, z1) === (bytesToByteString y2, bytesToByteString z2) 268 | _ -> property Discard 269 | , testProperty "splitTetragram1A" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (w3 :: Word8) (xs :: [Word8]) -> 270 | (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF && w3 /= 0xEF) 271 | ==> case Bytes.splitTetragram1 w0 w1 w2 w3 (slicedPack (0xEF : w0 : w1 : w2 : w3 : xs)) of 272 | Nothing -> property False 273 | Just (pre, post) -> 274 | (pre, post) === (Exts.fromList [0xEF], Exts.fromList xs) 275 | , testProperty "splitTetragram1B" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (w3 :: Word8) (xs :: [Word8]) -> 276 | (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF && w3 /= 0xEF) 277 | ==> case Bytes.splitTetragram1 w0 w1 w2 w3 (slicedPack (0xEF : 0xEF : 0xEF : 0xEF : w0 : w1 : w2 : w3 : xs)) of 278 | Nothing -> property False 279 | Just (pre, post) -> 280 | (pre, post) === (Exts.fromList (List.replicate 4 0xEF), Exts.fromList xs) 281 | , testProperty "splitTetragram1C" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (xs :: [Word8]) -> 282 | (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF) 283 | ==> case Bytes.splitTetragram1 w0 w1 w2 0xEF (slicedPack (xs ++ [w0, w1, w2, 0xEF])) of 284 | Nothing -> property False 285 | Just (pre, post) -> 286 | (pre, post) === (Exts.fromList xs, mempty) 287 | , testProperty "splitTetragram1D" $ \(w0 :: Word8) (w1 :: Word8) (w2 :: Word8) (xs :: [Word8]) -> 288 | (w0 /= 0xEF && w1 /= 0xEF && w2 /= 0xEF) 289 | ==> case Bytes.splitTetragram1 w0 w1 w2 0xEF (slicedPack (xs ++ [w0, w1, w2, 0xEF, 0xEF])) of 290 | Nothing -> property False 291 | Just (pre, post) -> 292 | (pre, post) === (Exts.fromList xs, Exts.fromList [0xEF]) 293 | , testProperty "split2" $ \(xs :: [Word8]) (ys :: [Word8]) (zs :: [Word8]) -> 294 | (all (/= 0xEF) xs && all (/= 0xEF) ys && all (/= 0xEF) zs) 295 | ==> case Bytes.split2 0xEF (slicedPack (xs ++ [0xEF] ++ ys ++ [0xEF] ++ zs)) of 296 | Just r -> r === (slicedPack xs, slicedPack ys, slicedPack zs) 297 | Nothing -> property False 298 | , testProperty "split3" $ \(ws :: [Word8]) (xs :: [Word8]) (ys :: [Word8]) (zs :: [Word8]) -> 299 | (all (/= 0xEF) ws && all (/= 0xEF) xs && all (/= 0xEF) ys && all (/= 0xEF) zs) 300 | ==> case Bytes.split3 0xEF (slicedPack (ws ++ [0xEF] ++ xs ++ [0xEF] ++ ys ++ [0xEF] ++ zs)) of 301 | Just r -> r === (slicedPack ws, slicedPack xs, slicedPack ys, slicedPack zs) 302 | Nothing -> property False 303 | , testProperty "split4" $ \(ws :: [Word8]) (xs :: [Word8]) (ys :: [Word8]) (zs :: [Word8]) (ps :: [Word8]) -> 304 | (all (/= 0xEF) ws && all (/= 0xEF) xs && all (/= 0xEF) ys && all (/= 0xEF) zs && all (/= 0xEF) ps) 305 | ==> case Bytes.split4 0xEF (slicedPack (ws ++ [0xEF] ++ xs ++ [0xEF] ++ ys ++ [0xEF] ++ zs ++ [0xEF] ++ ps)) of 306 | Just r -> r === (slicedPack ws, slicedPack xs, slicedPack ys, slicedPack zs, slicedPack ps) 307 | Nothing -> property False 308 | , testGroup 309 | "FNV-1a" 310 | [ testGroup 311 | "32-bit" 312 | [ testCase "empty" (Bytes.fnv1a32 Bytes.empty @=? 0x811c9dc5) 313 | , testCase "a" (Bytes.fnv1a32 (bytes "a") @=? 0xe40c292c) 314 | , testCase "foobar" (Bytes.fnv1a32 (bytes "foobar") @=? 0xbf9cf968) 315 | ] 316 | , testGroup 317 | "64-bit" 318 | [ testCase "empty" (Bytes.fnv1a64 Bytes.empty @=? 0xcbf29ce484222325) 319 | , testCase "a" (Bytes.fnv1a64 (bytes "a") @=? 0xaf63dc4c8601ec8c) 320 | , testCase "foobar" (Bytes.fnv1a64 (bytes "foobar") @=? 0x85944171f73967e8) 321 | , testCase "google.com" (Bytes.fnv1a64 (bytes "google.com") @=? 0xe1a2c1ae38dcdf45) 322 | ] 323 | ] 324 | , testGroup 325 | "Chunks" 326 | [ lawsToTest (QCC.eqLaws (Proxy :: Proxy Chunks)) 327 | , lawsToTest (QCC.semigroupLaws (Proxy :: Proxy Chunks)) 328 | , lawsToTest (QCC.monoidLaws (Proxy :: Proxy Chunks)) 329 | , testGroup 330 | "concatenation" 331 | [ testProperty "concat=concatU" $ \(c :: Chunks) -> 332 | Chunks.concat c === Bytes.fromByteArray (Chunks.concatU c) 333 | , testProperty "concat-singleton" $ \(b :: Bytes) -> 334 | Chunks.concat (ChunksCons b ChunksNil) === b 335 | , testProperty "concatU-singleton" $ \(b :: Bytes) -> 336 | Chunks.concatU (ChunksCons b ChunksNil) === Bytes.toByteArray b 337 | ] 338 | ] 339 | , testGroup 340 | "ASCII" 341 | [ testGroup 342 | "toText=naiveAsciiToText" 343 | [ testProperty "alpha" $ \(ws :: [Word8]) -> 344 | let b = slicedPack ws in naiveAsciiToText b === Ascii.toText b 345 | , testProperty "beta" $ \(ws :: [Word8]) -> 346 | let b = slicedPackAlt ws in naiveAsciiToText b === Ascii.toText b 347 | , testProperty "gamma" $ \(ASCIIString cs) -> 348 | let b = bytes cs in naiveAsciiToText b === Ascii.toText b 349 | ] 350 | ] 351 | ] 352 | 353 | bytes :: String -> Bytes 354 | bytes s = let b = pack ('x' : s) in Bytes b 1 (PM.sizeofByteArray b - 1) 355 | 356 | slicedPack :: [Word8] -> Bytes 357 | slicedPack s = 358 | let b = Exts.fromList ([0x00] ++ s ++ [0x00]) 359 | in Bytes b 1 (PM.sizeofByteArray b - 2) 360 | 361 | slicedPackAlt :: [Word8] -> Bytes 362 | slicedPackAlt s = 363 | let b = Exts.fromList ([0xFF] ++ s ++ [0x80]) 364 | in Bytes b 1 (PM.sizeofByteArray b - 2) 365 | 366 | pack :: String -> ByteArray 367 | pack = Exts.fromList . map (fromIntegral @Int @Word8 . ord) 368 | 369 | c2w :: Char -> Word8 370 | c2w = fromIntegral . ord 371 | 372 | bytesToByteString :: Bytes -> ByteString.ByteString 373 | bytesToByteString = ByteString.pack . Bytes.foldr (:) [] 374 | 375 | instance Arbitrary Bytes where 376 | arbitrary = do 377 | xs :: [Word8] <- TQC.arbitrary 378 | front <- TQC.choose (0, 2) 379 | back <- TQC.choose (0, 2) 380 | let frontPad = replicate front (254 :: Word8) 381 | let backPad = replicate back (254 :: Word8) 382 | let raw = Exts.fromList (frontPad ++ xs ++ backPad) 383 | pure (Bytes raw front (length xs)) 384 | 385 | instance Arbitrary Chunks where 386 | arbitrary = do 387 | xs :: [[Word8]] <- TQC.arbitrary 388 | let ys = 389 | map 390 | (\x -> Exts.fromList ([255] ++ x ++ [255])) 391 | xs 392 | zs = 393 | foldr 394 | ( \b cs -> 395 | ChunksCons (Bytes b 1 (PM.sizeofByteArray b - 2)) cs 396 | ) 397 | ChunksNil 398 | ys 399 | pure zs 400 | 401 | lawsToTest :: QCC.Laws -> TestTree 402 | lawsToTest (QCC.Laws name pairs) = testGroup name (map (uncurry TQC.testProperty) pairs) 403 | 404 | asciiStringToBytes :: ASCIIString -> Bytes 405 | asciiStringToBytes (ASCIIString x) = bytes x 406 | 407 | asciiStringToText :: ASCIIString -> Text 408 | asciiStringToText (ASCIIString x) = Text.pack x 409 | 410 | naiveAsciiToText :: Bytes -> Maybe Text 411 | {-# noinline naiveAsciiToText #-} 412 | naiveAsciiToText !b@(Bytes (PM.ByteArray arr) off len) = case Bytes.foldr (\w acc -> w < 128 && acc) True b of 413 | True -> Just (I.Text (A.ByteArray arr) off len) 414 | False -> Nothing 415 | -------------------------------------------------------------------------------- /src/Data/Bytes.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE BlockArguments #-} 3 | {-# LANGUAGE DataKinds #-} 4 | {-# LANGUAGE DuplicateRecordFields #-} 5 | {-# LANGUAGE KindSignatures #-} 6 | {-# LANGUAGE MagicHash #-} 7 | {-# LANGUAGE NamedFieldPuns #-} 8 | {-# LANGUAGE RankNTypes #-} 9 | {-# LANGUAGE UnboxedTuples #-} 10 | 11 | {- | If you are interested in sub-arrays of 'ByteArray's (e.g. writing a binary 12 | search), it would be grossly inefficient to make a copy of the sub-array. On 13 | the other hand, it'd be really annoying to track limit indices by hand. 14 | 15 | This module defines the 'Bytes' type which exposes a standard array interface 16 | for a sub-arrays without copying and without manual index manipulation. -- 17 | For mutable arrays, see 'Data.Bytes.Mutable'. 18 | -} 19 | module Data.Bytes 20 | ( -- * Types 21 | Bytes 22 | 23 | -- * Constants 24 | , Pure.empty 25 | , Pure.emptyPinned 26 | , Pure.emptyPinnedU 27 | 28 | -- * Properties 29 | , Pure.null 30 | , Pure.length 31 | 32 | -- * Decompose 33 | , uncons 34 | , unsnoc 35 | 36 | -- * Predicates 37 | , Pure.any 38 | , Pure.all 39 | 40 | -- * Create 41 | 42 | -- ** Sliced 43 | , singleton 44 | , doubleton 45 | , tripleton 46 | , Pure.replicate 47 | 48 | -- ** Unsliced 49 | , singletonU 50 | , doubletonU 51 | , tripletonU 52 | , Pure.replicateU 53 | 54 | -- * Filtering 55 | , takeWhile 56 | , dropWhile 57 | , takeWhileEnd 58 | , dropWhileEnd 59 | 60 | -- * Traversals 61 | , Pure.map 62 | , Pure.mapU 63 | 64 | -- * Folds 65 | , Pure.foldl 66 | , Pure.foldl' 67 | , Pure.foldr 68 | , Pure.foldr' 69 | 70 | -- * Folds with Indices 71 | , Pure.ifoldl' 72 | 73 | -- * Monadic Folds 74 | , Pure.foldlM 75 | , Pure.foldrM 76 | 77 | -- * Common Folds 78 | , elem 79 | 80 | -- * Splitting 81 | 82 | -- ** Unlimited 83 | , Byte.split 84 | , Byte.splitU 85 | , Byte.splitInit 86 | , Byte.splitInitU 87 | , Byte.splitNonEmpty 88 | , Byte.splitStream 89 | 90 | -- ** Fixed from Beginning 91 | , Byte.split1 92 | , Pure.splitTetragram1 93 | , Byte.split2 94 | , Byte.split3 95 | , Byte.split4 96 | 97 | -- ** Fixed from End 98 | , Byte.splitEnd1 99 | 100 | -- * Combining 101 | , intercalate 102 | , intercalateByte2 103 | , concatArray 104 | , concatArrayU 105 | 106 | -- * Searching 107 | , replace 108 | , findIndices 109 | , Pure.findTetragramIndex 110 | 111 | -- * Counting 112 | , Byte.count 113 | 114 | -- * Prefix and Suffix 115 | 116 | -- ** Byte Sequence 117 | , isPrefixOf 118 | , isSuffixOf 119 | , isInfixOf 120 | , stripPrefix 121 | , stripOptionalPrefix 122 | , stripSuffix 123 | , stripOptionalSuffix 124 | , longestCommonPrefix 125 | 126 | -- ** C Strings 127 | , stripCStringPrefix 128 | 129 | -- ** Single Byte 130 | , isBytePrefixOf 131 | , isByteSuffixOf 132 | 133 | -- * Equality 134 | 135 | -- ** Fixed Characters 136 | , equalsLatin1 137 | , equalsLatin2 138 | , equalsLatin3 139 | , equalsLatin4 140 | , equalsLatin5 141 | , equalsLatin6 142 | , equalsLatin7 143 | , equalsLatin8 144 | , equalsLatin9 145 | , equalsLatin10 146 | , equalsLatin11 147 | , equalsLatin12 148 | 149 | -- ** C Strings 150 | , equalsCString 151 | 152 | -- * Hashing 153 | , Pure.fnv1a32 154 | , Pure.fnv1a64 155 | 156 | -- * Unsafe Slicing 157 | , Pure.unsafeTake 158 | , Pure.unsafeDrop 159 | , Pure.unsafeIndex 160 | , Pure.unsafeHead 161 | 162 | -- * Copying 163 | , Pure.unsafeCopy 164 | 165 | -- * Pointers 166 | , Pure.pin 167 | , Pure.contents 168 | , touch 169 | 170 | -- * Conversion 171 | , Pure.toByteArray 172 | , Pure.toByteArrayClone 173 | , Pure.toPinnedByteArray 174 | , Pure.toPinnedByteArrayClone 175 | , fromAsciiString 176 | , fromLatinString 177 | , Pure.fromByteArray 178 | , Pure.fromPrimArray 179 | , toLatinString 180 | , fromCString# 181 | , Pure.toByteString 182 | , Pure.pinnedToByteString 183 | , Pure.fromByteString 184 | , Pure.fromLazyByteString 185 | , fromShortByteString 186 | , fromShortText 187 | , toShortByteString 188 | , toShortByteStringClone 189 | , toLowerAsciiByteArrayClone 190 | 191 | -- * I\/O with Handles 192 | , BIO.hGet 193 | , readFile 194 | , BIO.hPut 195 | 196 | -- * Unlifted Types 197 | , lift 198 | , unlift 199 | 200 | -- * Length Indexed 201 | , withLength 202 | , withLengthU 203 | ) where 204 | 205 | import Prelude hiding (all, any, dropWhile, elem, foldl, foldr, length, map, null, readFile, replicate, takeWhile) 206 | 207 | import Control.Monad.Primitive (PrimMonad, primitive_, unsafeIOToPrim) 208 | import Control.Monad.ST.Run (runByteArrayST) 209 | import Cstrlen (cstringLength#) 210 | import Data.ByteString.Short.Internal (ShortByteString (SBS)) 211 | import Data.Bytes.Pure (fromByteArray, length, toShortByteString, unsafeDrop, unsafeIndex) 212 | import Data.Bytes.Search (findIndices, isInfixOf, replace) 213 | import Data.Bytes.Types (ByteArrayN (ByteArrayN), Bytes (Bytes, array, offset), BytesN (BytesN)) 214 | import Data.Primitive (Array, ByteArray (ByteArray)) 215 | import Data.Text.Short (ShortText) 216 | import Foreign.C.String (CString) 217 | import Foreign.Ptr (Ptr, castPtr, plusPtr) 218 | import GHC.Exts (Addr#, Int (I#), Int#, Ptr (Ptr), Word#) 219 | import GHC.Word (Word8 (W8#)) 220 | import Reps (Bytes# (..), word8ToWord#) 221 | 222 | import qualified Arithmetic.Nat as Nat 223 | import qualified Arithmetic.Types as Arithmetic 224 | import qualified Data.Bytes.Byte as Byte 225 | import qualified Data.Bytes.Chunks as Chunks 226 | import qualified Data.Bytes.IO as BIO 227 | import qualified Data.Bytes.Pure as Pure 228 | import qualified Data.Bytes.Text.Ascii as Ascii 229 | import qualified Data.Bytes.Text.AsciiExt as AsciiExt 230 | import qualified Data.Bytes.Text.Latin1 as Latin1 231 | import qualified Data.Bytes.Types as Types 232 | import qualified Data.Foldable as F 233 | import qualified Data.List as List 234 | import qualified Data.Primitive as PM 235 | import qualified Data.Primitive.Ptr as PM 236 | import qualified Data.Text.Short as TS 237 | import qualified GHC.Exts as Exts 238 | import qualified GHC.TypeNats as GHC 239 | 240 | {- | Extract the head and tail of the 'Bytes', returning 'Nothing' if 241 | it is empty. 242 | -} 243 | uncons :: Bytes -> Maybe (Word8, Bytes) 244 | {-# INLINE uncons #-} 245 | uncons b = case length b of 246 | 0 -> Nothing 247 | _ -> Just (unsafeIndex b 0, unsafeDrop 1 b) 248 | 249 | {- | Extract the @init@ and @last@ of the 'Bytes', returning 'Nothing' if 250 | it is empty. 251 | -} 252 | unsnoc :: Bytes -> Maybe (Bytes, Word8) 253 | {-# INLINE unsnoc #-} 254 | unsnoc b@(Bytes arr off len) = case len of 255 | 0 -> Nothing 256 | _ -> 257 | let !len' = len - 1 258 | in Just (Bytes arr off len', unsafeIndex b len') 259 | 260 | {- | Does the byte sequence begin with the given byte? False if the 261 | byte sequence is empty. 262 | -} 263 | isBytePrefixOf :: Word8 -> Bytes -> Bool 264 | {-# INLINE isBytePrefixOf #-} 265 | isBytePrefixOf w b = case length b of 266 | 0 -> False 267 | _ -> unsafeIndex b 0 == w 268 | 269 | {- | Does the byte sequence end with the given byte? False if the 270 | byte sequence is empty. 271 | -} 272 | isByteSuffixOf :: Word8 -> Bytes -> Bool 273 | isByteSuffixOf w b = case len of 274 | 0 -> False 275 | _ -> unsafeIndex b (len - 1) == w 276 | where 277 | len = length b 278 | 279 | -- | Is the first argument a prefix of the second argument? 280 | isPrefixOf :: Bytes -> Bytes -> Bool 281 | isPrefixOf (Bytes a aOff aLen) (Bytes b bOff bLen) = 282 | -- For prefix and suffix testing, we do not use 283 | -- the sameByteArray optimization that we use in 284 | -- the Eq instance. Prefix and suffix testing seldom 285 | -- compares a byte array with the same in-memory 286 | -- byte array. 287 | if aLen <= bLen 288 | then compareByteArrays a aOff b bOff aLen == EQ 289 | else False 290 | 291 | -- | Is the first argument a suffix of the second argument? 292 | isSuffixOf :: Bytes -> Bytes -> Bool 293 | isSuffixOf (Bytes a aOff aLen) (Bytes b bOff bLen) = 294 | if aLen <= bLen 295 | then compareByteArrays a aOff b (bOff + bLen - aLen) aLen == EQ 296 | else False 297 | 298 | -- | Find the longest string which is a prefix of both arguments. 299 | longestCommonPrefix :: Bytes -> Bytes -> Bytes 300 | longestCommonPrefix a b = loop 0 301 | where 302 | loop :: Int -> Bytes 303 | loop !into 304 | | into < maxLen 305 | && unsafeIndex a into == unsafeIndex b into = 306 | loop (into + 1) 307 | | otherwise = Pure.unsafeTake into a 308 | maxLen = min (length a) (length b) 309 | 310 | -- | Create a byte sequence with one byte. 311 | singleton :: Word8 -> Bytes 312 | {-# INLINE singleton #-} 313 | singleton !a = Bytes (singletonU a) 0 1 314 | 315 | -- | Create a byte sequence with two bytes. 316 | doubleton :: Word8 -> Word8 -> Bytes 317 | {-# INLINE doubleton #-} 318 | doubleton !a !b = Bytes (doubletonU a b) 0 2 319 | 320 | -- | Create a byte sequence with three bytes. 321 | tripleton :: Word8 -> Word8 -> Word8 -> Bytes 322 | {-# INLINE tripleton #-} 323 | tripleton !a !b !c = Bytes (tripletonU a b c) 0 3 324 | 325 | -- | Create an unsliced byte sequence with one byte. 326 | singletonU :: Word8 -> ByteArray 327 | {-# INLINE singletonU #-} 328 | singletonU !a = runByteArrayST do 329 | arr <- PM.newByteArray 1 330 | PM.writeByteArray arr 0 a 331 | PM.unsafeFreezeByteArray arr 332 | 333 | -- | Create an unsliced byte sequence with two bytes. 334 | doubletonU :: Word8 -> Word8 -> ByteArray 335 | {-# INLINE doubletonU #-} 336 | doubletonU !a !b = runByteArrayST do 337 | arr <- PM.newByteArray 2 338 | PM.writeByteArray arr 0 a 339 | PM.writeByteArray arr 1 b 340 | PM.unsafeFreezeByteArray arr 341 | 342 | -- | Create an unsliced byte sequence with three bytes. 343 | tripletonU :: Word8 -> Word8 -> Word8 -> ByteArray 344 | {-# INLINE tripletonU #-} 345 | tripletonU !a !b !c = runByteArrayST do 346 | arr <- PM.newByteArray 3 347 | PM.writeByteArray arr 0 a 348 | PM.writeByteArray arr 1 b 349 | PM.writeByteArray arr 2 c 350 | PM.unsafeFreezeByteArray arr 351 | 352 | {- | /O(n)/ Return the suffix of the second string if its prefix 353 | matches the entire first string. 354 | -} 355 | stripPrefix :: Bytes -> Bytes -> Maybe Bytes 356 | stripPrefix !pre !str = 357 | if pre `isPrefixOf` str 358 | then Just (Bytes (array str) (offset str + length pre) (length str - length pre)) 359 | else Nothing 360 | 361 | {- | /O(n)/ Return the suffix of the second string if its prefix 362 | matches the entire first string. Otherwise, return the second 363 | string unchanged. 364 | -} 365 | stripOptionalPrefix :: Bytes -> Bytes -> Bytes 366 | stripOptionalPrefix !pre !str = 367 | if pre `isPrefixOf` str 368 | then Bytes (array str) (offset str + length pre) (length str - length pre) 369 | else str 370 | 371 | {- | /O(n)/ Return the prefix of the second string if its suffix 372 | matches the entire first string. 373 | -} 374 | stripSuffix :: Bytes -> Bytes -> Maybe Bytes 375 | stripSuffix !suf !str = 376 | if suf `isSuffixOf` str 377 | then Just (Bytes (array str) (offset str) (length str - length suf)) 378 | else Nothing 379 | 380 | {- | /O(n)/ Return the prefix of the second string if its suffix 381 | matches the entire first string. Otherwise, return the second 382 | string unchanged. 383 | -} 384 | stripOptionalSuffix :: Bytes -> Bytes -> Bytes 385 | stripOptionalSuffix !suf !str = 386 | if suf `isSuffixOf` str 387 | then Bytes (array str) (offset str) (length str - length suf) 388 | else str 389 | 390 | -- | Is the byte a member of the byte sequence? 391 | elem :: Word8 -> Bytes -> Bool 392 | elem (W8# w) b = case elemLoop 0# (word8ToWord# w) b of 393 | 1# -> True 394 | _ -> False 395 | 396 | elemLoop :: Int# -> Word# -> Bytes -> Int# 397 | elemLoop !r !w (Bytes arr@(ByteArray arr#) off@(I# off#) len) = case len of 398 | 0 -> r 399 | _ -> elemLoop (Exts.orI# r (Exts.eqWord# w (word8ToWord# (Exts.indexWord8Array# arr# off#)))) w (Bytes arr (off + 1) (len - 1)) 400 | 401 | -- | Take bytes while the predicate is true. 402 | takeWhile :: (Word8 -> Bool) -> Bytes -> Bytes 403 | {-# INLINE takeWhile #-} 404 | takeWhile k b = Pure.unsafeTake (Pure.countWhile k b) b 405 | 406 | -- | Drop bytes while the predicate is true. 407 | dropWhile :: (Word8 -> Bool) -> Bytes -> Bytes 408 | {-# INLINE dropWhile #-} 409 | dropWhile k b = Pure.unsafeDrop (Pure.countWhile k b) b 410 | 411 | {- | /O(n)/ 'dropWhileEnd' @p@ @b@ returns the prefix remaining after 412 | dropping characters that satisfy the predicate @p@ from the end of 413 | @t@. 414 | -} 415 | dropWhileEnd :: (Word8 -> Bool) -> Bytes -> Bytes 416 | {-# INLINE dropWhileEnd #-} 417 | dropWhileEnd k !b = Pure.unsafeTake (length b - Pure.countWhileEnd k b) b 418 | 419 | {- | /O(n)/ 'takeWhileEnd' @p@ @b@ returns the longest suffix of 420 | elements that satisfy predicate @p@. 421 | -} 422 | takeWhileEnd :: (Word8 -> Bool) -> Bytes -> Bytes 423 | {-# INLINE takeWhileEnd #-} 424 | takeWhileEnd k !b = 425 | let n = Pure.countWhileEnd k b 426 | in Bytes (array b) (offset b + length b - n) n 427 | 428 | {- | Convert a 'String' consisting of only characters in the ASCII block 429 | to a byte sequence. Any character with a codepoint above @U+007F@ is 430 | replaced by @U+0000@. 431 | -} 432 | fromAsciiString :: String -> Bytes 433 | {-# DEPRECATED fromAsciiString "use Data.Bytes.Text.Ascii.fromString instead" #-} 434 | {-# INLINE fromAsciiString #-} 435 | fromAsciiString = Ascii.fromString 436 | 437 | {- | Convert a 'String' consisting of only characters representable 438 | by ISO-8859-1. These are encoded with ISO-8859-1. Any character 439 | with a codepoint above @U+00FF@ is replaced by an unspecified byte. 440 | -} 441 | fromLatinString :: String -> Bytes 442 | {-# DEPRECATED fromLatinString "use Data.Bytes.Text.Latin1.fromString instead" #-} 443 | {-# INLINE fromLatinString #-} 444 | fromLatinString = Latin1.fromString 445 | 446 | -- | Interpret a byte sequence as text encoded by ISO-8859-1. 447 | toLatinString :: Bytes -> String 448 | {-# DEPRECATED toLatinString "use Data.Bytes.Text.Latin1.toString instead" #-} 449 | {-# INLINE toLatinString #-} 450 | toLatinString = Latin1.toString 451 | 452 | -- | Copy a primitive string literal into managed memory. 453 | fromCString# :: Addr# -> Bytes 454 | fromCString# a = 455 | Bytes 456 | ( runByteArrayST $ do 457 | dst@(PM.MutableByteArray dst#) <- PM.newByteArray len 458 | PM.copyPtrToMutablePrimArray 459 | (PM.MutablePrimArray dst#) 460 | 0 461 | (Ptr a :: Ptr Word8) 462 | len 463 | PM.unsafeFreezeByteArray dst 464 | ) 465 | 0 466 | len 467 | where 468 | len = I# (cstringLength# a) 469 | 470 | compareByteArrays :: ByteArray -> Int -> ByteArray -> Int -> Int -> Ordering 471 | {-# INLINE compareByteArrays #-} 472 | compareByteArrays (ByteArray ba1#) (I# off1#) (ByteArray ba2#) (I# off2#) (I# n#) = 473 | compare (I# (Exts.compareByteArrays# ba1# off1# ba2# off2# n#)) 0 474 | 475 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 476 | a singleton whose element matches the character? 477 | -} 478 | equalsLatin1 :: Char -> Bytes -> Bool 479 | {-# DEPRECATED equalsLatin1 "use Data.Bytes.Text.Latin1.equals1 instead" #-} 480 | {-# INLINE equalsLatin1 #-} 481 | equalsLatin1 = Latin1.equals1 482 | 483 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 484 | a doubleton whose elements match the characters? 485 | -} 486 | equalsLatin2 :: Char -> Char -> Bytes -> Bool 487 | {-# DEPRECATED equalsLatin2 "use Data.Bytes.Text.Latin1.equals2 instead" #-} 488 | {-# INLINE equalsLatin2 #-} 489 | equalsLatin2 = Latin1.equals2 490 | 491 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 492 | a tripleton whose elements match the characters? 493 | -} 494 | equalsLatin3 :: Char -> Char -> Char -> Bytes -> Bool 495 | {-# DEPRECATED equalsLatin3 "use Data.Bytes.Text.Latin1.equals3 instead" #-} 496 | {-# INLINE equalsLatin3 #-} 497 | equalsLatin3 = Latin1.equals3 498 | 499 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 500 | a quadrupleton whose elements match the characters? 501 | -} 502 | equalsLatin4 :: Char -> Char -> Char -> Char -> Bytes -> Bool 503 | {-# DEPRECATED equalsLatin4 "use Data.Bytes.Text.Latin1.equals4 instead" #-} 504 | {-# INLINE equalsLatin4 #-} 505 | equalsLatin4 = Latin1.equals4 506 | 507 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 508 | a quintupleton whose elements match the characters? 509 | -} 510 | equalsLatin5 :: Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 511 | {-# DEPRECATED equalsLatin5 "use Data.Bytes.Text.Latin1.equals5 instead" #-} 512 | {-# INLINE equalsLatin5 #-} 513 | equalsLatin5 = Latin1.equals5 514 | 515 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 516 | a sextupleton whose elements match the characters? 517 | -} 518 | equalsLatin6 :: Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 519 | {-# DEPRECATED equalsLatin6 "use Data.Bytes.Text.Latin1.equals6 instead" #-} 520 | {-# INLINE equalsLatin6 #-} 521 | equalsLatin6 = Latin1.equals6 522 | 523 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 524 | a septupleton whose elements match the characters? 525 | -} 526 | equalsLatin7 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 527 | {-# DEPRECATED equalsLatin7 "use Data.Bytes.Text.Latin1.equals7 instead" #-} 528 | {-# INLINE equalsLatin7 #-} 529 | equalsLatin7 = Latin1.equals7 530 | 531 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 532 | an octupleton whose elements match the characters? 533 | -} 534 | equalsLatin8 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 535 | {-# DEPRECATED equalsLatin8 "use Data.Bytes.Text.Latin1.equals8 instead" #-} 536 | {-# INLINE equalsLatin8 #-} 537 | equalsLatin8 = Latin1.equals8 538 | 539 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 540 | a 9-tuple whose elements match the characters? 541 | -} 542 | equalsLatin9 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 543 | {-# DEPRECATED equalsLatin9 "use Data.Bytes.Text.Latin1.equals9 instead" #-} 544 | {-# INLINE equalsLatin9 #-} 545 | equalsLatin9 = Latin1.equals9 546 | 547 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 548 | a 10-tuple whose elements match the characters? 549 | -} 550 | equalsLatin10 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 551 | {-# DEPRECATED equalsLatin10 "use Data.Bytes.Text.Latin1.equals10 instead" #-} 552 | {-# INLINE equalsLatin10 #-} 553 | equalsLatin10 = Latin1.equals10 554 | 555 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 556 | a 11-tuple whose elements match the characters? 557 | -} 558 | equalsLatin11 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 559 | {-# DEPRECATED equalsLatin11 "use Data.Bytes.Text.Latin1.equals11 instead" #-} 560 | {-# INLINE equalsLatin11 #-} 561 | equalsLatin11 = Latin1.equals11 562 | 563 | {- | Is the byte sequence, when interpreted as ISO-8859-1-encoded text, 564 | a 12-tuple whose elements match the characters? 565 | -} 566 | equalsLatin12 :: Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Char -> Bytes -> Bool 567 | {-# DEPRECATED equalsLatin12 "use Data.Bytes.Text.Latin1.equals12 instead" #-} 568 | {-# INLINE equalsLatin12 #-} 569 | equalsLatin12 = Latin1.equals12 570 | 571 | {- | Is the byte sequence equal to the @NUL@-terminated C String? 572 | The C string must be a constant. 573 | -} 574 | equalsCString :: CString -> Bytes -> Bool 575 | {-# INLINE equalsCString #-} 576 | equalsCString !ptr0 (Bytes arr off0 len0) = go (castPtr ptr0 :: Ptr Word8) off0 len0 577 | where 578 | go !ptr !off !len = case len of 579 | 0 -> PM.indexOffPtr ptr 0 == (0 :: Word8) 580 | _ -> case PM.indexOffPtr ptr 0 of 581 | 0 -> False 582 | c -> c == PM.indexByteArray arr off && go (plusPtr ptr 1) (off + 1) (len - 1) 583 | 584 | {- | /O(n)/ Variant of 'stripPrefix' that takes a @NUL@-terminated C String 585 | as the prefix to test for. 586 | -} 587 | stripCStringPrefix :: CString -> Bytes -> Maybe Bytes 588 | {-# INLINE stripCStringPrefix #-} 589 | stripCStringPrefix !ptr0 (Bytes arr off0 len0) = go (castPtr ptr0 :: Ptr Word8) off0 len0 590 | where 591 | go !ptr !off !len = case PM.indexOffPtr ptr 0 of 592 | 0 -> Just (Bytes arr off len) 593 | c -> case len of 594 | 0 -> Nothing 595 | _ -> case c == PM.indexByteArray arr off of 596 | True -> go (plusPtr ptr 1) (off + 1) (len - 1) 597 | False -> Nothing 598 | 599 | {- | Touch the byte array backing the byte sequence. This sometimes needed 600 | after calling 'Pure.contents' so that the @ByteArray@ does not get garbage 601 | collected. 602 | -} 603 | touch :: (PrimMonad m) => Bytes -> m () 604 | touch (Bytes (ByteArray arr) _ _) = 605 | unsafeIOToPrim 606 | (primitive_ (\s -> Exts.touch# arr s)) 607 | 608 | -- | Read an entire file strictly into a 'Bytes'. 609 | readFile :: FilePath -> IO Bytes 610 | readFile f = Chunks.concat <$> Chunks.readFile f 611 | 612 | {- | /O(n)/ The intercalate function takes a separator 'Bytes' and a list of 613 | 'Bytes' and concatenates the list elements by interspersing the separator 614 | between each element. 615 | -} 616 | intercalate :: 617 | -- | Separator (interspersed into the list) 618 | Bytes -> 619 | -- | List 620 | [Bytes] -> 621 | Bytes 622 | intercalate !_ [] = mempty 623 | intercalate !_ [x] = x 624 | intercalate (Bytes sarr soff slen) (Bytes arr0 off0 len0 : bs) = Bytes r 0 fullLen 625 | where 626 | !fullLen = List.foldl' (\acc (Bytes _ _ len) -> acc + len + slen) 0 bs + len0 627 | r = runByteArrayST $ do 628 | marr <- PM.newByteArray fullLen 629 | PM.copyByteArray marr 0 arr0 off0 len0 630 | !_ <- 631 | F.foldlM 632 | ( \ !currLen (Bytes arr off len) -> do 633 | PM.copyByteArray marr currLen sarr soff slen 634 | PM.copyByteArray marr (currLen + slen) arr off len 635 | pure (currLen + len + slen) 636 | ) 637 | len0 638 | bs 639 | PM.unsafeFreezeByteArray marr 640 | 641 | {- | Specialization of 'intercalate' where the separator is a single byte and 642 | there are exactly two byte sequences that are being concatenated. 643 | -} 644 | intercalateByte2 :: 645 | -- | Separator 646 | Word8 -> 647 | -- | First byte sequence 648 | Bytes -> 649 | -- | Second byte sequence 650 | Bytes -> 651 | Bytes 652 | intercalateByte2 !sep !a !b = 653 | Bytes 654 | { Types.array = runByteArrayST $ do 655 | dst <- PM.newByteArray len 656 | Pure.unsafeCopy dst 0 a 657 | PM.writeByteArray dst (length a) sep 658 | Pure.unsafeCopy dst (length a + 1) b 659 | PM.unsafeFreezeByteArray dst 660 | , Types.length = len 661 | , Types.offset = 0 662 | } 663 | where 664 | len = length a + length b + 1 665 | 666 | {- | Variant of 'toShortByteString' that unconditionally makes a copy of 667 | the array backing the sliced 'Bytes' even if the original array 668 | could be reused. Prefer 'toShortByteString'. 669 | -} 670 | toShortByteStringClone :: Bytes -> ShortByteString 671 | {-# INLINE toShortByteStringClone #-} 672 | toShortByteStringClone !b = case Pure.toByteArrayClone b of 673 | PM.ByteArray x -> SBS x 674 | 675 | -- | /O(1)/ Create 'Bytes' from a 'ShortByteString'. 676 | fromShortByteString :: ShortByteString -> Bytes 677 | {-# INLINE fromShortByteString #-} 678 | fromShortByteString (SBS x) = fromByteArray (ByteArray x) 679 | 680 | {- | /O(1)/ Create 'Bytes' from a 'ShortText'. This encodes the text as UTF-8. 681 | It is a no-op. 682 | -} 683 | fromShortText :: ShortText -> Bytes 684 | {-# INLINE fromShortText #-} 685 | fromShortText t = case TS.toShortByteString t of 686 | SBS x -> fromByteArray (ByteArray x) 687 | 688 | {- | /O(n)/ Interpreting the bytes an ASCII-encoded characters, convert 689 | the string to lowercase. This adds @0x20@ to bytes in the range 690 | @[0x41,0x5A]@ and leaves all other bytes alone. Unconditionally 691 | copies the bytes. 692 | -} 693 | toLowerAsciiByteArrayClone :: Bytes -> ByteArray 694 | {-# DEPRECATED toLowerAsciiByteArrayClone "use Data.Bytes/Text/AsciiExt.toLowerU" #-} 695 | {-# INLINE toLowerAsciiByteArrayClone #-} 696 | toLowerAsciiByteArrayClone = AsciiExt.toLowerU 697 | 698 | lift :: Bytes# -> Bytes 699 | {-# INLINE lift #-} 700 | lift (Bytes# (# arr, off, len #)) = Bytes (ByteArray arr) (I# off) (I# len) 701 | 702 | unlift :: Bytes -> Bytes# 703 | {-# INLINE unlift #-} 704 | unlift (Bytes (ByteArray arr) (I# off) (I# len)) = 705 | Bytes# (# arr, off, len #) 706 | 707 | concatArrayU :: Array Bytes -> ByteArray 708 | {-# NOINLINE concatArrayU #-} 709 | concatArrayU !xs = runByteArrayST $ do 710 | let !arrLen = PM.sizeofArray xs 711 | let !totalByteLen = F.foldl' (\acc b -> length b + acc) 0 xs 712 | dst <- PM.newByteArray totalByteLen 713 | let go !ix !dstOff = 714 | if ix < arrLen 715 | then do 716 | x <- PM.indexArrayM xs ix 717 | Pure.unsafeCopy dst dstOff x 718 | go (ix + 1) (dstOff + length x) 719 | else PM.unsafeFreezeByteArray dst 720 | go 0 0 721 | 722 | concatArray :: Array Bytes -> Bytes 723 | {-# INLINE concatArray #-} 724 | concatArray !xs = Pure.fromByteArray (concatArrayU xs) 725 | 726 | {- | Convert 'Bytes' to 'BytesN', exposing the length in a type-safe 727 | way in the callback. 728 | -} 729 | withLength :: 730 | Bytes -> 731 | (forall (n :: GHC.Nat). Arithmetic.Nat n -> BytesN n -> a) -> 732 | a 733 | {-# INLINE withLength #-} 734 | withLength Bytes {array, offset, length = len} f = 735 | Nat.with 736 | len 737 | (\n -> f n BytesN {array, offset}) 738 | 739 | withLengthU :: 740 | ByteArray -> 741 | (forall (n :: GHC.Nat). Arithmetic.Nat n -> ByteArrayN n -> a) -> 742 | a 743 | {-# INLINE withLengthU #-} 744 | withLengthU !arr f = 745 | Nat.with 746 | (PM.sizeofByteArray arr) 747 | (\n -> f n (ByteArrayN arr)) 748 | --------------------------------------------------------------------------------