├── benchmarks ├── .gitignore ├── python │ ├── .gitignore │ ├── cut.py │ ├── sort.py │ ├── utils.py │ ├── strip_tags.py │ └── multilang.py ├── Setup.hs ├── ruby │ ├── utils.rb │ ├── sort.rb │ ├── cut.rb │ ├── strip_tags.rb │ └── fold.rb ├── cbits │ └── time_iconv.c ├── haskell │ ├── Multilang.hs │ ├── Timer.hs │ ├── Benchmarks │ │ ├── EncodeUtf8.hs │ │ ├── Replace.hs │ │ ├── WordFrequencies.hs │ │ ├── FileRead.hs │ │ ├── Programs │ │ │ ├── BigTable.hs │ │ │ ├── StripTags.hs │ │ │ ├── Throughput.hs │ │ │ ├── Fold.hs │ │ │ ├── Sort.hs │ │ │ └── Cut.hs │ │ ├── Equality.hs │ │ ├── Search.hs │ │ ├── FoldLines.hs │ │ ├── DecodeUtf8.hs │ │ ├── Builder.hs │ │ ├── Stream.hs │ │ └── ReadNumbers.hs │ └── Benchmarks.hs └── text-benchmarks.cabal ├── tests ├── .ghci ├── Tests.hs ├── Makefile ├── Tests │ ├── IO.hs │ ├── SlowFunctions.hs │ ├── Utils.hs │ └── Regressions.hs ├── scripts │ └── cover-stdio.sh └── text-tests.cabal ├── Setup.lhs ├── .hgignore ├── Data └── Text │ ├── Util.hs │ ├── Encoding │ ├── Utf32.hs │ ├── Utf16.hs │ ├── Fusion │ │ └── Common.hs │ ├── Error.hs │ ├── Utf8.hs │ └── Fusion.hs │ ├── Lazy │ ├── Builder │ │ ├── RealFloat │ │ │ └── Functions.hs │ │ ├── Int │ │ │ └── Digits.hs │ │ └── Functions.hs │ ├── Builder.hs │ ├── Internal.hs │ ├── Fusion.hs │ ├── Search.hs │ ├── IO.hs │ └── Read.hs │ ├── Private.hs │ ├── Unsafe │ └── Base.hs │ ├── UnsafeShift.hs │ ├── UnsafeChar.hs │ ├── Search.hs │ ├── Unsafe.hs │ ├── Fusion │ ├── Internal.hs │ └── Size.hs │ ├── Internal.hs │ ├── Foreign.hs │ ├── IO │ └── Internal.hs │ ├── Array.hs │ ├── Fusion.hs │ └── Read.hs ├── scripts ├── ApiCompare.hs ├── CaseMapping.hs ├── CaseFolding.hs ├── Arsec.hs └── SpecialCasing.hs ├── LICENSE ├── README.markdown ├── .hgtags ├── tests-and-benchmarks.markdown ├── cbits └── cbits.c └── text.cabal /benchmarks/.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | -------------------------------------------------------------------------------- /tests/.ghci: -------------------------------------------------------------------------------- 1 | :set -DHAVE_DEEPSEQ -isrc -i../.. 2 | -------------------------------------------------------------------------------- /benchmarks/python/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | -------------------------------------------------------------------------------- /benchmarks/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /Setup.lhs: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env runhaskell 2 | > import Distribution.Simple 3 | > main = defaultMain 4 | -------------------------------------------------------------------------------- /.hgignore: -------------------------------------------------------------------------------- 1 | ^(?:dist|benchmarks/dist|tests/coverage|tests/dist)$ 2 | ^benchmarks/.*\.txt$ 3 | ^tests/text-testdata.tar.bz2$ 4 | ^tests/(?:\.hpc|bm|qc|qc-hpc|stdio-hpc|text/test)$ 5 | \.(?:aux|eventlog|h[ip]|log|[oa]|orig|prof|ps|rej|swp|tix)$ 6 | ~$ 7 | 8 | syntax: glob 9 | .\#* 10 | cabal-dev 11 | -------------------------------------------------------------------------------- /benchmarks/ruby/utils.rb: -------------------------------------------------------------------------------- 1 | require 'benchmark' 2 | 3 | def benchmark(&block) 4 | runs = 100 5 | total = 0 6 | 7 | runs.times do |i| 8 | result = Benchmark.measure(&block).total 9 | $stderr.puts "Run #{i}: #{result}" 10 | total += result 11 | end 12 | 13 | total / runs 14 | end 15 | -------------------------------------------------------------------------------- /benchmarks/ruby/sort.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require './utils.rb' 4 | 5 | def sort(filename) 6 | File.open(filename, 'r:utf-8') do |file| 7 | content = file.read 8 | puts content.lines.sort.join 9 | end 10 | end 11 | 12 | ARGV.each do |f| 13 | t = benchmark { sort(f) } 14 | STDERR.puts "#{f}: #{t}" 15 | end 16 | -------------------------------------------------------------------------------- /benchmarks/ruby/cut.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require './utils.rb' 4 | 5 | def cut(filename, l, r) 6 | File.open(filename, 'r:utf-8') do |file| 7 | file.each_line do |line| 8 | puts line[l, r - l] 9 | end 10 | end 11 | end 12 | 13 | ARGV.each do |f| 14 | t = benchmark { cut(f, 20, 40) } 15 | STDERR.puts "#{f}: #{t}" 16 | end 17 | -------------------------------------------------------------------------------- /benchmarks/python/cut.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import utils, sys, codecs 4 | 5 | def cut(filename, l, r): 6 | content = open(filename, encoding='utf-8') 7 | for line in content: 8 | print(line[l:r]) 9 | 10 | for f in sys.argv[1:]: 11 | t = utils.benchmark(lambda: cut(f, 20, 40)) 12 | sys.stderr.write('{0}: {1}\n'.format(f, t)) 13 | -------------------------------------------------------------------------------- /tests/Tests.hs: -------------------------------------------------------------------------------- 1 | -- | Provides a simple main function which runs all the tests 2 | -- 3 | module Main 4 | ( main 5 | ) where 6 | 7 | import Test.Framework (defaultMain) 8 | 9 | import qualified Tests.Properties as Properties 10 | import qualified Tests.Regressions as Regressions 11 | 12 | main :: IO () 13 | main = defaultMain [Properties.tests, Regressions.tests] 14 | -------------------------------------------------------------------------------- /benchmarks/python/sort.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import utils, sys, codecs 4 | 5 | def sort(filename): 6 | content = open(filename, encoding='utf-8').read() 7 | lines = content.splitlines() 8 | lines.sort() 9 | print('\n'.join(lines)) 10 | 11 | for f in sys.argv[1:]: 12 | t = utils.benchmark(lambda: sort(f)) 13 | sys.stderr.write('{0}: {1}\n'.format(f, t)) 14 | -------------------------------------------------------------------------------- /benchmarks/python/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys, time 4 | 5 | def benchmark_once(f): 6 | start = time.time() 7 | f() 8 | end = time.time() 9 | return end - start 10 | 11 | def benchmark(f): 12 | runs = 100 13 | total = 0.0 14 | for i in range(runs): 15 | result = benchmark_once(f) 16 | sys.stderr.write('Run {0}: {1}\n'.format(i, result)) 17 | total += result 18 | return total / runs 19 | -------------------------------------------------------------------------------- /benchmarks/ruby/strip_tags.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require './utils.rb' 4 | 5 | def strip_tags(filename) 6 | File.open(filename, 'r:utf-8') do |file| 7 | str = file.read 8 | 9 | d = 0 10 | 11 | str.each_char do |c| 12 | d += 1 if c == '<' 13 | putc(if d > 0 then ' ' else c end) 14 | d -= 1 if c == '>' 15 | end 16 | end 17 | end 18 | 19 | ARGV.each do |f| 20 | t = benchmark { strip_tags(f) } 21 | STDERR.puts "#{f}: #{t}" 22 | end 23 | -------------------------------------------------------------------------------- /benchmarks/python/strip_tags.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import utils, sys 4 | 5 | def strip_tags(filename): 6 | string = open(filename, encoding='utf-8').read() 7 | 8 | d = 0 9 | out = [] 10 | 11 | for c in string: 12 | if c == '<': d += 1 13 | 14 | if d > 0: 15 | out += ' ' 16 | else: 17 | out += c 18 | 19 | if c == '>': d -= 1 20 | 21 | print(''.join(out)) 22 | 23 | for f in sys.argv[1:]: 24 | t = utils.benchmark(lambda: strip_tags(f)) 25 | sys.stderr.write('{0}: {1}\n'.format(f, t)) 26 | -------------------------------------------------------------------------------- /Data/Text/Util.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, DeriveDataTypeable #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Util 5 | -- Copyright : 2010 Bryan O'Sullivan 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- Useful functions. 13 | 14 | module Data.Text.Util 15 | ( 16 | intersperse 17 | ) where 18 | 19 | -- | A lazier version of Data.List.intersperse. The other version 20 | -- causes space leaks! 21 | intersperse :: a -> [a] -> [a] 22 | intersperse _ [] = [] 23 | intersperse sep (x:xs) = x : go xs 24 | where 25 | go [] = [] 26 | go (y:ys) = sep : y: go ys 27 | {-# INLINE intersperse #-} 28 | -------------------------------------------------------------------------------- /Data/Text/Encoding/Utf32.hs: -------------------------------------------------------------------------------- 1 | -- | 2 | -- Module : Data.Text.Encoding.Utf32 3 | -- Copyright : (c) 2008, 2009 Tom Harper, 4 | -- (c) 2009, 2010 Bryan O'Sullivan, 5 | -- (c) 2009 Duncan Coutts 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 9 | -- duncan@haskell.org 10 | -- Stability : experimental 11 | -- Portability : portable 12 | -- 13 | -- Basic UTF-32 validation. 14 | module Data.Text.Encoding.Utf32 15 | ( 16 | validate 17 | ) where 18 | 19 | import Data.Word (Word32) 20 | 21 | validate :: Word32 -> Bool 22 | validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF) 23 | {-# INLINE validate #-} 24 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Builder/RealFloat/Functions.hs: -------------------------------------------------------------------------------- 1 | -- | 2 | -- Module: Data.Text.Lazy.Builder.RealFloat.Functions 3 | -- Copyright: (c) The University of Glasgow 1994-2002 4 | -- License: see libraries/base/LICENSE 5 | 6 | module Data.Text.Lazy.Builder.RealFloat.Functions 7 | ( 8 | roundTo 9 | ) where 10 | 11 | roundTo :: Int -> [Int] -> (Int,[Int]) 12 | roundTo d is = 13 | case f d is of 14 | x@(0,_) -> x 15 | (1,xs) -> (1, 1:xs) 16 | _ -> error "roundTo: bad Value" 17 | where 18 | f n [] = (0, replicate n 0) 19 | f 0 (x:_) = (if x >= 5 then 1 else 0, []) 20 | f n (i:xs) 21 | | i' == 10 = (1,0:ds) 22 | | otherwise = (0,i':ds) 23 | where 24 | (c,ds) = f (n-1) xs 25 | i' = c + i 26 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Builder/Int/Digits.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | -- Module: Data.Text.Lazy.Builder.Int.Digits 4 | -- Copyright: (c) 2013 Bryan O'Sullivan 5 | -- License: BSD3 6 | -- Maintainer: Bryan O'Sullivan 7 | -- Stability: experimental 8 | -- Portability: portable 9 | -- 10 | -- This module exists because the C preprocessor does things that we 11 | -- shall not speak of when confronted with Haskell multiline strings. 12 | 13 | module Data.Text.Lazy.Builder.Int.Digits (digits) where 14 | 15 | import Data.ByteString.Char8 (ByteString) 16 | 17 | digits :: ByteString 18 | digits = "0001020304050607080910111213141516171819\ 19 | \2021222324252627282930313233343536373839\ 20 | \4041424344454647484950515253545556575859\ 21 | \6061626364656667686970717273747576777879\ 22 | \8081828384858687888990919293949596979899" 23 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Builder/Functions.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MagicHash #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Lazy.Builder.Functions 5 | -- Copyright : (c) 2011 MailRank, Inc. 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- Useful functions and combinators. 13 | 14 | module Data.Text.Lazy.Builder.Functions 15 | ( 16 | (<>) 17 | , i2d 18 | ) where 19 | 20 | import Data.Monoid (mappend) 21 | import Data.Text.Lazy.Builder (Builder) 22 | import GHC.Base 23 | 24 | -- | Unsafe conversion for decimal digits. 25 | {-# INLINE i2d #-} 26 | i2d :: Int -> Char 27 | i2d (I# i#) = C# (chr# (ord# '0'# +# i#)) 28 | 29 | -- | The normal 'mappend' function with right associativity instead of 30 | -- left. 31 | (<>) :: Builder -> Builder -> Builder 32 | (<>) = mappend 33 | {-# INLINE (<>) #-} 34 | 35 | infixr 4 <> 36 | -------------------------------------------------------------------------------- /benchmarks/cbits/time_iconv.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | int time_iconv(char *srcbuf, size_t srcbufsize) 7 | { 8 | uint16_t *destbuf = NULL; 9 | size_t destbufsize; 10 | static uint16_t *origdestbuf; 11 | static size_t origdestbufsize; 12 | iconv_t ic = (iconv_t) -1; 13 | int ret = 0; 14 | 15 | if (ic == (iconv_t) -1) { 16 | ic = iconv_open("UTF-16LE", "UTF-8"); 17 | if (ic == (iconv_t) -1) { 18 | ret = -1; 19 | goto done; 20 | } 21 | } 22 | 23 | destbufsize = srcbufsize * sizeof(uint16_t); 24 | if (destbufsize > origdestbufsize) { 25 | free(origdestbuf); 26 | origdestbuf = destbuf = malloc(origdestbufsize = destbufsize); 27 | } else { 28 | destbuf = origdestbuf; 29 | } 30 | 31 | iconv(ic, &srcbuf, &srcbufsize, (char**) &destbuf, &destbufsize); 32 | 33 | done: 34 | return ret; 35 | } 36 | -------------------------------------------------------------------------------- /benchmarks/haskell/Multilang.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, OverloadedStrings, RankNTypes #-} 2 | 3 | module Main ( 4 | main 5 | ) where 6 | 7 | import Control.Monad (forM_) 8 | import qualified Data.ByteString as B 9 | import qualified Data.Text as Text 10 | import Data.Text.Encoding (decodeUtf8) 11 | import Data.Text (Text) 12 | import System.IO (hFlush, stdout) 13 | import Timer (timer) 14 | 15 | type BM = Text -> () 16 | 17 | bm :: forall a. (Text -> a) -> BM 18 | bm f t = f t `seq` () 19 | 20 | benchmarks :: [(String, Text.Text -> ())] 21 | benchmarks = [ 22 | ("find_first", bm $ Text.isInfixOf "en:Benin") 23 | , ("find_index", bm $ Text.findIndex (=='c')) 24 | ] 25 | 26 | main :: IO () 27 | main = do 28 | !contents <- decodeUtf8 `fmap` B.readFile "../tests/text-test-data/yiwiki.xml" 29 | forM_ benchmarks $ \(name, bmark) -> do 30 | putStr $ name ++ " " 31 | hFlush stdout 32 | putStrLn =<< (timer 100 contents bmark) 33 | -------------------------------------------------------------------------------- /tests/Makefile: -------------------------------------------------------------------------------- 1 | count = 1000 2 | 3 | coverage: build coverage/hpc_index.html 4 | 5 | build: text-test-data 6 | cabal configure -fhpc 7 | cabal build 8 | 9 | text-test-data: 10 | hg clone https://bitbucket.org/bos/text-test-data 11 | $(MAKE) -C text-test-data 12 | 13 | coverage/text-tests.tix: 14 | -mkdir -p coverage 15 | ./dist/build/text-tests/text-tests -a $(count) 16 | mv text-tests.tix $@ 17 | 18 | coverage/text-tests-stdio.tix: 19 | -mkdir -p coverage 20 | ./scripts/cover-stdio.sh ./dist/build/text-tests-stdio/text-tests-stdio 21 | mv text-tests-stdio.tix $@ 22 | 23 | coverage/coverage.tix: coverage/text-tests.tix coverage/text-tests-stdio.tix 24 | hpc combine --output=$@ \ 25 | --exclude=Main \ 26 | coverage/text-tests.tix \ 27 | coverage/text-tests-stdio.tix 28 | 29 | coverage/hpc_index.html: coverage/coverage.tix 30 | hpc markup --destdir=coverage coverage/coverage.tix 31 | 32 | clean: 33 | rm -rf dist coverage .hpc 34 | 35 | .PHONY: build coverage 36 | -------------------------------------------------------------------------------- /scripts/ApiCompare.hs: -------------------------------------------------------------------------------- 1 | -- This script compares the strict and lazy Text APIs to ensure that 2 | -- they're reasonably in sync. 3 | 4 | {-# LANGUAGE OverloadedStrings #-} 5 | 6 | import qualified Data.Set as S 7 | import qualified Data.Text as T 8 | import System.Process 9 | 10 | main = do 11 | let tidy pkg = (S.fromList . filter (T.isInfixOf "::") . T.lines . 12 | T.replace "GHC.Int.Int64" "Int" . 13 | T.replace "\n " "" . 14 | T.replace (T.append (T.pack pkg) ".") "" . T.pack) `fmap` 15 | readProcess "ghci" [] (":browse " ++ pkg) 16 | let diff a b = mapM_ (putStrLn . (" "++) . T.unpack) . S.toList $ 17 | S.difference a b 18 | text <- tidy "Data.Text" 19 | lazy <- tidy "Data.Text.Lazy" 20 | list <- tidy "Data.List" 21 | putStrLn "Text \\ List:" 22 | diff text list 23 | putStrLn "" 24 | putStrLn "Text \\ Lazy:" 25 | diff text lazy 26 | putStrLn "" 27 | putStrLn "Lazy \\ Text:" 28 | diff lazy text 29 | -------------------------------------------------------------------------------- /benchmarks/haskell/Timer.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | 3 | module Timer (timer) where 4 | 5 | import Control.Exception (evaluate) 6 | import Data.Time.Clock.POSIX (getPOSIXTime) 7 | import GHC.Float (FFFormat(..), formatRealFloat) 8 | 9 | ickyRound :: Int -> Double -> String 10 | ickyRound k = formatRealFloat FFFixed (Just k) 11 | 12 | timer :: Int -> a -> (a -> b) -> IO String 13 | timer count a0 f = do 14 | let loop !k !fastest 15 | | k <= 0 = return fastest 16 | | otherwise = do 17 | start <- getPOSIXTime 18 | let inner a i 19 | | i <= 0 = return () 20 | | otherwise = evaluate (f a) >> inner a (i-1) 21 | inner a0 count 22 | end <- getPOSIXTime 23 | let elapsed = end - start 24 | loop (k-1) (min fastest (elapsed / fromIntegral count)) 25 | t <- loop (3::Int) 1e300 26 | let log10 x = log x / log 10 27 | ft = realToFrac t 28 | prec = round (log10 (fromIntegral count) - log10 ft) 29 | return $! ickyRound prec ft 30 | {-# NOINLINE timer #-} 31 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/EncodeUtf8.hs: -------------------------------------------------------------------------------- 1 | -- | UTF-8 encode a text 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Replicating a string a number of times 6 | -- 7 | -- * UTF-8 encoding it 8 | -- 9 | module Benchmarks.EncodeUtf8 10 | ( benchmark 11 | ) where 12 | 13 | import Criterion (Benchmark, bgroup, bench, whnf) 14 | import qualified Data.ByteString as B 15 | import qualified Data.ByteString.Lazy as BL 16 | import qualified Data.Text as T 17 | import qualified Data.Text.Encoding as T 18 | import qualified Data.Text.Lazy as TL 19 | import qualified Data.Text.Lazy.Encoding as TL 20 | 21 | benchmark :: String -> IO Benchmark 22 | benchmark string = do 23 | return $ bgroup "EncodeUtf8" 24 | [ bench "Text" $ whnf (B.length . T.encodeUtf8) text 25 | , bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText 26 | ] 27 | where 28 | -- The string in different formats 29 | text = T.replicate k $ T.pack string 30 | lazyText = TL.replicate (fromIntegral k) $ TL.pack string 31 | 32 | -- Amount 33 | k = 100000 34 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Replace.hs: -------------------------------------------------------------------------------- 1 | -- | Replace a string by another string 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Search and replace of a pattern in a text 6 | -- 7 | module Benchmarks.Replace 8 | ( benchmark 9 | ) where 10 | 11 | import Criterion (Benchmark, bgroup, bench, nf) 12 | import qualified Data.ByteString.Char8 as B 13 | import qualified Data.ByteString.Lazy as BL 14 | import qualified Data.ByteString.Lazy.Search as BL 15 | import qualified Data.Text.Lazy as TL 16 | import qualified Data.Text.Lazy.Encoding as TL 17 | import qualified Data.Text.Lazy.IO as TL 18 | 19 | benchmark :: FilePath -> String -> String -> IO Benchmark 20 | benchmark fp pat sub = do 21 | tl <- TL.readFile fp 22 | bl <- BL.readFile fp 23 | return $ bgroup "Replace" 24 | [ bench "LazyText" $ nf (TL.length . TL.replace tpat tsub) tl 25 | , bench "LazyByteString" $ nf (BL.length . BL.replace bpat bsub) bl 26 | ] 27 | where 28 | tpat = TL.pack pat 29 | tsub = TL.pack sub 30 | bpat = B.concat $ BL.toChunks $ TL.encodeUtf8 tpat 31 | bsub = B.concat $ BL.toChunks $ TL.encodeUtf8 tsub 32 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/WordFrequencies.hs: -------------------------------------------------------------------------------- 1 | -- | A word frequency count using the different string types 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Splitting into words 6 | -- 7 | -- * Converting to lowercase 8 | -- 9 | -- * Comparing: Eq/Ord instances 10 | -- 11 | module Benchmarks.WordFrequencies 12 | ( benchmark 13 | ) where 14 | 15 | import Criterion (Benchmark, bench, bgroup, whnf) 16 | import Data.Char (toLower) 17 | import Data.List (foldl') 18 | import Data.Map (Map) 19 | import qualified Data.ByteString.Char8 as B 20 | import qualified Data.Map as M 21 | import qualified Data.Text as T 22 | import qualified Data.Text.IO as T 23 | 24 | benchmark :: FilePath -> IO Benchmark 25 | benchmark fp = do 26 | s <- readFile fp 27 | b <- B.readFile fp 28 | t <- T.readFile fp 29 | return $ bgroup "WordFrequencies" 30 | [ bench "String" $ whnf (frequencies . words . map toLower) s 31 | , bench "ByteString" $ whnf (frequencies . B.words . B.map toLower) b 32 | , bench "Text" $ whnf (frequencies . T.words . T.toLower) t 33 | ] 34 | 35 | frequencies :: Ord a => [a] -> Map a Int 36 | frequencies = foldl' (\m k -> M.insertWith (+) k 1 m) M.empty 37 | -------------------------------------------------------------------------------- /scripts/CaseMapping.hs: -------------------------------------------------------------------------------- 1 | import System.Environment 2 | import System.IO 3 | 4 | import Arsec 5 | import CaseFolding 6 | import SpecialCasing 7 | 8 | main = do 9 | args <- getArgs 10 | let oname = case args of 11 | [] -> "../Data/Text/Fusion/CaseMapping.hs" 12 | [o] -> o 13 | psc <- parseSC "SpecialCasing.txt" 14 | pcf <- parseCF "CaseFolding.txt" 15 | scs <- case psc of 16 | Left err -> print err >> return undefined 17 | Right ms -> return ms 18 | cfs <- case pcf of 19 | Left err -> print err >> return undefined 20 | Right ms -> return ms 21 | h <- openFile oname WriteMode 22 | mapM_ (hPutStrLn h) ["{-# LANGUAGE Rank2Types #-}" 23 | ,"-- AUTOMATICALLY GENERATED - DO NOT EDIT" 24 | ,"-- Generated by scripts/SpecialCasing.hs" 25 | ,"module Data.Text.Fusion.CaseMapping where" 26 | ,"import Data.Char" 27 | ,"import Data.Text.Fusion.Internal" 28 | ,""] 29 | mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs) 30 | mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs) 31 | mapM_ (hPutStrLn h) (mapCF cfs) 32 | hClose h 33 | -------------------------------------------------------------------------------- /Data/Text/Private.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, Rank2Types, UnboxedTuples #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Private 5 | -- Copyright : (c) 2011 Bryan O'Sullivan 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | 12 | module Data.Text.Private 13 | ( 14 | runText 15 | , span_ 16 | ) where 17 | 18 | import Control.Monad.ST (ST, runST) 19 | import Data.Text.Internal (Text(..), textP) 20 | import Data.Text.Unsafe (Iter(..), iter) 21 | import qualified Data.Text.Array as A 22 | 23 | span_ :: (Char -> Bool) -> Text -> (# Text, Text #) 24 | span_ p t@(Text arr off len) = (# hd,tl #) 25 | where hd = textP arr off k 26 | tl = textP arr (off+k) (len-k) 27 | !k = loop 0 28 | loop !i | i < len && p c = loop (i+d) 29 | | otherwise = i 30 | where Iter c d = iter t i 31 | {-# INLINE span_ #-} 32 | 33 | runText :: (forall s. (A.MArray s -> Int -> ST s Text) -> ST s Text) -> Text 34 | runText act = runST (act $ \ !marr !len -> do 35 | arr <- A.unsafeFreeze marr 36 | return $! textP arr 0 len) 37 | {-# INLINE runText #-} 38 | -------------------------------------------------------------------------------- /benchmarks/python/multilang.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import math 4 | import sys 5 | import time 6 | 7 | def find_first(): 8 | cf = contents.find 9 | return timer(lambda: cf("en:Benin")) 10 | 11 | def timer(f, count=100): 12 | a = 1e300 13 | def g(): 14 | return 15 | for i in xrange(3): 16 | start = time.time() 17 | for j in xrange(count): 18 | g() 19 | a = min(a, (time.time() - start) / count) 20 | 21 | b = 1e300 22 | for i in xrange(3): 23 | start = time.time() 24 | for j in xrange(count): 25 | f() 26 | b = min(b, (time.time() - start) / count) 27 | 28 | return round(b - a, int(round(math.log(count, 10) - math.log(b - a, 10)))) 29 | 30 | contents = open('../../tests/text-test-data/yiwiki.xml', 'r').read() 31 | contents = contents.decode('utf-8') 32 | 33 | benchmarks = ( 34 | find_first, 35 | ) 36 | 37 | to_run = sys.argv[1:] 38 | bms = [] 39 | if to_run: 40 | for r in to_run: 41 | for b in benchmarks: 42 | if b.__name__.startswith(r): 43 | bms.append(b) 44 | else: 45 | bms = benchmarks 46 | 47 | for b in bms: 48 | sys.stdout.write(b.__name__ + ' ') 49 | sys.stdout.flush() 50 | print b() 51 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/FileRead.hs: -------------------------------------------------------------------------------- 1 | -- | Benchmarks simple file reading 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Reading a file from the disk 6 | -- 7 | module Benchmarks.FileRead 8 | ( benchmark 9 | ) where 10 | 11 | import Control.Exception (evaluate) 12 | import Criterion (Benchmark, bgroup, bench) 13 | import qualified Data.ByteString as SB 14 | import qualified Data.ByteString.Lazy as LB 15 | import qualified Data.Text as T 16 | import qualified Data.Text.Encoding as T 17 | import qualified Data.Text.IO as T 18 | import qualified Data.Text.Lazy as LT 19 | import qualified Data.Text.Lazy.Encoding as LT 20 | import qualified Data.Text.Lazy.IO as LT 21 | 22 | benchmark :: FilePath -> IO Benchmark 23 | benchmark p = return $ bgroup "FileRead" 24 | [ bench "String" $ readFile p >>= evaluate . length 25 | , bench "ByteString" $ SB.readFile p >>= evaluate . SB.length 26 | , bench "LazyByteString" $ LB.readFile p >>= evaluate . LB.length 27 | , bench "Text" $ T.readFile p >>= evaluate . T.length 28 | , bench "LazyText" $ LT.readFile p >>= evaluate . LT.length 29 | , bench "TextByteString" $ 30 | SB.readFile p >>= evaluate . T.length . T.decodeUtf8 31 | , bench "LazyTextByteString" $ 32 | LB.readFile p >>= evaluate . LT.length . LT.decodeUtf8 33 | ] 34 | -------------------------------------------------------------------------------- /Data/Text/Encoding/Utf16.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MagicHash, BangPatterns #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Encoding.Utf16 5 | -- Copyright : (c) 2008, 2009 Tom Harper, 6 | -- (c) 2009 Bryan O'Sullivan, 7 | -- (c) 2009 Duncan Coutts 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Basic UTF-16 validation and character manipulation. 16 | module Data.Text.Encoding.Utf16 17 | ( 18 | chr2 19 | , validate1 20 | , validate2 21 | ) where 22 | 23 | import GHC.Exts 24 | import GHC.Word (Word16(..)) 25 | 26 | chr2 :: Word16 -> Word16 -> Char 27 | chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#)) 28 | where 29 | !x# = word2Int# a# 30 | !y# = word2Int# b# 31 | !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10# 32 | !lower# = y# -# 0xDC00# 33 | {-# INLINE chr2 #-} 34 | 35 | validate1 :: Word16 -> Bool 36 | validate1 x1 = x1 < 0xD800 || x1 > 0xDFFF 37 | {-# INLINE validate1 #-} 38 | 39 | validate2 :: Word16 -> Word16 -> Bool 40 | validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF && 41 | x2 >= 0xDC00 && x2 <= 0xDFFF 42 | {-# INLINE validate2 #-} 43 | -------------------------------------------------------------------------------- /benchmarks/ruby/fold.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require './utils.rb' 4 | 5 | def fold(filename, max_width) 6 | File.open(filename, 'r:utf-8') do |file| 7 | # Words in this paragraph 8 | paragraph = [] 9 | 10 | file.each_line do |line| 11 | # If we encounter an empty line, we reformat and dump the current 12 | # paragraph 13 | if line.strip.empty? 14 | puts fold_paragraph(paragraph, max_width) 15 | puts 16 | paragraph = [] 17 | # Otherwise, we append the words found in the line to the paragraph 18 | else 19 | paragraph.concat line.split 20 | end 21 | end 22 | 23 | # Last paragraph 24 | puts fold_paragraph(paragraph, max_width) unless paragraph.empty? 25 | end 26 | end 27 | 28 | # Fold a single paragraph to the desired width 29 | def fold_paragraph(paragraph, max_width) 30 | # Gradually build our output 31 | str, *rest = paragraph 32 | width = str.length 33 | 34 | rest.each do |word| 35 | if width + word.length + 1 <= max_width 36 | str << ' ' << word 37 | width += word.length + 1 38 | else 39 | str << "\n" << word 40 | width = word.length 41 | end 42 | end 43 | 44 | str 45 | end 46 | 47 | ARGV.each do |f| 48 | t = benchmark { fold(f, 80) } 49 | STDERR.puts "#{f}: #{t}" 50 | end 51 | -------------------------------------------------------------------------------- /tests/Tests/IO.hs: -------------------------------------------------------------------------------- 1 | -- | Program which exposes some haskell functions as an exutable. The results 2 | -- and coverage of this module is meant to be checked using a shell script. 3 | -- 4 | module Main 5 | ( 6 | main 7 | ) where 8 | 9 | import System.Environment (getArgs) 10 | import System.Exit (exitFailure) 11 | import System.IO (hPutStrLn, stderr) 12 | import qualified Data.Text as T 13 | import qualified Data.Text.IO as T 14 | import qualified Data.Text.Lazy as TL 15 | import qualified Data.Text.Lazy.IO as TL 16 | 17 | main :: IO () 18 | main = do 19 | args <- getArgs 20 | case args of 21 | ["T.readFile", name] -> T.putStr =<< T.readFile name 22 | ["T.writeFile", name, t] -> T.writeFile name (T.pack t) 23 | ["T.appendFile", name, t] -> T.appendFile name (T.pack t) 24 | ["T.interact"] -> T.interact id 25 | ["T.getContents"] -> T.putStr =<< T.getContents 26 | ["T.getLine"] -> T.putStrLn =<< T.getLine 27 | 28 | ["TL.readFile", name] -> TL.putStr =<< TL.readFile name 29 | ["TL.writeFile", name, t] -> TL.writeFile name (TL.pack t) 30 | ["TL.appendFile", name, t] -> TL.appendFile name (TL.pack t) 31 | ["TL.interact"] -> TL.interact id 32 | ["TL.getContents"] -> TL.putStr =<< TL.getContents 33 | ["TL.getLine"] -> TL.putStrLn =<< TL.getLine 34 | _ -> hPutStrLn stderr "invalid directive!" >> exitFailure 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008-2009, Tom Harper 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /tests/scripts/cover-stdio.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ $# < 1 ]]; then 4 | echo "Usage: $0 " 5 | exit 1 6 | fi 7 | 8 | exe=$1 9 | 10 | rm -f $exe.tix 11 | 12 | f=$(mktemp stdio-f.XXXXXX) 13 | g=$(mktemp stdio-g.XXXXXX) 14 | 15 | for t in T TL; do 16 | echo $t.readFile > $f 17 | $exe $t.readFile $f > $g 18 | if ! diff -u $f $g; then 19 | errs=$((errs+1)) 20 | echo FAIL: $t.readFile 1>&2 21 | fi 22 | 23 | $exe $t.writeFile $f $t.writeFile 24 | echo -n $t.writeFile > $g 25 | if ! diff -u $f $g; then 26 | errs=$((errs+1)) 27 | echo FAIL: $t.writeFile 1>&2 28 | fi 29 | 30 | echo -n quux > $f 31 | $exe $t.appendFile $f $t.appendFile 32 | echo -n quux$t.appendFile > $g 33 | if ! diff -u $f $g; then 34 | errs=$((errs+1)) 35 | echo FAIL: $t.appendFile 1>&2 36 | fi 37 | 38 | echo $t.interact | $exe $t.interact > $f 39 | echo $t.interact > $g 40 | if ! diff -u $f $g; then 41 | errs=$((errs+1)) 42 | echo FAIL: $t.interact 1>&2 43 | fi 44 | 45 | echo $t.getContents | $exe $t.getContents > $f 46 | echo $t.getContents > $g 47 | if ! diff -u $f $g; then 48 | errs=$((errs+1)) 49 | echo FAIL: $t.getContents 1>&2 50 | fi 51 | 52 | echo $t.getLine | $exe $t.getLine > $f 53 | echo $t.getLine > $g 54 | if ! diff -u $f $g; then 55 | errs=$((errs+1)) 56 | echo FAIL: $t.getLine 1>&2 57 | fi 58 | done 59 | 60 | rm -f $f $g 61 | 62 | exit $errs 63 | -------------------------------------------------------------------------------- /tests/Tests/SlowFunctions.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | module Tests.SlowFunctions 3 | ( 4 | indices 5 | , splitOn 6 | ) where 7 | 8 | import qualified Data.Text as T 9 | import Data.Text.Internal (Text(..)) 10 | import Data.Text.Unsafe (iter_, unsafeHead, unsafeTail) 11 | 12 | indices :: T.Text -- ^ Substring to search for (@needle@) 13 | -> T.Text -- ^ Text to search in (@haystack@) 14 | -> [Int] 15 | indices needle@(Text _narr _noff nlen) haystack@(Text harr hoff hlen) 16 | | T.null needle = [] 17 | | otherwise = scan 0 18 | where 19 | scan i | i >= hlen = [] 20 | | needle `T.isPrefixOf` t = i : scan (i+nlen) 21 | | otherwise = scan (i+d) 22 | where t = Text harr (hoff+i) (hlen-i) 23 | d = iter_ haystack i 24 | 25 | splitOn :: T.Text -- ^ Text to split on 26 | -> T.Text -- ^ Input text 27 | -> [T.Text] 28 | splitOn pat src0 29 | | T.null pat = error "splitOn: empty" 30 | | l == 1 = T.split (== (unsafeHead pat)) src0 31 | | otherwise = go src0 32 | where 33 | l = T.length pat 34 | go src = search 0 src 35 | where 36 | search !n !s 37 | | T.null s = [src] -- not found 38 | | pat `T.isPrefixOf` s = T.take n src : go (T.drop l s) 39 | | otherwise = search (n+1) (unsafeTail s) 40 | -------------------------------------------------------------------------------- /scripts/CaseFolding.hs: -------------------------------------------------------------------------------- 1 | -- This script processes the following source file: 2 | -- 3 | -- http://unicode.org/Public/UNIDATA/CaseFolding.txt 4 | 5 | module CaseFolding 6 | ( 7 | Fold(..) 8 | , parseCF 9 | , mapCF 10 | ) where 11 | 12 | import Arsec 13 | 14 | data Fold = Fold { 15 | code :: Char 16 | , status :: Char 17 | , mapping :: [Char] 18 | , name :: String 19 | } deriving (Eq, Ord, Show) 20 | 21 | entries :: Parser [Fold] 22 | entries = many comment *> many (entry <* many comment) 23 | where 24 | entry = Fold <$> unichar <* semi 25 | <*> oneOf "CFST" <* semi 26 | <*> unichars 27 | <*> (string "# " *> manyTill anyToken (char '\n')) 28 | 29 | parseCF :: FilePath -> IO (Either ParseError [Fold]) 30 | parseCF name = parse entries name <$> readFile name 31 | 32 | mapCF :: [Fold] -> [String] 33 | mapCF ms = typ ++ (map nice . filter p $ ms) ++ [last] 34 | where 35 | typ = ["foldMapping :: forall s. Char -> s -> Step (CC s) Char" 36 | ,"{-# INLINE foldMapping #-}"] 37 | last = "foldMapping c s = Yield (toLower c) (CC s '\\0' '\\0')" 38 | nice c = "-- " ++ name c ++ "\n" ++ 39 | "foldMapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")" 40 | where [x,y,z] = (map showC . take 3) (mapping c ++ repeat '\0') 41 | p f = status f `elem` "CF" && 42 | mapping f /= [toLower (code f)] 43 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Programs/BigTable.hs: -------------------------------------------------------------------------------- 1 | -- | Create a large HTML table and dump it to a handle 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Creating a large HTML document using a builder 6 | -- 7 | -- * Writing to a handle 8 | -- 9 | {-# LANGUAGE OverloadedStrings #-} 10 | module Benchmarks.Programs.BigTable 11 | ( benchmark 12 | ) where 13 | 14 | import Criterion (Benchmark, bench) 15 | import Data.Monoid (mappend, mconcat) 16 | import Data.Text.Lazy.Builder (Builder, fromText, toLazyText) 17 | import Data.Text.Lazy.IO (hPutStr) 18 | import System.IO (Handle) 19 | import qualified Data.Text as T 20 | 21 | benchmark :: Handle -> IO Benchmark 22 | benchmark sink = return $ bench "BigTable" $ do 23 | hPutStr sink "Content-Type: text/html\n\n" 24 | hPutStr sink . toLazyText . makeTable =<< rows 25 | hPutStr sink "
" 26 | where 27 | -- We provide the number of rows in IO so the builder value isn't shared 28 | -- between the benchmark samples. 29 | rows :: IO Int 30 | rows = return 20000 31 | {-# NOINLINE rows #-} 32 | 33 | makeTable :: Int -> Builder 34 | makeTable n = mconcat $ replicate n $ mconcat $ map makeCol [1 .. 50] 35 | 36 | makeCol :: Int -> Builder 37 | makeCol 1 = fromText "1" 38 | makeCol 50 = fromText "50" 39 | makeCol i = fromText "" `mappend` (fromInt i `mappend` fromText "") 40 | 41 | fromInt :: Int -> Builder 42 | fromInt = fromText . T.pack . show 43 | -------------------------------------------------------------------------------- /scripts/Arsec.hs: -------------------------------------------------------------------------------- 1 | module Arsec 2 | ( 3 | comment 4 | , semi 5 | , showC 6 | , unichar 7 | , unichars 8 | , module Control.Applicative 9 | , module Control.Monad 10 | , module Data.Char 11 | , module Text.ParserCombinators.Parsec.Char 12 | , module Text.ParserCombinators.Parsec.Combinator 13 | , module Text.ParserCombinators.Parsec.Error 14 | , module Text.ParserCombinators.Parsec.Prim 15 | ) where 16 | 17 | import Control.Monad 18 | import Control.Applicative 19 | import Data.Char 20 | import Numeric 21 | import Text.ParserCombinators.Parsec.Char hiding (lower, upper) 22 | import Text.ParserCombinators.Parsec.Combinator hiding (optional) 23 | import Text.ParserCombinators.Parsec.Error 24 | import Text.ParserCombinators.Parsec.Prim hiding ((<|>), many) 25 | 26 | instance Applicative (GenParser s a) where 27 | pure = return 28 | (<*>) = ap 29 | 30 | instance Alternative (GenParser s a) where 31 | empty = mzero 32 | (<|>) = mplus 33 | 34 | unichar :: Parser Char 35 | unichar = chr . fst . head . readHex <$> many1 hexDigit 36 | 37 | unichars :: Parser [Char] 38 | unichars = manyTill (unichar <* spaces) semi 39 | 40 | semi :: Parser () 41 | semi = char ';' *> spaces *> pure () 42 | 43 | comment :: Parser String 44 | comment = (char '#' *> manyTill anyToken (char '\n')) <|> string "\n" 45 | 46 | showC :: Char -> String 47 | showC c = "'\\x" ++ d ++ "'" 48 | where h = showHex (ord c) "" 49 | d = replicate (4 - length h) '0' ++ h 50 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Equality.hs: -------------------------------------------------------------------------------- 1 | -- | Compare a string with a copy of itself that is identical except 2 | -- for the last character. 3 | -- 4 | -- Tested in this benchmark: 5 | -- 6 | -- * Comparison of strings (Eq instance) 7 | -- 8 | module Benchmarks.Equality 9 | ( benchmark 10 | ) where 11 | 12 | import Criterion (Benchmark, bgroup, bench, whnf) 13 | import qualified Data.ByteString.Char8 as B 14 | import qualified Data.ByteString.Lazy.Char8 as BL 15 | import qualified Data.Text as T 16 | import qualified Data.Text.Encoding as T 17 | import qualified Data.Text.Lazy as TL 18 | import qualified Data.Text.Lazy.Encoding as TL 19 | 20 | benchmark :: FilePath -> IO Benchmark 21 | benchmark fp = do 22 | b <- B.readFile fp 23 | bl1 <- BL.readFile fp 24 | -- A lazy bytestring is a list of chunks. When we do not explicitly create two 25 | -- different lazy bytestrings at a different address, the bytestring library 26 | -- will compare the chunk addresses instead of the chunk contents. This is why 27 | -- we read the lazy bytestring twice here. 28 | bl2 <- BL.readFile fp 29 | l <- readFile fp 30 | let t = T.decodeUtf8 b 31 | tl = TL.decodeUtf8 bl1 32 | return $ bgroup "Equality" 33 | [ bench "Text" $ whnf (== T.init t `T.snoc` '\xfffd') t 34 | , bench "LazyText" $ whnf (== TL.init tl `TL.snoc` '\xfffd') tl 35 | , bench "ByteString" $ whnf (== B.init b `B.snoc` '\xfffd') b 36 | , bench "LazyByteString" $ whnf (== BL.init bl2 `BL.snoc` '\xfffd') bl1 37 | , bench "String" $ whnf (== init l ++ "\xfffd") l 38 | ] 39 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # Text: Fast, packed Unicode strings, using stream fusion 2 | 3 | This package provides the Data.Text library, a library for the space- 4 | and time-efficient manipulation of Unicode text in Haskell. 5 | 6 | 7 | # Normalization, conversion, and collation, oh my! 8 | 9 | This library intentionally provides a simple API based on the 10 | Haskell prelude's list manipulation functions. For more complicated 11 | real-world tasks, such as Unicode normalization, conversion to and 12 | from a larger variety of encodings, and collation, use the [text-icu 13 | package](http://hackage.haskell.org/cgi-bin/hackage-scripts/package/text-icu). 14 | 15 | That library uses the well-respected and liberally licensed ICU 16 | library to provide these facilities. 17 | 18 | 19 | # Get involved! 20 | 21 | Please report bugs via the 22 | [github issue tracker](https://github.com/bos/text/issues). 23 | 24 | Master [git repository](https://github.com/bos/text): 25 | 26 | * `git clone git://github.com/bos/text.git` 27 | 28 | There's also a [Mercurial mirror](https://bitbucket.org/bos/text): 29 | 30 | * `hg clone https://bitbucket.org/bos/text` 31 | 32 | (You can create and contribute changes using either Mercurial or git.) 33 | 34 | 35 | # Authors 36 | 37 | The base code for this library was originally written by Tom Harper, 38 | based on the stream fusion framework developed by Roman Leshchinskiy, 39 | Duncan Coutts, and Don Stewart. 40 | 41 | The core library was fleshed out, debugged, and tested by Bryan 42 | O'Sullivan , and he is the current maintainer. 43 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Builder.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, CPP, Rank2Types #-} 2 | 3 | ----------------------------------------------------------------------------- 4 | -- | 5 | -- Module : Data.Text.Lazy.Builder 6 | -- Copyright : (c) 2013 Bryan O'Sullivan 7 | -- (c) 2010 Johan Tibell 8 | -- License : BSD3-style (see LICENSE) 9 | -- 10 | -- Maintainer : Johan Tibell 11 | -- Stability : experimental 12 | -- Portability : portable to Hugs and GHC 13 | -- 14 | -- Efficient construction of lazy @Text@ values. The principal 15 | -- operations on a @Builder@ are @singleton@, @fromText@, and 16 | -- @fromLazyText@, which construct new builders, and 'mappend', which 17 | -- concatenates two builders. 18 | -- 19 | -- To get maximum performance when building lazy @Text@ values using a 20 | -- builder, associate @mappend@ calls to the right. For example, 21 | -- prefer 22 | -- 23 | -- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c') 24 | -- 25 | -- to 26 | -- 27 | -- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c' 28 | -- 29 | -- as the latter associates @mappend@ to the left. 30 | -- 31 | ----------------------------------------------------------------------------- 32 | 33 | module Data.Text.Lazy.Builder 34 | ( -- * The Builder type 35 | Builder 36 | , toLazyText 37 | , toLazyTextWith 38 | 39 | -- * Constructing Builders 40 | , singleton 41 | , fromText 42 | , fromLazyText 43 | , fromString 44 | 45 | -- * Flushing the buffer state 46 | , flush 47 | ) where 48 | 49 | import Data.Text.Lazy.Builder.Internal 50 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Programs/StripTags.hs: -------------------------------------------------------------------------------- 1 | -- | Program to replace HTML tags by whitespace 2 | -- 3 | -- This program was originally contributed by Petr Prokhorenkov. 4 | -- 5 | -- Tested in this benchmark: 6 | -- 7 | -- * Reading the file 8 | -- 9 | -- * Replacing text between HTML tags (<>) with whitespace 10 | -- 11 | -- * Writing back to a handle 12 | -- 13 | {-# OPTIONS_GHC -fspec-constr-count=5 #-} 14 | module Benchmarks.Programs.StripTags 15 | ( benchmark 16 | ) where 17 | 18 | import Criterion (Benchmark, bgroup, bench) 19 | import Data.List (mapAccumL) 20 | import System.IO (Handle, hPutStr) 21 | import qualified Data.ByteString as B 22 | import qualified Data.ByteString.Char8 as BC 23 | import qualified Data.Text as T 24 | import qualified Data.Text.Encoding as T 25 | import qualified Data.Text.IO as T 26 | 27 | benchmark :: FilePath -> Handle -> IO Benchmark 28 | benchmark i o = return $ bgroup "StripTags" 29 | [ bench "String" $ readFile i >>= hPutStr o . string 30 | , bench "ByteString" $ B.readFile i >>= B.hPutStr o . byteString 31 | , bench "Text" $ T.readFile i >>= T.hPutStr o . text 32 | , bench "TextByteString" $ 33 | B.readFile i >>= B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8 34 | ] 35 | 36 | string :: String -> String 37 | string = snd . mapAccumL step 0 38 | 39 | text :: T.Text -> T.Text 40 | text = snd . T.mapAccumL step 0 41 | 42 | byteString :: B.ByteString -> B.ByteString 43 | byteString = snd . BC.mapAccumL step 0 44 | 45 | step :: Int -> Char -> (Int, Char) 46 | step d c 47 | | d > 0 || d' > 0 = (d', ' ') 48 | | otherwise = (d', c) 49 | where 50 | d' = d + depth c 51 | depth '>' = 1 52 | depth '<' = -1 53 | depth _ = 0 54 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Programs/Throughput.hs: -------------------------------------------------------------------------------- 1 | -- | This benchmark simply reads and writes a file using the various string 2 | -- libraries. The point of it is that we can make better estimations on how 3 | -- much time the other benchmarks spend doing IO. 4 | -- 5 | -- Note that we expect ByteStrings to be a whole lot faster, since they do not 6 | -- do any actual encoding/decoding here, while String and Text do have UTF-8 7 | -- encoding/decoding. 8 | -- 9 | -- Tested in this benchmark: 10 | -- 11 | -- * Reading the file 12 | -- 13 | -- * Replacing text between HTML tags (<>) with whitespace 14 | -- 15 | -- * Writing back to a handle 16 | -- 17 | module Benchmarks.Programs.Throughput 18 | ( benchmark 19 | ) where 20 | 21 | import Criterion (Benchmark, bgroup, bench) 22 | import System.IO (Handle, hPutStr) 23 | import qualified Data.ByteString as B 24 | import qualified Data.ByteString.Lazy as BL 25 | import qualified Data.Text.Encoding as T 26 | import qualified Data.Text.IO as T 27 | import qualified Data.Text.Lazy.Encoding as TL 28 | import qualified Data.Text.Lazy.IO as TL 29 | 30 | benchmark :: FilePath -> Handle -> IO Benchmark 31 | benchmark fp sink = return $ bgroup "Throughput" 32 | [ bench "String" $ readFile fp >>= hPutStr sink 33 | , bench "ByteString" $ B.readFile fp >>= B.hPutStr sink 34 | , bench "LazyByteString" $ BL.readFile fp >>= BL.hPutStr sink 35 | , bench "Text" $ T.readFile fp >>= T.hPutStr sink 36 | , bench "LazyText" $ TL.readFile fp >>= TL.hPutStr sink 37 | , bench "TextByteString" $ 38 | B.readFile fp >>= B.hPutStr sink . T.encodeUtf8 . T.decodeUtf8 39 | , bench "LazyTextByteString" $ 40 | BL.readFile fp >>= BL.hPutStr sink . TL.encodeUtf8 . TL.decodeUtf8 41 | ] 42 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Search.hs: -------------------------------------------------------------------------------- 1 | -- | Search for a pattern in a file, find the number of occurences 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Searching all occurences of a pattern using library routines 6 | -- 7 | module Benchmarks.Search 8 | ( benchmark 9 | ) where 10 | 11 | import Criterion (Benchmark, bench, bgroup, whnf) 12 | import qualified Data.ByteString as B 13 | import qualified Data.ByteString.Lazy as BL 14 | import qualified Data.ByteString.Lazy.Search as BL 15 | import qualified Data.ByteString.Search as B 16 | import qualified Data.Text as T 17 | import qualified Data.Text.Encoding as T 18 | import qualified Data.Text.IO as T 19 | import qualified Data.Text.Lazy as TL 20 | import qualified Data.Text.Lazy.IO as TL 21 | 22 | benchmark :: FilePath -> T.Text -> IO Benchmark 23 | benchmark fp needleT = do 24 | b <- B.readFile fp 25 | bl <- BL.readFile fp 26 | t <- T.readFile fp 27 | tl <- TL.readFile fp 28 | return $ bgroup "FileIndices" 29 | [ bench "ByteString" $ whnf (byteString needleB) b 30 | , bench "LazyByteString" $ whnf (lazyByteString needleB) bl 31 | , bench "Text" $ whnf (text needleT) t 32 | , bench "LazyText" $ whnf (lazyText needleTL) tl 33 | ] 34 | where 35 | needleB = T.encodeUtf8 needleT 36 | needleTL = TL.fromChunks [needleT] 37 | 38 | byteString :: B.ByteString -> B.ByteString -> Int 39 | byteString needle = length . B.indices needle 40 | 41 | lazyByteString :: B.ByteString -> BL.ByteString -> Int 42 | lazyByteString needle = length . BL.indices needle 43 | 44 | text :: T.Text -> T.Text -> Int 45 | text = T.count 46 | 47 | lazyText :: TL.Text -> TL.Text -> Int 48 | lazyText needle = fromIntegral . TL.count needle 49 | -------------------------------------------------------------------------------- /scripts/SpecialCasing.hs: -------------------------------------------------------------------------------- 1 | -- This script processes the following source file: 2 | -- 3 | -- http://unicode.org/Public/UNIDATA/SpecialCasing.txt 4 | 5 | module SpecialCasing 6 | ( 7 | Case(..) 8 | , parseSC 9 | , mapSC 10 | ) where 11 | 12 | import Arsec 13 | 14 | data Case = Case { 15 | code :: Char 16 | , lower :: [Char] 17 | , title :: [Char] 18 | , upper :: [Char] 19 | , conditions :: String 20 | , name :: String 21 | } deriving (Eq, Ord, Show) 22 | 23 | entries :: Parser [Case] 24 | entries = many comment *> many (entry <* many comment) 25 | where 26 | entry = Case <$> unichar <* semi 27 | <*> unichars 28 | <*> unichars 29 | <*> unichars 30 | <*> manyTill anyToken (string "# ") 31 | <*> manyTill anyToken (char '\n') 32 | 33 | parseSC :: FilePath -> IO (Either ParseError [Case]) 34 | parseSC name = parse entries name <$> readFile name 35 | 36 | mapSC :: String -> (Case -> String) -> (Char -> Char) -> [Case] -> [String] 37 | mapSC which access twiddle ms = typ ++ (map nice . filter p $ ms) ++ [last] 38 | where 39 | typ = [which ++ "Mapping :: forall s. Char -> s -> Step (CC s) Char" 40 | ,"{-# INLINE " ++ which ++ "Mapping #-}"] 41 | last = which ++ "Mapping c s = Yield (to" ++ ucFirst which ++ " c) (CC s '\\0' '\\0')" 42 | nice c = "-- " ++ name c ++ "\n" ++ 43 | which ++ "Mapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")" 44 | where [x,y,z] = (map showC . take 3) (access c ++ repeat '\0') 45 | p c = [k] /= a && a /= [twiddle k] && null (conditions c) 46 | where a = access c 47 | k = code c 48 | 49 | ucFirst (c:cs) = toUpper c : cs 50 | ucFirst [] = [] 51 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/FoldLines.hs: -------------------------------------------------------------------------------- 1 | -- | Read a file line-by-line using handles, and perform a fold over the lines. 2 | -- The fold is used here to calculate the number of lines in the file. 3 | -- 4 | -- Tested in this benchmark: 5 | -- 6 | -- * Buffered, line-based IO 7 | -- 8 | {-# LANGUAGE BangPatterns #-} 9 | module Benchmarks.FoldLines 10 | ( benchmark 11 | ) where 12 | 13 | import Criterion (Benchmark, bgroup, bench) 14 | import System.IO 15 | import qualified Data.ByteString as B 16 | import qualified Data.Text as T 17 | import qualified Data.Text.IO as T 18 | 19 | benchmark :: FilePath -> IO Benchmark 20 | benchmark fp = return $ bgroup "ReadLines" 21 | [ bench "Text" $ withHandle $ foldLinesT (\n _ -> n + 1) (0 :: Int) 22 | , bench "ByteString" $ withHandle $ foldLinesB (\n _ -> n + 1) (0 :: Int) 23 | ] 24 | where 25 | withHandle f = do 26 | h <- openFile fp ReadMode 27 | hSetBuffering h (BlockBuffering (Just 16384)) 28 | x <- f h 29 | hClose h 30 | return x 31 | 32 | -- | Text line fold 33 | -- 34 | foldLinesT :: (a -> T.Text -> a) -> a -> Handle -> IO a 35 | foldLinesT f z0 h = go z0 36 | where 37 | go !z = do 38 | eof <- hIsEOF h 39 | if eof 40 | then return z 41 | else do 42 | l <- T.hGetLine h 43 | let z' = f z l in go z' 44 | {-# INLINE foldLinesT #-} 45 | 46 | -- | ByteString line fold 47 | -- 48 | foldLinesB :: (a -> B.ByteString -> a) -> a -> Handle -> IO a 49 | foldLinesB f z0 h = go z0 50 | where 51 | go !z = do 52 | eof <- hIsEOF h 53 | if eof 54 | then return z 55 | else do 56 | l <- B.hGetLine h 57 | let z' = f z l in go z' 58 | {-# INLINE foldLinesB #-} 59 | -------------------------------------------------------------------------------- /benchmarks/text-benchmarks.cabal: -------------------------------------------------------------------------------- 1 | name: text-benchmarks 2 | version: 0.0.0.0 3 | synopsis: Benchmarks for the text package 4 | description: Benchmarks for the text package 5 | homepage: https://bitbucket.org/bos/text 6 | license: BSD3 7 | license-file: ../LICENSE 8 | author: Jasper Van der Jeugt , 9 | Bryan O'Sullivan , 10 | Tom Harper , 11 | Duncan Coutts 12 | maintainer: jaspervdj@gmail.com 13 | category: Text 14 | build-type: Simple 15 | 16 | cabal-version: >=1.2 17 | 18 | flag llvm 19 | description: use LLVM 20 | default: False 21 | 22 | executable text-benchmarks 23 | hs-source-dirs: haskell .. 24 | c-sources: ../cbits/cbits.c 25 | cbits/time_iconv.c 26 | main-is: Benchmarks.hs 27 | ghc-options: -Wall -O2 28 | if flag(llvm) 29 | ghc-options: -fllvm 30 | cpp-options: -DHAVE_DEEPSEQ -DINTEGER_GMP 31 | build-depends: base == 4.*, 32 | binary, 33 | blaze-builder, 34 | bytestring, 35 | bytestring-lexing, 36 | containers, 37 | criterion >= 0.6.0.1, 38 | deepseq, 39 | directory, 40 | filepath, 41 | ghc-prim, 42 | integer-gmp, 43 | stringsearch, 44 | utf8-string 45 | 46 | executable text-multilang 47 | hs-source-dirs: haskell 48 | main-is: Multilang.hs 49 | ghc-options: -Wall -O2 50 | build-depends: base == 4.*, 51 | bytestring, 52 | text, 53 | time 54 | -------------------------------------------------------------------------------- /tests/Tests/Utils.hs: -------------------------------------------------------------------------------- 1 | -- | Miscellaneous testing utilities 2 | -- 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | module Tests.Utils 5 | ( 6 | (=^=) 7 | , withRedirect 8 | , withTempFile 9 | ) where 10 | 11 | import Control.Exception (SomeException, bracket, bracket_, evaluate, try) 12 | import Control.Monad (when) 13 | import Debug.Trace (trace) 14 | import GHC.IO.Handle.Internals (withHandle) 15 | import System.Directory (removeFile) 16 | import System.IO (Handle, hClose, hFlush, hIsOpen, hIsWritable, openTempFile) 17 | import System.IO.Unsafe (unsafePerformIO) 18 | 19 | -- Ensure that two potentially bottom values (in the sense of crashing 20 | -- for some inputs, not looping infinitely) either both crash, or both 21 | -- give comparable results for some input. 22 | (=^=) :: (Eq a, Show a) => a -> a -> Bool 23 | i =^= j = unsafePerformIO $ do 24 | x <- try (evaluate i) 25 | y <- try (evaluate j) 26 | case (x,y) of 27 | (Left (_ :: SomeException), Left (_ :: SomeException)) 28 | -> return True 29 | (Right a, Right b) -> return (a == b) 30 | e -> trace ("*** Divergence: " ++ show e) return False 31 | infix 4 =^= 32 | {-# NOINLINE (=^=) #-} 33 | 34 | withTempFile :: (FilePath -> Handle -> IO a) -> IO a 35 | withTempFile = bracket (openTempFile "." "crashy.txt") cleanupTemp . uncurry 36 | where 37 | cleanupTemp (path,h) = do 38 | open <- hIsOpen h 39 | when open (hClose h) 40 | removeFile path 41 | 42 | withRedirect :: Handle -> Handle -> IO a -> IO a 43 | withRedirect tmp h = bracket_ swap swap 44 | where 45 | whenM p a = p >>= (`when` a) 46 | swap = do 47 | whenM (hIsOpen tmp) $ whenM (hIsWritable tmp) $ hFlush tmp 48 | whenM (hIsOpen h) $ whenM (hIsWritable h) $ hFlush h 49 | withHandle "spam" tmp $ \tmph -> do 50 | hh <- withHandle "spam" h $ \hh -> 51 | return (tmph,hh) 52 | return (hh,()) 53 | -------------------------------------------------------------------------------- /Data/Text/Unsafe/Base.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, MagicHash, UnboxedTuples #-} 2 | -- | 3 | -- Module : Data.Text.Unsafe.Base 4 | -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan 5 | -- License : BSD-style 6 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 7 | -- duncan@haskell.org 8 | -- Stability : experimental 9 | -- Portability : portable 10 | -- 11 | -- A module containing unsafe operations, for very very careful use in 12 | -- heavily tested code. 13 | module Data.Text.Unsafe.Base 14 | ( 15 | inlineInterleaveST 16 | , inlinePerformIO 17 | ) where 18 | 19 | import GHC.ST (ST(..)) 20 | #if defined(__GLASGOW_HASKELL__) 21 | # if __GLASGOW_HASKELL__ >= 611 22 | import GHC.IO (IO(IO)) 23 | # else 24 | import GHC.IOBase (IO(IO)) 25 | # endif 26 | import GHC.Base (realWorld#) 27 | #endif 28 | 29 | 30 | -- | Just like unsafePerformIO, but we inline it. Big performance gains as 31 | -- it exposes lots of things to further inlining. /Very unsafe/. In 32 | -- particular, you should do no memory allocation inside an 33 | -- 'inlinePerformIO' block. On Hugs this is just @unsafePerformIO@. 34 | -- 35 | {-# INLINE inlinePerformIO #-} 36 | inlinePerformIO :: IO a -> a 37 | #if defined(__GLASGOW_HASKELL__) 38 | inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r 39 | #else 40 | inlinePerformIO = unsafePerformIO 41 | #endif 42 | 43 | -- | Allow an 'ST' computation to be deferred lazily. When passed an 44 | -- action of type 'ST' @s@ @a@, the action will only be performed when 45 | -- the value of @a@ is demanded. 46 | -- 47 | -- This function is identical to the normal unsafeInterleaveST, but is 48 | -- inlined and hence faster. 49 | -- 50 | -- /Note/: This operation is highly unsafe, as it can introduce 51 | -- externally visible non-determinism into an 'ST' action. 52 | inlineInterleaveST :: ST s a -> ST s a 53 | inlineInterleaveST (ST m) = ST $ \ s -> 54 | let r = case m s of (# _, res #) -> res in (# s, r #) 55 | {-# INLINE inlineInterleaveST #-} 56 | -------------------------------------------------------------------------------- /Data/Text/UnsafeShift.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MagicHash #-} 2 | 3 | -- | 4 | -- Module : Data.Text.UnsafeShift 5 | -- Copyright : (c) Bryan O'Sullivan 2009 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 9 | -- duncan@haskell.org 10 | -- Stability : experimental 11 | -- Portability : GHC 12 | -- 13 | -- Fast, unchecked bit shifting functions. 14 | 15 | module Data.Text.UnsafeShift 16 | ( 17 | UnsafeShift(..) 18 | ) where 19 | 20 | -- import qualified Data.Bits as Bits 21 | import GHC.Base 22 | import GHC.Word 23 | 24 | -- | This is a workaround for poor optimisation in GHC 6.8.2. It 25 | -- fails to notice constant-width shifts, and adds a test and branch 26 | -- to every shift. This imposes about a 10% performance hit. 27 | -- 28 | -- These functions are undefined when the amount being shifted by is 29 | -- greater than the size in bits of a machine Int#. 30 | class UnsafeShift a where 31 | shiftL :: a -> Int -> a 32 | shiftR :: a -> Int -> a 33 | 34 | instance UnsafeShift Word16 where 35 | {-# INLINE shiftL #-} 36 | shiftL (W16# x#) (I# i#) = W16# (narrow16Word# (x# `uncheckedShiftL#` i#)) 37 | 38 | {-# INLINE shiftR #-} 39 | shiftR (W16# x#) (I# i#) = W16# (x# `uncheckedShiftRL#` i#) 40 | 41 | instance UnsafeShift Word32 where 42 | {-# INLINE shiftL #-} 43 | shiftL (W32# x#) (I# i#) = W32# (narrow32Word# (x# `uncheckedShiftL#` i#)) 44 | 45 | {-# INLINE shiftR #-} 46 | shiftR (W32# x#) (I# i#) = W32# (x# `uncheckedShiftRL#` i#) 47 | 48 | instance UnsafeShift Word64 where 49 | {-# INLINE shiftL #-} 50 | shiftL (W64# x#) (I# i#) = W64# (x# `uncheckedShiftL64#` i#) 51 | 52 | {-# INLINE shiftR #-} 53 | shiftR (W64# x#) (I# i#) = W64# (x# `uncheckedShiftRL64#` i#) 54 | 55 | instance UnsafeShift Int where 56 | {-# INLINE shiftL #-} 57 | shiftL (I# x#) (I# i#) = I# (x# `iShiftL#` i#) 58 | 59 | {-# INLINE shiftR #-} 60 | shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#) 61 | 62 | {- 63 | instance UnsafeShift Integer where 64 | {-# INLINE shiftL #-} 65 | shiftL = Bits.shiftL 66 | 67 | {-# INLINE shiftR #-} 68 | shiftR = Bits.shiftR 69 | -} 70 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Programs/Fold.hs: -------------------------------------------------------------------------------- 1 | -- | Benchmark which formats paragraph, like the @sort@ unix utility. 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Reading the file 6 | -- 7 | -- * Splitting into paragraphs 8 | -- 9 | -- * Reformatting the paragraphs to a certain line width 10 | -- 11 | -- * Concatenating the results using the text builder 12 | -- 13 | -- * Writing back to a handle 14 | -- 15 | {-# LANGUAGE OverloadedStrings #-} 16 | module Benchmarks.Programs.Fold 17 | ( benchmark 18 | ) where 19 | 20 | import Data.List (foldl') 21 | import Data.List (intersperse) 22 | import Data.Monoid (mempty, mappend, mconcat) 23 | import System.IO (Handle) 24 | import Criterion (Benchmark, bench) 25 | import qualified Data.Text as T 26 | import qualified Data.Text.IO as T 27 | import qualified Data.Text.Lazy.Builder as TLB 28 | import qualified Data.Text.Lazy as TL 29 | import qualified Data.Text.Lazy.IO as TL 30 | 31 | benchmark :: FilePath -> Handle -> IO Benchmark 32 | benchmark i o = return $ 33 | bench "Fold" $ T.readFile i >>= TL.hPutStr o . fold 80 34 | 35 | -- | We represent a paragraph by a word list 36 | -- 37 | type Paragraph = [T.Text] 38 | 39 | -- | Fold a text 40 | -- 41 | fold :: Int -> T.Text -> TL.Text 42 | fold maxWidth = TLB.toLazyText . mconcat . 43 | intersperse "\n\n" . map (foldParagraph maxWidth) . paragraphs 44 | 45 | -- | Fold a paragraph 46 | -- 47 | foldParagraph :: Int -> Paragraph -> TLB.Builder 48 | foldParagraph _ [] = mempty 49 | foldParagraph max' (w : ws) = fst $ foldl' go (TLB.fromText w, T.length w) ws 50 | where 51 | go (builder, width) word 52 | | width + len + 1 <= max' = 53 | (builder `mappend` " " `mappend` word', width + len + 1) 54 | | otherwise = 55 | (builder `mappend` "\n" `mappend` word', len) 56 | where 57 | word' = TLB.fromText word 58 | len = T.length word 59 | 60 | -- | Divide a text into paragraphs 61 | -- 62 | paragraphs :: T.Text -> [Paragraph] 63 | paragraphs = splitParagraphs . map T.words . T.lines 64 | where 65 | splitParagraphs ls = case break null ls of 66 | ([], []) -> [] 67 | (p, []) -> [concat p] 68 | (p, lr) -> concat p : splitParagraphs (dropWhile null lr) 69 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/DecodeUtf8.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ForeignFunctionInterface #-} 2 | 3 | -- | Test decoding of UTF-8 4 | -- 5 | -- Tested in this benchmark: 6 | -- 7 | -- * Decoding bytes using UTF-8 8 | -- 9 | -- In some tests: 10 | -- 11 | -- * Taking the length of the result 12 | -- 13 | -- * Taking the init of the result 14 | -- 15 | -- The latter are used for testing stream fusion. 16 | -- 17 | module Benchmarks.DecodeUtf8 18 | ( benchmark 19 | ) where 20 | 21 | import Foreign.C.Types 22 | import Data.ByteString.Internal (ByteString(..)) 23 | import Foreign.Ptr (Ptr, plusPtr) 24 | import Foreign.ForeignPtr (withForeignPtr) 25 | import Data.Word (Word8) 26 | import qualified Criterion as C 27 | import Criterion (Benchmark, bgroup, nf) 28 | import qualified Codec.Binary.UTF8.Generic as U8 29 | import qualified Data.ByteString as B 30 | import qualified Data.ByteString.Lazy as BL 31 | import qualified Data.Text as T 32 | import qualified Data.Text.Encoding as T 33 | import qualified Data.Text.Lazy as TL 34 | import qualified Data.Text.Lazy.Encoding as TL 35 | 36 | benchmark :: String -> FilePath -> IO Benchmark 37 | benchmark kind fp = do 38 | bs <- B.readFile fp 39 | lbs <- BL.readFile fp 40 | let bench name = C.bench (name ++ "+" ++ kind) 41 | return $ bgroup "DecodeUtf8" 42 | [ bench "Strict" $ nf T.decodeUtf8 bs 43 | , bench "IConv" $ iconv bs 44 | , bench "StrictLength" $ nf (T.length . T.decodeUtf8) bs 45 | , bench "StrictInitLength" $ nf (T.length . T.init . T.decodeUtf8) bs 46 | , bench "Lazy" $ nf TL.decodeUtf8 lbs 47 | , bench "LazyLength" $ nf (TL.length . TL.decodeUtf8) lbs 48 | , bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs 49 | , bench "StrictStringUtf8" $ nf U8.toString bs 50 | , bench "StrictStringUtf8Length" $ nf (length . U8.toString) bs 51 | , bench "LazyStringUtf8" $ nf U8.toString lbs 52 | , bench "LazyStringUtf8Length" $ nf (length . U8.toString) lbs 53 | ] 54 | 55 | iconv :: ByteString -> IO CInt 56 | iconv (PS fp off len) = withForeignPtr fp $ \ptr -> 57 | time_iconv (ptr `plusPtr` off) (fromIntegral len) 58 | 59 | foreign import ccall unsafe time_iconv :: Ptr Word8 -> CSize -> IO CInt 60 | -------------------------------------------------------------------------------- /.hgtags: -------------------------------------------------------------------------------- 1 | 0edd5dc96024e6c0f56f175ffa673f9178cece8c 0.9.0.0 2 | 1de0da99d31ba212e179d88957cf5f3b9c4facd7 0.7.0.1 3 | 30575507578cf04b5b1501ee23b1faa83315eb88 0.1 4 | 321e61f17630dbd346c1c9991831572f964b7d5a 0.4 5 | 34cef27f87287cd3c17d78ac99e9de2d6c8a1200 0.6 6 | 3cbff6d9bca135a85e428a7f6c4e15739a562b8b 0.7.1.0 7 | 587c8661393f83cb2b9452f6cc1d6885e52fe544 0.8.1.0 8 | 61c322bd5b6446255870422f854266396169a7e5 0.5 9 | 9adb5edd9b6407335e8772ea86036b938471fc89 0.2 10 | b4fdbef1c1027a70b95c1a81747d968890acc3f6 0.3 11 | de7cb0cb2a72606126c357b860bb0dbf7f145ad2 0.9.0.1 12 | f2c9b1c019e3277926378d7b5a604f1c995f0202 0.7 13 | f3d625f2d0730cb6adf8506a1f37762178c9aea6 0.10.0.0 14 | 99ea36957b1d47145d70f2ee8b13f1ddfd09245c 0.10.0.1 15 | 1b7889216e78b00d69b32c48782111bcd935978d 0.10.0.2 16 | 75157eaec6eb38574ad5c347e75c48dcb7262dee 0.11.0.0 17 | 0d7bc7bd77ec5a8f6d064de59d459e5eaf906b0f 0.11.0.1 18 | a9d582a75dea9a4417883019d5110a35e4c07265 0.11.0.2 19 | 36b2a3aa661892ea25101199220383ba6d579abb 0.11.0.3 20 | 06e407bcdc9a3905abf9551062ecc6f92abcdde5 0.11.0.4 21 | 94bcf16e812f6fb49b87598673e17270f48c2bf1 0.11.0.5 22 | 784a32ffbec0b77adb43e9a64328db0482007309 0.11.0.6 23 | c808123a3d37f25004d2aad8f3a395acfcf9122f 0.11.0.7 24 | 679826e742b239c5dd0c1fa4ebe98aa4bfe02a00 0.11.0.8 25 | b49eebfa5673da89a23939d523c9de9edbd410b0 0.11.1.0 26 | 9c820a2ecaea01bc614d8e6bccf90e57431bdcbb 0.11.1.1 27 | ed3a60ec627af6ba8af321009aa00224b3296e47 0.11.1.2 28 | b75d3041d275e8e76b26605b0d426a572ddf1249 0.11.1.3 29 | 53906ad0c7e64f6c9a5df569e7cc642579aab8fc 0.11.1.5 30 | 9d6d3a9690ade506897c072060eb92868e229d5c 0.11.1.6 31 | 5ac062eace36b333566d1676979a57039a188840 0.11.1.7 32 | 9f01361a73071936b6ab0bdb879f1a45cca6577a 0.11.1.8 33 | 5dce2a934be53e74310fedffb51d7bd3e61fa993 0.11.1.9 34 | 9f47a2cfc9e51fd622d7553f08ad2ac1faad0438 0.11.1.10 35 | 9f47a2cfc9e51fd622d7553f08ad2ac1faad0438 0.11.1.10 36 | 407937739e9e764f1ae0f1f9ca454c42dca38772 0.11.1.10 37 | 8b981edd27befa4c2dd334fcb7db22ac67e22b67 0.11.1.11 38 | 204da16b5098531bdf858c388e2620238ef2aa5e 0.11.1.12 39 | 6a3d847a56a69d0514a79cb212cb218271ad0917 0.11.1.13 40 | 1d2c6fa9092c6a4000b2abdd9d01f3efcd477be5 0.11.2.0 41 | 78219784cf3652cc662805bf2971bd62d80210a9 0.11.2.1 42 | 4297307ebc11ad677cfba6b40319e7e5e2c0cfee 0.11.2.3 43 | 7fa79662b66aade97fe49394977213fe6432942e 0.11.3.0 44 | d99cd091cdf71ce807a4255f6cc509c3154f51ea 0.11.3.1 45 | -------------------------------------------------------------------------------- /tests-and-benchmarks.markdown: -------------------------------------------------------------------------------- 1 | Tests and benchmarks 2 | ==================== 3 | 4 | Prerequisites 5 | ------------- 6 | 7 | To run the tests and benchmarks, you will need the test data, which 8 | you can clone from one of the following locations: 9 | 10 | * Mercurial master repository: 11 | [bitbucket.org/bos/text-test-data](https://bitbucket.org/bos/text-test-data) 12 | 13 | * Git mirror repository: 14 | [github.com/bos/text-test-data](https://github.com/bos/text-test-data) 15 | 16 | You should clone that repository into the `tests` subdirectory (your 17 | clone must be named `text-test-data` locally), then run `make -C 18 | tests/text-test-data` to uncompress the test files. Many tests and 19 | benchmarks will fail if the test files are missing. 20 | 21 | Functional tests 22 | ---------------- 23 | 24 | The functional tests are located in the `tests` subdirectory. An overview of 25 | what's in that directory: 26 | 27 | Makefile Has targets for common tasks 28 | Tests Source files of the testing code 29 | scripts Various utility scripts 30 | text-tests.cabal Cabal file that compiles all benchmarks 31 | 32 | The `text-tests.cabal` builds: 33 | 34 | - A copy of the text library, sharing the source code, but exposing all internal 35 | modules, for testing purposes 36 | - The different test suites 37 | 38 | To compile, run all tests, and generate a coverage report, simply use `make`. 39 | 40 | Benchmarks 41 | ---------- 42 | 43 | The benchmarks are located in the `benchmarks` subdirectory. An overview of 44 | what's in that directory: 45 | 46 | Makefile Has targets for common tasks 47 | haskell Source files of the haskell benchmarks 48 | python Python implementations of some benchmarks 49 | ruby Ruby implementations of some benchmarks 50 | text-benchmarks.cabal Cabal file which compiles all benchmarks 51 | 52 | To compile the benchmarks, navigate to the `benchmarks` subdirectory and run 53 | `cabal configure && cabal build`. Then, you can run the benchmarks using: 54 | 55 | ./dist/build/text-benchmarks/text-benchmarks 56 | 57 | However, since there's quite a lot of benchmarks, you usually don't want to 58 | run them all. Instead, use the `-l` flag to get a list of benchmarks: 59 | 60 | ./dist/build/text-benchmarks/text-benchmarks 61 | 62 | And run the ones you want to inspect. If you want to configure the benchmarks 63 | further, the exact parameters can be changed in `Benchmarks.hs`. 64 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Programs/Sort.hs: -------------------------------------------------------------------------------- 1 | -- | This benchmark sorts the lines of a file, like the @sort@ unix utility. 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Reading the file 6 | -- 7 | -- * Splitting into lines 8 | -- 9 | -- * Sorting the lines 10 | -- 11 | -- * Joining the lines 12 | -- 13 | -- * Writing back to a handle 14 | -- 15 | {-# LANGUAGE OverloadedStrings #-} 16 | module Benchmarks.Programs.Sort 17 | ( benchmark 18 | ) where 19 | 20 | import Criterion (Benchmark, bgroup, bench) 21 | import Data.Monoid (mconcat) 22 | import System.IO (Handle, hPutStr) 23 | import qualified Data.ByteString as B 24 | import qualified Data.ByteString.Char8 as BC 25 | import qualified Data.ByteString.Lazy as BL 26 | import qualified Data.ByteString.Lazy.Char8 as BLC 27 | import qualified Data.List as L 28 | import qualified Data.Text as T 29 | import qualified Data.Text.Encoding as T 30 | import qualified Data.Text.IO as T 31 | import qualified Data.Text.Lazy as TL 32 | import qualified Data.Text.Lazy.Builder as TLB 33 | import qualified Data.Text.Lazy.Encoding as TL 34 | import qualified Data.Text.Lazy.IO as TL 35 | 36 | benchmark :: FilePath -> Handle -> IO Benchmark 37 | benchmark i o = return $ bgroup "Sort" 38 | [ bench "String" $ readFile i >>= hPutStr o . string 39 | , bench "ByteString" $ B.readFile i >>= B.hPutStr o . byteString 40 | , bench "LazyByteString" $ BL.readFile i >>= BL.hPutStr o . lazyByteString 41 | , bench "Text" $ T.readFile i >>= T.hPutStr o . text 42 | , bench "LazyText" $ TL.readFile i >>= TL.hPutStr o . lazyText 43 | , bench "TextByteString" $ B.readFile i >>= 44 | B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8 45 | , bench "LazyTextByteString" $ BL.readFile i >>= 46 | BL.hPutStr o . TL.encodeUtf8 . lazyText . TL.decodeUtf8 47 | , bench "TextBuilder" $ B.readFile i >>= 48 | BL.hPutStr o . TL.encodeUtf8 . textBuilder . T.decodeUtf8 49 | ] 50 | 51 | string :: String -> String 52 | string = unlines . L.sort . lines 53 | 54 | byteString :: B.ByteString -> B.ByteString 55 | byteString = BC.unlines . L.sort . BC.lines 56 | 57 | lazyByteString :: BL.ByteString -> BL.ByteString 58 | lazyByteString = BLC.unlines . L.sort . BLC.lines 59 | 60 | text :: T.Text -> T.Text 61 | text = T.unlines . L.sort . T.lines 62 | 63 | lazyText :: TL.Text -> TL.Text 64 | lazyText = TL.unlines . L.sort . TL.lines 65 | 66 | -- | Text variant using a builder monoid for the final concatenation 67 | -- 68 | textBuilder :: T.Text -> TL.Text 69 | textBuilder = TLB.toLazyText . mconcat . L.intersperse (TLB.singleton '\n') . 70 | map TLB.fromText . L.sort . T.lines 71 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Builder.hs: -------------------------------------------------------------------------------- 1 | -- | Testing the internal builder monoid 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Concatenating many small strings using a builder 6 | -- 7 | {-# LANGUAGE OverloadedStrings #-} 8 | module Benchmarks.Builder 9 | ( benchmark 10 | ) where 11 | 12 | import Criterion (Benchmark, bgroup, bench, nf) 13 | import Data.Binary.Builder as B 14 | import Data.ByteString.Char8 () 15 | import Data.Monoid (mconcat, mempty) 16 | import qualified Blaze.ByteString.Builder as Blaze 17 | import qualified Blaze.ByteString.Builder.Char.Utf8 as Blaze 18 | import qualified Data.ByteString as SB 19 | import qualified Data.ByteString.Lazy as LB 20 | import qualified Data.Text as T 21 | import qualified Data.Text.Lazy as LT 22 | import qualified Data.Text.Lazy.Builder as LTB 23 | import qualified Data.Text.Lazy.Builder.Int as Int 24 | import Data.Int (Int64) 25 | 26 | benchmark :: IO Benchmark 27 | benchmark = return $ bgroup "Builder" 28 | [ bgroup "Comparison" 29 | [ bench "LazyText" $ nf 30 | (LT.length . LTB.toLazyText . mconcat . map LTB.fromText) texts 31 | , bench "Binary" $ nf 32 | (LB.length . B.toLazyByteString . mconcat . map B.fromByteString) 33 | byteStrings 34 | , bench "Blaze" $ nf 35 | (LB.length . Blaze.toLazyByteString . mconcat . map Blaze.fromString) 36 | strings 37 | ] 38 | , bgroup "Int" 39 | [ bgroup "Decimal" 40 | [ bgroup "Positive" . 41 | flip map numbers $ \n -> 42 | (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n) 43 | , bgroup "Negative" . 44 | flip map numbers $ \m -> 45 | let n = negate m in 46 | (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n) 47 | , bench "Empty" $ nf LTB.toLazyText mempty 48 | , bgroup "Show" . 49 | flip map numbers $ \n -> 50 | (bench (show (length (show n))) $ nf show n) 51 | ] 52 | ] 53 | ] 54 | where 55 | numbers :: [Int64] 56 | numbers = [ 57 | 6, 14, 500, 9688, 10654, 620735, 5608880, 37010612, 58 | 731223504, 5061580596, 24596952933, 711732309084, 2845910093839, 59 | 54601756118340, 735159434806159, 3619097625502435, 95777227510267124, 60 | 414944309510675693, 8986407456998704019 61 | ] 62 | 63 | texts :: [T.Text] 64 | texts = take 200000 $ cycle ["foo", "λx", "由の"] 65 | {-# NOINLINE texts #-} 66 | 67 | -- Note that the non-ascii characters will be chopped 68 | byteStrings :: [SB.ByteString] 69 | byteStrings = take 200000 $ cycle ["foo", "λx", "由の"] 70 | {-# NOINLINE byteStrings #-} 71 | 72 | -- Note that the non-ascii characters will be chopped 73 | strings :: [String] 74 | strings = take 200000 $ cycle ["foo", "λx", "由の"] 75 | {-# NOINLINE strings #-} 76 | -------------------------------------------------------------------------------- /tests/Tests/Regressions.hs: -------------------------------------------------------------------------------- 1 | -- | Regression tests for specific bugs. 2 | -- 3 | {-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-} 4 | module Tests.Regressions 5 | ( 6 | tests 7 | ) where 8 | 9 | import Control.Exception (SomeException, handle) 10 | import System.IO 11 | import Test.HUnit (assertBool, assertFailure) 12 | import qualified Data.ByteString as B 13 | import Data.ByteString.Char8 () 14 | import qualified Data.ByteString.Lazy as LB 15 | import qualified Data.Text as T 16 | import qualified Data.Text.Encoding as TE 17 | import qualified Data.Text.IO as T 18 | import qualified Data.Text.Lazy as LT 19 | import qualified Data.Text.Lazy.Encoding as LE 20 | import qualified Test.Framework as F 21 | import qualified Test.Framework.Providers.HUnit as F 22 | 23 | import Tests.Utils (withTempFile) 24 | 25 | -- Reported by Michael Snoyman: UTF-8 encoding a large lazy bytestring 26 | -- caused either a segfault or attempt to allocate a negative number 27 | -- of bytes. 28 | lazy_encode_crash :: IO () 29 | lazy_encode_crash = withTempFile $ \ _ h -> 30 | LB.hPut h . LE.encodeUtf8 . LT.pack . replicate 100000 $ 'a' 31 | 32 | -- Reported by Pieter Laeremans: attempting to read an incorrectly 33 | -- encoded file can result in a crash in the RTS (i.e. not merely an 34 | -- exception). 35 | hGetContents_crash :: IO () 36 | hGetContents_crash = withTempFile $ \ path h -> do 37 | B.hPut h (B.pack [0x78, 0xc4 ,0x0a]) >> hClose h 38 | h' <- openFile path ReadMode 39 | hSetEncoding h' utf8 40 | handle (\(_::SomeException) -> return ()) $ 41 | T.hGetContents h' >> assertFailure "T.hGetContents should crash" 42 | 43 | -- Reported by Ian Lynagh: attempting to allocate a sufficiently large 44 | -- string (via either Array.new or Text.replicate) could result in an 45 | -- integer overflow. 46 | replicate_crash :: IO () 47 | replicate_crash = handle (\(_::SomeException) -> return ()) $ 48 | T.replicate (2^power) "0123456789abcdef" `seq` 49 | assertFailure "T.replicate should crash" 50 | where 51 | power | maxBound == (2147483647::Int) = 28 52 | | otherwise = 60 :: Int 53 | 54 | -- Reported by John Millikin: a UTF-8 decode error handler could 55 | -- return a bogus substitution character, which we would write without 56 | -- checking. 57 | utf8_decode_unsafe :: IO () 58 | utf8_decode_unsafe = do 59 | let t = TE.decodeUtf8With (\_ _ -> Just '\xdc00') "\x80" 60 | assertBool "broken error recovery shouldn't break us" (t == "\xfffd") 61 | 62 | tests :: F.Test 63 | tests = F.testGroup "Regressions" 64 | [ F.testCase "hGetContents_crash" hGetContents_crash 65 | , F.testCase "lazy_encode_crash" lazy_encode_crash 66 | , F.testCase "replicate_crash" replicate_crash 67 | , F.testCase "utf8_decode_unsafe" utf8_decode_unsafe 68 | ] 69 | -------------------------------------------------------------------------------- /Data/Text/UnsafeChar.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, MagicHash #-} 2 | 3 | -- | 4 | -- Module : Data.Text.UnsafeChar 5 | -- Copyright : (c) 2008, 2009 Tom Harper, 6 | -- (c) 2009, 2010 Bryan O'Sullivan, 7 | -- (c) 2009 Duncan Coutts 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Fast character manipulation functions. 16 | module Data.Text.UnsafeChar 17 | ( 18 | ord 19 | , unsafeChr 20 | , unsafeChr8 21 | , unsafeChr32 22 | , unsafeWrite 23 | -- , unsafeWriteRev 24 | ) where 25 | 26 | #ifdef ASSERTS 27 | import Control.Exception (assert) 28 | #endif 29 | import Control.Monad.ST (ST) 30 | import Data.Bits ((.&.)) 31 | import Data.Text.UnsafeShift (shiftR) 32 | import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#) 33 | import GHC.Word (Word8(..), Word16(..), Word32(..)) 34 | import qualified Data.Text.Array as A 35 | 36 | ord :: Char -> Int 37 | ord (C# c#) = I# (ord# c#) 38 | {-# INLINE ord #-} 39 | 40 | unsafeChr :: Word16 -> Char 41 | unsafeChr (W16# w#) = C# (chr# (word2Int# w#)) 42 | {-# INLINE unsafeChr #-} 43 | 44 | unsafeChr8 :: Word8 -> Char 45 | unsafeChr8 (W8# w#) = C# (chr# (word2Int# w#)) 46 | {-# INLINE unsafeChr8 #-} 47 | 48 | unsafeChr32 :: Word32 -> Char 49 | unsafeChr32 (W32# w#) = C# (chr# (word2Int# w#)) 50 | {-# INLINE unsafeChr32 #-} 51 | 52 | -- | Write a character into the array at the given offset. Returns 53 | -- the number of 'Word16's written. 54 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int 55 | unsafeWrite marr i c 56 | | n < 0x10000 = do 57 | #if defined(ASSERTS) 58 | assert (i >= 0) . assert (i < A.length marr) $ return () 59 | #endif 60 | A.unsafeWrite marr i (fromIntegral n) 61 | return 1 62 | | otherwise = do 63 | #if defined(ASSERTS) 64 | assert (i >= 0) . assert (i < A.length marr - 1) $ return () 65 | #endif 66 | A.unsafeWrite marr i lo 67 | A.unsafeWrite marr (i+1) hi 68 | return 2 69 | where n = ord c 70 | m = n - 0x10000 71 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 72 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 73 | {-# INLINE unsafeWrite #-} 74 | 75 | {- 76 | unsafeWriteRev :: A.MArray s Word16 -> Int -> Char -> ST s Int 77 | unsafeWriteRev marr i c 78 | | n < 0x10000 = do 79 | assert (i >= 0) . assert (i < A.length marr) $ 80 | A.unsafeWrite marr i (fromIntegral n) 81 | return (i-1) 82 | | otherwise = do 83 | assert (i >= 1) . assert (i < A.length marr) $ 84 | A.unsafeWrite marr (i-1) lo 85 | A.unsafeWrite marr i hi 86 | return (i-2) 87 | where n = ord c 88 | m = n - 0x10000 89 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 90 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 91 | {-# INLINE unsafeWriteRev #-} 92 | -} 93 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks.hs: -------------------------------------------------------------------------------- 1 | -- | Main module to run the micro benchmarks 2 | -- 3 | {-# LANGUAGE OverloadedStrings #-} 4 | module Main 5 | ( main 6 | ) where 7 | 8 | import Criterion.Main (Benchmark, defaultMain, bgroup) 9 | import System.FilePath (()) 10 | import System.IO (IOMode (WriteMode), openFile, hSetEncoding, utf8) 11 | 12 | import qualified Benchmarks.Builder as Builder 13 | import qualified Benchmarks.DecodeUtf8 as DecodeUtf8 14 | import qualified Benchmarks.EncodeUtf8 as EncodeUtf8 15 | import qualified Benchmarks.Equality as Equality 16 | import qualified Benchmarks.FileRead as FileRead 17 | import qualified Benchmarks.FoldLines as FoldLines 18 | import qualified Benchmarks.Pure as Pure 19 | import qualified Benchmarks.ReadNumbers as ReadNumbers 20 | import qualified Benchmarks.Replace as Replace 21 | import qualified Benchmarks.Search as Search 22 | import qualified Benchmarks.Stream as Stream 23 | import qualified Benchmarks.WordFrequencies as WordFrequencies 24 | 25 | import qualified Benchmarks.Programs.BigTable as Programs.BigTable 26 | import qualified Benchmarks.Programs.Cut as Programs.Cut 27 | import qualified Benchmarks.Programs.Fold as Programs.Fold 28 | import qualified Benchmarks.Programs.Sort as Programs.Sort 29 | import qualified Benchmarks.Programs.StripTags as Programs.StripTags 30 | import qualified Benchmarks.Programs.Throughput as Programs.Throughput 31 | 32 | main :: IO () 33 | main = benchmarks >>= defaultMain 34 | 35 | benchmarks :: IO [Benchmark] 36 | benchmarks = do 37 | sink <- openFile "/dev/null" WriteMode 38 | hSetEncoding sink utf8 39 | 40 | -- Traditional benchmarks 41 | bs <- sequence 42 | [ Builder.benchmark 43 | , DecodeUtf8.benchmark "html" (tf "libya-chinese.html") 44 | , DecodeUtf8.benchmark "xml" (tf "yiwiki.xml") 45 | , DecodeUtf8.benchmark "ascii" (tf "ascii.txt") 46 | , DecodeUtf8.benchmark "russian" (tf "russian.txt") 47 | , DecodeUtf8.benchmark "japanese" (tf "japanese.txt") 48 | , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯" 49 | , Equality.benchmark (tf "japanese.txt") 50 | , FileRead.benchmark (tf "russian.txt") 51 | , FoldLines.benchmark (tf "russian.txt") 52 | , Pure.benchmark "tiny "(tf "tiny.txt") 53 | , Pure.benchmark "japanese" (tf "japanese.txt") 54 | , ReadNumbers.benchmark (tf "numbers.txt") 55 | , Replace.benchmark (tf "russian.txt") "принимая" "своем" 56 | , Search.benchmark (tf "russian.txt") "принимая" 57 | , Stream.benchmark (tf "russian.txt") 58 | , WordFrequencies.benchmark (tf "russian.txt") 59 | ] 60 | 61 | -- Program-like benchmarks 62 | ps <- bgroup "Programs" `fmap` sequence 63 | [ Programs.BigTable.benchmark sink 64 | , Programs.Cut.benchmark (tf "russian.txt") sink 20 40 65 | , Programs.Fold.benchmark (tf "russian.txt") sink 66 | , Programs.Sort.benchmark (tf "russian.txt") sink 67 | , Programs.StripTags.benchmark (tf "yiwiki.xml") sink 68 | , Programs.Throughput.benchmark (tf "russian.txt") sink 69 | ] 70 | 71 | return $ bs ++ [ps] 72 | where 73 | -- Location of a test file 74 | tf = ("../tests/text-test-data" ) 75 | -------------------------------------------------------------------------------- /tests/text-tests.cabal: -------------------------------------------------------------------------------- 1 | name: text-tests 2 | version: 0.0.0.0 3 | synopsis: Functional tests for the text package 4 | description: Functional tests for the text package 5 | homepage: https://bitbucket.org/bos/text 6 | license: BSD3 7 | license-file: ../LICENSE 8 | author: Jasper Van der Jeugt , 9 | Bryan O'Sullivan , 10 | Tom Harper , 11 | Duncan Coutts 12 | maintainer: Bryan O'Sullivan 13 | category: Text 14 | build-type: Simple 15 | 16 | cabal-version: >=1.8 17 | 18 | flag hpc 19 | description: Enable HPC to generate coverage reports 20 | default: False 21 | 22 | executable text-tests 23 | main-is: Tests.hs 24 | 25 | ghc-options: 26 | -Wall -threaded -O0 -rtsopts 27 | 28 | if flag(hpc) 29 | ghc-options: 30 | -fhpc 31 | 32 | cpp-options: 33 | -DASSERTS 34 | -DHAVE_DEEPSEQ 35 | 36 | build-depends: 37 | HUnit >= 1.2, 38 | QuickCheck >= 2.4, 39 | base == 4.*, 40 | bytestring, 41 | deepseq, 42 | directory, 43 | random, 44 | test-framework >= 0.4, 45 | test-framework-hunit >= 0.2, 46 | test-framework-quickcheck2 >= 0.2, 47 | text-tests 48 | 49 | executable text-tests-stdio 50 | main-is: Tests/IO.hs 51 | 52 | ghc-options: 53 | -Wall -threaded -rtsopts 54 | 55 | -- Optional HPC support 56 | if flag(hpc) 57 | ghc-options: 58 | -fhpc 59 | 60 | build-depends: 61 | text-tests, 62 | base >= 4 && < 5 63 | 64 | library 65 | hs-source-dirs: .. 66 | c-sources: ../cbits/cbits.c 67 | exposed-modules: 68 | Data.Text 69 | Data.Text.Array 70 | Data.Text.Encoding 71 | Data.Text.Encoding.Error 72 | Data.Text.Encoding.Fusion 73 | Data.Text.Encoding.Fusion.Common 74 | Data.Text.Encoding.Utf16 75 | Data.Text.Encoding.Utf32 76 | Data.Text.Encoding.Utf8 77 | Data.Text.Foreign 78 | Data.Text.Fusion 79 | Data.Text.Fusion.CaseMapping 80 | Data.Text.Fusion.Common 81 | Data.Text.Fusion.Internal 82 | Data.Text.Fusion.Size 83 | Data.Text.IO 84 | Data.Text.IO.Internal 85 | Data.Text.Internal 86 | Data.Text.Lazy 87 | Data.Text.Lazy.Builder 88 | Data.Text.Lazy.Builder.Functions 89 | Data.Text.Lazy.Builder.Int 90 | Data.Text.Lazy.Builder.Int.Digits 91 | Data.Text.Lazy.Builder.Internal 92 | Data.Text.Lazy.Builder.RealFloat 93 | Data.Text.Lazy.Builder.RealFloat.Functions 94 | Data.Text.Lazy.Encoding 95 | Data.Text.Lazy.Encoding.Fusion 96 | Data.Text.Lazy.Fusion 97 | Data.Text.Lazy.IO 98 | Data.Text.Lazy.Internal 99 | Data.Text.Lazy.Read 100 | Data.Text.Lazy.Search 101 | Data.Text.Private 102 | Data.Text.Read 103 | Data.Text.Search 104 | Data.Text.Unsafe 105 | Data.Text.Unsafe.Base 106 | Data.Text.UnsafeChar 107 | Data.Text.UnsafeShift 108 | Data.Text.Util 109 | 110 | if flag(hpc) 111 | ghc-options: 112 | -fhpc 113 | 114 | cpp-options: 115 | -DHAVE_DEEPSEQ 116 | -DASSERTS 117 | -DINTEGER_GMP 118 | 119 | build-depends: 120 | array, 121 | base == 4.*, 122 | bytestring, 123 | deepseq, 124 | ghc-prim, 125 | integer-gmp 126 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Programs/Cut.hs: -------------------------------------------------------------------------------- 1 | -- | Cut into a file, selecting certain columns (e.g. columns 10 to 40) 2 | -- 3 | -- Tested in this benchmark: 4 | -- 5 | -- * Reading the file 6 | -- 7 | -- * Splitting into lines 8 | -- 9 | -- * Taking a number of characters from the lines 10 | -- 11 | -- * Joining the lines 12 | -- 13 | -- * Writing back to a handle 14 | -- 15 | module Benchmarks.Programs.Cut 16 | ( benchmark 17 | ) where 18 | 19 | import Criterion (Benchmark, bgroup, bench) 20 | import System.IO (Handle, hPutStr) 21 | import qualified Data.ByteString as B 22 | import qualified Data.ByteString.Char8 as BC 23 | import qualified Data.ByteString.Lazy as BL 24 | import qualified Data.ByteString.Lazy.Char8 as BLC 25 | import qualified Data.Text as T 26 | import qualified Data.Text.Encoding as T 27 | import qualified Data.Text.IO as T 28 | import qualified Data.Text.Lazy as TL 29 | import qualified Data.Text.Lazy.Encoding as TL 30 | import qualified Data.Text.Lazy.IO as TL 31 | 32 | benchmark :: FilePath -> Handle -> Int -> Int -> IO Benchmark 33 | benchmark p sink from to = return $ bgroup "Cut" 34 | [ bench' "String" string 35 | , bench' "ByteString" byteString 36 | , bench' "LazyByteString" lazyByteString 37 | , bench' "Text" text 38 | , bench' "LazyText" lazyText 39 | , bench' "TextByteString" textByteString 40 | , bench' "LazyTextByteString" lazyTextByteString 41 | ] 42 | where 43 | bench' n s = bench n (s p sink from to) 44 | 45 | string :: FilePath -> Handle -> Int -> Int -> IO () 46 | string fp sink from to = do 47 | s <- readFile fp 48 | hPutStr sink $ cut s 49 | where 50 | cut = unlines . map (take (to - from) . drop from) . lines 51 | 52 | byteString :: FilePath -> Handle -> Int -> Int -> IO () 53 | byteString fp sink from to = do 54 | bs <- B.readFile fp 55 | B.hPutStr sink $ cut bs 56 | where 57 | cut = BC.unlines . map (B.take (to - from) . B.drop from) . BC.lines 58 | 59 | lazyByteString :: FilePath -> Handle -> Int -> Int -> IO () 60 | lazyByteString fp sink from to = do 61 | bs <- BL.readFile fp 62 | BL.hPutStr sink $ cut bs 63 | where 64 | cut = BLC.unlines . map (BL.take (to' - from') . BL.drop from') . BLC.lines 65 | from' = fromIntegral from 66 | to' = fromIntegral to 67 | 68 | text :: FilePath -> Handle -> Int -> Int -> IO () 69 | text fp sink from to = do 70 | t <- T.readFile fp 71 | T.hPutStr sink $ cut t 72 | where 73 | cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines 74 | 75 | lazyText :: FilePath -> Handle -> Int -> Int -> IO () 76 | lazyText fp sink from to = do 77 | t <- TL.readFile fp 78 | TL.hPutStr sink $ cut t 79 | where 80 | cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines 81 | from' = fromIntegral from 82 | to' = fromIntegral to 83 | 84 | textByteString :: FilePath -> Handle -> Int -> Int -> IO () 85 | textByteString fp sink from to = do 86 | t <- T.decodeUtf8 `fmap` B.readFile fp 87 | B.hPutStr sink $ T.encodeUtf8 $ cut t 88 | where 89 | cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines 90 | 91 | lazyTextByteString :: FilePath -> Handle -> Int -> Int -> IO () 92 | lazyTextByteString fp sink from to = do 93 | t <- TL.decodeUtf8 `fmap` BL.readFile fp 94 | BL.hPutStr sink $ TL.encodeUtf8 $ cut t 95 | where 96 | cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines 97 | from' = fromIntegral from 98 | to' = fromIntegral to 99 | -------------------------------------------------------------------------------- /Data/Text/Search.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, ScopedTypeVariables #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Search 5 | -- Copyright : (c) Bryan O'Sullivan 2009 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 9 | -- duncan@haskell.org 10 | -- Stability : experimental 11 | -- Portability : GHC 12 | -- 13 | -- Fast substring search for 'Text', based on work by Boyer, Moore, 14 | -- Horspool, Sunday, and Lundh. 15 | -- 16 | -- References: 17 | -- 18 | -- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm. 19 | -- Communications of the ACM, 20, 10, 762-772 (1977) 20 | -- 21 | -- * R. N. Horspool: Practical Fast Searching in Strings. Software - 22 | -- Practice and Experience 10, 501-506 (1980) 23 | -- 24 | -- * D. M. Sunday: A Very Fast Substring Search Algorithm. 25 | -- Communications of the ACM, 33, 8, 132-142 (1990) 26 | -- 27 | -- * F. Lundh: The Fast Search Algorithm. 28 | -- (2006) 29 | 30 | module Data.Text.Search 31 | ( 32 | indices 33 | ) where 34 | 35 | import qualified Data.Text.Array as A 36 | import Data.Word (Word64) 37 | import Data.Text.Internal (Text(..)) 38 | import Data.Bits ((.|.), (.&.)) 39 | import Data.Text.UnsafeShift (shiftL) 40 | 41 | data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int 42 | 43 | -- | /O(n+m)/ Find the offsets of all non-overlapping indices of 44 | -- @needle@ within @haystack@. The offsets returned represent 45 | -- locations in the low-level array. 46 | -- 47 | -- In (unlikely) bad cases, this algorithm's complexity degrades 48 | -- towards /O(n*m)/. 49 | indices :: Text -- ^ Substring to search for (@needle@) 50 | -> Text -- ^ Text to search in (@haystack@) 51 | -> [Int] 52 | indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen) 53 | | nlen == 1 = scanOne (nindex 0) 54 | | nlen <= 0 || ldiff < 0 = [] 55 | | otherwise = scan 0 56 | where 57 | ldiff = hlen - nlen 58 | nlast = nlen - 1 59 | z = nindex nlast 60 | nindex k = A.unsafeIndex narr (noff+k) 61 | hindex k = A.unsafeIndex harr (hoff+k) 62 | hindex' k | k == hlen = 0 63 | | otherwise = A.unsafeIndex harr (hoff+k) 64 | buildTable !i !msk !skp 65 | | i >= nlast = (msk .|. swizzle z) :* skp 66 | | otherwise = buildTable (i+1) (msk .|. swizzle c) skp' 67 | where c = nindex i 68 | skp' | c == z = nlen - i - 2 69 | | otherwise = skp 70 | swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f) 71 | scan !i 72 | | i > ldiff = [] 73 | | c == z && candidateMatch 0 = i : scan (i + nlen) 74 | | otherwise = scan (i + delta) 75 | where c = hindex (i + nlast) 76 | candidateMatch !j 77 | | j >= nlast = True 78 | | hindex (i+j) /= nindex j = False 79 | | otherwise = candidateMatch (j+1) 80 | delta | nextInPattern = nlen + 1 81 | | c == z = skip + 1 82 | | otherwise = 1 83 | where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0 84 | !(mask :* skip) = buildTable 0 0 (nlen-2) 85 | scanOne c = loop 0 86 | where loop !i | i >= hlen = [] 87 | | hindex i == c = i : loop (i+1) 88 | | otherwise = loop (i+1) 89 | {-# INLINE indices #-} 90 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/Stream.hs: -------------------------------------------------------------------------------- 1 | -- | This module contains a number of benchmarks for the different streaming 2 | -- functions 3 | -- 4 | -- Tested in this benchmark: 5 | -- 6 | -- * Most streaming functions 7 | -- 8 | {-# LANGUAGE BangPatterns #-} 9 | {-# OPTIONS_GHC -fno-warn-orphans #-} 10 | module Benchmarks.Stream 11 | ( benchmark 12 | ) where 13 | 14 | import Control.DeepSeq (NFData (..)) 15 | import Criterion (Benchmark, bgroup, bench, nf) 16 | import Data.Text.Fusion.Internal (Step (..), Stream (..)) 17 | import qualified Data.Text.Encoding as T 18 | import qualified Data.Text.Encoding.Error as E 19 | import qualified Data.Text.Encoding.Fusion as T 20 | import qualified Data.Text.Encoding.Fusion.Common as F 21 | import qualified Data.Text.Fusion as T 22 | import qualified Data.Text.IO as T 23 | import qualified Data.Text.Lazy.Encoding as TL 24 | import qualified Data.Text.Lazy.Encoding.Fusion as TL 25 | import qualified Data.Text.Lazy.Fusion as TL 26 | import qualified Data.Text.Lazy.IO as TL 27 | 28 | instance NFData a => NFData (Stream a) where 29 | -- Currently, this implementation does not force evaluation of the size hint 30 | rnf (Stream next s0 _) = go s0 31 | where 32 | go !s = case next s of 33 | Done -> () 34 | Skip s' -> go s' 35 | Yield x s' -> rnf x `seq` go s' 36 | 37 | benchmark :: FilePath -> IO Benchmark 38 | benchmark fp = do 39 | -- Different formats 40 | t <- T.readFile fp 41 | let !utf8 = T.encodeUtf8 t 42 | !utf16le = T.encodeUtf16LE t 43 | !utf16be = T.encodeUtf16BE t 44 | !utf32le = T.encodeUtf32LE t 45 | !utf32be = T.encodeUtf32BE t 46 | 47 | -- Once again for the lazy variants 48 | tl <- TL.readFile fp 49 | let !utf8L = TL.encodeUtf8 tl 50 | !utf16leL = TL.encodeUtf16LE tl 51 | !utf16beL = TL.encodeUtf16BE tl 52 | !utf32leL = TL.encodeUtf32LE tl 53 | !utf32beL = TL.encodeUtf32BE tl 54 | 55 | -- For the functions which operate on streams 56 | let !s = T.stream t 57 | 58 | return $ bgroup "Stream" 59 | 60 | -- Fusion 61 | [ bgroup "stream" $ 62 | [ bench "Text" $ nf T.stream t 63 | , bench "LazyText" $ nf TL.stream tl 64 | ] 65 | 66 | -- Encoding.Fusion 67 | , bgroup "streamUtf8" 68 | [ bench "Text" $ nf (T.streamUtf8 E.lenientDecode) utf8 69 | , bench "LazyText" $ nf (TL.streamUtf8 E.lenientDecode) utf8L 70 | ] 71 | , bgroup "streamUtf16LE" 72 | [ bench "Text" $ nf (T.streamUtf16LE E.lenientDecode) utf16le 73 | , bench "LazyText" $ nf (TL.streamUtf16LE E.lenientDecode) utf16leL 74 | ] 75 | , bgroup "streamUtf16BE" 76 | [ bench "Text" $ nf (T.streamUtf16BE E.lenientDecode) utf16be 77 | , bench "LazyText" $ nf (TL.streamUtf16BE E.lenientDecode) utf16beL 78 | ] 79 | , bgroup "streamUtf32LE" 80 | [ bench "Text" $ nf (T.streamUtf32LE E.lenientDecode) utf32le 81 | , bench "LazyText" $ nf (TL.streamUtf32LE E.lenientDecode) utf32leL 82 | ] 83 | , bgroup "streamUtf32BE" 84 | [ bench "Text" $ nf (T.streamUtf32BE E.lenientDecode) utf32be 85 | , bench "LazyText" $ nf (TL.streamUtf32BE E.lenientDecode) utf32beL 86 | ] 87 | 88 | -- Encoding.Fusion.Common 89 | , bench "restreamUtf16LE" $ nf F.restreamUtf16LE s 90 | , bench "restreamUtf16BE" $ nf F.restreamUtf16BE s 91 | , bench "restreamUtf32LE" $ nf F.restreamUtf32LE s 92 | , bench "restreamUtf32BE" $ nf F.restreamUtf32BE s 93 | ] 94 | -------------------------------------------------------------------------------- /benchmarks/haskell/Benchmarks/ReadNumbers.hs: -------------------------------------------------------------------------------- 1 | -- | Read numbers from a file with a just a number on each line, find the 2 | -- minimum of those numbers. The file contains different kinds of numbers: 3 | -- 4 | -- * Decimals 5 | -- 6 | -- * Hexadecimals 7 | -- 8 | -- * Floating point numbers 9 | -- 10 | -- * Floating point numbers in scientific notation 11 | -- 12 | -- The different benchmarks will only take into account the values they can 13 | -- parse. 14 | -- 15 | -- Tested in this benchmark: 16 | -- 17 | -- * Lexing/parsing of different numerical types 18 | -- 19 | module Benchmarks.ReadNumbers 20 | ( benchmark 21 | ) where 22 | 23 | import Criterion (Benchmark, bgroup, bench, whnf) 24 | import Data.List (foldl') 25 | import Numeric (readDec, readFloat, readHex) 26 | import qualified Data.ByteString.Char8 as B 27 | import qualified Data.ByteString.Lazy.Char8 as BL 28 | import qualified Data.ByteString.Lex.Double as B 29 | import qualified Data.ByteString.Lex.Lazy.Double as BL 30 | import qualified Data.Text as T 31 | import qualified Data.Text.IO as T 32 | import qualified Data.Text.Lazy as TL 33 | import qualified Data.Text.Lazy.IO as TL 34 | import qualified Data.Text.Lazy.Read as TL 35 | import qualified Data.Text.Read as T 36 | 37 | benchmark :: FilePath -> IO Benchmark 38 | benchmark fp = do 39 | -- Read all files into lines: string, text, lazy text, bytestring, lazy 40 | -- bytestring 41 | s <- lines `fmap` readFile fp 42 | t <- T.lines `fmap` T.readFile fp 43 | tl <- TL.lines `fmap` TL.readFile fp 44 | b <- B.lines `fmap` B.readFile fp 45 | bl <- BL.lines `fmap` BL.readFile fp 46 | return $ bgroup "ReadNumbers" 47 | [ bench "DecimalString" $ whnf (int . string readDec) s 48 | , bench "HexadecimalString" $ whnf (int . string readHex) s 49 | , bench "DoubleString" $ whnf (double . string readFloat) s 50 | 51 | , bench "DecimalText" $ whnf (int . text (T.signed T.decimal)) t 52 | , bench "HexadecimalText" $ whnf (int . text (T.signed T.hexadecimal)) t 53 | , bench "DoubleText" $ whnf (double . text T.double) t 54 | , bench "RationalText" $ whnf (double . text T.rational) t 55 | 56 | , bench "DecimalLazyText" $ 57 | whnf (int . text (TL.signed TL.decimal)) tl 58 | , bench "HexadecimalLazyText" $ 59 | whnf (int . text (TL.signed TL.hexadecimal)) tl 60 | , bench "DoubleLazyText" $ 61 | whnf (double . text TL.double) tl 62 | , bench "RationalLazyText" $ 63 | whnf (double . text TL.rational) tl 64 | 65 | , bench "DecimalByteString" $ whnf (int . byteString B.readInt) b 66 | , bench "DoubleByteString" $ whnf (double . byteString B.readDouble) b 67 | 68 | , bench "DecimalLazyByteString" $ 69 | whnf (int . byteString BL.readInt) bl 70 | , bench "DoubleLazyByteString" $ 71 | whnf (double . byteString BL.readDouble) bl 72 | ] 73 | where 74 | -- Used for fixing types 75 | int :: Int -> Int 76 | int = id 77 | double :: Double -> Double 78 | double = id 79 | 80 | string :: (Ord a, Num a) => (t -> [(a, t)]) -> [t] -> a 81 | string reader = foldl' go 1000000 82 | where 83 | go z t = case reader t of [(n, _)] -> min n z 84 | _ -> z 85 | 86 | text :: (Ord a, Num a) => (t -> Either String (a,t)) -> [t] -> a 87 | text reader = foldl' go 1000000 88 | where 89 | go z t = case reader t of Left _ -> z 90 | Right (n, _) -> min n z 91 | 92 | byteString :: (Ord a, Num a) => (t -> Maybe (a,t)) -> [t] -> a 93 | byteString reader = foldl' go 1000000 94 | where 95 | go z t = case reader t of Nothing -> z 96 | Just (n, _) -> min n z 97 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, DeriveDataTypeable #-} 2 | -- | 3 | -- Module : Data.Text.Lazy.Internal 4 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 8 | -- duncan@haskell.org 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- A module containing private 'Text' internals. This exposes the 13 | -- 'Text' representation and low level construction functions. 14 | -- Modules which extend the 'Text' system may need to use this module. 15 | -- 16 | -- You should not use this module unless you are determined to monkey 17 | -- with the internals, as the functions here do just about nothing to 18 | -- preserve data invariants. You have been warned! 19 | 20 | module Data.Text.Lazy.Internal 21 | ( 22 | Text(..) 23 | , chunk 24 | , empty 25 | , foldrChunks 26 | , foldlChunks 27 | -- * Data type invariant and abstraction functions 28 | 29 | -- $invariant 30 | , strictInvariant 31 | , lazyInvariant 32 | , showStructure 33 | 34 | -- * Chunk allocation sizes 35 | , defaultChunkSize 36 | , smallChunkSize 37 | , chunkOverhead 38 | ) where 39 | 40 | import Data.Text () 41 | import Data.Text.UnsafeShift (shiftL) 42 | import Data.Typeable (Typeable) 43 | import Foreign.Storable (sizeOf) 44 | import qualified Data.Text.Internal as T 45 | 46 | data Text = Empty 47 | | Chunk {-# UNPACK #-} !T.Text Text 48 | deriving (Typeable) 49 | 50 | -- $invariant 51 | -- 52 | -- The data type invariant for lazy 'Text': Every 'Text' is either 'Empty' or 53 | -- consists of non-null 'T.Text's. All functions must preserve this, 54 | -- and the QC properties must check this. 55 | 56 | -- | Check the invariant strictly. 57 | strictInvariant :: Text -> Bool 58 | strictInvariant Empty = True 59 | strictInvariant x@(Chunk (T.Text _ _ len) cs) 60 | | len > 0 = strictInvariant cs 61 | | otherwise = error $ "Data.Text.Lazy: invariant violation: " 62 | ++ showStructure x 63 | 64 | -- | Check the invariant lazily. 65 | lazyInvariant :: Text -> Text 66 | lazyInvariant Empty = Empty 67 | lazyInvariant x@(Chunk c@(T.Text _ _ len) cs) 68 | | len > 0 = Chunk c (lazyInvariant cs) 69 | | otherwise = error $ "Data.Text.Lazy: invariant violation: " 70 | ++ showStructure x 71 | 72 | -- | Display the internal structure of a lazy 'Text'. 73 | showStructure :: Text -> String 74 | showStructure Empty = "Empty" 75 | showStructure (Chunk t Empty) = "Chunk " ++ show t ++ " Empty" 76 | showStructure (Chunk t ts) = 77 | "Chunk " ++ show t ++ " (" ++ showStructure ts ++ ")" 78 | 79 | -- | Smart constructor for 'Chunk'. Guarantees the data type invariant. 80 | chunk :: T.Text -> Text -> Text 81 | {-# INLINE chunk #-} 82 | chunk t@(T.Text _ _ len) ts | len == 0 = ts 83 | | otherwise = Chunk t ts 84 | 85 | -- | Smart constructor for 'Empty'. 86 | empty :: Text 87 | {-# INLINE [0] empty #-} 88 | empty = Empty 89 | 90 | -- | Consume the chunks of a lazy 'Text' with a natural right fold. 91 | foldrChunks :: (T.Text -> a -> a) -> a -> Text -> a 92 | foldrChunks f z = go 93 | where go Empty = z 94 | go (Chunk c cs) = f c (go cs) 95 | {-# INLINE foldrChunks #-} 96 | 97 | -- | Consume the chunks of a lazy 'Text' with a strict, tail-recursive, 98 | -- accumulating left fold. 99 | foldlChunks :: (a -> T.Text -> a) -> a -> Text -> a 100 | foldlChunks f z = go z 101 | where go !a Empty = a 102 | go !a (Chunk c cs) = go (f a c) cs 103 | {-# INLINE foldlChunks #-} 104 | 105 | -- | Currently set to 16 KiB, less the memory management overhead. 106 | defaultChunkSize :: Int 107 | defaultChunkSize = 16384 - chunkOverhead 108 | {-# INLINE defaultChunkSize #-} 109 | 110 | -- | Currently set to 128 bytes, less the memory management overhead. 111 | smallChunkSize :: Int 112 | smallChunkSize = 128 - chunkOverhead 113 | {-# INLINE smallChunkSize #-} 114 | 115 | -- | The memory management overhead. Currently this is tuned for GHC only. 116 | chunkOverhead :: Int 117 | chunkOverhead = sizeOf (undefined :: Int) `shiftL` 1 118 | {-# INLINE chunkOverhead #-} 119 | -------------------------------------------------------------------------------- /Data/Text/Unsafe.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, MagicHash, UnboxedTuples #-} 2 | -- | 3 | -- Module : Data.Text.Unsafe 4 | -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan 5 | -- License : BSD-style 6 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 7 | -- duncan@haskell.org 8 | -- Stability : experimental 9 | -- Portability : portable 10 | -- 11 | -- A module containing unsafe 'Text' operations, for very very careful 12 | -- use in heavily tested code. 13 | module Data.Text.Unsafe 14 | ( 15 | inlineInterleaveST 16 | , inlinePerformIO 17 | , unsafeDupablePerformIO 18 | , Iter(..) 19 | , iter 20 | , iter_ 21 | , reverseIter 22 | , unsafeHead 23 | , unsafeTail 24 | , lengthWord16 25 | , takeWord16 26 | , dropWord16 27 | ) where 28 | 29 | #if defined(ASSERTS) 30 | import Control.Exception (assert) 31 | #endif 32 | import Data.Text.Encoding.Utf16 (chr2) 33 | import Data.Text.Internal (Text(..)) 34 | import Data.Text.Unsafe.Base (inlineInterleaveST, inlinePerformIO) 35 | import Data.Text.UnsafeChar (unsafeChr) 36 | import qualified Data.Text.Array as A 37 | #if __GLASGOW_HASKELL__ >= 611 38 | import GHC.IO (unsafeDupablePerformIO) 39 | #else 40 | import GHC.IOBase (unsafeDupablePerformIO) 41 | #endif 42 | 43 | -- | /O(1)/ A variant of 'head' for non-empty 'Text'. 'unsafeHead' 44 | -- omits the check for the empty case, so there is an obligation on 45 | -- the programmer to provide a proof that the 'Text' is non-empty. 46 | unsafeHead :: Text -> Char 47 | unsafeHead (Text arr off _len) 48 | | m < 0xD800 || m > 0xDBFF = unsafeChr m 49 | | otherwise = chr2 m n 50 | where m = A.unsafeIndex arr off 51 | n = A.unsafeIndex arr (off+1) 52 | {-# INLINE unsafeHead #-} 53 | 54 | -- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeHead' 55 | -- omits the check for the empty case, so there is an obligation on 56 | -- the programmer to provide a proof that the 'Text' is non-empty. 57 | unsafeTail :: Text -> Text 58 | unsafeTail t@(Text arr off len) = 59 | #if defined(ASSERTS) 60 | assert (d <= len) $ 61 | #endif 62 | Text arr (off+d) (len-d) 63 | where d = iter_ t 0 64 | {-# INLINE unsafeTail #-} 65 | 66 | data Iter = Iter {-# UNPACK #-} !Char {-# UNPACK #-} !Int 67 | 68 | -- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-16 69 | -- array, returning the current character and the delta to add to give 70 | -- the next offset to iterate at. 71 | iter :: Text -> Int -> Iter 72 | iter (Text arr off _len) i 73 | | m < 0xD800 || m > 0xDBFF = Iter (unsafeChr m) 1 74 | | otherwise = Iter (chr2 m n) 2 75 | where m = A.unsafeIndex arr j 76 | n = A.unsafeIndex arr k 77 | j = off + i 78 | k = j + 1 79 | {-# INLINE iter #-} 80 | 81 | -- | /O(1)/ Iterate one step through a UTF-16 array, returning the 82 | -- delta to add to give the next offset to iterate at. 83 | iter_ :: Text -> Int -> Int 84 | iter_ (Text arr off _len) i | m < 0xD800 || m > 0xDBFF = 1 85 | | otherwise = 2 86 | where m = A.unsafeIndex arr (off+i) 87 | {-# INLINE iter_ #-} 88 | 89 | -- | /O(1)/ Iterate one step backwards through a UTF-16 array, 90 | -- returning the current character and the delta to add (i.e. a 91 | -- negative number) to give the next offset to iterate at. 92 | reverseIter :: Text -> Int -> (Char,Int) 93 | reverseIter (Text arr off _len) i 94 | | m < 0xDC00 || m > 0xDFFF = (unsafeChr m, -1) 95 | | otherwise = (chr2 n m, -2) 96 | where m = A.unsafeIndex arr j 97 | n = A.unsafeIndex arr k 98 | j = off + i 99 | k = j - 1 100 | {-# INLINE reverseIter #-} 101 | 102 | -- | /O(1)/ Return the length of a 'Text' in units of 'Word16'. This 103 | -- is useful for sizing a target array appropriately before using 104 | -- 'unsafeCopyToPtr'. 105 | lengthWord16 :: Text -> Int 106 | lengthWord16 (Text _arr _off len) = len 107 | {-# INLINE lengthWord16 #-} 108 | 109 | -- | /O(1)/ Unchecked take of 'k' 'Word16's from the front of a 'Text'. 110 | takeWord16 :: Int -> Text -> Text 111 | takeWord16 k (Text arr off _len) = Text arr off k 112 | {-# INLINE takeWord16 #-} 113 | 114 | -- | /O(1)/ Unchecked drop of 'k' 'Word16's from the front of a 'Text'. 115 | dropWord16 :: Int -> Text -> Text 116 | dropWord16 k (Text arr off len) = Text arr (off+k) (len-k) 117 | {-# INLINE dropWord16 #-} 118 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Fusion.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | -- | 3 | -- Module : Data.Text.Lazy.Fusion 4 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 8 | -- duncan@haskell.org 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- Core stream fusion functionality for text. 13 | 14 | module Data.Text.Lazy.Fusion 15 | ( 16 | stream 17 | , unstream 18 | , unstreamChunks 19 | , length 20 | , unfoldrN 21 | , index 22 | , countChar 23 | ) where 24 | 25 | import Prelude hiding (length) 26 | import qualified Data.Text.Fusion.Common as S 27 | import Control.Monad.ST (runST) 28 | import Data.Text.Fusion.Internal 29 | import Data.Text.Fusion.Size (isEmpty, unknownSize) 30 | import Data.Text.Lazy.Internal 31 | import qualified Data.Text.Internal as I 32 | import qualified Data.Text.Array as A 33 | import Data.Text.UnsafeChar (unsafeWrite) 34 | import Data.Text.UnsafeShift (shiftL) 35 | import Data.Text.Unsafe (Iter(..), iter) 36 | import Data.Int (Int64) 37 | 38 | default(Int64) 39 | 40 | -- | /O(n)/ Convert a 'Text' into a 'Stream Char'. 41 | stream :: Text -> Stream Char 42 | stream text = Stream next (text :*: 0) unknownSize 43 | where 44 | next (Empty :*: _) = Done 45 | next (txt@(Chunk t@(I.Text _ _ len) ts) :*: i) 46 | | i >= len = next (ts :*: 0) 47 | | otherwise = Yield c (txt :*: i+d) 48 | where Iter c d = iter t i 49 | {-# INLINE [0] stream #-} 50 | 51 | -- | /O(n)/ Convert a 'Stream Char' into a 'Text', using the given 52 | -- chunk size. 53 | unstreamChunks :: Int -> Stream Char -> Text 54 | unstreamChunks !chunkSize (Stream next s0 len0) 55 | | isEmpty len0 = Empty 56 | | otherwise = outer s0 57 | where 58 | outer so = {-# SCC "unstreamChunks/outer" #-} 59 | case next so of 60 | Done -> Empty 61 | Skip s' -> outer s' 62 | Yield x s' -> runST $ do 63 | a <- A.new unknownLength 64 | unsafeWrite a 0 x >>= inner a unknownLength s' 65 | where unknownLength = 4 66 | where 67 | inner marr !len s !i 68 | | i + 1 >= chunkSize = finish marr i s 69 | | i + 1 >= len = {-# SCC "unstreamChunks/resize" #-} do 70 | let newLen = min (len `shiftL` 1) chunkSize 71 | marr' <- A.new newLen 72 | A.copyM marr' 0 marr 0 len 73 | inner marr' newLen s i 74 | | otherwise = 75 | {-# SCC "unstreamChunks/inner" #-} 76 | case next s of 77 | Done -> finish marr i s 78 | Skip s' -> inner marr len s' i 79 | Yield x s' -> do d <- unsafeWrite marr i x 80 | inner marr len s' (i+d) 81 | finish marr len s' = do 82 | arr <- A.unsafeFreeze marr 83 | return (I.Text arr 0 len `Chunk` outer s') 84 | {-# INLINE [0] unstreamChunks #-} 85 | 86 | -- | /O(n)/ Convert a 'Stream Char' into a 'Text', using 87 | -- 'defaultChunkSize'. 88 | unstream :: Stream Char -> Text 89 | unstream = unstreamChunks defaultChunkSize 90 | {-# INLINE [0] unstream #-} 91 | 92 | -- | /O(n)/ Returns the number of characters in a text. 93 | length :: Stream Char -> Int64 94 | length = S.lengthI 95 | {-# INLINE[0] length #-} 96 | 97 | {-# RULES "LAZY STREAM stream/unstream fusion" forall s. 98 | stream (unstream s) = s #-} 99 | 100 | -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed 101 | -- value. However, the length of the result is limited by the 102 | -- first argument to 'unfoldrN'. This function is more efficient than 103 | -- 'unfoldr' when the length of the result is known. 104 | unfoldrN :: Int64 -> (a -> Maybe (Char,a)) -> a -> Stream Char 105 | unfoldrN n = S.unfoldrNI n 106 | {-# INLINE [0] unfoldrN #-} 107 | 108 | -- | /O(n)/ stream index (subscript) operator, starting from 0. 109 | index :: Stream Char -> Int64 -> Char 110 | index = S.indexI 111 | {-# INLINE [0] index #-} 112 | 113 | -- | /O(n)/ The 'count' function returns the number of times the query 114 | -- element appears in the given stream. 115 | countChar :: Char -> Stream Char -> Int64 116 | countChar = S.countCharI 117 | {-# INLINE [0] countChar #-} 118 | -------------------------------------------------------------------------------- /Data/Text/Fusion/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, ExistentialQuantification #-} 2 | -- | 3 | -- Module : Data.Text.Fusion.Internal 4 | -- Copyright : (c) Tom Harper 2008-2009, 5 | -- (c) Bryan O'Sullivan 2009, 6 | -- (c) Duncan Coutts 2009, 7 | -- (c) Jasper Van der Jeugt 2011 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Core stream fusion functionality for text. 16 | 17 | module Data.Text.Fusion.Internal 18 | ( 19 | CC(..) 20 | , M(..) 21 | , M8 22 | , PairS(..) 23 | , RS(..) 24 | , Step(..) 25 | , Stream(..) 26 | , Switch(..) 27 | , empty 28 | ) where 29 | 30 | import Data.Text.Fusion.Size 31 | import Data.Word (Word8) 32 | 33 | -- | Specialised tuple for case conversion. 34 | data CC s = CC !s {-# UNPACK #-} !Char {-# UNPACK #-} !Char 35 | 36 | -- | Specialised, strict Maybe-like type. 37 | data M a = N 38 | | J !a 39 | 40 | type M8 = M Word8 41 | 42 | -- Restreaming state. 43 | data RS s 44 | = RS0 !s 45 | | RS1 !s {-# UNPACK #-} !Word8 46 | | RS2 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 47 | | RS3 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 48 | 49 | infixl 2 :*: 50 | data PairS a b = !a :*: !b 51 | -- deriving (Eq, Ord, Show) 52 | 53 | -- | Allow a function over a stream to switch between two states. 54 | data Switch = S1 | S2 55 | 56 | data Step s a = Done 57 | | Skip !s 58 | | Yield !a !s 59 | 60 | {- 61 | instance (Show a) => Show (Step s a) 62 | where show Done = "Done" 63 | show (Skip _) = "Skip" 64 | show (Yield x _) = "Yield " ++ show x 65 | -} 66 | 67 | instance (Eq a) => Eq (Stream a) where 68 | (==) = eq 69 | 70 | instance (Ord a) => Ord (Stream a) where 71 | compare = cmp 72 | 73 | -- The length hint in a Stream has two roles. If its value is zero, 74 | -- we trust it, and treat the stream as empty. Otherwise, we treat it 75 | -- as a hint: it should usually be accurate, so we use it when 76 | -- unstreaming to decide what size array to allocate. However, the 77 | -- unstreaming functions must be able to cope with the hint being too 78 | -- small or too large. 79 | -- 80 | -- The size hint tries to track the UTF-16 code points in a stream, 81 | -- but often counts the number of characters instead. It can easily 82 | -- undercount if, for instance, a transformed stream contains astral 83 | -- plane characters (those above 0x10000). 84 | 85 | data Stream a = 86 | forall s. Stream 87 | (s -> Step s a) -- stepper function 88 | !s -- current state 89 | !Size -- size hint 90 | 91 | -- | /O(n)/ Determines if two streams are equal. 92 | eq :: (Eq a) => Stream a -> Stream a -> Bool 93 | eq (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2) 94 | where 95 | loop Done Done = True 96 | loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2') 97 | loop (Skip s1') x2 = loop (next1 s1') x2 98 | loop x1 (Skip s2') = loop x1 (next2 s2') 99 | loop Done _ = False 100 | loop _ Done = False 101 | loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 && 102 | loop (next1 s1') (next2 s2') 103 | {-# INLINE [0] eq #-} 104 | 105 | cmp :: (Ord a) => Stream a -> Stream a -> Ordering 106 | cmp (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2) 107 | where 108 | loop Done Done = EQ 109 | loop (Skip s1') (Skip s2') = loop (next1 s1') (next2 s2') 110 | loop (Skip s1') x2 = loop (next1 s1') x2 111 | loop x1 (Skip s2') = loop x1 (next2 s2') 112 | loop Done _ = LT 113 | loop _ Done = GT 114 | loop (Yield x1 s1') (Yield x2 s2') = 115 | case compare x1 x2 of 116 | EQ -> loop (next1 s1') (next2 s2') 117 | other -> other 118 | {-# INLINE [0] cmp #-} 119 | 120 | -- | The empty stream. 121 | empty :: Stream a 122 | empty = Stream next () 0 123 | where next _ = Done 124 | {-# INLINE [0] empty #-} 125 | -------------------------------------------------------------------------------- /Data/Text/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, DeriveDataTypeable #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Internal 5 | -- Copyright : (c) 2008, 2009 Tom Harper, 6 | -- (c) 2009, 2010 Bryan O'Sullivan, 7 | -- (c) 2009 Duncan Coutts 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- A module containing private 'Text' internals. This exposes the 16 | -- 'Text' representation and low level construction functions. 17 | -- Modules which extend the 'Text' system may need to use this module. 18 | -- 19 | -- You should not use this module unless you are determined to monkey 20 | -- with the internals, as the functions here do just about nothing to 21 | -- preserve data invariants. You have been warned! 22 | 23 | module Data.Text.Internal 24 | ( 25 | -- * Types 26 | -- $internals 27 | Text(..) 28 | -- * Construction 29 | , text 30 | , textP 31 | -- * Safety 32 | , safe 33 | -- * Code that must be here for accessibility 34 | , empty 35 | -- * Utilities 36 | , firstf 37 | -- * Debugging 38 | , showText 39 | ) where 40 | 41 | #if defined(ASSERTS) 42 | import Control.Exception (assert) 43 | #endif 44 | import Data.Bits ((.&.)) 45 | import qualified Data.Text.Array as A 46 | import Data.Text.UnsafeChar (ord) 47 | import Data.Typeable (Typeable) 48 | 49 | -- | A space efficient, packed, unboxed Unicode text type. 50 | data Text = Text 51 | {-# UNPACK #-} !A.Array -- payload (Word16 elements) 52 | {-# UNPACK #-} !Int -- offset (units of Word16, not Char) 53 | {-# UNPACK #-} !Int -- length (units of Word16, not Char) 54 | deriving (Typeable) 55 | 56 | -- | Smart constructor. 57 | text :: A.Array -> Int -> Int -> Text 58 | text arr off len = 59 | #if defined(ASSERTS) 60 | let c = A.unsafeIndex arr off 61 | alen = A.length arr 62 | in assert (len >= 0) . 63 | assert (off >= 0) . 64 | assert (alen == 0 || len == 0 || off < alen) . 65 | assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $ 66 | #endif 67 | Text arr off len 68 | {-# INLINE text #-} 69 | 70 | -- | /O(1)/ The empty 'Text'. 71 | empty :: Text 72 | empty = Text A.empty 0 0 73 | {-# INLINE [1] empty #-} 74 | 75 | -- | Construct a 'Text' without invisibly pinning its byte array in 76 | -- memory if its length has dwindled to zero. 77 | textP :: A.Array -> Int -> Int -> Text 78 | textP arr off len | len == 0 = empty 79 | | otherwise = text arr off len 80 | {-# INLINE textP #-} 81 | 82 | -- | A useful 'show'-like function for debugging purposes. 83 | showText :: Text -> String 84 | showText (Text arr off len) = 85 | "Text " ++ show (A.toList arr off len) ++ ' ' : 86 | show off ++ ' ' : show len 87 | 88 | -- | Map a 'Char' to a 'Text'-safe value. 89 | -- 90 | -- UTF-16 surrogate code points are not included in the set of Unicode 91 | -- scalar values, but are unfortunately admitted as valid 'Char' 92 | -- values by Haskell. They cannot be represented in a 'Text'. This 93 | -- function remaps those code points to the Unicode replacement 94 | -- character (U+FFFD, \'�\'), and leaves other code points 95 | -- unchanged. 96 | safe :: Char -> Char 97 | safe c 98 | | ord c .&. 0x1ff800 /= 0xd800 = c 99 | | otherwise = '\xfffd' 100 | {-# INLINE safe #-} 101 | 102 | -- | Apply a function to the first element of an optional pair. 103 | firstf :: (a -> c) -> Maybe (a,b) -> Maybe (c,b) 104 | firstf f (Just (a, b)) = Just (f a, b) 105 | firstf _ Nothing = Nothing 106 | 107 | -- $internals 108 | -- 109 | -- Internally, the 'Text' type is represented as an array of 'Word16' 110 | -- UTF-16 code units. The offset and length fields in the constructor 111 | -- are in these units, /not/ units of 'Char'. 112 | -- 113 | -- Invariants that all functions must maintain: 114 | -- 115 | -- * Since the 'Text' type uses UTF-16 internally, it cannot represent 116 | -- characters in the reserved surrogate code point range U+D800 to 117 | -- U+DFFF. To maintain this invariant, the 'safe' function maps 118 | -- 'Char' values in this range to the replacement character (U+FFFD, 119 | -- \'�\'). 120 | -- 121 | -- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must 122 | -- always be followed by a trailing (or \"low\") surrogate code unit 123 | -- (0xDC00-0xDFFF). A trailing surrogate code unit must always be 124 | -- preceded by a leading surrogate code unit. 125 | -------------------------------------------------------------------------------- /Data/Text/Fusion/Size.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# OPTIONS_GHC -fno-warn-missing-methods #-} 3 | -- | 4 | -- Module : Data.Text.Fusion.Internal 5 | -- Copyright : (c) Roman Leshchinskiy 2008, 6 | -- (c) Bryan O'Sullivan 2009 7 | -- 8 | -- License : BSD-style 9 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 10 | -- duncan@haskell.org 11 | -- Stability : experimental 12 | -- Portability : portable 13 | -- 14 | -- Size hints. 15 | 16 | module Data.Text.Fusion.Size 17 | ( 18 | Size 19 | , exactly 20 | , exactSize 21 | , maxSize 22 | , unknownSize 23 | , smaller 24 | , larger 25 | , upperBound 26 | , isEmpty 27 | ) where 28 | 29 | #if defined(ASSERTS) 30 | import Control.Exception (assert) 31 | #endif 32 | 33 | data Size = Exact {-# UNPACK #-} !Int -- ^ Exact size. 34 | | Max {-# UNPACK #-} !Int -- ^ Upper bound on size. 35 | | Unknown -- ^ Unknown size. 36 | deriving (Eq, Show) 37 | 38 | exactly :: Size -> Maybe Int 39 | exactly (Exact n) = Just n 40 | exactly _ = Nothing 41 | {-# INLINE exactly #-} 42 | 43 | exactSize :: Int -> Size 44 | exactSize n = 45 | #if defined(ASSERTS) 46 | assert (n >= 0) 47 | #endif 48 | Exact n 49 | {-# INLINE exactSize #-} 50 | 51 | maxSize :: Int -> Size 52 | maxSize n = 53 | #if defined(ASSERTS) 54 | assert (n >= 0) 55 | #endif 56 | Max n 57 | {-# INLINE maxSize #-} 58 | 59 | unknownSize :: Size 60 | unknownSize = Unknown 61 | {-# INLINE unknownSize #-} 62 | 63 | instance Num Size where 64 | (+) = addSize 65 | (-) = subtractSize 66 | (*) = mulSize 67 | 68 | fromInteger = f where f = Exact . fromInteger 69 | {-# INLINE f #-} 70 | 71 | add :: Int -> Int -> Int 72 | add m n | mn >= 0 = mn 73 | | otherwise = overflowError 74 | where mn = m + n 75 | {-# INLINE add #-} 76 | 77 | addSize :: Size -> Size -> Size 78 | addSize (Exact m) (Exact n) = Exact (add m n) 79 | addSize (Exact m) (Max n) = Max (add m n) 80 | addSize (Max m) (Exact n) = Max (add m n) 81 | addSize (Max m) (Max n) = Max (add m n) 82 | addSize _ _ = Unknown 83 | {-# INLINE addSize #-} 84 | 85 | subtractSize :: Size -> Size -> Size 86 | subtractSize (Exact m) (Exact n) = Exact (max (m-n) 0) 87 | subtractSize (Exact m) (Max _) = Max m 88 | subtractSize (Max m) (Exact n) = Max (max (m-n) 0) 89 | subtractSize a@(Max _) (Max _) = a 90 | subtractSize a@(Max _) Unknown = a 91 | subtractSize _ _ = Unknown 92 | {-# INLINE subtractSize #-} 93 | 94 | mul :: Int -> Int -> Int 95 | mul m n 96 | | m <= maxBound `quot` n = m * n 97 | | otherwise = overflowError 98 | {-# INLINE mul #-} 99 | 100 | mulSize :: Size -> Size -> Size 101 | mulSize (Exact m) (Exact n) = Exact (mul m n) 102 | mulSize (Exact m) (Max n) = Max (mul m n) 103 | mulSize (Max m) (Exact n) = Max (mul m n) 104 | mulSize (Max m) (Max n) = Max (mul m n) 105 | mulSize _ _ = Unknown 106 | {-# INLINE mulSize #-} 107 | 108 | -- | Minimum of two size hints. 109 | smaller :: Size -> Size -> Size 110 | smaller (Exact m) (Exact n) = Exact (m `min` n) 111 | smaller (Exact m) (Max n) = Max (m `min` n) 112 | smaller (Exact m) Unknown = Max m 113 | smaller (Max m) (Exact n) = Max (m `min` n) 114 | smaller (Max m) (Max n) = Max (m `min` n) 115 | smaller a@(Max _) Unknown = a 116 | smaller Unknown (Exact n) = Max n 117 | smaller Unknown (Max n) = Max n 118 | smaller Unknown Unknown = Unknown 119 | {-# INLINE smaller #-} 120 | 121 | -- | Maximum of two size hints. 122 | larger :: Size -> Size -> Size 123 | larger (Exact m) (Exact n) = Exact (m `max` n) 124 | larger a@(Exact m) b@(Max n) | m >= n = a 125 | | otherwise = b 126 | larger a@(Max m) b@(Exact n) | n >= m = b 127 | | otherwise = a 128 | larger (Max m) (Max n) = Max (m `max` n) 129 | larger _ _ = Unknown 130 | {-# INLINE larger #-} 131 | 132 | -- | Compute the maximum size from a size hint, if possible. 133 | upperBound :: Int -> Size -> Int 134 | upperBound _ (Exact n) = n 135 | upperBound _ (Max n) = n 136 | upperBound k _ = k 137 | {-# INLINE upperBound #-} 138 | 139 | isEmpty :: Size -> Bool 140 | isEmpty (Exact n) = n <= 0 141 | isEmpty (Max n) = n <= 0 142 | isEmpty _ = False 143 | {-# INLINE isEmpty #-} 144 | 145 | overflowError :: Int 146 | overflowError = error "Data.Text.Fusion.Size: size overflow" 147 | -------------------------------------------------------------------------------- /Data/Text/Encoding/Fusion/Common.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Encoding.Fusion.Common 5 | -- Copyright : (c) Tom Harper 2008-2009, 6 | -- (c) Bryan O'Sullivan 2009, 7 | -- (c) Duncan Coutts 2009, 8 | -- (c) Jasper Van der Jeugt 2011 9 | -- 10 | -- License : BSD-style 11 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 12 | -- duncan@haskell.org 13 | -- Stability : experimental 14 | -- Portability : portable 15 | -- 16 | -- Fusible 'Stream'-oriented functions for converting between 'Text' 17 | -- and several common encodings. 18 | 19 | module Data.Text.Encoding.Fusion.Common 20 | ( 21 | -- * Restreaming 22 | -- Restreaming is the act of converting from one 'Stream' 23 | -- representation to another. 24 | restreamUtf16LE 25 | , restreamUtf16BE 26 | , restreamUtf32LE 27 | , restreamUtf32BE 28 | ) where 29 | 30 | import Data.Bits ((.&.)) 31 | import Data.Text.Fusion (Step(..), Stream(..)) 32 | import Data.Text.Fusion.Internal (RS(..)) 33 | import Data.Text.UnsafeChar (ord) 34 | import Data.Text.UnsafeShift (shiftR) 35 | import Data.Word (Word8) 36 | 37 | restreamUtf16BE :: Stream Char -> Stream Word8 38 | restreamUtf16BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2) 39 | where 40 | next (RS0 s) = case next0 s of 41 | Done -> Done 42 | Skip s' -> Skip (RS0 s') 43 | Yield x s' 44 | | n < 0x10000 -> Yield (fromIntegral $ n `shiftR` 8) $ 45 | RS1 s' (fromIntegral n) 46 | | otherwise -> Yield c1 $ RS3 s' c2 c3 c4 47 | where 48 | n = ord x 49 | n1 = n - 0x10000 50 | c1 = fromIntegral (n1 `shiftR` 18 + 0xD8) 51 | c2 = fromIntegral (n1 `shiftR` 10) 52 | n2 = n1 .&. 0x3FF 53 | c3 = fromIntegral (n2 `shiftR` 8 + 0xDC) 54 | c4 = fromIntegral n2 55 | next (RS1 s x2) = Yield x2 (RS0 s) 56 | next (RS2 s x2 x3) = Yield x2 (RS1 s x3) 57 | next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4) 58 | {-# INLINE next #-} 59 | {-# INLINE restreamUtf16BE #-} 60 | 61 | restreamUtf16LE :: Stream Char -> Stream Word8 62 | restreamUtf16LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2) 63 | where 64 | next (RS0 s) = case next0 s of 65 | Done -> Done 66 | Skip s' -> Skip (RS0 s') 67 | Yield x s' 68 | | n < 0x10000 -> Yield (fromIntegral n) $ 69 | RS1 s' (fromIntegral $ shiftR n 8) 70 | | otherwise -> Yield c1 $ RS3 s' c2 c3 c4 71 | where 72 | n = ord x 73 | n1 = n - 0x10000 74 | c2 = fromIntegral (shiftR n1 18 + 0xD8) 75 | c1 = fromIntegral (shiftR n1 10) 76 | n2 = n1 .&. 0x3FF 77 | c4 = fromIntegral (shiftR n2 8 + 0xDC) 78 | c3 = fromIntegral n2 79 | next (RS1 s x2) = Yield x2 (RS0 s) 80 | next (RS2 s x2 x3) = Yield x2 (RS1 s x3) 81 | next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4) 82 | {-# INLINE next #-} 83 | {-# INLINE restreamUtf16LE #-} 84 | 85 | restreamUtf32BE :: Stream Char -> Stream Word8 86 | restreamUtf32BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2) 87 | where 88 | next (RS0 s) = case next0 s of 89 | Done -> Done 90 | Skip s' -> Skip (RS0 s') 91 | Yield x s' -> Yield c1 (RS3 s' c2 c3 c4) 92 | where 93 | n = ord x 94 | c1 = fromIntegral $ shiftR n 24 95 | c2 = fromIntegral $ shiftR n 16 96 | c3 = fromIntegral $ shiftR n 8 97 | c4 = fromIntegral n 98 | next (RS1 s x2) = Yield x2 (RS0 s) 99 | next (RS2 s x2 x3) = Yield x2 (RS1 s x3) 100 | next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4) 101 | {-# INLINE next #-} 102 | {-# INLINE restreamUtf32BE #-} 103 | 104 | restreamUtf32LE :: Stream Char -> Stream Word8 105 | restreamUtf32LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2) 106 | where 107 | next (RS0 s) = case next0 s of 108 | Done -> Done 109 | Skip s' -> Skip (RS0 s') 110 | Yield x s' -> Yield c1 (RS3 s' c2 c3 c4) 111 | where 112 | n = ord x 113 | c4 = fromIntegral $ shiftR n 24 114 | c3 = fromIntegral $ shiftR n 16 115 | c2 = fromIntegral $ shiftR n 8 116 | c1 = fromIntegral n 117 | next (RS1 s x2) = Yield x2 (RS0 s) 118 | next (RS2 s x2 x3) = Yield x2 (RS1 s x3) 119 | next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4) 120 | {-# INLINE next #-} 121 | {-# INLINE restreamUtf32LE #-} 122 | -------------------------------------------------------------------------------- /Data/Text/Encoding/Error.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, DeriveDataTypeable #-} 2 | #if __GLASGOW_HASKELL__ >= 702 3 | {-# LANGUAGE Trustworthy #-} 4 | #endif 5 | -- | 6 | -- Module : Data.Text.Encoding.Error 7 | -- Copyright : (c) Bryan O'Sullivan 2009 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Types and functions for dealing with encoding and decoding errors 16 | -- in Unicode text. 17 | -- 18 | -- The standard functions for encoding and decoding text are strict, 19 | -- which is to say that they throw exceptions on invalid input. This 20 | -- is often unhelpful on real world input, so alternative functions 21 | -- exist that accept custom handlers for dealing with invalid inputs. 22 | -- These 'OnError' handlers are normal Haskell functions. You can use 23 | -- one of the presupplied functions in this module, or you can write a 24 | -- custom handler of your own. 25 | 26 | module Data.Text.Encoding.Error 27 | ( 28 | -- * Error handling types 29 | UnicodeException(..) 30 | , OnError 31 | , OnDecodeError 32 | , OnEncodeError 33 | -- * Useful error handling functions 34 | , lenientDecode 35 | , strictDecode 36 | , strictEncode 37 | , ignore 38 | , replace 39 | ) where 40 | 41 | import Control.DeepSeq (NFData (..)) 42 | #if __GLASGOW_HASKELL__ >= 610 43 | import Control.Exception (Exception, throw) 44 | #else 45 | import Control.Exception.Extensible (Exception, throw) 46 | #endif 47 | import Data.Typeable (Typeable) 48 | import Data.Word (Word8) 49 | import Numeric (showHex) 50 | 51 | -- | Function type for handling a coding error. It is supplied with 52 | -- two inputs: 53 | -- 54 | -- * A 'String' that describes the error. 55 | -- 56 | -- * The input value that caused the error. If the error arose 57 | -- because the end of input was reached or could not be identified 58 | -- precisely, this value will be 'Nothing'. 59 | -- 60 | -- If the handler returns a value wrapped with 'Just', that value will 61 | -- be used in the output as the replacement for the invalid input. If 62 | -- it returns 'Nothing', no value will be used in the output. 63 | -- 64 | -- Should the handler need to abort processing, it should use 'error' 65 | -- or 'throw' an exception (preferably a 'UnicodeException'). It may 66 | -- use the description provided to construct a more helpful error 67 | -- report. 68 | type OnError a b = String -> Maybe a -> Maybe b 69 | 70 | -- | A handler for a decoding error. 71 | type OnDecodeError = OnError Word8 Char 72 | 73 | -- | A handler for an encoding error. 74 | type OnEncodeError = OnError Char Word8 75 | 76 | -- | An exception type for representing Unicode encoding errors. 77 | data UnicodeException = 78 | DecodeError String (Maybe Word8) 79 | -- ^ Could not decode a byte sequence because it was invalid under 80 | -- the given encoding, or ran out of input in mid-decode. 81 | | EncodeError String (Maybe Char) 82 | -- ^ Tried to encode a character that could not be represented 83 | -- under the given encoding, or ran out of input in mid-encode. 84 | deriving (Eq, Typeable) 85 | 86 | showUnicodeException :: UnicodeException -> String 87 | showUnicodeException (DecodeError desc (Just w)) 88 | = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc) 89 | showUnicodeException (DecodeError desc Nothing) 90 | = "Cannot decode input: " ++ desc 91 | showUnicodeException (EncodeError desc (Just c)) 92 | = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc) 93 | showUnicodeException (EncodeError desc Nothing) 94 | = "Cannot encode input: " ++ desc 95 | 96 | instance Show UnicodeException where 97 | show = showUnicodeException 98 | 99 | instance Exception UnicodeException 100 | 101 | instance NFData UnicodeException where 102 | rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` () 103 | rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` () 104 | 105 | -- | Throw a 'UnicodeException' if decoding fails. 106 | strictDecode :: OnDecodeError 107 | strictDecode desc c = throw (DecodeError desc c) 108 | 109 | -- | Replace an invalid input byte with the Unicode replacement 110 | -- character U+FFFD. 111 | lenientDecode :: OnDecodeError 112 | lenientDecode _ _ = Just '\xfffd' 113 | 114 | -- | Throw a 'UnicodeException' if encoding fails. 115 | strictEncode :: OnEncodeError 116 | strictEncode desc c = throw (EncodeError desc c) 117 | 118 | -- | Ignore an invalid input, substituting nothing in the output. 119 | ignore :: OnError a b 120 | ignore _ _ = Nothing 121 | 122 | -- | Replace an invalid input with a valid output. 123 | replace :: b -> OnError a b 124 | replace c _ _ = Just c 125 | -------------------------------------------------------------------------------- /Data/Text/Encoding/Utf8.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, MagicHash, BangPatterns #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Encoding.Utf8 5 | -- Copyright : (c) 2008, 2009 Tom Harper, 6 | -- (c) 2009, 2010 Bryan O'Sullivan, 7 | -- (c) 2009 Duncan Coutts 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Basic UTF-8 validation and character manipulation. 16 | module Data.Text.Encoding.Utf8 17 | ( 18 | -- Decomposition 19 | ord2 20 | , ord3 21 | , ord4 22 | -- Construction 23 | , chr2 24 | , chr3 25 | , chr4 26 | -- * Validation 27 | , validate1 28 | , validate2 29 | , validate3 30 | , validate4 31 | ) where 32 | 33 | #if defined(ASSERTS) 34 | import Control.Exception (assert) 35 | #endif 36 | import Data.Bits ((.&.)) 37 | import Data.Text.UnsafeChar (ord) 38 | import Data.Text.UnsafeShift (shiftR) 39 | import GHC.Exts 40 | import GHC.Word (Word8(..)) 41 | 42 | default(Int) 43 | 44 | between :: Word8 -- ^ byte to check 45 | -> Word8 -- ^ lower bound 46 | -> Word8 -- ^ upper bound 47 | -> Bool 48 | between x y z = x >= y && x <= z 49 | {-# INLINE between #-} 50 | 51 | ord2 :: Char -> (Word8,Word8) 52 | ord2 c = 53 | #if defined(ASSERTS) 54 | assert (n >= 0x80 && n <= 0x07ff) 55 | #endif 56 | (x1,x2) 57 | where 58 | n = ord c 59 | x1 = fromIntegral $ (n `shiftR` 6) + 0xC0 60 | x2 = fromIntegral $ (n .&. 0x3F) + 0x80 61 | 62 | ord3 :: Char -> (Word8,Word8,Word8) 63 | ord3 c = 64 | #if defined(ASSERTS) 65 | assert (n >= 0x0800 && n <= 0xffff) 66 | #endif 67 | (x1,x2,x3) 68 | where 69 | n = ord c 70 | x1 = fromIntegral $ (n `shiftR` 12) + 0xE0 71 | x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 72 | x3 = fromIntegral $ (n .&. 0x3F) + 0x80 73 | 74 | ord4 :: Char -> (Word8,Word8,Word8,Word8) 75 | ord4 c = 76 | #if defined(ASSERTS) 77 | assert (n >= 0x10000) 78 | #endif 79 | (x1,x2,x3,x4) 80 | where 81 | n = ord c 82 | x1 = fromIntegral $ (n `shiftR` 18) + 0xF0 83 | x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80 84 | x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 85 | x4 = fromIntegral $ (n .&. 0x3F) + 0x80 86 | 87 | chr2 :: Word8 -> Word8 -> Char 88 | chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#)) 89 | where 90 | !y1# = word2Int# x1# 91 | !y2# = word2Int# x2# 92 | !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6# 93 | !z2# = y2# -# 0x80# 94 | {-# INLINE chr2 #-} 95 | 96 | chr3 :: Word8 -> Word8 -> Word8 -> Char 97 | chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#)) 98 | where 99 | !y1# = word2Int# x1# 100 | !y2# = word2Int# x2# 101 | !y3# = word2Int# x3# 102 | !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12# 103 | !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6# 104 | !z3# = y3# -# 0x80# 105 | {-# INLINE chr3 #-} 106 | 107 | chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char 108 | chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) = 109 | C# (chr# (z1# +# z2# +# z3# +# z4#)) 110 | where 111 | !y1# = word2Int# x1# 112 | !y2# = word2Int# x2# 113 | !y3# = word2Int# x3# 114 | !y4# = word2Int# x4# 115 | !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18# 116 | !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12# 117 | !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6# 118 | !z4# = y4# -# 0x80# 119 | {-# INLINE chr4 #-} 120 | 121 | validate1 :: Word8 -> Bool 122 | validate1 x1 = x1 <= 0x7F 123 | {-# INLINE validate1 #-} 124 | 125 | validate2 :: Word8 -> Word8 -> Bool 126 | validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF 127 | {-# INLINE validate2 #-} 128 | 129 | validate3 :: Word8 -> Word8 -> Word8 -> Bool 130 | {-# INLINE validate3 #-} 131 | validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4 132 | where 133 | validate3_1 = (x1 == 0xE0) && 134 | between x2 0xA0 0xBF && 135 | between x3 0x80 0xBF 136 | validate3_2 = between x1 0xE1 0xEC && 137 | between x2 0x80 0xBF && 138 | between x3 0x80 0xBF 139 | validate3_3 = x1 == 0xED && 140 | between x2 0x80 0x9F && 141 | between x3 0x80 0xBF 142 | validate3_4 = between x1 0xEE 0xEF && 143 | between x2 0x80 0xBF && 144 | between x3 0x80 0xBF 145 | 146 | validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool 147 | {-# INLINE validate4 #-} 148 | validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3 149 | where 150 | validate4_1 = x1 == 0xF0 && 151 | between x2 0x90 0xBF && 152 | between x3 0x80 0xBF && 153 | between x4 0x80 0xBF 154 | validate4_2 = between x1 0xF1 0xF3 && 155 | between x2 0x80 0xBF && 156 | between x3 0x80 0xBF && 157 | between x4 0x80 0xBF 158 | validate4_3 = x1 == 0xF4 && 159 | between x2 0x80 0x8F && 160 | between x3 0x80 0xBF && 161 | between x4 0x80 0xBF 162 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Search.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, ScopedTypeVariables #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Lazy.Search 5 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 6 | -- 7 | -- License : BSD-style 8 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 9 | -- duncan@haskell.org 10 | -- Stability : experimental 11 | -- Portability : GHC 12 | -- 13 | -- Fast substring search for lazy 'Text', based on work by Boyer, 14 | -- Moore, Horspool, Sunday, and Lundh. Adapted from the strict 15 | -- implementation. 16 | 17 | module Data.Text.Lazy.Search 18 | ( 19 | indices 20 | ) where 21 | 22 | import qualified Data.Text.Array as A 23 | import Data.Int (Int64) 24 | import Data.Word (Word16, Word64) 25 | import qualified Data.Text.Internal as T 26 | import Data.Text.Fusion.Internal (PairS(..)) 27 | import Data.Text.Lazy.Internal (Text(..), foldlChunks) 28 | import Data.Bits ((.|.), (.&.)) 29 | import Data.Text.UnsafeShift (shiftL) 30 | 31 | -- | /O(n+m)/ Find the offsets of all non-overlapping indices of 32 | -- @needle@ within @haystack@. 33 | -- 34 | -- This function is strict in @needle@, and lazy (as far as possible) 35 | -- in the chunks of @haystack@. 36 | -- 37 | -- In (unlikely) bad cases, this algorithm's complexity degrades 38 | -- towards /O(n*m)/. 39 | indices :: Text -- ^ Substring to search for (@needle@) 40 | -> Text -- ^ Text to search in (@haystack@) 41 | -> [Int64] 42 | indices needle@(Chunk n ns) _haystack@(Chunk k ks) 43 | | nlen <= 0 = [] 44 | | nlen == 1 = indicesOne (nindex 0) 0 k ks 45 | | otherwise = advance k ks 0 0 46 | where 47 | advance x@(T.Text _ _ l) xs = scan 48 | where 49 | scan !g !i 50 | | i >= m = case xs of 51 | Empty -> [] 52 | Chunk y ys -> advance y ys g (i-m) 53 | | lackingHay (i + nlen) x xs = [] 54 | | c == z && candidateMatch 0 = g : scan (g+nlen) (i+nlen) 55 | | otherwise = scan (g+delta) (i+delta) 56 | where 57 | m = fromIntegral l 58 | c = hindex (i + nlast) 59 | delta | nextInPattern = nlen + 1 60 | | c == z = skip + 1 61 | | otherwise = 1 62 | nextInPattern = mask .&. swizzle (hindex (i+nlen)) == 0 63 | candidateMatch !j 64 | | j >= nlast = True 65 | | hindex (i+j) /= nindex j = False 66 | | otherwise = candidateMatch (j+1) 67 | hindex = index x xs 68 | nlen = wordLength needle 69 | nlast = nlen - 1 70 | nindex = index n ns 71 | z = foldlChunks fin 0 needle 72 | where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1) 73 | (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2) 74 | swizzle w = 1 `shiftL` (fromIntegral w .&. 0x3f) 75 | buildTable (T.Text xarr xoff xlen) xs = go 76 | where 77 | go !(g::Int64) !i !msk !skp 78 | | i >= xlast = case xs of 79 | Empty -> (msk .|. swizzle z) :*: skp 80 | Chunk y ys -> buildTable y ys g 0 msk' skp' 81 | | otherwise = go (g+1) (i+1) msk' skp' 82 | where c = A.unsafeIndex xarr (xoff+i) 83 | msk' = msk .|. swizzle c 84 | skp' | c == z = nlen - g - 2 85 | | otherwise = skp 86 | xlast = xlen - 1 87 | -- | Check whether an attempt to index into the haystack at the 88 | -- given offset would fail. 89 | lackingHay q = go 0 90 | where 91 | go p (T.Text _ _ l) ps = p' < q && case ps of 92 | Empty -> True 93 | Chunk r rs -> go p' r rs 94 | where p' = p + fromIntegral l 95 | indices _ _ = [] 96 | 97 | -- | Fast index into a partly unpacked 'Text'. We take into account 98 | -- the possibility that the caller might try to access one element 99 | -- past the end. 100 | index :: T.Text -> Text -> Int64 -> Word16 101 | index (T.Text arr off len) xs !i 102 | | j < len = A.unsafeIndex arr (off+j) 103 | | otherwise = case xs of 104 | Empty 105 | -- out of bounds, but legal 106 | | j == len -> 0 107 | -- should never happen, due to lackingHay above 108 | | otherwise -> emptyError "index" 109 | Chunk c cs -> index c cs (i-fromIntegral len) 110 | where j = fromIntegral i 111 | 112 | -- | A variant of 'indices' that scans linearly for a single 'Word16'. 113 | indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64] 114 | indicesOne c = chunk 115 | where 116 | chunk !i (T.Text oarr ooff olen) os = go 0 117 | where 118 | go h | h >= olen = case os of 119 | Empty -> [] 120 | Chunk y ys -> chunk (i+fromIntegral olen) y ys 121 | | on == c = i + fromIntegral h : go (h+1) 122 | | otherwise = go (h+1) 123 | where on = A.unsafeIndex oarr (ooff+h) 124 | 125 | -- | The number of 'Word16' values in a 'Text'. 126 | wordLength :: Text -> Int64 127 | wordLength = foldlChunks sumLength 0 128 | where sumLength i (T.Text _ _ l) = i + fromIntegral l 129 | 130 | emptyError :: String -> a 131 | emptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input") 132 | -------------------------------------------------------------------------------- /cbits/cbits.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2011 Bryan O'Sullivan . 3 | * 4 | * Portions copyright (c) 2008-2010 Björn Höhrmann . 5 | * 6 | * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff, 14 | size_t n) 15 | { 16 | memcpy(dest + (doff<<1), src + (soff<<1), n<<1); 17 | } 18 | 19 | int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff, 20 | size_t n) 21 | { 22 | return memcmp(a + (aoff<<1), b + (boff<<1), n<<1); 23 | } 24 | 25 | #define UTF8_ACCEPT 0 26 | #define UTF8_REJECT 12 27 | 28 | static const uint8_t utf8d[] = { 29 | /* 30 | * The first part of the table maps bytes to character classes that 31 | * to reduce the size of the transition table and create bitmasks. 32 | */ 33 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 34 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 35 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 36 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 37 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 38 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 39 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 40 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 41 | 42 | /* 43 | * The second part is a transition table that maps a combination of 44 | * a state of the automaton and a character class to a state. 45 | */ 46 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, 47 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, 48 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, 49 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, 50 | 12,36,12,12,12,12,12,12,12,12,12,12, 51 | }; 52 | 53 | static inline uint32_t 54 | decode(uint32_t *state, uint32_t* codep, uint32_t byte) { 55 | uint32_t type = utf8d[byte]; 56 | 57 | *codep = (*state != UTF8_ACCEPT) ? 58 | (byte & 0x3fu) | (*codep << 6) : 59 | (0xff >> type) & (byte); 60 | 61 | return *state = utf8d[256 + *state + type]; 62 | } 63 | 64 | /* 65 | * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode 66 | * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to 67 | * an UTF16 array 68 | */ 69 | void 70 | _hs_text_decode_latin1(uint16_t *dest, const uint8_t const *src, 71 | const uint8_t const *srcend) 72 | { 73 | const uint8_t *p = src; 74 | 75 | #if defined(__i386__) || defined(__x86_64__) 76 | /* This optimization works on a little-endian systems by using 77 | (aligned) 32-bit loads instead of 8-bit loads 78 | */ 79 | 80 | /* consume unaligned prefix */ 81 | while (p != srcend && (uintptr_t)p & 0x3) 82 | *dest++ = *p++; 83 | 84 | /* iterate over 32-bit aligned loads */ 85 | while (p < srcend - 3) { 86 | const uint32_t w = *((const uint32_t *)p); 87 | 88 | *dest++ = w & 0xff; 89 | *dest++ = (w >> 8) & 0xff; 90 | *dest++ = (w >> 16) & 0xff; 91 | *dest++ = (w >> 24) & 0xff; 92 | 93 | p += 4; 94 | } 95 | #endif 96 | 97 | /* handle unaligned suffix */ 98 | while (p != srcend) 99 | *dest++ = *p++; 100 | } 101 | 102 | /* 103 | * A best-effort decoder. Runs until it hits either end of input or 104 | * the start of an invalid byte sequence. 105 | * 106 | * At exit, updates *destoff with the next offset to write to, and 107 | * returns the next source offset to read from. 108 | */ 109 | uint8_t const * 110 | _hs_text_decode_utf8(uint16_t *dest, size_t *destoff, 111 | const uint8_t const *src, const uint8_t const *srcend) 112 | { 113 | uint16_t *d = dest + *destoff; 114 | const uint8_t const *s = src; 115 | uint32_t state = UTF8_ACCEPT; 116 | 117 | while (s < srcend) { 118 | uint32_t codepoint; 119 | 120 | #if defined(__i386__) || defined(__x86_64__) 121 | /* 122 | * This code will only work on a little-endian system that 123 | * supports unaligned loads. 124 | * 125 | * It gives a substantial speed win on data that is purely or 126 | * partly ASCII (e.g. HTML), at only a slight cost on purely 127 | * non-ASCII text. 128 | */ 129 | 130 | if (state == UTF8_ACCEPT) { 131 | while (s < srcend - 4) { 132 | codepoint = *((uint32_t *) s); 133 | if ((codepoint & 0x80808080) != 0) 134 | break; 135 | s += 4; 136 | 137 | /* 138 | * Tried 32-bit stores here, but the extra bit-twiddling 139 | * slowed the code down. 140 | */ 141 | 142 | *d++ = (uint16_t) (codepoint & 0xff); 143 | *d++ = (uint16_t) ((codepoint >> 8) & 0xff); 144 | *d++ = (uint16_t) ((codepoint >> 16) & 0xff); 145 | *d++ = (uint16_t) ((codepoint >> 24) & 0xff); 146 | } 147 | } 148 | #endif 149 | 150 | if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { 151 | if (state != UTF8_REJECT) 152 | continue; 153 | break; 154 | } 155 | 156 | if (codepoint <= 0xffff) 157 | *d++ = (uint16_t) codepoint; 158 | else { 159 | *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10)); 160 | *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF)); 161 | } 162 | } 163 | 164 | /* Error recovery - if we're not in a valid finishing state, back up. */ 165 | if (state != UTF8_ACCEPT) 166 | s -= 1; 167 | 168 | *destoff = d - dest; 169 | 170 | return s; 171 | } 172 | -------------------------------------------------------------------------------- /Data/Text/Foreign.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving #-} 2 | -- | 3 | -- Module : Data.Text.Foreign 4 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan 5 | -- 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 8 | -- duncan@haskell.org 9 | -- Stability : experimental 10 | -- Portability : GHC 11 | -- 12 | -- Support for using 'Text' data with native code via the Haskell 13 | -- foreign function interface. 14 | 15 | module Data.Text.Foreign 16 | ( 17 | -- * Interoperability with native code 18 | -- $interop 19 | I16 20 | -- * Safe conversion functions 21 | , fromPtr 22 | , useAsPtr 23 | , asForeignPtr 24 | -- * Unsafe conversion code 25 | , lengthWord16 26 | , unsafeCopyToPtr 27 | -- * Low-level manipulation 28 | -- $lowlevel 29 | , dropWord16 30 | , takeWord16 31 | ) where 32 | 33 | #if defined(ASSERTS) 34 | import Control.Exception (assert) 35 | #endif 36 | #if __GLASGOW_HASKELL__ >= 702 37 | import Control.Monad.ST.Unsafe (unsafeIOToST) 38 | #else 39 | import Control.Monad.ST (unsafeIOToST) 40 | #endif 41 | import Data.Text.Internal (Text(..), empty) 42 | import Data.Text.Unsafe (lengthWord16) 43 | import qualified Data.Text.Array as A 44 | import Data.Word (Word16) 45 | import Foreign.Marshal.Alloc (allocaBytes) 46 | import Foreign.Ptr (Ptr, castPtr, plusPtr) 47 | import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray, withForeignPtr) 48 | import Foreign.Storable (peek, poke) 49 | 50 | -- $interop 51 | -- 52 | -- The 'Text' type is implemented using arrays that are not guaranteed 53 | -- to have a fixed address in the Haskell heap. All communication with 54 | -- native code must thus occur by copying data back and forth. 55 | -- 56 | -- The 'Text' type's internal representation is UTF-16, using the 57 | -- platform's native endianness. This makes copied data suitable for 58 | -- use with native libraries that use a similar representation, such 59 | -- as ICU. To interoperate with native libraries that use different 60 | -- internal representations, such as UTF-8 or UTF-32, consider using 61 | -- the functions in the 'Data.Text.Encoding' module. 62 | 63 | -- | A type representing a number of UTF-16 code units. 64 | newtype I16 = I16 Int 65 | deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show) 66 | 67 | -- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the 68 | -- contents of the array. 69 | fromPtr :: Ptr Word16 -- ^ source array 70 | -> I16 -- ^ length of source array (in 'Word16' units) 71 | -> IO Text 72 | fromPtr _ (I16 0) = return empty 73 | fromPtr ptr (I16 len) = 74 | #if defined(ASSERTS) 75 | assert (len > 0) $ 76 | #endif 77 | return $! Text arr 0 len 78 | where 79 | arr = A.run (A.new len >>= copy) 80 | copy marr = loop ptr 0 81 | where 82 | loop !p !i | i == len = return marr 83 | | otherwise = do 84 | A.unsafeWrite marr i =<< unsafeIOToST (peek p) 85 | loop (p `plusPtr` 2) (i + 1) 86 | 87 | -- $lowlevel 88 | -- 89 | -- Foreign functions that use UTF-16 internally may return indices in 90 | -- units of 'Word16' instead of characters. These functions may 91 | -- safely be used with such indices, as they will adjust offsets if 92 | -- necessary to preserve the validity of a Unicode string. 93 | 94 | -- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in 95 | -- length. 96 | -- 97 | -- If @n@ would cause the 'Text' to end inside a surrogate pair, the 98 | -- end of the prefix will be advanced by one additional 'Word16' unit 99 | -- to maintain its validity. 100 | takeWord16 :: I16 -> Text -> Text 101 | takeWord16 (I16 n) t@(Text arr off len) 102 | | n <= 0 = empty 103 | | n >= len || m >= len = t 104 | | otherwise = Text arr off m 105 | where 106 | m | w < 0xDB00 || w > 0xD8FF = n 107 | | otherwise = n+1 108 | w = A.unsafeIndex arr (off+n-1) 109 | 110 | -- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units 111 | -- dropped from its beginning. 112 | -- 113 | -- If @n@ would cause the 'Text' to begin inside a surrogate pair, the 114 | -- beginning of the suffix will be advanced by one additional 'Word16' 115 | -- unit to maintain its validity. 116 | dropWord16 :: I16 -> Text -> Text 117 | dropWord16 (I16 n) t@(Text arr off len) 118 | | n <= 0 = t 119 | | n >= len || m >= len = empty 120 | | otherwise = Text arr (off+m) (len-m) 121 | where 122 | m | w < 0xD800 || w > 0xDBFF = n 123 | | otherwise = n+1 124 | w = A.unsafeIndex arr (off+n-1) 125 | 126 | -- | /O(n)/ Copy a 'Text' to an array. The array is assumed to be big 127 | -- enough to hold the contents of the entire 'Text'. 128 | unsafeCopyToPtr :: Text -> Ptr Word16 -> IO () 129 | unsafeCopyToPtr (Text arr off len) ptr = loop ptr off 130 | where 131 | end = off + len 132 | loop !p !i | i == end = return () 133 | | otherwise = do 134 | poke p (A.unsafeIndex arr i) 135 | loop (p `plusPtr` 2) (i + 1) 136 | 137 | -- | /O(n)/ Perform an action on a temporary, mutable copy of a 138 | -- 'Text'. The copy is freed as soon as the action returns. 139 | useAsPtr :: Text -> (Ptr Word16 -> I16 -> IO a) -> IO a 140 | useAsPtr t@(Text _arr _off len) action = 141 | allocaBytes (len * 2) $ \buf -> do 142 | unsafeCopyToPtr t buf 143 | action (castPtr buf) (fromIntegral len) 144 | 145 | -- | /O(n)/ Make a mutable copy of a 'Text'. 146 | asForeignPtr :: Text -> IO (ForeignPtr Word16, I16) 147 | asForeignPtr t@(Text _arr _off len) = do 148 | fp <- mallocForeignPtrArray len 149 | withForeignPtr fp $ unsafeCopyToPtr t 150 | return (fp, I16 len) 151 | -------------------------------------------------------------------------------- /text.cabal: -------------------------------------------------------------------------------- 1 | name: text 2 | version: 0.11.3.1 3 | homepage: https://github.com/bos/text 4 | bug-reports: https://github.com/bos/text/issues 5 | synopsis: An efficient packed Unicode text type. 6 | description: 7 | . 8 | An efficient packed, immutable Unicode text type (both strict and 9 | lazy), with a powerful loop fusion optimization framework. 10 | . 11 | The 'Text' type represents Unicode character strings, in a time and 12 | space-efficient manner. This package provides text processing 13 | capabilities that are optimized for performance critical use, both 14 | in terms of large data quantities and high speed. 15 | . 16 | The 'Text' type provides character-encoding, type-safe case 17 | conversion via whole-string case conversion functions. It also 18 | provides a range of functions for converting 'Text' values to and from 19 | 'ByteStrings', using several standard encodings. 20 | . 21 | Efficient locale-sensitive support for text IO is also supported. 22 | . 23 | These modules are intended to be imported qualified, to avoid name 24 | clashes with Prelude functions, e.g. 25 | . 26 | > import qualified Data.Text as T 27 | . 28 | To use an extended and very rich family of functions for working 29 | with Unicode text (including normalization, regular expressions, 30 | non-standard encodings, text breaking, and locales), see 31 | the @text-icu@ package: 32 | 33 | . 34 | —— RELEASE NOTES —— 35 | . 36 | Changes in 0.11.2.0: 37 | . 38 | * String literals are now converted directly from the format in 39 | which GHC stores them into 'Text', without an intermediate 40 | transformation through 'String', and without inlining of 41 | conversion code at each site where a string literal is declared. 42 | . 43 | license: BSD3 44 | license-file: LICENSE 45 | author: Bryan O'Sullivan 46 | maintainer: Bryan O'Sullivan 47 | copyright: 2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper 48 | category: Data, Text 49 | build-type: Simple 50 | cabal-version: >= 1.8 51 | extra-source-files: 52 | -- scripts/CaseFolding.txt 53 | -- scripts/SpecialCasing.txt 54 | README.markdown 55 | benchmarks/Setup.hs 56 | benchmarks/cbits/*.c 57 | benchmarks/haskell/*.hs 58 | benchmarks/haskell/Benchmarks/*.hs 59 | benchmarks/python/*.py 60 | benchmarks/ruby/*.rb 61 | benchmarks/text-benchmarks.cabal 62 | scripts/*.hs 63 | tests-and-benchmarks.markdown 64 | tests/*.hs 65 | tests/.ghci 66 | tests/Makefile 67 | tests/Tests/*.hs 68 | tests/scripts/*.sh 69 | tests/text-tests.cabal 70 | 71 | flag developer 72 | description: operate in developer mode 73 | default: False 74 | 75 | flag integer-simple 76 | description: Use the simple integer library instead of GMP 77 | default: False 78 | 79 | library 80 | c-sources: cbits/cbits.c 81 | 82 | exposed-modules: 83 | Data.Text 84 | Data.Text.Array 85 | Data.Text.Encoding 86 | Data.Text.Encoding.Error 87 | Data.Text.Foreign 88 | Data.Text.IO 89 | Data.Text.Internal 90 | Data.Text.Lazy 91 | Data.Text.Lazy.Builder 92 | Data.Text.Lazy.Builder.Int 93 | Data.Text.Lazy.Builder.RealFloat 94 | Data.Text.Lazy.Encoding 95 | Data.Text.Lazy.IO 96 | Data.Text.Lazy.Internal 97 | Data.Text.Lazy.Read 98 | Data.Text.Read 99 | Data.Text.Unsafe 100 | other-modules: 101 | Data.Text.Encoding.Fusion 102 | Data.Text.Encoding.Fusion.Common 103 | Data.Text.Encoding.Utf16 104 | Data.Text.Encoding.Utf32 105 | Data.Text.Encoding.Utf8 106 | Data.Text.Fusion 107 | Data.Text.Fusion.CaseMapping 108 | Data.Text.Fusion.Common 109 | Data.Text.Fusion.Internal 110 | Data.Text.Fusion.Size 111 | Data.Text.IO.Internal 112 | Data.Text.Lazy.Builder.Functions 113 | Data.Text.Lazy.Builder.Int.Digits 114 | Data.Text.Lazy.Builder.Internal 115 | Data.Text.Lazy.Builder.RealFloat.Functions 116 | Data.Text.Lazy.Encoding.Fusion 117 | Data.Text.Lazy.Fusion 118 | Data.Text.Lazy.Search 119 | Data.Text.Private 120 | Data.Text.Search 121 | Data.Text.Unsafe.Base 122 | Data.Text.UnsafeChar 123 | Data.Text.UnsafeShift 124 | Data.Text.Util 125 | 126 | build-depends: 127 | array, 128 | base < 5, 129 | bytestring >= 0.9 130 | if impl(ghc >= 6.10) 131 | build-depends: 132 | ghc-prim, base >= 4, deepseq >= 1.1.0.0 133 | cpp-options: -DHAVE_DEEPSEQ 134 | else 135 | build-depends: extensible-exceptions 136 | extensions: ScopedTypeVariables 137 | 138 | ghc-options: -Wall -funbox-strict-fields -O2 139 | if impl(ghc >= 6.8) 140 | ghc-options: -fwarn-tabs 141 | if flag(developer) 142 | ghc-prof-options: -auto-all 143 | ghc-options: -Werror 144 | cpp-options: -DASSERTS 145 | 146 | if impl(ghc >= 6.11) 147 | if flag(integer-simple) 148 | cpp-options: -DINTEGER_SIMPLE 149 | build-depends: integer-simple >= 0.1 && < 0.5 150 | else 151 | cpp-options: -DINTEGER_GMP 152 | build-depends: integer-gmp >= 0.2 153 | 154 | if impl(ghc >= 6.9) && impl(ghc < 6.11) 155 | cpp-options: -DINTEGER_GMP 156 | build-depends: integer >= 0.1 && < 0.2 157 | 158 | test-suite tests 159 | type: exitcode-stdio-1.0 160 | hs-source-dirs: tests . 161 | main-is: Tests.hs 162 | c-sources: cbits/cbits.c 163 | 164 | ghc-options: 165 | -Wall -threaded -O0 -rtsopts 166 | 167 | cpp-options: 168 | -DASSERTS -DHAVE_DEEPSEQ 169 | 170 | build-depends: 171 | HUnit >= 1.2, 172 | QuickCheck >= 2.4, 173 | array, 174 | base, 175 | bytestring, 176 | deepseq, 177 | directory, 178 | ghc-prim, 179 | random, 180 | test-framework >= 0.4, 181 | test-framework-hunit >= 0.2, 182 | test-framework-quickcheck2 >= 0.2 183 | 184 | if impl(ghc >= 6.11) 185 | if flag(integer-simple) 186 | cpp-options: -DINTEGER_SIMPLE 187 | build-depends: integer-simple >= 0.1 && < 0.5 188 | else 189 | cpp-options: -DINTEGER_GMP 190 | build-depends: integer-gmp >= 0.2 191 | 192 | if impl(ghc >= 6.9) && impl(ghc < 6.11) 193 | cpp-options: -DINTEGER_GMP 194 | build-depends: integer >= 0.1 && < 0.2 195 | 196 | source-repository head 197 | type: git 198 | location: https://github.com/bos/text 199 | 200 | source-repository head 201 | type: mercurial 202 | location: https://bitbucket.org/bos/text 203 | -------------------------------------------------------------------------------- /Data/Text/IO/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, CPP, RecordWildCards #-} 2 | -- | 3 | -- Module : Data.Text.IO.Internal 4 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan, 5 | -- (c) 2009 Simon Marlow 6 | -- License : BSD-style 7 | -- Maintainer : bos@serpentine.com 8 | -- Stability : experimental 9 | -- Portability : GHC 10 | -- 11 | -- Low-level support for text I\/O. 12 | 13 | module Data.Text.IO.Internal 14 | ( 15 | #if __GLASGOW_HASKELL__ >= 612 16 | hGetLineWith 17 | , readChunk 18 | #endif 19 | ) where 20 | 21 | #if __GLASGOW_HASKELL__ >= 612 22 | import qualified Control.Exception as E 23 | import Data.IORef (readIORef, writeIORef) 24 | import Data.Text (Text) 25 | import Data.Text.Fusion (unstream) 26 | import Data.Text.Fusion.Internal (Step(..), Stream(..)) 27 | import Data.Text.Fusion.Size (exactSize, maxSize) 28 | import Data.Text.Unsafe (inlinePerformIO) 29 | import Foreign.Storable (peekElemOff) 30 | import GHC.IO.Buffer (Buffer(..), CharBuffer, RawCharBuffer, bufferAdjustL, 31 | bufferElems, charSize, isEmptyBuffer, readCharBuf, 32 | withRawBuffer, writeCharBuf) 33 | import GHC.IO.Handle.Internals (ioe_EOF, readTextDevice, wantReadableHandle_) 34 | import GHC.IO.Handle.Types (Handle__(..), Newline(..)) 35 | import System.IO (Handle) 36 | import System.IO.Error (isEOFError) 37 | import qualified Data.Text as T 38 | 39 | -- | Read a single line of input from a handle, constructing a list of 40 | -- decoded chunks as we go. When we're done, transform them into the 41 | -- destination type. 42 | hGetLineWith :: ([Text] -> t) -> Handle -> IO t 43 | hGetLineWith f h = wantReadableHandle_ "hGetLine" h go 44 | where 45 | go hh@Handle__{..} = readIORef haCharBuffer >>= fmap f . hGetLineLoop hh [] 46 | 47 | hGetLineLoop :: Handle__ -> [Text] -> CharBuffer -> IO [Text] 48 | hGetLineLoop hh@Handle__{..} = go where 49 | go ts buf@Buffer{ bufL=r0, bufR=w, bufRaw=raw0 } = do 50 | let findEOL raw r | r == w = return (False, w) 51 | | otherwise = do 52 | (c,r') <- readCharBuf raw r 53 | if c == '\n' 54 | then return (True, r) 55 | else findEOL raw r' 56 | (eol, off) <- findEOL raw0 r0 57 | (t,r') <- if haInputNL == CRLF 58 | then unpack_nl raw0 r0 off 59 | else do t <- unpack raw0 r0 off 60 | return (t,off) 61 | if eol 62 | then do writeIORef haCharBuffer (bufferAdjustL (off+1) buf) 63 | return $ reverse (t:ts) 64 | else do 65 | let buf1 = bufferAdjustL r' buf 66 | maybe_buf <- maybeFillReadBuffer hh buf1 67 | case maybe_buf of 68 | -- Nothing indicates we caught an EOF, and we may have a 69 | -- partial line to return. 70 | Nothing -> do 71 | -- we reached EOF. There might be a lone \r left 72 | -- in the buffer, so check for that and 73 | -- append it to the line if necessary. 74 | let pre | isEmptyBuffer buf1 = T.empty 75 | | otherwise = T.singleton '\r' 76 | writeIORef haCharBuffer buf1{ bufL=0, bufR=0 } 77 | let str = reverse . filter (not . T.null) $ pre:t:ts 78 | if null str 79 | then ioe_EOF 80 | else return str 81 | Just new_buf -> go (t:ts) new_buf 82 | 83 | -- This function is lifted almost verbatim from GHC.IO.Handle.Text. 84 | maybeFillReadBuffer :: Handle__ -> CharBuffer -> IO (Maybe CharBuffer) 85 | maybeFillReadBuffer handle_ buf 86 | = E.catch (Just `fmap` getSomeCharacters handle_ buf) $ \e -> 87 | if isEOFError e 88 | then return Nothing 89 | else ioError e 90 | 91 | unpack :: RawCharBuffer -> Int -> Int -> IO Text 92 | unpack !buf !r !w 93 | | charSize /= 4 = sizeError "unpack" 94 | | r >= w = return T.empty 95 | | otherwise = withRawBuffer buf go 96 | where 97 | go pbuf = return $! unstream (Stream next r (exactSize (w-r))) 98 | where 99 | next !i | i >= w = Done 100 | | otherwise = Yield (ix i) (i+1) 101 | ix i = inlinePerformIO $ peekElemOff pbuf i 102 | 103 | unpack_nl :: RawCharBuffer -> Int -> Int -> IO (Text, Int) 104 | unpack_nl !buf !r !w 105 | | charSize /= 4 = sizeError "unpack_nl" 106 | | r >= w = return (T.empty, 0) 107 | | otherwise = withRawBuffer buf $ go 108 | where 109 | go pbuf = do 110 | let !t = unstream (Stream next r (maxSize (w-r))) 111 | w' = w - 1 112 | return $ if ix w' == '\r' 113 | then (t,w') 114 | else (t,w) 115 | where 116 | next !i | i >= w = Done 117 | | c == '\r' = let i' = i + 1 118 | in if i' < w 119 | then if ix i' == '\n' 120 | then Yield '\n' (i+2) 121 | else Yield '\n' i' 122 | else Done 123 | | otherwise = Yield c (i+1) 124 | where c = ix i 125 | ix i = inlinePerformIO $ peekElemOff pbuf i 126 | 127 | -- This function is completely lifted from GHC.IO.Handle.Text. 128 | getSomeCharacters :: Handle__ -> CharBuffer -> IO CharBuffer 129 | getSomeCharacters handle_@Handle__{..} buf@Buffer{..} = 130 | case bufferElems buf of 131 | -- buffer empty: read some more 132 | 0 -> {-# SCC "readTextDevice" #-} readTextDevice handle_ buf 133 | 134 | -- if the buffer has a single '\r' in it and we're doing newline 135 | -- translation: read some more 136 | 1 | haInputNL == CRLF -> do 137 | (c,_) <- readCharBuf bufRaw bufL 138 | if c == '\r' 139 | then do -- shuffle the '\r' to the beginning. This is only safe 140 | -- if we're about to call readTextDevice, otherwise it 141 | -- would mess up flushCharBuffer. 142 | -- See [note Buffer Flushing], GHC.IO.Handle.Types 143 | _ <- writeCharBuf bufRaw 0 '\r' 144 | let buf' = buf{ bufL=0, bufR=1 } 145 | readTextDevice handle_ buf' 146 | else do 147 | return buf 148 | 149 | -- buffer has some chars in it already: just return it 150 | _otherwise -> {-# SCC "otherwise" #-} return buf 151 | 152 | -- | Read a single chunk of strict text from a buffer. Used by both 153 | -- the strict and lazy implementations of hGetContents. 154 | readChunk :: Handle__ -> CharBuffer -> IO Text 155 | readChunk hh@Handle__{..} buf = do 156 | buf'@Buffer{..} <- getSomeCharacters hh buf 157 | (t,r) <- if haInputNL == CRLF 158 | then unpack_nl bufRaw bufL bufR 159 | else do t <- unpack bufRaw bufL bufR 160 | return (t,bufR) 161 | writeIORef haCharBuffer (bufferAdjustL r buf') 162 | return t 163 | 164 | sizeError :: String -> a 165 | sizeError loc = error $ "Data.Text.IO." ++ loc ++ ": bad internal buffer size" 166 | #endif 167 | -------------------------------------------------------------------------------- /Data/Text/Lazy/IO.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, CPP, RecordWildCards #-} 2 | #if __GLASGOW_HASKELL__ >= 702 3 | {-# LANGUAGE Trustworthy #-} 4 | #endif 5 | -- | 6 | -- Module : Data.Text.Lazy.IO 7 | -- Copyright : (c) 2009, 2010 Bryan O'Sullivan, 8 | -- (c) 2009 Simon Marlow 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com 11 | -- Stability : experimental 12 | -- Portability : GHC 13 | -- 14 | -- Efficient locale-sensitive support for lazy text I\/O. 15 | -- 16 | -- Skip past the synopsis for some important notes on performance and 17 | -- portability across different versions of GHC. 18 | 19 | module Data.Text.Lazy.IO 20 | ( 21 | -- * Performance 22 | -- $performance 23 | 24 | -- * Locale support 25 | -- $locale 26 | -- * File-at-a-time operations 27 | readFile 28 | , writeFile 29 | , appendFile 30 | -- * Operations on handles 31 | , hGetContents 32 | , hGetLine 33 | , hPutStr 34 | , hPutStrLn 35 | -- * Special cases for standard input and output 36 | , interact 37 | , getContents 38 | , getLine 39 | , putStr 40 | , putStrLn 41 | ) where 42 | 43 | import Data.Text.Lazy (Text) 44 | import Prelude hiding (appendFile, getContents, getLine, interact, 45 | putStr, putStrLn, readFile, writeFile) 46 | import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout, 47 | withFile) 48 | import qualified Data.Text.IO as T 49 | import qualified Data.Text.Lazy as L 50 | #if __GLASGOW_HASKELL__ <= 610 51 | import Data.Text.Lazy.Encoding (decodeUtf8) 52 | import qualified Data.ByteString.Char8 as S8 53 | import qualified Data.ByteString.Lazy.Char8 as L8 54 | #else 55 | import qualified Control.Exception as E 56 | import Control.Monad (when) 57 | import Data.IORef (readIORef) 58 | import Data.Text.IO.Internal (hGetLineWith, readChunk) 59 | import Data.Text.Lazy.Internal (chunk, empty) 60 | import GHC.IO.Buffer (isEmptyBuffer) 61 | import GHC.IO.Exception (IOException(..), IOErrorType(..), ioException) 62 | import GHC.IO.Handle.Internals (augmentIOError, hClose_help, 63 | wantReadableHandle, withHandle) 64 | import GHC.IO.Handle.Types (Handle__(..), HandleType(..)) 65 | import System.IO (BufferMode(..), hGetBuffering, hSetBuffering) 66 | import System.IO.Error (isEOFError) 67 | import System.IO.Unsafe (unsafeInterleaveIO) 68 | #endif 69 | 70 | -- $performance 71 | -- 72 | -- The functions in this module obey the runtime system's locale, 73 | -- character set encoding, and line ending conversion settings. 74 | -- 75 | -- If you know in advance that you will be working with data that has 76 | -- a specific encoding (e.g. UTF-8), and your application is highly 77 | -- performance sensitive, you may find that it is faster to perform 78 | -- I\/O with bytestrings and to encode and decode yourself than to use 79 | -- the functions in this module. 80 | -- 81 | -- Whether this will hold depends on the version of GHC you are using, 82 | -- the platform you are working on, the data you are working with, and 83 | -- the encodings you are using, so be sure to test for yourself. 84 | 85 | -- | Read a file and return its contents as a string. The file is 86 | -- read lazily, as with 'getContents'. 87 | readFile :: FilePath -> IO Text 88 | readFile name = openFile name ReadMode >>= hGetContents 89 | 90 | -- | Write a string to a file. The file is truncated to zero length 91 | -- before writing begins. 92 | writeFile :: FilePath -> Text -> IO () 93 | writeFile p = withFile p WriteMode . flip hPutStr 94 | 95 | -- | Write a string the end of a file. 96 | appendFile :: FilePath -> Text -> IO () 97 | appendFile p = withFile p AppendMode . flip hPutStr 98 | 99 | -- | Lazily read the remaining contents of a 'Handle'. The 'Handle' 100 | -- will be closed after the read completes, or on error. 101 | hGetContents :: Handle -> IO Text 102 | #if __GLASGOW_HASKELL__ <= 610 103 | hGetContents = fmap decodeUtf8 . L8.hGetContents 104 | #else 105 | hGetContents h = do 106 | chooseGoodBuffering h 107 | wantReadableHandle "hGetContents" h $ \hh -> do 108 | ts <- lazyRead h 109 | return (hh{haType=SemiClosedHandle}, ts) 110 | 111 | -- | Use a more efficient buffer size if we're reading in 112 | -- block-buffered mode with the default buffer size. 113 | chooseGoodBuffering :: Handle -> IO () 114 | chooseGoodBuffering h = do 115 | bufMode <- hGetBuffering h 116 | when (bufMode == BlockBuffering Nothing) $ 117 | hSetBuffering h (BlockBuffering (Just 16384)) 118 | 119 | lazyRead :: Handle -> IO Text 120 | lazyRead h = unsafeInterleaveIO $ 121 | withHandle "hGetContents" h $ \hh -> do 122 | case haType hh of 123 | ClosedHandle -> return (hh, L.empty) 124 | SemiClosedHandle -> lazyReadBuffered h hh 125 | _ -> ioException 126 | (IOError (Just h) IllegalOperation "hGetContents" 127 | "illegal handle type" Nothing Nothing) 128 | 129 | lazyReadBuffered :: Handle -> Handle__ -> IO (Handle__, Text) 130 | lazyReadBuffered h hh@Handle__{..} = do 131 | buf <- readIORef haCharBuffer 132 | (do t <- readChunk hh buf 133 | ts <- lazyRead h 134 | return (hh, chunk t ts)) `E.catch` \e -> do 135 | (hh', _) <- hClose_help hh 136 | if isEOFError e 137 | then return $ if isEmptyBuffer buf 138 | then (hh', empty) 139 | else (hh', L.singleton '\r') 140 | else E.throwIO (augmentIOError e "hGetContents" h) 141 | #endif 142 | 143 | -- | Read a single line from a handle. 144 | hGetLine :: Handle -> IO Text 145 | #if __GLASGOW_HASKELL__ <= 610 146 | hGetLine = fmap (decodeUtf8 . L8.fromChunks . (:[])) . S8.hGetLine 147 | #else 148 | hGetLine = hGetLineWith L.fromChunks 149 | #endif 150 | 151 | -- | Write a string to a handle. 152 | hPutStr :: Handle -> Text -> IO () 153 | hPutStr h = mapM_ (T.hPutStr h) . L.toChunks 154 | 155 | -- | Write a string to a handle, followed by a newline. 156 | hPutStrLn :: Handle -> Text -> IO () 157 | hPutStrLn h t = hPutStr h t >> hPutChar h '\n' 158 | 159 | -- | The 'interact' function takes a function of type @Text -> Text@ 160 | -- as its argument. The entire input from the standard input device is 161 | -- passed (lazily) to this function as its argument, and the resulting 162 | -- string is output on the standard output device. 163 | interact :: (Text -> Text) -> IO () 164 | interact f = putStr . f =<< getContents 165 | 166 | -- | Lazily read all user input on 'stdin' as a single string. 167 | getContents :: IO Text 168 | getContents = hGetContents stdin 169 | 170 | -- | Read a single line of user input from 'stdin'. 171 | getLine :: IO Text 172 | getLine = hGetLine stdin 173 | 174 | -- | Write a string to 'stdout'. 175 | putStr :: Text -> IO () 176 | putStr = hPutStr stdout 177 | 178 | -- | Write a string to 'stdout', followed by a newline. 179 | putStrLn :: Text -> IO () 180 | putStrLn = hPutStrLn stdout 181 | 182 | -- $locale 183 | -- 184 | -- /Note/: The behaviour of functions in this module depends on the 185 | -- version of GHC you are using. 186 | -- 187 | -- Beginning with GHC 6.12, text I\/O is performed using the system or 188 | -- handle's current locale and line ending conventions. 189 | -- 190 | -- Under GHC 6.10 and earlier, the system I\/O libraries /do not 191 | -- support/ locale-sensitive I\/O or line ending conversion. On these 192 | -- versions of GHC, functions in this library all use UTF-8. What 193 | -- does this mean in practice? 194 | -- 195 | -- * All data that is read will be decoded as UTF-8. 196 | -- 197 | -- * Before data is written, it is first encoded as UTF-8. 198 | -- 199 | -- * On both reading and writing, the platform's native newline 200 | -- conversion is performed. 201 | -- 202 | -- If you must use a non-UTF-8 locale on an older version of GHC, you 203 | -- will have to perform the transcoding yourself, e.g. as follows: 204 | -- 205 | -- > import qualified Data.ByteString.Lazy as B 206 | -- > import Data.Text.Lazy (Text) 207 | -- > import Data.Text.Lazy.Encoding (encodeUtf16) 208 | -- > 209 | -- > putStr_Utf16LE :: Text -> IO () 210 | -- > putStr_Utf16LE t = B.putStr (encodeUtf16LE t) 211 | -------------------------------------------------------------------------------- /Data/Text/Encoding/Fusion.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, CPP, Rank2Types #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Encoding.Fusion 5 | -- Copyright : (c) Tom Harper 2008-2009, 6 | -- (c) Bryan O'Sullivan 2009, 7 | -- (c) Duncan Coutts 2009 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : portable 14 | -- 15 | -- Fusible 'Stream'-oriented functions for converting between 'Text' 16 | -- and several common encodings. 17 | 18 | module Data.Text.Encoding.Fusion 19 | ( 20 | -- * Streaming 21 | streamASCII 22 | , streamUtf8 23 | , streamUtf16LE 24 | , streamUtf16BE 25 | , streamUtf32LE 26 | , streamUtf32BE 27 | 28 | -- * Unstreaming 29 | , unstream 30 | 31 | , module Data.Text.Encoding.Fusion.Common 32 | ) where 33 | 34 | #if defined(ASSERTS) 35 | import Control.Exception (assert) 36 | #endif 37 | import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy) 38 | import Data.Text.Fusion (Step(..), Stream(..)) 39 | import Data.Text.Fusion.Size 40 | import Data.Text.Encoding.Error 41 | import Data.Text.Encoding.Fusion.Common 42 | import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32) 43 | import Data.Text.UnsafeShift (shiftL, shiftR) 44 | import Data.Word (Word8, Word16, Word32) 45 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr) 46 | import Foreign.Storable (pokeByteOff) 47 | import qualified Data.ByteString as B 48 | import qualified Data.ByteString.Unsafe as B 49 | import qualified Data.Text.Encoding.Utf8 as U8 50 | import qualified Data.Text.Encoding.Utf16 as U16 51 | import qualified Data.Text.Encoding.Utf32 as U32 52 | import Data.Text.Unsafe (unsafeDupablePerformIO) 53 | 54 | streamASCII :: ByteString -> Stream Char 55 | streamASCII bs = Stream next 0 (maxSize l) 56 | where 57 | l = B.length bs 58 | {-# INLINE next #-} 59 | next i 60 | | i >= l = Done 61 | | otherwise = Yield (unsafeChr8 x1) (i+1) 62 | where 63 | x1 = B.unsafeIndex bs i 64 | {-# DEPRECATED streamASCII "Do not use this function" #-} 65 | {-# INLINE [0] streamASCII #-} 66 | 67 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8 68 | -- encoding. 69 | streamUtf8 :: OnDecodeError -> ByteString -> Stream Char 70 | streamUtf8 onErr bs = Stream next 0 (maxSize l) 71 | where 72 | l = B.length bs 73 | next i 74 | | i >= l = Done 75 | | U8.validate1 x1 = Yield (unsafeChr8 x1) (i+1) 76 | | i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2) 77 | | i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3) 78 | | i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4) 79 | | otherwise = decodeError "streamUtf8" "UTF-8" onErr (Just x1) (i+1) 80 | where 81 | x1 = idx i 82 | x2 = idx (i + 1) 83 | x3 = idx (i + 2) 84 | x4 = idx (i + 3) 85 | idx = B.unsafeIndex bs 86 | {-# INLINE [0] streamUtf8 #-} 87 | 88 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little 89 | -- endian UTF-16 encoding. 90 | streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char 91 | streamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1)) 92 | where 93 | l = B.length bs 94 | {-# INLINE next #-} 95 | next i 96 | | i >= l = Done 97 | | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2) 98 | | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4) 99 | | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1) 100 | where 101 | x1 = idx i + (idx (i + 1) `shiftL` 8) 102 | x2 = idx (i + 2) + (idx (i + 3) `shiftL` 8) 103 | idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16 104 | {-# INLINE [0] streamUtf16LE #-} 105 | 106 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big 107 | -- endian UTF-16 encoding. 108 | streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char 109 | streamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1)) 110 | where 111 | l = B.length bs 112 | {-# INLINE next #-} 113 | next i 114 | | i >= l = Done 115 | | i+1 < l && U16.validate1 x1 = Yield (unsafeChr x1) (i+2) 116 | | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4) 117 | | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1) 118 | where 119 | x1 = (idx i `shiftL` 8) + idx (i + 1) 120 | x2 = (idx (i + 2) `shiftL` 8) + idx (i + 3) 121 | idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16 122 | {-# INLINE [0] streamUtf16BE #-} 123 | 124 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big 125 | -- endian UTF-32 encoding. 126 | streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char 127 | streamUtf32BE onErr bs = Stream next 0 (maxSize (l `shiftR` 2)) 128 | where 129 | l = B.length bs 130 | {-# INLINE next #-} 131 | next i 132 | | i >= l = Done 133 | | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4) 134 | | otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1) 135 | where 136 | x = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4 137 | x1 = idx i 138 | x2 = idx (i+1) 139 | x3 = idx (i+2) 140 | x4 = idx (i+3) 141 | idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32 142 | {-# INLINE [0] streamUtf32BE #-} 143 | 144 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little 145 | -- endian UTF-32 encoding. 146 | streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char 147 | streamUtf32LE onErr bs = Stream next 0 (maxSize (l `shiftR` 2)) 148 | where 149 | l = B.length bs 150 | {-# INLINE next #-} 151 | next i 152 | | i >= l = Done 153 | | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4) 154 | | otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1) 155 | where 156 | x = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1 157 | x1 = idx i 158 | x2 = idx $ i+1 159 | x3 = idx $ i+2 160 | x4 = idx $ i+3 161 | idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32 162 | {-# INLINE [0] streamUtf32LE #-} 163 | 164 | -- | /O(n)/ Convert a 'Stream' 'Word8' to a 'ByteString'. 165 | unstream :: Stream Word8 -> ByteString 166 | unstream (Stream next s0 len) = unsafeDupablePerformIO $ do 167 | let mlen = upperBound 4 len 168 | mallocByteString mlen >>= loop mlen 0 s0 169 | where 170 | loop !n !off !s fp = case next s of 171 | Done -> trimUp fp n off 172 | Skip s' -> loop n off s' fp 173 | Yield x s' 174 | | off == n -> realloc fp n off s' x 175 | | otherwise -> do 176 | withForeignPtr fp $ \p -> pokeByteOff p off x 177 | loop n (off+1) s' fp 178 | {-# NOINLINE realloc #-} 179 | realloc fp n off s x = do 180 | let n' = n+n 181 | fp' <- copy0 fp n n' 182 | withForeignPtr fp' $ \p -> pokeByteOff p off x 183 | loop n' (off+1) s fp' 184 | {-# NOINLINE trimUp #-} 185 | trimUp fp _ off = return $! PS fp 0 off 186 | copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8) 187 | copy0 !src !srcLen !destLen = 188 | #if defined(ASSERTS) 189 | assert (srcLen <= destLen) $ 190 | #endif 191 | do 192 | dest <- mallocByteString destLen 193 | withForeignPtr src $ \src' -> 194 | withForeignPtr dest $ \dest' -> 195 | memcpy dest' src' (fromIntegral srcLen) 196 | return dest 197 | 198 | decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8 199 | -> s -> Step s Char 200 | decodeError func kind onErr mb i = 201 | case onErr desc mb of 202 | Nothing -> Skip i 203 | Just c -> Yield c i 204 | where desc = "Data.Text.Encoding.Fusion." ++ func ++ ": Invalid " ++ 205 | kind ++ " stream" 206 | -------------------------------------------------------------------------------- /Data/Text/Array.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, MagicHash, Rank2Types, 2 | RecordWildCards, UnboxedTuples, UnliftedFFITypes #-} 3 | {-# OPTIONS_GHC -fno-warn-unused-matches #-} 4 | -- | 5 | -- Module : Data.Text.Array 6 | -- Copyright : (c) 2009, 2010, 2011 Bryan O'Sullivan 7 | -- 8 | -- License : BSD-style 9 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 10 | -- duncan@haskell.org 11 | -- Stability : experimental 12 | -- Portability : portable 13 | -- 14 | -- Packed, unboxed, heap-resident arrays. Suitable for performance 15 | -- critical use, both in terms of large data quantities and high 16 | -- speed. 17 | -- 18 | -- This module is intended to be imported @qualified@, to avoid name 19 | -- clashes with "Prelude" functions, e.g. 20 | -- 21 | -- > import qualified Data.Text.Array as A 22 | -- 23 | -- The names in this module resemble those in the 'Data.Array' family 24 | -- of modules, but are shorter due to the assumption of qualifid 25 | -- naming. 26 | module Data.Text.Array 27 | ( 28 | -- * Types 29 | Array(aBA) 30 | , MArray(maBA) 31 | 32 | -- * Functions 33 | , copyM 34 | , copyI 35 | , empty 36 | , equal 37 | #if defined(ASSERTS) 38 | , length 39 | #endif 40 | , run 41 | , run2 42 | , toList 43 | , unsafeFreeze 44 | , unsafeIndex 45 | , new 46 | , unsafeWrite 47 | ) where 48 | 49 | #if defined(ASSERTS) 50 | -- This fugly hack is brought by GHC's apparent reluctance to deal 51 | -- with MagicHash and UnboxedTuples when inferring types. Eek! 52 | # define CHECK_BOUNDS(_func_,_len_,_k_) \ 53 | if (_k_) < 0 || (_k_) >= (_len_) then error ("Data.Text.Array." ++ (_func_) ++ ": bounds error, offset " ++ show (_k_) ++ ", length " ++ show (_len_)) else 54 | #else 55 | # define CHECK_BOUNDS(_func_,_len_,_k_) 56 | #endif 57 | 58 | #include "MachDeps.h" 59 | 60 | #if defined(ASSERTS) 61 | import Control.Exception (assert) 62 | #endif 63 | #if __GLASGOW_HASKELL__ >= 702 64 | import Control.Monad.ST.Unsafe (unsafeIOToST) 65 | #else 66 | import Control.Monad.ST (unsafeIOToST) 67 | #endif 68 | import Data.Bits ((.&.), xor) 69 | import Data.Text.Unsafe.Base (inlinePerformIO) 70 | import Data.Text.UnsafeShift (shiftL, shiftR) 71 | #if __GLASGOW_HASKELL__ >= 703 72 | import Foreign.C.Types (CInt(CInt), CSize(CSize)) 73 | #else 74 | import Foreign.C.Types (CInt, CSize) 75 | #endif 76 | import GHC.Base (ByteArray#, MutableByteArray#, Int(..), 77 | indexWord16Array#, newByteArray#, 78 | unsafeCoerce#, writeWord16Array#) 79 | import GHC.ST (ST(..), runST) 80 | import GHC.Word (Word16(..)) 81 | import Prelude hiding (length, read) 82 | 83 | -- | Immutable array type. 84 | data Array = Array { 85 | aBA :: ByteArray# 86 | #if defined(ASSERTS) 87 | , aLen :: {-# UNPACK #-} !Int -- length (in units of Word16, not bytes) 88 | #endif 89 | } 90 | 91 | -- | Mutable array type, for use in the ST monad. 92 | data MArray s = MArray { 93 | maBA :: MutableByteArray# s 94 | #if defined(ASSERTS) 95 | , maLen :: {-# UNPACK #-} !Int -- length (in units of Word16, not bytes) 96 | #endif 97 | } 98 | 99 | #if defined(ASSERTS) 100 | -- | Operations supported by all arrays. 101 | class IArray a where 102 | -- | Return the length of an array. 103 | length :: a -> Int 104 | 105 | instance IArray Array where 106 | length = aLen 107 | {-# INLINE length #-} 108 | 109 | instance IArray (MArray s) where 110 | length = maLen 111 | {-# INLINE length #-} 112 | #endif 113 | 114 | -- | Create an uninitialized mutable array. 115 | new :: forall s. Int -> ST s (MArray s) 116 | new n 117 | | n < 0 || n .&. highBit /= 0 = array_size_error 118 | | otherwise = ST $ \s1# -> 119 | case newByteArray# len# s1# of 120 | (# s2#, marr# #) -> (# s2#, MArray marr# 121 | #if defined(ASSERTS) 122 | n 123 | #endif 124 | #) 125 | where !(I# len#) = bytesInArray n 126 | highBit = maxBound `xor` (maxBound `shiftR` 1) 127 | {-# INLINE new #-} 128 | 129 | array_size_error :: a 130 | array_size_error = error "Data.Text.Array.new: size overflow" 131 | 132 | -- | Freeze a mutable array. Do not mutate the 'MArray' afterwards! 133 | unsafeFreeze :: MArray s -> ST s Array 134 | unsafeFreeze MArray{..} = ST $ \s# -> 135 | (# s#, Array (unsafeCoerce# maBA) 136 | #if defined(ASSERTS) 137 | maLen 138 | #endif 139 | #) 140 | {-# INLINE unsafeFreeze #-} 141 | 142 | -- | Indicate how many bytes would be used for an array of the given 143 | -- size. 144 | bytesInArray :: Int -> Int 145 | bytesInArray n = n `shiftL` 1 146 | {-# INLINE bytesInArray #-} 147 | 148 | -- | Unchecked read of an immutable array. May return garbage or 149 | -- crash on an out-of-bounds access. 150 | unsafeIndex :: Array -> Int -> Word16 151 | unsafeIndex Array{..} i@(I# i#) = 152 | CHECK_BOUNDS("unsafeIndex",aLen,i) 153 | case indexWord16Array# aBA i# of r# -> (W16# r#) 154 | {-# INLINE unsafeIndex #-} 155 | 156 | -- | Unchecked write of a mutable array. May return garbage or crash 157 | -- on an out-of-bounds access. 158 | unsafeWrite :: MArray s -> Int -> Word16 -> ST s () 159 | unsafeWrite MArray{..} i@(I# i#) (W16# e#) = ST $ \s1# -> 160 | CHECK_BOUNDS("unsafeWrite",maLen,i) 161 | case writeWord16Array# maBA i# e# s1# of 162 | s2# -> (# s2#, () #) 163 | {-# INLINE unsafeWrite #-} 164 | 165 | -- | Convert an immutable array to a list. 166 | toList :: Array -> Int -> Int -> [Word16] 167 | toList ary off len = loop 0 168 | where loop i | i < len = unsafeIndex ary (off+i) : loop (i+1) 169 | | otherwise = [] 170 | 171 | -- | An empty immutable array. 172 | empty :: Array 173 | empty = runST (new 0 >>= unsafeFreeze) 174 | 175 | -- | Run an action in the ST monad and return an immutable array of 176 | -- its result. 177 | run :: (forall s. ST s (MArray s)) -> Array 178 | run k = runST (k >>= unsafeFreeze) 179 | 180 | -- | Run an action in the ST monad and return an immutable array of 181 | -- its result paired with whatever else the action returns. 182 | run2 :: (forall s. ST s (MArray s, a)) -> (Array, a) 183 | run2 k = runST (do 184 | (marr,b) <- k 185 | arr <- unsafeFreeze marr 186 | return (arr,b)) 187 | {-# INLINE run2 #-} 188 | 189 | -- | Copy some elements of a mutable array. 190 | copyM :: MArray s -- ^ Destination 191 | -> Int -- ^ Destination offset 192 | -> MArray s -- ^ Source 193 | -> Int -- ^ Source offset 194 | -> Int -- ^ Count 195 | -> ST s () 196 | copyM dest didx src sidx count 197 | | count <= 0 = return () 198 | | otherwise = 199 | #if defined(ASSERTS) 200 | assert (sidx + count <= length src) . 201 | assert (didx + count <= length dest) . 202 | #endif 203 | unsafeIOToST $ memcpyM (maBA dest) (fromIntegral didx) 204 | (maBA src) (fromIntegral sidx) 205 | (fromIntegral count) 206 | {-# INLINE copyM #-} 207 | 208 | -- | Copy some elements of an immutable array. 209 | copyI :: MArray s -- ^ Destination 210 | -> Int -- ^ Destination offset 211 | -> Array -- ^ Source 212 | -> Int -- ^ Source offset 213 | -> Int -- ^ First offset in destination /not/ to 214 | -- copy (i.e. /not/ length) 215 | -> ST s () 216 | copyI dest i0 src j0 top 217 | | i0 >= top = return () 218 | | otherwise = unsafeIOToST $ 219 | memcpyI (maBA dest) (fromIntegral i0) 220 | (aBA src) (fromIntegral j0) 221 | (fromIntegral (top-i0)) 222 | {-# INLINE copyI #-} 223 | 224 | -- | Compare portions of two arrays for equality. No bounds checking 225 | -- is performed. 226 | equal :: Array -- ^ First 227 | -> Int -- ^ Offset into first 228 | -> Array -- ^ Second 229 | -> Int -- ^ Offset into second 230 | -> Int -- ^ Count 231 | -> Bool 232 | equal arrA offA arrB offB count = inlinePerformIO $ do 233 | i <- memcmp (aBA arrA) (fromIntegral offA) 234 | (aBA arrB) (fromIntegral offB) (fromIntegral count) 235 | return $! i == 0 236 | {-# INLINE equal #-} 237 | 238 | foreign import ccall unsafe "_hs_text_memcpy" memcpyI 239 | :: MutableByteArray# s -> CSize -> ByteArray# -> CSize -> CSize -> IO () 240 | 241 | foreign import ccall unsafe "_hs_text_memcmp" memcmp 242 | :: ByteArray# -> CSize -> ByteArray# -> CSize -> CSize -> IO CInt 243 | 244 | foreign import ccall unsafe "_hs_text_memcpy" memcpyM 245 | :: MutableByteArray# s -> CSize -> MutableByteArray# s -> CSize -> CSize 246 | -> IO () 247 | -------------------------------------------------------------------------------- /Data/Text/Lazy/Read.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings, CPP #-} 2 | #if __GLASGOW_HASKELL__ >= 702 3 | {-# LANGUAGE Trustworthy #-} 4 | #endif 5 | 6 | -- | 7 | -- Module : Data.Text.Lazy.Read 8 | -- Copyright : (c) 2010, 2011 Bryan O'Sullivan 9 | -- 10 | -- License : BSD-style 11 | -- Maintainer : bos@serpentine.com 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Functions used frequently when reading textual data. 16 | module Data.Text.Lazy.Read 17 | ( 18 | Reader 19 | , decimal 20 | , hexadecimal 21 | , signed 22 | , rational 23 | , double 24 | ) where 25 | 26 | import Control.Monad (liftM) 27 | import Data.Char (isDigit, isHexDigit, ord) 28 | import Data.Int (Int8, Int16, Int32, Int64) 29 | import Data.Ratio ((%)) 30 | import Data.Text.Lazy as T 31 | import Data.Word (Word, Word8, Word16, Word32, Word64) 32 | 33 | -- | Read some text. If the read succeeds, return its value and the 34 | -- remaining text, otherwise an error message. 35 | type Reader a = Text -> Either String (a,Text) 36 | 37 | -- | Read a decimal integer. The input must begin with at least one 38 | -- decimal digit, and is consumed until a non-digit or end of string 39 | -- is reached. 40 | -- 41 | -- This function does not handle leading sign characters. If you need 42 | -- to handle signed input, use @'signed' 'decimal'@. 43 | -- 44 | -- /Note/: For fixed-width integer types, this function does not 45 | -- attempt to detect overflow, so a sufficiently long input may give 46 | -- incorrect results. If you are worried about overflow, use 47 | -- 'Integer' for your result type. 48 | decimal :: Integral a => Reader a 49 | {-# SPECIALIZE decimal :: Reader Int #-} 50 | {-# SPECIALIZE decimal :: Reader Int8 #-} 51 | {-# SPECIALIZE decimal :: Reader Int16 #-} 52 | {-# SPECIALIZE decimal :: Reader Int32 #-} 53 | {-# SPECIALIZE decimal :: Reader Int64 #-} 54 | {-# SPECIALIZE decimal :: Reader Integer #-} 55 | {-# SPECIALIZE decimal :: Reader Word #-} 56 | {-# SPECIALIZE decimal :: Reader Word8 #-} 57 | {-# SPECIALIZE decimal :: Reader Word16 #-} 58 | {-# SPECIALIZE decimal :: Reader Word32 #-} 59 | {-# SPECIALIZE decimal :: Reader Word64 #-} 60 | decimal txt 61 | | T.null h = Left "input does not start with a digit" 62 | | otherwise = Right (T.foldl' go 0 h, t) 63 | where (h,t) = T.span isDigit txt 64 | go n d = (n * 10 + fromIntegral (digitToInt d)) 65 | 66 | -- | Read a hexadecimal integer, consisting of an optional leading 67 | -- @\"0x\"@ followed by at least one decimal digit. Input is consumed 68 | -- until a non-hex-digit or end of string is reached. This function 69 | -- is case insensitive. 70 | -- 71 | -- This function does not handle leading sign characters. If you need 72 | -- to handle signed input, use @'signed' 'hexadecimal'@. 73 | -- 74 | -- /Note/: For fixed-width integer types, this function does not 75 | -- attempt to detect overflow, so a sufficiently long input may give 76 | -- incorrect results. If you are worried about overflow, use 77 | -- 'Integer' for your result type. 78 | hexadecimal :: Integral a => Reader a 79 | {-# SPECIALIZE hexadecimal :: Reader Int #-} 80 | {-# SPECIALIZE hexadecimal :: Reader Integer #-} 81 | hexadecimal txt 82 | | h == "0x" || h == "0X" = hex t 83 | | otherwise = hex txt 84 | where (h,t) = T.splitAt 2 txt 85 | 86 | hex :: Integral a => Reader a 87 | {-# SPECIALIZE hexadecimal :: Reader Int #-} 88 | {-# SPECIALIZE hexadecimal :: Reader Int8 #-} 89 | {-# SPECIALIZE hexadecimal :: Reader Int16 #-} 90 | {-# SPECIALIZE hexadecimal :: Reader Int32 #-} 91 | {-# SPECIALIZE hexadecimal :: Reader Int64 #-} 92 | {-# SPECIALIZE hexadecimal :: Reader Integer #-} 93 | {-# SPECIALIZE hexadecimal :: Reader Word #-} 94 | {-# SPECIALIZE hexadecimal :: Reader Word8 #-} 95 | {-# SPECIALIZE hexadecimal :: Reader Word16 #-} 96 | {-# SPECIALIZE hexadecimal :: Reader Word32 #-} 97 | {-# SPECIALIZE hexadecimal :: Reader Word64 #-} 98 | hex txt 99 | | T.null h = Left "input does not start with a hexadecimal digit" 100 | | otherwise = Right (T.foldl' go 0 h, t) 101 | where (h,t) = T.span isHexDigit txt 102 | go n d = (n * 16 + fromIntegral (hexDigitToInt d)) 103 | 104 | hexDigitToInt :: Char -> Int 105 | hexDigitToInt c 106 | | c >= '0' && c <= '9' = ord c - ord '0' 107 | | c >= 'a' && c <= 'f' = ord c - (ord 'a' - 10) 108 | | otherwise = ord c - (ord 'A' - 10) 109 | 110 | digitToInt :: Char -> Int 111 | digitToInt c = ord c - ord '0' 112 | 113 | -- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and 114 | -- apply it to the result of applying the given reader. 115 | signed :: Num a => Reader a -> Reader a 116 | {-# INLINE signed #-} 117 | signed f = runP (signa (P f)) 118 | 119 | -- | Read a rational number. 120 | -- 121 | -- This function accepts an optional leading sign character, followed 122 | -- by at least one decimal digit. The syntax similar to that accepted 123 | -- by the 'read' function, with the exception that a trailing @\'.\'@ 124 | -- or @\'e\'@ /not/ followed by a number is not consumed. 125 | -- 126 | -- Examples: 127 | -- 128 | -- >rational "3" == Right (3.0, "") 129 | -- >rational "3.1" == Right (3.1, "") 130 | -- >rational "3e4" == Right (30000.0, "") 131 | -- >rational "3.1e4" == Right (31000.0, "") 132 | -- >rational ".3" == Left "input does not start with a digit" 133 | -- >rational "e3" == Left "input does not start with a digit" 134 | -- 135 | -- Examples of differences from 'read': 136 | -- 137 | -- >rational "3.foo" == Right (3.0, ".foo") 138 | -- >rational "3e" == Right (3.0, "e") 139 | rational :: Fractional a => Reader a 140 | {-# SPECIALIZE rational :: Reader Double #-} 141 | rational = floaty $ \real frac fracDenom -> fromRational $ 142 | real % 1 + frac % fracDenom 143 | 144 | -- | Read a rational number. 145 | -- 146 | -- The syntax accepted by this function is the same as for 'rational'. 147 | -- 148 | -- /Note/: This function is almost ten times faster than 'rational', 149 | -- but is slightly less accurate. 150 | -- 151 | -- The 'Double' type supports about 16 decimal places of accuracy. 152 | -- For 94.2% of numbers, this function and 'rational' give identical 153 | -- results, but for the remaining 5.8%, this function loses precision 154 | -- around the 15th decimal place. For 0.001% of numbers, this 155 | -- function will lose precision at the 13th or 14th decimal place. 156 | double :: Reader Double 157 | double = floaty $ \real frac fracDenom -> 158 | fromIntegral real + 159 | fromIntegral frac / fromIntegral fracDenom 160 | 161 | signa :: Num a => Parser a -> Parser a 162 | {-# SPECIALIZE signa :: Parser Int -> Parser Int #-} 163 | {-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-} 164 | {-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-} 165 | {-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-} 166 | {-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-} 167 | {-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-} 168 | signa p = do 169 | sign <- perhaps '+' $ char (\c -> c == '-' || c == '+') 170 | if sign == '+' then p else negate `liftM` p 171 | 172 | newtype Parser a = P { 173 | runP :: Reader a 174 | } 175 | 176 | instance Monad Parser where 177 | return a = P $ \t -> Right (a,t) 178 | {-# INLINE return #-} 179 | m >>= k = P $ \t -> case runP m t of 180 | Left err -> Left err 181 | Right (a,t') -> runP (k a) t' 182 | {-# INLINE (>>=) #-} 183 | fail msg = P $ \_ -> Left msg 184 | 185 | perhaps :: a -> Parser a -> Parser a 186 | perhaps def m = P $ \t -> case runP m t of 187 | Left _ -> Right (def,t) 188 | r@(Right _) -> r 189 | 190 | char :: (Char -> Bool) -> Parser Char 191 | char p = P $ \t -> case T.uncons t of 192 | Just (c,t') | p c -> Right (c,t') 193 | _ -> Left "character does not match" 194 | 195 | data T = T !Integer !Int 196 | 197 | floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a 198 | {-# INLINE floaty #-} 199 | floaty f = runP $ do 200 | sign <- perhaps '+' $ char (\c -> c == '-' || c == '+') 201 | real <- P decimal 202 | T fraction fracDigits <- perhaps (T 0 0) $ do 203 | _ <- char (=='.') 204 | digits <- P $ \t -> Right (fromIntegral . T.length $ T.takeWhile isDigit t, t) 205 | n <- P decimal 206 | return $ T n digits 207 | let e c = c == 'e' || c == 'E' 208 | power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int) 209 | let n = if fracDigits == 0 210 | then if power == 0 211 | then fromIntegral real 212 | else fromIntegral real * (10 ^^ power) 213 | else if power == 0 214 | then f real fraction (10 ^ fracDigits) 215 | else f real fraction (10 ^ fracDigits) * (10 ^^ power) 216 | return $! if sign == '+' 217 | then n 218 | else -n 219 | -------------------------------------------------------------------------------- /Data/Text/Fusion.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, MagicHash #-} 2 | 3 | -- | 4 | -- Module : Data.Text.Fusion 5 | -- Copyright : (c) Tom Harper 2008-2009, 6 | -- (c) Bryan O'Sullivan 2009-2010, 7 | -- (c) Duncan Coutts 2009 8 | -- 9 | -- License : BSD-style 10 | -- Maintainer : bos@serpentine.com, rtomharper@googlemail.com, 11 | -- duncan@haskell.org 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Text manipulation functions represented as fusible operations over 16 | -- streams. 17 | module Data.Text.Fusion 18 | ( 19 | -- * Types 20 | Stream(..) 21 | , Step(..) 22 | 23 | -- * Creation and elimination 24 | , stream 25 | , unstream 26 | , reverseStream 27 | 28 | , length 29 | 30 | -- * Transformations 31 | , reverse 32 | 33 | -- * Construction 34 | -- ** Scans 35 | , reverseScanr 36 | 37 | -- ** Accumulating maps 38 | , mapAccumL 39 | 40 | -- ** Generation and unfolding 41 | , unfoldrN 42 | 43 | -- * Indexing 44 | , index 45 | , findIndex 46 | , countChar 47 | ) where 48 | 49 | import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int, 50 | Num(..), Ord(..), ($), (&&), 51 | fromIntegral, otherwise) 52 | import Data.Bits ((.&.)) 53 | import Data.Text.Internal (Text(..)) 54 | import Data.Text.Private (runText) 55 | import Data.Text.UnsafeChar (ord, unsafeChr, unsafeWrite) 56 | import Data.Text.UnsafeShift (shiftL, shiftR) 57 | import qualified Data.Text.Array as A 58 | import qualified Data.Text.Fusion.Common as S 59 | import Data.Text.Fusion.Internal 60 | import Data.Text.Fusion.Size 61 | import qualified Data.Text.Internal as I 62 | import qualified Data.Text.Encoding.Utf16 as U16 63 | 64 | default(Int) 65 | 66 | -- | /O(n)/ Convert a 'Text' into a 'Stream Char'. 67 | stream :: Text -> Stream Char 68 | stream (Text arr off len) = Stream next off (maxSize len) 69 | where 70 | !end = off+len 71 | next !i 72 | | i >= end = Done 73 | | n >= 0xD800 && n <= 0xDBFF = Yield (U16.chr2 n n2) (i + 2) 74 | | otherwise = Yield (unsafeChr n) (i + 1) 75 | where 76 | n = A.unsafeIndex arr i 77 | n2 = A.unsafeIndex arr (i + 1) 78 | {-# INLINE [0] stream #-} 79 | 80 | -- | /O(n)/ Convert a 'Text' into a 'Stream Char', but iterate 81 | -- backwards. 82 | reverseStream :: Text -> Stream Char 83 | reverseStream (Text arr off len) = Stream next (off+len-1) (maxSize len) 84 | where 85 | {-# INLINE next #-} 86 | next !i 87 | | i < off = Done 88 | | n >= 0xDC00 && n <= 0xDFFF = Yield (U16.chr2 n2 n) (i - 2) 89 | | otherwise = Yield (unsafeChr n) (i - 1) 90 | where 91 | n = A.unsafeIndex arr i 92 | n2 = A.unsafeIndex arr (i - 1) 93 | {-# INLINE [0] reverseStream #-} 94 | 95 | -- | /O(n)/ Convert a 'Stream Char' into a 'Text'. 96 | unstream :: Stream Char -> Text 97 | unstream (Stream next0 s0 len) = runText $ \done -> do 98 | let mlen = upperBound 4 len 99 | arr0 <- A.new mlen 100 | let outer arr top = loop 101 | where 102 | loop !s !i = 103 | case next0 s of 104 | Done -> done arr i 105 | Skip s' -> loop s' i 106 | Yield x s' 107 | | j >= top -> {-# SCC "unstream/resize" #-} do 108 | let top' = (top + 1) `shiftL` 1 109 | arr' <- A.new top' 110 | A.copyM arr' 0 arr 0 top 111 | outer arr' top' s i 112 | | otherwise -> do d <- unsafeWrite arr i x 113 | loop s' (i+d) 114 | where j | ord x < 0x10000 = i 115 | | otherwise = i + 1 116 | outer arr0 mlen s0 0 117 | {-# INLINE [0] unstream #-} 118 | {-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-} 119 | 120 | 121 | -- ---------------------------------------------------------------------------- 122 | -- * Basic stream functions 123 | 124 | length :: Stream Char -> Int 125 | length = S.lengthI 126 | {-# INLINE[0] length #-} 127 | 128 | -- | /O(n)/ Reverse the characters of a string. 129 | reverse :: Stream Char -> Text 130 | reverse (Stream next s len0) 131 | | isEmpty len0 = I.empty 132 | | otherwise = I.textP arr off' len' 133 | where 134 | len0' = upperBound 4 (larger len0 4) 135 | (arr, (off', len')) = A.run2 (A.new len0' >>= loop s (len0'-1) len0') 136 | loop !s0 !i !len marr = 137 | case next s0 of 138 | Done -> return (marr, (j, len-j)) 139 | where j = i + 1 140 | Skip s1 -> loop s1 i len marr 141 | Yield x s1 | i < least -> {-# SCC "reverse/resize" #-} do 142 | let newLen = len `shiftL` 1 143 | marr' <- A.new newLen 144 | A.copyM marr' (newLen-len) marr 0 len 145 | write s1 (len+i) newLen marr' 146 | | otherwise -> write s1 i len marr 147 | where n = ord x 148 | least | n < 0x10000 = 0 149 | | otherwise = 1 150 | m = n - 0x10000 151 | lo = fromIntegral $ (m `shiftR` 10) + 0xD800 152 | hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00 153 | write t j l mar 154 | | n < 0x10000 = do 155 | A.unsafeWrite mar j (fromIntegral n) 156 | loop t (j-1) l mar 157 | | otherwise = do 158 | A.unsafeWrite mar (j-1) lo 159 | A.unsafeWrite mar j hi 160 | loop t (j-2) l mar 161 | {-# INLINE [0] reverse #-} 162 | 163 | -- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with 164 | -- the input and result reversed. 165 | reverseScanr :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char 166 | reverseScanr f z0 (Stream next0 s0 len) = Stream next (S1 :*: z0 :*: s0) (len+1) -- HINT maybe too low 167 | where 168 | {-# INLINE next #-} 169 | next (S1 :*: z :*: s) = Yield z (S2 :*: z :*: s) 170 | next (S2 :*: z :*: s) = case next0 s of 171 | Yield x s' -> let !x' = f x z 172 | in Yield x' (S2 :*: x' :*: s') 173 | Skip s' -> Skip (S2 :*: z :*: s') 174 | Done -> Done 175 | {-# INLINE reverseScanr #-} 176 | 177 | -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed 178 | -- value. However, the length of the result is limited by the 179 | -- first argument to 'unfoldrN'. This function is more efficient than 180 | -- 'unfoldr' when the length of the result is known. 181 | unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Stream Char 182 | unfoldrN n = S.unfoldrNI n 183 | {-# INLINE [0] unfoldrN #-} 184 | 185 | ------------------------------------------------------------------------------- 186 | -- ** Indexing streams 187 | 188 | -- | /O(n)/ stream index (subscript) operator, starting from 0. 189 | index :: Stream Char -> Int -> Char 190 | index = S.indexI 191 | {-# INLINE [0] index #-} 192 | 193 | -- | The 'findIndex' function takes a predicate and a stream and 194 | -- returns the index of the first element in the stream 195 | -- satisfying the predicate. 196 | findIndex :: (Char -> Bool) -> Stream Char -> Maybe Int 197 | findIndex = S.findIndexI 198 | {-# INLINE [0] findIndex #-} 199 | 200 | -- | /O(n)/ The 'count' function returns the number of times the query 201 | -- element appears in the given stream. 202 | countChar :: Char -> Stream Char -> Int 203 | countChar = S.countCharI 204 | {-# INLINE [0] countChar #-} 205 | 206 | -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a 207 | -- function to each element of a 'Text', passing an accumulating 208 | -- parameter from left to right, and returns a final 'Text'. 209 | mapAccumL :: (a -> Char -> (a,Char)) -> a -> Stream Char -> (a, Text) 210 | mapAccumL f z0 (Stream next0 s0 len) = (nz,I.textP na 0 nl) 211 | where 212 | (na,(nz,nl)) = A.run2 (A.new mlen >>= \arr -> outer arr mlen z0 s0 0) 213 | where mlen = upperBound 4 len 214 | outer arr top = loop 215 | where 216 | loop !z !s !i = 217 | case next0 s of 218 | Done -> return (arr, (z,i)) 219 | Skip s' -> loop z s' i 220 | Yield x s' 221 | | j >= top -> {-# SCC "mapAccumL/resize" #-} do 222 | let top' = (top + 1) `shiftL` 1 223 | arr' <- A.new top' 224 | A.copyM arr' 0 arr 0 top 225 | outer arr' top' z s i 226 | | otherwise -> do let (z',c) = f z x 227 | d <- unsafeWrite arr i c 228 | loop z' s' (i+d) 229 | where j | ord x < 0x10000 = i 230 | | otherwise = i + 1 231 | {-# INLINE [0] mapAccumL #-} 232 | -------------------------------------------------------------------------------- /Data/Text/Read.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings, UnboxedTuples, CPP #-} 2 | #if __GLASGOW_HASKELL__ >= 702 3 | {-# LANGUAGE Trustworthy #-} 4 | #endif 5 | 6 | -- | 7 | -- Module : Data.Text.Read 8 | -- Copyright : (c) 2010, 2011 Bryan O'Sullivan 9 | -- 10 | -- License : BSD-style 11 | -- Maintainer : bos@serpentine.com 12 | -- Stability : experimental 13 | -- Portability : GHC 14 | -- 15 | -- Functions used frequently when reading textual data. 16 | module Data.Text.Read 17 | ( 18 | Reader 19 | , decimal 20 | , hexadecimal 21 | , signed 22 | , rational 23 | , double 24 | ) where 25 | 26 | import Control.Monad (liftM) 27 | import Data.Char (isDigit, isHexDigit, ord) 28 | import Data.Int (Int8, Int16, Int32, Int64) 29 | import Data.Ratio ((%)) 30 | import Data.Text as T 31 | import Data.Text.Private (span_) 32 | import Data.Word (Word, Word8, Word16, Word32, Word64) 33 | 34 | -- | Read some text. If the read succeeds, return its value and the 35 | -- remaining text, otherwise an error message. 36 | type Reader a = Text -> Either String (a,Text) 37 | 38 | -- | Read a decimal integer. The input must begin with at least one 39 | -- decimal digit, and is consumed until a non-digit or end of string 40 | -- is reached. 41 | -- 42 | -- This function does not handle leading sign characters. If you need 43 | -- to handle signed input, use @'signed' 'decimal'@. 44 | -- 45 | -- /Note/: For fixed-width integer types, this function does not 46 | -- attempt to detect overflow, so a sufficiently long input may give 47 | -- incorrect results. If you are worried about overflow, use 48 | -- 'Integer' for your result type. 49 | decimal :: Integral a => Reader a 50 | {-# SPECIALIZE decimal :: Reader Int #-} 51 | {-# SPECIALIZE decimal :: Reader Int8 #-} 52 | {-# SPECIALIZE decimal :: Reader Int16 #-} 53 | {-# SPECIALIZE decimal :: Reader Int32 #-} 54 | {-# SPECIALIZE decimal :: Reader Int64 #-} 55 | {-# SPECIALIZE decimal :: Reader Integer #-} 56 | {-# SPECIALIZE decimal :: Reader Word #-} 57 | {-# SPECIALIZE decimal :: Reader Word8 #-} 58 | {-# SPECIALIZE decimal :: Reader Word16 #-} 59 | {-# SPECIALIZE decimal :: Reader Word32 #-} 60 | {-# SPECIALIZE decimal :: Reader Word64 #-} 61 | decimal txt 62 | | T.null h = Left "input does not start with a digit" 63 | | otherwise = Right (T.foldl' go 0 h, t) 64 | where (# h,t #) = span_ isDigit txt 65 | go n d = (n * 10 + fromIntegral (digitToInt d)) 66 | 67 | -- | Read a hexadecimal integer, consisting of an optional leading 68 | -- @\"0x\"@ followed by at least one decimal digit. Input is consumed 69 | -- until a non-hex-digit or end of string is reached. This function 70 | -- is case insensitive. 71 | -- 72 | -- This function does not handle leading sign characters. If you need 73 | -- to handle signed input, use @'signed' 'hexadecimal'@. 74 | -- 75 | -- /Note/: For fixed-width integer types, this function does not 76 | -- attempt to detect overflow, so a sufficiently long input may give 77 | -- incorrect results. If you are worried about overflow, use 78 | -- 'Integer' for your result type. 79 | hexadecimal :: Integral a => Reader a 80 | {-# SPECIALIZE hexadecimal :: Reader Int #-} 81 | {-# SPECIALIZE hexadecimal :: Reader Int8 #-} 82 | {-# SPECIALIZE hexadecimal :: Reader Int16 #-} 83 | {-# SPECIALIZE hexadecimal :: Reader Int32 #-} 84 | {-# SPECIALIZE hexadecimal :: Reader Int64 #-} 85 | {-# SPECIALIZE hexadecimal :: Reader Integer #-} 86 | {-# SPECIALIZE hexadecimal :: Reader Word #-} 87 | {-# SPECIALIZE hexadecimal :: Reader Word8 #-} 88 | {-# SPECIALIZE hexadecimal :: Reader Word16 #-} 89 | {-# SPECIALIZE hexadecimal :: Reader Word32 #-} 90 | {-# SPECIALIZE hexadecimal :: Reader Word64 #-} 91 | hexadecimal txt 92 | | h == "0x" || h == "0X" = hex t 93 | | otherwise = hex txt 94 | where (h,t) = T.splitAt 2 txt 95 | 96 | hex :: Integral a => Reader a 97 | {-# SPECIALIZE hex :: Reader Int #-} 98 | {-# SPECIALIZE hex :: Reader Int8 #-} 99 | {-# SPECIALIZE hex :: Reader Int16 #-} 100 | {-# SPECIALIZE hex :: Reader Int32 #-} 101 | {-# SPECIALIZE hex :: Reader Int64 #-} 102 | {-# SPECIALIZE hex :: Reader Integer #-} 103 | {-# SPECIALIZE hex :: Reader Word #-} 104 | {-# SPECIALIZE hex :: Reader Word8 #-} 105 | {-# SPECIALIZE hex :: Reader Word16 #-} 106 | {-# SPECIALIZE hex :: Reader Word32 #-} 107 | {-# SPECIALIZE hex :: Reader Word64 #-} 108 | hex txt 109 | | T.null h = Left "input does not start with a hexadecimal digit" 110 | | otherwise = Right (T.foldl' go 0 h, t) 111 | where (# h,t #) = span_ isHexDigit txt 112 | go n d = (n * 16 + fromIntegral (hexDigitToInt d)) 113 | 114 | hexDigitToInt :: Char -> Int 115 | hexDigitToInt c 116 | | c >= '0' && c <= '9' = ord c - ord '0' 117 | | c >= 'a' && c <= 'f' = ord c - (ord 'a' - 10) 118 | | otherwise = ord c - (ord 'A' - 10) 119 | 120 | digitToInt :: Char -> Int 121 | digitToInt c = ord c - ord '0' 122 | 123 | -- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and 124 | -- apply it to the result of applying the given reader. 125 | signed :: Num a => Reader a -> Reader a 126 | {-# INLINE signed #-} 127 | signed f = runP (signa (P f)) 128 | 129 | -- | Read a rational number. 130 | -- 131 | -- This function accepts an optional leading sign character, followed 132 | -- by at least one decimal digit. The syntax similar to that accepted 133 | -- by the 'read' function, with the exception that a trailing @\'.\'@ 134 | -- or @\'e\'@ /not/ followed by a number is not consumed. 135 | -- 136 | -- Examples (with behaviour identical to 'read'): 137 | -- 138 | -- >rational "3" == Right (3.0, "") 139 | -- >rational "3.1" == Right (3.1, "") 140 | -- >rational "3e4" == Right (30000.0, "") 141 | -- >rational "3.1e4" == Right (31000.0, "") 142 | -- >rational ".3" == Left "input does not start with a digit" 143 | -- >rational "e3" == Left "input does not start with a digit" 144 | -- 145 | -- Examples of differences from 'read': 146 | -- 147 | -- >rational "3.foo" == Right (3.0, ".foo") 148 | -- >rational "3e" == Right (3.0, "e") 149 | rational :: Fractional a => Reader a 150 | {-# SPECIALIZE rational :: Reader Double #-} 151 | rational = floaty $ \real frac fracDenom -> fromRational $ 152 | real % 1 + frac % fracDenom 153 | 154 | -- | Read a rational number. 155 | -- 156 | -- The syntax accepted by this function is the same as for 'rational'. 157 | -- 158 | -- /Note/: This function is almost ten times faster than 'rational', 159 | -- but is slightly less accurate. 160 | -- 161 | -- The 'Double' type supports about 16 decimal places of accuracy. 162 | -- For 94.2% of numbers, this function and 'rational' give identical 163 | -- results, but for the remaining 5.8%, this function loses precision 164 | -- around the 15th decimal place. For 0.001% of numbers, this 165 | -- function will lose precision at the 13th or 14th decimal place. 166 | double :: Reader Double 167 | double = floaty $ \real frac fracDenom -> 168 | fromIntegral real + 169 | fromIntegral frac / fromIntegral fracDenom 170 | 171 | signa :: Num a => Parser a -> Parser a 172 | {-# SPECIALIZE signa :: Parser Int -> Parser Int #-} 173 | {-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-} 174 | {-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-} 175 | {-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-} 176 | {-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-} 177 | {-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-} 178 | signa p = do 179 | sign <- perhaps '+' $ char (\c -> c == '-' || c == '+') 180 | if sign == '+' then p else negate `liftM` p 181 | 182 | newtype Parser a = P { 183 | runP :: Reader a 184 | } 185 | 186 | instance Monad Parser where 187 | return a = P $ \t -> Right (a,t) 188 | {-# INLINE return #-} 189 | m >>= k = P $ \t -> case runP m t of 190 | Left err -> Left err 191 | Right (a,t') -> runP (k a) t' 192 | {-# INLINE (>>=) #-} 193 | fail msg = P $ \_ -> Left msg 194 | 195 | perhaps :: a -> Parser a -> Parser a 196 | perhaps def m = P $ \t -> case runP m t of 197 | Left _ -> Right (def,t) 198 | r@(Right _) -> r 199 | 200 | char :: (Char -> Bool) -> Parser Char 201 | char p = P $ \t -> case T.uncons t of 202 | Just (c,t') | p c -> Right (c,t') 203 | _ -> Left "character does not match" 204 | 205 | data T = T !Integer !Int 206 | 207 | floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a 208 | {-# INLINE floaty #-} 209 | floaty f = runP $ do 210 | sign <- perhaps '+' $ char (\c -> c == '-' || c == '+') 211 | real <- P decimal 212 | T fraction fracDigits <- perhaps (T 0 0) $ do 213 | _ <- char (=='.') 214 | digits <- P $ \t -> Right (T.length $ T.takeWhile isDigit t, t) 215 | n <- P decimal 216 | return $ T n digits 217 | let e c = c == 'e' || c == 'E' 218 | power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int) 219 | let n = if fracDigits == 0 220 | then if power == 0 221 | then fromIntegral real 222 | else fromIntegral real * (10 ^^ power) 223 | else if power == 0 224 | then f real fraction (10 ^ fracDigits) 225 | else f real fraction (10 ^ fracDigits) * (10 ^^ power) 226 | return $! if sign == '+' 227 | then n 228 | else -n 229 | --------------------------------------------------------------------------------