├── benchmarks
    ├── .gitignore
    ├── python
    │   ├── .gitignore
    │   ├── cut.py
    │   ├── sort.py
    │   ├── utils.py
    │   ├── strip_tags.py
    │   └── multilang.py
    ├── Setup.hs
    ├── ruby
    │   ├── utils.rb
    │   ├── sort.rb
    │   ├── cut.rb
    │   ├── strip_tags.rb
    │   └── fold.rb
    ├── cbits
    │   └── time_iconv.c
    ├── haskell
    │   ├── Multilang.hs
    │   ├── Timer.hs
    │   ├── Benchmarks
    │   │   ├── EncodeUtf8.hs
    │   │   ├── Replace.hs
    │   │   ├── WordFrequencies.hs
    │   │   ├── FileRead.hs
    │   │   ├── Programs
    │   │   │   ├── BigTable.hs
    │   │   │   ├── StripTags.hs
    │   │   │   ├── Throughput.hs
    │   │   │   ├── Fold.hs
    │   │   │   ├── Sort.hs
    │   │   │   └── Cut.hs
    │   │   ├── Equality.hs
    │   │   ├── Search.hs
    │   │   ├── FoldLines.hs
    │   │   ├── DecodeUtf8.hs
    │   │   ├── Builder.hs
    │   │   ├── Stream.hs
    │   │   └── ReadNumbers.hs
    │   └── Benchmarks.hs
    └── text-benchmarks.cabal
├── tests
    ├── .ghci
    ├── Tests.hs
    ├── Makefile
    ├── Tests
    │   ├── IO.hs
    │   ├── SlowFunctions.hs
    │   ├── Utils.hs
    │   └── Regressions.hs
    ├── scripts
    │   └── cover-stdio.sh
    └── text-tests.cabal
├── Setup.lhs
├── .hgignore
├── Data
    └── Text
    │   ├── Util.hs
    │   ├── Encoding
    │       ├── Utf32.hs
    │       ├── Utf16.hs
    │       ├── Fusion
    │       │   └── Common.hs
    │       ├── Error.hs
    │       ├── Utf8.hs
    │       └── Fusion.hs
    │   ├── Lazy
    │       ├── Builder
    │       │   ├── RealFloat
    │       │   │   └── Functions.hs
    │       │   ├── Int
    │       │   │   └── Digits.hs
    │       │   └── Functions.hs
    │       ├── Builder.hs
    │       ├── Internal.hs
    │       ├── Fusion.hs
    │       ├── Search.hs
    │       ├── IO.hs
    │       └── Read.hs
    │   ├── Private.hs
    │   ├── Unsafe
    │       └── Base.hs
    │   ├── UnsafeShift.hs
    │   ├── UnsafeChar.hs
    │   ├── Search.hs
    │   ├── Unsafe.hs
    │   ├── Fusion
    │       ├── Internal.hs
    │       └── Size.hs
    │   ├── Internal.hs
    │   ├── Foreign.hs
    │   ├── IO
    │       └── Internal.hs
    │   ├── Array.hs
    │   ├── Fusion.hs
    │   └── Read.hs
├── scripts
    ├── ApiCompare.hs
    ├── CaseMapping.hs
    ├── CaseFolding.hs
    ├── Arsec.hs
    └── SpecialCasing.hs
├── LICENSE
├── README.markdown
├── .hgtags
├── tests-and-benchmarks.markdown
├── cbits
    └── cbits.c
└── text.cabal


/benchmarks/.gitignore:
--------------------------------------------------------------------------------
1 | dist
2 | 


--------------------------------------------------------------------------------
/tests/.ghci:
--------------------------------------------------------------------------------
1 | :set -DHAVE_DEEPSEQ -isrc -i../..
2 | 


--------------------------------------------------------------------------------
/benchmarks/python/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/benchmarks/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/Setup.lhs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env runhaskell
2 | > import Distribution.Simple
3 | > main = defaultMain
4 | 


--------------------------------------------------------------------------------
/.hgignore:
--------------------------------------------------------------------------------
 1 | ^(?:dist|benchmarks/dist|tests/coverage|tests/dist)$
 2 | ^benchmarks/.*\.txt$
 3 | ^tests/text-testdata.tar.bz2$
 4 | ^tests/(?:\.hpc|bm|qc|qc-hpc|stdio-hpc|text/test)$
 5 | \.(?:aux|eventlog|h[ip]|log|[oa]|orig|prof|ps|rej|swp|tix)$
 6 | ~$
 7 | 
 8 | syntax: glob
 9 | .\#*
10 | cabal-dev
11 | 


--------------------------------------------------------------------------------
/benchmarks/ruby/utils.rb:
--------------------------------------------------------------------------------
 1 | require 'benchmark'
 2 | 
 3 | def benchmark(&block)
 4 |   runs = 100
 5 |   total = 0
 6 | 
 7 |   runs.times do |i|
 8 |     result = Benchmark.measure(&block).total
 9 |     $stderr.puts "Run #{i}: #{result}"
10 |     total += result
11 |   end
12 | 
13 |   total / runs 
14 | end
15 | 


--------------------------------------------------------------------------------
/benchmarks/ruby/sort.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require './utils.rb'
 4 | 
 5 | def sort(filename)
 6 |   File.open(filename, 'r:utf-8') do |file|
 7 |     content = file.read
 8 |     puts content.lines.sort.join
 9 |   end
10 | end
11 | 
12 | ARGV.each do |f|
13 |   t = benchmark { sort(f) }
14 |   STDERR.puts "#{f}: #{t}"
15 | end
16 | 


--------------------------------------------------------------------------------
/benchmarks/ruby/cut.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require './utils.rb'
 4 | 
 5 | def cut(filename, l, r)
 6 |   File.open(filename, 'r:utf-8') do |file|
 7 |     file.each_line do |line|
 8 |       puts line[l, r - l]
 9 |     end
10 |   end
11 | end
12 | 
13 | ARGV.each do |f|
14 |   t = benchmark { cut(f, 20, 40) }
15 |   STDERR.puts "#{f}: #{t}"
16 | end
17 | 


--------------------------------------------------------------------------------
/benchmarks/python/cut.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import utils, sys, codecs
 4 | 
 5 | def cut(filename, l, r):
 6 |     content = open(filename, encoding='utf-8')
 7 |     for line in content:
 8 |         print(line[l:r])
 9 | 
10 | for f in sys.argv[1:]:
11 |     t = utils.benchmark(lambda: cut(f, 20, 40))
12 |     sys.stderr.write('{0}: {1}\n'.format(f, t))
13 | 


--------------------------------------------------------------------------------
/tests/Tests.hs:
--------------------------------------------------------------------------------
 1 | -- | Provides a simple main function which runs all the tests
 2 | --
 3 | module Main
 4 |     ( main
 5 |     ) where
 6 | 
 7 | import Test.Framework (defaultMain)
 8 | 
 9 | import qualified Tests.Properties as Properties
10 | import qualified Tests.Regressions as Regressions
11 | 
12 | main :: IO ()
13 | main = defaultMain [Properties.tests, Regressions.tests]
14 | 


--------------------------------------------------------------------------------
/benchmarks/python/sort.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import utils, sys, codecs
 4 | 
 5 | def sort(filename):
 6 |     content = open(filename, encoding='utf-8').read()
 7 |     lines = content.splitlines()
 8 |     lines.sort()
 9 |     print('\n'.join(lines))
10 | 
11 | for f in sys.argv[1:]:
12 |     t = utils.benchmark(lambda: sort(f))
13 |     sys.stderr.write('{0}: {1}\n'.format(f, t))
14 | 


--------------------------------------------------------------------------------
/benchmarks/python/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys, time
 4 | 
 5 | def benchmark_once(f):
 6 |     start = time.time()
 7 |     f()
 8 |     end = time.time()
 9 |     return end - start
10 | 
11 | def benchmark(f):
12 |     runs = 100
13 |     total = 0.0
14 |     for i in range(runs):
15 |         result = benchmark_once(f)
16 |         sys.stderr.write('Run {0}: {1}\n'.format(i, result))
17 |         total += result
18 |     return total / runs
19 | 


--------------------------------------------------------------------------------
/benchmarks/ruby/strip_tags.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require './utils.rb'
 4 | 
 5 | def strip_tags(filename)
 6 |   File.open(filename, 'r:utf-8') do |file|
 7 |     str = file.read
 8 | 
 9 |     d = 0
10 | 
11 |     str.each_char do |c|
12 |       d += 1 if c == '<'
13 |       putc(if d > 0 then ' ' else c end)
14 |       d -= 1 if c == '>'
15 |     end
16 |   end
17 | end
18 | 
19 | ARGV.each do |f|
20 |   t = benchmark { strip_tags(f) }
21 |   STDERR.puts "#{f}: #{t}"
22 | end
23 | 


--------------------------------------------------------------------------------
/benchmarks/python/strip_tags.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import utils, sys
 4 | 
 5 | def strip_tags(filename):
 6 |     string = open(filename, encoding='utf-8').read()
 7 | 
 8 |     d = 0
 9 |     out = []
10 | 
11 |     for c in string:
12 |         if c == '<': d += 1
13 | 
14 |         if d > 0:
15 |             out += ' '
16 |         else:
17 |             out += c
18 | 
19 |         if c == '>': d -= 1
20 | 
21 |     print(''.join(out))
22 | 
23 | for f in sys.argv[1:]:
24 |     t = utils.benchmark(lambda: strip_tags(f))
25 |     sys.stderr.write('{0}: {1}\n'.format(f, t))
26 | 


--------------------------------------------------------------------------------
/Data/Text/Util.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP, DeriveDataTypeable #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.Util
 5 | -- Copyright   : 2010 Bryan O'Sullivan
 6 | --
 7 | -- License     : BSD-style
 8 | -- Maintainer  : bos@serpentine.com
 9 | -- Stability   : experimental
10 | -- Portability : GHC
11 | --
12 | -- Useful functions.
13 | 
14 | module Data.Text.Util
15 |     (
16 |       intersperse
17 |     ) where
18 | 
19 | -- | A lazier version of Data.List.intersperse.  The other version
20 | -- causes space leaks!
21 | intersperse :: a -> [a] -> [a]
22 | intersperse _   []     = []
23 | intersperse sep (x:xs) = x : go xs
24 |   where
25 |     go []     = []
26 |     go (y:ys) = sep : y: go ys
27 | {-# INLINE intersperse #-}
28 | 


--------------------------------------------------------------------------------
/Data/Text/Encoding/Utf32.hs:
--------------------------------------------------------------------------------
 1 | -- |
 2 | -- Module      : Data.Text.Encoding.Utf32
 3 | -- Copyright   : (c) 2008, 2009 Tom Harper,
 4 | --               (c) 2009, 2010 Bryan O'Sullivan,
 5 | --               (c) 2009 Duncan Coutts
 6 | --
 7 | -- License     : BSD-style
 8 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 9 | --               duncan@haskell.org
10 | -- Stability   : experimental
11 | -- Portability : portable
12 | --
13 | -- Basic UTF-32 validation.
14 | module Data.Text.Encoding.Utf32
15 |     (
16 |       validate
17 |     ) where
18 | 
19 | import Data.Word (Word32)
20 | 
21 | validate    :: Word32 -> Bool
22 | validate x1 = x1 < 0xD800 || (x1 > 0xDFFF && x1 <= 0x10FFFF)
23 | {-# INLINE validate #-}
24 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Builder/RealFloat/Functions.hs:
--------------------------------------------------------------------------------
 1 | -- |
 2 | -- Module:    Data.Text.Lazy.Builder.RealFloat.Functions
 3 | -- Copyright: (c) The University of Glasgow 1994-2002
 4 | -- License:   see libraries/base/LICENSE
 5 | 
 6 | module Data.Text.Lazy.Builder.RealFloat.Functions
 7 |     (
 8 |       roundTo
 9 |     ) where
10 | 
11 | roundTo :: Int -> [Int] -> (Int,[Int])
12 | roundTo d is =
13 |   case f d is of
14 |     x@(0,_) -> x
15 |     (1,xs)  -> (1, 1:xs)
16 |     _       -> error "roundTo: bad Value"
17 |  where
18 |   f n []     = (0, replicate n 0)
19 |   f 0 (x:_)  = (if x >= 5 then 1 else 0, [])
20 |   f n (i:xs)
21 |      | i' == 10  = (1,0:ds)
22 |      | otherwise = (0,i':ds)
23 |       where
24 |        (c,ds) = f (n-1) xs
25 |        i'     = c + i
26 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Builder/Int/Digits.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | 
 3 | -- Module:      Data.Text.Lazy.Builder.Int.Digits
 4 | -- Copyright:   (c) 2013 Bryan O'Sullivan
 5 | -- License:     BSD3
 6 | -- Maintainer:  Bryan O'Sullivan <bos@serpentine.com>
 7 | -- Stability:   experimental
 8 | -- Portability: portable
 9 | --
10 | -- This module exists because the C preprocessor does things that we
11 | -- shall not speak of when confronted with Haskell multiline strings.
12 | 
13 | module Data.Text.Lazy.Builder.Int.Digits (digits) where
14 | 
15 | import Data.ByteString.Char8 (ByteString)
16 | 
17 | digits :: ByteString
18 | digits = "0001020304050607080910111213141516171819\
19 |          \2021222324252627282930313233343536373839\
20 |          \4041424344454647484950515253545556575859\
21 |          \6061626364656667686970717273747576777879\
22 |          \8081828384858687888990919293949596979899"
23 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Builder/Functions.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MagicHash #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.Lazy.Builder.Functions
 5 | -- Copyright   : (c) 2011 MailRank, Inc.
 6 | --
 7 | -- License     : BSD-style
 8 | -- Maintainer  : bos@serpentine.com
 9 | -- Stability   : experimental
10 | -- Portability : GHC
11 | --
12 | -- Useful functions and combinators.
13 | 
14 | module Data.Text.Lazy.Builder.Functions
15 |     (
16 |       (<>)
17 |     , i2d
18 |     ) where
19 | 
20 | import Data.Monoid (mappend)
21 | import Data.Text.Lazy.Builder (Builder)
22 | import GHC.Base
23 | 
24 | -- | Unsafe conversion for decimal digits.
25 | {-# INLINE i2d #-}
26 | i2d :: Int -> Char
27 | i2d (I# i#) = C# (chr# (ord# '0'# +# i#))
28 | 
29 | -- | The normal 'mappend' function with right associativity instead of
30 | -- left.
31 | (<>) :: Builder -> Builder -> Builder
32 | (<>) = mappend
33 | {-# INLINE (<>) #-}
34 | 
35 | infixr 4 <>
36 | 


--------------------------------------------------------------------------------
/benchmarks/cbits/time_iconv.c:
--------------------------------------------------------------------------------
 1 | #include <iconv.h>
 2 | #include <stdlib.h>
 3 | #include <stdio.h>
 4 | #include <stdint.h>
 5 | 
 6 | int time_iconv(char *srcbuf, size_t srcbufsize)
 7 | {
 8 |   uint16_t *destbuf = NULL;
 9 |   size_t destbufsize;
10 |   static uint16_t *origdestbuf;
11 |   static size_t origdestbufsize;
12 |   iconv_t ic = (iconv_t) -1;
13 |   int ret = 0;
14 | 
15 |   if (ic == (iconv_t) -1) {
16 |     ic = iconv_open("UTF-16LE", "UTF-8");
17 |     if (ic == (iconv_t) -1) {
18 |       ret = -1;
19 |       goto done;
20 |     }
21 |   }
22 |   
23 |   destbufsize = srcbufsize * sizeof(uint16_t);
24 |   if (destbufsize > origdestbufsize) {
25 |     free(origdestbuf);
26 |     origdestbuf = destbuf = malloc(origdestbufsize = destbufsize);
27 |   } else {
28 |     destbuf = origdestbuf;
29 |   }
30 | 
31 |   iconv(ic, &srcbuf, &srcbufsize, (char**) &destbuf, &destbufsize);
32 | 
33 |  done:
34 |   return ret;
35 | }
36 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Multilang.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns, OverloadedStrings, RankNTypes #-}
 2 | 
 3 | module Main (
 4 |   main
 5 |   ) where
 6 | 
 7 | import Control.Monad (forM_)
 8 | import qualified Data.ByteString as B
 9 | import qualified Data.Text as Text
10 | import Data.Text.Encoding (decodeUtf8)
11 | import Data.Text (Text)
12 | import System.IO (hFlush, stdout)
13 | import Timer (timer)
14 | 
15 | type BM = Text -> ()
16 | 
17 | bm :: forall a. (Text -> a) -> BM
18 | bm f t = f t `seq` ()
19 | 
20 | benchmarks :: [(String, Text.Text -> ())]
21 | benchmarks = [
22 |     ("find_first", bm $ Text.isInfixOf "en:Benin")
23 |   , ("find_index", bm $ Text.findIndex (=='c'))
24 |   ]
25 | 
26 | main :: IO ()
27 | main = do
28 |   !contents <- decodeUtf8 `fmap` B.readFile "../tests/text-test-data/yiwiki.xml"
29 |   forM_ benchmarks $ \(name, bmark) -> do
30 |     putStr $ name ++ " "
31 |     hFlush stdout
32 |     putStrLn =<< (timer 100 contents bmark)
33 | 


--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
 1 | count = 1000
 2 | 
 3 | coverage: build coverage/hpc_index.html
 4 | 
 5 | build: text-test-data
 6 | 	cabal configure -fhpc
 7 | 	cabal build
 8 | 
 9 | text-test-data:
10 | 	hg clone https://bitbucket.org/bos/text-test-data
11 | 	$(MAKE) -C text-test-data
12 | 
13 | coverage/text-tests.tix:
14 | 	-mkdir -p coverage
15 | 	./dist/build/text-tests/text-tests -a $(count)
16 | 	mv text-tests.tix $@
17 | 
18 | coverage/text-tests-stdio.tix:
19 | 	-mkdir -p coverage
20 | 	./scripts/cover-stdio.sh ./dist/build/text-tests-stdio/text-tests-stdio
21 | 	mv text-tests-stdio.tix $@
22 | 
23 | coverage/coverage.tix: coverage/text-tests.tix coverage/text-tests-stdio.tix
24 | 	hpc combine --output=$@ \
25 |         --exclude=Main \
26 |         coverage/text-tests.tix \
27 |         coverage/text-tests-stdio.tix
28 | 
29 | coverage/hpc_index.html: coverage/coverage.tix
30 | 	hpc markup --destdir=coverage coverage/coverage.tix
31 | 
32 | clean:
33 | 	rm -rf dist coverage .hpc
34 | 
35 | .PHONY: build coverage
36 | 


--------------------------------------------------------------------------------
/scripts/ApiCompare.hs:
--------------------------------------------------------------------------------
 1 | -- This script compares the strict and lazy Text APIs to ensure that
 2 | -- they're reasonably in sync.
 3 | 
 4 | {-# LANGUAGE OverloadedStrings #-}
 5 | 
 6 | import qualified Data.Set as S
 7 | import qualified Data.Text as T
 8 | import System.Process
 9 | 
10 | main = do
11 |   let tidy pkg = (S.fromList . filter (T.isInfixOf "::") . T.lines .
12 |                   T.replace "GHC.Int.Int64" "Int" .
13 |                   T.replace "\n " "" .
14 |                   T.replace (T.append (T.pack pkg) ".") "" . T.pack) `fmap`
15 |                  readProcess "ghci" [] (":browse " ++ pkg)
16 |   let diff a b = mapM_ (putStrLn . ("  "++) . T.unpack) . S.toList $
17 |                  S.difference a b
18 |   text <- tidy "Data.Text"
19 |   lazy <- tidy "Data.Text.Lazy"
20 |   list <- tidy "Data.List"
21 |   putStrLn "Text \\ List:"
22 |   diff text list
23 |   putStrLn ""
24 |   putStrLn "Text \\ Lazy:"
25 |   diff text lazy
26 |   putStrLn ""
27 |   putStrLn "Lazy \\ Text:"
28 |   diff lazy text
29 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Timer.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | 
 3 | module Timer (timer) where
 4 | 
 5 | import Control.Exception (evaluate)
 6 | import Data.Time.Clock.POSIX (getPOSIXTime)
 7 | import GHC.Float (FFFormat(..), formatRealFloat)
 8 | 
 9 | ickyRound :: Int -> Double -> String
10 | ickyRound k = formatRealFloat FFFixed (Just k)
11 | 
12 | timer :: Int -> a -> (a -> b) -> IO String
13 | timer count a0 f = do
14 |   let loop !k !fastest
15 |         | k <= 0 = return fastest
16 |         | otherwise = do
17 |         start <- getPOSIXTime
18 |         let inner a i
19 |               | i <= 0    = return ()
20 |               | otherwise = evaluate (f a) >> inner a (i-1)
21 |         inner a0 count
22 |         end <- getPOSIXTime
23 |         let elapsed = end - start
24 |         loop (k-1) (min fastest (elapsed / fromIntegral count))
25 |   t <- loop (3::Int) 1e300
26 |   let log10 x = log x / log 10
27 |       ft = realToFrac t
28 |       prec = round (log10 (fromIntegral count) - log10 ft)
29 |   return $! ickyRound prec ft
30 | {-# NOINLINE timer #-}
31 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/EncodeUtf8.hs:
--------------------------------------------------------------------------------
 1 | -- | UTF-8 encode a text
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Replicating a string a number of times
 6 | --
 7 | -- * UTF-8 encoding it
 8 | --
 9 | module Benchmarks.EncodeUtf8
10 |     ( benchmark
11 |     ) where
12 | 
13 | import Criterion (Benchmark, bgroup, bench, whnf)
14 | import qualified Data.ByteString as B
15 | import qualified Data.ByteString.Lazy as BL
16 | import qualified Data.Text as T
17 | import qualified Data.Text.Encoding as T
18 | import qualified Data.Text.Lazy as TL
19 | import qualified Data.Text.Lazy.Encoding as TL
20 | 
21 | benchmark :: String -> IO Benchmark
22 | benchmark string = do
23 |     return $ bgroup "EncodeUtf8"
24 |         [ bench "Text"     $ whnf (B.length . T.encodeUtf8)   text
25 |         , bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText
26 |         ]
27 |   where
28 |     -- The string in different formats
29 |     text = T.replicate k $ T.pack string
30 |     lazyText = TL.replicate (fromIntegral k) $ TL.pack string
31 | 
32 |     -- Amount
33 |     k = 100000
34 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Replace.hs:
--------------------------------------------------------------------------------
 1 | -- | Replace a string by another string
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Search and replace of a pattern in a text
 6 | --
 7 | module Benchmarks.Replace
 8 |     ( benchmark
 9 |     ) where
10 | 
11 | import Criterion (Benchmark, bgroup, bench, nf)
12 | import qualified Data.ByteString.Char8 as B
13 | import qualified Data.ByteString.Lazy as BL
14 | import qualified Data.ByteString.Lazy.Search as BL
15 | import qualified Data.Text.Lazy as TL
16 | import qualified Data.Text.Lazy.Encoding as TL
17 | import qualified Data.Text.Lazy.IO as TL
18 | 
19 | benchmark :: FilePath -> String -> String -> IO Benchmark
20 | benchmark fp pat sub = do
21 |     tl <- TL.readFile fp
22 |     bl <- BL.readFile fp
23 |     return $ bgroup "Replace"
24 |         [ bench "LazyText"       $ nf (TL.length . TL.replace tpat tsub) tl
25 |         , bench "LazyByteString" $ nf (BL.length . BL.replace bpat bsub) bl
26 |         ]
27 |   where
28 |     tpat = TL.pack pat
29 |     tsub = TL.pack sub
30 |     bpat = B.concat $ BL.toChunks $ TL.encodeUtf8 tpat
31 |     bsub = B.concat $ BL.toChunks $ TL.encodeUtf8 tsub
32 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/WordFrequencies.hs:
--------------------------------------------------------------------------------
 1 | -- | A word frequency count using the different string types
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Splitting into words
 6 | --
 7 | -- * Converting to lowercase
 8 | --
 9 | -- * Comparing: Eq/Ord instances
10 | --
11 | module Benchmarks.WordFrequencies
12 |     ( benchmark
13 |     ) where
14 | 
15 | import Criterion (Benchmark, bench, bgroup, whnf)
16 | import Data.Char (toLower)
17 | import Data.List (foldl')
18 | import Data.Map (Map)
19 | import qualified Data.ByteString.Char8 as B
20 | import qualified Data.Map as M
21 | import qualified Data.Text as T
22 | import qualified Data.Text.IO as T
23 | 
24 | benchmark :: FilePath -> IO Benchmark
25 | benchmark fp = do
26 |     s <- readFile fp
27 |     b <- B.readFile fp
28 |     t <- T.readFile fp
29 |     return $ bgroup "WordFrequencies"
30 |         [ bench "String"     $ whnf (frequencies . words . map toLower)     s
31 |         , bench "ByteString" $ whnf (frequencies . B.words . B.map toLower) b
32 |         , bench "Text"       $ whnf (frequencies . T.words . T.toLower)     t
33 |         ]
34 | 
35 | frequencies :: Ord a => [a] -> Map a Int
36 | frequencies = foldl' (\m k -> M.insertWith (+) k 1 m) M.empty
37 | 


--------------------------------------------------------------------------------
/scripts/CaseMapping.hs:
--------------------------------------------------------------------------------
 1 | import System.Environment
 2 | import System.IO
 3 | 
 4 | import Arsec
 5 | import CaseFolding
 6 | import SpecialCasing
 7 | 
 8 | main = do
 9 |   args <- getArgs
10 |   let oname = case args of
11 |                 [] -> "../Data/Text/Fusion/CaseMapping.hs"
12 |                 [o] -> o
13 |   psc <- parseSC "SpecialCasing.txt"
14 |   pcf <- parseCF "CaseFolding.txt"
15 |   scs <- case psc of
16 |            Left err -> print err >> return undefined
17 |            Right ms -> return ms
18 |   cfs <- case pcf of
19 |            Left err -> print err >> return undefined
20 |            Right ms -> return ms
21 |   h <- openFile oname WriteMode
22 |   mapM_ (hPutStrLn h) ["{-# LANGUAGE Rank2Types #-}"
23 |                       ,"-- AUTOMATICALLY GENERATED - DO NOT EDIT"
24 |                       ,"-- Generated by scripts/SpecialCasing.hs"
25 |                       ,"module Data.Text.Fusion.CaseMapping where"
26 |                       ,"import Data.Char"
27 |                       ,"import Data.Text.Fusion.Internal"
28 |                       ,""]
29 |   mapM_ (hPutStrLn h) (mapSC "upper" upper toUpper scs)
30 |   mapM_ (hPutStrLn h) (mapSC "lower" lower toLower scs)
31 |   mapM_ (hPutStrLn h) (mapCF cfs)
32 |   hClose h
33 | 


--------------------------------------------------------------------------------
/Data/Text/Private.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns, Rank2Types, UnboxedTuples #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.Private
 5 | -- Copyright   : (c) 2011 Bryan O'Sullivan
 6 | --
 7 | -- License     : BSD-style
 8 | -- Maintainer  : bos@serpentine.com
 9 | -- Stability   : experimental
10 | -- Portability : GHC
11 | 
12 | module Data.Text.Private
13 |     (
14 |       runText
15 |     , span_
16 |     ) where
17 | 
18 | import Control.Monad.ST (ST, runST)
19 | import Data.Text.Internal (Text(..), textP)
20 | import Data.Text.Unsafe (Iter(..), iter)
21 | import qualified Data.Text.Array as A
22 | 
23 | span_ :: (Char -> Bool) -> Text -> (# Text, Text #)
24 | span_ p t@(Text arr off len) = (# hd,tl #)
25 |   where hd = textP arr off k
26 |         tl = textP arr (off+k) (len-k)
27 |         !k = loop 0
28 |         loop !i | i < len && p c = loop (i+d)
29 |                 | otherwise      = i
30 |             where Iter c d       = iter t i
31 | {-# INLINE span_ #-}
32 | 
33 | runText :: (forall s. (A.MArray s -> Int -> ST s Text) -> ST s Text) -> Text
34 | runText act = runST (act $ \ !marr !len -> do
35 |                              arr <- A.unsafeFreeze marr
36 |                              return $! textP arr 0 len)
37 | {-# INLINE runText #-}
38 | 


--------------------------------------------------------------------------------
/benchmarks/python/multilang.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import math
 4 | import sys
 5 | import time
 6 | 
 7 | def find_first():
 8 |     cf = contents.find
 9 |     return timer(lambda: cf("en:Benin"))
10 | 
11 | def timer(f, count=100):
12 |     a = 1e300
13 |     def g():
14 |         return
15 |     for i in xrange(3):
16 |         start = time.time()
17 |         for j in xrange(count):
18 |             g()
19 |         a = min(a, (time.time() - start) / count)
20 | 
21 |     b = 1e300
22 |     for i in xrange(3):
23 |         start = time.time()
24 |         for j in xrange(count):
25 |             f()
26 |         b = min(b, (time.time() - start) / count)
27 | 
28 |     return round(b - a, int(round(math.log(count, 10) - math.log(b - a, 10))))
29 | 
30 | contents = open('../../tests/text-test-data/yiwiki.xml', 'r').read()
31 | contents = contents.decode('utf-8')
32 | 
33 | benchmarks = (
34 |     find_first,
35 |     )
36 | 
37 | to_run = sys.argv[1:]
38 | bms = []
39 | if to_run:
40 |     for r in to_run:
41 |         for b in benchmarks:
42 |             if b.__name__.startswith(r):
43 |                 bms.append(b)
44 | else:
45 |     bms = benchmarks
46 | 
47 | for b in bms:
48 |     sys.stdout.write(b.__name__ + ' ')
49 |     sys.stdout.flush()
50 |     print b()
51 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/FileRead.hs:
--------------------------------------------------------------------------------
 1 | -- | Benchmarks simple file reading
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Reading a file from the disk
 6 | --
 7 | module Benchmarks.FileRead
 8 |     ( benchmark
 9 |     ) where
10 | 
11 | import Control.Exception (evaluate)
12 | import Criterion (Benchmark, bgroup, bench)
13 | import qualified Data.ByteString as SB
14 | import qualified Data.ByteString.Lazy as LB
15 | import qualified Data.Text as T
16 | import qualified Data.Text.Encoding as T
17 | import qualified Data.Text.IO as T
18 | import qualified Data.Text.Lazy as LT
19 | import qualified Data.Text.Lazy.Encoding as LT
20 | import qualified Data.Text.Lazy.IO as LT
21 | 
22 | benchmark :: FilePath -> IO Benchmark
23 | benchmark p = return $ bgroup "FileRead"
24 |     [ bench "String" $ readFile p >>= evaluate . length
25 |     , bench "ByteString" $ SB.readFile p >>= evaluate . SB.length
26 |     , bench "LazyByteString" $ LB.readFile p >>= evaluate . LB.length
27 |     , bench "Text" $ T.readFile p >>= evaluate . T.length
28 |     , bench "LazyText" $ LT.readFile p >>= evaluate . LT.length
29 |     , bench "TextByteString" $
30 |         SB.readFile p >>= evaluate . T.length . T.decodeUtf8
31 |     , bench "LazyTextByteString" $
32 |         LB.readFile p >>= evaluate . LT.length . LT.decodeUtf8
33 |     ]
34 | 


--------------------------------------------------------------------------------
/Data/Text/Encoding/Utf16.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MagicHash, BangPatterns #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.Encoding.Utf16
 5 | -- Copyright   : (c) 2008, 2009 Tom Harper,
 6 | --               (c) 2009 Bryan O'Sullivan,
 7 | --               (c) 2009 Duncan Coutts
 8 | --
 9 | -- License     : BSD-style
10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
11 | --               duncan@haskell.org
12 | -- Stability   : experimental
13 | -- Portability : GHC
14 | --
15 | -- Basic UTF-16 validation and character manipulation.
16 | module Data.Text.Encoding.Utf16
17 |     (
18 |       chr2
19 |     , validate1
20 |     , validate2
21 |     ) where
22 | 
23 | import GHC.Exts
24 | import GHC.Word (Word16(..))
25 | 
26 | chr2 :: Word16 -> Word16 -> Char
27 | chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#))
28 |     where
29 |       !x# = word2Int# a#
30 |       !y# = word2Int# b#
31 |       !upper# = uncheckedIShiftL# (x# -# 0xD800#) 10#
32 |       !lower# = y# -# 0xDC00#
33 | {-# INLINE chr2 #-}
34 | 
35 | validate1    :: Word16 -> Bool
36 | validate1 x1 = x1 < 0xD800 || x1 > 0xDFFF
37 | {-# INLINE validate1 #-}
38 | 
39 | validate2       ::  Word16 -> Word16 -> Bool
40 | validate2 x1 x2 = x1 >= 0xD800 && x1 <= 0xDBFF &&
41 |                   x2 >= 0xDC00 && x2 <= 0xDFFF
42 | {-# INLINE validate2 #-}
43 | 


--------------------------------------------------------------------------------
/benchmarks/ruby/fold.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require './utils.rb'
 4 | 
 5 | def fold(filename, max_width)
 6 |   File.open(filename, 'r:utf-8') do |file|
 7 |     # Words in this paragraph
 8 |     paragraph = []
 9 | 
10 |     file.each_line do |line|
11 |       # If we encounter an empty line, we reformat and dump the current
12 |       # paragraph
13 |       if line.strip.empty?
14 |         puts fold_paragraph(paragraph, max_width)
15 |         puts
16 |         paragraph = []
17 |       # Otherwise, we append the words found in the line to the paragraph
18 |       else
19 |         paragraph.concat line.split
20 |       end
21 |     end
22 | 
23 |     # Last paragraph
24 |     puts fold_paragraph(paragraph, max_width) unless paragraph.empty?
25 |   end
26 | end
27 | 
28 | # Fold a single paragraph to the desired width
29 | def fold_paragraph(paragraph, max_width)
30 |   # Gradually build our output
31 |   str, *rest = paragraph
32 |   width = str.length
33 | 
34 |   rest.each do |word|
35 |     if width + word.length + 1 <= max_width
36 |       str << ' ' << word
37 |       width += word.length + 1
38 |     else
39 |       str << "\n" << word
40 |       width = word.length
41 |     end
42 |   end
43 | 
44 |   str
45 | end
46 | 
47 | ARGV.each do |f|
48 |   t = benchmark { fold(f, 80) }
49 |   STDERR.puts "#{f}: #{t}"
50 | end
51 | 


--------------------------------------------------------------------------------
/tests/Tests/IO.hs:
--------------------------------------------------------------------------------
 1 | -- | Program which exposes some haskell functions as an exutable. The results
 2 | -- and coverage of this module is meant to be checked using a shell script.
 3 | --
 4 | module Main
 5 |     (
 6 |       main
 7 |     ) where
 8 | 
 9 | import System.Environment (getArgs)
10 | import System.Exit (exitFailure)
11 | import System.IO (hPutStrLn, stderr)
12 | import qualified Data.Text as T
13 | import qualified Data.Text.IO as T
14 | import qualified Data.Text.Lazy as TL
15 | import qualified Data.Text.Lazy.IO as TL
16 | 
17 | main :: IO ()
18 | main = do
19 |   args <- getArgs
20 |   case args of
21 |     ["T.readFile", name] -> T.putStr =<< T.readFile name
22 |     ["T.writeFile", name, t] -> T.writeFile name (T.pack t)
23 |     ["T.appendFile", name, t] -> T.appendFile name (T.pack t)
24 |     ["T.interact"] -> T.interact id
25 |     ["T.getContents"] -> T.putStr =<< T.getContents
26 |     ["T.getLine"] -> T.putStrLn =<< T.getLine
27 | 
28 |     ["TL.readFile", name] -> TL.putStr =<< TL.readFile name
29 |     ["TL.writeFile", name, t] -> TL.writeFile name (TL.pack t)
30 |     ["TL.appendFile", name, t] -> TL.appendFile name (TL.pack t)
31 |     ["TL.interact"] -> TL.interact id
32 |     ["TL.getContents"] -> TL.putStr =<< TL.getContents
33 |     ["TL.getLine"] -> TL.putStrLn =<< TL.getLine
34 |     _ -> hPutStrLn stderr "invalid directive!" >> exitFailure
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2008-2009, Tom Harper
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions
 6 | are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/tests/scripts/cover-stdio.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $# < 1 ]]; then
 4 |     echo "Usage: $0 <exe>"
 5 |     exit 1
 6 | fi
 7 | 
 8 | exe=$1
 9 | 
10 | rm -f $exe.tix
11 | 
12 | f=$(mktemp stdio-f.XXXXXX)
13 | g=$(mktemp stdio-g.XXXXXX)
14 | 
15 | for t in T TL; do
16 |     echo $t.readFile > $f
17 |     $exe $t.readFile $f > $g
18 |     if ! diff -u $f $g; then
19 | 	errs=$((errs+1))
20 | 	echo FAIL: $t.readFile 1>&2
21 |     fi
22 | 
23 |     $exe $t.writeFile $f $t.writeFile
24 |     echo -n $t.writeFile > $g
25 |     if ! diff -u $f $g; then
26 | 	errs=$((errs+1))
27 | 	echo FAIL: $t.writeFile 1>&2
28 |     fi
29 | 
30 |     echo -n quux > $f
31 |     $exe $t.appendFile $f $t.appendFile
32 |     echo -n quux$t.appendFile > $g
33 |     if ! diff -u $f $g; then
34 | 	errs=$((errs+1))
35 | 	echo FAIL: $t.appendFile 1>&2
36 |     fi
37 | 
38 |     echo $t.interact | $exe $t.interact > $f
39 |     echo $t.interact > $g
40 |     if ! diff -u $f $g; then
41 | 	errs=$((errs+1))
42 | 	echo FAIL: $t.interact 1>&2
43 |     fi
44 | 
45 |     echo $t.getContents | $exe $t.getContents > $f
46 |     echo $t.getContents > $g
47 |     if ! diff -u $f $g; then
48 | 	errs=$((errs+1))
49 | 	echo FAIL: $t.getContents 1>&2
50 |     fi
51 | 
52 |     echo $t.getLine | $exe $t.getLine > $f
53 |     echo $t.getLine > $g
54 |     if ! diff -u $f $g; then
55 | 	errs=$((errs+1))
56 | 	echo FAIL: $t.getLine 1>&2
57 |     fi
58 | done
59 | 
60 | rm -f $f $g
61 | 
62 | exit $errs
63 | 


--------------------------------------------------------------------------------
/tests/Tests/SlowFunctions.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns #-}
 2 | module Tests.SlowFunctions
 3 |     (
 4 |       indices
 5 |     , splitOn
 6 |     ) where
 7 | 
 8 | import qualified Data.Text as T
 9 | import Data.Text.Internal (Text(..))
10 | import Data.Text.Unsafe (iter_, unsafeHead, unsafeTail)
11 | 
12 | indices :: T.Text              -- ^ Substring to search for (@needle@)
13 |         -> T.Text              -- ^ Text to search in (@haystack@)
14 |         -> [Int]
15 | indices needle@(Text _narr _noff nlen) haystack@(Text harr hoff hlen)
16 |     | T.null needle = []
17 |     | otherwise     = scan 0
18 |   where
19 |     scan i | i >= hlen = []
20 |            | needle `T.isPrefixOf` t = i : scan (i+nlen)
21 |            | otherwise = scan (i+d)
22 |            where t = Text harr (hoff+i) (hlen-i)
23 |                  d = iter_ haystack i
24 | 
25 | splitOn :: T.Text               -- ^ Text to split on
26 |         -> T.Text               -- ^ Input text
27 |         -> [T.Text]
28 | splitOn pat src0
29 |     | T.null pat  = error "splitOn: empty"
30 |     | l == 1      = T.split (== (unsafeHead pat)) src0
31 |     | otherwise   = go src0
32 |   where
33 |     l      = T.length pat
34 |     go src = search 0 src
35 |       where
36 |         search !n !s
37 |             | T.null s             = [src]      -- not found
38 |             | pat `T.isPrefixOf` s = T.take n src : go (T.drop l s)
39 |             | otherwise            = search (n+1) (unsafeTail s)
40 | 


--------------------------------------------------------------------------------
/scripts/CaseFolding.hs:
--------------------------------------------------------------------------------
 1 | -- This script processes the following source file:
 2 | --
 3 | --   http://unicode.org/Public/UNIDATA/CaseFolding.txt
 4 | 
 5 | module CaseFolding
 6 |     (
 7 |       Fold(..)
 8 |     , parseCF
 9 |     , mapCF
10 |     ) where
11 | 
12 | import Arsec
13 | 
14 | data Fold = Fold {
15 |       code :: Char
16 |     , status :: Char
17 |     , mapping :: [Char]
18 |     , name :: String
19 |     } deriving (Eq, Ord, Show)
20 | 
21 | entries :: Parser [Fold]
22 | entries = many comment *> many (entry <* many comment)
23 |   where
24 |     entry = Fold <$> unichar <* semi
25 |                  <*> oneOf "CFST" <* semi
26 |                  <*> unichars
27 |                  <*> (string "# " *> manyTill anyToken (char '\n'))
28 | 
29 | parseCF :: FilePath -> IO (Either ParseError [Fold])
30 | parseCF name = parse entries name <$> readFile name
31 | 
32 | mapCF :: [Fold] -> [String]
33 | mapCF ms = typ ++ (map nice . filter p $ ms) ++ [last]
34 |   where
35 |     typ = ["foldMapping :: forall s. Char -> s -> Step (CC s) Char"
36 |            ,"{-# INLINE foldMapping #-}"]
37 |     last = "foldMapping c s = Yield (toLower c) (CC s '\\0' '\\0')"
38 |     nice c = "-- " ++ name c ++ "\n" ++
39 |              "foldMapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
40 |        where [x,y,z] = (map showC . take 3) (mapping c ++ repeat '\0')
41 |     p f = status f `elem` "CF" &&
42 |           mapping f /= [toLower (code f)]
43 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Programs/BigTable.hs:
--------------------------------------------------------------------------------
 1 | -- | Create a large HTML table and dump it to a handle
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Creating a large HTML document using a builder
 6 | --
 7 | -- * Writing to a handle
 8 | --
 9 | {-# LANGUAGE OverloadedStrings #-}
10 | module Benchmarks.Programs.BigTable
11 |     ( benchmark
12 |     ) where
13 | 
14 | import Criterion (Benchmark, bench)
15 | import Data.Monoid (mappend, mconcat)
16 | import Data.Text.Lazy.Builder (Builder, fromText, toLazyText)
17 | import Data.Text.Lazy.IO (hPutStr)
18 | import System.IO (Handle)
19 | import qualified Data.Text as T
20 | 
21 | benchmark :: Handle -> IO Benchmark
22 | benchmark sink = return $ bench "BigTable" $ do
23 |     hPutStr sink "Content-Type: text/html\n\n<table>"
24 |     hPutStr sink . toLazyText . makeTable =<< rows
25 |     hPutStr sink "</table>"
26 |   where
27 |     -- We provide the number of rows in IO so the builder value isn't shared
28 |     -- between the benchmark samples.
29 |     rows :: IO Int
30 |     rows = return 20000
31 |     {-# NOINLINE rows #-}
32 | 
33 | makeTable :: Int -> Builder
34 | makeTable n = mconcat $ replicate n $ mconcat $ map makeCol [1 .. 50]
35 | 
36 | makeCol :: Int -> Builder
37 | makeCol 1 = fromText "<tr><td>1</td>"
38 | makeCol 50 = fromText "<td>50</td></tr>"
39 | makeCol i = fromText "<td>" `mappend` (fromInt i `mappend` fromText "</td>")
40 | 
41 | fromInt :: Int -> Builder
42 | fromInt = fromText . T.pack . show
43 | 


--------------------------------------------------------------------------------
/scripts/Arsec.hs:
--------------------------------------------------------------------------------
 1 | module Arsec
 2 |     (
 3 |       comment
 4 |     , semi
 5 |     , showC
 6 |     , unichar
 7 |     , unichars
 8 |     , module Control.Applicative
 9 |     , module Control.Monad
10 |     , module Data.Char
11 |     , module Text.ParserCombinators.Parsec.Char
12 |     , module Text.ParserCombinators.Parsec.Combinator
13 |     , module Text.ParserCombinators.Parsec.Error
14 |     , module Text.ParserCombinators.Parsec.Prim
15 |     ) where
16 | 
17 | import Control.Monad
18 | import Control.Applicative
19 | import Data.Char
20 | import Numeric
21 | import Text.ParserCombinators.Parsec.Char hiding (lower, upper)
22 | import Text.ParserCombinators.Parsec.Combinator hiding (optional)
23 | import Text.ParserCombinators.Parsec.Error
24 | import Text.ParserCombinators.Parsec.Prim hiding ((<|>), many)
25 | 
26 | instance Applicative (GenParser s a) where
27 |     pure = return
28 |     (<*>) = ap
29 | 
30 | instance Alternative (GenParser s a) where
31 |     empty = mzero
32 |     (<|>) = mplus
33 | 
34 | unichar :: Parser Char
35 | unichar = chr . fst . head . readHex <$> many1 hexDigit
36 | 
37 | unichars :: Parser [Char]
38 | unichars = manyTill (unichar <* spaces) semi
39 | 
40 | semi :: Parser ()
41 | semi = char ';' *> spaces *> pure ()
42 | 
43 | comment :: Parser String
44 | comment = (char '#' *> manyTill anyToken (char '\n')) <|> string "\n"
45 | 
46 | showC :: Char -> String
47 | showC c = "'\\x" ++ d ++ "'"
48 |     where h = showHex (ord c) ""
49 |           d = replicate (4 - length h) '0' ++ h
50 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Equality.hs:
--------------------------------------------------------------------------------
 1 | -- | Compare a string with a copy of itself that is identical except
 2 | -- for the last character.
 3 | --
 4 | -- Tested in this benchmark:
 5 | --
 6 | -- * Comparison of strings (Eq instance)
 7 | --
 8 | module Benchmarks.Equality
 9 |     ( benchmark
10 |     ) where
11 | 
12 | import Criterion (Benchmark, bgroup, bench, whnf)
13 | import qualified Data.ByteString.Char8 as B
14 | import qualified Data.ByteString.Lazy.Char8 as BL
15 | import qualified Data.Text as T
16 | import qualified Data.Text.Encoding as T
17 | import qualified Data.Text.Lazy as TL
18 | import qualified Data.Text.Lazy.Encoding as TL
19 | 
20 | benchmark :: FilePath -> IO Benchmark
21 | benchmark fp = do
22 |   b <- B.readFile fp
23 |   bl1 <- BL.readFile fp
24 |   -- A lazy bytestring is a list of chunks. When we do not explicitly create two
25 |   -- different lazy bytestrings at a different address, the bytestring library
26 |   -- will compare the chunk addresses instead of the chunk contents. This is why
27 |   -- we read the lazy bytestring twice here.
28 |   bl2 <- BL.readFile fp
29 |   l <- readFile fp
30 |   let t  = T.decodeUtf8 b
31 |       tl = TL.decodeUtf8 bl1
32 |   return $ bgroup "Equality"
33 |     [ bench "Text" $ whnf (== T.init t `T.snoc` '\xfffd') t
34 |     , bench "LazyText" $ whnf (== TL.init tl `TL.snoc` '\xfffd') tl
35 |     , bench "ByteString" $ whnf (== B.init b `B.snoc` '\xfffd') b
36 |     , bench "LazyByteString" $ whnf (== BL.init bl2 `BL.snoc` '\xfffd') bl1
37 |     , bench "String" $ whnf (== init l ++ "\xfffd") l
38 |     ]
39 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | # Text: Fast, packed Unicode strings, using stream fusion
 2 | 
 3 | This package provides the Data.Text library, a library for the space-
 4 | and time-efficient manipulation of Unicode text in Haskell.
 5 | 
 6 | 
 7 | # Normalization, conversion, and collation, oh my!
 8 | 
 9 | This library intentionally provides a simple API based on the
10 | Haskell prelude's list manipulation functions.  For more complicated
11 | real-world tasks, such as Unicode normalization, conversion to and
12 | from a larger variety of encodings, and collation, use the [text-icu
13 | package](http://hackage.haskell.org/cgi-bin/hackage-scripts/package/text-icu).
14 | 
15 | That library uses the well-respected and liberally licensed ICU
16 | library to provide these facilities.
17 | 
18 | 
19 | # Get involved!
20 | 
21 | Please report bugs via the
22 | [github issue tracker](https://github.com/bos/text/issues).
23 | 
24 | Master [git repository](https://github.com/bos/text):
25 | 
26 | * `git clone git://github.com/bos/text.git`
27 | 
28 | There's also a [Mercurial mirror](https://bitbucket.org/bos/text):
29 | 
30 | * `hg clone https://bitbucket.org/bos/text`
31 | 
32 | (You can create and contribute changes using either Mercurial or git.)
33 | 
34 | 
35 | # Authors
36 | 
37 | The base code for this library was originally written by Tom Harper,
38 | based on the stream fusion framework developed by Roman Leshchinskiy,
39 | Duncan Coutts, and Don Stewart.
40 | 
41 | The core library was fleshed out, debugged, and tested by Bryan
42 | O'Sullivan <bos@serpentine.com>, and he is the current maintainer.
43 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Builder.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
 2 | 
 3 | -----------------------------------------------------------------------------
 4 | -- |
 5 | -- Module      : Data.Text.Lazy.Builder
 6 | -- Copyright   : (c) 2013 Bryan O'Sullivan
 7 | --               (c) 2010 Johan Tibell
 8 | -- License     : BSD3-style (see LICENSE)
 9 | --
10 | -- Maintainer  : Johan Tibell <johan.tibell@gmail.com>
11 | -- Stability   : experimental
12 | -- Portability : portable to Hugs and GHC
13 | --
14 | -- Efficient construction of lazy @Text@ values.  The principal
15 | -- operations on a @Builder@ are @singleton@, @fromText@, and
16 | -- @fromLazyText@, which construct new builders, and 'mappend', which
17 | -- concatenates two builders.
18 | --
19 | -- To get maximum performance when building lazy @Text@ values using a
20 | -- builder, associate @mappend@ calls to the right.  For example,
21 | -- prefer
22 | --
23 | -- > singleton 'a' `mappend` (singleton 'b' `mappend` singleton 'c')
24 | --
25 | -- to
26 | --
27 | -- > singleton 'a' `mappend` singleton 'b' `mappend` singleton 'c'
28 | --
29 | -- as the latter associates @mappend@ to the left.
30 | --
31 | -----------------------------------------------------------------------------
32 | 
33 | module Data.Text.Lazy.Builder
34 |    ( -- * The Builder type
35 |      Builder
36 |    , toLazyText
37 |    , toLazyTextWith
38 | 
39 |      -- * Constructing Builders
40 |    , singleton
41 |    , fromText
42 |    , fromLazyText
43 |    , fromString
44 | 
45 |      -- * Flushing the buffer state
46 |    , flush
47 |    ) where
48 | 
49 | import Data.Text.Lazy.Builder.Internal
50 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Programs/StripTags.hs:
--------------------------------------------------------------------------------
 1 | -- | Program to replace HTML tags by whitespace
 2 | --
 3 | -- This program was originally contributed by Petr Prokhorenkov.
 4 | --
 5 | -- Tested in this benchmark:
 6 | --
 7 | -- * Reading the file
 8 | --
 9 | -- * Replacing text between HTML tags (<>) with whitespace
10 | --
11 | -- * Writing back to a handle
12 | --
13 | {-# OPTIONS_GHC -fspec-constr-count=5 #-}
14 | module Benchmarks.Programs.StripTags
15 |     ( benchmark
16 |     ) where
17 | 
18 | import Criterion (Benchmark, bgroup, bench)
19 | import Data.List (mapAccumL)
20 | import System.IO (Handle, hPutStr)
21 | import qualified Data.ByteString as B
22 | import qualified Data.ByteString.Char8 as BC
23 | import qualified Data.Text as T
24 | import qualified Data.Text.Encoding as T
25 | import qualified Data.Text.IO as T
26 | 
27 | benchmark :: FilePath -> Handle -> IO Benchmark
28 | benchmark i o = return $ bgroup "StripTags"
29 |     [ bench "String" $ readFile i >>= hPutStr o . string
30 |     , bench "ByteString" $ B.readFile i >>= B.hPutStr o . byteString
31 |     , bench "Text" $ T.readFile i >>= T.hPutStr o . text
32 |     , bench "TextByteString" $
33 |         B.readFile i >>= B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8
34 |     ]
35 | 
36 | string :: String -> String
37 | string = snd . mapAccumL step 0
38 | 
39 | text :: T.Text -> T.Text
40 | text = snd . T.mapAccumL step 0
41 | 
42 | byteString :: B.ByteString -> B.ByteString
43 | byteString = snd . BC.mapAccumL step 0
44 | 
45 | step :: Int -> Char -> (Int, Char)
46 | step d c
47 |     | d > 0 || d' > 0 = (d', ' ')
48 |     | otherwise       = (d', c)
49 |   where
50 |     d' = d + depth c
51 |     depth '>' = 1
52 |     depth '<' = -1
53 |     depth _   = 0
54 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Programs/Throughput.hs:
--------------------------------------------------------------------------------
 1 | -- | This benchmark simply reads and writes a file using the various string
 2 | -- libraries. The point of it is that we can make better estimations on how
 3 | -- much time the other benchmarks spend doing IO.
 4 | --
 5 | -- Note that we expect ByteStrings to be a whole lot faster, since they do not
 6 | -- do any actual encoding/decoding here, while String and Text do have UTF-8
 7 | -- encoding/decoding.
 8 | --
 9 | -- Tested in this benchmark:
10 | --
11 | -- * Reading the file
12 | --
13 | -- * Replacing text between HTML tags (<>) with whitespace
14 | --
15 | -- * Writing back to a handle
16 | --
17 | module Benchmarks.Programs.Throughput
18 |     ( benchmark
19 |     ) where
20 | 
21 | import Criterion (Benchmark, bgroup, bench)
22 | import System.IO (Handle, hPutStr)
23 | import qualified Data.ByteString as B
24 | import qualified Data.ByteString.Lazy as BL
25 | import qualified Data.Text.Encoding as T
26 | import qualified Data.Text.IO as T
27 | import qualified Data.Text.Lazy.Encoding as TL
28 | import qualified Data.Text.Lazy.IO as TL
29 | 
30 | benchmark :: FilePath -> Handle -> IO Benchmark
31 | benchmark fp sink = return $ bgroup "Throughput"
32 |     [ bench "String" $ readFile fp >>= hPutStr sink
33 |     , bench "ByteString" $ B.readFile fp >>= B.hPutStr sink
34 |     , bench "LazyByteString" $ BL.readFile fp >>= BL.hPutStr sink
35 |     , bench "Text" $ T.readFile fp >>= T.hPutStr sink
36 |     , bench "LazyText" $ TL.readFile fp >>= TL.hPutStr sink
37 |     , bench "TextByteString" $
38 |         B.readFile fp >>= B.hPutStr sink . T.encodeUtf8 .  T.decodeUtf8
39 |     , bench "LazyTextByteString" $
40 |         BL.readFile fp >>= BL.hPutStr sink . TL.encodeUtf8 . TL.decodeUtf8
41 |     ]
42 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Search.hs:
--------------------------------------------------------------------------------
 1 | -- | Search for a pattern in a file, find the number of occurences
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Searching all occurences of a pattern using library routines
 6 | --
 7 | module Benchmarks.Search
 8 |     ( benchmark
 9 |     ) where
10 | 
11 | import Criterion (Benchmark, bench, bgroup, whnf)
12 | import qualified Data.ByteString as B
13 | import qualified Data.ByteString.Lazy as BL
14 | import qualified Data.ByteString.Lazy.Search as BL
15 | import qualified Data.ByteString.Search as B
16 | import qualified Data.Text as T
17 | import qualified Data.Text.Encoding as T
18 | import qualified Data.Text.IO as T
19 | import qualified Data.Text.Lazy as TL
20 | import qualified Data.Text.Lazy.IO as TL
21 | 
22 | benchmark :: FilePath -> T.Text -> IO Benchmark
23 | benchmark fp needleT = do
24 |     b  <- B.readFile fp
25 |     bl <- BL.readFile fp
26 |     t  <- T.readFile fp
27 |     tl <- TL.readFile fp
28 |     return $ bgroup "FileIndices"
29 |         [ bench "ByteString"     $ whnf (byteString needleB)     b
30 |         , bench "LazyByteString" $ whnf (lazyByteString needleB) bl
31 |         , bench "Text"           $ whnf (text needleT)           t
32 |         , bench "LazyText"       $ whnf (lazyText needleTL)      tl
33 |         ]
34 |   where
35 |     needleB = T.encodeUtf8 needleT
36 |     needleTL = TL.fromChunks [needleT]
37 | 
38 | byteString :: B.ByteString -> B.ByteString -> Int
39 | byteString needle = length . B.indices needle
40 | 
41 | lazyByteString :: B.ByteString -> BL.ByteString -> Int
42 | lazyByteString needle = length . BL.indices needle
43 | 
44 | text :: T.Text -> T.Text -> Int
45 | text = T.count
46 | 
47 | lazyText :: TL.Text -> TL.Text -> Int
48 | lazyText needle = fromIntegral . TL.count needle
49 | 


--------------------------------------------------------------------------------
/scripts/SpecialCasing.hs:
--------------------------------------------------------------------------------
 1 | -- This script processes the following source file:
 2 | --
 3 | --   http://unicode.org/Public/UNIDATA/SpecialCasing.txt
 4 | 
 5 | module SpecialCasing
 6 |     (
 7 |       Case(..)
 8 |     , parseSC
 9 |     , mapSC
10 |     ) where
11 | 
12 | import Arsec
13 | 
14 | data Case = Case {
15 |       code :: Char
16 |     , lower :: [Char]
17 |     , title :: [Char]
18 |     , upper :: [Char]
19 |     , conditions :: String
20 |     , name :: String
21 |     } deriving (Eq, Ord, Show)
22 | 
23 | entries :: Parser [Case]
24 | entries = many comment *> many (entry <* many comment)
25 |   where
26 |     entry = Case <$> unichar <* semi
27 |                  <*> unichars
28 |                  <*> unichars
29 |                  <*> unichars
30 |                  <*> manyTill anyToken (string "# ")
31 |                  <*> manyTill anyToken (char '\n')
32 | 
33 | parseSC :: FilePath -> IO (Either ParseError [Case])
34 | parseSC name = parse entries name <$> readFile name
35 | 
36 | mapSC :: String -> (Case -> String) -> (Char -> Char) -> [Case] -> [String]
37 | mapSC which access twiddle ms = typ ++ (map nice . filter p $ ms) ++ [last]
38 |   where
39 |     typ = [which ++ "Mapping :: forall s. Char -> s -> Step (CC s) Char"
40 |            ,"{-# INLINE " ++ which ++ "Mapping #-}"]
41 |     last = which ++ "Mapping c s = Yield (to" ++ ucFirst which ++ " c) (CC s '\\0' '\\0')"
42 |     nice c = "-- " ++ name c ++ "\n" ++
43 |              which ++ "Mapping " ++ showC (code c) ++ " s = Yield " ++ x ++ " (CC s " ++ y ++ " " ++ z ++ ")"
44 |        where [x,y,z] = (map showC . take 3) (access c ++ repeat '\0')
45 |     p c = [k] /= a && a /= [twiddle k] && null (conditions c)
46 |         where a = access c
47 |               k = code c
48 | 
49 | ucFirst (c:cs) = toUpper c : cs
50 | ucFirst [] = []
51 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/FoldLines.hs:
--------------------------------------------------------------------------------
 1 | -- | Read a file line-by-line using handles, and perform a fold over the lines.
 2 | -- The fold is used here to calculate the number of lines in the file.
 3 | --
 4 | -- Tested in this benchmark:
 5 | --
 6 | -- * Buffered, line-based IO
 7 | --
 8 | {-# LANGUAGE BangPatterns #-}
 9 | module Benchmarks.FoldLines
10 |     ( benchmark
11 |     ) where
12 | 
13 | import Criterion (Benchmark, bgroup, bench)
14 | import System.IO
15 | import qualified Data.ByteString as B
16 | import qualified Data.Text as T
17 | import qualified Data.Text.IO as T
18 | 
19 | benchmark :: FilePath -> IO Benchmark
20 | benchmark fp = return $ bgroup "ReadLines"
21 |     [ bench "Text"       $ withHandle $ foldLinesT (\n _ -> n + 1) (0 :: Int)
22 |     , bench "ByteString" $ withHandle $ foldLinesB (\n _ -> n + 1) (0 :: Int)
23 |     ]
24 |   where
25 |     withHandle f = do
26 |         h <- openFile fp ReadMode
27 |         hSetBuffering h (BlockBuffering (Just 16384))
28 |         x <- f h
29 |         hClose h
30 |         return x
31 | 
32 | -- | Text line fold
33 | --
34 | foldLinesT :: (a -> T.Text -> a) -> a -> Handle -> IO a
35 | foldLinesT f z0 h = go z0
36 |   where
37 |     go !z = do
38 |         eof <- hIsEOF h
39 |         if eof
40 |             then return z
41 |             else do
42 |                 l <- T.hGetLine h
43 |                 let z' = f z l in go z'
44 | {-# INLINE foldLinesT #-}
45 | 
46 | -- | ByteString line fold
47 | --
48 | foldLinesB :: (a -> B.ByteString -> a) -> a -> Handle -> IO a
49 | foldLinesB f z0 h = go z0
50 |   where
51 |     go !z = do
52 |         eof <- hIsEOF h
53 |         if eof
54 |             then return z
55 |             else do
56 |                 l <- B.hGetLine h
57 |                 let z' = f z l in go z'
58 | {-# INLINE foldLinesB #-}
59 | 


--------------------------------------------------------------------------------
/benchmarks/text-benchmarks.cabal:
--------------------------------------------------------------------------------
 1 | name:                text-benchmarks
 2 | version:             0.0.0.0
 3 | synopsis:            Benchmarks for the text package
 4 | description:         Benchmarks for the text package
 5 | homepage:            https://bitbucket.org/bos/text
 6 | license:             BSD3
 7 | license-file:        ../LICENSE
 8 | author:              Jasper Van der Jeugt <jaspervdj@gmail.com>,
 9 |                      Bryan O'Sullivan <bos@serpentine.com>,
10 |                      Tom Harper <rtomharper@googlemail.com>,
11 |                      Duncan Coutts <duncan@haskell.org>
12 | maintainer:          jaspervdj@gmail.com
13 | category:            Text
14 | build-type:          Simple
15 | 
16 | cabal-version:       >=1.2
17 | 
18 | flag llvm
19 |   description: use LLVM
20 |   default: False
21 | 
22 | executable text-benchmarks
23 |   hs-source-dirs: haskell ..
24 |   c-sources:      ../cbits/cbits.c
25 |                   cbits/time_iconv.c
26 |   main-is:        Benchmarks.hs
27 |   ghc-options:    -Wall -O2
28 |   if flag(llvm)
29 |     ghc-options:  -fllvm
30 |   cpp-options:    -DHAVE_DEEPSEQ -DINTEGER_GMP
31 |   build-depends:  base == 4.*,
32 |                   binary,
33 |                   blaze-builder,
34 |                   bytestring,
35 |                   bytestring-lexing,
36 |                   containers,
37 |                   criterion >= 0.6.0.1,
38 |                   deepseq,
39 |                   directory,
40 |                   filepath,
41 |                   ghc-prim,
42 |                   integer-gmp,
43 |                   stringsearch,
44 |                   utf8-string
45 | 
46 | executable text-multilang
47 |   hs-source-dirs: haskell
48 |   main-is:        Multilang.hs
49 |   ghc-options:    -Wall -O2
50 |   build-depends:  base == 4.*,
51 |                   bytestring,
52 |                   text,
53 |                   time
54 | 


--------------------------------------------------------------------------------
/tests/Tests/Utils.hs:
--------------------------------------------------------------------------------
 1 | -- | Miscellaneous testing utilities
 2 | --
 3 | {-# LANGUAGE ScopedTypeVariables #-}
 4 | module Tests.Utils
 5 |     (
 6 |       (=^=)
 7 |     , withRedirect
 8 |     , withTempFile
 9 |     ) where
10 | 
11 | import Control.Exception (SomeException, bracket, bracket_, evaluate, try)
12 | import Control.Monad (when)
13 | import Debug.Trace (trace)
14 | import GHC.IO.Handle.Internals (withHandle)
15 | import System.Directory (removeFile)
16 | import System.IO (Handle, hClose, hFlush, hIsOpen, hIsWritable, openTempFile)
17 | import System.IO.Unsafe (unsafePerformIO)
18 | 
19 | -- Ensure that two potentially bottom values (in the sense of crashing
20 | -- for some inputs, not looping infinitely) either both crash, or both
21 | -- give comparable results for some input.
22 | (=^=) :: (Eq a, Show a) => a -> a -> Bool
23 | i =^= j = unsafePerformIO $ do
24 |   x <- try (evaluate i)
25 |   y <- try (evaluate j)
26 |   case (x,y) of
27 |     (Left (_ :: SomeException), Left (_ :: SomeException))
28 |                        -> return True
29 |     (Right a, Right b) -> return (a == b)
30 |     e                  -> trace ("*** Divergence: " ++ show e) return False
31 | infix 4 =^=
32 | {-# NOINLINE (=^=) #-}
33 | 
34 | withTempFile :: (FilePath -> Handle -> IO a) -> IO a
35 | withTempFile = bracket (openTempFile "." "crashy.txt") cleanupTemp . uncurry
36 |   where
37 |     cleanupTemp (path,h) = do
38 |       open <- hIsOpen h
39 |       when open (hClose h)
40 |       removeFile path
41 | 
42 | withRedirect :: Handle -> Handle -> IO a -> IO a
43 | withRedirect tmp h = bracket_ swap swap
44 |   where
45 |     whenM p a = p >>= (`when` a)
46 |     swap = do
47 |       whenM (hIsOpen tmp) $ whenM (hIsWritable tmp) $ hFlush tmp
48 |       whenM (hIsOpen h) $ whenM (hIsWritable h) $ hFlush h
49 |       withHandle "spam" tmp $ \tmph -> do
50 |         hh <- withHandle "spam" h $ \hh ->
51 |           return (tmph,hh)
52 |         return (hh,())
53 | 


--------------------------------------------------------------------------------
/Data/Text/Unsafe/Base.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP, MagicHash, UnboxedTuples #-}
 2 | -- |
 3 | -- Module      : Data.Text.Unsafe.Base
 4 | -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan
 5 | -- License     : BSD-style
 6 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 7 | --               duncan@haskell.org
 8 | -- Stability   : experimental
 9 | -- Portability : portable
10 | --
11 | -- A module containing unsafe operations, for very very careful use in
12 | -- heavily tested code.
13 | module Data.Text.Unsafe.Base
14 |     (
15 |       inlineInterleaveST
16 |     , inlinePerformIO
17 |     ) where
18 | 
19 | import GHC.ST (ST(..))
20 | #if defined(__GLASGOW_HASKELL__)
21 | # if __GLASGOW_HASKELL__ >= 611
22 | import GHC.IO (IO(IO))
23 | # else
24 | import GHC.IOBase (IO(IO))
25 | # endif
26 | import GHC.Base (realWorld#)
27 | #endif
28 | 
29 | 
30 | -- | Just like unsafePerformIO, but we inline it. Big performance gains as
31 | -- it exposes lots of things to further inlining. /Very unsafe/. In
32 | -- particular, you should do no memory allocation inside an
33 | -- 'inlinePerformIO' block. On Hugs this is just @unsafePerformIO@.
34 | --
35 | {-# INLINE inlinePerformIO #-}
36 | inlinePerformIO :: IO a -> a
37 | #if defined(__GLASGOW_HASKELL__)
38 | inlinePerformIO (IO m) = case m realWorld# of (# _, r #) -> r
39 | #else
40 | inlinePerformIO = unsafePerformIO
41 | #endif
42 | 
43 | -- | Allow an 'ST' computation to be deferred lazily. When passed an
44 | -- action of type 'ST' @s@ @a@, the action will only be performed when
45 | -- the value of @a@ is demanded.
46 | --
47 | -- This function is identical to the normal unsafeInterleaveST, but is
48 | -- inlined and hence faster.
49 | --
50 | -- /Note/: This operation is highly unsafe, as it can introduce
51 | -- externally visible non-determinism into an 'ST' action.
52 | inlineInterleaveST :: ST s a -> ST s a
53 | inlineInterleaveST (ST m) = ST $ \ s ->
54 |     let r = case m s of (# _, res #) -> res in (# s, r #)
55 | {-# INLINE inlineInterleaveST #-}
56 | 


--------------------------------------------------------------------------------
/Data/Text/UnsafeShift.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE MagicHash #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.UnsafeShift
 5 | -- Copyright   : (c) Bryan O'Sullivan 2009
 6 | --
 7 | -- License     : BSD-style
 8 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 9 | --               duncan@haskell.org
10 | -- Stability   : experimental
11 | -- Portability : GHC
12 | --
13 | -- Fast, unchecked bit shifting functions.
14 | 
15 | module Data.Text.UnsafeShift
16 |     (
17 |       UnsafeShift(..)
18 |     ) where
19 | 
20 | -- import qualified Data.Bits as Bits
21 | import GHC.Base
22 | import GHC.Word
23 | 
24 | -- | This is a workaround for poor optimisation in GHC 6.8.2.  It
25 | -- fails to notice constant-width shifts, and adds a test and branch
26 | -- to every shift.  This imposes about a 10% performance hit.
27 | --
28 | -- These functions are undefined when the amount being shifted by is
29 | -- greater than the size in bits of a machine Int#.
30 | class UnsafeShift a where
31 |     shiftL :: a -> Int -> a
32 |     shiftR :: a -> Int -> a
33 | 
34 | instance UnsafeShift Word16 where
35 |     {-# INLINE shiftL #-}
36 |     shiftL (W16# x#) (I# i#) = W16# (narrow16Word# (x# `uncheckedShiftL#` i#))
37 | 
38 |     {-# INLINE shiftR #-}
39 |     shiftR (W16# x#) (I# i#) = W16# (x# `uncheckedShiftRL#` i#)
40 | 
41 | instance UnsafeShift Word32 where
42 |     {-# INLINE shiftL #-}
43 |     shiftL (W32# x#) (I# i#) = W32# (narrow32Word# (x# `uncheckedShiftL#` i#))
44 | 
45 |     {-# INLINE shiftR #-}
46 |     shiftR (W32# x#) (I# i#) = W32# (x# `uncheckedShiftRL#` i#)
47 | 
48 | instance UnsafeShift Word64 where
49 |     {-# INLINE shiftL #-}
50 |     shiftL (W64# x#) (I# i#) = W64# (x# `uncheckedShiftL64#` i#)
51 | 
52 |     {-# INLINE shiftR #-}
53 |     shiftR (W64# x#) (I# i#) = W64# (x# `uncheckedShiftRL64#` i#)
54 | 
55 | instance UnsafeShift Int where
56 |     {-# INLINE shiftL #-}
57 |     shiftL (I# x#) (I# i#) = I# (x# `iShiftL#` i#)
58 | 
59 |     {-# INLINE shiftR #-}
60 |     shiftR (I# x#) (I# i#) = I# (x# `iShiftRA#` i#)
61 | 
62 | {-
63 | instance UnsafeShift Integer where
64 |     {-# INLINE shiftL #-}
65 |     shiftL = Bits.shiftL
66 | 
67 |     {-# INLINE shiftR #-}
68 |     shiftR = Bits.shiftR
69 | -}
70 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Programs/Fold.hs:
--------------------------------------------------------------------------------
 1 | -- | Benchmark which formats paragraph, like the @sort@ unix utility.
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Reading the file
 6 | --
 7 | -- * Splitting into paragraphs
 8 | --
 9 | -- * Reformatting the paragraphs to a certain line width
10 | --
11 | -- * Concatenating the results using the text builder
12 | --
13 | -- * Writing back to a handle
14 | --
15 | {-# LANGUAGE OverloadedStrings #-}
16 | module Benchmarks.Programs.Fold
17 |     ( benchmark
18 |     ) where
19 | 
20 | import Data.List (foldl')
21 | import Data.List (intersperse)
22 | import Data.Monoid (mempty, mappend, mconcat)
23 | import System.IO (Handle)
24 | import Criterion (Benchmark, bench)
25 | import qualified Data.Text as T
26 | import qualified Data.Text.IO as T
27 | import qualified Data.Text.Lazy.Builder as TLB
28 | import qualified Data.Text.Lazy as TL
29 | import qualified Data.Text.Lazy.IO as TL
30 | 
31 | benchmark :: FilePath -> Handle -> IO Benchmark
32 | benchmark i o = return $
33 |     bench "Fold" $ T.readFile i >>= TL.hPutStr o . fold 80
34 | 
35 | -- | We represent a paragraph by a word list
36 | --
37 | type Paragraph = [T.Text]
38 | 
39 | -- | Fold a text
40 | --
41 | fold :: Int -> T.Text -> TL.Text
42 | fold maxWidth = TLB.toLazyText . mconcat .
43 |     intersperse "\n\n" . map (foldParagraph maxWidth) . paragraphs
44 | 
45 | -- | Fold a paragraph
46 | --
47 | foldParagraph :: Int -> Paragraph -> TLB.Builder
48 | foldParagraph _    []       = mempty
49 | foldParagraph max' (w : ws) = fst $ foldl' go (TLB.fromText w, T.length w) ws
50 |   where
51 |     go (builder, width) word
52 |         | width + len + 1 <= max' =
53 |             (builder `mappend` " " `mappend` word', width + len + 1)
54 |         | otherwise =
55 |             (builder `mappend` "\n" `mappend` word', len)
56 |       where
57 |         word' = TLB.fromText word
58 |         len = T.length word
59 | 
60 | -- | Divide a text into paragraphs
61 | --
62 | paragraphs :: T.Text -> [Paragraph]
63 | paragraphs = splitParagraphs . map T.words . T.lines
64 |   where
65 |     splitParagraphs ls = case break null ls of
66 |         ([], []) -> []
67 |         (p,  []) -> [concat p]
68 |         (p,  lr) -> concat p : splitParagraphs (dropWhile null lr)
69 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/DecodeUtf8.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ForeignFunctionInterface #-}
 2 | 
 3 | -- | Test decoding of UTF-8
 4 | --
 5 | -- Tested in this benchmark:
 6 | --
 7 | -- * Decoding bytes using UTF-8
 8 | --
 9 | -- In some tests:
10 | --
11 | -- * Taking the length of the result
12 | --
13 | -- * Taking the init of the result
14 | --
15 | -- The latter are used for testing stream fusion.
16 | --
17 | module Benchmarks.DecodeUtf8
18 |     ( benchmark
19 |     ) where
20 | 
21 | import Foreign.C.Types
22 | import Data.ByteString.Internal (ByteString(..))
23 | import Foreign.Ptr (Ptr, plusPtr)
24 | import Foreign.ForeignPtr (withForeignPtr)
25 | import Data.Word (Word8)
26 | import qualified Criterion as C
27 | import Criterion (Benchmark, bgroup, nf)
28 | import qualified Codec.Binary.UTF8.Generic as U8
29 | import qualified Data.ByteString as B
30 | import qualified Data.ByteString.Lazy as BL
31 | import qualified Data.Text as T
32 | import qualified Data.Text.Encoding as T
33 | import qualified Data.Text.Lazy as TL
34 | import qualified Data.Text.Lazy.Encoding as TL
35 | 
36 | benchmark :: String -> FilePath -> IO Benchmark
37 | benchmark kind fp = do
38 |     bs  <- B.readFile fp
39 |     lbs <- BL.readFile fp
40 |     let bench name = C.bench (name ++ "+" ++ kind)
41 |     return $ bgroup "DecodeUtf8"
42 |         [ bench "Strict" $ nf T.decodeUtf8 bs
43 |         , bench "IConv" $ iconv bs
44 |         , bench "StrictLength" $ nf (T.length . T.decodeUtf8) bs
45 |         , bench "StrictInitLength" $ nf (T.length . T.init . T.decodeUtf8) bs
46 |         , bench "Lazy" $ nf TL.decodeUtf8 lbs
47 |         , bench "LazyLength" $ nf (TL.length . TL.decodeUtf8) lbs
48 |         , bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs
49 |         , bench "StrictStringUtf8" $ nf U8.toString bs
50 |         , bench "StrictStringUtf8Length" $ nf (length . U8.toString) bs
51 |         , bench "LazyStringUtf8" $ nf U8.toString lbs
52 |         , bench "LazyStringUtf8Length" $ nf (length . U8.toString) lbs
53 |         ]
54 | 
55 | iconv :: ByteString -> IO CInt
56 | iconv (PS fp off len) = withForeignPtr fp $ \ptr ->
57 |                         time_iconv (ptr `plusPtr` off) (fromIntegral len)
58 | 
59 | foreign import ccall unsafe time_iconv :: Ptr Word8 -> CSize -> IO CInt
60 | 


--------------------------------------------------------------------------------
/.hgtags:
--------------------------------------------------------------------------------
 1 | 0edd5dc96024e6c0f56f175ffa673f9178cece8c 0.9.0.0
 2 | 1de0da99d31ba212e179d88957cf5f3b9c4facd7 0.7.0.1
 3 | 30575507578cf04b5b1501ee23b1faa83315eb88 0.1
 4 | 321e61f17630dbd346c1c9991831572f964b7d5a 0.4
 5 | 34cef27f87287cd3c17d78ac99e9de2d6c8a1200 0.6
 6 | 3cbff6d9bca135a85e428a7f6c4e15739a562b8b 0.7.1.0
 7 | 587c8661393f83cb2b9452f6cc1d6885e52fe544 0.8.1.0
 8 | 61c322bd5b6446255870422f854266396169a7e5 0.5
 9 | 9adb5edd9b6407335e8772ea86036b938471fc89 0.2
10 | b4fdbef1c1027a70b95c1a81747d968890acc3f6 0.3
11 | de7cb0cb2a72606126c357b860bb0dbf7f145ad2 0.9.0.1
12 | f2c9b1c019e3277926378d7b5a604f1c995f0202 0.7
13 | f3d625f2d0730cb6adf8506a1f37762178c9aea6 0.10.0.0
14 | 99ea36957b1d47145d70f2ee8b13f1ddfd09245c 0.10.0.1
15 | 1b7889216e78b00d69b32c48782111bcd935978d 0.10.0.2
16 | 75157eaec6eb38574ad5c347e75c48dcb7262dee 0.11.0.0
17 | 0d7bc7bd77ec5a8f6d064de59d459e5eaf906b0f 0.11.0.1
18 | a9d582a75dea9a4417883019d5110a35e4c07265 0.11.0.2
19 | 36b2a3aa661892ea25101199220383ba6d579abb 0.11.0.3
20 | 06e407bcdc9a3905abf9551062ecc6f92abcdde5 0.11.0.4
21 | 94bcf16e812f6fb49b87598673e17270f48c2bf1 0.11.0.5
22 | 784a32ffbec0b77adb43e9a64328db0482007309 0.11.0.6
23 | c808123a3d37f25004d2aad8f3a395acfcf9122f 0.11.0.7
24 | 679826e742b239c5dd0c1fa4ebe98aa4bfe02a00 0.11.0.8
25 | b49eebfa5673da89a23939d523c9de9edbd410b0 0.11.1.0
26 | 9c820a2ecaea01bc614d8e6bccf90e57431bdcbb 0.11.1.1
27 | ed3a60ec627af6ba8af321009aa00224b3296e47 0.11.1.2
28 | b75d3041d275e8e76b26605b0d426a572ddf1249 0.11.1.3
29 | 53906ad0c7e64f6c9a5df569e7cc642579aab8fc 0.11.1.5
30 | 9d6d3a9690ade506897c072060eb92868e229d5c 0.11.1.6
31 | 5ac062eace36b333566d1676979a57039a188840 0.11.1.7
32 | 9f01361a73071936b6ab0bdb879f1a45cca6577a 0.11.1.8
33 | 5dce2a934be53e74310fedffb51d7bd3e61fa993 0.11.1.9
34 | 9f47a2cfc9e51fd622d7553f08ad2ac1faad0438 0.11.1.10
35 | 9f47a2cfc9e51fd622d7553f08ad2ac1faad0438 0.11.1.10
36 | 407937739e9e764f1ae0f1f9ca454c42dca38772 0.11.1.10
37 | 8b981edd27befa4c2dd334fcb7db22ac67e22b67 0.11.1.11
38 | 204da16b5098531bdf858c388e2620238ef2aa5e 0.11.1.12
39 | 6a3d847a56a69d0514a79cb212cb218271ad0917 0.11.1.13
40 | 1d2c6fa9092c6a4000b2abdd9d01f3efcd477be5 0.11.2.0
41 | 78219784cf3652cc662805bf2971bd62d80210a9 0.11.2.1
42 | 4297307ebc11ad677cfba6b40319e7e5e2c0cfee 0.11.2.3
43 | 7fa79662b66aade97fe49394977213fe6432942e 0.11.3.0
44 | d99cd091cdf71ce807a4255f6cc509c3154f51ea 0.11.3.1
45 | 


--------------------------------------------------------------------------------
/tests-and-benchmarks.markdown:
--------------------------------------------------------------------------------
 1 | Tests and benchmarks
 2 | ====================
 3 | 
 4 | Prerequisites
 5 | -------------
 6 | 
 7 | To run the tests and benchmarks, you will need the test data, which
 8 | you can clone from one of the following locations:
 9 | 
10 | * Mercurial master repository:
11 |   [bitbucket.org/bos/text-test-data](https://bitbucket.org/bos/text-test-data)
12 | 
13 | * Git mirror repository:
14 |   [github.com/bos/text-test-data](https://github.com/bos/text-test-data)
15 | 
16 | You should clone that repository into the `tests` subdirectory (your
17 | clone must be named `text-test-data` locally), then run `make -C
18 | tests/text-test-data` to uncompress the test files.  Many tests and
19 | benchmarks will fail if the test files are missing.
20 | 
21 | Functional tests
22 | ----------------
23 | 
24 | The functional tests are located in the `tests` subdirectory. An overview of
25 | what's in that directory:
26 | 
27 |     Makefile          Has targets for common tasks
28 |     Tests             Source files of the testing code
29 |     scripts           Various utility scripts
30 |     text-tests.cabal  Cabal file that compiles all benchmarks
31 | 
32 | The `text-tests.cabal` builds:
33 | 
34 | - A copy of the text library, sharing the source code, but exposing all internal
35 |   modules, for testing purposes
36 | - The different test suites
37 | 
38 | To compile, run all tests, and generate a coverage report, simply use `make`.
39 | 
40 | Benchmarks
41 | ----------
42 | 
43 | The benchmarks are located in the `benchmarks` subdirectory. An overview of
44 | what's in that directory:
45 | 
46 |     Makefile               Has targets for common tasks
47 |     haskell                Source files of the haskell benchmarks
48 |     python                 Python implementations of some benchmarks
49 |     ruby                   Ruby implementations of some benchmarks
50 |     text-benchmarks.cabal  Cabal file which compiles all benchmarks
51 | 
52 | To compile the benchmarks, navigate to the `benchmarks` subdirectory and run
53 | `cabal configure && cabal build`. Then, you can run the benchmarks using:
54 | 
55 |     ./dist/build/text-benchmarks/text-benchmarks
56 | 
57 | However, since there's quite a lot of benchmarks, you usually don't want to
58 | run them all. Instead, use the `-l` flag to get a list of benchmarks:
59 | 
60 |     ./dist/build/text-benchmarks/text-benchmarks
61 | 
62 | And run the ones you want to inspect. If you want to configure the benchmarks
63 | further, the exact parameters can be changed in `Benchmarks.hs`.
64 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Programs/Sort.hs:
--------------------------------------------------------------------------------
 1 | -- | This benchmark sorts the lines of a file, like the @sort@ unix utility.
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Reading the file
 6 | --
 7 | -- * Splitting into lines
 8 | --
 9 | -- * Sorting the lines
10 | --
11 | -- * Joining the lines
12 | --
13 | -- * Writing back to a handle
14 | --
15 | {-# LANGUAGE OverloadedStrings #-}
16 | module Benchmarks.Programs.Sort
17 |     ( benchmark
18 |     ) where
19 | 
20 | import Criterion (Benchmark, bgroup, bench)
21 | import Data.Monoid (mconcat)
22 | import System.IO (Handle, hPutStr)
23 | import qualified Data.ByteString as B
24 | import qualified Data.ByteString.Char8 as BC
25 | import qualified Data.ByteString.Lazy as BL
26 | import qualified Data.ByteString.Lazy.Char8 as BLC
27 | import qualified Data.List as L
28 | import qualified Data.Text as T
29 | import qualified Data.Text.Encoding as T
30 | import qualified Data.Text.IO as T
31 | import qualified Data.Text.Lazy as TL
32 | import qualified Data.Text.Lazy.Builder as TLB
33 | import qualified Data.Text.Lazy.Encoding as TL
34 | import qualified Data.Text.Lazy.IO as TL
35 | 
36 | benchmark :: FilePath -> Handle -> IO Benchmark
37 | benchmark i o = return $ bgroup "Sort"
38 |     [ bench "String" $ readFile i >>= hPutStr o . string
39 |     , bench "ByteString" $ B.readFile i >>= B.hPutStr o . byteString
40 |     , bench "LazyByteString" $ BL.readFile i >>= BL.hPutStr o . lazyByteString
41 |     , bench "Text" $ T.readFile i >>= T.hPutStr o . text
42 |     , bench "LazyText" $ TL.readFile i >>= TL.hPutStr o . lazyText
43 |     , bench "TextByteString" $ B.readFile i >>=
44 |         B.hPutStr o . T.encodeUtf8 . text . T.decodeUtf8
45 |     , bench "LazyTextByteString" $ BL.readFile i >>=
46 |         BL.hPutStr o . TL.encodeUtf8 . lazyText .  TL.decodeUtf8
47 |     , bench "TextBuilder" $ B.readFile i >>=
48 |         BL.hPutStr o . TL.encodeUtf8 . textBuilder . T.decodeUtf8
49 |     ]
50 | 
51 | string :: String -> String
52 | string = unlines . L.sort . lines
53 | 
54 | byteString :: B.ByteString -> B.ByteString
55 | byteString = BC.unlines . L.sort . BC.lines
56 | 
57 | lazyByteString :: BL.ByteString -> BL.ByteString
58 | lazyByteString = BLC.unlines . L.sort . BLC.lines
59 | 
60 | text :: T.Text -> T.Text
61 | text = T.unlines . L.sort . T.lines
62 | 
63 | lazyText :: TL.Text -> TL.Text
64 | lazyText = TL.unlines . L.sort . TL.lines
65 | 
66 | -- | Text variant using a builder monoid for the final concatenation
67 | --
68 | textBuilder :: T.Text -> TL.Text
69 | textBuilder = TLB.toLazyText . mconcat . L.intersperse (TLB.singleton '\n') .
70 |     map TLB.fromText . L.sort . T.lines
71 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Builder.hs:
--------------------------------------------------------------------------------
 1 | -- | Testing the internal builder monoid
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Concatenating many small strings using a builder
 6 | --
 7 | {-# LANGUAGE OverloadedStrings #-}
 8 | module Benchmarks.Builder
 9 |     ( benchmark
10 |     ) where
11 | 
12 | import Criterion (Benchmark, bgroup, bench, nf)
13 | import Data.Binary.Builder as B
14 | import Data.ByteString.Char8 ()
15 | import Data.Monoid (mconcat, mempty)
16 | import qualified Blaze.ByteString.Builder as Blaze
17 | import qualified Blaze.ByteString.Builder.Char.Utf8 as Blaze
18 | import qualified Data.ByteString as SB
19 | import qualified Data.ByteString.Lazy as LB
20 | import qualified Data.Text as T
21 | import qualified Data.Text.Lazy as LT
22 | import qualified Data.Text.Lazy.Builder as LTB
23 | import qualified Data.Text.Lazy.Builder.Int as Int
24 | import Data.Int (Int64)
25 | 
26 | benchmark :: IO Benchmark
27 | benchmark = return $ bgroup "Builder"
28 |     [ bgroup "Comparison"
29 |       [ bench "LazyText" $ nf
30 |           (LT.length . LTB.toLazyText . mconcat . map LTB.fromText) texts
31 |       , bench "Binary" $ nf
32 |           (LB.length . B.toLazyByteString . mconcat . map B.fromByteString)
33 |           byteStrings
34 |       , bench "Blaze" $ nf
35 |           (LB.length . Blaze.toLazyByteString . mconcat . map Blaze.fromString)
36 |           strings
37 |       ]
38 |     , bgroup "Int"
39 |       [ bgroup "Decimal"
40 |         [ bgroup "Positive" .
41 |           flip map numbers $ \n ->
42 |           (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n)
43 |         , bgroup "Negative" .
44 |           flip map numbers $ \m ->
45 |           let n = negate m in
46 |           (bench (show (length (show n))) $ nf (LTB.toLazyText . Int.decimal) n)
47 |         , bench "Empty" $ nf LTB.toLazyText mempty
48 |         , bgroup "Show" .
49 |           flip map numbers $ \n ->
50 |           (bench (show (length (show n))) $ nf show n)
51 |         ]
52 |       ]
53 |     ]
54 |   where
55 |     numbers :: [Int64]
56 |     numbers = [
57 |       6, 14, 500, 9688, 10654, 620735, 5608880, 37010612,
58 |       731223504, 5061580596, 24596952933, 711732309084, 2845910093839,
59 |       54601756118340, 735159434806159, 3619097625502435, 95777227510267124,
60 |       414944309510675693, 8986407456998704019
61 |      ]
62 | 
63 | texts :: [T.Text]
64 | texts = take 200000 $ cycle ["foo", "λx", "由の"]
65 | {-# NOINLINE texts #-}
66 | 
67 | -- Note that the non-ascii characters will be chopped
68 | byteStrings :: [SB.ByteString]
69 | byteStrings = take 200000 $ cycle ["foo", "λx", "由の"]
70 | {-# NOINLINE byteStrings #-}
71 | 
72 | -- Note that the non-ascii characters will be chopped
73 | strings :: [String]
74 | strings = take 200000 $ cycle ["foo", "λx", "由の"]
75 | {-# NOINLINE strings #-}
76 | 


--------------------------------------------------------------------------------
/tests/Tests/Regressions.hs:
--------------------------------------------------------------------------------
 1 | -- | Regression tests for specific bugs.
 2 | --
 3 | {-# LANGUAGE OverloadedStrings, ScopedTypeVariables #-}
 4 | module Tests.Regressions
 5 |     (
 6 |       tests
 7 |     ) where
 8 | 
 9 | import Control.Exception (SomeException, handle)
10 | import System.IO
11 | import Test.HUnit (assertBool, assertFailure)
12 | import qualified Data.ByteString as B
13 | import Data.ByteString.Char8 ()
14 | import qualified Data.ByteString.Lazy as LB
15 | import qualified Data.Text as T
16 | import qualified Data.Text.Encoding as TE
17 | import qualified Data.Text.IO as T
18 | import qualified Data.Text.Lazy as LT
19 | import qualified Data.Text.Lazy.Encoding as LE
20 | import qualified Test.Framework as F
21 | import qualified Test.Framework.Providers.HUnit as F
22 | 
23 | import Tests.Utils (withTempFile)
24 | 
25 | -- Reported by Michael Snoyman: UTF-8 encoding a large lazy bytestring
26 | -- caused either a segfault or attempt to allocate a negative number
27 | -- of bytes.
28 | lazy_encode_crash :: IO ()
29 | lazy_encode_crash = withTempFile $ \ _ h ->
30 |    LB.hPut h . LE.encodeUtf8 . LT.pack . replicate 100000 $ 'a'
31 | 
32 | -- Reported by Pieter Laeremans: attempting to read an incorrectly
33 | -- encoded file can result in a crash in the RTS (i.e. not merely an
34 | -- exception).
35 | hGetContents_crash :: IO ()
36 | hGetContents_crash = withTempFile $ \ path h -> do
37 |   B.hPut h (B.pack [0x78, 0xc4 ,0x0a]) >> hClose h
38 |   h' <- openFile path ReadMode
39 |   hSetEncoding h' utf8
40 |   handle (\(_::SomeException) -> return ()) $
41 |     T.hGetContents h' >> assertFailure "T.hGetContents should crash"
42 | 
43 | -- Reported by Ian Lynagh: attempting to allocate a sufficiently large
44 | -- string (via either Array.new or Text.replicate) could result in an
45 | -- integer overflow.
46 | replicate_crash :: IO ()
47 | replicate_crash = handle (\(_::SomeException) -> return ()) $
48 |                   T.replicate (2^power) "0123456789abcdef" `seq`
49 |                   assertFailure "T.replicate should crash"
50 |   where
51 |     power | maxBound == (2147483647::Int) = 28
52 |           | otherwise                     = 60 :: Int
53 | 
54 | -- Reported by John Millikin: a UTF-8 decode error handler could
55 | -- return a bogus substitution character, which we would write without
56 | -- checking.
57 | utf8_decode_unsafe :: IO ()
58 | utf8_decode_unsafe = do
59 |   let t = TE.decodeUtf8With (\_ _ -> Just '\xdc00') "\x80"
60 |   assertBool "broken error recovery shouldn't break us" (t == "\xfffd")
61 | 
62 | tests :: F.Test
63 | tests = F.testGroup "Regressions"
64 |     [ F.testCase "hGetContents_crash" hGetContents_crash
65 |     , F.testCase "lazy_encode_crash" lazy_encode_crash
66 |     , F.testCase "replicate_crash" replicate_crash
67 |     , F.testCase "utf8_decode_unsafe" utf8_decode_unsafe
68 |     ]
69 | 


--------------------------------------------------------------------------------
/Data/Text/UnsafeChar.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE CPP, MagicHash #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.UnsafeChar
 5 | -- Copyright   : (c) 2008, 2009 Tom Harper,
 6 | --               (c) 2009, 2010 Bryan O'Sullivan,
 7 | --               (c) 2009 Duncan Coutts
 8 | --
 9 | -- License     : BSD-style
10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
11 | --               duncan@haskell.org
12 | -- Stability   : experimental
13 | -- Portability : GHC
14 | --
15 | -- Fast character manipulation functions.
16 | module Data.Text.UnsafeChar
17 |     (
18 |       ord
19 |     , unsafeChr
20 |     , unsafeChr8
21 |     , unsafeChr32
22 |     , unsafeWrite
23 |     -- , unsafeWriteRev
24 |     ) where
25 | 
26 | #ifdef ASSERTS
27 | import Control.Exception (assert)
28 | #endif
29 | import Control.Monad.ST (ST)
30 | import Data.Bits ((.&.))
31 | import Data.Text.UnsafeShift (shiftR)
32 | import GHC.Exts (Char(..), Int(..), chr#, ord#, word2Int#)
33 | import GHC.Word (Word8(..), Word16(..), Word32(..))
34 | import qualified Data.Text.Array as A
35 | 
36 | ord :: Char -> Int
37 | ord (C# c#) = I# (ord# c#)
38 | {-# INLINE ord #-}
39 | 
40 | unsafeChr :: Word16 -> Char
41 | unsafeChr (W16# w#) = C# (chr# (word2Int# w#))
42 | {-# INLINE unsafeChr #-}
43 | 
44 | unsafeChr8 :: Word8 -> Char
45 | unsafeChr8 (W8# w#) = C# (chr# (word2Int# w#))
46 | {-# INLINE unsafeChr8 #-}
47 | 
48 | unsafeChr32 :: Word32 -> Char
49 | unsafeChr32 (W32# w#) = C# (chr# (word2Int# w#))
50 | {-# INLINE unsafeChr32 #-}
51 | 
52 | -- | Write a character into the array at the given offset.  Returns
53 | -- the number of 'Word16's written.
54 | unsafeWrite :: A.MArray s -> Int -> Char -> ST s Int
55 | unsafeWrite marr i c
56 |     | n < 0x10000 = do
57 | #if defined(ASSERTS)
58 |         assert (i >= 0) . assert (i < A.length marr) $ return ()
59 | #endif
60 |         A.unsafeWrite marr i (fromIntegral n)
61 |         return 1
62 |     | otherwise = do
63 | #if defined(ASSERTS)
64 |         assert (i >= 0) . assert (i < A.length marr - 1) $ return ()
65 | #endif
66 |         A.unsafeWrite marr i lo
67 |         A.unsafeWrite marr (i+1) hi
68 |         return 2
69 |     where n = ord c
70 |           m = n - 0x10000
71 |           lo = fromIntegral $ (m `shiftR` 10) + 0xD800
72 |           hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
73 | {-# INLINE unsafeWrite #-}
74 | 
75 | {-
76 | unsafeWriteRev :: A.MArray s Word16 -> Int -> Char -> ST s Int
77 | unsafeWriteRev marr i c
78 |     | n < 0x10000 = do
79 |         assert (i >= 0) . assert (i < A.length marr) $
80 |           A.unsafeWrite marr i (fromIntegral n)
81 |         return (i-1)
82 |     | otherwise = do
83 |         assert (i >= 1) . assert (i < A.length marr) $
84 |           A.unsafeWrite marr (i-1) lo
85 |         A.unsafeWrite marr i hi
86 |         return (i-2)
87 |     where n = ord c
88 |           m = n - 0x10000
89 |           lo = fromIntegral $ (m `shiftR` 10) + 0xD800
90 |           hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
91 | {-# INLINE unsafeWriteRev #-}
92 | -}
93 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks.hs:
--------------------------------------------------------------------------------
 1 | -- | Main module to run the micro benchmarks
 2 | --
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | module Main
 5 |     ( main
 6 |     ) where
 7 | 
 8 | import Criterion.Main (Benchmark, defaultMain, bgroup)
 9 | import System.FilePath ((</>))
10 | import System.IO (IOMode (WriteMode), openFile, hSetEncoding, utf8)
11 | 
12 | import qualified Benchmarks.Builder as Builder
13 | import qualified Benchmarks.DecodeUtf8 as DecodeUtf8
14 | import qualified Benchmarks.EncodeUtf8 as EncodeUtf8
15 | import qualified Benchmarks.Equality as Equality
16 | import qualified Benchmarks.FileRead as FileRead
17 | import qualified Benchmarks.FoldLines as FoldLines
18 | import qualified Benchmarks.Pure as Pure
19 | import qualified Benchmarks.ReadNumbers as ReadNumbers
20 | import qualified Benchmarks.Replace as Replace
21 | import qualified Benchmarks.Search as Search
22 | import qualified Benchmarks.Stream as Stream
23 | import qualified Benchmarks.WordFrequencies as WordFrequencies
24 | 
25 | import qualified Benchmarks.Programs.BigTable as Programs.BigTable
26 | import qualified Benchmarks.Programs.Cut as Programs.Cut
27 | import qualified Benchmarks.Programs.Fold as Programs.Fold
28 | import qualified Benchmarks.Programs.Sort as Programs.Sort
29 | import qualified Benchmarks.Programs.StripTags as Programs.StripTags
30 | import qualified Benchmarks.Programs.Throughput as Programs.Throughput
31 | 
32 | main :: IO ()
33 | main = benchmarks >>= defaultMain
34 | 
35 | benchmarks :: IO [Benchmark]
36 | benchmarks = do
37 |     sink <- openFile "/dev/null" WriteMode
38 |     hSetEncoding sink utf8
39 | 
40 |     -- Traditional benchmarks
41 |     bs <- sequence
42 |         [ Builder.benchmark
43 |         , DecodeUtf8.benchmark "html" (tf "libya-chinese.html")
44 |         , DecodeUtf8.benchmark "xml" (tf "yiwiki.xml")
45 |         , DecodeUtf8.benchmark "ascii" (tf "ascii.txt")
46 |         , DecodeUtf8.benchmark "russian" (tf "russian.txt")
47 |         , DecodeUtf8.benchmark "japanese" (tf "japanese.txt")
48 |         , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
49 |         , Equality.benchmark (tf "japanese.txt")
50 |         , FileRead.benchmark (tf "russian.txt")
51 |         , FoldLines.benchmark (tf "russian.txt")
52 |         , Pure.benchmark "tiny "(tf "tiny.txt")
53 |         , Pure.benchmark "japanese" (tf "japanese.txt")
54 |         , ReadNumbers.benchmark (tf "numbers.txt")
55 |         , Replace.benchmark (tf "russian.txt") "принимая" "своем"
56 |         , Search.benchmark (tf "russian.txt") "принимая"
57 |         , Stream.benchmark (tf "russian.txt")
58 |         , WordFrequencies.benchmark (tf "russian.txt")
59 |         ]
60 | 
61 |     -- Program-like benchmarks
62 |     ps <- bgroup "Programs" `fmap` sequence
63 |         [ Programs.BigTable.benchmark sink
64 |         , Programs.Cut.benchmark (tf "russian.txt") sink 20 40
65 |         , Programs.Fold.benchmark (tf "russian.txt") sink
66 |         , Programs.Sort.benchmark (tf "russian.txt") sink
67 |         , Programs.StripTags.benchmark (tf "yiwiki.xml") sink
68 |         , Programs.Throughput.benchmark (tf "russian.txt") sink
69 |         ]
70 | 
71 |     return $ bs ++ [ps]
72 |   where
73 |     -- Location of a test file
74 |     tf = ("../tests/text-test-data" </>)
75 | 


--------------------------------------------------------------------------------
/tests/text-tests.cabal:
--------------------------------------------------------------------------------
  1 | name:          text-tests
  2 | version:       0.0.0.0
  3 | synopsis:      Functional tests for the text package
  4 | description:   Functional tests for the text package
  5 | homepage:      https://bitbucket.org/bos/text
  6 | license:       BSD3
  7 | license-file:  ../LICENSE
  8 | author:        Jasper Van der Jeugt <jaspervdj@gmail.com>,
  9 |                Bryan O'Sullivan <bos@serpentine.com>,
 10 |                Tom Harper <rtomharper@googlemail.com>,
 11 |                Duncan Coutts <duncan@haskell.org>
 12 | maintainer:    Bryan O'Sullivan <bos@serpentine.com>
 13 | category:      Text
 14 | build-type:    Simple
 15 | 
 16 | cabal-version: >=1.8
 17 | 
 18 | flag hpc
 19 |   description: Enable HPC to generate coverage reports
 20 |   default:     False
 21 | 
 22 | executable text-tests
 23 |   main-is: Tests.hs
 24 | 
 25 |   ghc-options:
 26 |     -Wall -threaded -O0 -rtsopts
 27 | 
 28 |   if flag(hpc)
 29 |     ghc-options:
 30 |       -fhpc
 31 | 
 32 |   cpp-options:
 33 |     -DASSERTS
 34 |     -DHAVE_DEEPSEQ
 35 | 
 36 |   build-depends:
 37 |     HUnit >= 1.2,
 38 |     QuickCheck >= 2.4,
 39 |     base == 4.*,
 40 |     bytestring,
 41 |     deepseq,
 42 |     directory,
 43 |     random,
 44 |     test-framework >= 0.4,
 45 |     test-framework-hunit >= 0.2,
 46 |     test-framework-quickcheck2 >= 0.2,
 47 |     text-tests
 48 | 
 49 | executable text-tests-stdio
 50 |   main-is:        Tests/IO.hs
 51 | 
 52 |   ghc-options:
 53 |     -Wall -threaded -rtsopts
 54 | 
 55 |   -- Optional HPC support
 56 |   if flag(hpc)
 57 |     ghc-options:
 58 |       -fhpc
 59 | 
 60 |   build-depends:
 61 |     text-tests,
 62 |     base >= 4 && < 5
 63 | 
 64 | library
 65 |   hs-source-dirs: ..
 66 |   c-sources: ../cbits/cbits.c
 67 |   exposed-modules:
 68 |     Data.Text
 69 |     Data.Text.Array
 70 |     Data.Text.Encoding
 71 |     Data.Text.Encoding.Error
 72 |     Data.Text.Encoding.Fusion
 73 |     Data.Text.Encoding.Fusion.Common
 74 |     Data.Text.Encoding.Utf16
 75 |     Data.Text.Encoding.Utf32
 76 |     Data.Text.Encoding.Utf8
 77 |     Data.Text.Foreign
 78 |     Data.Text.Fusion
 79 |     Data.Text.Fusion.CaseMapping
 80 |     Data.Text.Fusion.Common
 81 |     Data.Text.Fusion.Internal
 82 |     Data.Text.Fusion.Size
 83 |     Data.Text.IO
 84 |     Data.Text.IO.Internal
 85 |     Data.Text.Internal
 86 |     Data.Text.Lazy
 87 |     Data.Text.Lazy.Builder
 88 |     Data.Text.Lazy.Builder.Functions
 89 |     Data.Text.Lazy.Builder.Int
 90 |     Data.Text.Lazy.Builder.Int.Digits
 91 |     Data.Text.Lazy.Builder.Internal
 92 |     Data.Text.Lazy.Builder.RealFloat
 93 |     Data.Text.Lazy.Builder.RealFloat.Functions
 94 |     Data.Text.Lazy.Encoding
 95 |     Data.Text.Lazy.Encoding.Fusion
 96 |     Data.Text.Lazy.Fusion
 97 |     Data.Text.Lazy.IO
 98 |     Data.Text.Lazy.Internal
 99 |     Data.Text.Lazy.Read
100 |     Data.Text.Lazy.Search
101 |     Data.Text.Private
102 |     Data.Text.Read
103 |     Data.Text.Search
104 |     Data.Text.Unsafe
105 |     Data.Text.Unsafe.Base
106 |     Data.Text.UnsafeChar
107 |     Data.Text.UnsafeShift
108 |     Data.Text.Util
109 | 
110 |   if flag(hpc)
111 |     ghc-options:
112 |       -fhpc
113 | 
114 |   cpp-options:
115 |     -DHAVE_DEEPSEQ
116 |     -DASSERTS
117 |     -DINTEGER_GMP
118 | 
119 |   build-depends:
120 |     array,
121 |     base == 4.*,
122 |     bytestring,
123 |     deepseq,
124 |     ghc-prim,
125 |     integer-gmp
126 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Programs/Cut.hs:
--------------------------------------------------------------------------------
 1 | -- | Cut into a file, selecting certain columns (e.g. columns 10 to 40)
 2 | --
 3 | -- Tested in this benchmark:
 4 | --
 5 | -- * Reading the file
 6 | --
 7 | -- * Splitting into lines
 8 | --
 9 | -- * Taking a number of characters from the lines
10 | --
11 | -- * Joining the lines
12 | --
13 | -- * Writing back to a handle
14 | --
15 | module Benchmarks.Programs.Cut
16 |     ( benchmark
17 |     ) where
18 | 
19 | import Criterion (Benchmark, bgroup, bench)
20 | import System.IO (Handle, hPutStr)
21 | import qualified Data.ByteString as B
22 | import qualified Data.ByteString.Char8 as BC
23 | import qualified Data.ByteString.Lazy as BL
24 | import qualified Data.ByteString.Lazy.Char8 as BLC
25 | import qualified Data.Text as T
26 | import qualified Data.Text.Encoding as T
27 | import qualified Data.Text.IO as T
28 | import qualified Data.Text.Lazy as TL
29 | import qualified Data.Text.Lazy.Encoding as TL
30 | import qualified Data.Text.Lazy.IO as TL
31 | 
32 | benchmark :: FilePath -> Handle -> Int -> Int -> IO Benchmark
33 | benchmark p sink from to = return $ bgroup "Cut"
34 |     [ bench' "String" string
35 |     , bench' "ByteString" byteString
36 |     , bench' "LazyByteString" lazyByteString
37 |     , bench' "Text" text
38 |     , bench' "LazyText" lazyText
39 |     , bench' "TextByteString" textByteString
40 |     , bench' "LazyTextByteString" lazyTextByteString
41 |     ]
42 |   where
43 |     bench' n s = bench n (s p sink from to)
44 | 
45 | string :: FilePath -> Handle -> Int -> Int -> IO ()
46 | string fp sink from to = do
47 |     s <- readFile fp
48 |     hPutStr sink $ cut s
49 |   where
50 |     cut = unlines . map (take (to - from) . drop from) . lines
51 | 
52 | byteString :: FilePath -> Handle -> Int -> Int -> IO ()
53 | byteString fp sink from to = do
54 |     bs <- B.readFile fp
55 |     B.hPutStr sink $ cut bs
56 |   where
57 |     cut = BC.unlines . map (B.take (to - from) . B.drop from) . BC.lines
58 | 
59 | lazyByteString :: FilePath -> Handle -> Int -> Int -> IO ()
60 | lazyByteString fp sink from to = do
61 |     bs <- BL.readFile fp
62 |     BL.hPutStr sink $ cut bs
63 |   where
64 |     cut = BLC.unlines . map (BL.take (to' - from') . BL.drop from') . BLC.lines
65 |     from' = fromIntegral from
66 |     to' = fromIntegral to
67 | 
68 | text :: FilePath -> Handle -> Int -> Int -> IO ()
69 | text fp sink from to = do
70 |     t <- T.readFile fp
71 |     T.hPutStr sink $ cut t
72 |   where
73 |     cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
74 | 
75 | lazyText :: FilePath -> Handle -> Int -> Int -> IO ()
76 | lazyText fp sink from to = do
77 |     t <- TL.readFile fp
78 |     TL.hPutStr sink $ cut t
79 |   where
80 |     cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
81 |     from' = fromIntegral from
82 |     to' = fromIntegral to
83 | 
84 | textByteString :: FilePath -> Handle -> Int -> Int -> IO ()
85 | textByteString fp sink from to = do
86 |     t <- T.decodeUtf8 `fmap` B.readFile fp
87 |     B.hPutStr sink $ T.encodeUtf8 $ cut t
88 |   where
89 |     cut = T.unlines . map (T.take (to - from) . T.drop from) . T.lines
90 | 
91 | lazyTextByteString :: FilePath -> Handle -> Int -> Int -> IO ()
92 | lazyTextByteString fp sink from to = do
93 |     t <- TL.decodeUtf8 `fmap` BL.readFile fp
94 |     BL.hPutStr sink $ TL.encodeUtf8 $ cut t
95 |   where
96 |     cut = TL.unlines . map (TL.take (to' - from') . TL.drop from') . TL.lines
97 |     from' = fromIntegral from
98 |     to' = fromIntegral to
99 | 


--------------------------------------------------------------------------------
/Data/Text/Search.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
 2 | 
 3 | -- |
 4 | -- Module      : Data.Text.Search
 5 | -- Copyright   : (c) Bryan O'Sullivan 2009
 6 | --
 7 | -- License     : BSD-style
 8 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 9 | --               duncan@haskell.org
10 | -- Stability   : experimental
11 | -- Portability : GHC
12 | --
13 | -- Fast substring search for 'Text', based on work by Boyer, Moore,
14 | -- Horspool, Sunday, and Lundh.
15 | --
16 | -- References:
17 | --
18 | -- * R. S. Boyer, J. S. Moore: A Fast String Searching Algorithm.
19 | --   Communications of the ACM, 20, 10, 762-772 (1977)
20 | --
21 | -- * R. N. Horspool: Practical Fast Searching in Strings.  Software -
22 | --   Practice and Experience 10, 501-506 (1980)
23 | --
24 | -- * D. M. Sunday: A Very Fast Substring Search Algorithm.
25 | --   Communications of the ACM, 33, 8, 132-142 (1990)
26 | --
27 | -- * F. Lundh: The Fast Search Algorithm.
28 | --   <http://effbot.org/zone/stringlib.htm> (2006)
29 | 
30 | module Data.Text.Search
31 |     (
32 |       indices
33 |     ) where
34 | 
35 | import qualified Data.Text.Array as A
36 | import Data.Word (Word64)
37 | import Data.Text.Internal (Text(..))
38 | import Data.Bits ((.|.), (.&.))
39 | import Data.Text.UnsafeShift (shiftL)
40 | 
41 | data T = {-# UNPACK #-} !Word64 :* {-# UNPACK #-} !Int
42 | 
43 | -- | /O(n+m)/ Find the offsets of all non-overlapping indices of
44 | -- @needle@ within @haystack@.  The offsets returned represent
45 | -- locations in the low-level array.
46 | --
47 | -- In (unlikely) bad cases, this algorithm's complexity degrades
48 | -- towards /O(n*m)/.
49 | indices :: Text                -- ^ Substring to search for (@needle@)
50 |         -> Text                -- ^ Text to search in (@haystack@)
51 |         -> [Int]
52 | indices _needle@(Text narr noff nlen) _haystack@(Text harr hoff hlen)
53 |     | nlen == 1              = scanOne (nindex 0)
54 |     | nlen <= 0 || ldiff < 0 = []
55 |     | otherwise              = scan 0
56 |   where
57 |     ldiff    = hlen - nlen
58 |     nlast    = nlen - 1
59 |     z        = nindex nlast
60 |     nindex k = A.unsafeIndex narr (noff+k)
61 |     hindex k = A.unsafeIndex harr (hoff+k)
62 |     hindex' k | k == hlen  = 0
63 |               | otherwise = A.unsafeIndex harr (hoff+k)
64 |     buildTable !i !msk !skp
65 |         | i >= nlast           = (msk .|. swizzle z) :* skp
66 |         | otherwise            = buildTable (i+1) (msk .|. swizzle c) skp'
67 |         where c                = nindex i
68 |               skp' | c == z    = nlen - i - 2
69 |                    | otherwise = skp
70 |     swizzle k = 1 `shiftL` (fromIntegral k .&. 0x3f)
71 |     scan !i
72 |         | i > ldiff                  = []
73 |         | c == z && candidateMatch 0 = i : scan (i + nlen)
74 |         | otherwise                  = scan (i + delta)
75 |         where c = hindex (i + nlast)
76 |               candidateMatch !j
77 |                     | j >= nlast               = True
78 |                     | hindex (i+j) /= nindex j = False
79 |                     | otherwise                = candidateMatch (j+1)
80 |               delta | nextInPattern = nlen + 1
81 |                     | c == z        = skip + 1
82 |                     | otherwise     = 1
83 |                 where nextInPattern = mask .&. swizzle (hindex' (i+nlen)) == 0
84 |               !(mask :* skip)       = buildTable 0 0 (nlen-2)
85 |     scanOne c = loop 0
86 |         where loop !i | i >= hlen     = []
87 |                       | hindex i == c = i : loop (i+1)
88 |                       | otherwise     = loop (i+1)
89 | {-# INLINE indices #-}
90 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/Stream.hs:
--------------------------------------------------------------------------------
 1 | -- | This module contains a number of benchmarks for the different streaming
 2 | -- functions
 3 | --
 4 | -- Tested in this benchmark:
 5 | --
 6 | -- * Most streaming functions
 7 | --
 8 | {-# LANGUAGE BangPatterns #-}
 9 | {-# OPTIONS_GHC -fno-warn-orphans #-}
10 | module Benchmarks.Stream
11 |     ( benchmark
12 |     ) where
13 | 
14 | import Control.DeepSeq (NFData (..))
15 | import Criterion (Benchmark, bgroup, bench, nf)
16 | import Data.Text.Fusion.Internal (Step (..), Stream (..))
17 | import qualified Data.Text.Encoding as T
18 | import qualified Data.Text.Encoding.Error as E
19 | import qualified Data.Text.Encoding.Fusion as T
20 | import qualified Data.Text.Encoding.Fusion.Common as F
21 | import qualified Data.Text.Fusion as T
22 | import qualified Data.Text.IO as T
23 | import qualified Data.Text.Lazy.Encoding as TL
24 | import qualified Data.Text.Lazy.Encoding.Fusion as TL
25 | import qualified Data.Text.Lazy.Fusion as TL
26 | import qualified Data.Text.Lazy.IO as TL
27 | 
28 | instance NFData a => NFData (Stream a) where
29 |     -- Currently, this implementation does not force evaluation of the size hint
30 |     rnf (Stream next s0 _) = go s0
31 |       where
32 |         go !s = case next s of
33 |             Done       -> ()
34 |             Skip s'    -> go s'
35 |             Yield x s' -> rnf x `seq` go s'
36 | 
37 | benchmark :: FilePath -> IO Benchmark
38 | benchmark fp = do
39 |     -- Different formats
40 |     t  <- T.readFile fp
41 |     let !utf8    = T.encodeUtf8 t
42 |         !utf16le = T.encodeUtf16LE t
43 |         !utf16be = T.encodeUtf16BE t
44 |         !utf32le = T.encodeUtf32LE t
45 |         !utf32be = T.encodeUtf32BE t
46 | 
47 |     -- Once again for the lazy variants
48 |     tl <- TL.readFile fp
49 |     let !utf8L    = TL.encodeUtf8 tl
50 |         !utf16leL = TL.encodeUtf16LE tl
51 |         !utf16beL = TL.encodeUtf16BE tl
52 |         !utf32leL = TL.encodeUtf32LE tl
53 |         !utf32beL = TL.encodeUtf32BE tl
54 | 
55 |     -- For the functions which operate on streams
56 |     let !s = T.stream t
57 | 
58 |     return $ bgroup "Stream"
59 | 
60 |         -- Fusion
61 |         [ bgroup "stream" $
62 |             [ bench "Text"     $ nf T.stream t
63 |             , bench "LazyText" $ nf TL.stream tl
64 |             ]
65 | 
66 |         -- Encoding.Fusion
67 |         , bgroup "streamUtf8"
68 |             [ bench "Text"     $ nf (T.streamUtf8 E.lenientDecode) utf8
69 |             , bench "LazyText" $ nf (TL.streamUtf8 E.lenientDecode) utf8L
70 |             ]
71 |         , bgroup "streamUtf16LE"
72 |             [ bench "Text"     $ nf (T.streamUtf16LE E.lenientDecode) utf16le
73 |             , bench "LazyText" $ nf (TL.streamUtf16LE E.lenientDecode) utf16leL
74 |             ]
75 |         , bgroup "streamUtf16BE"
76 |             [ bench "Text"     $ nf (T.streamUtf16BE E.lenientDecode) utf16be
77 |             , bench "LazyText" $ nf (TL.streamUtf16BE E.lenientDecode) utf16beL
78 |             ]
79 |         , bgroup "streamUtf32LE"
80 |             [ bench "Text"     $ nf (T.streamUtf32LE E.lenientDecode) utf32le
81 |             , bench "LazyText" $ nf (TL.streamUtf32LE E.lenientDecode) utf32leL
82 |             ]
83 |         , bgroup "streamUtf32BE"
84 |             [ bench "Text"     $ nf (T.streamUtf32BE E.lenientDecode) utf32be
85 |             , bench "LazyText" $ nf (TL.streamUtf32BE E.lenientDecode) utf32beL
86 |             ]
87 | 
88 |         -- Encoding.Fusion.Common
89 |         , bench "restreamUtf16LE" $ nf F.restreamUtf16LE s
90 |         , bench "restreamUtf16BE" $ nf F.restreamUtf16BE s
91 |         , bench "restreamUtf32LE" $ nf F.restreamUtf32LE s
92 |         , bench "restreamUtf32BE" $ nf F.restreamUtf32BE s
93 |         ]
94 | 


--------------------------------------------------------------------------------
/benchmarks/haskell/Benchmarks/ReadNumbers.hs:
--------------------------------------------------------------------------------
 1 | -- | Read numbers from a file with a just a number on each line, find the
 2 | -- minimum of those numbers. The file contains different kinds of numbers:
 3 | --
 4 | -- * Decimals
 5 | --
 6 | -- * Hexadecimals
 7 | --
 8 | -- * Floating point numbers
 9 | --
10 | -- * Floating point numbers in scientific notation
11 | --
12 | -- The different benchmarks will only take into account the values they can
13 | -- parse.
14 | --
15 | -- Tested in this benchmark:
16 | --
17 | -- * Lexing/parsing of different numerical types
18 | --
19 | module Benchmarks.ReadNumbers
20 |     ( benchmark
21 |     ) where
22 | 
23 | import Criterion (Benchmark, bgroup, bench, whnf)
24 | import Data.List (foldl')
25 | import Numeric (readDec, readFloat, readHex)
26 | import qualified Data.ByteString.Char8 as B
27 | import qualified Data.ByteString.Lazy.Char8 as BL
28 | import qualified Data.ByteString.Lex.Double as B
29 | import qualified Data.ByteString.Lex.Lazy.Double as BL
30 | import qualified Data.Text as T
31 | import qualified Data.Text.IO as T
32 | import qualified Data.Text.Lazy as TL
33 | import qualified Data.Text.Lazy.IO as TL
34 | import qualified Data.Text.Lazy.Read as TL
35 | import qualified Data.Text.Read as T
36 | 
37 | benchmark :: FilePath -> IO Benchmark
38 | benchmark fp = do
39 |     -- Read all files into lines: string, text, lazy text, bytestring, lazy
40 |     -- bytestring
41 |     s <- lines `fmap` readFile fp
42 |     t <- T.lines `fmap` T.readFile fp
43 |     tl <- TL.lines `fmap` TL.readFile fp
44 |     b <- B.lines `fmap` B.readFile fp
45 |     bl <- BL.lines `fmap` BL.readFile fp
46 |     return $ bgroup "ReadNumbers"
47 |         [ bench "DecimalString"     $ whnf (int . string readDec) s
48 |         , bench "HexadecimalString" $ whnf (int . string readHex) s
49 |         , bench "DoubleString"      $ whnf (double . string readFloat) s
50 | 
51 |         , bench "DecimalText"     $ whnf (int . text (T.signed T.decimal)) t
52 |         , bench "HexadecimalText" $ whnf (int . text (T.signed T.hexadecimal)) t
53 |         , bench "DoubleText"      $ whnf (double . text T.double) t
54 |         , bench "RationalText"    $ whnf (double . text T.rational) t
55 | 
56 |         , bench "DecimalLazyText" $
57 |             whnf (int . text (TL.signed TL.decimal)) tl
58 |         , bench "HexadecimalLazyText" $
59 |             whnf (int . text (TL.signed TL.hexadecimal)) tl
60 |         , bench "DoubleLazyText" $
61 |             whnf (double . text TL.double) tl
62 |         , bench "RationalLazyText" $
63 |             whnf (double . text TL.rational) tl
64 | 
65 |         , bench "DecimalByteString" $ whnf (int . byteString B.readInt) b
66 |         , bench "DoubleByteString"  $ whnf (double . byteString B.readDouble) b
67 | 
68 |         , bench "DecimalLazyByteString" $
69 |             whnf (int . byteString BL.readInt) bl
70 |         , bench "DoubleLazyByteString" $
71 |             whnf (double . byteString BL.readDouble) bl
72 |         ]
73 |   where
74 |     -- Used for fixing types
75 |     int :: Int -> Int
76 |     int = id
77 |     double :: Double -> Double
78 |     double = id
79 | 
80 | string :: (Ord a, Num a) => (t -> [(a, t)]) -> [t] -> a
81 | string reader = foldl' go 1000000
82 |   where
83 |     go z t = case reader t of [(n, _)] -> min n z
84 |                               _        -> z
85 | 
86 | text :: (Ord a, Num a) => (t -> Either String (a,t)) -> [t] -> a
87 | text reader = foldl' go 1000000
88 |   where
89 |     go z t = case reader t of Left _       -> z
90 |                               Right (n, _) -> min n z
91 | 
92 | byteString :: (Ord a, Num a) => (t -> Maybe (a,t)) -> [t] -> a
93 | byteString reader = foldl' go 1000000
94 |   where
95 |     go z t = case reader t of Nothing     -> z
96 |                               Just (n, _) -> min n z
97 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, DeriveDataTypeable #-}
  2 | -- |
  3 | -- Module      : Data.Text.Lazy.Internal
  4 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
  8 | --               duncan@haskell.org
  9 | -- Stability   : experimental
 10 | -- Portability : GHC
 11 | --
 12 | -- A module containing private 'Text' internals. This exposes the
 13 | -- 'Text' representation and low level construction functions.
 14 | -- Modules which extend the 'Text' system may need to use this module.
 15 | --
 16 | -- You should not use this module unless you are determined to monkey
 17 | -- with the internals, as the functions here do just about nothing to
 18 | -- preserve data invariants.  You have been warned!
 19 | 
 20 | module Data.Text.Lazy.Internal
 21 |     (
 22 |       Text(..)
 23 |     , chunk
 24 |     , empty
 25 |     , foldrChunks
 26 |     , foldlChunks
 27 |     -- * Data type invariant and abstraction functions
 28 | 
 29 |     -- $invariant
 30 |     , strictInvariant
 31 |     , lazyInvariant
 32 |     , showStructure
 33 | 
 34 |     -- * Chunk allocation sizes
 35 |     , defaultChunkSize
 36 |     , smallChunkSize
 37 |     , chunkOverhead
 38 |     ) where
 39 | 
 40 | import Data.Text ()
 41 | import Data.Text.UnsafeShift (shiftL)
 42 | import Data.Typeable (Typeable)
 43 | import Foreign.Storable (sizeOf)
 44 | import qualified Data.Text.Internal as T
 45 | 
 46 | data Text = Empty
 47 |           | Chunk {-# UNPACK #-} !T.Text Text
 48 |             deriving (Typeable)
 49 | 
 50 | -- $invariant
 51 | --
 52 | -- The data type invariant for lazy 'Text': Every 'Text' is either 'Empty' or
 53 | -- consists of non-null 'T.Text's.  All functions must preserve this,
 54 | -- and the QC properties must check this.
 55 | 
 56 | -- | Check the invariant strictly.
 57 | strictInvariant :: Text -> Bool
 58 | strictInvariant Empty = True
 59 | strictInvariant x@(Chunk (T.Text _ _ len) cs)
 60 |     | len > 0   = strictInvariant cs
 61 |     | otherwise = error $ "Data.Text.Lazy: invariant violation: "
 62 |                   ++ showStructure x
 63 | 
 64 | -- | Check the invariant lazily.
 65 | lazyInvariant :: Text -> Text
 66 | lazyInvariant Empty = Empty
 67 | lazyInvariant x@(Chunk c@(T.Text _ _ len) cs)
 68 |     | len > 0   = Chunk c (lazyInvariant cs)
 69 |     | otherwise = error $ "Data.Text.Lazy: invariant violation: "
 70 |                   ++ showStructure x
 71 | 
 72 | -- | Display the internal structure of a lazy 'Text'.
 73 | showStructure :: Text -> String
 74 | showStructure Empty           = "Empty"
 75 | showStructure (Chunk t Empty) = "Chunk " ++ show t ++ " Empty"
 76 | showStructure (Chunk t ts)    =
 77 |     "Chunk " ++ show t ++ " (" ++ showStructure ts ++ ")"
 78 | 
 79 | -- | Smart constructor for 'Chunk'. Guarantees the data type invariant.
 80 | chunk :: T.Text -> Text -> Text
 81 | {-# INLINE chunk #-}
 82 | chunk t@(T.Text _ _ len) ts | len == 0 = ts
 83 |                             | otherwise = Chunk t ts
 84 | 
 85 | -- | Smart constructor for 'Empty'.
 86 | empty :: Text
 87 | {-# INLINE [0] empty #-}
 88 | empty = Empty
 89 | 
 90 | -- | Consume the chunks of a lazy 'Text' with a natural right fold.
 91 | foldrChunks :: (T.Text -> a -> a) -> a -> Text -> a
 92 | foldrChunks f z = go
 93 |   where go Empty        = z
 94 |         go (Chunk c cs) = f c (go cs)
 95 | {-# INLINE foldrChunks #-}
 96 | 
 97 | -- | Consume the chunks of a lazy 'Text' with a strict, tail-recursive,
 98 | -- accumulating left fold.
 99 | foldlChunks :: (a -> T.Text -> a) -> a -> Text -> a
100 | foldlChunks f z = go z
101 |   where go !a Empty        = a
102 |         go !a (Chunk c cs) = go (f a c) cs
103 | {-# INLINE foldlChunks #-}
104 | 
105 | -- | Currently set to 16 KiB, less the memory management overhead.
106 | defaultChunkSize :: Int
107 | defaultChunkSize = 16384 - chunkOverhead
108 | {-# INLINE defaultChunkSize #-}
109 | 
110 | -- | Currently set to 128 bytes, less the memory management overhead.
111 | smallChunkSize :: Int
112 | smallChunkSize = 128 - chunkOverhead
113 | {-# INLINE smallChunkSize #-}
114 | 
115 | -- | The memory management overhead. Currently this is tuned for GHC only.
116 | chunkOverhead :: Int
117 | chunkOverhead = sizeOf (undefined :: Int) `shiftL` 1
118 | {-# INLINE chunkOverhead #-}
119 | 


--------------------------------------------------------------------------------
/Data/Text/Unsafe.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, MagicHash, UnboxedTuples #-}
  2 | -- |
  3 | -- Module      : Data.Text.Unsafe
  4 | -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan
  5 | -- License     : BSD-style
  6 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
  7 | --               duncan@haskell.org
  8 | -- Stability   : experimental
  9 | -- Portability : portable
 10 | --
 11 | -- A module containing unsafe 'Text' operations, for very very careful
 12 | -- use in heavily tested code.
 13 | module Data.Text.Unsafe
 14 |     (
 15 |       inlineInterleaveST
 16 |     , inlinePerformIO
 17 |     , unsafeDupablePerformIO
 18 |     , Iter(..)
 19 |     , iter
 20 |     , iter_
 21 |     , reverseIter
 22 |     , unsafeHead
 23 |     , unsafeTail
 24 |     , lengthWord16
 25 |     , takeWord16
 26 |     , dropWord16
 27 |     ) where
 28 | 
 29 | #if defined(ASSERTS)
 30 | import Control.Exception (assert)
 31 | #endif
 32 | import Data.Text.Encoding.Utf16 (chr2)
 33 | import Data.Text.Internal (Text(..))
 34 | import Data.Text.Unsafe.Base (inlineInterleaveST, inlinePerformIO)
 35 | import Data.Text.UnsafeChar (unsafeChr)
 36 | import qualified Data.Text.Array as A
 37 | #if __GLASGOW_HASKELL__ >= 611
 38 | import GHC.IO (unsafeDupablePerformIO)
 39 | #else
 40 | import GHC.IOBase (unsafeDupablePerformIO)
 41 | #endif
 42 | 
 43 | -- | /O(1)/ A variant of 'head' for non-empty 'Text'. 'unsafeHead'
 44 | -- omits the check for the empty case, so there is an obligation on
 45 | -- the programmer to provide a proof that the 'Text' is non-empty.
 46 | unsafeHead :: Text -> Char
 47 | unsafeHead (Text arr off _len)
 48 |     | m < 0xD800 || m > 0xDBFF = unsafeChr m
 49 |     | otherwise                = chr2 m n
 50 |     where m = A.unsafeIndex arr off
 51 |           n = A.unsafeIndex arr (off+1)
 52 | {-# INLINE unsafeHead #-}
 53 | 
 54 | -- | /O(1)/ A variant of 'tail' for non-empty 'Text'. 'unsafeHead'
 55 | -- omits the check for the empty case, so there is an obligation on
 56 | -- the programmer to provide a proof that the 'Text' is non-empty.
 57 | unsafeTail :: Text -> Text
 58 | unsafeTail t@(Text arr off len) =
 59 | #if defined(ASSERTS)
 60 |     assert (d <= len) $
 61 | #endif
 62 |     Text arr (off+d) (len-d)
 63 |   where d = iter_ t 0
 64 | {-# INLINE unsafeTail #-}
 65 | 
 66 | data Iter = Iter {-# UNPACK #-} !Char {-# UNPACK #-} !Int
 67 | 
 68 | -- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-16
 69 | -- array, returning the current character and the delta to add to give
 70 | -- the next offset to iterate at.
 71 | iter :: Text -> Int -> Iter
 72 | iter (Text arr off _len) i
 73 |     | m < 0xD800 || m > 0xDBFF = Iter (unsafeChr m) 1
 74 |     | otherwise                = Iter (chr2 m n) 2
 75 |   where m = A.unsafeIndex arr j
 76 |         n = A.unsafeIndex arr k
 77 |         j = off + i
 78 |         k = j + 1
 79 | {-# INLINE iter #-}
 80 | 
 81 | -- | /O(1)/ Iterate one step through a UTF-16 array, returning the
 82 | -- delta to add to give the next offset to iterate at.
 83 | iter_ :: Text -> Int -> Int
 84 | iter_ (Text arr off _len) i | m < 0xD800 || m > 0xDBFF = 1
 85 |                             | otherwise                = 2
 86 |   where m = A.unsafeIndex arr (off+i)
 87 | {-# INLINE iter_ #-}
 88 | 
 89 | -- | /O(1)/ Iterate one step backwards through a UTF-16 array,
 90 | -- returning the current character and the delta to add (i.e. a
 91 | -- negative number) to give the next offset to iterate at.
 92 | reverseIter :: Text -> Int -> (Char,Int)
 93 | reverseIter (Text arr off _len) i
 94 |     | m < 0xDC00 || m > 0xDFFF = (unsafeChr m, -1)
 95 |     | otherwise                = (chr2 n m,    -2)
 96 |   where m = A.unsafeIndex arr j
 97 |         n = A.unsafeIndex arr k
 98 |         j = off + i
 99 |         k = j - 1
100 | {-# INLINE reverseIter #-}
101 | 
102 | -- | /O(1)/ Return the length of a 'Text' in units of 'Word16'.  This
103 | -- is useful for sizing a target array appropriately before using
104 | -- 'unsafeCopyToPtr'.
105 | lengthWord16 :: Text -> Int
106 | lengthWord16 (Text _arr _off len) = len
107 | {-# INLINE lengthWord16 #-}
108 | 
109 | -- | /O(1)/ Unchecked take of 'k' 'Word16's from the front of a 'Text'.
110 | takeWord16 :: Int -> Text -> Text
111 | takeWord16 k (Text arr off _len) = Text arr off k
112 | {-# INLINE takeWord16 #-}
113 | 
114 | -- | /O(1)/ Unchecked drop of 'k' 'Word16's from the front of a 'Text'.
115 | dropWord16 :: Int -> Text -> Text
116 | dropWord16 k (Text arr off len) = Text arr (off+k) (len-k)
117 | {-# INLINE dropWord16 #-}
118 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Fusion.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | -- |
  3 | -- Module      : Data.Text.Lazy.Fusion
  4 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
  8 | --               duncan@haskell.org
  9 | -- Stability   : experimental
 10 | -- Portability : GHC
 11 | --
 12 | -- Core stream fusion functionality for text.
 13 | 
 14 | module Data.Text.Lazy.Fusion
 15 |     (
 16 |       stream
 17 |     , unstream
 18 |     , unstreamChunks
 19 |     , length
 20 |     , unfoldrN
 21 |     , index
 22 |     , countChar
 23 |     ) where
 24 | 
 25 | import Prelude hiding (length)
 26 | import qualified Data.Text.Fusion.Common as S
 27 | import Control.Monad.ST (runST)
 28 | import Data.Text.Fusion.Internal
 29 | import Data.Text.Fusion.Size (isEmpty, unknownSize)
 30 | import Data.Text.Lazy.Internal
 31 | import qualified Data.Text.Internal as I
 32 | import qualified Data.Text.Array as A
 33 | import Data.Text.UnsafeChar (unsafeWrite)
 34 | import Data.Text.UnsafeShift (shiftL)
 35 | import Data.Text.Unsafe (Iter(..), iter)
 36 | import Data.Int (Int64)
 37 | 
 38 | default(Int64)
 39 | 
 40 | -- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
 41 | stream :: Text -> Stream Char
 42 | stream text = Stream next (text :*: 0) unknownSize
 43 |   where
 44 |     next (Empty :*: _) = Done
 45 |     next (txt@(Chunk t@(I.Text _ _ len) ts) :*: i)
 46 |         | i >= len  = next (ts :*: 0)
 47 |         | otherwise = Yield c (txt :*: i+d)
 48 |         where Iter c d = iter t i
 49 | {-# INLINE [0] stream #-}
 50 | 
 51 | -- | /O(n)/ Convert a 'Stream Char' into a 'Text', using the given
 52 | -- chunk size.
 53 | unstreamChunks :: Int -> Stream Char -> Text
 54 | unstreamChunks !chunkSize (Stream next s0 len0)
 55 |   | isEmpty len0 = Empty
 56 |   | otherwise    = outer s0
 57 |   where
 58 |     outer so = {-# SCC "unstreamChunks/outer" #-}
 59 |               case next so of
 60 |                 Done       -> Empty
 61 |                 Skip s'    -> outer s'
 62 |                 Yield x s' -> runST $ do
 63 |                                 a <- A.new unknownLength
 64 |                                 unsafeWrite a 0 x >>= inner a unknownLength s'
 65 |                     where unknownLength = 4
 66 |       where
 67 |         inner marr !len s !i
 68 |             | i + 1 >= chunkSize = finish marr i s
 69 |             | i + 1 >= len       = {-# SCC "unstreamChunks/resize" #-} do
 70 |                 let newLen = min (len `shiftL` 1) chunkSize
 71 |                 marr' <- A.new newLen
 72 |                 A.copyM marr' 0 marr 0 len
 73 |                 inner marr' newLen s i
 74 |             | otherwise =
 75 |                 {-# SCC "unstreamChunks/inner" #-}
 76 |                 case next s of
 77 |                   Done        -> finish marr i s
 78 |                   Skip s'     -> inner marr len s' i
 79 |                   Yield x s'  -> do d <- unsafeWrite marr i x
 80 |                                     inner marr len s' (i+d)
 81 |         finish marr len s' = do
 82 |           arr <- A.unsafeFreeze marr
 83 |           return (I.Text arr 0 len `Chunk` outer s')
 84 | {-# INLINE [0] unstreamChunks #-}
 85 | 
 86 | -- | /O(n)/ Convert a 'Stream Char' into a 'Text', using
 87 | -- 'defaultChunkSize'.
 88 | unstream :: Stream Char -> Text
 89 | unstream = unstreamChunks defaultChunkSize
 90 | {-# INLINE [0] unstream #-}
 91 | 
 92 | -- | /O(n)/ Returns the number of characters in a text.
 93 | length :: Stream Char -> Int64
 94 | length = S.lengthI
 95 | {-# INLINE[0] length #-}
 96 | 
 97 | {-# RULES "LAZY STREAM stream/unstream fusion" forall s.
 98 |     stream (unstream s) = s #-}
 99 | 
100 | -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed
101 | -- value. However, the length of the result is limited by the
102 | -- first argument to 'unfoldrN'. This function is more efficient than
103 | -- 'unfoldr' when the length of the result is known.
104 | unfoldrN :: Int64 -> (a -> Maybe (Char,a)) -> a -> Stream Char
105 | unfoldrN n = S.unfoldrNI n
106 | {-# INLINE [0] unfoldrN #-}
107 | 
108 | -- | /O(n)/ stream index (subscript) operator, starting from 0.
109 | index :: Stream Char -> Int64 -> Char
110 | index = S.indexI
111 | {-# INLINE [0] index #-}
112 | 
113 | -- | /O(n)/ The 'count' function returns the number of times the query
114 | -- element appears in the given stream.
115 | countChar :: Char -> Stream Char -> Int64
116 | countChar = S.countCharI
117 | {-# INLINE [0] countChar #-}
118 | 


--------------------------------------------------------------------------------
/Data/Text/Fusion/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, ExistentialQuantification #-}
  2 | -- |
  3 | -- Module      : Data.Text.Fusion.Internal
  4 | -- Copyright   : (c) Tom Harper 2008-2009,
  5 | --               (c) Bryan O'Sullivan 2009,
  6 | --               (c) Duncan Coutts 2009,
  7 | --               (c) Jasper Van der Jeugt 2011
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 11 | --               duncan@haskell.org
 12 | -- Stability   : experimental
 13 | -- Portability : GHC
 14 | --
 15 | -- Core stream fusion functionality for text.
 16 | 
 17 | module Data.Text.Fusion.Internal
 18 |     (
 19 |       CC(..)
 20 |     , M(..)
 21 |     , M8
 22 |     , PairS(..)
 23 |     , RS(..)
 24 |     , Step(..)
 25 |     , Stream(..)
 26 |     , Switch(..)
 27 |     , empty
 28 |     ) where
 29 | 
 30 | import Data.Text.Fusion.Size
 31 | import Data.Word (Word8)
 32 | 
 33 | -- | Specialised tuple for case conversion.
 34 | data CC s = CC !s {-# UNPACK #-} !Char {-# UNPACK #-} !Char
 35 | 
 36 | -- | Specialised, strict Maybe-like type.
 37 | data M a = N
 38 |          | J !a
 39 | 
 40 | type M8 = M Word8
 41 | 
 42 | -- Restreaming state.
 43 | data RS s
 44 |     = RS0 !s
 45 |     | RS1 !s {-# UNPACK #-} !Word8
 46 |     | RS2 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
 47 |     | RS3 !s {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8
 48 | 
 49 | infixl 2 :*:
 50 | data PairS a b = !a :*: !b
 51 |                  -- deriving (Eq, Ord, Show)
 52 | 
 53 | -- | Allow a function over a stream to switch between two states.
 54 | data Switch = S1 | S2
 55 | 
 56 | data Step s a = Done
 57 |               | Skip !s
 58 |               | Yield !a !s
 59 | 
 60 | {-
 61 | instance (Show a) => Show (Step s a)
 62 |     where show Done        = "Done"
 63 |           show (Skip _)    = "Skip"
 64 |           show (Yield x _) = "Yield " ++ show x
 65 | -}
 66 | 
 67 | instance (Eq a) => Eq (Stream a) where
 68 |     (==) = eq
 69 | 
 70 | instance (Ord a) => Ord (Stream a) where
 71 |     compare = cmp
 72 | 
 73 | -- The length hint in a Stream has two roles.  If its value is zero,
 74 | -- we trust it, and treat the stream as empty.  Otherwise, we treat it
 75 | -- as a hint: it should usually be accurate, so we use it when
 76 | -- unstreaming to decide what size array to allocate.  However, the
 77 | -- unstreaming functions must be able to cope with the hint being too
 78 | -- small or too large.
 79 | --
 80 | -- The size hint tries to track the UTF-16 code points in a stream,
 81 | -- but often counts the number of characters instead.  It can easily
 82 | -- undercount if, for instance, a transformed stream contains astral
 83 | -- plane characters (those above 0x10000).
 84 | 
 85 | data Stream a =
 86 |     forall s. Stream
 87 |     (s -> Step s a)             -- stepper function
 88 |     !s                          -- current state
 89 |     !Size                       -- size hint
 90 | 
 91 | -- | /O(n)/ Determines if two streams are equal.
 92 | eq :: (Eq a) => Stream a -> Stream a -> Bool
 93 | eq (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
 94 |     where
 95 |       loop Done Done                     = True
 96 |       loop (Skip s1')     (Skip s2')     = loop (next1 s1') (next2 s2')
 97 |       loop (Skip s1')     x2             = loop (next1 s1') x2
 98 |       loop x1             (Skip s2')     = loop x1          (next2 s2')
 99 |       loop Done _                        = False
100 |       loop _    Done                     = False
101 |       loop (Yield x1 s1') (Yield x2 s2') = x1 == x2 &&
102 |                                            loop (next1 s1') (next2 s2')
103 | {-# INLINE [0] eq #-}
104 | 
105 | cmp :: (Ord a) => Stream a -> Stream a -> Ordering
106 | cmp (Stream next1 s1 _) (Stream next2 s2 _) = loop (next1 s1) (next2 s2)
107 |     where
108 |       loop Done Done                     = EQ
109 |       loop (Skip s1')     (Skip s2')     = loop (next1 s1') (next2 s2')
110 |       loop (Skip s1')     x2             = loop (next1 s1') x2
111 |       loop x1             (Skip s2')     = loop x1          (next2 s2')
112 |       loop Done _                        = LT
113 |       loop _    Done                     = GT
114 |       loop (Yield x1 s1') (Yield x2 s2') =
115 |           case compare x1 x2 of
116 |             EQ    -> loop (next1 s1') (next2 s2')
117 |             other -> other
118 | {-# INLINE [0] cmp #-}
119 | 
120 | -- | The empty stream.
121 | empty :: Stream a
122 | empty = Stream next () 0
123 |     where next _ = Done
124 | {-# INLINE [0] empty #-}
125 | 


--------------------------------------------------------------------------------
/Data/Text/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, DeriveDataTypeable #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.Internal
  5 | -- Copyright   : (c) 2008, 2009 Tom Harper,
  6 | --               (c) 2009, 2010 Bryan O'Sullivan,
  7 | --               (c) 2009 Duncan Coutts
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 11 | --               duncan@haskell.org
 12 | -- Stability   : experimental
 13 | -- Portability : GHC
 14 | --
 15 | -- A module containing private 'Text' internals. This exposes the
 16 | -- 'Text' representation and low level construction functions.
 17 | -- Modules which extend the 'Text' system may need to use this module.
 18 | --
 19 | -- You should not use this module unless you are determined to monkey
 20 | -- with the internals, as the functions here do just about nothing to
 21 | -- preserve data invariants.  You have been warned!
 22 | 
 23 | module Data.Text.Internal
 24 |     (
 25 |     -- * Types
 26 |     -- $internals
 27 |       Text(..)
 28 |     -- * Construction
 29 |     , text
 30 |     , textP
 31 |     -- * Safety
 32 |     , safe
 33 |     -- * Code that must be here for accessibility
 34 |     , empty
 35 |     -- * Utilities
 36 |     , firstf
 37 |     -- * Debugging
 38 |     , showText
 39 |     ) where
 40 | 
 41 | #if defined(ASSERTS)
 42 | import Control.Exception (assert)
 43 | #endif
 44 | import Data.Bits ((.&.))
 45 | import qualified Data.Text.Array as A
 46 | import Data.Text.UnsafeChar (ord)
 47 | import Data.Typeable (Typeable)
 48 | 
 49 | -- | A space efficient, packed, unboxed Unicode text type.
 50 | data Text = Text
 51 |     {-# UNPACK #-} !A.Array          -- payload (Word16 elements)
 52 |     {-# UNPACK #-} !Int              -- offset (units of Word16, not Char)
 53 |     {-# UNPACK #-} !Int              -- length (units of Word16, not Char)
 54 |     deriving (Typeable)
 55 | 
 56 | -- | Smart constructor.
 57 | text :: A.Array -> Int -> Int -> Text
 58 | text arr off len =
 59 | #if defined(ASSERTS)
 60 |   let c    = A.unsafeIndex arr off
 61 |       alen = A.length arr
 62 |   in assert (len >= 0) .
 63 |      assert (off >= 0) .
 64 |      assert (alen == 0 || len == 0 || off < alen) .
 65 |      assert (len == 0 || c < 0xDC00 || c > 0xDFFF) $
 66 | #endif
 67 |      Text arr off len
 68 | {-# INLINE text #-}
 69 | 
 70 | -- | /O(1)/ The empty 'Text'.
 71 | empty :: Text
 72 | empty = Text A.empty 0 0
 73 | {-# INLINE [1] empty #-}
 74 | 
 75 | -- | Construct a 'Text' without invisibly pinning its byte array in
 76 | -- memory if its length has dwindled to zero.
 77 | textP :: A.Array -> Int -> Int -> Text
 78 | textP arr off len | len == 0  = empty
 79 |                   | otherwise = text arr off len
 80 | {-# INLINE textP #-}
 81 | 
 82 | -- | A useful 'show'-like function for debugging purposes.
 83 | showText :: Text -> String
 84 | showText (Text arr off len) =
 85 |     "Text " ++ show (A.toList arr off len) ++ ' ' :
 86 |             show off ++ ' ' : show len
 87 | 
 88 | -- | Map a 'Char' to a 'Text'-safe value.
 89 | --
 90 | -- UTF-16 surrogate code points are not included in the set of Unicode
 91 | -- scalar values, but are unfortunately admitted as valid 'Char'
 92 | -- values by Haskell.  They cannot be represented in a 'Text'.  This
 93 | -- function remaps those code points to the Unicode replacement
 94 | -- character (U+FFFD, \'&#xfffd;\'), and leaves other code points
 95 | -- unchanged.
 96 | safe :: Char -> Char
 97 | safe c
 98 |     | ord c .&. 0x1ff800 /= 0xd800 = c
 99 |     | otherwise                    = '\xfffd'
100 | {-# INLINE safe #-}
101 | 
102 | -- | Apply a function to the first element of an optional pair.
103 | firstf :: (a -> c) -> Maybe (a,b) -> Maybe (c,b)
104 | firstf f (Just (a, b)) = Just (f a, b)
105 | firstf _  Nothing      = Nothing
106 | 
107 | -- $internals
108 | --
109 | -- Internally, the 'Text' type is represented as an array of 'Word16'
110 | -- UTF-16 code units. The offset and length fields in the constructor
111 | -- are in these units, /not/ units of 'Char'.
112 | --
113 | -- Invariants that all functions must maintain:
114 | --
115 | -- * Since the 'Text' type uses UTF-16 internally, it cannot represent
116 | --   characters in the reserved surrogate code point range U+D800 to
117 | --   U+DFFF. To maintain this invariant, the 'safe' function maps
118 | --   'Char' values in this range to the replacement character (U+FFFD,
119 | --   \'&#xfffd;\').
120 | --
121 | -- * A leading (or \"high\") surrogate code unit (0xD800–0xDBFF) must
122 | --   always be followed by a trailing (or \"low\") surrogate code unit
123 | --   (0xDC00-0xDFFF). A trailing surrogate code unit must always be
124 | --   preceded by a leading surrogate code unit.
125 | 


--------------------------------------------------------------------------------
/Data/Text/Fusion/Size.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP #-}
  2 | {-# OPTIONS_GHC -fno-warn-missing-methods #-}
  3 | -- |
  4 | -- Module      : Data.Text.Fusion.Internal
  5 | -- Copyright   : (c) Roman Leshchinskiy 2008,
  6 | --               (c) Bryan O'Sullivan 2009
  7 | --
  8 | -- License     : BSD-style
  9 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 10 | --               duncan@haskell.org
 11 | -- Stability   : experimental
 12 | -- Portability : portable
 13 | --
 14 | -- Size hints.
 15 | 
 16 | module Data.Text.Fusion.Size
 17 |     (
 18 |       Size
 19 |     , exactly
 20 |     , exactSize
 21 |     , maxSize
 22 |     , unknownSize
 23 |     , smaller
 24 |     , larger
 25 |     , upperBound
 26 |     , isEmpty
 27 |     ) where
 28 | 
 29 | #if defined(ASSERTS)
 30 | import Control.Exception (assert)
 31 | #endif
 32 | 
 33 | data Size = Exact {-# UNPACK #-} !Int -- ^ Exact size.
 34 |           | Max   {-# UNPACK #-} !Int -- ^ Upper bound on size.
 35 |           | Unknown                   -- ^ Unknown size.
 36 |             deriving (Eq, Show)
 37 | 
 38 | exactly :: Size -> Maybe Int
 39 | exactly (Exact n) = Just n
 40 | exactly _         = Nothing
 41 | {-# INLINE exactly #-}
 42 | 
 43 | exactSize :: Int -> Size
 44 | exactSize n =
 45 | #if defined(ASSERTS)
 46 |     assert (n >= 0)
 47 | #endif
 48 |     Exact n
 49 | {-# INLINE exactSize #-}
 50 | 
 51 | maxSize :: Int -> Size
 52 | maxSize n =
 53 | #if defined(ASSERTS)
 54 |     assert (n >= 0)
 55 | #endif
 56 |     Max n
 57 | {-# INLINE maxSize #-}
 58 | 
 59 | unknownSize :: Size
 60 | unknownSize = Unknown
 61 | {-# INLINE unknownSize #-}
 62 | 
 63 | instance Num Size where
 64 |     (+) = addSize
 65 |     (-) = subtractSize
 66 |     (*) = mulSize
 67 | 
 68 |     fromInteger = f where f = Exact . fromInteger
 69 |                           {-# INLINE f #-}
 70 | 
 71 | add :: Int -> Int -> Int
 72 | add m n | mn >=   0 = mn
 73 |         | otherwise = overflowError
 74 |   where mn = m + n
 75 | {-# INLINE add #-}
 76 | 
 77 | addSize :: Size -> Size -> Size
 78 | addSize (Exact m) (Exact n) = Exact (add m n)
 79 | addSize (Exact m) (Max   n) = Max   (add m n)
 80 | addSize (Max   m) (Exact n) = Max   (add m n)
 81 | addSize (Max   m) (Max   n) = Max   (add m n)
 82 | addSize _          _       = Unknown
 83 | {-# INLINE addSize #-}
 84 | 
 85 | subtractSize :: Size -> Size -> Size
 86 | subtractSize   (Exact m) (Exact n) = Exact (max (m-n) 0)
 87 | subtractSize   (Exact m) (Max   _) = Max   m
 88 | subtractSize   (Max   m) (Exact n) = Max   (max (m-n) 0)
 89 | subtractSize a@(Max   _) (Max   _) = a
 90 | subtractSize a@(Max   _) Unknown   = a
 91 | subtractSize _         _           = Unknown
 92 | {-# INLINE subtractSize #-}
 93 | 
 94 | mul :: Int -> Int -> Int
 95 | mul m n
 96 |     | m <= maxBound `quot` n = m * n
 97 |     | otherwise              = overflowError
 98 | {-# INLINE mul #-}
 99 | 
100 | mulSize :: Size -> Size -> Size
101 | mulSize (Exact m) (Exact n) = Exact (mul m n)
102 | mulSize (Exact m) (Max   n) = Max   (mul m n)
103 | mulSize (Max   m) (Exact n) = Max   (mul m n)
104 | mulSize (Max   m) (Max   n) = Max   (mul m n)
105 | mulSize _          _        = Unknown
106 | {-# INLINE mulSize #-}
107 | 
108 | -- | Minimum of two size hints.
109 | smaller :: Size -> Size -> Size
110 | smaller   (Exact m) (Exact n) = Exact (m `min` n)
111 | smaller   (Exact m) (Max   n) = Max   (m `min` n)
112 | smaller   (Exact m) Unknown   = Max   m
113 | smaller   (Max   m) (Exact n) = Max   (m `min` n)
114 | smaller   (Max   m) (Max   n) = Max   (m `min` n)
115 | smaller a@(Max   _) Unknown   = a
116 | smaller   Unknown   (Exact n) = Max   n
117 | smaller   Unknown   (Max   n) = Max   n
118 | smaller   Unknown   Unknown   = Unknown
119 | {-# INLINE smaller #-}
120 | 
121 | -- | Maximum of two size hints.
122 | larger :: Size -> Size -> Size
123 | larger   (Exact m)   (Exact n)             = Exact (m `max` n)
124 | larger a@(Exact m) b@(Max   n) | m >= n    = a
125 |                                | otherwise = b
126 | larger a@(Max   m) b@(Exact n) | n >= m    = b
127 |                                | otherwise = a
128 | larger   (Max   m)   (Max   n)             = Max   (m `max` n)
129 | larger _             _                     = Unknown
130 | {-# INLINE larger #-}
131 | 
132 | -- | Compute the maximum size from a size hint, if possible.
133 | upperBound :: Int -> Size -> Int
134 | upperBound _ (Exact n) = n
135 | upperBound _ (Max   n) = n
136 | upperBound k _         = k
137 | {-# INLINE upperBound #-}
138 | 
139 | isEmpty :: Size -> Bool
140 | isEmpty (Exact n) = n <= 0
141 | isEmpty (Max   n) = n <= 0
142 | isEmpty _         = False
143 | {-# INLINE isEmpty #-}
144 | 
145 | overflowError :: Int
146 | overflowError = error "Data.Text.Fusion.Size: size overflow"
147 | 


--------------------------------------------------------------------------------
/Data/Text/Encoding/Fusion/Common.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.Encoding.Fusion.Common
  5 | -- Copyright   : (c) Tom Harper 2008-2009,
  6 | --               (c) Bryan O'Sullivan 2009,
  7 | --               (c) Duncan Coutts 2009,
  8 | --               (c) Jasper Van der Jeugt 2011
  9 | --
 10 | -- License     : BSD-style
 11 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 12 | --               duncan@haskell.org
 13 | -- Stability   : experimental
 14 | -- Portability : portable
 15 | --
 16 | -- Fusible 'Stream'-oriented functions for converting between 'Text'
 17 | -- and several common encodings.
 18 | 
 19 | module Data.Text.Encoding.Fusion.Common
 20 |     (
 21 |     -- * Restreaming
 22 |     -- Restreaming is the act of converting from one 'Stream'
 23 |     -- representation to another.
 24 |       restreamUtf16LE
 25 |     , restreamUtf16BE
 26 |     , restreamUtf32LE
 27 |     , restreamUtf32BE
 28 |     ) where
 29 | 
 30 | import Data.Bits ((.&.))
 31 | import Data.Text.Fusion (Step(..), Stream(..))
 32 | import Data.Text.Fusion.Internal (RS(..))
 33 | import Data.Text.UnsafeChar (ord)
 34 | import Data.Text.UnsafeShift (shiftR)
 35 | import Data.Word (Word8)
 36 | 
 37 | restreamUtf16BE :: Stream Char -> Stream Word8
 38 | restreamUtf16BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
 39 |   where
 40 |     next (RS0 s) = case next0 s of
 41 |         Done -> Done
 42 |         Skip s' -> Skip (RS0 s')
 43 |         Yield x s'
 44 |             | n < 0x10000 -> Yield (fromIntegral $ n `shiftR` 8) $
 45 |                              RS1 s' (fromIntegral n)
 46 |             | otherwise   -> Yield c1 $ RS3 s' c2 c3 c4
 47 |             where
 48 |               n  = ord x
 49 |               n1 = n - 0x10000
 50 |               c1 = fromIntegral (n1 `shiftR` 18 + 0xD8)
 51 |               c2 = fromIntegral (n1 `shiftR` 10)
 52 |               n2 = n1 .&. 0x3FF
 53 |               c3 = fromIntegral (n2 `shiftR` 8 + 0xDC)
 54 |               c4 = fromIntegral n2
 55 |     next (RS1 s x2)       = Yield x2 (RS0 s)
 56 |     next (RS2 s x2 x3)    = Yield x2 (RS1 s x3)
 57 |     next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
 58 |     {-# INLINE next #-}
 59 | {-# INLINE restreamUtf16BE #-}
 60 | 
 61 | restreamUtf16LE :: Stream Char -> Stream Word8
 62 | restreamUtf16LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
 63 |   where
 64 |     next (RS0 s) = case next0 s of
 65 |         Done -> Done
 66 |         Skip s' -> Skip (RS0 s')
 67 |         Yield x s'
 68 |             | n < 0x10000 -> Yield (fromIntegral n) $
 69 |                              RS1 s' (fromIntegral $ shiftR n 8)
 70 |             | otherwise   -> Yield c1 $ RS3 s' c2 c3 c4
 71 |           where
 72 |             n  = ord x
 73 |             n1 = n - 0x10000
 74 |             c2 = fromIntegral (shiftR n1 18 + 0xD8)
 75 |             c1 = fromIntegral (shiftR n1 10)
 76 |             n2 = n1 .&. 0x3FF
 77 |             c4 = fromIntegral (shiftR n2 8 + 0xDC)
 78 |             c3 = fromIntegral n2
 79 |     next (RS1 s x2)       = Yield x2 (RS0 s)
 80 |     next (RS2 s x2 x3)    = Yield x2 (RS1 s x3)
 81 |     next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
 82 |     {-# INLINE next #-}
 83 | {-# INLINE restreamUtf16LE #-}
 84 | 
 85 | restreamUtf32BE :: Stream Char -> Stream Word8
 86 | restreamUtf32BE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
 87 |   where
 88 |     next (RS0 s) = case next0 s of
 89 |         Done       -> Done
 90 |         Skip s'    -> Skip (RS0 s')
 91 |         Yield x s' -> Yield c1 (RS3 s' c2 c3 c4)
 92 |           where
 93 |             n  = ord x
 94 |             c1 = fromIntegral $ shiftR n 24
 95 |             c2 = fromIntegral $ shiftR n 16
 96 |             c3 = fromIntegral $ shiftR n 8
 97 |             c4 = fromIntegral n
 98 |     next (RS1 s x2)       = Yield x2 (RS0 s)
 99 |     next (RS2 s x2 x3)    = Yield x2 (RS1 s x3)
100 |     next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
101 |     {-# INLINE next #-}
102 | {-# INLINE restreamUtf32BE #-}
103 | 
104 | restreamUtf32LE :: Stream Char -> Stream Word8
105 | restreamUtf32LE (Stream next0 s0 len) = Stream next (RS0 s0) (len * 2)
106 |   where
107 |     next (RS0 s) = case next0 s of
108 |         Done       -> Done
109 |         Skip s'    -> Skip (RS0 s')
110 |         Yield x s' -> Yield c1 (RS3 s' c2 c3 c4)
111 |           where
112 |             n  = ord x
113 |             c4 = fromIntegral $ shiftR n 24
114 |             c3 = fromIntegral $ shiftR n 16
115 |             c2 = fromIntegral $ shiftR n 8
116 |             c1 = fromIntegral n
117 |     next (RS1 s x2)       = Yield x2 (RS0 s)
118 |     next (RS2 s x2 x3)    = Yield x2 (RS1 s x3)
119 |     next (RS3 s x2 x3 x4) = Yield x2 (RS2 s x3 x4)
120 |     {-# INLINE next #-}
121 | {-# INLINE restreamUtf32LE #-}
122 | 


--------------------------------------------------------------------------------
/Data/Text/Encoding/Error.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, DeriveDataTypeable #-}
  2 | #if __GLASGOW_HASKELL__ >= 702
  3 | {-# LANGUAGE Trustworthy #-}
  4 | #endif
  5 | -- |
  6 | -- Module      : Data.Text.Encoding.Error
  7 | -- Copyright   : (c) Bryan O'Sullivan 2009
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 11 | --               duncan@haskell.org
 12 | -- Stability   : experimental
 13 | -- Portability : GHC
 14 | --
 15 | -- Types and functions for dealing with encoding and decoding errors
 16 | -- in Unicode text.
 17 | --
 18 | -- The standard functions for encoding and decoding text are strict,
 19 | -- which is to say that they throw exceptions on invalid input.  This
 20 | -- is often unhelpful on real world input, so alternative functions
 21 | -- exist that accept custom handlers for dealing with invalid inputs.
 22 | -- These 'OnError' handlers are normal Haskell functions.  You can use
 23 | -- one of the presupplied functions in this module, or you can write a
 24 | -- custom handler of your own.
 25 | 
 26 | module Data.Text.Encoding.Error
 27 |     (
 28 |     -- * Error handling types
 29 |       UnicodeException(..)
 30 |     , OnError
 31 |     , OnDecodeError
 32 |     , OnEncodeError
 33 |     -- * Useful error handling functions
 34 |     , lenientDecode
 35 |     , strictDecode
 36 |     , strictEncode
 37 |     , ignore
 38 |     , replace
 39 |     ) where
 40 | 
 41 | import Control.DeepSeq (NFData (..))
 42 | #if __GLASGOW_HASKELL__ >= 610
 43 | import Control.Exception (Exception, throw)
 44 | #else
 45 | import Control.Exception.Extensible (Exception, throw)
 46 | #endif
 47 | import Data.Typeable (Typeable)
 48 | import Data.Word (Word8)
 49 | import Numeric (showHex)
 50 | 
 51 | -- | Function type for handling a coding error.  It is supplied with
 52 | -- two inputs:
 53 | --
 54 | -- * A 'String' that describes the error.
 55 | --
 56 | -- * The input value that caused the error.  If the error arose
 57 | --   because the end of input was reached or could not be identified
 58 | --   precisely, this value will be 'Nothing'.
 59 | --
 60 | -- If the handler returns a value wrapped with 'Just', that value will
 61 | -- be used in the output as the replacement for the invalid input.  If
 62 | -- it returns 'Nothing', no value will be used in the output.
 63 | --
 64 | -- Should the handler need to abort processing, it should use 'error'
 65 | -- or 'throw' an exception (preferably a 'UnicodeException').  It may
 66 | -- use the description provided to construct a more helpful error
 67 | -- report.
 68 | type OnError a b = String -> Maybe a -> Maybe b
 69 | 
 70 | -- | A handler for a decoding error.
 71 | type OnDecodeError = OnError Word8 Char
 72 | 
 73 | -- | A handler for an encoding error.
 74 | type OnEncodeError = OnError Char Word8
 75 | 
 76 | -- | An exception type for representing Unicode encoding errors.
 77 | data UnicodeException =
 78 |     DecodeError String (Maybe Word8)
 79 |     -- ^ Could not decode a byte sequence because it was invalid under
 80 |     -- the given encoding, or ran out of input in mid-decode.
 81 |   | EncodeError String (Maybe Char)
 82 |     -- ^ Tried to encode a character that could not be represented
 83 |     -- under the given encoding, or ran out of input in mid-encode.
 84 |     deriving (Eq, Typeable)
 85 | 
 86 | showUnicodeException :: UnicodeException -> String
 87 | showUnicodeException (DecodeError desc (Just w))
 88 |     = "Cannot decode byte '\\x" ++ showHex w ("': " ++ desc)
 89 | showUnicodeException (DecodeError desc Nothing)
 90 |     = "Cannot decode input: " ++ desc
 91 | showUnicodeException (EncodeError desc (Just c))
 92 |     = "Cannot encode character '\\x" ++ showHex (fromEnum c) ("': " ++ desc)
 93 | showUnicodeException (EncodeError desc Nothing)
 94 |     = "Cannot encode input: " ++ desc
 95 | 
 96 | instance Show UnicodeException where
 97 |     show = showUnicodeException
 98 | 
 99 | instance Exception UnicodeException
100 | 
101 | instance NFData UnicodeException where
102 |     rnf (DecodeError desc w) = rnf desc `seq` rnf w `seq` ()
103 |     rnf (EncodeError desc c) = rnf desc `seq` rnf c `seq` ()
104 | 
105 | -- | Throw a 'UnicodeException' if decoding fails.
106 | strictDecode :: OnDecodeError
107 | strictDecode desc c = throw (DecodeError desc c)
108 | 
109 | -- | Replace an invalid input byte with the Unicode replacement
110 | -- character U+FFFD.
111 | lenientDecode :: OnDecodeError
112 | lenientDecode _ _ = Just '\xfffd'
113 | 
114 | -- | Throw a 'UnicodeException' if encoding fails.
115 | strictEncode :: OnEncodeError
116 | strictEncode desc c = throw (EncodeError desc c)
117 | 
118 | -- | Ignore an invalid input, substituting nothing in the output.
119 | ignore :: OnError a b
120 | ignore _ _ = Nothing
121 | 
122 | -- | Replace an invalid input with a valid output.
123 | replace :: b -> OnError a b
124 | replace c _ _ = Just c
125 | 


--------------------------------------------------------------------------------
/Data/Text/Encoding/Utf8.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP, MagicHash, BangPatterns #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.Encoding.Utf8
  5 | -- Copyright   : (c) 2008, 2009 Tom Harper,
  6 | --               (c) 2009, 2010 Bryan O'Sullivan,
  7 | --               (c) 2009 Duncan Coutts
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 11 | --               duncan@haskell.org
 12 | -- Stability   : experimental
 13 | -- Portability : GHC
 14 | --
 15 | -- Basic UTF-8 validation and character manipulation.
 16 | module Data.Text.Encoding.Utf8
 17 |     (
 18 |     -- Decomposition
 19 |       ord2
 20 |     , ord3
 21 |     , ord4
 22 |     -- Construction
 23 |     , chr2
 24 |     , chr3
 25 |     , chr4
 26 |     -- * Validation
 27 |     , validate1
 28 |     , validate2
 29 |     , validate3
 30 |     , validate4
 31 |     ) where
 32 | 
 33 | #if defined(ASSERTS)
 34 | import Control.Exception (assert)
 35 | #endif
 36 | import Data.Bits ((.&.))
 37 | import Data.Text.UnsafeChar (ord)
 38 | import Data.Text.UnsafeShift (shiftR)
 39 | import GHC.Exts
 40 | import GHC.Word (Word8(..))
 41 | 
 42 | default(Int)
 43 | 
 44 | between :: Word8                -- ^ byte to check
 45 |         -> Word8                -- ^ lower bound
 46 |         -> Word8                -- ^ upper bound
 47 |         -> Bool
 48 | between x y z = x >= y && x <= z
 49 | {-# INLINE between #-}
 50 | 
 51 | ord2 :: Char -> (Word8,Word8)
 52 | ord2 c =
 53 | #if defined(ASSERTS)
 54 |     assert (n >= 0x80 && n <= 0x07ff)
 55 | #endif
 56 |     (x1,x2)
 57 |     where
 58 |       n  = ord c
 59 |       x1 = fromIntegral $ (n `shiftR` 6) + 0xC0
 60 |       x2 = fromIntegral $ (n .&. 0x3F)   + 0x80
 61 | 
 62 | ord3 :: Char -> (Word8,Word8,Word8)
 63 | ord3 c =
 64 | #if defined(ASSERTS)
 65 |     assert (n >= 0x0800 && n <= 0xffff)
 66 | #endif
 67 |     (x1,x2,x3)
 68 |     where
 69 |       n  = ord c
 70 |       x1 = fromIntegral $ (n `shiftR` 12) + 0xE0
 71 |       x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
 72 |       x3 = fromIntegral $ (n .&. 0x3F) + 0x80
 73 | 
 74 | ord4 :: Char -> (Word8,Word8,Word8,Word8)
 75 | ord4 c =
 76 | #if defined(ASSERTS)
 77 |     assert (n >= 0x10000)
 78 | #endif
 79 |     (x1,x2,x3,x4)
 80 |     where
 81 |       n  = ord c
 82 |       x1 = fromIntegral $ (n `shiftR` 18) + 0xF0
 83 |       x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80
 84 |       x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80
 85 |       x4 = fromIntegral $ (n .&. 0x3F) + 0x80
 86 | 
 87 | chr2 :: Word8 -> Word8 -> Char
 88 | chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#))
 89 |     where
 90 |       !y1# = word2Int# x1#
 91 |       !y2# = word2Int# x2#
 92 |       !z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6#
 93 |       !z2# = y2# -# 0x80#
 94 | {-# INLINE chr2 #-}
 95 | 
 96 | chr3 :: Word8 -> Word8 -> Word8 -> Char
 97 | chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#))
 98 |     where
 99 |       !y1# = word2Int# x1#
100 |       !y2# = word2Int# x2#
101 |       !y3# = word2Int# x3#
102 |       !z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12#
103 |       !z2# = uncheckedIShiftL# (y2# -# 0x80#) 6#
104 |       !z3# = y3# -# 0x80#
105 | {-# INLINE chr3 #-}
106 | 
107 | chr4             :: Word8 -> Word8 -> Word8 -> Word8 -> Char
108 | chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) =
109 |     C# (chr# (z1# +# z2# +# z3# +# z4#))
110 |     where
111 |       !y1# = word2Int# x1#
112 |       !y2# = word2Int# x2#
113 |       !y3# = word2Int# x3#
114 |       !y4# = word2Int# x4#
115 |       !z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18#
116 |       !z2# = uncheckedIShiftL# (y2# -# 0x80#) 12#
117 |       !z3# = uncheckedIShiftL# (y3# -# 0x80#) 6#
118 |       !z4# = y4# -# 0x80#
119 | {-# INLINE chr4 #-}
120 | 
121 | validate1 :: Word8 -> Bool
122 | validate1 x1 = x1 <= 0x7F
123 | {-# INLINE validate1 #-}
124 | 
125 | validate2 :: Word8 -> Word8 -> Bool
126 | validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF
127 | {-# INLINE validate2 #-}
128 | 
129 | validate3 :: Word8 -> Word8 -> Word8 -> Bool
130 | {-# INLINE validate3 #-}
131 | validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4
132 |   where
133 |     validate3_1 = (x1 == 0xE0) &&
134 |                   between x2 0xA0 0xBF &&
135 |                   between x3 0x80 0xBF
136 |     validate3_2 = between x1 0xE1 0xEC &&
137 |                   between x2 0x80 0xBF &&
138 |                   between x3 0x80 0xBF
139 |     validate3_3 = x1 == 0xED &&
140 |                   between x2 0x80 0x9F &&
141 |                   between x3 0x80 0xBF
142 |     validate3_4 = between x1 0xEE 0xEF &&
143 |                   between x2 0x80 0xBF &&
144 |                   between x3 0x80 0xBF
145 | 
146 | validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool
147 | {-# INLINE validate4 #-}
148 | validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3
149 |   where
150 |     validate4_1 = x1 == 0xF0 &&
151 |                   between x2 0x90 0xBF &&
152 |                   between x3 0x80 0xBF &&
153 |                   between x4 0x80 0xBF
154 |     validate4_2 = between x1 0xF1 0xF3 &&
155 |                   between x2 0x80 0xBF &&
156 |                   between x3 0x80 0xBF &&
157 |                   between x4 0x80 0xBF
158 |     validate4_3 = x1 == 0xF4 &&
159 |                   between x2 0x80 0x8F &&
160 |                   between x3 0x80 0xBF &&
161 |                   between x4 0x80 0xBF
162 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Search.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.Lazy.Search
  5 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
  6 | --
  7 | -- License     : BSD-style
  8 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
  9 | --               duncan@haskell.org
 10 | -- Stability   : experimental
 11 | -- Portability : GHC
 12 | --
 13 | -- Fast substring search for lazy 'Text', based on work by Boyer,
 14 | -- Moore, Horspool, Sunday, and Lundh.  Adapted from the strict
 15 | -- implementation.
 16 | 
 17 | module Data.Text.Lazy.Search
 18 |     (
 19 |       indices
 20 |     ) where
 21 | 
 22 | import qualified Data.Text.Array as A
 23 | import Data.Int (Int64)
 24 | import Data.Word (Word16, Word64)
 25 | import qualified Data.Text.Internal as T
 26 | import Data.Text.Fusion.Internal (PairS(..))
 27 | import Data.Text.Lazy.Internal (Text(..), foldlChunks)
 28 | import Data.Bits ((.|.), (.&.))
 29 | import Data.Text.UnsafeShift (shiftL)
 30 | 
 31 | -- | /O(n+m)/ Find the offsets of all non-overlapping indices of
 32 | -- @needle@ within @haystack@.
 33 | --
 34 | -- This function is strict in @needle@, and lazy (as far as possible)
 35 | -- in the chunks of @haystack@.
 36 | --
 37 | -- In (unlikely) bad cases, this algorithm's complexity degrades
 38 | -- towards /O(n*m)/.
 39 | indices :: Text              -- ^ Substring to search for (@needle@)
 40 |         -> Text              -- ^ Text to search in (@haystack@)
 41 |         -> [Int64]
 42 | indices needle@(Chunk n ns) _haystack@(Chunk k ks)
 43 |     | nlen <= 0  = []
 44 |     | nlen == 1  = indicesOne (nindex 0) 0 k ks
 45 |     | otherwise  = advance k ks 0 0
 46 |   where
 47 |     advance x@(T.Text _ _ l) xs = scan
 48 |      where
 49 |       scan !g !i
 50 |          | i >= m = case xs of
 51 |                       Empty           -> []
 52 |                       Chunk y ys      -> advance y ys g (i-m)
 53 |          | lackingHay (i + nlen) x xs  = []
 54 |          | c == z && candidateMatch 0  = g : scan (g+nlen) (i+nlen)
 55 |          | otherwise                   = scan (g+delta) (i+delta)
 56 |        where
 57 |          m = fromIntegral l
 58 |          c = hindex (i + nlast)
 59 |          delta | nextInPattern = nlen + 1
 60 |                | c == z        = skip + 1
 61 |                | otherwise     = 1
 62 |          nextInPattern         = mask .&. swizzle (hindex (i+nlen)) == 0
 63 |          candidateMatch !j
 64 |              | j >= nlast               = True
 65 |              | hindex (i+j) /= nindex j = False
 66 |              | otherwise                = candidateMatch (j+1)
 67 |          hindex                         = index x xs
 68 |     nlen      = wordLength needle
 69 |     nlast     = nlen - 1
 70 |     nindex    = index n ns
 71 |     z         = foldlChunks fin 0 needle
 72 |         where fin _ (T.Text farr foff flen) = A.unsafeIndex farr (foff+flen-1)
 73 |     (mask :: Word64) :*: skip = buildTable n ns 0 0 0 (nlen-2)
 74 |     swizzle w = 1 `shiftL` (fromIntegral w .&. 0x3f)
 75 |     buildTable (T.Text xarr xoff xlen) xs = go
 76 |       where
 77 |         go !(g::Int64) !i !msk !skp
 78 |             | i >= xlast = case xs of
 79 |                              Empty      -> (msk .|. swizzle z) :*: skp
 80 |                              Chunk y ys -> buildTable y ys g 0 msk' skp'
 81 |             | otherwise = go (g+1) (i+1) msk' skp'
 82 |             where c                = A.unsafeIndex xarr (xoff+i)
 83 |                   msk'             = msk .|. swizzle c
 84 |                   skp' | c == z    = nlen - g - 2
 85 |                        | otherwise = skp
 86 |                   xlast = xlen - 1
 87 |     -- | Check whether an attempt to index into the haystack at the
 88 |     -- given offset would fail.
 89 |     lackingHay q = go 0
 90 |       where
 91 |         go p (T.Text _ _ l) ps = p' < q && case ps of
 92 |                                              Empty      -> True
 93 |                                              Chunk r rs -> go p' r rs
 94 |             where p' = p + fromIntegral l
 95 | indices _ _ = []
 96 | 
 97 | -- | Fast index into a partly unpacked 'Text'.  We take into account
 98 | -- the possibility that the caller might try to access one element
 99 | -- past the end.
100 | index :: T.Text -> Text -> Int64 -> Word16
101 | index (T.Text arr off len) xs !i
102 |     | j < len   = A.unsafeIndex arr (off+j)
103 |     | otherwise = case xs of
104 |                     Empty
105 |                         -- out of bounds, but legal
106 |                         | j == len  -> 0
107 |                         -- should never happen, due to lackingHay above
108 |                         | otherwise -> emptyError "index"
109 |                     Chunk c cs -> index c cs (i-fromIntegral len)
110 |     where j = fromIntegral i
111 | 
112 | -- | A variant of 'indices' that scans linearly for a single 'Word16'.
113 | indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
114 | indicesOne c = chunk
115 |   where
116 |     chunk !i (T.Text oarr ooff olen) os = go 0
117 |       where
118 |         go h | h >= olen = case os of
119 |                              Empty      -> []
120 |                              Chunk y ys -> chunk (i+fromIntegral olen) y ys
121 |              | on == c = i + fromIntegral h : go (h+1)
122 |              | otherwise = go (h+1)
123 |              where on = A.unsafeIndex oarr (ooff+h)
124 | 
125 | -- | The number of 'Word16' values in a 'Text'.
126 | wordLength :: Text -> Int64
127 | wordLength = foldlChunks sumLength 0
128 |     where sumLength i (T.Text _ _ l) = i + fromIntegral l
129 | 
130 | emptyError :: String -> a
131 | emptyError fun = error ("Data.Text.Lazy.Search." ++ fun ++ ": empty input")
132 | 


--------------------------------------------------------------------------------
/cbits/cbits.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) 2011 Bryan O'Sullivan <bos@serpentine.com>.
  3 |  *
  4 |  * Portions copyright (c) 2008-2010 Björn Höhrmann <bjoern@hoehrmann.de>.
  5 |  *
  6 |  * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
  7 |  */
  8 | 
  9 | #include <string.h>
 10 | #include <stdint.h>
 11 | #include <stdio.h>
 12 | 
 13 | void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
 14 | 		     size_t n)
 15 | {
 16 |   memcpy(dest + (doff<<1), src + (soff<<1), n<<1);
 17 | }
 18 | 
 19 | int _hs_text_memcmp(const void *a, size_t aoff, const void *b, size_t boff,
 20 | 		    size_t n)
 21 | {
 22 |   return memcmp(a + (aoff<<1), b + (boff<<1), n<<1);
 23 | }
 24 | 
 25 | #define UTF8_ACCEPT 0
 26 | #define UTF8_REJECT 12
 27 | 
 28 | static const uint8_t utf8d[] = {
 29 |   /*
 30 |    * The first part of the table maps bytes to character classes that
 31 |    * to reduce the size of the transition table and create bitmasks.
 32 |    */
 33 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 34 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 35 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 36 |    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 37 |    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 38 |    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 39 |    8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 40 |   10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
 41 | 
 42 |   /*
 43 |    * The second part is a transition table that maps a combination of
 44 |    * a state of the automaton and a character class to a state.
 45 |    */
 46 |    0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
 47 |   12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
 48 |   12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
 49 |   12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
 50 |   12,36,12,12,12,12,12,12,12,12,12,12,
 51 | };
 52 | 
 53 | static inline uint32_t
 54 | decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
 55 |   uint32_t type = utf8d[byte];
 56 | 
 57 |   *codep = (*state != UTF8_ACCEPT) ?
 58 |     (byte & 0x3fu) | (*codep << 6) :
 59 |     (0xff >> type) & (byte);
 60 | 
 61 |   return *state = utf8d[256 + *state + type];
 62 | }
 63 | 
 64 | /*
 65 |  * The ISO 8859-1 (aka latin-1) code points correspond exactly to the first 256 unicode
 66 |  * code-points, therefore we can trivially convert from a latin-1 encoded bytestring to
 67 |  * an UTF16 array
 68 |  */
 69 | void
 70 | _hs_text_decode_latin1(uint16_t *dest, const uint8_t const *src,
 71 |                        const uint8_t const *srcend)
 72 | {
 73 |   const uint8_t *p = src;
 74 | 
 75 | #if defined(__i386__) || defined(__x86_64__)
 76 |   /* This optimization works on a little-endian systems by using
 77 |      (aligned) 32-bit loads instead of 8-bit loads
 78 |    */
 79 | 
 80 |   /* consume unaligned prefix */
 81 |   while (p != srcend && (uintptr_t)p & 0x3)
 82 |     *dest++ = *p++;
 83 | 
 84 |   /* iterate over 32-bit aligned loads */
 85 |   while (p < srcend - 3) {
 86 |     const uint32_t w = *((const uint32_t *)p);
 87 | 
 88 |     *dest++ =  w        & 0xff;
 89 |     *dest++ = (w >> 8)  & 0xff;
 90 |     *dest++ = (w >> 16) & 0xff;
 91 |     *dest++ = (w >> 24) & 0xff;
 92 | 
 93 |     p += 4;
 94 |   }
 95 | #endif
 96 | 
 97 |   /* handle unaligned suffix */
 98 |   while (p != srcend)
 99 |     *dest++ = *p++;
100 | }
101 | 
102 | /*
103 |  * A best-effort decoder. Runs until it hits either end of input or
104 |  * the start of an invalid byte sequence.
105 |  *
106 |  * At exit, updates *destoff with the next offset to write to, and
107 |  * returns the next source offset to read from.
108 |  */
109 | uint8_t const *
110 | _hs_text_decode_utf8(uint16_t *dest, size_t *destoff,
111 | 		     const uint8_t const *src, const uint8_t const *srcend)
112 | {
113 |   uint16_t *d = dest + *destoff;
114 |   const uint8_t const *s = src;
115 |   uint32_t state = UTF8_ACCEPT;
116 | 
117 |   while (s < srcend) {
118 |     uint32_t codepoint;
119 | 
120 | #if defined(__i386__) || defined(__x86_64__)
121 |     /*
122 |      * This code will only work on a little-endian system that
123 |      * supports unaligned loads.
124 |      *
125 |      * It gives a substantial speed win on data that is purely or
126 |      * partly ASCII (e.g. HTML), at only a slight cost on purely
127 |      * non-ASCII text.
128 |      */
129 | 
130 |     if (state == UTF8_ACCEPT) {
131 |       while (s < srcend - 4) {
132 | 	codepoint = *((uint32_t *) s);
133 | 	if ((codepoint & 0x80808080) != 0)
134 | 	  break;
135 | 	s += 4;
136 | 
137 | 	/*
138 | 	 * Tried 32-bit stores here, but the extra bit-twiddling
139 | 	 * slowed the code down.
140 | 	 */
141 | 
142 | 	*d++ = (uint16_t) (codepoint & 0xff);
143 | 	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
144 | 	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
145 | 	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
146 |       }
147 |     }
148 | #endif
149 | 
150 |     if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
151 |       if (state != UTF8_REJECT)
152 | 	continue;
153 |       break;
154 |     }
155 | 
156 |     if (codepoint <= 0xffff)
157 |       *d++ = (uint16_t) codepoint;
158 |     else {
159 |       *d++ = (uint16_t) (0xD7C0 + (codepoint >> 10));
160 |       *d++ = (uint16_t) (0xDC00 + (codepoint & 0x3FF));
161 |     }
162 |   }
163 | 
164 |   /* Error recovery - if we're not in a valid finishing state, back up. */
165 |   if (state != UTF8_ACCEPT)
166 |     s -= 1;
167 | 
168 |   *destoff = d - dest;
169 | 
170 |   return s;
171 | }
172 | 


--------------------------------------------------------------------------------
/Data/Text/Foreign.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, CPP, GeneralizedNewtypeDeriving #-}
  2 | -- |
  3 | -- Module      : Data.Text.Foreign
  4 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
  5 | --
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
  8 | --               duncan@haskell.org
  9 | -- Stability   : experimental
 10 | -- Portability : GHC
 11 | --
 12 | -- Support for using 'Text' data with native code via the Haskell
 13 | -- foreign function interface.
 14 | 
 15 | module Data.Text.Foreign
 16 |     (
 17 |     -- * Interoperability with native code
 18 |     -- $interop
 19 |       I16
 20 |     -- * Safe conversion functions
 21 |     , fromPtr
 22 |     , useAsPtr
 23 |     , asForeignPtr
 24 |     -- * Unsafe conversion code
 25 |     , lengthWord16
 26 |     , unsafeCopyToPtr
 27 |     -- * Low-level manipulation
 28 |     -- $lowlevel
 29 |     , dropWord16
 30 |     , takeWord16
 31 |     ) where
 32 | 
 33 | #if defined(ASSERTS)
 34 | import Control.Exception (assert)
 35 | #endif
 36 | #if __GLASGOW_HASKELL__ >= 702
 37 | import Control.Monad.ST.Unsafe (unsafeIOToST)
 38 | #else
 39 | import Control.Monad.ST (unsafeIOToST)
 40 | #endif
 41 | import Data.Text.Internal (Text(..), empty)
 42 | import Data.Text.Unsafe (lengthWord16)
 43 | import qualified Data.Text.Array as A
 44 | import Data.Word (Word16)
 45 | import Foreign.Marshal.Alloc (allocaBytes)
 46 | import Foreign.Ptr (Ptr, castPtr, plusPtr)
 47 | import Foreign.ForeignPtr (ForeignPtr, mallocForeignPtrArray, withForeignPtr)
 48 | import Foreign.Storable (peek, poke)
 49 | 
 50 | -- $interop
 51 | --
 52 | -- The 'Text' type is implemented using arrays that are not guaranteed
 53 | -- to have a fixed address in the Haskell heap. All communication with
 54 | -- native code must thus occur by copying data back and forth.
 55 | --
 56 | -- The 'Text' type's internal representation is UTF-16, using the
 57 | -- platform's native endianness.  This makes copied data suitable for
 58 | -- use with native libraries that use a similar representation, such
 59 | -- as ICU.  To interoperate with native libraries that use different
 60 | -- internal representations, such as UTF-8 or UTF-32, consider using
 61 | -- the functions in the 'Data.Text.Encoding' module.
 62 | 
 63 | -- | A type representing a number of UTF-16 code units.
 64 | newtype I16 = I16 Int
 65 |     deriving (Bounded, Enum, Eq, Integral, Num, Ord, Read, Real, Show)
 66 | 
 67 | -- | /O(n)/ Create a new 'Text' from a 'Ptr' 'Word16' by copying the
 68 | -- contents of the array.
 69 | fromPtr :: Ptr Word16           -- ^ source array
 70 |         -> I16                  -- ^ length of source array (in 'Word16' units)
 71 |         -> IO Text
 72 | fromPtr _   (I16 0)   = return empty
 73 | fromPtr ptr (I16 len) =
 74 | #if defined(ASSERTS)
 75 |     assert (len > 0) $
 76 | #endif
 77 |     return $! Text arr 0 len
 78 |   where
 79 |     arr = A.run (A.new len >>= copy)
 80 |     copy marr = loop ptr 0
 81 |       where
 82 |         loop !p !i | i == len = return marr
 83 |                    | otherwise = do
 84 |           A.unsafeWrite marr i =<< unsafeIOToST (peek p)
 85 |           loop (p `plusPtr` 2) (i + 1)
 86 | 
 87 | -- $lowlevel
 88 | --
 89 | -- Foreign functions that use UTF-16 internally may return indices in
 90 | -- units of 'Word16' instead of characters.  These functions may
 91 | -- safely be used with such indices, as they will adjust offsets if
 92 | -- necessary to preserve the validity of a Unicode string.
 93 | 
 94 | -- | /O(1)/ Return the prefix of the 'Text' of @n@ 'Word16' units in
 95 | -- length.
 96 | --
 97 | -- If @n@ would cause the 'Text' to end inside a surrogate pair, the
 98 | -- end of the prefix will be advanced by one additional 'Word16' unit
 99 | -- to maintain its validity.
100 | takeWord16 :: I16 -> Text -> Text
101 | takeWord16 (I16 n) t@(Text arr off len)
102 |     | n <= 0               = empty
103 |     | n >= len || m >= len = t
104 |     | otherwise            = Text arr off m
105 |   where
106 |     m | w < 0xDB00 || w > 0xD8FF = n
107 |       | otherwise                = n+1
108 |     w = A.unsafeIndex arr (off+n-1)
109 | 
110 | -- | /O(1)/ Return the suffix of the 'Text', with @n@ 'Word16' units
111 | -- dropped from its beginning.
112 | --
113 | -- If @n@ would cause the 'Text' to begin inside a surrogate pair, the
114 | -- beginning of the suffix will be advanced by one additional 'Word16'
115 | -- unit to maintain its validity.
116 | dropWord16 :: I16 -> Text -> Text
117 | dropWord16 (I16 n) t@(Text arr off len)
118 |     | n <= 0               = t
119 |     | n >= len || m >= len = empty
120 |     | otherwise            = Text arr (off+m) (len-m)
121 |   where
122 |     m | w < 0xD800 || w > 0xDBFF = n
123 |       | otherwise                = n+1
124 |     w = A.unsafeIndex arr (off+n-1)
125 | 
126 | -- | /O(n)/ Copy a 'Text' to an array.  The array is assumed to be big
127 | -- enough to hold the contents of the entire 'Text'.
128 | unsafeCopyToPtr :: Text -> Ptr Word16 -> IO ()
129 | unsafeCopyToPtr (Text arr off len) ptr = loop ptr off
130 |   where
131 |     end = off + len
132 |     loop !p !i | i == end  = return ()
133 |                | otherwise = do
134 |       poke p (A.unsafeIndex arr i)
135 |       loop (p `plusPtr` 2) (i + 1)
136 | 
137 | -- | /O(n)/ Perform an action on a temporary, mutable copy of a
138 | -- 'Text'.  The copy is freed as soon as the action returns.
139 | useAsPtr :: Text -> (Ptr Word16 -> I16 -> IO a) -> IO a
140 | useAsPtr t@(Text _arr _off len) action =
141 |     allocaBytes (len * 2) $ \buf -> do
142 |       unsafeCopyToPtr t buf
143 |       action (castPtr buf) (fromIntegral len)
144 | 
145 | -- | /O(n)/ Make a mutable copy of a 'Text'.
146 | asForeignPtr :: Text -> IO (ForeignPtr Word16, I16)
147 | asForeignPtr t@(Text _arr _off len) = do
148 |   fp <- mallocForeignPtrArray len
149 |   withForeignPtr fp $ unsafeCopyToPtr t
150 |   return (fp, I16 len)
151 | 


--------------------------------------------------------------------------------
/text.cabal:
--------------------------------------------------------------------------------
  1 | name:           text
  2 | version:        0.11.3.1
  3 | homepage:       https://github.com/bos/text
  4 | bug-reports:    https://github.com/bos/text/issues
  5 | synopsis:       An efficient packed Unicode text type.
  6 | description:
  7 |     .
  8 |     An efficient packed, immutable Unicode text type (both strict and
  9 |     lazy), with a powerful loop fusion optimization framework.
 10 |     .
 11 |     The 'Text' type represents Unicode character strings, in a time and
 12 |     space-efficient manner. This package provides text processing
 13 |     capabilities that are optimized for performance critical use, both
 14 |     in terms of large data quantities and high speed.
 15 |     .
 16 |     The 'Text' type provides character-encoding, type-safe case
 17 |     conversion via whole-string case conversion functions. It also
 18 |     provides a range of functions for converting 'Text' values to and from
 19 |     'ByteStrings', using several standard encodings.
 20 |     .
 21 |     Efficient locale-sensitive support for text IO is also supported.
 22 |     .
 23 |     These modules are intended to be imported qualified, to avoid name
 24 |     clashes with Prelude functions, e.g.
 25 |     .
 26 |     > import qualified Data.Text as T
 27 |     .
 28 |     To use an extended and very rich family of functions for working
 29 |     with Unicode text (including normalization, regular expressions,
 30 |     non-standard encodings, text breaking, and locales), see
 31 |     the @text-icu@ package:
 32 |     <http://hackage.haskell.org/package/text-icu>
 33 |     .
 34 |     &#8212;&#8212; RELEASE NOTES &#8212;&#8212;
 35 |     .
 36 |     Changes in 0.11.2.0:
 37 |     .
 38 |     * String literals are now converted directly from the format in
 39 |       which GHC stores them into 'Text', without an intermediate
 40 |       transformation through 'String', and without inlining of
 41 |       conversion code at each site where a string literal is declared.
 42 |     .
 43 | license:        BSD3
 44 | license-file:   LICENSE
 45 | author:         Bryan O'Sullivan <bos@serpentine.com>
 46 | maintainer:     Bryan O'Sullivan <bos@serpentine.com>
 47 | copyright:      2009-2011 Bryan O'Sullivan, 2008-2009 Tom Harper
 48 | category:       Data, Text
 49 | build-type:     Simple
 50 | cabal-version:  >= 1.8
 51 | extra-source-files:
 52 |     -- scripts/CaseFolding.txt
 53 |     -- scripts/SpecialCasing.txt
 54 |     README.markdown
 55 |     benchmarks/Setup.hs
 56 |     benchmarks/cbits/*.c
 57 |     benchmarks/haskell/*.hs
 58 |     benchmarks/haskell/Benchmarks/*.hs
 59 |     benchmarks/python/*.py
 60 |     benchmarks/ruby/*.rb
 61 |     benchmarks/text-benchmarks.cabal
 62 |     scripts/*.hs
 63 |     tests-and-benchmarks.markdown
 64 |     tests/*.hs
 65 |     tests/.ghci
 66 |     tests/Makefile
 67 |     tests/Tests/*.hs
 68 |     tests/scripts/*.sh
 69 |     tests/text-tests.cabal
 70 | 
 71 | flag developer
 72 |   description: operate in developer mode
 73 |   default: False
 74 | 
 75 | flag integer-simple
 76 |   description: Use the simple integer library instead of GMP
 77 |   default: False
 78 | 
 79 | library
 80 |   c-sources: cbits/cbits.c
 81 | 
 82 |   exposed-modules:
 83 |     Data.Text
 84 |     Data.Text.Array
 85 |     Data.Text.Encoding
 86 |     Data.Text.Encoding.Error
 87 |     Data.Text.Foreign
 88 |     Data.Text.IO
 89 |     Data.Text.Internal
 90 |     Data.Text.Lazy
 91 |     Data.Text.Lazy.Builder
 92 |     Data.Text.Lazy.Builder.Int
 93 |     Data.Text.Lazy.Builder.RealFloat
 94 |     Data.Text.Lazy.Encoding
 95 |     Data.Text.Lazy.IO
 96 |     Data.Text.Lazy.Internal
 97 |     Data.Text.Lazy.Read
 98 |     Data.Text.Read
 99 |     Data.Text.Unsafe
100 |   other-modules:
101 |     Data.Text.Encoding.Fusion
102 |     Data.Text.Encoding.Fusion.Common
103 |     Data.Text.Encoding.Utf16
104 |     Data.Text.Encoding.Utf32
105 |     Data.Text.Encoding.Utf8
106 |     Data.Text.Fusion
107 |     Data.Text.Fusion.CaseMapping
108 |     Data.Text.Fusion.Common
109 |     Data.Text.Fusion.Internal
110 |     Data.Text.Fusion.Size
111 |     Data.Text.IO.Internal
112 |     Data.Text.Lazy.Builder.Functions
113 |     Data.Text.Lazy.Builder.Int.Digits
114 |     Data.Text.Lazy.Builder.Internal
115 |     Data.Text.Lazy.Builder.RealFloat.Functions
116 |     Data.Text.Lazy.Encoding.Fusion
117 |     Data.Text.Lazy.Fusion
118 |     Data.Text.Lazy.Search
119 |     Data.Text.Private
120 |     Data.Text.Search
121 |     Data.Text.Unsafe.Base
122 |     Data.Text.UnsafeChar
123 |     Data.Text.UnsafeShift
124 |     Data.Text.Util
125 | 
126 |   build-depends:
127 |     array,
128 |     base       < 5,
129 |     bytestring >= 0.9
130 |   if impl(ghc >= 6.10)
131 |     build-depends:
132 |       ghc-prim, base >= 4, deepseq >= 1.1.0.0
133 |     cpp-options: -DHAVE_DEEPSEQ
134 |   else
135 |     build-depends: extensible-exceptions
136 |     extensions: ScopedTypeVariables
137 | 
138 |   ghc-options: -Wall -funbox-strict-fields -O2
139 |   if impl(ghc >= 6.8)
140 |     ghc-options: -fwarn-tabs
141 |   if flag(developer)
142 |     ghc-prof-options: -auto-all
143 |     ghc-options: -Werror
144 |     cpp-options: -DASSERTS
145 | 
146 |   if impl(ghc >= 6.11)
147 |     if flag(integer-simple)
148 |       cpp-options: -DINTEGER_SIMPLE
149 |       build-depends: integer-simple >= 0.1 && < 0.5
150 |     else
151 |       cpp-options: -DINTEGER_GMP
152 |       build-depends: integer-gmp >= 0.2
153 | 
154 |   if impl(ghc >= 6.9) && impl(ghc < 6.11)
155 |     cpp-options: -DINTEGER_GMP
156 |     build-depends: integer >= 0.1 && < 0.2
157 | 
158 | test-suite tests
159 |   type:           exitcode-stdio-1.0
160 |   hs-source-dirs: tests .
161 |   main-is:        Tests.hs
162 |   c-sources:      cbits/cbits.c
163 | 
164 |   ghc-options:
165 |     -Wall -threaded -O0 -rtsopts
166 | 
167 |   cpp-options:
168 |     -DASSERTS -DHAVE_DEEPSEQ
169 | 
170 |   build-depends:
171 |     HUnit >= 1.2,
172 |     QuickCheck >= 2.4,
173 |     array,
174 |     base,
175 |     bytestring,
176 |     deepseq,
177 |     directory,
178 |     ghc-prim,
179 |     random,
180 |     test-framework >= 0.4,
181 |     test-framework-hunit >= 0.2,
182 |     test-framework-quickcheck2 >= 0.2
183 | 
184 |   if impl(ghc >= 6.11)
185 |     if flag(integer-simple)
186 |       cpp-options: -DINTEGER_SIMPLE
187 |       build-depends: integer-simple >= 0.1 && < 0.5
188 |     else
189 |       cpp-options: -DINTEGER_GMP
190 |       build-depends: integer-gmp >= 0.2
191 | 
192 |   if impl(ghc >= 6.9) && impl(ghc < 6.11)
193 |     cpp-options: -DINTEGER_GMP
194 |     build-depends: integer >= 0.1 && < 0.2
195 | 
196 | source-repository head
197 |   type:     git
198 |   location: https://github.com/bos/text
199 | 
200 | source-repository head
201 |   type:     mercurial
202 |   location: https://bitbucket.org/bos/text
203 | 


--------------------------------------------------------------------------------
/Data/Text/IO/Internal.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, CPP, RecordWildCards #-}
  2 | -- |
  3 | -- Module      : Data.Text.IO.Internal
  4 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan,
  5 | --               (c) 2009 Simon Marlow
  6 | -- License     : BSD-style
  7 | -- Maintainer  : bos@serpentine.com
  8 | -- Stability   : experimental
  9 | -- Portability : GHC
 10 | --
 11 | -- Low-level support for text I\/O.
 12 | 
 13 | module Data.Text.IO.Internal
 14 |     (
 15 | #if __GLASGOW_HASKELL__ >= 612
 16 |       hGetLineWith
 17 |     , readChunk
 18 | #endif
 19 |     ) where
 20 | 
 21 | #if __GLASGOW_HASKELL__ >= 612
 22 | import qualified Control.Exception as E
 23 | import Data.IORef (readIORef, writeIORef)
 24 | import Data.Text (Text)
 25 | import Data.Text.Fusion (unstream)
 26 | import Data.Text.Fusion.Internal (Step(..), Stream(..))
 27 | import Data.Text.Fusion.Size (exactSize, maxSize)
 28 | import Data.Text.Unsafe (inlinePerformIO)
 29 | import Foreign.Storable (peekElemOff)
 30 | import GHC.IO.Buffer (Buffer(..), CharBuffer, RawCharBuffer, bufferAdjustL,
 31 |                       bufferElems, charSize, isEmptyBuffer, readCharBuf,
 32 |                       withRawBuffer, writeCharBuf)
 33 | import GHC.IO.Handle.Internals (ioe_EOF, readTextDevice, wantReadableHandle_)
 34 | import GHC.IO.Handle.Types (Handle__(..), Newline(..))
 35 | import System.IO (Handle)
 36 | import System.IO.Error (isEOFError)
 37 | import qualified Data.Text as T
 38 | 
 39 | -- | Read a single line of input from a handle, constructing a list of
 40 | -- decoded chunks as we go.  When we're done, transform them into the
 41 | -- destination type.
 42 | hGetLineWith :: ([Text] -> t) -> Handle -> IO t
 43 | hGetLineWith f h = wantReadableHandle_ "hGetLine" h go
 44 |   where
 45 |     go hh@Handle__{..} = readIORef haCharBuffer >>= fmap f . hGetLineLoop hh []
 46 | 
 47 | hGetLineLoop :: Handle__ -> [Text] -> CharBuffer -> IO [Text]
 48 | hGetLineLoop hh@Handle__{..} = go where
 49 |  go ts buf@Buffer{ bufL=r0, bufR=w, bufRaw=raw0 } = do
 50 |   let findEOL raw r | r == w    = return (False, w)
 51 |                     | otherwise = do
 52 |         (c,r') <- readCharBuf raw r
 53 |         if c == '\n'
 54 |           then return (True, r)
 55 |           else findEOL raw r'
 56 |   (eol, off) <- findEOL raw0 r0
 57 |   (t,r') <- if haInputNL == CRLF
 58 |             then unpack_nl raw0 r0 off
 59 |             else do t <- unpack raw0 r0 off
 60 |                     return (t,off)
 61 |   if eol
 62 |     then do writeIORef haCharBuffer (bufferAdjustL (off+1) buf)
 63 |             return $ reverse (t:ts)
 64 |     else do
 65 |       let buf1 = bufferAdjustL r' buf
 66 |       maybe_buf <- maybeFillReadBuffer hh buf1
 67 |       case maybe_buf of
 68 |          -- Nothing indicates we caught an EOF, and we may have a
 69 |          -- partial line to return.
 70 |          Nothing -> do
 71 |               -- we reached EOF.  There might be a lone \r left
 72 |               -- in the buffer, so check for that and
 73 |               -- append it to the line if necessary.
 74 |               let pre | isEmptyBuffer buf1 = T.empty
 75 |                       | otherwise          = T.singleton '\r'
 76 |               writeIORef haCharBuffer buf1{ bufL=0, bufR=0 }
 77 |               let str = reverse . filter (not . T.null) $ pre:t:ts
 78 |               if null str
 79 |                 then ioe_EOF
 80 |                 else return str
 81 |          Just new_buf -> go (t:ts) new_buf
 82 | 
 83 | -- This function is lifted almost verbatim from GHC.IO.Handle.Text.
 84 | maybeFillReadBuffer :: Handle__ -> CharBuffer -> IO (Maybe CharBuffer)
 85 | maybeFillReadBuffer handle_ buf
 86 |   = E.catch (Just `fmap` getSomeCharacters handle_ buf) $ \e ->
 87 |       if isEOFError e
 88 |       then return Nothing
 89 |       else ioError e
 90 | 
 91 | unpack :: RawCharBuffer -> Int -> Int -> IO Text
 92 | unpack !buf !r !w
 93 |  | charSize /= 4 = sizeError "unpack"
 94 |  | r >= w        = return T.empty
 95 |  | otherwise     = withRawBuffer buf go
 96 |  where
 97 |   go pbuf = return $! unstream (Stream next r (exactSize (w-r)))
 98 |    where
 99 |     next !i | i >= w    = Done
100 |             | otherwise = Yield (ix i) (i+1)
101 |     ix i = inlinePerformIO $ peekElemOff pbuf i
102 | 
103 | unpack_nl :: RawCharBuffer -> Int -> Int -> IO (Text, Int)
104 | unpack_nl !buf !r !w
105 |  | charSize /= 4 = sizeError "unpack_nl"
106 |  | r >= w        = return (T.empty, 0)
107 |  | otherwise     = withRawBuffer buf $ go
108 |  where
109 |   go pbuf = do
110 |     let !t = unstream (Stream next r (maxSize (w-r)))
111 |         w' = w - 1
112 |     return $ if ix w' == '\r'
113 |              then (t,w')
114 |              else (t,w)
115 |    where
116 |     next !i | i >= w = Done
117 |             | c == '\r' = let i' = i + 1
118 |                           in if i' < w
119 |                              then if ix i' == '\n'
120 |                                   then Yield '\n' (i+2)
121 |                                   else Yield '\n' i'
122 |                              else Done
123 |             | otherwise = Yield c (i+1)
124 |             where c = ix i
125 |     ix i = inlinePerformIO $ peekElemOff pbuf i
126 | 
127 | -- This function is completely lifted from GHC.IO.Handle.Text.
128 | getSomeCharacters :: Handle__ -> CharBuffer -> IO CharBuffer
129 | getSomeCharacters handle_@Handle__{..} buf@Buffer{..} =
130 |   case bufferElems buf of
131 |     -- buffer empty: read some more
132 |     0 -> {-# SCC "readTextDevice" #-} readTextDevice handle_ buf
133 | 
134 |     -- if the buffer has a single '\r' in it and we're doing newline
135 |     -- translation: read some more
136 |     1 | haInputNL == CRLF -> do
137 |       (c,_) <- readCharBuf bufRaw bufL
138 |       if c == '\r'
139 |          then do -- shuffle the '\r' to the beginning.  This is only safe
140 |                  -- if we're about to call readTextDevice, otherwise it
141 |                  -- would mess up flushCharBuffer.
142 |                  -- See [note Buffer Flushing], GHC.IO.Handle.Types
143 |                  _ <- writeCharBuf bufRaw 0 '\r'
144 |                  let buf' = buf{ bufL=0, bufR=1 }
145 |                  readTextDevice handle_ buf'
146 |          else do
147 |                  return buf
148 | 
149 |     -- buffer has some chars in it already: just return it
150 |     _otherwise -> {-# SCC "otherwise" #-} return buf
151 | 
152 | -- | Read a single chunk of strict text from a buffer. Used by both
153 | -- the strict and lazy implementations of hGetContents.
154 | readChunk :: Handle__ -> CharBuffer -> IO Text
155 | readChunk hh@Handle__{..} buf = do
156 |   buf'@Buffer{..} <- getSomeCharacters hh buf
157 |   (t,r) <- if haInputNL == CRLF
158 |            then unpack_nl bufRaw bufL bufR
159 |            else do t <- unpack bufRaw bufL bufR
160 |                    return (t,bufR)
161 |   writeIORef haCharBuffer (bufferAdjustL r buf')
162 |   return t
163 | 
164 | sizeError :: String -> a
165 | sizeError loc = error $ "Data.Text.IO." ++ loc ++ ": bad internal buffer size"
166 | #endif
167 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/IO.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, CPP, RecordWildCards #-}
  2 | #if __GLASGOW_HASKELL__ >= 702
  3 | {-# LANGUAGE Trustworthy #-}
  4 | #endif
  5 | -- |
  6 | -- Module      : Data.Text.Lazy.IO
  7 | -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan,
  8 | --               (c) 2009 Simon Marlow
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com
 11 | -- Stability   : experimental
 12 | -- Portability : GHC
 13 | --
 14 | -- Efficient locale-sensitive support for lazy text I\/O.
 15 | --
 16 | -- Skip past the synopsis for some important notes on performance and
 17 | -- portability across different versions of GHC.
 18 | 
 19 | module Data.Text.Lazy.IO
 20 |     (
 21 |     -- * Performance
 22 |     -- $performance
 23 | 
 24 |     -- * Locale support
 25 |     -- $locale
 26 |     -- * File-at-a-time operations
 27 |       readFile
 28 |     , writeFile
 29 |     , appendFile
 30 |     -- * Operations on handles
 31 |     , hGetContents
 32 |     , hGetLine
 33 |     , hPutStr
 34 |     , hPutStrLn
 35 |     -- * Special cases for standard input and output
 36 |     , interact
 37 |     , getContents
 38 |     , getLine
 39 |     , putStr
 40 |     , putStrLn
 41 |     ) where
 42 | 
 43 | import Data.Text.Lazy (Text)
 44 | import Prelude hiding (appendFile, getContents, getLine, interact,
 45 |                        putStr, putStrLn, readFile, writeFile)
 46 | import System.IO (Handle, IOMode(..), hPutChar, openFile, stdin, stdout,
 47 |                   withFile)
 48 | import qualified Data.Text.IO as T
 49 | import qualified Data.Text.Lazy as L
 50 | #if __GLASGOW_HASKELL__ <= 610
 51 | import Data.Text.Lazy.Encoding (decodeUtf8)
 52 | import qualified Data.ByteString.Char8 as S8
 53 | import qualified Data.ByteString.Lazy.Char8 as L8
 54 | #else
 55 | import qualified Control.Exception as E
 56 | import Control.Monad (when)
 57 | import Data.IORef (readIORef)
 58 | import Data.Text.IO.Internal (hGetLineWith, readChunk)
 59 | import Data.Text.Lazy.Internal (chunk, empty)
 60 | import GHC.IO.Buffer (isEmptyBuffer)
 61 | import GHC.IO.Exception (IOException(..), IOErrorType(..), ioException)
 62 | import GHC.IO.Handle.Internals (augmentIOError, hClose_help,
 63 |                                 wantReadableHandle, withHandle)
 64 | import GHC.IO.Handle.Types (Handle__(..), HandleType(..))
 65 | import System.IO (BufferMode(..), hGetBuffering, hSetBuffering)
 66 | import System.IO.Error (isEOFError)
 67 | import System.IO.Unsafe (unsafeInterleaveIO)
 68 | #endif
 69 | 
 70 | -- $performance
 71 | --
 72 | -- The functions in this module obey the runtime system's locale,
 73 | -- character set encoding, and line ending conversion settings.
 74 | --
 75 | -- If you know in advance that you will be working with data that has
 76 | -- a specific encoding (e.g. UTF-8), and your application is highly
 77 | -- performance sensitive, you may find that it is faster to perform
 78 | -- I\/O with bytestrings and to encode and decode yourself than to use
 79 | -- the functions in this module.
 80 | --
 81 | -- Whether this will hold depends on the version of GHC you are using,
 82 | -- the platform you are working on, the data you are working with, and
 83 | -- the encodings you are using, so be sure to test for yourself.
 84 | 
 85 | -- | Read a file and return its contents as a string.  The file is
 86 | -- read lazily, as with 'getContents'.
 87 | readFile :: FilePath -> IO Text
 88 | readFile name = openFile name ReadMode >>= hGetContents
 89 | 
 90 | -- | Write a string to a file.  The file is truncated to zero length
 91 | -- before writing begins.
 92 | writeFile :: FilePath -> Text -> IO ()
 93 | writeFile p = withFile p WriteMode . flip hPutStr
 94 | 
 95 | -- | Write a string the end of a file.
 96 | appendFile :: FilePath -> Text -> IO ()
 97 | appendFile p = withFile p AppendMode . flip hPutStr
 98 | 
 99 | -- | Lazily read the remaining contents of a 'Handle'.  The 'Handle'
100 | -- will be closed after the read completes, or on error.
101 | hGetContents :: Handle -> IO Text
102 | #if __GLASGOW_HASKELL__ <= 610
103 | hGetContents = fmap decodeUtf8 . L8.hGetContents
104 | #else
105 | hGetContents h = do
106 |   chooseGoodBuffering h
107 |   wantReadableHandle "hGetContents" h $ \hh -> do
108 |     ts <- lazyRead h
109 |     return (hh{haType=SemiClosedHandle}, ts)
110 | 
111 | -- | Use a more efficient buffer size if we're reading in
112 | -- block-buffered mode with the default buffer size.
113 | chooseGoodBuffering :: Handle -> IO ()
114 | chooseGoodBuffering h = do
115 |   bufMode <- hGetBuffering h
116 |   when (bufMode == BlockBuffering Nothing) $
117 |     hSetBuffering h (BlockBuffering (Just 16384))
118 | 
119 | lazyRead :: Handle -> IO Text
120 | lazyRead h = unsafeInterleaveIO $
121 |   withHandle "hGetContents" h $ \hh -> do
122 |     case haType hh of
123 |       ClosedHandle     -> return (hh, L.empty)
124 |       SemiClosedHandle -> lazyReadBuffered h hh
125 |       _                -> ioException
126 |                           (IOError (Just h) IllegalOperation "hGetContents"
127 |                            "illegal handle type" Nothing Nothing)
128 | 
129 | lazyReadBuffered :: Handle -> Handle__ -> IO (Handle__, Text)
130 | lazyReadBuffered h hh@Handle__{..} = do
131 |    buf <- readIORef haCharBuffer
132 |    (do t <- readChunk hh buf
133 |        ts <- lazyRead h
134 |        return (hh, chunk t ts)) `E.catch` \e -> do
135 |          (hh', _) <- hClose_help hh
136 |          if isEOFError e
137 |            then return $ if isEmptyBuffer buf
138 |                          then (hh', empty)
139 |                          else (hh', L.singleton '\r')
140 |            else E.throwIO (augmentIOError e "hGetContents" h)
141 | #endif
142 | 
143 | -- | Read a single line from a handle.
144 | hGetLine :: Handle -> IO Text
145 | #if __GLASGOW_HASKELL__ <= 610
146 | hGetLine = fmap (decodeUtf8 . L8.fromChunks . (:[])) . S8.hGetLine
147 | #else
148 | hGetLine = hGetLineWith L.fromChunks
149 | #endif
150 | 
151 | -- | Write a string to a handle.
152 | hPutStr :: Handle -> Text -> IO ()
153 | hPutStr h = mapM_ (T.hPutStr h) . L.toChunks
154 | 
155 | -- | Write a string to a handle, followed by a newline.
156 | hPutStrLn :: Handle -> Text -> IO ()
157 | hPutStrLn h t = hPutStr h t >> hPutChar h '\n'
158 | 
159 | -- | The 'interact' function takes a function of type @Text -> Text@
160 | -- as its argument. The entire input from the standard input device is
161 | -- passed (lazily) to this function as its argument, and the resulting
162 | -- string is output on the standard output device.
163 | interact :: (Text -> Text) -> IO ()
164 | interact f = putStr . f =<< getContents
165 | 
166 | -- | Lazily read all user input on 'stdin' as a single string.
167 | getContents :: IO Text
168 | getContents = hGetContents stdin
169 | 
170 | -- | Read a single line of user input from 'stdin'.
171 | getLine :: IO Text
172 | getLine = hGetLine stdin
173 | 
174 | -- | Write a string to 'stdout'.
175 | putStr :: Text -> IO ()
176 | putStr = hPutStr stdout
177 | 
178 | -- | Write a string to 'stdout', followed by a newline.
179 | putStrLn :: Text -> IO ()
180 | putStrLn = hPutStrLn stdout
181 | 
182 | -- $locale
183 | --
184 | -- /Note/: The behaviour of functions in this module depends on the
185 | -- version of GHC you are using.
186 | --
187 | -- Beginning with GHC 6.12, text I\/O is performed using the system or
188 | -- handle's current locale and line ending conventions.
189 | --
190 | -- Under GHC 6.10 and earlier, the system I\/O libraries /do not
191 | -- support/ locale-sensitive I\/O or line ending conversion.  On these
192 | -- versions of GHC, functions in this library all use UTF-8.  What
193 | -- does this mean in practice?
194 | --
195 | -- * All data that is read will be decoded as UTF-8.
196 | --
197 | -- * Before data is written, it is first encoded as UTF-8.
198 | --
199 | -- * On both reading and writing, the platform's native newline
200 | --   conversion is performed.
201 | --
202 | -- If you must use a non-UTF-8 locale on an older version of GHC, you
203 | -- will have to perform the transcoding yourself, e.g. as follows:
204 | --
205 | -- > import qualified Data.ByteString.Lazy as B
206 | -- > import Data.Text.Lazy (Text)
207 | -- > import Data.Text.Lazy.Encoding (encodeUtf16)
208 | -- >
209 | -- > putStr_Utf16LE :: Text -> IO ()
210 | -- > putStr_Utf16LE t = B.putStr (encodeUtf16LE t)
211 | 


--------------------------------------------------------------------------------
/Data/Text/Encoding/Fusion.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, CPP, Rank2Types #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.Encoding.Fusion
  5 | -- Copyright   : (c) Tom Harper 2008-2009,
  6 | --               (c) Bryan O'Sullivan 2009,
  7 | --               (c) Duncan Coutts 2009
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 11 | --               duncan@haskell.org
 12 | -- Stability   : experimental
 13 | -- Portability : portable
 14 | --
 15 | -- Fusible 'Stream'-oriented functions for converting between 'Text'
 16 | -- and several common encodings.
 17 | 
 18 | module Data.Text.Encoding.Fusion
 19 |     (
 20 |     -- * Streaming
 21 |       streamASCII
 22 |     , streamUtf8
 23 |     , streamUtf16LE
 24 |     , streamUtf16BE
 25 |     , streamUtf32LE
 26 |     , streamUtf32BE
 27 | 
 28 |     -- * Unstreaming
 29 |     , unstream
 30 | 
 31 |     , module Data.Text.Encoding.Fusion.Common
 32 |     ) where
 33 | 
 34 | #if defined(ASSERTS)
 35 | import Control.Exception (assert)
 36 | #endif
 37 | import Data.ByteString.Internal (ByteString(..), mallocByteString, memcpy)
 38 | import Data.Text.Fusion (Step(..), Stream(..))
 39 | import Data.Text.Fusion.Size
 40 | import Data.Text.Encoding.Error
 41 | import Data.Text.Encoding.Fusion.Common
 42 | import Data.Text.UnsafeChar (unsafeChr, unsafeChr8, unsafeChr32)
 43 | import Data.Text.UnsafeShift (shiftL, shiftR)
 44 | import Data.Word (Word8, Word16, Word32)
 45 | import Foreign.ForeignPtr (withForeignPtr, ForeignPtr)
 46 | import Foreign.Storable (pokeByteOff)
 47 | import qualified Data.ByteString as B
 48 | import qualified Data.ByteString.Unsafe as B
 49 | import qualified Data.Text.Encoding.Utf8 as U8
 50 | import qualified Data.Text.Encoding.Utf16 as U16
 51 | import qualified Data.Text.Encoding.Utf32 as U32
 52 | import Data.Text.Unsafe (unsafeDupablePerformIO)
 53 | 
 54 | streamASCII :: ByteString -> Stream Char
 55 | streamASCII bs = Stream next 0 (maxSize l)
 56 |     where
 57 |       l = B.length bs
 58 |       {-# INLINE next #-}
 59 |       next i
 60 |           | i >= l    = Done
 61 |           | otherwise = Yield (unsafeChr8 x1) (i+1)
 62 |           where
 63 |             x1 = B.unsafeIndex bs i
 64 | {-# DEPRECATED streamASCII "Do not use this function" #-}
 65 | {-# INLINE [0] streamASCII #-}
 66 | 
 67 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using UTF-8
 68 | -- encoding.
 69 | streamUtf8 :: OnDecodeError -> ByteString -> Stream Char
 70 | streamUtf8 onErr bs = Stream next 0 (maxSize l)
 71 |     where
 72 |       l = B.length bs
 73 |       next i
 74 |           | i >= l = Done
 75 |           | U8.validate1 x1 = Yield (unsafeChr8 x1) (i+1)
 76 |           | i+1 < l && U8.validate2 x1 x2 = Yield (U8.chr2 x1 x2) (i+2)
 77 |           | i+2 < l && U8.validate3 x1 x2 x3 = Yield (U8.chr3 x1 x2 x3) (i+3)
 78 |           | i+3 < l && U8.validate4 x1 x2 x3 x4 = Yield (U8.chr4 x1 x2 x3 x4) (i+4)
 79 |           | otherwise = decodeError "streamUtf8" "UTF-8" onErr (Just x1) (i+1)
 80 |           where
 81 |             x1 = idx i
 82 |             x2 = idx (i + 1)
 83 |             x3 = idx (i + 2)
 84 |             x4 = idx (i + 3)
 85 |             idx = B.unsafeIndex bs
 86 | {-# INLINE [0] streamUtf8 #-}
 87 | 
 88 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
 89 | -- endian UTF-16 encoding.
 90 | streamUtf16LE :: OnDecodeError -> ByteString -> Stream Char
 91 | streamUtf16LE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
 92 |     where
 93 |       l = B.length bs
 94 |       {-# INLINE next #-}
 95 |       next i
 96 |           | i >= l                         = Done
 97 |           | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
 98 |           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
 99 |           | otherwise = decodeError "streamUtf16LE" "UTF-16LE" onErr Nothing (i+1)
100 |           where
101 |             x1    = idx i       + (idx (i + 1) `shiftL` 8)
102 |             x2    = idx (i + 2) + (idx (i + 3) `shiftL` 8)
103 |             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
104 | {-# INLINE [0] streamUtf16LE #-}
105 | 
106 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
107 | -- endian UTF-16 encoding.
108 | streamUtf16BE :: OnDecodeError -> ByteString -> Stream Char
109 | streamUtf16BE onErr bs = Stream next 0 (maxSize (l `shiftR` 1))
110 |     where
111 |       l = B.length bs
112 |       {-# INLINE next #-}
113 |       next i
114 |           | i >= l                         = Done
115 |           | i+1 < l && U16.validate1 x1    = Yield (unsafeChr x1) (i+2)
116 |           | i+3 < l && U16.validate2 x1 x2 = Yield (U16.chr2 x1 x2) (i+4)
117 |           | otherwise = decodeError "streamUtf16BE" "UTF-16BE" onErr Nothing (i+1)
118 |           where
119 |             x1    = (idx i `shiftL` 8)       + idx (i + 1)
120 |             x2    = (idx (i + 2) `shiftL` 8) + idx (i + 3)
121 |             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word16
122 | {-# INLINE [0] streamUtf16BE #-}
123 | 
124 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using big
125 | -- endian UTF-32 encoding.
126 | streamUtf32BE :: OnDecodeError -> ByteString -> Stream Char
127 | streamUtf32BE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
128 |     where
129 |       l = B.length bs
130 |       {-# INLINE next #-}
131 |       next i
132 |           | i >= l                    = Done
133 |           | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
134 |           | otherwise = decodeError "streamUtf32BE" "UTF-32BE" onErr Nothing (i+1)
135 |           where
136 |             x     = shiftL x1 24 + shiftL x2 16 + shiftL x3 8 + x4
137 |             x1    = idx i
138 |             x2    = idx (i+1)
139 |             x3    = idx (i+2)
140 |             x4    = idx (i+3)
141 |             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
142 | {-# INLINE [0] streamUtf32BE #-}
143 | 
144 | -- | /O(n)/ Convert a 'ByteString' into a 'Stream Char', using little
145 | -- endian UTF-32 encoding.
146 | streamUtf32LE :: OnDecodeError -> ByteString -> Stream Char
147 | streamUtf32LE onErr bs = Stream next 0 (maxSize (l `shiftR` 2))
148 |     where
149 |       l = B.length bs
150 |       {-# INLINE next #-}
151 |       next i
152 |           | i >= l                    = Done
153 |           | i+3 < l && U32.validate x = Yield (unsafeChr32 x) (i+4)
154 |           | otherwise = decodeError "streamUtf32LE" "UTF-32LE" onErr Nothing (i+1)
155 |           where
156 |             x     = shiftL x4 24 + shiftL x3 16 + shiftL x2 8 + x1
157 |             x1    = idx i
158 |             x2    = idx $ i+1
159 |             x3    = idx $ i+2
160 |             x4    = idx $ i+3
161 |             idx = fromIntegral . B.unsafeIndex bs :: Int -> Word32
162 | {-# INLINE [0] streamUtf32LE #-}
163 | 
164 | -- | /O(n)/ Convert a 'Stream' 'Word8' to a 'ByteString'.
165 | unstream :: Stream Word8 -> ByteString
166 | unstream (Stream next s0 len) = unsafeDupablePerformIO $ do
167 |     let mlen = upperBound 4 len
168 |     mallocByteString mlen >>= loop mlen 0 s0
169 |     where
170 |       loop !n !off !s fp = case next s of
171 |           Done -> trimUp fp n off
172 |           Skip s' -> loop n off s' fp
173 |           Yield x s'
174 |               | off == n -> realloc fp n off s' x
175 |               | otherwise -> do
176 |             withForeignPtr fp $ \p -> pokeByteOff p off x
177 |             loop n (off+1) s' fp
178 |       {-# NOINLINE realloc #-}
179 |       realloc fp n off s x = do
180 |         let n' = n+n
181 |         fp' <- copy0 fp n n'
182 |         withForeignPtr fp' $ \p -> pokeByteOff p off x
183 |         loop n' (off+1) s fp'
184 |       {-# NOINLINE trimUp #-}
185 |       trimUp fp _ off = return $! PS fp 0 off
186 |       copy0 :: ForeignPtr Word8 -> Int -> Int -> IO (ForeignPtr Word8)
187 |       copy0 !src !srcLen !destLen =
188 | #if defined(ASSERTS)
189 |         assert (srcLen <= destLen) $
190 | #endif
191 |         do
192 |           dest <- mallocByteString destLen
193 |           withForeignPtr src  $ \src'  ->
194 |               withForeignPtr dest $ \dest' ->
195 |                   memcpy dest' src' (fromIntegral srcLen)
196 |           return dest
197 | 
198 | decodeError :: forall s. String -> String -> OnDecodeError -> Maybe Word8
199 |             -> s -> Step s Char
200 | decodeError func kind onErr mb i =
201 |     case onErr desc mb of
202 |       Nothing -> Skip i
203 |       Just c  -> Yield c i
204 |     where desc = "Data.Text.Encoding.Fusion." ++ func ++ ": Invalid " ++
205 |                  kind ++ " stream"
206 | 


--------------------------------------------------------------------------------
/Data/Text/Array.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, CPP, ForeignFunctionInterface, MagicHash, Rank2Types,
  2 |     RecordWildCards, UnboxedTuples, UnliftedFFITypes #-}
  3 | {-# OPTIONS_GHC -fno-warn-unused-matches #-}
  4 | -- |
  5 | -- Module      : Data.Text.Array
  6 | -- Copyright   : (c) 2009, 2010, 2011 Bryan O'Sullivan
  7 | --
  8 | -- License     : BSD-style
  9 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 10 | --               duncan@haskell.org
 11 | -- Stability   : experimental
 12 | -- Portability : portable
 13 | --
 14 | -- Packed, unboxed, heap-resident arrays.  Suitable for performance
 15 | -- critical use, both in terms of large data quantities and high
 16 | -- speed.
 17 | --
 18 | -- This module is intended to be imported @qualified@, to avoid name
 19 | -- clashes with "Prelude" functions, e.g.
 20 | --
 21 | -- > import qualified Data.Text.Array as A
 22 | --
 23 | -- The names in this module resemble those in the 'Data.Array' family
 24 | -- of modules, but are shorter due to the assumption of qualifid
 25 | -- naming.
 26 | module Data.Text.Array
 27 |     (
 28 |     -- * Types
 29 |       Array(aBA)
 30 |     , MArray(maBA)
 31 | 
 32 |     -- * Functions
 33 |     , copyM
 34 |     , copyI
 35 |     , empty
 36 |     , equal
 37 | #if defined(ASSERTS)
 38 |     , length
 39 | #endif
 40 |     , run
 41 |     , run2
 42 |     , toList
 43 |     , unsafeFreeze
 44 |     , unsafeIndex
 45 |     , new
 46 |     , unsafeWrite
 47 |     ) where
 48 | 
 49 | #if defined(ASSERTS)
 50 | -- This fugly hack is brought by GHC's apparent reluctance to deal
 51 | -- with MagicHash and UnboxedTuples when inferring types. Eek!
 52 | # define CHECK_BOUNDS(_func_,_len_,_k_) \
 53 | if (_k_) < 0 || (_k_) >= (_len_) then error ("Data.Text.Array." ++ (_func_) ++ ": bounds error, offset " ++ show (_k_) ++ ", length " ++ show (_len_)) else
 54 | #else
 55 | # define CHECK_BOUNDS(_func_,_len_,_k_)
 56 | #endif
 57 | 
 58 | #include "MachDeps.h"
 59 | 
 60 | #if defined(ASSERTS)
 61 | import Control.Exception (assert)
 62 | #endif
 63 | #if __GLASGOW_HASKELL__ >= 702
 64 | import Control.Monad.ST.Unsafe (unsafeIOToST)
 65 | #else
 66 | import Control.Monad.ST (unsafeIOToST)
 67 | #endif
 68 | import Data.Bits ((.&.), xor)
 69 | import Data.Text.Unsafe.Base (inlinePerformIO)
 70 | import Data.Text.UnsafeShift (shiftL, shiftR)
 71 | #if __GLASGOW_HASKELL__ >= 703
 72 | import Foreign.C.Types (CInt(CInt), CSize(CSize))
 73 | #else
 74 | import Foreign.C.Types (CInt, CSize)
 75 | #endif
 76 | import GHC.Base (ByteArray#, MutableByteArray#, Int(..),
 77 |                  indexWord16Array#, newByteArray#,
 78 |                  unsafeCoerce#, writeWord16Array#)
 79 | import GHC.ST (ST(..), runST)
 80 | import GHC.Word (Word16(..))
 81 | import Prelude hiding (length, read)
 82 | 
 83 | -- | Immutable array type.
 84 | data Array = Array {
 85 |       aBA :: ByteArray#
 86 | #if defined(ASSERTS)
 87 |     , aLen :: {-# UNPACK #-} !Int -- length (in units of Word16, not bytes)
 88 | #endif
 89 |     }
 90 | 
 91 | -- | Mutable array type, for use in the ST monad.
 92 | data MArray s = MArray {
 93 |       maBA :: MutableByteArray# s
 94 | #if defined(ASSERTS)
 95 |     , maLen :: {-# UNPACK #-} !Int -- length (in units of Word16, not bytes)
 96 | #endif
 97 |     }
 98 | 
 99 | #if defined(ASSERTS)
100 | -- | Operations supported by all arrays.
101 | class IArray a where
102 |     -- | Return the length of an array.
103 |     length :: a -> Int
104 | 
105 | instance IArray Array where
106 |     length = aLen
107 |     {-# INLINE length #-}
108 | 
109 | instance IArray (MArray s) where
110 |     length = maLen
111 |     {-# INLINE length #-}
112 | #endif
113 | 
114 | -- | Create an uninitialized mutable array.
115 | new :: forall s. Int -> ST s (MArray s)
116 | new n
117 |   | n < 0 || n .&. highBit /= 0 = array_size_error
118 |   | otherwise = ST $ \s1# ->
119 |        case newByteArray# len# s1# of
120 |          (# s2#, marr# #) -> (# s2#, MArray marr#
121 | #if defined(ASSERTS)
122 |                                 n
123 | #endif
124 |                                 #)
125 |   where !(I# len#) = bytesInArray n
126 |         highBit    = maxBound `xor` (maxBound `shiftR` 1)
127 | {-# INLINE new #-}
128 | 
129 | array_size_error :: a
130 | array_size_error = error "Data.Text.Array.new: size overflow"
131 | 
132 | -- | Freeze a mutable array. Do not mutate the 'MArray' afterwards!
133 | unsafeFreeze :: MArray s -> ST s Array
134 | unsafeFreeze MArray{..} = ST $ \s# ->
135 |                           (# s#, Array (unsafeCoerce# maBA)
136 | #if defined(ASSERTS)
137 |                              maLen
138 | #endif
139 |                              #)
140 | {-# INLINE unsafeFreeze #-}
141 | 
142 | -- | Indicate how many bytes would be used for an array of the given
143 | -- size.
144 | bytesInArray :: Int -> Int
145 | bytesInArray n = n `shiftL` 1
146 | {-# INLINE bytesInArray #-}
147 | 
148 | -- | Unchecked read of an immutable array.  May return garbage or
149 | -- crash on an out-of-bounds access.
150 | unsafeIndex :: Array -> Int -> Word16
151 | unsafeIndex Array{..} i@(I# i#) =
152 |   CHECK_BOUNDS("unsafeIndex",aLen,i)
153 |     case indexWord16Array# aBA i# of r# -> (W16# r#)
154 | {-# INLINE unsafeIndex #-}
155 | 
156 | -- | Unchecked write of a mutable array.  May return garbage or crash
157 | -- on an out-of-bounds access.
158 | unsafeWrite :: MArray s -> Int -> Word16 -> ST s ()
159 | unsafeWrite MArray{..} i@(I# i#) (W16# e#) = ST $ \s1# ->
160 |   CHECK_BOUNDS("unsafeWrite",maLen,i)
161 |   case writeWord16Array# maBA i# e# s1# of
162 |     s2# -> (# s2#, () #)
163 | {-# INLINE unsafeWrite #-}
164 | 
165 | -- | Convert an immutable array to a list.
166 | toList :: Array -> Int -> Int -> [Word16]
167 | toList ary off len = loop 0
168 |     where loop i | i < len   = unsafeIndex ary (off+i) : loop (i+1)
169 |                  | otherwise = []
170 | 
171 | -- | An empty immutable array.
172 | empty :: Array
173 | empty = runST (new 0 >>= unsafeFreeze)
174 | 
175 | -- | Run an action in the ST monad and return an immutable array of
176 | -- its result.
177 | run :: (forall s. ST s (MArray s)) -> Array
178 | run k = runST (k >>= unsafeFreeze)
179 | 
180 | -- | Run an action in the ST monad and return an immutable array of
181 | -- its result paired with whatever else the action returns.
182 | run2 :: (forall s. ST s (MArray s, a)) -> (Array, a)
183 | run2 k = runST (do
184 |                  (marr,b) <- k
185 |                  arr <- unsafeFreeze marr
186 |                  return (arr,b))
187 | {-# INLINE run2 #-}
188 | 
189 | -- | Copy some elements of a mutable array.
190 | copyM :: MArray s               -- ^ Destination
191 |       -> Int                    -- ^ Destination offset
192 |       -> MArray s               -- ^ Source
193 |       -> Int                    -- ^ Source offset
194 |       -> Int                    -- ^ Count
195 |       -> ST s ()
196 | copyM dest didx src sidx count
197 |     | count <= 0 = return ()
198 |     | otherwise =
199 | #if defined(ASSERTS)
200 |     assert (sidx + count <= length src) .
201 |     assert (didx + count <= length dest) .
202 | #endif
203 |     unsafeIOToST $ memcpyM (maBA dest) (fromIntegral didx)
204 |                            (maBA src) (fromIntegral sidx)
205 |                            (fromIntegral count)
206 | {-# INLINE copyM #-}
207 | 
208 | -- | Copy some elements of an immutable array.
209 | copyI :: MArray s               -- ^ Destination
210 |       -> Int                    -- ^ Destination offset
211 |       -> Array                  -- ^ Source
212 |       -> Int                    -- ^ Source offset
213 |       -> Int                    -- ^ First offset in destination /not/ to
214 |                                 -- copy (i.e. /not/ length)
215 |       -> ST s ()
216 | copyI dest i0 src j0 top
217 |     | i0 >= top = return ()
218 |     | otherwise = unsafeIOToST $
219 |                   memcpyI (maBA dest) (fromIntegral i0)
220 |                           (aBA src) (fromIntegral j0)
221 |                           (fromIntegral (top-i0))
222 | {-# INLINE copyI #-}
223 | 
224 | -- | Compare portions of two arrays for equality.  No bounds checking
225 | -- is performed.
226 | equal :: Array                  -- ^ First
227 |       -> Int                    -- ^ Offset into first
228 |       -> Array                  -- ^ Second
229 |       -> Int                    -- ^ Offset into second
230 |       -> Int                    -- ^ Count
231 |       -> Bool
232 | equal arrA offA arrB offB count = inlinePerformIO $ do
233 |   i <- memcmp (aBA arrA) (fromIntegral offA)
234 |                      (aBA arrB) (fromIntegral offB) (fromIntegral count)
235 |   return $! i == 0
236 | {-# INLINE equal #-}
237 | 
238 | foreign import ccall unsafe "_hs_text_memcpy" memcpyI
239 |     :: MutableByteArray# s -> CSize -> ByteArray# -> CSize -> CSize -> IO ()
240 | 
241 | foreign import ccall unsafe "_hs_text_memcmp" memcmp
242 |     :: ByteArray# -> CSize -> ByteArray# -> CSize -> CSize -> IO CInt
243 | 
244 | foreign import ccall unsafe "_hs_text_memcpy" memcpyM
245 |     :: MutableByteArray# s -> CSize -> MutableByteArray# s -> CSize -> CSize
246 |     -> IO ()
247 | 


--------------------------------------------------------------------------------
/Data/Text/Lazy/Read.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings, CPP #-}
  2 | #if __GLASGOW_HASKELL__ >= 702
  3 | {-# LANGUAGE Trustworthy #-}
  4 | #endif
  5 | 
  6 | -- |
  7 | -- Module      : Data.Text.Lazy.Read
  8 | -- Copyright   : (c) 2010, 2011 Bryan O'Sullivan
  9 | --
 10 | -- License     : BSD-style
 11 | -- Maintainer  : bos@serpentine.com
 12 | -- Stability   : experimental
 13 | -- Portability : GHC
 14 | --
 15 | -- Functions used frequently when reading textual data.
 16 | module Data.Text.Lazy.Read
 17 |     (
 18 |       Reader
 19 |     , decimal
 20 |     , hexadecimal
 21 |     , signed
 22 |     , rational
 23 |     , double
 24 |     ) where
 25 | 
 26 | import Control.Monad (liftM)
 27 | import Data.Char (isDigit, isHexDigit, ord)
 28 | import Data.Int (Int8, Int16, Int32, Int64)
 29 | import Data.Ratio ((%))
 30 | import Data.Text.Lazy as T
 31 | import Data.Word (Word, Word8, Word16, Word32, Word64)
 32 | 
 33 | -- | Read some text.  If the read succeeds, return its value and the
 34 | -- remaining text, otherwise an error message.
 35 | type Reader a = Text -> Either String (a,Text)
 36 | 
 37 | -- | Read a decimal integer.  The input must begin with at least one
 38 | -- decimal digit, and is consumed until a non-digit or end of string
 39 | -- is reached.
 40 | --
 41 | -- This function does not handle leading sign characters.  If you need
 42 | -- to handle signed input, use @'signed' 'decimal'@.
 43 | --
 44 | -- /Note/: For fixed-width integer types, this function does not
 45 | -- attempt to detect overflow, so a sufficiently long input may give
 46 | -- incorrect results.  If you are worried about overflow, use
 47 | -- 'Integer' for your result type.
 48 | decimal :: Integral a => Reader a
 49 | {-# SPECIALIZE decimal :: Reader Int #-}
 50 | {-# SPECIALIZE decimal :: Reader Int8 #-}
 51 | {-# SPECIALIZE decimal :: Reader Int16 #-}
 52 | {-# SPECIALIZE decimal :: Reader Int32 #-}
 53 | {-# SPECIALIZE decimal :: Reader Int64 #-}
 54 | {-# SPECIALIZE decimal :: Reader Integer #-}
 55 | {-# SPECIALIZE decimal :: Reader Word #-}
 56 | {-# SPECIALIZE decimal :: Reader Word8 #-}
 57 | {-# SPECIALIZE decimal :: Reader Word16 #-}
 58 | {-# SPECIALIZE decimal :: Reader Word32 #-}
 59 | {-# SPECIALIZE decimal :: Reader Word64 #-}
 60 | decimal txt
 61 |     | T.null h  = Left "input does not start with a digit"
 62 |     | otherwise = Right (T.foldl' go 0 h, t)
 63 |   where (h,t)  = T.span isDigit txt
 64 |         go n d = (n * 10 + fromIntegral (digitToInt d))
 65 | 
 66 | -- | Read a hexadecimal integer, consisting of an optional leading
 67 | -- @\"0x\"@ followed by at least one decimal digit. Input is consumed
 68 | -- until a non-hex-digit or end of string is reached.  This function
 69 | -- is case insensitive.
 70 | --
 71 | -- This function does not handle leading sign characters.  If you need
 72 | -- to handle signed input, use @'signed' 'hexadecimal'@.
 73 | --
 74 | -- /Note/: For fixed-width integer types, this function does not
 75 | -- attempt to detect overflow, so a sufficiently long input may give
 76 | -- incorrect results.  If you are worried about overflow, use
 77 | -- 'Integer' for your result type.
 78 | hexadecimal :: Integral a => Reader a
 79 | {-# SPECIALIZE hexadecimal :: Reader Int #-}
 80 | {-# SPECIALIZE hexadecimal :: Reader Integer #-}
 81 | hexadecimal txt
 82 |     | h == "0x" || h == "0X" = hex t
 83 |     | otherwise              = hex txt
 84 |  where (h,t) = T.splitAt 2 txt
 85 | 
 86 | hex :: Integral a => Reader a
 87 | {-# SPECIALIZE hexadecimal :: Reader Int #-}
 88 | {-# SPECIALIZE hexadecimal :: Reader Int8 #-}
 89 | {-# SPECIALIZE hexadecimal :: Reader Int16 #-}
 90 | {-# SPECIALIZE hexadecimal :: Reader Int32 #-}
 91 | {-# SPECIALIZE hexadecimal :: Reader Int64 #-}
 92 | {-# SPECIALIZE hexadecimal :: Reader Integer #-}
 93 | {-# SPECIALIZE hexadecimal :: Reader Word #-}
 94 | {-# SPECIALIZE hexadecimal :: Reader Word8 #-}
 95 | {-# SPECIALIZE hexadecimal :: Reader Word16 #-}
 96 | {-# SPECIALIZE hexadecimal :: Reader Word32 #-}
 97 | {-# SPECIALIZE hexadecimal :: Reader Word64 #-}
 98 | hex txt
 99 |     | T.null h  = Left "input does not start with a hexadecimal digit"
100 |     | otherwise = Right (T.foldl' go 0 h, t)
101 |   where (h,t)  = T.span isHexDigit txt
102 |         go n d = (n * 16 + fromIntegral (hexDigitToInt d))
103 | 
104 | hexDigitToInt :: Char -> Int
105 | hexDigitToInt c
106 |     | c >= '0' && c <= '9' = ord c - ord '0'
107 |     | c >= 'a' && c <= 'f' = ord c - (ord 'a' - 10)
108 |     | otherwise            = ord c - (ord 'A' - 10)
109 | 
110 | digitToInt :: Char -> Int
111 | digitToInt c = ord c - ord '0'
112 | 
113 | -- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
114 | -- apply it to the result of applying the given reader.
115 | signed :: Num a => Reader a -> Reader a
116 | {-# INLINE signed #-}
117 | signed f = runP (signa (P f))
118 | 
119 | -- | Read a rational number.
120 | --
121 | -- This function accepts an optional leading sign character, followed
122 | -- by at least one decimal digit.  The syntax similar to that accepted
123 | -- by the 'read' function, with the exception that a trailing @\'.\'@
124 | -- or @\'e\'@ /not/ followed by a number is not consumed.
125 | --
126 | -- Examples:
127 | --
128 | -- >rational "3"     == Right (3.0, "")
129 | -- >rational "3.1"   == Right (3.1, "")
130 | -- >rational "3e4"   == Right (30000.0, "")
131 | -- >rational "3.1e4" == Right (31000.0, "")
132 | -- >rational ".3"    == Left "input does not start with a digit"
133 | -- >rational "e3"    == Left "input does not start with a digit"
134 | --
135 | -- Examples of differences from 'read':
136 | --
137 | -- >rational "3.foo" == Right (3.0, ".foo")
138 | -- >rational "3e"    == Right (3.0, "e")
139 | rational :: Fractional a => Reader a
140 | {-# SPECIALIZE rational :: Reader Double #-}
141 | rational = floaty $ \real frac fracDenom -> fromRational $
142 |                      real % 1 + frac % fracDenom
143 | 
144 | -- | Read a rational number.
145 | --
146 | -- The syntax accepted by this function is the same as for 'rational'.
147 | --
148 | -- /Note/: This function is almost ten times faster than 'rational',
149 | -- but is slightly less accurate.
150 | --
151 | -- The 'Double' type supports about 16 decimal places of accuracy.
152 | -- For 94.2% of numbers, this function and 'rational' give identical
153 | -- results, but for the remaining 5.8%, this function loses precision
154 | -- around the 15th decimal place.  For 0.001% of numbers, this
155 | -- function will lose precision at the 13th or 14th decimal place.
156 | double :: Reader Double
157 | double = floaty $ \real frac fracDenom ->
158 |                    fromIntegral real +
159 |                    fromIntegral frac / fromIntegral fracDenom
160 | 
161 | signa :: Num a => Parser a -> Parser a
162 | {-# SPECIALIZE signa :: Parser Int -> Parser Int #-}
163 | {-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-}
164 | {-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-}
165 | {-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-}
166 | {-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
167 | {-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
168 | signa p = do
169 |   sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
170 |   if sign == '+' then p else negate `liftM` p
171 | 
172 | newtype Parser a = P {
173 |       runP :: Reader a
174 |     }
175 | 
176 | instance Monad Parser where
177 |     return a = P $ \t -> Right (a,t)
178 |     {-# INLINE return #-}
179 |     m >>= k  = P $ \t -> case runP m t of
180 |                            Left err     -> Left err
181 |                            Right (a,t') -> runP (k a) t'
182 |     {-# INLINE (>>=) #-}
183 |     fail msg = P $ \_ -> Left msg
184 | 
185 | perhaps :: a -> Parser a -> Parser a
186 | perhaps def m = P $ \t -> case runP m t of
187 |                             Left _      -> Right (def,t)
188 |                             r@(Right _) -> r
189 | 
190 | char :: (Char -> Bool) -> Parser Char
191 | char p = P $ \t -> case T.uncons t of
192 |                      Just (c,t') | p c -> Right (c,t')
193 |                      _                 -> Left "character does not match"
194 | 
195 | data T = T !Integer !Int
196 | 
197 | floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
198 | {-# INLINE floaty #-}
199 | floaty f = runP $ do
200 |   sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
201 |   real <- P decimal
202 |   T fraction fracDigits <- perhaps (T 0 0) $ do
203 |     _ <- char (=='.')
204 |     digits <- P $ \t -> Right (fromIntegral . T.length $ T.takeWhile isDigit t, t)
205 |     n <- P decimal
206 |     return $ T n digits
207 |   let e c = c == 'e' || c == 'E'
208 |   power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
209 |   let n = if fracDigits == 0
210 |           then if power == 0
211 |                then fromIntegral real
212 |                else fromIntegral real * (10 ^^ power)
213 |           else if power == 0
214 |                then f real fraction (10 ^ fracDigits)
215 |                else f real fraction (10 ^ fracDigits) * (10 ^^ power)
216 |   return $! if sign == '+'
217 |             then n
218 |             else -n
219 | 


--------------------------------------------------------------------------------
/Data/Text/Fusion.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns, MagicHash #-}
  2 | 
  3 | -- |
  4 | -- Module      : Data.Text.Fusion
  5 | -- Copyright   : (c) Tom Harper 2008-2009,
  6 | --               (c) Bryan O'Sullivan 2009-2010,
  7 | --               (c) Duncan Coutts 2009
  8 | --
  9 | -- License     : BSD-style
 10 | -- Maintainer  : bos@serpentine.com, rtomharper@googlemail.com,
 11 | --               duncan@haskell.org
 12 | -- Stability   : experimental
 13 | -- Portability : GHC
 14 | --
 15 | -- Text manipulation functions represented as fusible operations over
 16 | -- streams.
 17 | module Data.Text.Fusion
 18 |     (
 19 |     -- * Types
 20 |       Stream(..)
 21 |     , Step(..)
 22 | 
 23 |     -- * Creation and elimination
 24 |     , stream
 25 |     , unstream
 26 |     , reverseStream
 27 | 
 28 |     , length
 29 | 
 30 |     -- * Transformations
 31 |     , reverse
 32 | 
 33 |     -- * Construction
 34 |     -- ** Scans
 35 |     , reverseScanr
 36 | 
 37 |     -- ** Accumulating maps
 38 |     , mapAccumL
 39 | 
 40 |     -- ** Generation and unfolding
 41 |     , unfoldrN
 42 | 
 43 |     -- * Indexing
 44 |     , index
 45 |     , findIndex
 46 |     , countChar
 47 |     ) where
 48 | 
 49 | import Prelude (Bool(..), Char, Maybe(..), Monad(..), Int,
 50 |                 Num(..), Ord(..), ($), (&&),
 51 |                 fromIntegral, otherwise)
 52 | import Data.Bits ((.&.))
 53 | import Data.Text.Internal (Text(..))
 54 | import Data.Text.Private (runText)
 55 | import Data.Text.UnsafeChar (ord, unsafeChr, unsafeWrite)
 56 | import Data.Text.UnsafeShift (shiftL, shiftR)
 57 | import qualified Data.Text.Array as A
 58 | import qualified Data.Text.Fusion.Common as S
 59 | import Data.Text.Fusion.Internal
 60 | import Data.Text.Fusion.Size
 61 | import qualified Data.Text.Internal as I
 62 | import qualified Data.Text.Encoding.Utf16 as U16
 63 | 
 64 | default(Int)
 65 | 
 66 | -- | /O(n)/ Convert a 'Text' into a 'Stream Char'.
 67 | stream :: Text -> Stream Char
 68 | stream (Text arr off len) = Stream next off (maxSize len)
 69 |     where
 70 |       !end = off+len
 71 |       next !i
 72 |           | i >= end                   = Done
 73 |           | n >= 0xD800 && n <= 0xDBFF = Yield (U16.chr2 n n2) (i + 2)
 74 |           | otherwise                  = Yield (unsafeChr n) (i + 1)
 75 |           where
 76 |             n  = A.unsafeIndex arr i
 77 |             n2 = A.unsafeIndex arr (i + 1)
 78 | {-# INLINE [0] stream #-}
 79 | 
 80 | -- | /O(n)/ Convert a 'Text' into a 'Stream Char', but iterate
 81 | -- backwards.
 82 | reverseStream :: Text -> Stream Char
 83 | reverseStream (Text arr off len) = Stream next (off+len-1) (maxSize len)
 84 |     where
 85 |       {-# INLINE next #-}
 86 |       next !i
 87 |           | i < off                    = Done
 88 |           | n >= 0xDC00 && n <= 0xDFFF = Yield (U16.chr2 n2 n) (i - 2)
 89 |           | otherwise                  = Yield (unsafeChr n) (i - 1)
 90 |           where
 91 |             n  = A.unsafeIndex arr i
 92 |             n2 = A.unsafeIndex arr (i - 1)
 93 | {-# INLINE [0] reverseStream #-}
 94 | 
 95 | -- | /O(n)/ Convert a 'Stream Char' into a 'Text'.
 96 | unstream :: Stream Char -> Text
 97 | unstream (Stream next0 s0 len) = runText $ \done -> do
 98 |   let mlen = upperBound 4 len
 99 |   arr0 <- A.new mlen
100 |   let outer arr top = loop
101 |        where
102 |         loop !s !i =
103 |             case next0 s of
104 |               Done          -> done arr i
105 |               Skip s'       -> loop s' i
106 |               Yield x s'
107 |                 | j >= top  -> {-# SCC "unstream/resize" #-} do
108 |                                let top' = (top + 1) `shiftL` 1
109 |                                arr' <- A.new top'
110 |                                A.copyM arr' 0 arr 0 top
111 |                                outer arr' top' s i
112 |                 | otherwise -> do d <- unsafeWrite arr i x
113 |                                   loop s' (i+d)
114 |                 where j | ord x < 0x10000 = i
115 |                         | otherwise       = i + 1
116 |   outer arr0 mlen s0 0
117 | {-# INLINE [0] unstream #-}
118 | {-# RULES "STREAM stream/unstream fusion" forall s. stream (unstream s) = s #-}
119 | 
120 | 
121 | -- ----------------------------------------------------------------------------
122 | -- * Basic stream functions
123 | 
124 | length :: Stream Char -> Int
125 | length = S.lengthI
126 | {-# INLINE[0] length #-}
127 | 
128 | -- | /O(n)/ Reverse the characters of a string.
129 | reverse :: Stream Char -> Text
130 | reverse (Stream next s len0)
131 |     | isEmpty len0 = I.empty
132 |     | otherwise    = I.textP arr off' len'
133 |   where
134 |     len0' = upperBound 4 (larger len0 4)
135 |     (arr, (off', len')) = A.run2 (A.new len0' >>= loop s (len0'-1) len0')
136 |     loop !s0 !i !len marr =
137 |         case next s0 of
138 |           Done -> return (marr, (j, len-j))
139 |               where j = i + 1
140 |           Skip s1    -> loop s1 i len marr
141 |           Yield x s1 | i < least -> {-# SCC "reverse/resize" #-} do
142 |                        let newLen = len `shiftL` 1
143 |                        marr' <- A.new newLen
144 |                        A.copyM marr' (newLen-len) marr 0 len
145 |                        write s1 (len+i) newLen marr'
146 |                      | otherwise -> write s1 i len marr
147 |             where n = ord x
148 |                   least | n < 0x10000 = 0
149 |                         | otherwise   = 1
150 |                   m = n - 0x10000
151 |                   lo = fromIntegral $ (m `shiftR` 10) + 0xD800
152 |                   hi = fromIntegral $ (m .&. 0x3FF) + 0xDC00
153 |                   write t j l mar
154 |                       | n < 0x10000 = do
155 |                           A.unsafeWrite mar j (fromIntegral n)
156 |                           loop t (j-1) l mar
157 |                       | otherwise = do
158 |                           A.unsafeWrite mar (j-1) lo
159 |                           A.unsafeWrite mar j hi
160 |                           loop t (j-2) l mar
161 | {-# INLINE [0] reverse #-}
162 | 
163 | -- | /O(n)/ Perform the equivalent of 'scanr' over a list, only with
164 | -- the input and result reversed.
165 | reverseScanr :: (Char -> Char -> Char) -> Char -> Stream Char -> Stream Char
166 | reverseScanr f z0 (Stream next0 s0 len) = Stream next (S1 :*: z0 :*: s0) (len+1) -- HINT maybe too low
167 |   where
168 |     {-# INLINE next #-}
169 |     next (S1 :*: z :*: s) = Yield z (S2 :*: z :*: s)
170 |     next (S2 :*: z :*: s) = case next0 s of
171 |                               Yield x s' -> let !x' = f x z
172 |                                             in Yield x' (S2 :*: x' :*: s')
173 |                               Skip s'    -> Skip (S2 :*: z :*: s')
174 |                               Done       -> Done
175 | {-# INLINE reverseScanr #-}
176 | 
177 | -- | /O(n)/ Like 'unfoldr', 'unfoldrN' builds a stream from a seed
178 | -- value. However, the length of the result is limited by the
179 | -- first argument to 'unfoldrN'. This function is more efficient than
180 | -- 'unfoldr' when the length of the result is known.
181 | unfoldrN :: Int -> (a -> Maybe (Char,a)) -> a -> Stream Char
182 | unfoldrN n = S.unfoldrNI n
183 | {-# INLINE [0] unfoldrN #-}
184 | 
185 | -------------------------------------------------------------------------------
186 | -- ** Indexing streams
187 | 
188 | -- | /O(n)/ stream index (subscript) operator, starting from 0.
189 | index :: Stream Char -> Int -> Char
190 | index = S.indexI
191 | {-# INLINE [0] index #-}
192 | 
193 | -- | The 'findIndex' function takes a predicate and a stream and
194 | -- returns the index of the first element in the stream
195 | -- satisfying the predicate.
196 | findIndex :: (Char -> Bool) -> Stream Char -> Maybe Int
197 | findIndex = S.findIndexI
198 | {-# INLINE [0] findIndex #-}
199 | 
200 | -- | /O(n)/ The 'count' function returns the number of times the query
201 | -- element appears in the given stream.
202 | countChar :: Char -> Stream Char -> Int
203 | countChar = S.countCharI
204 | {-# INLINE [0] countChar #-}
205 | 
206 | -- | /O(n)/ Like a combination of 'map' and 'foldl''. Applies a
207 | -- function to each element of a 'Text', passing an accumulating
208 | -- parameter from left to right, and returns a final 'Text'.
209 | mapAccumL :: (a -> Char -> (a,Char)) -> a -> Stream Char -> (a, Text)
210 | mapAccumL f z0 (Stream next0 s0 len) = (nz,I.textP na 0 nl)
211 |   where
212 |     (na,(nz,nl)) = A.run2 (A.new mlen >>= \arr -> outer arr mlen z0 s0 0)
213 |       where mlen = upperBound 4 len
214 |     outer arr top = loop
215 |       where
216 |         loop !z !s !i =
217 |             case next0 s of
218 |               Done          -> return (arr, (z,i))
219 |               Skip s'       -> loop z s' i
220 |               Yield x s'
221 |                 | j >= top  -> {-# SCC "mapAccumL/resize" #-} do
222 |                                let top' = (top + 1) `shiftL` 1
223 |                                arr' <- A.new top'
224 |                                A.copyM arr' 0 arr 0 top
225 |                                outer arr' top' z s i
226 |                 | otherwise -> do let (z',c) = f z x
227 |                                   d <- unsafeWrite arr i c
228 |                                   loop z' s' (i+d)
229 |                 where j | ord x < 0x10000 = i
230 |                         | otherwise       = i + 1
231 | {-# INLINE [0] mapAccumL #-}
232 | 


--------------------------------------------------------------------------------
/Data/Text/Read.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings, UnboxedTuples, CPP #-}
  2 | #if __GLASGOW_HASKELL__ >= 702
  3 | {-# LANGUAGE Trustworthy #-}
  4 | #endif
  5 | 
  6 | -- |
  7 | -- Module      : Data.Text.Read
  8 | -- Copyright   : (c) 2010, 2011 Bryan O'Sullivan
  9 | --
 10 | -- License     : BSD-style
 11 | -- Maintainer  : bos@serpentine.com
 12 | -- Stability   : experimental
 13 | -- Portability : GHC
 14 | --
 15 | -- Functions used frequently when reading textual data.
 16 | module Data.Text.Read
 17 |     (
 18 |       Reader
 19 |     , decimal
 20 |     , hexadecimal
 21 |     , signed
 22 |     , rational
 23 |     , double
 24 |     ) where
 25 | 
 26 | import Control.Monad (liftM)
 27 | import Data.Char (isDigit, isHexDigit, ord)
 28 | import Data.Int (Int8, Int16, Int32, Int64)
 29 | import Data.Ratio ((%))
 30 | import Data.Text as T
 31 | import Data.Text.Private (span_)
 32 | import Data.Word (Word, Word8, Word16, Word32, Word64)
 33 | 
 34 | -- | Read some text.  If the read succeeds, return its value and the
 35 | -- remaining text, otherwise an error message.
 36 | type Reader a = Text -> Either String (a,Text)
 37 | 
 38 | -- | Read a decimal integer.  The input must begin with at least one
 39 | -- decimal digit, and is consumed until a non-digit or end of string
 40 | -- is reached.
 41 | --
 42 | -- This function does not handle leading sign characters.  If you need
 43 | -- to handle signed input, use @'signed' 'decimal'@.
 44 | --
 45 | -- /Note/: For fixed-width integer types, this function does not
 46 | -- attempt to detect overflow, so a sufficiently long input may give
 47 | -- incorrect results.  If you are worried about overflow, use
 48 | -- 'Integer' for your result type.
 49 | decimal :: Integral a => Reader a
 50 | {-# SPECIALIZE decimal :: Reader Int #-}
 51 | {-# SPECIALIZE decimal :: Reader Int8 #-}
 52 | {-# SPECIALIZE decimal :: Reader Int16 #-}
 53 | {-# SPECIALIZE decimal :: Reader Int32 #-}
 54 | {-# SPECIALIZE decimal :: Reader Int64 #-}
 55 | {-# SPECIALIZE decimal :: Reader Integer #-}
 56 | {-# SPECIALIZE decimal :: Reader Word #-}
 57 | {-# SPECIALIZE decimal :: Reader Word8 #-}
 58 | {-# SPECIALIZE decimal :: Reader Word16 #-}
 59 | {-# SPECIALIZE decimal :: Reader Word32 #-}
 60 | {-# SPECIALIZE decimal :: Reader Word64 #-}
 61 | decimal txt
 62 |     | T.null h  = Left "input does not start with a digit"
 63 |     | otherwise = Right (T.foldl' go 0 h, t)
 64 |   where (# h,t #)  = span_ isDigit txt
 65 |         go n d = (n * 10 + fromIntegral (digitToInt d))
 66 | 
 67 | -- | Read a hexadecimal integer, consisting of an optional leading
 68 | -- @\"0x\"@ followed by at least one decimal digit. Input is consumed
 69 | -- until a non-hex-digit or end of string is reached.  This function
 70 | -- is case insensitive.
 71 | --
 72 | -- This function does not handle leading sign characters.  If you need
 73 | -- to handle signed input, use @'signed' 'hexadecimal'@.
 74 | --
 75 | -- /Note/: For fixed-width integer types, this function does not
 76 | -- attempt to detect overflow, so a sufficiently long input may give
 77 | -- incorrect results.  If you are worried about overflow, use
 78 | -- 'Integer' for your result type.
 79 | hexadecimal :: Integral a => Reader a
 80 | {-# SPECIALIZE hexadecimal :: Reader Int #-}
 81 | {-# SPECIALIZE hexadecimal :: Reader Int8 #-}
 82 | {-# SPECIALIZE hexadecimal :: Reader Int16 #-}
 83 | {-# SPECIALIZE hexadecimal :: Reader Int32 #-}
 84 | {-# SPECIALIZE hexadecimal :: Reader Int64 #-}
 85 | {-# SPECIALIZE hexadecimal :: Reader Integer #-}
 86 | {-# SPECIALIZE hexadecimal :: Reader Word #-}
 87 | {-# SPECIALIZE hexadecimal :: Reader Word8 #-}
 88 | {-# SPECIALIZE hexadecimal :: Reader Word16 #-}
 89 | {-# SPECIALIZE hexadecimal :: Reader Word32 #-}
 90 | {-# SPECIALIZE hexadecimal :: Reader Word64 #-}
 91 | hexadecimal txt
 92 |     | h == "0x" || h == "0X" = hex t
 93 |     | otherwise              = hex txt
 94 |  where (h,t) = T.splitAt 2 txt
 95 | 
 96 | hex :: Integral a => Reader a
 97 | {-# SPECIALIZE hex :: Reader Int #-}
 98 | {-# SPECIALIZE hex :: Reader Int8 #-}
 99 | {-# SPECIALIZE hex :: Reader Int16 #-}
100 | {-# SPECIALIZE hex :: Reader Int32 #-}
101 | {-# SPECIALIZE hex :: Reader Int64 #-}
102 | {-# SPECIALIZE hex :: Reader Integer #-}
103 | {-# SPECIALIZE hex :: Reader Word #-}
104 | {-# SPECIALIZE hex :: Reader Word8 #-}
105 | {-# SPECIALIZE hex :: Reader Word16 #-}
106 | {-# SPECIALIZE hex :: Reader Word32 #-}
107 | {-# SPECIALIZE hex :: Reader Word64 #-}
108 | hex txt
109 |     | T.null h  = Left "input does not start with a hexadecimal digit"
110 |     | otherwise = Right (T.foldl' go 0 h, t)
111 |   where (# h,t #)  = span_ isHexDigit txt
112 |         go n d = (n * 16 + fromIntegral (hexDigitToInt d))
113 | 
114 | hexDigitToInt :: Char -> Int
115 | hexDigitToInt c
116 |     | c >= '0' && c <= '9' = ord c - ord '0'
117 |     | c >= 'a' && c <= 'f' = ord c - (ord 'a' - 10)
118 |     | otherwise            = ord c - (ord 'A' - 10)
119 | 
120 | digitToInt :: Char -> Int
121 | digitToInt c = ord c - ord '0'
122 | 
123 | -- | Read an optional leading sign character (@\'-\'@ or @\'+\'@) and
124 | -- apply it to the result of applying the given reader.
125 | signed :: Num a => Reader a -> Reader a
126 | {-# INLINE signed #-}
127 | signed f = runP (signa (P f))
128 | 
129 | -- | Read a rational number.
130 | --
131 | -- This function accepts an optional leading sign character, followed
132 | -- by at least one decimal digit.  The syntax similar to that accepted
133 | -- by the 'read' function, with the exception that a trailing @\'.\'@
134 | -- or @\'e\'@ /not/ followed by a number is not consumed.
135 | --
136 | -- Examples (with behaviour identical to 'read'):
137 | --
138 | -- >rational "3"     == Right (3.0, "")
139 | -- >rational "3.1"   == Right (3.1, "")
140 | -- >rational "3e4"   == Right (30000.0, "")
141 | -- >rational "3.1e4" == Right (31000.0, "")
142 | -- >rational ".3"    == Left "input does not start with a digit"
143 | -- >rational "e3"    == Left "input does not start with a digit"
144 | --
145 | -- Examples of differences from 'read':
146 | --
147 | -- >rational "3.foo" == Right (3.0, ".foo")
148 | -- >rational "3e"    == Right (3.0, "e")
149 | rational :: Fractional a => Reader a
150 | {-# SPECIALIZE rational :: Reader Double #-}
151 | rational = floaty $ \real frac fracDenom -> fromRational $
152 |                      real % 1 + frac % fracDenom
153 | 
154 | -- | Read a rational number.
155 | --
156 | -- The syntax accepted by this function is the same as for 'rational'.
157 | --
158 | -- /Note/: This function is almost ten times faster than 'rational',
159 | -- but is slightly less accurate.
160 | --
161 | -- The 'Double' type supports about 16 decimal places of accuracy.
162 | -- For 94.2% of numbers, this function and 'rational' give identical
163 | -- results, but for the remaining 5.8%, this function loses precision
164 | -- around the 15th decimal place.  For 0.001% of numbers, this
165 | -- function will lose precision at the 13th or 14th decimal place.
166 | double :: Reader Double
167 | double = floaty $ \real frac fracDenom ->
168 |                    fromIntegral real +
169 |                    fromIntegral frac / fromIntegral fracDenom
170 | 
171 | signa :: Num a => Parser a -> Parser a
172 | {-# SPECIALIZE signa :: Parser Int -> Parser Int #-}
173 | {-# SPECIALIZE signa :: Parser Int8 -> Parser Int8 #-}
174 | {-# SPECIALIZE signa :: Parser Int16 -> Parser Int16 #-}
175 | {-# SPECIALIZE signa :: Parser Int32 -> Parser Int32 #-}
176 | {-# SPECIALIZE signa :: Parser Int64 -> Parser Int64 #-}
177 | {-# SPECIALIZE signa :: Parser Integer -> Parser Integer #-}
178 | signa p = do
179 |   sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
180 |   if sign == '+' then p else negate `liftM` p
181 | 
182 | newtype Parser a = P {
183 |       runP :: Reader a
184 |     }
185 | 
186 | instance Monad Parser where
187 |     return a = P $ \t -> Right (a,t)
188 |     {-# INLINE return #-}
189 |     m >>= k  = P $ \t -> case runP m t of
190 |                            Left err     -> Left err
191 |                            Right (a,t') -> runP (k a) t'
192 |     {-# INLINE (>>=) #-}
193 |     fail msg = P $ \_ -> Left msg
194 | 
195 | perhaps :: a -> Parser a -> Parser a
196 | perhaps def m = P $ \t -> case runP m t of
197 |                             Left _      -> Right (def,t)
198 |                             r@(Right _) -> r
199 | 
200 | char :: (Char -> Bool) -> Parser Char
201 | char p = P $ \t -> case T.uncons t of
202 |                      Just (c,t') | p c -> Right (c,t')
203 |                      _                 -> Left "character does not match"
204 | 
205 | data T = T !Integer !Int
206 | 
207 | floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Reader a
208 | {-# INLINE floaty #-}
209 | floaty f = runP $ do
210 |   sign <- perhaps '+' $ char (\c -> c == '-' || c == '+')
211 |   real <- P decimal
212 |   T fraction fracDigits <- perhaps (T 0 0) $ do
213 |     _ <- char (=='.')
214 |     digits <- P $ \t -> Right (T.length $ T.takeWhile isDigit t, t)
215 |     n <- P decimal
216 |     return $ T n digits
217 |   let e c = c == 'e' || c == 'E'
218 |   power <- perhaps 0 (char e >> signa (P decimal) :: Parser Int)
219 |   let n = if fracDigits == 0
220 |           then if power == 0
221 |                then fromIntegral real
222 |                else fromIntegral real * (10 ^^ power)
223 |           else if power == 0
224 |                then f real fraction (10 ^ fracDigits)
225 |                else f real fraction (10 ^ fracDigits) * (10 ^^ power)
226 |   return $! if sign == '+'
227 |             then n
228 |             else -n
229 | 


--------------------------------------------------------------------------------