├── .gitignore
├── Setup.hs
├── ChangeLog.md
├── stack.yaml
├── src
    └── Data
    │   ├── Dispatch.hs
    │   └── WordCount.hs
├── README.md
├── package.yaml
├── LICENSE
├── test
    └── Spec.hs
└── app
    └── Main.hs


/.gitignore:
--------------------------------------------------------------------------------
1 | .stack-work/
2 | hwc.cabal
3 | *~


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/ChangeLog.md:
--------------------------------------------------------------------------------
1 | # Changelog for hwc
2 | 
3 | ## Unreleased changes
4 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
1 | resolver: lts-15.2
2 | packages:
3 | - .
4 | extra-deps:
5 | - bytestring-mmap-0.2.2
6 | 


--------------------------------------------------------------------------------
/src/Data/Dispatch.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE LambdaCase #-}
 2 | {-# LANGUAGE TemplateHaskell #-}
 3 | 
 4 | module Data.Dispatch where
 5 | 
 6 | import Control.Monad
 7 | import Language.Haskell.TH
 8 | 
 9 | import Data.WordCount
10 | 
11 | dispatch :: Name -> Name -> Q Exp
12 | dispatch fun bs = reify ''Statistics >>= \case
13 |   TyConI (DataD _ _ _ _ cons _) -> do
14 |     let consNames = [ name | NormalC name _ <- cons ]
15 |     let powerset = filterM (const [True, False]) consNames
16 |     let matches = buildMatch fun bs <$> filter (not . null) powerset
17 |     fallbackMatch <- (\body -> Match WildP (NormalB body) []) <$> [e| error "Unexpected input" |]
18 |     pure $ LamCaseE $ matches <> [fallbackMatch]
19 |   _ -> fail "unsupported type"
20 | 
21 | buildMatch :: Name -> Name -> [Name] -> Match
22 | buildMatch fun bs consNames = Match (ListP $ (`ConP` []) <$> consNames) (NormalB $ VarE 'prettyPrint `AppE` (wcCall `AppE` VarE bs)) []
23 |   where
24 |     wcCall = VarE fun `AppTypeE` foldr1 f (PromotedT <$> consNames)
25 |     f accTy promotedTy = PromotedT '(:::) `AppT` accTy `AppT` promotedTy
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hwc
 2 | 
 3 | This is the common `wc` command, implemented in Haskell.
 4 | Turns out that it's about 4-6 times faster than GNU coreutils' `wc`,
 5 | depending on the exact subset of statistics being computed.
 6 | 
 7 | `hwc` supports counting bytes, words and lines, as well as UTF-8 characters and maximum line length.
 8 | It does so in a composable and _efficient_ manner, so that only those statistics requested by user
 9 | are actually calculated.
10 | 
11 | It all started from [this](https://chrispenner.ca/posts/wc) blog post.
12 | The author of that post manages to beat the C version by resorting to parallel input file handling
13 | (and that's a good demonstration, no critique here — go figure how to parallelize the C version correctly!),
14 | but I was curious if it's possible to beat the (single-threaded) C version by a single-threaded Haskell version.
15 | 
16 | Turns out the answer is yes, and with some very minor modifications to the original author's implementation.
17 | 
18 | For more details about the implementation,
19 | please refer to [this](https://0xd34df00d.me/posts/2020/03/the-joys-and-perils.html) post.
20 | 


--------------------------------------------------------------------------------
/package.yaml:
--------------------------------------------------------------------------------
 1 | name:                hwc
 2 | version:             0.1.0.0
 3 | github:              "0xd34df00d/hwc"
 4 | license:             BSD3
 5 | author:              "Georg Rudoy"
 6 | maintainer:          "0xd34df00d@gmail.com"
 7 | copyright:           "2020 Georg Rudoy"
 8 | 
 9 | extra-source-files:
10 | - README.md
11 | - ChangeLog.md
12 | 
13 | synopsis:            The common Unix wc command, in Haskell, faster than GNU coreutils'
14 | category:            Text
15 | 
16 | # To avoid duplicated efforts in documentation and dealing with the
17 | # complications of embedding Haddock markup inside cabal files, it is
18 | # common to point users to the README.md file.
19 | description:         Please see the README on GitHub at <https://github.com/0xd34df00d/hwc#readme>
20 | 
21 | dependencies:
22 | - base >= 4.7 && < 5
23 | - bytestring
24 | - bytestring-mmap
25 | - optparse-applicative
26 | - template-haskell
27 | - unix
28 | 
29 | ghc-options:
30 | - -fllvm
31 | - -O2
32 | - -Wall
33 | 
34 | library:
35 |   source-dirs: src
36 | 
37 | executables:
38 |   hwc-exe:
39 |     main:                Main.hs
40 |     source-dirs:         app
41 |     ghc-options:
42 |     - -rtsopts
43 |     - -with-rtsopts=-A2m
44 |     dependencies:
45 |     - hwc
46 | 
47 | tests:
48 |   hwc-test:
49 |     main:                Spec.hs
50 |     source-dirs:         test
51 |     ghc-options:
52 |     - -threaded
53 |     - -rtsopts
54 |     - -with-rtsopts=-N
55 |     dependencies:
56 |     - hwc
57 |     - hspec
58 |     - hspec-core
59 |     - QuickCheck
60 |     - text
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Georg Rudoy (c) 2020
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Georg Rudoy nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/test/Spec.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings, ViewPatterns #-}
 2 | {-# LANGUAGE DataKinds, TypeApplications #-}
 3 | 
 4 | import qualified Data.ByteString.Char8 as BS
 5 | import qualified Data.Text as T
 6 | import qualified Data.Text.Encoding as T
 7 | import Data.Char
 8 | import Data.List
 9 | import Test.Hspec
10 | import Test.Hspec.Core.QuickCheck
11 | import Test.QuickCheck
12 | 
13 | import Data.WordCount
14 | 
15 | wrapUnicode :: UnicodeString -> (BS.ByteString, T.Text)
16 | wrapUnicode ustr = (T.encodeUtf8 txt, txt)
17 |   where
18 |     txt = T.pack $ getUnicodeString ustr
19 | 
20 | replaceNonAsciiSpaces :: Char -> Char
21 | replaceNonAsciiSpaces ch | ch >= chr 127 && isSpace ch = '_'
22 |                          | otherwise = ch
23 | 
24 | main :: IO ()
25 | main = hspec $ parallel $ modifyMaxSuccess (const 10000) $ modifyMaxSize (const 1000) $ do
26 |   describe "ASCII support" $ do
27 |     it "Counts bytes correctly" $ property $
28 |       \(getASCIIString -> str) -> wc @'Bytes (BS.pack str) `shouldBe` genericLength str
29 |     it "Counts chars correctly" $ property $
30 |       \(getASCIIString -> str) -> wc @'Chars (BS.pack str) `shouldBe` genericLength str
31 |     it "Counts words correctly" $ property $
32 |       \(getASCIIString -> str) -> wc @'Words (BS.pack str) `shouldBe` genericLength (words str)
33 |     it "Counts lines correctly" $ property $
34 |       \(getASCIIString -> str) -> wc @'Lines (BS.pack str) `shouldBe` genericLength (filter (== '\n') str)
35 |   describe "UTF8 support" $ do
36 |     it "Counts bytes correctly" $ property $
37 |       \(wrapUnicode -> (bs, _))   -> wc @'Bytes bs `shouldBe` fromIntegral (BS.length bs)
38 |     it "Counts chars correctly" $ property $
39 |       \(wrapUnicode -> (bs, txt)) -> wc @'Chars bs `shouldBe` fromIntegral (T.length txt)
40 |     it "Counts words correctly" $ property $
41 |       \(wrapUnicode -> (bs, txt)) -> wc @'Words bs `shouldBe` genericLength (T.words $ T.map replaceNonAsciiSpaces txt)
42 |     it "Counts lines correctly" $ property $
43 |       \(wrapUnicode -> (bs, txt)) -> wc @'Lines bs `shouldBe` fromIntegral (T.count "\n" txt)
44 | 


--------------------------------------------------------------------------------
/app/Main.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE RecordWildCards #-}
 2 | {-# LANGUAGE TypeApplications, DataKinds #-}
 3 | {-# LANGUAGE TemplateHaskell #-}
 4 | {-# LANGUAGE Strict #-}
 5 | 
 6 | module Main where
 7 | 
 8 | import qualified Data.ByteString.Lazy as BSL
 9 | import Control.Monad
10 | import Options.Applicative
11 | import System.Posix.Files
12 | import System.IO.Posix.MMap
13 | 
14 | import Data.Dispatch
15 | import Data.WordCount
16 | 
17 | data Options = Options
18 |   { countBytes :: Bool
19 |   , countChars :: Bool
20 |   , countLines :: Bool
21 |   , countMaxLineLength :: Bool
22 |   , countWords :: Bool
23 |   , files :: [FilePath]
24 |   }
25 | 
26 | options :: Parser Options
27 | options = Options
28 |   <$> switch (long "bytes" <> short 'c' <> help "print the byte counts")
29 |   <*> switch (long "chars" <> short 'm' <> help "print the character counts")
30 |   <*> switch (long "lines" <> short 'l' <> help "print the newline counts")
31 |   <*> switch (long "max-line-length" <> short 'L' <> help "print the maximum display width")
32 |   <*> switch (long "words" <> short 'w' <> help "print the word counts")
33 |   <*> many (argument str (metavar "FILES..."))
34 | 
35 | main :: IO ()
36 | main = do
37 |   Options { .. } <- execParser $ info (options <**> helper) (fullDesc <> progDesc "Print newline, word, and byte counts for each file")
38 |   let selectedStats = map snd $ filter fst [(countBytes, Bytes), (countChars, Chars), (countWords, Words), (countMaxLineLength, MaxLL), (countLines, Lines)]
39 |   let stats | null selectedStats = [Bytes, Words, Lines]
40 |             | otherwise = selectedStats
41 |   forM_ files $ \path -> do
42 |     stat <- getFileStatus path
43 |     if isRegularFile stat || isSymbolicLink stat
44 |       then countStrict stats $ unsafeMMapFile path
45 |       else countLazy stats $ BSL.readFile path
46 | 
47 |   when (null files) $ countLazy stats BSL.getContents
48 |   where
49 |     countStrict stats act = do
50 |       contents <- act
51 |       putStrLn $ $(dispatch 'wc 'contents) stats
52 |     countLazy stats act = do
53 |       contents <- act
54 |       putStrLn $ $(dispatch 'wcLazy 'contents) stats
55 | 


--------------------------------------------------------------------------------
/src/Data/WordCount.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE Strict, RecordWildCards, BinaryLiterals #-}
  2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
  3 | {-# LANGUAGE MultiParamTypeClasses, UndecidableInstances, FlexibleInstances #-}
  4 | {-# LANGUAGE TypeFamilyDependencies, FunctionalDependencies, PolyKinds, DataKinds, GADTs, TypeOperators #-}
  5 | {-# LANGUAGE ScopedTypeVariables #-}
  6 | 
  7 | module Data.WordCount where
  8 | 
  9 | import qualified Data.ByteString as BS
 10 | import qualified Data.ByteString.Lazy as BSL
 11 | import Data.Bits
 12 | import Data.Word
 13 | 
 14 | data Statistics = Bytes | Chars | Words | MaxLL | Lines deriving (Eq, Ord)
 15 | data StatCompTyOf = Chunked | ByteOnly
 16 | 
 17 | type family CombineCompTy a b where
 18 |   CombineCompTy 'Chunked 'Chunked = 'Chunked
 19 |   CombineCompTy _ _ = 'ByteOnly
 20 | 
 21 | data StatComputation st compTy where
 22 |   ChunkedComputation :: (st -> Word8 -> st)
 23 |                      -> (st -> BS.ByteString -> st)
 24 |                      -> StatComputation st 'Chunked
 25 |   ByteOnlyComputation :: (st -> Word8 -> st)
 26 |                       -> StatComputation st 'ByteOnly
 27 | 
 28 | class Statistic s res st comp | res -> s, st -> s, s -> res, s -> st, s -> comp where
 29 |   initState :: st
 30 |   extractState :: st -> res
 31 |   prettyPrint :: res -> String
 32 |   computation :: StatComputation st comp
 33 | 
 34 | newtype Tagged a = Tagged Word64 deriving (Eq, Show, Num)
 35 | 
 36 | instance Statistic 'Bytes (Tagged 'Bytes) (Tagged 'Bytes) 'Chunked where
 37 |   initState = 0
 38 |   extractState = id
 39 |   prettyPrint (Tagged n) = show n <> " bytes"
 40 |   computation = ChunkedComputation (\st _ -> st + 1) (\st str -> st + fromIntegral (BS.length str))
 41 | 
 42 | instance Statistic 'Chars (Tagged 'Chars) (Tagged 'Chars) 'ByteOnly where
 43 |   initState = 0
 44 |   extractState = id
 45 |   prettyPrint (Tagged n) = show n <> " characters"
 46 |   computation = ByteOnlyComputation $ \cnt c -> cnt + 1 - fromIntegral (   ((c .&. 0b10000000) `shiftR` 7)
 47 |                                                                        .&. (1 - ((c .&. 0b01000000) `shiftR` 6))
 48 |                                                                        )
 49 | 
 50 | data WordsState = WordsState { ws :: Word64, wasSpace :: Word64 }
 51 | 
 52 | instance Statistic 'Words (Tagged 'Words) WordsState 'ByteOnly where
 53 |   initState = WordsState 0 1
 54 |   extractState WordsState { .. } = Tagged (ws + 1 - wasSpace)
 55 |   prettyPrint (Tagged n) = show n <> " words"
 56 |   computation = ByteOnlyComputation step
 57 |     where
 58 |       step WordsState { .. } c = WordsState (ws + (1 - wasSpace) * isSp) isSp
 59 |         where
 60 |           isSp | c == 32 || c - 9 <= 4 = 1
 61 |                | otherwise = 0
 62 | 
 63 | data MaxLLState = MaxLLState { maxLen :: Word64, curLen :: Word64 }
 64 | 
 65 | instance Statistic 'MaxLL (Tagged 'MaxLL) MaxLLState 'ByteOnly where
 66 |   initState = MaxLLState 0 0
 67 |   extractState MaxLLState { .. } = Tagged $ max maxLen curLen
 68 |   prettyPrint (Tagged n) = show n <> " max line length"
 69 |   computation = ByteOnlyComputation step
 70 |     where
 71 |       step MaxLLState { .. } 9 = MaxLLState maxLen $ curLen + 8 - (curLen `rem` 8)
 72 |       step MaxLLState { .. } 8 = MaxLLState maxLen $ max 0 (curLen - 1)
 73 |       step MaxLLState { .. } c | c == 10
 74 |                               || c == 12
 75 |                               || c == 13 = MaxLLState (max maxLen curLen) 0
 76 |                                | c < 32 = MaxLLState maxLen curLen
 77 |       step MaxLLState { .. } _ = MaxLLState maxLen (curLen + 1)
 78 | 
 79 | instance Statistic 'Lines (Tagged 'Lines) (Tagged 'Lines) 'Chunked where
 80 |   initState = 0
 81 |   extractState = id
 82 |   prettyPrint (Tagged n) = show n <> " lines"
 83 |   computation = ChunkedComputation (\st c -> st + if c == 10 then 1 else 0) (\st str -> st + fromIntegral (BS.count 10 str))
 84 | 
 85 | infixr 5 :::
 86 | data a ::: b = a ::: b deriving (Show)
 87 | 
 88 | instance (Statistic sa resa sta compa, Statistic sb resb stb compb, comp ~ CombineCompTy compa compb)
 89 |        => Statistic (sa '::: sb) (resa ::: resb) (sta ::: stb) comp where
 90 |   initState = initState ::: initState
 91 |   extractState (a ::: b) = extractState a ::: extractState b
 92 |   prettyPrint (a ::: b) = prettyPrint a <> "\n" <> prettyPrint b
 93 |   computation = case (computation :: StatComputation sta compa, computation :: StatComputation stb compb) of
 94 |                      (ByteOnlyComputation a, ChunkedComputation b _) -> ByteOnlyComputation $ combine a b
 95 |                      (ChunkedComputation a _, ByteOnlyComputation b) -> ByteOnlyComputation $ combine a b
 96 |                      (ByteOnlyComputation a, ByteOnlyComputation b)  -> ByteOnlyComputation $ combine a b
 97 |                      (ChunkedComputation stepA chunkA, ChunkedComputation stepB chunkB)
 98 |                                                                      -> ChunkedComputation (combine stepA stepB) (combine chunkA chunkB)
 99 |     where
100 |       combine fa fb = \(a ::: b) w -> fa a w ::: fb b w
101 | 
102 | wc :: forall s res st comp. Statistic s res st comp => BS.ByteString -> res
103 | wc s = extractState $! runCompute computation
104 |   where
105 |     runCompute :: StatComputation st comp -> st
106 |     runCompute (ByteOnlyComputation step) = BS.foldl' step initState s
107 |     runCompute (ChunkedComputation _ chunker) = chunker initState s
108 | 
109 | wcLazy :: forall s res st comp. Statistic s res st comp => BSL.ByteString -> res
110 | wcLazy s = extractState $! runCompute computation
111 |   where
112 |     runCompute :: StatComputation st comp -> st
113 |     runCompute (ByteOnlyComputation step) = BSL.foldl' step initState s
114 |     runCompute (ChunkedComputation _ chunker) = BSL.foldlChunks chunker initState s
115 | 


--------------------------------------------------------------------------------