├── .gitignore ├── Setup.hs ├── ChangeLog.md ├── stack.yaml ├── src └── Data │ ├── Dispatch.hs │ └── WordCount.hs ├── README.md ├── package.yaml ├── LICENSE ├── test └── Spec.hs └── app └── Main.hs /.gitignore: -------------------------------------------------------------------------------- 1 | .stack-work/ 2 | hwc.cabal 3 | *~ -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | # Changelog for hwc 2 | 3 | ## Unreleased changes 4 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-15.2 2 | packages: 3 | - . 4 | extra-deps: 5 | - bytestring-mmap-0.2.2 6 | -------------------------------------------------------------------------------- /src/Data/Dispatch.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE LambdaCase #-} 2 | {-# LANGUAGE TemplateHaskell #-} 3 | 4 | module Data.Dispatch where 5 | 6 | import Control.Monad 7 | import Language.Haskell.TH 8 | 9 | import Data.WordCount 10 | 11 | dispatch :: Name -> Name -> Q Exp 12 | dispatch fun bs = reify ''Statistics >>= \case 13 | TyConI (DataD _ _ _ _ cons _) -> do 14 | let consNames = [ name | NormalC name _ <- cons ] 15 | let powerset = filterM (const [True, False]) consNames 16 | let matches = buildMatch fun bs <$> filter (not . null) powerset 17 | fallbackMatch <- (\body -> Match WildP (NormalB body) []) <$> [e| error "Unexpected input" |] 18 | pure $ LamCaseE $ matches <> [fallbackMatch] 19 | _ -> fail "unsupported type" 20 | 21 | buildMatch :: Name -> Name -> [Name] -> Match 22 | buildMatch fun bs consNames = Match (ListP $ (`ConP` []) <$> consNames) (NormalB $ VarE 'prettyPrint `AppE` (wcCall `AppE` VarE bs)) [] 23 | where 24 | wcCall = VarE fun `AppTypeE` foldr1 f (PromotedT <$> consNames) 25 | f accTy promotedTy = PromotedT '(:::) `AppT` accTy `AppT` promotedTy 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hwc 2 | 3 | This is the common `wc` command, implemented in Haskell. 4 | Turns out that it's about 4-6 times faster than GNU coreutils' `wc`, 5 | depending on the exact subset of statistics being computed. 6 | 7 | `hwc` supports counting bytes, words and lines, as well as UTF-8 characters and maximum line length. 8 | It does so in a composable and _efficient_ manner, so that only those statistics requested by user 9 | are actually calculated. 10 | 11 | It all started from [this](https://chrispenner.ca/posts/wc) blog post. 12 | The author of that post manages to beat the C version by resorting to parallel input file handling 13 | (and that's a good demonstration, no critique here — go figure how to parallelize the C version correctly!), 14 | but I was curious if it's possible to beat the (single-threaded) C version by a single-threaded Haskell version. 15 | 16 | Turns out the answer is yes, and with some very minor modifications to the original author's implementation. 17 | 18 | For more details about the implementation, 19 | please refer to [this](https://0xd34df00d.me/posts/2020/03/the-joys-and-perils.html) post. 20 | -------------------------------------------------------------------------------- /package.yaml: -------------------------------------------------------------------------------- 1 | name: hwc 2 | version: 0.1.0.0 3 | github: "0xd34df00d/hwc" 4 | license: BSD3 5 | author: "Georg Rudoy" 6 | maintainer: "0xd34df00d@gmail.com" 7 | copyright: "2020 Georg Rudoy" 8 | 9 | extra-source-files: 10 | - README.md 11 | - ChangeLog.md 12 | 13 | synopsis: The common Unix wc command, in Haskell, faster than GNU coreutils' 14 | category: Text 15 | 16 | # To avoid duplicated efforts in documentation and dealing with the 17 | # complications of embedding Haddock markup inside cabal files, it is 18 | # common to point users to the README.md file. 19 | description: Please see the README on GitHub at 20 | 21 | dependencies: 22 | - base >= 4.7 && < 5 23 | - bytestring 24 | - bytestring-mmap 25 | - optparse-applicative 26 | - template-haskell 27 | - unix 28 | 29 | ghc-options: 30 | - -fllvm 31 | - -O2 32 | - -Wall 33 | 34 | library: 35 | source-dirs: src 36 | 37 | executables: 38 | hwc-exe: 39 | main: Main.hs 40 | source-dirs: app 41 | ghc-options: 42 | - -rtsopts 43 | - -with-rtsopts=-A2m 44 | dependencies: 45 | - hwc 46 | 47 | tests: 48 | hwc-test: 49 | main: Spec.hs 50 | source-dirs: test 51 | ghc-options: 52 | - -threaded 53 | - -rtsopts 54 | - -with-rtsopts=-N 55 | dependencies: 56 | - hwc 57 | - hspec 58 | - hspec-core 59 | - QuickCheck 60 | - text 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Georg Rudoy (c) 2020 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Georg Rudoy nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /test/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings, ViewPatterns #-} 2 | {-# LANGUAGE DataKinds, TypeApplications #-} 3 | 4 | import qualified Data.ByteString.Char8 as BS 5 | import qualified Data.Text as T 6 | import qualified Data.Text.Encoding as T 7 | import Data.Char 8 | import Data.List 9 | import Test.Hspec 10 | import Test.Hspec.Core.QuickCheck 11 | import Test.QuickCheck 12 | 13 | import Data.WordCount 14 | 15 | wrapUnicode :: UnicodeString -> (BS.ByteString, T.Text) 16 | wrapUnicode ustr = (T.encodeUtf8 txt, txt) 17 | where 18 | txt = T.pack $ getUnicodeString ustr 19 | 20 | replaceNonAsciiSpaces :: Char -> Char 21 | replaceNonAsciiSpaces ch | ch >= chr 127 && isSpace ch = '_' 22 | | otherwise = ch 23 | 24 | main :: IO () 25 | main = hspec $ parallel $ modifyMaxSuccess (const 10000) $ modifyMaxSize (const 1000) $ do 26 | describe "ASCII support" $ do 27 | it "Counts bytes correctly" $ property $ 28 | \(getASCIIString -> str) -> wc @'Bytes (BS.pack str) `shouldBe` genericLength str 29 | it "Counts chars correctly" $ property $ 30 | \(getASCIIString -> str) -> wc @'Chars (BS.pack str) `shouldBe` genericLength str 31 | it "Counts words correctly" $ property $ 32 | \(getASCIIString -> str) -> wc @'Words (BS.pack str) `shouldBe` genericLength (words str) 33 | it "Counts lines correctly" $ property $ 34 | \(getASCIIString -> str) -> wc @'Lines (BS.pack str) `shouldBe` genericLength (filter (== '\n') str) 35 | describe "UTF8 support" $ do 36 | it "Counts bytes correctly" $ property $ 37 | \(wrapUnicode -> (bs, _)) -> wc @'Bytes bs `shouldBe` fromIntegral (BS.length bs) 38 | it "Counts chars correctly" $ property $ 39 | \(wrapUnicode -> (bs, txt)) -> wc @'Chars bs `shouldBe` fromIntegral (T.length txt) 40 | it "Counts words correctly" $ property $ 41 | \(wrapUnicode -> (bs, txt)) -> wc @'Words bs `shouldBe` genericLength (T.words $ T.map replaceNonAsciiSpaces txt) 42 | it "Counts lines correctly" $ property $ 43 | \(wrapUnicode -> (bs, txt)) -> wc @'Lines bs `shouldBe` fromIntegral (T.count "\n" txt) 44 | -------------------------------------------------------------------------------- /app/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RecordWildCards #-} 2 | {-# LANGUAGE TypeApplications, DataKinds #-} 3 | {-# LANGUAGE TemplateHaskell #-} 4 | {-# LANGUAGE Strict #-} 5 | 6 | module Main where 7 | 8 | import qualified Data.ByteString.Lazy as BSL 9 | import Control.Monad 10 | import Options.Applicative 11 | import System.Posix.Files 12 | import System.IO.Posix.MMap 13 | 14 | import Data.Dispatch 15 | import Data.WordCount 16 | 17 | data Options = Options 18 | { countBytes :: Bool 19 | , countChars :: Bool 20 | , countLines :: Bool 21 | , countMaxLineLength :: Bool 22 | , countWords :: Bool 23 | , files :: [FilePath] 24 | } 25 | 26 | options :: Parser Options 27 | options = Options 28 | <$> switch (long "bytes" <> short 'c' <> help "print the byte counts") 29 | <*> switch (long "chars" <> short 'm' <> help "print the character counts") 30 | <*> switch (long "lines" <> short 'l' <> help "print the newline counts") 31 | <*> switch (long "max-line-length" <> short 'L' <> help "print the maximum display width") 32 | <*> switch (long "words" <> short 'w' <> help "print the word counts") 33 | <*> many (argument str (metavar "FILES...")) 34 | 35 | main :: IO () 36 | main = do 37 | Options { .. } <- execParser $ info (options <**> helper) (fullDesc <> progDesc "Print newline, word, and byte counts for each file") 38 | let selectedStats = map snd $ filter fst [(countBytes, Bytes), (countChars, Chars), (countWords, Words), (countMaxLineLength, MaxLL), (countLines, Lines)] 39 | let stats | null selectedStats = [Bytes, Words, Lines] 40 | | otherwise = selectedStats 41 | forM_ files $ \path -> do 42 | stat <- getFileStatus path 43 | if isRegularFile stat || isSymbolicLink stat 44 | then countStrict stats $ unsafeMMapFile path 45 | else countLazy stats $ BSL.readFile path 46 | 47 | when (null files) $ countLazy stats BSL.getContents 48 | where 49 | countStrict stats act = do 50 | contents <- act 51 | putStrLn $ $(dispatch 'wc 'contents) stats 52 | countLazy stats act = do 53 | contents <- act 54 | putStrLn $ $(dispatch 'wcLazy 'contents) stats 55 | -------------------------------------------------------------------------------- /src/Data/WordCount.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE Strict, RecordWildCards, BinaryLiterals #-} 2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 3 | {-# LANGUAGE MultiParamTypeClasses, UndecidableInstances, FlexibleInstances #-} 4 | {-# LANGUAGE TypeFamilyDependencies, FunctionalDependencies, PolyKinds, DataKinds, GADTs, TypeOperators #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | 7 | module Data.WordCount where 8 | 9 | import qualified Data.ByteString as BS 10 | import qualified Data.ByteString.Lazy as BSL 11 | import Data.Bits 12 | import Data.Word 13 | 14 | data Statistics = Bytes | Chars | Words | MaxLL | Lines deriving (Eq, Ord) 15 | data StatCompTyOf = Chunked | ByteOnly 16 | 17 | type family CombineCompTy a b where 18 | CombineCompTy 'Chunked 'Chunked = 'Chunked 19 | CombineCompTy _ _ = 'ByteOnly 20 | 21 | data StatComputation st compTy where 22 | ChunkedComputation :: (st -> Word8 -> st) 23 | -> (st -> BS.ByteString -> st) 24 | -> StatComputation st 'Chunked 25 | ByteOnlyComputation :: (st -> Word8 -> st) 26 | -> StatComputation st 'ByteOnly 27 | 28 | class Statistic s res st comp | res -> s, st -> s, s -> res, s -> st, s -> comp where 29 | initState :: st 30 | extractState :: st -> res 31 | prettyPrint :: res -> String 32 | computation :: StatComputation st comp 33 | 34 | newtype Tagged a = Tagged Word64 deriving (Eq, Show, Num) 35 | 36 | instance Statistic 'Bytes (Tagged 'Bytes) (Tagged 'Bytes) 'Chunked where 37 | initState = 0 38 | extractState = id 39 | prettyPrint (Tagged n) = show n <> " bytes" 40 | computation = ChunkedComputation (\st _ -> st + 1) (\st str -> st + fromIntegral (BS.length str)) 41 | 42 | instance Statistic 'Chars (Tagged 'Chars) (Tagged 'Chars) 'ByteOnly where 43 | initState = 0 44 | extractState = id 45 | prettyPrint (Tagged n) = show n <> " characters" 46 | computation = ByteOnlyComputation $ \cnt c -> cnt + 1 - fromIntegral ( ((c .&. 0b10000000) `shiftR` 7) 47 | .&. (1 - ((c .&. 0b01000000) `shiftR` 6)) 48 | ) 49 | 50 | data WordsState = WordsState { ws :: Word64, wasSpace :: Word64 } 51 | 52 | instance Statistic 'Words (Tagged 'Words) WordsState 'ByteOnly where 53 | initState = WordsState 0 1 54 | extractState WordsState { .. } = Tagged (ws + 1 - wasSpace) 55 | prettyPrint (Tagged n) = show n <> " words" 56 | computation = ByteOnlyComputation step 57 | where 58 | step WordsState { .. } c = WordsState (ws + (1 - wasSpace) * isSp) isSp 59 | where 60 | isSp | c == 32 || c - 9 <= 4 = 1 61 | | otherwise = 0 62 | 63 | data MaxLLState = MaxLLState { maxLen :: Word64, curLen :: Word64 } 64 | 65 | instance Statistic 'MaxLL (Tagged 'MaxLL) MaxLLState 'ByteOnly where 66 | initState = MaxLLState 0 0 67 | extractState MaxLLState { .. } = Tagged $ max maxLen curLen 68 | prettyPrint (Tagged n) = show n <> " max line length" 69 | computation = ByteOnlyComputation step 70 | where 71 | step MaxLLState { .. } 9 = MaxLLState maxLen $ curLen + 8 - (curLen `rem` 8) 72 | step MaxLLState { .. } 8 = MaxLLState maxLen $ max 0 (curLen - 1) 73 | step MaxLLState { .. } c | c == 10 74 | || c == 12 75 | || c == 13 = MaxLLState (max maxLen curLen) 0 76 | | c < 32 = MaxLLState maxLen curLen 77 | step MaxLLState { .. } _ = MaxLLState maxLen (curLen + 1) 78 | 79 | instance Statistic 'Lines (Tagged 'Lines) (Tagged 'Lines) 'Chunked where 80 | initState = 0 81 | extractState = id 82 | prettyPrint (Tagged n) = show n <> " lines" 83 | computation = ChunkedComputation (\st c -> st + if c == 10 then 1 else 0) (\st str -> st + fromIntegral (BS.count 10 str)) 84 | 85 | infixr 5 ::: 86 | data a ::: b = a ::: b deriving (Show) 87 | 88 | instance (Statistic sa resa sta compa, Statistic sb resb stb compb, comp ~ CombineCompTy compa compb) 89 | => Statistic (sa '::: sb) (resa ::: resb) (sta ::: stb) comp where 90 | initState = initState ::: initState 91 | extractState (a ::: b) = extractState a ::: extractState b 92 | prettyPrint (a ::: b) = prettyPrint a <> "\n" <> prettyPrint b 93 | computation = case (computation :: StatComputation sta compa, computation :: StatComputation stb compb) of 94 | (ByteOnlyComputation a, ChunkedComputation b _) -> ByteOnlyComputation $ combine a b 95 | (ChunkedComputation a _, ByteOnlyComputation b) -> ByteOnlyComputation $ combine a b 96 | (ByteOnlyComputation a, ByteOnlyComputation b) -> ByteOnlyComputation $ combine a b 97 | (ChunkedComputation stepA chunkA, ChunkedComputation stepB chunkB) 98 | -> ChunkedComputation (combine stepA stepB) (combine chunkA chunkB) 99 | where 100 | combine fa fb = \(a ::: b) w -> fa a w ::: fb b w 101 | 102 | wc :: forall s res st comp. Statistic s res st comp => BS.ByteString -> res 103 | wc s = extractState $! runCompute computation 104 | where 105 | runCompute :: StatComputation st comp -> st 106 | runCompute (ByteOnlyComputation step) = BS.foldl' step initState s 107 | runCompute (ChunkedComputation _ chunker) = chunker initState s 108 | 109 | wcLazy :: forall s res st comp. Statistic s res st comp => BSL.ByteString -> res 110 | wcLazy s = extractState $! runCompute computation 111 | where 112 | runCompute :: StatComputation st comp -> st 113 | runCompute (ByteOnlyComputation step) = BSL.foldl' step initState s 114 | runCompute (ChunkedComputation _ chunker) = BSL.foldlChunks chunker initState s 115 | --------------------------------------------------------------------------------