├── app
├── App
│ ├── Main.hs
│ ├── Show.hs
│ ├── XPath
│ │ ├── Types.hs
│ │ └── Parser.hs
│ ├── Options.hs
│ ├── Commands.hs
│ ├── Commands
│ │ ├── Types.hs
│ │ ├── CreateBlankedXml.hs
│ │ ├── CreateIbIndex.hs
│ │ ├── CreateBpIndex.hs
│ │ ├── CreateIndex.hs
│ │ ├── Demo.hs
│ │ └── Count.hs
│ └── Naive.hs
└── Main.hs
├── Setup.hs
├── test
├── Spec.hs
└── HaskellWorks
│ └── Data
│ └── Xml
│ ├── Succinct
│ ├── Cursor
│ │ ├── BlankedXmlSpec.hs
│ │ ├── InterestBitsSpec.hs
│ │ └── BalancedParensSpec.hs
│ ├── CursorSpec.hs
│ └── CursorSpec
│ │ └── Make.hs
│ ├── Token
│ └── TokenizeSpec.hs
│ ├── TypeSpec.hs
│ ├── RawValueSpec.hs
│ └── Internal
│ └── BlankSpec.hs
├── cabal.project
├── src
└── HaskellWorks
│ └── Data
│ ├── Xml
│ ├── Succinct.hs
│ ├── Token.hs
│ ├── Succinct
│ │ ├── Cursor.hs
│ │ ├── Cursor
│ │ │ ├── Types.hs
│ │ │ ├── Load.hs
│ │ │ ├── BlankedXml.hs
│ │ │ ├── Token.hs
│ │ │ ├── Create.hs
│ │ │ ├── MMap.hs
│ │ │ ├── BalancedParens.hs
│ │ │ ├── InterestBits.hs
│ │ │ └── Internal.hs
│ │ └── Index.hs
│ ├── Internal
│ │ ├── Show.hs
│ │ ├── ByteString.hs
│ │ ├── ToIbBp64.hs
│ │ ├── Tables.hs
│ │ ├── Words.hs
│ │ ├── BalancedParens.hs
│ │ ├── List.hs
│ │ └── Blank.hs
│ ├── RawDecode.hs
│ ├── DecodeError.hs
│ ├── Token
│ │ ├── Types.hs
│ │ └── Tokenize.hs
│ ├── Lens.hs
│ ├── CharLike.hs
│ ├── Index.hs
│ ├── DecodeResult.hs
│ ├── Decode.hs
│ ├── Grammar.hs
│ ├── Type.hs
│ ├── Value.hs
│ ├── RawValue.hs
│ └── Blank.hs
│ └── Xml.hs
├── doctest
└── DoctestDriver.hs
├── .gitignore
├── README.md
├── .github
├── FUNDING.yml
└── workflows
│ └── haskell.yml
├── Criteria
├── project.sh
├── LICENSE
├── .vscode
└── tasks.json
├── bench
└── Main.hs
├── .stylish-haskell.yaml
├── data
└── catalog.xml
└── hw-xml.cabal
/app/App/Main.hs:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 |
--------------------------------------------------------------------------------
/test/Spec.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
2 |
--------------------------------------------------------------------------------
/cabal.project:
--------------------------------------------------------------------------------
1 | packages: .
2 |
3 | allow-newer:
4 | hedgehog:template-haskell,
5 | doctest:ghc,
6 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Succinct
2 | ( module X
3 | ) where
4 |
5 | import HaskellWorks.Data.Xml.Succinct.Cursor as X
6 |
--------------------------------------------------------------------------------
/app/App/Show.hs:
--------------------------------------------------------------------------------
1 | module App.Show
2 | ( tshow
3 | ) where
4 |
5 | import Data.Text (Text)
6 |
7 | import qualified Data.Text as T
8 |
9 | tshow :: Show a => a -> Text
10 | tshow = T.pack . show
11 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Token.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Token
2 | ( module X
3 | ) where
4 |
5 | import HaskellWorks.Data.Xml.Token.Types as X
6 | import HaskellWorks.Data.Xml.Token.Tokenize as X
7 |
--------------------------------------------------------------------------------
/app/App/XPath/Types.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric #-}
2 |
3 | module App.XPath.Types where
4 |
5 | import Data.Text (Text)
6 | import GHC.Generics
7 |
8 | newtype XPath = XPath
9 | { path :: [Text]
10 | } deriving (Eq, Show, Generic)
11 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor.hs:
--------------------------------------------------------------------------------
1 |
2 | module HaskellWorks.Data.Xml.Succinct.Cursor
3 | ( module X
4 | ) where
5 |
6 | import HaskellWorks.Data.Xml.Succinct.Cursor.Internal as X
7 | import HaskellWorks.Data.Xml.Succinct.Cursor.Token as X
8 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/Show.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Internal.Show
2 | ( tshow
3 | ) where
4 |
5 | import Data.Text (Text)
6 |
7 | import qualified Data.Text as T
8 |
9 | tshow :: Show a => a -> Text
10 | tshow = T.pack . show
11 |
--------------------------------------------------------------------------------
/app/Main.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import App.Commands
4 | import Control.Monad
5 | import Options.Applicative
6 |
7 | main :: IO ()
8 | main = join $ customExecParser
9 | (prefs $ showHelpOnEmpty <> showHelpOnError)
10 | (info (commands <**> helper) idm)
11 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml
2 | ( module X
3 | ) where
4 |
5 | import HaskellWorks.Data.Xml.Decode as X
6 | import HaskellWorks.Data.Xml.DecodeError as X
7 | import HaskellWorks.Data.Xml.Succinct as X
8 | import HaskellWorks.Data.Xml.Token as X
9 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/RawDecode.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.RawDecode where
2 |
3 | import HaskellWorks.Data.Xml.RawValue
4 |
5 | class RawDecode a where
6 | rawDecode :: RawValue -> a
7 |
8 | instance RawDecode RawValue where
9 | rawDecode = id
10 | {-# INLINE rawDecode #-}
11 |
--------------------------------------------------------------------------------
/doctest/DoctestDriver.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE CPP #-}
2 |
3 | #if MIN_VERSION_GLASGOW_HASKELL(8,4,4,0)
4 | {-# OPTIONS_GHC -F -pgmF doctest-discover #-}
5 | #else
6 | module Main where
7 |
8 | import qualified System.IO as IO
9 |
10 | main :: IO ()
11 | main = IO.putStrLn "WARNING: doctest will not run on GHC versions earlier than 8.4.4"
12 | #endif
13 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/DecodeError.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric #-}
2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
3 |
4 | module HaskellWorks.Data.Xml.DecodeError where
5 |
6 | import Control.DeepSeq
7 | import Data.Text (Text)
8 | import GHC.Generics
9 |
10 | newtype DecodeError = DecodeError Text deriving (Eq, Show, Generic, NFData)
11 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Token/Types.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Token.Types (XmlToken(..)) where
2 |
3 | data XmlToken s d
4 | = XmlTokenBraceL
5 | | XmlTokenBraceR
6 | | XmlTokenBracketL
7 | | XmlTokenBracketR
8 | | XmlTokenComma
9 | | XmlTokenColon
10 | | XmlTokenWhitespace
11 | | XmlTokenString s
12 | | XmlTokenBoolean Bool
13 | | XmlTokenNumber d
14 | | XmlTokenNull
15 | deriving (Eq, Show)
16 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist
2 | dist-newstyle
3 | cabal-dev
4 | data/bench/
5 | *.o
6 | *.hi
7 | *.chi
8 | *.chs.h
9 | *.dyn_o
10 | *.dyn_hi
11 | .hpc
12 | .hsenv
13 | .cabal-sandbox/
14 | cabal.sandbox.config
15 | *.prof
16 | *.aux
17 | *.hp
18 | .ghc.environment*
19 |
20 | /*.submodules
21 | /.stack-work/
22 | /result
23 | /deps
24 | /*.xml
25 | /*.xml.ib
26 | /*.xml.bp
27 | /snapshot.yaml
28 | /stack-ci.yaml
29 | /*.gz
30 | /*.idx
31 |
32 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Lens.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Lens where
2 |
3 | import Control.Lens
4 | import Data.Text (Text)
5 | import HaskellWorks.Data.Xml.Value
6 |
7 | isTagNamed :: Text -> Value -> Bool
8 | isTagNamed a (XmlElement b _ _) | a == b = True
9 | isTagNamed _ _ = False
10 |
11 | tagNamed :: (Applicative f, Choice p) => Text -> Optic' p f Value Value
12 | tagNamed = filtered . isTagNamed
13 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/ByteString.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Internal.ByteString
2 | ( repartitionMod8
3 | ) where
4 |
5 | import Data.ByteString (ByteString)
6 |
7 | import qualified Data.ByteString as BS
8 |
9 | repartitionMod8 :: ByteString -> ByteString -> (ByteString, ByteString)
10 | repartitionMod8 aBS bBS = (BS.take cLen abBS, BS.drop cLen abBS)
11 | where abBS = BS.concat [aBS, bBS]
12 | abLen = BS.length abBS
13 | cLen = (abLen `div` 8) * 8
14 | {-# INLINE repartitionMod8 #-}
15 |
--------------------------------------------------------------------------------
/app/App/XPath/Parser.hs:
--------------------------------------------------------------------------------
1 | module App.XPath.Parser
2 | ( path
3 | ) where
4 |
5 | import Control.Applicative
6 | import Data.Attoparsec.Text
7 | import Data.Text (Text)
8 |
9 | import qualified App.XPath.Types as XP
10 | import qualified Data.Text as T
11 |
12 | tag :: Parser Text
13 | tag = T.cons <$> letter <*> tagTail
14 |
15 | tagTail :: Parser Text
16 | tagTail = T.pack <$> many (letter <|> digit <|> char '-' <|> char '_')
17 |
18 | path :: Parser XP.XPath
19 | path = XP.XPath <$> sepBy1 tag (char '/')
20 |
--------------------------------------------------------------------------------
/app/App/Options.hs:
--------------------------------------------------------------------------------
1 | module App.Options
2 | ( optionParser
3 | , textOption
4 | ) where
5 |
6 | import Data.Text (Text)
7 |
8 | import qualified Data.Attoparsec.Text as AT
9 | import qualified Data.Text as T
10 | import qualified Options.Applicative as OA
11 |
12 | optionParser :: AT.Parser a -> OA.Mod OA.OptionFields a -> OA.Parser a
13 | optionParser p = OA.option (OA.eitherReader (AT.parseOnly p . T.pack))
14 |
15 | textOption :: OA.Mod OA.OptionFields String -> OA.Parser Text
16 | textOption = fmap T.pack . OA.strOption
17 |
--------------------------------------------------------------------------------
/app/App/Commands.hs:
--------------------------------------------------------------------------------
1 | module App.Commands where
2 |
3 | import App.Commands.Count
4 | import App.Commands.CreateBlankedXml
5 | import App.Commands.CreateBpIndex
6 | import App.Commands.CreateIbIndex
7 | import App.Commands.CreateIndex
8 | import App.Commands.Demo
9 | import Options.Applicative
10 |
11 | commands :: Parser (IO ())
12 | commands = commandsGeneral
13 |
14 | commandsGeneral :: Parser (IO ())
15 | commandsGeneral = subparser $ mempty
16 | <> commandGroup "Commands:"
17 | <> cmdCount
18 | <> cmdCreateIndex
19 | <> cmdCreateBlankedXml
20 | <> cmdCreateIbIndex
21 | <> cmdCreateBpIndex
22 | <> cmdDemo
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hw-xml
2 | [](https://circleci.com/gh/haskell-works/hw-xml)
3 |
4 | `hw-xml` is a high performance XML parsing library. It uses
5 | succinct data-structures to allow traversal of large XML
6 | strings with minimal memory overhead.
7 |
8 | For an example, see [app/Main.hs](../master/app/Main.hs)
9 |
10 | # Notes
11 | * [Semi-Indexing Semi-Structured Data in Tiny Space](http://www.di.unipi.it/~ottavian/files/semi_index_cikm.pdf)
12 | * [Space-Efficient, High-Performance Rank & Select Structures on Uncompressed Bit Sequences](https://www.cs.cmu.edu/~dga/papers/zhou-sea2013.pdf)
13 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/CharLike.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.CharLike where
2 |
3 | import Data.Word
4 | import Data.Word8 as W
5 |
6 | class XmlCharLike c where
7 | isElementStart :: c -> Bool
8 | isExclam :: c -> Bool
9 | isHyphen :: c -> Bool
10 | isOpenBracket :: c -> Bool
11 | isQuestion :: c -> Bool
12 | isQuote :: c -> Bool
13 | isSpace :: c -> Bool
14 |
15 | instance XmlCharLike Word8 where
16 | isElementStart = (== _less)
17 | isExclam = (== _exclam)
18 | isHyphen = (== _hyphen)
19 | isOpenBracket = (== _bracketleft)
20 | isQuestion = (== _question)
21 | isQuote c = c == _quotedbl || c == _quotesingle
22 | isSpace = W.isSpace
23 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/Types.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE ScopedTypeVariables #-}
2 |
3 | module HaskellWorks.Data.Xml.Succinct.Cursor.Types
4 | ( SlowCursor
5 | , FastCursor
6 | ) where
7 |
8 | import Data.Word
9 | import HaskellWorks.Data.BalancedParens.RangeMin2
10 | import HaskellWorks.Data.BalancedParens.Simple
11 | import HaskellWorks.Data.Bits.BitShown
12 | import HaskellWorks.Data.RankSelect.CsPoppy1
13 | import HaskellWorks.Data.Xml.Succinct.Cursor
14 |
15 | import qualified Data.ByteString as BS
16 | import qualified Data.Vector.Storable as DVS
17 |
18 | type SlowCursor = XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))
19 |
20 | type FastCursor = XmlCursor BS.ByteString CsPoppy1 (RangeMin2 CsPoppy1)
21 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [newhoggy, araga] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/Load.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE ScopedTypeVariables #-}
2 |
3 | module HaskellWorks.Data.Xml.Succinct.Cursor.Load
4 | ( loadSlowCursor
5 | , loadFastCursor
6 | ) where
7 |
8 | import HaskellWorks.Data.Xml.Succinct.Cursor.Create
9 | import HaskellWorks.Data.Xml.Succinct.Cursor.Types
10 |
11 | import qualified Data.ByteString as BS
12 |
13 | -- | Load an XML file into memory and return a raw cursor initialised to the
14 | -- start of the XML document.
15 | loadSlowCursor :: FilePath -> IO SlowCursor
16 | loadSlowCursor = fmap byteStringAsSlowCursor . BS.readFile
17 |
18 | -- | Load an XML file into memory and return a query-optimised cursor initialised
19 | -- to the start of the XML document.
20 | loadFastCursor :: FilePath -> IO FastCursor
21 | loadFastCursor = fmap byteStringAsFastCursor . BS.readFile
22 |
--------------------------------------------------------------------------------
/Criteria:
--------------------------------------------------------------------------------
1 | # Criteria
2 |
3 | ## Data ingestion
4 | * with various options
5 | * over 1G chunks of data (by unzipped)
6 | * and measure ingestion time (in seconds)
7 |
8 | ## API support
9 | * Which languages?
10 | * Is the API convenient?
11 |
12 | ## Resiliance against bad/slow queries
13 | * What is the impact of a bad query on the database
14 | * How easy is it to recover from a bad query
15 | * What happens on node failure, etc.
16 |
17 | ## Query Capabilities
18 | * Is it possible to filter by time range?
19 | * Is it possible to have dynamic querying?
20 |
21 | ## Query performance
22 | * Performance of querying all data for specific attacks
23 | * Performance of metadata for specific attacks
24 | * Performance of time range query
25 | * Performance of dynamic filter query
26 | * Performance of aggressive (duration filter) query for fast-forward
27 |
28 | ## Cost
29 | * How does it cost to handle today's data?
30 | * How does it cost to scale?
31 |
32 |
33 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/ToIbBp64.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE FlexibleInstances #-}
3 | {-# LANGUAGE InstanceSigs #-}
4 | {-# LANGUAGE MultiParamTypeClasses #-}
5 |
6 | module HaskellWorks.Data.Xml.Internal.ToIbBp64
7 | ( toBalancedParens64
8 | , toInterestBits64
9 | , toIbBp64
10 | ) where
11 |
12 | import Data.ByteString (ByteString)
13 | import HaskellWorks.Data.Xml.Internal.BalancedParens
14 | import HaskellWorks.Data.Xml.Internal.List
15 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml (BlankedXml (..))
16 |
17 | toBalancedParens64 :: BlankedXml -> [ByteString]
18 | toBalancedParens64 (BlankedXml bj) = compressWordAsBit (blankedXmlToBalancedParens bj)
19 |
20 | toInterestBits64 :: BlankedXml -> [ByteString]
21 | toInterestBits64 (BlankedXml bj) = blankedXmlToInterestBits bj
22 |
23 | toIbBp64 :: BlankedXml -> [(ByteString, ByteString)]
24 | toIbBp64 bj = zip (toInterestBits64 bj) (toBalancedParens64 bj)
25 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/BlankedXml.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric #-}
2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
3 |
4 | module HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
5 | ( BlankedXml(..)
6 | , FromBlankedXml(..)
7 | , getBlankedXml
8 | , bsToBlankedXml
9 | , lbsToBlankedXml
10 | ) where
11 |
12 | import Control.DeepSeq
13 | import GHC.Generics
14 | import HaskellWorks.Data.Xml.Internal.Blank
15 |
16 | import qualified Data.ByteString as BS
17 | import qualified Data.ByteString.Lazy as LBS
18 |
19 | newtype BlankedXml = BlankedXml
20 | { unblankedXml :: [BS.ByteString]
21 | } deriving (Eq, Show, Generic, NFData)
22 |
23 | getBlankedXml :: BlankedXml -> [BS.ByteString]
24 | getBlankedXml (BlankedXml bs) = bs
25 |
26 | class FromBlankedXml a where
27 | fromBlankedXml :: BlankedXml -> a
28 |
29 | bsToBlankedXml :: BS.ByteString -> BlankedXml
30 | bsToBlankedXml bs = BlankedXml (blankXml [bs])
31 |
32 | lbsToBlankedXml :: LBS.ByteString -> BlankedXml
33 | lbsToBlankedXml lbs = BlankedXml (blankXml (LBS.toChunks lbs))
34 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/Tables.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Internal.Tables
2 | ( interestingWord8s
3 | , isInterestingWord8
4 | ) where
5 |
6 | import Data.Word
7 | import Data.Word8
8 | import HaskellWorks.Data.AtIndex ((!!!))
9 | import Prelude as P
10 |
11 | import qualified Data.Vector.Storable as DVS
12 |
13 | interestingWord8s :: DVS.Vector Word8
14 | interestingWord8s = DVS.constructN 256 go
15 | where go :: DVS.Vector Word8 -> Word8
16 | go v = if w == _bracketleft
17 | || w == _braceleft
18 | || w == _parenleft
19 | || w == _bracketleft
20 | || w == _less
21 | || w == _a
22 | || w == _v
23 | || w == _t
24 | then 1
25 | else 0
26 | where w :: Word8
27 | w = fromIntegral (DVS.length v)
28 | {-# NOINLINE interestingWord8s #-}
29 |
30 | isInterestingWord8 :: Word8 -> Word8
31 | isInterestingWord8 b = fromIntegral (interestingWord8s !!! fromIntegral b)
32 | {-# INLINABLE isInterestingWord8 #-}
33 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/Token.hs:
--------------------------------------------------------------------------------
1 |
2 | module HaskellWorks.Data.Xml.Succinct.Cursor.Token
3 | ( xmlTokenAt
4 | ) where
5 |
6 | import Data.ByteString (ByteString)
7 | import HaskellWorks.Data.Bits.BitWise
8 | import HaskellWorks.Data.Drop
9 | import HaskellWorks.Data.Positioning
10 | import HaskellWorks.Data.RankSelect.Base.Rank1
11 | import HaskellWorks.Data.RankSelect.Base.Select1
12 | import HaskellWorks.Data.Xml.Succinct.Cursor.Internal
13 | import HaskellWorks.Data.Xml.Token.Tokenize
14 | import Prelude hiding (drop)
15 |
16 | import qualified Data.Attoparsec.ByteString.Char8 as ABC
17 |
18 | xmlTokenAt :: (Rank1 w, Select1 v, TestBit w) => XmlCursor ByteString v w -> Maybe (XmlToken String Double)
19 | xmlTokenAt k = if balancedParens k .?. lastPositionOf (cursorRank k)
20 | then case ABC.parse parseXmlToken (drop (toCount (xmlCursorPos k)) (cursorText k)) of
21 | ABC.Fail {} -> error "Failed to parse token in cursor"
22 | ABC.Partial _ -> error "Failed to parse token in cursor"
23 | ABC.Done _ r -> Just r
24 | else Nothing
25 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/Succinct/Cursor/BlankedXmlSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 |
4 | module HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXmlSpec
5 | ( spec
6 | ) where
7 |
8 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
9 | import HaskellWorks.Hspec.Hedgehog
10 | import Hedgehog
11 | import Test.Hspec
12 |
13 | {- HLINT ignore "Redundant do" -}
14 |
15 | spec :: Spec
16 | spec = describe "HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXmlSpec" $ do
17 | describe "Blanking XML should work" $ do
18 | it "on strict bytestrings" $ requireTest $ do
19 | let input = ""
20 | let expected = "< < > >"
21 | let blankedXml = bsToBlankedXml input
22 |
23 | mconcat (unblankedXml blankedXml) === expected
24 |
25 | it "on lazy bytestrings" $ requireTest $ do
26 | let input = ""
27 | let expected = "< < > >"
28 | let blankedXml = lbsToBlankedXml input
29 |
30 | mconcat (unblankedXml blankedXml) === expected
31 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/Words.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Internal.Words where
2 |
3 | import Data.Word
4 | import Data.Word8
5 |
6 | isLeadingDigit :: Word8 -> Bool
7 | isLeadingDigit w = w == _hyphen || (w >= _0 && w <= _9)
8 |
9 | isTrailingDigit :: Word8 -> Bool
10 | isTrailingDigit w = w == _plus || w == _hyphen || (w >= _0 && w <= _9) || w == _period || w == _E || w == _e
11 |
12 | isAlphabetic :: Word8 -> Bool
13 | isAlphabetic w = (w >= _A && w <= _Z) || (w >= _a && w <= _z)
14 |
15 | isQuote :: Word8 -> Bool
16 | isQuote w = w == _quotedbl || w == _quotesingle
17 |
18 | isNameStartChar :: Word8 -> Bool
19 | isNameStartChar w = w == _underscore || w == _colon || isAlphabetic w
20 | || w `isIn` (0xc0, 0xd6)
21 | || w `isIn` (0xd8, 0xf6)
22 | || w `isIn` (0xf8, 0xff)
23 |
24 | isNameChar :: Word8 -> Bool
25 | isNameChar w = isNameStartChar w || w == _hyphen || w == _period
26 | || w == 0xb7 || w `isIn` (0, 9)
27 |
28 | isXml :: Word8 -> Bool
29 | isXml w = w == _less || w == _greater
30 |
31 | isTextStart :: Word8 -> Bool
32 | isTextStart w = not (isSpace w) && w /= _less && w /= _greater
33 |
34 | isIn :: Word8 -> (Word8, Word8) -> Bool
35 | isIn w (s, e) = w >= s && w <= e
36 | {-# INLINE isIn #-}
37 |
--------------------------------------------------------------------------------
/app/App/Commands/Types.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric #-}
2 | {-# LANGUAGE DuplicateRecordFields #-}
3 |
4 | module App.Commands.Types
5 | ( CountOptions(..)
6 | , CreateBlankedXmlOptions(..)
7 | , CreateBpIndexOptions(..)
8 | , CreateIbIndexOptions(..)
9 | , CreateIndexOptions(..)
10 | , DemoOptions(..)
11 | ) where
12 |
13 | import App.XPath.Types (XPath)
14 | import Data.Text (Text)
15 | import GHC.Generics
16 |
17 | data DemoOptions = DemoOptions deriving (Eq, Show, Generic)
18 |
19 | data CountOptions = CountOptions
20 | { input :: FilePath
21 | , xpath :: XPath
22 | , method :: Text
23 | } deriving (Eq, Show, Generic)
24 |
25 | data CreateIndexOptions = CreateIndexOptions
26 | { input :: FilePath
27 | , ibOutput :: FilePath
28 | , bpOutput :: FilePath
29 | , method :: Text
30 | } deriving (Eq, Show, Generic)
31 |
32 | data CreateBlankedXmlOptions = CreateBlankedXmlOptions
33 | { input :: FilePath
34 | , output :: FilePath
35 | } deriving (Eq, Show, Generic)
36 |
37 | data CreateIbIndexOptions = CreateIbIndexOptions
38 | { input :: FilePath
39 | , output :: FilePath
40 | } deriving (Eq, Show, Generic)
41 |
42 | data CreateBpIndexOptions = CreateBpIndexOptions
43 | { input :: FilePath
44 | , output :: FilePath
45 | } deriving (Eq, Show, Generic)
46 |
--------------------------------------------------------------------------------
/project.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | CABAL_FLAGS="-j8"
4 |
5 | cmd="$1"
6 |
7 | shift
8 |
9 | cabal-install() {
10 | cabal v2-install \
11 | -j8 \
12 | --installdir="$HOME/.local/bin" \
13 | --overwrite-policy=always \
14 | --disable-documentation \
15 | $CABAL_FLAGS "$@"
16 | }
17 |
18 | cabal-build() {
19 | cabal v2-build \
20 | --enable-tests \
21 | --write-ghc-environment-files=ghc8.4.4+ \
22 | $CABAL_FLAGS "$@"
23 | }
24 |
25 | cabal-test() {
26 | cabal v2-test \
27 | --enable-tests \
28 | --test-show-details=direct \
29 | --test-options='+RTS -g1' \
30 | $CABAL_FLAGS "$@"
31 | }
32 |
33 | cabal-exec() {
34 | cabal v2-exec "$(echo *.cabal | cut -d . -f 1)" "$@"
35 | }
36 |
37 | cabal-bench() {
38 | cabal v2-bench -j8 \
39 | $CABAL_FLAGS "$@"
40 | }
41 |
42 | cabal-repl() {
43 | cabal v2-repl \
44 | $CABAL_FLAGS "$@"
45 | }
46 |
47 | cabal-clean() {
48 | cabal v2-clean
49 | }
50 |
51 | case "$cmd" in
52 | install)
53 | cabal-install
54 | ;;
55 |
56 | build)
57 | cabal-build
58 | ;;
59 |
60 | exec)
61 | cabal-exec
62 | ;;
63 |
64 | test)
65 | cabal-build
66 | cabal-test
67 | ;;
68 |
69 | bench)
70 | cabal-bench
71 | ;;
72 |
73 | repl)
74 | cabal-repl
75 | ;;
76 |
77 | clean)
78 | cabal-clean
79 | ;;
80 |
81 | *)
82 | echo "Unrecognised command: $cmd"
83 | exit 1
84 | esac
85 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/Create.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Succinct.Cursor.Create
2 | ( byteStringAsFastCursor
3 | , byteStringAsSlowCursor
4 | ) where
5 |
6 | import Data.Coerce
7 | import HaskellWorks.Data.BalancedParens.RangeMin2
8 | import HaskellWorks.Data.BalancedParens.Simple
9 | import HaskellWorks.Data.Bits.BitShown
10 | import HaskellWorks.Data.RankSelect.CsPoppy1
11 | import HaskellWorks.Data.Vector.Storable
12 | import HaskellWorks.Data.Xml.Succinct.Cursor
13 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
14 | import HaskellWorks.Data.Xml.Succinct.Cursor.Types
15 |
16 | import qualified Data.ByteString as BS
17 | import qualified HaskellWorks.Data.Xml.Internal.ToIbBp64 as I
18 |
19 | byteStringAsSlowCursor :: BS.ByteString -> SlowCursor
20 | byteStringAsSlowCursor bs = XmlCursor
21 | { cursorText = bs
22 | , interests = BitShown ib
23 | , balancedParens = SimpleBalancedParens bp
24 | , cursorRank = 1
25 | }
26 | where blankedXml = bsToBlankedXml bs
27 | bsLen = BS.length bs
28 | idxLen = (bsLen + 7) `div` 8
29 | (ib, bp) = construct64UnzipN idxLen (I.toIbBp64 blankedXml)
30 |
31 | byteStringAsFastCursor :: BS.ByteString -> FastCursor
32 | byteStringAsFastCursor bs = XmlCursor bs ibCsPoppy rangeMinMax r
33 | where XmlCursor _ ib bp r = byteStringAsSlowCursor bs
34 | bpCsPoppy = makeCsPoppy (coerce bp)
35 | rangeMinMax = mkRangeMin2 bpCsPoppy
36 | ibCsPoppy = makeCsPoppy (coerce ib)
37 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Index.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveAnyClass #-}
2 | {-# LANGUAGE DeriveGeneric #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 |
5 | module HaskellWorks.Data.Xml.Index
6 | ( Index(..)
7 | , indexVersion
8 | ) where
9 |
10 | import Control.DeepSeq
11 | import Data.Serialize
12 | import Data.Word
13 | import GHC.Generics
14 | import HaskellWorks.Data.Bits.BitShown
15 |
16 | import qualified Data.Vector.Storable as DVS
17 |
18 | indexVersion :: String
19 | indexVersion = "1.0"
20 |
21 | data Index = Index
22 | { xiVersion :: String
23 | , xiInterests :: BitShown (DVS.Vector Word64)
24 | , xiBalancedParens :: BitShown (DVS.Vector Word64)
25 | } deriving (Eq, Show, Generic, NFData)
26 |
27 | putBitShownVector :: Putter (BitShown (DVS.Vector Word64))
28 | putBitShownVector = putVector . bitShown
29 |
30 | getBitShownVector :: Get (BitShown (DVS.Vector Word64))
31 | getBitShownVector = BitShown <$> getVector
32 |
33 | putVector :: DVS.Vector Word64 -> Put
34 | putVector v = do
35 | let len = DVS.length v
36 | put len
37 | DVS.forM_ v put
38 |
39 | getVector :: Get (DVS.Vector Word64)
40 | getVector = do
41 | len <- get
42 | DVS.generateM len (const get)
43 |
44 | instance Serialize Index where
45 | put xi = do
46 | put $ xiVersion xi
47 | putBitShownVector $ xiInterests xi
48 | putBitShownVector $ xiBalancedParens xi
49 |
50 | get = do
51 | version <- get
52 | ib <- getBitShownVector
53 | bp <- getBitShownVector
54 | return $ Index version ib bp
55 |
--------------------------------------------------------------------------------
/app/App/Commands/CreateBlankedXml.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DataKinds #-}
2 | {-# LANGUAGE DeriveGeneric #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TypeApplications #-}
7 | {-# LANGUAGE TypeSynonymInstances #-}
8 |
9 | module App.Commands.CreateBlankedXml
10 | ( cmdCreateBlankedXml
11 | ) where
12 |
13 | import Control.Lens
14 | import Data.Generics.Product.Any
15 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
16 | import Options.Applicative hiding (columns)
17 |
18 | import qualified App.Commands.Types as Z
19 | import qualified Data.ByteString.Lazy as LBS
20 |
21 | runCreateBlankedXml :: Z.CreateBlankedXmlOptions -> IO ()
22 | runCreateBlankedXml opt = do
23 | let input = opt ^. the @"input"
24 | let output = opt ^. the @"output"
25 |
26 | lbs <- LBS.readFile input
27 | let blankedXml = lbsToBlankedXml lbs
28 | LBS.writeFile output (LBS.fromChunks (blankedXml ^. the @1))
29 |
30 | return ()
31 |
32 | optsCreateBlankedXml :: Parser Z.CreateBlankedXmlOptions
33 | optsCreateBlankedXml = Z.CreateBlankedXmlOptions
34 | <$> strOption
35 | ( long "input"
36 | <> help "Input file"
37 | <> metavar "FILE"
38 | )
39 | <*> strOption
40 | ( long "output"
41 | <> help "Blanked XML output"
42 | <> metavar "FILE"
43 | )
44 |
45 | cmdCreateBlankedXml :: Mod CommandFields (IO ())
46 | cmdCreateBlankedXml = command "create-blanked-xml" $ flip info idm $ runCreateBlankedXml <$> optsCreateBlankedXml
47 |
--------------------------------------------------------------------------------
/app/App/Commands/CreateIbIndex.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DataKinds #-}
2 | {-# LANGUAGE DeriveGeneric #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TypeApplications #-}
7 | {-# LANGUAGE TypeSynonymInstances #-}
8 |
9 | module App.Commands.CreateIbIndex
10 | ( cmdCreateIbIndex
11 | ) where
12 |
13 | import Control.Lens
14 | import Data.Generics.Product.Any
15 | import HaskellWorks.Data.Xml.Internal.ToIbBp64
16 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
17 | import Options.Applicative hiding (columns)
18 |
19 | import qualified App.Commands.Types as Z
20 | import qualified Data.ByteString.Lazy as LBS
21 |
22 | runCreateIbIndex :: Z.CreateIbIndexOptions -> IO ()
23 | runCreateIbIndex opt = do
24 | let input = opt ^. the @"input"
25 | let output = opt ^. the @"output"
26 |
27 | lbs <- LBS.readFile input
28 | let blankedXml = lbsToBlankedXml lbs
29 | let ib = toInterestBits64 blankedXml
30 | LBS.writeFile output (LBS.fromChunks ib)
31 |
32 | optsCreateIbIndex :: Parser Z.CreateIbIndexOptions
33 | optsCreateIbIndex = Z.CreateIbIndexOptions
34 | <$> strOption
35 | ( long "input"
36 | <> help "Input file"
37 | <> metavar "FILE"
38 | )
39 | <*> strOption
40 | ( long "output"
41 | <> help "Interest Bits output"
42 | <> metavar "FILE"
43 | )
44 |
45 | cmdCreateIbIndex :: Mod CommandFields (IO ())
46 | cmdCreateIbIndex = command "create-ib-index" $ flip info idm $ runCreateIbIndex <$> optsCreateIbIndex
47 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright John Ky, Alexey Raga (c) 2016-2017
2 |
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above
12 | copyright notice, this list of conditions and the following
13 | disclaimer in the documentation and/or other materials provided
14 | with the distribution.
15 |
16 | * Neither the name of Author name here nor the names of other
17 | contributors may be used to endorse or promote products derived
18 | from this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/app/App/Commands/CreateBpIndex.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DataKinds #-}
2 | {-# LANGUAGE DeriveGeneric #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TypeApplications #-}
7 | {-# LANGUAGE TypeSynonymInstances #-}
8 |
9 | module App.Commands.CreateBpIndex
10 | ( cmdCreateBpIndex
11 | ) where
12 |
13 | import Control.Lens
14 | import Data.Generics.Product.Any
15 | import HaskellWorks.Data.Xml.Internal.ToIbBp64
16 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
17 | import Options.Applicative hiding (columns)
18 |
19 | import qualified App.Commands.Types as Z
20 | import qualified Data.ByteString.Lazy as LBS
21 |
22 | runCreateBpIndex :: Z.CreateBpIndexOptions -> IO ()
23 | runCreateBpIndex opt = do
24 | let input = opt ^. the @"input"
25 | let output = opt ^. the @"output"
26 |
27 | lbs <- LBS.readFile input
28 | let blankedXml = lbsToBlankedXml lbs
29 | let ib = toBalancedParens64 blankedXml
30 | LBS.writeFile output (LBS.fromChunks ib)
31 |
32 | return ()
33 |
34 | optsCreateBpIndex :: Parser Z.CreateBpIndexOptions
35 | optsCreateBpIndex = Z.CreateBpIndexOptions
36 | <$> strOption
37 | ( long "input"
38 | <> help "Input file"
39 | <> metavar "FILE"
40 | )
41 | <*> strOption
42 | ( long "output"
43 | <> help "Balanced parens output"
44 | <> metavar "FILE"
45 | )
46 |
47 | cmdCreateBpIndex :: Mod CommandFields (IO ())
48 | cmdCreateBpIndex = command "create-bp-index" $ flip info idm $ runCreateBpIndex <$> optsCreateBpIndex
49 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/BalancedParens.hs:
--------------------------------------------------------------------------------
1 | module HaskellWorks.Data.Xml.Internal.BalancedParens
2 | ( blankedXmlToBalancedParens
3 | ) where
4 |
5 | import Data.ByteString (ByteString)
6 | import Data.Word
7 | import Data.Word8
8 |
9 | import qualified Data.ByteString as BS
10 |
11 | data MiniBP = MiniN | MiniT | MiniF | MiniTF
12 |
13 | blankedXmlToBalancedParens :: [ByteString] -> [ByteString]
14 | blankedXmlToBalancedParens is = case is of
15 | (bs:bss) -> do
16 | let (cs, _) = BS.unfoldrN (BS.length bs * 2) gen (Nothing, bs)
17 | cs:blankedXmlToBalancedParens bss
18 | [] -> []
19 | where gen :: (Maybe Bool, ByteString) -> Maybe (Word8, (Maybe Bool, ByteString))
20 | gen (Just True , bs) = Just (0xFF, (Nothing, bs))
21 | gen (Just False , bs) = Just (0x00, (Nothing, bs))
22 | gen (Nothing , bs) = case BS.uncons bs of
23 | Just (c, cs) -> case balancedParensOf c of
24 | MiniN -> gen (Nothing , cs)
25 | MiniT -> Just (0xFF, (Nothing , cs))
26 | MiniF -> Just (0x00, (Nothing , cs))
27 | MiniTF -> Just (0xFF, (Just False , cs))
28 | Nothing -> Nothing
29 |
30 | balancedParensOf :: Word8 -> MiniBP
31 | balancedParensOf c = case c of
32 | d | d == _less -> MiniT
33 | d | d == _greater -> MiniF
34 | d | d == _bracketleft -> MiniT
35 | d | d == _bracketright -> MiniF
36 | d | d == _parenleft -> MiniT
37 | d | d == _parenright -> MiniF
38 | d | d == _t -> MiniTF
39 | d | d == _a -> MiniTF
40 | d | d == _v -> MiniTF
41 | _ -> MiniN
42 |
43 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/DecodeResult.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveFunctor #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 |
4 | module HaskellWorks.Data.Xml.DecodeResult where
5 |
6 | import Control.Applicative
7 | import HaskellWorks.Data.Xml.DecodeError
8 |
9 | data DecodeResult a
10 | = DecodeOk a
11 | | DecodeFailed DecodeError
12 | deriving (Eq, Show, Functor)
13 |
14 | instance Applicative DecodeResult where
15 | pure = DecodeOk
16 | {-# INLINE pure #-}
17 |
18 | (<*>) (DecodeOk f ) (DecodeOk a) = DecodeOk (f a)
19 | (<*>) (DecodeOk _ ) (DecodeFailed e) = DecodeFailed e
20 | (<*>) (DecodeFailed e) _ = DecodeFailed e
21 | {-# INLINE (<*>) #-}
22 |
23 | instance Monad DecodeResult where
24 | (>>=) (DecodeOk a) f = f a
25 | (>>=) (DecodeFailed e) _ = DecodeFailed e
26 | {-# INLINE (>>=) #-}
27 |
28 | instance Alternative DecodeResult where
29 | empty = DecodeFailed (DecodeError "Failed decode")
30 | (<|>) (DecodeOk a) _ = DecodeOk a
31 | (<|>) _ (DecodeOk b) = DecodeOk b
32 | (<|>) _ (DecodeFailed e) = DecodeFailed e
33 | {-# INLINE (<|>) #-}
34 |
35 | instance Foldable DecodeResult where
36 | foldr f z (DecodeOk a) = f a z
37 | foldr _ z (DecodeFailed _) = z
38 |
39 | instance Traversable DecodeResult where
40 | traverse _ (DecodeFailed e) = pure (DecodeFailed e)
41 | traverse f (DecodeOk x) = DecodeOk <$> f x
42 |
43 | toEither :: DecodeResult a -> Either DecodeError a
44 | toEither (DecodeOk a) = Right a
45 | toEither (DecodeFailed e) = Left e
46 |
47 | isOk :: DecodeResult a -> Bool
48 | isOk (DecodeOk _) = True
49 | isOk _ = False
50 |
51 | isFailed :: DecodeResult a -> Bool
52 | isFailed (DecodeFailed _) = True
53 | isFailed _ = False
54 |
--------------------------------------------------------------------------------
/app/App/Naive.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | {-# LANGUAGE DataKinds #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TypeApplications #-}
7 | {-# LANGUAGE TypeSynonymInstances #-}
8 |
9 | module App.Naive
10 | ( loadSlowCursor
11 | , loadFastCursor
12 | ) where
13 |
14 | import HaskellWorks.Data.BalancedParens.RangeMin2
15 | import HaskellWorks.Data.BalancedParens.Simple
16 | import HaskellWorks.Data.Bits.BitShown
17 | import HaskellWorks.Data.FromByteString
18 | import HaskellWorks.Data.RankSelect.CsPoppy1
19 | import HaskellWorks.Data.Xml.Succinct.Cursor
20 | import HaskellWorks.Data.Xml.Succinct.Cursor.MMap
21 |
22 | import qualified Data.ByteString as BS
23 |
24 | -- | Load an XML file into memory and return a raw cursor initialised to the
25 | -- start of the XML document.
26 | loadSlowCursor :: FilePath -> IO SlowCursor
27 | loadSlowCursor path = do
28 | !bs <- BS.readFile path
29 | let !cursor = fromByteString bs :: SlowCursor
30 | return cursor
31 |
32 | -- | Load an XML file into memory and return a query-optimised cursor initialised
33 | -- to the start of the XML document.
34 | loadFastCursor :: FilePath -> IO FastCursor
35 | loadFastCursor filename = do
36 | -- Load the XML file into memory as a raw cursor.
37 | -- The raw XML data is `text`, and `ib` and `bp` are the indexes.
38 | -- `ib` and `bp` can be persisted to an index file for later use to avoid
39 | -- re-parsing the file.
40 | XmlCursor !text (BitShown !ib) (SimpleBalancedParens !bp) _ <- loadSlowCursor filename
41 | let !bpCsPoppy = makeCsPoppy bp
42 | let !rangeMinMax = mkRangeMin2 bpCsPoppy
43 | let !ibCsPoppy = makeCsPoppy ib
44 | return $ XmlCursor text ibCsPoppy rangeMinMax 1
45 |
--------------------------------------------------------------------------------
/.vscode/tasks.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "2.0.0",
3 | "tasks": [
4 | {
5 | "label": "Build",
6 | "type": "shell",
7 | "command": "bash",
8 | "args": ["-lc", "cabal build all --enable-tests && echo 'Done'"],
9 | "group": {
10 | "kind": "build",
11 | "isDefault": true
12 | },
13 | "problemMatcher": {
14 | "owner": "haskell",
15 | "fileLocation": "relative",
16 | "pattern": [
17 | {
18 | "regexp": "^(.+?):(\\d+):(\\d+):\\s+(error|warning|info):.*$",
19 | "file": 1, "line": 2, "column": 3, "severity": 4
20 | },
21 | {
22 | "regexp": "\\s*(.*)$",
23 | "message": 1
24 | }
25 | ]
26 | },
27 | "presentation": {
28 | "echo": false,
29 | "reveal": "always",
30 | "focus": false,
31 | "panel": "shared",
32 | "showReuseMessage": false,
33 | "clear": true
34 | }
35 | },
36 | {
37 | "label": "Test",
38 | "type": "shell",
39 | "command": "bash",
40 | "args": ["-lc", "cabal test all --enable-tests && echo 'Done'"],
41 | "group": {
42 | "kind": "test",
43 | "isDefault": true
44 | },
45 | "problemMatcher": {
46 | "owner": "haskell",
47 | "fileLocation": "relative",
48 | "pattern": [
49 | {
50 | "regexp": "^(.+?):(\\d+):(\\d+):.*$",
51 | "file": 1, "line": 2, "column": 3, "severity": 4
52 | },
53 | {
54 | "regexp": "\\s*(\\d\\)\\s)?(.*)$",
55 | "message": 2
56 | }
57 | ]
58 | },
59 | "presentation": {
60 | "echo": false,
61 | "reveal": "always",
62 | "focus": false,
63 | "panel": "shared",
64 | "showReuseMessage": false,
65 | "clear": true
66 | }
67 | }
68 | ]
69 | }
70 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/MMap.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | {-# LANGUAGE ScopedTypeVariables #-}
3 |
4 | module HaskellWorks.Data.Xml.Succinct.Cursor.MMap
5 | ( SlowCursor
6 | , FastCursor
7 | , mmapSlowCursor
8 | , mmapFastCursor
9 | ) where
10 |
11 | import Data.Word
12 | import Foreign.ForeignPtr
13 | import HaskellWorks.Data.BalancedParens.RangeMin2
14 | import HaskellWorks.Data.BalancedParens.Simple
15 | import HaskellWorks.Data.Bits.BitShown
16 | import HaskellWorks.Data.RankSelect.CsPoppy1
17 | import HaskellWorks.Data.Vector.Storable
18 | import HaskellWorks.Data.Xml.Succinct.Cursor
19 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
20 | import HaskellWorks.Data.Xml.Succinct.Cursor.Types
21 |
22 | import qualified Data.ByteString.Internal as BSI
23 | import qualified HaskellWorks.Data.Xml.Internal.ToIbBp64 as I
24 | import qualified System.IO.MMap as IO
25 |
26 | mmapSlowCursor :: FilePath -> IO SlowCursor
27 | mmapSlowCursor filePath = do
28 | (fptr :: ForeignPtr Word8, offset, size) <- IO.mmapFileForeignPtr filePath IO.ReadOnly Nothing
29 | let !bs = BSI.fromForeignPtr (castForeignPtr fptr) offset size
30 | let blankedXml = bsToBlankedXml bs
31 | let (ib, bp) = construct64UnzipN (fromIntegral size) (I.toIbBp64 blankedXml)
32 | let !cursor = XmlCursor
33 | { cursorText = bs
34 | , interests = BitShown ib
35 | , balancedParens = SimpleBalancedParens bp
36 | , cursorRank = 1
37 | }
38 |
39 | return cursor
40 |
41 | mmapFastCursor :: FilePath -> IO FastCursor
42 | mmapFastCursor filename = do
43 | -- Load the XML file into memory as a raw cursor.
44 | -- The raw XML data is `text`, and `ib` and `bp` are the indexes.
45 | -- `ib` and `bp` can be persisted to an index file for later use to avoid
46 | -- re-parsing the file.
47 | XmlCursor !text (BitShown !ib) (SimpleBalancedParens !bp) _ <- mmapSlowCursor filename
48 | let !bpCsPoppy = makeCsPoppy bp
49 | let !rangeMinMax = mkRangeMin2 bpCsPoppy
50 | let !ibCsPoppy = makeCsPoppy ib
51 | return $ XmlCursor text ibCsPoppy rangeMinMax 1
52 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Decode.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 |
3 | module HaskellWorks.Data.Xml.Decode where
4 |
5 | import Control.Applicative
6 | import Control.Lens
7 | import Control.Monad
8 | import Data.Foldable
9 | import Data.Text (Text)
10 | import HaskellWorks.Data.Xml.DecodeError
11 | import HaskellWorks.Data.Xml.DecodeResult
12 | import HaskellWorks.Data.Xml.Internal.Show
13 | import HaskellWorks.Data.Xml.Value
14 |
15 | class Decode a where
16 | decode :: Value -> DecodeResult a
17 |
18 | instance Decode Value where
19 | decode = DecodeOk
20 | {-# INLINE decode #-}
21 |
22 | failDecode :: Text -> DecodeResult a
23 | failDecode = DecodeFailed . DecodeError
24 |
25 | (@>) :: Value -> Text -> DecodeResult Text
26 | (@>) (XmlElement _ as _) n = case find (\v -> fst v == n) as of
27 | Just (_, text) -> DecodeOk text
28 | Nothing -> failDecode $ "No such attribute " <> tshow n
29 | (@>) _ n = failDecode $ "Not an element whilst looking up attribute " <> tshow n
30 |
31 | (/>) :: Value -> Text -> DecodeResult Value
32 | (/>) (XmlElement _ _ cs) n = go cs
33 | where go [] = failDecode $ "Unable to find element " <> tshow n
34 | go (r:rs) = case r of
35 | e@(XmlElement n' _ _) | n' == n -> DecodeOk e
36 | _ -> go rs
37 | (/>) _ n = failDecode $ "Expecting parent of element " <> tshow n
38 |
39 | (?>) :: Value -> (Value -> DecodeResult Value) -> DecodeResult Value
40 | (?>) v f = f v <|> pure v
41 |
42 | (~>) :: Value -> Text -> DecodeResult Value
43 | (~>) e@(XmlElement n' _ _) n | n' == n = DecodeOk e
44 | (~>) _ n = failDecode $ "Expecting parent of element " <> tshow n
45 |
46 | (/>>) :: Value -> Text -> DecodeResult [Value]
47 | (/>>) v n = v ^. childNodes <&> (~> n) <&> toList & join & pure
48 |
49 | -- Contextful
50 |
51 | (>) :: DecodeResult Value -> Text -> DecodeResult Value
52 | (>) ma n = ma >>= (/> n)
53 |
54 | (<@>) :: DecodeResult Value -> Text -> DecodeResult Text
55 | (<@>) ma n = ma >>= (@> n)
56 |
57 | (>) :: DecodeResult Value -> (Value -> DecodeResult Value) -> DecodeResult Value
58 | (>) ma f = ma >>= (?> f)
59 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Grammar.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE FlexibleInstances #-}
3 | {-# LANGUAGE InstanceSigs #-}
4 | {-# LANGUAGE MultiParamTypeClasses #-}
5 | {-# LANGUAGE OverloadedStrings #-}
6 | {-# LANGUAGE ScopedTypeVariables #-}
7 |
8 | module HaskellWorks.Data.Xml.Grammar where
9 |
10 | import Control.Applicative
11 | import Data.Char
12 | import Data.String
13 | import Data.Text (Text)
14 | import Data.Word
15 | import HaskellWorks.Data.Parser
16 |
17 | import qualified Data.Attoparsec.Types as T
18 | import qualified Data.Text as T
19 | import qualified HaskellWorks.Data.Parser as P
20 |
21 | data XmlElementType
22 | = XmlElementTypeDocument
23 | | XmlElementTypeElement Text
24 | | XmlElementTypeComment
25 | | XmlElementTypeCData
26 | | XmlElementTypeMeta Text
27 |
28 | parseXmlString :: (P.Parser t Word8) => T.Parser t Text
29 | parseXmlString = do
30 | q <- satisfyChar (=='"') <|> satisfyChar (=='\'')
31 | T.pack <$> many (satisfyChar (/= q))
32 |
33 | parseXmlElement :: (P.Parser t Word8, IsString t) => T.Parser t XmlElementType
34 | parseXmlElement = comment <|> cdata <|> doc <|> meta <|> element
35 | where
36 | comment = const XmlElementTypeComment <$> string "!--"
37 | cdata = const XmlElementTypeCData <$> string "![CDATA["
38 | meta = XmlElementTypeMeta <$> (string "!" >> parseXmlToken)
39 | doc = const XmlElementTypeDocument <$> string "?xml"
40 | element = XmlElementTypeElement <$> parseXmlToken
41 |
42 | parseXmlToken :: (P.Parser t Word8) => T.Parser t Text
43 | parseXmlToken = T.pack <$> many (satisfyChar isNameChar > "invalid string character")
44 |
45 | parseXmlAttributeName :: (P.Parser t Word8) => T.Parser t Text
46 | parseXmlAttributeName = parseXmlToken
47 |
48 | isNameStartChar :: Char -> Bool
49 | isNameStartChar w =
50 | let iw = ord w
51 | in w == '_' || w == ':' || isAlpha w
52 | || (iw >= 0xc0 && iw <= 0xd6)
53 | || (iw >= 0xd8 && iw <= 0xf6)
54 | || (iw >= 0xf8 && iw <= 0xff)
55 |
56 | isNameChar :: Char -> Bool
57 | isNameChar w = isNameStartChar w || w == '-' || w == '.'
58 | || ord w == 0xb7 || isNumber w
59 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Type.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleInstances #-}
2 | {-# LANGUAGE InstanceSigs #-}
3 | {-# LANGUAGE MultiParamTypeClasses #-}
4 |
5 | module HaskellWorks.Data.Xml.Type where
6 |
7 | import Data.Char
8 | import Data.Word8 as W8
9 | import HaskellWorks.Data.Bits.BitWise
10 | import HaskellWorks.Data.Drop
11 | import HaskellWorks.Data.Positioning
12 | import HaskellWorks.Data.RankSelect.Base.Rank0
13 | import HaskellWorks.Data.RankSelect.Base.Rank1
14 | import HaskellWorks.Data.RankSelect.Base.Select1
15 | import HaskellWorks.Data.Xml.Succinct
16 | import Prelude hiding (drop)
17 |
18 | import qualified Data.ByteString as BS
19 | import qualified HaskellWorks.Data.BalancedParens as BP
20 |
21 | {- HLINT ignore "Reduce duplication" -}
22 |
23 | data XmlType
24 | = XmlTypeElement
25 | | XmlTypeAttrList
26 | | XmlTypeToken
27 | deriving (Eq, Show)
28 |
29 | class XmlTypeAt a where
30 | xmlTypeAtPosition :: Position -> a -> Maybe XmlType
31 | xmlTypeAt :: a -> Maybe XmlType
32 |
33 | instance (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlTypeAt (XmlCursor String v w) where
34 | xmlTypeAtPosition p k = case drop (toCount p) (cursorText k) of
35 | c:_ | fromIntegral (ord c) == _less -> Just XmlTypeElement
36 | c:_ | W8.isSpace $ fromIntegral (ord c) -> Just XmlTypeAttrList
37 | _ -> Just XmlTypeToken
38 |
39 | xmlTypeAt k = xmlTypeAtPosition p k
40 | where p = lastPositionOf (select1 ik (rank1 bpk (cursorRank k)))
41 | ik = interests k
42 | bpk = balancedParens k
43 |
44 | instance (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlTypeAt (XmlCursor BS.ByteString v w) where
45 | xmlTypeAtPosition p k = case BS.uncons (drop (toCount p) (cursorText k)) of
46 | Just (c, _) | c == _less -> Just XmlTypeElement
47 | Just (c, _) | W8.isSpace c -> Just XmlTypeAttrList
48 | _ -> Just XmlTypeToken
49 |
50 | xmlTypeAt k = xmlTypeAtPosition p k
51 | where p = lastPositionOf (select1 ik (rank1 bpk (cursorRank k)))
52 | ik = interests k
53 | bpk = balancedParens k
54 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/List.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 | {-# LANGUAGE RankNTypes #-}
3 |
4 | module HaskellWorks.Data.Xml.Internal.List
5 | ( blankedXmlToInterestBits
6 | , compressWordAsBit
7 | ) where
8 |
9 | import Data.ByteString (ByteString)
10 | import Data.Word
11 | import HaskellWorks.Data.Bits.BitWise
12 | import HaskellWorks.Data.Xml.Internal.ByteString
13 | import HaskellWorks.Data.Xml.Internal.Tables
14 | import Prelude
15 |
16 | import qualified Data.ByteString as BS
17 |
18 | blankedXmlToInterestBits :: [ByteString] -> [ByteString]
19 | blankedXmlToInterestBits = blankedXmlToInterestBits' ""
20 |
21 | blankedXmlToInterestBits' :: ByteString -> [ByteString] -> [ByteString]
22 | blankedXmlToInterestBits' rs is = case is of
23 | (bs:bss) -> do
24 | let cs = if BS.length rs /= 0 then BS.concat [rs, bs] else bs
25 | let lencs = BS.length cs
26 | let q = lencs `quot` 8
27 | let (ds, es) = BS.splitAt (q * 8) cs
28 | let (fs, _) = BS.unfoldrN q gen ds
29 | fs:blankedXmlToInterestBits' es bss
30 | [] -> do
31 | let lenrs = BS.length rs
32 | let q = lenrs + 7 `quot` 8
33 | [fst (BS.unfoldrN q gen rs)]
34 | where gen :: ByteString -> Maybe (Word8, ByteString)
35 | gen as = if BS.length as == 0
36 | then Nothing
37 | else Just ( BS.foldr' (\b m -> isInterestingWord8 b .|. (m .<. 1)) 0 (BS.take 8 as)
38 | , BS.drop 8 as
39 | )
40 |
41 | compressWordAsBit :: [ByteString] -> [ByteString]
42 | compressWordAsBit = compressWordAsBit' BS.empty
43 |
44 | compressWordAsBit' :: ByteString -> [ByteString] -> [ByteString]
45 | compressWordAsBit' aBS iBS = case iBS of
46 | (bBS:bBSs) -> do
47 | let (cBS, dBS) = repartitionMod8 aBS bBS
48 | let (cs, _) = BS.unfoldrN (BS.length cBS + 7 `div` 8) gen cBS
49 | cs:compressWordAsBit' dBS bBSs
50 | [] -> do
51 | let (cs, _) = BS.unfoldrN (BS.length aBS + 7 `div` 8) gen aBS
52 | [cs]
53 | where gen :: ByteString -> Maybe (Word8, ByteString)
54 | gen xs = if BS.length xs == 0
55 | then Nothing
56 | else Just ( BS.foldr' (\b m -> ((b .&. 1) .|. (m .<. 1))) 0 (BS.take 8 xs)
57 | , BS.drop 8 xs
58 | )
59 |
--------------------------------------------------------------------------------
/bench/Main.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | {-# LANGUAGE ScopedTypeVariables #-}
3 |
4 | module Main where
5 |
6 | import Criterion.Main
7 | import Data.ByteString (ByteString)
8 | import Data.Word
9 | import Foreign
10 | import HaskellWorks.Data.BalancedParens.Simple
11 | import HaskellWorks.Data.Bits.BitShown
12 | import HaskellWorks.Data.FromByteString
13 | import HaskellWorks.Data.Xml.Internal.Blank
14 | import HaskellWorks.Data.Xml.Internal.List
15 | import HaskellWorks.Data.Xml.Internal.Tables
16 | import HaskellWorks.Data.Xml.Succinct.Cursor
17 | import System.IO.MMap
18 |
19 | import qualified Data.ByteString as BS
20 | import qualified Data.ByteString.Internal as BSI
21 | import qualified Data.Vector.Storable as DVS
22 |
23 | setupEnvXml :: FilePath -> IO ByteString
24 | setupEnvXml filepath = do
25 | (fptr :: ForeignPtr Word8, offset, size) <- mmapFileForeignPtr filepath ReadOnly Nothing
26 | let !bs = BSI.fromForeignPtr (castForeignPtr fptr) offset size
27 | return bs
28 |
29 | loadXml :: ByteString -> XmlCursor ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))
30 | loadXml bs = fromByteString bs :: XmlCursor ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))
31 |
32 | xmlToInterestBits3 :: [ByteString] -> [ByteString]
33 | xmlToInterestBits3 = blankedXmlToInterestBits . blankXml
34 |
35 | runCon :: ([i] -> [ByteString]) -> i -> ByteString
36 | runCon con bs = BS.concat $ con [bs]
37 |
38 | benchRankXmlCatalogLists :: [Benchmark]
39 | benchRankXmlCatalogLists =
40 | [ env (setupEnvXml "data/catalog.xml") $ \bs -> bgroup "catalog.xml"
41 | [ bench "Run blankXml" (whnf (runCon blankXml ) bs)
42 | , bench "Run xmlToInterestBits3" (whnf (runCon xmlToInterestBits3) bs)
43 | , bench "loadXml" (whnf loadXml bs)
44 | ]
45 | ]
46 |
47 | setupInterestingWord8s :: IO ()
48 | setupInterestingWord8s = do
49 | let !_ = interestingWord8s
50 | return ()
51 |
52 | benchIsInterestingWord8 :: [Benchmark]
53 | benchIsInterestingWord8 =
54 | [ env setupInterestingWord8s $ \_ -> bgroup "Interesting Word8 lookup"
55 | [ bench "isInterestingWord8" (whnf isInterestingWord8 0)
56 | ]
57 | ]
58 |
59 | main :: IO ()
60 | main = defaultMain $ concat
61 | [ benchIsInterestingWord8
62 | , benchRankXmlCatalogLists
63 | ]
64 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Value.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleInstances #-}
2 | {-# LANGUAGE InstanceSigs #-}
3 | {-# LANGUAGE MultiParamTypeClasses #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TemplateHaskell #-}
7 | {-# LANGUAGE TupleSections #-}
8 |
9 | module HaskellWorks.Data.Xml.Value
10 | ( Value(..)
11 | , HasValue(..)
12 | , _XmlDocument
13 | , _XmlText
14 | , _XmlElement
15 | , _XmlCData
16 | , _XmlComment
17 | , _XmlMeta
18 | , _XmlError
19 | ) where
20 |
21 | import Control.Lens
22 | import Data.Text (Text)
23 | import HaskellWorks.Data.Xml.Internal.Show
24 | import HaskellWorks.Data.Xml.RawDecode
25 | import HaskellWorks.Data.Xml.RawValue
26 |
27 | data Value
28 | = XmlDocument
29 | { _childNodes :: [Value]
30 | }
31 | | XmlText
32 | { _textValue :: Text
33 | }
34 | | XmlElement
35 | { _name :: Text
36 | , _attributes :: [(Text, Text)]
37 | , _childNodes :: [Value]
38 | }
39 | | XmlCData
40 | { _cdata :: Text
41 | }
42 | | XmlComment
43 | { _comment :: Text
44 | }
45 | | XmlMeta
46 | { _name :: Text
47 | , _childNodes :: [Value]
48 | }
49 | | XmlError
50 | { _errorMessage :: Text
51 | }
52 | deriving (Eq, Show)
53 |
54 | makeClassy ''Value
55 | makePrisms ''Value
56 |
57 | instance RawDecode Value where
58 | rawDecode (RawDocument rvs ) = XmlDocument (rawDecode <$> rvs)
59 | rawDecode (RawText text ) = XmlText text
60 | rawDecode (RawElement n cs ) = mkXmlElement n cs
61 | rawDecode (RawCData text ) = XmlCData text
62 | rawDecode (RawComment text ) = XmlComment text
63 | rawDecode (RawMeta n cs ) = XmlMeta n (rawDecode <$> cs)
64 | rawDecode (RawAttrName nameValue ) = XmlError ("Can't decode attribute name: " <> nameValue)
65 | rawDecode (RawAttrValue attrValue ) = XmlError ("Can't decode attribute value: " <> attrValue)
66 | rawDecode (RawAttrList as ) = XmlError ("Can't decode attribute list: " <> tshow as)
67 | rawDecode (RawError msg ) = XmlError msg
68 |
69 | mkXmlElement :: Text -> [RawValue] -> Value
70 | mkXmlElement n (RawAttrList as:cs) = XmlElement n (mkAttrs as) (rawDecode <$> cs)
71 | mkXmlElement n cs = XmlElement n [] (rawDecode <$> cs)
72 |
73 | mkAttrs :: [RawValue] -> [(Text, Text)]
74 | mkAttrs (RawAttrName n:RawAttrValue v:cs) = (n, v):mkAttrs cs
75 | mkAttrs (_:cs) = mkAttrs cs
76 | mkAttrs [] = []
77 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/Succinct/Cursor/InterestBitsSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 |
4 | module HaskellWorks.Data.Xml.Succinct.Cursor.InterestBitsSpec(spec) where
5 |
6 | import Data.String
7 | import Data.Word
8 | import HaskellWorks.Data.Bits.BitShown
9 | import HaskellWorks.Data.FromByteString
10 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
11 | import HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits
12 | import HaskellWorks.Hspec.Hedgehog
13 | import Hedgehog
14 | import Test.Hspec
15 |
16 | import qualified Data.ByteString as BS
17 | import qualified Data.Vector.Storable as DVS
18 |
19 | {- HLINT ignore "Redundant do" -}
20 |
21 | interestBitsOf :: FromBlankedXml (XmlInterestBits a) => BS.ByteString -> a
22 | interestBitsOf = getXmlInterestBits . fromBlankedXml . bsToBlankedXml
23 |
24 | spec :: Spec
25 | spec = describe "HaskellWorks.Data.Xml.Succinct.Cursor.InterestBitsSpec" $ do
26 | it "Evaluating interest bits" $ requireTest $ do
27 | (interestBitsOf "" :: BitShown (DVS.Vector Word8)) === fromString ""
28 | (interestBitsOf " \n \r \t " :: BitShown (DVS.Vector Word8)) === fromString "00000000"
29 | (interestBitsOf "" :: BitShown (DVS.Vector Word8)) === fromString "10010000 00000000"
33 | (interestBitsOf " " :: BitShown (DVS.Vector Word8)) === fromString "01011010 00000000"
34 | (interestBitsOf " " :: BitShown (DVS.Vector Word8)) === fromString "01000000 00000000"
35 | (interestBitsOf " "
43 | , "< < "
44 | , "> >"
45 | ]
46 | annotate $ "Blanked: " <> show blanked
47 | let ib :: XmlInterestBits (BitShown (DVS.Vector Word8))
48 | ib = XmlInterestBits (getXmlInterestBits (fromBlankedXml (BlankedXml blanked)))
49 | let moo :: [BS.ByteString]
50 | moo = blankedXmlToInterestBits blanked -- :: XmlInterestBits (BitShown (DVS.Vector Word8))
51 | annotate $ "Moo: " <> show (BitShown . BS.unpack <$> moo)
52 | let actual = getXmlInterestBits ib :: BitShown (DVS.Vector Word8)
53 | let expected = fromString "10000110 00000010 00001000 00000100 00000001 00100000 00000000"
54 |
55 | actual === expected
56 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/BalancedParens.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric #-}
2 | {-# LANGUAGE FlexibleContexts #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
5 | {-# LANGUAGE InstanceSigs #-}
6 | {-# LANGUAGE MultiParamTypeClasses #-}
7 |
8 | module HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParens
9 | ( XmlBalancedParens(..)
10 | , getXmlBalancedParens
11 | ) where
12 |
13 | import Control.Applicative
14 | import Control.DeepSeq
15 | import Data.Word
16 | import GHC.Generics
17 | import HaskellWorks.Data.BalancedParens
18 | import HaskellWorks.Data.Xml.Internal.BalancedParens
19 | import HaskellWorks.Data.Xml.Internal.List
20 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
21 |
22 | import qualified Data.ByteString as BS
23 | import qualified Data.Vector.Storable as DVS
24 |
25 | newtype XmlBalancedParens a = XmlBalancedParens a deriving (Eq, Show, Generic, NFData)
26 |
27 | getXmlBalancedParens :: XmlBalancedParens a -> a
28 | getXmlBalancedParens (XmlBalancedParens a) = a
29 |
30 | genBitWordsForever :: BS.ByteString -> Maybe (Word8, BS.ByteString)
31 | genBitWordsForever bs = BS.uncons bs <|> Just (0, bs)
32 | {-# INLINABLE genBitWordsForever #-}
33 |
34 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word8))) where
35 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS)))
36 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj)))
37 | newLen = (BS.length interestBS + 7) `div` 8 * 8
38 |
39 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word16))) where
40 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS)))
41 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj)))
42 | newLen = (BS.length interestBS + 7) `div` 8 * 8
43 |
44 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word32))) where
45 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS)))
46 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj)))
47 | newLen = (BS.length interestBS + 7) `div` 8 * 8
48 |
49 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word64))) where
50 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS)))
51 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj)))
52 | newLen = (BS.length interestBS + 7) `div` 8 * 8
53 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/Token/TokenizeSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 |
3 | module HaskellWorks.Data.Xml.Token.TokenizeSpec (spec) where
4 |
5 | import Data.ByteString (ByteString)
6 | import HaskellWorks.Data.Xml.Token.Tokenize
7 | import HaskellWorks.Hspec.Hedgehog
8 | import Hedgehog
9 | import Test.Hspec
10 |
11 | import qualified Data.Attoparsec.ByteString.Char8 as BC
12 | import qualified Data.ByteString as BS
13 |
14 | {- HLINT ignore "Redundant do" -}
15 |
16 | parseXmlToken' :: ByteString -> Either String (XmlToken String Double)
17 | parseXmlToken' = BC.parseOnly parseXmlToken
18 |
19 | spec :: Spec
20 | spec = describe "HaskellWorks.Data.Xml.Token.TokenizeSpec" $ do
21 | describe "When parsing single token at beginning of text" $ do
22 | it "Empty Xml should produce no bits" $ requireTest $
23 | parseXmlToken' "" === Left "not enough input"
24 | it "Xml with one space should produce whitespace token" $ requireTest $
25 | parseXmlToken' " " === Right XmlTokenWhitespace
26 | it "Xml with two spaces should produce whitespace token" $ requireTest $
27 | parseXmlToken' " " === Right XmlTokenWhitespace
28 | it "Spaces and newlines should produce no bits" $ requireTest $
29 | parseXmlToken' " \n \r \t " === Right XmlTokenWhitespace
30 | it "`null` at beginning should produce one bit" $ requireTest $
31 | parseXmlToken' "null " === Right XmlTokenNull
32 | it "number at beginning should produce one bit" $ requireTest $
33 | parseXmlToken' "1234 " === Right (XmlTokenNumber 1234)
34 | it "false at beginning should produce one bit" $ requireTest $
35 | parseXmlToken' "false " === Right (XmlTokenBoolean False)
36 | it "true at beginning should produce one bit" $ requireTest $
37 | parseXmlToken' "true " === Right (XmlTokenBoolean True)
38 | it "string at beginning should produce one bit" $ requireTest $
39 | parseXmlToken' "\"hello\" " === Right (XmlTokenString "hello")
40 | it "quoted string should parse" $ requireTest $
41 | parseXmlToken' "\"\\\"\" " === Right (XmlTokenString "\"")
42 | it "left brace at beginning should produce one bit" $ requireTest $
43 | parseXmlToken' "{ " === Right XmlTokenBraceL
44 | it "right brace at beginning should produce one bit" $ requireTest $
45 | parseXmlToken' "} " === Right XmlTokenBraceR
46 | it "left bracket at beginning should produce one bit" $ requireTest $
47 | parseXmlToken' "[ " === Right XmlTokenBracketL
48 | it "right bracket at beginning should produce one bit" $ requireTest $
49 | parseXmlToken' "] " === Right XmlTokenBracketR
50 | it "right bracket at beginning should produce one bit" $ requireTest $
51 | parseXmlToken' ": " === Right XmlTokenColon
52 | it "right bracket at beginning should produce one bit" $ requireTest $
53 | parseXmlToken' ", " === Right XmlTokenComma
54 |
--------------------------------------------------------------------------------
/app/App/Commands/CreateIndex.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DataKinds #-}
2 | {-# LANGUAGE DeriveGeneric #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TypeApplications #-}
7 | {-# LANGUAGE TypeSynonymInstances #-}
8 |
9 | module App.Commands.CreateIndex
10 | ( cmdCreateIndex
11 | ) where
12 |
13 | import App.Options
14 | import Control.Lens
15 | import Control.Monad
16 | import Data.Generics.Product.Any
17 | import HaskellWorks.Data.Xml.Internal.ToIbBp64
18 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
19 | import HaskellWorks.Data.Xml.Succinct.Cursor.MMap
20 | import Options.Applicative hiding (columns)
21 |
22 | import qualified App.Commands.Types as Z
23 | import qualified Data.ByteString as BS
24 | import qualified Data.ByteString.Lazy as LBS
25 | import qualified Data.Text.IO as TIO
26 | import qualified HaskellWorks.Data.ByteString.Lazy as LBS
27 | import qualified Options.Applicative as OA
28 | import qualified System.Exit as IO
29 | import qualified System.IO as IO
30 |
31 | runCreateIndex :: Z.CreateIndexOptions -> IO ()
32 | runCreateIndex opt = do
33 | let input = opt ^. the @"input"
34 | let ibOutput = opt ^. the @"ibOutput"
35 | let bpOutput = opt ^. the @"bpOutput"
36 | let method = opt ^. the @"method"
37 |
38 | case method of
39 | "memory" -> do
40 | cursor <- mmapSlowCursor input
41 |
42 | LBS.writeFile ibOutput (LBS.toLazyByteString (cursor ^. the @"interests" . the @1))
43 | LBS.writeFile bpOutput (LBS.toLazyByteString (cursor ^. the @"balancedParens" . the @1))
44 | "stream" -> do
45 | lbs <- LBS.readFile input
46 | let blankedXml = lbsToBlankedXml lbs
47 | let ibBp = toIbBp64 blankedXml
48 |
49 | hIbOutput <- IO.openFile ibOutput IO.WriteMode
50 | hBpOutput <- IO.openFile bpOutput IO.WriteMode
51 |
52 | forM_ ibBp $ \(ib, bp) -> do
53 | BS.hPut hIbOutput ib
54 | BS.hPut hBpOutput bp
55 |
56 | IO.hClose hIbOutput
57 | IO.hClose hBpOutput
58 |
59 | return ()
60 | unknown -> do
61 | TIO.hPutStrLn IO.stderr $ "Unsupported method: " <> unknown
62 | IO.exitFailure
63 |
64 | optsCreateIndex :: Parser Z.CreateIndexOptions
65 | optsCreateIndex = Z.CreateIndexOptions
66 | <$> strOption
67 | ( long "input"
68 | <> help "Input file"
69 | <> metavar "FILE"
70 | )
71 | <*> strOption
72 | ( long "ib-output"
73 | <> help "Interest Bits output"
74 | <> metavar "FILE"
75 | )
76 | <*> strOption
77 | ( long "bp-output"
78 | <> help "Balanced Parens output"
79 | <> metavar "FILE"
80 | )
81 | <*> textOption
82 | ( long "method"
83 | <> help "Method"
84 | <> metavar "METHOD"
85 | <> OA.value "memory"
86 | )
87 |
88 | cmdCreateIndex :: Mod CommandFields (IO ())
89 | cmdCreateIndex = command "create-index" $ flip info idm $ runCreateIndex <$> optsCreateIndex
90 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/InterestBits.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric #-}
2 | {-# LANGUAGE FlexibleContexts #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
5 | {-# LANGUAGE InstanceSigs #-}
6 | {-# LANGUAGE MultiParamTypeClasses #-}
7 |
8 | module HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits
9 | ( XmlInterestBits(..)
10 | , getXmlInterestBits
11 | , blankedXmlToInterestBits
12 | , blankedXmlBssToInterestBitsBs
13 | , genInterestForever
14 | ) where
15 |
16 | import Control.Applicative
17 | import Control.DeepSeq
18 | import Data.ByteString.Internal
19 | import Data.Word
20 | import GHC.Generics
21 | import HaskellWorks.Data.Bits.BitShown
22 | import HaskellWorks.Data.FromByteString
23 | import HaskellWorks.Data.RankSelect.Poppy512
24 | import HaskellWorks.Data.Xml.Internal.List
25 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
26 |
27 | import qualified Data.ByteString as BS
28 | import qualified Data.Vector.Storable as DVS
29 |
30 | newtype XmlInterestBits a = XmlInterestBits a deriving (Eq, Show, Generic, NFData)
31 |
32 | getXmlInterestBits :: XmlInterestBits a -> a
33 | getXmlInterestBits (XmlInterestBits a) = a
34 |
35 | blankedXmlBssToInterestBitsBs :: [ByteString] -> ByteString
36 | blankedXmlBssToInterestBitsBs bss = BS.concat $ blankedXmlToInterestBits bss
37 |
38 | genInterest :: ByteString -> Maybe (Word8, ByteString)
39 | genInterest = BS.uncons
40 |
41 | genInterestForever :: ByteString -> Maybe (Word8, ByteString)
42 | genInterestForever bs = BS.uncons bs <|> Just (0, bs)
43 |
44 | instance FromBlankedXml (XmlInterestBits (BitShown [Bool])) where
45 | fromBlankedXml = XmlInterestBits . fromByteString . BS.concat . blankedXmlToInterestBits . getBlankedXml
46 |
47 | instance FromBlankedXml (XmlInterestBits (BitShown BS.ByteString)) where
48 | fromBlankedXml = XmlInterestBits . BitShown . BS.unfoldr genInterest . blankedXmlBssToInterestBitsBs . getBlankedXml
49 |
50 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word8))) where
51 | fromBlankedXml = XmlInterestBits . BitShown . DVS.unfoldr genInterest . blankedXmlBssToInterestBitsBs . getBlankedXml
52 |
53 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word16))) where
54 | fromBlankedXml bj = XmlInterestBits (BitShown (DVS.unsafeCast (DVS.unfoldrN newLen genInterestForever interestBS)))
55 | where interestBS = blankedXmlBssToInterestBitsBs (getBlankedXml bj)
56 | newLen = (BS.length interestBS + 1) `div` 2 * 2
57 |
58 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word32))) where
59 | fromBlankedXml bj = XmlInterestBits (BitShown (DVS.unsafeCast (DVS.unfoldrN newLen genInterestForever interestBS)))
60 | where interestBS = blankedXmlBssToInterestBitsBs (getBlankedXml bj)
61 | newLen = (BS.length interestBS + 3) `div` 4 * 4
62 |
63 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word64))) where
64 | fromBlankedXml bj = XmlInterestBits (BitShown (DVS.unsafeCast (DVS.unfoldrN newLen genInterestForever interestBS)))
65 | where interestBS = blankedXmlBssToInterestBitsBs (getBlankedXml bj)
66 | newLen = (BS.length interestBS + 7) `div` 8 * 8
67 |
68 | instance FromBlankedXml (XmlInterestBits Poppy512) where
69 | fromBlankedXml = XmlInterestBits . makePoppy512 . bitShown . getXmlInterestBits . fromBlankedXml
70 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/Succinct/Cursor/BalancedParensSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 |
4 | module HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParensSpec
5 | ( spec
6 | ) where
7 |
8 | import Data.String
9 | import HaskellWorks.Data.Bits.BitShown
10 | import HaskellWorks.Data.ByteString
11 | import HaskellWorks.Data.Xml.Internal.BalancedParens
12 | import HaskellWorks.Data.Xml.Internal.Blank
13 | import HaskellWorks.Data.Xml.Internal.List
14 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
15 | import HaskellWorks.Hspec.Hedgehog
16 | import Hedgehog
17 | import Test.Hspec
18 |
19 | import qualified Data.ByteString as BS
20 |
21 | {- HLINT ignore "Redundant do" -}
22 |
23 | spec :: Spec
24 | spec = describe "HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParensSpec" $ do
25 | it "Blanking XML should work 1" $ requireTest $ do
26 | let blankedXml = BlankedXml [">"]
27 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml blankedXml)))
28 | bp === fromString "11011000"
29 | it "Blanking XML should work 2" $ requireTest $ do
30 | let blankedXml = BlankedXml
31 | [ "<><><><><><><><>"
32 | , "<><><><><><><><>"
33 | ]
34 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml blankedXml)))
35 | bp === fromString
36 | "1010101010101010\
37 | \1010101010101010"
38 |
39 | let unchunkedInput = "\n\n \n \n \n \n \n \n\n"
40 | let chunkedInput = chunkedBy 15 unchunkedInput
41 | let chunkedBlank = blankXml chunkedInput
42 |
43 | let unchunkedBadInput = "\n\n \n \n \n \n \n \n\n"
44 | let chunkedBadInput = chunkedBy 15 unchunkedBadInput
45 | let chunkedBadBlank = blankXml chunkedBadInput
46 |
47 | it "Same input" $ requireTest $ do
48 | unchunkedInput === BS.concat chunkedInput
49 |
50 | it "Blanking XML should work 3" $ requireTest $ do
51 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens chunkedBlank))
52 | annotate $ "Good: " <> show chunkedBlank
53 | bp === fromString "11101010 10001101 01010100"
54 |
55 | it "Blanking XML should work 3" $ requireTest $ do
56 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens chunkedBadBlank))
57 | annotate $ "Bad: " <> show chunkedBadBlank
58 | bp === fromString "11101010 10001101 01010100"
59 |
60 | describe "Chunking works" $ do
61 | let document = "free"
62 | let whole = mkBlank 4096 document
63 | let chunked = mkBlank 15 document
64 |
65 | it "should BP the same with chanks" $ requireTest $ do
66 | BS.concat chunked === BS.concat whole
67 |
68 | it "should produce same bits" $ requireTest $ do
69 | BS.concat (mkBits chunked) === BS.concat (mkBits whole)
70 |
71 |
72 | mkBlank :: Int -> BS.ByteString -> [BS.ByteString]
73 | mkBlank csize bs = blankXml (chunkedBy csize bs)
74 |
75 | mkBits :: [BS.ByteString] -> [BS.ByteString]
76 | mkBits = compressWordAsBit . blankedXmlToBalancedParens
77 |
--------------------------------------------------------------------------------
/app/App/Commands/Demo.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | {-# LANGUAGE DataKinds #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TypeApplications #-}
7 | {-# LANGUAGE TypeSynonymInstances #-}
8 |
9 | module App.Commands.Demo
10 | ( cmdDemo
11 | ) where
12 |
13 | import Data.Foldable
14 | import Data.Maybe
15 | import Data.Text (Text)
16 | import HaskellWorks.Data.TreeCursor
17 | import HaskellWorks.Data.Xml.Decode
18 | import HaskellWorks.Data.Xml.DecodeResult
19 | import HaskellWorks.Data.Xml.RawDecode
20 | import HaskellWorks.Data.Xml.RawValue
21 | import HaskellWorks.Data.Xml.Succinct.Cursor.Load
22 | import HaskellWorks.Data.Xml.Succinct.Index
23 | import HaskellWorks.Data.Xml.Value
24 | import Options.Applicative hiding (columns)
25 |
26 | import qualified App.Commands.Types as Z
27 |
28 | -- | Parse the text of an XML node.
29 | class ParseText a where
30 | parseText :: Value -> DecodeResult a
31 |
32 | instance ParseText Text where
33 | parseText (XmlText text) = DecodeOk text
34 | parseText (XmlCData text) = DecodeOk text
35 | parseText (XmlElement _ _ cs) = DecodeOk $ mconcat $ mconcat $ toList . parseText <$> cs
36 | parseText _ = DecodeOk ""
37 |
38 | -- | Convert a decode result to a maybe
39 | decodeResultToMaybe :: DecodeResult a -> Maybe a
40 | decodeResultToMaybe (DecodeOk a) = Just a
41 | decodeResultToMaybe _ = Nothing
42 |
43 | -- | Document model. This does not need to be able to completely represent all
44 | -- the data in the XML document. In fact, having a smaller model may improve
45 | -- query performance.
46 | data Plant = Plant
47 | { common :: Text
48 | , price :: Text
49 | } deriving (Eq, Show)
50 |
51 | newtype Catalog = Catalog
52 | { plants :: [Plant]
53 | } deriving (Eq, Show)
54 |
55 | -- | Decode plant element
56 | decodePlant :: Value -> DecodeResult Plant
57 | decodePlant xml = do
58 | aCommon <- xml /> "common" >>= parseText
59 | aPrice <- xml /> "price" >>= parseText
60 | return $ Plant aCommon aPrice
61 |
62 | -- | Decode catalog element
63 | decodeCatalog :: Value -> DecodeResult Catalog
64 | decodeCatalog xml = do
65 | aPlantXmls <- xml />> "plant"
66 | let aPlants = catMaybes (decodeResultToMaybe . decodePlant <$> aPlantXmls)
67 | return $ Catalog aPlants
68 |
69 | runDemo :: Z.DemoOptions -> IO ()
70 | runDemo _ = do
71 | -- Read XML into memory as a query-optimised cursor
72 | !cursor <- loadFastCursor "data/catalog.xml"
73 | -- Skip the XML declaration to get to the root element cursor
74 | case nextSibling cursor of
75 | Just rootCursor -> do
76 | -- Get the root raw XML value at the root element cursor
77 | let rootValue = rawValueAt (xmlIndexAt rootCursor)
78 | -- Show what we have at this cursor
79 | putStrLn $ "Raw value: " <> take 100 (show rootValue)
80 | -- Decode the raw XML value
81 | case decodeCatalog (rawDecode rootValue) of
82 | DecodeOk catalog -> putStrLn $ "Catalog: " <> show catalog
83 | DecodeFailed msg -> putStrLn $ "Error: " <> show msg
84 | Nothing -> do
85 | putStrLn "Could not read XML"
86 | return ()
87 |
88 | optsDemo :: Parser Z.DemoOptions
89 | optsDemo = pure Z.DemoOptions
90 |
91 | cmdDemo :: Mod CommandFields (IO ())
92 | cmdDemo = command "demo" $ flip info idm $ runDemo <$> optsDemo
93 |
--------------------------------------------------------------------------------
/app/App/Commands/Count.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | {-# LANGUAGE DataKinds #-}
3 | {-# LANGUAGE DeriveGeneric #-}
4 | {-# LANGUAGE FlexibleInstances #-}
5 | {-# LANGUAGE OverloadedStrings #-}
6 | {-# LANGUAGE ScopedTypeVariables #-}
7 | {-# LANGUAGE TypeApplications #-}
8 | {-# LANGUAGE TypeSynonymInstances #-}
9 |
10 | module App.Commands.Count
11 | ( cmdCount
12 | ) where
13 |
14 | import App.Options
15 | import Control.Lens
16 | import Control.Monad
17 | import Data.Generics.Product.Any
18 | import Data.Text (Text)
19 | import GHC.Generics
20 | import HaskellWorks.Data.TreeCursor
21 | import HaskellWorks.Data.Xml.DecodeResult
22 | import HaskellWorks.Data.Xml.RawDecode
23 | import HaskellWorks.Data.Xml.RawValue
24 | import HaskellWorks.Data.Xml.Succinct.Cursor.Load
25 | import HaskellWorks.Data.Xml.Succinct.Cursor.MMap
26 | import HaskellWorks.Data.Xml.Succinct.Index
27 | import HaskellWorks.Data.Xml.Value
28 | import Options.Applicative hiding (columns)
29 |
30 | import qualified App.Commands.Types as Z
31 | import qualified App.Naive as NAIVE
32 | import qualified App.XPath.Parser as XPP
33 | import qualified System.Exit as IO
34 | import qualified System.IO as IO
35 |
36 | -- | Document model. This does not need to be able to completely represent all
37 | -- the data in the XML document. In fact, having a smaller model may improve
38 | -- Count performance.
39 | data Plant = Plant
40 | { common :: String
41 | , price :: String
42 | } deriving (Eq, Show, Generic)
43 |
44 | newtype Catalog = Catalog
45 | { plants :: [Plant]
46 | } deriving (Eq, Show, Generic)
47 |
48 | tags :: Value -> Text -> [Value]
49 | tags xml@(XmlElement n _ _) elemName = if n == elemName
50 | then [xml]
51 | else []
52 | tags _ _ = []
53 |
54 | kids :: Value -> [Value]
55 | kids (XmlElement _ _ cs) = cs
56 | kids _ = []
57 |
58 | countAtPath :: [Text] -> Value -> DecodeResult Int
59 | countAtPath [] _ = return 0
60 | countAtPath [t] xml = return (length (tags xml t))
61 | countAtPath (t:ts) xml = do
62 | counts <- forM (tags xml t >>= kids) $ countAtPath ts
63 | return (sum counts)
64 |
65 | runCount :: Z.CountOptions -> IO ()
66 | runCount opt = do
67 | let input = opt ^. the @"input"
68 | let xpath = opt ^. the @"xpath"
69 | let method = opt ^. the @"method"
70 |
71 | IO.putStrLn $ "XPath: " <> show xpath
72 |
73 | cursorResult <- case method of
74 | "mmap" -> Right <$> mmapFastCursor input
75 | "memory" -> Right <$> loadFastCursor input
76 | "naive" -> Right <$> NAIVE.loadFastCursor input
77 | unknown -> return (Left ("Unknown method " <> show unknown))
78 |
79 | case cursorResult of
80 | Right !cursor -> do
81 | -- Skip the XML declaration to get to the root element cursor
82 | case nextSibling cursor of
83 | Just rootCursor -> do
84 | -- Get the root raw XML value at the root element cursor
85 | let rootValue = rawValueAt (xmlIndexAt rootCursor)
86 | -- Show what we have at this cursor
87 | putStrLn $ "Raw value: " <> take 100 (show rootValue)
88 | -- Decode the raw XML value
89 | case countAtPath (xpath ^. the @"path") (rawDecode rootValue) of
90 | DecodeOk count -> putStrLn $ "Count: " <> show count
91 | DecodeFailed msg -> putStrLn $ "Error: " <> show msg
92 | Nothing -> do
93 | putStrLn "Could not read XML"
94 | return ()
95 | Left msg -> do
96 | IO.putStrLn $ "Error: " <> msg
97 | IO.exitFailure
98 |
99 | optsCount :: Parser Z.CountOptions
100 | optsCount = Z.CountOptions
101 | <$> strOption
102 | ( long "input"
103 | <> help "Input file"
104 | <> metavar "FILE"
105 | )
106 | <*> optionParser XPP.path
107 | ( long "xpath"
108 | <> help "XPath expression"
109 | <> metavar "XPATH"
110 | )
111 | <*> textOption
112 | ( long "method"
113 | <> help "Read method"
114 | <> metavar "METHOD"
115 | )
116 |
117 | cmdCount :: Mod CommandFields (IO ())
118 | cmdCount = command "count" $ flip info idm $ runCount <$> optsCount
119 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Index.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | {-# LANGUAGE FlexibleContexts #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE InstanceSigs #-}
5 | {-# LANGUAGE MultiParamTypeClasses #-}
6 | {-# LANGUAGE OverloadedStrings #-}
7 |
8 | module HaskellWorks.Data.Xml.Succinct.Index
9 | ( XmlIndex(..)
10 | , XmlIndexAt(..)
11 | )
12 | where
13 |
14 | import Control.Arrow
15 | import Data.Text (Text)
16 | import HaskellWorks.Data.Bits.BitWise
17 | import HaskellWorks.Data.Drop
18 | import HaskellWorks.Data.Positioning
19 | import HaskellWorks.Data.RankSelect.Base.Rank0
20 | import HaskellWorks.Data.RankSelect.Base.Rank1
21 | import HaskellWorks.Data.RankSelect.Base.Select1
22 | import HaskellWorks.Data.TreeCursor
23 | import HaskellWorks.Data.Uncons
24 | import HaskellWorks.Data.Xml.CharLike
25 | import HaskellWorks.Data.Xml.Grammar
26 | import HaskellWorks.Data.Xml.Succinct
27 | import Prelude hiding (drop)
28 |
29 | import qualified Data.Attoparsec.ByteString.Char8 as ABC
30 | import qualified Data.ByteString as BS
31 | import qualified Data.List as L
32 | import qualified Data.Text as T
33 | import qualified HaskellWorks.Data.BalancedParens as BP
34 |
35 | data XmlIndex
36 | = XmlIndexDocument [XmlIndex]
37 | | XmlIndexElement Text [XmlIndex]
38 | | XmlIndexCData BS.ByteString
39 | | XmlIndexComment BS.ByteString
40 | | XmlIndexMeta Text [XmlIndex]
41 | | XmlIndexAttrList [XmlIndex]
42 | | XmlIndexValue BS.ByteString
43 | | XmlIndexAttrName BS.ByteString
44 | | XmlIndexAttrValue BS.ByteString
45 | | XmlIndexError Text
46 | deriving (Eq, Show)
47 |
48 | data XmlIndexState
49 | = InAttrList
50 | | InElement
51 | | Unknown
52 | deriving (Eq, Show)
53 |
54 | class XmlIndexAt a where
55 | xmlIndexAt :: a -> XmlIndex
56 |
57 | pos :: (Select1 v, Rank1 w) => XmlCursor t v w -> Position
58 | pos c = lastPositionOf (select1 (interests c) (rank1 (balancedParens c) (cursorRank c)))
59 |
60 | remText :: (Drop v, Select1 v1, Rank1 w) => XmlCursor v v1 w -> v
61 | remText c = drop (toCount (pos c)) (cursorText c)
62 |
63 | instance (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlIndexAt (XmlCursor BS.ByteString v w) where
64 | xmlIndexAt :: XmlCursor BS.ByteString v w -> XmlIndex
65 | xmlIndexAt = getIndexAt Unknown
66 |
67 |
68 | getIndexAt :: (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlIndexState -> XmlCursor BS.ByteString v w -> XmlIndex
69 | getIndexAt state k = case uncons remainder of
70 | Just (!c, cs) | isElementStart c -> parseElem cs
71 | Just (!c, _ ) | isSpace c -> XmlIndexAttrList $ mapValuesFrom InAttrList (firstChild k)
72 | Just (!c, _ ) | isAttribute && isQuote c -> XmlIndexAttrValue remainder
73 | Just _ | isAttribute -> XmlIndexAttrName remainder
74 | Just _ -> XmlIndexValue remainder
75 | Nothing -> XmlIndexError "End of data"
76 | where remainder = remText k
77 | mapValuesFrom s = L.unfoldr (fmap (getIndexAt s &&& nextSibling))
78 | isAttribute = case state of
79 | InAttrList -> True
80 | InElement -> False
81 | Unknown -> case remText <$> parent k >>= uncons of
82 | Just (!c, _) | isSpace c -> True
83 | _ -> False
84 |
85 | parseElem bs =
86 | case ABC.parse parseXmlElement bs of
87 | ABC.Fail {} -> decodeErr "Unable to parse element name" bs
88 | ABC.Partial _ -> decodeErr "Unexpected end of string" bs
89 | ABC.Done i r -> case r of
90 | XmlElementTypeCData -> XmlIndexCData i
91 | XmlElementTypeComment -> XmlIndexComment i
92 | XmlElementTypeMeta s -> XmlIndexMeta s (mapValuesFrom InElement $ firstChild k)
93 | XmlElementTypeElement s -> XmlIndexElement s (mapValuesFrom InElement $ firstChild k)
94 | XmlElementTypeDocument -> XmlIndexDocument (mapValuesFrom InElement (firstChild k) <> mapValuesFrom InElement (nextSibling k))
95 |
96 | decodeErr :: String -> BS.ByteString -> XmlIndex
97 | decodeErr reason bs = XmlIndexError . T.pack $ reason <>": " <> show (BS.take 20 bs) <> "...'"
98 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/Succinct/CursorSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE ExplicitForAll #-}
2 | {-# LANGUAGE FlexibleContexts #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE InstanceSigs #-}
5 | {-# LANGUAGE MultiParamTypeClasses #-}
6 | {-# LANGUAGE NoMonomorphismRestriction #-}
7 | {-# LANGUAGE OverloadedStrings #-}
8 | {-# LANGUAGE ScopedTypeVariables #-}
9 | {-# LANGUAGE TypeApplications #-}
10 |
11 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
12 |
13 | module HaskellWorks.Data.Xml.Succinct.CursorSpec(spec) where
14 |
15 | import Control.Monad
16 | import Data.Word
17 | import HaskellWorks.Data.BalancedParens.BalancedParens
18 | import HaskellWorks.Data.BalancedParens.Simple
19 | import HaskellWorks.Data.Bits.BitShow
20 | import HaskellWorks.Data.Bits.BitShown
21 | import HaskellWorks.Data.Bits.BitWise
22 | import HaskellWorks.Data.RankSelect.Base.Rank0
23 | import HaskellWorks.Data.RankSelect.Base.Rank1
24 | import HaskellWorks.Data.RankSelect.Base.Select1
25 | import HaskellWorks.Data.RankSelect.Poppy512
26 | import HaskellWorks.Data.Xml.Succinct.Cursor as C
27 | import HaskellWorks.Data.Xml.Succinct.CursorSpec.Make
28 | import HaskellWorks.Data.Xml.Token
29 | import HaskellWorks.Hspec.Hedgehog
30 | import Hedgehog
31 | import Test.Hspec
32 |
33 | import qualified Data.ByteString as BS
34 | import qualified Data.Text as T
35 | import qualified Data.Text.Encoding as T
36 | import qualified Data.Vector.Storable as DVS
37 | import qualified HaskellWorks.Data.FromByteString as BS
38 | import qualified HaskellWorks.Data.TreeCursor as TC
39 | import qualified HaskellWorks.Data.Xml.Succinct.Cursor.Create as CC
40 |
41 | {- HLINT ignore "Redundant do" -}
42 | {- HLINT ignore "Redundant bracket" -}
43 | {- HLINT ignore "Reduce duplication" -}
44 |
45 | fc = TC.firstChild
46 | ns = TC.nextSibling
47 | pn = TC.parent
48 | cd = TC.depth
49 | ss = TC.subtreeSize
50 |
51 | spec :: Spec
52 | spec = describe "HaskellWorks.Data.Xml.Succinct.CursorSpec" $ do
53 | make "DVS.Vector Word8" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word8 )) (SimpleBalancedParens (DVS.Vector Word8 )))
54 | make "DVS.Vector Word16" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16)))
55 | make "DVS.Vector Word32" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32)))
56 | make "DVS.Vector Word64" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64)))
57 | make "Poppy512" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString Poppy512 (SimpleBalancedParens (DVS.Vector Word64)))
58 | make "DVS.Vector Word8" CC.byteStringAsFastCursor
59 | make "DVS.Vector Word16" CC.byteStringAsFastCursor
60 | make "DVS.Vector Word32" CC.byteStringAsFastCursor
61 | make "DVS.Vector Word64" CC.byteStringAsFastCursor
62 | make "Poppy512" CC.byteStringAsFastCursor
63 | it "Loads same Xml consistentally from different backing vectors" $ requireTest $ do
64 | let cursor8 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (SimpleBalancedParens (DVS.Vector Word8))
65 | let cursor16 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16))
66 | let cursor32 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32))
67 | let cursor64 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))
68 | cursorText cursor8 === cursorText cursor16
69 | cursorText cursor8 === cursorText cursor32
70 | cursorText cursor8 === cursorText cursor64
71 | let ic8 = bitShow $ interests cursor8
72 | let ic16 = bitShow $ interests cursor16
73 | let ic32 = bitShow $ interests cursor32
74 | let ic64 = bitShow $ interests cursor64
75 | ic16 `shouldBeginWith` ic8
76 | ic32 `shouldBeginWith` ic16
77 | ic64 `shouldBeginWith` ic32
78 |
79 | shouldBeginWith :: (Eq a, Show a) => [a] -> [a] -> PropertyT IO ()
80 | shouldBeginWith as bs = take (length bs) as === bs
81 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Succinct/Cursor/Internal.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric #-}
2 | {-# LANGUAGE FlexibleContexts #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE InstanceSigs #-}
5 | {-# LANGUAGE MultiParamTypeClasses #-}
6 | {-# LANGUAGE ScopedTypeVariables #-}
7 |
8 | module HaskellWorks.Data.Xml.Succinct.Cursor.Internal
9 | ( XmlCursor(..)
10 | , xmlCursorPos
11 | ) where
12 |
13 | import Control.DeepSeq (NFData (..))
14 | import Data.String
15 | import Data.Word
16 | import Foreign.ForeignPtr
17 | import GHC.Generics
18 | import HaskellWorks.Data.Bits.BitShown
19 | import HaskellWorks.Data.FromByteString
20 | import HaskellWorks.Data.FromForeignRegion
21 | import HaskellWorks.Data.Positioning
22 | import HaskellWorks.Data.RankSelect.Base.Rank0
23 | import HaskellWorks.Data.RankSelect.Base.Rank1
24 | import HaskellWorks.Data.RankSelect.Base.Select1
25 | import HaskellWorks.Data.RankSelect.Poppy512
26 | import HaskellWorks.Data.TreeCursor
27 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
28 | import HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits
29 |
30 | import qualified Data.ByteString as BS
31 | import qualified Data.ByteString.Char8 as BSC
32 | import qualified Data.ByteString.Internal as BSI
33 | import qualified Data.Vector.Storable as DVS
34 | import qualified HaskellWorks.Data.BalancedParens as BP
35 | import qualified HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParens as CBP
36 |
37 | data XmlCursor t v w = XmlCursor
38 | { cursorText :: !t
39 | , interests :: !v
40 | , balancedParens :: !w
41 | , cursorRank :: !Count
42 | }
43 | deriving (Eq, Show, Generic)
44 |
45 | instance (NFData t, NFData v, NFData w) => NFData (XmlCursor t v w) where
46 | rnf (XmlCursor a b c d) = rnf (a, b, c, d)
47 |
48 | instance (FromBlankedXml (XmlInterestBits a), FromBlankedXml (CBP.XmlBalancedParens b))
49 | => FromByteString (XmlCursor BS.ByteString a b) where
50 | fromByteString bs = XmlCursor
51 | { cursorText = bs
52 | , interests = getXmlInterestBits (fromBlankedXml blankedXml)
53 | , balancedParens = CBP.getXmlBalancedParens (fromBlankedXml blankedXml)
54 | , cursorRank = 1
55 | }
56 | where blankedXml :: BlankedXml
57 | blankedXml = bsToBlankedXml bs
58 |
59 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (BP.SimpleBalancedParens (DVS.Vector Word8))) where
60 | fromString = fromByteString . BSC.pack
61 |
62 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (BP.SimpleBalancedParens (DVS.Vector Word16))) where
63 | fromString = fromByteString . BSC.pack
64 |
65 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (BP.SimpleBalancedParens (DVS.Vector Word32))) where
66 | fromString = fromByteString . BSC.pack
67 |
68 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (BP.SimpleBalancedParens (DVS.Vector Word64))) where
69 | fromString = fromByteString . BSC.pack
70 |
71 | instance IsString (XmlCursor BS.ByteString Poppy512 (BP.SimpleBalancedParens (DVS.Vector Word64))) where
72 | fromString = fromByteString . BSC.pack
73 |
74 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (BP.SimpleBalancedParens (DVS.Vector Word8))) where
75 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size)
76 |
77 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (BP.SimpleBalancedParens (DVS.Vector Word16))) where
78 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size)
79 |
80 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (BP.SimpleBalancedParens (DVS.Vector Word32))) where
81 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size)
82 |
83 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (BP.SimpleBalancedParens (DVS.Vector Word64))) where
84 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size)
85 |
86 | instance FromForeignRegion (XmlCursor BS.ByteString Poppy512 (BP.SimpleBalancedParens (DVS.Vector Word64))) where
87 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size)
88 |
89 | instance (BP.BalancedParens u, Rank1 u, Rank0 u) => TreeCursor (XmlCursor t v u) where
90 | firstChild :: XmlCursor t v u -> Maybe (XmlCursor t v u)
91 | firstChild k = let mq = BP.firstChild (balancedParens k) (cursorRank k) in (\q -> k { cursorRank = q }) <$> mq
92 |
93 | nextSibling :: XmlCursor t v u -> Maybe (XmlCursor t v u)
94 | nextSibling k = (\q -> k { cursorRank = q }) <$> BP.nextSibling (balancedParens k) (cursorRank k)
95 |
96 | parent :: XmlCursor t v u -> Maybe (XmlCursor t v u)
97 | parent k = let mq = BP.parent (balancedParens k) (cursorRank k) in (\q -> k { cursorRank = q }) <$> mq
98 |
99 | depth :: XmlCursor t v u -> Maybe Count
100 | depth k = BP.depth (balancedParens k) (cursorRank k)
101 |
102 | subtreeSize :: XmlCursor t v u -> Maybe Count
103 | subtreeSize k = BP.subtreeSize (balancedParens k) (cursorRank k)
104 |
105 | xmlCursorPos :: (Rank1 w, Select1 v) => XmlCursor s v w -> Position
106 | xmlCursorPos k = toPosition (select1 ik (rank1 bpk (cursorRank k)) - 1)
107 | where ik = interests k
108 | bpk = balancedParens k
109 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/TypeSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE ExplicitForAll #-}
2 | {-# LANGUAGE FlexibleContexts #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE InstanceSigs #-}
5 | {-# LANGUAGE MultiParamTypeClasses #-}
6 | {-# LANGUAGE NoMonomorphismRestriction #-}
7 | {-# LANGUAGE OverloadedStrings #-}
8 | {-# LANGUAGE ScopedTypeVariables #-}
9 |
10 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
11 | {-# OPTIONS_GHC -fno-warn-simplifiable-class-constraints #-}
12 |
13 | module HaskellWorks.Data.Xml.TypeSpec (spec) where
14 |
15 | import Control.Monad
16 | import Data.String
17 | import Data.Word
18 | import HaskellWorks.Data.BalancedParens.BalancedParens
19 | import HaskellWorks.Data.BalancedParens.Simple
20 | import HaskellWorks.Data.Bits.BitShown
21 | import HaskellWorks.Data.Bits.BitWise
22 | import HaskellWorks.Data.RankSelect.Base.Rank0
23 | import HaskellWorks.Data.RankSelect.Base.Rank1
24 | import HaskellWorks.Data.RankSelect.Base.Select1
25 | import HaskellWorks.Data.RankSelect.Poppy512
26 | import HaskellWorks.Data.Xml.Succinct.Cursor as C
27 | import HaskellWorks.Data.Xml.Type
28 | import HaskellWorks.Hspec.Hedgehog
29 | import Hedgehog
30 | import Test.Hspec
31 |
32 | import qualified Data.ByteString as BS
33 | import qualified Data.Vector.Storable as DVS
34 | import qualified HaskellWorks.Data.TreeCursor as TC
35 |
36 | {- HLINT ignore "Redundant do" -}
37 | {- HLINT ignore "Redundant bracket" -}
38 | {- HLINT ignore "Reduce duplication" -}
39 |
40 | fc = TC.firstChild
41 | ns = TC.nextSibling
42 |
43 | spec :: Spec
44 | spec = describe "HaskellWorks.Data.Xml.TypeSpec" $ do
45 | genSpec "DVS.Vector Word8" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word8 )) (SimpleBalancedParens (DVS.Vector Word8 )))
46 | genSpec "DVS.Vector Word16" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16)))
47 | genSpec "DVS.Vector Word32" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32)))
48 | genSpec "DVS.Vector Word64" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64)))
49 | genSpec "Poppy512" (undefined :: XmlCursor BS.ByteString Poppy512 (SimpleBalancedParens (DVS.Vector Word64)))
50 |
51 | genSpec :: forall t u.
52 | ( Show t
53 | , Select1 t
54 | , Show u
55 | , Rank0 u
56 | , Rank1 u
57 | , BalancedParens u
58 | , TestBit u
59 | , IsString (XmlCursor BS.ByteString t u)
60 | )
61 | => String -> (XmlCursor BS.ByteString t u) -> SpecWith ()
62 | genSpec t _ = do
63 | describe ("XML cursor of type " ++ t) $ do
64 | let forXml (cursor :: XmlCursor BS.ByteString t u) f = describe ("of value " ++ show cursor) (f cursor)
65 | forXml "" $ \cursor -> do
66 | it "should have correct type" . requireTest $ xmlTypeAt cursor === Just XmlTypeElement
67 | forXml " " $ \cursor -> do
68 | it "should have correct type" . requireTest $ xmlTypeAt cursor === Just XmlTypeElement
69 | forXml "" $ \cursor -> do
70 | it "cursor can navigate to second attribute" $ requireTest $ do
71 | (fc >=> fc >=> ns >=> ns >=> xmlTypeAt) cursor === Just XmlTypeToken
72 | it "cursor can navigate to first attribute of an inner element" $ requireTest $ do
73 | (fc >=> ns >=> fc >=> fc >=> xmlTypeAt) cursor === Just XmlTypeToken
74 | it "cursor can navigate to first atrribute value of an inner element" $ requireTest $ do
75 | (fc >=> ns >=> fc >=> fc >=> ns >=> xmlTypeAt) cursor === Just XmlTypeToken
76 | describe "For a single element" $ do
77 | let cursor = "text" :: XmlCursor BS.ByteString t u
78 | it "can navigate down and forwards" $ requireTest $ do
79 | ( xmlTypeAt) cursor === Just XmlTypeElement
80 | (fc >=> xmlTypeAt) cursor === Just XmlTypeToken
81 | (fc >=> ns >=> xmlTypeAt) cursor === Nothing
82 | (fc >=> ns >=> ns >=> xmlTypeAt) cursor === Nothing
83 | describe "For sample Xml" $ do
84 | let cursor = " \
85 | \ \
86 | \ 500 \
87 | \ 600.01e-02 \
88 | \ false \
89 | \ \
90 | \" :: XmlCursor BS.ByteString t u
91 | it "can navigate down and forwards" $ requireTest $ do
92 | ( xmlTypeAt) cursor === Just XmlTypeElement --widget
93 | (fc >=> xmlTypeAt) cursor === Just XmlTypeAttrList --widget attrs
94 | (fc >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --window
95 | (fc >=> ns >=> fc >=> xmlTypeAt) cursor === Just XmlTypeAttrList --window attrs
96 | (fc >=> ns >=> fc >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --dimension 500
97 | (fc >=> ns >=> fc >=> ns >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --dimension 600
98 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --dimension false
99 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> xmlTypeAt) cursor === Just XmlTypeToken --false
100 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/RawValue.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleInstances #-}
2 | {-# LANGUAGE InstanceSigs #-}
3 | {-# LANGUAGE MultiParamTypeClasses #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# LANGUAGE TypeApplications #-}
7 |
8 | module HaskellWorks.Data.Xml.RawValue
9 | ( RawValue(..)
10 | , RawValueAt(..)
11 | ) where
12 |
13 | import Data.ByteString (ByteString)
14 | import Data.List
15 | import Data.Text (Text)
16 | import HaskellWorks.Data.Xml.Grammar
17 | import HaskellWorks.Data.Xml.Internal.Show
18 | import HaskellWorks.Data.Xml.Succinct.Index
19 | import Prettyprinter
20 |
21 | import qualified Data.Attoparsec.ByteString.Char8 as ABC
22 | import qualified Data.ByteString as BS
23 | import qualified Data.Text as T
24 |
25 | data RawValue
26 | = RawDocument [RawValue]
27 | | RawText Text
28 | | RawElement Text [RawValue]
29 | | RawCData Text
30 | | RawComment Text
31 | | RawMeta Text [RawValue]
32 | | RawAttrName Text
33 | | RawAttrValue Text
34 | | RawAttrList [RawValue]
35 | | RawError Text
36 | deriving (Eq, Show)
37 |
38 | -- TODO use colors and styles
39 |
40 | red :: Doc ann -> Doc ann
41 | red = id
42 |
43 | dullwhite :: Doc ann -> Doc ann
44 | dullwhite = id
45 |
46 | bold :: Doc ann -> Doc ann
47 | bold = id
48 |
49 | dullgreen :: Doc ann -> Doc ann
50 | dullgreen = id
51 |
52 | instance Pretty RawValue where
53 | pretty mjpv = case mjpv of
54 | RawText s -> ctext $ pretty (T.unpack s)
55 | RawAttrName s -> pretty (T.unpack s)
56 | RawAttrValue s -> (ctext . dquotes . pretty) (T.unpack s)
57 | RawAttrList ats -> formatAttrs ats
58 | RawComment s -> pretty $ ""
59 | RawElement s xs -> formatElem (T.unpack s) xs
60 | RawDocument xs -> formatMeta "?" "xml" xs
61 | RawError s -> red $ "[error " <> pretty (T.unpack s) <> "]"
62 | RawCData s -> cangle " ctag "[CDATA[" <> pretty (T.unpack s) <> cangle "]]>"
63 | RawMeta s xs -> formatMeta "!" (T.unpack s) xs
64 | where
65 | formatAttr at = case at of
66 | RawAttrName a -> " " <> pretty (RawAttrName a)
67 | RawAttrValue a -> "=" <> pretty (RawAttrValue a)
68 | RawAttrList _ -> red "ATTRS"
69 | _ -> red "booo"
70 | formatAttrs ats = hcat (formatAttr <$> ats)
71 | formatElem s xs =
72 | let (ats, es) = partition isAttrL xs
73 | in cangle langle <> ctag (pretty s)
74 | <> hcat (pretty <$> ats)
75 | <> cangle rangle
76 | <> hcat (pretty <$> es)
77 | <> cangle "" <> ctag (pretty s) <> cangle rangle
78 | formatMeta b s xs =
79 | let (ats, es) = partition isAttr xs
80 | in cangle (langle <> pretty @String b) <> ctag (pretty @String s)
81 | <> hcat (pretty <$> ats)
82 | <> cangle rangle
83 | <> hcat (pretty <$> es)
84 |
85 | class RawValueAt a where
86 | rawValueAt :: a -> RawValue
87 |
88 | instance RawValueAt XmlIndex where
89 | rawValueAt i = case i of
90 | XmlIndexCData s -> parseTextUntil "]]>" s `as` (RawCData . T.pack)
91 | XmlIndexComment s -> parseTextUntil "-->" s `as` (RawComment . T.pack)
92 | XmlIndexMeta s cs -> RawMeta s (rawValueAt <$> cs)
93 | XmlIndexElement s cs -> RawElement s (rawValueAt <$> cs)
94 | XmlIndexDocument cs -> RawDocument (rawValueAt <$> cs)
95 | XmlIndexAttrName cs -> parseAttrName cs `as` RawAttrName
96 | XmlIndexAttrValue cs -> parseString cs `as` RawAttrValue
97 | XmlIndexAttrList cs -> RawAttrList (rawValueAt <$> cs)
98 | XmlIndexValue s -> parseTextUntil "<" s `as` (RawText . T.pack)
99 | XmlIndexError s -> RawError s
100 | --unknown -> XmlError ("Not yet supported: " <> show unknown)
101 | where
102 | parseUntil s = ABC.manyTill ABC.anyChar (ABC.string s)
103 |
104 | parseTextUntil :: ByteString -> ByteString -> Either Text [Char]
105 | parseTextUntil s bs = case ABC.parse (parseUntil s) bs of
106 | ABC.Fail {} -> decodeErr ("Unable to find " <> tshow s <> ".") bs
107 | ABC.Partial _ -> decodeErr ("Unexpected end, expected " <> tshow s <> ".") bs
108 | ABC.Done _ r -> Right r
109 | parseString :: ByteString -> Either Text Text
110 | parseString bs = case ABC.parse parseXmlString bs of
111 | ABC.Fail {} -> decodeErr "Unable to parse string" bs
112 | ABC.Partial _ -> decodeErr "Unexpected end of string, expected" bs
113 | ABC.Done _ r -> Right r
114 | parseAttrName :: ByteString -> Either Text Text
115 | parseAttrName bs = case ABC.parse parseXmlAttributeName bs of
116 | ABC.Fail {} -> decodeErr "Unable to parse attribute name" bs
117 | ABC.Partial _ -> decodeErr "Unexpected end of attr name, expected" bs
118 | ABC.Done _ r -> Right r
119 |
120 | cangle :: Doc ann -> Doc ann
121 | cangle = dullwhite
122 |
123 | ctag :: Doc ann -> Doc ann
124 | ctag = bold
125 |
126 | ctext :: Doc ann -> Doc ann
127 | ctext = dullgreen
128 |
129 | isAttrL :: RawValue -> Bool
130 | isAttrL (RawAttrList _) = True
131 | isAttrL _ = False
132 |
133 | isAttr :: RawValue -> Bool
134 | isAttr v = case v of
135 | RawAttrName _ -> True
136 | RawAttrValue _ -> True
137 | RawAttrList _ -> True
138 | _ -> False
139 |
140 | as :: Either Text a -> (a -> RawValue) -> RawValue
141 | as = flip $ either RawError
142 |
143 | decodeErr :: Text -> BS.ByteString -> Either Text a
144 | decodeErr reason bs = Left $ reason <> " (" <> tshow (BS.take 20 bs) <> "...)"
145 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/RawValueSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE ExplicitForAll #-}
2 | {-# LANGUAGE FlexibleContexts #-}
3 | {-# LANGUAGE FlexibleInstances #-}
4 | {-# LANGUAGE InstanceSigs #-}
5 | {-# LANGUAGE MonoLocalBinds #-}
6 | {-# LANGUAGE MultiParamTypeClasses #-}
7 | {-# LANGUAGE NoMonomorphismRestriction #-}
8 | {-# LANGUAGE OverloadedStrings #-}
9 | {-# LANGUAGE ScopedTypeVariables #-}
10 |
11 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
12 |
13 | module HaskellWorks.Data.Xml.RawValueSpec (spec) where
14 |
15 | import Control.Monad
16 | import Data.String
17 | import Data.Text (Text)
18 | import Data.Word
19 | import HaskellWorks.Data.BalancedParens.BalancedParens
20 | import HaskellWorks.Data.BalancedParens.Simple
21 | import HaskellWorks.Data.Bits.BitShown
22 | import HaskellWorks.Data.Bits.BitWise
23 | import HaskellWorks.Data.RankSelect.Base.Rank0
24 | import HaskellWorks.Data.RankSelect.Base.Rank1
25 | import HaskellWorks.Data.RankSelect.Base.Select1
26 | import HaskellWorks.Data.RankSelect.Poppy512
27 | import HaskellWorks.Data.Xml.RawValue
28 | import HaskellWorks.Data.Xml.Succinct.Cursor as C
29 | import HaskellWorks.Data.Xml.Succinct.Index
30 | import HaskellWorks.Hspec.Hedgehog
31 | import Hedgehog
32 | import Test.Hspec
33 |
34 | import qualified Data.ByteString as BS
35 | import qualified Data.Vector.Storable as DVS
36 | import qualified HaskellWorks.Data.TreeCursor as TC
37 |
38 | {- HLINT ignore "Redundant do" -}
39 | {- HLINT ignore "Redundant return" -}
40 | {- HLINT ignore "Reduce duplication" -}
41 |
42 | fc = TC.firstChild
43 | ns = TC.nextSibling
44 |
45 | attrs :: [(Text, Text)] -> RawValue
46 | attrs as = RawAttrList $ as >>= (\(k, v) -> [RawAttrName k, RawAttrValue v])
47 |
48 | spec :: Spec
49 | spec = describe "HaskellWorks.Data.Xml.ValueSpec" $ do
50 | genSpec "DVS.Vector Word8" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (SimpleBalancedParens (DVS.Vector Word8)))
51 | genSpec "DVS.Vector Word16" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16)))
52 | genSpec "DVS.Vector Word32" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32)))
53 | genSpec "DVS.Vector Word64" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64)))
54 | genSpec "Poppy512" (undefined :: XmlCursor BS.ByteString Poppy512 (SimpleBalancedParens (DVS.Vector Word64)))
55 |
56 | rawValueVia :: XmlIndexAt (XmlCursor BS.ByteString t u)
57 | => Maybe (XmlCursor BS.ByteString t u) -> RawValue
58 | rawValueVia mk = case mk of
59 | Just k -> rawValueAt (xmlIndexAt k) --either (\(DecodeError e) -> XmlError e) id (rawValueAt <$> xmlIndexAt k)
60 | Nothing -> RawError "No such element"
61 |
62 | genSpec :: forall t u.
63 | ( Show t
64 | , Select1 t
65 | , Show u
66 | , Rank0 u
67 | , Rank1 u
68 | , BalancedParens u
69 | , TestBit u
70 | , IsString (XmlCursor BS.ByteString t u)
71 | )
72 | => String -> XmlCursor BS.ByteString t u -> SpecWith ()
73 | genSpec t _ = do
74 | describe ("XML cursor of type " <> t) $ do
75 | let forXml (cursor :: XmlCursor BS.ByteString t u) f = describe ("of value " <> show cursor) (f cursor)
76 |
77 | forXml "" $ \cursor -> do
78 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) === RawElement "a" []
79 |
80 | forXml "" $ \cursor -> do
81 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) ===
82 | RawElement "a" [attrs [("attr", "value")]]
83 |
84 | forXml "" $ \cursor -> do
85 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) ===
86 | RawElement "a" [attrs [("attr", "value")],
87 | RawElement "b" [attrs [("attr", "value")]]]
88 |
89 | forXml "value text" $ \cursor -> do
90 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) ===
91 | RawElement "a" [RawText "value text"]
92 |
93 | forXml "" $ \cursor -> do
94 | it "should parse space separared comment" $ requireTest $ rawValueVia (Just cursor) ===
95 | RawComment " some comment "
96 |
97 | forXml "" $ \cursor -> do
98 | it "should parse space separared comment" $ requireTest $ rawValueVia (Just cursor) ===
99 | RawComment "some comment ->"
100 |
101 | forXml " tag]]>" $ \cursor -> do
102 | it "should parse cdata data" $ requireTest $ rawValueVia (Just cursor) ===
103 | RawCData "a
tag"
104 |
105 | forXml "]>" $ \cursor -> do
106 | it "should parse metas" $ requireTest $ rawValueVia (Just cursor) ===
107 | RawMeta "DOCTYPE" [RawMeta "ELEMENT" []]
108 |
109 | forXml "free" $ \cursor -> do
110 | it "should parse xml header" $ requireTest $ rawValueVia (Just cursor) ===
111 | RawDocument [
112 | attrs [("version", "1.0"), ("encoding", "UTF-8")],
113 | RawElement "a" [attrs [("text", "value")],
114 | RawText "free"]]
115 |
116 | it "navigate around" $ requireTest $ do
117 | rawValueVia (ns cursor) === RawElement "a" [attrs [("text", "value")], RawText "free"]
118 | rawValueVia ((ns >=> fc) cursor) === attrs [("text", "value")]
119 | rawValueVia ((ns >=> fc >=> fc) cursor) === RawAttrName "text"
120 | rawValueVia ((ns >=> fc >=> fc >=> ns) cursor) === RawAttrValue "value"
121 | rawValueVia ((ns >=> fc >=> ns) cursor) === RawText "free"
122 |
--------------------------------------------------------------------------------
/.github/workflows/haskell.yml:
--------------------------------------------------------------------------------
1 | name: Binaries
2 |
3 | defaults:
4 | run:
5 | shell: bash
6 |
7 | on:
8 | push:
9 | branches:
10 | - main
11 | pull_request:
12 |
13 | jobs:
14 | build:
15 | runs-on: ${{ matrix.os }}
16 |
17 | strategy:
18 | fail-fast: false
19 | matrix:
20 | ghc: ["9.12.2", "9.10.2", "9.8.4", "9.6.7"]
21 | os: [ubuntu-latest]
22 |
23 | env:
24 | # Modify this value to "invalidate" the cabal cache.
25 | CABAL_CACHE_VERSION: "2024-01-05"
26 |
27 | steps:
28 | - uses: actions/checkout@v2
29 |
30 | - uses: haskell-actions/setup@v2
31 | id: setup-haskell
32 | with:
33 | ghc-version: ${{ matrix.ghc }}
34 | cabal-version: '3.10.2.1'
35 |
36 | - name: Set some window specific things
37 | if: matrix.os == 'windows-latest'
38 | run: echo 'EXE_EXT=.exe' >> $GITHUB_ENV
39 |
40 | - name: Configure project
41 | run: |
42 | cabal configure --enable-tests --enable-benchmarks --write-ghc-environment-files=ghc8.4.4+
43 | cabal build all --enable-tests --enable-benchmarks --dry-run
44 |
45 | - name: Cabal cache over S3
46 | uses: action-works/cabal-cache-s3@v1
47 | env:
48 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
49 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
50 | with:
51 | region: us-west-2
52 | dist-dir: dist-newstyle
53 | store-path: ${{ steps.setup-haskell.outputs.cabal-store }}
54 | threads: 16
55 | archive-uri: ${{ secrets.BINARY_CACHE_URI }}/${{ env.CABAL_CACHE_VERSION }}/${{ runner.os }}/${{ matrix.cabal }}/${{ matrix.ghc }}
56 | skip: "${{ secrets.BINARY_CACHE_URI == '' }}"
57 |
58 | - name: Cabal cache over HTTPS
59 | uses: action-works/cabal-cache-s3@v1
60 | with:
61 | dist-dir: dist-newstyle
62 | store-path: ${{ steps.setup-haskell.outputs.cabal-store }}
63 | threads: 16
64 | archive-uri: https://cache.haskellworks.io/${{ env.CABAL_CACHE_VERSION }}/${{ runner.os }}/${{ matrix.cabal }}/${{ matrix.ghc }}
65 | skip: "${{ secrets.BINARY_CACHE_URI != '' }}"
66 |
67 | - name: Build
68 | run: cabal build all --enable-tests --enable-benchmarks
69 |
70 | - name: Test
71 | run: cabal test all --enable-tests --enable-benchmarks
72 |
73 | check:
74 | needs: build
75 | runs-on: ubuntu-latest
76 | outputs:
77 | tag: ${{ steps.tag.outputs.tag }}
78 |
79 | steps:
80 | - uses: actions/checkout@v2
81 |
82 | - name: Check if cabal project is sane
83 | run: |
84 | PROJECT_DIR=$PWD
85 | mkdir -p $PROJECT_DIR/build/sdist
86 | for i in $(git ls-files | grep '\.cabal'); do
87 | cd $PROJECT_DIR && cd `dirname $i`
88 | cabal check
89 | done
90 |
91 | - name: Tag new version
92 | id: tag
93 | if: ${{ github.ref == 'refs/heads/main' }}
94 | env:
95 | server: http://hackage.haskell.org
96 | username: ${{ secrets.HACKAGE_USER }}
97 | password: ${{ secrets.HACKAGE_PASS }}
98 | run: |
99 | package_version="$(cat *.cabal | grep '^version:' | cut -d : -f 2 | xargs)"
100 |
101 | echo "Package version is v$package_version"
102 |
103 | git fetch --unshallow origin
104 |
105 | if git tag "v$package_version"; then
106 | echo "Tagging with new version "v$package_version""
107 |
108 | if git push origin "v$package_version"; then
109 | echo "Tagged with new version "v$package_version""
110 |
111 | echo "::set-output name=tag::v$package_version"
112 | fi
113 | fi
114 |
115 | release:
116 | needs: [build, check]
117 | runs-on: ubuntu-latest
118 | if: ${{ needs.check.outputs.tag != '' }}
119 | outputs:
120 | upload_url: ${{ steps.create_release.outputs.upload_url }}
121 |
122 | steps:
123 | - uses: actions/checkout@v2
124 |
125 | - name: Create source distribution
126 | run: |
127 | PROJECT_DIR=$PWD
128 | mkdir -p $PROJECT_DIR/build/sdist
129 | for i in $(git ls-files | grep '\.cabal'); do
130 | cd $PROJECT_DIR && cd `dirname $i`
131 | cabal v2-sdist -o $PROJECT_DIR/build/sdist
132 | done;
133 |
134 | - name: Publish to hackage
135 | env:
136 | server: http://hackage.haskell.org
137 | username: ${{ secrets.HACKAGE_USER }}
138 | password: ${{ secrets.HACKAGE_PASS }}
139 | candidate: false
140 | run: |
141 | package_version="$(cat *.cabal | grep '^version:' | cut -d : -f 2 | xargs)"
142 |
143 | for PACKAGE_TARBALL in $(find ./build/sdist/ -name "*.tar.gz"); do
144 | PACKAGE_NAME=$(basename ${PACKAGE_TARBALL%.*.*})
145 |
146 | if ${{ env.candidate }}; then
147 | TARGET_URL="${{ env.server }}/packages/candidates";
148 | DOCS_URL="${{ env.server }}/package/$PACKAGE_NAME/candidate/docs"
149 | else
150 | TARGET_URL="${{ env.server }}/packages/upload";
151 | DOCS_URL="${{ env.server }}/package/$PACKAGE_NAME/docs"
152 | fi
153 |
154 | HACKAGE_STATUS=$(curl --silent --head -w %{http_code} -XGET --anyauth --user "${{ env.username }}:${{ env.password }}" ${{ env.server }}/package/$PACKAGE_NAME -o /dev/null)
155 |
156 | if [ "$HACKAGE_STATUS" = "404" ]; then
157 | echo "Uploading $PACKAGE_NAME to $TARGET_URL"
158 |
159 | curl -X POST -f --user "${{ env.username }}:${{ env.password }}" $TARGET_URL -F "package=@$PACKAGE_TARBALL"
160 | echo "Uploaded $PACKAGE_NAME"
161 | else
162 | echo "Package $PACKAGE_NAME" already exists on Hackage.
163 | fi
164 | done
165 |
166 | - name: Create Release
167 | id: create_release
168 | uses: actions/create-release@v1
169 | env:
170 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
171 | with:
172 | tag_name: ${{ github.ref }}
173 | release_name: Release ${{ github.ref }}
174 | body: Undocumented
175 | draft: true
176 | prerelease: false
177 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/Internal/BlankSpec.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 | {-# LANGUAGE ScopedTypeVariables #-}
3 |
4 | module HaskellWorks.Data.Xml.Internal.BlankSpec (spec) where
5 |
6 | import Data.Char
7 | import HaskellWorks.Data.ByteString
8 | import HaskellWorks.Data.Xml.Internal.Blank
9 | import HaskellWorks.Hspec.Hedgehog
10 | import Hedgehog
11 | import Test.Hspec
12 |
13 | import qualified Data.ByteString as BS
14 | import qualified Hedgehog.Gen as G
15 | import qualified Hedgehog.Range as R
16 |
17 | {- HLINT ignore "Redundant do" -}
18 | {- HLINT ignore "Reduce duplication" -}
19 |
20 | whenBlankedXmlShouldBe :: BS.ByteString -> BS.ByteString -> Spec
21 | whenBlankedXmlShouldBe original expected = do
22 | it (show original <> " when blanked xml should be " <> show expected) $ requireTest $ do
23 | BS.concat (blankXml [original]) === expected
24 |
25 | repeatBS :: Int -> BS.ByteString -> BS.ByteString
26 | repeatBS n bs | n > 0 = bs <> repeatBS (n - 1) bs
27 | repeatBS _ _ = BS.empty
28 |
29 | noSpaces :: BS.ByteString -> BS.ByteString
30 | noSpaces = BS.filter (/= fromIntegral (ord ' '))
31 |
32 | data Annotated a b = Annotated a b deriving Show
33 |
34 | instance Eq a => Eq (Annotated a b) where
35 | (Annotated a _) == (Annotated b _) = a == b
36 |
37 | spec :: Spec
38 | spec = describe "HaskellWorks.Data.Xml.Internal.BlankSpec" $ do
39 | describe "Can blank XML" $ do
40 | "" `whenBlankedXmlShouldBe` "< >"
41 | "" `whenBlankedXmlShouldBe` "< >"
42 | "text" `whenBlankedXmlShouldBe` "< t >"
43 | " text " `whenBlankedXmlShouldBe` "< t >"
44 | "" `whenBlankedXmlShouldBe` "< ()>"
45 | "" `whenBlankedXmlShouldBe` "< (a v )>"
46 | "" `whenBlankedXmlShouldBe` "< (a v )>"
47 | "" `whenBlankedXmlShouldBe` "< (a v )>"
48 | "" `whenBlankedXmlShouldBe` "< (a v a v )>"
49 | "text" `whenBlankedXmlShouldBe` "< (a v a v )t >"
50 | "" `whenBlankedXmlShouldBe` "< (a v a v )>"
51 | "" `whenBlankedXmlShouldBe` "< (a v )< > >"
52 | "test" `whenBlankedXmlShouldBe` "< (a v )< t > >"
53 | " text bold " `whenBlankedXmlShouldBe` "< t < t > >"
54 | " text bold uuu" `whenBlankedXmlShouldBe` "< t < t > t >"
55 | "" `whenBlankedXmlShouldBe` "< (a v )>"
56 | " " `whenBlankedXmlShouldBe` "< [ ] >"
57 | " " `whenBlankedXmlShouldBe` "< [ ] >"
58 |
59 | " " `whenBlankedXmlShouldBe` "< [ ] >"
60 | "" `whenBlankedXmlShouldBe` "< (a v a v )>"
61 |
62 | "]>" `whenBlankedXmlShouldBe` "[ [ ] ]"
64 |
65 | "Hello,\
66 | \ world!]]>" `whenBlankedXmlShouldBe` "< [ ] >"
67 |
68 | "" `whenBlankedXmlShouldBe` "< [ ] >"
69 | "00" `whenBlankedXmlShouldBe` "< < t >< > >"
70 | "0" `whenBlankedXmlShouldBe` "< < t >< > >"
71 |
72 | it "Can blank across chunk boundaries with basic tags" $ requireTest $ do
73 | let inputOriginalPrefix = "\n\n "
74 | let inputOriginalSuffix = "\n \n \n \n \n \n\n"
75 | let inputOriginal = inputOriginalPrefix <> inputOriginalSuffix
76 | let inputOriginalChunked = chunkedBy 16 inputOriginal
77 | let inputOriginalBlanked = blankXml inputOriginalChunked
78 |
79 | n <- forAll $ G.int (R.linear 0 16)
80 |
81 | let inputShifted = inputOriginalPrefix <> repeatBS n " " <> inputOriginalSuffix
82 | let inputShiftedChunked = chunkedBy 16 inputShifted
83 | let inputShiftedBlanked = blankXml inputShiftedChunked
84 |
85 | noSpaces (BS.concat inputShiftedBlanked) === noSpaces (BS.concat inputOriginalBlanked)
86 | it "Can blank across chunk boundaries with auto-close tags" $ requireTest $ do
87 | let inputOriginalPrefix = ""
88 | let inputOriginalSuffix = "\n"
89 | let inputOriginal = inputOriginalPrefix <> inputOriginalSuffix
90 | let inputOriginalChunked = chunkedBy 16 inputOriginal
91 | let inputOriginalBlanked = blankXml inputOriginalChunked
92 |
93 | n <- forAll $ G.int (R.linear 0 16)
94 |
95 | let inputShifted = inputOriginalPrefix <> repeatBS n " " <> inputOriginalSuffix
96 | let inputShiftedChunked = chunkedBy 16 inputShifted
97 | let inputShiftedBlanked = blankXml inputShiftedChunked
98 |
99 | -- putStrLn $ show (BS.concat inputShiftedBlanked) <> " vs " <> show (BS.concat inputOriginalBlanked)
100 | let actual = Annotated (noSpaces (BS.concat inputShiftedBlanked )) (inputShiftedBlanked, n)
101 | let expected = Annotated (noSpaces (BS.concat inputOriginalBlanked)) (inputOriginalBlanked, n)
102 |
103 | actual === expected
104 | it "Can blank across chunk boundaries with auto-close tags" $ requireTest $ do
105 | let inputOriginalPrefix = ""
106 | let inputOriginalSuffix = "\n"
107 | let inputOriginal = inputOriginalPrefix <> inputOriginalSuffix
108 | let inputOriginalChunked = chunkedBy 16 inputOriginal
109 | let inputOriginalBlanked = blankXml inputOriginalChunked
110 |
111 | let n = 15
112 | let inputShifted = inputOriginalPrefix <> repeatBS n " " <> inputOriginalSuffix
113 | let inputShiftedChunked = chunkedBy 16 inputShifted
114 | let inputShiftedBlanked = blankXml inputShiftedChunked
115 |
116 | -- putStrLn $ show (BS.concat inputShiftedBlanked) <> " vs " <> show (BS.concat inputOriginalBlanked)
117 | let actual = Annotated (noSpaces (BS.concat inputShiftedBlanked )) (inputShiftedBlanked, n)
118 | let expected = Annotated (noSpaces (BS.concat inputOriginalBlanked)) (inputOriginalBlanked, n)
119 |
120 | actual === expected
121 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Token/Tokenize.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE FlexibleInstances #-}
3 | {-# LANGUAGE MultiParamTypeClasses #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE TypeSynonymInstances #-}
6 |
7 | module HaskellWorks.Data.Xml.Token.Tokenize
8 | ( IsChar(..)
9 | , XmlToken(..)
10 | , ParseXml(..)
11 | ) where
12 |
13 | import Control.Applicative
14 | import Data.Bits
15 | import Data.Char
16 | import Data.Word
17 | import Data.Word8
18 | import HaskellWorks.Data.Char.IsChar
19 | import HaskellWorks.Data.Parser as P
20 | import HaskellWorks.Data.Xml.Token.Types
21 |
22 | import qualified Data.Attoparsec.ByteString.Char8 as BC
23 | import qualified Data.Attoparsec.Combinator as AC
24 | import qualified Data.Attoparsec.Types as T
25 | import qualified Data.ByteString as BS
26 |
27 | hexDigitNumeric :: P.Parser t Word8 => T.Parser t Int
28 | hexDigitNumeric = do
29 | c <- satisfyChar (\c -> '0' <= c && c <= '9')
30 | return $ ord c - ord '0'
31 |
32 | hexDigitAlphaLower :: P.Parser t Word8 => T.Parser t Int
33 | hexDigitAlphaLower = do
34 | c <- satisfyChar (\c -> 'a' <= c && c <= 'z')
35 | return $ ord c - ord 'a' + 10
36 |
37 | hexDigitAlphaUpper :: P.Parser t Word8 => T.Parser t Int
38 | hexDigitAlphaUpper = do
39 | c <- satisfyChar (\c -> 'A' <= c && c <= 'Z')
40 | return $ ord c - ord 'A' + 10
41 |
42 | hexDigit :: P.Parser t Word8 => T.Parser t Int
43 | hexDigit = hexDigitNumeric <|> hexDigitAlphaLower <|> hexDigitAlphaUpper
44 |
45 | class ParseXml t s d where
46 | parseXmlTokenString :: T.Parser t (XmlToken s d)
47 | parseXmlToken :: T.Parser t (XmlToken s d)
48 | parseXmlTokenBraceL :: T.Parser t (XmlToken s d)
49 | parseXmlTokenBraceR :: T.Parser t (XmlToken s d)
50 | parseXmlTokenBracketL :: T.Parser t (XmlToken s d)
51 | parseXmlTokenBracketR :: T.Parser t (XmlToken s d)
52 | parseXmlTokenComma :: T.Parser t (XmlToken s d)
53 | parseXmlTokenColon :: T.Parser t (XmlToken s d)
54 | parseXmlTokenWhitespace :: T.Parser t (XmlToken s d)
55 | parseXmlTokenNull :: T.Parser t (XmlToken s d)
56 | parseXmlTokenBoolean :: T.Parser t (XmlToken s d)
57 | parseXmlTokenDouble :: T.Parser t (XmlToken s d)
58 |
59 | parseXmlToken =
60 | parseXmlTokenString <|>
61 | parseXmlTokenBraceL <|>
62 | parseXmlTokenBraceR <|>
63 | parseXmlTokenBracketL <|>
64 | parseXmlTokenBracketR <|>
65 | parseXmlTokenComma <|>
66 | parseXmlTokenColon <|>
67 | parseXmlTokenWhitespace <|>
68 | parseXmlTokenNull <|>
69 | parseXmlTokenBoolean <|>
70 | parseXmlTokenDouble
71 |
72 | instance ParseXml BS.ByteString String Double where
73 | parseXmlTokenBraceL = string "{" >> return XmlTokenBraceL
74 | parseXmlTokenBraceR = string "}" >> return XmlTokenBraceR
75 | parseXmlTokenBracketL = string "[" >> return XmlTokenBracketL
76 | parseXmlTokenBracketR = string "]" >> return XmlTokenBracketR
77 | parseXmlTokenComma = string "," >> return XmlTokenComma
78 | parseXmlTokenColon = string ":" >> return XmlTokenColon
79 | parseXmlTokenNull = string "null" >> return XmlTokenNull
80 | parseXmlTokenDouble = XmlTokenNumber <$> rational
81 |
82 | parseXmlTokenString = do
83 | _ <- string "\""
84 | value <- many (verbatimChar <|> escapedChar <|> escapedCode)
85 | _ <- string "\""
86 | return $ XmlTokenString value
87 | where
88 | verbatimChar = satisfyChar (BC.notInClass "\"\\") > "invalid string character"
89 | escapedChar = do
90 | _ <- string "\\"
91 | ( char '"' >> return '"' ) <|>
92 | ( char 'b' >> return '\b' ) <|>
93 | ( char 'n' >> return '\n' ) <|>
94 | ( char 'f' >> return '\f' ) <|>
95 | ( char 'r' >> return '\r' ) <|>
96 | ( char 't' >> return '\t' ) <|>
97 | ( char '\\' >> return '\\' ) <|>
98 | ( char '\'' >> return '\'' ) <|>
99 | ( char '/' >> return '/' )
100 | escapedCode :: T.Parser BS.ByteString Char
101 | escapedCode = do
102 | _ <- string "\\u"
103 | a <- hexDigit
104 | b <- hexDigit
105 | c <- hexDigit
106 | d <- hexDigit
107 | return . chr $ a `shift` 24 .|. b `shift` 16 .|. c `shift` 8 .|. d
108 |
109 | parseXmlTokenWhitespace = do
110 | _ <- AC.many1' $ BC.choice [string " ", string "\t", string "\n", string "\r"]
111 | return XmlTokenWhitespace
112 |
113 | parseXmlTokenBoolean = true <|> false
114 | where
115 | true = string "true" >> return (XmlTokenBoolean True)
116 | false = string "false" >> return (XmlTokenBoolean False)
117 |
118 | instance ParseXml BS.ByteString BS.ByteString Double where
119 | parseXmlTokenBraceL = string "{" >> return XmlTokenBraceL
120 | parseXmlTokenBraceR = string "}" >> return XmlTokenBraceR
121 | parseXmlTokenBracketL = string "[" >> return XmlTokenBracketL
122 | parseXmlTokenBracketR = string "]" >> return XmlTokenBracketR
123 | parseXmlTokenComma = string "," >> return XmlTokenComma
124 | parseXmlTokenColon = string ":" >> return XmlTokenColon
125 | parseXmlTokenNull = string "null" >> return XmlTokenNull
126 | parseXmlTokenDouble = XmlTokenNumber <$> rational
127 |
128 | parseXmlTokenString = do
129 | _ <- string "\""
130 | value <- many (verbatimChar <|> escapedChar <|> escapedCode)
131 | _ <- string "\""
132 | return . XmlTokenString $ BS.pack value
133 | where
134 | word :: Word8 -> T.Parser BS.ByteString Word8
135 | word w = satisfy (== w)
136 | verbatimChar :: T.Parser BS.ByteString Word8
137 | verbatimChar = satisfy (\w -> w /= _quotedbl && w /= _backslash) -- > "invalid string character"
138 | escapedChar :: T.Parser BS.ByteString Word8
139 | escapedChar = do
140 | _ <- string "\\"
141 | ( word _quotedbl >> return _quotedbl ) <|>
142 | ( word _b >> return 0x08 ) <|>
143 | ( word _n >> return _lf ) <|>
144 | ( word _f >> return _np ) <|>
145 | ( word _r >> return _cr ) <|>
146 | ( word _t >> return _tab ) <|>
147 | ( word _backslash >> return _backslash ) <|>
148 | ( word _quotesingle >> return _quotesingle ) <|>
149 | ( word _slash >> return _slash )
150 | escapedCode :: T.Parser BS.ByteString Word8
151 | escapedCode = do
152 | _ <- string "\\u"
153 | a <- hexDigit
154 | b <- hexDigit
155 | c <- hexDigit
156 | d <- hexDigit
157 | return . fromIntegral $ a `shift` 24 .|. b `shift` 16 .|. c `shift` 8 .|. d
158 |
159 | parseXmlTokenWhitespace = do
160 | _ <- AC.many1' $ BC.choice [string " ", string "\t", string "\n", string "\r"]
161 | return XmlTokenWhitespace
162 |
163 | parseXmlTokenBoolean = true <|> false
164 | where
165 | true = string "true" >> return (XmlTokenBoolean True)
166 | false = string "false" >> return (XmlTokenBoolean False)
167 |
--------------------------------------------------------------------------------
/.stylish-haskell.yaml:
--------------------------------------------------------------------------------
1 | # stylish-haskell configuration file
2 | # ==================================
3 |
4 | # The stylish-haskell tool is mainly configured by specifying steps. These steps
5 | # are a list, so they have an order, and one specific step may appear more than
6 | # once (if needed). Each file is processed by these steps in the given order.
7 | steps:
8 | # Convert some ASCII sequences to their Unicode equivalents. This is disabled
9 | # by default.
10 | # - unicode_syntax:
11 | # # In order to make this work, we also need to insert the UnicodeSyntax
12 | # # language pragma. If this flag is set to true, we insert it when it's
13 | # # not already present. You may want to disable it if you configure
14 | # # language extensions using some other method than pragmas. Default:
15 | # # true.
16 | # add_language_pragma: true
17 |
18 | # Align the right hand side of some elements. This is quite conservative
19 | # and only applies to statements where each element occupies a single
20 | # line.
21 | - simple_align:
22 | cases: true
23 | top_level_patterns: true
24 | records: true
25 |
26 | # Import cleanup
27 | - imports:
28 | # There are different ways we can align names and lists.
29 | #
30 | # - global: Align the import names and import list throughout the entire
31 | # file.
32 | #
33 | # - file: Like global, but don't add padding when there are no qualified
34 | # imports in the file.
35 | #
36 | # - group: Only align the imports per group (a group is formed by adjacent
37 | # import lines).
38 | #
39 | # - none: Do not perform any alignment.
40 | #
41 | # Default: global.
42 | align: group
43 |
44 | # Folowing options affect only import list alignment.
45 | #
46 | # List align has following options:
47 | #
48 | # - after_alias: Import list is aligned with end of import including
49 | # 'as' and 'hiding' keywords.
50 | #
51 | # > import qualified Data.List as List (concat, foldl, foldr, head,
52 | # > init, last, length)
53 | #
54 | # - with_alias: Import list is aligned with start of alias or hiding.
55 | #
56 | # > import qualified Data.List as List (concat, foldl, foldr, head,
57 | # > init, last, length)
58 | #
59 | # - new_line: Import list starts always on new line.
60 | #
61 | # > import qualified Data.List as List
62 | # > (concat, foldl, foldr, head, init, last, length)
63 | #
64 | # Default: after_alias
65 | list_align: after_alias
66 |
67 | # Long list align style takes effect when import is too long. This is
68 | # determined by 'columns' setting.
69 | #
70 | # - inline: This option will put as much specs on same line as possible.
71 | #
72 | # - new_line: Import list will start on new line.
73 | #
74 | # - new_line_multiline: Import list will start on new line when it's
75 | # short enough to fit to single line. Otherwise it'll be multiline.
76 | #
77 | # - multiline: One line per import list entry.
78 | # Type with contructor list acts like single import.
79 | #
80 | # > import qualified Data.Map as M
81 | # > ( empty
82 | # > , singleton
83 | # > , ...
84 | # > , delete
85 | # > )
86 | #
87 | # Default: inline
88 | long_list_align: inline
89 |
90 | # Align empty list (importing instances)
91 | #
92 | # Empty list align has following options
93 | #
94 | # - inherit: inherit list_align setting
95 | #
96 | # - right_after: () is right after the module name:
97 | #
98 | # > import Vector.Instances ()
99 | #
100 | # Default: inherit
101 | empty_list_align: inherit
102 |
103 | # List padding determines indentation of import list on lines after import.
104 | # This option affects 'long_list_align'.
105 | #
106 | # - : constant value
107 | #
108 | # - module_name: align under start of module name.
109 | # Useful for 'file' and 'group' align settings.
110 | list_padding: 4
111 |
112 | # Separate lists option affects formating of import list for type
113 | # or class. The only difference is single space between type and list
114 | # of constructors, selectors and class functions.
115 | #
116 | # - true: There is single space between Foldable type and list of it's
117 | # functions.
118 | #
119 | # > import Data.Foldable (Foldable (fold, foldl, foldMap))
120 | #
121 | # - false: There is no space between Foldable type and list of it's
122 | # functions.
123 | #
124 | # > import Data.Foldable (Foldable(fold, foldl, foldMap))
125 | #
126 | # Default: true
127 | separate_lists: true
128 |
129 | # Language pragmas
130 | - language_pragmas:
131 | # We can generate different styles of language pragma lists.
132 | #
133 | # - vertical: Vertical-spaced language pragmas, one per line.
134 | #
135 | # - compact: A more compact style.
136 | #
137 | # - compact_line: Similar to compact, but wrap each line with
138 | # `{-#LANGUAGE #-}'.
139 | #
140 | # Default: vertical.
141 | style: vertical
142 |
143 | # Align affects alignment of closing pragma brackets.
144 | #
145 | # - true: Brackets are aligned in same collumn.
146 | #
147 | # - false: Brackets are not aligned together. There is only one space
148 | # between actual import and closing bracket.
149 | #
150 | # Default: true
151 | align: true
152 |
153 | # stylish-haskell can detect redundancy of some language pragmas. If this
154 | # is set to true, it will remove those redundant pragmas. Default: true.
155 | remove_redundant: true
156 |
157 | # Replace tabs by spaces. This is disabled by default.
158 | # - tabs:
159 | # # Number of spaces to use for each tab. Default: 8, as specified by the
160 | # # Haskell report.
161 | # spaces: 8
162 |
163 | # Remove trailing whitespace
164 | - trailing_whitespace: {}
165 |
166 | # A common setting is the number of columns (parts of) code will be wrapped
167 | # to. Different steps take this into account. Default: 80.
168 | columns: 800
169 |
170 | # By default, line endings are converted according to the OS. You can override
171 | # preferred format here.
172 | #
173 | # - native: Native newline format. CRLF on Windows, LF on other OSes.
174 | #
175 | # - lf: Convert to LF ("\n").
176 | #
177 | # - crlf: Convert to CRLF ("\r\n").
178 | #
179 | # Default: native.
180 | newline: native
181 |
182 | # Sometimes, language extensions are specified in a cabal file or from the
183 | # command line instead of using language pragmas in the file. stylish-haskell
184 | # needs to be aware of these, so it can parse the file correctly.
185 | #
186 | # No language extensions are enabled by default.
187 | # language_extensions:
188 | # - TemplateHaskell
189 | # - QuasiQuotes
190 |
--------------------------------------------------------------------------------
/test/HaskellWorks/Data/Xml/Succinct/CursorSpec/Make.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE FlexibleInstances #-}
3 | {-# LANGUAGE InstanceSigs #-}
4 | {-# LANGUAGE MultiParamTypeClasses #-}
5 | {-# LANGUAGE NoMonomorphismRestriction #-}
6 | {-# LANGUAGE OverloadedStrings #-}
7 | {-# LANGUAGE ScopedTypeVariables #-}
8 | {-# LANGUAGE TypeApplications #-}
9 |
10 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
11 |
12 | module HaskellWorks.Data.Xml.Succinct.CursorSpec.Make
13 | ( make
14 | ) where
15 |
16 | import Control.Monad
17 | import Data.Word
18 | import HaskellWorks.Data.BalancedParens.BalancedParens
19 | import HaskellWorks.Data.BalancedParens.Simple
20 | import HaskellWorks.Data.Bits.BitShow
21 | import HaskellWorks.Data.Bits.BitShown
22 | import HaskellWorks.Data.Bits.BitWise
23 | import HaskellWorks.Data.RankSelect.Base.Rank0
24 | import HaskellWorks.Data.RankSelect.Base.Rank1
25 | import HaskellWorks.Data.RankSelect.Base.Select1
26 | import HaskellWorks.Data.RankSelect.Poppy512
27 | import HaskellWorks.Data.Xml.Succinct.Cursor as C
28 | import HaskellWorks.Data.Xml.Token
29 | import HaskellWorks.Hspec.Hedgehog
30 | import Hedgehog
31 | import Test.Hspec
32 |
33 | import qualified Data.ByteString as BS
34 | import qualified Data.Text as T
35 | import qualified Data.Text.Encoding as T
36 | import qualified Data.Vector.Storable as DVS
37 | import qualified HaskellWorks.Data.FromByteString as BS
38 | import qualified HaskellWorks.Data.TreeCursor as TC
39 | import qualified HaskellWorks.Data.Xml.Succinct.Cursor.Create as CC
40 |
41 | {- HLINT ignore "Redundant do" -}
42 | {- HLINT ignore "Redundant bracket" -}
43 | {- HLINT ignore "Reduce duplication" -}
44 |
45 | fc = TC.firstChild
46 | ns = TC.nextSibling
47 | pn = TC.parent
48 | cd = TC.depth
49 | ss = TC.subtreeSize
50 |
51 | make :: forall t u.
52 | ( Select1 t
53 | , Rank0 u
54 | , Rank1 u
55 | , BalancedParens u
56 | , TestBit u
57 | )
58 | => String -> (BS.ByteString -> XmlCursor BS.ByteString t u) -> SpecWith ()
59 | make t mkCursor = do
60 | describe ("Cursor for (" ++ t ++ ")") $ do
61 | let forXml bs f = let cursor = mkCursor bs in describe (T.unpack ("of value " <> T.decodeUtf8 bs)) (f cursor)
62 | forXml "[null]" $ \cursor -> do
63 | xit "depth at top" $ requireTest $ cd cursor === Just 1
64 | xit "depth at first child of array" $ requireTest $ (fc >=> cd) cursor === Just 2
65 | forXml "[null, {\"field\": 1}]" $ \cursor -> do
66 | xit "depth at second child of array" $ requireTest $ do
67 | (fc >=> ns >=> cd) cursor === Just 2
68 | xit "depth at first child of object at second child of array" $ requireTest $ do
69 | (fc >=> ns >=> fc >=> cd) cursor === Just 3
70 | xit "depth at first child of object at second child of array" $ requireTest $ do
71 | (fc >=> ns >=> fc >=> ns >=> cd) cursor === Just 3
72 |
73 | describe "For sample XML" $ do
74 | let cursor = mkCursor " \
75 | \ \
76 | \ 500 \
77 | \ 600.01e-02 \
78 | \ false \
79 | \ \
80 | \" :: XmlCursor BS.ByteString t u
81 | xit "can get token at cursor" $ requireTest $ do
82 | (xmlTokenAt ) cursor === Just (XmlTokenBraceL )
83 | (fc >=> xmlTokenAt) cursor === Just (XmlTokenString "widget" )
84 | (fc >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenBraceL )
85 | (fc >=> ns >=> fc >=> xmlTokenAt) cursor === Just (XmlTokenString "debug" )
86 | (fc >=> ns >=> fc >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "on" )
87 | (fc >=> ns >=> fc >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "window" )
88 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenBraceL )
89 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> xmlTokenAt) cursor === Just (XmlTokenString "name" )
90 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "main_window" )
91 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "dimensions" )
92 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenBracketL )
93 | -- xit "can navigate up" $ requireTest $ do
94 | -- ( pn) cursor === Nothing
95 | -- (fc >=> pn) cursor === Just cursor
96 | -- (fc >=> ns >=> pn) cursor === Just cursor
97 | -- (fc >=> ns >=> fc >=> pn) cursor === (fc >=> ns ) cursor
98 | -- (fc >=> ns >=> fc >=> ns >=> pn) cursor === (fc >=> ns ) cursor
99 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> pn) cursor === (fc >=> ns ) cursor
100 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> pn) cursor === (fc >=> ns ) cursor
101 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor
102 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor
103 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor
104 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ns >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor
105 | xit "can get subtree size" $ requireTest $ do
106 | ( ss) cursor === Just 16
107 | (fc >=> ss) cursor === Just 1
108 | (fc >=> ns >=> ss) cursor === Just 14
109 | (fc >=> ns >=> fc >=> ss) cursor === Just 1
110 | (fc >=> ns >=> fc >=> ns >=> ss) cursor === Just 1
111 | (fc >=> ns >=> fc >=> ns >=> ns >=> ss) cursor === Just 1
112 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> ss) cursor === Just 10
113 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ss) cursor === Just 1
114 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ss) cursor === Just 1
115 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ss) cursor === Just 1
116 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ns >=> ss) cursor === Just 6
117 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Blank.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 |
4 | module HaskellWorks.Data.Xml.Blank
5 | ( blankXml
6 | ) where
7 |
8 | import Data.ByteString (ByteString)
9 | import Data.Word
10 | import Data.Word8
11 | import HaskellWorks.Data.Xml.Internal.Words
12 | import Prelude
13 |
14 | import qualified Data.ByteString as BS
15 |
16 | type ExpectedChar = Word8
17 |
18 | data BlankState
19 | = InXml
20 | | InTag
21 | | InAttrList
22 | | InCloseTag
23 | | InClose
24 | | InBang Int
25 | | InString ExpectedChar
26 | | InText
27 | | InMeta
28 | | InCdataTag
29 | | InCdata Int
30 | | InRem Int
31 | | InIdent
32 | deriving (Eq, Show)
33 |
34 | data ByteStringP = BSP Word8 ByteString | EmptyBSP deriving Show
35 |
36 | blankXml :: BS.ByteString -> BS.ByteString
37 | blankXml as = fst (BS.unfoldrN (BS.length as) go (InXml, as))
38 | where go :: (BlankState, ByteString) -> Maybe (Word8, (BlankState, ByteString))
39 | go (InXml, bs) = case BS.uncons bs of
40 | Just (!c, !cs) | isMetaStart c cs -> Just (_bracketleft , (InMeta , cs))
41 | Just (!c, !cs) | isEndTag c cs -> Just (_space , (InCloseTag , cs))
42 | Just (!c, !cs) | isTextStart c -> Just (_t , (InText , cs))
43 | Just (!c, !cs) | c == _less -> Just (_less , (InTag , cs))
44 | Just (!c, !cs) | isSpace c -> Just (c , (InXml , cs))
45 | Just ( _, !cs) -> Just (_space , (InXml , cs))
46 | Nothing -> Nothing
47 | go (InTag, bs) = case BS.uncons bs of
48 | Just (!c, !cs) | isSpace c -> Just (_parenleft , (InAttrList , cs))
49 | Just (!c, !cs) | isTagClose c cs -> Just (_space , (InClose , cs))
50 | Just (!c, !cs) | c == _greater -> Just (_space , (InXml , cs))
51 | Just (!c, !cs) | isSpace c -> Just (c , (InTag , cs))
52 | Just ( _, !cs) -> Just (_space , (InTag , cs))
53 | Nothing -> Nothing
54 | go (InCloseTag, bs) = case BS.uncons bs of
55 | Just (!c, !cs) | c == _greater -> Just (_greater , (InXml , cs))
56 | Just ( _, !cs) -> Just (_space , (InCloseTag , cs))
57 | Nothing -> Nothing
58 | go (InAttrList, bs) = case BS.uncons bs of
59 | Just (!c, !cs) | c == _greater -> Just (_parenright , (InXml , cs))
60 | Just (!c, !cs) | isTagClose c cs -> Just (_parenright , (InClose , cs))
61 | Just (!c, !cs) | isNameStartChar c -> Just (_a , (InIdent , cs))
62 | Just (!c, !cs) | isQuote c -> Just (_v , (InString c , cs))
63 | Just (!c, !cs) | isSpace c -> Just (c , (InAttrList , cs))
64 | Just ( _, !cs) -> Just (_space , (InAttrList , cs))
65 | Nothing -> Nothing
66 | go (InClose, bs) = case BS.uncons bs of
67 | Just (_, !cs) -> Just (_greater , (InXml , cs))
68 | Nothing -> Nothing
69 | go (InIdent, bs) = case BS.uncons bs of
70 | Just (!c, !cs) | isNameChar c -> Just (_space , (InIdent , cs))
71 | Just (!c, !cs) | isSpace c -> Just (_space , (InAttrList , cs))
72 | Just (!c, !cs) | c == _equal -> Just (_space , (InAttrList , cs))
73 | Just (!c, !cs) | isSpace c -> Just (c , (InAttrList , cs))
74 | Just ( _, !cs) -> Just (_space , (InAttrList , cs))
75 | Nothing -> Nothing
76 | go (InString q, bs) = case BS.uncons bs of
77 | Just (!c, !cs) | c == q -> Just (_space , (InAttrList , cs))
78 | Just (!c, !cs) | isSpace c -> Just (c , (InString q , cs))
79 | Just ( _, !cs) -> Just (_space , (InString q , cs))
80 | Nothing -> Nothing
81 | go (InText, bs) = case BS.uncons bs of
82 | Just (!c, !cs) | isEndTag c cs -> Just (_space , (InCloseTag , cs))
83 | Just ( _, !cs) | headIs (== _less) cs -> Just (_space , (InXml , cs))
84 | Just (!c, !cs) | isSpace c -> Just (c , (InText , cs))
85 | Just ( _, !cs) -> Just (_space , (InText , cs))
86 | Nothing -> Nothing
87 | go (InMeta, bs) = case BS.uncons bs of
88 | Just (!c, !cs) | c == _exclam -> Just (_space , (InMeta , cs))
89 | Just (!c, !cs) | c == _hyphen -> Just (_space , (InRem 0 , cs))
90 | Just (!c, !cs) | c == _bracketleft -> Just (_space , (InCdataTag , cs))
91 | Just (!c, !cs) | c == _greater -> Just (_bracketright, (InXml , cs))
92 | Just (!c, !cs) | isSpace c -> Just (c , (InBang 1 , cs))
93 | Just ( _, !cs) -> Just (_space , (InBang 1 , cs))
94 | Nothing -> Nothing
95 | go (InCdataTag, bs) = case BS.uncons bs of
96 | Just (!c, !cs) | c == _bracketleft -> Just (_space , (InCdata 0 , cs))
97 | Just (!c, !cs) | isSpace c -> Just (c , (InCdataTag , cs))
98 | Just ( _, !cs) -> Just (_space , (InCdataTag , cs))
99 | Nothing -> Nothing
100 | go (InCdata n, bs) = case BS.uncons bs of
101 | Just (!c, !cs) | c == _greater && n >= 2 -> Just (_bracketright, (InXml , cs))
102 | Just (!c, !cs) | isCdataEnd c cs && n > 0 -> Just (_space , (InCdata (n+1), cs))
103 | Just (!c, !cs) | c == _bracketright -> Just (_space , (InCdata (n+1), cs))
104 | Just (!c, !cs) | isSpace c -> Just (c , (InCdata 0 , cs))
105 | Just ( _, !cs) -> Just (_space , (InCdata 0 , cs))
106 | Nothing -> Nothing
107 | go (InRem n, bs) = case BS.uncons bs of
108 | Just (!c, !cs) | c == _greater && n >= 2 -> Just (_bracketright, (InXml , cs))
109 | Just (!c, !cs) | c == _hyphen -> Just (_space , (InRem (n+1) , cs))
110 | Just (!c, !cs) | isSpace c -> Just (c , (InRem 0 , cs))
111 | Just ( _, !cs) -> Just (_space , (InRem 0 , cs))
112 | Nothing -> Nothing
113 | go (InBang n, bs) = case BS.uncons bs of
114 | Just (!c, !cs) | c == _less -> Just (_bracketleft , (InBang (n+1) , cs))
115 | Just (!c, !cs) | c == _greater && n == 1 -> Just (_bracketright, (InXml , cs))
116 | Just (!c, !cs) | c == _greater -> Just (_bracketright, (InBang (n-1) , cs))
117 | Just (!c, !cs) | isSpace c -> Just (c , (InBang n , cs))
118 | Just ( _, !cs) -> Just (_space , (InBang n , cs))
119 | Nothing -> Nothing
120 |
121 | isEndTag :: Word8 -> ByteString -> Bool
122 | isEndTag c cs = c == _less && headIs (== _slash) cs
123 | {-# INLINE isEndTag #-}
124 |
125 | isTagClose :: Word8 -> ByteString -> Bool
126 | isTagClose c cs = (c == _slash) || ((c == _slash || c == _question) && headIs (== _greater) cs)
127 | {-# INLINE isTagClose #-}
128 |
129 | isMetaStart :: Word8 -> ByteString -> Bool
130 | isMetaStart c cs = c == _less && headIs (== _exclam) cs
131 | {-# INLINE isMetaStart #-}
132 |
133 | isCdataEnd :: Word8 -> ByteString -> Bool
134 | isCdataEnd c cs = c == _bracketright && headIs (== _greater) cs
135 | {-# INLINE isCdataEnd #-}
136 |
137 | headIs :: (Word8 -> Bool) -> ByteString -> Bool
138 | headIs p bs = case BS.uncons bs of
139 | Just (!c, _) -> p c
140 | Nothing -> False
141 | {-# INLINE headIs #-}
142 |
--------------------------------------------------------------------------------
/data/catalog.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Bloodroot
5 | Sanguinaria canadensis
6 | 4
7 | Mostly Shady
8 | $2.44
9 | 031599
10 |
11 |
12 |
13 | Columbine
14 | Aquilegia canadensis
15 | 3
16 | Mostly Shady
17 | $9.37
18 | 030699
19 |
20 |
21 |
22 | Marsh Marigold
23 | Caltha palustris
24 | 4
25 | Mostly Sunny
26 | $6.81
27 | 051799
28 |
29 |
30 |
31 | Cowslip
32 | Caltha palustris
33 | 4
34 | Mostly Shady
35 | $9.90
36 | 030699
37 |
38 |
39 |
40 | Dutchman's-Breeches
41 | Diecentra cucullaria
42 | 3
43 | Mostly Shady
44 | $6.44
45 | 012099
46 |
47 |
48 |
49 | Ginger, Wild
50 | Asarum canadense
51 | 3
52 | Mostly Shady
53 | $9.03
54 | 041899
55 |
56 |
57 |
58 | Hepatica
59 | Hepatica americana
60 | 4
61 | Mostly Shady
62 | $4.45
63 | 012699
64 |
65 |
66 |
67 | Liverleaf
68 | Hepatica americana
69 | 4
70 | Mostly Shady
71 | $3.99
72 | 010299
73 |
74 |
75 |
76 | Jack-In-The-Pulpit
77 | Arisaema triphyllum
78 | 4
79 | Mostly Shady
80 | $3.23
81 | 020199
82 |
83 |
84 |
85 | Mayapple
86 | Podophyllum peltatum
87 | 3
88 | Mostly Shady
89 | $2.98
90 | 060599
91 |
92 |
93 |
94 | Phlox, Woodland
95 | Phlox divaricata
96 | 3
97 | Sun or Shade
98 | $2.80
99 | 012299
100 |
101 |
102 |
103 | Phlox, Blue
104 | Phlox divaricata
105 | 3
106 | Sun or Shade
107 | $5.59
108 | 021699
109 |
110 |
111 |
112 | Spring-Beauty
113 | Claytonia Virginica
114 | 7
115 | Mostly Shady
116 | $6.59
117 | 020199
118 |
119 |
120 |
121 | Trillium
122 | Trillium grandiflorum
123 | 5
124 | Sun or Shade
125 | $3.90
126 | 042999
127 |
128 |
129 |
130 | Wake Robin
131 | Trillium grandiflorum
132 | 5
133 | Sun or Shade
134 | $3.20
135 | 022199
136 |
137 |
138 |
139 | Violet, Dog-Tooth
140 | Erythronium americanum
141 | 4
142 | Shade
143 | $9.04
144 | 020199
145 |
146 |
147 |
148 | Trout Lily
149 | Erythronium americanum
150 | 4
151 | Shade
152 | $6.94
153 | 032499
154 |
155 |
156 |
157 | Adder's-Tongue
158 | Erythronium americanum
159 | 4
160 | Shade
161 | $9.58
162 | 041399
163 |
164 |
165 |
166 | Anemone
167 | Anemone blanda
168 | 6
169 | Mostly Shady
170 | $8.86
171 | 122698
172 |
173 |
174 |
175 | Grecian Windflower
176 | Anemone blanda
177 | 6
178 | Mostly Shady
179 | $9.16
180 | 071099
181 |
182 |
183 |
184 | Bee Balm
185 | Monarda didyma
186 | 4
187 | Shade
188 | $4.59
189 | 050399
190 |
191 |
192 |
193 | Bergamont
194 | Monarda didyma
195 | 4
196 | Shade
197 | $7.16
198 | 042799
199 |
200 |
201 |
202 | Black-Eyed Susan
203 | Rudbeckia hirta
204 | Annual
205 | Sunny
206 | $9.80
207 | 061899
208 |
209 |
210 |
211 | Buttercup
212 | Ranunculus
213 | 4
214 | Shade
215 | $2.57
216 | 061099
217 |
218 |
219 |
220 | Crowfoot
221 | Ranunculus
222 | 4
223 | Shade
224 | $9.34
225 | 040399
226 |
227 |
228 |
229 | Butterfly Weed
230 | Asclepias tuberosa
231 | Annual
232 | Sunny
233 | $2.78
234 | 063099
235 |
236 |
237 |
238 | Cinquefoil
239 | Potentilla
240 | Annual
241 | Shade
242 | $7.06
243 | 052599
244 |
245 |
246 |
247 | Primrose
248 | Oenothera
249 | 3 - 5
250 | Sunny
251 | $6.56
252 | 013099
253 |
254 |
255 |
256 | Gentian
257 | Gentiana
258 | 4
259 | Sun or Shade
260 | $7.81
261 | 051899
262 |
263 |
264 |
265 | Blue Gentian
266 | Gentiana
267 | 4
268 | Sun or Shade
269 | $8.56
270 | 050299
271 |
272 |
273 |
274 | Jacob's Ladder
275 | Polemonium caeruleum
276 | Annual
277 | Shade
278 | $9.26
279 | 022199
280 |
281 |
282 |
283 | Greek Valerian
284 | Polemonium caeruleum
285 | Annual
286 | Shade
287 | $4.36
288 | 071499
289 |
290 |
291 |
292 | California Poppy
293 | Eschscholzia californica
294 | Annual
295 | Sun
296 | $7.89
297 | 032799
298 |
299 |
300 |
301 | Shooting Star
302 | Dodecatheon
303 | Annual
304 | Mostly Shady
305 | $8.60
306 | 051399
307 |
308 |
309 |
310 | Snakeroot
311 | Cimicifuga
312 | Annual
313 | Shade
314 | $5.63
315 | 071199
316 |
317 |
318 |
319 | Cardinal Flower
320 | Lobelia cardinalis
321 | 2
322 | Shade
323 | $3.02
324 | 022299
325 |
326 |
327 |
--------------------------------------------------------------------------------
/src/HaskellWorks/Data/Xml/Internal/Blank.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC-funbox-strict-fields #-}
2 | {-# LANGUAGE BangPatterns #-}
3 | {-# LANGUAGE OverloadedStrings #-}
4 |
5 | module HaskellWorks.Data.Xml.Internal.Blank
6 | ( blankXml
7 | , BlankData(..)
8 | ) where
9 |
10 | import Data.ByteString (ByteString)
11 | import Data.Word
12 | import Data.Word8
13 | import HaskellWorks.Data.Xml.Internal.Words
14 | import Prelude
15 |
16 | import qualified Data.ByteString as BS
17 |
18 | type ExpectedChar = Word8
19 |
20 | data BlankState
21 | = InXml
22 | | InTag
23 | | InAttrList
24 | | InCloseTag
25 | | InClose
26 | | InBang !Int
27 | | InString !ExpectedChar
28 | | InText
29 | | InMeta
30 | | InCdataTag
31 | | InCdata !Int
32 | | InRem !Int
33 | | InIdent
34 |
35 | data BlankData = BlankData
36 | { blankState :: !BlankState
37 | , blankA :: !Word8
38 | , blankB :: !Word8
39 | , blankC :: !ByteString
40 | }
41 |
42 | blankXml :: [ByteString] -> [ByteString]
43 | blankXml = blankXmlPlan1 BS.empty InXml
44 |
45 | blankXmlPlan1 :: ByteString -> BlankState -> [ByteString] -> [ByteString]
46 | blankXmlPlan1 as lastState is = case is of
47 | (bs:bss) -> do
48 | let cs = as <> bs
49 | case BS.uncons cs of
50 | Just (d, ds) -> case BS.uncons ds of
51 | Just (e, es) -> blankXmlRun False d e es lastState bss
52 | Nothing -> blankXmlPlan1 cs lastState bss
53 | Nothing -> blankXmlPlan1 cs lastState bss
54 | [] -> [BS.map (const _space) as]
55 |
56 | blankXmlPlan2 :: Word8 -> Word8 -> BlankState -> [ByteString] -> [ByteString]
57 | blankXmlPlan2 a b lastState is = case is of
58 | (cs:css) -> blankXmlRun False a b cs lastState css
59 | [] -> blankXmlRun True a b (BS.pack [_space, _space]) lastState []
60 |
61 | blankXmlRun :: Bool -> Word8 -> Word8 -> ByteString -> BlankState -> [ByteString] -> [ByteString]
62 | blankXmlRun done a b cs lastState is = do
63 | let (!ds, mState) = BS.unfoldrN (BS.length cs) blankByteString (BlankData lastState a b cs)
64 | case mState of
65 | Just (BlankData !nextState _ _ _) -> do
66 | let (yy, zz) = case BS.unsnoc cs of
67 | Just (ys, z) -> case BS.unsnoc ys of
68 | Just (_, y) -> (y, z)
69 | Nothing -> (b, z)
70 | Nothing -> (a, b)
71 | if done
72 | then [ds]
73 | else ds:blankXmlPlan2 yy zz nextState is
74 | Nothing -> error "No state: blankXmlRun"
75 |
76 | mkNext :: Word8 -> BlankState -> Word8 -> ByteString -> Maybe (Word8, BlankData)
77 | mkNext w s a bs = case BS.uncons bs of
78 | Just (b, cs) -> Just (w, BlankData s a b cs)
79 | Nothing -> error "This should never happen"
80 | {-# INLINE mkNext #-}
81 |
82 | blankByteString :: BlankData -> Maybe (Word8, BlankData)
83 | blankByteString (BlankData InXml a b cs) | isMetaStart a b = mkNext _bracketleft InMeta b cs
84 | blankByteString (BlankData InXml a b cs) | isEndTag a b = mkNext _space InCloseTag b cs
85 | blankByteString (BlankData InXml a b cs) | isTextStart a = mkNext _t InText b cs
86 | blankByteString (BlankData InXml a b cs) | a == _less = mkNext _less InTag b cs
87 | blankByteString (BlankData InXml a b cs) | isSpace a = mkNext a InXml b cs
88 | blankByteString (BlankData InXml _ b cs) = mkNext _space InXml b cs
89 | blankByteString (BlankData InTag a b cs) | isSpace a = mkNext _parenleft InAttrList b cs
90 | blankByteString (BlankData InTag a b cs) | isTagClose a b = mkNext _space InClose b cs
91 | blankByteString (BlankData InTag a b cs) | a == _greater = mkNext _space InXml b cs
92 | blankByteString (BlankData InTag a b cs) | isSpace a = mkNext a InTag b cs
93 | blankByteString (BlankData InTag _ b cs) = mkNext _space InTag b cs
94 | blankByteString (BlankData InCloseTag a b cs) | a == _greater = mkNext _greater InXml b cs
95 | blankByteString (BlankData InCloseTag a b cs) | isSpace a = mkNext a InCloseTag b cs
96 | blankByteString (BlankData InCloseTag _ b cs) = mkNext _space InCloseTag b cs
97 | blankByteString (BlankData InAttrList a b cs) | a == _greater = mkNext _parenright InXml b cs
98 | blankByteString (BlankData InAttrList a b cs) | isTagClose a b = mkNext _parenright InClose b cs
99 | blankByteString (BlankData InAttrList a b cs) | isNameStartChar a = mkNext _a InIdent b cs
100 | blankByteString (BlankData InAttrList a b cs) | isQuote a = mkNext _v (InString a) b cs
101 | blankByteString (BlankData InAttrList a b cs) | isSpace a = mkNext a InAttrList b cs
102 | blankByteString (BlankData InAttrList _ b cs) = mkNext _space InAttrList b cs
103 | blankByteString (BlankData InClose _ b cs) = mkNext _greater InXml b cs
104 | blankByteString (BlankData InIdent a b cs) | isNameChar a = mkNext _space InIdent b cs
105 | blankByteString (BlankData InIdent a b cs) | isSpace a = mkNext _space InAttrList b cs
106 | blankByteString (BlankData InIdent a b cs) | a == _equal = mkNext _space InAttrList b cs
107 | blankByteString (BlankData InIdent a b cs) | isSpace a = mkNext a InAttrList b cs
108 | blankByteString (BlankData InIdent _ b cs) = mkNext _space InAttrList b cs
109 | blankByteString (BlankData (InString q ) a b cs) | a == q = mkNext _space InAttrList b cs
110 | blankByteString (BlankData (InString q ) a b cs) | isSpace a = mkNext a (InString q) b cs
111 | blankByteString (BlankData (InString q ) _ b cs) = mkNext _space (InString q) b cs
112 | blankByteString (BlankData InText a b cs) | isEndTag a b = mkNext _space InCloseTag b cs
113 | blankByteString (BlankData InText _ b cs) | b == _less = mkNext _space InXml b cs
114 | blankByteString (BlankData InText a b cs) | isSpace a = mkNext a InText b cs
115 | blankByteString (BlankData InText _ b cs) = mkNext _space InText b cs
116 | blankByteString (BlankData InMeta a b cs) | a == _exclam = mkNext _space InMeta b cs
117 | blankByteString (BlankData InMeta a b cs) | a == _hyphen = mkNext _space (InRem 0) b cs
118 | blankByteString (BlankData InMeta a b cs) | a == _bracketleft = mkNext _space InCdataTag b cs
119 | blankByteString (BlankData InMeta a b cs) | a == _greater = mkNext _bracketright InXml b cs
120 | blankByteString (BlankData InMeta a b cs) | isSpace a = mkNext a (InBang 1) b cs
121 | blankByteString (BlankData InMeta _ b cs) = mkNext _space (InBang 1) b cs
122 | blankByteString (BlankData InCdataTag a b cs) | a == _bracketleft = mkNext _space (InCdata 0) b cs
123 | blankByteString (BlankData InCdataTag a b cs) | isSpace a = mkNext a InCdataTag b cs
124 | blankByteString (BlankData InCdataTag _ b cs) = mkNext _space InCdataTag b cs
125 | blankByteString (BlankData (InCdata n ) a b cs) | a == _greater && n >= 2 = mkNext _bracketright InXml b cs
126 | blankByteString (BlankData (InCdata n ) a b cs) | isCdataEnd a b && n > 0 = mkNext _space (InCdata (n+1)) b cs
127 | blankByteString (BlankData (InCdata n ) a b cs) | a == _bracketright = mkNext _space (InCdata (n+1)) b cs
128 | blankByteString (BlankData (InCdata _ ) a b cs) | isSpace a = mkNext a (InCdata 0) b cs
129 | blankByteString (BlankData (InCdata _ ) _ b cs) = mkNext _space (InCdata 0) b cs
130 | blankByteString (BlankData (InRem n ) a b cs) | a == _greater && n >= 2 = mkNext _bracketright InXml b cs
131 | blankByteString (BlankData (InRem n ) a b cs) | a == _hyphen = mkNext _space (InRem (n+1)) b cs
132 | blankByteString (BlankData (InRem _ ) a b cs) | isSpace a = mkNext a (InRem 0) b cs
133 | blankByteString (BlankData (InRem _ ) _ b cs) = mkNext _space (InRem 0) b cs
134 | blankByteString (BlankData (InBang n ) a b cs) | a == _less = mkNext _bracketleft (InBang (n+1)) b cs
135 | blankByteString (BlankData (InBang n ) a b cs) | a == _greater && n == 1 = mkNext _bracketright InXml b cs
136 | blankByteString (BlankData (InBang n ) a b cs) | a == _greater = mkNext _bracketright (InBang (n-1)) b cs
137 | blankByteString (BlankData (InBang n ) a b cs) | isSpace a = mkNext a (InBang n) b cs
138 | blankByteString (BlankData (InBang n ) _ b cs) = mkNext _space (InBang n) b cs
139 | {-# INLINE blankByteString #-}
140 |
141 | isEndTag :: Word8 -> Word8 -> Bool
142 | isEndTag a b = a == _less && b == _slash
143 | {-# INLINE isEndTag #-}
144 |
145 | isTagClose :: Word8 -> Word8 -> Bool
146 | isTagClose a b = a == _slash || ((a == _slash || a == _question) && b == _greater)
147 | {-# INLINE isTagClose #-}
148 |
149 | isMetaStart :: Word8 -> Word8 -> Bool
150 | isMetaStart a b = a == _less && b == _exclam
151 | {-# INLINE isMetaStart #-}
152 |
153 | isCdataEnd :: Word8 -> Word8 -> Bool
154 | isCdataEnd a b = a == _bracketright && b == _greater
155 | {-# INLINE isCdataEnd #-}
156 |
--------------------------------------------------------------------------------
/hw-xml.cabal:
--------------------------------------------------------------------------------
1 | cabal-version: 2.2
2 |
3 | name: hw-xml
4 | version: 0.5.1.2
5 | synopsis: XML parser based on succinct data structures.
6 | description: XML parser based on succinct data structures. Please see README.md
7 | category: Data, XML, Succinct Data Structures, Data Structures
8 | homepage: http://github.com/haskell-works/hw-xml#readme
9 | bug-reports: https://github.com/haskell-works/hw-xml/issues
10 | author: John Ky,
11 | Alexey Raga
12 | maintainer: alexey.raga@gmail.com
13 | copyright: 2016-2025 John Ky
14 | , 2016-2019 Alexey Raga
15 | license: BSD-3-Clause
16 | license-file: LICENSE
17 | tested-with: GHC == 9.12.2, GHC == 9.10.2, GHC == 9.8.4, GHC == 9.6.7, GHC == 9.4.4, GHC == 9.0.2, GHC == 8.10.7, GHC == 8.8.4, GHC == 8.6.5
18 | build-type: Simple
19 | extra-source-files: README.md
20 | data-files: data/catalog.xml
21 |
22 | source-repository head
23 | type: git
24 | location: https://github.com/haskell-works/hw-xml
25 |
26 | common base { build-depends: base >= 4.11 && < 5 }
27 |
28 | common prettyprinter { build-depends: prettyprinter >= 0.6.9 && < 2 }
29 | common array { build-depends: array >= 0.5.2.0 && < 0.6 }
30 | common attoparsec { build-depends: attoparsec >= 0.13.2.2 && < 0.15 }
31 | common bytestring { build-depends: bytestring >= 0.10.8.2 && < 0.13 }
32 | common cereal { build-depends: cereal >= 0.5.8.1 && < 0.6 }
33 | common containers { build-depends: containers >= 0.5.10.2 && < 0.8 }
34 | common criterion { build-depends: criterion >= 1.5.5.0 && < 1.7 }
35 | common deepseq { build-depends: deepseq >= 1.4.3.0 && < 1.6 }
36 | common doctest { build-depends: doctest >= 0.16.2 && < 0.23 }
37 | common doctest-discover { build-depends: doctest-discover >= 0.2 && < 0.3 }
38 | common generic-lens { build-depends: generic-lens >= 2.2 && < 2.3 }
39 | common ghc-prim { build-depends: ghc-prim >= 0.5 && < 1 }
40 | common hedgehog { build-depends: hedgehog >= 1.0 && < 2 }
41 | common hspec { build-depends: hspec >= 2.5 && < 3 }
42 | common hw-balancedparens { build-depends: hw-balancedparens >= 0.3.0.1 && < 0.5 }
43 | common hw-bits { build-depends: hw-bits >= 0.7.0.9 && < 0.8 }
44 | common hw-hspec-hedgehog { build-depends: hw-hspec-hedgehog >= 0.1 && < 0.2 }
45 | common hw-parser { build-depends: hw-parser >= 0.1.0.1 && < 0.2 }
46 | common hw-prim { build-depends: hw-prim >= 0.6.2.39 && < 0.7 }
47 | common hw-rankselect { build-depends: hw-rankselect >= 0.13.2.0 && < 0.14 }
48 | common hw-rankselect-base { build-depends: hw-rankselect-base >= 0.3.2.1 && < 0.4 }
49 | common lens { build-depends: lens >= 4.17.1 && < 6 }
50 | common mmap { build-depends: mmap >= 0.5.9 && < 0.6 }
51 | common mtl { build-depends: mtl >= 2.2.2 && < 3 }
52 | common optparse-applicative { build-depends: optparse-applicative >= 0.15.1.0 && < 0.19 }
53 | common resourcet { build-depends: resourcet >= 1.2.2 && < 2 }
54 | common text { build-depends: text >= 1.2.3.2 && < 3 }
55 | common transformers { build-depends: transformers >= 0.5 && < 0.7 }
56 | common vector { build-depends: vector >= 0.12.0.3 && < 0.14 }
57 | common word8 { build-depends: word8 >= 0.1.3 && < 0.2 }
58 |
59 | common config
60 | default-language: Haskell2010
61 |
62 | common hw-xml
63 | build-depends: hw-xml
64 |
65 | library
66 | import: base, config
67 | , array
68 | , attoparsec
69 | , base
70 | , bytestring
71 | , cereal
72 | , containers
73 | , deepseq
74 | , ghc-prim
75 | , hw-balancedparens
76 | , hw-bits
77 | , hw-parser
78 | , hw-prim
79 | , hw-rankselect
80 | , hw-rankselect-base
81 | , lens
82 | , mmap
83 | , mtl
84 | , prettyprinter
85 | , resourcet
86 | , text
87 | , transformers
88 | , vector
89 | , word8
90 | exposed-modules: HaskellWorks.Data.Xml
91 | HaskellWorks.Data.Xml.Blank
92 | HaskellWorks.Data.Xml.CharLike
93 | HaskellWorks.Data.Xml.Decode
94 | HaskellWorks.Data.Xml.DecodeError
95 | HaskellWorks.Data.Xml.DecodeResult
96 | HaskellWorks.Data.Xml.Grammar
97 | HaskellWorks.Data.Xml.Index
98 | HaskellWorks.Data.Xml.Internal.BalancedParens
99 | HaskellWorks.Data.Xml.Internal.ByteString
100 | HaskellWorks.Data.Xml.Internal.Blank
101 | HaskellWorks.Data.Xml.Internal.List
102 | HaskellWorks.Data.Xml.Internal.Show
103 | HaskellWorks.Data.Xml.Internal.Tables
104 | HaskellWorks.Data.Xml.Internal.ToIbBp64
105 | HaskellWorks.Data.Xml.Internal.Words
106 | HaskellWorks.Data.Xml.Lens
107 | HaskellWorks.Data.Xml.Succinct
108 | HaskellWorks.Data.Xml.Succinct.Cursor
109 | HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParens
110 | HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml
111 | HaskellWorks.Data.Xml.Succinct.Cursor.Create
112 | HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits
113 | HaskellWorks.Data.Xml.Succinct.Cursor.Internal
114 | HaskellWorks.Data.Xml.Succinct.Cursor.Load
115 | HaskellWorks.Data.Xml.Succinct.Cursor.Types
116 | HaskellWorks.Data.Xml.Succinct.Cursor.MMap
117 | HaskellWorks.Data.Xml.Succinct.Cursor.Token
118 | HaskellWorks.Data.Xml.Succinct.Index
119 | HaskellWorks.Data.Xml.RawDecode
120 | HaskellWorks.Data.Xml.RawValue
121 | HaskellWorks.Data.Xml.Token.Tokenize
122 | HaskellWorks.Data.Xml.Token.Types
123 | HaskellWorks.Data.Xml.Token
124 | HaskellWorks.Data.Xml.Type
125 | HaskellWorks.Data.Xml.Value
126 | other-modules: Paths_hw_xml
127 | autogen-modules: Paths_hw_xml
128 | hs-source-dirs: src
129 | ghc-options: -Wall -O2 -msse4.2
130 |
131 | executable hw-xml
132 | import: base, config
133 | , attoparsec
134 | , bytestring
135 | , deepseq
136 | , generic-lens
137 | , hw-balancedparens
138 | , hw-bits
139 | , hw-prim
140 | , hw-rankselect
141 | , hw-xml
142 | , lens
143 | , mmap
144 | , mtl
145 | , optparse-applicative
146 | , resourcet
147 | , text
148 | , vector
149 | main-is: Main.hs
150 | other-modules: App.Commands
151 | App.Commands.Count
152 | App.Commands.CreateBlankedXml
153 | App.Commands.CreateBpIndex
154 | App.Commands.CreateIbIndex
155 | App.Commands.CreateIndex
156 | App.Commands.Demo
157 | App.Commands.Types
158 | App.Options
159 | App.XPath.Parser
160 | App.XPath.Types
161 | App.Show
162 | App.Naive
163 | hs-source-dirs: app
164 | ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2 -Wall -msse4.2
165 |
166 | test-suite hw-xml-test
167 | import: base, config
168 | , attoparsec
169 | , base
170 | , bytestring
171 | , hedgehog
172 | , hspec
173 | , hw-balancedparens
174 | , hw-bits
175 | , hw-hspec-hedgehog
176 | , hw-prim
177 | , hw-xml
178 | , hw-rankselect
179 | , hw-rankselect-base
180 | , text
181 | , vector
182 | type: exitcode-stdio-1.0
183 | main-is: Spec.hs
184 | hs-source-dirs: test
185 | ghc-options: -threaded -rtsopts -with-rtsopts=-N
186 | default-language: Haskell2010
187 | build-tool-depends: hspec-discover:hspec-discover
188 | other-modules: HaskellWorks.Data.Xml.Internal.BlankSpec
189 | HaskellWorks.Data.Xml.RawValueSpec
190 | HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParensSpec
191 | HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXmlSpec
192 | HaskellWorks.Data.Xml.Succinct.Cursor.InterestBitsSpec
193 | HaskellWorks.Data.Xml.Succinct.CursorSpec.Make
194 | HaskellWorks.Data.Xml.Succinct.CursorSpec
195 | HaskellWorks.Data.Xml.Token.TokenizeSpec
196 | HaskellWorks.Data.Xml.TypeSpec
197 |
198 | benchmark bench
199 | import: base, config
200 | , bytestring
201 | , criterion
202 | , hw-balancedparens
203 | , hw-bits
204 | , hw-prim
205 | , mmap
206 | , resourcet
207 | , vector
208 | type: exitcode-stdio-1.0
209 | main-is: Main.hs
210 | build-depends: hw-xml
211 | hs-source-dirs: bench
212 | ghc-options: -O2 -Wall -msse4.2
213 |
214 | test-suite doctest
215 | import: base, config
216 | , doctest
217 | , doctest-discover
218 | , hw-xml
219 | type: exitcode-stdio-1.0
220 | ghc-options: -threaded -rtsopts -with-rtsopts=-N
221 | main-is: DoctestDriver.hs
222 | HS-Source-Dirs: doctest
223 | build-tool-depends: doctest-discover:doctest-discover
224 |
--------------------------------------------------------------------------------