├── .github ├── FUNDING.yml └── workflows │ └── haskell.yml ├── .gitignore ├── .stylish-haskell.yaml ├── .vscode └── tasks.json ├── Criteria ├── LICENSE ├── README.md ├── Setup.hs ├── app ├── App │ ├── Commands.hs │ ├── Commands │ │ ├── Count.hs │ │ ├── CreateBlankedXml.hs │ │ ├── CreateBpIndex.hs │ │ ├── CreateIbIndex.hs │ │ ├── CreateIndex.hs │ │ ├── Demo.hs │ │ └── Types.hs │ ├── Main.hs │ ├── Naive.hs │ ├── Options.hs │ ├── Show.hs │ └── XPath │ │ ├── Parser.hs │ │ └── Types.hs └── Main.hs ├── bench └── Main.hs ├── cabal.project ├── data └── catalog.xml ├── doctest └── DoctestDriver.hs ├── hw-xml.cabal ├── project.sh ├── src └── HaskellWorks │ └── Data │ ├── Xml.hs │ └── Xml │ ├── Blank.hs │ ├── CharLike.hs │ ├── Decode.hs │ ├── DecodeError.hs │ ├── DecodeResult.hs │ ├── Grammar.hs │ ├── Index.hs │ ├── Internal │ ├── BalancedParens.hs │ ├── Blank.hs │ ├── ByteString.hs │ ├── List.hs │ ├── Show.hs │ ├── Tables.hs │ ├── ToIbBp64.hs │ └── Words.hs │ ├── Lens.hs │ ├── RawDecode.hs │ ├── RawValue.hs │ ├── Succinct.hs │ ├── Succinct │ ├── Cursor.hs │ ├── Cursor │ │ ├── BalancedParens.hs │ │ ├── BlankedXml.hs │ │ ├── Create.hs │ │ ├── InterestBits.hs │ │ ├── Internal.hs │ │ ├── Load.hs │ │ ├── MMap.hs │ │ ├── Token.hs │ │ └── Types.hs │ └── Index.hs │ ├── Token.hs │ ├── Token │ ├── Tokenize.hs │ └── Types.hs │ ├── Type.hs │ └── Value.hs └── test ├── HaskellWorks └── Data │ └── Xml │ ├── Internal │ └── BlankSpec.hs │ ├── RawValueSpec.hs │ ├── Succinct │ ├── Cursor │ │ ├── BalancedParensSpec.hs │ │ ├── BlankedXmlSpec.hs │ │ └── InterestBitsSpec.hs │ ├── CursorSpec.hs │ └── CursorSpec │ │ └── Make.hs │ ├── Token │ └── TokenizeSpec.hs │ └── TypeSpec.hs └── Spec.hs /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [newhoggy, araga] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/workflows/haskell.yml: -------------------------------------------------------------------------------- 1 | name: Binaries 2 | 3 | defaults: 4 | run: 5 | shell: bash 6 | 7 | on: 8 | push: 9 | branches: 10 | - main 11 | pull_request: 12 | 13 | jobs: 14 | build: 15 | runs-on: ${{ matrix.os }} 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | ghc: ["9.8.1", "9.6.3", "9.4.8", "9.2.8", "9.0.2", "8.10.7"] 21 | os: [ubuntu-latest, macOS-latest, windows-latest] 22 | 23 | env: 24 | # Modify this value to "invalidate" the cabal cache. 25 | CABAL_CACHE_VERSION: "2024-01-05" 26 | 27 | steps: 28 | - uses: actions/checkout@v2 29 | 30 | - uses: haskell-actions/setup@v2 31 | id: setup-haskell 32 | with: 33 | ghc-version: ${{ matrix.ghc }} 34 | cabal-version: '3.10.2.1' 35 | 36 | - name: Set some window specific things 37 | if: matrix.os == 'windows-latest' 38 | run: echo 'EXE_EXT=.exe' >> $GITHUB_ENV 39 | 40 | - name: Configure project 41 | run: | 42 | cabal configure --enable-tests --enable-benchmarks --write-ghc-environment-files=ghc8.4.4+ 43 | cabal build all --enable-tests --enable-benchmarks --dry-run 44 | 45 | - name: Cabal cache over S3 46 | uses: action-works/cabal-cache-s3@v1 47 | env: 48 | AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} 49 | AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 50 | with: 51 | region: us-west-2 52 | dist-dir: dist-newstyle 53 | store-path: ${{ steps.setup-haskell.outputs.cabal-store }} 54 | threads: 16 55 | archive-uri: ${{ secrets.BINARY_CACHE_URI }}/${{ env.CABAL_CACHE_VERSION }}/${{ runner.os }}/${{ matrix.cabal }}/${{ matrix.ghc }} 56 | skip: "${{ secrets.BINARY_CACHE_URI == '' }}" 57 | 58 | - name: Cabal cache over HTTPS 59 | uses: action-works/cabal-cache-s3@v1 60 | with: 61 | dist-dir: dist-newstyle 62 | store-path: ${{ steps.setup-haskell.outputs.cabal-store }} 63 | threads: 16 64 | archive-uri: https://cache.haskellworks.io/${{ env.CABAL_CACHE_VERSION }}/${{ runner.os }}/${{ matrix.cabal }}/${{ matrix.ghc }} 65 | skip: "${{ secrets.BINARY_CACHE_URI != '' }}" 66 | 67 | - name: Build 68 | run: cabal build all --enable-tests --enable-benchmarks 69 | 70 | - name: Test 71 | run: cabal test all --enable-tests --enable-benchmarks 72 | 73 | check: 74 | needs: build 75 | runs-on: ubuntu-latest 76 | outputs: 77 | tag: ${{ steps.tag.outputs.tag }} 78 | 79 | steps: 80 | - uses: actions/checkout@v2 81 | 82 | - name: Check if cabal project is sane 83 | run: | 84 | PROJECT_DIR=$PWD 85 | mkdir -p $PROJECT_DIR/build/sdist 86 | for i in $(git ls-files | grep '\.cabal'); do 87 | cd $PROJECT_DIR && cd `dirname $i` 88 | cabal check 89 | done 90 | 91 | - name: Tag new version 92 | id: tag 93 | if: ${{ github.ref == 'refs/heads/main' }} 94 | env: 95 | server: http://hackage.haskell.org 96 | username: ${{ secrets.HACKAGE_USER }} 97 | password: ${{ secrets.HACKAGE_PASS }} 98 | run: | 99 | package_version="$(cat *.cabal | grep '^version:' | cut -d : -f 2 | xargs)" 100 | 101 | echo "Package version is v$package_version" 102 | 103 | git fetch --unshallow origin 104 | 105 | if git tag "v$package_version"; then 106 | echo "Tagging with new version "v$package_version"" 107 | 108 | if git push origin "v$package_version"; then 109 | echo "Tagged with new version "v$package_version"" 110 | 111 | echo "::set-output name=tag::v$package_version" 112 | fi 113 | fi 114 | 115 | release: 116 | needs: [build, check] 117 | runs-on: ubuntu-latest 118 | if: ${{ needs.check.outputs.tag != '' }} 119 | outputs: 120 | upload_url: ${{ steps.create_release.outputs.upload_url }} 121 | 122 | steps: 123 | - uses: actions/checkout@v2 124 | 125 | - name: Create source distribution 126 | run: | 127 | PROJECT_DIR=$PWD 128 | mkdir -p $PROJECT_DIR/build/sdist 129 | for i in $(git ls-files | grep '\.cabal'); do 130 | cd $PROJECT_DIR && cd `dirname $i` 131 | cabal v2-sdist -o $PROJECT_DIR/build/sdist 132 | done; 133 | 134 | - name: Publish to hackage 135 | env: 136 | server: http://hackage.haskell.org 137 | username: ${{ secrets.HACKAGE_USER }} 138 | password: ${{ secrets.HACKAGE_PASS }} 139 | candidate: false 140 | run: | 141 | package_version="$(cat *.cabal | grep '^version:' | cut -d : -f 2 | xargs)" 142 | 143 | for PACKAGE_TARBALL in $(find ./build/sdist/ -name "*.tar.gz"); do 144 | PACKAGE_NAME=$(basename ${PACKAGE_TARBALL%.*.*}) 145 | 146 | if ${{ env.candidate }}; then 147 | TARGET_URL="${{ env.server }}/packages/candidates"; 148 | DOCS_URL="${{ env.server }}/package/$PACKAGE_NAME/candidate/docs" 149 | else 150 | TARGET_URL="${{ env.server }}/packages/upload"; 151 | DOCS_URL="${{ env.server }}/package/$PACKAGE_NAME/docs" 152 | fi 153 | 154 | HACKAGE_STATUS=$(curl --silent --head -w %{http_code} -XGET --anyauth --user "${{ env.username }}:${{ env.password }}" ${{ env.server }}/package/$PACKAGE_NAME -o /dev/null) 155 | 156 | if [ "$HACKAGE_STATUS" = "404" ]; then 157 | echo "Uploading $PACKAGE_NAME to $TARGET_URL" 158 | 159 | curl -X POST -f --user "${{ env.username }}:${{ env.password }}" $TARGET_URL -F "package=@$PACKAGE_TARBALL" 160 | echo "Uploaded $PACKAGE_NAME" 161 | else 162 | echo "Package $PACKAGE_NAME" already exists on Hackage. 163 | fi 164 | done 165 | 166 | - name: Create Release 167 | id: create_release 168 | uses: actions/create-release@v1 169 | env: 170 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token 171 | with: 172 | tag_name: ${{ github.ref }} 173 | release_name: Release ${{ github.ref }} 174 | body: Undocumented 175 | draft: true 176 | prerelease: false 177 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | dist-newstyle 3 | cabal-dev 4 | data/bench/ 5 | *.o 6 | *.hi 7 | *.chi 8 | *.chs.h 9 | *.dyn_o 10 | *.dyn_hi 11 | .hpc 12 | .hsenv 13 | .cabal-sandbox/ 14 | cabal.sandbox.config 15 | *.prof 16 | *.aux 17 | *.hp 18 | .ghc.environment* 19 | 20 | /*.submodules 21 | /.stack-work/ 22 | /result 23 | /deps 24 | /*.xml 25 | /*.xml.ib 26 | /*.xml.bp 27 | /snapshot.yaml 28 | /stack-ci.yaml 29 | /*.gz 30 | /*.idx 31 | 32 | -------------------------------------------------------------------------------- /.stylish-haskell.yaml: -------------------------------------------------------------------------------- 1 | # stylish-haskell configuration file 2 | # ================================== 3 | 4 | # The stylish-haskell tool is mainly configured by specifying steps. These steps 5 | # are a list, so they have an order, and one specific step may appear more than 6 | # once (if needed). Each file is processed by these steps in the given order. 7 | steps: 8 | # Convert some ASCII sequences to their Unicode equivalents. This is disabled 9 | # by default. 10 | # - unicode_syntax: 11 | # # In order to make this work, we also need to insert the UnicodeSyntax 12 | # # language pragma. If this flag is set to true, we insert it when it's 13 | # # not already present. You may want to disable it if you configure 14 | # # language extensions using some other method than pragmas. Default: 15 | # # true. 16 | # add_language_pragma: true 17 | 18 | # Align the right hand side of some elements. This is quite conservative 19 | # and only applies to statements where each element occupies a single 20 | # line. 21 | - simple_align: 22 | cases: true 23 | top_level_patterns: true 24 | records: true 25 | 26 | # Import cleanup 27 | - imports: 28 | # There are different ways we can align names and lists. 29 | # 30 | # - global: Align the import names and import list throughout the entire 31 | # file. 32 | # 33 | # - file: Like global, but don't add padding when there are no qualified 34 | # imports in the file. 35 | # 36 | # - group: Only align the imports per group (a group is formed by adjacent 37 | # import lines). 38 | # 39 | # - none: Do not perform any alignment. 40 | # 41 | # Default: global. 42 | align: group 43 | 44 | # Folowing options affect only import list alignment. 45 | # 46 | # List align has following options: 47 | # 48 | # - after_alias: Import list is aligned with end of import including 49 | # 'as' and 'hiding' keywords. 50 | # 51 | # > import qualified Data.List as List (concat, foldl, foldr, head, 52 | # > init, last, length) 53 | # 54 | # - with_alias: Import list is aligned with start of alias or hiding. 55 | # 56 | # > import qualified Data.List as List (concat, foldl, foldr, head, 57 | # > init, last, length) 58 | # 59 | # - new_line: Import list starts always on new line. 60 | # 61 | # > import qualified Data.List as List 62 | # > (concat, foldl, foldr, head, init, last, length) 63 | # 64 | # Default: after_alias 65 | list_align: after_alias 66 | 67 | # Long list align style takes effect when import is too long. This is 68 | # determined by 'columns' setting. 69 | # 70 | # - inline: This option will put as much specs on same line as possible. 71 | # 72 | # - new_line: Import list will start on new line. 73 | # 74 | # - new_line_multiline: Import list will start on new line when it's 75 | # short enough to fit to single line. Otherwise it'll be multiline. 76 | # 77 | # - multiline: One line per import list entry. 78 | # Type with contructor list acts like single import. 79 | # 80 | # > import qualified Data.Map as M 81 | # > ( empty 82 | # > , singleton 83 | # > , ... 84 | # > , delete 85 | # > ) 86 | # 87 | # Default: inline 88 | long_list_align: inline 89 | 90 | # Align empty list (importing instances) 91 | # 92 | # Empty list align has following options 93 | # 94 | # - inherit: inherit list_align setting 95 | # 96 | # - right_after: () is right after the module name: 97 | # 98 | # > import Vector.Instances () 99 | # 100 | # Default: inherit 101 | empty_list_align: inherit 102 | 103 | # List padding determines indentation of import list on lines after import. 104 | # This option affects 'long_list_align'. 105 | # 106 | # - : constant value 107 | # 108 | # - module_name: align under start of module name. 109 | # Useful for 'file' and 'group' align settings. 110 | list_padding: 4 111 | 112 | # Separate lists option affects formating of import list for type 113 | # or class. The only difference is single space between type and list 114 | # of constructors, selectors and class functions. 115 | # 116 | # - true: There is single space between Foldable type and list of it's 117 | # functions. 118 | # 119 | # > import Data.Foldable (Foldable (fold, foldl, foldMap)) 120 | # 121 | # - false: There is no space between Foldable type and list of it's 122 | # functions. 123 | # 124 | # > import Data.Foldable (Foldable(fold, foldl, foldMap)) 125 | # 126 | # Default: true 127 | separate_lists: true 128 | 129 | # Language pragmas 130 | - language_pragmas: 131 | # We can generate different styles of language pragma lists. 132 | # 133 | # - vertical: Vertical-spaced language pragmas, one per line. 134 | # 135 | # - compact: A more compact style. 136 | # 137 | # - compact_line: Similar to compact, but wrap each line with 138 | # `{-#LANGUAGE #-}'. 139 | # 140 | # Default: vertical. 141 | style: vertical 142 | 143 | # Align affects alignment of closing pragma brackets. 144 | # 145 | # - true: Brackets are aligned in same collumn. 146 | # 147 | # - false: Brackets are not aligned together. There is only one space 148 | # between actual import and closing bracket. 149 | # 150 | # Default: true 151 | align: true 152 | 153 | # stylish-haskell can detect redundancy of some language pragmas. If this 154 | # is set to true, it will remove those redundant pragmas. Default: true. 155 | remove_redundant: true 156 | 157 | # Replace tabs by spaces. This is disabled by default. 158 | # - tabs: 159 | # # Number of spaces to use for each tab. Default: 8, as specified by the 160 | # # Haskell report. 161 | # spaces: 8 162 | 163 | # Remove trailing whitespace 164 | - trailing_whitespace: {} 165 | 166 | # A common setting is the number of columns (parts of) code will be wrapped 167 | # to. Different steps take this into account. Default: 80. 168 | columns: 800 169 | 170 | # By default, line endings are converted according to the OS. You can override 171 | # preferred format here. 172 | # 173 | # - native: Native newline format. CRLF on Windows, LF on other OSes. 174 | # 175 | # - lf: Convert to LF ("\n"). 176 | # 177 | # - crlf: Convert to CRLF ("\r\n"). 178 | # 179 | # Default: native. 180 | newline: native 181 | 182 | # Sometimes, language extensions are specified in a cabal file or from the 183 | # command line instead of using language pragmas in the file. stylish-haskell 184 | # needs to be aware of these, so it can parse the file correctly. 185 | # 186 | # No language extensions are enabled by default. 187 | # language_extensions: 188 | # - TemplateHaskell 189 | # - QuasiQuotes 190 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "Build", 6 | "type": "shell", 7 | "command": "bash", 8 | "args": ["-lc", "cabal build all --enable-tests && echo 'Done'"], 9 | "group": { 10 | "kind": "build", 11 | "isDefault": true 12 | }, 13 | "problemMatcher": { 14 | "owner": "haskell", 15 | "fileLocation": "relative", 16 | "pattern": [ 17 | { 18 | "regexp": "^(.+?):(\\d+):(\\d+):\\s+(error|warning|info):.*$", 19 | "file": 1, "line": 2, "column": 3, "severity": 4 20 | }, 21 | { 22 | "regexp": "\\s*(.*)$", 23 | "message": 1 24 | } 25 | ] 26 | }, 27 | "presentation": { 28 | "echo": false, 29 | "reveal": "always", 30 | "focus": false, 31 | "panel": "shared", 32 | "showReuseMessage": false, 33 | "clear": true 34 | } 35 | }, 36 | { 37 | "label": "Test", 38 | "type": "shell", 39 | "command": "bash", 40 | "args": ["-lc", "cabal test all --enable-tests && echo 'Done'"], 41 | "group": { 42 | "kind": "test", 43 | "isDefault": true 44 | }, 45 | "problemMatcher": { 46 | "owner": "haskell", 47 | "fileLocation": "relative", 48 | "pattern": [ 49 | { 50 | "regexp": "^(.+?):(\\d+):(\\d+):.*$", 51 | "file": 1, "line": 2, "column": 3, "severity": 4 52 | }, 53 | { 54 | "regexp": "\\s*(\\d\\)\\s)?(.*)$", 55 | "message": 2 56 | } 57 | ] 58 | }, 59 | "presentation": { 60 | "echo": false, 61 | "reveal": "always", 62 | "focus": false, 63 | "panel": "shared", 64 | "showReuseMessage": false, 65 | "clear": true 66 | } 67 | } 68 | ] 69 | } 70 | -------------------------------------------------------------------------------- /Criteria: -------------------------------------------------------------------------------- 1 | # Criteria 2 | 3 | ## Data ingestion 4 | * with various options 5 | * over 1G chunks of data (by unzipped) 6 | * and measure ingestion time (in seconds) 7 | 8 | ## API support 9 | * Which languages? 10 | * Is the API convenient? 11 | 12 | ## Resiliance against bad/slow queries 13 | * What is the impact of a bad query on the database 14 | * How easy is it to recover from a bad query 15 | * What happens on node failure, etc. 16 | 17 | ## Query Capabilities 18 | * Is it possible to filter by time range? 19 | * Is it possible to have dynamic querying? 20 | 21 | ## Query performance 22 | * Performance of querying all data for specific attacks 23 | * Performance of metadata for specific attacks 24 | * Performance of time range query 25 | * Performance of dynamic filter query 26 | * Performance of aggressive (duration filter) query for fast-forward 27 | 28 | ## Cost 29 | * How does it cost to handle today's data? 30 | * How does it cost to scale? 31 | 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright John Ky, Alexey Raga (c) 2016-2017 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Author name here nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hw-xml 2 | [![CircleCI](https://circleci.com/gh/haskell-works/hw-xml.svg?style=svg)](https://circleci.com/gh/haskell-works/hw-xml) 3 | 4 | `hw-xml` is a high performance XML parsing library. It uses 5 | succinct data-structures to allow traversal of large XML 6 | strings with minimal memory overhead. 7 | 8 | For an example, see [app/Main.hs](../master/app/Main.hs) 9 | 10 | # Notes 11 | * [Semi-Indexing Semi-Structured Data in Tiny Space](http://www.di.unipi.it/~ottavian/files/semi_index_cikm.pdf) 12 | * [Space-Efficient, High-Performance Rank & Select Structures on Uncompressed Bit Sequences](https://www.cs.cmu.edu/~dga/papers/zhou-sea2013.pdf) 13 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /app/App/Commands.hs: -------------------------------------------------------------------------------- 1 | module App.Commands where 2 | 3 | import App.Commands.Count 4 | import App.Commands.CreateBlankedXml 5 | import App.Commands.CreateBpIndex 6 | import App.Commands.CreateIbIndex 7 | import App.Commands.CreateIndex 8 | import App.Commands.Demo 9 | import Options.Applicative 10 | 11 | commands :: Parser (IO ()) 12 | commands = commandsGeneral 13 | 14 | commandsGeneral :: Parser (IO ()) 15 | commandsGeneral = subparser $ mempty 16 | <> commandGroup "Commands:" 17 | <> cmdCount 18 | <> cmdCreateIndex 19 | <> cmdCreateBlankedXml 20 | <> cmdCreateIbIndex 21 | <> cmdCreateBpIndex 22 | <> cmdDemo 23 | -------------------------------------------------------------------------------- /app/App/Commands/Count.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE DeriveGeneric #-} 4 | {-# LANGUAGE FlexibleInstances #-} 5 | {-# LANGUAGE OverloadedStrings #-} 6 | {-# LANGUAGE ScopedTypeVariables #-} 7 | {-# LANGUAGE TypeApplications #-} 8 | {-# LANGUAGE TypeSynonymInstances #-} 9 | 10 | module App.Commands.Count 11 | ( cmdCount 12 | ) where 13 | 14 | import App.Options 15 | import Control.Lens 16 | import Control.Monad 17 | import Data.Generics.Product.Any 18 | import Data.Text (Text) 19 | import GHC.Generics 20 | import HaskellWorks.Data.TreeCursor 21 | import HaskellWorks.Data.Xml.DecodeResult 22 | import HaskellWorks.Data.Xml.RawDecode 23 | import HaskellWorks.Data.Xml.RawValue 24 | import HaskellWorks.Data.Xml.Succinct.Cursor.Load 25 | import HaskellWorks.Data.Xml.Succinct.Cursor.MMap 26 | import HaskellWorks.Data.Xml.Succinct.Index 27 | import HaskellWorks.Data.Xml.Value 28 | import Options.Applicative hiding (columns) 29 | 30 | import qualified App.Commands.Types as Z 31 | import qualified App.Naive as NAIVE 32 | import qualified App.XPath.Parser as XPP 33 | import qualified System.Exit as IO 34 | import qualified System.IO as IO 35 | 36 | -- | Document model. This does not need to be able to completely represent all 37 | -- the data in the XML document. In fact, having a smaller model may improve 38 | -- Count performance. 39 | data Plant = Plant 40 | { common :: String 41 | , price :: String 42 | } deriving (Eq, Show, Generic) 43 | 44 | newtype Catalog = Catalog 45 | { plants :: [Plant] 46 | } deriving (Eq, Show, Generic) 47 | 48 | tags :: Value -> Text -> [Value] 49 | tags xml@(XmlElement n _ _) elemName = if n == elemName 50 | then [xml] 51 | else [] 52 | tags _ _ = [] 53 | 54 | kids :: Value -> [Value] 55 | kids (XmlElement _ _ cs) = cs 56 | kids _ = [] 57 | 58 | countAtPath :: [Text] -> Value -> DecodeResult Int 59 | countAtPath [] _ = return 0 60 | countAtPath [t] xml = return (length (tags xml t)) 61 | countAtPath (t:ts) xml = do 62 | counts <- forM (tags xml t >>= kids) $ countAtPath ts 63 | return (sum counts) 64 | 65 | runCount :: Z.CountOptions -> IO () 66 | runCount opt = do 67 | let input = opt ^. the @"input" 68 | let xpath = opt ^. the @"xpath" 69 | let method = opt ^. the @"method" 70 | 71 | IO.putStrLn $ "XPath: " <> show xpath 72 | 73 | cursorResult <- case method of 74 | "mmap" -> Right <$> mmapFastCursor input 75 | "memory" -> Right <$> loadFastCursor input 76 | "naive" -> Right <$> NAIVE.loadFastCursor input 77 | unknown -> return (Left ("Unknown method " <> show unknown)) 78 | 79 | case cursorResult of 80 | Right !cursor -> do 81 | -- Skip the XML declaration to get to the root element cursor 82 | case nextSibling cursor of 83 | Just rootCursor -> do 84 | -- Get the root raw XML value at the root element cursor 85 | let rootValue = rawValueAt (xmlIndexAt rootCursor) 86 | -- Show what we have at this cursor 87 | putStrLn $ "Raw value: " <> take 100 (show rootValue) 88 | -- Decode the raw XML value 89 | case countAtPath (xpath ^. the @"path") (rawDecode rootValue) of 90 | DecodeOk count -> putStrLn $ "Count: " <> show count 91 | DecodeFailed msg -> putStrLn $ "Error: " <> show msg 92 | Nothing -> do 93 | putStrLn "Could not read XML" 94 | return () 95 | Left msg -> do 96 | IO.putStrLn $ "Error: " <> msg 97 | IO.exitFailure 98 | 99 | optsCount :: Parser Z.CountOptions 100 | optsCount = Z.CountOptions 101 | <$> strOption 102 | ( long "input" 103 | <> help "Input file" 104 | <> metavar "FILE" 105 | ) 106 | <*> optionParser XPP.path 107 | ( long "xpath" 108 | <> help "XPath expression" 109 | <> metavar "XPATH" 110 | ) 111 | <*> textOption 112 | ( long "method" 113 | <> help "Read method" 114 | <> metavar "METHOD" 115 | ) 116 | 117 | cmdCount :: Mod CommandFields (IO ()) 118 | cmdCount = command "count" $ flip info idm $ runCount <$> optsCount 119 | -------------------------------------------------------------------------------- /app/App/Commands/CreateBlankedXml.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DataKinds #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | {-# LANGUAGE TypeSynonymInstances #-} 8 | 9 | module App.Commands.CreateBlankedXml 10 | ( cmdCreateBlankedXml 11 | ) where 12 | 13 | import Control.Lens 14 | import Data.Generics.Product.Any 15 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 16 | import Options.Applicative hiding (columns) 17 | 18 | import qualified App.Commands.Types as Z 19 | import qualified Data.ByteString.Lazy as LBS 20 | 21 | runCreateBlankedXml :: Z.CreateBlankedXmlOptions -> IO () 22 | runCreateBlankedXml opt = do 23 | let input = opt ^. the @"input" 24 | let output = opt ^. the @"output" 25 | 26 | lbs <- LBS.readFile input 27 | let blankedXml = lbsToBlankedXml lbs 28 | LBS.writeFile output (LBS.fromChunks (blankedXml ^. the @1)) 29 | 30 | return () 31 | 32 | optsCreateBlankedXml :: Parser Z.CreateBlankedXmlOptions 33 | optsCreateBlankedXml = Z.CreateBlankedXmlOptions 34 | <$> strOption 35 | ( long "input" 36 | <> help "Input file" 37 | <> metavar "FILE" 38 | ) 39 | <*> strOption 40 | ( long "output" 41 | <> help "Blanked XML output" 42 | <> metavar "FILE" 43 | ) 44 | 45 | cmdCreateBlankedXml :: Mod CommandFields (IO ()) 46 | cmdCreateBlankedXml = command "create-blanked-xml" $ flip info idm $ runCreateBlankedXml <$> optsCreateBlankedXml 47 | -------------------------------------------------------------------------------- /app/App/Commands/CreateBpIndex.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DataKinds #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | {-# LANGUAGE TypeSynonymInstances #-} 8 | 9 | module App.Commands.CreateBpIndex 10 | ( cmdCreateBpIndex 11 | ) where 12 | 13 | import Control.Lens 14 | import Data.Generics.Product.Any 15 | import HaskellWorks.Data.Xml.Internal.ToIbBp64 16 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 17 | import Options.Applicative hiding (columns) 18 | 19 | import qualified App.Commands.Types as Z 20 | import qualified Data.ByteString.Lazy as LBS 21 | 22 | runCreateBpIndex :: Z.CreateBpIndexOptions -> IO () 23 | runCreateBpIndex opt = do 24 | let input = opt ^. the @"input" 25 | let output = opt ^. the @"output" 26 | 27 | lbs <- LBS.readFile input 28 | let blankedXml = lbsToBlankedXml lbs 29 | let ib = toBalancedParens64 blankedXml 30 | LBS.writeFile output (LBS.fromChunks ib) 31 | 32 | return () 33 | 34 | optsCreateBpIndex :: Parser Z.CreateBpIndexOptions 35 | optsCreateBpIndex = Z.CreateBpIndexOptions 36 | <$> strOption 37 | ( long "input" 38 | <> help "Input file" 39 | <> metavar "FILE" 40 | ) 41 | <*> strOption 42 | ( long "output" 43 | <> help "Balanced parens output" 44 | <> metavar "FILE" 45 | ) 46 | 47 | cmdCreateBpIndex :: Mod CommandFields (IO ()) 48 | cmdCreateBpIndex = command "create-bp-index" $ flip info idm $ runCreateBpIndex <$> optsCreateBpIndex 49 | -------------------------------------------------------------------------------- /app/App/Commands/CreateIbIndex.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DataKinds #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | {-# LANGUAGE TypeSynonymInstances #-} 8 | 9 | module App.Commands.CreateIbIndex 10 | ( cmdCreateIbIndex 11 | ) where 12 | 13 | import Control.Lens 14 | import Data.Generics.Product.Any 15 | import HaskellWorks.Data.Xml.Internal.ToIbBp64 16 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 17 | import Options.Applicative hiding (columns) 18 | 19 | import qualified App.Commands.Types as Z 20 | import qualified Data.ByteString.Lazy as LBS 21 | 22 | runCreateIbIndex :: Z.CreateIbIndexOptions -> IO () 23 | runCreateIbIndex opt = do 24 | let input = opt ^. the @"input" 25 | let output = opt ^. the @"output" 26 | 27 | lbs <- LBS.readFile input 28 | let blankedXml = lbsToBlankedXml lbs 29 | let ib = toInterestBits64 blankedXml 30 | LBS.writeFile output (LBS.fromChunks ib) 31 | 32 | optsCreateIbIndex :: Parser Z.CreateIbIndexOptions 33 | optsCreateIbIndex = Z.CreateIbIndexOptions 34 | <$> strOption 35 | ( long "input" 36 | <> help "Input file" 37 | <> metavar "FILE" 38 | ) 39 | <*> strOption 40 | ( long "output" 41 | <> help "Interest Bits output" 42 | <> metavar "FILE" 43 | ) 44 | 45 | cmdCreateIbIndex :: Mod CommandFields (IO ()) 46 | cmdCreateIbIndex = command "create-ib-index" $ flip info idm $ runCreateIbIndex <$> optsCreateIbIndex 47 | -------------------------------------------------------------------------------- /app/App/Commands/CreateIndex.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DataKinds #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | {-# LANGUAGE TypeSynonymInstances #-} 8 | 9 | module App.Commands.CreateIndex 10 | ( cmdCreateIndex 11 | ) where 12 | 13 | import App.Options 14 | import Control.Lens 15 | import Control.Monad 16 | import Data.Generics.Product.Any 17 | import HaskellWorks.Data.Xml.Internal.ToIbBp64 18 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 19 | import HaskellWorks.Data.Xml.Succinct.Cursor.MMap 20 | import Options.Applicative hiding (columns) 21 | 22 | import qualified App.Commands.Types as Z 23 | import qualified Data.ByteString as BS 24 | import qualified Data.ByteString.Lazy as LBS 25 | import qualified Data.Text.IO as TIO 26 | import qualified HaskellWorks.Data.ByteString.Lazy as LBS 27 | import qualified Options.Applicative as OA 28 | import qualified System.Exit as IO 29 | import qualified System.IO as IO 30 | 31 | runCreateIndex :: Z.CreateIndexOptions -> IO () 32 | runCreateIndex opt = do 33 | let input = opt ^. the @"input" 34 | let ibOutput = opt ^. the @"ibOutput" 35 | let bpOutput = opt ^. the @"bpOutput" 36 | let method = opt ^. the @"method" 37 | 38 | case method of 39 | "memory" -> do 40 | cursor <- mmapSlowCursor input 41 | 42 | LBS.writeFile ibOutput (LBS.toLazyByteString (cursor ^. the @"interests" . the @1)) 43 | LBS.writeFile bpOutput (LBS.toLazyByteString (cursor ^. the @"balancedParens" . the @1)) 44 | "stream" -> do 45 | lbs <- LBS.readFile input 46 | let blankedXml = lbsToBlankedXml lbs 47 | let ibBp = toIbBp64 blankedXml 48 | 49 | hIbOutput <- IO.openFile ibOutput IO.WriteMode 50 | hBpOutput <- IO.openFile bpOutput IO.WriteMode 51 | 52 | forM_ ibBp $ \(ib, bp) -> do 53 | BS.hPut hIbOutput ib 54 | BS.hPut hBpOutput bp 55 | 56 | IO.hClose hIbOutput 57 | IO.hClose hBpOutput 58 | 59 | return () 60 | unknown -> do 61 | TIO.hPutStrLn IO.stderr $ "Unsupported method: " <> unknown 62 | IO.exitFailure 63 | 64 | optsCreateIndex :: Parser Z.CreateIndexOptions 65 | optsCreateIndex = Z.CreateIndexOptions 66 | <$> strOption 67 | ( long "input" 68 | <> help "Input file" 69 | <> metavar "FILE" 70 | ) 71 | <*> strOption 72 | ( long "ib-output" 73 | <> help "Interest Bits output" 74 | <> metavar "FILE" 75 | ) 76 | <*> strOption 77 | ( long "bp-output" 78 | <> help "Balanced Parens output" 79 | <> metavar "FILE" 80 | ) 81 | <*> textOption 82 | ( long "method" 83 | <> help "Method" 84 | <> metavar "METHOD" 85 | <> OA.value "memory" 86 | ) 87 | 88 | cmdCreateIndex :: Mod CommandFields (IO ()) 89 | cmdCreateIndex = command "create-index" $ flip info idm $ runCreateIndex <$> optsCreateIndex 90 | -------------------------------------------------------------------------------- /app/App/Commands/Demo.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | {-# LANGUAGE TypeSynonymInstances #-} 8 | 9 | module App.Commands.Demo 10 | ( cmdDemo 11 | ) where 12 | 13 | import Data.Foldable 14 | import Data.Maybe 15 | import Data.Text (Text) 16 | import HaskellWorks.Data.TreeCursor 17 | import HaskellWorks.Data.Xml.Decode 18 | import HaskellWorks.Data.Xml.DecodeResult 19 | import HaskellWorks.Data.Xml.RawDecode 20 | import HaskellWorks.Data.Xml.RawValue 21 | import HaskellWorks.Data.Xml.Succinct.Cursor.Load 22 | import HaskellWorks.Data.Xml.Succinct.Index 23 | import HaskellWorks.Data.Xml.Value 24 | import Options.Applicative hiding (columns) 25 | 26 | import qualified App.Commands.Types as Z 27 | 28 | -- | Parse the text of an XML node. 29 | class ParseText a where 30 | parseText :: Value -> DecodeResult a 31 | 32 | instance ParseText Text where 33 | parseText (XmlText text) = DecodeOk text 34 | parseText (XmlCData text) = DecodeOk text 35 | parseText (XmlElement _ _ cs) = DecodeOk $ mconcat $ mconcat $ toList . parseText <$> cs 36 | parseText _ = DecodeOk "" 37 | 38 | -- | Convert a decode result to a maybe 39 | decodeResultToMaybe :: DecodeResult a -> Maybe a 40 | decodeResultToMaybe (DecodeOk a) = Just a 41 | decodeResultToMaybe _ = Nothing 42 | 43 | -- | Document model. This does not need to be able to completely represent all 44 | -- the data in the XML document. In fact, having a smaller model may improve 45 | -- query performance. 46 | data Plant = Plant 47 | { common :: Text 48 | , price :: Text 49 | } deriving (Eq, Show) 50 | 51 | newtype Catalog = Catalog 52 | { plants :: [Plant] 53 | } deriving (Eq, Show) 54 | 55 | -- | Decode plant element 56 | decodePlant :: Value -> DecodeResult Plant 57 | decodePlant xml = do 58 | aCommon <- xml /> "common" >>= parseText 59 | aPrice <- xml /> "price" >>= parseText 60 | return $ Plant aCommon aPrice 61 | 62 | -- | Decode catalog element 63 | decodeCatalog :: Value -> DecodeResult Catalog 64 | decodeCatalog xml = do 65 | aPlantXmls <- xml />> "plant" 66 | let aPlants = catMaybes (decodeResultToMaybe . decodePlant <$> aPlantXmls) 67 | return $ Catalog aPlants 68 | 69 | runDemo :: Z.DemoOptions -> IO () 70 | runDemo _ = do 71 | -- Read XML into memory as a query-optimised cursor 72 | !cursor <- loadFastCursor "data/catalog.xml" 73 | -- Skip the XML declaration to get to the root element cursor 74 | case nextSibling cursor of 75 | Just rootCursor -> do 76 | -- Get the root raw XML value at the root element cursor 77 | let rootValue = rawValueAt (xmlIndexAt rootCursor) 78 | -- Show what we have at this cursor 79 | putStrLn $ "Raw value: " <> take 100 (show rootValue) 80 | -- Decode the raw XML value 81 | case decodeCatalog (rawDecode rootValue) of 82 | DecodeOk catalog -> putStrLn $ "Catalog: " <> show catalog 83 | DecodeFailed msg -> putStrLn $ "Error: " <> show msg 84 | Nothing -> do 85 | putStrLn "Could not read XML" 86 | return () 87 | 88 | optsDemo :: Parser Z.DemoOptions 89 | optsDemo = pure Z.DemoOptions 90 | 91 | cmdDemo :: Mod CommandFields (IO ()) 92 | cmdDemo = command "demo" $ flip info idm $ runDemo <$> optsDemo 93 | -------------------------------------------------------------------------------- /app/App/Commands/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE DuplicateRecordFields #-} 3 | 4 | module App.Commands.Types 5 | ( CountOptions(..) 6 | , CreateBlankedXmlOptions(..) 7 | , CreateBpIndexOptions(..) 8 | , CreateIbIndexOptions(..) 9 | , CreateIndexOptions(..) 10 | , DemoOptions(..) 11 | ) where 12 | 13 | import App.XPath.Types (XPath) 14 | import Data.Text (Text) 15 | import GHC.Generics 16 | 17 | data DemoOptions = DemoOptions deriving (Eq, Show, Generic) 18 | 19 | data CountOptions = CountOptions 20 | { input :: FilePath 21 | , xpath :: XPath 22 | , method :: Text 23 | } deriving (Eq, Show, Generic) 24 | 25 | data CreateIndexOptions = CreateIndexOptions 26 | { input :: FilePath 27 | , ibOutput :: FilePath 28 | , bpOutput :: FilePath 29 | , method :: Text 30 | } deriving (Eq, Show, Generic) 31 | 32 | data CreateBlankedXmlOptions = CreateBlankedXmlOptions 33 | { input :: FilePath 34 | , output :: FilePath 35 | } deriving (Eq, Show, Generic) 36 | 37 | data CreateIbIndexOptions = CreateIbIndexOptions 38 | { input :: FilePath 39 | , output :: FilePath 40 | } deriving (Eq, Show, Generic) 41 | 42 | data CreateBpIndexOptions = CreateBpIndexOptions 43 | { input :: FilePath 44 | , output :: FilePath 45 | } deriving (Eq, Show, Generic) 46 | -------------------------------------------------------------------------------- /app/App/Main.hs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/haskell-works/hw-xml/987e92490b2ca428fa382beb0c284ee712147f27/app/App/Main.hs -------------------------------------------------------------------------------- /app/App/Naive.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | {-# LANGUAGE TypeSynonymInstances #-} 8 | 9 | module App.Naive 10 | ( loadSlowCursor 11 | , loadFastCursor 12 | ) where 13 | 14 | import HaskellWorks.Data.BalancedParens.RangeMin2 15 | import HaskellWorks.Data.BalancedParens.Simple 16 | import HaskellWorks.Data.Bits.BitShown 17 | import HaskellWorks.Data.FromByteString 18 | import HaskellWorks.Data.RankSelect.CsPoppy1 19 | import HaskellWorks.Data.Xml.Succinct.Cursor 20 | import HaskellWorks.Data.Xml.Succinct.Cursor.MMap 21 | 22 | import qualified Data.ByteString as BS 23 | 24 | -- | Load an XML file into memory and return a raw cursor initialised to the 25 | -- start of the XML document. 26 | loadSlowCursor :: FilePath -> IO SlowCursor 27 | loadSlowCursor path = do 28 | !bs <- BS.readFile path 29 | let !cursor = fromByteString bs :: SlowCursor 30 | return cursor 31 | 32 | -- | Load an XML file into memory and return a query-optimised cursor initialised 33 | -- to the start of the XML document. 34 | loadFastCursor :: FilePath -> IO FastCursor 35 | loadFastCursor filename = do 36 | -- Load the XML file into memory as a raw cursor. 37 | -- The raw XML data is `text`, and `ib` and `bp` are the indexes. 38 | -- `ib` and `bp` can be persisted to an index file for later use to avoid 39 | -- re-parsing the file. 40 | XmlCursor !text (BitShown !ib) (SimpleBalancedParens !bp) _ <- loadSlowCursor filename 41 | let !bpCsPoppy = makeCsPoppy bp 42 | let !rangeMinMax = mkRangeMin2 bpCsPoppy 43 | let !ibCsPoppy = makeCsPoppy ib 44 | return $ XmlCursor text ibCsPoppy rangeMinMax 1 45 | -------------------------------------------------------------------------------- /app/App/Options.hs: -------------------------------------------------------------------------------- 1 | module App.Options 2 | ( optionParser 3 | , textOption 4 | ) where 5 | 6 | import Data.Text (Text) 7 | 8 | import qualified Data.Attoparsec.Text as AT 9 | import qualified Data.Text as T 10 | import qualified Options.Applicative as OA 11 | 12 | optionParser :: AT.Parser a -> OA.Mod OA.OptionFields a -> OA.Parser a 13 | optionParser p = OA.option (OA.eitherReader (AT.parseOnly p . T.pack)) 14 | 15 | textOption :: OA.Mod OA.OptionFields String -> OA.Parser Text 16 | textOption = fmap T.pack . OA.strOption 17 | -------------------------------------------------------------------------------- /app/App/Show.hs: -------------------------------------------------------------------------------- 1 | module App.Show 2 | ( tshow 3 | ) where 4 | 5 | import Data.Text (Text) 6 | 7 | import qualified Data.Text as T 8 | 9 | tshow :: Show a => a -> Text 10 | tshow = T.pack . show 11 | -------------------------------------------------------------------------------- /app/App/XPath/Parser.hs: -------------------------------------------------------------------------------- 1 | module App.XPath.Parser 2 | ( path 3 | ) where 4 | 5 | import Control.Applicative 6 | import Data.Attoparsec.Text 7 | import Data.Text (Text) 8 | 9 | import qualified App.XPath.Types as XP 10 | import qualified Data.Text as T 11 | 12 | tag :: Parser Text 13 | tag = T.cons <$> letter <*> tagTail 14 | 15 | tagTail :: Parser Text 16 | tagTail = T.pack <$> many (letter <|> digit <|> char '-' <|> char '_') 17 | 18 | path :: Parser XP.XPath 19 | path = XP.XPath <$> sepBy1 tag (char '/') 20 | -------------------------------------------------------------------------------- /app/App/XPath/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | 3 | module App.XPath.Types where 4 | 5 | import Data.Text (Text) 6 | import GHC.Generics 7 | 8 | newtype XPath = XPath 9 | { path :: [Text] 10 | } deriving (Eq, Show, Generic) 11 | -------------------------------------------------------------------------------- /app/Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import App.Commands 4 | import Control.Monad 5 | import Options.Applicative 6 | 7 | main :: IO () 8 | main = join $ customExecParser 9 | (prefs $ showHelpOnEmpty <> showHelpOnError) 10 | (info (commands <**> helper) idm) 11 | -------------------------------------------------------------------------------- /bench/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | 4 | module Main where 5 | 6 | import Criterion.Main 7 | import Data.ByteString (ByteString) 8 | import Data.Word 9 | import Foreign 10 | import HaskellWorks.Data.BalancedParens.Simple 11 | import HaskellWorks.Data.Bits.BitShown 12 | import HaskellWorks.Data.FromByteString 13 | import HaskellWorks.Data.Xml.Internal.Blank 14 | import HaskellWorks.Data.Xml.Internal.List 15 | import HaskellWorks.Data.Xml.Internal.Tables 16 | import HaskellWorks.Data.Xml.Succinct.Cursor 17 | import System.IO.MMap 18 | 19 | import qualified Data.ByteString as BS 20 | import qualified Data.ByteString.Internal as BSI 21 | import qualified Data.Vector.Storable as DVS 22 | 23 | setupEnvXml :: FilePath -> IO ByteString 24 | setupEnvXml filepath = do 25 | (fptr :: ForeignPtr Word8, offset, size) <- mmapFileForeignPtr filepath ReadOnly Nothing 26 | let !bs = BSI.fromForeignPtr (castForeignPtr fptr) offset size 27 | return bs 28 | 29 | loadXml :: ByteString -> XmlCursor ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64)) 30 | loadXml bs = fromByteString bs :: XmlCursor ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64)) 31 | 32 | xmlToInterestBits3 :: [ByteString] -> [ByteString] 33 | xmlToInterestBits3 = blankedXmlToInterestBits . blankXml 34 | 35 | runCon :: ([i] -> [ByteString]) -> i -> ByteString 36 | runCon con bs = BS.concat $ con [bs] 37 | 38 | benchRankXmlCatalogLists :: [Benchmark] 39 | benchRankXmlCatalogLists = 40 | [ env (setupEnvXml "data/catalog.xml") $ \bs -> bgroup "catalog.xml" 41 | [ bench "Run blankXml" (whnf (runCon blankXml ) bs) 42 | , bench "Run xmlToInterestBits3" (whnf (runCon xmlToInterestBits3) bs) 43 | , bench "loadXml" (whnf loadXml bs) 44 | ] 45 | ] 46 | 47 | setupInterestingWord8s :: IO () 48 | setupInterestingWord8s = do 49 | let !_ = interestingWord8s 50 | return () 51 | 52 | benchIsInterestingWord8 :: [Benchmark] 53 | benchIsInterestingWord8 = 54 | [ env setupInterestingWord8s $ \_ -> bgroup "Interesting Word8 lookup" 55 | [ bench "isInterestingWord8" (whnf isInterestingWord8 0) 56 | ] 57 | ] 58 | 59 | main :: IO () 60 | main = defaultMain $ concat 61 | [ benchIsInterestingWord8 62 | , benchRankXmlCatalogLists 63 | ] 64 | -------------------------------------------------------------------------------- /cabal.project: -------------------------------------------------------------------------------- 1 | packages: . 2 | -------------------------------------------------------------------------------- /data/catalog.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Bloodroot 5 | Sanguinaria canadensis 6 | 4 7 | Mostly Shady 8 | $2.44 9 | 031599 10 | 11 | 12 | 13 | Columbine 14 | Aquilegia canadensis 15 | 3 16 | Mostly Shady 17 | $9.37 18 | 030699 19 | 20 | 21 | 22 | Marsh Marigold 23 | Caltha palustris 24 | 4 25 | Mostly Sunny 26 | $6.81 27 | 051799 28 | 29 | 30 | 31 | Cowslip 32 | Caltha palustris 33 | 4 34 | Mostly Shady 35 | $9.90 36 | 030699 37 | 38 | 39 | 40 | Dutchman's-Breeches 41 | Diecentra cucullaria 42 | 3 43 | Mostly Shady 44 | $6.44 45 | 012099 46 | 47 | 48 | 49 | Ginger, Wild 50 | Asarum canadense 51 | 3 52 | Mostly Shady 53 | $9.03 54 | 041899 55 | 56 | 57 | 58 | Hepatica 59 | Hepatica americana 60 | 4 61 | Mostly Shady 62 | $4.45 63 | 012699 64 | 65 | 66 | 67 | Liverleaf 68 | Hepatica americana 69 | 4 70 | Mostly Shady 71 | $3.99 72 | 010299 73 | 74 | 75 | 76 | Jack-In-The-Pulpit 77 | Arisaema triphyllum 78 | 4 79 | Mostly Shady 80 | $3.23 81 | 020199 82 | 83 | 84 | 85 | Mayapple 86 | Podophyllum peltatum 87 | 3 88 | Mostly Shady 89 | $2.98 90 | 060599 91 | 92 | 93 | 94 | Phlox, Woodland 95 | Phlox divaricata 96 | 3 97 | Sun or Shade 98 | $2.80 99 | 012299 100 | 101 | 102 | 103 | Phlox, Blue 104 | Phlox divaricata 105 | 3 106 | Sun or Shade 107 | $5.59 108 | 021699 109 | 110 | 111 | 112 | Spring-Beauty 113 | Claytonia Virginica 114 | 7 115 | Mostly Shady 116 | $6.59 117 | 020199 118 | 119 | 120 | 121 | Trillium 122 | Trillium grandiflorum 123 | 5 124 | Sun or Shade 125 | $3.90 126 | 042999 127 | 128 | 129 | 130 | Wake Robin 131 | Trillium grandiflorum 132 | 5 133 | Sun or Shade 134 | $3.20 135 | 022199 136 | 137 | 138 | 139 | Violet, Dog-Tooth 140 | Erythronium americanum 141 | 4 142 | Shade 143 | $9.04 144 | 020199 145 | 146 | 147 | 148 | Trout Lily 149 | Erythronium americanum 150 | 4 151 | Shade 152 | $6.94 153 | 032499 154 | 155 | 156 | 157 | Adder's-Tongue 158 | Erythronium americanum 159 | 4 160 | Shade 161 | $9.58 162 | 041399 163 | 164 | 165 | 166 | Anemone 167 | Anemone blanda 168 | 6 169 | Mostly Shady 170 | $8.86 171 | 122698 172 | 173 | 174 | 175 | Grecian Windflower 176 | Anemone blanda 177 | 6 178 | Mostly Shady 179 | $9.16 180 | 071099 181 | 182 | 183 | 184 | Bee Balm 185 | Monarda didyma 186 | 4 187 | Shade 188 | $4.59 189 | 050399 190 | 191 | 192 | 193 | Bergamont 194 | Monarda didyma 195 | 4 196 | Shade 197 | $7.16 198 | 042799 199 | 200 | 201 | 202 | Black-Eyed Susan 203 | Rudbeckia hirta 204 | Annual 205 | Sunny 206 | $9.80 207 | 061899 208 | 209 | 210 | 211 | Buttercup 212 | Ranunculus 213 | 4 214 | Shade 215 | $2.57 216 | 061099 217 | 218 | 219 | 220 | Crowfoot 221 | Ranunculus 222 | 4 223 | Shade 224 | $9.34 225 | 040399 226 | 227 | 228 | 229 | Butterfly Weed 230 | Asclepias tuberosa 231 | Annual 232 | Sunny 233 | $2.78 234 | 063099 235 | 236 | 237 | 238 | Cinquefoil 239 | Potentilla 240 | Annual 241 | Shade 242 | $7.06 243 | 052599 244 | 245 | 246 | 247 | Primrose 248 | Oenothera 249 | 3 - 5 250 | Sunny 251 | $6.56 252 | 013099 253 | 254 | 255 | 256 | Gentian 257 | Gentiana 258 | 4 259 | Sun or Shade 260 | $7.81 261 | 051899 262 | 263 | 264 | 265 | Blue Gentian 266 | Gentiana 267 | 4 268 | Sun or Shade 269 | $8.56 270 | 050299 271 | 272 | 273 | 274 | Jacob's Ladder 275 | Polemonium caeruleum 276 | Annual 277 | Shade 278 | $9.26 279 | 022199 280 | 281 | 282 | 283 | Greek Valerian 284 | Polemonium caeruleum 285 | Annual 286 | Shade 287 | $4.36 288 | 071499 289 | 290 | 291 | 292 | California Poppy 293 | Eschscholzia californica 294 | Annual 295 | Sun 296 | $7.89 297 | 032799 298 | 299 | 300 | 301 | Shooting Star 302 | Dodecatheon 303 | Annual 304 | Mostly Shady 305 | $8.60 306 | 051399 307 | 308 | 309 | 310 | Snakeroot 311 | Cimicifuga 312 | Annual 313 | Shade 314 | $5.63 315 | 071199 316 | 317 | 318 | 319 | Cardinal Flower 320 | Lobelia cardinalis 321 | 2 322 | Shade 323 | $3.02 324 | 022299 325 | 326 | 327 | -------------------------------------------------------------------------------- /doctest/DoctestDriver.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | 3 | #if MIN_VERSION_GLASGOW_HASKELL(8,4,4,0) 4 | {-# OPTIONS_GHC -F -pgmF doctest-discover #-} 5 | #else 6 | module Main where 7 | 8 | import qualified System.IO as IO 9 | 10 | main :: IO () 11 | main = IO.putStrLn "WARNING: doctest will not run on GHC versions earlier than 8.4.4" 12 | #endif 13 | -------------------------------------------------------------------------------- /hw-xml.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 2.2 2 | 3 | name: hw-xml 4 | version: 0.5.1.2 5 | synopsis: XML parser based on succinct data structures. 6 | description: XML parser based on succinct data structures. Please see README.md 7 | category: Data, XML, Succinct Data Structures, Data Structures 8 | homepage: http://github.com/haskell-works/hw-xml#readme 9 | bug-reports: https://github.com/haskell-works/hw-xml/issues 10 | author: John Ky, 11 | Alexey Raga 12 | maintainer: alexey.raga@gmail.com 13 | copyright: 2016-2021 John Ky 14 | , 2016-2019 Alexey Raga 15 | license: BSD-3-Clause 16 | license-file: LICENSE 17 | tested-with: GHC == 9.2.2, GHC == 9.0.2, GHC == 8.10.7, GHC == 8.8.4, GHC == 8.6.5 18 | build-type: Simple 19 | extra-source-files: README.md 20 | data-files: data/catalog.xml 21 | 22 | source-repository head 23 | type: git 24 | location: https://github.com/haskell-works/hw-xml 25 | 26 | common base { build-depends: base >= 4.11 && < 5 } 27 | 28 | common prettyprinter { build-depends: prettyprinter >= 0.6.9 && < 2 } 29 | common array { build-depends: array >= 0.5.2.0 && < 0.6 } 30 | common attoparsec { build-depends: attoparsec >= 0.13.2.2 && < 0.15 } 31 | common bytestring { build-depends: bytestring >= 0.10.8.2 && < 0.13 } 32 | common cereal { build-depends: cereal >= 0.5.8.1 && < 0.6 } 33 | common containers { build-depends: containers >= 0.5.10.2 && < 0.8 } 34 | common criterion { build-depends: criterion >= 1.5.5.0 && < 1.7 } 35 | common deepseq { build-depends: deepseq >= 1.4.3.0 && < 1.6 } 36 | common doctest { build-depends: doctest >= 0.16.2 && < 0.23 } 37 | common doctest-discover { build-depends: doctest-discover >= 0.2 && < 0.3 } 38 | common generic-lens { build-depends: generic-lens >= 2.2 && < 2.3 } 39 | common ghc-prim { build-depends: ghc-prim >= 0.5 && < 0.12 } 40 | common hedgehog { build-depends: hedgehog >= 1.0 && < 1.5 } 41 | common hspec { build-depends: hspec >= 2.5 && < 3 } 42 | common hw-balancedparens { build-depends: hw-balancedparens >= 0.3.0.1 && < 0.5 } 43 | common hw-bits { build-depends: hw-bits >= 0.7.0.9 && < 0.8 } 44 | common hw-hspec-hedgehog { build-depends: hw-hspec-hedgehog >= 0.1 && < 0.2 } 45 | common hw-parser { build-depends: hw-parser >= 0.1.0.1 && < 0.2 } 46 | common hw-prim { build-depends: hw-prim >= 0.6.2.39 && < 0.7 } 47 | common hw-rankselect { build-depends: hw-rankselect >= 0.13.2.0 && < 0.14 } 48 | common hw-rankselect-base { build-depends: hw-rankselect-base >= 0.3.2.1 && < 0.4 } 49 | common lens { build-depends: lens >= 4.17.1 && < 6 } 50 | common mmap { build-depends: mmap >= 0.5.9 && < 0.6 } 51 | common mtl { build-depends: mtl >= 2.2.2 && < 3 } 52 | common optparse-applicative { build-depends: optparse-applicative >= 0.15.1.0 && < 0.19 } 53 | common resourcet { build-depends: resourcet >= 1.2.2 && < 2 } 54 | common text { build-depends: text >= 1.2.3.2 && < 3 } 55 | common transformers { build-depends: transformers >= 0.5 && < 0.7 } 56 | common vector { build-depends: vector >= 0.12.0.3 && < 0.14 } 57 | common word8 { build-depends: word8 >= 0.1.3 && < 0.2 } 58 | 59 | common config 60 | default-language: Haskell2010 61 | 62 | common hw-xml 63 | build-depends: hw-xml 64 | 65 | library 66 | import: base, config 67 | , array 68 | , attoparsec 69 | , base 70 | , bytestring 71 | , cereal 72 | , containers 73 | , deepseq 74 | , ghc-prim 75 | , hw-balancedparens 76 | , hw-bits 77 | , hw-parser 78 | , hw-prim 79 | , hw-rankselect 80 | , hw-rankselect-base 81 | , lens 82 | , mmap 83 | , mtl 84 | , prettyprinter 85 | , resourcet 86 | , text 87 | , transformers 88 | , vector 89 | , word8 90 | exposed-modules: HaskellWorks.Data.Xml 91 | HaskellWorks.Data.Xml.Blank 92 | HaskellWorks.Data.Xml.CharLike 93 | HaskellWorks.Data.Xml.Decode 94 | HaskellWorks.Data.Xml.DecodeError 95 | HaskellWorks.Data.Xml.DecodeResult 96 | HaskellWorks.Data.Xml.Grammar 97 | HaskellWorks.Data.Xml.Index 98 | HaskellWorks.Data.Xml.Internal.BalancedParens 99 | HaskellWorks.Data.Xml.Internal.ByteString 100 | HaskellWorks.Data.Xml.Internal.Blank 101 | HaskellWorks.Data.Xml.Internal.List 102 | HaskellWorks.Data.Xml.Internal.Show 103 | HaskellWorks.Data.Xml.Internal.Tables 104 | HaskellWorks.Data.Xml.Internal.ToIbBp64 105 | HaskellWorks.Data.Xml.Internal.Words 106 | HaskellWorks.Data.Xml.Lens 107 | HaskellWorks.Data.Xml.Succinct 108 | HaskellWorks.Data.Xml.Succinct.Cursor 109 | HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParens 110 | HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 111 | HaskellWorks.Data.Xml.Succinct.Cursor.Create 112 | HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits 113 | HaskellWorks.Data.Xml.Succinct.Cursor.Internal 114 | HaskellWorks.Data.Xml.Succinct.Cursor.Load 115 | HaskellWorks.Data.Xml.Succinct.Cursor.Types 116 | HaskellWorks.Data.Xml.Succinct.Cursor.MMap 117 | HaskellWorks.Data.Xml.Succinct.Cursor.Token 118 | HaskellWorks.Data.Xml.Succinct.Index 119 | HaskellWorks.Data.Xml.RawDecode 120 | HaskellWorks.Data.Xml.RawValue 121 | HaskellWorks.Data.Xml.Token.Tokenize 122 | HaskellWorks.Data.Xml.Token.Types 123 | HaskellWorks.Data.Xml.Token 124 | HaskellWorks.Data.Xml.Type 125 | HaskellWorks.Data.Xml.Value 126 | other-modules: Paths_hw_xml 127 | autogen-modules: Paths_hw_xml 128 | hs-source-dirs: src 129 | ghc-options: -Wall -O2 -msse4.2 130 | 131 | executable hw-xml 132 | import: base, config 133 | , attoparsec 134 | , bytestring 135 | , deepseq 136 | , generic-lens 137 | , hw-balancedparens 138 | , hw-bits 139 | , hw-prim 140 | , hw-rankselect 141 | , hw-xml 142 | , lens 143 | , mmap 144 | , mtl 145 | , optparse-applicative 146 | , resourcet 147 | , text 148 | , vector 149 | main-is: Main.hs 150 | other-modules: App.Commands 151 | App.Commands.Count 152 | App.Commands.CreateBlankedXml 153 | App.Commands.CreateBpIndex 154 | App.Commands.CreateIbIndex 155 | App.Commands.CreateIndex 156 | App.Commands.Demo 157 | App.Commands.Types 158 | App.Options 159 | App.XPath.Parser 160 | App.XPath.Types 161 | App.Show 162 | App.Naive 163 | hs-source-dirs: app 164 | ghc-options: -threaded -rtsopts -with-rtsopts=-N -O2 -Wall -msse4.2 165 | 166 | test-suite hw-xml-test 167 | import: base, config 168 | , attoparsec 169 | , base 170 | , bytestring 171 | , hedgehog 172 | , hspec 173 | , hw-balancedparens 174 | , hw-bits 175 | , hw-hspec-hedgehog 176 | , hw-prim 177 | , hw-xml 178 | , hw-rankselect 179 | , hw-rankselect-base 180 | , text 181 | , vector 182 | type: exitcode-stdio-1.0 183 | main-is: Spec.hs 184 | hs-source-dirs: test 185 | ghc-options: -threaded -rtsopts -with-rtsopts=-N 186 | default-language: Haskell2010 187 | build-tool-depends: hspec-discover:hspec-discover 188 | other-modules: HaskellWorks.Data.Xml.Internal.BlankSpec 189 | HaskellWorks.Data.Xml.RawValueSpec 190 | HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParensSpec 191 | HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXmlSpec 192 | HaskellWorks.Data.Xml.Succinct.Cursor.InterestBitsSpec 193 | HaskellWorks.Data.Xml.Succinct.CursorSpec.Make 194 | HaskellWorks.Data.Xml.Succinct.CursorSpec 195 | HaskellWorks.Data.Xml.Token.TokenizeSpec 196 | HaskellWorks.Data.Xml.TypeSpec 197 | 198 | benchmark bench 199 | import: base, config 200 | , bytestring 201 | , criterion 202 | , hw-balancedparens 203 | , hw-bits 204 | , hw-prim 205 | , mmap 206 | , resourcet 207 | , vector 208 | type: exitcode-stdio-1.0 209 | main-is: Main.hs 210 | build-depends: hw-xml 211 | hs-source-dirs: bench 212 | ghc-options: -O2 -Wall -msse4.2 213 | 214 | test-suite doctest 215 | import: base, config 216 | , doctest 217 | , doctest-discover 218 | , hw-xml 219 | type: exitcode-stdio-1.0 220 | ghc-options: -threaded -rtsopts -with-rtsopts=-N 221 | main-is: DoctestDriver.hs 222 | HS-Source-Dirs: doctest 223 | build-tool-depends: doctest-discover:doctest-discover 224 | -------------------------------------------------------------------------------- /project.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CABAL_FLAGS="-j8" 4 | 5 | cmd="$1" 6 | 7 | shift 8 | 9 | cabal-install() { 10 | cabal v2-install \ 11 | -j8 \ 12 | --installdir="$HOME/.local/bin" \ 13 | --overwrite-policy=always \ 14 | --disable-documentation \ 15 | $CABAL_FLAGS "$@" 16 | } 17 | 18 | cabal-build() { 19 | cabal v2-build \ 20 | --enable-tests \ 21 | --write-ghc-environment-files=ghc8.4.4+ \ 22 | $CABAL_FLAGS "$@" 23 | } 24 | 25 | cabal-test() { 26 | cabal v2-test \ 27 | --enable-tests \ 28 | --test-show-details=direct \ 29 | --test-options='+RTS -g1' \ 30 | $CABAL_FLAGS "$@" 31 | } 32 | 33 | cabal-exec() { 34 | cabal v2-exec "$(echo *.cabal | cut -d . -f 1)" "$@" 35 | } 36 | 37 | cabal-bench() { 38 | cabal v2-bench -j8 \ 39 | $CABAL_FLAGS "$@" 40 | } 41 | 42 | cabal-repl() { 43 | cabal v2-repl \ 44 | $CABAL_FLAGS "$@" 45 | } 46 | 47 | cabal-clean() { 48 | cabal v2-clean 49 | } 50 | 51 | case "$cmd" in 52 | install) 53 | cabal-install 54 | ;; 55 | 56 | build) 57 | cabal-build 58 | ;; 59 | 60 | exec) 61 | cabal-exec 62 | ;; 63 | 64 | test) 65 | cabal-build 66 | cabal-test 67 | ;; 68 | 69 | bench) 70 | cabal-bench 71 | ;; 72 | 73 | repl) 74 | cabal-repl 75 | ;; 76 | 77 | clean) 78 | cabal-clean 79 | ;; 80 | 81 | *) 82 | echo "Unrecognised command: $cmd" 83 | exit 1 84 | esac 85 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml 2 | ( module X 3 | ) where 4 | 5 | import HaskellWorks.Data.Xml.Decode as X 6 | import HaskellWorks.Data.Xml.DecodeError as X 7 | import HaskellWorks.Data.Xml.Succinct as X 8 | import HaskellWorks.Data.Xml.Token as X 9 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Blank.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module HaskellWorks.Data.Xml.Blank 5 | ( blankXml 6 | ) where 7 | 8 | import Data.ByteString (ByteString) 9 | import Data.Word 10 | import Data.Word8 11 | import HaskellWorks.Data.Xml.Internal.Words 12 | import Prelude 13 | 14 | import qualified Data.ByteString as BS 15 | 16 | type ExpectedChar = Word8 17 | 18 | data BlankState 19 | = InXml 20 | | InTag 21 | | InAttrList 22 | | InCloseTag 23 | | InClose 24 | | InBang Int 25 | | InString ExpectedChar 26 | | InText 27 | | InMeta 28 | | InCdataTag 29 | | InCdata Int 30 | | InRem Int 31 | | InIdent 32 | deriving (Eq, Show) 33 | 34 | data ByteStringP = BSP Word8 ByteString | EmptyBSP deriving Show 35 | 36 | blankXml :: BS.ByteString -> BS.ByteString 37 | blankXml as = fst (BS.unfoldrN (BS.length as) go (InXml, as)) 38 | where go :: (BlankState, ByteString) -> Maybe (Word8, (BlankState, ByteString)) 39 | go (InXml, bs) = case BS.uncons bs of 40 | Just (!c, !cs) | isMetaStart c cs -> Just (_bracketleft , (InMeta , cs)) 41 | Just (!c, !cs) | isEndTag c cs -> Just (_space , (InCloseTag , cs)) 42 | Just (!c, !cs) | isTextStart c -> Just (_t , (InText , cs)) 43 | Just (!c, !cs) | c == _less -> Just (_less , (InTag , cs)) 44 | Just (!c, !cs) | isSpace c -> Just (c , (InXml , cs)) 45 | Just ( _, !cs) -> Just (_space , (InXml , cs)) 46 | Nothing -> Nothing 47 | go (InTag, bs) = case BS.uncons bs of 48 | Just (!c, !cs) | isSpace c -> Just (_parenleft , (InAttrList , cs)) 49 | Just (!c, !cs) | isTagClose c cs -> Just (_space , (InClose , cs)) 50 | Just (!c, !cs) | c == _greater -> Just (_space , (InXml , cs)) 51 | Just (!c, !cs) | isSpace c -> Just (c , (InTag , cs)) 52 | Just ( _, !cs) -> Just (_space , (InTag , cs)) 53 | Nothing -> Nothing 54 | go (InCloseTag, bs) = case BS.uncons bs of 55 | Just (!c, !cs) | c == _greater -> Just (_greater , (InXml , cs)) 56 | Just ( _, !cs) -> Just (_space , (InCloseTag , cs)) 57 | Nothing -> Nothing 58 | go (InAttrList, bs) = case BS.uncons bs of 59 | Just (!c, !cs) | c == _greater -> Just (_parenright , (InXml , cs)) 60 | Just (!c, !cs) | isTagClose c cs -> Just (_parenright , (InClose , cs)) 61 | Just (!c, !cs) | isNameStartChar c -> Just (_a , (InIdent , cs)) 62 | Just (!c, !cs) | isQuote c -> Just (_v , (InString c , cs)) 63 | Just (!c, !cs) | isSpace c -> Just (c , (InAttrList , cs)) 64 | Just ( _, !cs) -> Just (_space , (InAttrList , cs)) 65 | Nothing -> Nothing 66 | go (InClose, bs) = case BS.uncons bs of 67 | Just (_, !cs) -> Just (_greater , (InXml , cs)) 68 | Nothing -> Nothing 69 | go (InIdent, bs) = case BS.uncons bs of 70 | Just (!c, !cs) | isNameChar c -> Just (_space , (InIdent , cs)) 71 | Just (!c, !cs) | isSpace c -> Just (_space , (InAttrList , cs)) 72 | Just (!c, !cs) | c == _equal -> Just (_space , (InAttrList , cs)) 73 | Just (!c, !cs) | isSpace c -> Just (c , (InAttrList , cs)) 74 | Just ( _, !cs) -> Just (_space , (InAttrList , cs)) 75 | Nothing -> Nothing 76 | go (InString q, bs) = case BS.uncons bs of 77 | Just (!c, !cs) | c == q -> Just (_space , (InAttrList , cs)) 78 | Just (!c, !cs) | isSpace c -> Just (c , (InString q , cs)) 79 | Just ( _, !cs) -> Just (_space , (InString q , cs)) 80 | Nothing -> Nothing 81 | go (InText, bs) = case BS.uncons bs of 82 | Just (!c, !cs) | isEndTag c cs -> Just (_space , (InCloseTag , cs)) 83 | Just ( _, !cs) | headIs (== _less) cs -> Just (_space , (InXml , cs)) 84 | Just (!c, !cs) | isSpace c -> Just (c , (InText , cs)) 85 | Just ( _, !cs) -> Just (_space , (InText , cs)) 86 | Nothing -> Nothing 87 | go (InMeta, bs) = case BS.uncons bs of 88 | Just (!c, !cs) | c == _exclam -> Just (_space , (InMeta , cs)) 89 | Just (!c, !cs) | c == _hyphen -> Just (_space , (InRem 0 , cs)) 90 | Just (!c, !cs) | c == _bracketleft -> Just (_space , (InCdataTag , cs)) 91 | Just (!c, !cs) | c == _greater -> Just (_bracketright, (InXml , cs)) 92 | Just (!c, !cs) | isSpace c -> Just (c , (InBang 1 , cs)) 93 | Just ( _, !cs) -> Just (_space , (InBang 1 , cs)) 94 | Nothing -> Nothing 95 | go (InCdataTag, bs) = case BS.uncons bs of 96 | Just (!c, !cs) | c == _bracketleft -> Just (_space , (InCdata 0 , cs)) 97 | Just (!c, !cs) | isSpace c -> Just (c , (InCdataTag , cs)) 98 | Just ( _, !cs) -> Just (_space , (InCdataTag , cs)) 99 | Nothing -> Nothing 100 | go (InCdata n, bs) = case BS.uncons bs of 101 | Just (!c, !cs) | c == _greater && n >= 2 -> Just (_bracketright, (InXml , cs)) 102 | Just (!c, !cs) | isCdataEnd c cs && n > 0 -> Just (_space , (InCdata (n+1), cs)) 103 | Just (!c, !cs) | c == _bracketright -> Just (_space , (InCdata (n+1), cs)) 104 | Just (!c, !cs) | isSpace c -> Just (c , (InCdata 0 , cs)) 105 | Just ( _, !cs) -> Just (_space , (InCdata 0 , cs)) 106 | Nothing -> Nothing 107 | go (InRem n, bs) = case BS.uncons bs of 108 | Just (!c, !cs) | c == _greater && n >= 2 -> Just (_bracketright, (InXml , cs)) 109 | Just (!c, !cs) | c == _hyphen -> Just (_space , (InRem (n+1) , cs)) 110 | Just (!c, !cs) | isSpace c -> Just (c , (InRem 0 , cs)) 111 | Just ( _, !cs) -> Just (_space , (InRem 0 , cs)) 112 | Nothing -> Nothing 113 | go (InBang n, bs) = case BS.uncons bs of 114 | Just (!c, !cs) | c == _less -> Just (_bracketleft , (InBang (n+1) , cs)) 115 | Just (!c, !cs) | c == _greater && n == 1 -> Just (_bracketright, (InXml , cs)) 116 | Just (!c, !cs) | c == _greater -> Just (_bracketright, (InBang (n-1) , cs)) 117 | Just (!c, !cs) | isSpace c -> Just (c , (InBang n , cs)) 118 | Just ( _, !cs) -> Just (_space , (InBang n , cs)) 119 | Nothing -> Nothing 120 | 121 | isEndTag :: Word8 -> ByteString -> Bool 122 | isEndTag c cs = c == _less && headIs (== _slash) cs 123 | {-# INLINE isEndTag #-} 124 | 125 | isTagClose :: Word8 -> ByteString -> Bool 126 | isTagClose c cs = (c == _slash) || ((c == _slash || c == _question) && headIs (== _greater) cs) 127 | {-# INLINE isTagClose #-} 128 | 129 | isMetaStart :: Word8 -> ByteString -> Bool 130 | isMetaStart c cs = c == _less && headIs (== _exclam) cs 131 | {-# INLINE isMetaStart #-} 132 | 133 | isCdataEnd :: Word8 -> ByteString -> Bool 134 | isCdataEnd c cs = c == _bracketright && headIs (== _greater) cs 135 | {-# INLINE isCdataEnd #-} 136 | 137 | headIs :: (Word8 -> Bool) -> ByteString -> Bool 138 | headIs p bs = case BS.uncons bs of 139 | Just (!c, _) -> p c 140 | Nothing -> False 141 | {-# INLINE headIs #-} 142 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/CharLike.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.CharLike where 2 | 3 | import Data.Word 4 | import Data.Word8 as W 5 | 6 | class XmlCharLike c where 7 | isElementStart :: c -> Bool 8 | isExclam :: c -> Bool 9 | isHyphen :: c -> Bool 10 | isOpenBracket :: c -> Bool 11 | isQuestion :: c -> Bool 12 | isQuote :: c -> Bool 13 | isSpace :: c -> Bool 14 | 15 | instance XmlCharLike Word8 where 16 | isElementStart = (== _less) 17 | isExclam = (== _exclam) 18 | isHyphen = (== _hyphen) 19 | isOpenBracket = (== _bracketleft) 20 | isQuestion = (== _question) 21 | isQuote c = c == _quotedbl || c == _quotesingle 22 | isSpace = W.isSpace 23 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Decode.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module HaskellWorks.Data.Xml.Decode where 4 | 5 | import Control.Applicative 6 | import Control.Lens 7 | import Control.Monad 8 | import Data.Foldable 9 | import Data.Text (Text) 10 | import HaskellWorks.Data.Xml.DecodeError 11 | import HaskellWorks.Data.Xml.DecodeResult 12 | import HaskellWorks.Data.Xml.Internal.Show 13 | import HaskellWorks.Data.Xml.Value 14 | 15 | class Decode a where 16 | decode :: Value -> DecodeResult a 17 | 18 | instance Decode Value where 19 | decode = DecodeOk 20 | {-# INLINE decode #-} 21 | 22 | failDecode :: Text -> DecodeResult a 23 | failDecode = DecodeFailed . DecodeError 24 | 25 | (@>) :: Value -> Text -> DecodeResult Text 26 | (@>) (XmlElement _ as _) n = case find (\v -> fst v == n) as of 27 | Just (_, text) -> DecodeOk text 28 | Nothing -> failDecode $ "No such attribute " <> tshow n 29 | (@>) _ n = failDecode $ "Not an element whilst looking up attribute " <> tshow n 30 | 31 | (/>) :: Value -> Text -> DecodeResult Value 32 | (/>) (XmlElement _ _ cs) n = go cs 33 | where go [] = failDecode $ "Unable to find element " <> tshow n 34 | go (r:rs) = case r of 35 | e@(XmlElement n' _ _) | n' == n -> DecodeOk e 36 | _ -> go rs 37 | (/>) _ n = failDecode $ "Expecting parent of element " <> tshow n 38 | 39 | (?>) :: Value -> (Value -> DecodeResult Value) -> DecodeResult Value 40 | (?>) v f = f v <|> pure v 41 | 42 | (~>) :: Value -> Text -> DecodeResult Value 43 | (~>) e@(XmlElement n' _ _) n | n' == n = DecodeOk e 44 | (~>) _ n = failDecode $ "Expecting parent of element " <> tshow n 45 | 46 | (/>>) :: Value -> Text -> DecodeResult [Value] 47 | (/>>) v n = v ^. childNodes <&> (~> n) <&> toList & join & pure 48 | 49 | -- Contextful 50 | 51 | () :: DecodeResult Value -> Text -> DecodeResult Value 52 | () ma n = ma >>= (/> n) 53 | 54 | (<@>) :: DecodeResult Value -> Text -> DecodeResult Text 55 | (<@>) ma n = ma >>= (@> n) 56 | 57 | () :: DecodeResult Value -> (Value -> DecodeResult Value) -> DecodeResult Value 58 | () ma f = ma >>= (?> f) 59 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/DecodeError.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 3 | 4 | module HaskellWorks.Data.Xml.DecodeError where 5 | 6 | import Control.DeepSeq 7 | import Data.Text (Text) 8 | import GHC.Generics 9 | 10 | newtype DecodeError = DecodeError Text deriving (Eq, Show, Generic, NFData) 11 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/DecodeResult.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveFunctor #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module HaskellWorks.Data.Xml.DecodeResult where 5 | 6 | import Control.Applicative 7 | import HaskellWorks.Data.Xml.DecodeError 8 | 9 | data DecodeResult a 10 | = DecodeOk a 11 | | DecodeFailed DecodeError 12 | deriving (Eq, Show, Functor) 13 | 14 | instance Applicative DecodeResult where 15 | pure = DecodeOk 16 | {-# INLINE pure #-} 17 | 18 | (<*>) (DecodeOk f ) (DecodeOk a) = DecodeOk (f a) 19 | (<*>) (DecodeOk _ ) (DecodeFailed e) = DecodeFailed e 20 | (<*>) (DecodeFailed e) _ = DecodeFailed e 21 | {-# INLINE (<*>) #-} 22 | 23 | instance Monad DecodeResult where 24 | (>>=) (DecodeOk a) f = f a 25 | (>>=) (DecodeFailed e) _ = DecodeFailed e 26 | {-# INLINE (>>=) #-} 27 | 28 | instance Alternative DecodeResult where 29 | empty = DecodeFailed (DecodeError "Failed decode") 30 | (<|>) (DecodeOk a) _ = DecodeOk a 31 | (<|>) _ (DecodeOk b) = DecodeOk b 32 | (<|>) _ (DecodeFailed e) = DecodeFailed e 33 | {-# INLINE (<|>) #-} 34 | 35 | instance Foldable DecodeResult where 36 | foldr f z (DecodeOk a) = f a z 37 | foldr _ z (DecodeFailed _) = z 38 | 39 | instance Traversable DecodeResult where 40 | traverse _ (DecodeFailed e) = pure (DecodeFailed e) 41 | traverse f (DecodeOk x) = DecodeOk <$> f x 42 | 43 | toEither :: DecodeResult a -> Either DecodeError a 44 | toEither (DecodeOk a) = Right a 45 | toEither (DecodeFailed e) = Left e 46 | 47 | isOk :: DecodeResult a -> Bool 48 | isOk (DecodeOk _) = True 49 | isOk _ = False 50 | 51 | isFailed :: DecodeResult a -> Bool 52 | isFailed (DecodeFailed _) = True 53 | isFailed _ = False 54 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Grammar.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# LANGUAGE InstanceSigs #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE OverloadedStrings #-} 6 | {-# LANGUAGE ScopedTypeVariables #-} 7 | 8 | module HaskellWorks.Data.Xml.Grammar where 9 | 10 | import Control.Applicative 11 | import Data.Char 12 | import Data.String 13 | import Data.Text (Text) 14 | import Data.Word 15 | import HaskellWorks.Data.Parser 16 | 17 | import qualified Data.Attoparsec.Types as T 18 | import qualified Data.Text as T 19 | import qualified HaskellWorks.Data.Parser as P 20 | 21 | data XmlElementType 22 | = XmlElementTypeDocument 23 | | XmlElementTypeElement Text 24 | | XmlElementTypeComment 25 | | XmlElementTypeCData 26 | | XmlElementTypeMeta Text 27 | 28 | parseXmlString :: (P.Parser t Word8) => T.Parser t Text 29 | parseXmlString = do 30 | q <- satisfyChar (=='"') <|> satisfyChar (=='\'') 31 | T.pack <$> many (satisfyChar (/= q)) 32 | 33 | parseXmlElement :: (P.Parser t Word8, IsString t) => T.Parser t XmlElementType 34 | parseXmlElement = comment <|> cdata <|> doc <|> meta <|> element 35 | where 36 | comment = const XmlElementTypeComment <$> string "!--" 37 | cdata = const XmlElementTypeCData <$> string "![CDATA[" 38 | meta = XmlElementTypeMeta <$> (string "!" >> parseXmlToken) 39 | doc = const XmlElementTypeDocument <$> string "?xml" 40 | element = XmlElementTypeElement <$> parseXmlToken 41 | 42 | parseXmlToken :: (P.Parser t Word8) => T.Parser t Text 43 | parseXmlToken = T.pack <$> many (satisfyChar isNameChar "invalid string character") 44 | 45 | parseXmlAttributeName :: (P.Parser t Word8) => T.Parser t Text 46 | parseXmlAttributeName = parseXmlToken 47 | 48 | isNameStartChar :: Char -> Bool 49 | isNameStartChar w = 50 | let iw = ord w 51 | in w == '_' || w == ':' || isAlpha w 52 | || (iw >= 0xc0 && iw <= 0xd6) 53 | || (iw >= 0xd8 && iw <= 0xf6) 54 | || (iw >= 0xf8 && iw <= 0xff) 55 | 56 | isNameChar :: Char -> Bool 57 | isNameChar w = isNameStartChar w || w == '-' || w == '.' 58 | || ord w == 0xb7 || isNumber w 59 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Index.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | 5 | module HaskellWorks.Data.Xml.Index 6 | ( Index(..) 7 | , indexVersion 8 | ) where 9 | 10 | import Control.DeepSeq 11 | import Data.Serialize 12 | import Data.Word 13 | import GHC.Generics 14 | import HaskellWorks.Data.Bits.BitShown 15 | 16 | import qualified Data.Vector.Storable as DVS 17 | 18 | indexVersion :: String 19 | indexVersion = "1.0" 20 | 21 | data Index = Index 22 | { xiVersion :: String 23 | , xiInterests :: BitShown (DVS.Vector Word64) 24 | , xiBalancedParens :: BitShown (DVS.Vector Word64) 25 | } deriving (Eq, Show, Generic, NFData) 26 | 27 | putBitShownVector :: Putter (BitShown (DVS.Vector Word64)) 28 | putBitShownVector = putVector . bitShown 29 | 30 | getBitShownVector :: Get (BitShown (DVS.Vector Word64)) 31 | getBitShownVector = BitShown <$> getVector 32 | 33 | putVector :: DVS.Vector Word64 -> Put 34 | putVector v = do 35 | let len = DVS.length v 36 | put len 37 | DVS.forM_ v put 38 | 39 | getVector :: Get (DVS.Vector Word64) 40 | getVector = do 41 | len <- get 42 | DVS.generateM len (const get) 43 | 44 | instance Serialize Index where 45 | put xi = do 46 | put $ xiVersion xi 47 | putBitShownVector $ xiInterests xi 48 | putBitShownVector $ xiBalancedParens xi 49 | 50 | get = do 51 | version <- get 52 | ib <- getBitShownVector 53 | bp <- getBitShownVector 54 | return $ Index version ib bp 55 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/BalancedParens.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Internal.BalancedParens 2 | ( blankedXmlToBalancedParens 3 | ) where 4 | 5 | import Data.ByteString (ByteString) 6 | import Data.Word 7 | import Data.Word8 8 | 9 | import qualified Data.ByteString as BS 10 | 11 | data MiniBP = MiniN | MiniT | MiniF | MiniTF 12 | 13 | blankedXmlToBalancedParens :: [ByteString] -> [ByteString] 14 | blankedXmlToBalancedParens is = case is of 15 | (bs:bss) -> do 16 | let (cs, _) = BS.unfoldrN (BS.length bs * 2) gen (Nothing, bs) 17 | cs:blankedXmlToBalancedParens bss 18 | [] -> [] 19 | where gen :: (Maybe Bool, ByteString) -> Maybe (Word8, (Maybe Bool, ByteString)) 20 | gen (Just True , bs) = Just (0xFF, (Nothing, bs)) 21 | gen (Just False , bs) = Just (0x00, (Nothing, bs)) 22 | gen (Nothing , bs) = case BS.uncons bs of 23 | Just (c, cs) -> case balancedParensOf c of 24 | MiniN -> gen (Nothing , cs) 25 | MiniT -> Just (0xFF, (Nothing , cs)) 26 | MiniF -> Just (0x00, (Nothing , cs)) 27 | MiniTF -> Just (0xFF, (Just False , cs)) 28 | Nothing -> Nothing 29 | 30 | balancedParensOf :: Word8 -> MiniBP 31 | balancedParensOf c = case c of 32 | d | d == _less -> MiniT 33 | d | d == _greater -> MiniF 34 | d | d == _bracketleft -> MiniT 35 | d | d == _bracketright -> MiniF 36 | d | d == _parenleft -> MiniT 37 | d | d == _parenright -> MiniF 38 | d | d == _t -> MiniTF 39 | d | d == _a -> MiniTF 40 | d | d == _v -> MiniTF 41 | _ -> MiniN 42 | 43 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/Blank.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC-funbox-strict-fields #-} 2 | {-# LANGUAGE BangPatterns #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | module HaskellWorks.Data.Xml.Internal.Blank 6 | ( blankXml 7 | , BlankData(..) 8 | ) where 9 | 10 | import Data.ByteString (ByteString) 11 | import Data.Word 12 | import Data.Word8 13 | import HaskellWorks.Data.Xml.Internal.Words 14 | import Prelude 15 | 16 | import qualified Data.ByteString as BS 17 | 18 | type ExpectedChar = Word8 19 | 20 | data BlankState 21 | = InXml 22 | | InTag 23 | | InAttrList 24 | | InCloseTag 25 | | InClose 26 | | InBang !Int 27 | | InString !ExpectedChar 28 | | InText 29 | | InMeta 30 | | InCdataTag 31 | | InCdata !Int 32 | | InRem !Int 33 | | InIdent 34 | 35 | data BlankData = BlankData 36 | { blankState :: !BlankState 37 | , blankA :: !Word8 38 | , blankB :: !Word8 39 | , blankC :: !ByteString 40 | } 41 | 42 | blankXml :: [ByteString] -> [ByteString] 43 | blankXml = blankXmlPlan1 BS.empty InXml 44 | 45 | blankXmlPlan1 :: ByteString -> BlankState -> [ByteString] -> [ByteString] 46 | blankXmlPlan1 as lastState is = case is of 47 | (bs:bss) -> do 48 | let cs = as <> bs 49 | case BS.uncons cs of 50 | Just (d, ds) -> case BS.uncons ds of 51 | Just (e, es) -> blankXmlRun False d e es lastState bss 52 | Nothing -> blankXmlPlan1 cs lastState bss 53 | Nothing -> blankXmlPlan1 cs lastState bss 54 | [] -> [BS.map (const _space) as] 55 | 56 | blankXmlPlan2 :: Word8 -> Word8 -> BlankState -> [ByteString] -> [ByteString] 57 | blankXmlPlan2 a b lastState is = case is of 58 | (cs:css) -> blankXmlRun False a b cs lastState css 59 | [] -> blankXmlRun True a b (BS.pack [_space, _space]) lastState [] 60 | 61 | blankXmlRun :: Bool -> Word8 -> Word8 -> ByteString -> BlankState -> [ByteString] -> [ByteString] 62 | blankXmlRun done a b cs lastState is = do 63 | let (!ds, mState) = BS.unfoldrN (BS.length cs) blankByteString (BlankData lastState a b cs) 64 | case mState of 65 | Just (BlankData !nextState _ _ _) -> do 66 | let (yy, zz) = case BS.unsnoc cs of 67 | Just (ys, z) -> case BS.unsnoc ys of 68 | Just (_, y) -> (y, z) 69 | Nothing -> (b, z) 70 | Nothing -> (a, b) 71 | if done 72 | then [ds] 73 | else ds:blankXmlPlan2 yy zz nextState is 74 | Nothing -> error "No state: blankXmlRun" 75 | 76 | mkNext :: Word8 -> BlankState -> Word8 -> ByteString -> Maybe (Word8, BlankData) 77 | mkNext w s a bs = case BS.uncons bs of 78 | Just (b, cs) -> Just (w, BlankData s a b cs) 79 | Nothing -> error "This should never happen" 80 | {-# INLINE mkNext #-} 81 | 82 | blankByteString :: BlankData -> Maybe (Word8, BlankData) 83 | blankByteString (BlankData InXml a b cs) | isMetaStart a b = mkNext _bracketleft InMeta b cs 84 | blankByteString (BlankData InXml a b cs) | isEndTag a b = mkNext _space InCloseTag b cs 85 | blankByteString (BlankData InXml a b cs) | isTextStart a = mkNext _t InText b cs 86 | blankByteString (BlankData InXml a b cs) | a == _less = mkNext _less InTag b cs 87 | blankByteString (BlankData InXml a b cs) | isSpace a = mkNext a InXml b cs 88 | blankByteString (BlankData InXml _ b cs) = mkNext _space InXml b cs 89 | blankByteString (BlankData InTag a b cs) | isSpace a = mkNext _parenleft InAttrList b cs 90 | blankByteString (BlankData InTag a b cs) | isTagClose a b = mkNext _space InClose b cs 91 | blankByteString (BlankData InTag a b cs) | a == _greater = mkNext _space InXml b cs 92 | blankByteString (BlankData InTag a b cs) | isSpace a = mkNext a InTag b cs 93 | blankByteString (BlankData InTag _ b cs) = mkNext _space InTag b cs 94 | blankByteString (BlankData InCloseTag a b cs) | a == _greater = mkNext _greater InXml b cs 95 | blankByteString (BlankData InCloseTag a b cs) | isSpace a = mkNext a InCloseTag b cs 96 | blankByteString (BlankData InCloseTag _ b cs) = mkNext _space InCloseTag b cs 97 | blankByteString (BlankData InAttrList a b cs) | a == _greater = mkNext _parenright InXml b cs 98 | blankByteString (BlankData InAttrList a b cs) | isTagClose a b = mkNext _parenright InClose b cs 99 | blankByteString (BlankData InAttrList a b cs) | isNameStartChar a = mkNext _a InIdent b cs 100 | blankByteString (BlankData InAttrList a b cs) | isQuote a = mkNext _v (InString a) b cs 101 | blankByteString (BlankData InAttrList a b cs) | isSpace a = mkNext a InAttrList b cs 102 | blankByteString (BlankData InAttrList _ b cs) = mkNext _space InAttrList b cs 103 | blankByteString (BlankData InClose _ b cs) = mkNext _greater InXml b cs 104 | blankByteString (BlankData InIdent a b cs) | isNameChar a = mkNext _space InIdent b cs 105 | blankByteString (BlankData InIdent a b cs) | isSpace a = mkNext _space InAttrList b cs 106 | blankByteString (BlankData InIdent a b cs) | a == _equal = mkNext _space InAttrList b cs 107 | blankByteString (BlankData InIdent a b cs) | isSpace a = mkNext a InAttrList b cs 108 | blankByteString (BlankData InIdent _ b cs) = mkNext _space InAttrList b cs 109 | blankByteString (BlankData (InString q ) a b cs) | a == q = mkNext _space InAttrList b cs 110 | blankByteString (BlankData (InString q ) a b cs) | isSpace a = mkNext a (InString q) b cs 111 | blankByteString (BlankData (InString q ) _ b cs) = mkNext _space (InString q) b cs 112 | blankByteString (BlankData InText a b cs) | isEndTag a b = mkNext _space InCloseTag b cs 113 | blankByteString (BlankData InText _ b cs) | b == _less = mkNext _space InXml b cs 114 | blankByteString (BlankData InText a b cs) | isSpace a = mkNext a InText b cs 115 | blankByteString (BlankData InText _ b cs) = mkNext _space InText b cs 116 | blankByteString (BlankData InMeta a b cs) | a == _exclam = mkNext _space InMeta b cs 117 | blankByteString (BlankData InMeta a b cs) | a == _hyphen = mkNext _space (InRem 0) b cs 118 | blankByteString (BlankData InMeta a b cs) | a == _bracketleft = mkNext _space InCdataTag b cs 119 | blankByteString (BlankData InMeta a b cs) | a == _greater = mkNext _bracketright InXml b cs 120 | blankByteString (BlankData InMeta a b cs) | isSpace a = mkNext a (InBang 1) b cs 121 | blankByteString (BlankData InMeta _ b cs) = mkNext _space (InBang 1) b cs 122 | blankByteString (BlankData InCdataTag a b cs) | a == _bracketleft = mkNext _space (InCdata 0) b cs 123 | blankByteString (BlankData InCdataTag a b cs) | isSpace a = mkNext a InCdataTag b cs 124 | blankByteString (BlankData InCdataTag _ b cs) = mkNext _space InCdataTag b cs 125 | blankByteString (BlankData (InCdata n ) a b cs) | a == _greater && n >= 2 = mkNext _bracketright InXml b cs 126 | blankByteString (BlankData (InCdata n ) a b cs) | isCdataEnd a b && n > 0 = mkNext _space (InCdata (n+1)) b cs 127 | blankByteString (BlankData (InCdata n ) a b cs) | a == _bracketright = mkNext _space (InCdata (n+1)) b cs 128 | blankByteString (BlankData (InCdata _ ) a b cs) | isSpace a = mkNext a (InCdata 0) b cs 129 | blankByteString (BlankData (InCdata _ ) _ b cs) = mkNext _space (InCdata 0) b cs 130 | blankByteString (BlankData (InRem n ) a b cs) | a == _greater && n >= 2 = mkNext _bracketright InXml b cs 131 | blankByteString (BlankData (InRem n ) a b cs) | a == _hyphen = mkNext _space (InRem (n+1)) b cs 132 | blankByteString (BlankData (InRem _ ) a b cs) | isSpace a = mkNext a (InRem 0) b cs 133 | blankByteString (BlankData (InRem _ ) _ b cs) = mkNext _space (InRem 0) b cs 134 | blankByteString (BlankData (InBang n ) a b cs) | a == _less = mkNext _bracketleft (InBang (n+1)) b cs 135 | blankByteString (BlankData (InBang n ) a b cs) | a == _greater && n == 1 = mkNext _bracketright InXml b cs 136 | blankByteString (BlankData (InBang n ) a b cs) | a == _greater = mkNext _bracketright (InBang (n-1)) b cs 137 | blankByteString (BlankData (InBang n ) a b cs) | isSpace a = mkNext a (InBang n) b cs 138 | blankByteString (BlankData (InBang n ) _ b cs) = mkNext _space (InBang n) b cs 139 | {-# INLINE blankByteString #-} 140 | 141 | isEndTag :: Word8 -> Word8 -> Bool 142 | isEndTag a b = a == _less && b == _slash 143 | {-# INLINE isEndTag #-} 144 | 145 | isTagClose :: Word8 -> Word8 -> Bool 146 | isTagClose a b = a == _slash || ((a == _slash || a == _question) && b == _greater) 147 | {-# INLINE isTagClose #-} 148 | 149 | isMetaStart :: Word8 -> Word8 -> Bool 150 | isMetaStart a b = a == _less && b == _exclam 151 | {-# INLINE isMetaStart #-} 152 | 153 | isCdataEnd :: Word8 -> Word8 -> Bool 154 | isCdataEnd a b = a == _bracketright && b == _greater 155 | {-# INLINE isCdataEnd #-} 156 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/ByteString.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Internal.ByteString 2 | ( repartitionMod8 3 | ) where 4 | 5 | import Data.ByteString (ByteString) 6 | 7 | import qualified Data.ByteString as BS 8 | 9 | repartitionMod8 :: ByteString -> ByteString -> (ByteString, ByteString) 10 | repartitionMod8 aBS bBS = (BS.take cLen abBS, BS.drop cLen abBS) 11 | where abBS = BS.concat [aBS, bBS] 12 | abLen = BS.length abBS 13 | cLen = (abLen `div` 8) * 8 14 | {-# INLINE repartitionMod8 #-} 15 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/List.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | 4 | module HaskellWorks.Data.Xml.Internal.List 5 | ( blankedXmlToInterestBits 6 | , compressWordAsBit 7 | ) where 8 | 9 | import Data.ByteString (ByteString) 10 | import Data.Word 11 | import HaskellWorks.Data.Bits.BitWise 12 | import HaskellWorks.Data.Xml.Internal.ByteString 13 | import HaskellWorks.Data.Xml.Internal.Tables 14 | import Prelude 15 | 16 | import qualified Data.ByteString as BS 17 | 18 | blankedXmlToInterestBits :: [ByteString] -> [ByteString] 19 | blankedXmlToInterestBits = blankedXmlToInterestBits' "" 20 | 21 | blankedXmlToInterestBits' :: ByteString -> [ByteString] -> [ByteString] 22 | blankedXmlToInterestBits' rs is = case is of 23 | (bs:bss) -> do 24 | let cs = if BS.length rs /= 0 then BS.concat [rs, bs] else bs 25 | let lencs = BS.length cs 26 | let q = lencs `quot` 8 27 | let (ds, es) = BS.splitAt (q * 8) cs 28 | let (fs, _) = BS.unfoldrN q gen ds 29 | fs:blankedXmlToInterestBits' es bss 30 | [] -> do 31 | let lenrs = BS.length rs 32 | let q = lenrs + 7 `quot` 8 33 | [fst (BS.unfoldrN q gen rs)] 34 | where gen :: ByteString -> Maybe (Word8, ByteString) 35 | gen as = if BS.length as == 0 36 | then Nothing 37 | else Just ( BS.foldr' (\b m -> isInterestingWord8 b .|. (m .<. 1)) 0 (BS.take 8 as) 38 | , BS.drop 8 as 39 | ) 40 | 41 | compressWordAsBit :: [ByteString] -> [ByteString] 42 | compressWordAsBit = compressWordAsBit' BS.empty 43 | 44 | compressWordAsBit' :: ByteString -> [ByteString] -> [ByteString] 45 | compressWordAsBit' aBS iBS = case iBS of 46 | (bBS:bBSs) -> do 47 | let (cBS, dBS) = repartitionMod8 aBS bBS 48 | let (cs, _) = BS.unfoldrN (BS.length cBS + 7 `div` 8) gen cBS 49 | cs:compressWordAsBit' dBS bBSs 50 | [] -> do 51 | let (cs, _) = BS.unfoldrN (BS.length aBS + 7 `div` 8) gen aBS 52 | [cs] 53 | where gen :: ByteString -> Maybe (Word8, ByteString) 54 | gen xs = if BS.length xs == 0 55 | then Nothing 56 | else Just ( BS.foldr' (\b m -> ((b .&. 1) .|. (m .<. 1))) 0 (BS.take 8 xs) 57 | , BS.drop 8 xs 58 | ) 59 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/Show.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Internal.Show 2 | ( tshow 3 | ) where 4 | 5 | import Data.Text (Text) 6 | 7 | import qualified Data.Text as T 8 | 9 | tshow :: Show a => a -> Text 10 | tshow = T.pack . show 11 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/Tables.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Internal.Tables 2 | ( interestingWord8s 3 | , isInterestingWord8 4 | ) where 5 | 6 | import Data.Word 7 | import Data.Word8 8 | import HaskellWorks.Data.AtIndex ((!!!)) 9 | import Prelude as P 10 | 11 | import qualified Data.Vector.Storable as DVS 12 | 13 | interestingWord8s :: DVS.Vector Word8 14 | interestingWord8s = DVS.constructN 256 go 15 | where go :: DVS.Vector Word8 -> Word8 16 | go v = if w == _bracketleft 17 | || w == _braceleft 18 | || w == _parenleft 19 | || w == _bracketleft 20 | || w == _less 21 | || w == _a 22 | || w == _v 23 | || w == _t 24 | then 1 25 | else 0 26 | where w :: Word8 27 | w = fromIntegral (DVS.length v) 28 | {-# NOINLINE interestingWord8s #-} 29 | 30 | isInterestingWord8 :: Word8 -> Word8 31 | isInterestingWord8 b = fromIntegral (interestingWord8s !!! fromIntegral b) 32 | {-# INLINABLE isInterestingWord8 #-} 33 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/ToIbBp64.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# LANGUAGE InstanceSigs #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | 6 | module HaskellWorks.Data.Xml.Internal.ToIbBp64 7 | ( toBalancedParens64 8 | , toInterestBits64 9 | , toIbBp64 10 | ) where 11 | 12 | import Data.ByteString (ByteString) 13 | import HaskellWorks.Data.Xml.Internal.BalancedParens 14 | import HaskellWorks.Data.Xml.Internal.List 15 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml (BlankedXml (..)) 16 | 17 | toBalancedParens64 :: BlankedXml -> [ByteString] 18 | toBalancedParens64 (BlankedXml bj) = compressWordAsBit (blankedXmlToBalancedParens bj) 19 | 20 | toInterestBits64 :: BlankedXml -> [ByteString] 21 | toInterestBits64 (BlankedXml bj) = blankedXmlToInterestBits bj 22 | 23 | toIbBp64 :: BlankedXml -> [(ByteString, ByteString)] 24 | toIbBp64 bj = zip (toInterestBits64 bj) (toBalancedParens64 bj) 25 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Internal/Words.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Internal.Words where 2 | 3 | import Data.Word 4 | import Data.Word8 5 | 6 | isLeadingDigit :: Word8 -> Bool 7 | isLeadingDigit w = w == _hyphen || (w >= _0 && w <= _9) 8 | 9 | isTrailingDigit :: Word8 -> Bool 10 | isTrailingDigit w = w == _plus || w == _hyphen || (w >= _0 && w <= _9) || w == _period || w == _E || w == _e 11 | 12 | isAlphabetic :: Word8 -> Bool 13 | isAlphabetic w = (w >= _A && w <= _Z) || (w >= _a && w <= _z) 14 | 15 | isQuote :: Word8 -> Bool 16 | isQuote w = w == _quotedbl || w == _quotesingle 17 | 18 | isNameStartChar :: Word8 -> Bool 19 | isNameStartChar w = w == _underscore || w == _colon || isAlphabetic w 20 | || w `isIn` (0xc0, 0xd6) 21 | || w `isIn` (0xd8, 0xf6) 22 | || w `isIn` (0xf8, 0xff) 23 | 24 | isNameChar :: Word8 -> Bool 25 | isNameChar w = isNameStartChar w || w == _hyphen || w == _period 26 | || w == 0xb7 || w `isIn` (0, 9) 27 | 28 | isXml :: Word8 -> Bool 29 | isXml w = w == _less || w == _greater 30 | 31 | isTextStart :: Word8 -> Bool 32 | isTextStart w = not (isSpace w) && w /= _less && w /= _greater 33 | 34 | isIn :: Word8 -> (Word8, Word8) -> Bool 35 | isIn w (s, e) = w >= s && w <= e 36 | {-# INLINE isIn #-} 37 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Lens.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Lens where 2 | 3 | import Control.Lens 4 | import Data.Text (Text) 5 | import HaskellWorks.Data.Xml.Value 6 | 7 | isTagNamed :: Text -> Value -> Bool 8 | isTagNamed a (XmlElement b _ _) | a == b = True 9 | isTagNamed _ _ = False 10 | 11 | tagNamed :: (Applicative f, Choice p) => Text -> Optic' p f Value Value 12 | tagNamed = filtered . isTagNamed 13 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/RawDecode.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.RawDecode where 2 | 3 | import HaskellWorks.Data.Xml.RawValue 4 | 5 | class RawDecode a where 6 | rawDecode :: RawValue -> a 7 | 8 | instance RawDecode RawValue where 9 | rawDecode = id 10 | {-# INLINE rawDecode #-} 11 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/RawValue.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleInstances #-} 2 | {-# LANGUAGE InstanceSigs #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | 8 | module HaskellWorks.Data.Xml.RawValue 9 | ( RawValue(..) 10 | , RawValueAt(..) 11 | ) where 12 | 13 | import Data.ByteString (ByteString) 14 | import Data.List 15 | import Data.Text (Text) 16 | import HaskellWorks.Data.Xml.Grammar 17 | import HaskellWorks.Data.Xml.Internal.Show 18 | import HaskellWorks.Data.Xml.Succinct.Index 19 | import Prettyprinter 20 | 21 | import qualified Data.Attoparsec.ByteString.Char8 as ABC 22 | import qualified Data.ByteString as BS 23 | import qualified Data.Text as T 24 | 25 | data RawValue 26 | = RawDocument [RawValue] 27 | | RawText Text 28 | | RawElement Text [RawValue] 29 | | RawCData Text 30 | | RawComment Text 31 | | RawMeta Text [RawValue] 32 | | RawAttrName Text 33 | | RawAttrValue Text 34 | | RawAttrList [RawValue] 35 | | RawError Text 36 | deriving (Eq, Show) 37 | 38 | -- TODO use colors and styles 39 | 40 | red :: Doc ann -> Doc ann 41 | red = id 42 | 43 | dullwhite :: Doc ann -> Doc ann 44 | dullwhite = id 45 | 46 | bold :: Doc ann -> Doc ann 47 | bold = id 48 | 49 | dullgreen :: Doc ann -> Doc ann 50 | dullgreen = id 51 | 52 | instance Pretty RawValue where 53 | pretty mjpv = case mjpv of 54 | RawText s -> ctext $ pretty (T.unpack s) 55 | RawAttrName s -> pretty (T.unpack s) 56 | RawAttrValue s -> (ctext . dquotes . pretty) (T.unpack s) 57 | RawAttrList ats -> formatAttrs ats 58 | RawComment s -> pretty $ "" 59 | RawElement s xs -> formatElem (T.unpack s) xs 60 | RawDocument xs -> formatMeta "?" "xml" xs 61 | RawError s -> red $ "[error " <> pretty (T.unpack s) <> "]" 62 | RawCData s -> cangle " ctag "[CDATA[" <> pretty (T.unpack s) <> cangle "]]>" 63 | RawMeta s xs -> formatMeta "!" (T.unpack s) xs 64 | where 65 | formatAttr at = case at of 66 | RawAttrName a -> " " <> pretty (RawAttrName a) 67 | RawAttrValue a -> "=" <> pretty (RawAttrValue a) 68 | RawAttrList _ -> red "ATTRS" 69 | _ -> red "booo" 70 | formatAttrs ats = hcat (formatAttr <$> ats) 71 | formatElem s xs = 72 | let (ats, es) = partition isAttrL xs 73 | in cangle langle <> ctag (pretty s) 74 | <> hcat (pretty <$> ats) 75 | <> cangle rangle 76 | <> hcat (pretty <$> es) 77 | <> cangle " ctag (pretty s) <> cangle rangle 78 | formatMeta b s xs = 79 | let (ats, es) = partition isAttr xs 80 | in cangle (langle <> pretty @String b) <> ctag (pretty @String s) 81 | <> hcat (pretty <$> ats) 82 | <> cangle rangle 83 | <> hcat (pretty <$> es) 84 | 85 | class RawValueAt a where 86 | rawValueAt :: a -> RawValue 87 | 88 | instance RawValueAt XmlIndex where 89 | rawValueAt i = case i of 90 | XmlIndexCData s -> parseTextUntil "]]>" s `as` (RawCData . T.pack) 91 | XmlIndexComment s -> parseTextUntil "-->" s `as` (RawComment . T.pack) 92 | XmlIndexMeta s cs -> RawMeta s (rawValueAt <$> cs) 93 | XmlIndexElement s cs -> RawElement s (rawValueAt <$> cs) 94 | XmlIndexDocument cs -> RawDocument (rawValueAt <$> cs) 95 | XmlIndexAttrName cs -> parseAttrName cs `as` RawAttrName 96 | XmlIndexAttrValue cs -> parseString cs `as` RawAttrValue 97 | XmlIndexAttrList cs -> RawAttrList (rawValueAt <$> cs) 98 | XmlIndexValue s -> parseTextUntil "<" s `as` (RawText . T.pack) 99 | XmlIndexError s -> RawError s 100 | --unknown -> XmlError ("Not yet supported: " <> show unknown) 101 | where 102 | parseUntil s = ABC.manyTill ABC.anyChar (ABC.string s) 103 | 104 | parseTextUntil :: ByteString -> ByteString -> Either Text [Char] 105 | parseTextUntil s bs = case ABC.parse (parseUntil s) bs of 106 | ABC.Fail {} -> decodeErr ("Unable to find " <> tshow s <> ".") bs 107 | ABC.Partial _ -> decodeErr ("Unexpected end, expected " <> tshow s <> ".") bs 108 | ABC.Done _ r -> Right r 109 | parseString :: ByteString -> Either Text Text 110 | parseString bs = case ABC.parse parseXmlString bs of 111 | ABC.Fail {} -> decodeErr "Unable to parse string" bs 112 | ABC.Partial _ -> decodeErr "Unexpected end of string, expected" bs 113 | ABC.Done _ r -> Right r 114 | parseAttrName :: ByteString -> Either Text Text 115 | parseAttrName bs = case ABC.parse parseXmlAttributeName bs of 116 | ABC.Fail {} -> decodeErr "Unable to parse attribute name" bs 117 | ABC.Partial _ -> decodeErr "Unexpected end of attr name, expected" bs 118 | ABC.Done _ r -> Right r 119 | 120 | cangle :: Doc ann -> Doc ann 121 | cangle = dullwhite 122 | 123 | ctag :: Doc ann -> Doc ann 124 | ctag = bold 125 | 126 | ctext :: Doc ann -> Doc ann 127 | ctext = dullgreen 128 | 129 | isAttrL :: RawValue -> Bool 130 | isAttrL (RawAttrList _) = True 131 | isAttrL _ = False 132 | 133 | isAttr :: RawValue -> Bool 134 | isAttr v = case v of 135 | RawAttrName _ -> True 136 | RawAttrValue _ -> True 137 | RawAttrList _ -> True 138 | _ -> False 139 | 140 | as :: Either Text a -> (a -> RawValue) -> RawValue 141 | as = flip $ either RawError 142 | 143 | decodeErr :: Text -> BS.ByteString -> Either Text a 144 | decodeErr reason bs = Left $ reason <> " (" <> tshow (BS.take 20 bs) <> "...)" 145 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Succinct 2 | ( module X 3 | ) where 4 | 5 | import HaskellWorks.Data.Xml.Succinct.Cursor as X 6 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor.hs: -------------------------------------------------------------------------------- 1 | 2 | module HaskellWorks.Data.Xml.Succinct.Cursor 3 | ( module X 4 | ) where 5 | 6 | import HaskellWorks.Data.Xml.Succinct.Cursor.Internal as X 7 | import HaskellWorks.Data.Xml.Succinct.Cursor.Token as X 8 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/BalancedParens.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 5 | {-# LANGUAGE InstanceSigs #-} 6 | {-# LANGUAGE MultiParamTypeClasses #-} 7 | 8 | module HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParens 9 | ( XmlBalancedParens(..) 10 | , getXmlBalancedParens 11 | ) where 12 | 13 | import Control.Applicative 14 | import Control.DeepSeq 15 | import Data.Word 16 | import GHC.Generics 17 | import HaskellWorks.Data.BalancedParens 18 | import HaskellWorks.Data.Xml.Internal.BalancedParens 19 | import HaskellWorks.Data.Xml.Internal.List 20 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 21 | 22 | import qualified Data.ByteString as BS 23 | import qualified Data.Vector.Storable as DVS 24 | 25 | newtype XmlBalancedParens a = XmlBalancedParens a deriving (Eq, Show, Generic, NFData) 26 | 27 | getXmlBalancedParens :: XmlBalancedParens a -> a 28 | getXmlBalancedParens (XmlBalancedParens a) = a 29 | 30 | genBitWordsForever :: BS.ByteString -> Maybe (Word8, BS.ByteString) 31 | genBitWordsForever bs = BS.uncons bs <|> Just (0, bs) 32 | {-# INLINABLE genBitWordsForever #-} 33 | 34 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word8))) where 35 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS))) 36 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj))) 37 | newLen = (BS.length interestBS + 7) `div` 8 * 8 38 | 39 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word16))) where 40 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS))) 41 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj))) 42 | newLen = (BS.length interestBS + 7) `div` 8 * 8 43 | 44 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word32))) where 45 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS))) 46 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj))) 47 | newLen = (BS.length interestBS + 7) `div` 8 * 8 48 | 49 | instance FromBlankedXml (XmlBalancedParens (SimpleBalancedParens (DVS.Vector Word64))) where 50 | fromBlankedXml bj = XmlBalancedParens (SimpleBalancedParens (DVS.unsafeCast (DVS.unfoldrN newLen genBitWordsForever interestBS))) 51 | where interestBS = BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml bj))) 52 | newLen = (BS.length interestBS + 7) `div` 8 * 8 53 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/BlankedXml.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 3 | 4 | module HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 5 | ( BlankedXml(..) 6 | , FromBlankedXml(..) 7 | , getBlankedXml 8 | , bsToBlankedXml 9 | , lbsToBlankedXml 10 | ) where 11 | 12 | import Control.DeepSeq 13 | import GHC.Generics 14 | import HaskellWorks.Data.Xml.Internal.Blank 15 | 16 | import qualified Data.ByteString as BS 17 | import qualified Data.ByteString.Lazy as LBS 18 | 19 | newtype BlankedXml = BlankedXml 20 | { unblankedXml :: [BS.ByteString] 21 | } deriving (Eq, Show, Generic, NFData) 22 | 23 | getBlankedXml :: BlankedXml -> [BS.ByteString] 24 | getBlankedXml (BlankedXml bs) = bs 25 | 26 | class FromBlankedXml a where 27 | fromBlankedXml :: BlankedXml -> a 28 | 29 | bsToBlankedXml :: BS.ByteString -> BlankedXml 30 | bsToBlankedXml bs = BlankedXml (blankXml [bs]) 31 | 32 | lbsToBlankedXml :: LBS.ByteString -> BlankedXml 33 | lbsToBlankedXml lbs = BlankedXml (blankXml (LBS.toChunks lbs)) 34 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/Create.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Succinct.Cursor.Create 2 | ( byteStringAsFastCursor 3 | , byteStringAsSlowCursor 4 | ) where 5 | 6 | import Data.Coerce 7 | import HaskellWorks.Data.BalancedParens.RangeMin2 8 | import HaskellWorks.Data.BalancedParens.Simple 9 | import HaskellWorks.Data.Bits.BitShown 10 | import HaskellWorks.Data.RankSelect.CsPoppy1 11 | import HaskellWorks.Data.Vector.Storable 12 | import HaskellWorks.Data.Xml.Succinct.Cursor 13 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 14 | import HaskellWorks.Data.Xml.Succinct.Cursor.Types 15 | 16 | import qualified Data.ByteString as BS 17 | import qualified HaskellWorks.Data.Xml.Internal.ToIbBp64 as I 18 | 19 | byteStringAsSlowCursor :: BS.ByteString -> SlowCursor 20 | byteStringAsSlowCursor bs = XmlCursor 21 | { cursorText = bs 22 | , interests = BitShown ib 23 | , balancedParens = SimpleBalancedParens bp 24 | , cursorRank = 1 25 | } 26 | where blankedXml = bsToBlankedXml bs 27 | bsLen = BS.length bs 28 | idxLen = (bsLen + 7) `div` 8 29 | (ib, bp) = construct64UnzipN idxLen (I.toIbBp64 blankedXml) 30 | 31 | byteStringAsFastCursor :: BS.ByteString -> FastCursor 32 | byteStringAsFastCursor bs = XmlCursor bs ibCsPoppy rangeMinMax r 33 | where XmlCursor _ ib bp r = byteStringAsSlowCursor bs 34 | bpCsPoppy = makeCsPoppy (coerce bp) 35 | rangeMinMax = mkRangeMin2 bpCsPoppy 36 | ibCsPoppy = makeCsPoppy (coerce ib) 37 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/InterestBits.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 5 | {-# LANGUAGE InstanceSigs #-} 6 | {-# LANGUAGE MultiParamTypeClasses #-} 7 | 8 | module HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits 9 | ( XmlInterestBits(..) 10 | , getXmlInterestBits 11 | , blankedXmlToInterestBits 12 | , blankedXmlBssToInterestBitsBs 13 | , genInterestForever 14 | ) where 15 | 16 | import Control.Applicative 17 | import Control.DeepSeq 18 | import Data.ByteString.Internal 19 | import Data.Word 20 | import GHC.Generics 21 | import HaskellWorks.Data.Bits.BitShown 22 | import HaskellWorks.Data.FromByteString 23 | import HaskellWorks.Data.RankSelect.Poppy512 24 | import HaskellWorks.Data.Xml.Internal.List 25 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 26 | 27 | import qualified Data.ByteString as BS 28 | import qualified Data.Vector.Storable as DVS 29 | 30 | newtype XmlInterestBits a = XmlInterestBits a deriving (Eq, Show, Generic, NFData) 31 | 32 | getXmlInterestBits :: XmlInterestBits a -> a 33 | getXmlInterestBits (XmlInterestBits a) = a 34 | 35 | blankedXmlBssToInterestBitsBs :: [ByteString] -> ByteString 36 | blankedXmlBssToInterestBitsBs bss = BS.concat $ blankedXmlToInterestBits bss 37 | 38 | genInterest :: ByteString -> Maybe (Word8, ByteString) 39 | genInterest = BS.uncons 40 | 41 | genInterestForever :: ByteString -> Maybe (Word8, ByteString) 42 | genInterestForever bs = BS.uncons bs <|> Just (0, bs) 43 | 44 | instance FromBlankedXml (XmlInterestBits (BitShown [Bool])) where 45 | fromBlankedXml = XmlInterestBits . fromByteString . BS.concat . blankedXmlToInterestBits . getBlankedXml 46 | 47 | instance FromBlankedXml (XmlInterestBits (BitShown BS.ByteString)) where 48 | fromBlankedXml = XmlInterestBits . BitShown . BS.unfoldr genInterest . blankedXmlBssToInterestBitsBs . getBlankedXml 49 | 50 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word8))) where 51 | fromBlankedXml = XmlInterestBits . BitShown . DVS.unfoldr genInterest . blankedXmlBssToInterestBitsBs . getBlankedXml 52 | 53 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word16))) where 54 | fromBlankedXml bj = XmlInterestBits (BitShown (DVS.unsafeCast (DVS.unfoldrN newLen genInterestForever interestBS))) 55 | where interestBS = blankedXmlBssToInterestBitsBs (getBlankedXml bj) 56 | newLen = (BS.length interestBS + 1) `div` 2 * 2 57 | 58 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word32))) where 59 | fromBlankedXml bj = XmlInterestBits (BitShown (DVS.unsafeCast (DVS.unfoldrN newLen genInterestForever interestBS))) 60 | where interestBS = blankedXmlBssToInterestBitsBs (getBlankedXml bj) 61 | newLen = (BS.length interestBS + 3) `div` 4 * 4 62 | 63 | instance FromBlankedXml (XmlInterestBits (BitShown (DVS.Vector Word64))) where 64 | fromBlankedXml bj = XmlInterestBits (BitShown (DVS.unsafeCast (DVS.unfoldrN newLen genInterestForever interestBS))) 65 | where interestBS = blankedXmlBssToInterestBitsBs (getBlankedXml bj) 66 | newLen = (BS.length interestBS + 7) `div` 8 * 8 67 | 68 | instance FromBlankedXml (XmlInterestBits Poppy512) where 69 | fromBlankedXml = XmlInterestBits . makePoppy512 . bitShown . getXmlInterestBits . fromBlankedXml 70 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/Internal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE InstanceSigs #-} 5 | {-# LANGUAGE MultiParamTypeClasses #-} 6 | {-# LANGUAGE ScopedTypeVariables #-} 7 | 8 | module HaskellWorks.Data.Xml.Succinct.Cursor.Internal 9 | ( XmlCursor(..) 10 | , xmlCursorPos 11 | ) where 12 | 13 | import Control.DeepSeq (NFData (..)) 14 | import Data.String 15 | import Data.Word 16 | import Foreign.ForeignPtr 17 | import GHC.Generics 18 | import HaskellWorks.Data.Bits.BitShown 19 | import HaskellWorks.Data.FromByteString 20 | import HaskellWorks.Data.FromForeignRegion 21 | import HaskellWorks.Data.Positioning 22 | import HaskellWorks.Data.RankSelect.Base.Rank0 23 | import HaskellWorks.Data.RankSelect.Base.Rank1 24 | import HaskellWorks.Data.RankSelect.Base.Select1 25 | import HaskellWorks.Data.RankSelect.Poppy512 26 | import HaskellWorks.Data.TreeCursor 27 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 28 | import HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits 29 | 30 | import qualified Data.ByteString as BS 31 | import qualified Data.ByteString.Char8 as BSC 32 | import qualified Data.ByteString.Internal as BSI 33 | import qualified Data.Vector.Storable as DVS 34 | import qualified HaskellWorks.Data.BalancedParens as BP 35 | import qualified HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParens as CBP 36 | 37 | data XmlCursor t v w = XmlCursor 38 | { cursorText :: !t 39 | , interests :: !v 40 | , balancedParens :: !w 41 | , cursorRank :: !Count 42 | } 43 | deriving (Eq, Show, Generic) 44 | 45 | instance (NFData t, NFData v, NFData w) => NFData (XmlCursor t v w) where 46 | rnf (XmlCursor a b c d) = rnf (a, b, c, d) 47 | 48 | instance (FromBlankedXml (XmlInterestBits a), FromBlankedXml (CBP.XmlBalancedParens b)) 49 | => FromByteString (XmlCursor BS.ByteString a b) where 50 | fromByteString bs = XmlCursor 51 | { cursorText = bs 52 | , interests = getXmlInterestBits (fromBlankedXml blankedXml) 53 | , balancedParens = CBP.getXmlBalancedParens (fromBlankedXml blankedXml) 54 | , cursorRank = 1 55 | } 56 | where blankedXml :: BlankedXml 57 | blankedXml = bsToBlankedXml bs 58 | 59 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (BP.SimpleBalancedParens (DVS.Vector Word8))) where 60 | fromString = fromByteString . BSC.pack 61 | 62 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (BP.SimpleBalancedParens (DVS.Vector Word16))) where 63 | fromString = fromByteString . BSC.pack 64 | 65 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (BP.SimpleBalancedParens (DVS.Vector Word32))) where 66 | fromString = fromByteString . BSC.pack 67 | 68 | instance IsString (XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (BP.SimpleBalancedParens (DVS.Vector Word64))) where 69 | fromString = fromByteString . BSC.pack 70 | 71 | instance IsString (XmlCursor BS.ByteString Poppy512 (BP.SimpleBalancedParens (DVS.Vector Word64))) where 72 | fromString = fromByteString . BSC.pack 73 | 74 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (BP.SimpleBalancedParens (DVS.Vector Word8))) where 75 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size) 76 | 77 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (BP.SimpleBalancedParens (DVS.Vector Word16))) where 78 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size) 79 | 80 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (BP.SimpleBalancedParens (DVS.Vector Word32))) where 81 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size) 82 | 83 | instance FromForeignRegion (XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (BP.SimpleBalancedParens (DVS.Vector Word64))) where 84 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size) 85 | 86 | instance FromForeignRegion (XmlCursor BS.ByteString Poppy512 (BP.SimpleBalancedParens (DVS.Vector Word64))) where 87 | fromForeignRegion (fptr, offset, size) = fromByteString (BSI.fromForeignPtr (castForeignPtr fptr) offset size) 88 | 89 | instance (BP.BalancedParens u, Rank1 u, Rank0 u) => TreeCursor (XmlCursor t v u) where 90 | firstChild :: XmlCursor t v u -> Maybe (XmlCursor t v u) 91 | firstChild k = let mq = BP.firstChild (balancedParens k) (cursorRank k) in (\q -> k { cursorRank = q }) <$> mq 92 | 93 | nextSibling :: XmlCursor t v u -> Maybe (XmlCursor t v u) 94 | nextSibling k = (\q -> k { cursorRank = q }) <$> BP.nextSibling (balancedParens k) (cursorRank k) 95 | 96 | parent :: XmlCursor t v u -> Maybe (XmlCursor t v u) 97 | parent k = let mq = BP.parent (balancedParens k) (cursorRank k) in (\q -> k { cursorRank = q }) <$> mq 98 | 99 | depth :: XmlCursor t v u -> Maybe Count 100 | depth k = BP.depth (balancedParens k) (cursorRank k) 101 | 102 | subtreeSize :: XmlCursor t v u -> Maybe Count 103 | subtreeSize k = BP.subtreeSize (balancedParens k) (cursorRank k) 104 | 105 | xmlCursorPos :: (Rank1 w, Select1 v) => XmlCursor s v w -> Position 106 | xmlCursorPos k = toPosition (select1 ik (rank1 bpk (cursorRank k)) - 1) 107 | where ik = interests k 108 | bpk = balancedParens k 109 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/Load.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | 3 | module HaskellWorks.Data.Xml.Succinct.Cursor.Load 4 | ( loadSlowCursor 5 | , loadFastCursor 6 | ) where 7 | 8 | import HaskellWorks.Data.Xml.Succinct.Cursor.Create 9 | import HaskellWorks.Data.Xml.Succinct.Cursor.Types 10 | 11 | import qualified Data.ByteString as BS 12 | 13 | -- | Load an XML file into memory and return a raw cursor initialised to the 14 | -- start of the XML document. 15 | loadSlowCursor :: FilePath -> IO SlowCursor 16 | loadSlowCursor = fmap byteStringAsSlowCursor . BS.readFile 17 | 18 | -- | Load an XML file into memory and return a query-optimised cursor initialised 19 | -- to the start of the XML document. 20 | loadFastCursor :: FilePath -> IO FastCursor 21 | loadFastCursor = fmap byteStringAsFastCursor . BS.readFile 22 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/MMap.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | 4 | module HaskellWorks.Data.Xml.Succinct.Cursor.MMap 5 | ( SlowCursor 6 | , FastCursor 7 | , mmapSlowCursor 8 | , mmapFastCursor 9 | ) where 10 | 11 | import Data.Word 12 | import Foreign.ForeignPtr 13 | import HaskellWorks.Data.BalancedParens.RangeMin2 14 | import HaskellWorks.Data.BalancedParens.Simple 15 | import HaskellWorks.Data.Bits.BitShown 16 | import HaskellWorks.Data.RankSelect.CsPoppy1 17 | import HaskellWorks.Data.Vector.Storable 18 | import HaskellWorks.Data.Xml.Succinct.Cursor 19 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 20 | import HaskellWorks.Data.Xml.Succinct.Cursor.Types 21 | 22 | import qualified Data.ByteString.Internal as BSI 23 | import qualified HaskellWorks.Data.Xml.Internal.ToIbBp64 as I 24 | import qualified System.IO.MMap as IO 25 | 26 | mmapSlowCursor :: FilePath -> IO SlowCursor 27 | mmapSlowCursor filePath = do 28 | (fptr :: ForeignPtr Word8, offset, size) <- IO.mmapFileForeignPtr filePath IO.ReadOnly Nothing 29 | let !bs = BSI.fromForeignPtr (castForeignPtr fptr) offset size 30 | let blankedXml = bsToBlankedXml bs 31 | let (ib, bp) = construct64UnzipN (fromIntegral size) (I.toIbBp64 blankedXml) 32 | let !cursor = XmlCursor 33 | { cursorText = bs 34 | , interests = BitShown ib 35 | , balancedParens = SimpleBalancedParens bp 36 | , cursorRank = 1 37 | } 38 | 39 | return cursor 40 | 41 | mmapFastCursor :: FilePath -> IO FastCursor 42 | mmapFastCursor filename = do 43 | -- Load the XML file into memory as a raw cursor. 44 | -- The raw XML data is `text`, and `ib` and `bp` are the indexes. 45 | -- `ib` and `bp` can be persisted to an index file for later use to avoid 46 | -- re-parsing the file. 47 | XmlCursor !text (BitShown !ib) (SimpleBalancedParens !bp) _ <- mmapSlowCursor filename 48 | let !bpCsPoppy = makeCsPoppy bp 49 | let !rangeMinMax = mkRangeMin2 bpCsPoppy 50 | let !ibCsPoppy = makeCsPoppy ib 51 | return $ XmlCursor text ibCsPoppy rangeMinMax 1 52 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/Token.hs: -------------------------------------------------------------------------------- 1 | 2 | module HaskellWorks.Data.Xml.Succinct.Cursor.Token 3 | ( xmlTokenAt 4 | ) where 5 | 6 | import Data.ByteString (ByteString) 7 | import HaskellWorks.Data.Bits.BitWise 8 | import HaskellWorks.Data.Drop 9 | import HaskellWorks.Data.Positioning 10 | import HaskellWorks.Data.RankSelect.Base.Rank1 11 | import HaskellWorks.Data.RankSelect.Base.Select1 12 | import HaskellWorks.Data.Xml.Succinct.Cursor.Internal 13 | import HaskellWorks.Data.Xml.Token.Tokenize 14 | import Prelude hiding (drop) 15 | 16 | import qualified Data.Attoparsec.ByteString.Char8 as ABC 17 | 18 | xmlTokenAt :: (Rank1 w, Select1 v, TestBit w) => XmlCursor ByteString v w -> Maybe (XmlToken String Double) 19 | xmlTokenAt k = if balancedParens k .?. lastPositionOf (cursorRank k) 20 | then case ABC.parse parseXmlToken (drop (toCount (xmlCursorPos k)) (cursorText k)) of 21 | ABC.Fail {} -> error "Failed to parse token in cursor" 22 | ABC.Partial _ -> error "Failed to parse token in cursor" 23 | ABC.Done _ r -> Just r 24 | else Nothing 25 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Cursor/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | 3 | module HaskellWorks.Data.Xml.Succinct.Cursor.Types 4 | ( SlowCursor 5 | , FastCursor 6 | ) where 7 | 8 | import Data.Word 9 | import HaskellWorks.Data.BalancedParens.RangeMin2 10 | import HaskellWorks.Data.BalancedParens.Simple 11 | import HaskellWorks.Data.Bits.BitShown 12 | import HaskellWorks.Data.RankSelect.CsPoppy1 13 | import HaskellWorks.Data.Xml.Succinct.Cursor 14 | 15 | import qualified Data.ByteString as BS 16 | import qualified Data.Vector.Storable as DVS 17 | 18 | type SlowCursor = XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64)) 19 | 20 | type FastCursor = XmlCursor BS.ByteString CsPoppy1 (RangeMin2 CsPoppy1) 21 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Succinct/Index.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE InstanceSigs #-} 5 | {-# LANGUAGE MultiParamTypeClasses #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | 8 | module HaskellWorks.Data.Xml.Succinct.Index 9 | ( XmlIndex(..) 10 | , XmlIndexAt(..) 11 | ) 12 | where 13 | 14 | import Control.Arrow 15 | import Data.Text (Text) 16 | import HaskellWorks.Data.Bits.BitWise 17 | import HaskellWorks.Data.Drop 18 | import HaskellWorks.Data.Positioning 19 | import HaskellWorks.Data.RankSelect.Base.Rank0 20 | import HaskellWorks.Data.RankSelect.Base.Rank1 21 | import HaskellWorks.Data.RankSelect.Base.Select1 22 | import HaskellWorks.Data.TreeCursor 23 | import HaskellWorks.Data.Uncons 24 | import HaskellWorks.Data.Xml.CharLike 25 | import HaskellWorks.Data.Xml.Grammar 26 | import HaskellWorks.Data.Xml.Succinct 27 | import Prelude hiding (drop) 28 | 29 | import qualified Data.Attoparsec.ByteString.Char8 as ABC 30 | import qualified Data.ByteString as BS 31 | import qualified Data.List as L 32 | import qualified Data.Text as T 33 | import qualified HaskellWorks.Data.BalancedParens as BP 34 | 35 | data XmlIndex 36 | = XmlIndexDocument [XmlIndex] 37 | | XmlIndexElement Text [XmlIndex] 38 | | XmlIndexCData BS.ByteString 39 | | XmlIndexComment BS.ByteString 40 | | XmlIndexMeta Text [XmlIndex] 41 | | XmlIndexAttrList [XmlIndex] 42 | | XmlIndexValue BS.ByteString 43 | | XmlIndexAttrName BS.ByteString 44 | | XmlIndexAttrValue BS.ByteString 45 | | XmlIndexError Text 46 | deriving (Eq, Show) 47 | 48 | data XmlIndexState 49 | = InAttrList 50 | | InElement 51 | | Unknown 52 | deriving (Eq, Show) 53 | 54 | class XmlIndexAt a where 55 | xmlIndexAt :: a -> XmlIndex 56 | 57 | pos :: (Select1 v, Rank1 w) => XmlCursor t v w -> Position 58 | pos c = lastPositionOf (select1 (interests c) (rank1 (balancedParens c) (cursorRank c))) 59 | 60 | remText :: (Drop v, Select1 v1, Rank1 w) => XmlCursor v v1 w -> v 61 | remText c = drop (toCount (pos c)) (cursorText c) 62 | 63 | instance (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlIndexAt (XmlCursor BS.ByteString v w) where 64 | xmlIndexAt :: XmlCursor BS.ByteString v w -> XmlIndex 65 | xmlIndexAt = getIndexAt Unknown 66 | 67 | 68 | getIndexAt :: (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlIndexState -> XmlCursor BS.ByteString v w -> XmlIndex 69 | getIndexAt state k = case uncons remainder of 70 | Just (!c, cs) | isElementStart c -> parseElem cs 71 | Just (!c, _ ) | isSpace c -> XmlIndexAttrList $ mapValuesFrom InAttrList (firstChild k) 72 | Just (!c, _ ) | isAttribute && isQuote c -> XmlIndexAttrValue remainder 73 | Just _ | isAttribute -> XmlIndexAttrName remainder 74 | Just _ -> XmlIndexValue remainder 75 | Nothing -> XmlIndexError "End of data" 76 | where remainder = remText k 77 | mapValuesFrom s = L.unfoldr (fmap (getIndexAt s &&& nextSibling)) 78 | isAttribute = case state of 79 | InAttrList -> True 80 | InElement -> False 81 | Unknown -> case remText <$> parent k >>= uncons of 82 | Just (!c, _) | isSpace c -> True 83 | _ -> False 84 | 85 | parseElem bs = 86 | case ABC.parse parseXmlElement bs of 87 | ABC.Fail {} -> decodeErr "Unable to parse element name" bs 88 | ABC.Partial _ -> decodeErr "Unexpected end of string" bs 89 | ABC.Done i r -> case r of 90 | XmlElementTypeCData -> XmlIndexCData i 91 | XmlElementTypeComment -> XmlIndexComment i 92 | XmlElementTypeMeta s -> XmlIndexMeta s (mapValuesFrom InElement $ firstChild k) 93 | XmlElementTypeElement s -> XmlIndexElement s (mapValuesFrom InElement $ firstChild k) 94 | XmlElementTypeDocument -> XmlIndexDocument (mapValuesFrom InElement (firstChild k) <> mapValuesFrom InElement (nextSibling k)) 95 | 96 | decodeErr :: String -> BS.ByteString -> XmlIndex 97 | decodeErr reason bs = XmlIndexError . T.pack $ reason <>": " <> show (BS.take 20 bs) <> "...'" 98 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Token.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Token 2 | ( module X 3 | ) where 4 | 5 | import HaskellWorks.Data.Xml.Token.Types as X 6 | import HaskellWorks.Data.Xml.Token.Tokenize as X 7 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Token/Tokenize.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE TypeSynonymInstances #-} 6 | 7 | module HaskellWorks.Data.Xml.Token.Tokenize 8 | ( IsChar(..) 9 | , XmlToken(..) 10 | , ParseXml(..) 11 | ) where 12 | 13 | import Control.Applicative 14 | import Data.Bits 15 | import Data.Char 16 | import Data.Word 17 | import Data.Word8 18 | import HaskellWorks.Data.Char.IsChar 19 | import HaskellWorks.Data.Parser as P 20 | import HaskellWorks.Data.Xml.Token.Types 21 | 22 | import qualified Data.Attoparsec.ByteString.Char8 as BC 23 | import qualified Data.Attoparsec.Combinator as AC 24 | import qualified Data.Attoparsec.Types as T 25 | import qualified Data.ByteString as BS 26 | 27 | hexDigitNumeric :: P.Parser t Word8 => T.Parser t Int 28 | hexDigitNumeric = do 29 | c <- satisfyChar (\c -> '0' <= c && c <= '9') 30 | return $ ord c - ord '0' 31 | 32 | hexDigitAlphaLower :: P.Parser t Word8 => T.Parser t Int 33 | hexDigitAlphaLower = do 34 | c <- satisfyChar (\c -> 'a' <= c && c <= 'z') 35 | return $ ord c - ord 'a' + 10 36 | 37 | hexDigitAlphaUpper :: P.Parser t Word8 => T.Parser t Int 38 | hexDigitAlphaUpper = do 39 | c <- satisfyChar (\c -> 'A' <= c && c <= 'Z') 40 | return $ ord c - ord 'A' + 10 41 | 42 | hexDigit :: P.Parser t Word8 => T.Parser t Int 43 | hexDigit = hexDigitNumeric <|> hexDigitAlphaLower <|> hexDigitAlphaUpper 44 | 45 | class ParseXml t s d where 46 | parseXmlTokenString :: T.Parser t (XmlToken s d) 47 | parseXmlToken :: T.Parser t (XmlToken s d) 48 | parseXmlTokenBraceL :: T.Parser t (XmlToken s d) 49 | parseXmlTokenBraceR :: T.Parser t (XmlToken s d) 50 | parseXmlTokenBracketL :: T.Parser t (XmlToken s d) 51 | parseXmlTokenBracketR :: T.Parser t (XmlToken s d) 52 | parseXmlTokenComma :: T.Parser t (XmlToken s d) 53 | parseXmlTokenColon :: T.Parser t (XmlToken s d) 54 | parseXmlTokenWhitespace :: T.Parser t (XmlToken s d) 55 | parseXmlTokenNull :: T.Parser t (XmlToken s d) 56 | parseXmlTokenBoolean :: T.Parser t (XmlToken s d) 57 | parseXmlTokenDouble :: T.Parser t (XmlToken s d) 58 | 59 | parseXmlToken = 60 | parseXmlTokenString <|> 61 | parseXmlTokenBraceL <|> 62 | parseXmlTokenBraceR <|> 63 | parseXmlTokenBracketL <|> 64 | parseXmlTokenBracketR <|> 65 | parseXmlTokenComma <|> 66 | parseXmlTokenColon <|> 67 | parseXmlTokenWhitespace <|> 68 | parseXmlTokenNull <|> 69 | parseXmlTokenBoolean <|> 70 | parseXmlTokenDouble 71 | 72 | instance ParseXml BS.ByteString String Double where 73 | parseXmlTokenBraceL = string "{" >> return XmlTokenBraceL 74 | parseXmlTokenBraceR = string "}" >> return XmlTokenBraceR 75 | parseXmlTokenBracketL = string "[" >> return XmlTokenBracketL 76 | parseXmlTokenBracketR = string "]" >> return XmlTokenBracketR 77 | parseXmlTokenComma = string "," >> return XmlTokenComma 78 | parseXmlTokenColon = string ":" >> return XmlTokenColon 79 | parseXmlTokenNull = string "null" >> return XmlTokenNull 80 | parseXmlTokenDouble = XmlTokenNumber <$> rational 81 | 82 | parseXmlTokenString = do 83 | _ <- string "\"" 84 | value <- many (verbatimChar <|> escapedChar <|> escapedCode) 85 | _ <- string "\"" 86 | return $ XmlTokenString value 87 | where 88 | verbatimChar = satisfyChar (BC.notInClass "\"\\") "invalid string character" 89 | escapedChar = do 90 | _ <- string "\\" 91 | ( char '"' >> return '"' ) <|> 92 | ( char 'b' >> return '\b' ) <|> 93 | ( char 'n' >> return '\n' ) <|> 94 | ( char 'f' >> return '\f' ) <|> 95 | ( char 'r' >> return '\r' ) <|> 96 | ( char 't' >> return '\t' ) <|> 97 | ( char '\\' >> return '\\' ) <|> 98 | ( char '\'' >> return '\'' ) <|> 99 | ( char '/' >> return '/' ) 100 | escapedCode :: T.Parser BS.ByteString Char 101 | escapedCode = do 102 | _ <- string "\\u" 103 | a <- hexDigit 104 | b <- hexDigit 105 | c <- hexDigit 106 | d <- hexDigit 107 | return . chr $ a `shift` 24 .|. b `shift` 16 .|. c `shift` 8 .|. d 108 | 109 | parseXmlTokenWhitespace = do 110 | _ <- AC.many1' $ BC.choice [string " ", string "\t", string "\n", string "\r"] 111 | return XmlTokenWhitespace 112 | 113 | parseXmlTokenBoolean = true <|> false 114 | where 115 | true = string "true" >> return (XmlTokenBoolean True) 116 | false = string "false" >> return (XmlTokenBoolean False) 117 | 118 | instance ParseXml BS.ByteString BS.ByteString Double where 119 | parseXmlTokenBraceL = string "{" >> return XmlTokenBraceL 120 | parseXmlTokenBraceR = string "}" >> return XmlTokenBraceR 121 | parseXmlTokenBracketL = string "[" >> return XmlTokenBracketL 122 | parseXmlTokenBracketR = string "]" >> return XmlTokenBracketR 123 | parseXmlTokenComma = string "," >> return XmlTokenComma 124 | parseXmlTokenColon = string ":" >> return XmlTokenColon 125 | parseXmlTokenNull = string "null" >> return XmlTokenNull 126 | parseXmlTokenDouble = XmlTokenNumber <$> rational 127 | 128 | parseXmlTokenString = do 129 | _ <- string "\"" 130 | value <- many (verbatimChar <|> escapedChar <|> escapedCode) 131 | _ <- string "\"" 132 | return . XmlTokenString $ BS.pack value 133 | where 134 | word :: Word8 -> T.Parser BS.ByteString Word8 135 | word w = satisfy (== w) 136 | verbatimChar :: T.Parser BS.ByteString Word8 137 | verbatimChar = satisfy (\w -> w /= _quotedbl && w /= _backslash) -- "invalid string character" 138 | escapedChar :: T.Parser BS.ByteString Word8 139 | escapedChar = do 140 | _ <- string "\\" 141 | ( word _quotedbl >> return _quotedbl ) <|> 142 | ( word _b >> return 0x08 ) <|> 143 | ( word _n >> return _lf ) <|> 144 | ( word _f >> return _np ) <|> 145 | ( word _r >> return _cr ) <|> 146 | ( word _t >> return _tab ) <|> 147 | ( word _backslash >> return _backslash ) <|> 148 | ( word _quotesingle >> return _quotesingle ) <|> 149 | ( word _slash >> return _slash ) 150 | escapedCode :: T.Parser BS.ByteString Word8 151 | escapedCode = do 152 | _ <- string "\\u" 153 | a <- hexDigit 154 | b <- hexDigit 155 | c <- hexDigit 156 | d <- hexDigit 157 | return . fromIntegral $ a `shift` 24 .|. b `shift` 16 .|. c `shift` 8 .|. d 158 | 159 | parseXmlTokenWhitespace = do 160 | _ <- AC.many1' $ BC.choice [string " ", string "\t", string "\n", string "\r"] 161 | return XmlTokenWhitespace 162 | 163 | parseXmlTokenBoolean = true <|> false 164 | where 165 | true = string "true" >> return (XmlTokenBoolean True) 166 | false = string "false" >> return (XmlTokenBoolean False) 167 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Token/Types.hs: -------------------------------------------------------------------------------- 1 | module HaskellWorks.Data.Xml.Token.Types (XmlToken(..)) where 2 | 3 | data XmlToken s d 4 | = XmlTokenBraceL 5 | | XmlTokenBraceR 6 | | XmlTokenBracketL 7 | | XmlTokenBracketR 8 | | XmlTokenComma 9 | | XmlTokenColon 10 | | XmlTokenWhitespace 11 | | XmlTokenString s 12 | | XmlTokenBoolean Bool 13 | | XmlTokenNumber d 14 | | XmlTokenNull 15 | deriving (Eq, Show) 16 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Type.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleInstances #-} 2 | {-# LANGUAGE InstanceSigs #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | 5 | module HaskellWorks.Data.Xml.Type where 6 | 7 | import Data.Char 8 | import Data.Word8 as W8 9 | import HaskellWorks.Data.Bits.BitWise 10 | import HaskellWorks.Data.Drop 11 | import HaskellWorks.Data.Positioning 12 | import HaskellWorks.Data.RankSelect.Base.Rank0 13 | import HaskellWorks.Data.RankSelect.Base.Rank1 14 | import HaskellWorks.Data.RankSelect.Base.Select1 15 | import HaskellWorks.Data.Xml.Succinct 16 | import Prelude hiding (drop) 17 | 18 | import qualified Data.ByteString as BS 19 | import qualified HaskellWorks.Data.BalancedParens as BP 20 | 21 | {- HLINT ignore "Reduce duplication" -} 22 | 23 | data XmlType 24 | = XmlTypeElement 25 | | XmlTypeAttrList 26 | | XmlTypeToken 27 | deriving (Eq, Show) 28 | 29 | class XmlTypeAt a where 30 | xmlTypeAtPosition :: Position -> a -> Maybe XmlType 31 | xmlTypeAt :: a -> Maybe XmlType 32 | 33 | instance (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlTypeAt (XmlCursor String v w) where 34 | xmlTypeAtPosition p k = case drop (toCount p) (cursorText k) of 35 | c:_ | fromIntegral (ord c) == _less -> Just XmlTypeElement 36 | c:_ | W8.isSpace $ fromIntegral (ord c) -> Just XmlTypeAttrList 37 | _ -> Just XmlTypeToken 38 | 39 | xmlTypeAt k = xmlTypeAtPosition p k 40 | where p = lastPositionOf (select1 ik (rank1 bpk (cursorRank k))) 41 | ik = interests k 42 | bpk = balancedParens k 43 | 44 | instance (BP.BalancedParens w, Rank0 w, Rank1 w, Select1 v, TestBit w) => XmlTypeAt (XmlCursor BS.ByteString v w) where 45 | xmlTypeAtPosition p k = case BS.uncons (drop (toCount p) (cursorText k)) of 46 | Just (c, _) | c == _less -> Just XmlTypeElement 47 | Just (c, _) | W8.isSpace c -> Just XmlTypeAttrList 48 | _ -> Just XmlTypeToken 49 | 50 | xmlTypeAt k = xmlTypeAtPosition p k 51 | where p = lastPositionOf (select1 ik (rank1 bpk (cursorRank k))) 52 | ik = interests k 53 | bpk = balancedParens k 54 | -------------------------------------------------------------------------------- /src/HaskellWorks/Data/Xml/Value.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleInstances #-} 2 | {-# LANGUAGE InstanceSigs #-} 3 | {-# LANGUAGE MultiParamTypeClasses #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TemplateHaskell #-} 7 | {-# LANGUAGE TupleSections #-} 8 | 9 | module HaskellWorks.Data.Xml.Value 10 | ( Value(..) 11 | , HasValue(..) 12 | , _XmlDocument 13 | , _XmlText 14 | , _XmlElement 15 | , _XmlCData 16 | , _XmlComment 17 | , _XmlMeta 18 | , _XmlError 19 | ) where 20 | 21 | import Control.Lens 22 | import Data.Text (Text) 23 | import HaskellWorks.Data.Xml.Internal.Show 24 | import HaskellWorks.Data.Xml.RawDecode 25 | import HaskellWorks.Data.Xml.RawValue 26 | 27 | data Value 28 | = XmlDocument 29 | { _childNodes :: [Value] 30 | } 31 | | XmlText 32 | { _textValue :: Text 33 | } 34 | | XmlElement 35 | { _name :: Text 36 | , _attributes :: [(Text, Text)] 37 | , _childNodes :: [Value] 38 | } 39 | | XmlCData 40 | { _cdata :: Text 41 | } 42 | | XmlComment 43 | { _comment :: Text 44 | } 45 | | XmlMeta 46 | { _name :: Text 47 | , _childNodes :: [Value] 48 | } 49 | | XmlError 50 | { _errorMessage :: Text 51 | } 52 | deriving (Eq, Show) 53 | 54 | makeClassy ''Value 55 | makePrisms ''Value 56 | 57 | instance RawDecode Value where 58 | rawDecode (RawDocument rvs ) = XmlDocument (rawDecode <$> rvs) 59 | rawDecode (RawText text ) = XmlText text 60 | rawDecode (RawElement n cs ) = mkXmlElement n cs 61 | rawDecode (RawCData text ) = XmlCData text 62 | rawDecode (RawComment text ) = XmlComment text 63 | rawDecode (RawMeta n cs ) = XmlMeta n (rawDecode <$> cs) 64 | rawDecode (RawAttrName nameValue ) = XmlError ("Can't decode attribute name: " <> nameValue) 65 | rawDecode (RawAttrValue attrValue ) = XmlError ("Can't decode attribute value: " <> attrValue) 66 | rawDecode (RawAttrList as ) = XmlError ("Can't decode attribute list: " <> tshow as) 67 | rawDecode (RawError msg ) = XmlError msg 68 | 69 | mkXmlElement :: Text -> [RawValue] -> Value 70 | mkXmlElement n (RawAttrList as:cs) = XmlElement n (mkAttrs as) (rawDecode <$> cs) 71 | mkXmlElement n cs = XmlElement n [] (rawDecode <$> cs) 72 | 73 | mkAttrs :: [RawValue] -> [(Text, Text)] 74 | mkAttrs (RawAttrName n:RawAttrValue v:cs) = (n, v):mkAttrs cs 75 | mkAttrs (_:cs) = mkAttrs cs 76 | mkAttrs [] = [] 77 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/Internal/BlankSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | 4 | module HaskellWorks.Data.Xml.Internal.BlankSpec (spec) where 5 | 6 | import Data.Char 7 | import HaskellWorks.Data.ByteString 8 | import HaskellWorks.Data.Xml.Internal.Blank 9 | import HaskellWorks.Hspec.Hedgehog 10 | import Hedgehog 11 | import Test.Hspec 12 | 13 | import qualified Data.ByteString as BS 14 | import qualified Hedgehog.Gen as G 15 | import qualified Hedgehog.Range as R 16 | 17 | {- HLINT ignore "Redundant do" -} 18 | {- HLINT ignore "Reduce duplication" -} 19 | 20 | whenBlankedXmlShouldBe :: BS.ByteString -> BS.ByteString -> Spec 21 | whenBlankedXmlShouldBe original expected = do 22 | it (show original <> " when blanked xml should be " <> show expected) $ requireTest $ do 23 | BS.concat (blankXml [original]) === expected 24 | 25 | repeatBS :: Int -> BS.ByteString -> BS.ByteString 26 | repeatBS n bs | n > 0 = bs <> repeatBS (n - 1) bs 27 | repeatBS _ _ = BS.empty 28 | 29 | noSpaces :: BS.ByteString -> BS.ByteString 30 | noSpaces = BS.filter (/= fromIntegral (ord ' ')) 31 | 32 | data Annotated a b = Annotated a b deriving Show 33 | 34 | instance Eq a => Eq (Annotated a b) where 35 | (Annotated a _) == (Annotated b _) = a == b 36 | 37 | spec :: Spec 38 | spec = describe "HaskellWorks.Data.Xml.Internal.BlankSpec" $ do 39 | describe "Can blank XML" $ do 40 | "" `whenBlankedXmlShouldBe` "< >" 41 | "" `whenBlankedXmlShouldBe` "< >" 42 | "text" `whenBlankedXmlShouldBe` "< t >" 43 | " text " `whenBlankedXmlShouldBe` "< t >" 44 | "" `whenBlankedXmlShouldBe` "< ()>" 45 | "" `whenBlankedXmlShouldBe` "< (a v )>" 46 | "" `whenBlankedXmlShouldBe` "< (a v )>" 47 | "" `whenBlankedXmlShouldBe` "< (a v )>" 48 | "" `whenBlankedXmlShouldBe` "< (a v a v )>" 49 | "text" `whenBlankedXmlShouldBe` "< (a v a v )t >" 50 | "" `whenBlankedXmlShouldBe` "< (a v a v )>" 51 | "" `whenBlankedXmlShouldBe` "< (a v )< > >" 52 | "test" `whenBlankedXmlShouldBe` "< (a v )< t > >" 53 | " text bold " `whenBlankedXmlShouldBe` "< t < t > >" 54 | " text bold uuu" `whenBlankedXmlShouldBe` "< t < t > t >" 55 | "" `whenBlankedXmlShouldBe` "< (a v )>" 56 | " " `whenBlankedXmlShouldBe` "< [ ] >" 57 | " " `whenBlankedXmlShouldBe` "< [ ] >" 58 | 59 | " " `whenBlankedXmlShouldBe` "< [ ] >" 60 | "" `whenBlankedXmlShouldBe` "< (a v a v )>" 61 | 62 | "]>" `whenBlankedXmlShouldBe` "[ [ ] ]" 64 | 65 | "Hello,\ 66 | \ world!]]>" `whenBlankedXmlShouldBe` "< [ ] >" 67 | 68 | "" `whenBlankedXmlShouldBe` "< [ ] >" 69 | "00" `whenBlankedXmlShouldBe` "< < t >< > >" 70 | "0" `whenBlankedXmlShouldBe` "< < t >< > >" 71 | 72 | it "Can blank across chunk boundaries with basic tags" $ requireTest $ do 73 | let inputOriginalPrefix = "\n\n " 74 | let inputOriginalSuffix = "\n \n \n \n \n \n\n" 75 | let inputOriginal = inputOriginalPrefix <> inputOriginalSuffix 76 | let inputOriginalChunked = chunkedBy 16 inputOriginal 77 | let inputOriginalBlanked = blankXml inputOriginalChunked 78 | 79 | n <- forAll $ G.int (R.linear 0 16) 80 | 81 | let inputShifted = inputOriginalPrefix <> repeatBS n " " <> inputOriginalSuffix 82 | let inputShiftedChunked = chunkedBy 16 inputShifted 83 | let inputShiftedBlanked = blankXml inputShiftedChunked 84 | 85 | noSpaces (BS.concat inputShiftedBlanked) === noSpaces (BS.concat inputOriginalBlanked) 86 | it "Can blank across chunk boundaries with auto-close tags" $ requireTest $ do 87 | let inputOriginalPrefix = "" 88 | let inputOriginalSuffix = "\n" 89 | let inputOriginal = inputOriginalPrefix <> inputOriginalSuffix 90 | let inputOriginalChunked = chunkedBy 16 inputOriginal 91 | let inputOriginalBlanked = blankXml inputOriginalChunked 92 | 93 | n <- forAll $ G.int (R.linear 0 16) 94 | 95 | let inputShifted = inputOriginalPrefix <> repeatBS n " " <> inputOriginalSuffix 96 | let inputShiftedChunked = chunkedBy 16 inputShifted 97 | let inputShiftedBlanked = blankXml inputShiftedChunked 98 | 99 | -- putStrLn $ show (BS.concat inputShiftedBlanked) <> " vs " <> show (BS.concat inputOriginalBlanked) 100 | let actual = Annotated (noSpaces (BS.concat inputShiftedBlanked )) (inputShiftedBlanked, n) 101 | let expected = Annotated (noSpaces (BS.concat inputOriginalBlanked)) (inputOriginalBlanked, n) 102 | 103 | actual === expected 104 | it "Can blank across chunk boundaries with auto-close tags" $ requireTest $ do 105 | let inputOriginalPrefix = "" 106 | let inputOriginalSuffix = "\n" 107 | let inputOriginal = inputOriginalPrefix <> inputOriginalSuffix 108 | let inputOriginalChunked = chunkedBy 16 inputOriginal 109 | let inputOriginalBlanked = blankXml inputOriginalChunked 110 | 111 | let n = 15 112 | let inputShifted = inputOriginalPrefix <> repeatBS n " " <> inputOriginalSuffix 113 | let inputShiftedChunked = chunkedBy 16 inputShifted 114 | let inputShiftedBlanked = blankXml inputShiftedChunked 115 | 116 | -- putStrLn $ show (BS.concat inputShiftedBlanked) <> " vs " <> show (BS.concat inputOriginalBlanked) 117 | let actual = Annotated (noSpaces (BS.concat inputShiftedBlanked )) (inputShiftedBlanked, n) 118 | let expected = Annotated (noSpaces (BS.concat inputOriginalBlanked)) (inputOriginalBlanked, n) 119 | 120 | actual === expected 121 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/RawValueSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitForAll #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE InstanceSigs #-} 5 | {-# LANGUAGE MonoLocalBinds #-} 6 | {-# LANGUAGE MultiParamTypeClasses #-} 7 | {-# LANGUAGE NoMonomorphismRestriction #-} 8 | {-# LANGUAGE OverloadedStrings #-} 9 | {-# LANGUAGE ScopedTypeVariables #-} 10 | 11 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-} 12 | 13 | module HaskellWorks.Data.Xml.RawValueSpec (spec) where 14 | 15 | import Control.Monad 16 | import Data.String 17 | import Data.Text (Text) 18 | import Data.Word 19 | import HaskellWorks.Data.BalancedParens.BalancedParens 20 | import HaskellWorks.Data.BalancedParens.Simple 21 | import HaskellWorks.Data.Bits.BitShown 22 | import HaskellWorks.Data.Bits.BitWise 23 | import HaskellWorks.Data.RankSelect.Base.Rank0 24 | import HaskellWorks.Data.RankSelect.Base.Rank1 25 | import HaskellWorks.Data.RankSelect.Base.Select1 26 | import HaskellWorks.Data.RankSelect.Poppy512 27 | import HaskellWorks.Data.Xml.RawValue 28 | import HaskellWorks.Data.Xml.Succinct.Cursor as C 29 | import HaskellWorks.Data.Xml.Succinct.Index 30 | import HaskellWorks.Hspec.Hedgehog 31 | import Hedgehog 32 | import Test.Hspec 33 | 34 | import qualified Data.ByteString as BS 35 | import qualified Data.Vector.Storable as DVS 36 | import qualified HaskellWorks.Data.TreeCursor as TC 37 | 38 | {- HLINT ignore "Redundant do" -} 39 | {- HLINT ignore "Redundant return" -} 40 | {- HLINT ignore "Reduce duplication" -} 41 | 42 | fc = TC.firstChild 43 | ns = TC.nextSibling 44 | 45 | attrs :: [(Text, Text)] -> RawValue 46 | attrs as = RawAttrList $ as >>= (\(k, v) -> [RawAttrName k, RawAttrValue v]) 47 | 48 | spec :: Spec 49 | spec = describe "HaskellWorks.Data.Xml.ValueSpec" $ do 50 | genSpec "DVS.Vector Word8" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (SimpleBalancedParens (DVS.Vector Word8))) 51 | genSpec "DVS.Vector Word16" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16))) 52 | genSpec "DVS.Vector Word32" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32))) 53 | genSpec "DVS.Vector Word64" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))) 54 | genSpec "Poppy512" (undefined :: XmlCursor BS.ByteString Poppy512 (SimpleBalancedParens (DVS.Vector Word64))) 55 | 56 | rawValueVia :: XmlIndexAt (XmlCursor BS.ByteString t u) 57 | => Maybe (XmlCursor BS.ByteString t u) -> RawValue 58 | rawValueVia mk = case mk of 59 | Just k -> rawValueAt (xmlIndexAt k) --either (\(DecodeError e) -> XmlError e) id (rawValueAt <$> xmlIndexAt k) 60 | Nothing -> RawError "No such element" 61 | 62 | genSpec :: forall t u. 63 | ( Show t 64 | , Select1 t 65 | , Show u 66 | , Rank0 u 67 | , Rank1 u 68 | , BalancedParens u 69 | , TestBit u 70 | , IsString (XmlCursor BS.ByteString t u) 71 | ) 72 | => String -> XmlCursor BS.ByteString t u -> SpecWith () 73 | genSpec t _ = do 74 | describe ("XML cursor of type " <> t) $ do 75 | let forXml (cursor :: XmlCursor BS.ByteString t u) f = describe ("of value " <> show cursor) (f cursor) 76 | 77 | forXml "" $ \cursor -> do 78 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) === RawElement "a" [] 79 | 80 | forXml "" $ \cursor -> do 81 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) === 82 | RawElement "a" [attrs [("attr", "value")]] 83 | 84 | forXml "" $ \cursor -> do 85 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) === 86 | RawElement "a" [attrs [("attr", "value")], 87 | RawElement "b" [attrs [("attr", "value")]]] 88 | 89 | forXml "value text" $ \cursor -> do 90 | it "should have correct value" $ requireTest $ rawValueVia (Just cursor) === 91 | RawElement "a" [RawText "value text"] 92 | 93 | forXml "" $ \cursor -> do 94 | it "should parse space separared comment" $ requireTest $ rawValueVia (Just cursor) === 95 | RawComment " some comment " 96 | 97 | forXml "" $ \cursor -> do 98 | it "should parse space separared comment" $ requireTest $ rawValueVia (Just cursor) === 99 | RawComment "some comment ->" 100 | 101 | forXml " tag]]>" $ \cursor -> do 102 | it "should parse cdata data" $ requireTest $ rawValueVia (Just cursor) === 103 | RawCData "a
tag" 104 | 105 | forXml "]>" $ \cursor -> do 106 | it "should parse metas" $ requireTest $ rawValueVia (Just cursor) === 107 | RawMeta "DOCTYPE" [RawMeta "ELEMENT" []] 108 | 109 | forXml "free" $ \cursor -> do 110 | it "should parse xml header" $ requireTest $ rawValueVia (Just cursor) === 111 | RawDocument [ 112 | attrs [("version", "1.0"), ("encoding", "UTF-8")], 113 | RawElement "a" [attrs [("text", "value")], 114 | RawText "free"]] 115 | 116 | it "navigate around" $ requireTest $ do 117 | rawValueVia (ns cursor) === RawElement "a" [attrs [("text", "value")], RawText "free"] 118 | rawValueVia ((ns >=> fc) cursor) === attrs [("text", "value")] 119 | rawValueVia ((ns >=> fc >=> fc) cursor) === RawAttrName "text" 120 | rawValueVia ((ns >=> fc >=> fc >=> ns) cursor) === RawAttrValue "value" 121 | rawValueVia ((ns >=> fc >=> ns) cursor) === RawText "free" 122 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/Succinct/Cursor/BalancedParensSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParensSpec 5 | ( spec 6 | ) where 7 | 8 | import Data.String 9 | import HaskellWorks.Data.Bits.BitShown 10 | import HaskellWorks.Data.ByteString 11 | import HaskellWorks.Data.Xml.Internal.BalancedParens 12 | import HaskellWorks.Data.Xml.Internal.Blank 13 | import HaskellWorks.Data.Xml.Internal.List 14 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 15 | import HaskellWorks.Hspec.Hedgehog 16 | import Hedgehog 17 | import Test.Hspec 18 | 19 | import qualified Data.ByteString as BS 20 | 21 | {- HLINT ignore "Redundant do" -} 22 | 23 | spec :: Spec 24 | spec = describe "HaskellWorks.Data.Xml.Succinct.Cursor.BalancedParensSpec" $ do 25 | it "Blanking XML should work 1" $ requireTest $ do 26 | let blankedXml = BlankedXml [">"] 27 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml blankedXml))) 28 | bp === fromString "11011000" 29 | it "Blanking XML should work 2" $ requireTest $ do 30 | let blankedXml = BlankedXml 31 | [ "<><><><><><><><>" 32 | , "<><><><><><><><>" 33 | ] 34 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens (getBlankedXml blankedXml))) 35 | bp === fromString 36 | "1010101010101010\ 37 | \1010101010101010" 38 | 39 | let unchunkedInput = "\n\n \n \n \n \n \n \n\n" 40 | let chunkedInput = chunkedBy 15 unchunkedInput 41 | let chunkedBlank = blankXml chunkedInput 42 | 43 | let unchunkedBadInput = "\n\n \n \n \n \n \n \n\n" 44 | let chunkedBadInput = chunkedBy 15 unchunkedBadInput 45 | let chunkedBadBlank = blankXml chunkedBadInput 46 | 47 | it "Same input" $ requireTest $ do 48 | unchunkedInput === BS.concat chunkedInput 49 | 50 | it "Blanking XML should work 3" $ requireTest $ do 51 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens chunkedBlank)) 52 | annotate $ "Good: " <> show chunkedBlank 53 | bp === fromString "11101010 10001101 01010100" 54 | 55 | it "Blanking XML should work 3" $ requireTest $ do 56 | let bp = BitShown $ BS.concat (compressWordAsBit (blankedXmlToBalancedParens chunkedBadBlank)) 57 | annotate $ "Bad: " <> show chunkedBadBlank 58 | bp === fromString "11101010 10001101 01010100" 59 | 60 | describe "Chunking works" $ do 61 | let document = "free" 62 | let whole = mkBlank 4096 document 63 | let chunked = mkBlank 15 document 64 | 65 | it "should BP the same with chanks" $ requireTest $ do 66 | BS.concat chunked === BS.concat whole 67 | 68 | it "should produce same bits" $ requireTest $ do 69 | BS.concat (mkBits chunked) === BS.concat (mkBits whole) 70 | 71 | 72 | mkBlank :: Int -> BS.ByteString -> [BS.ByteString] 73 | mkBlank csize bs = blankXml (chunkedBy csize bs) 74 | 75 | mkBits :: [BS.ByteString] -> [BS.ByteString] 76 | mkBits = compressWordAsBit . blankedXmlToBalancedParens 77 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/Succinct/Cursor/BlankedXmlSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXmlSpec 5 | ( spec 6 | ) where 7 | 8 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 9 | import HaskellWorks.Hspec.Hedgehog 10 | import Hedgehog 11 | import Test.Hspec 12 | 13 | {- HLINT ignore "Redundant do" -} 14 | 15 | spec :: Spec 16 | spec = describe "HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXmlSpec" $ do 17 | describe "Blanking XML should work" $ do 18 | it "on strict bytestrings" $ requireTest $ do 19 | let input = "" 20 | let expected = "< < > >" 21 | let blankedXml = bsToBlankedXml input 22 | 23 | mconcat (unblankedXml blankedXml) === expected 24 | 25 | it "on lazy bytestrings" $ requireTest $ do 26 | let input = "" 27 | let expected = "< < > >" 28 | let blankedXml = lbsToBlankedXml input 29 | 30 | mconcat (unblankedXml blankedXml) === expected 31 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/Succinct/Cursor/InterestBitsSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module HaskellWorks.Data.Xml.Succinct.Cursor.InterestBitsSpec(spec) where 5 | 6 | import Data.String 7 | import Data.Word 8 | import HaskellWorks.Data.Bits.BitShown 9 | import HaskellWorks.Data.FromByteString 10 | import HaskellWorks.Data.Xml.Succinct.Cursor.BlankedXml 11 | import HaskellWorks.Data.Xml.Succinct.Cursor.InterestBits 12 | import HaskellWorks.Hspec.Hedgehog 13 | import Hedgehog 14 | import Test.Hspec 15 | 16 | import qualified Data.ByteString as BS 17 | import qualified Data.Vector.Storable as DVS 18 | 19 | {- HLINT ignore "Redundant do" -} 20 | 21 | interestBitsOf :: FromBlankedXml (XmlInterestBits a) => BS.ByteString -> a 22 | interestBitsOf = getXmlInterestBits . fromBlankedXml . bsToBlankedXml 23 | 24 | spec :: Spec 25 | spec = describe "HaskellWorks.Data.Xml.Succinct.Cursor.InterestBitsSpec" $ do 26 | it "Evaluating interest bits" $ requireTest $ do 27 | (interestBitsOf "" :: BitShown (DVS.Vector Word8)) === fromString "" 28 | (interestBitsOf " \n \r \t " :: BitShown (DVS.Vector Word8)) === fromString "00000000" 29 | (interestBitsOf "" :: BitShown (DVS.Vector Word8)) === fromString "10010000 00000000" 33 | (interestBitsOf " " :: BitShown (DVS.Vector Word8)) === fromString "01011010 00000000" 34 | (interestBitsOf " " :: BitShown (DVS.Vector Word8)) === fromString "01000000 00000000" 35 | (interestBitsOf " " 43 | , "< < " 44 | , "> >" 45 | ] 46 | annotate $ "Blanked: " <> show blanked 47 | let ib :: XmlInterestBits (BitShown (DVS.Vector Word8)) 48 | ib = XmlInterestBits (getXmlInterestBits (fromBlankedXml (BlankedXml blanked))) 49 | let moo :: [BS.ByteString] 50 | moo = blankedXmlToInterestBits blanked -- :: XmlInterestBits (BitShown (DVS.Vector Word8)) 51 | annotate $ "Moo: " <> show (BitShown . BS.unpack <$> moo) 52 | let actual = getXmlInterestBits ib :: BitShown (DVS.Vector Word8) 53 | let expected = fromString "10000110 00000010 00001000 00000100 00000001 00100000 00000000" 54 | 55 | actual === expected 56 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/Succinct/CursorSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitForAll #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE InstanceSigs #-} 5 | {-# LANGUAGE MultiParamTypeClasses #-} 6 | {-# LANGUAGE NoMonomorphismRestriction #-} 7 | {-# LANGUAGE OverloadedStrings #-} 8 | {-# LANGUAGE ScopedTypeVariables #-} 9 | {-# LANGUAGE TypeApplications #-} 10 | 11 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-} 12 | 13 | module HaskellWorks.Data.Xml.Succinct.CursorSpec(spec) where 14 | 15 | import Control.Monad 16 | import Data.Word 17 | import HaskellWorks.Data.BalancedParens.BalancedParens 18 | import HaskellWorks.Data.BalancedParens.Simple 19 | import HaskellWorks.Data.Bits.BitShow 20 | import HaskellWorks.Data.Bits.BitShown 21 | import HaskellWorks.Data.Bits.BitWise 22 | import HaskellWorks.Data.RankSelect.Base.Rank0 23 | import HaskellWorks.Data.RankSelect.Base.Rank1 24 | import HaskellWorks.Data.RankSelect.Base.Select1 25 | import HaskellWorks.Data.RankSelect.Poppy512 26 | import HaskellWorks.Data.Xml.Succinct.Cursor as C 27 | import HaskellWorks.Data.Xml.Succinct.CursorSpec.Make 28 | import HaskellWorks.Data.Xml.Token 29 | import HaskellWorks.Hspec.Hedgehog 30 | import Hedgehog 31 | import Test.Hspec 32 | 33 | import qualified Data.ByteString as BS 34 | import qualified Data.Text as T 35 | import qualified Data.Text.Encoding as T 36 | import qualified Data.Vector.Storable as DVS 37 | import qualified HaskellWorks.Data.FromByteString as BS 38 | import qualified HaskellWorks.Data.TreeCursor as TC 39 | import qualified HaskellWorks.Data.Xml.Succinct.Cursor.Create as CC 40 | 41 | {- HLINT ignore "Redundant do" -} 42 | {- HLINT ignore "Redundant bracket" -} 43 | {- HLINT ignore "Reduce duplication" -} 44 | 45 | fc = TC.firstChild 46 | ns = TC.nextSibling 47 | pn = TC.parent 48 | cd = TC.depth 49 | ss = TC.subtreeSize 50 | 51 | spec :: Spec 52 | spec = describe "HaskellWorks.Data.Xml.Succinct.CursorSpec" $ do 53 | make "DVS.Vector Word8" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word8 )) (SimpleBalancedParens (DVS.Vector Word8 ))) 54 | make "DVS.Vector Word16" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16))) 55 | make "DVS.Vector Word32" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32))) 56 | make "DVS.Vector Word64" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))) 57 | make "Poppy512" (BS.fromByteString :: BS.ByteString -> XmlCursor BS.ByteString Poppy512 (SimpleBalancedParens (DVS.Vector Word64))) 58 | make "DVS.Vector Word8" CC.byteStringAsFastCursor 59 | make "DVS.Vector Word16" CC.byteStringAsFastCursor 60 | make "DVS.Vector Word32" CC.byteStringAsFastCursor 61 | make "DVS.Vector Word64" CC.byteStringAsFastCursor 62 | make "Poppy512" CC.byteStringAsFastCursor 63 | it "Loads same Xml consistentally from different backing vectors" $ requireTest $ do 64 | let cursor8 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word8)) (SimpleBalancedParens (DVS.Vector Word8)) 65 | let cursor16 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16)) 66 | let cursor32 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32)) 67 | let cursor64 = "{\n \"widget\": {\n \"debug\": \"on\" } }" :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64)) 68 | cursorText cursor8 === cursorText cursor16 69 | cursorText cursor8 === cursorText cursor32 70 | cursorText cursor8 === cursorText cursor64 71 | let ic8 = bitShow $ interests cursor8 72 | let ic16 = bitShow $ interests cursor16 73 | let ic32 = bitShow $ interests cursor32 74 | let ic64 = bitShow $ interests cursor64 75 | ic16 `shouldBeginWith` ic8 76 | ic32 `shouldBeginWith` ic16 77 | ic64 `shouldBeginWith` ic32 78 | 79 | shouldBeginWith :: (Eq a, Show a) => [a] -> [a] -> PropertyT IO () 80 | shouldBeginWith as bs = take (length bs) as === bs 81 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/Succinct/CursorSpec/Make.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# LANGUAGE InstanceSigs #-} 4 | {-# LANGUAGE MultiParamTypeClasses #-} 5 | {-# LANGUAGE NoMonomorphismRestriction #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | {-# LANGUAGE ScopedTypeVariables #-} 8 | {-# LANGUAGE TypeApplications #-} 9 | 10 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-} 11 | 12 | module HaskellWorks.Data.Xml.Succinct.CursorSpec.Make 13 | ( make 14 | ) where 15 | 16 | import Control.Monad 17 | import Data.Word 18 | import HaskellWorks.Data.BalancedParens.BalancedParens 19 | import HaskellWorks.Data.BalancedParens.Simple 20 | import HaskellWorks.Data.Bits.BitShow 21 | import HaskellWorks.Data.Bits.BitShown 22 | import HaskellWorks.Data.Bits.BitWise 23 | import HaskellWorks.Data.RankSelect.Base.Rank0 24 | import HaskellWorks.Data.RankSelect.Base.Rank1 25 | import HaskellWorks.Data.RankSelect.Base.Select1 26 | import HaskellWorks.Data.RankSelect.Poppy512 27 | import HaskellWorks.Data.Xml.Succinct.Cursor as C 28 | import HaskellWorks.Data.Xml.Token 29 | import HaskellWorks.Hspec.Hedgehog 30 | import Hedgehog 31 | import Test.Hspec 32 | 33 | import qualified Data.ByteString as BS 34 | import qualified Data.Text as T 35 | import qualified Data.Text.Encoding as T 36 | import qualified Data.Vector.Storable as DVS 37 | import qualified HaskellWorks.Data.FromByteString as BS 38 | import qualified HaskellWorks.Data.TreeCursor as TC 39 | import qualified HaskellWorks.Data.Xml.Succinct.Cursor.Create as CC 40 | 41 | {- HLINT ignore "Redundant do" -} 42 | {- HLINT ignore "Redundant bracket" -} 43 | {- HLINT ignore "Reduce duplication" -} 44 | 45 | fc = TC.firstChild 46 | ns = TC.nextSibling 47 | pn = TC.parent 48 | cd = TC.depth 49 | ss = TC.subtreeSize 50 | 51 | make :: forall t u. 52 | ( Select1 t 53 | , Rank0 u 54 | , Rank1 u 55 | , BalancedParens u 56 | , TestBit u 57 | ) 58 | => String -> (BS.ByteString -> XmlCursor BS.ByteString t u) -> SpecWith () 59 | make t mkCursor = do 60 | describe ("Cursor for (" ++ t ++ ")") $ do 61 | let forXml bs f = let cursor = mkCursor bs in describe (T.unpack ("of value " <> T.decodeUtf8 bs)) (f cursor) 62 | forXml "[null]" $ \cursor -> do 63 | xit "depth at top" $ requireTest $ cd cursor === Just 1 64 | xit "depth at first child of array" $ requireTest $ (fc >=> cd) cursor === Just 2 65 | forXml "[null, {\"field\": 1}]" $ \cursor -> do 66 | xit "depth at second child of array" $ requireTest $ do 67 | (fc >=> ns >=> cd) cursor === Just 2 68 | xit "depth at first child of object at second child of array" $ requireTest $ do 69 | (fc >=> ns >=> fc >=> cd) cursor === Just 3 70 | xit "depth at first child of object at second child of array" $ requireTest $ do 71 | (fc >=> ns >=> fc >=> ns >=> cd) cursor === Just 3 72 | 73 | describe "For sample XML" $ do 74 | let cursor = mkCursor " \ 75 | \ \ 76 | \ 500 \ 77 | \ 600.01e-02 \ 78 | \ false \ 79 | \ \ 80 | \" :: XmlCursor BS.ByteString t u 81 | xit "can get token at cursor" $ requireTest $ do 82 | (xmlTokenAt ) cursor === Just (XmlTokenBraceL ) 83 | (fc >=> xmlTokenAt) cursor === Just (XmlTokenString "widget" ) 84 | (fc >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenBraceL ) 85 | (fc >=> ns >=> fc >=> xmlTokenAt) cursor === Just (XmlTokenString "debug" ) 86 | (fc >=> ns >=> fc >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "on" ) 87 | (fc >=> ns >=> fc >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "window" ) 88 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenBraceL ) 89 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> xmlTokenAt) cursor === Just (XmlTokenString "name" ) 90 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "main_window" ) 91 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenString "dimensions" ) 92 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ns >=> xmlTokenAt) cursor === Just (XmlTokenBracketL ) 93 | -- xit "can navigate up" $ requireTest $ do 94 | -- ( pn) cursor === Nothing 95 | -- (fc >=> pn) cursor === Just cursor 96 | -- (fc >=> ns >=> pn) cursor === Just cursor 97 | -- (fc >=> ns >=> fc >=> pn) cursor === (fc >=> ns ) cursor 98 | -- (fc >=> ns >=> fc >=> ns >=> pn) cursor === (fc >=> ns ) cursor 99 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> pn) cursor === (fc >=> ns ) cursor 100 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> pn) cursor === (fc >=> ns ) cursor 101 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor 102 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor 103 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor 104 | -- (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ns >=> pn) cursor === (fc >=> ns >=> fc >=> ns >=> ns >=> ns) cursor 105 | xit "can get subtree size" $ requireTest $ do 106 | ( ss) cursor === Just 16 107 | (fc >=> ss) cursor === Just 1 108 | (fc >=> ns >=> ss) cursor === Just 14 109 | (fc >=> ns >=> fc >=> ss) cursor === Just 1 110 | (fc >=> ns >=> fc >=> ns >=> ss) cursor === Just 1 111 | (fc >=> ns >=> fc >=> ns >=> ns >=> ss) cursor === Just 1 112 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> ss) cursor === Just 10 113 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ss) cursor === Just 1 114 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ss) cursor === Just 1 115 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ss) cursor === Just 1 116 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> ns >=> ns >=> ns >=> ss) cursor === Just 6 117 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/Token/TokenizeSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module HaskellWorks.Data.Xml.Token.TokenizeSpec (spec) where 4 | 5 | import Data.ByteString (ByteString) 6 | import HaskellWorks.Data.Xml.Token.Tokenize 7 | import HaskellWorks.Hspec.Hedgehog 8 | import Hedgehog 9 | import Test.Hspec 10 | 11 | import qualified Data.Attoparsec.ByteString.Char8 as BC 12 | import qualified Data.ByteString as BS 13 | 14 | {- HLINT ignore "Redundant do" -} 15 | 16 | parseXmlToken' :: ByteString -> Either String (XmlToken String Double) 17 | parseXmlToken' = BC.parseOnly parseXmlToken 18 | 19 | spec :: Spec 20 | spec = describe "HaskellWorks.Data.Xml.Token.TokenizeSpec" $ do 21 | describe "When parsing single token at beginning of text" $ do 22 | it "Empty Xml should produce no bits" $ requireTest $ 23 | parseXmlToken' "" === Left "not enough input" 24 | it "Xml with one space should produce whitespace token" $ requireTest $ 25 | parseXmlToken' " " === Right XmlTokenWhitespace 26 | it "Xml with two spaces should produce whitespace token" $ requireTest $ 27 | parseXmlToken' " " === Right XmlTokenWhitespace 28 | it "Spaces and newlines should produce no bits" $ requireTest $ 29 | parseXmlToken' " \n \r \t " === Right XmlTokenWhitespace 30 | it "`null` at beginning should produce one bit" $ requireTest $ 31 | parseXmlToken' "null " === Right XmlTokenNull 32 | it "number at beginning should produce one bit" $ requireTest $ 33 | parseXmlToken' "1234 " === Right (XmlTokenNumber 1234) 34 | it "false at beginning should produce one bit" $ requireTest $ 35 | parseXmlToken' "false " === Right (XmlTokenBoolean False) 36 | it "true at beginning should produce one bit" $ requireTest $ 37 | parseXmlToken' "true " === Right (XmlTokenBoolean True) 38 | it "string at beginning should produce one bit" $ requireTest $ 39 | parseXmlToken' "\"hello\" " === Right (XmlTokenString "hello") 40 | it "quoted string should parse" $ requireTest $ 41 | parseXmlToken' "\"\\\"\" " === Right (XmlTokenString "\"") 42 | it "left brace at beginning should produce one bit" $ requireTest $ 43 | parseXmlToken' "{ " === Right XmlTokenBraceL 44 | it "right brace at beginning should produce one bit" $ requireTest $ 45 | parseXmlToken' "} " === Right XmlTokenBraceR 46 | it "left bracket at beginning should produce one bit" $ requireTest $ 47 | parseXmlToken' "[ " === Right XmlTokenBracketL 48 | it "right bracket at beginning should produce one bit" $ requireTest $ 49 | parseXmlToken' "] " === Right XmlTokenBracketR 50 | it "right bracket at beginning should produce one bit" $ requireTest $ 51 | parseXmlToken' ": " === Right XmlTokenColon 52 | it "right bracket at beginning should produce one bit" $ requireTest $ 53 | parseXmlToken' ", " === Right XmlTokenComma 54 | -------------------------------------------------------------------------------- /test/HaskellWorks/Data/Xml/TypeSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitForAll #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE FlexibleInstances #-} 4 | {-# LANGUAGE InstanceSigs #-} 5 | {-# LANGUAGE MultiParamTypeClasses #-} 6 | {-# LANGUAGE NoMonomorphismRestriction #-} 7 | {-# LANGUAGE OverloadedStrings #-} 8 | {-# LANGUAGE ScopedTypeVariables #-} 9 | 10 | {-# OPTIONS_GHC -fno-warn-missing-signatures #-} 11 | {-# OPTIONS_GHC -fno-warn-simplifiable-class-constraints #-} 12 | 13 | module HaskellWorks.Data.Xml.TypeSpec (spec) where 14 | 15 | import Control.Monad 16 | import Data.String 17 | import Data.Word 18 | import HaskellWorks.Data.BalancedParens.BalancedParens 19 | import HaskellWorks.Data.BalancedParens.Simple 20 | import HaskellWorks.Data.Bits.BitShown 21 | import HaskellWorks.Data.Bits.BitWise 22 | import HaskellWorks.Data.RankSelect.Base.Rank0 23 | import HaskellWorks.Data.RankSelect.Base.Rank1 24 | import HaskellWorks.Data.RankSelect.Base.Select1 25 | import HaskellWorks.Data.RankSelect.Poppy512 26 | import HaskellWorks.Data.Xml.Succinct.Cursor as C 27 | import HaskellWorks.Data.Xml.Type 28 | import HaskellWorks.Hspec.Hedgehog 29 | import Hedgehog 30 | import Test.Hspec 31 | 32 | import qualified Data.ByteString as BS 33 | import qualified Data.Vector.Storable as DVS 34 | import qualified HaskellWorks.Data.TreeCursor as TC 35 | 36 | {- HLINT ignore "Redundant do" -} 37 | {- HLINT ignore "Redundant bracket" -} 38 | {- HLINT ignore "Reduce duplication" -} 39 | 40 | fc = TC.firstChild 41 | ns = TC.nextSibling 42 | 43 | spec :: Spec 44 | spec = describe "HaskellWorks.Data.Xml.TypeSpec" $ do 45 | genSpec "DVS.Vector Word8" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word8 )) (SimpleBalancedParens (DVS.Vector Word8 ))) 46 | genSpec "DVS.Vector Word16" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word16)) (SimpleBalancedParens (DVS.Vector Word16))) 47 | genSpec "DVS.Vector Word32" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word32)) (SimpleBalancedParens (DVS.Vector Word32))) 48 | genSpec "DVS.Vector Word64" (undefined :: XmlCursor BS.ByteString (BitShown (DVS.Vector Word64)) (SimpleBalancedParens (DVS.Vector Word64))) 49 | genSpec "Poppy512" (undefined :: XmlCursor BS.ByteString Poppy512 (SimpleBalancedParens (DVS.Vector Word64))) 50 | 51 | genSpec :: forall t u. 52 | ( Show t 53 | , Select1 t 54 | , Show u 55 | , Rank0 u 56 | , Rank1 u 57 | , BalancedParens u 58 | , TestBit u 59 | , IsString (XmlCursor BS.ByteString t u) 60 | ) 61 | => String -> (XmlCursor BS.ByteString t u) -> SpecWith () 62 | genSpec t _ = do 63 | describe ("XML cursor of type " ++ t) $ do 64 | let forXml (cursor :: XmlCursor BS.ByteString t u) f = describe ("of value " ++ show cursor) (f cursor) 65 | forXml "" $ \cursor -> do 66 | it "should have correct type" . requireTest $ xmlTypeAt cursor === Just XmlTypeElement 67 | forXml " " $ \cursor -> do 68 | it "should have correct type" . requireTest $ xmlTypeAt cursor === Just XmlTypeElement 69 | forXml "" $ \cursor -> do 70 | it "cursor can navigate to second attribute" $ requireTest $ do 71 | (fc >=> fc >=> ns >=> ns >=> xmlTypeAt) cursor === Just XmlTypeToken 72 | it "cursor can navigate to first attribute of an inner element" $ requireTest $ do 73 | (fc >=> ns >=> fc >=> fc >=> xmlTypeAt) cursor === Just XmlTypeToken 74 | it "cursor can navigate to first atrribute value of an inner element" $ requireTest $ do 75 | (fc >=> ns >=> fc >=> fc >=> ns >=> xmlTypeAt) cursor === Just XmlTypeToken 76 | describe "For a single element" $ do 77 | let cursor = "text" :: XmlCursor BS.ByteString t u 78 | it "can navigate down and forwards" $ requireTest $ do 79 | ( xmlTypeAt) cursor === Just XmlTypeElement 80 | (fc >=> xmlTypeAt) cursor === Just XmlTypeToken 81 | (fc >=> ns >=> xmlTypeAt) cursor === Nothing 82 | (fc >=> ns >=> ns >=> xmlTypeAt) cursor === Nothing 83 | describe "For sample Xml" $ do 84 | let cursor = " \ 85 | \ \ 86 | \ 500 \ 87 | \ 600.01e-02 \ 88 | \ false \ 89 | \ \ 90 | \" :: XmlCursor BS.ByteString t u 91 | it "can navigate down and forwards" $ requireTest $ do 92 | ( xmlTypeAt) cursor === Just XmlTypeElement --widget 93 | (fc >=> xmlTypeAt) cursor === Just XmlTypeAttrList --widget attrs 94 | (fc >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --window 95 | (fc >=> ns >=> fc >=> xmlTypeAt) cursor === Just XmlTypeAttrList --window attrs 96 | (fc >=> ns >=> fc >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --dimension 500 97 | (fc >=> ns >=> fc >=> ns >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --dimension 600 98 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> xmlTypeAt) cursor === Just XmlTypeElement --dimension false 99 | (fc >=> ns >=> fc >=> ns >=> ns >=> ns >=> fc >=> xmlTypeAt) cursor === Just XmlTypeToken --false 100 | -------------------------------------------------------------------------------- /test/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 2 | --------------------------------------------------------------------------------