├── .gitignore ├── Setup.hs ├── .ghci ├── tests ├── unit │ ├── Spec.hs │ └── Text │ │ └── Taggy │ │ ├── EntitiesSpec.hs │ │ ├── RendererSpec.hs │ │ ├── DOMSpec.hs │ │ └── ParserSpec.hs └── integration │ └── Main.hs ├── .travis.yml ├── default.nix ├── html_files ├── haskell.html ├── links_50.html ├── alpmestan.html ├── haskell.org.html ├── links_500.html └── agda.html ├── example └── taggy.hs ├── LICENSE ├── bench └── vs-tagsoup.hs ├── src └── Text │ ├── Taggy.hs │ └── Taggy │ ├── Renderer.hs │ ├── Types.hs │ ├── Parser.hs │ ├── Entities.hs │ └── DOM.hs ├── stack.yaml ├── README.md └── taggy.cabal /.gitignore: -------------------------------------------------------------------------------- 1 | .stack-work 2 | .ghc.environment.* 3 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /.ghci: -------------------------------------------------------------------------------- 1 | :set -isrc 2 | :set -itests 3 | :set -XOverloadedStrings 4 | -------------------------------------------------------------------------------- /tests/unit/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: haskell 2 | 3 | script: 4 | - cabal configure --enable-tests && cabal build && cabal test 5 | 6 | before_install: 7 | - cabal update 8 | 9 | install: 10 | - cabal install --only-dependencies --enable-tests 11 | 12 | notifications: 13 | email: 14 | - alpmestan@gmail.com 15 | irc: "irc.freenode.org#haskell-math" 16 | -------------------------------------------------------------------------------- /default.nix: -------------------------------------------------------------------------------- 1 | let 2 | nixpkgs = (import {}).fetchgit { 3 | url = "git@github.com:nixos/nixpkgs.git"; 4 | rev = "718106e958cbd872ecf3e08a451b80f68f950dae"; 5 | sha256 = "72ef1a4b66312676d0b7e1684d3d68a5e82fdff1725d8816a9dac7eff4ee81e8"; 6 | }; 7 | in 8 | { system ? builtins.currentSystem 9 | , pkgs ? (import nixpkgs { inherit system; }) 10 | , haskellPackages ? pkgs.haskellPackages_ghc783 11 | , src ? ./. 12 | , name ? "taggy" 13 | }: 14 | haskellPackages.buildLocalCabal src name 15 | -------------------------------------------------------------------------------- /tests/unit/Text/Taggy/EntitiesSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module Text.Taggy.EntitiesSpec where 3 | 4 | import Test.Hspec 5 | import Text.Taggy.Entities 6 | 7 | spec :: Spec 8 | spec = do 9 | describe "convertEntities" $ do 10 | it "converts " to \"" $ 11 | convertEntities """ 12 | `shouldBe` "\"" 13 | 14 | it "converts å to å" $ 15 | convertEntities "å" 16 | `shouldBe` "å" 17 | 18 | it "converts å to å" $ 19 | convertEntities "å" 20 | `shouldBe` "å" 21 | 22 | it "leaves alone "" $ 23 | convertEntities """ 24 | `shouldBe` """ 25 | 26 | it "leaves alone å" $ 27 | convertEntities "å" 28 | `shouldBe` "å" 29 | 30 | it "leaves alone å" $ 31 | convertEntities "å" 32 | `shouldBe` "å" 33 | -------------------------------------------------------------------------------- /tests/integration/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TupleSections #-} 2 | 3 | module Main (main) where 4 | 5 | import Prelude hiding (readFile) 6 | import Data.Functor ((<$>)) 7 | import Data.List (isSuffixOf) 8 | import Data.Text.Lazy (Text) 9 | import Data.Text.Lazy.IO (readFile) 10 | import Paths_taggy (getDataFileName) 11 | import System.Directory (getDirectoryContents, setCurrentDirectory) 12 | import Test.Hspec (hspec, runIO, describe, it, shouldSatisfy) 13 | import Text.Taggy (taggyWith) 14 | 15 | getHTMLFiles :: IO [(FilePath, Text)] 16 | getHTMLFiles = getDataFileName "html_files" 17 | >>= setCurrentDirectory 18 | >> filter (isSuffixOf ".html") <$> getDirectoryContents "." 19 | >>= mapM (\name -> fmap (name,) $ readFile name) 20 | 21 | main :: IO () 22 | main = hspec . (runIO getHTMLFiles >>=) . mapM_ $ \(name, content) -> 23 | describe name . it "Should parse without error." $ 24 | taggyWith True content `shouldSatisfy` not . null 25 | -------------------------------------------------------------------------------- /tests/unit/Text/Taggy/RendererSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Text.Taggy.RendererSpec where 4 | 5 | import Test.Hspec 6 | import Text.Taggy 7 | 8 | spec :: Spec 9 | spec = do 10 | describe "render" $ do 11 | let doc = "foobaz" 12 | node = head . domify $ taggyWith False doc 13 | elmt = (\(NodeElement e) -> e) $ node 14 | it "Should render a given node." $ do 15 | render node `shouldBe` doc 16 | it "Should render a given element." $ do 17 | render elmt `shouldBe` doc 18 | describe "renderWith" $ do 19 | let document = "I ♥ you!" 20 | element = (\(NodeElement e) -> e) . head . domify $ taggyWith False document 21 | it "Should escape HTML entities when the first argument is True." $ do 22 | renderWith True element `shouldBe` "I &hearts; you!" 23 | it "Shouldn't escape HTML entities when the first argument is False." $ do 24 | renderWith False element `shouldBe` document 25 | -------------------------------------------------------------------------------- /html_files/haskell.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | HaskellWiki 11 | -------------------------------------------------------------------------------- /example/taggy.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Data.Attoparsec.Text.Lazy (eitherResult) 4 | import System.Environment 5 | import Text.Taggy 6 | 7 | import qualified Data.Text.Lazy.IO as T 8 | 9 | main :: IO () 10 | main = do 11 | args <- getArgs 12 | case args of 13 | ["--help"] -> usage 14 | (opt : filenames) | opt == "-d" || opt == "--dom" -> mapM_ dom filenames 15 | (opt : filenames) | opt == "-t" || opt == "--t" -> mapM_ taggy filenames 16 | _ -> usage 17 | 18 | usage :: IO () 19 | usage = do 20 | putStrLn "taggy - simple and fast HTML parser" 21 | putStrLn "" 22 | putStrLn "Usage:\t taggy file1.html file2.html file3.html ..." 23 | putStrLn "\n" 24 | putStrLn "Formats are:" 25 | putStrLn "\t -d/--dom\t Parse as a DOM tree." 26 | putStrLn "\t -t/--tags\t Parse as a list of opening/closing/text/comment/script/style tags" 27 | 28 | taggy :: FilePath -> IO () 29 | taggy fp = do 30 | content <- T.readFile fp 31 | either (\s -> putStrLn $ "couldn't parse: " ++ s) 32 | (mapM_ print) 33 | (eitherResult $ run True content) 34 | 35 | dom :: FilePath -> IO () 36 | dom fp = do 37 | content <- T.readFile fp 38 | mapM_ print . domify $ taggyWith True content 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Alp Mestanogullari 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Alp Mestanogullari nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /bench/vs-tagsoup.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Main where 4 | 5 | import Criterion.Main 6 | import qualified Text.Taggy as Taggy 7 | import qualified Text.HTML.TagSoup as Tagsoup 8 | import qualified Data.Text.Lazy as T 9 | import qualified Data.Text.Lazy.IO as T 10 | import qualified Data.Vector as V 11 | 12 | sizes :: [Int] 13 | sizes = [50, 500, 5000] 14 | 15 | sizes' :: [Int] 16 | sizes' = [50000, 500000, 5000000] 17 | 18 | benchOnFile :: String -> FilePath -> Benchmark 19 | benchOnFile label fp = 20 | bgroup label 21 | [ bench "tagsoup" $ whnfIO (tagsoup fp) 22 | , bench "taggy" $ whnfIO (taggy fp) 23 | , bench "taggy-entities" $ whnfIO (taggyEntities fp) 24 | ] 25 | 26 | tagsoup :: FilePath -> IO (V.Vector (Tagsoup.Tag T.Text)) 27 | tagsoup fp = T.readFile ("html_files/" ++ fp) 28 | >>= return . V.fromList . Tagsoup.parseTags 29 | 30 | taggy :: FilePath -> IO (V.Vector Taggy.Tag) 31 | taggy fp = T.readFile ("html_files/" ++ fp) 32 | >>= return . V.fromList . Taggy.taggyWith False 33 | 34 | taggyEntities :: FilePath -> IO (V.Vector Taggy.Tag) 35 | taggyEntities fp = T.readFile ("html_files/" ++ fp) 36 | >>= return . V.fromList . Taggy.taggyWith True 37 | 38 | linkBench :: Int -> Benchmark 39 | linkBench size = benchOnFile (show size) fp 40 | 41 | where fp = "links_" ++ show size ++ ".html" 42 | 43 | main :: IO () 44 | main = 45 | defaultMain 46 | [ benchOnFile "alpmestan.com index" 47 | "alpmestan.html" 48 | 49 | , benchOnFile "google search" 50 | "googling-haskell.html" 51 | 52 | , benchOnFile "youtube" 53 | "youtube.html" 54 | 55 | , benchOnFile "wikipedia - history of mathematics" 56 | "wikipedia_history_of_mathematics.html" 57 | 58 | , benchOnFile "worldslongestwebsite.com" 59 | "worldslongestwebsite.com.html" 60 | 61 | , benchOnFile "/r/haskell" 62 | "haskell_reddit.html" 63 | 64 | -- , bgroup "links" $ map linkBench sizes 65 | ] 66 | -------------------------------------------------------------------------------- /src/Text/Taggy.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | -- | 3 | -- Module : Text.Taggy 4 | -- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma 5 | -- License : BSD3 6 | -- Maintainer : alpmestan@gmail.com 7 | -- Stability : experimental 8 | -- 9 | -- /taggy/ is a simple package for parsing HTML (and should work with XML) 10 | -- written on top of the 11 | -- library, which makes it one of the most efficient (space and time consumption wise) 12 | -- on hackage. 13 | -- 14 | -- This is the root module of /taggy/. It reexports everything 15 | -- from the package. See each module's docs for details about 16 | -- the functions and types involved in /taggy/. 17 | -- 18 | -- While we've been testing the parser on /many/ pages, it may still 19 | -- be a bit rough around the edges. Let us know on 20 | -- if you have any problem. 21 | -- 22 | -- If you like to look at your HTML through 23 | -- various optical instruments, feel free to take a look at 24 | -- the companion 25 | -- package we've put up together. 26 | -- 27 | -- * If you want to parse a document as list of tags 28 | -- and go through it as some kind of stream by just picking 29 | -- what you need, head to "Text.Taggy.Parser" and take 30 | -- a look at 'Text.Taggy.Parser.taggyWith' and 31 | -- 'Text.Taggy.Parser.run'. 32 | -- * If you want to parse the document as a DOM tree and 33 | -- traverse it to find the information you need, 34 | -- use 'Text.Taggy.DOM.parseDOM'. This is especially useful 35 | -- when used in conjunction with . 36 | -- * If you build some HTML manually 37 | -- or just transform some existing DOM tree 38 | -- and want to turn it into a 'Data.Text.Lazy.Text' 39 | -- head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'. 40 | module Text.Taggy 41 | ( -- * Exported modules 42 | module Text.Taggy.Types 43 | , module Text.Taggy.Parser 44 | , module Text.Taggy.DOM 45 | , module Text.Taggy.Renderer 46 | ) where 47 | 48 | import Text.Taggy.Types 49 | import Text.Taggy.Parser 50 | import Text.Taggy.DOM 51 | import Text.Taggy.Renderer 52 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | # This file was automatically generated by 'stack init' 2 | # 3 | # Some commonly used options have been documented as comments in this file. 4 | # For advanced use and comprehensive documentation of the format, please see: 5 | # http://docs.haskellstack.org/en/stable/yaml_configuration/ 6 | 7 | # Resolver to choose a 'specific' stackage snapshot or a compiler version. 8 | # A snapshot resolver dictates the compiler version and the set of packages 9 | # to be used for project dependencies. For example: 10 | # 11 | # resolver: lts-3.5 12 | # resolver: nightly-2015-09-21 13 | # resolver: ghc-7.10.2 14 | # resolver: ghcjs-0.1.0_ghc-7.10.2 15 | # resolver: 16 | # name: custom-snapshot 17 | # location: "./custom-snapshot.yaml" 18 | resolver: lts-9.0 19 | 20 | # User packages to be built. 21 | # Various formats can be used as shown in the example below. 22 | # 23 | # packages: 24 | # - some-directory 25 | # - https://example.com/foo/bar/baz-0.0.2.tar.gz 26 | # - location: 27 | # git: https://github.com/commercialhaskell/stack.git 28 | # commit: e7b331f14bcffb8367cd58fbfc8b40ec7642100a 29 | # - location: https://github.com/commercialhaskell/stack/commit/e7b331f14bcffb8367cd58fbfc8b40ec7642100a 30 | # extra-dep: true 31 | # subdirs: 32 | # - auto-update 33 | # - wai 34 | # 35 | # A package marked 'extra-dep: true' will only be built if demanded by a 36 | # non-dependency (i.e. a user package), and its test suites and benchmarks 37 | # will not be run. This is useful for tweaking upstream packages. 38 | packages: 39 | - '.' 40 | # Dependency packages to be pulled from upstream that are not in the resolver 41 | # (e.g., acme-missiles-0.3) 42 | extra-deps: [] 43 | 44 | # Override default flag values for local packages and extra-deps 45 | flags: {} 46 | 47 | # Extra package databases containing global packages 48 | extra-package-dbs: [] 49 | 50 | # Control whether we use the GHC we find on the path 51 | # system-ghc: true 52 | # 53 | # Require a specific version of stack, using version ranges 54 | # require-stack-version: -any # Default 55 | # require-stack-version: ">=1.2" 56 | # 57 | # Override the architecture used by stack, especially useful on Windows 58 | # arch: i386 59 | # arch: x86_64 60 | # 61 | # Extra directories used by stack for building 62 | # extra-include-dirs: [/path/to/dir] 63 | # extra-lib-dirs: [/path/to/dir] 64 | # 65 | # Allow a newer minor version of GHC than the snapshot specifies 66 | # compiler-check: newer-minor -------------------------------------------------------------------------------- /html_files/links_50.html: -------------------------------------------------------------------------------- 1 | Best web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site everBest web site ever -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | taggy 2 | ===== 3 | 4 | An attoparsec based html parser. [![Build Status](https://secure.travis-ci.org/alpmestan/taggy.png?branch=master)](http://travis-ci.org/alpmestan/taggy) 5 | 6 | Currently very WIP but already supports a fairly decent range of common websites. I haven't managed to find a website with which it chokes, using the current parser. The performance is quite promising. 7 | 8 | Using `taggy` 9 | ============= 10 | 11 | _taggy_ has a `taggyWith` function to work on HTML à la _tagsoup_. 12 | 13 | ``` haskell 14 | taggyWith :: Bool -> LT.Text -> [Tag] 15 | ``` 16 | 17 | The `Bool` there just lets you specify whether you want to convert the special HTML entities to their corresponding unicode character. `True` means "yes convert them please". This function takes lazy `Text` as input. 18 | 19 | Or you can use the raw `run` function, which returns a good old `Result` from _attoparsec_. 20 | 21 | ``` haskell 22 | run :: Bool -> LT.Text -> AttoLT.Result [Tag] 23 | ``` 24 | 25 | For example, if you want to read the html code from a file, and print one tag per line, you could do: 26 | 27 | ``` haskell 28 | import Data.Attoparsec.Text.Lazy (eitherResult) 29 | import qualified Data.Text.Lazy.IO as T 30 | import Text.Taggy (run) 31 | 32 | taggy :: FilePath -> IO () 33 | taggy fp = do 34 | content <- T.readFile fp 35 | either (\s -> putStrLn $ "couldn't parse: " ++ s) 36 | (mapM_ print) 37 | (eitherResult $ run True content) 38 | ``` 39 | 40 | But _taggy_ also started providing support for DOM-syle documents. This is computed from the list of tags gained by using `taggyWith`. 41 | 42 | If you fire up ghci with _taggy_ loaded: 43 | 44 | ``` bash 45 | $ cabal repl # if working with a copy of this repo 46 | ``` 47 | 48 | You can see this `domify` in action. 49 | 50 | ``` haskell 51 | λ> :set -XOverloadedStrings 52 | λ> head . domify . taggyWith False $ "yo" 53 | NodeElement (Element {eltName = "html", eltAttrs = fromList [], eltChildren = [NodeElement (Element {eltName = "head", eltAttrs = fromList [], eltChildren = []}),NodeElement (Element {eltName = "body", eltAttrs = fromList [], eltChildren = [NodeContent "yo"]})]}) 54 | ``` 55 | 56 | Note that the `Text.Taggy.DOM` module contains a function 57 | that composes `domify` and `taggyWith` for you: `parseDOM`. 58 | 59 | Lenses for taggy 60 | ================ 61 | 62 | We (well, mostly Vikram Virma to be honest) have 63 | put up a companion [taggy-lens](http://github.com/alpmestan/taggy-lens) 64 | library. 65 | 66 | Haddocks 67 | ======== 68 | 69 | I try to keep an up-to-date copy of the docs on my server: 70 | 71 | - [taggy](https://hackage.haskell.org/package/taggy) 72 | - [taggy-lens](https://hackage.haskell.org/package/taggy-lens) 73 | 74 | -------------------------------------------------------------------------------- /tests/unit/Text/Taggy/DOMSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Text.Taggy.DOMSpec where 4 | 5 | import Test.Hspec 6 | import Text.Taggy.DOM 7 | import Text.Taggy.Types 8 | import Data.Monoid (mempty) 9 | import Data.Text (Text) 10 | 11 | spec :: Spec 12 | spec = do 13 | describe "domify" $ do 14 | it "domifies self-closing tags" $ 15 | [ TagOpen "br" [] True 16 | , TagText "pizza" 17 | ] 18 | `domifiesTo` 19 | [ e "br" [] 20 | , text "pizza" 21 | ] 22 | it "domifies paired tags" $ 23 | [ TagOpen "p" [] False 24 | , TagText "pizza" 25 | , TagClose "p" 26 | , TagText "olives" 27 | ] 28 | `domifiesTo` 29 | [ e "p" [ text "pizza" ] 30 | , text "olives" 31 | ] 32 | it "domifies omitted closing tags at end" $ 33 | [ TagOpen "p" [] False 34 | , TagText "pizza" 35 | ] 36 | `domifiesTo` 37 | [ e "p" [text "pizza"] 38 | ] 39 | it "closes tags when parent closes" $ 40 | [ TagOpen "p" [] False 41 | , TagOpen "em" [] False 42 | , TagText "pizza" 43 | , TagClose "p" 44 | ] 45 | `domifiesTo` 46 | [ e "p" [ e "em" [ text "pizza" ] ] 47 | ] 48 | it "domifies implicitly closed

" $ 49 | [ TagOpen "p" [] False 50 | , TagText "pizza" 51 | , TagOpen "p" [] False 52 | , TagText "olives" 53 | ] 54 | `domifiesTo` 55 | [ e "p" [text "pizza"] 56 | , e "p" [text "olives"] 57 | ] 58 | it "domifies implicitly closed

  • " $ 59 | [ TagOpen "ul" [] False 60 | , TagOpen "li" [] False 61 | , TagText "item 1" 62 | , TagOpen "li" [] False 63 | , TagText "item 2" 64 | , TagClose "ul" 65 | ] 66 | `domifiesTo` 67 | [ e "ul" 68 | [ e "li" 69 | [ text "item 1" ] 70 | , e "li" 71 | [ text "item 2" ] 72 | ] 73 | ] 74 | it "domifies implicitly closed , , " $ 75 | [ TagOpen "table" [] False 76 | , TagOpen "tr" [] False 77 | , TagOpen "th" [] False 78 | , TagOpen "td" [] False 79 | , TagOpen "td" [] False 80 | , TagOpen "tr" [] False 81 | , TagClose "table" 82 | ] 83 | `domifiesTo` 84 | [ e "table" 85 | [ e "tr" 86 | [ e "th" [] 87 | , e "td" [] 88 | , e "td" [] 89 | ] 90 | , e "tr" [] 91 | ] 92 | ] 93 | 94 | domifiesTo :: [Tag] -> [Node] -> Bool 95 | domifiesTo tags expected = 96 | domify tags == expected 97 | 98 | e :: Text -> [Node] -> Node 99 | e tag children = NodeElement $ Element tag mempty children 100 | 101 | text :: Text -> Node 102 | text txt = NodeContent txt 103 | -------------------------------------------------------------------------------- /src/Text/Taggy/Renderer.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP, LambdaCase, RecordWildCards, FlexibleInstances, UndecidableInstances, OverloadedStrings #-} 2 | -- | 3 | -- Module : Text.Taggy.Renderer 4 | -- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma 5 | -- License : BSD3 6 | -- Maintainer : alpmestan@gmail.com 7 | -- Stability : experimental 8 | -- 9 | -- Render a DOM tree (from "Text.Taggy.DOM") 10 | -- using the excellent blaze markup rendering library. 11 | module Text.Taggy.Renderer where 12 | 13 | import Data.Foldable (foldMap) 14 | import Data.HashMap.Strict (HashMap, foldlWithKey') 15 | import Data.Monoid ((<>)) 16 | import Data.Text (Text, unpack) 17 | import Data.Text.Encoding (encodeUtf8) 18 | import Data.Set as Set (member) 19 | import GHC.Exts 20 | import Text.Blaze (Markup) 21 | import Text.Blaze.Renderer.Text (renderMarkup) 22 | import Text.Taggy.DOM (Element(..), Node(..)) 23 | import qualified Data.Text.Lazy as Lazy (Text) 24 | import Text.Blaze.Internal (ChoiceString(..), StaticString(..), MarkupM(..)) 25 | 26 | -- renderMarkup does entity conversion implicitly, and an override at the 27 | -- constructor level is needed to control this; `PreEscaped (Text s)` is not 28 | -- escaped, but a naked `Text s` is. 29 | 30 | class AsMarkup a where 31 | -- | If the first parameter is true, we align the constructors for entity 32 | -- conversion. 33 | toMarkup :: Bool -> a -> Markup 34 | 35 | -- | A 'Node' is convertible to 'Markup' 36 | instance AsMarkup Node where 37 | toMarkup convertEntities = \case 38 | #if MIN_VERSION_blaze_markup(0,8,0) 39 | NodeContent text -> flip Content () $ 40 | #else 41 | NodeContent text -> Content $ 42 | #endif 43 | if convertEntities then Text text else PreEscaped (Text text) 44 | 45 | NodeElement elmt -> toMarkup convertEntities elmt 46 | 47 | -- | An 'Element' is convertible to 'Markup' 48 | instance AsMarkup Element where 49 | toMarkup convertEntities Element{..} = eltAttrs `toAttribute` Parent tag begin end kids 50 | where tag = toStatic eltName 51 | begin = toStatic $ "<" <> eltName 52 | end = case voidElement eltName of 53 | True -> toStatic "" 54 | False -> toStatic $ " eltName <> ">" 55 | kids = foldMap (toMarkup convertEntities) eltChildren 56 | 57 | voidElement :: (IsString s, Ord s) => s -> Bool 58 | voidElement e = Set.member e (fromList ["area", "base", "br", "col", "command", "embed", "hr", 59 | "img", "input", "keygen", "link", "meta", "param", 60 | "source", "track", "wbr"]) 61 | 62 | class Renderable a where 63 | render :: a -> Lazy.Text 64 | render = renderWith True 65 | renderWith :: Bool -> a -> Lazy.Text 66 | 67 | -- | Any value convertible to 'Markup' can be rendered as HTML, by way of 68 | -- 'render' and 'renderWith'. 69 | 70 | instance AsMarkup a => Renderable a where 71 | renderWith = fmap renderMarkup . toMarkup 72 | 73 | toAttribute :: HashMap Text Text -> (Markup -> Markup) 74 | toAttribute = flip $ foldlWithKey' toAttribute' 75 | where toAttribute' html attr value = AddCustomAttribute (Text attr) (Text value) html 76 | 77 | toStatic :: Text -> StaticString 78 | toStatic text = StaticString (unpack text ++) (encodeUtf8 text) text 79 | -------------------------------------------------------------------------------- /src/Text/Taggy/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | -- | 3 | -- Module : Text.Taggy.Types 4 | -- Copyright : (c) 2014 Alp Mestanogullari 5 | -- License : BSD3 6 | -- Maintainer : alpmestan@gmail.com 7 | -- Stability : experimental 8 | -- 9 | -- Core types of /taggy/. 10 | module Text.Taggy.Types 11 | ( -- * 'Tag' type 12 | Tag(..) 13 | , tname 14 | , isTagOpen 15 | , isTagClose 16 | , isTagText 17 | , isTagComment 18 | , isTagScript 19 | , isTagStyle 20 | , tagsNamed 21 | 22 | , -- * 'Attribute's 23 | Attribute(..) 24 | , attrs 25 | , attrKey 26 | , attrValue 27 | 28 | , -- * A small difference list implementation 29 | L 30 | , emptyL 31 | , appL 32 | , insertL 33 | , singletonL 34 | , toListL 35 | ) where 36 | 37 | import Data.Text (Text, toCaseFold) 38 | 39 | -- | An attribute is just an attribute name 40 | -- and an attribute value. 41 | data Attribute = Attribute !Text !Text 42 | deriving (Show, Eq) 43 | 44 | -- | Get the attributes of a 'Tag'. 45 | attrs :: Tag -> [Attribute] 46 | attrs (TagOpen _ as _) = as 47 | attrs _ = [] 48 | 49 | -- | Get the name of an 'Attribute'. 50 | attrKey :: Attribute -> Text 51 | attrKey (Attribute k _) = k 52 | 53 | -- | Get the value of an 'Attribute'. 54 | attrValue :: Attribute -> Text 55 | attrValue (Attribute _ v) = v 56 | 57 | -- | A 'Tag' can be one of the following types of tags: 58 | -- 59 | -- * an opening tag that has a name, a list of attributes, and whether 60 | -- it is a self-closing tag or not 61 | -- * a closing tag with the name of the tag 62 | -- * some raw 'Text' 63 | -- * an HTML comment tag 64 | -- * a @@ tag 65 | -- * a @@ tag 66 | -- 67 | -- The latter two are useful to be considered 68 | -- separately in the parser and also lets you 69 | -- collect these bits quite easily. 70 | data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag? 71 | | TagClose !Text 72 | | TagText !Text 73 | | TagComment !Text 74 | | TagScript !Tag !Text !Tag 75 | | TagStyle !Tag !Text !Tag 76 | deriving (Show, Eq) 77 | 78 | -- | Name of a 'Tag'. 79 | -- 80 | -- > tname (TagClose "a") == "a" 81 | tname :: Tag -> Text 82 | tname (TagOpen n _ _) = n 83 | tname (TagClose n) = n 84 | tname (TagText _) = "" 85 | tname (TagComment _) = "" 86 | tname (TagScript _ _ _) = "script" 87 | tname (TagStyle _ _ _) = "style" 88 | 89 | -- | Is this 'Tag' an opening tag? 90 | isTagOpen :: Tag -> Bool 91 | isTagOpen (TagOpen _ _ _) = True 92 | isTagOpen _ = False 93 | 94 | -- | Is this 'Tag' a closing tag? 95 | isTagClose :: Tag -> Bool 96 | isTagClose (TagClose _) = True 97 | isTagClose _ = False 98 | 99 | -- | Is this 'Tag' just some flat text? 100 | isTagText :: Tag -> Bool 101 | isTagText (TagText _) = True 102 | isTagText _ = False 103 | 104 | -- | Is this 'Tag' an HTML comment tag? 105 | isTagComment :: Tag -> Bool 106 | isTagComment (TagComment _) = True 107 | isTagComment _ = False 108 | 109 | -- | Is this 'Tag' a @@ tag? 110 | isTagScript :: Tag -> Bool 111 | isTagScript (TagScript _ _ _) = True 112 | isTagScript _ = False 113 | 114 | -- | Is this 'Tag' a @@ tag? 115 | isTagStyle :: Tag -> Bool 116 | isTagStyle (TagStyle _ _ _) = True 117 | isTagStyle _ = False 118 | 119 | -- | Get all the (opening) tags with the given name 120 | tagsNamed :: Text -> [Tag] -> [Tag] 121 | tagsNamed nam = filter (named nam) 122 | 123 | where named n (TagOpen t _ _) = toCaseFold n == toCaseFold t 124 | named _ _ = False 125 | 126 | newtype L a = L ([a] -> [a]) 127 | 128 | emptyL :: L a 129 | emptyL = L $ const [] 130 | 131 | appL :: L a -> L a -> L a 132 | appL (L l1) (L l2) = L $ l1 . l2 133 | 134 | singletonL :: a -> L a 135 | singletonL x = L (x:) 136 | 137 | toListL :: L a -> [a] 138 | toListL (L f) = f [] 139 | 140 | insertL :: a -> L a -> L a 141 | insertL x (L f) = L $ (x:) . f 142 | -------------------------------------------------------------------------------- /taggy.cabal: -------------------------------------------------------------------------------- 1 | name: taggy 2 | version: 0.2.1 3 | synopsis: Efficient and simple HTML/XML parsing library 4 | description: 5 | /taggy/ is a simple package for parsing HTML (and should work with XML) 6 | written on top of the 7 | library, which makes it one of the most efficient (space and time consumption wise) 8 | on hackage. 9 | . 10 | This is the root module of /taggy/. It reexports everything 11 | from the package. See each module's docs for details about 12 | the functions and types involved in /taggy/. 13 | . 14 | While we've been testing the parser on /many/ pages, it may still 15 | be a bit rough around the edges. Let us know on 16 | if you have any problem. 17 | . 18 | If you like to look at your HTML through 19 | various optical instruments, feel free to take a look at 20 | the companion 21 | package we've put up together. It makes HTML parsing a piece of cake. 22 | . 23 | If you want to parse a document as list of tags 24 | and go through it as some kind of stream by just picking 25 | what you need, head to "Text.Taggy.Parser" and take 26 | a look at 'Text.Taggy.Parser.taggyWith' and 27 | 'Text.Taggy.Parser.run'. 28 | . 29 | If you want to parse the document as a DOM tree and 30 | traverse it to find the information you need, 31 | use 'Text.Taggy.DOM.parseDOM'. This is especially useful 32 | when used in conjunction with . 33 | . 34 | If you build some HTML manually 35 | or just transform some existing DOM tree 36 | and want to turn it into a 'Data.Text.Lazy.Text' 37 | head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'. 38 | homepage: http://github.com/alpmestan/taggy 39 | license: BSD3 40 | license-file: LICENSE 41 | author: Alp Mestanogullari, Vikram Verma 42 | maintainer: alpmestan@gmail.com 43 | copyright: 2014 Alp Mestanogullari, Vikram Verma 44 | category: Text, Web 45 | build-type: Simple 46 | extra-source-files: html_files/*.html 47 | data-files: html_files/*.html 48 | cabal-version: >=1.10 49 | 50 | library 51 | exposed-modules: Text.Taggy, 52 | Text.Taggy.DOM, 53 | Text.Taggy.Entities, 54 | Text.Taggy.Parser, 55 | Text.Taggy.Renderer 56 | Text.Taggy.Types 57 | build-depends: base >=4.6 && <5, 58 | blaze-html >= 0.7, 59 | blaze-markup >= 0.6, 60 | text >= 1, 61 | attoparsec >=0.11, 62 | vector >=0.7, 63 | containers, 64 | unordered-containers >= 0.2 65 | hs-source-dirs: src 66 | default-language: Haskell2010 67 | ghc-options: -Wall -O2 -fno-warn-unused-do-bind -funbox-strict-fields 68 | -- ghc-prof-options: -Wall -O2 -fno-warn-unused-do-bind -funbox-strict-fields -prof -auto-all 69 | 70 | executable taggy 71 | main-is: taggy.hs 72 | hs-source-dirs: example 73 | build-depends: base >=4.5 && <5, 74 | text >= 1, 75 | attoparsec >=0.12, 76 | taggy 77 | ghc-options: -Wall -O2 -fno-warn-unused-do-bind 78 | -- ghc-prof-options: -Wall -prof -auto-all -O2 -fno-warn-unused-do-bind -rtsopts "-with-rtsopts=-sstderr -p" 79 | default-language: Haskell2010 80 | 81 | benchmark taggytagsoup 82 | main-is: vs-tagsoup.hs 83 | hs-source-dirs: bench 84 | ghc-options: -O2 -funbox-strict-fields 85 | type: exitcode-stdio-1.0 86 | build-depends: base >= 4 && < 5, 87 | text >=1, 88 | attoparsec >=0.12, 89 | taggy, 90 | tagsoup, 91 | criterion, 92 | vector 93 | default-language: Haskell2010 94 | 95 | test-suite unit 96 | type: 97 | exitcode-stdio-1.0 98 | ghc-options: 99 | -Wall -fno-warn-unused-do-bind 100 | hs-source-dirs: 101 | tests/unit 102 | main-is: 103 | Spec.hs 104 | other-modules: 105 | Text.Taggy.DOMSpec 106 | , Text.Taggy.EntitiesSpec 107 | , Text.Taggy.ParserSpec 108 | , Text.Taggy.RendererSpec 109 | build-depends: 110 | base == 4.* 111 | , taggy 112 | , blaze-html 113 | , blaze-markup 114 | , text 115 | , hspec 116 | , hspec-attoparsec 117 | , vector 118 | , attoparsec 119 | , unordered-containers 120 | default-language: 121 | Haskell2010 122 | 123 | test-suite integration 124 | type: 125 | exitcode-stdio-1.0 126 | ghc-options: 127 | -Wall -fno-warn-unused-do-bind 128 | hs-source-dirs: 129 | tests/integration 130 | main-is: 131 | Main.hs 132 | other-modules: 133 | Paths_taggy 134 | build-depends: 135 | base == 4.* 136 | , taggy 137 | , blaze-html 138 | , blaze-markup 139 | , directory 140 | , text 141 | , hspec >= 1.11 142 | , hspec-attoparsec 143 | , vector 144 | , attoparsec 145 | , unordered-containers 146 | default-language: 147 | Haskell2010 148 | 149 | -- vim:ts=2 sw=2 150 | -------------------------------------------------------------------------------- /src/Text/Taggy/Parser.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | -- | 3 | -- Module : Text.Taggy.Parser 4 | -- Copyright : (c) 2014 Alp Mestanogullari 5 | -- License : BSD3 6 | -- Maintainer : alpmestan@gmail.com 7 | -- Stability : experimental 8 | -- 9 | -- Parse an HTML or XML document as a list of 'Tag's 10 | -- with 'taggyWith' or 'run'. 11 | module Text.Taggy.Parser 12 | ( taggyWith 13 | , run 14 | , -- * Internal parsers 15 | tagopen 16 | , tagclose 17 | , tagcomment 18 | , tagstyle 19 | , tagscript 20 | , tagtext 21 | , tag 22 | , htmlWith 23 | ) where 24 | 25 | import Control.Applicative 26 | import Data.Attoparsec.Combinator as Atto 27 | import Data.Attoparsec.Text as Atto 28 | import qualified Data.Attoparsec.Text.Lazy as AttoLT 29 | import Data.Char 30 | import Data.Monoid 31 | import Text.Taggy.Entities 32 | import Text.Taggy.Types 33 | 34 | import qualified Data.Text as T 35 | import qualified Data.Text.Lazy as LT 36 | import qualified Data.Vector as V 37 | 38 | scannerFor :: T.Text -> Int -> Char -> Maybe Int 39 | scannerFor ending = go 40 | 41 | where metadata :: V.Vector Char 42 | metadata = V.fromList . T.unpack $ ending 43 | 44 | go i c | i == V.length metadata = Nothing 45 | | metadata `V.unsafeIndex` i == c = Just (i+1) 46 | | otherwise = Just 0 47 | 48 | matchUntil :: T.Text -> Parser T.Text 49 | matchUntil endStr = 50 | T.dropEnd (T.length endStr) 51 | `fmap` scan 0 (scannerFor endStr) 52 | 53 | delimitedBy :: T.Text -> T.Text -> Parser (T.Text, T.Text, T.Text) 54 | delimitedBy begStr endStr = do 55 | string begStr 56 | mid <- matchUntil endStr 57 | return (begStr, mid, endStr) 58 | 59 | delimitedByTag :: T.Text -> Bool -> Parser (Tag, T.Text, Tag) 60 | delimitedByTag t cventities = do 61 | char '<' 62 | string t 63 | (as, _) <- attributes cventities 64 | inside <- matchUntil $ " t <> ">" 65 | return (TagOpen t as False, inside, TagClose t) 66 | 67 | tagcomment :: Parser Tag 68 | tagcomment = do 69 | (_, comm, _) <- delimitedBy "" 70 | return $ TagComment comm 71 | 72 | tagscript :: Bool -> Parser Tag 73 | tagscript cventities = do 74 | (open, scr, close) <- delimitedByTag "script" cventities 75 | return $ TagScript open scr close 76 | 77 | tagstyle :: Bool -> Parser Tag 78 | tagstyle cventities = do 79 | (open, st, close) <- delimitedByTag "style" cventities 80 | return $ TagStyle open st close 81 | 82 | possibly :: Char -> Parser () 83 | possibly c = (char c *> return ()) 84 | <|> return () 85 | 86 | ident :: Parser T.Text 87 | ident = 88 | takeWhile1 (\c -> isAlphaNum c || c `elem` ("-_:." :: String)) 89 | 90 | attribute_ident :: Parser T.Text 91 | attribute_ident = 92 | takeWhile1 (`notElem` (">=" :: String)) 93 | 94 | tagopen :: Bool -> Parser Tag 95 | tagopen cventities = do 96 | char '<' 97 | possibly '<' 98 | possibly '!' 99 | possibly '?' 100 | skipSpace 101 | i <- ident 102 | (as, autoclose) <- attributes cventities 103 | return $ TagOpen i as autoclose 104 | 105 | tagclose :: Parser Tag 106 | tagclose = do 107 | char '<' 108 | char '/' 109 | skipSpace 110 | i <- ident 111 | skipSpace 112 | possibly '>' 113 | return $ TagClose i 114 | 115 | tagtext :: Bool -> Parser Tag 116 | tagtext b = (TagText . if b then convertEntities else id) `fmap` takeWhile1 (/='<') 117 | 118 | attributes :: Bool -> Parser ([Attribute], Bool) 119 | attributes cventities = postProcess `fmap` go emptyL 120 | where 121 | go l = (do autoclose <- tagends 122 | return (l, autoclose) 123 | ) 124 | <|> ( do attr <- attribute cventities 125 | go (insertL attr l) 126 | ) 127 | 128 | tagends = skipSpace >> parseEnd 129 | 130 | parseEnd = autoClosing 131 | <|> ("?>" *> return False) 132 | <|> (">" *> return False) 133 | 134 | autoClosing = do 135 | char '/' 136 | skipSpace 137 | char '>' 138 | return True 139 | 140 | postProcess (l, b) = (toListL l, b) 141 | 142 | attribute :: Bool -> Parser Attribute 143 | attribute cventities = do 144 | skipSpace 145 | key <- quoted <|> attribute_ident 146 | value <- option "" $ fmap (if cventities then convertEntities else id) $ do 147 | possibly ' ' 148 | "=" 149 | possibly ' ' 150 | quoted <|> singlequoted <|> unquoted 151 | return $ Attribute key value 152 | 153 | where quoted = do 154 | "\"" 155 | val <- Atto.takeWhile (/='"') 156 | "\"" 157 | return val 158 | 159 | singlequoted = do 160 | "'" 161 | val <- Atto.takeWhile (/='\'') 162 | "'" 163 | return val 164 | 165 | unquoted = Atto.takeTill (\c -> isSpace c || c == '>') 166 | 167 | htmlWith :: Bool -> Parser [Tag] 168 | htmlWith cventities = go 169 | 170 | where go = do 171 | finished <- atEnd 172 | if finished 173 | then return [] 174 | else do t <- tag cventities 175 | (t:) `fmap` go 176 | 177 | tag :: Bool -> Parser Tag 178 | tag cventities = tagStructured cventities <|> tagtext cventities 179 | 180 | tagStructured :: Bool -> Parser Tag 181 | tagStructured b = 182 | tagcomment 183 | <|> tagscript b 184 | <|> tagstyle b 185 | <|> tagopen b 186 | <|> tagclose 187 | 188 | -- | Get a list of tags from an HTML document 189 | -- represented as a 'LT.Text' value. 190 | -- 191 | -- The 'Bool' lets you specify whether you want 192 | -- to convert HTML entities to their corresponding 193 | -- unicode character. ('True' means "yes convert") 194 | taggyWith :: Bool -> LT.Text -> [Tag] 195 | taggyWith cventities = 196 | either (const []) id 197 | . AttoLT.eitherResult 198 | . AttoLT.parse (htmlWith cventities) 199 | 200 | -- | Same as 'taggyWith' but hands you back a 201 | -- 'AttoLT.Result' from @attoparsec@ 202 | run :: Bool -> LT.Text -> AttoLT.Result [Tag] 203 | run cventities = AttoLT.parse (htmlWith cventities) 204 | -------------------------------------------------------------------------------- /tests/unit/Text/Taggy/ParserSpec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | module Text.Taggy.ParserSpec where 4 | 5 | import Data.Attoparsec.Text.Lazy 6 | import Data.Text.Lazy 7 | import Test.Hspec 8 | import Test.Hspec.Attoparsec (shouldParse) 9 | import qualified Test.Hspec.Attoparsec.Source (Source((~>))) 10 | import Text.Taggy.Parser 11 | import Text.Taggy.Types 12 | 13 | spec :: Spec 14 | spec = do 15 | describe "tagopen parser" $ do 16 | it "successfully parses " $ 17 | "" ~> tagopen False 18 | `shouldParse` TagOpen "b" [] False 19 | 20 | it "successfully parses " $ 21 | "" ~> tagopen False 22 | `shouldParse` TagOpen "a" [Attribute "href" "/home"] False 23 | 24 | it "successfully parses