├── .github └── workflows │ └── test.yaml ├── .gitignore ├── LICENSE ├── README.md ├── display.go ├── go.mod ├── go.sum ├── parse.go ├── parse_test.go ├── pup.go ├── pup.rb ├── selector.go └── tests ├── README.md ├── cmds.txt ├── expected_output.txt ├── index.html ├── run.py └── test /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: test 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | 10 | jobs: 11 | test: 12 | strategy: 13 | matrix: 14 | os: [ubuntu-latest] 15 | go-version: [1.17.x, 1.16.x] 16 | runs-on: ${{ matrix.os }} 17 | steps: 18 | - name: Install Go 19 | uses: actions/setup-go@v2 20 | with: 21 | go-version: ${{ matrix.go-version }} 22 | - name: Checkout code 23 | uses: actions/checkout@v2 24 | - name: Build 25 | run: go build ./... 26 | - name: Test 27 | run: go test -v ./... 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | testpages/* 3 | tests/test_results.txt 4 | robots.html 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014: Eric Chiang 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pup 2 | 3 | pup is a command line tool for processing HTML. It reads from stdin, 4 | prints to stdout, and allows the user to filter parts of the page using 5 | [CSS selectors](https://developer.mozilla.org/en-US/docs/Web/Guide/CSS/Getting_started/Selectors). 6 | 7 | Inspired by [jq](http://stedolan.github.io/jq/), pup aims to be a 8 | fast and flexible way of exploring HTML from the terminal. 9 | 10 | ## Install 11 | 12 | Direct downloads are available through the [releases page](https://github.com/EricChiang/pup/releases/latest). 13 | 14 | If you have Go installed on your computer just run `go get`. 15 | 16 | go get github.com/ericchiang/pup 17 | 18 | If you're on OS X, use [Homebrew](http://brew.sh/) to install (no Go required). 19 | 20 | brew install https://raw.githubusercontent.com/EricChiang/pup/master/pup.rb 21 | 22 | ## Quick start 23 | 24 | ```bash 25 | $ curl -s https://news.ycombinator.com/ 26 | ``` 27 | 28 | Ew, HTML. Let's run that through some pup selectors: 29 | 30 | ```bash 31 | $ curl -s https://news.ycombinator.com/ | pup 'table table tr:nth-last-of-type(n+2) td.title a' 32 | ``` 33 | 34 | Okay, how about only the links? 35 | 36 | ```bash 37 | $ curl -s https://news.ycombinator.com/ | pup 'table table tr:nth-last-of-type(n+2) td.title a attr{href}' 38 | ``` 39 | 40 | Even better, let's grab the titles too: 41 | 42 | ```bash 43 | $ curl -s https://news.ycombinator.com/ | pup 'table table tr:nth-last-of-type(n+2) td.title a json{}' 44 | ``` 45 | 46 | ## Basic Usage 47 | 48 | ```bash 49 | $ cat index.html | pup [flags] '[selectors] [display function]' 50 | ``` 51 | 52 | ## Examples 53 | 54 | Download a webpage with wget. 55 | 56 | ```bash 57 | $ wget http://en.wikipedia.org/wiki/Robots_exclusion_standard -O robots.html 58 | ``` 59 | 60 | #### Clean and indent 61 | 62 | By default pup will fill in missing tags and properly indent the page. 63 | 64 | ```bash 65 | $ cat robots.html 66 | # nasty looking HTML 67 | $ cat robots.html | pup --color 68 | # cleaned, indented, and colorful HTML 69 | ``` 70 | 71 | #### Filter by tag 72 | 73 | ```bash 74 | $ cat robots.html | pup 'title' 75 |
tag indicates that the text within it should always be formatted 70 | // as is. See https://github.com/ericchiang/pup/issues/33 71 | func (t TreeDisplayer) printPre(n *html.Node) { 72 | switch n.Type { 73 | case html.TextNode: 74 | s := n.Data 75 | if pupEscapeHTML { 76 | // don't escape javascript 77 | if n.Parent == nil || n.Parent.DataAtom != atom.Script { 78 | s = html.EscapeString(s) 79 | } 80 | } 81 | fmt.Print(s) 82 | for c := n.FirstChild; c != nil; c = c.NextSibling { 83 | t.printPre(c) 84 | } 85 | case html.ElementNode: 86 | fmt.Printf("<%s", n.Data) 87 | for _, a := range n.Attr { 88 | val := a.Val 89 | if pupEscapeHTML { 90 | val = html.EscapeString(val) 91 | } 92 | fmt.Printf(` %s="%s"`, a.Key, val) 93 | } 94 | fmt.Print(">") 95 | if !isVoidElement(n) { 96 | for c := n.FirstChild; c != nil; c = c.NextSibling { 97 | t.printPre(c) 98 | } 99 | fmt.Printf("%s>", n.Data) 100 | } 101 | case html.CommentNode: 102 | data := n.Data 103 | if pupEscapeHTML { 104 | data = html.EscapeString(data) 105 | } 106 | fmt.Printf("\n", data) 107 | for c := n.FirstChild; c != nil; c = c.NextSibling { 108 | t.printPre(c) 109 | } 110 | case html.DoctypeNode, html.DocumentNode: 111 | for c := n.FirstChild; c != nil; c = c.NextSibling { 112 | t.printPre(c) 113 | } 114 | } 115 | } 116 | 117 | // Print a node and all of it's children to `maxlevel`. 118 | func (t TreeDisplayer) printNode(n *html.Node, level int) { 119 | switch n.Type { 120 | case html.TextNode: 121 | s := n.Data 122 | if pupEscapeHTML { 123 | // don't escape javascript 124 | if n.Parent == nil || n.Parent.DataAtom != atom.Script { 125 | s = html.EscapeString(s) 126 | } 127 | } 128 | s = strings.TrimSpace(s) 129 | if s != "" { 130 | t.printIndent(level) 131 | fmt.Println(s) 132 | } 133 | case html.ElementNode: 134 | t.printIndent(level) 135 | // TODO: allow pre with color 136 | if n.DataAtom == atom.Pre && !pupPrintColor && pupPreformatted { 137 | t.printPre(n) 138 | fmt.Println() 139 | return 140 | } 141 | if pupPrintColor { 142 | tokenColor.Print("<") 143 | tagColor.Printf("%s", n.Data) 144 | } else { 145 | fmt.Printf("<%s", n.Data) 146 | } 147 | for _, a := range n.Attr { 148 | val := a.Val 149 | if pupEscapeHTML { 150 | val = html.EscapeString(val) 151 | } 152 | if pupPrintColor { 153 | fmt.Print(" ") 154 | attrKeyColor.Printf("%s", a.Key) 155 | tokenColor.Print("=") 156 | quoteColor.Printf(`"%s"`, val) 157 | } else { 158 | fmt.Printf(` %s="%s"`, a.Key, val) 159 | } 160 | } 161 | if pupPrintColor { 162 | tokenColor.Println(">") 163 | } else { 164 | fmt.Println(">") 165 | } 166 | if !isVoidElement(n) { 167 | t.printChildren(n, level+1) 168 | t.printIndent(level) 169 | if pupPrintColor { 170 | tokenColor.Print("") 171 | tagColor.Printf("%s", n.Data) 172 | tokenColor.Println(">") 173 | } else { 174 | fmt.Printf("%s>\n", n.Data) 175 | } 176 | } 177 | case html.CommentNode: 178 | t.printIndent(level) 179 | data := n.Data 180 | if pupEscapeHTML { 181 | data = html.EscapeString(data) 182 | } 183 | if pupPrintColor { 184 | commentColor.Printf("\n", data) 185 | } else { 186 | fmt.Printf("\n", data) 187 | } 188 | t.printChildren(n, level) 189 | case html.DoctypeNode, html.DocumentNode: 190 | t.printChildren(n, level) 191 | } 192 | } 193 | 194 | func (t TreeDisplayer) printChildren(n *html.Node, level int) { 195 | if pupMaxPrintLevel > -1 { 196 | if level >= pupMaxPrintLevel { 197 | t.printIndent(level) 198 | fmt.Println("...") 199 | return 200 | } 201 | } 202 | child := n.FirstChild 203 | for child != nil { 204 | t.printNode(child, level) 205 | child = child.NextSibling 206 | } 207 | } 208 | 209 | func (t TreeDisplayer) printIndent(level int) { 210 | for ; level > 0; level-- { 211 | fmt.Print(pupIndentString) 212 | } 213 | } 214 | 215 | // Print the text of a node 216 | type TextDisplayer struct{} 217 | 218 | func (t TextDisplayer) Display(nodes []*html.Node) { 219 | for _, node := range nodes { 220 | if node.Type == html.TextNode { 221 | data := node.Data 222 | if pupEscapeHTML { 223 | // don't escape javascript 224 | if node.Parent == nil || node.Parent.DataAtom != atom.Script { 225 | data = html.EscapeString(data) 226 | } 227 | } 228 | fmt.Println(data) 229 | } 230 | children := []*html.Node{} 231 | child := node.FirstChild 232 | for child != nil { 233 | children = append(children, child) 234 | child = child.NextSibling 235 | } 236 | t.Display(children) 237 | } 238 | } 239 | 240 | // Print the attribute of a node 241 | type AttrDisplayer struct { 242 | Attr string 243 | } 244 | 245 | func (a AttrDisplayer) Display(nodes []*html.Node) { 246 | for _, node := range nodes { 247 | attributes := node.Attr 248 | for _, attr := range attributes { 249 | if attr.Key == a.Attr { 250 | val := attr.Val 251 | if pupEscapeHTML { 252 | val = html.EscapeString(val) 253 | } 254 | fmt.Printf("%s\n", val) 255 | } 256 | } 257 | } 258 | } 259 | 260 | // Print nodes as a JSON list 261 | type JSONDisplayer struct{} 262 | 263 | // returns a jsonifiable struct 264 | func jsonify(node *html.Node) map[string]interface{} { 265 | vals := map[string]interface{}{} 266 | if len(node.Attr) > 0 { 267 | for _, attr := range node.Attr { 268 | if pupEscapeHTML { 269 | vals[attr.Key] = html.EscapeString(attr.Val) 270 | } else { 271 | vals[attr.Key] = attr.Val 272 | } 273 | } 274 | } 275 | vals["tag"] = node.DataAtom.String() 276 | children := []interface{}{} 277 | for child := node.FirstChild; child != nil; child = child.NextSibling { 278 | switch child.Type { 279 | case html.ElementNode: 280 | children = append(children, jsonify(child)) 281 | case html.TextNode: 282 | text := strings.TrimSpace(child.Data) 283 | if text != "" { 284 | if pupEscapeHTML { 285 | // don't escape javascript 286 | if node.DataAtom != atom.Script { 287 | text = html.EscapeString(text) 288 | } 289 | } 290 | // if there is already text we'll append it 291 | currText, ok := vals["text"] 292 | if ok { 293 | text = fmt.Sprintf("%s %s", currText, text) 294 | } 295 | vals["text"] = text 296 | } 297 | case html.CommentNode: 298 | comment := strings.TrimSpace(child.Data) 299 | if pupEscapeHTML { 300 | comment = html.EscapeString(comment) 301 | } 302 | currComment, ok := vals["comment"] 303 | if ok { 304 | comment = fmt.Sprintf("%s %s", currComment, comment) 305 | } 306 | vals["comment"] = comment 307 | } 308 | } 309 | if len(children) > 0 { 310 | vals["children"] = children 311 | } 312 | return vals 313 | } 314 | 315 | func (j JSONDisplayer) Display(nodes []*html.Node) { 316 | var data []byte 317 | var err error 318 | jsonNodes := []map[string]interface{}{} 319 | for _, node := range nodes { 320 | jsonNodes = append(jsonNodes, jsonify(node)) 321 | } 322 | data, err = json.MarshalIndent(&jsonNodes, "", pupIndentString) 323 | if err != nil { 324 | panic("Could not jsonify nodes") 325 | } 326 | fmt.Printf("%s\n", data) 327 | } 328 | 329 | // Print the number of features returned 330 | type NumDisplayer struct{} 331 | 332 | func (d NumDisplayer) Display(nodes []*html.Node) { 333 | fmt.Println(len(nodes)) 334 | } 335 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/ericchiang/pup 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/fatih/color v1.0.0 7 | github.com/mattn/go-colorable v0.0.5 8 | github.com/mattn/go-isatty v0.0.0-20151211000621-56b76bdf51f7 // indirect 9 | golang.org/x/net v0.0.0-20160720084139-4d38db76854b 10 | golang.org/x/sys v0.0.0-20160717071931-a646d33e2ee3 // indirect 11 | golang.org/x/text v0.0.0-20160719205907-0a5a09ee4409 12 | ) 13 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/fatih/color v1.0.0 h1:4zdNjpoprR9fed2QRCPb2VTPU4UFXEtJc9Vc+sgXkaQ= 2 | github.com/fatih/color v1.0.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= 3 | github.com/mattn/go-colorable v0.0.5 h1:X1IeP+MaFWC+vpbhw3y426rQftzXSj+N7eJFnBEMBfE= 4 | github.com/mattn/go-colorable v0.0.5/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= 5 | github.com/mattn/go-isatty v0.0.0-20151211000621-56b76bdf51f7 h1:owMyzMR4QR+jSdlfkX9jPU3rsby4++j99BfbtgVr6ZY= 6 | github.com/mattn/go-isatty v0.0.0-20151211000621-56b76bdf51f7/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= 7 | golang.org/x/net v0.0.0-20160720084139-4d38db76854b h1:2lHDZItrxmjk3OXnITVKcHWo6qQYJSm4q2pmvciVkxo= 8 | golang.org/x/net v0.0.0-20160720084139-4d38db76854b/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= 9 | golang.org/x/sys v0.0.0-20160717071931-a646d33e2ee3 h1:ZLExsLvnoqWSw6JB6k6RjWobIHGR3NG9dzVANJ7SVKc= 10 | golang.org/x/sys v0.0.0-20160717071931-a646d33e2ee3/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 11 | golang.org/x/text v0.0.0-20160719205907-0a5a09ee4409 h1:ImTDOALQ1AOSGXgapb9Q1tOcHlxpQXZCPSIMKLce0JU= 12 | golang.org/x/text v0.0.0-20160719205907-0a5a09ee4409/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 13 | -------------------------------------------------------------------------------- /parse.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "strconv" 8 | "strings" 9 | 10 | "golang.org/x/net/html" 11 | "golang.org/x/net/html/charset" 12 | "golang.org/x/text/transform" 13 | ) 14 | 15 | var ( 16 | pupIn io.ReadCloser = os.Stdin 17 | pupCharset string = "" 18 | pupMaxPrintLevel int = -1 19 | pupPreformatted bool = false 20 | pupPrintColor bool = false 21 | pupEscapeHTML bool = true 22 | pupIndentString string = " " 23 | pupDisplayer Displayer = TreeDisplayer{} 24 | ) 25 | 26 | // Parse the html while handling the charset 27 | func ParseHTML(r io.Reader, cs string) (*html.Node, error) { 28 | var err error 29 | if cs == "" { 30 | // attempt to guess the charset of the HTML document 31 | r, err = charset.NewReader(r, "") 32 | if err != nil { 33 | return nil, err 34 | } 35 | } else { 36 | // let the user specify the charset 37 | e, name := charset.Lookup(cs) 38 | if name == "" { 39 | return nil, fmt.Errorf("'%s' is not a valid charset", cs) 40 | } 41 | r = transform.NewReader(r, e.NewDecoder()) 42 | } 43 | return html.Parse(r) 44 | } 45 | 46 | func PrintHelp(w io.Writer, exitCode int) { 47 | helpString := `Usage 48 | pup [flags] [selectors] [optional display function] 49 | Version 50 | %s 51 | Flags 52 | -c --color print result with color 53 | -f --file file to read from 54 | -h --help display this help 55 | -i --indent number of spaces to use for indent or character 56 | -n --number print number of elements selected 57 | -l --limit restrict number of levels printed 58 | -p --plain don't escape html 59 | --pre preserve preformatted text 60 | --charset specify the charset for pup to use 61 | --version display version 62 | ` 63 | fmt.Fprintf(w, helpString, VERSION) 64 | os.Exit(exitCode) 65 | } 66 | 67 | func ParseArgs() ([]string, error) { 68 | cmds, err := ProcessFlags(os.Args[1:]) 69 | if err != nil { 70 | return []string{}, err 71 | } 72 | return ParseCommands(strings.Join(cmds, " ")) 73 | } 74 | 75 | // Process command arguments and return all non-flags. 76 | func ProcessFlags(cmds []string) (nonFlagCmds []string, err error) { 77 | var i int 78 | defer func() { 79 | if r := recover(); r != nil { 80 | err = fmt.Errorf("Option '%s' requires an argument", cmds[i]) 81 | } 82 | }() 83 | nonFlagCmds = make([]string, len(cmds)) 84 | n := 0 85 | for i = 0; i < len(cmds); i++ { 86 | cmd := cmds[i] 87 | switch cmd { 88 | case "-c", "--color": 89 | pupPrintColor = true 90 | case "-p", "--plain": 91 | pupEscapeHTML = false 92 | case "--pre": 93 | pupPreformatted = true 94 | case "-f", "--file": 95 | filename := cmds[i+1] 96 | pupIn, err = os.Open(filename) 97 | if err != nil { 98 | fmt.Fprintf(os.Stderr, "%s\n", err.Error()) 99 | os.Exit(2) 100 | } 101 | i++ 102 | case "-h", "--help": 103 | PrintHelp(os.Stdout, 0) 104 | case "-i", "--indent": 105 | indentLevel, err := strconv.Atoi(cmds[i+1]) 106 | if err == nil { 107 | pupIndentString = strings.Repeat(" ", indentLevel) 108 | } else { 109 | pupIndentString = cmds[i+1] 110 | } 111 | i++ 112 | case "-l", "--limit": 113 | pupMaxPrintLevel, err = strconv.Atoi(cmds[i+1]) 114 | if err != nil { 115 | return []string{}, fmt.Errorf("Argument for '%s' must be numeric", cmd) 116 | } 117 | i++ 118 | case "--charset": 119 | pupCharset = cmds[i+1] 120 | i++ 121 | case "--version": 122 | fmt.Println(VERSION) 123 | os.Exit(0) 124 | case "-n", "--number": 125 | pupDisplayer = NumDisplayer{} 126 | default: 127 | if cmd[0] == '-' { 128 | return []string{}, fmt.Errorf("Unrecognized flag '%s'", cmd) 129 | } 130 | nonFlagCmds[n] = cmds[i] 131 | n++ 132 | } 133 | } 134 | return nonFlagCmds[:n], nil 135 | } 136 | 137 | // Split a string with awareness for quoted text and commas 138 | func ParseCommands(cmdString string) ([]string, error) { 139 | cmds := []string{} 140 | last, next, max := 0, 0, len(cmdString) 141 | for { 142 | // if we're at the end of the string, return 143 | if next == max { 144 | if next > last { 145 | cmds = append(cmds, cmdString[last:next]) 146 | } 147 | return cmds, nil 148 | } 149 | // evaluate a rune 150 | c := cmdString[next] 151 | switch c { 152 | case ' ': 153 | if next > last { 154 | cmds = append(cmds, cmdString[last:next]) 155 | } 156 | last = next + 1 157 | case ',': 158 | if next > last { 159 | cmds = append(cmds, cmdString[last:next]) 160 | } 161 | cmds = append(cmds, ",") 162 | last = next + 1 163 | case '\'', '"': 164 | // for quotes, consume runes until the quote has ended 165 | quoteChar := c 166 | for { 167 | next++ 168 | if next == max { 169 | return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar) 170 | } 171 | if cmdString[next] == '\\' { 172 | next++ 173 | if next == max { 174 | return []string{}, fmt.Errorf("Unmatched open quote (%c)", quoteChar) 175 | } 176 | } else if cmdString[next] == quoteChar { 177 | break 178 | } 179 | } 180 | } 181 | next++ 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /parse_test.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | type parseCmdTest struct { 8 | input string 9 | split []string 10 | ok bool 11 | } 12 | 13 | var parseCmdTests = []parseCmdTest{ 14 | parseCmdTest{`w1 w2`, []string{`w1`, `w2`}, true}, 15 | parseCmdTest{`w1 w2 w3`, []string{`w1`, `w2`, `w3`}, true}, 16 | parseCmdTest{`w1 'w2 w3'`, []string{`w1`, `'w2 w3'`}, true}, 17 | parseCmdTest{`w1 "w2 w3"`, []string{`w1`, `"w2 w3"`}, true}, 18 | parseCmdTest{`w1 "w2 w3"`, []string{`w1`, `"w2 w3"`}, true}, 19 | parseCmdTest{`w1 'w2 w3'`, []string{`w1`, `'w2 w3'`}, true}, 20 | parseCmdTest{`w1"w2 w3"`, []string{`w1"w2 w3"`}, true}, 21 | parseCmdTest{`w1'w2 w3'`, []string{`w1'w2 w3'`}, true}, 22 | parseCmdTest{`w1"w2 'w3"`, []string{`w1"w2 'w3"`}, true}, 23 | parseCmdTest{`w1'w2 "w3'`, []string{`w1'w2 "w3'`}, true}, 24 | parseCmdTest{`"w1 w2" "w3"`, []string{`"w1 w2"`, `"w3"`}, true}, 25 | parseCmdTest{`'w1 w2' "w3"`, []string{`'w1 w2'`, `"w3"`}, true}, 26 | parseCmdTest{`'w1 \'w2' "w3"`, []string{`'w1 \'w2'`, `"w3"`}, true}, 27 | parseCmdTest{`'w1 \'w2 "w3"`, []string{}, false}, 28 | parseCmdTest{`w1 'w2 w3'"`, []string{}, false}, 29 | parseCmdTest{`w1 "w2 w3"'`, []string{}, false}, 30 | parseCmdTest{`w1 ' "w2 w3"`, []string{}, false}, 31 | parseCmdTest{`w1 " 'w2 w3'`, []string{}, false}, 32 | parseCmdTest{`w1"w2 w3""`, []string{}, false}, 33 | parseCmdTest{`w1'w2 w3''`, []string{}, false}, 34 | parseCmdTest{`w1"w2 'w3""`, []string{}, false}, 35 | parseCmdTest{`w1'w2 "w3''`, []string{}, false}, 36 | parseCmdTest{`"w1 w2" "w3"'`, []string{}, false}, 37 | parseCmdTest{`'w1 w2' "w3"'`, []string{}, false}, 38 | parseCmdTest{`w1,"w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, 39 | parseCmdTest{`w1,'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, 40 | parseCmdTest{`w1 , "w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, 41 | parseCmdTest{`w1 , 'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, 42 | parseCmdTest{`w1, "w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, 43 | parseCmdTest{`w1, 'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, 44 | parseCmdTest{`w1 ,"w2 w3"`, []string{`w1`, `,`, `"w2 w3"`}, true}, 45 | parseCmdTest{`w1 ,'w2 w3'`, []string{`w1`, `,`, `'w2 w3'`}, true}, 46 | parseCmdTest{`w1"w2, w3"`, []string{`w1"w2, w3"`}, true}, 47 | parseCmdTest{`w1'w2, w3'`, []string{`w1'w2, w3'`}, true}, 48 | parseCmdTest{`w1"w2, 'w3"`, []string{`w1"w2, 'w3"`}, true}, 49 | parseCmdTest{`w1'w2, "w3'`, []string{`w1'w2, "w3'`}, true}, 50 | parseCmdTest{`"w1, w2" "w3"`, []string{`"w1, w2"`, `"w3"`}, true}, 51 | parseCmdTest{`'w1, w2' "w3"`, []string{`'w1, w2'`, `"w3"`}, true}, 52 | parseCmdTest{`'w1, \'w2' "w3"`, []string{`'w1, \'w2'`, `"w3"`}, true}, 53 | parseCmdTest{`h1, .article-teaser, .article-content`, []string{ 54 | `h1`, `,`, `.article-teaser`, `,`, `.article-content`, 55 | }, true}, 56 | parseCmdTest{`h1 ,.article-teaser ,.article-content`, []string{ 57 | `h1`, `,`, `.article-teaser`, `,`, `.article-content`, 58 | }, true}, 59 | parseCmdTest{`h1 , .article-teaser , .article-content`, []string{ 60 | `h1`, `,`, `.article-teaser`, `,`, `.article-content`, 61 | }, true}, 62 | } 63 | 64 | func sliceEq(s1, s2 []string) bool { 65 | if len(s1) != len(s2) { 66 | return false 67 | } 68 | for i := range s1 { 69 | if s1[i] != s2[i] { 70 | return false 71 | } 72 | } 73 | return true 74 | } 75 | 76 | func TestParseCommands(t *testing.T) { 77 | for _, test := range parseCmdTests { 78 | parsed, err := ParseCommands(test.input) 79 | if test.ok != (err == nil) { 80 | t.Errorf("`%s`: should have cause error? %v", test.input, !test.ok) 81 | } else if !sliceEq(test.split, parsed) { 82 | t.Errorf("`%s`: `%s`: `%s`", test.input, test.split, parsed) 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /pup.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | 7 | "golang.org/x/net/html" 8 | ) 9 | 10 | // _=,_ 11 | // o_/6 /#\ 12 | // \__ |##/ 13 | // ='|--\ 14 | // / #'-. 15 | // \#|_ _'-. / 16 | // |/ \_( # |" 17 | // C/ ,--___/ 18 | 19 | var VERSION string = "0.4.0" 20 | 21 | func main() { 22 | // process flags and arguments 23 | cmds, err := ParseArgs() 24 | if err != nil { 25 | fmt.Fprintf(os.Stderr, "%s\n", err.Error()) 26 | os.Exit(2) 27 | } 28 | 29 | // Parse the input and get the root node 30 | root, err := ParseHTML(pupIn, pupCharset) 31 | if err != nil { 32 | fmt.Fprintf(os.Stderr, "%s\n", err.Error()) 33 | os.Exit(2) 34 | } 35 | pupIn.Close() 36 | 37 | // Parse the selectors 38 | selectorFuncs := []SelectorFunc{} 39 | funcGenerator := Select 40 | var cmd string 41 | for len(cmds) > 0 { 42 | cmd, cmds = cmds[0], cmds[1:] 43 | if len(cmds) == 0 { 44 | if err := ParseDisplayer(cmd); err == nil { 45 | continue 46 | } 47 | } 48 | switch cmd { 49 | case "*": // select all 50 | continue 51 | case ">": 52 | funcGenerator = SelectFromChildren 53 | case "+": 54 | funcGenerator = SelectNextSibling 55 | case ",": // nil will signify a comma 56 | selectorFuncs = append(selectorFuncs, nil) 57 | default: 58 | selector, err := ParseSelector(cmd) 59 | if err != nil { 60 | fmt.Fprintf(os.Stderr, "Selector parsing error: %s\n", err.Error()) 61 | os.Exit(2) 62 | } 63 | selectorFuncs = append(selectorFuncs, funcGenerator(selector)) 64 | funcGenerator = Select 65 | } 66 | } 67 | 68 | selectedNodes := []*html.Node{} 69 | currNodes := []*html.Node{root} 70 | for _, selectorFunc := range selectorFuncs { 71 | if selectorFunc == nil { // hit a comma 72 | selectedNodes = append(selectedNodes, currNodes...) 73 | currNodes = []*html.Node{root} 74 | } else { 75 | currNodes = selectorFunc(currNodes) 76 | } 77 | } 78 | selectedNodes = append(selectedNodes, currNodes...) 79 | pupDisplayer.Display(selectedNodes) 80 | } 81 | -------------------------------------------------------------------------------- /pup.rb: -------------------------------------------------------------------------------- 1 | # This file was generated by release.sh 2 | require 'formula' 3 | class Pup < Formula 4 | homepage 'https://github.com/ericchiang/pup' 5 | version '0.4.0' 6 | 7 | if Hardware::CPU.is_64_bit? 8 | url 'https://github.com/ericchiang/pup/releases/download/v0.4.0/pup_v0.4.0_darwin_amd64.zip' 9 | sha256 'c539a697efee2f8e56614a54cb3b215338e00de1f6a7c2fa93144ab6e1db8ebe' 10 | else 11 | url 'https://github.com/ericchiang/pup/releases/download/v0.4.0/pup_v0.4.0_darwin_386.zip' 12 | sha256 '75c27caa0008a9cc639beb7506077ad9f32facbffcc4e815e999eaf9588a527e' 13 | end 14 | 15 | def install 16 | bin.install 'pup' 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /selector.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "regexp" 7 | "strconv" 8 | "strings" 9 | "text/scanner" 10 | 11 | "golang.org/x/net/html" 12 | ) 13 | 14 | type Selector interface { 15 | Match(node *html.Node) bool 16 | } 17 | 18 | type SelectorFunc func(nodes []*html.Node) []*html.Node 19 | 20 | func Select(s Selector) SelectorFunc { 21 | // have to define first to be able to do recursion 22 | var selectChildren func(node *html.Node) []*html.Node 23 | selectChildren = func(node *html.Node) []*html.Node { 24 | selected := []*html.Node{} 25 | for child := node.FirstChild; child != nil; child = child.NextSibling { 26 | if s.Match(child) { 27 | selected = append(selected, child) 28 | } else { 29 | selected = append(selected, selectChildren(child)...) 30 | } 31 | } 32 | return selected 33 | } 34 | return func(nodes []*html.Node) []*html.Node { 35 | selected := []*html.Node{} 36 | for _, node := range nodes { 37 | selected = append(selected, selectChildren(node)...) 38 | } 39 | return selected 40 | } 41 | } 42 | 43 | // Defined for the '>' selector 44 | func SelectNextSibling(s Selector) SelectorFunc { 45 | return func(nodes []*html.Node) []*html.Node { 46 | selected := []*html.Node{} 47 | for _, node := range nodes { 48 | for ns := node.NextSibling; ns != nil; ns = ns.NextSibling { 49 | if ns.Type == html.ElementNode { 50 | if s.Match(ns) { 51 | selected = append(selected, ns) 52 | } 53 | break 54 | } 55 | } 56 | } 57 | return selected 58 | } 59 | } 60 | 61 | // Defined for the '+' selector 62 | func SelectFromChildren(s Selector) SelectorFunc { 63 | return func(nodes []*html.Node) []*html.Node { 64 | selected := []*html.Node{} 65 | for _, node := range nodes { 66 | for c := node.FirstChild; c != nil; c = c.NextSibling { 67 | if s.Match(c) { 68 | selected = append(selected, c) 69 | } 70 | } 71 | } 72 | return selected 73 | } 74 | } 75 | 76 | type PseudoClass func(*html.Node) bool 77 | 78 | type CSSSelector struct { 79 | Tag string 80 | Attrs map[string]*regexp.Regexp 81 | Pseudo PseudoClass 82 | } 83 | 84 | func (s CSSSelector) Match(node *html.Node) bool { 85 | if node.Type != html.ElementNode { 86 | return false 87 | } 88 | if s.Tag != "" { 89 | if s.Tag != node.DataAtom.String() { 90 | return false 91 | } 92 | } 93 | for attrKey, matcher := range s.Attrs { 94 | matched := false 95 | for _, attr := range node.Attr { 96 | if attrKey == attr.Key { 97 | if !matcher.MatchString(attr.Val) { 98 | return false 99 | } 100 | matched = true 101 | break 102 | } 103 | } 104 | if !matched { 105 | return false 106 | } 107 | } 108 | if s.Pseudo == nil { 109 | return true 110 | } 111 | return s.Pseudo(node) 112 | } 113 | 114 | // Parse a selector 115 | // e.g. `div#my-button.btn[href^="http"]` 116 | func ParseSelector(cmd string) (selector CSSSelector, err error) { 117 | selector = CSSSelector{ 118 | Tag: "", 119 | Attrs: map[string]*regexp.Regexp{}, 120 | Pseudo: nil, 121 | } 122 | var s scanner.Scanner 123 | s.Init(strings.NewReader(cmd)) 124 | err = ParseTagMatcher(&selector, s) 125 | return 126 | } 127 | 128 | // Parse the initial tag 129 | // e.g. `div` 130 | func ParseTagMatcher(selector *CSSSelector, s scanner.Scanner) error { 131 | tag := bytes.NewBuffer([]byte{}) 132 | defer func() { 133 | selector.Tag = tag.String() 134 | }() 135 | for { 136 | c := s.Next() 137 | switch c { 138 | case scanner.EOF: 139 | return nil 140 | case '.': 141 | return ParseClassMatcher(selector, s) 142 | case '#': 143 | return ParseIdMatcher(selector, s) 144 | case '[': 145 | return ParseAttrMatcher(selector, s) 146 | case ':': 147 | return ParsePseudo(selector, s) 148 | default: 149 | if _, err := tag.WriteRune(c); err != nil { 150 | return err 151 | } 152 | } 153 | } 154 | } 155 | 156 | // Parse a class matcher 157 | // e.g. `.btn` 158 | func ParseClassMatcher(selector *CSSSelector, s scanner.Scanner) error { 159 | var class bytes.Buffer 160 | defer func() { 161 | regexpStr := `(\A|\s)` + regexp.QuoteMeta(class.String()) + `(\s|\z)` 162 | selector.Attrs["class"] = regexp.MustCompile(regexpStr) 163 | }() 164 | for { 165 | c := s.Next() 166 | switch c { 167 | case scanner.EOF: 168 | return nil 169 | case '.': 170 | return ParseClassMatcher(selector, s) 171 | case '#': 172 | return ParseIdMatcher(selector, s) 173 | case '[': 174 | return ParseAttrMatcher(selector, s) 175 | case ':': 176 | return ParsePseudo(selector, s) 177 | default: 178 | if _, err := class.WriteRune(c); err != nil { 179 | return err 180 | } 181 | } 182 | } 183 | } 184 | 185 | // Parse an id matcher 186 | // e.g. `#my-picture` 187 | func ParseIdMatcher(selector *CSSSelector, s scanner.Scanner) error { 188 | var id bytes.Buffer 189 | defer func() { 190 | regexpStr := `^` + regexp.QuoteMeta(id.String()) + `$` 191 | selector.Attrs["id"] = regexp.MustCompile(regexpStr) 192 | }() 193 | for { 194 | c := s.Next() 195 | switch c { 196 | case scanner.EOF: 197 | return nil 198 | case '.': 199 | return ParseClassMatcher(selector, s) 200 | case '#': 201 | return ParseIdMatcher(selector, s) 202 | case '[': 203 | return ParseAttrMatcher(selector, s) 204 | case ':': 205 | return ParsePseudo(selector, s) 206 | default: 207 | if _, err := id.WriteRune(c); err != nil { 208 | return err 209 | } 210 | } 211 | } 212 | } 213 | 214 | // Parse an attribute matcher 215 | // e.g. `[attr^="http"]` 216 | func ParseAttrMatcher(selector *CSSSelector, s scanner.Scanner) error { 217 | var attrKey bytes.Buffer 218 | var attrVal bytes.Buffer 219 | hasMatchVal := false 220 | matchType := '=' 221 | defer func() { 222 | if hasMatchVal { 223 | var regexpStr string 224 | switch matchType { 225 | case '=': 226 | regexpStr = `^` + regexp.QuoteMeta(attrVal.String()) + `$` 227 | case '*': 228 | regexpStr = regexp.QuoteMeta(attrVal.String()) 229 | case '$': 230 | regexpStr = regexp.QuoteMeta(attrVal.String()) + `$` 231 | case '^': 232 | regexpStr = `^` + regexp.QuoteMeta(attrVal.String()) 233 | case '~': 234 | regexpStr = `(\A|\s)` + regexp.QuoteMeta(attrVal.String()) + `(\s|\z)` 235 | } 236 | selector.Attrs[attrKey.String()] = regexp.MustCompile(regexpStr) 237 | } else { 238 | selector.Attrs[attrKey.String()] = regexp.MustCompile(`^.*$`) 239 | } 240 | }() 241 | // After reaching ']' proceed 242 | proceed := func() error { 243 | switch s.Next() { 244 | case scanner.EOF: 245 | return nil 246 | case '.': 247 | return ParseClassMatcher(selector, s) 248 | case '#': 249 | return ParseIdMatcher(selector, s) 250 | case '[': 251 | return ParseAttrMatcher(selector, s) 252 | case ':': 253 | return ParsePseudo(selector, s) 254 | default: 255 | return fmt.Errorf("Expected selector indicator after ']'") 256 | } 257 | } 258 | // Parse the attribute key matcher 259 | for !hasMatchVal { 260 | c := s.Next() 261 | switch c { 262 | case scanner.EOF: 263 | return fmt.Errorf("Unmatched open brace '['") 264 | case ']': 265 | // No attribute value matcher, proceed! 266 | return proceed() 267 | case '$', '^', '~', '*': 268 | matchType = c 269 | hasMatchVal = true 270 | if s.Next() != '=' { 271 | return fmt.Errorf("'%c' must be followed by a '='", matchType) 272 | } 273 | case '=': 274 | matchType = c 275 | hasMatchVal = true 276 | default: 277 | if _, err := attrKey.WriteRune(c); err != nil { 278 | return err 279 | } 280 | } 281 | } 282 | // figure out if the value is quoted 283 | c := s.Next() 284 | inQuote := false 285 | switch c { 286 | case scanner.EOF: 287 | return fmt.Errorf("Unmatched open brace '['") 288 | case ']': 289 | return proceed() 290 | case '"': 291 | inQuote = true 292 | default: 293 | if _, err := attrVal.WriteRune(c); err != nil { 294 | return err 295 | } 296 | } 297 | if inQuote { 298 | for { 299 | c := s.Next() 300 | switch c { 301 | case '\\': 302 | // consume another character 303 | if c = s.Next(); c == scanner.EOF { 304 | return fmt.Errorf("Unmatched open brace '['") 305 | } 306 | case '"': 307 | switch s.Next() { 308 | case ']': 309 | return proceed() 310 | default: 311 | return fmt.Errorf("Quote must end at ']'") 312 | } 313 | } 314 | if _, err := attrVal.WriteRune(c); err != nil { 315 | return err 316 | } 317 | } 318 | } else { 319 | for { 320 | c := s.Next() 321 | switch c { 322 | case scanner.EOF: 323 | return fmt.Errorf("Unmatched open brace '['") 324 | case ']': 325 | // No attribute value matcher, proceed! 326 | return proceed() 327 | } 328 | if _, err := attrVal.WriteRune(c); err != nil { 329 | return err 330 | } 331 | } 332 | } 333 | } 334 | 335 | // Parse the selector after ':' 336 | func ParsePseudo(selector *CSSSelector, s scanner.Scanner) error { 337 | if selector.Pseudo != nil { 338 | return fmt.Errorf("Combined multiple pseudo classes") 339 | } 340 | var b bytes.Buffer 341 | for s.Peek() != scanner.EOF { 342 | if _, err := b.WriteRune(s.Next()); err != nil { 343 | return err 344 | } 345 | } 346 | cmd := b.String() 347 | var err error 348 | switch { 349 | case cmd == "empty": 350 | selector.Pseudo = func(n *html.Node) bool { 351 | return n.FirstChild == nil 352 | } 353 | case cmd == "first-child": 354 | selector.Pseudo = firstChildPseudo 355 | case cmd == "last-child": 356 | selector.Pseudo = lastChildPseudo 357 | case cmd == "only-child": 358 | selector.Pseudo = func(n *html.Node) bool { 359 | return firstChildPseudo(n) && lastChildPseudo(n) 360 | } 361 | case cmd == "first-of-type": 362 | selector.Pseudo = firstOfTypePseudo 363 | case cmd == "last-of-type": 364 | selector.Pseudo = lastOfTypePseudo 365 | case cmd == "only-of-type": 366 | selector.Pseudo = func(n *html.Node) bool { 367 | return firstOfTypePseudo(n) && lastOfTypePseudo(n) 368 | } 369 | case strings.HasPrefix(cmd, "contains("): 370 | selector.Pseudo, err = parseContainsPseudo(cmd[len("contains("):]) 371 | if err != nil { 372 | return err 373 | } 374 | case strings.HasPrefix(cmd, "nth-child("), 375 | strings.HasPrefix(cmd, "nth-last-child("), 376 | strings.HasPrefix(cmd, "nth-last-of-type("), 377 | strings.HasPrefix(cmd, "nth-of-type("): 378 | if selector.Pseudo, err = parseNthPseudo(cmd); err != nil { 379 | return err 380 | } 381 | case strings.HasPrefix(cmd, "not("): 382 | if selector.Pseudo, err = parseNotPseudo(cmd[len("not("):]); err != nil { 383 | return err 384 | } 385 | case strings.HasPrefix(cmd, "parent-of("): 386 | if selector.Pseudo, err = parseParentOfPseudo(cmd[len("parent-of("):]); err != nil { 387 | return err 388 | } 389 | default: 390 | return fmt.Errorf("%s not a valid pseudo class", cmd) 391 | } 392 | return nil 393 | } 394 | 395 | // :first-of-child 396 | func firstChildPseudo(n *html.Node) bool { 397 | for c := n.PrevSibling; c != nil; c = c.PrevSibling { 398 | if c.Type == html.ElementNode { 399 | return false 400 | } 401 | } 402 | return true 403 | } 404 | 405 | // :last-of-child 406 | func lastChildPseudo(n *html.Node) bool { 407 | for c := n.NextSibling; c != nil; c = c.NextSibling { 408 | if c.Type == html.ElementNode { 409 | return false 410 | } 411 | } 412 | return true 413 | } 414 | 415 | // :first-of-type 416 | func firstOfTypePseudo(node *html.Node) bool { 417 | if node.Type != html.ElementNode { 418 | return false 419 | } 420 | for n := node.PrevSibling; n != nil; n = n.PrevSibling { 421 | if n.DataAtom == node.DataAtom { 422 | return false 423 | } 424 | } 425 | return true 426 | } 427 | 428 | // :last-of-type 429 | func lastOfTypePseudo(node *html.Node) bool { 430 | if node.Type != html.ElementNode { 431 | return false 432 | } 433 | for n := node.NextSibling; n != nil; n = n.NextSibling { 434 | if n.DataAtom == node.DataAtom { 435 | return false 436 | } 437 | } 438 | return true 439 | } 440 | 441 | func parseNthPseudo(cmd string) (PseudoClass, error) { 442 | i := strings.IndexRune(cmd, '(') 443 | if i < 0 { 444 | // really, we should never get here 445 | return nil, fmt.Errorf("Fatal error, '%s' does not contain a '('", cmd) 446 | } 447 | pseudoName := cmd[:i] 448 | // Figure out how the counting function works 449 | var countNth func(*html.Node) int 450 | switch pseudoName { 451 | case "nth-child": 452 | countNth = func(n *html.Node) int { 453 | nth := 1 454 | for sib := n.PrevSibling; sib != nil; sib = sib.PrevSibling { 455 | if sib.Type == html.ElementNode { 456 | nth++ 457 | } 458 | } 459 | return nth 460 | } 461 | case "nth-of-type": 462 | countNth = func(n *html.Node) int { 463 | nth := 1 464 | for sib := n.PrevSibling; sib != nil; sib = sib.PrevSibling { 465 | if sib.Type == html.ElementNode && sib.DataAtom == n.DataAtom { 466 | nth++ 467 | } 468 | } 469 | return nth 470 | } 471 | case "nth-last-child": 472 | countNth = func(n *html.Node) int { 473 | nth := 1 474 | for sib := n.NextSibling; sib != nil; sib = sib.NextSibling { 475 | if sib.Type == html.ElementNode { 476 | nth++ 477 | } 478 | } 479 | return nth 480 | } 481 | case "nth-last-of-type": 482 | countNth = func(n *html.Node) int { 483 | nth := 1 484 | for sib := n.NextSibling; sib != nil; sib = sib.NextSibling { 485 | if sib.Type == html.ElementNode && sib.DataAtom == n.DataAtom { 486 | nth++ 487 | } 488 | } 489 | return nth 490 | } 491 | default: 492 | return nil, fmt.Errorf("Unrecognized pseudo '%s'", pseudoName) 493 | } 494 | 495 | nthString := cmd[i+1:] 496 | i = strings.IndexRune(nthString, ')') 497 | if i < 0 { 498 | return nil, fmt.Errorf("Unmatched '(' for pseudo class %s", pseudoName) 499 | } else if i != len(nthString)-1 { 500 | return nil, fmt.Errorf("%s(n) must end selector", pseudoName) 501 | } 502 | number := nthString[:i] 503 | 504 | // Check if the number is 'odd' or 'even' 505 | oddOrEven := -1 506 | switch number { 507 | case "odd": 508 | oddOrEven = 1 509 | case "even": 510 | oddOrEven = 0 511 | } 512 | if oddOrEven > -1 { 513 | return func(n *html.Node) bool { 514 | return n.Type == html.ElementNode && countNth(n)%2 == oddOrEven 515 | }, nil 516 | } 517 | // Check against '3n+4' pattern 518 | r := regexp.MustCompile(`([0-9]+)n[ ]?\+[ ]?([0-9])`) 519 | subMatch := r.FindAllStringSubmatch(number, -1) 520 | if len(subMatch) == 1 && len(subMatch[0]) == 3 { 521 | cycle, _ := strconv.Atoi(subMatch[0][1]) 522 | offset, _ := strconv.Atoi(subMatch[0][2]) 523 | return func(n *html.Node) bool { 524 | return n.Type == html.ElementNode && countNth(n)%cycle == offset 525 | }, nil 526 | } 527 | // check against 'n+2' pattern 528 | r = regexp.MustCompile(`n[ ]?\+[ ]?([0-9])`) 529 | subMatch = r.FindAllStringSubmatch(number, -1) 530 | if len(subMatch) == 1 && len(subMatch[0]) == 2 { 531 | offset, _ := strconv.Atoi(subMatch[0][1]) 532 | return func(n *html.Node) bool { 533 | return n.Type == html.ElementNode && countNth(n) >= offset 534 | }, nil 535 | } 536 | // the only other option is a numeric value 537 | nth, err := strconv.Atoi(nthString[:i]) 538 | if err != nil { 539 | return nil, err 540 | } else if nth <= 0 { 541 | return nil, fmt.Errorf("Argument to '%s' must be greater than 0", pseudoName) 542 | } 543 | return func(n *html.Node) bool { 544 | return n.Type == html.ElementNode && countNth(n) == nth 545 | }, nil 546 | } 547 | 548 | // Parse a :contains("") selector 549 | // expects the input to be everything after the open parenthesis 550 | // e.g. for `contains("Help")` the argument would be `"Help")` 551 | func parseContainsPseudo(cmd string) (PseudoClass, error) { 552 | var s scanner.Scanner 553 | s.Init(strings.NewReader(cmd)) 554 | switch s.Next() { 555 | case '"': 556 | default: 557 | return nil, fmt.Errorf("Malformed 'contains(\"\")' selector") 558 | } 559 | textToContain := bytes.NewBuffer([]byte{}) 560 | for { 561 | r := s.Next() 562 | switch r { 563 | case '"': 564 | // ')' then EOF must follow '"' 565 | if s.Next() != ')' { 566 | return nil, fmt.Errorf("Malformed 'contains(\"\")' selector") 567 | } 568 | if s.Next() != scanner.EOF { 569 | return nil, fmt.Errorf("'contains(\"\")' must end selector") 570 | } 571 | text := textToContain.String() 572 | contains := func(node *html.Node) bool { 573 | for c := node.FirstChild; c != nil; c = c.NextSibling { 574 | if c.Type == html.TextNode { 575 | if strings.Contains(c.Data, text) { 576 | return true 577 | } 578 | } 579 | } 580 | return false 581 | } 582 | return contains, nil 583 | case '\\': 584 | s.Next() 585 | case scanner.EOF: 586 | return nil, fmt.Errorf("Malformed 'contains(\"\")' selector") 587 | default: 588 | if _, err := textToContain.WriteRune(r); err != nil { 589 | return nil, err 590 | } 591 | } 592 | } 593 | } 594 | 595 | // Parse a :not(selector) selector 596 | // expects the input to be everything after the open parenthesis 597 | // e.g. for `not(div#id)` the argument would be `div#id)` 598 | func parseNotPseudo(cmd string) (PseudoClass, error) { 599 | if len(cmd) < 2 { 600 | return nil, fmt.Errorf("malformed ':not' selector") 601 | } 602 | endQuote, cmd := cmd[len(cmd)-1], cmd[:len(cmd)-1] 603 | selector, err := ParseSelector(cmd) 604 | if err != nil { 605 | return nil, err 606 | } 607 | if endQuote != ')' { 608 | return nil, fmt.Errorf("unmatched '('") 609 | } 610 | return func(n *html.Node) bool { 611 | return !selector.Match(n) 612 | }, nil 613 | } 614 | 615 | // Parse a :parent-of(selector) selector 616 | // expects the input to be everything after the open parenthesis 617 | // e.g. for `parent-of(div#id)` the argument would be `div#id)` 618 | func parseParentOfPseudo(cmd string) (PseudoClass, error) { 619 | if len(cmd) < 2 { 620 | return nil, fmt.Errorf("malformed ':parent-of' selector") 621 | } 622 | endQuote, cmd := cmd[len(cmd)-1], cmd[:len(cmd)-1] 623 | selector, err := ParseSelector(cmd) 624 | if err != nil { 625 | return nil, err 626 | } 627 | if endQuote != ')' { 628 | return nil, fmt.Errorf("unmatched '('") 629 | } 630 | return func(n *html.Node) bool { 631 | for c := n.FirstChild; c != nil; c = c.NextSibling { 632 | if c.Type == html.ElementNode && selector.Match(c) { 633 | return true 634 | } 635 | } 636 | return false 637 | }, nil 638 | } 639 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | A simple set of tests to help maintain sanity. 4 | 5 | These tests don't actually test functionality they only make sure pup behaves 6 | the same after code changes. 7 | 8 | `cmds.txt` holds a list of commands to perform on `index.html`. 9 | 10 | The output of each of these commands produces a specific sha1sum. The expected 11 | sha1sum of each command is in `expected_output.txt`. 12 | 13 | Running the `test` file (just a bash script) will run the tests and diff the 14 | output. If pup has changed at all since the last version, you'll see the sha1sums 15 | that changed and the commands that produced that change. 16 | 17 | To overwrite the current sha1sums, just run `python run.py > expected_output.txt` 18 | 19 | -------------------------------------------------------------------------------- /tests/cmds.txt: -------------------------------------------------------------------------------- 1 | #footer 2 | #footer li 3 | #footer li + a 4 | #footer li + a attr{title} 5 | #footer li > li 6 | table li 7 | table li:first-child 8 | table li:first-of-type 9 | table li:last-child 10 | table li:last-of-type 11 | table a[title="The Practice of Programming"] 12 | table a[title="The Practice of Programming"] text{} 13 | json{} 14 | text{} 15 | .after-portlet 16 | .after 17 | :empty 18 | td:empty 19 | .navbox-list li:nth-child(1) 20 | .navbox-list li:nth-child(2) 21 | .navbox-list li:nth-child(3) 22 | .navbox-list li:nth-last-child(1) 23 | .navbox-list li:nth-last-child(2) 24 | .navbox-list li:nth-last-child(3) 25 | .navbox-list li:nth-child(n+1) 26 | .navbox-list li:nth-child(3n+1) 27 | .navbox-list li:nth-last-child(n+1) 28 | .navbox-list li:nth-last-child(3n+1) 29 | :only-child 30 | .navbox-list li:only-child 31 | .summary 32 | [class=summary] 33 | [class="summary"] 34 | #toc 35 | #toc li + a 36 | #toc li + a text{} 37 | #toc li + a json{} 38 | #toc li + a + span 39 | #toc li + span 40 | #toc li > li 41 | li a:not([rel]) 42 | link, a 43 | link ,a 44 | link , a 45 | link , a sup 46 | link , a:parent-of(sup) 47 | link , a:parent-of(sup) sup 48 | li --number 49 | li -n 50 | -------------------------------------------------------------------------------- /tests/expected_output.txt: -------------------------------------------------------------------------------- 1 | c00fef10d36c1166cb5ac886f9d25201b720e37e #footer 2 | a7bb8dbfdd638bacad0aa9dc3674126d396b74e2 #footer li 3 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #footer li + a 4 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #footer li + a attr{title} 5 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #footer li > li 6 | a92e50c09cd56970625ac3b74efbddb83b2731bb table li 7 | 505c04a42e0084cd95560c233bd3a81b2c59352d table li:first-child 8 | 505c04a42e0084cd95560c233bd3a81b2c59352d table li:first-of-type 9 | 66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-child 10 | 66950e746590d7f4e9cfe3d1adef42cd0addcf1d table li:last-of-type 11 | 0a37d612cd4c67a42bd147b1edc5a1128456b017 table a[title="The Practice of Programming"] 12 | 0d3918d54f868f13110262ffbb88cbb0b083057d table a[title="The Practice of Programming"] text{} 13 | ecb542a30fc75c71a0c6380692cbbc4266ccbce4 json{} 14 | 95ef88ded9dab22ee3206cca47b9c3a376274bda text{} 15 | e4f7358fbb7bb1748a296fa2a7e815fa7de0a08b .after-portlet 16 | da39a3ee5e6b4b0d3255bfef95601890afd80709 .after 17 | 5b3020ba03fb43f7cdbcb3924546532b6ec9bd71 :empty 18 | 3406ca0f548d66a7351af5411ce945cf67a2f849 td:empty 19 | 30fff0af0b1209f216d6e9124e7396c0adfa0758 .navbox-list li:nth-child(1) 20 | a38e26949f047faab5ea7ba2acabff899349ce03 .navbox-list li:nth-child(2) 21 | d954831229a76b888e85149564727776e5a2b37a .navbox-list li:nth-child(3) 22 | d314e83b059bb876b0e5ee76aa92d54987961f9a .navbox-list li:nth-last-child(1) 23 | 1f19496e239bca61a1109dbbb8b5e0ab3e302b50 .navbox-list li:nth-last-child(2) 24 | 1ec9ebf14fc28c7d2b13e81241a6d2e1608589e8 .navbox-list li:nth-last-child(3) 25 | 52e726f0993d2660f0fb3ea85156f6fbcc1cfeee .navbox-list li:nth-child(n+1) 26 | 0b20c98650efa5df39d380fea8d5b43f3a08cb66 .navbox-list li:nth-child(3n+1) 27 | 52e726f0993d2660f0fb3ea85156f6fbcc1cfeee .navbox-list li:nth-last-child(n+1) 28 | 972973fe1e8f63e4481c8641d6169c638a528a6e .navbox-list li:nth-last-child(3n+1) 29 | 6c45ee6bca361b8a9baee50a15f575fc6ac73adc :only-child 30 | 44c99f6ad37b65dc0893cdcb1c60235d827ee73e .navbox-list li:only-child 31 | 641037814e358487d1938fc080e08f72a3846ef8 .summary 32 | 641037814e358487d1938fc080e08f72a3846ef8 [class=summary] 33 | 641037814e358487d1938fc080e08f72a3846ef8 [class="summary"] 34 | 613bf65ac4042b6ee0a7a47f08732fdbe1b5b06b #toc 35 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a 36 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a text{} 37 | 97d170e1550eee4afc0af065b78cda302a97674c #toc li + a json{} 38 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + a + span 39 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li + span 40 | da39a3ee5e6b4b0d3255bfef95601890afd80709 #toc li > li 41 | 87eee1189dd5296d6c010a1ad329fc53c6099d72 li a:not([rel]) 42 | 055f3c98e9160beb13f72f1009ad66b6252a9bba link, a 43 | 055f3c98e9160beb13f72f1009ad66b6252a9bba link ,a 44 | 055f3c98e9160beb13f72f1009ad66b6252a9bba link , a 45 | 0d1f66765d1632c70f8608947890524e78459362 link , a sup 46 | b6a3d6cccd305fcc3e8bf2743c443743bdaaa02b link , a:parent-of(sup) 47 | 0d1f66765d1632c70f8608947890524e78459362 link , a:parent-of(sup) sup 48 | da39a3ee5e6b4b0d3255bfef95601890afd80709 li --number 49 | da39a3ee5e6b4b0d3255bfef95601890afd80709 li -n 50 | -------------------------------------------------------------------------------- /tests/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import print_function 4 | from hashlib import sha1 5 | from subprocess import Popen, PIPE, STDOUT 6 | 7 | data = open("index.html", "r").read() 8 | 9 | for line in open("cmds.txt", "r"): 10 | line = line.strip() 11 | p = Popen(['pup', line], stdout=PIPE, stdin=PIPE, stderr=PIPE) 12 | h = sha1() 13 | h.update(p.communicate(input=data)[0]) 14 | print("%s %s" % (h.hexdigest(), line)) 15 | -------------------------------------------------------------------------------- /tests/test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python run.py > test_results.txt 4 | diff expected_output.txt test_results.txt 5 | --------------------------------------------------------------------------------