├── .gitignore
├── LICENSE
├── README.md
├── enums.go
├── generic.go
├── generic_test.go
├── htmlelement.go
├── htmlelementinfo.go
├── htmlelementinfo_init.go
├── htmlparser.go
├── htmlparser_test.go
└── testcontent.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
2 | *.o
3 | *.a
4 | *.so
5 |
6 | # Folders
7 | _obj
8 | _test
9 |
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 |
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 |
20 | _testmain.go
21 |
22 | *.exe
23 | *.test
24 | *.prof
25 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Marcelo Calbucci
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # go-htmlparser
2 | Events-based HTML 5.0 compliant parser in Go (SAX-style parsing)
3 |
4 | ## Typical Scenarios
5 | - Use it to scrape pieces of HTML
6 | - Detect META / LINK tags (e.g. Open Graph tags)
7 | - Optimize the output HTML (remove whitespace, clear empty tags)
8 | - Detect HTML syntax errors and notify developers
9 | - Extract text from the HTML
10 |
11 |
12 | ## Sample
13 |
14 | ### Get the RSS Feed of a website
15 |
16 | ```go
17 | rssFeed := ""
18 | parser := NewParser(htmlContent)
19 |
20 | parser.Parse(nil, func(e *HtmlElement, isEmpty bool) {
21 | if e.TagName == "link" {
22 |
23 | if ty,_ := e.GetAttributeValue("type"); ty == "application/rss+xml" {
24 | t.Logf("rss-e: %v %v\n", e.TagName, e.Attributes)
25 | rssFeed,_ = e.GetAttributeValue("href")
26 | parser.Stop()
27 | }
28 | }
29 | }, nil)
30 |
31 | fmt.Println(rssFeed)
32 | ```
33 |
34 | ### Remove whitespaces
35 |
36 | ```go
37 | parser := NewParser(origHtml)
38 |
39 | parser.PreserveCRLFTab = false
40 |
41 | n := bytes.NewBufferString("")
42 |
43 | parser.Parse(func(text string, parent *HtmlElement) {
44 | escaped := html.EscapeString(text)
45 | n.WriteString(escaped)
46 | }, func(parent *HtmlElement, isEmptyTag bool) {
47 | n.WriteString(parent.GetOpenTag(false, false))
48 | }, func(closeTag string) {
49 | n.WriteString("" + closeTag + ">")
50 | })
51 |
52 | newHtml := n.String()
53 | ```
54 |
55 |
56 |
57 | ## Questions
58 |
59 |
60 |
61 | ## Contributors
62 |
63 | - HtmlParser was originally created by *Marcelo Calbucci* ([blog.calbucci.com](http://blog.calbucci.com) | [@calbucci](http://twitter.com/calbucci))
64 |
65 |
--------------------------------------------------------------------------------
/enums.go:
--------------------------------------------------------------------------------
1 | package htmlparser
2 |
3 | // AttrStatus indicate a status of an attribute
4 | type AttrStatus uint8
5 |
6 | const (
7 | ASValid AttrStatus = iota
8 | ASDeprecated
9 | ASUnknown
10 | )
11 |
12 | // Type of HTML Element according to the HTML 5.0 spec
13 | type HtmlElementType uint8
14 |
15 | const (
16 | HETPhrasing HtmlElementType = 0x1 // former "inline element"
17 | HETFlow = 0x2 // former "block element"
18 | HETMeta = 0x4 // control elements
19 | HETText = 0x8 // text block
20 | HETNRCharData = 0x10 // Non-Replaceable Char Data
21 |
22 | HETAnyContent = HETPhrasing | HETFlow | HETText
23 | HETTransparent = HETPhrasing | HETFlow
24 | HETNone = 0
25 | )
26 |
27 | type HtmlTagFormatting uint8
28 |
29 | const (
30 | HTFSingle HtmlTagFormatting = iota // Has no closing tag, e.g.
31 | HTFOptionalClosing // has an optional closing tag, e.g.
", 28 | "a
", 29 | "a
b", 30 | "a
b
c", 31 | "ab
c
b
c",
33 | "a
b
c
", 34 | "ab
c", 35 | "a
b
c", 36 | "a
bcd
e
", 37 | "abcd
e", 38 | "a
bcd
e", 39 | }) 40 | } 41 | 42 | func Test_OptionalClosingWithBlockElement(t *testing.T) { 43 | testSegments(t, true, []string{ 44 | "
a |
a |
a |
e" 182 | 183 | n := bytes.NewBufferString("") 184 | 185 | parser := NewParser(segment) 186 | 187 | parser.Parse(func(text string, he *HtmlElement) { 188 | n.WriteString(text) 189 | }, nil, nil) 190 | 191 | if n.String() != "abcde" { 192 | t.Error() 193 | } 194 | 195 | } 196 | 197 | func Test_UrlAttribute(t *testing.T) { 198 | segment := ` ` 199 | 200 | foundIt := false 201 | parser := NewParser(segment) 202 | 203 | parser.Parse(nil, func(e *HtmlElement, isEmpty bool) { 204 | t.Logf("E: %v (attr=%v)\n", e.TagName, e.Attributes) 205 | if e.TagName == "link" { 206 | if len(e.Attributes) != 4 { 207 | t.Error() 208 | } 209 | if title, _ := e.GetAttributeValue("title"); title != "M-Shaped Brain » Feed" { 210 | t.Error() 211 | } 212 | if href, _ := e.GetAttributeValue("href"); href != "http://blog.calbucci.com/feed/" { 213 | t.Error() 214 | } 215 | 216 | foundIt = true 217 | } 218 | }, nil) 219 | 220 | if !foundIt { 221 | t.Error() 222 | } 223 | 224 | } 225 | 226 | func Test_FindRSSFeed(t *testing.T) { 227 | rssFeed := "" 228 | parser := NewParser(blogPost) 229 | 230 | parser.Parse(nil, func(e *HtmlElement, isEmpty bool) { 231 | if e.TagName == "link" { 232 | 233 | if ty, _ := e.GetAttributeValue("type"); ty == "application/rss+xml" { 234 | t.Logf("rss-e: %v %v\n", e.TagName, e.Attributes) 235 | rssFeed, _ = e.GetAttributeValue("href") 236 | parser.Stop() 237 | } 238 | } 239 | }, nil) 240 | 241 | t.Logf("rssFeed=%v\n", rssFeed) 242 | if rssFeed != "http://blog.calbucci.com/feed/" { 243 | t.Error() 244 | } 245 | 246 | } 247 | 248 | func Test_Idempotent(t *testing.T) { 249 | baseHtml := blogPost 250 | html1 := parseAndSerialize(baseHtml) 251 | html2 := parseAndSerialize(html1) 252 | html3 := parseAndSerialize(html2) 253 | 254 | if html1 != html2 { 255 | 256 | max := len(html1) 257 | if max > len(html2) { 258 | max = len(html2) 259 | } 260 | for i := 0; i < max; i++ { 261 | if html1[i] != html2[i] { 262 | i -= 20 263 | if i < 0 { 264 | i = 0 265 | } 266 | e := i + 30 267 | if e > max { 268 | e = max 269 | } 270 | t.Logf("Mismatch1: %v\n", html1[i:e]) 271 | t.Logf("Mismatch2: %v\n", html2[i:e]) 272 | break 273 | } 274 | } 275 | 276 | t.Error() 277 | } 278 | if html2 != html3 { 279 | t.Error() 280 | } 281 | } 282 | 283 | func parseAndSerialize(origHtml string) string { 284 | parser := NewParser(origHtml) 285 | 286 | parser.PreserveCRLFTab = false 287 | 288 | n := bytes.NewBufferString("") 289 | 290 | parser.Parse(func(text string, parent *HtmlElement) { 291 | escaped := html.EscapeString(text) 292 | n.WriteString(escaped) 293 | }, func(parent *HtmlElement, isEmptyTag bool) { 294 | n.WriteString(parent.GetOpenTag(false, false)) 295 | }, func(closeTag string) { 296 | n.WriteString("" + closeTag + ">") 297 | }) 298 | 299 | return n.String() 300 | } 301 | 302 | func Test_FindOpenGraphTags(t *testing.T) { 303 | parser := NewParser(blogPost) 304 | 305 | tags := make(map[string]string) 306 | 307 | parser.Parse(nil, func(element *HtmlElement, isEmptyTag bool) { 308 | if element.TagName == "meta" { 309 | ogName, _ := element.GetAttributeValue("property") 310 | if ogName == "" || !strings.HasPrefix(ogName, "og:") { 311 | return 312 | } 313 | ogValue, _ := element.GetAttributeValue("content") 314 | tags[ogName] = ogValue 315 | } 316 | }, nil) 317 | 318 | if !parser.HasValidSyntax { 319 | t.Error() 320 | } 321 | 322 | if v, _ := tags["og:type"]; v != "article" { 323 | t.Error() 324 | } 325 | 326 | if v, _ := tags["og:url"]; v != "http://blog.calbucci.com/2015/01/27/attention-cannibalization/" { 327 | t.Error() 328 | } 329 | 330 | } 331 | 332 | func testSegments(t *testing.T, result bool, segments []string) { 333 | for _, segment := range segments { 334 | t.Logf("Processing: %v\n", segment) 335 | parser := NewParser(segment) 336 | if parser.Parse(nil, nil, nil) != result { 337 | t.Errorf("Failed to parse segment: " + segment) 338 | } 339 | } 340 | } 341 | --------------------------------------------------------------------------------