├── .gitignore
├── LICENSE
├── README.md
├── enums.go
├── generic.go
├── generic_test.go
├── htmlelement.go
├── htmlelementinfo.go
├── htmlelementinfo_init.go
├── htmlparser.go
├── htmlparser_test.go
└── testcontent.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | *.test
24 | *.prof
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Marcelo Calbucci
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # go-htmlparser
 2 | Events-based HTML 5.0 compliant parser in Go (SAX-style parsing)
 3 | 
 4 | ## Typical Scenarios
 5 | - Use it to scrape pieces of HTML
 6 | - Detect META / LINK tags (e.g. Open Graph tags)
 7 | - Optimize the output HTML (remove whitespace, clear empty tags)
 8 | - Detect HTML syntax errors and notify developers
 9 | - Extract text from the HTML
10 | 
11 | 
12 | ## Sample
13 | 
14 | ### Get the RSS Feed of a website
15 | 
16 | ```go
17 | 	rssFeed := ""
18 | 	parser := NewParser(htmlContent)
19 | 
20 | 	parser.Parse(nil, func(e *HtmlElement, isEmpty bool) {
21 | 		if e.TagName == "link" {
22 | 
23 | 			if ty,_ := e.GetAttributeValue("type"); ty == "application/rss+xml" {
24 | 				t.Logf("rss-e: %v %v\n", e.TagName, e.Attributes)
25 | 				rssFeed,_ = e.GetAttributeValue("href")
26 | 				parser.Stop()
27 | 			}
28 | 		}
29 | 	}, nil)
30 | 	
31 | 	fmt.Println(rssFeed)
32 | ```
33 | 
34 | ### Remove whitespaces
35 | 
36 | ```go
37 | 	parser := NewParser(origHtml)
38 | 
39 | 	parser.PreserveCRLFTab = false
40 | 
41 | 	n := bytes.NewBufferString("")
42 | 
43 | 	parser.Parse(func(text string, parent *HtmlElement) {
44 | 		escaped := html.EscapeString(text)
45 | 		n.WriteString(escaped)
46 | 	}, func(parent *HtmlElement, isEmptyTag bool) {
47 | 		n.WriteString(parent.GetOpenTag(false, false))
48 | 	}, func(closeTag string) {
49 | 		n.WriteString("</" + closeTag + ">")
50 | 	})
51 | 
52 | 	newHtml := n.String()
53 | ```
54 | 
55 | 
56 | 
57 | ## Questions
58 | 
59 | 
60 | 
61 | ## Contributors
62 | 
63 | - HtmlParser was originally created by *Marcelo Calbucci* ([blog.calbucci.com](http://blog.calbucci.com) | [@calbucci](http://twitter.com/calbucci))
64 | 
65 | 


--------------------------------------------------------------------------------
/enums.go:
--------------------------------------------------------------------------------
 1 | package htmlparser
 2 | 
 3 | // AttrStatus indicate a status of an attribute
 4 | type AttrStatus uint8
 5 | 
 6 | const (
 7 | 	ASValid AttrStatus = iota
 8 | 	ASDeprecated
 9 | 	ASUnknown
10 | )
11 | 
12 | // Type of HTML Element according to the HTML 5.0 spec
13 | type HtmlElementType uint8
14 | 
15 | const (
16 | 	HETPhrasing   HtmlElementType = 0x1  // former "inline element"
17 | 	HETFlow                       = 0x2  // former "block element"
18 | 	HETMeta                       = 0x4  // control elements
19 | 	HETText                       = 0x8  // text block
20 | 	HETNRCharData                 = 0x10 // Non-Replaceable Char Data
21 | 
22 | 	HETAnyContent  = HETPhrasing | HETFlow | HETText
23 | 	HETTransparent = HETPhrasing | HETFlow
24 | 	HETNone        = 0
25 | )
26 | 
27 | type HtmlTagFormatting uint8
28 | 
29 | const (
30 | 	HTFSingle          HtmlTagFormatting = iota // Has no closing tag, e.g. <br>
31 | 	HTFOptionalClosing                          // has an optional closing tag, e.g. <li>
32 | 	HTFComplete                                 // must have a closing tag
33 | )
34 | 


--------------------------------------------------------------------------------
/generic.go:
--------------------------------------------------------------------------------
  1 | package htmlparser
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"sort"
  6 | 	"strings"
  7 | 	"unicode"
  8 | )
  9 | 
 10 | // union add all the elements from slice2 to slice1 if not present
 11 | func union(slice1, slice2 []string) []string {
 12 | 	if slice1 == nil {
 13 | 		slice1 = make([]string, len(slice2))
 14 | 		copy(slice1, slice2)
 15 | 		sort.Strings(slice1)
 16 | 		return slice1
 17 | 	}
 18 | 	for _, e2 := range slice2 {
 19 | 		found := false
 20 | 		for _, e1 := range slice1 {
 21 | 			if e1 == e2 {
 22 | 				found = true
 23 | 				break
 24 | 			}
 25 | 		}
 26 | 		if !found {
 27 | 			slice1 = append(slice1, e2)
 28 | 		}
 29 | 	}
 30 | 	sort.Strings(slice1)
 31 | 	return slice1
 32 | }
 33 | 
 34 | // sorted_contains check if a string is present in a slice using binary search
 35 | func sorted_contains(slice []string, element string) bool {
 36 | 	if slice == nil || len(slice) == 0 {
 37 | 		return false
 38 | 	}
 39 | 	pos := sort.SearchStrings(slice, element)
 40 | 	return pos != len(slice) && slice[pos] == element
 41 | }
 42 | 
 43 | // contains check if a string is present in a slice
 44 | func contains(s []string, e string) bool {
 45 | 	for _, a := range s {
 46 | 		if a == e {
 47 | 			return true
 48 | 		}
 49 | 	}
 50 | 	return false
 51 | }
 52 | 
 53 | // convertSemicolonDelimited converts a semi-colon delimited string into a slice of strings and sort them
 54 | func convertSemicolonDelimited(text string) []string {
 55 | 	if len(text) > 0 {
 56 | 		strList := strings.Split(text, ";")
 57 | 		if len(strList) > 0 {
 58 | 			for i, s := range strList {
 59 | 				strList[i] = strings.ToLower(s)
 60 | 			}
 61 | 			sort.Strings(strList)
 62 | 			return strList
 63 | 		}
 64 | 	}
 65 | 	return nil
 66 | }
 67 | 
 68 | // runesLastIndex finds the last occurrance of a rune r in the sequence runes
 69 | func runesLastIndex(runes []rune, r rune) int {
 70 | 
 71 | 	for i := len(runes) - 1; i >= 0; i-- {
 72 | 		if runes[i] == r {
 73 | 			return i
 74 | 		}
 75 | 	}
 76 | 	return -1
 77 | }
 78 | 
 79 | // runesIndex finds the first occurance of rune r in the sequence runes
 80 | func runesIndex(runes []rune, r rune) int {
 81 | 	for i, v := range runes {
 82 | 		if v == r {
 83 | 			return i
 84 | 		}
 85 | 	}
 86 | 	return -1
 87 | }
 88 | 
 89 | // runesIndexRunesStart finds the first occurrance of the sequence sub inside of runes start at position start
 90 | func runesIndexRunesStart(runes []rune, sub []rune, start int) int {
 91 | 
 92 | 	max := len(runes) - len(sub)
 93 | 	if len(sub) == 0 || max < 0 {
 94 | 		return -1
 95 | 	}
 96 | 
 97 | 	for ; start <= max; start++ {
 98 | 
 99 | 		match := true
100 | 		for i := 0; i < len(sub); i++ {
101 | 			if runes[start+i] != sub[i] {
102 | 				match = false
103 | 				break
104 | 			}
105 | 		}
106 | 		if match {
107 | 			return start
108 | 		}
109 | 
110 | 	}
111 | 	return -1
112 | }
113 | 
114 | // trimInBetween converts all whitespace to a space and remove duplicate sequences of spaces
115 | func trimInBetween(str string) string {
116 | 	if str == "" {
117 | 		return str
118 | 	}
119 | 
120 | 	n := bytes.NewBufferString("")
121 | 
122 | 	lastSpace := false
123 | 
124 | 	for _, r := range str {
125 | 		if unicode.IsSpace(r) || unicode.IsControl(r) {
126 | 			if lastSpace {
127 | 				continue
128 | 			}
129 | 			lastSpace = true
130 | 			n.WriteRune(' ')
131 | 			continue
132 | 		}
133 | 		n.WriteRune(r)
134 | 		lastSpace = false
135 | 	}
136 | 	return n.String()
137 | }
138 | 
139 | // hasContent indicates if a string has any character that's not a whitespace or control character
140 | func hasContent(text string) bool {
141 | 	if len(text) == 0 {
142 | 		return false
143 | 	}
144 | 
145 | 	for _, r := range text {
146 | 		if !unicode.IsSpace(r) && !unicode.IsControl(r) {
147 | 			return true
148 | 		}
149 | 	}
150 | 	return false
151 | }
152 | 


--------------------------------------------------------------------------------
/generic_test.go:
--------------------------------------------------------------------------------
  1 | package htmlparser
  2 | 
  3 | import (
  4 | 	"strconv"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func Test_Union(t *testing.T) {
  9 | 	a := []string{"a", "b"}
 10 | 	b := []string{"a", "c"}
 11 | 
 12 | 	c := union(a, b)
 13 | 
 14 | 	if len(c) != 3 {
 15 | 		t.Error()
 16 | 	}
 17 | }
 18 | 
 19 | func Test_Sorted_contains(t *testing.T) {
 20 | 	a := []string{"a", "b", "c"}
 21 | 
 22 | 	for _, v := range a {
 23 | 		if !sorted_contains(a, v) {
 24 | 			t.Error(v)
 25 | 		}
 26 | 	}
 27 | 
 28 | 	if sorted_contains(a, "d") {
 29 | 		t.Error("d")
 30 | 	}
 31 | }
 32 | 
 33 | func Test_Contains(t *testing.T) {
 34 | 	a := []string{"a", "b", "c"}
 35 | 
 36 | 	for _, v := range a {
 37 | 		if !sorted_contains(a, v) {
 38 | 			t.Error(v)
 39 | 		}
 40 | 	}
 41 | 
 42 | 	if sorted_contains(a, "d") {
 43 | 		t.Error("d")
 44 | 	}
 45 | }
 46 | 
 47 | func Test_convertSemicolonDelimited(t *testing.T) {
 48 | 	r := convertSemicolonDelimited("")
 49 | 	if r != nil {
 50 | 		t.Error()
 51 | 	}
 52 | 
 53 | 	r = convertSemicolonDelimited("a")
 54 | 	if len(r) != 1 || r[0] != "a" {
 55 | 		t.Error()
 56 | 	}
 57 | 
 58 | 	r = convertSemicolonDelimited("a;a")
 59 | 	if len(r) != 2 || r[0] != "a" {
 60 | 		t.Error()
 61 | 	}
 62 | 
 63 | 	r = convertSemicolonDelimited("b;a")
 64 | 	if len(r) != 2 || r[0] != "a" || r[1] != "b" {
 65 | 		t.Error()
 66 | 	}
 67 | 
 68 | }
 69 | 
 70 | func Test_runesLastIndex(t *testing.T) {
 71 | 	if runesLastIndex([]rune(""), 'a') != -1 {
 72 | 		t.Error()
 73 | 	}
 74 | 
 75 | 	if runesLastIndex([]rune("a"), 'a') != 0 {
 76 | 		t.Error()
 77 | 	}
 78 | 
 79 | 	if runesLastIndex([]rune("bac"), 'a') != 1 {
 80 | 		t.Error()
 81 | 	}
 82 | 
 83 | 	if runesLastIndex([]rune("bba"), 'a') != 2 {
 84 | 		t.Error()
 85 | 	}
 86 | 
 87 | 	if runesLastIndex([]rune("abab"), 'a') != 2 {
 88 | 		t.Error()
 89 | 	}
 90 | 
 91 | 	if runesLastIndex([]rune("defg"), 'a') != -1 {
 92 | 		t.Error()
 93 | 	}
 94 | }
 95 | 
 96 | func Test_runesIndex(t *testing.T) {
 97 | 	if runesIndex([]rune(""), 'a') != -1 {
 98 | 		t.Error()
 99 | 	}
100 | 
101 | 	if runesIndex([]rune("a"), 'a') != 0 {
102 | 		t.Error()
103 | 	}
104 | 
105 | 	if runesIndex([]rune("bac"), 'a') != 1 {
106 | 		t.Error()
107 | 	}
108 | 
109 | 	if runesIndex([]rune("bba"), 'a') != 2 {
110 | 		t.Error()
111 | 	}
112 | 
113 | 	if runesIndex([]rune("abab"), 'a') != 0 {
114 | 		t.Error()
115 | 	}
116 | 
117 | 	if runesIndex([]rune("defg"), 'a') != -1 {
118 | 		t.Error()
119 | 	}
120 | }
121 | 
122 | func Test_runesIndexRunesStart(t *testing.T) {
123 | 	if runesIndexRunesStart([]rune(""), []rune(""), 0) != -1 {
124 | 		t.Error()
125 | 	}
126 | 
127 | 	if runesIndexRunesStart([]rune("abc"), []rune(""), 0) != -1 {
128 | 		t.Error()
129 | 	}
130 | 
131 | 	if runesIndexRunesStart([]rune("abc"), []rune("d"), 0) != -1 {
132 | 		t.Error()
133 | 	}
134 | 
135 | 	if runesIndexRunesStart([]rune("abc"), []rune("def"), 0) != -1 {
136 | 		t.Error()
137 | 	}
138 | 
139 | 	if runesIndexRunesStart([]rune("abc"), []rune("abd"), 0) != -1 {
140 | 		t.Error()
141 | 	}
142 | 
143 | 	if runesIndexRunesStart([]rune("abc"), []rune("a"), 0) != 0 {
144 | 		t.Error()
145 | 	}
146 | 
147 | 	if runesIndexRunesStart([]rune("abc"), []rune("c"), 0) != 2 {
148 | 		t.Error()
149 | 	}
150 | 
151 | 	if runesIndexRunesStart([]rune("abc"), []rune("abc"), 0) != 0 {
152 | 		t.Error()
153 | 	}
154 | 
155 | 	if runesIndexRunesStart([]rune("abab"), []rune("ab"), 0) != 0 {
156 | 		t.Error()
157 | 	}
158 | 
159 | 	if r := runesIndexRunesStart([]rune("abab"), []rune("ab"), 1); r != 2 {
160 | 		t.Error(strconv.Itoa(r))
161 | 	}
162 | 
163 | }
164 | 
165 | func Test_trimInBetween(t *testing.T) {
166 | 
167 | 	if trimInBetween("") != "" {
168 | 		t.Error()
169 | 	}
170 | 
171 | 	if trimInBetween("abc") != "abc" {
172 | 		t.Error()
173 | 	}
174 | 
175 | 	if trimInBetween(" abc ") != " abc " {
176 | 		t.Error()
177 | 	}
178 | 
179 | 	if trimInBetween("a b c") != "a b c" {
180 | 		t.Error()
181 | 	}
182 | 
183 | 	if trimInBetween("a  b  c") != "a b c" {
184 | 		t.Error()
185 | 	}
186 | 
187 | 	if trimInBetween("a\nb\nc") != "a b c" {
188 | 		t.Error()
189 | 	}
190 | 
191 | 	if r := trimInBetween("a\n\nb \n c"); r != "a b c" {
192 | 		t.Error(r)
193 | 	}
194 | }
195 | 
196 | func Test_hasContent(t *testing.T) {
197 | 	if hasContent("") {
198 | 		t.Error()
199 | 	}
200 | 
201 | 	if hasContent(" ") {
202 | 		t.Error()
203 | 	}
204 | 
205 | 	if hasContent("\r") {
206 | 		t.Error()
207 | 	}
208 | 
209 | 	if hasContent("\r\n\t ") {
210 | 		t.Error()
211 | 	}
212 | 
213 | 	if !hasContent("a") {
214 | 		t.Error()
215 | 	}
216 | 
217 | 	if !hasContent(" a") {
218 | 		t.Error()
219 | 	}
220 | 
221 | 	if !hasContent("a ") {
222 | 		t.Error()
223 | 	}
224 | 
225 | 	if !hasContent("\t \n a") {
226 | 		t.Error()
227 | 	}
228 | 
229 | }
230 | 


--------------------------------------------------------------------------------
/htmlelement.go:
--------------------------------------------------------------------------------
  1 | package htmlparser
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"html"
  6 | 	"strings"
  7 | 	"unicode"
  8 | )
  9 | 
 10 | type QuoteType uint8
 11 | 
 12 | const (
 13 | 	QTNone QuoteType = iota
 14 | 	QTSingle
 15 | 	QTDouble
 16 | )
 17 | 
 18 | type attributeInfo struct {
 19 | 	Name  string
 20 | 	Value string
 21 | }
 22 | 
 23 | type HtmlElement struct {
 24 | 	errors   *[]string
 25 | 	warnings *[]string
 26 | 
 27 | 	TagName                 string
 28 | 	TagNameNS               string
 29 | 	Id                      string
 30 | 	Attributes              []attributeInfo
 31 | 	ElementInfo             *HtmlElementInfo
 32 | 	Namespace               string
 33 | 	HasNamespace            bool
 34 | 	XmlEmptyTag             bool
 35 | 	Parent                  *HtmlElement
 36 | 	HasDeprecatedAttributes bool
 37 | 	HasOnlyKnownAttributes  bool
 38 | 	SyntaxError             bool
 39 | 	FatalSyntaxError        bool
 40 | 	OriginalOpenTag         string
 41 | }
 42 | 
 43 | func NewHtmlElement(openElement string, parent *HtmlElement, errors, warnings *[]string) *HtmlElement {
 44 | 
 45 | 	he := new(HtmlElement)
 46 | 	he.OriginalOpenTag = openElement
 47 | 
 48 | 	he.Parent = parent
 49 | 
 50 | 	he.errors = errors
 51 | 	he.warnings = warnings
 52 | 
 53 | 	he.HasOnlyKnownAttributes = true
 54 | 	he.HasDeprecatedAttributes = false
 55 | 
 56 | 	// openElement contains any type of open tag/single tag
 57 | 	// Examples:
 58 | 	//	<br>
 59 | 	//  <br/>
 60 | 	//	<br clear=left>
 61 | 	//	<div style="color:#fff">
 62 | 	//	<img src='/a/b/c'>
 63 | 
 64 | 	he.Attributes = make([]attributeInfo, 0)
 65 | 
 66 | 	runes := []rune(openElement)
 67 | 	l := len(runes)
 68 | 
 69 | 	pos := 1 // skip the <
 70 | 	for ; pos < l; pos++ {
 71 | 		c := runes[pos]
 72 | 		if !unicode.IsSpace(c) {
 73 | 			break
 74 | 		}
 75 | 	}
 76 | 
 77 | 	if pos == l {
 78 | 		// Error: Empty tag with whitespaces only: "<   >";
 79 | 		he.addError("Invalid tag (whitespaces only).")
 80 | 		he.SyntaxError = true
 81 | 		return he
 82 | 	}
 83 | 
 84 | 	for ; pos < l; pos++ {
 85 | 		c := runes[pos]
 86 | 		if c == '>' {
 87 | 			if pos == 1 {
 88 | 				// Error: Empty tag like "<>"
 89 | 				he.addError("Empty tag <>")
 90 | 				he.SyntaxError = true
 91 | 				return he
 92 | 			}
 93 | 			// This is it
 94 | 			he.TagName = strings.ToLower(strings.TrimSpace(string(runes[1:pos])))
 95 | 			he.checkTag()
 96 | 			return he
 97 | 		}
 98 | 
 99 | 		if unicode.IsSpace(c) {
100 | 			he.TagName = strings.ToLower(strings.TrimSpace(string(runes[1:pos])))
101 | 			he.checkTag()
102 | 			break
103 | 		}
104 | 	}
105 | 
106 | 	pos++ // skip the whitespace
107 | 
108 | 	end := runesLastIndex(runes, '>')
109 | 	if end == -1 {
110 | 		he.addError("Missing closing >")
111 | 		he.SyntaxError = true
112 | 		he.FatalSyntaxError = true
113 | 		return he
114 | 	}
115 | 	end--
116 | 	for end >= pos {
117 | 		if runes[end] == '/' {
118 | 			he.XmlEmptyTag = true
119 | 			end--
120 | 			break
121 | 		}
122 | 		if !unicode.IsSpace(runes[end]) {
123 | 			break
124 | 		}
125 | 		end--
126 | 	}
127 | 
128 | 	if end > pos {
129 | 		he.parseAttributes(squeezeSpaces(string(runes[pos : end+1])))
130 | 	}
131 | 
132 | 	return he
133 | }
134 | 
135 | func (he *HtmlElement) GetOpenTag(noEvents, noUnknownAttributes bool) string {
136 | 	return internalBuildOpenTag(he.ElementInfo, he.TagNameNS, he.Attributes, noEvents, noUnknownAttributes, he.XmlEmptyTag)
137 | }
138 | 
139 | func (he *HtmlElement) GetCloseTag() string {
140 | 	return "</" + he.TagNameNS + ">"
141 | }
142 | 
143 | func (he *HtmlElement) GetAttributeValue(attrName string) (string, bool) {
144 | 
145 | 	i := he.FindAttributeIndex(attrName)
146 | 	if i >= 0 {
147 | 		return he.Attributes[i].Value, true
148 | 	}
149 | 	return "", false
150 | 
151 | }
152 | 
153 | func (he *HtmlElement) SetAttribute(attrName, attrValue string) bool {
154 | 	if attrName == "" {
155 | 		return true
156 | 	}
157 | 
158 | 	if strings.IndexAny(attrValue, "\r\n\t") >= 0 {
159 | 		//throw new ArgumentException("attrValue cannot contain control characters")
160 | 		return false
161 | 	}
162 | 
163 | 	i := he.FindAttributeIndex(attrName)
164 | 	if i >= 0 {
165 | 		he.Attributes[i].Value = attrValue
166 | 	} else {
167 | 		he.AddAttribute(attrName, attrValue)
168 | 	}
169 | 	return true
170 | }
171 | 
172 | func (he *HtmlElement) RemoveAttribute(attrName string) {
173 | 	i := he.FindAttributeIndex(attrName)
174 | 	if i >= 0 {
175 | 		he.Attributes = append(he.Attributes[:i], he.Attributes[i+1:]...)
176 | 	}
177 | }
178 | 
179 | func (he *HtmlElement) HasAttribute(attrName string) bool {
180 | 	return he.FindAttributeIndex(attrName) >= 0
181 | }
182 | 
183 | func (he *HtmlElement) FindAttributeIndex(attrName string) int {
184 | 	if len(he.Attributes) == 0 || attrName == "" {
185 | 		return -1
186 | 	}
187 | 
188 | 	attrName = strings.ToLower(attrName)
189 | 
190 | 	for i, a := range he.Attributes {
191 | 		if a.Name == attrName {
192 | 			return i
193 | 		}
194 | 	}
195 | 	return -1
196 | }
197 | 
198 | func (he *HtmlElement) checkTag() {
199 | 	if strings.HasSuffix(he.TagName, "/") {
200 | 		he.TagName = he.TagName[0 : len(he.TagName)-1]
201 | 	}
202 | 	he.TagNameNS = he.TagName
203 | 
204 | 	he.ElementInfo = GetElementInfo(he.TagNameNS)
205 | 
206 | 	pos := strings.Index(he.TagName, ":")
207 | 	if pos != -1 {
208 | 		he.Namespace = he.TagName[:pos]
209 | 		he.TagName = he.TagName[pos+1:]
210 | 	}
211 | 	if he.ElementInfo == nil {
212 | 		if he.Namespace == "" {
213 | 			he.addWarning("Unknown tag: " + he.TagName)
214 | 		}
215 | 	} else {
216 | 		if he.Parent != nil {
217 | 			if !he.ElementInfo.IsValidParent(he.Parent.TagName) {
218 | 				he.addWarning("Invalid parent for " + he.TagName + " (parent: " + he.Parent.TagName + ")")
219 | 			}
220 | 		}
221 | 	}
222 | }
223 | 
224 | func (he *HtmlElement) addWarning(warning string) {
225 | 	*he.warnings = append(*he.warnings, warning)
226 | }
227 | 
228 | func (he *HtmlElement) addError(error string) {
229 | 	*he.errors = append(*he.errors, error)
230 | }
231 | 
232 | func (he *HtmlElement) AddAttribute(attrName, attrVal string) {
233 | 	if attrName == "" {
234 | 		return
235 | 	}
236 | 
237 | 	if attrName == "style" {
238 | 		attrVal = cleanStyleAttr(attrVal)
239 | 	} else if attrName == "id" {
240 | 		he.Id = attrVal
241 | 	}
242 | 
243 | 	if he.ElementInfo != nil {
244 | 		//bool useUrl;
245 | 		ast := he.ElementInfo.GetAttributeStatus(attrName)
246 | 		if ast == ASUnknown {
247 | 			if strings.Index(attrName, ":") > 0 {
248 | 			} else {
249 | 				he.HasOnlyKnownAttributes = false
250 | 				he.addWarning("Unknown attribute: " + attrName + " (tag: " + he.TagNameNS + ")")
251 | 			}
252 | 		} else if ast == ASDeprecated {
253 | 			he.HasDeprecatedAttributes = true
254 | 			he.addWarning("Deprecated attribute: " + attrName + " (tag: " + he.TagNameNS + ")")
255 | 		}
256 | 	}
257 | 
258 | 	if len(attrVal) > 0 {
259 | 		attrVal = html.UnescapeString(attrVal)
260 | 	}
261 | 
262 | 	he.Attributes = append(he.Attributes, attributeInfo{attrName, attrVal})
263 | }
264 | 
265 | func squeezeSpaces(s string) string {
266 | 	n := bytes.NewBufferString("")
267 | 	atSpace := false
268 | 	atEqual := false
269 | 	inQuote := false
270 | 	quote := rune('-')
271 | 
272 | 	for _, c := range s {
273 | 
274 | 		if inQuote {
275 | 			if c == quote {
276 | 				inQuote = false
277 | 			}
278 | 			n.WriteRune(c)
279 | 			continue
280 | 		}
281 | 		if unicode.IsSpace(c) {
282 | 			atSpace = true
283 | 			continue
284 | 		}
285 | 		if c == '=' {
286 | 			atEqual = true
287 | 			continue
288 | 		}
289 | 		// At this point, we know the char is not white or '='.
290 | 		if atEqual {
291 | 			n.WriteRune('=')
292 | 			atEqual = false
293 | 			atSpace = false
294 | 		}
295 | 		if atSpace {
296 | 			n.WriteRune(' ')
297 | 			atSpace = false
298 | 		}
299 | 		if c == '"' || c == '\'' {
300 | 			inQuote = true
301 | 			quote = c
302 | 		}
303 | 		n.WriteRune(c)
304 | 	}
305 | 
306 | 	if atEqual {
307 | 		n.WriteRune('=')
308 | 	}
309 | 	return n.String()
310 | }
311 | 
312 | func (he *HtmlElement) parseAttributes(openElement string) {
313 | 	runes := []rune(openElement)
314 | 	l := len(runes)
315 | 	var attrName, attrVal string
316 | 	p := 0
317 | 	var c rune
318 | 	var found bool
319 | 	// Parse all the attributes now
320 | 	for ; p < l; p++ {
321 | 		// skip all the whitespaces
322 | 		for unicode.IsSpace(runes[p]) {
323 | 			p++
324 | 			if p == l {
325 | 				return
326 | 			}
327 | 		}
328 | 
329 | 		// now, search for the attribute name by either finding a whitespace or the "=" sign
330 | 		found = false
331 | 		startAttrName := p
332 | 		for {
333 | 			c = runes[p]
334 | 			if unicode.IsSpace(c) || c == '>' {
335 | 				// This is an empty attribute like "checked" in "<input type=checkbox checked>"
336 | 				attrName = strings.ToLower(strings.TrimSpace(string(runes[startAttrName:p])))
337 | 				he.AddAttribute(attrName, "")
338 | 				if c == '>' {
339 | 					return
340 | 				}
341 | 				found = true
342 | 				break
343 | 			}
344 | 			if c == '=' {
345 | 				break
346 | 			}
347 | 			p++
348 | 			if p >= l {
349 | 
350 | 				attrName = strings.ToLower(strings.TrimSpace(string(runes[startAttrName:p])))
351 | 				he.AddAttribute(attrName, "")
352 | 				return
353 | 			}
354 | 		}
355 | 		if found {
356 | 			continue
357 | 		}
358 | 
359 | 		if startAttrName == p {
360 | 			he.addError("Attribute name starts with the '=' sign.")
361 | 			he.SyntaxError = true
362 | 			// Invalid attribute, starts with an '=' sign
363 | 			// Skip it to the next whitespace
364 | 			p++
365 | 			c = runes[p]
366 | 			if c == '\'' {
367 | 				p = p + 1 + runesIndex(runes[p+1:], '\'')
368 | 			} else if c == '"' {
369 | 
370 | 				p = p + 1 + runesIndex(runes[p+1:], '"')
371 | 			}
372 | 			continue
373 | 		}
374 | 
375 | 		attrName = strings.ToLower(strings.TrimSpace(string(runes[startAttrName:p])))
376 | 		p++ // skipt the equal sign
377 | 		if p == l {
378 | 			he.addError("Attribute ends with equal sign.")
379 | 			he.SyntaxError = true
380 | 			he.FatalSyntaxError = true
381 | 			return
382 | 		}
383 | 
384 | 		startAttrVal := p
385 | 		c = runes[p]
386 | 
387 | 		if unicode.IsSpace(c) || c == '>' {
388 | 			// This is a malformed attribute since it has a whitespace after the '=' sign,
389 | 			// like <a class= abc> or <a class=>
390 | 			he.addError("Attribute is missing value: " + attrName)
391 | 			he.SyntaxError = true
392 | 			he.AddAttribute(attrName, "")
393 | 			continue
394 | 		}
395 | 
396 | 		if c == '\'' || c == '"' {
397 | 			startAttrVal++
398 | 			np := runesIndex(runes[p+1:], c)
399 | 			if np == -1 {
400 | 				// Argh, this attribute is missing the end quote, stop parsing
401 | 				he.addError("Attribute is missing end quote: " + attrName)
402 | 				he.SyntaxError = true
403 | 				he.FatalSyntaxError = true
404 | 				return
405 | 			}
406 | 			p = np + p + 1
407 | 
408 | 			if p == startAttrVal {
409 | 				attrVal = ""
410 | 			} else {
411 | 				attrVal = string(runes[startAttrVal:p])
412 | 			}
413 | 			he.AddAttribute(attrName, attrVal)
414 | 			continue
415 | 		}
416 | 
417 | 		// This is an attribute without a quote. Find the first whitespace or >
418 | 		for ; p < l; p++ {
419 | 			c = runes[p]
420 | 			if unicode.IsSpace(c) || c == '>' || p == l-1 {
421 | 
422 | 				attrVal = string(runes[startAttrVal : p+1])
423 | 				he.AddAttribute(attrName, attrVal)
424 | 				break
425 | 			}
426 | 		}
427 | 	}
428 | 
429 | }
430 | 
431 | func parseClosingTag(elem string) string {
432 | 	if !strings.HasPrefix(elem, "</") {
433 | 		return ""
434 | 	}
435 | 
436 | 	for p, c := range elem {
437 | 
438 | 		if c == '>' || unicode.IsSpace(c) {
439 | 
440 | 			return strings.ToLower(strings.TrimSpace(elem[2:p]))
441 | 		}
442 | 	}
443 | 	return strings.ToLower(strings.TrimSpace(elem))
444 | }
445 | 
446 | func BuildOpenTagHEI(ei *HtmlElementInfo, attributes []attributeInfo, noEvents, noUnknownAttributes bool) string {
447 | 	return internalBuildOpenTag(ei, ei.TagName, attributes, noEvents, noUnknownAttributes, false)
448 | }
449 | 
450 | func BuildOpenTag(tagName string, attributes []attributeInfo, noEvents, noUnknownAttributes bool) string {
451 | 	var ei *HtmlElementInfo
452 | 	if noUnknownAttributes {
453 | 		ei = GetElementInfo(tagName)
454 | 	}
455 | 	return internalBuildOpenTag(ei, tagName, attributes, noEvents, noUnknownAttributes, false)
456 | }
457 | 
458 | func HtmlAttributeEncode(attributeValue string) string {
459 | 
460 | 	if attributeValue == "" {
461 | 		return ""
462 | 	}
463 | 
464 | 	if strings.IndexAny(attributeValue, `&"`) == -1 {
465 | 		return attributeValue
466 | 	}
467 | 
468 | 	n := bytes.NewBufferString("")
469 | 	for _, c := range attributeValue {
470 | 		switch c {
471 | 		case '&':
472 | 			n.WriteString("&amp;")
473 | 		case '"':
474 | 			n.WriteString("&quot;")
475 | 		default:
476 | 			n.WriteRune(c)
477 | 
478 | 		}
479 | 	}
480 | 
481 | 	return n.String()
482 | }
483 | 
484 | func NeedQuotesForAttr(val string) QuoteType {
485 | 	if val == "" {
486 | 		return QTDouble
487 | 	}
488 | 
489 | 	qt := QTNone
490 | 	runes := []rune(val)
491 | 	for c := range runes {
492 | 		switch {
493 | 		case c >= 'a' && c <= 'z':
494 | 			continue
495 | 		case c >= 'A' && c <= 'Z':
496 | 			continue
497 | 		case c >= '0' && c <= '9':
498 | 			continue
499 | 		case c == '_' || c == '-' || c == '.' || c == ',': // According to http://www.w3.org/TR/html401/intro/sgmltut.html#h-3.2.2
500 | 			continue
501 | 		}
502 | 		qt = QTDouble
503 | 		if c == '"' {
504 | 			qt = QTSingle
505 | 		}
506 | 	}
507 | 	return qt
508 | }
509 | 
510 | func cleanStyleAttr(style string) string {
511 | 	if style == "" {
512 | 		return style
513 | 	}
514 | 
515 | 	parts := convertSemicolonDelimited(style)
516 | 
517 | 	n := bytes.NewBufferString("")
518 | 
519 | 	for _, part := range parts {
520 | 		p2 := strings.TrimSpace(part)
521 | 		if len(p2) == 0 {
522 | 			continue
523 | 		}
524 | 		pos := strings.IndexRune(p2, ':')
525 | 		if pos == -1 {
526 | 			continue
527 | 		}
528 | 		styleName := strings.ToLower(p2[:pos])
529 | 		styleValue := strings.TrimSpace(p2[pos+1:])
530 | 
531 | 		if len(styleValue) == 0 {
532 | 			continue
533 | 		}
534 | 
535 | 		if n.Len() > 0 {
536 | 			n.WriteRune(';')
537 | 		}
538 | 		n.WriteString(styleName)
539 | 		n.WriteRune(':')
540 | 		n.WriteString(styleValue)
541 | 	}
542 | 
543 | 	return n.String()
544 | 
545 | }
546 | 
547 | func internalBuildOpenTag(ei *HtmlElementInfo, tagName string, attributes []attributeInfo, noEvents, noUnknownAttributes, xmlEmptyTag bool) string {
548 | 	if !noUnknownAttributes {
549 | 		ei = nil
550 | 	}
551 | 
552 | 	n := bytes.NewBufferString("")
553 | 
554 | 	n.WriteRune('<')
555 | 	n.WriteString(tagName)
556 | 
557 | 	for _, a := range attributes {
558 | 
559 | 		if a.Name == "" || noEvents && strings.HasPrefix(a.Name, "on") {
560 | 			continue
561 | 		}
562 | 
563 | 		if ei != nil && ei.GetAttributeStatus(a.Name) == ASUnknown {
564 | 			continue
565 | 		}
566 | 
567 | 		n.WriteRune(' ')
568 | 		n.WriteString(a.Name)
569 | 		if a.Value == "" {
570 | 
571 | 			continue // Empty attribute (valid on HTML5 and above)
572 | 		}
573 | 
574 | 		n.WriteRune('=')
575 | 
576 | 		if len(a.Value) > 0 {
577 | 			encoded := html.EscapeString(a.Value)
578 | 			n.WriteRune('"')
579 | 			n.WriteString(encoded)
580 | 			n.WriteRune('"')
581 | 		}
582 | 	}
583 | 	if xmlEmptyTag {
584 | 		n.WriteString(" />")
585 | 	} else {
586 | 		n.WriteRune('>')
587 | 	}
588 | 	return n.String()
589 | 
590 | }
591 | 


--------------------------------------------------------------------------------
/htmlelementinfo.go:
--------------------------------------------------------------------------------
  1 | package htmlparser
  2 | 
  3 | import (
  4 | 	"strings"
  5 | )
  6 | 
  7 | var globalAttributes []string
  8 | var allElements []HtmlElementInfo
  9 | var elemsInfo map[string]HtmlElementInfo
 10 | 
 11 | func init() {
 12 | 	baseAttributes := "accesskey;class;contenteditable;contextmenu;dir;draggable;dropzone;hidden;id;lang;spellcheck;style;tabindex;title;translate;;onabort;onblur;oncanplay;oncanplaythrough;onchange;onclick;oncontextmenu;ondblclick;ondrag;ondragend;ondragenter;ondragleave;ondragover;ondragstart;ondrop;ondurationchange;onemptied;onended;onerror;onfocus;oninput;oninvalid;onkeydown;onkeypress;onkeyup;onload;onloaddata;onloadeddata;onloadedmetadata;onloadstart;onmousedown;onmousemove;onmouseout;onmouseover;onmouseup;onmousewheel;onpause;onplay;onplaying;onprogress;onratechange;onreadystatechange;onreset;onscroll;onseekend;onseeking;onselect;onshow;onstalled;onsubmit;onsuspended;ontimeupdate;onvolumechange;onwaiting;xml:base;xml:lang;xml:space"
 13 | 	globalAttributes = convertSemicolonDelimited(baseAttributes)
 14 | 
 15 | 	initElements()
 16 | 
 17 | 	elemsInfo = make(map[string]HtmlElementInfo, len(allElements))
 18 | 	for _, hei := range allElements {
 19 | 		hei.setAttributes(hei.attributesString)
 20 | 		elemsInfo[hei.TagName] = hei
 21 | 	}
 22 | }
 23 | 
 24 | type HtmlElementInfo struct {
 25 | 	TagName                string
 26 | 	HtmlVersion            int  // HTML version that introduced this tag
 27 | 	Obsolete               bool // Indicates if this element is obsolete
 28 | 	TagFormatting          HtmlTagFormatting
 29 | 	ElementType            HtmlElementType
 30 | 	PermittedChildrenTypes HtmlElementType // Valid types of elements that can be nested inside this tag
 31 | 	PermittedChildrenTags  []string        // Valid children for this tag
 32 | 	Attributes             []string
 33 | 	attributesString       []string // This is temporary to be merged with globalAttributes
 34 | 	ObsoleteAttributes     []string
 35 | 	ParentContentTypes     HtmlElementType
 36 | 	ParentTags             []string
 37 | 	ExcludeParentTags      []string
 38 | }
 39 | 
 40 | func (hei *HtmlElementInfo) GetAttributeStatus(attrName string) AttrStatus {
 41 | 	if attrName == "" {
 42 | 		return ASUnknown
 43 | 	}
 44 | 
 45 | 	attrNameLower := strings.ToLower(attrName)
 46 | 
 47 | 	if sorted_contains(hei.ObsoleteAttributes, attrNameLower) {
 48 | 		return ASDeprecated
 49 | 	}
 50 | 
 51 | 	if sorted_contains(hei.Attributes, attrNameLower) {
 52 | 		return ASValid
 53 | 	}
 54 | 
 55 | 	return ASUnknown
 56 | }
 57 | 
 58 | func (hei *HtmlElementInfo) IsValidParent(parentTagName string) bool {
 59 | 	if parentTagName == "" {
 60 | 		return true // no parent is always valid here
 61 | 	}
 62 | 
 63 | 	parentTagNameLower := strings.ToLower(parentTagName)
 64 | 
 65 | 	// Check if the parent is in the not-allowed list
 66 | 	if sorted_contains(hei.ExcludeParentTags, parentTagNameLower) {
 67 | 		return false
 68 | 	}
 69 | 
 70 | 	// Check if the parent is in the white list
 71 | 	if sorted_contains(hei.ParentTags, parentTagNameLower) {
 72 | 		return true
 73 | 	}
 74 | 
 75 | 	// Finally, check if the content type is allowed
 76 | 	if hei.ParentContentTypes == HETNone {
 77 | 		return false
 78 | 	}
 79 | 
 80 | 	parentInfo := GetElementInfo(parentTagNameLower)
 81 | 	if parentInfo == nil {
 82 | 		if strings.Contains(parentTagName, ":") {
 83 | 			return true // assume it's a custom defined element
 84 | 		}
 85 | 
 86 | 		return false
 87 | 	}
 88 | 
 89 | 	if (hei.ParentContentTypes & parentInfo.PermittedChildrenTypes) != 0 {
 90 | 		return true
 91 | 	}
 92 | 
 93 | 	return false
 94 | }
 95 | 
 96 | func (hei *HtmlElementInfo) setPermittedChildrenTags(tags string) {
 97 | 	hei.PermittedChildrenTags = convertSemicolonDelimited(tags)
 98 | }
 99 | 
100 | func (hei *HtmlElementInfo) setObsoleteAttributes(attrs string) {
101 | 	hei.ObsoleteAttributes = convertSemicolonDelimited(attrs)
102 | }
103 | 
104 | func (hei *HtmlElementInfo) setParentTags(tags string) {
105 | 	hei.ParentTags = convertSemicolonDelimited(tags)
106 | }
107 | 
108 | func (hei *HtmlElementInfo) setExcludeParentTags(tags string) {
109 | 	hei.ParentTags = convertSemicolonDelimited(tags)
110 | }
111 | 
112 | func (hei *HtmlElementInfo) setAttributes(attrs []string) {
113 | 	if len(attrs) == 0 {
114 | 		hei.Attributes = globalAttributes
115 | 	} else {
116 | 		hei.Attributes = union(attrs, globalAttributes)
117 | 
118 | 	}
119 | }
120 | 
121 | // GetElementInfo returns the HtmlElementInfo for this tag
122 | func GetElementInfo(tagName string) *HtmlElementInfo {
123 | 	if tagName == "" {
124 | 		return nil
125 | 	}
126 | 
127 | 	elem, exist := elemsInfo[tagName]
128 | 	if exist {
129 | 		return &elem
130 | 	}
131 | 	return nil
132 | }
133 | 


--------------------------------------------------------------------------------
/htmlelementinfo_init.go:
--------------------------------------------------------------------------------
   1 | package htmlparser
   2 | 
   3 | func initElements() {
   4 | 	allElements = []HtmlElementInfo{
   5 | 		HtmlElementInfo{
   6 | 			TagName:                "a",
   7 | 			HtmlVersion:            3,
   8 | 			Obsolete:               false,
   9 | 			ElementType:            HETFlow,
  10 | 			PermittedChildrenTypes: HETAnyContent,
  11 | 			PermittedChildrenTags:  []string{},
  12 | 			attributesString:       []string{"href", "target", "rel", "hreflang", "media", "type"},
  13 | 			TagFormatting:          HTFComplete,
  14 | 			ParentTags:             []string{},
  15 | 			ExcludeParentTags:      []string{"a", "button"},
  16 | 			ParentContentTypes:     HETFlow | HETPhrasing,
  17 | 			ObsoleteAttributes:     []string{"coords", "shape", "urn", "charset", "methods", "rev", "name"},
  18 | 		},
  19 | 		HtmlElementInfo{
  20 | 			TagName:                "abbr",
  21 | 			HtmlVersion:            3,
  22 | 			Obsolete:               false,
  23 | 			ElementType:            HETPhrasing,
  24 | 			PermittedChildrenTypes: HETPhrasing | HETText,
  25 | 			PermittedChildrenTags:  []string{},
  26 | 			attributesString:       []string{},
  27 | 			TagFormatting:          HTFComplete,
  28 | 			ParentTags:             []string{},
  29 | 			ExcludeParentTags:      []string{},
  30 | 			ParentContentTypes:     HETPhrasing,
  31 | 			ObsoleteAttributes:     []string{},
  32 | 		},
  33 | 		HtmlElementInfo{
  34 | 
  35 | 			TagName:                "acronym",
  36 | 			HtmlVersion:            4,
  37 | 			Obsolete:               true,
  38 | 			ElementType:            HETPhrasing,
  39 | 			PermittedChildrenTypes: HETAnyContent,
  40 | 			PermittedChildrenTags:  []string{},
  41 | 			attributesString:       []string{},
  42 | 			TagFormatting:          HTFComplete,
  43 | 			ParentTags:             []string{},
  44 | 			ExcludeParentTags:      []string{},
  45 | 			ParentContentTypes:     HETPhrasing,
  46 | 			ObsoleteAttributes:     []string{},
  47 | 		},
  48 | 		HtmlElementInfo{
  49 | 
  50 | 			TagName:                "address",
  51 | 			HtmlVersion:            3,
  52 | 			Obsolete:               false,
  53 | 			ElementType:            HETFlow,
  54 | 			PermittedChildrenTypes: HETAnyContent,
  55 | 			PermittedChildrenTags:  []string{},
  56 | 			attributesString:       []string{},
  57 | 			TagFormatting:          HTFComplete,
  58 | 			ParentTags:             []string{},
  59 | 			ExcludeParentTags:      []string{"address"},
  60 | 			ParentContentTypes:     HETFlow,
  61 | 			ObsoleteAttributes:     []string{},
  62 | 		},
  63 | 		HtmlElementInfo{
  64 | 
  65 | 			TagName:                "applet",
  66 | 			HtmlVersion:            3,
  67 | 			Obsolete:               true,
  68 | 			ElementType:            HETPhrasing,
  69 | 			PermittedChildrenTypes: HETAnyContent,
  70 | 			PermittedChildrenTags:  []string{},
  71 | 			attributesString:       []string{},
  72 | 			TagFormatting:          HTFComplete,
  73 | 			ParentTags:             []string{},
  74 | 			ExcludeParentTags:      []string{},
  75 | 			ParentContentTypes:     HETPhrasing,
  76 | 			ObsoleteAttributes:     []string{},
  77 | 		},
  78 | 		HtmlElementInfo{
  79 | 
  80 | 			TagName:                "area",
  81 | 			HtmlVersion:            3,
  82 | 			Obsolete:               false,
  83 | 			ElementType:            HETPhrasing,
  84 | 			PermittedChildrenTypes: HETNone,
  85 | 			PermittedChildrenTags:  []string{},
  86 | 			attributesString:       []string{"alt", "href", "target", "rel", "media", "hreflang", "type", "shape", "coords"},
  87 | 			TagFormatting:          HTFSingle,
  88 | 			ParentTags:             []string{"map"},
  89 | 			ExcludeParentTags:      []string{},
  90 | 			ParentContentTypes:     HETNone,
  91 | 			ObsoleteAttributes:     []string{"nohref"},
  92 | 		},
  93 | 		HtmlElementInfo{
  94 | 
  95 | 			TagName:                "article",
  96 | 			HtmlVersion:            5,
  97 | 			Obsolete:               false,
  98 | 			ElementType:            HETFlow,
  99 | 			PermittedChildrenTypes: HETAnyContent,
 100 | 			PermittedChildrenTags:  []string{},
 101 | 			attributesString:       []string{},
 102 | 			TagFormatting:          HTFComplete,
 103 | 			ParentTags:             []string{},
 104 | 			ExcludeParentTags:      []string{},
 105 | 			ParentContentTypes:     HETFlow,
 106 | 			ObsoleteAttributes:     []string{},
 107 | 		},
 108 | 		HtmlElementInfo{
 109 | 
 110 | 			TagName:                "aside",
 111 | 			HtmlVersion:            5,
 112 | 			Obsolete:               false,
 113 | 			ElementType:            HETFlow,
 114 | 			PermittedChildrenTypes: HETAnyContent,
 115 | 			PermittedChildrenTags:  []string{},
 116 | 			attributesString:       []string{},
 117 | 			TagFormatting:          HTFComplete,
 118 | 			ParentTags:             []string{},
 119 | 			ExcludeParentTags:      []string{"address"},
 120 | 			ParentContentTypes:     HETFlow,
 121 | 			ObsoleteAttributes:     []string{},
 122 | 		},
 123 | 		HtmlElementInfo{
 124 | 
 125 | 			TagName:                "audio",
 126 | 			HtmlVersion:            5,
 127 | 			Obsolete:               false,
 128 | 			ElementType:            HETPhrasing,
 129 | 			PermittedChildrenTypes: HETFlow | HETPhrasing,
 130 | 			PermittedChildrenTags:  []string{},
 131 | 			attributesString:       []string{"autoplay", "preload", "controls", "loop", "mediagroup", "muted", "src"},
 132 | 			TagFormatting:          HTFComplete,
 133 | 			ParentTags:             []string{},
 134 | 			ExcludeParentTags:      []string{"a", "button"},
 135 | 			ParentContentTypes:     HETFlow | HETPhrasing,
 136 | 			ObsoleteAttributes:     []string{},
 137 | 		},
 138 | 		HtmlElementInfo{
 139 | 
 140 | 			TagName:                "b",
 141 | 			HtmlVersion:            3,
 142 | 			Obsolete:               false,
 143 | 			ElementType:            HETPhrasing,
 144 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 145 | 			PermittedChildrenTags:  []string{},
 146 | 			attributesString:       []string{},
 147 | 			TagFormatting:          HTFComplete,
 148 | 			ParentTags:             []string{},
 149 | 			ExcludeParentTags:      []string{},
 150 | 			ParentContentTypes:     HETPhrasing,
 151 | 			ObsoleteAttributes:     []string{},
 152 | 		},
 153 | 		HtmlElementInfo{
 154 | 
 155 | 			TagName:                "base",
 156 | 			HtmlVersion:            3,
 157 | 			Obsolete:               false,
 158 | 			ElementType:            HETPhrasing,
 159 | 			PermittedChildrenTypes: HETNone,
 160 | 			PermittedChildrenTags:  []string{},
 161 | 			attributesString:       []string{"href", "target"},
 162 | 			TagFormatting:          HTFSingle,
 163 | 			ParentTags:             []string{"head"},
 164 | 			ExcludeParentTags:      []string{},
 165 | 			ParentContentTypes:     HETNone,
 166 | 			ObsoleteAttributes:     []string{},
 167 | 		},
 168 | 		HtmlElementInfo{
 169 | 
 170 | 			TagName:                "basefont",
 171 | 			HtmlVersion:            3,
 172 | 			Obsolete:               true,
 173 | 			ElementType:            HETPhrasing,
 174 | 			PermittedChildrenTypes: HETNone,
 175 | 			PermittedChildrenTags:  []string{},
 176 | 			attributesString:       []string{},
 177 | 			TagFormatting:          HTFSingle,
 178 | 			ParentTags:             []string{},
 179 | 			ExcludeParentTags:      []string{},
 180 | 			ParentContentTypes:     HETPhrasing,
 181 | 			ObsoleteAttributes:     []string{},
 182 | 		},
 183 | 		HtmlElementInfo{
 184 | 
 185 | 			TagName:                "bdi",
 186 | 			HtmlVersion:            5,
 187 | 			Obsolete:               false,
 188 | 			ElementType:            HETPhrasing,
 189 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 190 | 			PermittedChildrenTags:  []string{},
 191 | 			attributesString:       []string{},
 192 | 			TagFormatting:          HTFComplete,
 193 | 			ParentTags:             []string{},
 194 | 			ExcludeParentTags:      []string{},
 195 | 			ParentContentTypes:     HETPhrasing,
 196 | 			ObsoleteAttributes:     []string{},
 197 | 		},
 198 | 		HtmlElementInfo{
 199 | 
 200 | 			TagName:                "bdo",
 201 | 			HtmlVersion:            3,
 202 | 			Obsolete:               false,
 203 | 			ElementType:            HETPhrasing,
 204 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 205 | 			PermittedChildrenTags:  []string{},
 206 | 			attributesString:       []string{},
 207 | 			TagFormatting:          HTFComplete,
 208 | 			ParentTags:             []string{},
 209 | 			ExcludeParentTags:      []string{},
 210 | 			ParentContentTypes:     HETPhrasing,
 211 | 			ObsoleteAttributes:     []string{},
 212 | 		},
 213 | 		HtmlElementInfo{
 214 | 
 215 | 			TagName:                "big",
 216 | 			HtmlVersion:            3,
 217 | 			Obsolete:               true,
 218 | 			ElementType:            HETPhrasing,
 219 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 220 | 			PermittedChildrenTags:  []string{},
 221 | 			attributesString:       []string{},
 222 | 			TagFormatting:          HTFComplete,
 223 | 			ParentTags:             []string{},
 224 | 			ExcludeParentTags:      []string{},
 225 | 			ParentContentTypes:     HETPhrasing,
 226 | 			ObsoleteAttributes:     []string{},
 227 | 		},
 228 | 		HtmlElementInfo{
 229 | 
 230 | 			TagName:                "blockquote",
 231 | 			HtmlVersion:            3,
 232 | 			Obsolete:               false,
 233 | 			ElementType:            HETFlow,
 234 | 			PermittedChildrenTypes: HETAnyContent,
 235 | 			PermittedChildrenTags:  []string{},
 236 | 			attributesString:       []string{"cite"},
 237 | 			TagFormatting:          HTFComplete,
 238 | 			ParentTags:             []string{},
 239 | 			ExcludeParentTags:      []string{},
 240 | 			ParentContentTypes:     HETFlow,
 241 | 			ObsoleteAttributes:     []string{},
 242 | 		},
 243 | 		HtmlElementInfo{
 244 | 
 245 | 			TagName:                "body",
 246 | 			HtmlVersion:            3,
 247 | 			Obsolete:               false,
 248 | 			ElementType:            HETFlow,
 249 | 			PermittedChildrenTypes: HETAnyContent,
 250 | 			PermittedChildrenTags:  []string{"script", "style"},
 251 | 			attributesString:       []string{"onafterprint", "onbeforeprint", "onbeforeunload", "onblur", "onerror", "onfocus", "onhaschange", "onload", "onmessage", "onoffline", "ononline", "onpagehide", "onpageshow", "onpopstate", "onresize", "onstoragte", "onunload"},
 252 | 			TagFormatting:          HTFOptionalClosing,
 253 | 			ParentTags:             []string{"html"},
 254 | 			ExcludeParentTags:      []string{},
 255 | 			ParentContentTypes:     HETNone,
 256 | 			ObsoleteAttributes:     []string{"alink", "background", "bgcolor", "link", "marginbottom", "marginheight", "marginleft", "marginright", "margintop", "marginwidth", "text", "vlink"},
 257 | 		},
 258 | 		HtmlElementInfo{
 259 | 
 260 | 			TagName:                "br",
 261 | 			HtmlVersion:            3,
 262 | 			Obsolete:               false,
 263 | 			ElementType:            HETPhrasing,
 264 | 			PermittedChildrenTypes: HETNone,
 265 | 			PermittedChildrenTags:  []string{},
 266 | 			attributesString:       []string{},
 267 | 			TagFormatting:          HTFSingle,
 268 | 			ParentTags:             []string{},
 269 | 			ExcludeParentTags:      []string{},
 270 | 			ParentContentTypes:     HETFlow | HETPhrasing,
 271 | 			ObsoleteAttributes:     []string{"clear"},
 272 | 		},
 273 | 		HtmlElementInfo{
 274 | 
 275 | 			TagName:                "button",
 276 | 			HtmlVersion:            4,
 277 | 			Obsolete:               false,
 278 | 			ElementType:            HETPhrasing,
 279 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 280 | 			PermittedChildrenTags:  []string{},
 281 | 			attributesString:       []string{"name", "disabled", "form", "type", "value", "formaction", "autofocus", "formenctype", "formmethod", "formtarget", "formnovalidate"},
 282 | 			TagFormatting:          HTFComplete,
 283 | 			ParentTags:             []string{},
 284 | 			ExcludeParentTags:      []string{"a", "button"},
 285 | 			ParentContentTypes:     HETPhrasing,
 286 | 			ObsoleteAttributes:     []string{},
 287 | 		},
 288 | 		HtmlElementInfo{
 289 | 
 290 | 			TagName:                "canvas",
 291 | 			HtmlVersion:            5,
 292 | 			Obsolete:               false,
 293 | 			ElementType:            HETFlow,
 294 | 			PermittedChildrenTypes: HETAnyContent,
 295 | 			PermittedChildrenTags:  []string{},
 296 | 			attributesString:       []string{"height", "width"},
 297 | 			TagFormatting:          HTFComplete,
 298 | 			ParentTags:             []string{},
 299 | 			ExcludeParentTags:      []string{},
 300 | 			ParentContentTypes:     HETFlow | HETPhrasing,
 301 | 			ObsoleteAttributes:     []string{},
 302 | 		},
 303 | 		HtmlElementInfo{
 304 | 
 305 | 			TagName:                "caption",
 306 | 			HtmlVersion:            3,
 307 | 			Obsolete:               false,
 308 | 			ElementType:            HETPhrasing,
 309 | 			PermittedChildrenTypes: HETAnyContent,
 310 | 			PermittedChildrenTags:  []string{},
 311 | 			attributesString:       []string{},
 312 | 			TagFormatting:          HTFComplete,
 313 | 			ParentTags:             []string{"table"},
 314 | 			ExcludeParentTags:      []string{},
 315 | 			ParentContentTypes:     HETNone,
 316 | 			ObsoleteAttributes:     []string{"align"},
 317 | 		},
 318 | 		HtmlElementInfo{
 319 | 
 320 | 			TagName:                "center",
 321 | 			HtmlVersion:            3,
 322 | 			Obsolete:               true,
 323 | 			ElementType:            HETFlow,
 324 | 			PermittedChildrenTypes: HETAnyContent,
 325 | 			PermittedChildrenTags:  []string{},
 326 | 			attributesString:       []string{},
 327 | 			TagFormatting:          HTFComplete,
 328 | 			ParentTags:             []string{},
 329 | 			ExcludeParentTags:      []string{},
 330 | 			ParentContentTypes:     HETFlow,
 331 | 			ObsoleteAttributes:     []string{},
 332 | 		},
 333 | 		HtmlElementInfo{
 334 | 
 335 | 			TagName:                "cite",
 336 | 			HtmlVersion:            3,
 337 | 			Obsolete:               false,
 338 | 			ElementType:            HETPhrasing,
 339 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 340 | 			PermittedChildrenTags:  []string{},
 341 | 			attributesString:       []string{},
 342 | 			TagFormatting:          HTFComplete,
 343 | 			ParentTags:             []string{},
 344 | 			ExcludeParentTags:      []string{},
 345 | 			ParentContentTypes:     HETPhrasing,
 346 | 			ObsoleteAttributes:     []string{},
 347 | 		},
 348 | 		HtmlElementInfo{
 349 | 
 350 | 			TagName:                "code",
 351 | 			HtmlVersion:            3,
 352 | 			Obsolete:               false,
 353 | 			ElementType:            HETPhrasing,
 354 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 355 | 			PermittedChildrenTags:  []string{},
 356 | 			attributesString:       []string{},
 357 | 			TagFormatting:          HTFComplete,
 358 | 			ParentTags:             []string{},
 359 | 			ExcludeParentTags:      []string{},
 360 | 			ParentContentTypes:     HETPhrasing,
 361 | 			ObsoleteAttributes:     []string{},
 362 | 		},
 363 | 		HtmlElementInfo{
 364 | 
 365 | 			TagName:                "col",
 366 | 			HtmlVersion:            3,
 367 | 			Obsolete:               false,
 368 | 			ElementType:            HETPhrasing,
 369 | 			PermittedChildrenTypes: HETNone,
 370 | 			PermittedChildrenTags:  []string{},
 371 | 			attributesString:       []string{"span"},
 372 | 			TagFormatting:          HTFSingle,
 373 | 			ParentTags:             []string{"colgroup"},
 374 | 			ExcludeParentTags:      []string{},
 375 | 			ParentContentTypes:     HETNone,
 376 | 			ObsoleteAttributes:     []string{"align", "width", "char", "charoff", "valign"},
 377 | 		},
 378 | 		HtmlElementInfo{
 379 | 
 380 | 			TagName:                "colgroup",
 381 | 			HtmlVersion:            4,
 382 | 			Obsolete:               false,
 383 | 			ElementType:            HETPhrasing,
 384 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 385 | 			PermittedChildrenTags:  []string{},
 386 | 			attributesString:       []string{"span"},
 387 | 			TagFormatting:          HTFOptionalClosing,
 388 | 			ParentTags:             []string{"table"},
 389 | 			ExcludeParentTags:      []string{},
 390 | 			ParentContentTypes:     HETNone,
 391 | 			ObsoleteAttributes:     []string{"width", "char", "charoff", "valign"},
 392 | 		},
 393 | 		HtmlElementInfo{
 394 | 
 395 | 			TagName:                "command",
 396 | 			HtmlVersion:            5,
 397 | 			Obsolete:               false,
 398 | 			ElementType:            HETMeta,
 399 | 			PermittedChildrenTypes: HETNone,
 400 | 			PermittedChildrenTags:  []string{},
 401 | 			attributesString:       []string{"type", "label", "icon", "disabled", "radiogroup", "checked"},
 402 | 			TagFormatting:          HTFSingle,
 403 | 			ParentTags:             []string{},
 404 | 			ExcludeParentTags:      []string{},
 405 | 			ParentContentTypes:     HETPhrasing | HETMeta,
 406 | 			ObsoleteAttributes:     []string{},
 407 | 		},
 408 | 		HtmlElementInfo{
 409 | 
 410 | 			TagName:                "datalist",
 411 | 			HtmlVersion:            5,
 412 | 			Obsolete:               false,
 413 | 			ElementType:            HETPhrasing,
 414 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 415 | 			PermittedChildrenTags:  []string{},
 416 | 			attributesString:       []string{},
 417 | 			TagFormatting:          HTFComplete,
 418 | 			ParentTags:             []string{},
 419 | 			ExcludeParentTags:      []string{},
 420 | 			ParentContentTypes:     HETPhrasing,
 421 | 			ObsoleteAttributes:     []string{},
 422 | 		},
 423 | 		HtmlElementInfo{
 424 | 
 425 | 			TagName:                "dd",
 426 | 			HtmlVersion:            3,
 427 | 			Obsolete:               false,
 428 | 			ElementType:            HETFlow,
 429 | 			PermittedChildrenTypes: HETAnyContent,
 430 | 			PermittedChildrenTags:  []string{},
 431 | 			attributesString:       []string{},
 432 | 			TagFormatting:          HTFOptionalClosing,
 433 | 			ParentTags:             []string{"dl"},
 434 | 			ExcludeParentTags:      []string{},
 435 | 			ParentContentTypes:     HETNone,
 436 | 			ObsoleteAttributes:     []string{},
 437 | 		},
 438 | 		HtmlElementInfo{
 439 | 
 440 | 			TagName:                "del",
 441 | 			HtmlVersion:            4,
 442 | 			Obsolete:               false,
 443 | 			ElementType:            HETPhrasing,
 444 | 			PermittedChildrenTypes: HETAnyContent,
 445 | 			PermittedChildrenTags:  []string{},
 446 | 			attributesString:       []string{"cite", "datetime"},
 447 | 			TagFormatting:          HTFComplete,
 448 | 			ParentTags:             []string{},
 449 | 			ExcludeParentTags:      []string{},
 450 | 			ParentContentTypes:     HETFlow | HETPhrasing,
 451 | 			ObsoleteAttributes:     []string{},
 452 | 		},
 453 | 		HtmlElementInfo{
 454 | 
 455 | 			TagName:                "details",
 456 | 			HtmlVersion:            5,
 457 | 			Obsolete:               false,
 458 | 			ElementType:            HETFlow,
 459 | 			PermittedChildrenTypes: HETAnyContent,
 460 | 			PermittedChildrenTags:  []string{},
 461 | 			attributesString:       []string{"open"},
 462 | 			TagFormatting:          HTFComplete,
 463 | 			ParentTags:             []string{},
 464 | 			ExcludeParentTags:      []string{"a", "button"},
 465 | 			ParentContentTypes:     HETFlow,
 466 | 			ObsoleteAttributes:     []string{},
 467 | 		},
 468 | 		HtmlElementInfo{
 469 | 
 470 | 			TagName:                "dfn",
 471 | 			HtmlVersion:            3,
 472 | 			Obsolete:               false,
 473 | 			ElementType:            HETPhrasing,
 474 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 475 | 			PermittedChildrenTags:  []string{},
 476 | 			attributesString:       []string{},
 477 | 			TagFormatting:          HTFComplete,
 478 | 			ParentTags:             []string{},
 479 | 			ExcludeParentTags:      []string{},
 480 | 			ParentContentTypes:     HETPhrasing,
 481 | 			ObsoleteAttributes:     []string{},
 482 | 		},
 483 | 		HtmlElementInfo{
 484 | 
 485 | 			TagName:                "dir",
 486 | 			HtmlVersion:            3,
 487 | 			Obsolete:               true,
 488 | 			ElementType:            HETFlow,
 489 | 			PermittedChildrenTypes: HETAnyContent,
 490 | 			PermittedChildrenTags:  []string{},
 491 | 			attributesString:       []string{},
 492 | 			TagFormatting:          HTFComplete,
 493 | 			ParentTags:             []string{},
 494 | 			ExcludeParentTags:      []string{},
 495 | 			ParentContentTypes:     HETFlow,
 496 | 			ObsoleteAttributes:     []string{},
 497 | 		},
 498 | 		HtmlElementInfo{
 499 | 
 500 | 			TagName:                "div",
 501 | 			HtmlVersion:            3,
 502 | 			Obsolete:               false,
 503 | 			ElementType:            HETFlow,
 504 | 			PermittedChildrenTypes: HETAnyContent,
 505 | 			PermittedChildrenTags:  []string{},
 506 | 			attributesString:       []string{},
 507 | 			TagFormatting:          HTFComplete,
 508 | 			ParentTags:             []string{},
 509 | 			ExcludeParentTags:      []string{},
 510 | 			ParentContentTypes:     HETFlow,
 511 | 			ObsoleteAttributes:     []string{},
 512 | 		},
 513 | 		HtmlElementInfo{
 514 | 
 515 | 			TagName:                "dl",
 516 | 			HtmlVersion:            3,
 517 | 			Obsolete:               false,
 518 | 			ElementType:            HETFlow,
 519 | 			PermittedChildrenTypes: HETAnyContent,
 520 | 			PermittedChildrenTags:  []string{},
 521 | 			attributesString:       []string{},
 522 | 			TagFormatting:          HTFComplete,
 523 | 			ParentTags:             []string{},
 524 | 			ExcludeParentTags:      []string{},
 525 | 			ParentContentTypes:     HETFlow,
 526 | 			ObsoleteAttributes:     []string{"compact"},
 527 | 		},
 528 | 		HtmlElementInfo{
 529 | 
 530 | 			TagName:                "dt",
 531 | 			HtmlVersion:            3,
 532 | 			Obsolete:               false,
 533 | 			ElementType:            HETFlow,
 534 | 			PermittedChildrenTypes: HETAnyContent,
 535 | 			PermittedChildrenTags:  []string{},
 536 | 			attributesString:       []string{},
 537 | 			TagFormatting:          HTFOptionalClosing,
 538 | 			ParentTags:             []string{"dl"},
 539 | 			ExcludeParentTags:      []string{},
 540 | 			ParentContentTypes:     HETNone,
 541 | 			ObsoleteAttributes:     []string{},
 542 | 		},
 543 | 		HtmlElementInfo{
 544 | 
 545 | 			TagName:                "em",
 546 | 			HtmlVersion:            3,
 547 | 			Obsolete:               false,
 548 | 			ElementType:            HETPhrasing,
 549 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 550 | 			PermittedChildrenTags:  []string{},
 551 | 			attributesString:       []string{},
 552 | 			TagFormatting:          HTFComplete,
 553 | 			ParentTags:             []string{},
 554 | 			ExcludeParentTags:      []string{},
 555 | 			ParentContentTypes:     HETPhrasing,
 556 | 			ObsoleteAttributes:     []string{},
 557 | 		},
 558 | 		HtmlElementInfo{
 559 | 
 560 | 			TagName:                "embed",
 561 | 			HtmlVersion:            3,
 562 | 			Obsolete:               true,
 563 | 			ElementType:            HETPhrasing,
 564 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 565 | 			PermittedChildrenTags:  []string{},
 566 | 			attributesString:       []string{},
 567 | 			TagFormatting:          HTFSingle,
 568 | 			ParentTags:             []string{},
 569 | 			ExcludeParentTags:      []string{},
 570 | 			ParentContentTypes:     HETPhrasing,
 571 | 			ObsoleteAttributes:     []string{},
 572 | 		},
 573 | 		HtmlElementInfo{
 574 | 
 575 | 			TagName:                "fieldset",
 576 | 			HtmlVersion:            4,
 577 | 			Obsolete:               false,
 578 | 			ElementType:            HETFlow,
 579 | 			PermittedChildrenTypes: HETAnyContent,
 580 | 			PermittedChildrenTags:  []string{},
 581 | 			attributesString:       []string{"name", "disabled", "form"},
 582 | 			TagFormatting:          HTFComplete,
 583 | 			ParentTags:             []string{},
 584 | 			ExcludeParentTags:      []string{},
 585 | 			ParentContentTypes:     HETFlow,
 586 | 			ObsoleteAttributes:     []string{},
 587 | 		},
 588 | 		HtmlElementInfo{
 589 | 
 590 | 			TagName:                "figcaption",
 591 | 			HtmlVersion:            5,
 592 | 			Obsolete:               false,
 593 | 			ElementType:            HETFlow,
 594 | 			PermittedChildrenTypes: HETAnyContent,
 595 | 			PermittedChildrenTags:  []string{},
 596 | 			attributesString:       []string{},
 597 | 			TagFormatting:          HTFComplete,
 598 | 			ParentTags:             []string{"figure"},
 599 | 			ExcludeParentTags:      []string{},
 600 | 			ParentContentTypes:     HETNone,
 601 | 			ObsoleteAttributes:     []string{},
 602 | 		},
 603 | 		HtmlElementInfo{
 604 | 
 605 | 			TagName:                "figure",
 606 | 			HtmlVersion:            5,
 607 | 			Obsolete:               false,
 608 | 			ElementType:            HETFlow,
 609 | 			PermittedChildrenTypes: HETAnyContent,
 610 | 			PermittedChildrenTags:  []string{},
 611 | 			attributesString:       []string{},
 612 | 			TagFormatting:          HTFComplete,
 613 | 			ParentTags:             []string{},
 614 | 			ExcludeParentTags:      []string{},
 615 | 			ParentContentTypes:     HETFlow,
 616 | 			ObsoleteAttributes:     []string{},
 617 | 		},
 618 | 		HtmlElementInfo{
 619 | 
 620 | 			TagName:                "font",
 621 | 			HtmlVersion:            3,
 622 | 			Obsolete:               true,
 623 | 			ElementType:            HETPhrasing,
 624 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 625 | 			PermittedChildrenTags:  []string{},
 626 | 			attributesString:       []string{},
 627 | 			TagFormatting:          HTFComplete,
 628 | 			ParentTags:             []string{},
 629 | 			ExcludeParentTags:      []string{},
 630 | 			ParentContentTypes:     HETPhrasing,
 631 | 			ObsoleteAttributes:     []string{},
 632 | 		},
 633 | 		HtmlElementInfo{
 634 | 
 635 | 			TagName:                "footer",
 636 | 			HtmlVersion:            5,
 637 | 			Obsolete:               false,
 638 | 			ElementType:            HETFlow,
 639 | 			PermittedChildrenTypes: HETAnyContent,
 640 | 			PermittedChildrenTags:  []string{},
 641 | 			attributesString:       []string{},
 642 | 			TagFormatting:          HTFComplete,
 643 | 			ParentTags:             []string{},
 644 | 			ExcludeParentTags:      []string{"header", "footer", "address"},
 645 | 			ParentContentTypes:     HETFlow,
 646 | 			ObsoleteAttributes:     []string{},
 647 | 		},
 648 | 		HtmlElementInfo{
 649 | 
 650 | 			TagName:                "form",
 651 | 			HtmlVersion:            3,
 652 | 			Obsolete:               false,
 653 | 			ElementType:            HETFlow,
 654 | 			PermittedChildrenTypes: HETAnyContent,
 655 | 			PermittedChildrenTags:  []string{},
 656 | 			attributesString:       []string{"action", "method", "enctype", "name", "accept-charset", "novalidate", "target", "autocomplete"},
 657 | 			TagFormatting:          HTFComplete,
 658 | 			ParentTags:             []string{},
 659 | 			ExcludeParentTags:      []string{"form"},
 660 | 			ParentContentTypes:     HETFlow,
 661 | 			ObsoleteAttributes:     []string{},
 662 | 		},
 663 | 		HtmlElementInfo{
 664 | 
 665 | 			TagName:                "frame",
 666 | 			HtmlVersion:            3,
 667 | 			Obsolete:               true,
 668 | 			ElementType:            HETFlow,
 669 | 			PermittedChildrenTypes: HETAnyContent,
 670 | 			PermittedChildrenTags:  []string{},
 671 | 			attributesString:       []string{},
 672 | 			TagFormatting:          HTFSingle,
 673 | 			ParentTags:             []string{},
 674 | 			ExcludeParentTags:      []string{},
 675 | 			ParentContentTypes:     HETFlow,
 676 | 			ObsoleteAttributes:     []string{},
 677 | 		},
 678 | 		HtmlElementInfo{
 679 | 
 680 | 			TagName:                "frameset",
 681 | 			HtmlVersion:            3,
 682 | 			Obsolete:               true,
 683 | 			ElementType:            HETFlow,
 684 | 			PermittedChildrenTypes: HETAnyContent,
 685 | 			PermittedChildrenTags:  []string{},
 686 | 			attributesString:       []string{},
 687 | 			TagFormatting:          HTFComplete,
 688 | 			ParentTags:             []string{},
 689 | 			ExcludeParentTags:      []string{},
 690 | 			ParentContentTypes:     HETFlow,
 691 | 			ObsoleteAttributes:     []string{},
 692 | 		},
 693 | 		HtmlElementInfo{
 694 | 
 695 | 			TagName:                "h1",
 696 | 			HtmlVersion:            3,
 697 | 			Obsolete:               false,
 698 | 			ElementType:            HETFlow,
 699 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 700 | 			PermittedChildrenTags:  []string{},
 701 | 			attributesString:       []string{},
 702 | 			TagFormatting:          HTFComplete,
 703 | 			ParentTags:             []string{"hgroup"},
 704 | 			ExcludeParentTags:      []string{"address"},
 705 | 			ParentContentTypes:     HETFlow,
 706 | 			ObsoleteAttributes:     []string{"align"},
 707 | 		},
 708 | 		HtmlElementInfo{
 709 | 
 710 | 			TagName:                "h2",
 711 | 			HtmlVersion:            3,
 712 | 			Obsolete:               false,
 713 | 			ElementType:            HETFlow,
 714 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 715 | 			PermittedChildrenTags:  []string{},
 716 | 			attributesString:       []string{},
 717 | 			TagFormatting:          HTFComplete,
 718 | 			ParentTags:             []string{"hgroup"},
 719 | 			ExcludeParentTags:      []string{"address"},
 720 | 			ParentContentTypes:     HETFlow,
 721 | 			ObsoleteAttributes:     []string{"align"},
 722 | 		},
 723 | 		HtmlElementInfo{
 724 | 
 725 | 			TagName:                "h3",
 726 | 			HtmlVersion:            3,
 727 | 			Obsolete:               false,
 728 | 			ElementType:            HETFlow,
 729 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 730 | 			PermittedChildrenTags:  []string{},
 731 | 			attributesString:       []string{},
 732 | 			TagFormatting:          HTFComplete,
 733 | 			ParentTags:             []string{"hgroup"},
 734 | 			ExcludeParentTags:      []string{"address"},
 735 | 			ParentContentTypes:     HETFlow,
 736 | 			ObsoleteAttributes:     []string{"align"},
 737 | 		},
 738 | 		HtmlElementInfo{
 739 | 
 740 | 			TagName:                "h4",
 741 | 			HtmlVersion:            3,
 742 | 			Obsolete:               false,
 743 | 			ElementType:            HETFlow,
 744 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 745 | 			PermittedChildrenTags:  []string{},
 746 | 			attributesString:       []string{},
 747 | 			TagFormatting:          HTFComplete,
 748 | 			ParentTags:             []string{"hgroup"},
 749 | 			ExcludeParentTags:      []string{"address"},
 750 | 			ParentContentTypes:     HETFlow,
 751 | 			ObsoleteAttributes:     []string{"align"},
 752 | 		},
 753 | 		HtmlElementInfo{
 754 | 
 755 | 			TagName:                "h5",
 756 | 			HtmlVersion:            3,
 757 | 			Obsolete:               false,
 758 | 			ElementType:            HETFlow,
 759 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 760 | 			PermittedChildrenTags:  []string{},
 761 | 			attributesString:       []string{},
 762 | 			TagFormatting:          HTFComplete,
 763 | 			ParentTags:             []string{"hgroup"},
 764 | 			ExcludeParentTags:      []string{"address"},
 765 | 			ParentContentTypes:     HETFlow,
 766 | 			ObsoleteAttributes:     []string{"align"},
 767 | 		},
 768 | 		HtmlElementInfo{
 769 | 
 770 | 			TagName:                "h6",
 771 | 			HtmlVersion:            3,
 772 | 			Obsolete:               false,
 773 | 			ElementType:            HETFlow,
 774 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 775 | 			PermittedChildrenTags:  []string{},
 776 | 			attributesString:       []string{},
 777 | 			TagFormatting:          HTFComplete,
 778 | 			ParentTags:             []string{"hgroup"},
 779 | 			ExcludeParentTags:      []string{"address"},
 780 | 			ParentContentTypes:     HETFlow,
 781 | 			ObsoleteAttributes:     []string{"align"},
 782 | 		},
 783 | 		HtmlElementInfo{
 784 | 
 785 | 			TagName:                "head",
 786 | 			HtmlVersion:            3,
 787 | 			Obsolete:               false,
 788 | 			ElementType:            HETMeta,
 789 | 			PermittedChildrenTypes: HETMeta,
 790 | 			PermittedChildrenTags:  []string{},
 791 | 			attributesString:       []string{},
 792 | 			TagFormatting:          HTFOptionalClosing,
 793 | 			ParentTags:             []string{"html"},
 794 | 			ExcludeParentTags:      []string{},
 795 | 			ParentContentTypes:     HETNone,
 796 | 			ObsoleteAttributes:     []string{"profile"},
 797 | 		},
 798 | 		HtmlElementInfo{
 799 | 
 800 | 			TagName:                "header",
 801 | 			HtmlVersion:            5,
 802 | 			Obsolete:               false,
 803 | 			ElementType:            HETFlow,
 804 | 			PermittedChildrenTypes: HETAnyContent,
 805 | 			PermittedChildrenTags:  []string{},
 806 | 			attributesString:       []string{},
 807 | 			TagFormatting:          HTFComplete,
 808 | 			ParentTags:             []string{},
 809 | 			ExcludeParentTags:      []string{"footer", "address", "header"},
 810 | 			ParentContentTypes:     HETFlow,
 811 | 			ObsoleteAttributes:     []string{},
 812 | 		},
 813 | 		HtmlElementInfo{
 814 | 
 815 | 			TagName:                "hgroup",
 816 | 			HtmlVersion:            5,
 817 | 			Obsolete:               false,
 818 | 			ElementType:            HETFlow,
 819 | 			PermittedChildrenTypes: HETNone,
 820 | 			PermittedChildrenTags:  []string{"h1", "h2", "h3", "h4", "h5", "h6"},
 821 | 			attributesString:       []string{},
 822 | 			TagFormatting:          HTFComplete,
 823 | 			ParentTags:             []string{},
 824 | 			ExcludeParentTags:      []string{},
 825 | 			ParentContentTypes:     HETFlow,
 826 | 			ObsoleteAttributes:     []string{},
 827 | 		},
 828 | 		HtmlElementInfo{
 829 | 
 830 | 			TagName:                "hr",
 831 | 			HtmlVersion:            3,
 832 | 			Obsolete:               false,
 833 | 			ElementType:            HETFlow,
 834 | 			PermittedChildrenTypes: HETNone,
 835 | 			PermittedChildrenTags:  []string{},
 836 | 			attributesString:       []string{},
 837 | 			TagFormatting:          HTFSingle,
 838 | 			ParentTags:             []string{},
 839 | 			ExcludeParentTags:      []string{},
 840 | 			ParentContentTypes:     HETFlow,
 841 | 			ObsoleteAttributes:     []string{"align", "width", "noshade", "size", "color"},
 842 | 		},
 843 | 		HtmlElementInfo{
 844 | 
 845 | 			TagName:                "html",
 846 | 			HtmlVersion:            3,
 847 | 			Obsolete:               false,
 848 | 			ElementType:            HETPhrasing,
 849 | 			PermittedChildrenTypes: HETNone,
 850 | 			PermittedChildrenTags:  []string{"head", "body"},
 851 | 			attributesString:       []string{"manifest"},
 852 | 			TagFormatting:          HTFOptionalClosing,
 853 | 			ParentTags:             []string{},
 854 | 			ExcludeParentTags:      []string{},
 855 | 			ParentContentTypes:     HETNone,
 856 | 			ObsoleteAttributes:     []string{"version"},
 857 | 		},
 858 | 		HtmlElementInfo{
 859 | 
 860 | 			TagName:                "i",
 861 | 			HtmlVersion:            3,
 862 | 			Obsolete:               false,
 863 | 			ElementType:            HETPhrasing,
 864 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 865 | 			PermittedChildrenTags:  []string{},
 866 | 			attributesString:       []string{},
 867 | 			TagFormatting:          HTFComplete,
 868 | 			ParentTags:             []string{},
 869 | 			ExcludeParentTags:      []string{},
 870 | 			ParentContentTypes:     HETPhrasing,
 871 | 			ObsoleteAttributes:     []string{},
 872 | 		},
 873 | 		HtmlElementInfo{
 874 | 
 875 | 			TagName:                "iframe",
 876 | 			HtmlVersion:            3,
 877 | 			Obsolete:               false,
 878 | 			ElementType:            HETFlow,
 879 | 			PermittedChildrenTypes: HETText,
 880 | 			PermittedChildrenTags:  []string{},
 881 | 			attributesString:       []string{"src", "srcdoc", "name", "width", "height", "sandbox", "seamless"},
 882 | 			TagFormatting:          HTFComplete,
 883 | 			ParentTags:             []string{},
 884 | 			ExcludeParentTags:      []string{"a", "button"},
 885 | 			ParentContentTypes:     HETPhrasing,
 886 | 			ObsoleteAttributes:     []string{"longdesc", "align", "allowtransparency", "frameborder", "marginheight", "marginwidth", "scrolling"},
 887 | 		},
 888 | 		HtmlElementInfo{
 889 | 
 890 | 			TagName:                "img",
 891 | 			HtmlVersion:            3,
 892 | 			Obsolete:               false,
 893 | 			ElementType:            HETPhrasing,
 894 | 			PermittedChildrenTypes: HETNone,
 895 | 			PermittedChildrenTags:  []string{},
 896 | 			attributesString:       []string{"src", "alt", "height", "width", "usemap", "ismap", "border"},
 897 | 			TagFormatting:          HTFSingle,
 898 | 			ParentTags:             []string{},
 899 | 			ExcludeParentTags:      []string{},
 900 | 			ParentContentTypes:     HETPhrasing,
 901 | 			ObsoleteAttributes:     []string{"longdesc", "name", "align", "hspace", "vspace", "border"},
 902 | 		},
 903 | 		HtmlElementInfo{
 904 | 
 905 | 			TagName:                "input",
 906 | 			HtmlVersion:            3,
 907 | 			Obsolete:               false,
 908 | 			ElementType:            HETPhrasing,
 909 | 			PermittedChildrenTypes: HETNone,
 910 | 			PermittedChildrenTags:  []string{},
 911 | 			attributesString:       []string{"name", "disabled", "form", "type", "maxlength", "readonly", "size", "value", "autocomplete", "autofocus", "list", "pattern", "required", "placeholder", "dirname", "checked", "formaction", "formenctype", "formmethod", "formtarget", "formnovalidate", "accept", "multiple", "alt", "src", "height", "width", "list", "min", "max", "step"},
 912 | 			TagFormatting:          HTFSingle,
 913 | 			ParentTags:             []string{},
 914 | 			ExcludeParentTags:      []string{},
 915 | 			ParentContentTypes:     HETPhrasing,
 916 | 			ObsoleteAttributes:     []string{"usemap", "align"},
 917 | 		},
 918 | 		HtmlElementInfo{
 919 | 
 920 | 			TagName:                "ins",
 921 | 			HtmlVersion:            4,
 922 | 			Obsolete:               false,
 923 | 			ElementType:            HETPhrasing,
 924 | 			PermittedChildrenTypes: HETAnyContent,
 925 | 			PermittedChildrenTags:  []string{},
 926 | 			attributesString:       []string{"cite", "datetime"},
 927 | 			TagFormatting:          HTFComplete,
 928 | 			ParentTags:             []string{},
 929 | 			ExcludeParentTags:      []string{},
 930 | 			ParentContentTypes:     HETFlow | HETPhrasing,
 931 | 			ObsoleteAttributes:     []string{},
 932 | 		},
 933 | 		HtmlElementInfo{
 934 | 
 935 | 			TagName:                "isindex",
 936 | 			HtmlVersion:            3,
 937 | 			Obsolete:               true,
 938 | 			ElementType:            HETFlow,
 939 | 			PermittedChildrenTypes: HETAnyContent,
 940 | 			PermittedChildrenTags:  []string{},
 941 | 			attributesString:       []string{},
 942 | 			TagFormatting:          HTFSingle,
 943 | 			ParentTags:             []string{},
 944 | 			ExcludeParentTags:      []string{},
 945 | 			ParentContentTypes:     HETFlow,
 946 | 			ObsoleteAttributes:     []string{},
 947 | 		},
 948 | 		HtmlElementInfo{
 949 | 
 950 | 			TagName:                "kbd",
 951 | 			HtmlVersion:            3,
 952 | 			Obsolete:               false,
 953 | 			ElementType:            HETPhrasing,
 954 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 955 | 			PermittedChildrenTags:  []string{},
 956 | 			attributesString:       []string{},
 957 | 			TagFormatting:          HTFComplete,
 958 | 			ParentTags:             []string{},
 959 | 			ExcludeParentTags:      []string{},
 960 | 			ParentContentTypes:     HETPhrasing,
 961 | 			ObsoleteAttributes:     []string{},
 962 | 		},
 963 | 		HtmlElementInfo{
 964 | 
 965 | 			TagName:                "keygen",
 966 | 			HtmlVersion:            5,
 967 | 			Obsolete:               false,
 968 | 			ElementType:            HETPhrasing,
 969 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 970 | 			PermittedChildrenTags:  []string{},
 971 | 			attributesString:       []string{"challenge", "keytype", "autofocus", "name", "disabled", "form"},
 972 | 			TagFormatting:          HTFSingle,
 973 | 			ParentTags:             []string{},
 974 | 			ExcludeParentTags:      []string{},
 975 | 			ParentContentTypes:     HETPhrasing,
 976 | 			ObsoleteAttributes:     []string{},
 977 | 		},
 978 | 		HtmlElementInfo{
 979 | 
 980 | 			TagName:                "label",
 981 | 			HtmlVersion:            4,
 982 | 			Obsolete:               false,
 983 | 			ElementType:            HETPhrasing,
 984 | 			PermittedChildrenTypes: HETPhrasing | HETText,
 985 | 			PermittedChildrenTags:  []string{},
 986 | 			attributesString:       []string{"for", "form"},
 987 | 			TagFormatting:          HTFComplete,
 988 | 			ParentTags:             []string{},
 989 | 			ExcludeParentTags:      []string{},
 990 | 			ParentContentTypes:     HETPhrasing,
 991 | 			ObsoleteAttributes:     []string{},
 992 | 		},
 993 | 		HtmlElementInfo{
 994 | 
 995 | 			TagName:                "legend",
 996 | 			HtmlVersion:            3,
 997 | 			Obsolete:               false,
 998 | 			ElementType:            HETPhrasing,
 999 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1000 | 			PermittedChildrenTags:  []string{},
1001 | 			attributesString:       []string{},
1002 | 			TagFormatting:          HTFComplete,
1003 | 			ParentTags:             []string{"fieldset"},
1004 | 			ExcludeParentTags:      []string{},
1005 | 			ParentContentTypes:     HETNone,
1006 | 			ObsoleteAttributes:     []string{},
1007 | 		},
1008 | 		HtmlElementInfo{
1009 | 
1010 | 			TagName:                "li",
1011 | 			HtmlVersion:            3,
1012 | 			Obsolete:               false,
1013 | 			ElementType:            HETFlow,
1014 | 			PermittedChildrenTypes: HETAnyContent,
1015 | 			PermittedChildrenTags:  []string{},
1016 | 			attributesString:       []string{"value"},
1017 | 			TagFormatting:          HTFOptionalClosing,
1018 | 			ParentTags:             []string{"ul", "ol", "menu"},
1019 | 			ExcludeParentTags:      []string{},
1020 | 			ParentContentTypes:     HETNone,
1021 | 			ObsoleteAttributes:     []string{},
1022 | 		},
1023 | 		HtmlElementInfo{
1024 | 
1025 | 			TagName:                "link",
1026 | 			HtmlVersion:            3,
1027 | 			Obsolete:               false,
1028 | 			ElementType:            HETMeta,
1029 | 			PermittedChildrenTypes: HETNone,
1030 | 			PermittedChildrenTags:  []string{},
1031 | 			attributesString:       []string{"href", "rel", "hreflang", "media", "type", "sizes"},
1032 | 			TagFormatting:          HTFSingle,
1033 | 			ParentTags:             []string{"noscript"},
1034 | 			ExcludeParentTags:      []string{},
1035 | 			ParentContentTypes:     HETMeta,
1036 | 			ObsoleteAttributes:     []string{"target", "urn", "charset", "methods", "rev"},
1037 | 		},
1038 | 		HtmlElementInfo{
1039 | 
1040 | 			TagName:                "map",
1041 | 			HtmlVersion:            3,
1042 | 			Obsolete:               false,
1043 | 			ElementType:            HETPhrasing,
1044 | 			PermittedChildrenTypes: HETAnyContent,
1045 | 			PermittedChildrenTags:  []string{},
1046 | 			attributesString:       []string{"name"},
1047 | 			TagFormatting:          HTFComplete,
1048 | 			ParentTags:             []string{},
1049 | 			ExcludeParentTags:      []string{},
1050 | 			ParentContentTypes:     HETFlow | HETPhrasing,
1051 | 			ObsoleteAttributes:     []string{},
1052 | 		},
1053 | 		HtmlElementInfo{
1054 | 
1055 | 			TagName:                "mark",
1056 | 			HtmlVersion:            5,
1057 | 			Obsolete:               false,
1058 | 			ElementType:            HETPhrasing,
1059 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1060 | 			PermittedChildrenTags:  []string{},
1061 | 			attributesString:       []string{},
1062 | 			TagFormatting:          HTFComplete,
1063 | 			ParentTags:             []string{},
1064 | 			ExcludeParentTags:      []string{},
1065 | 			ParentContentTypes:     HETPhrasing,
1066 | 			ObsoleteAttributes:     []string{},
1067 | 		},
1068 | 		HtmlElementInfo{
1069 | 
1070 | 			TagName:                "menu",
1071 | 			HtmlVersion:            3,
1072 | 			Obsolete:               false,
1073 | 			ElementType:            HETFlow,
1074 | 			PermittedChildrenTypes: HETAnyContent,
1075 | 			PermittedChildrenTags:  []string{},
1076 | 			attributesString:       []string{"type", "label"},
1077 | 			TagFormatting:          HTFComplete,
1078 | 			ParentTags:             []string{},
1079 | 			ExcludeParentTags:      []string{},
1080 | 			ParentContentTypes:     HETFlow,
1081 | 			ObsoleteAttributes:     []string{"compact"},
1082 | 		},
1083 | 		HtmlElementInfo{
1084 | 
1085 | 			TagName:                "meta",
1086 | 			HtmlVersion:            3,
1087 | 			Obsolete:               false,
1088 | 			ElementType:            HETMeta,
1089 | 			PermittedChildrenTypes: HETNone,
1090 | 			PermittedChildrenTags:  []string{},
1091 | 			attributesString:       []string{"name", "content", "http-equiv", "charset"},
1092 | 			TagFormatting:          HTFSingle,
1093 | 			ParentTags:             []string{},
1094 | 			ExcludeParentTags:      []string{},
1095 | 			ParentContentTypes:     HETMeta,
1096 | 			ObsoleteAttributes:     []string{},
1097 | 		},
1098 | 		HtmlElementInfo{
1099 | 
1100 | 			TagName:                "meter",
1101 | 			HtmlVersion:            5,
1102 | 			Obsolete:               false,
1103 | 			ElementType:            HETPhrasing,
1104 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1105 | 			PermittedChildrenTags:  []string{},
1106 | 			attributesString:       []string{"value", "min", "low", "high", "max", "optimum"},
1107 | 			TagFormatting:          HTFComplete,
1108 | 			ParentTags:             []string{},
1109 | 			ExcludeParentTags:      []string{},
1110 | 			ParentContentTypes:     HETPhrasing,
1111 | 			ObsoleteAttributes:     []string{},
1112 | 		},
1113 | 		HtmlElementInfo{
1114 | 
1115 | 			TagName:                "nav",
1116 | 			HtmlVersion:            5,
1117 | 			Obsolete:               false,
1118 | 			ElementType:            HETFlow,
1119 | 			PermittedChildrenTypes: HETAnyContent,
1120 | 			PermittedChildrenTags:  []string{},
1121 | 			attributesString:       []string{},
1122 | 			TagFormatting:          HTFComplete,
1123 | 			ParentTags:             []string{},
1124 | 			ExcludeParentTags:      []string{"address"},
1125 | 			ParentContentTypes:     HETFlow,
1126 | 			ObsoleteAttributes:     []string{},
1127 | 		},
1128 | 		HtmlElementInfo{
1129 | 
1130 | 			TagName:                "nobr",
1131 | 			HtmlVersion:            3,
1132 | 			Obsolete:               true,
1133 | 			ElementType:            HETPhrasing,
1134 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1135 | 			PermittedChildrenTags:  []string{},
1136 | 			attributesString:       []string{},
1137 | 			TagFormatting:          HTFComplete,
1138 | 			ParentTags:             []string{},
1139 | 			ExcludeParentTags:      []string{},
1140 | 			ParentContentTypes:     HETPhrasing,
1141 | 			ObsoleteAttributes:     []string{},
1142 | 		},
1143 | 		HtmlElementInfo{
1144 | 
1145 | 			TagName:                "noframes",
1146 | 			HtmlVersion:            3,
1147 | 			Obsolete:               true,
1148 | 			ElementType:            HETFlow,
1149 | 			PermittedChildrenTypes: HETAnyContent,
1150 | 			PermittedChildrenTags:  []string{},
1151 | 			attributesString:       []string{},
1152 | 			TagFormatting:          HTFComplete,
1153 | 			ParentTags:             []string{},
1154 | 			ExcludeParentTags:      []string{},
1155 | 			ParentContentTypes:     HETFlow,
1156 | 			ObsoleteAttributes:     []string{},
1157 | 		},
1158 | 		HtmlElementInfo{
1159 | 
1160 | 			TagName:                "noscript",
1161 | 			HtmlVersion:            3,
1162 | 			Obsolete:               false,
1163 | 			ElementType:            HETMeta,
1164 | 			PermittedChildrenTypes: HETMeta | HETAnyContent,
1165 | 			PermittedChildrenTags:  []string{},
1166 | 			attributesString:       []string{},
1167 | 			TagFormatting:          HTFComplete,
1168 | 			ParentTags:             []string{},
1169 | 			ExcludeParentTags:      []string{"noscript"},
1170 | 			ParentContentTypes:     HETMeta | HETFlow | HETPhrasing,
1171 | 			ObsoleteAttributes:     []string{},
1172 | 		},
1173 | 		HtmlElementInfo{
1174 | 
1175 | 			TagName:                "object",
1176 | 			HtmlVersion:            3,
1177 | 			Obsolete:               false,
1178 | 			ElementType:            HETPhrasing,
1179 | 			PermittedChildrenTypes: HETAnyContent,
1180 | 			PermittedChildrenTags:  []string{},
1181 | 			attributesString:       []string{"data", "type", "height", "width", "usemap", "name", "form"},
1182 | 			TagFormatting:          HTFComplete,
1183 | 			ParentTags:             []string{},
1184 | 			ExcludeParentTags:      []string{"a", "button"},
1185 | 			ParentContentTypes:     HETFlow | HETPhrasing,
1186 | 			ObsoleteAttributes:     []string{"archive", "classid", "code", "codebase", "codetype", "declare", "standby", "align", "hspace", "vspace", "border"},
1187 | 		},
1188 | 		HtmlElementInfo{
1189 | 
1190 | 			TagName:                "ol",
1191 | 			HtmlVersion:            3,
1192 | 			Obsolete:               false,
1193 | 			ElementType:            HETFlow,
1194 | 			PermittedChildrenTypes: HETNone,
1195 | 			PermittedChildrenTags:  []string{"li"},
1196 | 			attributesString:       []string{"start", "reversed", "type"},
1197 | 			TagFormatting:          HTFComplete,
1198 | 			ParentTags:             []string{},
1199 | 			ExcludeParentTags:      []string{},
1200 | 			ParentContentTypes:     HETFlow,
1201 | 			ObsoleteAttributes:     []string{"compact"},
1202 | 		},
1203 | 		HtmlElementInfo{
1204 | 
1205 | 			TagName:                "optgroup",
1206 | 			HtmlVersion:            3,
1207 | 			Obsolete:               false,
1208 | 			ElementType:            HETPhrasing,
1209 | 			PermittedChildrenTypes: HETNone,
1210 | 			PermittedChildrenTags:  []string{"option"},
1211 | 			attributesString:       []string{"label", "disabled"},
1212 | 			TagFormatting:          HTFOptionalClosing,
1213 | 			ParentTags:             []string{"select"},
1214 | 			ExcludeParentTags:      []string{},
1215 | 			ParentContentTypes:     HETNone,
1216 | 			ObsoleteAttributes:     []string{},
1217 | 		},
1218 | 		HtmlElementInfo{
1219 | 
1220 | 			TagName:                "option",
1221 | 			HtmlVersion:            3,
1222 | 			Obsolete:               false,
1223 | 			ElementType:            HETPhrasing,
1224 | 			PermittedChildrenTypes: HETText,
1225 | 			PermittedChildrenTags:  []string{},
1226 | 			attributesString:       []string{"disabled", "selected", "label", "value"},
1227 | 			TagFormatting:          HTFOptionalClosing,
1228 | 			ParentTags:             []string{"optgroup", "select", "datalist"},
1229 | 			ExcludeParentTags:      []string{},
1230 | 			ParentContentTypes:     HETNone,
1231 | 			ObsoleteAttributes:     []string{"name"},
1232 | 		},
1233 | 		HtmlElementInfo{
1234 | 
1235 | 			TagName:                "output",
1236 | 			HtmlVersion:            5,
1237 | 			Obsolete:               false,
1238 | 			ElementType:            HETPhrasing,
1239 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1240 | 			PermittedChildrenTags:  []string{},
1241 | 			attributesString:       []string{"name", "form", "for"},
1242 | 			TagFormatting:          HTFComplete,
1243 | 			ParentTags:             []string{},
1244 | 			ExcludeParentTags:      []string{},
1245 | 			ParentContentTypes:     HETPhrasing,
1246 | 			ObsoleteAttributes:     []string{},
1247 | 		},
1248 | 		HtmlElementInfo{
1249 | 
1250 | 			TagName:                "p",
1251 | 			HtmlVersion:            3,
1252 | 			Obsolete:               false,
1253 | 			ElementType:            HETFlow,
1254 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1255 | 			PermittedChildrenTags:  []string{},
1256 | 			attributesString:       []string{},
1257 | 			TagFormatting:          HTFOptionalClosing,
1258 | 			ParentTags:             []string{},
1259 | 			ExcludeParentTags:      []string{},
1260 | 			ParentContentTypes:     HETFlow,
1261 | 			ObsoleteAttributes:     []string{"align"},
1262 | 		},
1263 | 		HtmlElementInfo{
1264 | 
1265 | 			TagName:                "param",
1266 | 			HtmlVersion:            3,
1267 | 			Obsolete:               false,
1268 | 			ElementType:            HETMeta,
1269 | 			PermittedChildrenTypes: HETNone,
1270 | 			PermittedChildrenTags:  []string{},
1271 | 			attributesString:       []string{"name", "value"},
1272 | 			TagFormatting:          HTFSingle,
1273 | 			ParentTags:             []string{"object"},
1274 | 			ExcludeParentTags:      []string{},
1275 | 			ParentContentTypes:     HETNone,
1276 | 			ObsoleteAttributes:     []string{"type", "valuetype"},
1277 | 		},
1278 | 		HtmlElementInfo{
1279 | 
1280 | 			TagName:                "pre",
1281 | 			HtmlVersion:            3,
1282 | 			Obsolete:               false,
1283 | 			ElementType:            HETFlow,
1284 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1285 | 			PermittedChildrenTags:  []string{},
1286 | 			attributesString:       []string{},
1287 | 			TagFormatting:          HTFComplete,
1288 | 			ParentTags:             []string{},
1289 | 			ExcludeParentTags:      []string{},
1290 | 			ParentContentTypes:     HETFlow,
1291 | 			ObsoleteAttributes:     []string{},
1292 | 		},
1293 | 		HtmlElementInfo{
1294 | 
1295 | 			TagName:                "progress",
1296 | 			HtmlVersion:            5,
1297 | 			Obsolete:               false,
1298 | 			ElementType:            HETPhrasing,
1299 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1300 | 			PermittedChildrenTags:  []string{},
1301 | 			attributesString:       []string{"value", "max"},
1302 | 			TagFormatting:          HTFComplete,
1303 | 			ParentTags:             []string{},
1304 | 			ExcludeParentTags:      []string{},
1305 | 			ParentContentTypes:     HETPhrasing,
1306 | 			ObsoleteAttributes:     []string{},
1307 | 		},
1308 | 		HtmlElementInfo{
1309 | 
1310 | 			TagName:                "q",
1311 | 			HtmlVersion:            4,
1312 | 			Obsolete:               false,
1313 | 			ElementType:            HETPhrasing,
1314 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1315 | 			PermittedChildrenTags:  []string{},
1316 | 			attributesString:       []string{"cite"},
1317 | 			TagFormatting:          HTFComplete,
1318 | 			ParentTags:             []string{},
1319 | 			ExcludeParentTags:      []string{},
1320 | 			ParentContentTypes:     HETPhrasing,
1321 | 			ObsoleteAttributes:     []string{},
1322 | 		},
1323 | 		HtmlElementInfo{
1324 | 
1325 | 			TagName:                "rp",
1326 | 			HtmlVersion:            5,
1327 | 			Obsolete:               false,
1328 | 			ElementType:            HETPhrasing,
1329 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1330 | 			PermittedChildrenTags:  []string{},
1331 | 			attributesString:       []string{},
1332 | 			TagFormatting:          HTFComplete,
1333 | 			ParentTags:             []string{"ruby"},
1334 | 			ExcludeParentTags:      []string{},
1335 | 			ParentContentTypes:     HETNone,
1336 | 			ObsoleteAttributes:     []string{},
1337 | 		},
1338 | 		HtmlElementInfo{
1339 | 
1340 | 			TagName:                "rt",
1341 | 			HtmlVersion:            5,
1342 | 			Obsolete:               false,
1343 | 			ElementType:            HETPhrasing,
1344 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1345 | 			PermittedChildrenTags:  []string{},
1346 | 			attributesString:       []string{},
1347 | 			TagFormatting:          HTFComplete,
1348 | 			ParentTags:             []string{"ruby"},
1349 | 			ExcludeParentTags:      []string{},
1350 | 			ParentContentTypes:     HETNone,
1351 | 			ObsoleteAttributes:     []string{},
1352 | 		},
1353 | 		HtmlElementInfo{
1354 | 
1355 | 			TagName:                "ruby",
1356 | 			HtmlVersion:            5,
1357 | 			Obsolete:               false,
1358 | 			ElementType:            HETPhrasing,
1359 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1360 | 			PermittedChildrenTags:  []string{},
1361 | 			attributesString:       []string{},
1362 | 			TagFormatting:          HTFComplete,
1363 | 			ParentTags:             []string{},
1364 | 			ExcludeParentTags:      []string{},
1365 | 			ParentContentTypes:     HETPhrasing,
1366 | 			ObsoleteAttributes:     []string{},
1367 | 		},
1368 | 		HtmlElementInfo{
1369 | 
1370 | 			TagName:                "s",
1371 | 			HtmlVersion:            3,
1372 | 			Obsolete:               true,
1373 | 			ElementType:            HETPhrasing,
1374 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1375 | 			PermittedChildrenTags:  []string{},
1376 | 			attributesString:       []string{},
1377 | 			TagFormatting:          HTFComplete,
1378 | 			ParentTags:             []string{},
1379 | 			ExcludeParentTags:      []string{},
1380 | 			ParentContentTypes:     HETPhrasing,
1381 | 			ObsoleteAttributes:     []string{},
1382 | 		},
1383 | 		HtmlElementInfo{
1384 | 
1385 | 			TagName:                "samp",
1386 | 			HtmlVersion:            3,
1387 | 			Obsolete:               false,
1388 | 			ElementType:            HETPhrasing,
1389 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1390 | 			PermittedChildrenTags:  []string{},
1391 | 			attributesString:       []string{},
1392 | 			TagFormatting:          HTFComplete,
1393 | 			ParentTags:             []string{},
1394 | 			ExcludeParentTags:      []string{},
1395 | 			ParentContentTypes:     HETPhrasing,
1396 | 			ObsoleteAttributes:     []string{},
1397 | 		},
1398 | 		HtmlElementInfo{
1399 | 
1400 | 			TagName:                "script",
1401 | 			HtmlVersion:            3,
1402 | 			Obsolete:               false,
1403 | 			ElementType:            HETMeta,
1404 | 			PermittedChildrenTypes: HETNRCharData,
1405 | 			PermittedChildrenTags:  []string{},
1406 | 			attributesString:       []string{"type", "src", "defer", "async", "charset"},
1407 | 			TagFormatting:          HTFComplete,
1408 | 			ParentTags:             []string{},
1409 | 			ExcludeParentTags:      []string{},
1410 | 			ParentContentTypes:     HETMeta | HETPhrasing | HETFlow,
1411 | 			ObsoleteAttributes:     []string{"language"},
1412 | 		},
1413 | 		HtmlElementInfo{
1414 | 
1415 | 			TagName:                "section",
1416 | 			HtmlVersion:            5,
1417 | 			Obsolete:               false,
1418 | 			ElementType:            HETFlow,
1419 | 			PermittedChildrenTypes: HETAnyContent,
1420 | 			PermittedChildrenTags:  []string{"style"},
1421 | 			attributesString:       []string{},
1422 | 			TagFormatting:          HTFComplete,
1423 | 			ParentTags:             []string{},
1424 | 			ExcludeParentTags:      []string{},
1425 | 			ParentContentTypes:     HETFlow,
1426 | 			ObsoleteAttributes:     []string{},
1427 | 		},
1428 | 		HtmlElementInfo{
1429 | 
1430 | 			TagName:                "select",
1431 | 			HtmlVersion:            3,
1432 | 			Obsolete:               false,
1433 | 			ElementType:            HETPhrasing,
1434 | 			PermittedChildrenTypes: HETNone,
1435 | 			PermittedChildrenTags:  []string{"optgroup", "option"},
1436 | 			attributesString:       []string{"name", "disabled", "form", "size", "multiple", "autofocus", "required"},
1437 | 			TagFormatting:          HTFComplete,
1438 | 			ParentTags:             []string{},
1439 | 			ExcludeParentTags:      []string{"a", "button"},
1440 | 			ParentContentTypes:     HETPhrasing,
1441 | 			ObsoleteAttributes:     []string{},
1442 | 		},
1443 | 		HtmlElementInfo{
1444 | 
1445 | 			TagName:                "small",
1446 | 			HtmlVersion:            3,
1447 | 			Obsolete:               false,
1448 | 			ElementType:            HETPhrasing,
1449 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1450 | 			PermittedChildrenTags:  []string{},
1451 | 			attributesString:       []string{},
1452 | 			TagFormatting:          HTFComplete,
1453 | 			ParentTags:             []string{},
1454 | 			ExcludeParentTags:      []string{},
1455 | 			ParentContentTypes:     HETPhrasing,
1456 | 			ObsoleteAttributes:     []string{},
1457 | 		},
1458 | 		HtmlElementInfo{
1459 | 
1460 | 			TagName:                "source",
1461 | 			HtmlVersion:            5,
1462 | 			Obsolete:               false,
1463 | 			ElementType:            HETMeta,
1464 | 			PermittedChildrenTypes: HETNone,
1465 | 			PermittedChildrenTags:  []string{},
1466 | 			attributesString:       []string{"src", "type", "media"},
1467 | 			TagFormatting:          HTFComplete,
1468 | 			ParentTags:             []string{"audio", "video"},
1469 | 			ExcludeParentTags:      []string{},
1470 | 			ParentContentTypes:     HETNone,
1471 | 			ObsoleteAttributes:     []string{},
1472 | 		},
1473 | 		HtmlElementInfo{
1474 | 
1475 | 			TagName:                "span",
1476 | 			HtmlVersion:            3,
1477 | 			Obsolete:               false,
1478 | 			ElementType:            HETPhrasing,
1479 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1480 | 			PermittedChildrenTags:  []string{},
1481 | 			attributesString:       []string{},
1482 | 			TagFormatting:          HTFComplete,
1483 | 			ParentTags:             []string{},
1484 | 			ExcludeParentTags:      []string{},
1485 | 			ParentContentTypes:     HETPhrasing,
1486 | 			ObsoleteAttributes:     []string{},
1487 | 		},
1488 | 		HtmlElementInfo{
1489 | 
1490 | 			TagName:                "strike",
1491 | 			HtmlVersion:            3,
1492 | 			Obsolete:               true,
1493 | 			ElementType:            HETPhrasing,
1494 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1495 | 			PermittedChildrenTags:  []string{},
1496 | 			attributesString:       []string{},
1497 | 			TagFormatting:          HTFComplete,
1498 | 			ParentTags:             []string{},
1499 | 			ExcludeParentTags:      []string{},
1500 | 			ParentContentTypes:     HETPhrasing,
1501 | 			ObsoleteAttributes:     []string{},
1502 | 		},
1503 | 		HtmlElementInfo{
1504 | 
1505 | 			TagName:                "strong",
1506 | 			HtmlVersion:            3,
1507 | 			Obsolete:               false,
1508 | 			ElementType:            HETPhrasing,
1509 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1510 | 			PermittedChildrenTags:  []string{},
1511 | 			attributesString:       []string{},
1512 | 			TagFormatting:          HTFComplete,
1513 | 			ParentTags:             []string{},
1514 | 			ExcludeParentTags:      []string{},
1515 | 			ParentContentTypes:     HETPhrasing,
1516 | 			ObsoleteAttributes:     []string{},
1517 | 		},
1518 | 		HtmlElementInfo{
1519 | 
1520 | 			TagName:                "style",
1521 | 			HtmlVersion:            3,
1522 | 			Obsolete:               false,
1523 | 			ElementType:            HETMeta,
1524 | 			PermittedChildrenTypes: HETNRCharData,
1525 | 			PermittedChildrenTags:  []string{},
1526 | 			attributesString:       []string{"type", "media", "scoped"},
1527 | 			TagFormatting:          HTFComplete,
1528 | 			ParentTags:             []string{"div", "noscript", "section", "article", "aside"},
1529 | 			ExcludeParentTags:      []string{},
1530 | 			ParentContentTypes:     HETMeta,
1531 | 			ObsoleteAttributes:     []string{},
1532 | 		},
1533 | 		HtmlElementInfo{
1534 | 
1535 | 			TagName:                "sub",
1536 | 			HtmlVersion:            3,
1537 | 			Obsolete:               false,
1538 | 			ElementType:            HETPhrasing,
1539 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1540 | 			PermittedChildrenTags:  []string{},
1541 | 			attributesString:       []string{},
1542 | 			TagFormatting:          HTFComplete,
1543 | 			ParentTags:             []string{},
1544 | 			ExcludeParentTags:      []string{},
1545 | 			ParentContentTypes:     HETPhrasing,
1546 | 			ObsoleteAttributes:     []string{},
1547 | 		},
1548 | 		HtmlElementInfo{
1549 | 
1550 | 			TagName:                "summary",
1551 | 			HtmlVersion:            5,
1552 | 			Obsolete:               false,
1553 | 			ElementType:            HETFlow,
1554 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1555 | 			PermittedChildrenTags:  []string{},
1556 | 			attributesString:       []string{},
1557 | 			TagFormatting:          HTFComplete,
1558 | 			ParentTags:             []string{"details"},
1559 | 			ExcludeParentTags:      []string{},
1560 | 			ParentContentTypes:     HETNone,
1561 | 			ObsoleteAttributes:     []string{},
1562 | 		},
1563 | 		HtmlElementInfo{
1564 | 
1565 | 			TagName:                "sup",
1566 | 			HtmlVersion:            3,
1567 | 			Obsolete:               false,
1568 | 			ElementType:            HETPhrasing,
1569 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1570 | 			PermittedChildrenTags:  []string{},
1571 | 			attributesString:       []string{},
1572 | 			TagFormatting:          HTFComplete,
1573 | 			ParentTags:             []string{},
1574 | 			ExcludeParentTags:      []string{},
1575 | 			ParentContentTypes:     HETPhrasing,
1576 | 			ObsoleteAttributes:     []string{},
1577 | 		},
1578 | 		HtmlElementInfo{
1579 | 
1580 | 			TagName:                "table",
1581 | 			HtmlVersion:            3,
1582 | 			Obsolete:               false,
1583 | 			ElementType:            HETFlow,
1584 | 			PermittedChildrenTypes: HETNone,
1585 | 			PermittedChildrenTags:  []string{"capition", "colgroup", "thead", "tfoot", "tbody", "tr"},
1586 | 			attributesString:       []string{"border"},
1587 | 			TagFormatting:          HTFComplete,
1588 | 			ParentTags:             []string{},
1589 | 			ExcludeParentTags:      []string{},
1590 | 			ParentContentTypes:     HETFlow,
1591 | 			ObsoleteAttributes:     []string{"summary", "align", "width", "bgcolor", "cellpadding", "cellspacing", "frame", "rules"},
1592 | 		},
1593 | 		HtmlElementInfo{
1594 | 
1595 | 			TagName:                "tbody",
1596 | 			HtmlVersion:            3,
1597 | 			Obsolete:               false,
1598 | 			ElementType:            HETPhrasing,
1599 | 			PermittedChildrenTypes: HETNone,
1600 | 			PermittedChildrenTags:  []string{"tr"},
1601 | 			attributesString:       []string{},
1602 | 			TagFormatting:          HTFOptionalClosing,
1603 | 			ParentTags:             []string{"table"},
1604 | 			ExcludeParentTags:      []string{},
1605 | 			ParentContentTypes:     HETNone,
1606 | 			ObsoleteAttributes:     []string{"align", "char", "charoff", "valign"},
1607 | 		},
1608 | 		HtmlElementInfo{
1609 | 
1610 | 			TagName:                "td",
1611 | 			HtmlVersion:            3,
1612 | 			Obsolete:               false,
1613 | 			ElementType:            HETPhrasing,
1614 | 			PermittedChildrenTypes: HETAnyContent,
1615 | 			PermittedChildrenTags:  []string{},
1616 | 			attributesString:       []string{"colspan", "rowspan", "headers"},
1617 | 			TagFormatting:          HTFOptionalClosing,
1618 | 			ParentTags:             []string{"tr"},
1619 | 			ExcludeParentTags:      []string{},
1620 | 			ParentContentTypes:     HETNone,
1621 | 			ObsoleteAttributes:     []string{"scope", "abbr", "axis", "align", "width", "char", "charoff", "valign", "bgcolor", "height", "nowrap"},
1622 | 		},
1623 | 		HtmlElementInfo{
1624 | 
1625 | 			TagName:                "textarea",
1626 | 			HtmlVersion:            3,
1627 | 			Obsolete:               false,
1628 | 			ElementType:            HETPhrasing,
1629 | 			PermittedChildrenTypes: HETText,
1630 | 			PermittedChildrenTags:  []string{},
1631 | 			attributesString:       []string{"name", "disabled", "form", "readonly", "maxlength", "autofocus", "required", "placeholder", "dirname", "rows", "wrap", "cols"},
1632 | 			TagFormatting:          HTFComplete,
1633 | 			ParentTags:             []string{},
1634 | 			ExcludeParentTags:      []string{},
1635 | 			ParentContentTypes:     HETPhrasing,
1636 | 			ObsoleteAttributes:     []string{},
1637 | 		},
1638 | 		HtmlElementInfo{
1639 | 
1640 | 			TagName:                "tfoot",
1641 | 			HtmlVersion:            3,
1642 | 			Obsolete:               false,
1643 | 			ElementType:            HETPhrasing,
1644 | 			PermittedChildrenTypes: HETNone,
1645 | 			PermittedChildrenTags:  []string{"tr"},
1646 | 			attributesString:       []string{},
1647 | 			TagFormatting:          HTFOptionalClosing,
1648 | 			ParentTags:             []string{"table"},
1649 | 			ExcludeParentTags:      []string{},
1650 | 			ParentContentTypes:     HETNone,
1651 | 			ObsoleteAttributes:     []string{"align", "char", "charoff", "valign"},
1652 | 		},
1653 | 		HtmlElementInfo{
1654 | 
1655 | 			TagName:                "th",
1656 | 			HtmlVersion:            3,
1657 | 			Obsolete:               false,
1658 | 			ElementType:            HETFlow,
1659 | 			PermittedChildrenTypes: HETAnyContent,
1660 | 			PermittedChildrenTags:  []string{},
1661 | 			attributesString:       []string{"scope", "scolspan", "rowspan", "headers"},
1662 | 			TagFormatting:          HTFOptionalClosing,
1663 | 			ParentTags:             []string{"tr"},
1664 | 			ExcludeParentTags:      []string{},
1665 | 			ParentContentTypes:     HETNone,
1666 | 			ObsoleteAttributes:     []string{"scope", "abbr", "axis", "align", "width", "char", "charoff", "valign", "bgcolor", "height", "nowrap"},
1667 | 		},
1668 | 		HtmlElementInfo{
1669 | 
1670 | 			TagName:                "thead",
1671 | 			HtmlVersion:            3,
1672 | 			Obsolete:               false,
1673 | 			ElementType:            HETPhrasing,
1674 | 			PermittedChildrenTypes: HETNone,
1675 | 			PermittedChildrenTags:  []string{"tr"},
1676 | 			attributesString:       []string{},
1677 | 			TagFormatting:          HTFOptionalClosing,
1678 | 			ParentTags:             []string{"table"},
1679 | 			ExcludeParentTags:      []string{},
1680 | 			ParentContentTypes:     HETNone,
1681 | 			ObsoleteAttributes:     []string{"align", "char", "charoff", "valign"},
1682 | 		},
1683 | 		HtmlElementInfo{
1684 | 
1685 | 			TagName:                "time",
1686 | 			HtmlVersion:            5,
1687 | 			Obsolete:               false,
1688 | 			ElementType:            HETPhrasing,
1689 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1690 | 			PermittedChildrenTags:  []string{},
1691 | 			attributesString:       []string{"datetime"},
1692 | 			TagFormatting:          HTFComplete,
1693 | 			ParentTags:             []string{},
1694 | 			ExcludeParentTags:      []string{"time"},
1695 | 			ParentContentTypes:     HETPhrasing,
1696 | 			ObsoleteAttributes:     []string{},
1697 | 		},
1698 | 		HtmlElementInfo{
1699 | 
1700 | 			TagName:                "title",
1701 | 			HtmlVersion:            3,
1702 | 			Obsolete:               false,
1703 | 			ElementType:            HETMeta,
1704 | 			PermittedChildrenTypes: HETText,
1705 | 			PermittedChildrenTags:  []string{},
1706 | 			attributesString:       []string{},
1707 | 			TagFormatting:          HTFComplete,
1708 | 			ParentTags:             []string{"head"},
1709 | 			ExcludeParentTags:      []string{},
1710 | 			ParentContentTypes:     HETNone,
1711 | 			ObsoleteAttributes:     []string{},
1712 | 		},
1713 | 		HtmlElementInfo{
1714 | 
1715 | 			TagName:                "tr",
1716 | 			HtmlVersion:            3,
1717 | 			Obsolete:               false,
1718 | 			ElementType:            HETPhrasing,
1719 | 			PermittedChildrenTypes: HETNone,
1720 | 			PermittedChildrenTags:  []string{"td", "th"},
1721 | 			attributesString:       []string{},
1722 | 			TagFormatting:          HTFOptionalClosing,
1723 | 			ParentTags:             []string{"table", "thead", "tfoot", "tbody"},
1724 | 			ExcludeParentTags:      []string{},
1725 | 			ParentContentTypes:     HETNone,
1726 | 			ObsoleteAttributes:     []string{"align", "char", "charoff", "valign", "bgcolor"},
1727 | 		},
1728 | 		HtmlElementInfo{
1729 | 
1730 | 			TagName:                "track",
1731 | 			HtmlVersion:            5,
1732 | 			Obsolete:               false,
1733 | 			ElementType:            HETMeta,
1734 | 			PermittedChildrenTypes: HETNone,
1735 | 			PermittedChildrenTags:  []string{},
1736 | 			attributesString:       []string{"kind", "src", "srclang", "label", "default"},
1737 | 			TagFormatting:          HTFSingle,
1738 | 			ParentTags:             []string{"audio", "video"},
1739 | 			ExcludeParentTags:      []string{},
1740 | 			ParentContentTypes:     HETNone,
1741 | 			ObsoleteAttributes:     []string{},
1742 | 		},
1743 | 		HtmlElementInfo{
1744 | 
1745 | 			TagName:                "tt",
1746 | 			HtmlVersion:            3,
1747 | 			Obsolete:               true,
1748 | 			ElementType:            HETPhrasing,
1749 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1750 | 			PermittedChildrenTags:  []string{},
1751 | 			attributesString:       []string{},
1752 | 			TagFormatting:          HTFComplete,
1753 | 			ParentTags:             []string{},
1754 | 			ExcludeParentTags:      []string{},
1755 | 			ParentContentTypes:     HETPhrasing,
1756 | 			ObsoleteAttributes:     []string{},
1757 | 		},
1758 | 		HtmlElementInfo{
1759 | 
1760 | 			TagName:                "u",
1761 | 			HtmlVersion:            3,
1762 | 			Obsolete:               true,
1763 | 			ElementType:            HETPhrasing,
1764 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1765 | 			PermittedChildrenTags:  []string{},
1766 | 			attributesString:       []string{},
1767 | 			TagFormatting:          HTFComplete,
1768 | 			ParentTags:             []string{},
1769 | 			ExcludeParentTags:      []string{},
1770 | 			ParentContentTypes:     HETPhrasing,
1771 | 			ObsoleteAttributes:     []string{},
1772 | 		},
1773 | 		HtmlElementInfo{
1774 | 
1775 | 			TagName:                "ul",
1776 | 			HtmlVersion:            3,
1777 | 			Obsolete:               false,
1778 | 			ElementType:            HETFlow,
1779 | 			PermittedChildrenTypes: HETNone,
1780 | 			PermittedChildrenTags:  []string{"li"},
1781 | 			attributesString:       []string{},
1782 | 			TagFormatting:          HTFComplete,
1783 | 			ParentTags:             []string{},
1784 | 			ExcludeParentTags:      []string{},
1785 | 			ParentContentTypes:     HETFlow,
1786 | 			ObsoleteAttributes:     []string{"type", "compact"},
1787 | 		},
1788 | 		HtmlElementInfo{
1789 | 
1790 | 			TagName:                "var",
1791 | 			HtmlVersion:            3,
1792 | 			Obsolete:               false,
1793 | 			ElementType:            HETPhrasing,
1794 | 			PermittedChildrenTypes: HETPhrasing | HETText,
1795 | 			PermittedChildrenTags:  []string{},
1796 | 			attributesString:       []string{},
1797 | 			TagFormatting:          HTFComplete,
1798 | 			ParentTags:             []string{},
1799 | 			ExcludeParentTags:      []string{},
1800 | 			ParentContentTypes:     HETPhrasing,
1801 | 			ObsoleteAttributes:     []string{},
1802 | 		},
1803 | 		HtmlElementInfo{
1804 | 
1805 | 			TagName:                "video",
1806 | 			HtmlVersion:            5,
1807 | 			Obsolete:               false,
1808 | 			ElementType:            HETFlow,
1809 | 			PermittedChildrenTypes: HETAnyContent,
1810 | 			PermittedChildrenTags:  []string{},
1811 | 			attributesString:       []string{"autoplay", "preload", "controls", "loop", "poster", "height", "width", "mediagroup", "muted", "src"},
1812 | 			TagFormatting:          HTFComplete,
1813 | 			ParentTags:             []string{},
1814 | 			ExcludeParentTags:      []string{"a", "button"},
1815 | 			ParentContentTypes:     HETFlow | HETPhrasing,
1816 | 			ObsoleteAttributes:     []string{},
1817 | 		},
1818 | 		HtmlElementInfo{
1819 | 
1820 | 			TagName:                "wbr",
1821 | 			HtmlVersion:            3,
1822 | 			Obsolete:               false,
1823 | 			ElementType:            HETPhrasing,
1824 | 			PermittedChildrenTypes: HETNone,
1825 | 			PermittedChildrenTags:  []string{},
1826 | 			attributesString:       []string{},
1827 | 			TagFormatting:          HTFSingle,
1828 | 			ParentTags:             []string{},
1829 | 			ExcludeParentTags:      []string{},
1830 | 			ParentContentTypes:     HETPhrasing,
1831 | 			ObsoleteAttributes:     []string{},
1832 | 		},
1833 | 	}
1834 | 
1835 | }
1836 | 


--------------------------------------------------------------------------------
/htmlparser.go:
--------------------------------------------------------------------------------
  1 | package htmlparser
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"html"
  6 | 	"strconv"
  7 | 	"strings"
  8 | 	"unicode/utf8"
  9 | 	//"fmt"
 10 | )
 11 | 
 12 | const maxInnerTextLengthStored = 65500
 13 | 
 14 | var ignoreTextInsideTag map[string]bool
 15 | 
 16 | type TextCallback func(string, *HtmlElement)
 17 | type ElementCallback func(*HtmlElement, bool)
 18 | type EndElementCallback func(string)
 19 | 
 20 | type HtmlParser struct {
 21 | 	OrigHtml  string
 22 | 	origRunes []rune
 23 | 	stop      bool
 24 | 
 25 | 	textCallback       TextCallback
 26 | 	elementCallback    ElementCallback
 27 | 	endElementCallback EndElementCallback
 28 | 
 29 | 	Errors   []string
 30 | 	Warnings []string
 31 | 
 32 | 	Ids map[string]bool
 33 | 
 34 | 	innerTextBuilder *bytes.Buffer
 35 | 	InnerText        string
 36 | 
 37 | 	HasValidSyntax          bool
 38 | 	HasOnlyValidTags        bool
 39 | 	HasOnlyValidAttributes  bool
 40 | 	HasOnlyKnownTags        bool
 41 | 	HasDeprecatedAttributes bool
 42 | 	HasDeprecatedTags       bool
 43 | 
 44 | 	SkipComments    bool
 45 | 	PreserveCRLFTab bool
 46 | }
 47 | 
 48 | func init() {
 49 | 	ignoreTextInsideTag = map[string]bool{
 50 | 		"head":   true,
 51 | 		"html":   true,
 52 | 		"ol":     true,
 53 | 		"select": true,
 54 | 		"table":  true,
 55 | 		"tbody":  true,
 56 | 		"thead":  true,
 57 | 		"tfoot":  true,
 58 | 		"tr":     true,
 59 | 	}
 60 | }
 61 | 
 62 | func NewParser(html string) HtmlParser {
 63 | 	var parser HtmlParser
 64 | 
 65 | 	parser.OrigHtml = html
 66 | 
 67 | 	parser.SkipComments = true
 68 | 	parser.PreserveCRLFTab = true
 69 | 
 70 | 	parser.innerTextBuilder = bytes.NewBufferString("")
 71 | 
 72 | 	return parser
 73 | }
 74 | 
 75 | func (parser *HtmlParser) Parse(textCallback TextCallback, elementCallback ElementCallback, endElementCallback EndElementCallback) bool {
 76 | 	if parser.stop {
 77 | 		return false
 78 | 	}
 79 | 
 80 | 	parser.textCallback = textCallback
 81 | 	parser.elementCallback = elementCallback
 82 | 	parser.endElementCallback = endElementCallback
 83 | 
 84 | 	parser.HasValidSyntax = true
 85 | 	parser.HasOnlyValidTags = true
 86 | 	parser.HasOnlyValidAttributes = true
 87 | 	parser.HasDeprecatedAttributes = false
 88 | 	parser.HasDeprecatedTags = false
 89 | 	parser.HasOnlyKnownTags = true
 90 | 
 91 | 	parser.Errors = make([]string, 1)
 92 | 	parser.Warnings = make([]string, 1)
 93 | 	parser.Ids = make(map[string]bool, 1)
 94 | 
 95 | 	if parser.OrigHtml == "" {
 96 | 		return true
 97 | 	}
 98 | 
 99 | 	if strings.Index(parser.OrigHtml, "<") < 0 {
100 | 		parser.callText(parser.OrigHtml, nil)
101 | 	} else {
102 | 		parser.internalParse()
103 | 	}
104 | 
105 | 	parser.InnerText = html.UnescapeString(parser.innerTextBuilder.String())
106 | 	parser.stop = true
107 | 
108 | 	return parser.HasValidSyntax
109 | }
110 | 
111 | func (parser *HtmlParser) IsValidStrictHTML401() bool {
112 | 	return parser.HasValidSyntax && parser.HasOnlyValidTags && parser.HasOnlyValidAttributes
113 | }
114 | 
115 | func (parser *HtmlParser) IsValidStrictHTMLNoDeprecated() bool {
116 | 	return parser.HasValidSyntax && parser.HasOnlyValidTags && parser.HasOnlyValidAttributes && !parser.HasDeprecatedAttributes && !parser.HasDeprecatedTags
117 | }
118 | 
119 | func (parser *HtmlParser) IsValidHTML401() bool {
120 | 	return parser.HasValidSyntax && parser.HasOnlyValidTags && parser.HasOnlyValidAttributes
121 | }
122 | 
123 | func (parser *HtmlParser) Stop() {
124 | 	parser.stop = true
125 | }
126 | 
127 | func (p *HtmlParser) callText(text string, parent *HtmlElement) {
128 | 
129 | 	if text == "" {
130 | 		return
131 | 	}
132 | 
133 | 	if !p.PreserveCRLFTab {
134 | 		if !hasContent(text) {
135 | 			return
136 | 		}
137 | 	}
138 | 
139 | 	if parent != nil && parent.ElementInfo != nil {
140 | 		var childrenTypes = parent.ElementInfo.PermittedChildrenTypes
141 | 		if (childrenTypes & (HETText | HETNRCharData)) == 0 {
142 | 			p.addWarning("Text node inside a " + parent.TagName + " element is not valid")
143 | 		}
144 | 	}
145 | 
146 | 	if parent != nil {
147 | 		_, present := ignoreTextInsideTag[parent.TagNameNS]
148 | 		if present {
149 | 			return
150 | 		}
151 | 	}
152 | 
153 | 	clearText := !p.PreserveCRLFTab
154 | 
155 | 	if parent != nil {
156 | 		switch parent.TagNameNS {
157 | 		case "pre":
158 | 			clearText = false
159 | 		case "script":
160 | 			clearText = false
161 | 		case "style":
162 | 			clearText = false
163 | 		}
164 | 	}
165 | 
166 | 	if clearText && strings.HasPrefix(text, "<!--") {
167 | 		clearText = false
168 | 	}
169 | 
170 | 	if clearText {
171 | 		// Remove all tabs, CR, LF
172 | 		n := bytes.NewBufferString("")
173 | 		for _, r := range text {
174 | 			if r == '\n' {
175 | 				n.WriteRune(' ')
176 | 			} else if r >= rune(32) {
177 | 				n.WriteRune(r)
178 | 			}
179 | 		}
180 | 		text = n.String()
181 | 	}
182 | 
183 | 	text = html.UnescapeString(text)
184 | 
185 | 	if p.textCallback != nil {
186 | 		p.textCallback(text, parent)
187 | 		if p.stop {
188 | 			return
189 | 		}
190 | 	}
191 | 
192 | 	useBlock := false
193 | 	if parent != nil && parent.ElementInfo != nil {
194 | 		if parent.ElementInfo.ElementType == HETFlow {
195 | 			useBlock = true
196 | 		} else {
197 | 			// This is whacky. We messed up on the definition of block-level elements for TR/TD
198 | 			// because of the optional tags.
199 | 			switch parent.TagNameNS {
200 | 			case "tr":
201 | 				useBlock = true
202 | 			case "td":
203 | 				useBlock = true
204 | 			}
205 | 		}
206 | 	}
207 | 
208 | 	l := p.innerTextBuilder.Len()
209 | 	if l < maxInnerTextLengthStored {
210 | 		if useBlock {
211 | 			prevChar := '\n'
212 | 			if l > 0 {
213 | 				prevChar, _ = utf8.DecodeLastRune(p.innerTextBuilder.Bytes())
214 | 			}
215 | 
216 | 			if prevChar != '\n' && prevChar != '\r' {
217 | 				p.innerTextBuilder.WriteRune('\n')
218 | 			}
219 | 			p.innerTextBuilder.WriteString(text)
220 | 			p.innerTextBuilder.WriteRune('\n')
221 | 		} else {
222 | 			p.innerTextBuilder.WriteString(text)
223 | 		}
224 | 	}
225 | }
226 | 
227 | func (hp *HtmlParser) internalParse() {
228 | 	openedTags := make([]*HtmlElement, 0)
229 | 	openedBlocks := make([]*HtmlElement, 0)
230 | 
231 | 	anyContent := false
232 | 
233 | 	var parent, blockParent *HtmlElement
234 | 
235 | 	fatal := false
236 | 
237 | 	var text string
238 | 	last := 0
239 | 
240 | 	var c rune
241 | 	hp.origRunes = []rune(hp.OrigHtml)
242 | 	l := len(hp.origRunes)
243 | 	//fmt.Printf("Len: %v\n", l)
244 | 	len1 := l - 1
245 | 
246 | 	for p := 0; p < l; p++ {
247 | 		//fmt.Printf("p=%v | last=%v\n", p, last)
248 | 		if hp.stop {
249 | 			return
250 | 		}
251 | 		c = hp.origRunes[p]
252 | 
253 | 		if c != '<' {
254 | 			continue
255 | 		}
256 | 
257 | 		diff := p - last
258 | 		if diff >= 1 {
259 | 			parent = nil
260 | 			if hp.HasValidSyntax && len(openedTags) > 0 {
261 | 				parent = openedTags[len(openedTags)-1]
262 | 			}
263 | 			//fmt.Printf("1-[%v:%v]\n", last, diff + last)
264 | 			text2 := string(hp.origRunes[last : diff+last])
265 | 			if hasContent(text2) {
266 | 				anyContent = true
267 | 			}
268 | 			hp.callText(text2, parent)
269 | 			if hp.stop {
270 | 				return
271 | 			}
272 | 		}
273 | 		// Yes, this is an open (or closing) tag
274 | 		if p >= len1 {
275 | 			hp.HasValidSyntax = false
276 | 			fatal = true
277 | 			hp.addError("HTML ends with < character")
278 | 			break // the html ends with this open tag
279 | 		}
280 | 
281 | 		startElem := p
282 | 		elem := hp.getElementString(startElem)
283 | 		if elem == "" {
284 | 			fatal = true
285 | 			break // fatal syntax error
286 | 		}
287 | 		ecl := utf8.RuneCountInString(elem)
288 | 		//fmt.Printf("ecl=%v | elem='%v'\n", ecl, elem)
289 | 		p += ecl - 1
290 | 		if ecl <= 2 {
291 | 			// bad HTML, like "<>"
292 | 			hp.addError("Element is empty <>")
293 | 			hp.HasValidSyntax = false
294 | 			continue
295 | 		}
296 | 		last = p + 1
297 | 		parent, blockParent = nil, nil
298 | 		if hp.HasValidSyntax {
299 | 			if len(openedTags) > 0 {
300 | 				parent = openedTags[len(openedTags)-1]
301 | 			}
302 | 			if len(openedBlocks) > 0 {
303 | 				blockParent = openedBlocks[len(openedBlocks)-1]
304 | 			}
305 | 		}
306 | 
307 | 		if elem[1] == '/' {
308 | 			anyContent = true
309 | 			// This is a closing tag
310 | 			tag := parseClosingTag(elem)
311 | 			//fmt.Printf("309tag: %v\n", tag)
312 | 
313 | 			if hp.HasValidSyntax {
314 | 				hp.unwindForClose(tag, &openedTags, &openedBlocks)
315 | 				//fmt.Printf("313-openTags: %v | openedBlocks: %v\n", openedTags, openedBlocks)
316 | 				if hp.stop {
317 | 					return
318 | 				}
319 | 			}
320 | 			if hp.endElementCallback != nil {
321 | 				hp.endElementCallback(tag)
322 | 				if hp.stop {
323 | 					return
324 | 				}
325 | 			}
326 | 		} else {
327 | 			if strings.HasPrefix(elem, "<!") {
328 | 				if strings.HasPrefix(elem, "<!--") {
329 | 					p = p - utf8.RuneCountInString(elem) + 1
330 | 					// It's a comment
331 | 					ec := runesIndexRunesStart(hp.origRunes, []rune("-->"), p+4)
332 | 					if ec == -1 {
333 | 						hp.HasValidSyntax = false
334 | 						fatal = true
335 | 						hp.addError("Missing end comment -->")
336 | 						break
337 | 					}
338 | 					//fmt.Printf("2-[%v:%v]\n", p, ec + 3)
339 | 					text = string(hp.origRunes[p : ec+3])
340 | 					if !hp.SkipComments {
341 | 						hp.callText(text, nil)
342 | 						if hp.stop {
343 | 							return
344 | 						}
345 | 					}
346 | 					p += utf8.RuneCountInString(text)
347 | 					last = p
348 | 					p--
349 | 					continue
350 | 				}
351 | 				// Looks like a doctype
352 | 				e2 := strings.ToLower(elem)
353 | 				//fmt.Printf("351-e2=%v\n", e2)
354 | 				if strings.HasPrefix(e2, "<!doctype ") {
355 | 					if anyContent {
356 | 						hp.HasValidSyntax = false
357 | 						hp.addError("The doctype declaration must be the first element of the HTML:" + e2)
358 | 					}
359 | 					anyContent = true
360 | 					hp.callText(elem, nil)
361 | 					if hp.stop {
362 | 						return
363 | 					}
364 | 					continue
365 | 				}
366 | 			}
367 | 			anyContent = true
368 | 			he := NewHtmlElement(elem, parent, &hp.Errors, &hp.Warnings)
369 | 			if he.HasDeprecatedAttributes {
370 | 				hp.HasDeprecatedAttributes = true
371 | 			}
372 | 			if !he.HasOnlyKnownAttributes {
373 | 				hp.HasOnlyValidAttributes = false
374 | 			}
375 | 
376 | 			if he.Id != "" {
377 | 				_, present := hp.Ids[he.Id]
378 | 				if present {
379 | 					hp.addWarning("Duplicate id: " + he.Id + " - Element: " + he.OriginalOpenTag)
380 | 				}
381 | 				hp.Ids[he.Id] = true
382 | 			}
383 | 
384 | 			if he.SyntaxError {
385 | 				hp.HasValidSyntax = false
386 | 				if he.FatalSyntaxError {
387 | 					fatal = true
388 | 					break
389 | 				}
390 | 				hp.addError("Element syntax error for " + he.OriginalOpenTag)
391 | 				continue
392 | 			}
393 | 
394 | 			// Special cases for script and style
395 | 			if he.TagNameNS == "script" || he.TagNameNS == "style" {
396 | 				startScript := p + 1
397 | 				endScript := 0
398 | 				endTag := ""
399 | 				cp := startScript
400 | 				for {
401 | 					cp = runesIndexRunesStart(hp.origRunes, []rune("</"), cp)
402 | 					if cp == -1 {
403 | 						hp.HasValidSyntax = false
404 | 						fatal = true
405 | 						hp.addError("Missing close tag: " + he.OriginalOpenTag)
406 | 						break
407 | 					}
408 | 					endScript = cp
409 | 					elem = hp.getElementString(cp)
410 | 					if elem == "" {
411 | 						hp.HasValidSyntax = false
412 | 						fatal = true
413 | 						hp.addError("Missing close > for closing tag: " + he.OriginalOpenTag)
414 | 						break
415 | 					}
416 | 					cp += utf8.RuneCountInString(elem)
417 | 					last = cp
418 | 					endTag = parseClosingTag(elem)
419 | 					if endTag == he.TagNameNS {
420 | 						break
421 | 					}
422 | 					endTag = ""
423 | 				}
424 | 				if endTag == "" {
425 | 					p = l
426 | 					break
427 | 				}
428 | 				p = cp - 1
429 | 
430 | 				if hp.elementCallback != nil {
431 | 					hp.elementCallback(he, false)
432 | 					if hp.stop {
433 | 						return
434 | 					}
435 | 				}
436 | 
437 | 				//fmt.Printf("3-[%v:%v]\n", startScript, endScript)
438 | 				hp.callText(string(hp.origRunes[startScript:endScript]), he)
439 | 				if hp.stop {
440 | 					return
441 | 				}
442 | 
443 | 				if hp.endElementCallback != nil {
444 | 					hp.endElementCallback(he.TagNameNS)
445 | 					if hp.stop {
446 | 						return
447 | 					}
448 | 				}
449 | 				continue
450 | 			}
451 | 
452 | 			// We consider this a single element if
453 | 			// 1) the ElementInfo.Single is flagged
454 | 			// 2) It is an unknown element (but not in any namespace)
455 | 			if he.ElementInfo == nil {
456 | 				hp.HasOnlyKnownTags = false
457 | 				// Unknown HTML 4.01 tag
458 | 				if !he.HasNamespace {
459 | 					hp.addWarning("Unknown tag: " + he.TagNameNS)
460 | 					// Really unknown and invalid tag
461 | 					if he.XmlEmptyTag {
462 | 						if hp.elementCallback != nil {
463 | 							hp.elementCallback(he, true)
464 | 							if hp.stop {
465 | 								return
466 | 							}
467 | 						}
468 | 					} else {
469 | 						if hp.elementCallback != nil {
470 | 							hp.elementCallback(he, false)
471 | 							if hp.stop {
472 | 								return
473 | 							}
474 | 						}
475 | 						if hp.HasValidSyntax {
476 | 							openedTags = append(openedTags, he)
477 | 						}
478 | 					}
479 | 				} else {
480 | 					// it is unknown, but correctly declared in an XML namespace
481 | 					if he.XmlEmptyTag {
482 | 						if hp.elementCallback != nil {
483 | 							hp.elementCallback(he, true)
484 | 							if hp.stop {
485 | 								return
486 | 							}
487 | 						}
488 | 					} else {
489 | 						if hp.elementCallback != nil {
490 | 							hp.elementCallback(he, false)
491 | 							if hp.stop {
492 | 								return
493 | 							}
494 | 						}
495 | 						if hp.HasValidSyntax {
496 | 							openedTags = append(openedTags, he)
497 | 						}
498 | 					}
499 | 				}
500 | 			} else {
501 | 				if he.ElementInfo.Obsolete {
502 | 					hp.addWarning("Deprecated Tag: " + he.TagNameNS)
503 | 					hp.HasDeprecatedTags = true
504 | 				}
505 | 
506 | 				// It's  known tag
507 | 				if he.ElementInfo.TagFormatting == HTFSingle || he.XmlEmptyTag {
508 | 					if hp.elementCallback != nil {
509 | 						hp.elementCallback(he, true)
510 | 						if hp.stop {
511 | 							return
512 | 						}
513 | 					}
514 | 				} else {
515 | 					if hp.HasValidSyntax {
516 | 						if he.ElementInfo.ElementType == HETFlow {
517 | 							// Some Tags have optional closing (like LI or TD or P)
518 | 							// We assume an automatic closing for these tags on the following situation:
519 | 							// 1) Current element is block-level, and
520 | 							// 2) Parent node is also a block-level and supports optional closing
521 | 							// 3) Current element is the same class as parent element
522 | 							//    or Current element is the closing tag of the parent element
523 | 							if blockParent != nil && blockParent.ElementInfo.TagFormatting == HTFOptionalClosing {
524 | 								if parent != blockParent {
525 | 									hp.addWarning("Invalid parent for " + blockParent.TagName + " (inside of " + parent.TagName + ")")
526 | 								} else {
527 | 									if he.TagName == blockParent.TagName {
528 | 										if hp.endElementCallback != nil {
529 | 											hp.endElementCallback(parent.TagNameNS)
530 | 											if hp.stop {
531 | 												return
532 | 											}
533 | 										}
534 | 										openedTags = openedTags[:len(openedTags)-1]
535 | 										openedBlocks = openedBlocks[:len(openedBlocks)-1]
536 | 									}
537 | 								}
538 | 							}
539 | 							if hp.HasValidSyntax {
540 | 								openedBlocks = append(openedBlocks, he)
541 | 							}
542 | 						}
543 | 						if hp.HasValidSyntax {
544 | 							openedTags = append(openedTags, he)
545 | 						}
546 | 					}
547 | 					if hp.elementCallback != nil {
548 | 						hp.elementCallback(he, false)
549 | 					}
550 | 
551 | 				}
552 | 			}
553 | 
554 | 		}
555 | 	} // for loop
556 | 
557 | 	//fmt.Printf("554-Out\n")
558 | 
559 | 	if !fatal {
560 | 		// commit the last piece of text
561 | 		parent = nil
562 | 		//fmt.Printf("559\n")
563 | 		if hp.HasValidSyntax && len(openedTags) > 0 {
564 | 			parent = openedTags[len(openedTags)-1]
565 | 		}
566 | 
567 | 		//fmt.Printf("564\n")
568 | 		if last < l {
569 | 			//fmt.Printf("564-[%v:]\n", last)
570 | 			text = string(hp.origRunes[last:])
571 | 			hp.callText(text, parent)
572 | 			if hp.stop {
573 | 				return
574 | 			}
575 | 		}
576 | 
577 | 		//fmt.Printf("574\n")
578 | 		if hp.HasValidSyntax {
579 | 			//fmt.Printf("576-openedTags: %v %v\n", len(openedTags), openedTags)
580 | 			for len(openedTags) > 0 {
581 | 				parent = openedTags[len(openedTags)-1]
582 | 				if parent.ElementInfo == nil || parent.ElementInfo.TagFormatting != HTFOptionalClosing {
583 | 					break
584 | 				}
585 | 				if hp.endElementCallback != nil {
586 | 					hp.endElementCallback(parent.TagNameNS)
587 | 					if hp.stop {
588 | 						return
589 | 					}
590 | 				}
591 | 				openedTags = openedTags[:len(openedTags)-1]
592 | 				blockParent = nil
593 | 				if len(openedBlocks) > 0 {
594 | 					blockParent = openedBlocks[len(openedBlocks)-1]
595 | 				}
596 | 				if parent == blockParent {
597 | 					openedBlocks = openedBlocks[:len(openedBlocks)-1]
598 | 				}
599 | 			}
600 | 		}
601 | 		//fmt.Printf("598\n")
602 | 	}
603 | 
604 | 	//fmt.Printf("596-OUt\n")
605 | 
606 | 	if hp.HasValidSyntax {
607 | 		if len(openedBlocks) > 0 {
608 | 			if len(openedTags) != len(openedBlocks) {
609 | 				hp.HasValidSyntax = false
610 | 				hp.addError("Missing " + strconv.Itoa(len(openedTags)) + " tag(s) closing.")
611 | 			} else {
612 | 				for len(openedBlocks) > 0 {
613 | 					blockParent = openedBlocks[len(openedBlocks)-1]
614 | 					openedBlocks = openedBlocks[:len(openedBlocks)-1]
615 | 					parent = openedTags[len(openedTags)-1]
616 | 					openedTags = openedTags[:len(openedTags)-1]
617 | 					if parent != blockParent {
618 | 						hp.HasValidSyntax = false
619 | 						hp.addError("Missing a close tag for a block-element. Opened Tag: " + parent.TagNameNS)
620 | 						break
621 | 					}
622 | 					if hp.endElementCallback != nil {
623 | 						hp.endElementCallback(parent.TagNameNS)
624 | 						if hp.stop {
625 | 							return
626 | 						}
627 | 					}
628 | 				}
629 | 
630 | 			}
631 | 		} else if len(openedTags) > 0 {
632 | 			hp.addError("Missing " + strconv.Itoa(len(openedTags)) + " tag(s) closing.")
633 | 			hp.HasValidSyntax = false
634 | 		}
635 | 	}
636 | }
637 | 
638 | func (hp *HtmlParser) unwindForClose(tag string, openedTags, openedBlocks *[]*HtmlElement) {
639 | 	var parent, blockParent *HtmlElement
640 | 	if len(*openedTags) > 0 {
641 | 		parent = (*openedTags)[len(*openedTags)-1]
642 | 	}
643 | 
644 | 	if parent == nil {
645 | 		hp.HasValidSyntax = false
646 | 		hp.addError("Closing tag without opening: " + tag)
647 | 		return
648 | 	}
649 | 	//fmt.Printf("637-Parent:%v\n", parent)
650 | 
651 | 	firstParent := parent.TagNameNS
652 | 
653 | 	if len(*openedBlocks) > 0 {
654 | 		blockParent = (*openedBlocks)[len(*openedBlocks)-1]
655 | 	}
656 | 
657 | 	//fmt.Printf("645-openTags: %v | openedBlocks: %v\n", *openedTags, *openedBlocks)
658 | 
659 | 	for parent != nil {
660 | 		if parent.TagNameNS == tag {
661 | 			*openedTags = (*openedTags)[:len(*openedTags)-1]
662 | 			//fmt.Printf("648-openTags: %v | openedBlocks: %v\n", *openedTags, *openedBlocks)
663 | 			if blockParent != nil && blockParent.TagNameNS == tag {
664 | 				*openedBlocks = (*openedBlocks)[:len(*openedBlocks)-1]
665 | 			}
666 | 			return
667 | 		}
668 | 
669 | 		if parent.ElementInfo == nil {
670 | 			break // mismatch
671 | 		}
672 | 
673 | 		// This could be either a tag mismatch, or an optional element missing
674 | 		if parent.ElementInfo.TagFormatting != HTFOptionalClosing {
675 | 			break // mismatch
676 | 		}
677 | 
678 | 		// inject the optional closing tag
679 | 		if hp.endElementCallback != nil {
680 | 			hp.endElementCallback(parent.TagNameNS)
681 | 			if hp.stop {
682 | 				return
683 | 			}
684 | 		}
685 | 
686 | 		if len(*openedTags) == 0 {
687 | 			break
688 | 		}
689 | 		*openedTags = (*openedTags)[:len(*openedTags)-1]
690 | 		if blockParent == parent {
691 | 			*openedBlocks = (*openedBlocks)[:len(*openedBlocks)-1]
692 | 			blockParent = nil
693 | 			if len(*openedBlocks) > 0 {
694 | 				blockParent = (*openedBlocks)[len(*openedBlocks)-1]
695 | 			}
696 | 		}
697 | 		parent = parent.Parent
698 | 	}
699 | 
700 | 	hp.addError("Tag mismatch. Open tag: " + firstParent + " / Closing tag: " + tag)
701 | 	hp.HasValidSyntax = false
702 | }
703 | 
704 | func (hp *HtmlParser) addError(error string) {
705 | 	hp.Errors = append(hp.Errors, error)
706 | }
707 | 
708 | func (hp *HtmlParser) addWarning(wrn string) {
709 | 	hp.Warnings = append(hp.Warnings, wrn)
710 | }
711 | 
712 | func (hp *HtmlParser) getElementString(startPos int) string {
713 | 	var c rune
714 | 	endElem := 0
715 | 	l := len(hp.origRunes)
716 | 	p := startPos
717 | 	for ; p < l; p++ {
718 | 		c = hp.origRunes[p]
719 | 		if c == '>' {
720 | 			endElem = p
721 | 			break
722 | 		}
723 | 		if c == '"' || c == '\'' {
724 | 			p = runesIndexRunesStart(hp.origRunes, []rune{c}, p+1)
725 | 			if p == -1 {
726 | 				// Not well formed HTML: <div attr="value>   (missing quote)
727 | 				logString := hp.origRunes[startPos:]
728 | 
729 | 				if len(logString) > 40 {
730 | 					logString = logString[:40]
731 | 				}
732 | 				hp.addError("Attribute start quote but doesn't end with quote: " + string(logString))
733 | 				hp.HasValidSyntax = false
734 | 				return ""
735 | 			}
736 | 		}
737 | 	}
738 | 
739 | 	if endElem == 0 {
740 | 		hp.HasValidSyntax = false
741 | 		logString := hp.origRunes[startPos:]
742 | 		if len(logString) > 40 {
743 | 			logString = logString[0:40]
744 | 		}
745 | 		hp.addError("Can't find > for tag: " + string(logString))
746 | 		return ""
747 | 	}
748 | 
749 | 	return string(hp.origRunes[startPos : endElem+1])
750 | 
751 | }
752 | 


--------------------------------------------------------------------------------
/htmlparser_test.go:
--------------------------------------------------------------------------------
  1 | package htmlparser
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"html"
  6 | 	"strings"
  7 | 	//"fmt"
  8 | 	"testing"
  9 | )
 10 | 
 11 | func Test__SimpleSegments(t *testing.T) {
 12 | 	testSegments(t, true, []string{
 13 | 		"",
 14 | 		"text only",
 15 | 		"text only with &gt; entities",
 16 | 		"<b></b>",
 17 | 		"<b>bold</b>",
 18 | 		"a<b>bold</b>",
 19 | 		"a<b>bold</b>b",
 20 | 		"<b>bold<i>italic-bold</i>bold</b>",
 21 | 	})
 22 | }
 23 | 
 24 | func Test_OptionalClosing(t *testing.T) {
 25 | 	testSegments(t, true,
 26 | 		[]string{
 27 | 			"<p>",
 28 | 			"a<p>",
 29 | 			"a<p>b",
 30 | 			"a<p>b</p>c",
 31 | 			"a<p>b<br>c</p>",
 32 | 			"a<p>b<br>c",
 33 | 			"a<p>b</p><p>c</p>",
 34 | 			"a<p>b</p><p>c",
 35 | 			"a<p>b<p>c",
 36 | 			"a<p>b<b>c</b>d</p><p>e</p>",
 37 | 			"a<p>b<b>c</b>d</p><p>e",
 38 | 			"a<p>b<b>c</b>d<p>e",
 39 | 		})
 40 | }
 41 | 
 42 | func Test_OptionalClosingWithBlockElement(t *testing.T) {
 43 | 	testSegments(t, true, []string{
 44 | 		"<ul><li><div>a</div></li></ul>",
 45 | 		"<ul><li>a<div>b</div></li></ul>",
 46 | 		"<ul><li><div>a</div>b</li></ul>",
 47 | 		"<ul><li><div>a</div></ul>",
 48 | 		"<ul><li>a<div>b</div></ul>",
 49 | 		"<ul><li><div>a</div>b</ul>",
 50 | 	})
 51 | }
 52 | 
 53 | func Test_Scripts(t *testing.T) {
 54 | 	testSegments(t, true, []string{
 55 | 		"<script src=abc></script>",
 56 | 		"<script>if(a<b) c;</script>",
 57 | 		"abc<script>if(a<b) c;</script>def",
 58 | 	})
 59 | }
 60 | 
 61 | func Test_Comments(t *testing.T) {
 62 | 	testSegments(t, true, []string{
 63 | 		"<!-- comment -->",
 64 | 		"<!-- this is <b>a long comment</b> -->",
 65 | 		"abcd<!-- comments <b>bold</bold> -->ghij",
 66 | 	})
 67 | }
 68 | 
 69 | func Test_Table(t *testing.T) {
 70 | 	testSegments(t, true, []string{
 71 | 		"<table> <tr> <td>a</td> </table>",
 72 | 		"<table> <tr> <td>a</table>",
 73 | 		"<table> <tr> <td><p>a</table>",
 74 | 	})
 75 | }
 76 | 
 77 | func Test_CompleteHtml(t *testing.T) {
 78 | 	testSegments(t, true, []string{
 79 | 		"<html><head><title>title</title></head><body>body</body></html>",
 80 | 		"<html><head><title>hello</title><body>body",
 81 | 		`<!doctype html public "-//w3c//dtd html 4.0 strict//en"><html><head><title>title</title></head><body>body</body></html>`,
 82 | 	})
 83 | }
 84 | 
 85 | func Test_SingleTags(t *testing.T) {
 86 | 	testSegments(t, true, []string{
 87 | 		"<br>",
 88 | 		"<hr>",
 89 | 		"<br/>",
 90 | 		"<br />",
 91 | 		"< br />",
 92 | 		"< br / >",
 93 | 		"<hr size=1>",
 94 | 		"<hr size=\"1\">",
 95 | 		"<hr size=1/>",
 96 | 		"<hr size=\"1\"/>",
 97 | 	})
 98 | }
 99 | 
100 | func Test_Attributes(t *testing.T) {
101 | 	testSegments(t, true, []string{
102 | 		"<a href></a>",
103 | 		"<font size=1 face=verdana>a</font>",
104 | 		"<font size=\"1\" face=verdana>a</font>",
105 | 		"<font size=1 face=\"verdana\">a</font>",
106 | 		"<font size=\"1\" face=\"verdana\">a</font>",
107 | 	})
108 | 
109 | }
110 | 
111 | func Test_StyleTag(t *testing.T) {
112 | 	testSegments(t, true, []string{
113 | 		"<head><style>.a { background-url: 'ab<>c.jpg'; }</style></head>",
114 | 	})
115 | }
116 | 
117 | func Test_InvalidSegments(t *testing.T) {
118 | 	testSegments(t, false, []string{
119 | 		"<",
120 | 		"<b",
121 | 		"<hr",
122 | 		"<>",
123 | 		"< >",
124 | 		" < > ",
125 | 		"<b>",
126 | 		"a<b>b<i>c</b>d</i>e",
127 | 		"<a href=></a>",
128 | 		"<a href=\"abc></a>",
129 | 		"<!-- missing closing",
130 | 	})
131 | }
132 | 
133 | func Test_InnerText(t *testing.T) {
134 | 	segments := []struct {
135 | 		Item1 string
136 | 		Item2 string
137 | 	}{
138 | 		{"", "<b></b>"},
139 | 		{"a", "<b>a</b>"},
140 | 		{"a", "a<b></b>"},
141 | 		{"a", "<b></b>a"},
142 | 		{"abc", "a<b>b</b>c"},
143 | 		{"ac", "a<!--b-->c"},
144 | 	}
145 | 
146 | 	for _, segment := range segments {
147 | 
148 | 		var parser = NewParser(segment.Item2)
149 | 		if !parser.Parse(nil, nil, nil) {
150 | 			t.Error()
151 | 		}
152 | 		if segment.Item1 != parser.InnerText {
153 | 			t.Error(segment.Item1)
154 | 		}
155 | 
156 | 	}
157 | }
158 | 
159 | func Test_PreserveComments(t *testing.T) {
160 | 	segment := "a<b>b<!--comment-->c</b>d"
161 | 	parser := NewParser(segment)
162 | 	parser.SkipComments = false
163 | 	if !parser.Parse(nil, nil, nil) {
164 | 		t.Error()
165 | 	}
166 | 	if parser.InnerText != "ab<!--comment-->cd" {
167 | 		t.Error()
168 | 	}
169 | 
170 | }
171 | 
172 | func Test_ComplexHtml(t *testing.T) {
173 | 	parser := NewParser(googleHomepage)
174 | 	parser.Parse(nil, nil, nil)
175 | 	if !parser.HasValidSyntax {
176 | 		t.Error()
177 | 	}
178 | }
179 | 
180 | func Test_CustomInnerText(t *testing.T) {
181 | 	segment := "a<b>b</b>c<!--comment-->d<p>e"
182 | 
183 | 	n := bytes.NewBufferString("")
184 | 
185 | 	parser := NewParser(segment)
186 | 
187 | 	parser.Parse(func(text string, he *HtmlElement) {
188 | 		n.WriteString(text)
189 | 	}, nil, nil)
190 | 
191 | 	if n.String() != "abcde" {
192 | 		t.Error()
193 | 	}
194 | 
195 | }
196 | 
197 | func Test_UrlAttribute(t *testing.T) {
198 | 	segment := `<b> <link rel="alternate" type="application/rss+xml" title="M-Shaped Brain &raquo; Feed" href="http://blog.calbucci.com/feed/" /> </b>`
199 | 
200 | 	foundIt := false
201 | 	parser := NewParser(segment)
202 | 
203 | 	parser.Parse(nil, func(e *HtmlElement, isEmpty bool) {
204 | 		t.Logf("E: %v (attr=%v)\n", e.TagName, e.Attributes)
205 | 		if e.TagName == "link" {
206 | 			if len(e.Attributes) != 4 {
207 | 				t.Error()
208 | 			}
209 | 			if title, _ := e.GetAttributeValue("title"); title != "M-Shaped Brain » Feed" {
210 | 				t.Error()
211 | 			}
212 | 			if href, _ := e.GetAttributeValue("href"); href != "http://blog.calbucci.com/feed/" {
213 | 				t.Error()
214 | 			}
215 | 
216 | 			foundIt = true
217 | 		}
218 | 	}, nil)
219 | 
220 | 	if !foundIt {
221 | 		t.Error()
222 | 	}
223 | 
224 | }
225 | 
226 | func Test_FindRSSFeed(t *testing.T) {
227 | 	rssFeed := ""
228 | 	parser := NewParser(blogPost)
229 | 
230 | 	parser.Parse(nil, func(e *HtmlElement, isEmpty bool) {
231 | 		if e.TagName == "link" {
232 | 
233 | 			if ty, _ := e.GetAttributeValue("type"); ty == "application/rss+xml" {
234 | 				t.Logf("rss-e: %v %v\n", e.TagName, e.Attributes)
235 | 				rssFeed, _ = e.GetAttributeValue("href")
236 | 				parser.Stop()
237 | 			}
238 | 		}
239 | 	}, nil)
240 | 
241 | 	t.Logf("rssFeed=%v\n", rssFeed)
242 | 	if rssFeed != "http://blog.calbucci.com/feed/" {
243 | 		t.Error()
244 | 	}
245 | 
246 | }
247 | 
248 | func Test_Idempotent(t *testing.T) {
249 | 	baseHtml := blogPost
250 | 	html1 := parseAndSerialize(baseHtml)
251 | 	html2 := parseAndSerialize(html1)
252 | 	html3 := parseAndSerialize(html2)
253 | 
254 | 	if html1 != html2 {
255 | 
256 | 		max := len(html1)
257 | 		if max > len(html2) {
258 | 			max = len(html2)
259 | 		}
260 | 		for i := 0; i < max; i++ {
261 | 			if html1[i] != html2[i] {
262 | 				i -= 20
263 | 				if i < 0 {
264 | 					i = 0
265 | 				}
266 | 				e := i + 30
267 | 				if e > max {
268 | 					e = max
269 | 				}
270 | 				t.Logf("Mismatch1: %v\n", html1[i:e])
271 | 				t.Logf("Mismatch2: %v\n", html2[i:e])
272 | 				break
273 | 			}
274 | 		}
275 | 
276 | 		t.Error()
277 | 	}
278 | 	if html2 != html3 {
279 | 		t.Error()
280 | 	}
281 | }
282 | 
283 | func parseAndSerialize(origHtml string) string {
284 | 	parser := NewParser(origHtml)
285 | 
286 | 	parser.PreserveCRLFTab = false
287 | 
288 | 	n := bytes.NewBufferString("")
289 | 
290 | 	parser.Parse(func(text string, parent *HtmlElement) {
291 | 		escaped := html.EscapeString(text)
292 | 		n.WriteString(escaped)
293 | 	}, func(parent *HtmlElement, isEmptyTag bool) {
294 | 		n.WriteString(parent.GetOpenTag(false, false))
295 | 	}, func(closeTag string) {
296 | 		n.WriteString("</" + closeTag + ">")
297 | 	})
298 | 
299 | 	return n.String()
300 | }
301 | 
302 | func Test_FindOpenGraphTags(t *testing.T) {
303 | 	parser := NewParser(blogPost)
304 | 
305 | 	tags := make(map[string]string)
306 | 
307 | 	parser.Parse(nil, func(element *HtmlElement, isEmptyTag bool) {
308 | 		if element.TagName == "meta" {
309 | 			ogName, _ := element.GetAttributeValue("property")
310 | 			if ogName == "" || !strings.HasPrefix(ogName, "og:") {
311 | 				return
312 | 			}
313 | 			ogValue, _ := element.GetAttributeValue("content")
314 | 			tags[ogName] = ogValue
315 | 		}
316 | 	}, nil)
317 | 
318 | 	if !parser.HasValidSyntax {
319 | 		t.Error()
320 | 	}
321 | 
322 | 	if v, _ := tags["og:type"]; v != "article" {
323 | 		t.Error()
324 | 	}
325 | 
326 | 	if v, _ := tags["og:url"]; v != "http://blog.calbucci.com/2015/01/27/attention-cannibalization/" {
327 | 		t.Error()
328 | 	}
329 | 
330 | }
331 | 
332 | func testSegments(t *testing.T, result bool, segments []string) {
333 | 	for _, segment := range segments {
334 | 		t.Logf("Processing: %v\n", segment)
335 | 		parser := NewParser(segment)
336 | 		if parser.Parse(nil, nil, nil) != result {
337 | 			t.Errorf("Failed to parse segment: " + segment)
338 | 		}
339 | 	}
340 | }
341 | 


--------------------------------------------------------------------------------