├── .gitignore ├── LICENSE ├── README.md ├── enums.go ├── generic.go ├── generic_test.go ├── htmlelement.go ├── htmlelementinfo.go ├── htmlelementinfo_init.go ├── htmlparser.go ├── htmlparser_test.go └── testcontent.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | *.test 24 | *.prof 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Marcelo Calbucci 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # go-htmlparser 2 | Events-based HTML 5.0 compliant parser in Go (SAX-style parsing) 3 | 4 | ## Typical Scenarios 5 | - Use it to scrape pieces of HTML 6 | - Detect META / LINK tags (e.g. Open Graph tags) 7 | - Optimize the output HTML (remove whitespace, clear empty tags) 8 | - Detect HTML syntax errors and notify developers 9 | - Extract text from the HTML 10 | 11 | 12 | ## Sample 13 | 14 | ### Get the RSS Feed of a website 15 | 16 | ```go 17 | rssFeed := "" 18 | parser := NewParser(htmlContent) 19 | 20 | parser.Parse(nil, func(e *HtmlElement, isEmpty bool) { 21 | if e.TagName == "link" { 22 | 23 | if ty,_ := e.GetAttributeValue("type"); ty == "application/rss+xml" { 24 | t.Logf("rss-e: %v %v\n", e.TagName, e.Attributes) 25 | rssFeed,_ = e.GetAttributeValue("href") 26 | parser.Stop() 27 | } 28 | } 29 | }, nil) 30 | 31 | fmt.Println(rssFeed) 32 | ``` 33 | 34 | ### Remove whitespaces 35 | 36 | ```go 37 | parser := NewParser(origHtml) 38 | 39 | parser.PreserveCRLFTab = false 40 | 41 | n := bytes.NewBufferString("") 42 | 43 | parser.Parse(func(text string, parent *HtmlElement) { 44 | escaped := html.EscapeString(text) 45 | n.WriteString(escaped) 46 | }, func(parent *HtmlElement, isEmptyTag bool) { 47 | n.WriteString(parent.GetOpenTag(false, false)) 48 | }, func(closeTag string) { 49 | n.WriteString("") 50 | }) 51 | 52 | newHtml := n.String() 53 | ``` 54 | 55 | 56 | 57 | ## Questions 58 | 59 | 60 | 61 | ## Contributors 62 | 63 | - HtmlParser was originally created by *Marcelo Calbucci* ([blog.calbucci.com](http://blog.calbucci.com) | [@calbucci](http://twitter.com/calbucci)) 64 | 65 | -------------------------------------------------------------------------------- /enums.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | // AttrStatus indicate a status of an attribute 4 | type AttrStatus uint8 5 | 6 | const ( 7 | ASValid AttrStatus = iota 8 | ASDeprecated 9 | ASUnknown 10 | ) 11 | 12 | // Type of HTML Element according to the HTML 5.0 spec 13 | type HtmlElementType uint8 14 | 15 | const ( 16 | HETPhrasing HtmlElementType = 0x1 // former "inline element" 17 | HETFlow = 0x2 // former "block element" 18 | HETMeta = 0x4 // control elements 19 | HETText = 0x8 // text block 20 | HETNRCharData = 0x10 // Non-Replaceable Char Data 21 | 22 | HETAnyContent = HETPhrasing | HETFlow | HETText 23 | HETTransparent = HETPhrasing | HETFlow 24 | HETNone = 0 25 | ) 26 | 27 | type HtmlTagFormatting uint8 28 | 29 | const ( 30 | HTFSingle HtmlTagFormatting = iota // Has no closing tag, e.g.
31 | HTFOptionalClosing // has an optional closing tag, e.g.
  • 32 | HTFComplete // must have a closing tag 33 | ) 34 | -------------------------------------------------------------------------------- /generic.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | import ( 4 | "bytes" 5 | "sort" 6 | "strings" 7 | "unicode" 8 | ) 9 | 10 | // union add all the elements from slice2 to slice1 if not present 11 | func union(slice1, slice2 []string) []string { 12 | if slice1 == nil { 13 | slice1 = make([]string, len(slice2)) 14 | copy(slice1, slice2) 15 | sort.Strings(slice1) 16 | return slice1 17 | } 18 | for _, e2 := range slice2 { 19 | found := false 20 | for _, e1 := range slice1 { 21 | if e1 == e2 { 22 | found = true 23 | break 24 | } 25 | } 26 | if !found { 27 | slice1 = append(slice1, e2) 28 | } 29 | } 30 | sort.Strings(slice1) 31 | return slice1 32 | } 33 | 34 | // sorted_contains check if a string is present in a slice using binary search 35 | func sorted_contains(slice []string, element string) bool { 36 | if slice == nil || len(slice) == 0 { 37 | return false 38 | } 39 | pos := sort.SearchStrings(slice, element) 40 | return pos != len(slice) && slice[pos] == element 41 | } 42 | 43 | // contains check if a string is present in a slice 44 | func contains(s []string, e string) bool { 45 | for _, a := range s { 46 | if a == e { 47 | return true 48 | } 49 | } 50 | return false 51 | } 52 | 53 | // convertSemicolonDelimited converts a semi-colon delimited string into a slice of strings and sort them 54 | func convertSemicolonDelimited(text string) []string { 55 | if len(text) > 0 { 56 | strList := strings.Split(text, ";") 57 | if len(strList) > 0 { 58 | for i, s := range strList { 59 | strList[i] = strings.ToLower(s) 60 | } 61 | sort.Strings(strList) 62 | return strList 63 | } 64 | } 65 | return nil 66 | } 67 | 68 | // runesLastIndex finds the last occurrance of a rune r in the sequence runes 69 | func runesLastIndex(runes []rune, r rune) int { 70 | 71 | for i := len(runes) - 1; i >= 0; i-- { 72 | if runes[i] == r { 73 | return i 74 | } 75 | } 76 | return -1 77 | } 78 | 79 | // runesIndex finds the first occurance of rune r in the sequence runes 80 | func runesIndex(runes []rune, r rune) int { 81 | for i, v := range runes { 82 | if v == r { 83 | return i 84 | } 85 | } 86 | return -1 87 | } 88 | 89 | // runesIndexRunesStart finds the first occurrance of the sequence sub inside of runes start at position start 90 | func runesIndexRunesStart(runes []rune, sub []rune, start int) int { 91 | 92 | max := len(runes) - len(sub) 93 | if len(sub) == 0 || max < 0 { 94 | return -1 95 | } 96 | 97 | for ; start <= max; start++ { 98 | 99 | match := true 100 | for i := 0; i < len(sub); i++ { 101 | if runes[start+i] != sub[i] { 102 | match = false 103 | break 104 | } 105 | } 106 | if match { 107 | return start 108 | } 109 | 110 | } 111 | return -1 112 | } 113 | 114 | // trimInBetween converts all whitespace to a space and remove duplicate sequences of spaces 115 | func trimInBetween(str string) string { 116 | if str == "" { 117 | return str 118 | } 119 | 120 | n := bytes.NewBufferString("") 121 | 122 | lastSpace := false 123 | 124 | for _, r := range str { 125 | if unicode.IsSpace(r) || unicode.IsControl(r) { 126 | if lastSpace { 127 | continue 128 | } 129 | lastSpace = true 130 | n.WriteRune(' ') 131 | continue 132 | } 133 | n.WriteRune(r) 134 | lastSpace = false 135 | } 136 | return n.String() 137 | } 138 | 139 | // hasContent indicates if a string has any character that's not a whitespace or control character 140 | func hasContent(text string) bool { 141 | if len(text) == 0 { 142 | return false 143 | } 144 | 145 | for _, r := range text { 146 | if !unicode.IsSpace(r) && !unicode.IsControl(r) { 147 | return true 148 | } 149 | } 150 | return false 151 | } 152 | -------------------------------------------------------------------------------- /generic_test.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | import ( 4 | "strconv" 5 | "testing" 6 | ) 7 | 8 | func Test_Union(t *testing.T) { 9 | a := []string{"a", "b"} 10 | b := []string{"a", "c"} 11 | 12 | c := union(a, b) 13 | 14 | if len(c) != 3 { 15 | t.Error() 16 | } 17 | } 18 | 19 | func Test_Sorted_contains(t *testing.T) { 20 | a := []string{"a", "b", "c"} 21 | 22 | for _, v := range a { 23 | if !sorted_contains(a, v) { 24 | t.Error(v) 25 | } 26 | } 27 | 28 | if sorted_contains(a, "d") { 29 | t.Error("d") 30 | } 31 | } 32 | 33 | func Test_Contains(t *testing.T) { 34 | a := []string{"a", "b", "c"} 35 | 36 | for _, v := range a { 37 | if !sorted_contains(a, v) { 38 | t.Error(v) 39 | } 40 | } 41 | 42 | if sorted_contains(a, "d") { 43 | t.Error("d") 44 | } 45 | } 46 | 47 | func Test_convertSemicolonDelimited(t *testing.T) { 48 | r := convertSemicolonDelimited("") 49 | if r != nil { 50 | t.Error() 51 | } 52 | 53 | r = convertSemicolonDelimited("a") 54 | if len(r) != 1 || r[0] != "a" { 55 | t.Error() 56 | } 57 | 58 | r = convertSemicolonDelimited("a;a") 59 | if len(r) != 2 || r[0] != "a" { 60 | t.Error() 61 | } 62 | 63 | r = convertSemicolonDelimited("b;a") 64 | if len(r) != 2 || r[0] != "a" || r[1] != "b" { 65 | t.Error() 66 | } 67 | 68 | } 69 | 70 | func Test_runesLastIndex(t *testing.T) { 71 | if runesLastIndex([]rune(""), 'a') != -1 { 72 | t.Error() 73 | } 74 | 75 | if runesLastIndex([]rune("a"), 'a') != 0 { 76 | t.Error() 77 | } 78 | 79 | if runesLastIndex([]rune("bac"), 'a') != 1 { 80 | t.Error() 81 | } 82 | 83 | if runesLastIndex([]rune("bba"), 'a') != 2 { 84 | t.Error() 85 | } 86 | 87 | if runesLastIndex([]rune("abab"), 'a') != 2 { 88 | t.Error() 89 | } 90 | 91 | if runesLastIndex([]rune("defg"), 'a') != -1 { 92 | t.Error() 93 | } 94 | } 95 | 96 | func Test_runesIndex(t *testing.T) { 97 | if runesIndex([]rune(""), 'a') != -1 { 98 | t.Error() 99 | } 100 | 101 | if runesIndex([]rune("a"), 'a') != 0 { 102 | t.Error() 103 | } 104 | 105 | if runesIndex([]rune("bac"), 'a') != 1 { 106 | t.Error() 107 | } 108 | 109 | if runesIndex([]rune("bba"), 'a') != 2 { 110 | t.Error() 111 | } 112 | 113 | if runesIndex([]rune("abab"), 'a') != 0 { 114 | t.Error() 115 | } 116 | 117 | if runesIndex([]rune("defg"), 'a') != -1 { 118 | t.Error() 119 | } 120 | } 121 | 122 | func Test_runesIndexRunesStart(t *testing.T) { 123 | if runesIndexRunesStart([]rune(""), []rune(""), 0) != -1 { 124 | t.Error() 125 | } 126 | 127 | if runesIndexRunesStart([]rune("abc"), []rune(""), 0) != -1 { 128 | t.Error() 129 | } 130 | 131 | if runesIndexRunesStart([]rune("abc"), []rune("d"), 0) != -1 { 132 | t.Error() 133 | } 134 | 135 | if runesIndexRunesStart([]rune("abc"), []rune("def"), 0) != -1 { 136 | t.Error() 137 | } 138 | 139 | if runesIndexRunesStart([]rune("abc"), []rune("abd"), 0) != -1 { 140 | t.Error() 141 | } 142 | 143 | if runesIndexRunesStart([]rune("abc"), []rune("a"), 0) != 0 { 144 | t.Error() 145 | } 146 | 147 | if runesIndexRunesStart([]rune("abc"), []rune("c"), 0) != 2 { 148 | t.Error() 149 | } 150 | 151 | if runesIndexRunesStart([]rune("abc"), []rune("abc"), 0) != 0 { 152 | t.Error() 153 | } 154 | 155 | if runesIndexRunesStart([]rune("abab"), []rune("ab"), 0) != 0 { 156 | t.Error() 157 | } 158 | 159 | if r := runesIndexRunesStart([]rune("abab"), []rune("ab"), 1); r != 2 { 160 | t.Error(strconv.Itoa(r)) 161 | } 162 | 163 | } 164 | 165 | func Test_trimInBetween(t *testing.T) { 166 | 167 | if trimInBetween("") != "" { 168 | t.Error() 169 | } 170 | 171 | if trimInBetween("abc") != "abc" { 172 | t.Error() 173 | } 174 | 175 | if trimInBetween(" abc ") != " abc " { 176 | t.Error() 177 | } 178 | 179 | if trimInBetween("a b c") != "a b c" { 180 | t.Error() 181 | } 182 | 183 | if trimInBetween("a b c") != "a b c" { 184 | t.Error() 185 | } 186 | 187 | if trimInBetween("a\nb\nc") != "a b c" { 188 | t.Error() 189 | } 190 | 191 | if r := trimInBetween("a\n\nb \n c"); r != "a b c" { 192 | t.Error(r) 193 | } 194 | } 195 | 196 | func Test_hasContent(t *testing.T) { 197 | if hasContent("") { 198 | t.Error() 199 | } 200 | 201 | if hasContent(" ") { 202 | t.Error() 203 | } 204 | 205 | if hasContent("\r") { 206 | t.Error() 207 | } 208 | 209 | if hasContent("\r\n\t ") { 210 | t.Error() 211 | } 212 | 213 | if !hasContent("a") { 214 | t.Error() 215 | } 216 | 217 | if !hasContent(" a") { 218 | t.Error() 219 | } 220 | 221 | if !hasContent("a ") { 222 | t.Error() 223 | } 224 | 225 | if !hasContent("\t \n a") { 226 | t.Error() 227 | } 228 | 229 | } 230 | -------------------------------------------------------------------------------- /htmlelement.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | import ( 4 | "bytes" 5 | "html" 6 | "strings" 7 | "unicode" 8 | ) 9 | 10 | type QuoteType uint8 11 | 12 | const ( 13 | QTNone QuoteType = iota 14 | QTSingle 15 | QTDouble 16 | ) 17 | 18 | type attributeInfo struct { 19 | Name string 20 | Value string 21 | } 22 | 23 | type HtmlElement struct { 24 | errors *[]string 25 | warnings *[]string 26 | 27 | TagName string 28 | TagNameNS string 29 | Id string 30 | Attributes []attributeInfo 31 | ElementInfo *HtmlElementInfo 32 | Namespace string 33 | HasNamespace bool 34 | XmlEmptyTag bool 35 | Parent *HtmlElement 36 | HasDeprecatedAttributes bool 37 | HasOnlyKnownAttributes bool 38 | SyntaxError bool 39 | FatalSyntaxError bool 40 | OriginalOpenTag string 41 | } 42 | 43 | func NewHtmlElement(openElement string, parent *HtmlElement, errors, warnings *[]string) *HtmlElement { 44 | 45 | he := new(HtmlElement) 46 | he.OriginalOpenTag = openElement 47 | 48 | he.Parent = parent 49 | 50 | he.errors = errors 51 | he.warnings = warnings 52 | 53 | he.HasOnlyKnownAttributes = true 54 | he.HasDeprecatedAttributes = false 55 | 56 | // openElement contains any type of open tag/single tag 57 | // Examples: 58 | //
    59 | //
    60 | //
    61 | //
    62 | // 63 | 64 | he.Attributes = make([]attributeInfo, 0) 65 | 66 | runes := []rune(openElement) 67 | l := len(runes) 68 | 69 | pos := 1 // skip the < 70 | for ; pos < l; pos++ { 71 | c := runes[pos] 72 | if !unicode.IsSpace(c) { 73 | break 74 | } 75 | } 76 | 77 | if pos == l { 78 | // Error: Empty tag with whitespaces only: "< >"; 79 | he.addError("Invalid tag (whitespaces only).") 80 | he.SyntaxError = true 81 | return he 82 | } 83 | 84 | for ; pos < l; pos++ { 85 | c := runes[pos] 86 | if c == '>' { 87 | if pos == 1 { 88 | // Error: Empty tag like "<>" 89 | he.addError("Empty tag <>") 90 | he.SyntaxError = true 91 | return he 92 | } 93 | // This is it 94 | he.TagName = strings.ToLower(strings.TrimSpace(string(runes[1:pos]))) 95 | he.checkTag() 96 | return he 97 | } 98 | 99 | if unicode.IsSpace(c) { 100 | he.TagName = strings.ToLower(strings.TrimSpace(string(runes[1:pos]))) 101 | he.checkTag() 102 | break 103 | } 104 | } 105 | 106 | pos++ // skip the whitespace 107 | 108 | end := runesLastIndex(runes, '>') 109 | if end == -1 { 110 | he.addError("Missing closing >") 111 | he.SyntaxError = true 112 | he.FatalSyntaxError = true 113 | return he 114 | } 115 | end-- 116 | for end >= pos { 117 | if runes[end] == '/' { 118 | he.XmlEmptyTag = true 119 | end-- 120 | break 121 | } 122 | if !unicode.IsSpace(runes[end]) { 123 | break 124 | } 125 | end-- 126 | } 127 | 128 | if end > pos { 129 | he.parseAttributes(squeezeSpaces(string(runes[pos : end+1]))) 130 | } 131 | 132 | return he 133 | } 134 | 135 | func (he *HtmlElement) GetOpenTag(noEvents, noUnknownAttributes bool) string { 136 | return internalBuildOpenTag(he.ElementInfo, he.TagNameNS, he.Attributes, noEvents, noUnknownAttributes, he.XmlEmptyTag) 137 | } 138 | 139 | func (he *HtmlElement) GetCloseTag() string { 140 | return "" 141 | } 142 | 143 | func (he *HtmlElement) GetAttributeValue(attrName string) (string, bool) { 144 | 145 | i := he.FindAttributeIndex(attrName) 146 | if i >= 0 { 147 | return he.Attributes[i].Value, true 148 | } 149 | return "", false 150 | 151 | } 152 | 153 | func (he *HtmlElement) SetAttribute(attrName, attrValue string) bool { 154 | if attrName == "" { 155 | return true 156 | } 157 | 158 | if strings.IndexAny(attrValue, "\r\n\t") >= 0 { 159 | //throw new ArgumentException("attrValue cannot contain control characters") 160 | return false 161 | } 162 | 163 | i := he.FindAttributeIndex(attrName) 164 | if i >= 0 { 165 | he.Attributes[i].Value = attrValue 166 | } else { 167 | he.AddAttribute(attrName, attrValue) 168 | } 169 | return true 170 | } 171 | 172 | func (he *HtmlElement) RemoveAttribute(attrName string) { 173 | i := he.FindAttributeIndex(attrName) 174 | if i >= 0 { 175 | he.Attributes = append(he.Attributes[:i], he.Attributes[i+1:]...) 176 | } 177 | } 178 | 179 | func (he *HtmlElement) HasAttribute(attrName string) bool { 180 | return he.FindAttributeIndex(attrName) >= 0 181 | } 182 | 183 | func (he *HtmlElement) FindAttributeIndex(attrName string) int { 184 | if len(he.Attributes) == 0 || attrName == "" { 185 | return -1 186 | } 187 | 188 | attrName = strings.ToLower(attrName) 189 | 190 | for i, a := range he.Attributes { 191 | if a.Name == attrName { 192 | return i 193 | } 194 | } 195 | return -1 196 | } 197 | 198 | func (he *HtmlElement) checkTag() { 199 | if strings.HasSuffix(he.TagName, "/") { 200 | he.TagName = he.TagName[0 : len(he.TagName)-1] 201 | } 202 | he.TagNameNS = he.TagName 203 | 204 | he.ElementInfo = GetElementInfo(he.TagNameNS) 205 | 206 | pos := strings.Index(he.TagName, ":") 207 | if pos != -1 { 208 | he.Namespace = he.TagName[:pos] 209 | he.TagName = he.TagName[pos+1:] 210 | } 211 | if he.ElementInfo == nil { 212 | if he.Namespace == "" { 213 | he.addWarning("Unknown tag: " + he.TagName) 214 | } 215 | } else { 216 | if he.Parent != nil { 217 | if !he.ElementInfo.IsValidParent(he.Parent.TagName) { 218 | he.addWarning("Invalid parent for " + he.TagName + " (parent: " + he.Parent.TagName + ")") 219 | } 220 | } 221 | } 222 | } 223 | 224 | func (he *HtmlElement) addWarning(warning string) { 225 | *he.warnings = append(*he.warnings, warning) 226 | } 227 | 228 | func (he *HtmlElement) addError(error string) { 229 | *he.errors = append(*he.errors, error) 230 | } 231 | 232 | func (he *HtmlElement) AddAttribute(attrName, attrVal string) { 233 | if attrName == "" { 234 | return 235 | } 236 | 237 | if attrName == "style" { 238 | attrVal = cleanStyleAttr(attrVal) 239 | } else if attrName == "id" { 240 | he.Id = attrVal 241 | } 242 | 243 | if he.ElementInfo != nil { 244 | //bool useUrl; 245 | ast := he.ElementInfo.GetAttributeStatus(attrName) 246 | if ast == ASUnknown { 247 | if strings.Index(attrName, ":") > 0 { 248 | } else { 249 | he.HasOnlyKnownAttributes = false 250 | he.addWarning("Unknown attribute: " + attrName + " (tag: " + he.TagNameNS + ")") 251 | } 252 | } else if ast == ASDeprecated { 253 | he.HasDeprecatedAttributes = true 254 | he.addWarning("Deprecated attribute: " + attrName + " (tag: " + he.TagNameNS + ")") 255 | } 256 | } 257 | 258 | if len(attrVal) > 0 { 259 | attrVal = html.UnescapeString(attrVal) 260 | } 261 | 262 | he.Attributes = append(he.Attributes, attributeInfo{attrName, attrVal}) 263 | } 264 | 265 | func squeezeSpaces(s string) string { 266 | n := bytes.NewBufferString("") 267 | atSpace := false 268 | atEqual := false 269 | inQuote := false 270 | quote := rune('-') 271 | 272 | for _, c := range s { 273 | 274 | if inQuote { 275 | if c == quote { 276 | inQuote = false 277 | } 278 | n.WriteRune(c) 279 | continue 280 | } 281 | if unicode.IsSpace(c) { 282 | atSpace = true 283 | continue 284 | } 285 | if c == '=' { 286 | atEqual = true 287 | continue 288 | } 289 | // At this point, we know the char is not white or '='. 290 | if atEqual { 291 | n.WriteRune('=') 292 | atEqual = false 293 | atSpace = false 294 | } 295 | if atSpace { 296 | n.WriteRune(' ') 297 | atSpace = false 298 | } 299 | if c == '"' || c == '\'' { 300 | inQuote = true 301 | quote = c 302 | } 303 | n.WriteRune(c) 304 | } 305 | 306 | if atEqual { 307 | n.WriteRune('=') 308 | } 309 | return n.String() 310 | } 311 | 312 | func (he *HtmlElement) parseAttributes(openElement string) { 313 | runes := []rune(openElement) 314 | l := len(runes) 315 | var attrName, attrVal string 316 | p := 0 317 | var c rune 318 | var found bool 319 | // Parse all the attributes now 320 | for ; p < l; p++ { 321 | // skip all the whitespaces 322 | for unicode.IsSpace(runes[p]) { 323 | p++ 324 | if p == l { 325 | return 326 | } 327 | } 328 | 329 | // now, search for the attribute name by either finding a whitespace or the "=" sign 330 | found = false 331 | startAttrName := p 332 | for { 333 | c = runes[p] 334 | if unicode.IsSpace(c) || c == '>' { 335 | // This is an empty attribute like "checked" in "" 336 | attrName = strings.ToLower(strings.TrimSpace(string(runes[startAttrName:p]))) 337 | he.AddAttribute(attrName, "") 338 | if c == '>' { 339 | return 340 | } 341 | found = true 342 | break 343 | } 344 | if c == '=' { 345 | break 346 | } 347 | p++ 348 | if p >= l { 349 | 350 | attrName = strings.ToLower(strings.TrimSpace(string(runes[startAttrName:p]))) 351 | he.AddAttribute(attrName, "") 352 | return 353 | } 354 | } 355 | if found { 356 | continue 357 | } 358 | 359 | if startAttrName == p { 360 | he.addError("Attribute name starts with the '=' sign.") 361 | he.SyntaxError = true 362 | // Invalid attribute, starts with an '=' sign 363 | // Skip it to the next whitespace 364 | p++ 365 | c = runes[p] 366 | if c == '\'' { 367 | p = p + 1 + runesIndex(runes[p+1:], '\'') 368 | } else if c == '"' { 369 | 370 | p = p + 1 + runesIndex(runes[p+1:], '"') 371 | } 372 | continue 373 | } 374 | 375 | attrName = strings.ToLower(strings.TrimSpace(string(runes[startAttrName:p]))) 376 | p++ // skipt the equal sign 377 | if p == l { 378 | he.addError("Attribute ends with equal sign.") 379 | he.SyntaxError = true 380 | he.FatalSyntaxError = true 381 | return 382 | } 383 | 384 | startAttrVal := p 385 | c = runes[p] 386 | 387 | if unicode.IsSpace(c) || c == '>' { 388 | // This is a malformed attribute since it has a whitespace after the '=' sign, 389 | // like or 390 | he.addError("Attribute is missing value: " + attrName) 391 | he.SyntaxError = true 392 | he.AddAttribute(attrName, "") 393 | continue 394 | } 395 | 396 | if c == '\'' || c == '"' { 397 | startAttrVal++ 398 | np := runesIndex(runes[p+1:], c) 399 | if np == -1 { 400 | // Argh, this attribute is missing the end quote, stop parsing 401 | he.addError("Attribute is missing end quote: " + attrName) 402 | he.SyntaxError = true 403 | he.FatalSyntaxError = true 404 | return 405 | } 406 | p = np + p + 1 407 | 408 | if p == startAttrVal { 409 | attrVal = "" 410 | } else { 411 | attrVal = string(runes[startAttrVal:p]) 412 | } 413 | he.AddAttribute(attrName, attrVal) 414 | continue 415 | } 416 | 417 | // This is an attribute without a quote. Find the first whitespace or > 418 | for ; p < l; p++ { 419 | c = runes[p] 420 | if unicode.IsSpace(c) || c == '>' || p == l-1 { 421 | 422 | attrVal = string(runes[startAttrVal : p+1]) 423 | he.AddAttribute(attrName, attrVal) 424 | break 425 | } 426 | } 427 | } 428 | 429 | } 430 | 431 | func parseClosingTag(elem string) string { 432 | if !strings.HasPrefix(elem, "' || unicode.IsSpace(c) { 439 | 440 | return strings.ToLower(strings.TrimSpace(elem[2:p])) 441 | } 442 | } 443 | return strings.ToLower(strings.TrimSpace(elem)) 444 | } 445 | 446 | func BuildOpenTagHEI(ei *HtmlElementInfo, attributes []attributeInfo, noEvents, noUnknownAttributes bool) string { 447 | return internalBuildOpenTag(ei, ei.TagName, attributes, noEvents, noUnknownAttributes, false) 448 | } 449 | 450 | func BuildOpenTag(tagName string, attributes []attributeInfo, noEvents, noUnknownAttributes bool) string { 451 | var ei *HtmlElementInfo 452 | if noUnknownAttributes { 453 | ei = GetElementInfo(tagName) 454 | } 455 | return internalBuildOpenTag(ei, tagName, attributes, noEvents, noUnknownAttributes, false) 456 | } 457 | 458 | func HtmlAttributeEncode(attributeValue string) string { 459 | 460 | if attributeValue == "" { 461 | return "" 462 | } 463 | 464 | if strings.IndexAny(attributeValue, `&"`) == -1 { 465 | return attributeValue 466 | } 467 | 468 | n := bytes.NewBufferString("") 469 | for _, c := range attributeValue { 470 | switch c { 471 | case '&': 472 | n.WriteString("&") 473 | case '"': 474 | n.WriteString(""") 475 | default: 476 | n.WriteRune(c) 477 | 478 | } 479 | } 480 | 481 | return n.String() 482 | } 483 | 484 | func NeedQuotesForAttr(val string) QuoteType { 485 | if val == "" { 486 | return QTDouble 487 | } 488 | 489 | qt := QTNone 490 | runes := []rune(val) 491 | for c := range runes { 492 | switch { 493 | case c >= 'a' && c <= 'z': 494 | continue 495 | case c >= 'A' && c <= 'Z': 496 | continue 497 | case c >= '0' && c <= '9': 498 | continue 499 | case c == '_' || c == '-' || c == '.' || c == ',': // According to http://www.w3.org/TR/html401/intro/sgmltut.html#h-3.2.2 500 | continue 501 | } 502 | qt = QTDouble 503 | if c == '"' { 504 | qt = QTSingle 505 | } 506 | } 507 | return qt 508 | } 509 | 510 | func cleanStyleAttr(style string) string { 511 | if style == "" { 512 | return style 513 | } 514 | 515 | parts := convertSemicolonDelimited(style) 516 | 517 | n := bytes.NewBufferString("") 518 | 519 | for _, part := range parts { 520 | p2 := strings.TrimSpace(part) 521 | if len(p2) == 0 { 522 | continue 523 | } 524 | pos := strings.IndexRune(p2, ':') 525 | if pos == -1 { 526 | continue 527 | } 528 | styleName := strings.ToLower(p2[:pos]) 529 | styleValue := strings.TrimSpace(p2[pos+1:]) 530 | 531 | if len(styleValue) == 0 { 532 | continue 533 | } 534 | 535 | if n.Len() > 0 { 536 | n.WriteRune(';') 537 | } 538 | n.WriteString(styleName) 539 | n.WriteRune(':') 540 | n.WriteString(styleValue) 541 | } 542 | 543 | return n.String() 544 | 545 | } 546 | 547 | func internalBuildOpenTag(ei *HtmlElementInfo, tagName string, attributes []attributeInfo, noEvents, noUnknownAttributes, xmlEmptyTag bool) string { 548 | if !noUnknownAttributes { 549 | ei = nil 550 | } 551 | 552 | n := bytes.NewBufferString("") 553 | 554 | n.WriteRune('<') 555 | n.WriteString(tagName) 556 | 557 | for _, a := range attributes { 558 | 559 | if a.Name == "" || noEvents && strings.HasPrefix(a.Name, "on") { 560 | continue 561 | } 562 | 563 | if ei != nil && ei.GetAttributeStatus(a.Name) == ASUnknown { 564 | continue 565 | } 566 | 567 | n.WriteRune(' ') 568 | n.WriteString(a.Name) 569 | if a.Value == "" { 570 | 571 | continue // Empty attribute (valid on HTML5 and above) 572 | } 573 | 574 | n.WriteRune('=') 575 | 576 | if len(a.Value) > 0 { 577 | encoded := html.EscapeString(a.Value) 578 | n.WriteRune('"') 579 | n.WriteString(encoded) 580 | n.WriteRune('"') 581 | } 582 | } 583 | if xmlEmptyTag { 584 | n.WriteString(" />") 585 | } else { 586 | n.WriteRune('>') 587 | } 588 | return n.String() 589 | 590 | } 591 | -------------------------------------------------------------------------------- /htmlelementinfo.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | var globalAttributes []string 8 | var allElements []HtmlElementInfo 9 | var elemsInfo map[string]HtmlElementInfo 10 | 11 | func init() { 12 | baseAttributes := "accesskey;class;contenteditable;contextmenu;dir;draggable;dropzone;hidden;id;lang;spellcheck;style;tabindex;title;translate;;onabort;onblur;oncanplay;oncanplaythrough;onchange;onclick;oncontextmenu;ondblclick;ondrag;ondragend;ondragenter;ondragleave;ondragover;ondragstart;ondrop;ondurationchange;onemptied;onended;onerror;onfocus;oninput;oninvalid;onkeydown;onkeypress;onkeyup;onload;onloaddata;onloadeddata;onloadedmetadata;onloadstart;onmousedown;onmousemove;onmouseout;onmouseover;onmouseup;onmousewheel;onpause;onplay;onplaying;onprogress;onratechange;onreadystatechange;onreset;onscroll;onseekend;onseeking;onselect;onshow;onstalled;onsubmit;onsuspended;ontimeupdate;onvolumechange;onwaiting;xml:base;xml:lang;xml:space" 13 | globalAttributes = convertSemicolonDelimited(baseAttributes) 14 | 15 | initElements() 16 | 17 | elemsInfo = make(map[string]HtmlElementInfo, len(allElements)) 18 | for _, hei := range allElements { 19 | hei.setAttributes(hei.attributesString) 20 | elemsInfo[hei.TagName] = hei 21 | } 22 | } 23 | 24 | type HtmlElementInfo struct { 25 | TagName string 26 | HtmlVersion int // HTML version that introduced this tag 27 | Obsolete bool // Indicates if this element is obsolete 28 | TagFormatting HtmlTagFormatting 29 | ElementType HtmlElementType 30 | PermittedChildrenTypes HtmlElementType // Valid types of elements that can be nested inside this tag 31 | PermittedChildrenTags []string // Valid children for this tag 32 | Attributes []string 33 | attributesString []string // This is temporary to be merged with globalAttributes 34 | ObsoleteAttributes []string 35 | ParentContentTypes HtmlElementType 36 | ParentTags []string 37 | ExcludeParentTags []string 38 | } 39 | 40 | func (hei *HtmlElementInfo) GetAttributeStatus(attrName string) AttrStatus { 41 | if attrName == "" { 42 | return ASUnknown 43 | } 44 | 45 | attrNameLower := strings.ToLower(attrName) 46 | 47 | if sorted_contains(hei.ObsoleteAttributes, attrNameLower) { 48 | return ASDeprecated 49 | } 50 | 51 | if sorted_contains(hei.Attributes, attrNameLower) { 52 | return ASValid 53 | } 54 | 55 | return ASUnknown 56 | } 57 | 58 | func (hei *HtmlElementInfo) IsValidParent(parentTagName string) bool { 59 | if parentTagName == "" { 60 | return true // no parent is always valid here 61 | } 62 | 63 | parentTagNameLower := strings.ToLower(parentTagName) 64 | 65 | // Check if the parent is in the not-allowed list 66 | if sorted_contains(hei.ExcludeParentTags, parentTagNameLower) { 67 | return false 68 | } 69 | 70 | // Check if the parent is in the white list 71 | if sorted_contains(hei.ParentTags, parentTagNameLower) { 72 | return true 73 | } 74 | 75 | // Finally, check if the content type is allowed 76 | if hei.ParentContentTypes == HETNone { 77 | return false 78 | } 79 | 80 | parentInfo := GetElementInfo(parentTagNameLower) 81 | if parentInfo == nil { 82 | if strings.Contains(parentTagName, ":") { 83 | return true // assume it's a custom defined element 84 | } 85 | 86 | return false 87 | } 88 | 89 | if (hei.ParentContentTypes & parentInfo.PermittedChildrenTypes) != 0 { 90 | return true 91 | } 92 | 93 | return false 94 | } 95 | 96 | func (hei *HtmlElementInfo) setPermittedChildrenTags(tags string) { 97 | hei.PermittedChildrenTags = convertSemicolonDelimited(tags) 98 | } 99 | 100 | func (hei *HtmlElementInfo) setObsoleteAttributes(attrs string) { 101 | hei.ObsoleteAttributes = convertSemicolonDelimited(attrs) 102 | } 103 | 104 | func (hei *HtmlElementInfo) setParentTags(tags string) { 105 | hei.ParentTags = convertSemicolonDelimited(tags) 106 | } 107 | 108 | func (hei *HtmlElementInfo) setExcludeParentTags(tags string) { 109 | hei.ParentTags = convertSemicolonDelimited(tags) 110 | } 111 | 112 | func (hei *HtmlElementInfo) setAttributes(attrs []string) { 113 | if len(attrs) == 0 { 114 | hei.Attributes = globalAttributes 115 | } else { 116 | hei.Attributes = union(attrs, globalAttributes) 117 | 118 | } 119 | } 120 | 121 | // GetElementInfo returns the HtmlElementInfo for this tag 122 | func GetElementInfo(tagName string) *HtmlElementInfo { 123 | if tagName == "" { 124 | return nil 125 | } 126 | 127 | elem, exist := elemsInfo[tagName] 128 | if exist { 129 | return &elem 130 | } 131 | return nil 132 | } 133 | -------------------------------------------------------------------------------- /htmlelementinfo_init.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | func initElements() { 4 | allElements = []HtmlElementInfo{ 5 | HtmlElementInfo{ 6 | TagName: "a", 7 | HtmlVersion: 3, 8 | Obsolete: false, 9 | ElementType: HETFlow, 10 | PermittedChildrenTypes: HETAnyContent, 11 | PermittedChildrenTags: []string{}, 12 | attributesString: []string{"href", "target", "rel", "hreflang", "media", "type"}, 13 | TagFormatting: HTFComplete, 14 | ParentTags: []string{}, 15 | ExcludeParentTags: []string{"a", "button"}, 16 | ParentContentTypes: HETFlow | HETPhrasing, 17 | ObsoleteAttributes: []string{"coords", "shape", "urn", "charset", "methods", "rev", "name"}, 18 | }, 19 | HtmlElementInfo{ 20 | TagName: "abbr", 21 | HtmlVersion: 3, 22 | Obsolete: false, 23 | ElementType: HETPhrasing, 24 | PermittedChildrenTypes: HETPhrasing | HETText, 25 | PermittedChildrenTags: []string{}, 26 | attributesString: []string{}, 27 | TagFormatting: HTFComplete, 28 | ParentTags: []string{}, 29 | ExcludeParentTags: []string{}, 30 | ParentContentTypes: HETPhrasing, 31 | ObsoleteAttributes: []string{}, 32 | }, 33 | HtmlElementInfo{ 34 | 35 | TagName: "acronym", 36 | HtmlVersion: 4, 37 | Obsolete: true, 38 | ElementType: HETPhrasing, 39 | PermittedChildrenTypes: HETAnyContent, 40 | PermittedChildrenTags: []string{}, 41 | attributesString: []string{}, 42 | TagFormatting: HTFComplete, 43 | ParentTags: []string{}, 44 | ExcludeParentTags: []string{}, 45 | ParentContentTypes: HETPhrasing, 46 | ObsoleteAttributes: []string{}, 47 | }, 48 | HtmlElementInfo{ 49 | 50 | TagName: "address", 51 | HtmlVersion: 3, 52 | Obsolete: false, 53 | ElementType: HETFlow, 54 | PermittedChildrenTypes: HETAnyContent, 55 | PermittedChildrenTags: []string{}, 56 | attributesString: []string{}, 57 | TagFormatting: HTFComplete, 58 | ParentTags: []string{}, 59 | ExcludeParentTags: []string{"address"}, 60 | ParentContentTypes: HETFlow, 61 | ObsoleteAttributes: []string{}, 62 | }, 63 | HtmlElementInfo{ 64 | 65 | TagName: "applet", 66 | HtmlVersion: 3, 67 | Obsolete: true, 68 | ElementType: HETPhrasing, 69 | PermittedChildrenTypes: HETAnyContent, 70 | PermittedChildrenTags: []string{}, 71 | attributesString: []string{}, 72 | TagFormatting: HTFComplete, 73 | ParentTags: []string{}, 74 | ExcludeParentTags: []string{}, 75 | ParentContentTypes: HETPhrasing, 76 | ObsoleteAttributes: []string{}, 77 | }, 78 | HtmlElementInfo{ 79 | 80 | TagName: "area", 81 | HtmlVersion: 3, 82 | Obsolete: false, 83 | ElementType: HETPhrasing, 84 | PermittedChildrenTypes: HETNone, 85 | PermittedChildrenTags: []string{}, 86 | attributesString: []string{"alt", "href", "target", "rel", "media", "hreflang", "type", "shape", "coords"}, 87 | TagFormatting: HTFSingle, 88 | ParentTags: []string{"map"}, 89 | ExcludeParentTags: []string{}, 90 | ParentContentTypes: HETNone, 91 | ObsoleteAttributes: []string{"nohref"}, 92 | }, 93 | HtmlElementInfo{ 94 | 95 | TagName: "article", 96 | HtmlVersion: 5, 97 | Obsolete: false, 98 | ElementType: HETFlow, 99 | PermittedChildrenTypes: HETAnyContent, 100 | PermittedChildrenTags: []string{}, 101 | attributesString: []string{}, 102 | TagFormatting: HTFComplete, 103 | ParentTags: []string{}, 104 | ExcludeParentTags: []string{}, 105 | ParentContentTypes: HETFlow, 106 | ObsoleteAttributes: []string{}, 107 | }, 108 | HtmlElementInfo{ 109 | 110 | TagName: "aside", 111 | HtmlVersion: 5, 112 | Obsolete: false, 113 | ElementType: HETFlow, 114 | PermittedChildrenTypes: HETAnyContent, 115 | PermittedChildrenTags: []string{}, 116 | attributesString: []string{}, 117 | TagFormatting: HTFComplete, 118 | ParentTags: []string{}, 119 | ExcludeParentTags: []string{"address"}, 120 | ParentContentTypes: HETFlow, 121 | ObsoleteAttributes: []string{}, 122 | }, 123 | HtmlElementInfo{ 124 | 125 | TagName: "audio", 126 | HtmlVersion: 5, 127 | Obsolete: false, 128 | ElementType: HETPhrasing, 129 | PermittedChildrenTypes: HETFlow | HETPhrasing, 130 | PermittedChildrenTags: []string{}, 131 | attributesString: []string{"autoplay", "preload", "controls", "loop", "mediagroup", "muted", "src"}, 132 | TagFormatting: HTFComplete, 133 | ParentTags: []string{}, 134 | ExcludeParentTags: []string{"a", "button"}, 135 | ParentContentTypes: HETFlow | HETPhrasing, 136 | ObsoleteAttributes: []string{}, 137 | }, 138 | HtmlElementInfo{ 139 | 140 | TagName: "b", 141 | HtmlVersion: 3, 142 | Obsolete: false, 143 | ElementType: HETPhrasing, 144 | PermittedChildrenTypes: HETPhrasing | HETText, 145 | PermittedChildrenTags: []string{}, 146 | attributesString: []string{}, 147 | TagFormatting: HTFComplete, 148 | ParentTags: []string{}, 149 | ExcludeParentTags: []string{}, 150 | ParentContentTypes: HETPhrasing, 151 | ObsoleteAttributes: []string{}, 152 | }, 153 | HtmlElementInfo{ 154 | 155 | TagName: "base", 156 | HtmlVersion: 3, 157 | Obsolete: false, 158 | ElementType: HETPhrasing, 159 | PermittedChildrenTypes: HETNone, 160 | PermittedChildrenTags: []string{}, 161 | attributesString: []string{"href", "target"}, 162 | TagFormatting: HTFSingle, 163 | ParentTags: []string{"head"}, 164 | ExcludeParentTags: []string{}, 165 | ParentContentTypes: HETNone, 166 | ObsoleteAttributes: []string{}, 167 | }, 168 | HtmlElementInfo{ 169 | 170 | TagName: "basefont", 171 | HtmlVersion: 3, 172 | Obsolete: true, 173 | ElementType: HETPhrasing, 174 | PermittedChildrenTypes: HETNone, 175 | PermittedChildrenTags: []string{}, 176 | attributesString: []string{}, 177 | TagFormatting: HTFSingle, 178 | ParentTags: []string{}, 179 | ExcludeParentTags: []string{}, 180 | ParentContentTypes: HETPhrasing, 181 | ObsoleteAttributes: []string{}, 182 | }, 183 | HtmlElementInfo{ 184 | 185 | TagName: "bdi", 186 | HtmlVersion: 5, 187 | Obsolete: false, 188 | ElementType: HETPhrasing, 189 | PermittedChildrenTypes: HETPhrasing | HETText, 190 | PermittedChildrenTags: []string{}, 191 | attributesString: []string{}, 192 | TagFormatting: HTFComplete, 193 | ParentTags: []string{}, 194 | ExcludeParentTags: []string{}, 195 | ParentContentTypes: HETPhrasing, 196 | ObsoleteAttributes: []string{}, 197 | }, 198 | HtmlElementInfo{ 199 | 200 | TagName: "bdo", 201 | HtmlVersion: 3, 202 | Obsolete: false, 203 | ElementType: HETPhrasing, 204 | PermittedChildrenTypes: HETPhrasing | HETText, 205 | PermittedChildrenTags: []string{}, 206 | attributesString: []string{}, 207 | TagFormatting: HTFComplete, 208 | ParentTags: []string{}, 209 | ExcludeParentTags: []string{}, 210 | ParentContentTypes: HETPhrasing, 211 | ObsoleteAttributes: []string{}, 212 | }, 213 | HtmlElementInfo{ 214 | 215 | TagName: "big", 216 | HtmlVersion: 3, 217 | Obsolete: true, 218 | ElementType: HETPhrasing, 219 | PermittedChildrenTypes: HETPhrasing | HETText, 220 | PermittedChildrenTags: []string{}, 221 | attributesString: []string{}, 222 | TagFormatting: HTFComplete, 223 | ParentTags: []string{}, 224 | ExcludeParentTags: []string{}, 225 | ParentContentTypes: HETPhrasing, 226 | ObsoleteAttributes: []string{}, 227 | }, 228 | HtmlElementInfo{ 229 | 230 | TagName: "blockquote", 231 | HtmlVersion: 3, 232 | Obsolete: false, 233 | ElementType: HETFlow, 234 | PermittedChildrenTypes: HETAnyContent, 235 | PermittedChildrenTags: []string{}, 236 | attributesString: []string{"cite"}, 237 | TagFormatting: HTFComplete, 238 | ParentTags: []string{}, 239 | ExcludeParentTags: []string{}, 240 | ParentContentTypes: HETFlow, 241 | ObsoleteAttributes: []string{}, 242 | }, 243 | HtmlElementInfo{ 244 | 245 | TagName: "body", 246 | HtmlVersion: 3, 247 | Obsolete: false, 248 | ElementType: HETFlow, 249 | PermittedChildrenTypes: HETAnyContent, 250 | PermittedChildrenTags: []string{"script", "style"}, 251 | attributesString: []string{"onafterprint", "onbeforeprint", "onbeforeunload", "onblur", "onerror", "onfocus", "onhaschange", "onload", "onmessage", "onoffline", "ononline", "onpagehide", "onpageshow", "onpopstate", "onresize", "onstoragte", "onunload"}, 252 | TagFormatting: HTFOptionalClosing, 253 | ParentTags: []string{"html"}, 254 | ExcludeParentTags: []string{}, 255 | ParentContentTypes: HETNone, 256 | ObsoleteAttributes: []string{"alink", "background", "bgcolor", "link", "marginbottom", "marginheight", "marginleft", "marginright", "margintop", "marginwidth", "text", "vlink"}, 257 | }, 258 | HtmlElementInfo{ 259 | 260 | TagName: "br", 261 | HtmlVersion: 3, 262 | Obsolete: false, 263 | ElementType: HETPhrasing, 264 | PermittedChildrenTypes: HETNone, 265 | PermittedChildrenTags: []string{}, 266 | attributesString: []string{}, 267 | TagFormatting: HTFSingle, 268 | ParentTags: []string{}, 269 | ExcludeParentTags: []string{}, 270 | ParentContentTypes: HETFlow | HETPhrasing, 271 | ObsoleteAttributes: []string{"clear"}, 272 | }, 273 | HtmlElementInfo{ 274 | 275 | TagName: "button", 276 | HtmlVersion: 4, 277 | Obsolete: false, 278 | ElementType: HETPhrasing, 279 | PermittedChildrenTypes: HETPhrasing | HETText, 280 | PermittedChildrenTags: []string{}, 281 | attributesString: []string{"name", "disabled", "form", "type", "value", "formaction", "autofocus", "formenctype", "formmethod", "formtarget", "formnovalidate"}, 282 | TagFormatting: HTFComplete, 283 | ParentTags: []string{}, 284 | ExcludeParentTags: []string{"a", "button"}, 285 | ParentContentTypes: HETPhrasing, 286 | ObsoleteAttributes: []string{}, 287 | }, 288 | HtmlElementInfo{ 289 | 290 | TagName: "canvas", 291 | HtmlVersion: 5, 292 | Obsolete: false, 293 | ElementType: HETFlow, 294 | PermittedChildrenTypes: HETAnyContent, 295 | PermittedChildrenTags: []string{}, 296 | attributesString: []string{"height", "width"}, 297 | TagFormatting: HTFComplete, 298 | ParentTags: []string{}, 299 | ExcludeParentTags: []string{}, 300 | ParentContentTypes: HETFlow | HETPhrasing, 301 | ObsoleteAttributes: []string{}, 302 | }, 303 | HtmlElementInfo{ 304 | 305 | TagName: "caption", 306 | HtmlVersion: 3, 307 | Obsolete: false, 308 | ElementType: HETPhrasing, 309 | PermittedChildrenTypes: HETAnyContent, 310 | PermittedChildrenTags: []string{}, 311 | attributesString: []string{}, 312 | TagFormatting: HTFComplete, 313 | ParentTags: []string{"table"}, 314 | ExcludeParentTags: []string{}, 315 | ParentContentTypes: HETNone, 316 | ObsoleteAttributes: []string{"align"}, 317 | }, 318 | HtmlElementInfo{ 319 | 320 | TagName: "center", 321 | HtmlVersion: 3, 322 | Obsolete: true, 323 | ElementType: HETFlow, 324 | PermittedChildrenTypes: HETAnyContent, 325 | PermittedChildrenTags: []string{}, 326 | attributesString: []string{}, 327 | TagFormatting: HTFComplete, 328 | ParentTags: []string{}, 329 | ExcludeParentTags: []string{}, 330 | ParentContentTypes: HETFlow, 331 | ObsoleteAttributes: []string{}, 332 | }, 333 | HtmlElementInfo{ 334 | 335 | TagName: "cite", 336 | HtmlVersion: 3, 337 | Obsolete: false, 338 | ElementType: HETPhrasing, 339 | PermittedChildrenTypes: HETPhrasing | HETText, 340 | PermittedChildrenTags: []string{}, 341 | attributesString: []string{}, 342 | TagFormatting: HTFComplete, 343 | ParentTags: []string{}, 344 | ExcludeParentTags: []string{}, 345 | ParentContentTypes: HETPhrasing, 346 | ObsoleteAttributes: []string{}, 347 | }, 348 | HtmlElementInfo{ 349 | 350 | TagName: "code", 351 | HtmlVersion: 3, 352 | Obsolete: false, 353 | ElementType: HETPhrasing, 354 | PermittedChildrenTypes: HETPhrasing | HETText, 355 | PermittedChildrenTags: []string{}, 356 | attributesString: []string{}, 357 | TagFormatting: HTFComplete, 358 | ParentTags: []string{}, 359 | ExcludeParentTags: []string{}, 360 | ParentContentTypes: HETPhrasing, 361 | ObsoleteAttributes: []string{}, 362 | }, 363 | HtmlElementInfo{ 364 | 365 | TagName: "col", 366 | HtmlVersion: 3, 367 | Obsolete: false, 368 | ElementType: HETPhrasing, 369 | PermittedChildrenTypes: HETNone, 370 | PermittedChildrenTags: []string{}, 371 | attributesString: []string{"span"}, 372 | TagFormatting: HTFSingle, 373 | ParentTags: []string{"colgroup"}, 374 | ExcludeParentTags: []string{}, 375 | ParentContentTypes: HETNone, 376 | ObsoleteAttributes: []string{"align", "width", "char", "charoff", "valign"}, 377 | }, 378 | HtmlElementInfo{ 379 | 380 | TagName: "colgroup", 381 | HtmlVersion: 4, 382 | Obsolete: false, 383 | ElementType: HETPhrasing, 384 | PermittedChildrenTypes: HETPhrasing | HETText, 385 | PermittedChildrenTags: []string{}, 386 | attributesString: []string{"span"}, 387 | TagFormatting: HTFOptionalClosing, 388 | ParentTags: []string{"table"}, 389 | ExcludeParentTags: []string{}, 390 | ParentContentTypes: HETNone, 391 | ObsoleteAttributes: []string{"width", "char", "charoff", "valign"}, 392 | }, 393 | HtmlElementInfo{ 394 | 395 | TagName: "command", 396 | HtmlVersion: 5, 397 | Obsolete: false, 398 | ElementType: HETMeta, 399 | PermittedChildrenTypes: HETNone, 400 | PermittedChildrenTags: []string{}, 401 | attributesString: []string{"type", "label", "icon", "disabled", "radiogroup", "checked"}, 402 | TagFormatting: HTFSingle, 403 | ParentTags: []string{}, 404 | ExcludeParentTags: []string{}, 405 | ParentContentTypes: HETPhrasing | HETMeta, 406 | ObsoleteAttributes: []string{}, 407 | }, 408 | HtmlElementInfo{ 409 | 410 | TagName: "datalist", 411 | HtmlVersion: 5, 412 | Obsolete: false, 413 | ElementType: HETPhrasing, 414 | PermittedChildrenTypes: HETPhrasing | HETText, 415 | PermittedChildrenTags: []string{}, 416 | attributesString: []string{}, 417 | TagFormatting: HTFComplete, 418 | ParentTags: []string{}, 419 | ExcludeParentTags: []string{}, 420 | ParentContentTypes: HETPhrasing, 421 | ObsoleteAttributes: []string{}, 422 | }, 423 | HtmlElementInfo{ 424 | 425 | TagName: "dd", 426 | HtmlVersion: 3, 427 | Obsolete: false, 428 | ElementType: HETFlow, 429 | PermittedChildrenTypes: HETAnyContent, 430 | PermittedChildrenTags: []string{}, 431 | attributesString: []string{}, 432 | TagFormatting: HTFOptionalClosing, 433 | ParentTags: []string{"dl"}, 434 | ExcludeParentTags: []string{}, 435 | ParentContentTypes: HETNone, 436 | ObsoleteAttributes: []string{}, 437 | }, 438 | HtmlElementInfo{ 439 | 440 | TagName: "del", 441 | HtmlVersion: 4, 442 | Obsolete: false, 443 | ElementType: HETPhrasing, 444 | PermittedChildrenTypes: HETAnyContent, 445 | PermittedChildrenTags: []string{}, 446 | attributesString: []string{"cite", "datetime"}, 447 | TagFormatting: HTFComplete, 448 | ParentTags: []string{}, 449 | ExcludeParentTags: []string{}, 450 | ParentContentTypes: HETFlow | HETPhrasing, 451 | ObsoleteAttributes: []string{}, 452 | }, 453 | HtmlElementInfo{ 454 | 455 | TagName: "details", 456 | HtmlVersion: 5, 457 | Obsolete: false, 458 | ElementType: HETFlow, 459 | PermittedChildrenTypes: HETAnyContent, 460 | PermittedChildrenTags: []string{}, 461 | attributesString: []string{"open"}, 462 | TagFormatting: HTFComplete, 463 | ParentTags: []string{}, 464 | ExcludeParentTags: []string{"a", "button"}, 465 | ParentContentTypes: HETFlow, 466 | ObsoleteAttributes: []string{}, 467 | }, 468 | HtmlElementInfo{ 469 | 470 | TagName: "dfn", 471 | HtmlVersion: 3, 472 | Obsolete: false, 473 | ElementType: HETPhrasing, 474 | PermittedChildrenTypes: HETPhrasing | HETText, 475 | PermittedChildrenTags: []string{}, 476 | attributesString: []string{}, 477 | TagFormatting: HTFComplete, 478 | ParentTags: []string{}, 479 | ExcludeParentTags: []string{}, 480 | ParentContentTypes: HETPhrasing, 481 | ObsoleteAttributes: []string{}, 482 | }, 483 | HtmlElementInfo{ 484 | 485 | TagName: "dir", 486 | HtmlVersion: 3, 487 | Obsolete: true, 488 | ElementType: HETFlow, 489 | PermittedChildrenTypes: HETAnyContent, 490 | PermittedChildrenTags: []string{}, 491 | attributesString: []string{}, 492 | TagFormatting: HTFComplete, 493 | ParentTags: []string{}, 494 | ExcludeParentTags: []string{}, 495 | ParentContentTypes: HETFlow, 496 | ObsoleteAttributes: []string{}, 497 | }, 498 | HtmlElementInfo{ 499 | 500 | TagName: "div", 501 | HtmlVersion: 3, 502 | Obsolete: false, 503 | ElementType: HETFlow, 504 | PermittedChildrenTypes: HETAnyContent, 505 | PermittedChildrenTags: []string{}, 506 | attributesString: []string{}, 507 | TagFormatting: HTFComplete, 508 | ParentTags: []string{}, 509 | ExcludeParentTags: []string{}, 510 | ParentContentTypes: HETFlow, 511 | ObsoleteAttributes: []string{}, 512 | }, 513 | HtmlElementInfo{ 514 | 515 | TagName: "dl", 516 | HtmlVersion: 3, 517 | Obsolete: false, 518 | ElementType: HETFlow, 519 | PermittedChildrenTypes: HETAnyContent, 520 | PermittedChildrenTags: []string{}, 521 | attributesString: []string{}, 522 | TagFormatting: HTFComplete, 523 | ParentTags: []string{}, 524 | ExcludeParentTags: []string{}, 525 | ParentContentTypes: HETFlow, 526 | ObsoleteAttributes: []string{"compact"}, 527 | }, 528 | HtmlElementInfo{ 529 | 530 | TagName: "dt", 531 | HtmlVersion: 3, 532 | Obsolete: false, 533 | ElementType: HETFlow, 534 | PermittedChildrenTypes: HETAnyContent, 535 | PermittedChildrenTags: []string{}, 536 | attributesString: []string{}, 537 | TagFormatting: HTFOptionalClosing, 538 | ParentTags: []string{"dl"}, 539 | ExcludeParentTags: []string{}, 540 | ParentContentTypes: HETNone, 541 | ObsoleteAttributes: []string{}, 542 | }, 543 | HtmlElementInfo{ 544 | 545 | TagName: "em", 546 | HtmlVersion: 3, 547 | Obsolete: false, 548 | ElementType: HETPhrasing, 549 | PermittedChildrenTypes: HETPhrasing | HETText, 550 | PermittedChildrenTags: []string{}, 551 | attributesString: []string{}, 552 | TagFormatting: HTFComplete, 553 | ParentTags: []string{}, 554 | ExcludeParentTags: []string{}, 555 | ParentContentTypes: HETPhrasing, 556 | ObsoleteAttributes: []string{}, 557 | }, 558 | HtmlElementInfo{ 559 | 560 | TagName: "embed", 561 | HtmlVersion: 3, 562 | Obsolete: true, 563 | ElementType: HETPhrasing, 564 | PermittedChildrenTypes: HETPhrasing | HETText, 565 | PermittedChildrenTags: []string{}, 566 | attributesString: []string{}, 567 | TagFormatting: HTFSingle, 568 | ParentTags: []string{}, 569 | ExcludeParentTags: []string{}, 570 | ParentContentTypes: HETPhrasing, 571 | ObsoleteAttributes: []string{}, 572 | }, 573 | HtmlElementInfo{ 574 | 575 | TagName: "fieldset", 576 | HtmlVersion: 4, 577 | Obsolete: false, 578 | ElementType: HETFlow, 579 | PermittedChildrenTypes: HETAnyContent, 580 | PermittedChildrenTags: []string{}, 581 | attributesString: []string{"name", "disabled", "form"}, 582 | TagFormatting: HTFComplete, 583 | ParentTags: []string{}, 584 | ExcludeParentTags: []string{}, 585 | ParentContentTypes: HETFlow, 586 | ObsoleteAttributes: []string{}, 587 | }, 588 | HtmlElementInfo{ 589 | 590 | TagName: "figcaption", 591 | HtmlVersion: 5, 592 | Obsolete: false, 593 | ElementType: HETFlow, 594 | PermittedChildrenTypes: HETAnyContent, 595 | PermittedChildrenTags: []string{}, 596 | attributesString: []string{}, 597 | TagFormatting: HTFComplete, 598 | ParentTags: []string{"figure"}, 599 | ExcludeParentTags: []string{}, 600 | ParentContentTypes: HETNone, 601 | ObsoleteAttributes: []string{}, 602 | }, 603 | HtmlElementInfo{ 604 | 605 | TagName: "figure", 606 | HtmlVersion: 5, 607 | Obsolete: false, 608 | ElementType: HETFlow, 609 | PermittedChildrenTypes: HETAnyContent, 610 | PermittedChildrenTags: []string{}, 611 | attributesString: []string{}, 612 | TagFormatting: HTFComplete, 613 | ParentTags: []string{}, 614 | ExcludeParentTags: []string{}, 615 | ParentContentTypes: HETFlow, 616 | ObsoleteAttributes: []string{}, 617 | }, 618 | HtmlElementInfo{ 619 | 620 | TagName: "font", 621 | HtmlVersion: 3, 622 | Obsolete: true, 623 | ElementType: HETPhrasing, 624 | PermittedChildrenTypes: HETPhrasing | HETText, 625 | PermittedChildrenTags: []string{}, 626 | attributesString: []string{}, 627 | TagFormatting: HTFComplete, 628 | ParentTags: []string{}, 629 | ExcludeParentTags: []string{}, 630 | ParentContentTypes: HETPhrasing, 631 | ObsoleteAttributes: []string{}, 632 | }, 633 | HtmlElementInfo{ 634 | 635 | TagName: "footer", 636 | HtmlVersion: 5, 637 | Obsolete: false, 638 | ElementType: HETFlow, 639 | PermittedChildrenTypes: HETAnyContent, 640 | PermittedChildrenTags: []string{}, 641 | attributesString: []string{}, 642 | TagFormatting: HTFComplete, 643 | ParentTags: []string{}, 644 | ExcludeParentTags: []string{"header", "footer", "address"}, 645 | ParentContentTypes: HETFlow, 646 | ObsoleteAttributes: []string{}, 647 | }, 648 | HtmlElementInfo{ 649 | 650 | TagName: "form", 651 | HtmlVersion: 3, 652 | Obsolete: false, 653 | ElementType: HETFlow, 654 | PermittedChildrenTypes: HETAnyContent, 655 | PermittedChildrenTags: []string{}, 656 | attributesString: []string{"action", "method", "enctype", "name", "accept-charset", "novalidate", "target", "autocomplete"}, 657 | TagFormatting: HTFComplete, 658 | ParentTags: []string{}, 659 | ExcludeParentTags: []string{"form"}, 660 | ParentContentTypes: HETFlow, 661 | ObsoleteAttributes: []string{}, 662 | }, 663 | HtmlElementInfo{ 664 | 665 | TagName: "frame", 666 | HtmlVersion: 3, 667 | Obsolete: true, 668 | ElementType: HETFlow, 669 | PermittedChildrenTypes: HETAnyContent, 670 | PermittedChildrenTags: []string{}, 671 | attributesString: []string{}, 672 | TagFormatting: HTFSingle, 673 | ParentTags: []string{}, 674 | ExcludeParentTags: []string{}, 675 | ParentContentTypes: HETFlow, 676 | ObsoleteAttributes: []string{}, 677 | }, 678 | HtmlElementInfo{ 679 | 680 | TagName: "frameset", 681 | HtmlVersion: 3, 682 | Obsolete: true, 683 | ElementType: HETFlow, 684 | PermittedChildrenTypes: HETAnyContent, 685 | PermittedChildrenTags: []string{}, 686 | attributesString: []string{}, 687 | TagFormatting: HTFComplete, 688 | ParentTags: []string{}, 689 | ExcludeParentTags: []string{}, 690 | ParentContentTypes: HETFlow, 691 | ObsoleteAttributes: []string{}, 692 | }, 693 | HtmlElementInfo{ 694 | 695 | TagName: "h1", 696 | HtmlVersion: 3, 697 | Obsolete: false, 698 | ElementType: HETFlow, 699 | PermittedChildrenTypes: HETPhrasing | HETText, 700 | PermittedChildrenTags: []string{}, 701 | attributesString: []string{}, 702 | TagFormatting: HTFComplete, 703 | ParentTags: []string{"hgroup"}, 704 | ExcludeParentTags: []string{"address"}, 705 | ParentContentTypes: HETFlow, 706 | ObsoleteAttributes: []string{"align"}, 707 | }, 708 | HtmlElementInfo{ 709 | 710 | TagName: "h2", 711 | HtmlVersion: 3, 712 | Obsolete: false, 713 | ElementType: HETFlow, 714 | PermittedChildrenTypes: HETPhrasing | HETText, 715 | PermittedChildrenTags: []string{}, 716 | attributesString: []string{}, 717 | TagFormatting: HTFComplete, 718 | ParentTags: []string{"hgroup"}, 719 | ExcludeParentTags: []string{"address"}, 720 | ParentContentTypes: HETFlow, 721 | ObsoleteAttributes: []string{"align"}, 722 | }, 723 | HtmlElementInfo{ 724 | 725 | TagName: "h3", 726 | HtmlVersion: 3, 727 | Obsolete: false, 728 | ElementType: HETFlow, 729 | PermittedChildrenTypes: HETPhrasing | HETText, 730 | PermittedChildrenTags: []string{}, 731 | attributesString: []string{}, 732 | TagFormatting: HTFComplete, 733 | ParentTags: []string{"hgroup"}, 734 | ExcludeParentTags: []string{"address"}, 735 | ParentContentTypes: HETFlow, 736 | ObsoleteAttributes: []string{"align"}, 737 | }, 738 | HtmlElementInfo{ 739 | 740 | TagName: "h4", 741 | HtmlVersion: 3, 742 | Obsolete: false, 743 | ElementType: HETFlow, 744 | PermittedChildrenTypes: HETPhrasing | HETText, 745 | PermittedChildrenTags: []string{}, 746 | attributesString: []string{}, 747 | TagFormatting: HTFComplete, 748 | ParentTags: []string{"hgroup"}, 749 | ExcludeParentTags: []string{"address"}, 750 | ParentContentTypes: HETFlow, 751 | ObsoleteAttributes: []string{"align"}, 752 | }, 753 | HtmlElementInfo{ 754 | 755 | TagName: "h5", 756 | HtmlVersion: 3, 757 | Obsolete: false, 758 | ElementType: HETFlow, 759 | PermittedChildrenTypes: HETPhrasing | HETText, 760 | PermittedChildrenTags: []string{}, 761 | attributesString: []string{}, 762 | TagFormatting: HTFComplete, 763 | ParentTags: []string{"hgroup"}, 764 | ExcludeParentTags: []string{"address"}, 765 | ParentContentTypes: HETFlow, 766 | ObsoleteAttributes: []string{"align"}, 767 | }, 768 | HtmlElementInfo{ 769 | 770 | TagName: "h6", 771 | HtmlVersion: 3, 772 | Obsolete: false, 773 | ElementType: HETFlow, 774 | PermittedChildrenTypes: HETPhrasing | HETText, 775 | PermittedChildrenTags: []string{}, 776 | attributesString: []string{}, 777 | TagFormatting: HTFComplete, 778 | ParentTags: []string{"hgroup"}, 779 | ExcludeParentTags: []string{"address"}, 780 | ParentContentTypes: HETFlow, 781 | ObsoleteAttributes: []string{"align"}, 782 | }, 783 | HtmlElementInfo{ 784 | 785 | TagName: "head", 786 | HtmlVersion: 3, 787 | Obsolete: false, 788 | ElementType: HETMeta, 789 | PermittedChildrenTypes: HETMeta, 790 | PermittedChildrenTags: []string{}, 791 | attributesString: []string{}, 792 | TagFormatting: HTFOptionalClosing, 793 | ParentTags: []string{"html"}, 794 | ExcludeParentTags: []string{}, 795 | ParentContentTypes: HETNone, 796 | ObsoleteAttributes: []string{"profile"}, 797 | }, 798 | HtmlElementInfo{ 799 | 800 | TagName: "header", 801 | HtmlVersion: 5, 802 | Obsolete: false, 803 | ElementType: HETFlow, 804 | PermittedChildrenTypes: HETAnyContent, 805 | PermittedChildrenTags: []string{}, 806 | attributesString: []string{}, 807 | TagFormatting: HTFComplete, 808 | ParentTags: []string{}, 809 | ExcludeParentTags: []string{"footer", "address", "header"}, 810 | ParentContentTypes: HETFlow, 811 | ObsoleteAttributes: []string{}, 812 | }, 813 | HtmlElementInfo{ 814 | 815 | TagName: "hgroup", 816 | HtmlVersion: 5, 817 | Obsolete: false, 818 | ElementType: HETFlow, 819 | PermittedChildrenTypes: HETNone, 820 | PermittedChildrenTags: []string{"h1", "h2", "h3", "h4", "h5", "h6"}, 821 | attributesString: []string{}, 822 | TagFormatting: HTFComplete, 823 | ParentTags: []string{}, 824 | ExcludeParentTags: []string{}, 825 | ParentContentTypes: HETFlow, 826 | ObsoleteAttributes: []string{}, 827 | }, 828 | HtmlElementInfo{ 829 | 830 | TagName: "hr", 831 | HtmlVersion: 3, 832 | Obsolete: false, 833 | ElementType: HETFlow, 834 | PermittedChildrenTypes: HETNone, 835 | PermittedChildrenTags: []string{}, 836 | attributesString: []string{}, 837 | TagFormatting: HTFSingle, 838 | ParentTags: []string{}, 839 | ExcludeParentTags: []string{}, 840 | ParentContentTypes: HETFlow, 841 | ObsoleteAttributes: []string{"align", "width", "noshade", "size", "color"}, 842 | }, 843 | HtmlElementInfo{ 844 | 845 | TagName: "html", 846 | HtmlVersion: 3, 847 | Obsolete: false, 848 | ElementType: HETPhrasing, 849 | PermittedChildrenTypes: HETNone, 850 | PermittedChildrenTags: []string{"head", "body"}, 851 | attributesString: []string{"manifest"}, 852 | TagFormatting: HTFOptionalClosing, 853 | ParentTags: []string{}, 854 | ExcludeParentTags: []string{}, 855 | ParentContentTypes: HETNone, 856 | ObsoleteAttributes: []string{"version"}, 857 | }, 858 | HtmlElementInfo{ 859 | 860 | TagName: "i", 861 | HtmlVersion: 3, 862 | Obsolete: false, 863 | ElementType: HETPhrasing, 864 | PermittedChildrenTypes: HETPhrasing | HETText, 865 | PermittedChildrenTags: []string{}, 866 | attributesString: []string{}, 867 | TagFormatting: HTFComplete, 868 | ParentTags: []string{}, 869 | ExcludeParentTags: []string{}, 870 | ParentContentTypes: HETPhrasing, 871 | ObsoleteAttributes: []string{}, 872 | }, 873 | HtmlElementInfo{ 874 | 875 | TagName: "iframe", 876 | HtmlVersion: 3, 877 | Obsolete: false, 878 | ElementType: HETFlow, 879 | PermittedChildrenTypes: HETText, 880 | PermittedChildrenTags: []string{}, 881 | attributesString: []string{"src", "srcdoc", "name", "width", "height", "sandbox", "seamless"}, 882 | TagFormatting: HTFComplete, 883 | ParentTags: []string{}, 884 | ExcludeParentTags: []string{"a", "button"}, 885 | ParentContentTypes: HETPhrasing, 886 | ObsoleteAttributes: []string{"longdesc", "align", "allowtransparency", "frameborder", "marginheight", "marginwidth", "scrolling"}, 887 | }, 888 | HtmlElementInfo{ 889 | 890 | TagName: "img", 891 | HtmlVersion: 3, 892 | Obsolete: false, 893 | ElementType: HETPhrasing, 894 | PermittedChildrenTypes: HETNone, 895 | PermittedChildrenTags: []string{}, 896 | attributesString: []string{"src", "alt", "height", "width", "usemap", "ismap", "border"}, 897 | TagFormatting: HTFSingle, 898 | ParentTags: []string{}, 899 | ExcludeParentTags: []string{}, 900 | ParentContentTypes: HETPhrasing, 901 | ObsoleteAttributes: []string{"longdesc", "name", "align", "hspace", "vspace", "border"}, 902 | }, 903 | HtmlElementInfo{ 904 | 905 | TagName: "input", 906 | HtmlVersion: 3, 907 | Obsolete: false, 908 | ElementType: HETPhrasing, 909 | PermittedChildrenTypes: HETNone, 910 | PermittedChildrenTags: []string{}, 911 | attributesString: []string{"name", "disabled", "form", "type", "maxlength", "readonly", "size", "value", "autocomplete", "autofocus", "list", "pattern", "required", "placeholder", "dirname", "checked", "formaction", "formenctype", "formmethod", "formtarget", "formnovalidate", "accept", "multiple", "alt", "src", "height", "width", "list", "min", "max", "step"}, 912 | TagFormatting: HTFSingle, 913 | ParentTags: []string{}, 914 | ExcludeParentTags: []string{}, 915 | ParentContentTypes: HETPhrasing, 916 | ObsoleteAttributes: []string{"usemap", "align"}, 917 | }, 918 | HtmlElementInfo{ 919 | 920 | TagName: "ins", 921 | HtmlVersion: 4, 922 | Obsolete: false, 923 | ElementType: HETPhrasing, 924 | PermittedChildrenTypes: HETAnyContent, 925 | PermittedChildrenTags: []string{}, 926 | attributesString: []string{"cite", "datetime"}, 927 | TagFormatting: HTFComplete, 928 | ParentTags: []string{}, 929 | ExcludeParentTags: []string{}, 930 | ParentContentTypes: HETFlow | HETPhrasing, 931 | ObsoleteAttributes: []string{}, 932 | }, 933 | HtmlElementInfo{ 934 | 935 | TagName: "isindex", 936 | HtmlVersion: 3, 937 | Obsolete: true, 938 | ElementType: HETFlow, 939 | PermittedChildrenTypes: HETAnyContent, 940 | PermittedChildrenTags: []string{}, 941 | attributesString: []string{}, 942 | TagFormatting: HTFSingle, 943 | ParentTags: []string{}, 944 | ExcludeParentTags: []string{}, 945 | ParentContentTypes: HETFlow, 946 | ObsoleteAttributes: []string{}, 947 | }, 948 | HtmlElementInfo{ 949 | 950 | TagName: "kbd", 951 | HtmlVersion: 3, 952 | Obsolete: false, 953 | ElementType: HETPhrasing, 954 | PermittedChildrenTypes: HETPhrasing | HETText, 955 | PermittedChildrenTags: []string{}, 956 | attributesString: []string{}, 957 | TagFormatting: HTFComplete, 958 | ParentTags: []string{}, 959 | ExcludeParentTags: []string{}, 960 | ParentContentTypes: HETPhrasing, 961 | ObsoleteAttributes: []string{}, 962 | }, 963 | HtmlElementInfo{ 964 | 965 | TagName: "keygen", 966 | HtmlVersion: 5, 967 | Obsolete: false, 968 | ElementType: HETPhrasing, 969 | PermittedChildrenTypes: HETPhrasing | HETText, 970 | PermittedChildrenTags: []string{}, 971 | attributesString: []string{"challenge", "keytype", "autofocus", "name", "disabled", "form"}, 972 | TagFormatting: HTFSingle, 973 | ParentTags: []string{}, 974 | ExcludeParentTags: []string{}, 975 | ParentContentTypes: HETPhrasing, 976 | ObsoleteAttributes: []string{}, 977 | }, 978 | HtmlElementInfo{ 979 | 980 | TagName: "label", 981 | HtmlVersion: 4, 982 | Obsolete: false, 983 | ElementType: HETPhrasing, 984 | PermittedChildrenTypes: HETPhrasing | HETText, 985 | PermittedChildrenTags: []string{}, 986 | attributesString: []string{"for", "form"}, 987 | TagFormatting: HTFComplete, 988 | ParentTags: []string{}, 989 | ExcludeParentTags: []string{}, 990 | ParentContentTypes: HETPhrasing, 991 | ObsoleteAttributes: []string{}, 992 | }, 993 | HtmlElementInfo{ 994 | 995 | TagName: "legend", 996 | HtmlVersion: 3, 997 | Obsolete: false, 998 | ElementType: HETPhrasing, 999 | PermittedChildrenTypes: HETPhrasing | HETText, 1000 | PermittedChildrenTags: []string{}, 1001 | attributesString: []string{}, 1002 | TagFormatting: HTFComplete, 1003 | ParentTags: []string{"fieldset"}, 1004 | ExcludeParentTags: []string{}, 1005 | ParentContentTypes: HETNone, 1006 | ObsoleteAttributes: []string{}, 1007 | }, 1008 | HtmlElementInfo{ 1009 | 1010 | TagName: "li", 1011 | HtmlVersion: 3, 1012 | Obsolete: false, 1013 | ElementType: HETFlow, 1014 | PermittedChildrenTypes: HETAnyContent, 1015 | PermittedChildrenTags: []string{}, 1016 | attributesString: []string{"value"}, 1017 | TagFormatting: HTFOptionalClosing, 1018 | ParentTags: []string{"ul", "ol", "menu"}, 1019 | ExcludeParentTags: []string{}, 1020 | ParentContentTypes: HETNone, 1021 | ObsoleteAttributes: []string{}, 1022 | }, 1023 | HtmlElementInfo{ 1024 | 1025 | TagName: "link", 1026 | HtmlVersion: 3, 1027 | Obsolete: false, 1028 | ElementType: HETMeta, 1029 | PermittedChildrenTypes: HETNone, 1030 | PermittedChildrenTags: []string{}, 1031 | attributesString: []string{"href", "rel", "hreflang", "media", "type", "sizes"}, 1032 | TagFormatting: HTFSingle, 1033 | ParentTags: []string{"noscript"}, 1034 | ExcludeParentTags: []string{}, 1035 | ParentContentTypes: HETMeta, 1036 | ObsoleteAttributes: []string{"target", "urn", "charset", "methods", "rev"}, 1037 | }, 1038 | HtmlElementInfo{ 1039 | 1040 | TagName: "map", 1041 | HtmlVersion: 3, 1042 | Obsolete: false, 1043 | ElementType: HETPhrasing, 1044 | PermittedChildrenTypes: HETAnyContent, 1045 | PermittedChildrenTags: []string{}, 1046 | attributesString: []string{"name"}, 1047 | TagFormatting: HTFComplete, 1048 | ParentTags: []string{}, 1049 | ExcludeParentTags: []string{}, 1050 | ParentContentTypes: HETFlow | HETPhrasing, 1051 | ObsoleteAttributes: []string{}, 1052 | }, 1053 | HtmlElementInfo{ 1054 | 1055 | TagName: "mark", 1056 | HtmlVersion: 5, 1057 | Obsolete: false, 1058 | ElementType: HETPhrasing, 1059 | PermittedChildrenTypes: HETPhrasing | HETText, 1060 | PermittedChildrenTags: []string{}, 1061 | attributesString: []string{}, 1062 | TagFormatting: HTFComplete, 1063 | ParentTags: []string{}, 1064 | ExcludeParentTags: []string{}, 1065 | ParentContentTypes: HETPhrasing, 1066 | ObsoleteAttributes: []string{}, 1067 | }, 1068 | HtmlElementInfo{ 1069 | 1070 | TagName: "menu", 1071 | HtmlVersion: 3, 1072 | Obsolete: false, 1073 | ElementType: HETFlow, 1074 | PermittedChildrenTypes: HETAnyContent, 1075 | PermittedChildrenTags: []string{}, 1076 | attributesString: []string{"type", "label"}, 1077 | TagFormatting: HTFComplete, 1078 | ParentTags: []string{}, 1079 | ExcludeParentTags: []string{}, 1080 | ParentContentTypes: HETFlow, 1081 | ObsoleteAttributes: []string{"compact"}, 1082 | }, 1083 | HtmlElementInfo{ 1084 | 1085 | TagName: "meta", 1086 | HtmlVersion: 3, 1087 | Obsolete: false, 1088 | ElementType: HETMeta, 1089 | PermittedChildrenTypes: HETNone, 1090 | PermittedChildrenTags: []string{}, 1091 | attributesString: []string{"name", "content", "http-equiv", "charset"}, 1092 | TagFormatting: HTFSingle, 1093 | ParentTags: []string{}, 1094 | ExcludeParentTags: []string{}, 1095 | ParentContentTypes: HETMeta, 1096 | ObsoleteAttributes: []string{}, 1097 | }, 1098 | HtmlElementInfo{ 1099 | 1100 | TagName: "meter", 1101 | HtmlVersion: 5, 1102 | Obsolete: false, 1103 | ElementType: HETPhrasing, 1104 | PermittedChildrenTypes: HETPhrasing | HETText, 1105 | PermittedChildrenTags: []string{}, 1106 | attributesString: []string{"value", "min", "low", "high", "max", "optimum"}, 1107 | TagFormatting: HTFComplete, 1108 | ParentTags: []string{}, 1109 | ExcludeParentTags: []string{}, 1110 | ParentContentTypes: HETPhrasing, 1111 | ObsoleteAttributes: []string{}, 1112 | }, 1113 | HtmlElementInfo{ 1114 | 1115 | TagName: "nav", 1116 | HtmlVersion: 5, 1117 | Obsolete: false, 1118 | ElementType: HETFlow, 1119 | PermittedChildrenTypes: HETAnyContent, 1120 | PermittedChildrenTags: []string{}, 1121 | attributesString: []string{}, 1122 | TagFormatting: HTFComplete, 1123 | ParentTags: []string{}, 1124 | ExcludeParentTags: []string{"address"}, 1125 | ParentContentTypes: HETFlow, 1126 | ObsoleteAttributes: []string{}, 1127 | }, 1128 | HtmlElementInfo{ 1129 | 1130 | TagName: "nobr", 1131 | HtmlVersion: 3, 1132 | Obsolete: true, 1133 | ElementType: HETPhrasing, 1134 | PermittedChildrenTypes: HETPhrasing | HETText, 1135 | PermittedChildrenTags: []string{}, 1136 | attributesString: []string{}, 1137 | TagFormatting: HTFComplete, 1138 | ParentTags: []string{}, 1139 | ExcludeParentTags: []string{}, 1140 | ParentContentTypes: HETPhrasing, 1141 | ObsoleteAttributes: []string{}, 1142 | }, 1143 | HtmlElementInfo{ 1144 | 1145 | TagName: "noframes", 1146 | HtmlVersion: 3, 1147 | Obsolete: true, 1148 | ElementType: HETFlow, 1149 | PermittedChildrenTypes: HETAnyContent, 1150 | PermittedChildrenTags: []string{}, 1151 | attributesString: []string{}, 1152 | TagFormatting: HTFComplete, 1153 | ParentTags: []string{}, 1154 | ExcludeParentTags: []string{}, 1155 | ParentContentTypes: HETFlow, 1156 | ObsoleteAttributes: []string{}, 1157 | }, 1158 | HtmlElementInfo{ 1159 | 1160 | TagName: "noscript", 1161 | HtmlVersion: 3, 1162 | Obsolete: false, 1163 | ElementType: HETMeta, 1164 | PermittedChildrenTypes: HETMeta | HETAnyContent, 1165 | PermittedChildrenTags: []string{}, 1166 | attributesString: []string{}, 1167 | TagFormatting: HTFComplete, 1168 | ParentTags: []string{}, 1169 | ExcludeParentTags: []string{"noscript"}, 1170 | ParentContentTypes: HETMeta | HETFlow | HETPhrasing, 1171 | ObsoleteAttributes: []string{}, 1172 | }, 1173 | HtmlElementInfo{ 1174 | 1175 | TagName: "object", 1176 | HtmlVersion: 3, 1177 | Obsolete: false, 1178 | ElementType: HETPhrasing, 1179 | PermittedChildrenTypes: HETAnyContent, 1180 | PermittedChildrenTags: []string{}, 1181 | attributesString: []string{"data", "type", "height", "width", "usemap", "name", "form"}, 1182 | TagFormatting: HTFComplete, 1183 | ParentTags: []string{}, 1184 | ExcludeParentTags: []string{"a", "button"}, 1185 | ParentContentTypes: HETFlow | HETPhrasing, 1186 | ObsoleteAttributes: []string{"archive", "classid", "code", "codebase", "codetype", "declare", "standby", "align", "hspace", "vspace", "border"}, 1187 | }, 1188 | HtmlElementInfo{ 1189 | 1190 | TagName: "ol", 1191 | HtmlVersion: 3, 1192 | Obsolete: false, 1193 | ElementType: HETFlow, 1194 | PermittedChildrenTypes: HETNone, 1195 | PermittedChildrenTags: []string{"li"}, 1196 | attributesString: []string{"start", "reversed", "type"}, 1197 | TagFormatting: HTFComplete, 1198 | ParentTags: []string{}, 1199 | ExcludeParentTags: []string{}, 1200 | ParentContentTypes: HETFlow, 1201 | ObsoleteAttributes: []string{"compact"}, 1202 | }, 1203 | HtmlElementInfo{ 1204 | 1205 | TagName: "optgroup", 1206 | HtmlVersion: 3, 1207 | Obsolete: false, 1208 | ElementType: HETPhrasing, 1209 | PermittedChildrenTypes: HETNone, 1210 | PermittedChildrenTags: []string{"option"}, 1211 | attributesString: []string{"label", "disabled"}, 1212 | TagFormatting: HTFOptionalClosing, 1213 | ParentTags: []string{"select"}, 1214 | ExcludeParentTags: []string{}, 1215 | ParentContentTypes: HETNone, 1216 | ObsoleteAttributes: []string{}, 1217 | }, 1218 | HtmlElementInfo{ 1219 | 1220 | TagName: "option", 1221 | HtmlVersion: 3, 1222 | Obsolete: false, 1223 | ElementType: HETPhrasing, 1224 | PermittedChildrenTypes: HETText, 1225 | PermittedChildrenTags: []string{}, 1226 | attributesString: []string{"disabled", "selected", "label", "value"}, 1227 | TagFormatting: HTFOptionalClosing, 1228 | ParentTags: []string{"optgroup", "select", "datalist"}, 1229 | ExcludeParentTags: []string{}, 1230 | ParentContentTypes: HETNone, 1231 | ObsoleteAttributes: []string{"name"}, 1232 | }, 1233 | HtmlElementInfo{ 1234 | 1235 | TagName: "output", 1236 | HtmlVersion: 5, 1237 | Obsolete: false, 1238 | ElementType: HETPhrasing, 1239 | PermittedChildrenTypes: HETPhrasing | HETText, 1240 | PermittedChildrenTags: []string{}, 1241 | attributesString: []string{"name", "form", "for"}, 1242 | TagFormatting: HTFComplete, 1243 | ParentTags: []string{}, 1244 | ExcludeParentTags: []string{}, 1245 | ParentContentTypes: HETPhrasing, 1246 | ObsoleteAttributes: []string{}, 1247 | }, 1248 | HtmlElementInfo{ 1249 | 1250 | TagName: "p", 1251 | HtmlVersion: 3, 1252 | Obsolete: false, 1253 | ElementType: HETFlow, 1254 | PermittedChildrenTypes: HETPhrasing | HETText, 1255 | PermittedChildrenTags: []string{}, 1256 | attributesString: []string{}, 1257 | TagFormatting: HTFOptionalClosing, 1258 | ParentTags: []string{}, 1259 | ExcludeParentTags: []string{}, 1260 | ParentContentTypes: HETFlow, 1261 | ObsoleteAttributes: []string{"align"}, 1262 | }, 1263 | HtmlElementInfo{ 1264 | 1265 | TagName: "param", 1266 | HtmlVersion: 3, 1267 | Obsolete: false, 1268 | ElementType: HETMeta, 1269 | PermittedChildrenTypes: HETNone, 1270 | PermittedChildrenTags: []string{}, 1271 | attributesString: []string{"name", "value"}, 1272 | TagFormatting: HTFSingle, 1273 | ParentTags: []string{"object"}, 1274 | ExcludeParentTags: []string{}, 1275 | ParentContentTypes: HETNone, 1276 | ObsoleteAttributes: []string{"type", "valuetype"}, 1277 | }, 1278 | HtmlElementInfo{ 1279 | 1280 | TagName: "pre", 1281 | HtmlVersion: 3, 1282 | Obsolete: false, 1283 | ElementType: HETFlow, 1284 | PermittedChildrenTypes: HETPhrasing | HETText, 1285 | PermittedChildrenTags: []string{}, 1286 | attributesString: []string{}, 1287 | TagFormatting: HTFComplete, 1288 | ParentTags: []string{}, 1289 | ExcludeParentTags: []string{}, 1290 | ParentContentTypes: HETFlow, 1291 | ObsoleteAttributes: []string{}, 1292 | }, 1293 | HtmlElementInfo{ 1294 | 1295 | TagName: "progress", 1296 | HtmlVersion: 5, 1297 | Obsolete: false, 1298 | ElementType: HETPhrasing, 1299 | PermittedChildrenTypes: HETPhrasing | HETText, 1300 | PermittedChildrenTags: []string{}, 1301 | attributesString: []string{"value", "max"}, 1302 | TagFormatting: HTFComplete, 1303 | ParentTags: []string{}, 1304 | ExcludeParentTags: []string{}, 1305 | ParentContentTypes: HETPhrasing, 1306 | ObsoleteAttributes: []string{}, 1307 | }, 1308 | HtmlElementInfo{ 1309 | 1310 | TagName: "q", 1311 | HtmlVersion: 4, 1312 | Obsolete: false, 1313 | ElementType: HETPhrasing, 1314 | PermittedChildrenTypes: HETPhrasing | HETText, 1315 | PermittedChildrenTags: []string{}, 1316 | attributesString: []string{"cite"}, 1317 | TagFormatting: HTFComplete, 1318 | ParentTags: []string{}, 1319 | ExcludeParentTags: []string{}, 1320 | ParentContentTypes: HETPhrasing, 1321 | ObsoleteAttributes: []string{}, 1322 | }, 1323 | HtmlElementInfo{ 1324 | 1325 | TagName: "rp", 1326 | HtmlVersion: 5, 1327 | Obsolete: false, 1328 | ElementType: HETPhrasing, 1329 | PermittedChildrenTypes: HETPhrasing | HETText, 1330 | PermittedChildrenTags: []string{}, 1331 | attributesString: []string{}, 1332 | TagFormatting: HTFComplete, 1333 | ParentTags: []string{"ruby"}, 1334 | ExcludeParentTags: []string{}, 1335 | ParentContentTypes: HETNone, 1336 | ObsoleteAttributes: []string{}, 1337 | }, 1338 | HtmlElementInfo{ 1339 | 1340 | TagName: "rt", 1341 | HtmlVersion: 5, 1342 | Obsolete: false, 1343 | ElementType: HETPhrasing, 1344 | PermittedChildrenTypes: HETPhrasing | HETText, 1345 | PermittedChildrenTags: []string{}, 1346 | attributesString: []string{}, 1347 | TagFormatting: HTFComplete, 1348 | ParentTags: []string{"ruby"}, 1349 | ExcludeParentTags: []string{}, 1350 | ParentContentTypes: HETNone, 1351 | ObsoleteAttributes: []string{}, 1352 | }, 1353 | HtmlElementInfo{ 1354 | 1355 | TagName: "ruby", 1356 | HtmlVersion: 5, 1357 | Obsolete: false, 1358 | ElementType: HETPhrasing, 1359 | PermittedChildrenTypes: HETPhrasing | HETText, 1360 | PermittedChildrenTags: []string{}, 1361 | attributesString: []string{}, 1362 | TagFormatting: HTFComplete, 1363 | ParentTags: []string{}, 1364 | ExcludeParentTags: []string{}, 1365 | ParentContentTypes: HETPhrasing, 1366 | ObsoleteAttributes: []string{}, 1367 | }, 1368 | HtmlElementInfo{ 1369 | 1370 | TagName: "s", 1371 | HtmlVersion: 3, 1372 | Obsolete: true, 1373 | ElementType: HETPhrasing, 1374 | PermittedChildrenTypes: HETPhrasing | HETText, 1375 | PermittedChildrenTags: []string{}, 1376 | attributesString: []string{}, 1377 | TagFormatting: HTFComplete, 1378 | ParentTags: []string{}, 1379 | ExcludeParentTags: []string{}, 1380 | ParentContentTypes: HETPhrasing, 1381 | ObsoleteAttributes: []string{}, 1382 | }, 1383 | HtmlElementInfo{ 1384 | 1385 | TagName: "samp", 1386 | HtmlVersion: 3, 1387 | Obsolete: false, 1388 | ElementType: HETPhrasing, 1389 | PermittedChildrenTypes: HETPhrasing | HETText, 1390 | PermittedChildrenTags: []string{}, 1391 | attributesString: []string{}, 1392 | TagFormatting: HTFComplete, 1393 | ParentTags: []string{}, 1394 | ExcludeParentTags: []string{}, 1395 | ParentContentTypes: HETPhrasing, 1396 | ObsoleteAttributes: []string{}, 1397 | }, 1398 | HtmlElementInfo{ 1399 | 1400 | TagName: "script", 1401 | HtmlVersion: 3, 1402 | Obsolete: false, 1403 | ElementType: HETMeta, 1404 | PermittedChildrenTypes: HETNRCharData, 1405 | PermittedChildrenTags: []string{}, 1406 | attributesString: []string{"type", "src", "defer", "async", "charset"}, 1407 | TagFormatting: HTFComplete, 1408 | ParentTags: []string{}, 1409 | ExcludeParentTags: []string{}, 1410 | ParentContentTypes: HETMeta | HETPhrasing | HETFlow, 1411 | ObsoleteAttributes: []string{"language"}, 1412 | }, 1413 | HtmlElementInfo{ 1414 | 1415 | TagName: "section", 1416 | HtmlVersion: 5, 1417 | Obsolete: false, 1418 | ElementType: HETFlow, 1419 | PermittedChildrenTypes: HETAnyContent, 1420 | PermittedChildrenTags: []string{"style"}, 1421 | attributesString: []string{}, 1422 | TagFormatting: HTFComplete, 1423 | ParentTags: []string{}, 1424 | ExcludeParentTags: []string{}, 1425 | ParentContentTypes: HETFlow, 1426 | ObsoleteAttributes: []string{}, 1427 | }, 1428 | HtmlElementInfo{ 1429 | 1430 | TagName: "select", 1431 | HtmlVersion: 3, 1432 | Obsolete: false, 1433 | ElementType: HETPhrasing, 1434 | PermittedChildrenTypes: HETNone, 1435 | PermittedChildrenTags: []string{"optgroup", "option"}, 1436 | attributesString: []string{"name", "disabled", "form", "size", "multiple", "autofocus", "required"}, 1437 | TagFormatting: HTFComplete, 1438 | ParentTags: []string{}, 1439 | ExcludeParentTags: []string{"a", "button"}, 1440 | ParentContentTypes: HETPhrasing, 1441 | ObsoleteAttributes: []string{}, 1442 | }, 1443 | HtmlElementInfo{ 1444 | 1445 | TagName: "small", 1446 | HtmlVersion: 3, 1447 | Obsolete: false, 1448 | ElementType: HETPhrasing, 1449 | PermittedChildrenTypes: HETPhrasing | HETText, 1450 | PermittedChildrenTags: []string{}, 1451 | attributesString: []string{}, 1452 | TagFormatting: HTFComplete, 1453 | ParentTags: []string{}, 1454 | ExcludeParentTags: []string{}, 1455 | ParentContentTypes: HETPhrasing, 1456 | ObsoleteAttributes: []string{}, 1457 | }, 1458 | HtmlElementInfo{ 1459 | 1460 | TagName: "source", 1461 | HtmlVersion: 5, 1462 | Obsolete: false, 1463 | ElementType: HETMeta, 1464 | PermittedChildrenTypes: HETNone, 1465 | PermittedChildrenTags: []string{}, 1466 | attributesString: []string{"src", "type", "media"}, 1467 | TagFormatting: HTFComplete, 1468 | ParentTags: []string{"audio", "video"}, 1469 | ExcludeParentTags: []string{}, 1470 | ParentContentTypes: HETNone, 1471 | ObsoleteAttributes: []string{}, 1472 | }, 1473 | HtmlElementInfo{ 1474 | 1475 | TagName: "span", 1476 | HtmlVersion: 3, 1477 | Obsolete: false, 1478 | ElementType: HETPhrasing, 1479 | PermittedChildrenTypes: HETPhrasing | HETText, 1480 | PermittedChildrenTags: []string{}, 1481 | attributesString: []string{}, 1482 | TagFormatting: HTFComplete, 1483 | ParentTags: []string{}, 1484 | ExcludeParentTags: []string{}, 1485 | ParentContentTypes: HETPhrasing, 1486 | ObsoleteAttributes: []string{}, 1487 | }, 1488 | HtmlElementInfo{ 1489 | 1490 | TagName: "strike", 1491 | HtmlVersion: 3, 1492 | Obsolete: true, 1493 | ElementType: HETPhrasing, 1494 | PermittedChildrenTypes: HETPhrasing | HETText, 1495 | PermittedChildrenTags: []string{}, 1496 | attributesString: []string{}, 1497 | TagFormatting: HTFComplete, 1498 | ParentTags: []string{}, 1499 | ExcludeParentTags: []string{}, 1500 | ParentContentTypes: HETPhrasing, 1501 | ObsoleteAttributes: []string{}, 1502 | }, 1503 | HtmlElementInfo{ 1504 | 1505 | TagName: "strong", 1506 | HtmlVersion: 3, 1507 | Obsolete: false, 1508 | ElementType: HETPhrasing, 1509 | PermittedChildrenTypes: HETPhrasing | HETText, 1510 | PermittedChildrenTags: []string{}, 1511 | attributesString: []string{}, 1512 | TagFormatting: HTFComplete, 1513 | ParentTags: []string{}, 1514 | ExcludeParentTags: []string{}, 1515 | ParentContentTypes: HETPhrasing, 1516 | ObsoleteAttributes: []string{}, 1517 | }, 1518 | HtmlElementInfo{ 1519 | 1520 | TagName: "style", 1521 | HtmlVersion: 3, 1522 | Obsolete: false, 1523 | ElementType: HETMeta, 1524 | PermittedChildrenTypes: HETNRCharData, 1525 | PermittedChildrenTags: []string{}, 1526 | attributesString: []string{"type", "media", "scoped"}, 1527 | TagFormatting: HTFComplete, 1528 | ParentTags: []string{"div", "noscript", "section", "article", "aside"}, 1529 | ExcludeParentTags: []string{}, 1530 | ParentContentTypes: HETMeta, 1531 | ObsoleteAttributes: []string{}, 1532 | }, 1533 | HtmlElementInfo{ 1534 | 1535 | TagName: "sub", 1536 | HtmlVersion: 3, 1537 | Obsolete: false, 1538 | ElementType: HETPhrasing, 1539 | PermittedChildrenTypes: HETPhrasing | HETText, 1540 | PermittedChildrenTags: []string{}, 1541 | attributesString: []string{}, 1542 | TagFormatting: HTFComplete, 1543 | ParentTags: []string{}, 1544 | ExcludeParentTags: []string{}, 1545 | ParentContentTypes: HETPhrasing, 1546 | ObsoleteAttributes: []string{}, 1547 | }, 1548 | HtmlElementInfo{ 1549 | 1550 | TagName: "summary", 1551 | HtmlVersion: 5, 1552 | Obsolete: false, 1553 | ElementType: HETFlow, 1554 | PermittedChildrenTypes: HETPhrasing | HETText, 1555 | PermittedChildrenTags: []string{}, 1556 | attributesString: []string{}, 1557 | TagFormatting: HTFComplete, 1558 | ParentTags: []string{"details"}, 1559 | ExcludeParentTags: []string{}, 1560 | ParentContentTypes: HETNone, 1561 | ObsoleteAttributes: []string{}, 1562 | }, 1563 | HtmlElementInfo{ 1564 | 1565 | TagName: "sup", 1566 | HtmlVersion: 3, 1567 | Obsolete: false, 1568 | ElementType: HETPhrasing, 1569 | PermittedChildrenTypes: HETPhrasing | HETText, 1570 | PermittedChildrenTags: []string{}, 1571 | attributesString: []string{}, 1572 | TagFormatting: HTFComplete, 1573 | ParentTags: []string{}, 1574 | ExcludeParentTags: []string{}, 1575 | ParentContentTypes: HETPhrasing, 1576 | ObsoleteAttributes: []string{}, 1577 | }, 1578 | HtmlElementInfo{ 1579 | 1580 | TagName: "table", 1581 | HtmlVersion: 3, 1582 | Obsolete: false, 1583 | ElementType: HETFlow, 1584 | PermittedChildrenTypes: HETNone, 1585 | PermittedChildrenTags: []string{"capition", "colgroup", "thead", "tfoot", "tbody", "tr"}, 1586 | attributesString: []string{"border"}, 1587 | TagFormatting: HTFComplete, 1588 | ParentTags: []string{}, 1589 | ExcludeParentTags: []string{}, 1590 | ParentContentTypes: HETFlow, 1591 | ObsoleteAttributes: []string{"summary", "align", "width", "bgcolor", "cellpadding", "cellspacing", "frame", "rules"}, 1592 | }, 1593 | HtmlElementInfo{ 1594 | 1595 | TagName: "tbody", 1596 | HtmlVersion: 3, 1597 | Obsolete: false, 1598 | ElementType: HETPhrasing, 1599 | PermittedChildrenTypes: HETNone, 1600 | PermittedChildrenTags: []string{"tr"}, 1601 | attributesString: []string{}, 1602 | TagFormatting: HTFOptionalClosing, 1603 | ParentTags: []string{"table"}, 1604 | ExcludeParentTags: []string{}, 1605 | ParentContentTypes: HETNone, 1606 | ObsoleteAttributes: []string{"align", "char", "charoff", "valign"}, 1607 | }, 1608 | HtmlElementInfo{ 1609 | 1610 | TagName: "td", 1611 | HtmlVersion: 3, 1612 | Obsolete: false, 1613 | ElementType: HETPhrasing, 1614 | PermittedChildrenTypes: HETAnyContent, 1615 | PermittedChildrenTags: []string{}, 1616 | attributesString: []string{"colspan", "rowspan", "headers"}, 1617 | TagFormatting: HTFOptionalClosing, 1618 | ParentTags: []string{"tr"}, 1619 | ExcludeParentTags: []string{}, 1620 | ParentContentTypes: HETNone, 1621 | ObsoleteAttributes: []string{"scope", "abbr", "axis", "align", "width", "char", "charoff", "valign", "bgcolor", "height", "nowrap"}, 1622 | }, 1623 | HtmlElementInfo{ 1624 | 1625 | TagName: "textarea", 1626 | HtmlVersion: 3, 1627 | Obsolete: false, 1628 | ElementType: HETPhrasing, 1629 | PermittedChildrenTypes: HETText, 1630 | PermittedChildrenTags: []string{}, 1631 | attributesString: []string{"name", "disabled", "form", "readonly", "maxlength", "autofocus", "required", "placeholder", "dirname", "rows", "wrap", "cols"}, 1632 | TagFormatting: HTFComplete, 1633 | ParentTags: []string{}, 1634 | ExcludeParentTags: []string{}, 1635 | ParentContentTypes: HETPhrasing, 1636 | ObsoleteAttributes: []string{}, 1637 | }, 1638 | HtmlElementInfo{ 1639 | 1640 | TagName: "tfoot", 1641 | HtmlVersion: 3, 1642 | Obsolete: false, 1643 | ElementType: HETPhrasing, 1644 | PermittedChildrenTypes: HETNone, 1645 | PermittedChildrenTags: []string{"tr"}, 1646 | attributesString: []string{}, 1647 | TagFormatting: HTFOptionalClosing, 1648 | ParentTags: []string{"table"}, 1649 | ExcludeParentTags: []string{}, 1650 | ParentContentTypes: HETNone, 1651 | ObsoleteAttributes: []string{"align", "char", "charoff", "valign"}, 1652 | }, 1653 | HtmlElementInfo{ 1654 | 1655 | TagName: "th", 1656 | HtmlVersion: 3, 1657 | Obsolete: false, 1658 | ElementType: HETFlow, 1659 | PermittedChildrenTypes: HETAnyContent, 1660 | PermittedChildrenTags: []string{}, 1661 | attributesString: []string{"scope", "scolspan", "rowspan", "headers"}, 1662 | TagFormatting: HTFOptionalClosing, 1663 | ParentTags: []string{"tr"}, 1664 | ExcludeParentTags: []string{}, 1665 | ParentContentTypes: HETNone, 1666 | ObsoleteAttributes: []string{"scope", "abbr", "axis", "align", "width", "char", "charoff", "valign", "bgcolor", "height", "nowrap"}, 1667 | }, 1668 | HtmlElementInfo{ 1669 | 1670 | TagName: "thead", 1671 | HtmlVersion: 3, 1672 | Obsolete: false, 1673 | ElementType: HETPhrasing, 1674 | PermittedChildrenTypes: HETNone, 1675 | PermittedChildrenTags: []string{"tr"}, 1676 | attributesString: []string{}, 1677 | TagFormatting: HTFOptionalClosing, 1678 | ParentTags: []string{"table"}, 1679 | ExcludeParentTags: []string{}, 1680 | ParentContentTypes: HETNone, 1681 | ObsoleteAttributes: []string{"align", "char", "charoff", "valign"}, 1682 | }, 1683 | HtmlElementInfo{ 1684 | 1685 | TagName: "time", 1686 | HtmlVersion: 5, 1687 | Obsolete: false, 1688 | ElementType: HETPhrasing, 1689 | PermittedChildrenTypes: HETPhrasing | HETText, 1690 | PermittedChildrenTags: []string{}, 1691 | attributesString: []string{"datetime"}, 1692 | TagFormatting: HTFComplete, 1693 | ParentTags: []string{}, 1694 | ExcludeParentTags: []string{"time"}, 1695 | ParentContentTypes: HETPhrasing, 1696 | ObsoleteAttributes: []string{}, 1697 | }, 1698 | HtmlElementInfo{ 1699 | 1700 | TagName: "title", 1701 | HtmlVersion: 3, 1702 | Obsolete: false, 1703 | ElementType: HETMeta, 1704 | PermittedChildrenTypes: HETText, 1705 | PermittedChildrenTags: []string{}, 1706 | attributesString: []string{}, 1707 | TagFormatting: HTFComplete, 1708 | ParentTags: []string{"head"}, 1709 | ExcludeParentTags: []string{}, 1710 | ParentContentTypes: HETNone, 1711 | ObsoleteAttributes: []string{}, 1712 | }, 1713 | HtmlElementInfo{ 1714 | 1715 | TagName: "tr", 1716 | HtmlVersion: 3, 1717 | Obsolete: false, 1718 | ElementType: HETPhrasing, 1719 | PermittedChildrenTypes: HETNone, 1720 | PermittedChildrenTags: []string{"td", "th"}, 1721 | attributesString: []string{}, 1722 | TagFormatting: HTFOptionalClosing, 1723 | ParentTags: []string{"table", "thead", "tfoot", "tbody"}, 1724 | ExcludeParentTags: []string{}, 1725 | ParentContentTypes: HETNone, 1726 | ObsoleteAttributes: []string{"align", "char", "charoff", "valign", "bgcolor"}, 1727 | }, 1728 | HtmlElementInfo{ 1729 | 1730 | TagName: "track", 1731 | HtmlVersion: 5, 1732 | Obsolete: false, 1733 | ElementType: HETMeta, 1734 | PermittedChildrenTypes: HETNone, 1735 | PermittedChildrenTags: []string{}, 1736 | attributesString: []string{"kind", "src", "srclang", "label", "default"}, 1737 | TagFormatting: HTFSingle, 1738 | ParentTags: []string{"audio", "video"}, 1739 | ExcludeParentTags: []string{}, 1740 | ParentContentTypes: HETNone, 1741 | ObsoleteAttributes: []string{}, 1742 | }, 1743 | HtmlElementInfo{ 1744 | 1745 | TagName: "tt", 1746 | HtmlVersion: 3, 1747 | Obsolete: true, 1748 | ElementType: HETPhrasing, 1749 | PermittedChildrenTypes: HETPhrasing | HETText, 1750 | PermittedChildrenTags: []string{}, 1751 | attributesString: []string{}, 1752 | TagFormatting: HTFComplete, 1753 | ParentTags: []string{}, 1754 | ExcludeParentTags: []string{}, 1755 | ParentContentTypes: HETPhrasing, 1756 | ObsoleteAttributes: []string{}, 1757 | }, 1758 | HtmlElementInfo{ 1759 | 1760 | TagName: "u", 1761 | HtmlVersion: 3, 1762 | Obsolete: true, 1763 | ElementType: HETPhrasing, 1764 | PermittedChildrenTypes: HETPhrasing | HETText, 1765 | PermittedChildrenTags: []string{}, 1766 | attributesString: []string{}, 1767 | TagFormatting: HTFComplete, 1768 | ParentTags: []string{}, 1769 | ExcludeParentTags: []string{}, 1770 | ParentContentTypes: HETPhrasing, 1771 | ObsoleteAttributes: []string{}, 1772 | }, 1773 | HtmlElementInfo{ 1774 | 1775 | TagName: "ul", 1776 | HtmlVersion: 3, 1777 | Obsolete: false, 1778 | ElementType: HETFlow, 1779 | PermittedChildrenTypes: HETNone, 1780 | PermittedChildrenTags: []string{"li"}, 1781 | attributesString: []string{}, 1782 | TagFormatting: HTFComplete, 1783 | ParentTags: []string{}, 1784 | ExcludeParentTags: []string{}, 1785 | ParentContentTypes: HETFlow, 1786 | ObsoleteAttributes: []string{"type", "compact"}, 1787 | }, 1788 | HtmlElementInfo{ 1789 | 1790 | TagName: "var", 1791 | HtmlVersion: 3, 1792 | Obsolete: false, 1793 | ElementType: HETPhrasing, 1794 | PermittedChildrenTypes: HETPhrasing | HETText, 1795 | PermittedChildrenTags: []string{}, 1796 | attributesString: []string{}, 1797 | TagFormatting: HTFComplete, 1798 | ParentTags: []string{}, 1799 | ExcludeParentTags: []string{}, 1800 | ParentContentTypes: HETPhrasing, 1801 | ObsoleteAttributes: []string{}, 1802 | }, 1803 | HtmlElementInfo{ 1804 | 1805 | TagName: "video", 1806 | HtmlVersion: 5, 1807 | Obsolete: false, 1808 | ElementType: HETFlow, 1809 | PermittedChildrenTypes: HETAnyContent, 1810 | PermittedChildrenTags: []string{}, 1811 | attributesString: []string{"autoplay", "preload", "controls", "loop", "poster", "height", "width", "mediagroup", "muted", "src"}, 1812 | TagFormatting: HTFComplete, 1813 | ParentTags: []string{}, 1814 | ExcludeParentTags: []string{"a", "button"}, 1815 | ParentContentTypes: HETFlow | HETPhrasing, 1816 | ObsoleteAttributes: []string{}, 1817 | }, 1818 | HtmlElementInfo{ 1819 | 1820 | TagName: "wbr", 1821 | HtmlVersion: 3, 1822 | Obsolete: false, 1823 | ElementType: HETPhrasing, 1824 | PermittedChildrenTypes: HETNone, 1825 | PermittedChildrenTags: []string{}, 1826 | attributesString: []string{}, 1827 | TagFormatting: HTFSingle, 1828 | ParentTags: []string{}, 1829 | ExcludeParentTags: []string{}, 1830 | ParentContentTypes: HETPhrasing, 1831 | ObsoleteAttributes: []string{}, 1832 | }, 1833 | } 1834 | 1835 | } 1836 | -------------------------------------------------------------------------------- /htmlparser.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | import ( 4 | "bytes" 5 | "html" 6 | "strconv" 7 | "strings" 8 | "unicode/utf8" 9 | //"fmt" 10 | ) 11 | 12 | const maxInnerTextLengthStored = 65500 13 | 14 | var ignoreTextInsideTag map[string]bool 15 | 16 | type TextCallback func(string, *HtmlElement) 17 | type ElementCallback func(*HtmlElement, bool) 18 | type EndElementCallback func(string) 19 | 20 | type HtmlParser struct { 21 | OrigHtml string 22 | origRunes []rune 23 | stop bool 24 | 25 | textCallback TextCallback 26 | elementCallback ElementCallback 27 | endElementCallback EndElementCallback 28 | 29 | Errors []string 30 | Warnings []string 31 | 32 | Ids map[string]bool 33 | 34 | innerTextBuilder *bytes.Buffer 35 | InnerText string 36 | 37 | HasValidSyntax bool 38 | HasOnlyValidTags bool 39 | HasOnlyValidAttributes bool 40 | HasOnlyKnownTags bool 41 | HasDeprecatedAttributes bool 42 | HasDeprecatedTags bool 43 | 44 | SkipComments bool 45 | PreserveCRLFTab bool 46 | } 47 | 48 | func init() { 49 | ignoreTextInsideTag = map[string]bool{ 50 | "head": true, 51 | "html": true, 52 | "ol": true, 53 | "select": true, 54 | "table": true, 55 | "tbody": true, 56 | "thead": true, 57 | "tfoot": true, 58 | "tr": true, 59 | } 60 | } 61 | 62 | func NewParser(html string) HtmlParser { 63 | var parser HtmlParser 64 | 65 | parser.OrigHtml = html 66 | 67 | parser.SkipComments = true 68 | parser.PreserveCRLFTab = true 69 | 70 | parser.innerTextBuilder = bytes.NewBufferString("") 71 | 72 | return parser 73 | } 74 | 75 | func (parser *HtmlParser) Parse(textCallback TextCallback, elementCallback ElementCallback, endElementCallback EndElementCallback) bool { 76 | if parser.stop { 77 | return false 78 | } 79 | 80 | parser.textCallback = textCallback 81 | parser.elementCallback = elementCallback 82 | parser.endElementCallback = endElementCallback 83 | 84 | parser.HasValidSyntax = true 85 | parser.HasOnlyValidTags = true 86 | parser.HasOnlyValidAttributes = true 87 | parser.HasDeprecatedAttributes = false 88 | parser.HasDeprecatedTags = false 89 | parser.HasOnlyKnownTags = true 90 | 91 | parser.Errors = make([]string, 1) 92 | parser.Warnings = make([]string, 1) 93 | parser.Ids = make(map[string]bool, 1) 94 | 95 | if parser.OrigHtml == "" { 96 | return true 97 | } 98 | 99 | if strings.Index(parser.OrigHtml, "<") < 0 { 100 | parser.callText(parser.OrigHtml, nil) 101 | } else { 102 | parser.internalParse() 103 | } 104 | 105 | parser.InnerText = html.UnescapeString(parser.innerTextBuilder.String()) 106 | parser.stop = true 107 | 108 | return parser.HasValidSyntax 109 | } 110 | 111 | func (parser *HtmlParser) IsValidStrictHTML401() bool { 112 | return parser.HasValidSyntax && parser.HasOnlyValidTags && parser.HasOnlyValidAttributes 113 | } 114 | 115 | func (parser *HtmlParser) IsValidStrictHTMLNoDeprecated() bool { 116 | return parser.HasValidSyntax && parser.HasOnlyValidTags && parser.HasOnlyValidAttributes && !parser.HasDeprecatedAttributes && !parser.HasDeprecatedTags 117 | } 118 | 119 | func (parser *HtmlParser) IsValidHTML401() bool { 120 | return parser.HasValidSyntax && parser.HasOnlyValidTags && parser.HasOnlyValidAttributes 121 | } 122 | 123 | func (parser *HtmlParser) Stop() { 124 | parser.stop = true 125 | } 126 | 127 | func (p *HtmlParser) callText(text string, parent *HtmlElement) { 128 | 129 | if text == "" { 130 | return 131 | } 132 | 133 | if !p.PreserveCRLFTab { 134 | if !hasContent(text) { 135 | return 136 | } 137 | } 138 | 139 | if parent != nil && parent.ElementInfo != nil { 140 | var childrenTypes = parent.ElementInfo.PermittedChildrenTypes 141 | if (childrenTypes & (HETText | HETNRCharData)) == 0 { 142 | p.addWarning("Text node inside a " + parent.TagName + " element is not valid") 143 | } 144 | } 145 | 146 | if parent != nil { 147 | _, present := ignoreTextInsideTag[parent.TagNameNS] 148 | if present { 149 | return 150 | } 151 | } 152 | 153 | clearText := !p.PreserveCRLFTab 154 | 155 | if parent != nil { 156 | switch parent.TagNameNS { 157 | case "pre": 158 | clearText = false 159 | case "script": 160 | clearText = false 161 | case "style": 162 | clearText = false 163 | } 164 | } 165 | 166 | if clearText && strings.HasPrefix(text, ""), p+4) 332 | if ec == -1 { 333 | hp.HasValidSyntax = false 334 | fatal = true 335 | hp.addError("Missing end comment -->") 336 | break 337 | } 338 | //fmt.Printf("2-[%v:%v]\n", p, ec + 3) 339 | text = string(hp.origRunes[p : ec+3]) 340 | if !hp.SkipComments { 341 | hp.callText(text, nil) 342 | if hp.stop { 343 | return 344 | } 345 | } 346 | p += utf8.RuneCountInString(text) 347 | last = p 348 | p-- 349 | continue 350 | } 351 | // Looks like a doctype 352 | e2 := strings.ToLower(elem) 353 | //fmt.Printf("351-e2=%v\n", e2) 354 | if strings.HasPrefix(e2, " for closing tag: " + he.OriginalOpenTag) 414 | break 415 | } 416 | cp += utf8.RuneCountInString(elem) 417 | last = cp 418 | endTag = parseClosingTag(elem) 419 | if endTag == he.TagNameNS { 420 | break 421 | } 422 | endTag = "" 423 | } 424 | if endTag == "" { 425 | p = l 426 | break 427 | } 428 | p = cp - 1 429 | 430 | if hp.elementCallback != nil { 431 | hp.elementCallback(he, false) 432 | if hp.stop { 433 | return 434 | } 435 | } 436 | 437 | //fmt.Printf("3-[%v:%v]\n", startScript, endScript) 438 | hp.callText(string(hp.origRunes[startScript:endScript]), he) 439 | if hp.stop { 440 | return 441 | } 442 | 443 | if hp.endElementCallback != nil { 444 | hp.endElementCallback(he.TagNameNS) 445 | if hp.stop { 446 | return 447 | } 448 | } 449 | continue 450 | } 451 | 452 | // We consider this a single element if 453 | // 1) the ElementInfo.Single is flagged 454 | // 2) It is an unknown element (but not in any namespace) 455 | if he.ElementInfo == nil { 456 | hp.HasOnlyKnownTags = false 457 | // Unknown HTML 4.01 tag 458 | if !he.HasNamespace { 459 | hp.addWarning("Unknown tag: " + he.TagNameNS) 460 | // Really unknown and invalid tag 461 | if he.XmlEmptyTag { 462 | if hp.elementCallback != nil { 463 | hp.elementCallback(he, true) 464 | if hp.stop { 465 | return 466 | } 467 | } 468 | } else { 469 | if hp.elementCallback != nil { 470 | hp.elementCallback(he, false) 471 | if hp.stop { 472 | return 473 | } 474 | } 475 | if hp.HasValidSyntax { 476 | openedTags = append(openedTags, he) 477 | } 478 | } 479 | } else { 480 | // it is unknown, but correctly declared in an XML namespace 481 | if he.XmlEmptyTag { 482 | if hp.elementCallback != nil { 483 | hp.elementCallback(he, true) 484 | if hp.stop { 485 | return 486 | } 487 | } 488 | } else { 489 | if hp.elementCallback != nil { 490 | hp.elementCallback(he, false) 491 | if hp.stop { 492 | return 493 | } 494 | } 495 | if hp.HasValidSyntax { 496 | openedTags = append(openedTags, he) 497 | } 498 | } 499 | } 500 | } else { 501 | if he.ElementInfo.Obsolete { 502 | hp.addWarning("Deprecated Tag: " + he.TagNameNS) 503 | hp.HasDeprecatedTags = true 504 | } 505 | 506 | // It's known tag 507 | if he.ElementInfo.TagFormatting == HTFSingle || he.XmlEmptyTag { 508 | if hp.elementCallback != nil { 509 | hp.elementCallback(he, true) 510 | if hp.stop { 511 | return 512 | } 513 | } 514 | } else { 515 | if hp.HasValidSyntax { 516 | if he.ElementInfo.ElementType == HETFlow { 517 | // Some Tags have optional closing (like LI or TD or P) 518 | // We assume an automatic closing for these tags on the following situation: 519 | // 1) Current element is block-level, and 520 | // 2) Parent node is also a block-level and supports optional closing 521 | // 3) Current element is the same class as parent element 522 | // or Current element is the closing tag of the parent element 523 | if blockParent != nil && blockParent.ElementInfo.TagFormatting == HTFOptionalClosing { 524 | if parent != blockParent { 525 | hp.addWarning("Invalid parent for " + blockParent.TagName + " (inside of " + parent.TagName + ")") 526 | } else { 527 | if he.TagName == blockParent.TagName { 528 | if hp.endElementCallback != nil { 529 | hp.endElementCallback(parent.TagNameNS) 530 | if hp.stop { 531 | return 532 | } 533 | } 534 | openedTags = openedTags[:len(openedTags)-1] 535 | openedBlocks = openedBlocks[:len(openedBlocks)-1] 536 | } 537 | } 538 | } 539 | if hp.HasValidSyntax { 540 | openedBlocks = append(openedBlocks, he) 541 | } 542 | } 543 | if hp.HasValidSyntax { 544 | openedTags = append(openedTags, he) 545 | } 546 | } 547 | if hp.elementCallback != nil { 548 | hp.elementCallback(he, false) 549 | } 550 | 551 | } 552 | } 553 | 554 | } 555 | } // for loop 556 | 557 | //fmt.Printf("554-Out\n") 558 | 559 | if !fatal { 560 | // commit the last piece of text 561 | parent = nil 562 | //fmt.Printf("559\n") 563 | if hp.HasValidSyntax && len(openedTags) > 0 { 564 | parent = openedTags[len(openedTags)-1] 565 | } 566 | 567 | //fmt.Printf("564\n") 568 | if last < l { 569 | //fmt.Printf("564-[%v:]\n", last) 570 | text = string(hp.origRunes[last:]) 571 | hp.callText(text, parent) 572 | if hp.stop { 573 | return 574 | } 575 | } 576 | 577 | //fmt.Printf("574\n") 578 | if hp.HasValidSyntax { 579 | //fmt.Printf("576-openedTags: %v %v\n", len(openedTags), openedTags) 580 | for len(openedTags) > 0 { 581 | parent = openedTags[len(openedTags)-1] 582 | if parent.ElementInfo == nil || parent.ElementInfo.TagFormatting != HTFOptionalClosing { 583 | break 584 | } 585 | if hp.endElementCallback != nil { 586 | hp.endElementCallback(parent.TagNameNS) 587 | if hp.stop { 588 | return 589 | } 590 | } 591 | openedTags = openedTags[:len(openedTags)-1] 592 | blockParent = nil 593 | if len(openedBlocks) > 0 { 594 | blockParent = openedBlocks[len(openedBlocks)-1] 595 | } 596 | if parent == blockParent { 597 | openedBlocks = openedBlocks[:len(openedBlocks)-1] 598 | } 599 | } 600 | } 601 | //fmt.Printf("598\n") 602 | } 603 | 604 | //fmt.Printf("596-OUt\n") 605 | 606 | if hp.HasValidSyntax { 607 | if len(openedBlocks) > 0 { 608 | if len(openedTags) != len(openedBlocks) { 609 | hp.HasValidSyntax = false 610 | hp.addError("Missing " + strconv.Itoa(len(openedTags)) + " tag(s) closing.") 611 | } else { 612 | for len(openedBlocks) > 0 { 613 | blockParent = openedBlocks[len(openedBlocks)-1] 614 | openedBlocks = openedBlocks[:len(openedBlocks)-1] 615 | parent = openedTags[len(openedTags)-1] 616 | openedTags = openedTags[:len(openedTags)-1] 617 | if parent != blockParent { 618 | hp.HasValidSyntax = false 619 | hp.addError("Missing a close tag for a block-element. Opened Tag: " + parent.TagNameNS) 620 | break 621 | } 622 | if hp.endElementCallback != nil { 623 | hp.endElementCallback(parent.TagNameNS) 624 | if hp.stop { 625 | return 626 | } 627 | } 628 | } 629 | 630 | } 631 | } else if len(openedTags) > 0 { 632 | hp.addError("Missing " + strconv.Itoa(len(openedTags)) + " tag(s) closing.") 633 | hp.HasValidSyntax = false 634 | } 635 | } 636 | } 637 | 638 | func (hp *HtmlParser) unwindForClose(tag string, openedTags, openedBlocks *[]*HtmlElement) { 639 | var parent, blockParent *HtmlElement 640 | if len(*openedTags) > 0 { 641 | parent = (*openedTags)[len(*openedTags)-1] 642 | } 643 | 644 | if parent == nil { 645 | hp.HasValidSyntax = false 646 | hp.addError("Closing tag without opening: " + tag) 647 | return 648 | } 649 | //fmt.Printf("637-Parent:%v\n", parent) 650 | 651 | firstParent := parent.TagNameNS 652 | 653 | if len(*openedBlocks) > 0 { 654 | blockParent = (*openedBlocks)[len(*openedBlocks)-1] 655 | } 656 | 657 | //fmt.Printf("645-openTags: %v | openedBlocks: %v\n", *openedTags, *openedBlocks) 658 | 659 | for parent != nil { 660 | if parent.TagNameNS == tag { 661 | *openedTags = (*openedTags)[:len(*openedTags)-1] 662 | //fmt.Printf("648-openTags: %v | openedBlocks: %v\n", *openedTags, *openedBlocks) 663 | if blockParent != nil && blockParent.TagNameNS == tag { 664 | *openedBlocks = (*openedBlocks)[:len(*openedBlocks)-1] 665 | } 666 | return 667 | } 668 | 669 | if parent.ElementInfo == nil { 670 | break // mismatch 671 | } 672 | 673 | // This could be either a tag mismatch, or an optional element missing 674 | if parent.ElementInfo.TagFormatting != HTFOptionalClosing { 675 | break // mismatch 676 | } 677 | 678 | // inject the optional closing tag 679 | if hp.endElementCallback != nil { 680 | hp.endElementCallback(parent.TagNameNS) 681 | if hp.stop { 682 | return 683 | } 684 | } 685 | 686 | if len(*openedTags) == 0 { 687 | break 688 | } 689 | *openedTags = (*openedTags)[:len(*openedTags)-1] 690 | if blockParent == parent { 691 | *openedBlocks = (*openedBlocks)[:len(*openedBlocks)-1] 692 | blockParent = nil 693 | if len(*openedBlocks) > 0 { 694 | blockParent = (*openedBlocks)[len(*openedBlocks)-1] 695 | } 696 | } 697 | parent = parent.Parent 698 | } 699 | 700 | hp.addError("Tag mismatch. Open tag: " + firstParent + " / Closing tag: " + tag) 701 | hp.HasValidSyntax = false 702 | } 703 | 704 | func (hp *HtmlParser) addError(error string) { 705 | hp.Errors = append(hp.Errors, error) 706 | } 707 | 708 | func (hp *HtmlParser) addWarning(wrn string) { 709 | hp.Warnings = append(hp.Warnings, wrn) 710 | } 711 | 712 | func (hp *HtmlParser) getElementString(startPos int) string { 713 | var c rune 714 | endElem := 0 715 | l := len(hp.origRunes) 716 | p := startPos 717 | for ; p < l; p++ { 718 | c = hp.origRunes[p] 719 | if c == '>' { 720 | endElem = p 721 | break 722 | } 723 | if c == '"' || c == '\'' { 724 | p = runesIndexRunesStart(hp.origRunes, []rune{c}, p+1) 725 | if p == -1 { 726 | // Not well formed HTML:
    40 { 743 | logString = logString[0:40] 744 | } 745 | hp.addError("Can't find > for tag: " + string(logString)) 746 | return "" 747 | } 748 | 749 | return string(hp.origRunes[startPos : endElem+1]) 750 | 751 | } 752 | -------------------------------------------------------------------------------- /htmlparser_test.go: -------------------------------------------------------------------------------- 1 | package htmlparser 2 | 3 | import ( 4 | "bytes" 5 | "html" 6 | "strings" 7 | //"fmt" 8 | "testing" 9 | ) 10 | 11 | func Test__SimpleSegments(t *testing.T) { 12 | testSegments(t, true, []string{ 13 | "", 14 | "text only", 15 | "text only with > entities", 16 | "", 17 | "bold", 18 | "abold", 19 | "aboldb", 20 | "bolditalic-boldbold", 21 | }) 22 | } 23 | 24 | func Test_OptionalClosing(t *testing.T) { 25 | testSegments(t, true, 26 | []string{ 27 | "

    ", 28 | "a

    ", 29 | "a

    b", 30 | "a

    b

    c", 31 | "a

    b
    c

    ", 32 | "a

    b
    c", 33 | "a

    b

    c

    ", 34 | "a

    b

    c", 35 | "a

    b

    c", 36 | "a

    bcd

    e

    ", 37 | "a

    bcd

    e", 38 | "a

    bcd

    e", 39 | }) 40 | } 41 | 42 | func Test_OptionalClosingWithBlockElement(t *testing.T) { 43 | testSegments(t, true, []string{ 44 | "

    • a
    ", 45 | "
    • a
      b
    ", 46 | "
    • a
      b
    ", 47 | "
    • a
    ", 48 | "
    • a
      b
    ", 49 | "
    • a
      b
    ", 50 | }) 51 | } 52 | 53 | func Test_Scripts(t *testing.T) { 54 | testSegments(t, true, []string{ 55 | "", 56 | "", 57 | "abcdef", 58 | }) 59 | } 60 | 61 | func Test_Comments(t *testing.T) { 62 | testSegments(t, true, []string{ 63 | "", 64 | "", 65 | "abcdghij", 66 | }) 67 | } 68 | 69 | func Test_Table(t *testing.T) { 70 | testSegments(t, true, []string{ 71 | "
    a
    ", 72 | "
    a
    ", 73 | "

    a

    ", 74 | }) 75 | } 76 | 77 | func Test_CompleteHtml(t *testing.T) { 78 | testSegments(t, true, []string{ 79 | "titlebody", 80 | "hellobody", 81 | `titlebody`, 82 | }) 83 | } 84 | 85 | func Test_SingleTags(t *testing.T) { 86 | testSegments(t, true, []string{ 87 | "
    ", 88 | "
    ", 89 | "
    ", 90 | "
    ", 91 | "< br />", 92 | "< br / >", 93 | "
    ", 94 | "
    ", 95 | "
    ", 96 | "
    ", 97 | }) 98 | } 99 | 100 | func Test_Attributes(t *testing.T) { 101 | testSegments(t, true, []string{ 102 | "
    ", 103 | "a", 104 | "a", 105 | "a", 106 | "a", 107 | }) 108 | 109 | } 110 | 111 | func Test_StyleTag(t *testing.T) { 112 | testSegments(t, true, []string{ 113 | "", 114 | }) 115 | } 116 | 117 | func Test_InvalidSegments(t *testing.T) { 118 | testSegments(t, false, []string{ 119 | "<", 120 | "", 123 | "< >", 124 | " < > ", 125 | "", 126 | "abcde", 127 | "", 128 | "", 129 | "c"}, 144 | } 145 | 146 | for _, segment := range segments { 147 | 148 | var parser = NewParser(segment.Item2) 149 | if !parser.Parse(nil, nil, nil) { 150 | t.Error() 151 | } 152 | if segment.Item1 != parser.InnerText { 153 | t.Error(segment.Item1) 154 | } 155 | 156 | } 157 | } 158 | 159 | func Test_PreserveComments(t *testing.T) { 160 | segment := "abcd" 161 | parser := NewParser(segment) 162 | parser.SkipComments = false 163 | if !parser.Parse(nil, nil, nil) { 164 | t.Error() 165 | } 166 | if parser.InnerText != "abcd" { 167 | t.Error() 168 | } 169 | 170 | } 171 | 172 | func Test_ComplexHtml(t *testing.T) { 173 | parser := NewParser(googleHomepage) 174 | parser.Parse(nil, nil, nil) 175 | if !parser.HasValidSyntax { 176 | t.Error() 177 | } 178 | } 179 | 180 | func Test_CustomInnerText(t *testing.T) { 181 | segment := "abcd

    e" 182 | 183 | n := bytes.NewBufferString("") 184 | 185 | parser := NewParser(segment) 186 | 187 | parser.Parse(func(text string, he *HtmlElement) { 188 | n.WriteString(text) 189 | }, nil, nil) 190 | 191 | if n.String() != "abcde" { 192 | t.Error() 193 | } 194 | 195 | } 196 | 197 | func Test_UrlAttribute(t *testing.T) { 198 | segment := ` ` 199 | 200 | foundIt := false 201 | parser := NewParser(segment) 202 | 203 | parser.Parse(nil, func(e *HtmlElement, isEmpty bool) { 204 | t.Logf("E: %v (attr=%v)\n", e.TagName, e.Attributes) 205 | if e.TagName == "link" { 206 | if len(e.Attributes) != 4 { 207 | t.Error() 208 | } 209 | if title, _ := e.GetAttributeValue("title"); title != "M-Shaped Brain » Feed" { 210 | t.Error() 211 | } 212 | if href, _ := e.GetAttributeValue("href"); href != "http://blog.calbucci.com/feed/" { 213 | t.Error() 214 | } 215 | 216 | foundIt = true 217 | } 218 | }, nil) 219 | 220 | if !foundIt { 221 | t.Error() 222 | } 223 | 224 | } 225 | 226 | func Test_FindRSSFeed(t *testing.T) { 227 | rssFeed := "" 228 | parser := NewParser(blogPost) 229 | 230 | parser.Parse(nil, func(e *HtmlElement, isEmpty bool) { 231 | if e.TagName == "link" { 232 | 233 | if ty, _ := e.GetAttributeValue("type"); ty == "application/rss+xml" { 234 | t.Logf("rss-e: %v %v\n", e.TagName, e.Attributes) 235 | rssFeed, _ = e.GetAttributeValue("href") 236 | parser.Stop() 237 | } 238 | } 239 | }, nil) 240 | 241 | t.Logf("rssFeed=%v\n", rssFeed) 242 | if rssFeed != "http://blog.calbucci.com/feed/" { 243 | t.Error() 244 | } 245 | 246 | } 247 | 248 | func Test_Idempotent(t *testing.T) { 249 | baseHtml := blogPost 250 | html1 := parseAndSerialize(baseHtml) 251 | html2 := parseAndSerialize(html1) 252 | html3 := parseAndSerialize(html2) 253 | 254 | if html1 != html2 { 255 | 256 | max := len(html1) 257 | if max > len(html2) { 258 | max = len(html2) 259 | } 260 | for i := 0; i < max; i++ { 261 | if html1[i] != html2[i] { 262 | i -= 20 263 | if i < 0 { 264 | i = 0 265 | } 266 | e := i + 30 267 | if e > max { 268 | e = max 269 | } 270 | t.Logf("Mismatch1: %v\n", html1[i:e]) 271 | t.Logf("Mismatch2: %v\n", html2[i:e]) 272 | break 273 | } 274 | } 275 | 276 | t.Error() 277 | } 278 | if html2 != html3 { 279 | t.Error() 280 | } 281 | } 282 | 283 | func parseAndSerialize(origHtml string) string { 284 | parser := NewParser(origHtml) 285 | 286 | parser.PreserveCRLFTab = false 287 | 288 | n := bytes.NewBufferString("") 289 | 290 | parser.Parse(func(text string, parent *HtmlElement) { 291 | escaped := html.EscapeString(text) 292 | n.WriteString(escaped) 293 | }, func(parent *HtmlElement, isEmptyTag bool) { 294 | n.WriteString(parent.GetOpenTag(false, false)) 295 | }, func(closeTag string) { 296 | n.WriteString("") 297 | }) 298 | 299 | return n.String() 300 | } 301 | 302 | func Test_FindOpenGraphTags(t *testing.T) { 303 | parser := NewParser(blogPost) 304 | 305 | tags := make(map[string]string) 306 | 307 | parser.Parse(nil, func(element *HtmlElement, isEmptyTag bool) { 308 | if element.TagName == "meta" { 309 | ogName, _ := element.GetAttributeValue("property") 310 | if ogName == "" || !strings.HasPrefix(ogName, "og:") { 311 | return 312 | } 313 | ogValue, _ := element.GetAttributeValue("content") 314 | tags[ogName] = ogValue 315 | } 316 | }, nil) 317 | 318 | if !parser.HasValidSyntax { 319 | t.Error() 320 | } 321 | 322 | if v, _ := tags["og:type"]; v != "article" { 323 | t.Error() 324 | } 325 | 326 | if v, _ := tags["og:url"]; v != "http://blog.calbucci.com/2015/01/27/attention-cannibalization/" { 327 | t.Error() 328 | } 329 | 330 | } 331 | 332 | func testSegments(t *testing.T, result bool, segments []string) { 333 | for _, segment := range segments { 334 | t.Logf("Processing: %v\n", segment) 335 | parser := NewParser(segment) 336 | if parser.Parse(nil, nil, nil) != result { 337 | t.Errorf("Failed to parse segment: " + segment) 338 | } 339 | } 340 | } 341 | --------------------------------------------------------------------------------