\n | <\/td>\n | $text<\/td>\n <\/tr>\n <\/tbody>\n<\/table>","\n \n <\/tr>\n \n $text\n <\/tr>\n <\/tbody>\n<\/table>","\n \n \n \n <\/colgroup>$text\n<\/table>","$text","$text","$text","$text","$text","$text","$text<\/a>","$text"," $text","$text<\/h1>","<\/h1>"]
--------------------------------------------------------------------------------
/hui/results_parsers/JS_HTMLPARSER2.json:
--------------------------------------------------------------------------------
1 | ["\">","\">","","\">","\">","\">","","$text","$text","$text","$text","$text","","","","","","","$text | ","$text | ","$text","$text","$text","$text ","$text","$text"," $text","$text",""]
--------------------------------------------------------------------------------
/hui/generated_payloads.json:
--------------------------------------------------------------------------------
1 | ["\">", "", "", "\">", "\">", "\">", "", "$text", "$text", "$text", "$text", "$text", "", "", "$text", " | $text | ", "|
$text ", "$text", "| $text | ", "$text | ", "$text", "$text", "$text", "$text ", "$text", "$text", " $text", "$text", ""]
--------------------------------------------------------------------------------
/hui/results_parsers/PYTHON_HTML.json:
--------------------------------------------------------------------------------
1 | ["", "", "", "", "\">", "", "", "$text", "$text", "$text", "$text", "$text", "", "", "", "", "", "", "$text | ", "$text | ", "$text", "$text", "$text", "$text ", "$text", "$text", " $text", "$text", ""]
--------------------------------------------------------------------------------
/hui/results_parsers/JS_JSXSS.json:
--------------------------------------------------------------------------------
1 | ["<xmp></xmp>","<textarea></textarea>","<noscript></noscript>","<noembed></noembed>","<style></style>","<plaintext></plaintext>","<select></select>","$text","$text","$text","$text","$text","<form><form>$text</form></form>","","$text"," | $text | ","|
$text ","$text","| $text | ","$text | ","$text","$text","$text","$text ","$text","<wbr>$text</wbr>"," $text","$text",""]
--------------------------------------------------------------------------------
/hui.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | LICENSE.txt
2 | README.md
3 | pyproject.toml
4 | setup.cfg
5 | setup.py
6 | hui/ALLOWED_ATTRS.py
7 | hui/ALLOWED_TAGS.py
8 | hui/CustomParser.py
9 | hui/Generator.py
10 | hui/ParserBase.py
11 | hui/ParserPayload.py
12 | hui/__init__.py
13 | hui/generated_payloads.json
14 | hui/identify.py
15 | hui.egg-info/PKG-INFO
16 | hui.egg-info/SOURCES.txt
17 | hui.egg-info/dependency_links.txt
18 | hui.egg-info/top_level.txt
19 | hui/parsers/GO_HTML.py
20 | hui/parsers/GO_bluemonday.py
21 | hui/parsers/JAVA_JSOUP.py
22 | hui/parsers/JS_DOM.py
23 | hui/parsers/JS_DOMPURIFY.py
24 | hui/parsers/JS_HTMLPARSER2.py
25 | hui/parsers/JS_JSXSS.py
26 | hui/parsers/JS_SANITIZE_HTML.py
27 | hui/parsers/PYTHON_HTML.py
28 | hui/parsers/PYTHON_HTML_SANITIZER.py
29 | hui/parsers/PYTHON_LXML_HTML.py
30 | hui/parsers/__init__.py
31 | hui/parsers/simple_parser.py
32 | hui/results_parsers/GO_BLUEMONDAY.json
33 | hui/results_parsers/GO_HTML.json
34 | hui/results_parsers/JAVA_JSOUP.json
35 | hui/results_parsers/JSDOM_HTML.json
36 | hui/results_parsers/JS_DOMPURIFY.json
37 | hui/results_parsers/JS_HTMLPARSER2.json
38 | hui/results_parsers/JS_JSXSS.json
39 | hui/results_parsers/JS_SANITIZE_HTML.json
40 | hui/results_parsers/PYTHON_HTML.json
41 | hui/results_parsers/PYTHON_HTML_SANITIZE.json
42 | hui/results_parsers/PYTHON_LXML_HTML.json
--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/src/main/java/com/example/Main.java:
--------------------------------------------------------------------------------
1 | package com.example;
2 |
3 | import org.jsoup.Jsoup;
4 | import org.jsoup.nodes.Document;
5 | import org.json.simple.JSONArray;
6 | import org.json.simple.parser.JSONParser;
7 | import java.nio.file.Files;
8 | import java.nio.file.Paths;
9 | import java.io.IOException;
10 | import java.io.FileReader;
11 |
12 |
13 | public class Main {
14 | public static void generate() {
15 | try {
16 | System.out.println("Starting generation process...");
17 | JSONParser parser = new JSONParser();
18 | JSONArray arr = (JSONArray) parser.parse(new FileReader("generated_payloads.json"));
19 | JSONArray res = new JSONArray();
20 |
21 | for (int i = 0; i < arr.size(); i++) {
22 | String htmlContent = (String) arr.get(i);
23 | Document doc = Jsoup.parse(htmlContent);
24 | String bodyInnerHtml = doc.body().html();
25 | res.add(bodyInnerHtml);
26 | }
27 | Files.write(Paths.get("results_parsers/JAVA_JSOUP.json"), res.toJSONString().getBytes());
28 | } catch (Exception e) {
29 | e.printStackTrace();
30 | }
31 | }
32 |
33 | public static void main(String[] args) {
34 | generate();
35 | }
36 | }
--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.example
5 | java-jsoup
6 | 1.0-SNAPSHOT
7 |
8 |
9 |
10 | maven-jar-plugin
11 | 3.2.0
12 |
13 |
14 |
15 | true
16 | true
17 | com.example.Main
18 |
19 |
20 |
21 |
22 |
23 | maven-shade-plugin
24 | 3.6.0
25 |
26 |
27 | package
28 |
29 | shade
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/hui/results_parsers/PYTHON_LXML_HTML.json:
--------------------------------------------------------------------------------
1 | ["", "", "", "", "\">", "", "", "$text", "$text", "$text", "$text", "$text", "", "", "", "", "", "", "$text | ", "$text | ", "$text", "$text", "$text", "$text ", "$text", "$text", " $text", "$text", ""]
--------------------------------------------------------------------------------
/hui/parsers/generators/go_html.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "fmt"
6 | "os"
7 | "strings"
8 |
9 | "golang.org/x/net/html"
10 | )
11 |
12 | func generate() {
13 | file, err := os.Open("../../generated_payloads.json")
14 | if err != nil {
15 | fmt.Println("Error opening file:", err)
16 | return
17 | }
18 | defer file.Close()
19 |
20 | var arr []string
21 | if err := json.NewDecoder(file).Decode(&arr); err != nil {
22 | fmt.Println("Error decoding JSON:", err)
23 | return
24 | }
25 |
26 | var res []string
27 | for _, payload := range arr {
28 | htmlContent := fmt.Sprintf("%s", payload)
29 |
30 | doc, err := html.Parse(strings.NewReader(htmlContent))
31 | if err != nil {
32 | fmt.Println("Error parsing HTML:", err)
33 | continue
34 | }
35 |
36 | var bodyInnerHTML string
37 | var f func(*html.Node)
38 | f = func(n *html.Node) {
39 | if n.Type == html.ElementNode && n.Data == "body" {
40 | var buf strings.Builder
41 | html.Render(&buf, n)
42 | bodyInnerHTML = buf.String()
43 | }
44 | for c := n.FirstChild; c != nil; c = c.NextSibling {
45 | f(c)
46 | }
47 | }
48 | f(doc)
49 |
50 | res = append(res, strings.TrimSuffix(strings.TrimPrefix(bodyInnerHTML, ""), ""))
51 | }
52 |
53 | outputFile, err := os.Create("../../results_parsers/GO_HTML.json")
54 | if err != nil {
55 | fmt.Println("Error creating output file:", err)
56 | return
57 | }
58 | defer outputFile.Close()
59 |
60 | if err := json.NewEncoder(outputFile).Encode(res); err != nil {
61 | fmt.Println("Error encoding JSON:", err)
62 | }
63 | }
64 |
65 | func main() {
66 | generate()
67 | }
--------------------------------------------------------------------------------
/hui/results_parsers/GO_BLUEMONDAY.json:
--------------------------------------------------------------------------------
1 | ["\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026#34;\u0026gt;\u003c/a\u003e","\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026lt;/plaintext\u0026gt;\u0026#34;\u0026gt;\u0026lt;/a\u0026gt;\u0026lt;/plaintext\u0026gt;","\u003ch1\u003e\u003c/h1\u003e","\u003ch1\u003e\u003ch2\u003e$text\u003c/h2\u003e\u003c/h1\u003e","\u003ch2\u003e\u003ch3\u003e$text\u003c/h3\u003e\u003c/h2\u003e","\u003ch3\u003e\u003ch4\u003e$text\u003c/h4\u003e\u003c/h3\u003e","\u003ch4\u003e\u003ch5\u003e$text\u003c/h5\u003e\u003c/h4\u003e","\u003ch5\u003e\u003ch6\u003e$text\u003c/h6\u003e\u003c/h5\u003e","$text","\u003ctable\u003e\u003ctable\u003e$text\u003c/table\u003e\u003c/table\u003e","\u003ctable\u003e\u003ccaption\u003e\u003ccaption\u003e$text\u003c/caption\u003e\u003c/caption\u003e\u003c/caption\u003e","\u003ctable\u003e\u003ctd\u003e\u003ctd\u003e$text\u003c/td\u003e\u003c/td\u003e\u003c/td\u003e","\u003ctable\u003e\u003ctr\u003e\u003ctr\u003e$text\u003c/tr\u003e\u003c/tr\u003e\u003c/tr\u003e","\u003ctable\u003e\u003ccol\u003e\u003ccol\u003e$text\u003c/col\u003e\u003c/col\u003e\u003c/col\u003e","\u003cth\u003e$text\u003c/th\u003e","\u003ctd\u003e$text\u003c/td\u003e","\u003ctfoot\u003e$text\u003c/tfoot\u003e","\u003cthead\u003e$text\u003c/thead\u003e","\u003ctbody\u003e$text\u003c/tbody\u003e","\u003ctr\u003e$text\u003c/tr\u003e","\u003ca href=\"$href\" rel=\"nofollow\"\u003e$text\u003c/a\u003e","\u003cwbr\u003e$text\u003c/wbr\u003e","\u003chr\u003e$text\u003c/hr\u003e","\u003ch1\u003e$text\u003c/h1\u003e\u003c/h1\u003e","\u003ch1\u003e"]
2 |
--------------------------------------------------------------------------------
/hui/results_parsers/GO_HTML.json:
--------------------------------------------------------------------------------
1 | ["\u003cxmp\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/xmp\u003e\u0026#34;\u0026gt;","\u003ctextarea\u003e\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u003c/textarea\u003e\u0026#34;\u0026gt;","\u003cnoscript\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/noscript\u003e\u0026#34;\u0026gt;","\u003cnoembed\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/noembed\u003e\u0026#34;\u0026gt;","\u003cstyle\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/style\u003e\u0026#34;\u0026gt;","\u003cplaintext\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/plaintext\u003e\"\u003e\u003c/a\u003e\u003c/plaintext\u003e\u003c/body\u003e\u003c/html\u003e","\u003cselect\u003e\u003c/select\u003e","\u003ch1\u003e\u003c/h1\u003e\u003ch2\u003e$text\u003c/h2\u003e","\u003ch2\u003e\u003c/h2\u003e\u003ch3\u003e$text\u003c/h3\u003e","\u003ch3\u003e\u003c/h3\u003e\u003ch4\u003e$text\u003c/h4\u003e","\u003ch4\u003e\u003c/h4\u003e\u003ch5\u003e$text\u003c/h5\u003e","\u003ch5\u003e\u003c/h5\u003e\u003ch6\u003e$text\u003c/h6\u003e","\u003cform\u003e$text\u003c/form\u003e","\u003ctable\u003e\u003c/table\u003e$text\u003ctable\u003e\u003c/table\u003e","\u003ctable\u003e\u003ccaption\u003e\u003c/caption\u003e\u003ccaption\u003e$text\u003c/caption\u003e\u003c/table\u003e","\u003ctable\u003e\u003ctbody\u003e\u003ctr\u003e\u003ctd\u003e\u003c/td\u003e\u003ctd\u003e$text\u003c/td\u003e\u003c/tr\u003e\u003c/tbody\u003e\u003c/table\u003e","$text\u003ctable\u003e\u003ctbody\u003e\u003ctr\u003e\u003c/tr\u003e\u003ctr\u003e\u003c/tr\u003e\u003c/tbody\u003e\u003c/table\u003e","$text\u003ctable\u003e\u003ccolgroup\u003e\u003ccol/\u003e\u003ccol/\u003e\u003c/colgroup\u003e\u003c/table\u003e","$text","$text","$text","$text","$text","$text","\u003ca $attribute_prefix-K=\"1\" href=\"$href\"\u003e$text\u003c/a\u003e","\u003cwbr/\u003e$text","\u003chr/\u003e$text","\u003ch1\u003e$text\u003c/h1\u003e","\u003ch1\u003e\u003c/h1\u003e"]
2 |
--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.example
8 | java-jsoup
9 | 1.0-SNAPSHOT
10 |
11 |
12 |
13 |
14 | org.apache.maven.plugins
15 | maven-jar-plugin
16 | 3.2.0
17 |
18 |
19 |
20 | true
21 | true
22 | com.example.Main
23 |
24 |
25 |
26 |
27 |
28 | org.apache.maven.plugins
29 | maven-shade-plugin
30 | 3.6.0
31 |
32 |
33 | package
34 |
35 | shade
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 | org.jsoup
46 | jsoup
47 | 1.14.3
48 |
49 |
50 | com.googlecode.json-simple
51 | json-simple
52 | 1.1.1
53 |
54 |
55 |
--------------------------------------------------------------------------------
/hui/ParserPayload.py:
--------------------------------------------------------------------------------
1 | from string import Template
2 |
3 | class ParserPayload:
4 | """
5 | Represents a payload for parsing with additional metadata and methods for validation.
6 |
7 | Attributes:
8 | payload (str): The actual payload to be parsed.
9 | expected_output (str, optional): The expected output of the parsing process. Defaults to None.
10 | version (str, optional): The version of the parser or the payload. Defaults to None.
11 | sanitizer (str, optional): The sanitizer to be used for the payload. Defaults to None.
12 | parametrs (str, optional): Parameters for the parsing process. Defaults to None.
13 | tags (list): Tags associated with the payload.
14 |
15 | Methods:
16 | check(output): Checks if the output matches the expected output, considering whitespace.
17 | remove_whitespace(string): Removes whitespace from a given string.
18 | """
19 |
20 | def __init__(self, payload: str, tags, expected_output: str = None, version: str = None, sanitizer: str = None, parametrs: str = None) -> None:
21 | self.payload = payload
22 | self.expected_output = expected_output
23 | self.version = version
24 | self.sanitizer = sanitizer
25 | self.parametrs = parametrs
26 | self.tags = tags
27 |
28 | def check(self, output, TEMPLATE_VARS):
29 | """
30 | Checks if the output matches the expected output, considering whitespace.
31 |
32 | Args:
33 | output (str): The output to be checked against the expected output.
34 |
35 | Returns:
36 | bool: True if the output matches the expected output, False otherwise.
37 | """
38 | output = Template(output).safe_substitute(TEMPLATE_VARS)
39 | expected_output_template = Template(self.expected_output).safe_substitute(TEMPLATE_VARS)
40 | if output in expected_output_template:
41 | return True
42 | if self.remove_whitespace(output) in self.remove_whitespace(expected_output_template):
43 | return True
44 | return False
45 |
46 |
47 | def remove_whitespace(self, string):
48 | """
49 | Removes whitespace from a given string.
50 |
51 | Args:
52 | string (str): The string from which to remove whitespace.
53 |
54 | Returns:
55 | str: The string with all whitespace removed.
56 | """
57 | return "".join(string.split())
--------------------------------------------------------------------------------
/hui/ALLOWED_TAGS.py:
--------------------------------------------------------------------------------
1 | html_tags = [
2 | 'a',
3 | 'abbr',
4 | 'acronym',
5 | 'address',
6 | 'area',
7 | 'article',
8 | 'aside',
9 | 'audio',
10 | 'b',
11 | 'base',
12 | 'bdi',
13 | 'bdo',
14 | 'big',
15 | 'blockquote',
16 | 'body',
17 | 'br',
18 | 'button',
19 | 'canvas',
20 | 'caption',
21 | 'center',
22 | 'cite',
23 | 'code',
24 | 'col',
25 | 'colgroup',
26 | 'data',
27 | 'datalist',
28 | 'dd',
29 | 'del',
30 | 'details',
31 | 'dfn',
32 | 'dialog',
33 | 'dir',
34 | 'div',
35 | 'dl',
36 | 'dt',
37 | 'em',
38 | 'embed',
39 | 'fencedframe',
40 | 'fieldset',
41 | 'figcaption',
42 | 'figure',
43 | 'font',
44 | 'footer',
45 | 'form',
46 | 'frame',
47 | 'frameset',
48 | 'h1',
49 | 'h2',
50 | 'h3',
51 | 'h4',
52 | 'h5',
53 | 'h6',
54 | 'head',
55 | 'header',
56 | 'hgroup',
57 | 'hr',
58 | 'html',
59 | 'i',
60 | 'iframe',
61 | 'img',
62 | 'input',
63 | 'ins',
64 | 'kbd',
65 | 'label',
66 | 'legend',
67 | 'li',
68 | 'link',
69 | 'main',
70 | 'map',
71 | 'mark',
72 | 'marquee',
73 | 'menu',
74 | 'meta',
75 | 'meter',
76 | 'nav',
77 | 'nobr',
78 | 'noembed',
79 | 'noframes',
80 | 'noscript',
81 | 'object',
82 | 'ol',
83 | 'optgroup',
84 | 'option',
85 | 'output',
86 | 'p',
87 | 'param',
88 | 'picture',
89 | 'plaintext',
90 | 'portal',
91 | 'pre',
92 | 'progress',
93 | 'q',
94 | 'rb',
95 | 'rp',
96 | 'rt',
97 | 'rtc',
98 | 'ruby',
99 | 's',
100 | 'samp',
101 | 'script',
102 | 'search',
103 | 'section',
104 | 'select',
105 | 'slot',
106 | 'small',
107 | 'source',
108 | 'span',
109 | 'strike',
110 | 'strong',
111 | 'style',
112 | 'sub',
113 | 'summary',
114 | 'sup',
115 | 'table',
116 | 'tbody',
117 | 'td',
118 | 'template',
119 | 'textarea',
120 | 'tfoot',
121 | 'th',
122 | 'thead',
123 | 'time',
124 | 'title',
125 | 'tr',
126 | 'track',
127 | 'tt',
128 | 'u',
129 | 'ul',
130 | 'var',
131 | 'video',
132 | 'wbr',
133 | 'xmp',
134 | 'customtag'
135 | ]
136 |
137 | # Some tags may be only work inside table tag
138 | html_table_tags = [
139 | 'caption',
140 | 'thead',
141 | 'colgroup',
142 | 'col',
143 | 'th',
144 | 'tbody',
145 | 'tr',
146 | 'td',
147 | 'tfoot',
148 | ]
149 |
150 | # Copyied from https://github.com/cure53/DOMPurify/blob/f1106aae5a861d1096cb57ad9a6f518b4279ea8c/src/tags.ts#L226
151 | mathml_tags = [
152 | 'math',
153 | 'menclose',
154 | 'merror',
155 | 'mfenced',
156 | 'mfrac',
157 | 'mglyph',
158 | 'mi',
159 | 'mlabeledtr',
160 | 'mmultiscripts',
161 | 'mn',
162 | 'mo',
163 | 'mover',
164 | 'mpadded',
165 | 'mphantom',
166 | 'mroot',
167 | 'mrow',
168 | 'ms',
169 | 'mspace',
170 | 'msqrt',
171 | 'mstyle',
172 | 'msub',
173 | 'msup',
174 | 'msubsup',
175 | 'mtable',
176 | 'mtd',
177 | 'mtext',
178 | 'mtr',
179 | 'munder',
180 | 'munderover',
181 | 'mprescripts',
182 | 'maction',
183 | 'maligngroup',
184 | 'malignmark',
185 | 'mlongdiv',
186 | 'mscarries',
187 | 'mscarry',
188 | 'msgroup',
189 | 'mstack',
190 | 'msline',
191 | 'msrow',
192 | 'semantics',
193 | 'annotation',
194 | 'annotation-xml',
195 | 'mprescripts',
196 | 'none',
197 | ]
198 |
199 | # Copyied from https://github.com/cure53/DOMPurify/blob/f1106aae5a861d1096cb57ad9a6f518b4279ea8c/src/tags.ts#L123
200 | svg_tags = [
201 | 'animate',
202 | 'color-profile',
203 | 'cursor',
204 | 'discard',
205 | 'font-face',
206 | 'font-face-format',
207 | 'font-face-name',
208 | 'font-face-src',
209 | 'font-face-uri',
210 | 'foreignobject',
211 | 'hatch',
212 | 'hatchpath',
213 | 'mesh',
214 | 'meshgradient',
215 | 'meshpatch',
216 | 'meshrow',
217 | 'missing-glyph',
218 | 'script',
219 | 'set',
220 | 'solidcolor',
221 | 'unknown',
222 | 'use',
223 | 'feBlend',
224 | 'feColorMatrix',
225 | 'feComponentTransfer',
226 | 'feComposite',
227 | 'feConvolveMatrix',
228 | 'feDiffuseLighting',
229 | 'feDisplacementMap',
230 | 'feDistantLight',
231 | 'feDropShadow',
232 | 'feFlood',
233 | 'feFuncA',
234 | 'feFuncB',
235 | 'feFuncG',
236 | 'feFuncR',
237 | 'feGaussianBlur',
238 | 'feImage',
239 | 'feMerge',
240 | 'feMergeNode',
241 | 'feMorphology',
242 | 'feOffset',
243 | 'fePointLight',
244 | 'feSpecularLighting',
245 | 'feSpotLight',
246 | 'feTile',
247 | 'feTurbulence',
248 | 'a',
249 | 'altglyph',
250 | 'altglyphdef',
251 | 'altglyphitem',
252 | 'animatecolor',
253 | 'animatemotion',
254 | 'animatetransform',
255 | 'circle',
256 | 'clippath',
257 | 'defs',
258 | 'desc',
259 | 'ellipse',
260 | 'filter',
261 | 'font',
262 | 'g',
263 | 'glyph',
264 | 'glyphref',
265 | 'hkern',
266 | 'image',
267 | 'line',
268 | 'lineargradient',
269 | 'marker',
270 | 'mask',
271 | 'metadata',
272 | 'mpath',
273 | 'path',
274 | 'pattern',
275 | 'polygon',
276 | 'polyline',
277 | 'radialgradient',
278 | 'rect',
279 | 'stop',
280 | 'style',
281 | 'switch',
282 | 'symbol',
283 | 'text',
284 | 'textpath',
285 | 'title',
286 | 'tref',
287 | 'tspan',
288 | 'view',
289 | 'vkern'
290 | ]
--------------------------------------------------------------------------------
/hui/ParserBase.py:
--------------------------------------------------------------------------------
1 | from .ParserPayload import ParserPayload
2 | import json
3 | import os
4 |
5 | class ParserBase:
6 | """
7 | A class to handle parsing of HTML content with various checks for incorrect parsing states.
8 |
9 | Attributes:
10 | parser_name (str): The name of the parser.
11 | checks (list): A list of ParserPayload objects that define parsing checks.
12 | """
13 |
14 | def __init__(self, parser_name: str, attribute_prefix='data-') -> None:
15 | """
16 | Initializes the ParserBase with a parser name and an optional attribute prefix.
17 |
18 | Args:
19 | parser_name (str): The name of the parser.
20 | attribute_prefix (str): The prefix for attributes (default is 'data-').
21 | """
22 | self.parser_name = parser_name
23 | self.checks = []
24 |
25 | # Some HTML parser don't properly resolve raw text tags
26 | incorrect_parsing_state_tags = ['xmp','textarea','noscript','noembed','style','plaintext']
27 | for tag in incorrect_parsing_state_tags:
28 | self.add(
29 | ParserPayload(f'<{tag}>{tag}>',
30 | [tag,'a'],
31 | expected_output=f'<{tag}><a href="https://github.com/Slonser/hui/{tag}>">')
32 | )
33 |
34 | # Some HTML parsers incorrectly handle select tag
35 | # Browsers will remove tag, They - don't do this
36 | self.add(
37 | ParserPayload('',
38 | ['select','h1'],
39 | expected_output="")
40 | )
41 |
42 | # Some HTML parsers don't use flattening with headers tags
43 | for i in range(1,6):
44 | self.add(
45 | ParserPayload(f'$text',
46 | [f'h{i}',f'h{i+1}'],
47 | expected_output=f"$text")
48 | )
49 |
50 | # Some HTML parsers don't resolve nested forms
51 | self.add(
52 | ParserPayload('',
53 | ['form'],
54 | expected_output=f"")
55 | )
56 |
57 | # Some HTML parser don't resolve nested tables
58 | self.add(
59 | ParserPayload('',
60 | ['table'],
61 | expected_output=f"$text")
62 | )
63 |
64 | # Some parser don't resolve nested table elemenents
65 | table_nested_tags = ['caption','td','tr','col']
66 | for tag in table_nested_tags:
67 | self.add(
68 | ParserPayload(f'<{tag}><{tag}>$text{tag}>{tag}>{tag}>',
69 | ['table',tag],
70 | expected_output=f"<{tag}>{tag}><{tag}>$text{tag}> ")
71 | )
72 |
73 | # Some HTML parsers don't implement "in row" insertion mode correctly
74 | row_insertion_mode = ['th','td','tfoot','thead','tbody','tr']
75 | for tag in row_insertion_mode:
76 | self.add(
77 | ParserPayload(f'<{tag}>$text{tag}>',
78 | [tag],
79 | expected_output='$text')
80 | )
81 | # Python HTML parsers incorrectly handle lower on html attribute names
82 | # By default in browsers, only ascii chars would be lowercased
83 | # In python \u212a -> 0x6b
84 | self.add(
85 | ParserPayload('$text',
86 | ['a'],
87 | expected_output=f'$text')
88 | )
89 |
90 | #Some parsers incorrectrly parse self closing tags
91 | self_closing_tags = ['wbr','hr']
92 | for tag in self_closing_tags:
93 | self.add(
94 | ParserPayload(f'<{tag}>$text{tag}>',
95 | [tag],
96 | expected_output=f'<{tag}>$text')
97 | )
98 |
99 | # Some parsers incorrect handle
100 | # TODO: should check with another tags, because h1 maybe banned
101 | self.add(
102 | ParserPayload(f'$text',
103 | ['h1'],
104 | expected_output=f'123')
105 | )
106 |
107 | #Some sanitizers not close tags
108 | self.add(
109 | ParserPayload(f'',
110 | ['h1'],
111 | expected_output=f'')
112 | )
113 |
114 | def add(self, payload):
115 | """
116 | Adds a ParserPayload to the checks list.
117 |
118 | Args:
119 | payload (ParserPayload): The payload to be added to the checks.
120 | """
121 | self.checks.append(payload)
122 |
123 | def add_all(self, arr):
124 | """
125 | Adds multiple ParserPayloads to the checks list.
126 |
127 | Args:
128 | arr (list): A list of ParserPayload objects to be added.
129 | """
130 | for x in arr:
131 | self.add(x)
132 |
133 | def generate_payloads(self):
134 | """
135 | Generates payloads and saves them to a JSON file if it does not already exist.
136 | """
137 | if os.path.exists("./generated_payloads.json"):
138 | return
139 |
140 | tag_arr = []
141 | for tag in self.checks:
142 | tag_arr.append(tag.payload)
143 |
144 | return json.dump(tag_arr, open('./generated_payloads.json',"w"))
145 |
146 | def get_results(self):
147 | """
148 | Placeholder method for getting results.
149 | """
150 | pass
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Html Universal Identifier
2 |
3 | Html Universal Identifier is an alpha version of an application designed for identifying server-side HTML parsers. This package provides a way to determine which HTML, SVG, and MathML tags are allowed, helps to find parser features (incorrectly implemented tags), and can also help to guess which parser is used on the backend.
4 |
5 | Primarily, this library relies on the incorrectness of HTML parsing, for example, here are some classic examples:
6 | - `` should be transformed to ``
7 | - `text` should be transformed to `text`
8 |
9 | There are several reasons why you don't want to rely entirely on allowed tags:
10 | - It won't help you determine which parser your custom sanitization is based on
11 | - Allowed tags can be changed
12 |
13 | ## Features
14 |
15 | - Identify allowed HTML, SVG, and MathML tags.
16 | - Identify allowed attributes.
17 | - Identify incorrect parsing
18 | - Use a customizable handler function to process HTML payloads.
19 | - Load and compare results against predefined Parser outputs.
20 |
21 | ## Installation
22 |
23 | To install the package, use pip:
24 |
25 | ```
26 | pip install hui
27 | ```
28 |
29 | ## Usage
30 |
31 | Here is a basic example of how to use the `Identifier` class from the package:
32 |
33 | ```python
34 | from hui.identify import Identifier
35 | import requests
36 |
37 | def handler(payload):
38 | return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text
39 |
40 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False)
41 | print(a.identify())
42 | # run all
43 | # Example output
44 | # [[1.0, 27, 'JS_SANITIZE_HTML'], [0.8148148148148148, 22, 'PYTHON_HTML_SANITIZE'], ...
45 |
46 | print(a.check_attr_allowed("href",tag="a"))
47 | # True or False
48 | print(a.INCORRECT_PARSED)
49 | # Example output
50 | # [{'output': 'govnoed', 'expected': '$text'}, .. ]
51 | print(a.ALLOWED_TAGS)
52 | # print allowed tags
53 | print(a.ATTRIBUTES)
54 | # Prints ATTRIBUTES info
55 | print(a.DEPTH_LIMITS)
56 | # Example Outputs:
57 | # (514, 'No max tags limit')
58 | # (512, 'Flattening')
59 | # (255, 'Removing')
60 | ```
61 |
62 | ## Identifier Class
63 |
64 | The `Identifier` class is the core of this package. It is responsible for identifying allowed HTML, SVG, and MathML tags based on a handler function that processes HTML payloads.
65 |
66 | The class also maintains an `INCORRECT_PARSED` list, which contains payloads that were incorrectly parsed by the handler. For example, this may include cases where the parser fails to remove nested forms and similar issues.
67 |
68 | ## Current Parsers
69 |
70 | The following parsers are currently supported in the project:
71 |
72 | - **DOMpurify with JSDOM (JS)**
73 | - **JSDOM (JS)**
74 | - **sanitize_html (JS)**
75 | - **htmlparser2 (JS)**
76 | - **JSXSS (JS)**
77 | - **html (python)**
78 | - **lxml (python)**
79 | - **html_sanitizer (python)**
80 | - **net/html (go)**
81 | - **bluemonday (go)**
82 |
83 | If you believe a new parser/sanitizer should be added, please create an issue, and I will be happy to include it.
84 | ### Constructor Parameters
85 |
86 | - **`handler`**: A function that takes a payload and returns an HTML response. Example:
87 | ```python
88 | lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text
89 | ```
90 |
91 | - **`buffer_enabled`** (optional, default=False): A boolean flag to enable or disable buffering of payloads before sending them to the handler. By default, buffering is disabled, as it can sometimes lead to incorrect results. For example, some sanitizers may simply remove all input if it contains a dangerous tag. Use buffering only if the server you are interacting with has strict rate limits.
92 |
93 | - **`buffer_delimeter`** (optional, default=`TEXTTEXT `): A string used to delimit buffered payloads when sending them to the handler.
94 |
95 | - **`buffer_limit`** (optional, default=32): An integer that specifies the maximum number of payloads to buffer before sending them to the handler.
96 |
97 | - **`template_vars`** (optional, default=None): A dictionary of template variables to use for substitution in payloads.
98 |
99 | - **`debug_mode`** (optional, default=False): A boolean flag to enable or disable debug logging.
100 |
101 | ### Methods
102 |
103 | - **`check_allowed_tags()`**: Checks and populates the `ALLOWED_TAGS` dictionary with allowed tags for HTML, SVG, and MathML.
104 | - **`call_handler(template_payloads: list[str])`**: Calls the handler function with a list of template payloads and returns the processed results.
105 | - **`check_namespace(namespace: str)`**: Checks for allowed tags in the specified namespace (SVG or MathML).
106 | - **`identify()`**: Identifies the best matching Parser based on generated payloads and returns a list of matches.
107 | - **`check_allowed_attrs()`**: Checks and validates allowed attributes for HTML tags.
108 |
109 | ### identify() Method
110 |
111 | The `identify()` method checks if allowed tags have been determined. If not, it calls `check_allowed_tags()` to populate the `ALLOWED_TAGS`. It then loads a list of generated payloads from a JSON file and calls the handler for each payload. Finally, it compares the results against all JSON files in the `results_parsers` directory to count matches and returns a sorted list of results.
112 |
113 | - **Returns**: A list of tuples, each containing:
114 | - The match ratio (float)
115 | - The number of matches (int)
116 | - The name of the Parser (str)
117 |
118 | ### Attributes
119 |
120 | - **`ATTRIBUTES`**: A dictionary that holds information about allowed attributes for HTML tags, including:
121 | - `custom_attribute`: Indicates if custom attributes are allowed.
122 | - `event_attributes_blocked`: Indicates if event attributes are directly blocked.
123 | - `data_attributes`: Indicates if data attributes are allowed.
124 | - `attrs_allowed`: A nested dictionary categorizing allowed attributes into global, event and specific tags attributes.
125 |
126 | ### Allowed Tags
127 |
128 | - **`ALLOWED_TAGS`**: A dictionary that holds information about allowed tags for HTML, SVG, and MathML, including:
129 | - `html`: A list of allowed HTML tags.
130 | - `svg`: A list of allowed SVG tags.
131 | - `math`: A list of allowed MathML tags.
132 |
133 | ### Incorrectly Parsed Tags
134 |
135 | - **`INCORRECT_PARSED`**: A dictionary that holds information about incorrectly parsed tags for HTML, SVG, and MathML, including:
136 | - `html`: A list of incorrectly parsed HTML tags.
137 | - `svg`: A list of incorrectly parsed SVG tags.
138 | - `math`: A list of incorrectly parsed MathML tags.
139 |
140 | ### DEPTH_LIMITS
141 | **DEPTH_LIMITS**: A tuple that holds information about the depth limits of HTML tags, including:
142 | - `max_depth`: The maximum depth of HTML tags.
143 | - `limit_strategy`: The strategy used to handle tags exceeding the depth limit, which can be 'No max tags limit', 'Flattening', or 'Removing'.
144 |
--------------------------------------------------------------------------------
/hui.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
1 | Metadata-Version: 2.1
2 | Name: hui
3 | Version: 0.2.2
4 | Home-page: https://github.com/slonser/hui
5 | Download-URL: https://github.com/Slonser/hui/archive/v_01.tar.gz
6 | Author: Slonser
7 | Author-email: slonser@slonser.info
8 | License: MIT
9 | Keywords: HTML,hui,HTML GUESSER,HTML identifier,XSS,bugbounty
10 | Classifier: Development Status :: 3 - Alpha
11 | Classifier: Intended Audience :: Developers
12 | Classifier: Topic :: Software Development :: Build Tools
13 | Classifier: License :: OSI Approved :: MIT License
14 | Classifier: Programming Language :: Python :: 3
15 | Description-Content-Type: text/markdown
16 | License-File: LICENSE.txt
17 |
18 | # Html Universal Identifier
19 |
20 | Html Universal Identifier is an alpha version of an application designed for identifying server-side HTML parsers. This package provides a way to determine which HTML, SVG, and MathML tags are allowed, helps to find parser features (incorrectly implemented tags), and can also help to guess which parser is used on the backend.
21 |
22 | Primarily, this library relies on the incorrectness of HTML parsing, for example, here are some classic examples:
23 | - `` should be transformed to ``
24 | - `text` should be transformed to `text`
25 |
26 | There are several reasons why you don't want to rely entirely on allowed tags:
27 | - It won't help you determine which parser your custom sanitization is based on
28 | - Allowed tags can be changed
29 |
30 | ## Features
31 |
32 | - Identify allowed HTML, SVG, and MathML tags.
33 | - Identify allowed attributes.
34 | - Identify incorrect parsing
35 | - Use a customizable handler function to process HTML payloads.
36 | - Load and compare results against predefined Parser outputs.
37 |
38 | ## Installation
39 |
40 | To install the package, use pip:
41 |
42 | ```
43 | pip install hui
44 | ```
45 |
46 | ## Usage
47 |
48 | Here is a basic example of how to use the `Identifier` class from the package:
49 |
50 | ```python
51 | from hui.identify import Identifier
52 | import requests
53 |
54 | def handler(payload):
55 | return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text
56 |
57 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False)
58 | print(a.identify())
59 | # run all
60 | # Example output
61 | # [[1.0, 27, 'JS_SANITIZE_HTML'], [0.8148148148148148, 22, 'PYTHON_HTML_SANITIZE'], ...
62 |
63 | print(a.check_attr_allowed("href",tag="a"))
64 | # True or False
65 | print(a.INCORRECT_PARSED)
66 | # Example output
67 | # [{'output': 'govnoed', 'expected': '$text'}, .. ]
68 | print(a.ALLOWED_TAGS)
69 | # print allowed tags
70 | print(a.ATTRIBUTES)
71 | # Prints ATTRIBUTES info
72 | print(a.DEPTH_LIMITS)
73 | # Example Outputs:
74 | # (514, 'No max tags limit')
75 | # (512, 'Flattening')
76 | # (255, 'Removing')
77 | ```
78 |
79 | ## Identifier Class
80 |
81 | The `Identifier` class is the core of this package. It is responsible for identifying allowed HTML, SVG, and MathML tags based on a handler function that processes HTML payloads.
82 |
83 | The class also maintains an `INCORRECT_PARSED` list, which contains payloads that were incorrectly parsed by the handler. For example, this may include cases where the parser fails to remove nested forms and similar issues.
84 |
85 | ## Current Parsers
86 |
87 | The following parsers are currently supported in the project:
88 |
89 | - **DOMpurify with JSDOM (JS)**
90 | - **JSDOM (JS)**
91 | - **sanitize_html (JS)**
92 | - **htmlparser2 (JS)**
93 | - **JSXSS (JS)**
94 | - **html (python)**
95 | - **lxml (python)**
96 | - **html_sanitizer (python)**
97 | - **net/html (go)**
98 | - **bluemonday (go)**
99 |
100 | If you believe a new parser/sanitizer should be added, please create an issue, and I will be happy to include it.
101 | ### Constructor Parameters
102 |
103 | - **`handler`**: A function that takes a payload and returns an HTML response. Example:
104 | ```python
105 | lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text
106 | ```
107 |
108 | - **`buffer_enabled`** (optional, default=False): A boolean flag to enable or disable buffering of payloads before sending them to the handler. By default, buffering is disabled, as it can sometimes lead to incorrect results. For example, some sanitizers may simply remove all input if it contains a dangerous tag. Use buffering only if the server you are interacting with has strict rate limits.
109 |
110 | - **`buffer_delimeter`** (optional, default=`TEXTTEXT `): A string used to delimit buffered payloads when sending them to the handler.
111 |
112 | - **`buffer_limit`** (optional, default=32): An integer that specifies the maximum number of payloads to buffer before sending them to the handler.
113 |
114 | - **`template_vars`** (optional, default=None): A dictionary of template variables to use for substitution in payloads.
115 |
116 | - **`debug_mode`** (optional, default=False): A boolean flag to enable or disable debug logging.
117 |
118 | ### Methods
119 |
120 | - **`check_allowed_tags()`**: Checks and populates the `ALLOWED_TAGS` dictionary with allowed tags for HTML, SVG, and MathML.
121 | - **`call_handler(template_payloads: list[str])`**: Calls the handler function with a list of template payloads and returns the processed results.
122 | - **`check_namespace(namespace: str)`**: Checks for allowed tags in the specified namespace (SVG or MathML).
123 | - **`identify()`**: Identifies the best matching Parser based on generated payloads and returns a list of matches.
124 | - **`check_allowed_attrs()`**: Checks and validates allowed attributes for HTML tags.
125 |
126 | ### identify() Method
127 |
128 | The `identify()` method checks if allowed tags have been determined. If not, it calls `check_allowed_tags()` to populate the `ALLOWED_TAGS`. It then loads a list of generated payloads from a JSON file and calls the handler for each payload. Finally, it compares the results against all JSON files in the `results_parsers` directory to count matches and returns a sorted list of results.
129 |
130 | - **Returns**: A list of tuples, each containing:
131 | - The match ratio (float)
132 | - The number of matches (int)
133 | - The name of the Parser (str)
134 |
135 | ### Attributes
136 |
137 | - **`ATTRIBUTES`**: A dictionary that holds information about allowed attributes for HTML tags, including:
138 | - `custom_attribute`: Indicates if custom attributes are allowed.
139 | - `event_attributes_blocked`: Indicates if event attributes are directly blocked.
140 | - `data_attributes`: Indicates if data attributes are allowed.
141 | - `attrs_allowed`: A nested dictionary categorizing allowed attributes into global, event and specific tags attributes.
142 |
143 | ### Allowed Tags
144 |
145 | - **`ALLOWED_TAGS`**: A dictionary that holds information about allowed tags for HTML, SVG, and MathML, including:
146 | - `html`: A list of allowed HTML tags.
147 | - `svg`: A list of allowed SVG tags.
148 | - `math`: A list of allowed MathML tags.
149 |
150 | ### Incorrectly Parsed Tags
151 |
152 | - **`INCORRECT_PARSED`**: A dictionary that holds information about incorrectly parsed tags for HTML, SVG, and MathML, including:
153 | - `html`: A list of incorrectly parsed HTML tags.
154 | - `svg`: A list of incorrectly parsed SVG tags.
155 | - `math`: A list of incorrectly parsed MathML tags.
156 |
157 | ### DEPTH_LIMITS
158 | **DEPTH_LIMITS**: A tuple that holds information about the depth limits of HTML tags, including:
159 | - `max_depth`: The maximum depth of HTML tags.
160 | - `limit_strategy`: The strategy used to handle tags exceeding the depth limit, which can be 'No max tags limit', 'Flattening', or 'Removing'.
161 |
--------------------------------------------------------------------------------
/hui/ALLOWED_ATTRS.py:
--------------------------------------------------------------------------------
1 | GLOBAL_ATTRS = [
2 | "accesskey",
3 | "anchor",
4 | "autocapitalize",
5 | "autocorrect",
6 | "autofocus",
7 | "class",
8 | "contenteditable",
9 | "data-*",
10 | "dir",
11 | "draggable",
12 | "enterkeyhint",
13 | "exportparts",
14 | "hidden",
15 | "id",
16 | "inert",
17 | "inputmode",
18 | "is",
19 | "itemid",
20 | "itemprop",
21 | "itemref",
22 | "itemscope",
23 | "itemtype",
24 | "lang",
25 | "nonce",
26 | "part",
27 | "popover",
28 | "slot",
29 | "spellcheck",
30 | "style",
31 | "tabindex",
32 | "title",
33 | "translate",
34 | "virtualkeyboardpolicy",
35 | "writingsuggestions"
36 | ]
37 |
38 | EVENT_ATTRS = [
39 | "onafterprint",
40 | "onafterscriptexecute",
41 | "onanimationcancel",
42 | "onanimationend",
43 | "onanimationiteration",
44 | "onanimationstart",
45 | "onauxclick",
46 | "onbeforecopy",
47 | "onbeforecut",
48 | "onbeforeinput",
49 | "onbeforeprint",
50 | "onbeforescriptexecute",
51 | "onbeforetoggle",
52 | "onbeforeunload",
53 | "onbegin",
54 | "onblur",
55 | "oncancel",
56 | "oncanplay",
57 | "oncanplaythrough",
58 | "onchange",
59 | "onclick",
60 | "onclose",
61 | "oncontentvisibilityautostatechange",
62 | "oncontextmenu",
63 | "oncopy",
64 | "oncuechange",
65 | "oncut",
66 | "ondblclick",
67 | "ondrag",
68 | "ondragend",
69 | "ondragenter",
70 | "ondragexit",
71 | "ondragleave",
72 | "ondragover",
73 | "ondragstart",
74 | "ondrop",
75 | "ondurationchange",
76 | "onend",
77 | "onended",
78 | "onerror",
79 | "onfocus",
80 | "onfocus(autofocus)",
81 | "onfocusin",
82 | "onfocusout",
83 | "onformdata",
84 | "onfullscreenchange",
85 | "onhashchange",
86 | "oninput",
87 | "oninvalid",
88 | "onkeydown",
89 | "onkeypress",
90 | "onkeyup",
91 | "onload",
92 | "onloadeddata",
93 | "onloadedmetadata",
94 | "onloadstart",
95 | "onmessage",
96 | "onmousedown",
97 | "onmouseenter",
98 | "onmouseleave",
99 | "onmousemove",
100 | "onmouseout",
101 | "onmouseover",
102 | "onmouseup",
103 | "onmousewheel",
104 | "onmozfullscreenchange",
105 | "onpagehide",
106 | "onpageshow",
107 | "onpaste",
108 | "onpause",
109 | "onplay",
110 | "onplaying",
111 | "onpointercancel",
112 | "onpointerdown",
113 | "onpointerenter",
114 | "onpointerleave",
115 | "onpointermove",
116 | "onpointerout",
117 | "onpointerover",
118 | "onpointerrawupdate",
119 | "onpointerup",
120 | "onpopstate",
121 | "onprogress",
122 | "onratechange",
123 | "onrepeat",
124 | "onreset",
125 | "onresize",
126 | "onscroll",
127 | "onscrollend",
128 | "onscrollsnapchange",
129 | "onsearch",
130 | "onseeked",
131 | "onseeking",
132 | "onselect",
133 | "onselectionchange",
134 | "onselectstart",
135 | "onshow",
136 | "onsubmit",
137 | "onsuspend",
138 | "ontimeupdate",
139 | "ontoggle",
140 | "ontoggle(popover)",
141 | "ontouchend",
142 | "ontouchmove",
143 | "ontouchstart",
144 | "ontransitioncancel",
145 | "ontransitionend",
146 | "ontransitionrun",
147 | "ontransitionstart",
148 | "onunhandledrejection",
149 | "onunload",
150 | "onvolumechange",
151 | "onwaiting",
152 | "onwebkitanimationend",
153 | "onwebkitanimationiteration",
154 | "onwebkitanimationstart",
155 | "onwebkitfullscreenchange",
156 | "onwebkitmouseforcechanged",
157 | "onwebkitmouseforcedown",
158 | "onwebkitmouseforceup",
159 | "onwebkitmouseforcewillbegin",
160 | "onwebkitplaybacktargetavailabilitychanged",
161 | "onwebkitpresentationmodechanged",
162 | "onwebkittransitionend",
163 | "onwebkitwillrevealbottom",
164 | "onwheel"
165 | ]
166 |
167 | DEFAULT_ATTRS = {
168 | "form": [
169 | "accept", "accept-charset", "action", "autocomplete", "enctype",
170 | "method", "name", "novalidate", "target"
171 | ],
172 | "input": [
173 | "accept", "alt", "autocomplete", "capture", "checked", "dirname",
174 | "disabled", "form", "formaction", "formenctype", "formmethod",
175 | "formnovalidate", "formtarget", "list", "max", "maxlength",
176 | "minlength", "min", "multiple", "name", "pattern", "placeholder",
177 | "readonly", "required", "size", "src", "step", "type", "usemap",
178 | "value", "width"
179 | ],
180 | "col": ["span"],
181 | "colgroup": ["span"],
182 | "iframe": [
183 | "allow", "csp", "name", "referrerpolicy", "sandbox", "src",
184 | "srcdoc", "width"
185 | ],
186 | "img": [
187 | "alt", "crossorigin", "decoding", "intrinsicsize", "ismap",
188 | "referrerpolicy", "sizes", "src", "srcset", "usemap", "width"
189 | ],
190 | "table": ["summary"],
191 | "td": ["colspan", "headers", "rowspan"],
192 | "th": ["colspan", "headers", "rowspan", "scope"],
193 | "area": [
194 | "alt", "coords", "download", "href", "media", "ping",
195 | "referrerpolicy", "rel", "shape", "target"
196 | ],
197 | "link": [
198 | "as", "crossorigin", "href", "hreflang", "integrity",
199 | "media", "referrerpolicy", "rel", "sizes", "type"
200 | ],
201 | "script": [
202 | "async", "crossorigin", "defer", "integrity", "language",
203 | "referrerpolicy", "src", "type"
204 | ],
205 | "select": [
206 | "autocomplete", "disabled", "form", "multiple", "name",
207 | "required", "size"
208 | ],
209 | "textarea": [
210 | "autocomplete", "cols", "dirname", "disabled", "enterkeyhint",
211 | "form", "inputmode", "maxlength", "minlength", "name",
212 | "placeholder", "readonly", "required", "rows", "wrap"
213 | ],
214 | "audio": [
215 | "autoplay", "controls", "crossorigin", "loop", "muted",
216 | "preload", "src"
217 | ],
218 | "video": [
219 | "autoplay", "controls", "crossorigin", "loop", "muted",
220 | "playsinline", "poster", "preload", "src", "width"
221 | ],
222 | "marquee": ["loop"],
223 | "object": [
224 | "data", "form", "name", "type", "usemap", "width"
225 | ],
226 | "meta": ["charset", "content", "http-equiv", "name"],
227 | "blockquote": ["cite"],
228 | "del": ["cite", "datetime"],
229 | "ins": ["cite", "datetime"],
230 | "q": ["cite"],
231 | "time": ["datetime"],
232 | "track": ["default", "kind", "label", "src", "srclang"],
233 | "button": [
234 | "disabled", "form", "formaction", "formenctype", "formmethod",
235 | "formnovalidate", "formtarget", "name", "type", "value"
236 | ],
237 | "fieldset": ["disabled", "form", "name"],
238 | "optgroup": ["disabled", "label"],
239 | "option": ["disabled", "label", "selected", "value"],
240 | "a": [
241 | "download", "href", "hreflang", "media", "ping",
242 | "referrerpolicy", "rel", "shape", "target"
243 | ],
244 | "label": ["for", "form"],
245 | "output": ["for", "form", "name"],
246 | "meter": ["form", "high", "low", "max", "min", "optimum", "value"],
247 | "progress": ["form", "max", "value"],
248 | "canvas": ["width"],
249 | "embed": ["src", "type", "width"],
250 | "base": ["href", "target"],
251 | "source": ["media", "sizes", "src", "srcset", "type"],
252 | "style": ["media", "scoped", "type"],
253 | "map": ["name"],
254 | "param": ["name", "value"],
255 | "details": ["open"],
256 | "dialog": ["open"],
257 | "ol": ["reversed", "start", "type"],
258 | "menu": ["type"],
259 | "data": ["value"],
260 | "li": ["value"]
261 | }
--------------------------------------------------------------------------------
/hui/identify.py:
--------------------------------------------------------------------------------
1 | from .ALLOWED_TAGS import *
2 | from .ALLOWED_ATTRS import *
3 | from string import Template
4 | import json
5 | import os
6 | import importlib.resources
7 | from importlib.resources import files
8 | from .parsers.simple_parser import SANITIZE_HTML
9 | import logging
10 | from .CustomParser import CustomParser
11 |
12 | class Identifier:
13 | def __init__(self, handler, buffer_enabled=False, buffer_delimeter="TEXTTEXT ", buffer_limit=32, template_vars=None, debug_mode=False) -> None:
14 | """
15 | Initializes the Identifier class with a handler function and optional parameters for buffer management, template variables, and logging.
16 |
17 | :param handler: handler function that must return text with an HTML response.
18 | Example of a handler function:
19 | lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text
20 | :param buffer_enabled: Boolean indicating whether to enable buffering of payloads before sending to the handler.
21 | :param buffer_delimeter: String used to delimit payloads in the buffer.
22 | :param buffer_limit: Integer specifying the maximum number of payloads to buffer before sending to the handler.
23 | :param template_vars: Optional dictionary of template variables to use for substitution in payloads.
24 | :param debug_mode: Boolean indicating whether to enable debug logging.
25 | :return: returns nothing
26 | """
27 | self.handler = handler
28 | self.ALLOWED_TAGS = {
29 | "html": [],
30 | "svg": [],
31 | "math": [],
32 | }
33 | self.TEMPLATE_VARS = template_vars if template_vars is not None else {
34 | 'text': 'govnoed',
35 | 'href': 'https://github.com',
36 | 'attribute_prefix': 'data'
37 | }
38 |
39 | self.ALLOWED_TAGS_CHECKED = False
40 | self.DEFAULT_SANITIZER = SANITIZE_HTML()
41 |
42 | self.BUFFER = ""
43 | self.BUFFER_LIMIT = buffer_limit
44 | self.BUFFER_ENABLED = buffer_enabled
45 | self.BUFFER_DELIMETER = buffer_delimeter
46 |
47 | self.INCORRECT_PARSED = []
48 |
49 | self.DEPTH_LIMITS = ()
50 |
51 | self.ATTRIBUTES = {
52 | "custom_attribute" : None, # is custom attributes allowed
53 | "event_attributes_blocked": None, # is event attributes directly blocked
54 | "data_attributes": None, # is data attributes allowed
55 | "attrs_allowed":{
56 | "global":[], # global attributes
57 | "events":[] # events attributes
58 | }
59 | }
60 |
61 | # Configure logging based on debug_mode
62 | if debug_mode:
63 | logging.basicConfig(level=logging.DEBUG)
64 | else:
65 | logging.basicConfig(level=logging.INFO)
66 |
67 | self.logger = logging.getLogger(__name__)
68 | self.parser = CustomParser()
69 |
70 | def check_allowed_tags(self) -> dict:
71 | """
72 | Check and validate allowed HTML, SVG, and MathML tags.
73 |
74 | :return: A dictionary of allowed tags.
75 | """
76 | self.logger.debug("Checking allowed tags...")
77 | self.ALLOWED_TAGS_CHECKED = True
78 | self.check_html_namespace()
79 | self.check_namespace("math")
80 | self.check_namespace("svg")
81 |
82 | self.logger.debug("Allowed tags checked: %s", self.ALLOWED_TAGS)
83 | return self.ALLOWED_TAGS
84 |
85 | def call_handler(self, template_payloads: list[str]) -> list[str]:
86 | """
87 | Call the handler function with the provided template payloads.
88 |
89 | :param template_payloads: List of template strings to process.
90 | :return: List of processed results from the handler.
91 | """
92 | self.logger.debug("Calling handler with payloads: %s", template_payloads)
93 | for i in range(len(template_payloads)):
94 | template_payloads[i] = Template(template_payloads[i]).safe_substitute(self.TEMPLATE_VARS)
95 |
96 | if self.BUFFER_ENABLED:
97 | res = []
98 | buffer = []
99 | for payload in template_payloads:
100 | buffer.append(payload)
101 | if len(buffer) >= self.BUFFER_LIMIT:
102 | res.extend(self.handler(self.BUFFER_DELIMETER.join(buffer)).split(self.BUFFER_DELIMETER))
103 | buffer = []
104 | if buffer:
105 | res.extend(self.handler(self.BUFFER_DELIMETER.join(buffer)).split(self.BUFFER_DELIMETER))
106 | self.logger.debug("Handler results: %s", res)
107 | return res
108 |
109 | res = [self.handler(payload) for payload in template_payloads]
110 | self.logger.debug("Handler results: %s", res)
111 | return res
112 |
113 | def check_html_namespace(self) -> None:
114 | """
115 | Check and validate allowed HTML tags.
116 |
117 | :return: None
118 | """
119 | self.logger.debug("Checking HTML namespace...")
120 | arr = []
121 | for tag in html_tags:
122 | arr.append([f'<{tag}>$text{tag}>', tag])
123 |
124 | for tag in html_table_tags:
125 | arr.append([f'', tag])
126 |
127 | handler_results = self.call_handler([x[0] for x in arr])
128 | for i in range(len(handler_results)):
129 | res = handler_results[i]
130 | if f'<{arr[i][1]}' in res:
131 | self.ALLOWED_TAGS["html"].append(arr[i][1])
132 |
133 | self.logger.debug("Allowed HTML tags: %s", self.ALLOWED_TAGS["html"])
134 |
135 | def check_namespace(self, namespace: str) -> None:
136 | """
137 | Check and validate tags in the specified namespace (math or svg).
138 |
139 | :param namespace: The namespace to check (math or svg).
140 | :raises Exception: If the namespace is not supported.
141 | :return: None
142 | """
143 | self.logger.debug("Checking namespace: %s", namespace)
144 | if namespace not in self.ALLOWED_TAGS:
145 | raise Exception(f'{namespace} namespace is not supported')
146 |
147 | tag_arr = []
148 | namespace_tags = []
149 | if namespace == "math":
150 | namespace_tags = mathml_tags
151 | elif namespace == "svg":
152 | namespace_tags = svg_tags
153 |
154 | for tag in namespace_tags:
155 | tag_arr.append([f'<{namespace}><{tag}>$text{tag}>{namespace}>', tag])
156 |
157 | handler_results = self.call_handler([x[0] for x in tag_arr])
158 |
159 | for i in range(len(handler_results)):
160 | res = handler_results[i]
161 | if f'<{tag_arr[i][1]}' in res:
162 | self.ALLOWED_TAGS[namespace].append(tag_arr[i][1])
163 |
164 | self.logger.debug("Allowed tags for namespace '%s': %s", namespace, self.ALLOWED_TAGS[namespace])
165 |
166 | def check_tag_allowed(self, tag: str) -> bool:
167 | """
168 | Check if a tag is allowed.
169 |
170 | :param tag: The tag to check.
171 | :return: True if the tag is allowed, False otherwise.
172 | """
173 | return any([(tag in self.ALLOWED_TAGS[namespace]) for namespace in self.ALLOWED_TAGS])
174 |
175 | def identify(self) -> list[list[float | int | str]]:
176 | """
177 | Identify and validate tags against expected outputs.
178 |
179 | :return: A sorted list of results with match ratios and file names.
180 | """
181 | self.logger.debug("Identifying tags...")
182 | if len(self.ALLOWED_TAGS['html']) == 0:
183 | self.check_allowed_tags()
184 | self.check_allowed_attrs()
185 | self.check_depth()
186 | arr = self.DEFAULT_SANITIZER.checks
187 | res = self.call_handler([tag.payload for tag in arr])
188 | for i in range(len(res)):
189 | all_tags_allowed = all([self.check_tag_allowed(tag) for tag in arr[i].tags])
190 | if all_tags_allowed and not(arr[i].check(res[i],self.TEMPLATE_VARS)):
191 | self.logger.debug("Found incorrect parsing logic: %s, but %s is expected", res[i], arr[i].expected_output)
192 | self.INCORRECT_PARSED.append({"output": res[i].strip(), "expected": arr[i].expected_output})
193 |
194 |
195 | json_files = [f for f in importlib.resources.files('hui.results_parsers').iterdir() if f.name.endswith('.json')]
196 |
197 | result = []
198 | for json_file in json_files:
199 | with open(json_file) as f:
200 | data = json.load(f)
201 |
202 | # Count the number of matches in the JSON file
203 | matches = sum([1 for i in range(len(res)) if Template(data[i]).substitute(self.TEMPLATE_VARS).strip() in res[i].strip()])
204 | result.append([matches / len(data), matches, json_file.name.split('.')[0]])
205 |
206 | result = sorted(result, reverse=True)
207 | self.logger.debug("Identification results: %s", result)
208 | return result
209 |
210 | def check_namespace_supported(self, namespace: str) -> bool:
211 | """
212 | Check if the specified namespace is supported.
213 |
214 | :param namespace: The namespace to check.
215 | :raises Exception: If the namespace is invalid or not supported.
216 | :return: True if the namespace is supported, False otherwise.
217 | """
218 | if not self.ALLOWED_TAGS_CHECKED:
219 | self.check_allowed_tags()
220 | if namespace not in self.ALLOWED_TAGS:
221 | raise Exception('Invalid namespace name')
222 | return len(self.ALLOWED_TAGS[namespace]) > 0
223 |
224 | def check_attr_allowed(self, attr: str, tag: str = None, attr_value: str = "https://github.com/Slonser/hui") -> bool:
225 | """
226 | Checks if a given attribute is allowed for a specified tag.
227 |
228 | This method checks if a given attribute is allowed for a specified tag by simulating the parsing of HTML elements with the attribute and then checking if the attribute is present in the parsed attributes.
229 |
230 | :param attr: The attribute to check.
231 | :param tag: The tag to check the attribute for. Defaults to None, which means the first allowed HTML tag will be used.
232 | :param attr_value: The value to assign to the attribute for testing. Defaults to "https://github.com/Slonser/hui".
233 | :return: True if the attribute is allowed, False otherwise.
234 | """
235 | if tag is None:
236 | assert self.check_namespace_supported("html"), "No tags allowed"
237 | tag = self.ALLOWED_TAGS['html'][0]
238 |
239 | # Simulate parsing of HTML elements with the attribute to check
240 | res = self.call_handler([f'<{tag} {attr}="{attr_value}">{tag}>',
241 | f'<{tag}/{attr}="{attr_value}">{tag}>']) # In some situations, the attribute might only be parsed with a / symbol
242 | self.parser.check(res[0]+res[1])
243 | # Check if the attribute is present in the parsed attributes
244 | return attr in [attr_parsed[0] for attr_parsed in self.parser.found_attrs]
245 |
246 | def check_allowed_attrs(self):
247 | """
248 | Check and validate allowed attributes for HTML tags.
249 |
250 | This method checks if global attributes, event attributes, and default attributes are allowed.
251 | It updates the ATTRIBUTES dictionary with the allowed attributes and logs the results.
252 |
253 | :return: A dictionary containing the allowed attributes categorized by global, event, and specific tags.
254 | """
255 | for attr in GLOBAL_ATTRS:
256 | is_allowed = self.check_attr_allowed(attr)
257 | if is_allowed:
258 | self.ATTRIBUTES["attrs_allowed"]["global"].append(attr)
259 |
260 |
261 | for attr in EVENT_ATTRS:
262 | is_allowed = self.check_attr_allowed(attr)
263 | if is_allowed:
264 | self.ATTRIBUTES["events"]["events"].append(attr)
265 |
266 | for tag in DEFAULT_ATTRS:
267 | self.ATTRIBUTES["attrs_allowed"][tag] = []
268 | for attr in DEFAULT_ATTRS[tag]:
269 | is_allowed = self.check_attr_allowed(attr,tag=tag)
270 | if is_allowed:
271 | self.ATTRIBUTES["attrs_allowed"][tag].append(attr)
272 |
273 | self.ATTRIBUTES["data_attributes"] = self.check_attr_allowed("data-hui")
274 | if self.ATTRIBUTES["data_attributes"]:
275 | self.logger.debug("data attributes allowed")
276 |
277 | self.ATTRIBUTES["custom_attribute"] = self.check_attr_allowed("custom")
278 |
279 | if self.ATTRIBUTES["custom_attribute"]:
280 | self.logger.debug("Custom attributes allowed")
281 |
282 | self.ATTRIBUTES["event_attributes_blocked"] = not(self.check_attr_allowed("onhui"))
283 |
284 | if self.ATTRIBUTES["custom_attribute"] and self.ATTRIBUTES["event_attributes_blocked"]:
285 | self.logger.debug("Event attributes directly blocked")
286 |
287 | return self.ATTRIBUTES
288 |
289 | def check_depth(self):
290 | """
291 | Check and validate the depth of HTML tags.
292 |
293 | This method checks if the depth of HTML tags exceeds the limit and updates the DEPTH_LIMITS accordingly.
294 |
295 | :return: DEPTH_LIMITS
296 | """
297 | assert self.check_namespace_supported("html"), "No tags allowed"
298 | tag = self.ALLOWED_TAGS['html'][0]
299 | res = self.call_handler([f''*514+f' '])
300 | self.parser.check(res[0])
301 | if self.parser.max_depth > 512:
302 | self.DEPTH_LIMITS = (self.parser.max_depth, 'No max tags limit')
303 | elif self.parser.start_tags > 512:
304 | self.DEPTH_LIMITS = (self.parser.max_depth, 'Flattening')
305 | else:
306 | self.DEPTH_LIMITS = (self.parser.max_depth, 'Removing')
307 | return self.DEPTH_LIMITS
308 |
--------------------------------------------------------------------------------
|