├── hui
    ├── parsers
    │   ├── __init__.py
    │   ├── generators
    │   │   ├── JSOUP
    │   │   │   ├── target
    │   │   │   │   ├── maven-status
    │   │   │   │   │   └── maven-compiler-plugin
    │   │   │   │   │   │   └── compile
    │   │   │   │   │   │       └── default-compile
    │   │   │   │   │   │           ├── createdFiles.lst
    │   │   │   │   │   │           └── inputFiles.lst
    │   │   │   │   ├── maven-archiver
    │   │   │   │   │   └── pom.properties
    │   │   │   │   ├── java-jsoup-1.0-SNAPSHOT.jar
    │   │   │   │   ├── classes
    │   │   │   │   │   └── com
    │   │   │   │   │   │   └── example
    │   │   │   │   │   │       └── Main.class
    │   │   │   │   └── original-java-jsoup-1.0-SNAPSHOT.jar
    │   │   │   ├── src
    │   │   │   │   └── main
    │   │   │   │   │   └── java
    │   │   │   │   │       └── com
    │   │   │   │   │           └── example
    │   │   │   │   │               └── Main.java
    │   │   │   ├── dependency-reduced-pom.xml
    │   │   │   └── pom.xml
    │   │   ├── go.mod
    │   │   ├── python-html_sanitizer.py
    │   │   ├── python-html.py
    │   │   ├── python-lxml-html.py
    │   │   ├── js_jsxss.js
    │   │   ├── js_sanitize-html.js
    │   │   ├── js_htmlparser2.js
    │   │   ├── js_jsdom.js
    │   │   ├── js_dompurify.js
    │   │   ├── go.sum
    │   │   ├── go_bluemonday.go
    │   │   └── go_html.go
    │   ├── JS_DOM.py
    │   ├── GO_HTML.py
    │   ├── JS_JSXSS.py
    │   ├── JS_DOMPURIFY.py
    │   ├── JS_HTMLPARSER2.py
    │   ├── simple_parser.py
    │   ├── JS_SANITIZE_HTML.py
    │   ├── JAVA_JSOUP.py
    │   ├── PYTHON_HTML.py
    │   ├── GO_bluemonday.py
    │   ├── PYTHON_HTML_SANITIZER.py
    │   └── PYTHON_LXML_HTML.py
    ├── __init__.py
    ├── results_parsers
    │   ├── PYTHON_HTML_SANITIZE.json
    │   ├── JS_DOMPURIFY.json
    │   ├── JS_SANITIZE_HTML.json
    │   ├── JSDOM_HTML.json
    │   ├── JAVA_JSOUP.json
    │   ├── JS_HTMLPARSER2.json
    │   ├── PYTHON_HTML.json
    │   ├── JS_JSXSS.json
    │   ├── PYTHON_LXML_HTML.json
    │   ├── GO_BLUEMONDAY.json
    │   └── GO_HTML.json
    ├── Generator.py
    ├── CustomParser.py
    ├── generated_payloads.json
    ├── ParserPayload.py
    ├── ALLOWED_TAGS.py
    ├── ParserBase.py
    ├── ALLOWED_ATTRS.py
    └── identify.py
├── hui.egg-info
    ├── dependency_links.txt
    ├── top_level.txt
    ├── SOURCES.txt
    └── PKG-INFO
├── MANIFEST
├── .gitignore
├── dist
    ├── hui-0.2.2.tar.gz
    └── hui-0.2.2-py3-none-any.whl
├── pyproject.toml
├── setup.cfg
├── examples
    └── example.py
├── setup.py
├── LICENSE.txt
└── README.md


/hui/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hui.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/hui.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | hui
2 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | setup.py
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | node_modules
3 | package-lock.json
4 | hui.egg-info


--------------------------------------------------------------------------------
/dist/hui-0.2.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Slonser/hui/HEAD/dist/hui-0.2.2.tar.gz


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ['setuptools>=42']
3 | build-backend = 'setuptools.build_meta'


--------------------------------------------------------------------------------
/dist/hui-0.2.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Slonser/hui/HEAD/dist/hui-0.2.2-py3-none-any.whl


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst:
--------------------------------------------------------------------------------
1 | com/example/Main.class
2 | 


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/target/maven-archiver/pom.properties:
--------------------------------------------------------------------------------
1 | artifactId=java-jsoup
2 | groupId=com.example
3 | version=1.0-SNAPSHOT
4 | 


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/target/java-jsoup-1.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Slonser/hui/HEAD/hui/parsers/generators/JSOUP/target/java-jsoup-1.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/target/classes/com/example/Main.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Slonser/hui/HEAD/hui/parsers/generators/JSOUP/target/classes/com/example/Main.class


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/target/original-java-jsoup-1.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Slonser/hui/HEAD/hui/parsers/generators/JSOUP/target/original-java-jsoup-1.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst:
--------------------------------------------------------------------------------
1 | /Users/slonser/parser_identifier/src/parsers/generators/JSOUP/src/main/java/com/example/Main.java
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | include_package_data = True
4 | long_description = file: README.md
5 | long_description_content_type = text/markdown
6 | name = HTML Universal Identifier
7 | 


--------------------------------------------------------------------------------
/hui/__init__.py:
--------------------------------------------------------------------------------
1 | from .identify import Identifier
2 | from .parsers import *
3 | from .ParserPayload import ParserPayload
4 | from .ParserBase import ParserBase
5 | from .ALLOWED_TAGS import *
6 | from .Generator import generate
7 | from .CustomParser import CustomParser
8 | from .ALLOWED_ATTRS import *


--------------------------------------------------------------------------------
/hui/parsers/generators/go.mod:
--------------------------------------------------------------------------------
 1 | module generator
 2 | 
 3 | go 1.23.4
 4 | 
 5 | require (
 6 | 	github.com/aymerick/douceur v0.2.0 // indirect
 7 | 	github.com/gorilla/css v1.0.1 // indirect
 8 | 	github.com/microcosm-cc/bluemonday v1.0.27 // indirect
 9 | 	golang.org/x/net v0.26.0 // indirect
10 | )
11 | 


--------------------------------------------------------------------------------
/hui/parsers/JS_DOM.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("JS_DOM")
 8 | 
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("node ./parsers/generators/js_jsdom.js")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/GO_HTML.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("GO_HTML")
 8 | 
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("cd ./parsers/generators/;go run go_html.go")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/JS_JSXSS.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("JS_DOMPURIFY")
 8 | 
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("node ./parsers/generators/js_jsxss.js")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/JS_DOMPURIFY.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("JS_DOMPURIFY")
 8 | 
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("node ./parsers/generators/js_dompurify.js")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/JS_HTMLPARSER2.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("JS_HTMLPARSER2")
 8 | 
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("node ./parsers/generators/js_htmlparser2.js")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/simple_parser.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | # Cheat for include basic checks into identify
 5 | # TODO: replace
 6 | class SANITIZE_HTML(ParserBase):
 7 | 
 8 |     def __init__(self) -> None:
 9 |         super().__init__("SIMPLE_PARSER")
10 |     
11 |     def get_results(self):
12 |         self.generate_payloads()


--------------------------------------------------------------------------------
/hui/parsers/JS_SANITIZE_HTML.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("JS_SANITIZE_HTML")
 8 | 
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("node ./parsers/generators/js_sanitize-html.js")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/JAVA_JSOUP.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("JAVA_JSOUP")
 8 | 
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("java -jar ./parsers/generators/JSOUP/target/java-jsoup-1.0-SNAPSHOT.jar")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/PYTHON_HTML.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | # Python HTML parser
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("PYTHON_HTML")
 8 |     
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("python ./parsers/generators/python-html.py")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/GO_bluemonday.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | from ..ParserPayload import ParserPayload
 3 | import os
 4 | 
 5 | class SANITIZE_HTML(ParserBase):
 6 | 
 7 |     def __init__(self) -> None:
 8 |         super().__init__("GO_bluemonday")
 9 | 
10 |     def get_results(self):
11 |         self.generate_payloads()
12 |         os.system("cd ./parsers/generators/;go run go_bluemonday.go")
13 | 


--------------------------------------------------------------------------------
/hui/parsers/PYTHON_HTML_SANITIZER.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | # Python HTML parser
 4 | class SANITIZE_HTML(ParserBase):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__("PYTHON_HTML_SANITIZER")
 8 |     
 9 |     def get_results(self):
10 |         self.generate_payloads()
11 |         os.system("python ./parsers/generators/python-html_sanitizer.py")
12 | 


--------------------------------------------------------------------------------
/hui/parsers/PYTHON_LXML_HTML.py:
--------------------------------------------------------------------------------
 1 | from ..ParserBase import ParserBase
 2 | import os
 3 | 
 4 | # Python LXML_HTML parser
 5 | class SANITIZE_HTML(ParserBase):
 6 | 
 7 |     def __init__(self) -> None:
 8 |         super().__init__("PYTHON_LXML_HTML")
 9 |     
10 |     def get_results(self):
11 |         self.generate_payloads()
12 |         os.system("python ./parsers/generators/python-lxml-html.py")
13 |         


--------------------------------------------------------------------------------
/hui/parsers/generators/python-html_sanitizer.py:
--------------------------------------------------------------------------------
 1 | from html_sanitizer import Sanitizer
 2 | import json 
 3 | 
 4 | def generate():
 5 |     arr = json.load(open("generated_payloads.json"))
 6 |     res = []
 7 |     sanitizer = Sanitizer()
 8 |     for payload in arr:
 9 |         html_content = f"{payload}"
10 |         sanitized_content = sanitizer.sanitize(html_content)
11 |         res.append(sanitized_content)
12 |     json.dump(res,open("results_parsers/PYTHON_HTML_SANITIZE.json","w"))
13 | 
14 | if __name__ == "__main__":
15 |     generate()


--------------------------------------------------------------------------------
/hui/parsers/generators/python-html.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import json
 3 | 
 4 | def generate():
 5 |     arr = json.load(open("generated_payloads.json"))
 6 |     res = []
 7 |     for payload in arr:
 8 |         html_content = f"<html><body>{payload}</body></html>"
 9 | 
10 |         soup = BeautifulSoup(html_content, 'html.parser')
11 | 
12 |         body_inner_html = str(soup.body)
13 |         res.append(body_inner_html[6:-7])
14 |     json.dump(res,open("results_parsers/PYTHON_HTML.json","w"))
15 | 
16 | if __name__ == "__main__":
17 |     generate()


--------------------------------------------------------------------------------
/hui/parsers/generators/python-lxml-html.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree
 2 | import json
 3 | 
 4 | def generate():
 5 |     arr = json.load(open("generated_payloads.json"))
 6 |     res = []
 7 |     for payload in arr:
 8 |         html_content = f"<html><body>{payload}</body></html>"
 9 | 
10 |         parser = etree.HTMLParser()
11 |         tree = etree.fromstring(html_content, parser)
12 | 
13 |         body_inner_html = etree.tostring(tree.find('.//body'), encoding='unicode')
14 |         res.append(body_inner_html)
15 |     json.dump(res, open("results_parsers/PYTHON_LXML_HTML.json", "w"))
16 | 
17 | if __name__ == "__main__":
18 |     generate()


--------------------------------------------------------------------------------
/hui/results_parsers/PYTHON_HTML_SANITIZE.json:
--------------------------------------------------------------------------------
1 | ["<a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"></a>", "<a href=\"https://github.com/Slonser/hui/&lt;/textarea&gt;\"></a>", "<a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"></a>", "<a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"></a>", "\"&gt;", "<a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"></a>", "", "<h1><h2>$text</h2></h1>", "<h2><h3>$text</h3></h2>", "<h3>$text</h3>", "$text", "$text", "<p>$text</p>", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "<a href=\"#\">$text</a>", "$text", "<hr>$text", "<h1>$text</h1>", ""]


--------------------------------------------------------------------------------
/hui/parsers/generators/js_jsxss.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const xss = require('xss');
 3 | 
 4 | function generate() {
 5 |     const arr = JSON.parse(fs.readFileSync("generated_payloads.json"));
 6 |     const res = [];
 7 |     for(let payload of arr){
 8 |         try{
 9 |             const html_content = `${payload}`;
10 |             console.log(html_content)
11 |             const sanitized_html = xss(html_content);
12 |             res.push(sanitized_html);
13 |         }catch(e){
14 |             res.push("");
15 |         }
16 |     }
17 |     fs.writeFileSync("results_parsers/JS_JSXSS.json", JSON.stringify(res));
18 | }
19 | 
20 | if (require.main === module) {
21 |     generate();
22 | }


--------------------------------------------------------------------------------
/hui/parsers/generators/js_sanitize-html.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const sanitizeHtml = require('sanitize-html');
 3 | 
 4 | function generate() {
 5 |     const arr = JSON.parse(fs.readFileSync("generated_payloads.json"));
 6 |     const res = [];
 7 |     arr.forEach(payload => {
 8 |         try{
 9 |             const html_content = `${payload}`;
10 |             const sanitized_html = sanitizeHtml(html_content);
11 |             res.push(sanitized_html);
12 |         }catch{
13 |             res.push("");
14 |         }
15 |     });
16 |     fs.writeFileSync("results_parsers/JS_SANITIZE_HTML.json", JSON.stringify(res));
17 | }
18 | 
19 | if (require.main === module) {
20 |     generate();
21 | }


--------------------------------------------------------------------------------
/hui/parsers/generators/js_htmlparser2.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const { parseDocument } = require('htmlparser2');
 3 | const cheerio = require('cheerio');
 4 | 
 5 | function generate() {
 6 |     const arr = JSON.parse(fs.readFileSync("generated_payloads.json"));
 7 |     const res = [];
 8 |     arr.forEach(payload => {
 9 |         const html_content = `<html><body>${payload}</body></html>`;
10 |         const dom = parseDocument(html_content);
11 |         const $ = cheerio.load(dom);
12 |         const body_inner_html = $('body').html();
13 |         res.push(body_inner_html);
14 |     });
15 |     fs.writeFileSync("results_parsers/JS_HTMLPARSER2.json", JSON.stringify(res));
16 | }
17 | 
18 | if (require.main === module) {
19 |     generate();
20 | }


--------------------------------------------------------------------------------
/hui/results_parsers/JS_DOMPURIFY.json:
--------------------------------------------------------------------------------
1 | ["\"&gt;","<textarea>&lt;a href=\"https://github.com/Slonser/hui/</textarea>\"&gt;","<a href=\"https://github.com/Slonser/hui/</noscript>\"></a>","\"&gt;","\"&gt;","","<select></select>","<h1></h1><h2>$text</h2>","<h2></h2><h3>$text</h3>","<h3></h3><h4>$text</h4>","<h4></h4><h5>$text</h5>","<h5></h5><h6>$text</h6>","<form>$text</form>","<table></table><table></table>$text","<table><caption></caption><caption>$text</caption></table>","<table><tbody><tr><td></td><td>$text</td></tr></tbody></table>","<table><tbody><tr></tr><tr></tr></tbody></table>$text","<table><colgroup><col><col></colgroup></table>$text","$text","$text","$text","$text","$text","$text","<a href=\"$href\">$text</a>","<wbr>$text","<hr>$text","<h1>$text</h1>","<h1></h1>"]


--------------------------------------------------------------------------------
/examples/example.py:
--------------------------------------------------------------------------------
 1 | from hui.identify import Identifier
 2 | import requests
 3 | 
 4 | def handler(payload):
 5 |     return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text
 6 | 
 7 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False)
 8 | print(a.identify())
 9 | # run all
10 | print(a.check_attr_allowed("href",tag="a"))
11 | # True or False
12 | print(a.INCORRECT_PARSED)
13 | # Example output
14 | # [{'output': '<h5><h6>govnoed</h6></h5>', 'expected': '<h5></h5><h6>$text</h6>'}, .. ]
15 | print(a.ALLOWED_TAGS)
16 | # print allowed tags
17 | print(a.ATTRIBUTES)
18 | # Prints ATTRIBUTES info
19 | print(a.DEPTH_LIMITS)
20 | # Example Outputs:
21 | # (514, 'No max tags limit')
22 | # (512, 'Flattening')
23 | # (255, 'Removing')


--------------------------------------------------------------------------------
/hui/parsers/generators/js_jsdom.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const { JSDOM } = require('jsdom');
 3 | 
 4 | function generate() {
 5 |     const arr = JSON.parse(fs.readFileSync("generated_payloads.json"));
 6 |     const res = [];
 7 |     for(let payload of arr){
 8 |         try{
 9 |             const html_content = `${payload}`;
10 |             const dom = new JSDOM();
11 |             dom.window.document.body.innerHTML = html_content;
12 |             const body_inner_html = dom.window.document.body.innerHTML;
13 |             res.push(body_inner_html);
14 |         }catch(e){
15 |             res.push("");
16 |         }
17 |     }
18 |     fs.writeFileSync("results_parsers/JSDOM_HTML.json", JSON.stringify(res));
19 | }
20 | 
21 | if (require.main === module) {
22 |     generate();
23 | }


--------------------------------------------------------------------------------
/hui/Generator.py:
--------------------------------------------------------------------------------
 1 | from .parsers import JS_DOM, JS_DOMPURIFY, JS_HTMLPARSER2, PYTHON_HTML, PYTHON_LXML_HTML,PYTHON_HTML_SANITIZER ,GO_HTML, JS_SANITIZE_HTML, GO_bluemonday, JS_JSXSS
 2 | 
 3 | def generate():
 4 |     parsers_list = [
 5 |         JS_DOMPURIFY.SANITIZE_HTML(),
 6 |         JS_DOM.SANITIZE_HTML(),
 7 |         JS_HTMLPARSER2.SANITIZE_HTML(),
 8 |         PYTHON_HTML.SANITIZE_HTML(),
 9 |         PYTHON_LXML_HTML.SANITIZE_HTML(),
10 |         GO_HTML.SANITIZE_HTML(),
11 |         JS_SANITIZE_HTML.SANITIZE_HTML(),
12 |         PYTHON_HTML_SANITIZER.SANITIZE_HTML(),
13 |         GO_bluemonday.SANITIZE_HTML(),
14 |         JS_JSXSS.SANITIZE_HTML()
15 |     ]
16 |     
17 |     for parser in parsers_list:
18 |         parser.get_results()
19 | 
20 | if __name__ == "__main__":
21 |     generate()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from setuptools import setup, find_packages
 3 | 
 4 | setup(
 5 |   name = 'hui',
 6 |   packages= find_packages(),
 7 |   version = '0.2.2',
 8 |   license='MIT', 
 9 |   author = 'Slonser',
10 |   author_email = 'slonser@slonser.info',
11 |   url = 'https://github.com/slonser/hui',
12 |   download_url = 'https://github.com/Slonser/hui/archive/v_01.tar.gz',
13 |   keywords = ['HTML', 'hui', 'HTML GUESSER', "HTML identifier", "XSS", "bugbounty"],
14 |   classifiers=[
15 |     'Development Status :: 3 - Alpha',
16 |     'Intended Audience :: Developers',
17 |     'Topic :: Software Development :: Build Tools',
18 |     'License :: OSI Approved :: MIT License',
19 |     'Programming Language :: Python :: 3'
20 |   ],
21 |   package_data={'': ['generated_payloads.json','results_parsers/*.json']}
22 |   )


--------------------------------------------------------------------------------
/hui/parsers/generators/js_dompurify.js:
--------------------------------------------------------------------------------
 1 | const fs = require('fs');
 2 | const { JSDOM } = require('jsdom');
 3 | const DOMPurify = require('dompurify');
 4 | 
 5 | function generate() {
 6 |     const arr = JSON.parse(fs.readFileSync("generated_payloads.json"));
 7 |     const res = [];
 8 |     arr.forEach(payload => {
 9 |         try{
10 |             const html_content = `${payload}`;
11 |             const window = new JSDOM('').window;
12 |             const purify = DOMPurify(window);
13 |             const sanitized_html = purify.sanitize(html_content);
14 |             res.push(sanitized_html);
15 |         }catch{
16 |             res.push("");
17 |         }
18 |     });
19 |     fs.writeFileSync("results_parsers/JS_DOMPURIFY.json", JSON.stringify(res));
20 | }
21 | 
22 | if (require.main === module) {
23 |     generate();
24 | }


--------------------------------------------------------------------------------
/hui/parsers/generators/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
 2 | github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
 3 | github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8=
 4 | github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0=
 5 | github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk=
 6 | github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA=
 7 | golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
 8 | golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
 9 | golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ=
10 | golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE=
11 | 


--------------------------------------------------------------------------------
/hui/results_parsers/JS_SANITIZE_HTML.json:
--------------------------------------------------------------------------------
1 | ["<a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"></a>","","<a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"></a>","<a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"></a>","\"&gt;","<a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"></a>","<h1></h1>","<h1><h2>$text</h2></h1>","<h2><h3>$text</h3></h2>","<h3><h4>$text</h4></h3>","<h4><h5>$text</h5></h4>","<h5><h6>$text</h6></h5>","$text","<table><table>$text</table></table>","<table><caption><caption>$text</caption></caption></table>","<table><td></td><td>$text</td></table>","<table><tr></tr><tr>$text</tr></table>","<table><col></col><col></col>$text</table>","<th>$text</th>","<td>$text</td>","<tfoot>$text</tfoot>","<thead>$text</thead>","<tbody>$text</tbody>","<tr>$text</tr>","<a href=\"$href\">$text</a>","<wbr></wbr>$text","<hr />$text","<h1>$text</h1>","<h1></h1>"]


--------------------------------------------------------------------------------
/hui/results_parsers/JSDOM_HTML.json:
--------------------------------------------------------------------------------
1 | ["<xmp><a href=\"https://github.com/Slonser/hui/</xmp>\"&gt;","<textarea>&lt;a href=\"https://github.com/Slonser/hui/</textarea>\"&gt;","<noscript><a href=\"https://github.com/Slonser/hui/</noscript>\"></a></noscript>","<noembed><a href=\"https://github.com/Slonser/hui/</noembed>\"&gt;","<style><a href=\"https://github.com/Slonser/hui/</style>\"&gt;","<plaintext><a href=\"https://github.com/Slonser/hui/</plaintext>\"></a></plaintext></plaintext>","<select></select>","<h1></h1><h2>$text</h2>","<h2></h2><h3>$text</h3>","<h3></h3><h4>$text</h4>","<h4></h4><h5>$text</h5>","<h5></h5><h6>$text</h6>","<form>$text</form>","<table></table><table></table>$text","<table><caption></caption><caption>$text</caption></table>","<table><tbody><tr><td></td><td>$text</td></tr></tbody></table>","<table><tbody><tr></tr><tr></tr></tbody></table>$text","<table><colgroup><col><col></colgroup></table>$text","$text","$text","$text","$text","$text","$text","<a $attribute_prefix-K=\"1\" href=\"$href\">$text</a>","<wbr>$text","<hr>$text","<h1>$text</h1>","<h1></h1>"]


--------------------------------------------------------------------------------
/hui/parsers/generators/go_bluemonday.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 	"os"
 7 | 	"github.com/microcosm-cc/bluemonday"
 8 | )
 9 | 
10 | func generate() {
11 | 	file, err := os.Open("../../generated_payloads.json")
12 | 	if err != nil {
13 | 		fmt.Println("Error opening file:", err)
14 | 		return
15 | 	}
16 | 	defer file.Close()
17 | 
18 | 	var arr []string
19 | 	if err := json.NewDecoder(file).Decode(&arr); err != nil {
20 | 		fmt.Println("Error decoding JSON:", err)
21 | 		return
22 | 	}
23 | 
24 | 	var res []string
25 | 	for _, payload := range arr {
26 | 		p := bluemonday.UGCPolicy()
27 | 		sanitizedHTML := p.Sanitize(payload)
28 | 		res = append(res, sanitizedHTML)
29 | 	}
30 | 
31 | 	outputFile, err := os.Create("../../results_parsers/GO_BLUEMONDAY.json")
32 | 	if err != nil {
33 | 		fmt.Println("Error creating output file:", err)
34 | 		return
35 | 	}
36 | 	defer outputFile.Close()
37 | 
38 | 	if err := json.NewEncoder(outputFile).Encode(res); err != nil {
39 | 		fmt.Println("Error encoding JSON:", err)
40 | 	}
41 | }
42 | 
43 | func main() {
44 | 	generate()
45 | }


--------------------------------------------------------------------------------
/hui/CustomParser.py:
--------------------------------------------------------------------------------
 1 | from html.parser import HTMLParser
 2 | 
 3 | class CustomParser(HTMLParser):
 4 |     def __init__(self):
 5 |         super().__init__()
 6 |         self.customattr_found = False
 7 |         self.found_attrs = []
 8 |         self.found_tags = []
 9 |         self.current_depth = 0
10 |         self.max_depth = 0
11 |         self.start_tags = 0
12 | 
13 |     def handle_starttag(self, tag, attrs):
14 |         self.found_tags.append(tag)
15 |         self.found_attrs.extend(attrs)
16 |         self.current_depth += 1
17 |         self.start_tags += 1
18 |         self.max_depth = max(self.max_depth, self.current_depth)
19 | 
20 |     def handle_endtag(self, tag):
21 |         self.current_depth -= 1
22 | 
23 |     def check(self, payload):
24 |         self.found_attrs = []
25 |         self.found_tags = []
26 |         self.current_depth = 0
27 |         self.max_depth = 0
28 |         self.start_tags = 0
29 | 
30 |         self.feed(payload)
31 |          # Need to close parser to clear buffer
32 |         # TODO: Is this best solution?
33 |         self.close()
34 |         return self.max_depth


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | Copyright (c) 2024 Slonser
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | The above copyright notice and this permission notice shall be included in all
10 | copies or substantial portions of the Software.
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17 | SOFTWARE.
18 | 


--------------------------------------------------------------------------------
/hui/results_parsers/JAVA_JSOUP.json:
--------------------------------------------------------------------------------
1 | ["<xmp>\n &lt;a href=\"https:\/\/github.com\/Slonser\/hui\/\n<\/xmp>\"&gt;","<textarea>&lt;a href=\"https:\/\/github.com\/Slonser\/hui\/<\/textarea>\"&gt;","","<noembed>\n &lt;a href=\"https:\/\/github.com\/Slonser\/hui\/\n<\/noembed>\"&gt;","\"&gt;","<plaintext>\n &lt;a href=\"https:\/\/github.com\/Slonser\/hui\/&lt;\/plaintext&gt;\"&gt;&lt;\/a&gt;&lt;\/plaintext&gt;\n<\/plaintext>","<select><\/select>","<h1><\/h1>\n<h2>$text<\/h2>","<h2><\/h2>\n<h3>$text<\/h3>","<h3><\/h3>\n<h4>$text<\/h4>","<h4><\/h4>\n<h5>$text<\/h5>","<h5><\/h5>\n<h6>$text<\/h6>","<form>\n $text\n<\/form>","<table><\/table>\n<table>\n $text\n<\/table>","<table>\n <caption><\/caption>\n <caption>\n  $text\n <\/caption>\n<\/table>","<table>\n <tbody>\n  <tr>\n   <td><\/td>\n   <td>$text<\/td>\n  <\/tr>\n <\/tbody>\n<\/table>","<table>\n <tbody>\n  <tr><\/tr>\n  <tr>\n   $text\n  <\/tr>\n <\/tbody>\n<\/table>","<table>\n <colgroup>\n  <col>\n  <col>\n <\/colgroup>$text\n<\/table>","$text","$text","$text","$text","$text","$text","<a $attribute_prefix-k=\"1\" href=\"$href\">$text<\/a>","<wbr>$text","<hr>$text","<h1>$text<\/h1>","<h1><\/h1>"]


--------------------------------------------------------------------------------
/hui/results_parsers/JS_HTMLPARSER2.json:
--------------------------------------------------------------------------------
1 | ["<xmp><a href=\"https://github.com/Slonser/hui/</xmp>\"></a></xmp>","<textarea>&lt;a href=\"https://github.com/Slonser/hui/</textarea>\"&gt;","<noscript><a href=\"https://github.com/Slonser/hui/</noscript>\"></a></noscript>","<noembed><a href=\"https://github.com/Slonser/hui/</noembed>\"></a></noembed>","<style>&lt;a href=\"https://github.com/Slonser/hui/</style>\"&gt;","<plaintext><a href=\"https://github.com/Slonser/hui/</plaintext>\"></a></plaintext>","<select><h1></h1></select>","<h1><h2>$text</h2></h1>","<h2><h3>$text</h3></h2>","<h3><h4>$text</h4></h3>","<h4><h5>$text</h5></h4>","<h5><h6>$text</h6></h5>","<form><form>$text</form></form>","<table><table>$text</table></table>","<table><caption><caption>$text</caption></caption></table>","<table><td></td><td>$text</td></table>","<table><tr></tr><tr>$text</tr></table>","<table><col></col><col></col>$text</table>","<th>$text</th>","<td>$text</td>","<tfoot>$text</tfoot>","<thead>$text</thead>","<tbody>$text</tbody>","<tr>$text</tr>","<a $attribute_prefix-k=\"1\" href=\"$href\">$text</a>","<wbr></wbr>$text","<hr></hr>$text","<h1>$text</h1>","<h1></h1>"]


--------------------------------------------------------------------------------
/hui/generated_payloads.json:
--------------------------------------------------------------------------------
1 | ["<xmp><a href=\"https://github.com/Slonser/hui/</xmp>\"></a></xmp>", "<textarea><a href=\"https://github.com/Slonser/hui/</textarea>\"></a></textarea>", "<noscript><a href=\"https://github.com/Slonser/hui/</noscript>\"></a></noscript>", "<noembed><a href=\"https://github.com/Slonser/hui/</noembed>\"></a></noembed>", "<style><a href=\"https://github.com/Slonser/hui/</style>\"></a></style>", "<plaintext><a href=\"https://github.com/Slonser/hui/</plaintext>\"></a></plaintext>", "<select><h1></h1></select>", "<h1><h2>$text</h2></h1>", "<h2><h3>$text</h3></h2>", "<h3><h4>$text</h4></h3>", "<h4><h5>$text</h5></h4>", "<h5><h6>$text</h6></h5>", "<form><form>$text</form></form>", "<table><table>$text</table></table>", "<table><caption><caption>$text</caption></caption></caption>", "<table><td><td>$text</td></td></td>", "<table><tr><tr>$text</tr></tr></tr>", "<table><col><col>$text</col></col></col>", "<th>$text</th>", "<td>$text</td>", "<tfoot>$text</tfoot>", "<thead>$text</thead>", "<tbody>$text</tbody>", "<tr>$text</tr>", "<a $attribute_prefix-\u212a=\"1\" href=\"$href\">$text</a>", "<wbr>$text</wbr>", "<hr>$text</hr>", "<h1>$text</h1></h1>", "<h1>"]


--------------------------------------------------------------------------------
/hui/results_parsers/PYTHON_HTML.json:
--------------------------------------------------------------------------------
1 | ["<xmp><a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"></a></xmp>", "<textarea><a href=\"https://github.com/Slonser/hui/&lt;/textarea&gt;\"></a></textarea>", "<noscript><a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"></a></noscript>", "<noembed><a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"></a></noembed>", "<style><a href=\"https://github.com/Slonser/hui/</style>\"&gt;", "<plaintext><a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"></a></plaintext>", "<select><h1></h1></select>", "<h1><h2>$text</h2></h1>", "<h2><h3>$text</h3></h2>", "<h3><h4>$text</h4></h3>", "<h4><h5>$text</h5></h4>", "<h5><h6>$text</h6></h5>", "<form><form>$text</form></form>", "<table><table>$text</table></table>", "<table><caption><caption>$text</caption></caption></table>", "<table><td><td>$text</td></td></table>", "<table><tr><tr>$text</tr></tr></table>", "<table><col/><col/>$text</table>", "<th>$text</th>", "<td>$text</td>", "<tfoot>$text</tfoot>", "<thead>$text</thead>", "<tbody>$text</tbody>", "<tr>$text</tr>", "<a $attribute_prefix-k=\"1\" href=\"$href\">$text</a>", "<wbr/>$text", "<hr/>$text", "<h1>$text</h1>", "<h1></h1>"]


--------------------------------------------------------------------------------
/hui/results_parsers/JS_JSXSS.json:
--------------------------------------------------------------------------------
1 | ["&lt;xmp&gt;<a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"></a>&lt;/xmp&gt;","&lt;textarea&gt;<a href=\"https://github.com/Slonser/hui/&lt;/textarea&gt;\"></a>&lt;/textarea&gt;","&lt;noscript&gt;<a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"></a>&lt;/noscript&gt;","&lt;noembed&gt;<a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"></a>&lt;/noembed&gt;","&lt;style&gt;<a href=\"https://github.com/Slonser/hui/&lt;/style&gt;\"></a>&lt;/style&gt;","&lt;plaintext&gt;<a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"></a>&lt;/plaintext&gt;","&lt;select&gt;<h1></h1>&lt;/select&gt;","<h1><h2>$text</h2></h1>","<h2><h3>$text</h3></h2>","<h3><h4>$text</h4></h3>","<h4><h5>$text</h5></h4>","<h5><h6>$text</h6></h5>","&lt;form&gt;&lt;form&gt;$text&lt;/form&gt;&lt;/form&gt;","<table><table>$text</table></table>","<table><caption><caption>$text</caption></caption></caption>","<table><td><td>$text</td></td></td>","<table><tr><tr>$text</tr></tr></tr>","<table><col><col>$text</col></col></col>","<th>$text</th>","<td>$text</td>","<tfoot>$text</tfoot>","<thead>$text</thead>","<tbody>$text</tbody>","<tr>$text</tr>","<a href>$text</a>","&lt;wbr&gt;$text&lt;/wbr&gt;","<hr>$text</hr>","<h1>$text</h1></h1>","<h1>"]


--------------------------------------------------------------------------------
/hui.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | LICENSE.txt
 2 | README.md
 3 | pyproject.toml
 4 | setup.cfg
 5 | setup.py
 6 | hui/ALLOWED_ATTRS.py
 7 | hui/ALLOWED_TAGS.py
 8 | hui/CustomParser.py
 9 | hui/Generator.py
10 | hui/ParserBase.py
11 | hui/ParserPayload.py
12 | hui/__init__.py
13 | hui/generated_payloads.json
14 | hui/identify.py
15 | hui.egg-info/PKG-INFO
16 | hui.egg-info/SOURCES.txt
17 | hui.egg-info/dependency_links.txt
18 | hui.egg-info/top_level.txt
19 | hui/parsers/GO_HTML.py
20 | hui/parsers/GO_bluemonday.py
21 | hui/parsers/JAVA_JSOUP.py
22 | hui/parsers/JS_DOM.py
23 | hui/parsers/JS_DOMPURIFY.py
24 | hui/parsers/JS_HTMLPARSER2.py
25 | hui/parsers/JS_JSXSS.py
26 | hui/parsers/JS_SANITIZE_HTML.py
27 | hui/parsers/PYTHON_HTML.py
28 | hui/parsers/PYTHON_HTML_SANITIZER.py
29 | hui/parsers/PYTHON_LXML_HTML.py
30 | hui/parsers/__init__.py
31 | hui/parsers/simple_parser.py
32 | hui/results_parsers/GO_BLUEMONDAY.json
33 | hui/results_parsers/GO_HTML.json
34 | hui/results_parsers/JAVA_JSOUP.json
35 | hui/results_parsers/JSDOM_HTML.json
36 | hui/results_parsers/JS_DOMPURIFY.json
37 | hui/results_parsers/JS_HTMLPARSER2.json
38 | hui/results_parsers/JS_JSXSS.json
39 | hui/results_parsers/JS_SANITIZE_HTML.json
40 | hui/results_parsers/PYTHON_HTML.json
41 | hui/results_parsers/PYTHON_HTML_SANITIZE.json
42 | hui/results_parsers/PYTHON_LXML_HTML.json


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/src/main/java/com/example/Main.java:
--------------------------------------------------------------------------------
 1 | package com.example;
 2 | 
 3 | import org.jsoup.Jsoup;
 4 | import org.jsoup.nodes.Document;
 5 | import org.json.simple.JSONArray;
 6 | import org.json.simple.parser.JSONParser;
 7 | import java.nio.file.Files;
 8 | import java.nio.file.Paths;
 9 | import java.io.IOException;
10 | import java.io.FileReader;
11 | 
12 | 
13 | public class Main {
14 |     public static void generate() {
15 |         try {
16 |             System.out.println("Starting generation process...");
17 |             JSONParser parser = new JSONParser();
18 |             JSONArray arr = (JSONArray) parser.parse(new FileReader("generated_payloads.json"));
19 |             JSONArray res = new JSONArray();
20 | 
21 |             for (int i = 0; i < arr.size(); i++) {
22 |                     String htmlContent = (String) arr.get(i);
23 |                     Document doc = Jsoup.parse(htmlContent);
24 |                     String bodyInnerHtml = doc.body().html();
25 |                     res.add(bodyInnerHtml);
26 |             }
27 |             Files.write(Paths.get("results_parsers/JAVA_JSOUP.json"), res.toJSONString().getBytes());
28 |         } catch (Exception e) {
29 |             e.printStackTrace();
30 |         }
31 |     }
32 | 
33 |     public static void main(String[] args) {
34 |         generate();
35 |     }
36 | }


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>com.example</groupId>
 5 |   <artifactId>java-jsoup</artifactId>
 6 |   <version>1.0-SNAPSHOT</version>
 7 |   <build>
 8 |     <plugins>
 9 |       <plugin>
10 |         <artifactId>maven-jar-plugin</artifactId>
11 |         <version>3.2.0</version>
12 |         <configuration>
13 |           <archive>
14 |             <manifest>
15 |               <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
16 |               <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
17 |               <mainClass>com.example.Main</mainClass>
18 |             </manifest>
19 |           </archive>
20 |         </configuration>
21 |       </plugin>
22 |       <plugin>
23 |         <artifactId>maven-shade-plugin</artifactId>
24 |         <version>3.6.0</version>
25 |         <executions>
26 |           <execution>
27 |             <phase>package</phase>
28 |             <goals>
29 |               <goal>shade</goal>
30 |             </goals>
31 |           </execution>
32 |         </executions>
33 |       </plugin>
34 |     </plugins>
35 |   </build>
36 | </project>
37 | 


--------------------------------------------------------------------------------
/hui/results_parsers/PYTHON_LXML_HTML.json:
--------------------------------------------------------------------------------
1 | ["<body><xmp><a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"/></xmp></body>", "<body><textarea><a href=\"https://github.com/Slonser/hui/&lt;/textarea&gt;\"/></textarea></body>", "<body><noscript><a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"/></noscript></body>", "<body><noembed><a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"/></noembed></body>", "<body><style>&lt;a href=\"https://github.com/Slonser/hui/</style>\"&gt;</body>", "<body><plaintext><a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"/></plaintext></body>", "<body><select><h1/></select></body>", "<body><h1><h2>$text</h2></h1></body>", "<body><h2><h3>$text</h3></h2></body>", "<body><h3><h4>$text</h4></h3></body>", "<body><h4><h5>$text</h5></h4></body>", "<body><h5><h6>$text</h6></h5></body>", "<body><form/><form>$text</form></body>", "<body><table><table>$text</table></table></body>", "<body><table><caption><caption>$text</caption></caption></table></body>", "<body><table><td/><td>$text</td></table></body>", "<body><table><tr/><tr>$text</tr></table></body>", "<body><table><col/><col/>$text</table></body>", "<body><th>$text</th></body>", "<body><td>$text</td></body>", "<body><tfoot>$text</tfoot></body>", "<body><thead>$text</thead></body>", "<body><tbody>$text</tbody></body>", "<body><tr>$text</tr></body>", "<body><a href=\"$href\">$text</a></body>", "<body><wbr>$text</wbr></body>", "<body><hr/>$text</body>", "<body><h1>$text</h1></body>", "<body><h1/></body>"]


--------------------------------------------------------------------------------
/hui/parsers/generators/go_html.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"fmt"
 6 | 	"os"
 7 | 	"strings"
 8 | 
 9 | 	"golang.org/x/net/html"
10 | )
11 | 
12 | func generate() {
13 | 	file, err := os.Open("../../generated_payloads.json")
14 | 	if err != nil {
15 | 		fmt.Println("Error opening file:", err)
16 | 		return
17 | 	}
18 | 	defer file.Close()
19 | 
20 | 	var arr []string
21 | 	if err := json.NewDecoder(file).Decode(&arr); err != nil {
22 | 		fmt.Println("Error decoding JSON:", err)
23 | 		return
24 | 	}
25 | 
26 | 	var res []string
27 | 	for _, payload := range arr {
28 | 		htmlContent := fmt.Sprintf("<html><body>%s</body></html>", payload)
29 | 
30 | 		doc, err := html.Parse(strings.NewReader(htmlContent))
31 | 		if err != nil {
32 | 			fmt.Println("Error parsing HTML:", err)
33 | 			continue
34 | 		}
35 | 
36 | 		var bodyInnerHTML string
37 | 		var f func(*html.Node)
38 | 		f = func(n *html.Node) {
39 | 			if n.Type == html.ElementNode && n.Data == "body" {
40 | 				var buf strings.Builder
41 | 				html.Render(&buf, n)
42 | 				bodyInnerHTML = buf.String()
43 | 			}
44 | 			for c := n.FirstChild; c != nil; c = c.NextSibling {
45 | 				f(c)
46 | 			}
47 | 		}
48 | 		f(doc)
49 | 
50 | 		res = append(res, strings.TrimSuffix(strings.TrimPrefix(bodyInnerHTML, "<body>"), "</body>"))
51 | 	}
52 | 
53 | 	outputFile, err := os.Create("../../results_parsers/GO_HTML.json")
54 | 	if err != nil {
55 | 		fmt.Println("Error creating output file:", err)
56 | 		return
57 | 	}
58 | 	defer outputFile.Close()
59 | 
60 | 	if err := json.NewEncoder(outputFile).Encode(res); err != nil {
61 | 		fmt.Println("Error encoding JSON:", err)
62 | 	}
63 | }
64 | 
65 | func main() {
66 | 	generate()
67 | }


--------------------------------------------------------------------------------
/hui/results_parsers/GO_BLUEMONDAY.json:
--------------------------------------------------------------------------------
1 | ["\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026#34;\u0026gt;\u003c/a\u003e","\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026lt;/plaintext\u0026gt;\u0026#34;\u0026gt;\u0026lt;/a\u0026gt;\u0026lt;/plaintext\u0026gt;","\u003ch1\u003e\u003c/h1\u003e","\u003ch1\u003e\u003ch2\u003e$text\u003c/h2\u003e\u003c/h1\u003e","\u003ch2\u003e\u003ch3\u003e$text\u003c/h3\u003e\u003c/h2\u003e","\u003ch3\u003e\u003ch4\u003e$text\u003c/h4\u003e\u003c/h3\u003e","\u003ch4\u003e\u003ch5\u003e$text\u003c/h5\u003e\u003c/h4\u003e","\u003ch5\u003e\u003ch6\u003e$text\u003c/h6\u003e\u003c/h5\u003e","$text","\u003ctable\u003e\u003ctable\u003e$text\u003c/table\u003e\u003c/table\u003e","\u003ctable\u003e\u003ccaption\u003e\u003ccaption\u003e$text\u003c/caption\u003e\u003c/caption\u003e\u003c/caption\u003e","\u003ctable\u003e\u003ctd\u003e\u003ctd\u003e$text\u003c/td\u003e\u003c/td\u003e\u003c/td\u003e","\u003ctable\u003e\u003ctr\u003e\u003ctr\u003e$text\u003c/tr\u003e\u003c/tr\u003e\u003c/tr\u003e","\u003ctable\u003e\u003ccol\u003e\u003ccol\u003e$text\u003c/col\u003e\u003c/col\u003e\u003c/col\u003e","\u003cth\u003e$text\u003c/th\u003e","\u003ctd\u003e$text\u003c/td\u003e","\u003ctfoot\u003e$text\u003c/tfoot\u003e","\u003cthead\u003e$text\u003c/thead\u003e","\u003ctbody\u003e$text\u003c/tbody\u003e","\u003ctr\u003e$text\u003c/tr\u003e","\u003ca href=\"$href\" rel=\"nofollow\"\u003e$text\u003c/a\u003e","\u003cwbr\u003e$text\u003c/wbr\u003e","\u003chr\u003e$text\u003c/hr\u003e","\u003ch1\u003e$text\u003c/h1\u003e\u003c/h1\u003e","\u003ch1\u003e"]
2 | 


--------------------------------------------------------------------------------
/hui/results_parsers/GO_HTML.json:
--------------------------------------------------------------------------------
1 | ["\u003cxmp\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/xmp\u003e\u0026#34;\u0026gt;","\u003ctextarea\u003e\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u003c/textarea\u003e\u0026#34;\u0026gt;","\u003cnoscript\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/noscript\u003e\u0026#34;\u0026gt;","\u003cnoembed\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/noembed\u003e\u0026#34;\u0026gt;","\u003cstyle\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/style\u003e\u0026#34;\u0026gt;","\u003cplaintext\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/plaintext\u003e\"\u003e\u003c/a\u003e\u003c/plaintext\u003e\u003c/body\u003e\u003c/html\u003e","\u003cselect\u003e\u003c/select\u003e","\u003ch1\u003e\u003c/h1\u003e\u003ch2\u003e$text\u003c/h2\u003e","\u003ch2\u003e\u003c/h2\u003e\u003ch3\u003e$text\u003c/h3\u003e","\u003ch3\u003e\u003c/h3\u003e\u003ch4\u003e$text\u003c/h4\u003e","\u003ch4\u003e\u003c/h4\u003e\u003ch5\u003e$text\u003c/h5\u003e","\u003ch5\u003e\u003c/h5\u003e\u003ch6\u003e$text\u003c/h6\u003e","\u003cform\u003e$text\u003c/form\u003e","\u003ctable\u003e\u003c/table\u003e$text\u003ctable\u003e\u003c/table\u003e","\u003ctable\u003e\u003ccaption\u003e\u003c/caption\u003e\u003ccaption\u003e$text\u003c/caption\u003e\u003c/table\u003e","\u003ctable\u003e\u003ctbody\u003e\u003ctr\u003e\u003ctd\u003e\u003c/td\u003e\u003ctd\u003e$text\u003c/td\u003e\u003c/tr\u003e\u003c/tbody\u003e\u003c/table\u003e","$text\u003ctable\u003e\u003ctbody\u003e\u003ctr\u003e\u003c/tr\u003e\u003ctr\u003e\u003c/tr\u003e\u003c/tbody\u003e\u003c/table\u003e","$text\u003ctable\u003e\u003ccolgroup\u003e\u003ccol/\u003e\u003ccol/\u003e\u003c/colgroup\u003e\u003c/table\u003e","$text","$text","$text","$text","$text","$text","\u003ca $attribute_prefix-K=\"1\" href=\"$href\"\u003e$text\u003c/a\u003e","\u003cwbr/\u003e$text","\u003chr/\u003e$text","\u003ch1\u003e$text\u003c/h1\u003e","\u003ch1\u003e\u003c/h1\u003e"]
2 | 


--------------------------------------------------------------------------------
/hui/parsers/generators/JSOUP/pom.xml:
--------------------------------------------------------------------------------
 1 | <!-- pom.xml -->
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>com.example</groupId>
 8 |     <artifactId>java-jsoup</artifactId>
 9 |     <version>1.0-SNAPSHOT</version>
10 | 
11 |     <build>
12 |         <plugins>
13 |             <plugin>
14 |                 <groupId>org.apache.maven.plugins</groupId>
15 |                 <artifactId>maven-jar-plugin</artifactId>
16 |                 <version>3.2.0</version>
17 |                 <configuration>
18 |                     <archive>
19 |                         <manifest>
20 |                             <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
21 |                             <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries>
22 |                             <mainClass>com.example.Main</mainClass>
23 |                         </manifest>
24 |                     </archive>
25 |                 </configuration>
26 |             </plugin>
27 |                   <plugin>
28 |         <groupId>org.apache.maven.plugins</groupId>
29 |         <artifactId>maven-shade-plugin</artifactId>
30 |         <version>3.6.0</version>
31 |         <executions>
32 |           <execution>
33 |             <phase>package</phase>
34 |             <goals>
35 |               <goal>shade</goal>
36 |             </goals>
37 |           </execution>
38 |         </executions>
39 |       </plugin>
40 |         </plugins>
41 |     </build>
42 | 
43 |     <dependencies>
44 |         <dependency>
45 |             <groupId>org.jsoup</groupId>
46 |             <artifactId>jsoup</artifactId>
47 |             <version>1.14.3</version>
48 |         </dependency>
49 |         <dependency>
50 |             <groupId>com.googlecode.json-simple</groupId>
51 |             <artifactId>json-simple</artifactId>
52 |             <version>1.1.1</version>
53 |         </dependency>
54 |     </dependencies>
55 | </project>


--------------------------------------------------------------------------------
/hui/ParserPayload.py:
--------------------------------------------------------------------------------
 1 | from string import Template
 2 | 
 3 | class ParserPayload:
 4 |     """
 5 |     Represents a payload for parsing with additional metadata and methods for validation.
 6 | 
 7 |     Attributes:
 8 |         payload (str): The actual payload to be parsed.
 9 |         expected_output (str, optional): The expected output of the parsing process. Defaults to None.
10 |         version (str, optional): The version of the parser or the payload. Defaults to None.
11 |         sanitizer (str, optional): The sanitizer to be used for the payload. Defaults to None.
12 |         parametrs (str, optional): Parameters for the parsing process. Defaults to None.
13 |         tags (list): Tags associated with the payload.
14 | 
15 |     Methods:
16 |         check(output): Checks if the output matches the expected output, considering whitespace.
17 |         remove_whitespace(string): Removes whitespace from a given string.
18 |     """
19 | 
20 |     def __init__(self, payload: str, tags, expected_output: str = None, version: str = None, sanitizer: str = None, parametrs: str = None) -> None:
21 |         self.payload = payload
22 |         self.expected_output = expected_output
23 |         self.version = version
24 |         self.sanitizer = sanitizer
25 |         self.parametrs = parametrs
26 |         self.tags = tags
27 | 
28 |     def check(self, output, TEMPLATE_VARS):
29 |         """
30 |         Checks if the output matches the expected output, considering whitespace.
31 | 
32 |         Args:
33 |             output (str): The output to be checked against the expected output.
34 | 
35 |         Returns:
36 |             bool: True if the output matches the expected output, False otherwise.
37 |         """
38 |         output = Template(output).safe_substitute(TEMPLATE_VARS)
39 |         expected_output_template = Template(self.expected_output).safe_substitute(TEMPLATE_VARS)
40 |         if output in expected_output_template:
41 |             return True
42 |         if self.remove_whitespace(output) in self.remove_whitespace(expected_output_template):
43 |             return True
44 |         return False
45 | 
46 |     
47 |     def remove_whitespace(self, string):
48 |         """
49 |         Removes whitespace from a given string.
50 | 
51 |         Args:
52 |             string (str): The string from which to remove whitespace.
53 | 
54 |         Returns:
55 |             str: The string with all whitespace removed.
56 |         """
57 |         return "".join(string.split())


--------------------------------------------------------------------------------
/hui/ALLOWED_TAGS.py:
--------------------------------------------------------------------------------
  1 | html_tags = [
  2 |     'a',
  3 |     'abbr',
  4 |     'acronym',
  5 |     'address',
  6 |     'area',
  7 |     'article',
  8 |     'aside',
  9 |     'audio',
 10 |     'b',
 11 |     'base',
 12 |     'bdi',
 13 |     'bdo',
 14 |     'big',
 15 |     'blockquote',
 16 |     'body',
 17 |     'br',
 18 |     'button',
 19 |     'canvas',
 20 |     'caption',
 21 |     'center',
 22 |     'cite',
 23 |     'code',
 24 |     'col',
 25 |     'colgroup',
 26 |     'data',
 27 |     'datalist',
 28 |     'dd',
 29 |     'del',
 30 |     'details',
 31 |     'dfn',
 32 |     'dialog',
 33 |     'dir',
 34 |     'div',
 35 |     'dl',
 36 |     'dt',
 37 |     'em',
 38 |     'embed',
 39 |     'fencedframe',
 40 |     'fieldset',
 41 |     'figcaption',
 42 |     'figure',
 43 |     'font',
 44 |     'footer',
 45 |     'form',
 46 |     'frame',
 47 |     'frameset',
 48 |     'h1',
 49 |     'h2',
 50 |     'h3',
 51 |     'h4',
 52 |     'h5',
 53 |     'h6',
 54 |     'head',
 55 |     'header',
 56 |     'hgroup',
 57 |     'hr',
 58 |     'html',
 59 |     'i',
 60 |     'iframe',
 61 |     'img',
 62 |     'input',
 63 |     'ins',
 64 |     'kbd',
 65 |     'label',
 66 |     'legend',
 67 |     'li',
 68 |     'link',
 69 |     'main',
 70 |     'map',
 71 |     'mark',
 72 |     'marquee',
 73 |     'menu',
 74 |     'meta',
 75 |     'meter',
 76 |     'nav',
 77 |     'nobr',
 78 |     'noembed',
 79 |     'noframes',
 80 |     'noscript',
 81 |     'object',
 82 |     'ol',
 83 |     'optgroup',
 84 |     'option',
 85 |     'output',
 86 |     'p',
 87 |     'param',
 88 |     'picture',
 89 |     'plaintext',
 90 |     'portal',
 91 |     'pre',
 92 |     'progress',
 93 |     'q',
 94 |     'rb',
 95 |     'rp',
 96 |     'rt',
 97 |     'rtc',
 98 |     'ruby',
 99 |     's',
100 |     'samp',
101 |     'script',
102 |     'search',
103 |     'section',
104 |     'select',
105 |     'slot',
106 |     'small',
107 |     'source',
108 |     'span',
109 |     'strike',
110 |     'strong',
111 |     'style',
112 |     'sub',
113 |     'summary',
114 |     'sup',
115 |     'table',
116 |     'tbody',
117 |     'td',
118 |     'template',
119 |     'textarea',
120 |     'tfoot',
121 |     'th',
122 |     'thead',
123 |     'time',
124 |     'title',
125 |     'tr',
126 |     'track',
127 |     'tt',
128 |     'u',
129 |     'ul',
130 |     'var',
131 |     'video',
132 |     'wbr',
133 |     'xmp',
134 |     'customtag'
135 | ]
136 | 
137 | # Some tags may be only work inside table tag
138 | html_table_tags = [
139 |     'caption',
140 |     'thead',
141 |     'colgroup',
142 |     'col',
143 |     'th',
144 |     'tbody',
145 |     'tr',
146 |     'td',
147 |     'tfoot',
148 | ]
149 | 
150 | # Copyied from https://github.com/cure53/DOMPurify/blob/f1106aae5a861d1096cb57ad9a6f518b4279ea8c/src/tags.ts#L226
151 | mathml_tags = [
152 |   'math',
153 |   'menclose',
154 |   'merror',
155 |   'mfenced',
156 |   'mfrac',
157 |   'mglyph',
158 |   'mi',
159 |   'mlabeledtr',
160 |   'mmultiscripts',
161 |   'mn',
162 |   'mo',
163 |   'mover',
164 |   'mpadded',
165 |   'mphantom',
166 |   'mroot',
167 |   'mrow',
168 |   'ms',
169 |   'mspace',
170 |   'msqrt',
171 |   'mstyle',
172 |   'msub',
173 |   'msup',
174 |   'msubsup',
175 |   'mtable',
176 |   'mtd',
177 |   'mtext',
178 |   'mtr',
179 |   'munder',
180 |   'munderover',
181 |   'mprescripts',
182 |   'maction',
183 |   'maligngroup',
184 |   'malignmark',
185 |   'mlongdiv',
186 |   'mscarries',
187 |   'mscarry',
188 |   'msgroup',
189 |   'mstack',
190 |   'msline',
191 |   'msrow',
192 |   'semantics',
193 |   'annotation',
194 |   'annotation-xml',
195 |   'mprescripts',
196 |   'none',
197 | ]
198 | 
199 | # Copyied from https://github.com/cure53/DOMPurify/blob/f1106aae5a861d1096cb57ad9a6f518b4279ea8c/src/tags.ts#L123
200 | svg_tags = [
201 |   'animate',
202 |   'color-profile',
203 |   'cursor',
204 |   'discard',
205 |   'font-face',
206 |   'font-face-format',
207 |   'font-face-name',
208 |   'font-face-src',
209 |   'font-face-uri',
210 |   'foreignobject',
211 |   'hatch',
212 |   'hatchpath',
213 |   'mesh',
214 |   'meshgradient',
215 |   'meshpatch',
216 |   'meshrow',
217 |   'missing-glyph',
218 |   'script',
219 |   'set',
220 |   'solidcolor',
221 |   'unknown',
222 |   'use',
223 |     'feBlend',
224 |   'feColorMatrix',
225 |   'feComponentTransfer',
226 |   'feComposite',
227 |   'feConvolveMatrix',
228 |   'feDiffuseLighting',
229 |   'feDisplacementMap',
230 |   'feDistantLight',
231 |   'feDropShadow',
232 |   'feFlood',
233 |   'feFuncA',
234 |   'feFuncB',
235 |   'feFuncG',
236 |   'feFuncR',
237 |   'feGaussianBlur',
238 |   'feImage',
239 |   'feMerge',
240 |   'feMergeNode',
241 |   'feMorphology',
242 |   'feOffset',
243 |   'fePointLight',
244 |   'feSpecularLighting',
245 |   'feSpotLight',
246 |   'feTile',
247 |   'feTurbulence',
248 |   'a',
249 |   'altglyph',
250 |   'altglyphdef',
251 |   'altglyphitem',
252 |   'animatecolor',
253 |   'animatemotion',
254 |   'animatetransform',
255 |   'circle',
256 |   'clippath',
257 |   'defs',
258 |   'desc',
259 |   'ellipse',
260 |   'filter',
261 |   'font',
262 |   'g',
263 |   'glyph',
264 |   'glyphref',
265 |   'hkern',
266 |   'image',
267 |   'line',
268 |   'lineargradient',
269 |   'marker',
270 |   'mask',
271 |   'metadata',
272 |   'mpath',
273 |   'path',
274 |   'pattern',
275 |   'polygon',
276 |   'polyline',
277 |   'radialgradient',
278 |   'rect',
279 |   'stop',
280 |   'style',
281 |   'switch',
282 |   'symbol',
283 |   'text',
284 |   'textpath',
285 |   'title',
286 |   'tref',
287 |   'tspan',
288 |   'view',
289 |   'vkern'
290 | ]


--------------------------------------------------------------------------------
/hui/ParserBase.py:
--------------------------------------------------------------------------------
  1 | from .ParserPayload import ParserPayload
  2 | import json
  3 | import os
  4 | 
  5 | class ParserBase:
  6 |     """
  7 |     A class to handle parsing of HTML content with various checks for incorrect parsing states.
  8 |     
  9 |     Attributes:
 10 |         parser_name (str): The name of the parser.
 11 |         checks (list): A list of ParserPayload objects that define parsing checks.
 12 |     """
 13 |     
 14 |     def __init__(self, parser_name: str, attribute_prefix='data-') -> None:
 15 |         """
 16 |         Initializes the ParserBase with a parser name and an optional attribute prefix.
 17 | 
 18 |         Args:
 19 |             parser_name (str): The name of the parser.
 20 |             attribute_prefix (str): The prefix for attributes (default is 'data-').
 21 |         """
 22 |         self.parser_name = parser_name
 23 |         self.checks = []
 24 | 
 25 |         # Some HTML parser don't properly resolve raw text tags
 26 |         incorrect_parsing_state_tags = ['xmp','textarea','noscript','noembed','style','plaintext']
 27 |         for tag in incorrect_parsing_state_tags:
 28 |             self.add(
 29 |                 ParserPayload(f'<{tag}><a href="https://github.com/Slonser/hui/</{tag}>"></a></{tag}>',
 30 |                               [tag,'a'],
 31 |                               expected_output=f'<{tag}>&lt;a href="https://github.com/Slonser/hui/</{tag}>"&gt;')
 32 |             )
 33 | 
 34 |         # Some HTML parsers incorrectly handle select tag
 35 |         # Browsers will remove <img/> tag, They - don't do this
 36 |         self.add(
 37 |             ParserPayload('<select><h1></h1></select>',
 38 |                           ['select','h1'],
 39 |                           expected_output="<select></select>")
 40 |         )
 41 |         
 42 |         # Some HTML parsers don't use flattening with headers tags
 43 |         for i in range(1,6):
 44 |             self.add(
 45 |                 ParserPayload(f'<h{i}><h{i+1}>$text</h{i+1}></h{i}>',
 46 |                               [f'h{i}',f'h{i+1}'],
 47 |                               expected_output=f"<h{i}></h{i}><h{i+1}>$text</h{i+1}>")
 48 |             )
 49 | 
 50 |         # Some HTML parsers don't resolve nested forms
 51 |         self.add(
 52 |             ParserPayload('<form><form>$text</form></form>',
 53 |                           ['form'],
 54 |                           expected_output=f"<form>$text</form>")
 55 |         )
 56 | 
 57 |         # Some HTML parser don't resolve nested tables
 58 |         self.add(
 59 |             ParserPayload('<table><table>$text</table></table>',
 60 |                           ['table'],
 61 |                           expected_output=f"<table></table>$text<table></table>")
 62 |         )
 63 | 
 64 |         # Some parser don't resolve nested table elemenents
 65 |         table_nested_tags = ['caption','td','tr','col']
 66 |         for tag in table_nested_tags:
 67 |             self.add(
 68 |                 ParserPayload(f'<table><{tag}><{tag}>$text</{tag}></{tag}></{tag}>',
 69 |                               ['table',tag],
 70 |                               expected_output=f"<table><{tag}></{tag}><{tag}>$text</{tag}></table>")
 71 |             )
 72 | 
 73 |         # Some HTML parsers don't implement "in row" insertion mode correctly
 74 |         row_insertion_mode = ['th','td','tfoot','thead','tbody','tr']
 75 |         for tag in row_insertion_mode:
 76 |             self.add(
 77 |                 ParserPayload(f'<{tag}>$text</{tag}>',
 78 |                               [tag],
 79 |                               expected_output='$text')
 80 |             )
 81 |         # Python HTML parsers incorrectly handle lower on html attribute names
 82 |         # By default in browsers, only ascii chars would be lowercased 
 83 |         # In python \u212a -> 0x6b
 84 |         self.add(
 85 |             ParserPayload('<a $attribute_prefix-\u212a="1" href="$href">$text</a>',
 86 |                           ['a'],
 87 |                           expected_output=f'<a href="$href" data-K="1">$text</a>')
 88 |         )
 89 | 
 90 |         #Some parsers incorrectrly parse self closing tags
 91 |         self_closing_tags = ['wbr','hr']
 92 |         for tag in self_closing_tags:
 93 |             self.add(
 94 |                 ParserPayload(f'<{tag}>$text</{tag}>',
 95 |                               [tag],
 96 |                               expected_output=f'<{tag}>$text')
 97 |             )
 98 | 
 99 |         # Some parsers incorrect handle <tag></tag></tag>
100 |         # TODO: should check with another tags, because h1 maybe banned
101 |         self.add(
102 |             ParserPayload(f'<h1>$text</h1></h1>',
103 |                             ['h1'],
104 |                             expected_output=f'<h1>123</h1></h1>')
105 |         )
106 | 
107 |         #Some sanitizers not close tags
108 |         self.add(
109 |             ParserPayload(f'<h1>',
110 |                             ['h1'],
111 |                             expected_output=f'<h1></h1>')
112 |         )
113 | 
114 |     def add(self, payload):
115 |         """
116 |         Adds a ParserPayload to the checks list.
117 | 
118 |         Args:
119 |             payload (ParserPayload): The payload to be added to the checks.
120 |         """
121 |         self.checks.append(payload)
122 |     
123 |     def add_all(self, arr):
124 |         """
125 |         Adds multiple ParserPayloads to the checks list.
126 | 
127 |         Args:
128 |             arr (list): A list of ParserPayload objects to be added.
129 |         """
130 |         for x in arr:
131 |             self.add(x)
132 |     
133 |     def generate_payloads(self):
134 |         """
135 |         Generates payloads and saves them to a JSON file if it does not already exist.
136 |         """
137 |         if os.path.exists("./generated_payloads.json"):
138 |             return
139 |         
140 |         tag_arr = []
141 |         for tag in self.checks:
142 |             tag_arr.append(tag.payload)
143 |         
144 |         return json.dump(tag_arr, open('./generated_payloads.json',"w"))
145 | 
146 |     def get_results(self):
147 |         """
148 |         Placeholder method for getting results.
149 |         """
150 |         pass


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Html Universal Identifier
  2 | 
  3 | Html Universal Identifier is an alpha version of an application designed for identifying server-side HTML parsers. This package provides a way to determine which HTML, SVG, and MathML tags are allowed, helps to find parser features (incorrectly implemented tags), and can also help to guess which parser is used on the backend.
  4 | 
  5 | Primarily, this library relies on the incorrectness of HTML parsing, for example, here are some classic examples:
  6 | - `<form><form>text</form></form>` should be transformed to `<form>text</form>`
  7 | - `<h1><h2>text</h2></h1>` should be transformed to `<h1><h2>text</h2></h1>`
  8 | 
  9 | There are several reasons why you don't want to rely entirely on allowed tags:
 10 | - It won't help you determine which parser your custom sanitization is based on
 11 | - Allowed tags can be changed
 12 |   
 13 | ## Features
 14 | 
 15 | - Identify allowed HTML, SVG, and MathML tags.
 16 | - Identify allowed attributes.
 17 | - Identify incorrect parsing
 18 | - Use a customizable handler function to process HTML payloads.
 19 | - Load and compare results against predefined Parser outputs.
 20 | 
 21 | ## Installation
 22 | 
 23 | To install the package, use pip:
 24 | 
 25 | ```
 26 | pip install hui
 27 | ```
 28 | 
 29 | ## Usage
 30 | 
 31 | Here is a basic example of how to use the `Identifier` class from the package:
 32 | 
 33 | ```python
 34 | from hui.identify import Identifier
 35 | import requests
 36 | 
 37 | def handler(payload):
 38 |     return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text
 39 | 
 40 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False)
 41 | print(a.identify())
 42 | # run all
 43 | # Example output 
 44 | # [[1.0, 27, 'JS_SANITIZE_HTML'], [0.8148148148148148, 22, 'PYTHON_HTML_SANITIZE'], ...
 45 | 
 46 | print(a.check_attr_allowed("href",tag="a"))
 47 | # True or False
 48 | print(a.INCORRECT_PARSED)
 49 | # Example output
 50 | # [{'output': '<h5><h6>govnoed</h6></h5>', 'expected': '<h5></h5><h6>$text</h6>'}, .. ]
 51 | print(a.ALLOWED_TAGS)
 52 | # print allowed tags
 53 | print(a.ATTRIBUTES)
 54 | # Prints ATTRIBUTES info
 55 | print(a.DEPTH_LIMITS)
 56 | # Example Outputs:
 57 | # (514, 'No max tags limit')
 58 | # (512, 'Flattening')
 59 | # (255, 'Removing')
 60 | ```
 61 | 
 62 | ## Identifier Class
 63 | 
 64 | The `Identifier` class is the core of this package. It is responsible for identifying allowed HTML, SVG, and MathML tags based on a handler function that processes HTML payloads.
 65 | 
 66 | The class also maintains an `INCORRECT_PARSED` list, which contains payloads that were incorrectly parsed by the handler. For example, this may include cases where the parser fails to remove nested forms and similar issues.
 67 | 
 68 | ## Current Parsers
 69 | 
 70 | The following parsers are currently supported in the project:
 71 | 
 72 | - **DOMpurify with JSDOM (JS)**
 73 | - **JSDOM (JS)**
 74 | - **sanitize_html (JS)**
 75 | - **htmlparser2 (JS)**
 76 | - **JSXSS (JS)**
 77 | - **html (python)**
 78 | - **lxml (python)**
 79 | - **html_sanitizer (python)**
 80 | - **net/html (go)**
 81 | - **bluemonday (go)**
 82 | 
 83 | If you believe a new parser/sanitizer should be added, please create an issue, and I will be happy to include it.
 84 | ### Constructor Parameters
 85 | 
 86 | - **`handler`**: A function that takes a payload and returns an HTML response. Example:
 87 |   ```python
 88 |   lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text
 89 |   ```
 90 | 
 91 | - **`buffer_enabled`** (optional, default=False): A boolean flag to enable or disable buffering of payloads before sending them to the handler. By default, buffering is disabled, as it can sometimes lead to incorrect results. For example, some sanitizers may simply remove all input if it contains a dangerous tag. Use buffering only if the server you are interacting with has strict rate limits.
 92 | 
 93 | - **`buffer_delimeter`** (optional, default=`<div>TEXTTEXT</div>`): A string used to delimit buffered payloads when sending them to the handler.
 94 | 
 95 | - **`buffer_limit`** (optional, default=32): An integer that specifies the maximum number of payloads to buffer before sending them to the handler.
 96 | 
 97 | - **`template_vars`** (optional, default=None): A dictionary of template variables to use for substitution in payloads.
 98 | 
 99 | - **`debug_mode`** (optional, default=False): A boolean flag to enable or disable debug logging.
100 | 
101 | ### Methods
102 | 
103 | - **`check_allowed_tags()`**: Checks and populates the `ALLOWED_TAGS` dictionary with allowed tags for HTML, SVG, and MathML.
104 | - **`call_handler(template_payloads: list[str])`**: Calls the handler function with a list of template payloads and returns the processed results.
105 | - **`check_namespace(namespace: str)`**: Checks for allowed tags in the specified namespace (SVG or MathML).
106 | - **`identify()`**: Identifies the best matching Parser based on generated payloads and returns a list of matches.
107 | - **`check_allowed_attrs()`**: Checks and validates allowed attributes for HTML tags.
108 | 
109 | ### identify() Method
110 | 
111 | The `identify()` method checks if allowed tags have been determined. If not, it calls `check_allowed_tags()` to populate the `ALLOWED_TAGS`. It then loads a list of generated payloads from a JSON file and calls the handler for each payload. Finally, it compares the results against all JSON files in the `results_parsers` directory to count matches and returns a sorted list of results.
112 | 
113 | - **Returns**: A list of tuples, each containing:
114 |   - The match ratio (float)
115 |   - The number of matches (int)
116 |   - The name of the Parser (str)
117 | 
118 | ### Attributes
119 | 
120 | - **`ATTRIBUTES`**: A dictionary that holds information about allowed attributes for HTML tags, including:
121 |   - `custom_attribute`: Indicates if custom attributes are allowed.
122 |   - `event_attributes_blocked`: Indicates if event attributes are directly blocked.
123 |   - `data_attributes`: Indicates if data attributes are allowed.
124 |   - `attrs_allowed`: A nested dictionary categorizing allowed attributes into global, event and specific tags attributes.
125 | 
126 | ### Allowed Tags
127 | 
128 | - **`ALLOWED_TAGS`**: A dictionary that holds information about allowed tags for HTML, SVG, and MathML, including:
129 |   - `html`: A list of allowed HTML tags.
130 |   - `svg`: A list of allowed SVG tags.
131 |   - `math`: A list of allowed MathML tags.
132 | 
133 | ### Incorrectly Parsed Tags
134 | 
135 | - **`INCORRECT_PARSED`**: A dictionary that holds information about incorrectly parsed tags for HTML, SVG, and MathML, including:
136 |   - `html`: A list of incorrectly parsed HTML tags.
137 |   - `svg`: A list of incorrectly parsed SVG tags.
138 |   - `math`: A list of incorrectly parsed MathML tags.
139 | 
140 | ### DEPTH_LIMITS
141 | **DEPTH_LIMITS**: A tuple that holds information about the depth limits of HTML tags, including:
142 |   - `max_depth`: The maximum depth of HTML tags.
143 |   - `limit_strategy`: The strategy used to handle tags exceeding the depth limit, which can be 'No max tags limit', 'Flattening', or 'Removing'.
144 | 


--------------------------------------------------------------------------------
/hui.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
  1 | Metadata-Version: 2.1
  2 | Name: hui
  3 | Version: 0.2.2
  4 | Home-page: https://github.com/slonser/hui
  5 | Download-URL: https://github.com/Slonser/hui/archive/v_01.tar.gz
  6 | Author: Slonser
  7 | Author-email: slonser@slonser.info
  8 | License: MIT
  9 | Keywords: HTML,hui,HTML GUESSER,HTML identifier,XSS,bugbounty
 10 | Classifier: Development Status :: 3 - Alpha
 11 | Classifier: Intended Audience :: Developers
 12 | Classifier: Topic :: Software Development :: Build Tools
 13 | Classifier: License :: OSI Approved :: MIT License
 14 | Classifier: Programming Language :: Python :: 3
 15 | Description-Content-Type: text/markdown
 16 | License-File: LICENSE.txt
 17 | 
 18 | # Html Universal Identifier
 19 | 
 20 | Html Universal Identifier is an alpha version of an application designed for identifying server-side HTML parsers. This package provides a way to determine which HTML, SVG, and MathML tags are allowed, helps to find parser features (incorrectly implemented tags), and can also help to guess which parser is used on the backend.
 21 | 
 22 | Primarily, this library relies on the incorrectness of HTML parsing, for example, here are some classic examples:
 23 | - `<form><form>text</form></form>` should be transformed to `<form>text</form>`
 24 | - `<h1><h2>text</h2></h1>` should be transformed to `<h1><h2>text</h2></h1>`
 25 | 
 26 | There are several reasons why you don't want to rely entirely on allowed tags:
 27 | - It won't help you determine which parser your custom sanitization is based on
 28 | - Allowed tags can be changed
 29 |   
 30 | ## Features
 31 | 
 32 | - Identify allowed HTML, SVG, and MathML tags.
 33 | - Identify allowed attributes.
 34 | - Identify incorrect parsing
 35 | - Use a customizable handler function to process HTML payloads.
 36 | - Load and compare results against predefined Parser outputs.
 37 | 
 38 | ## Installation
 39 | 
 40 | To install the package, use pip:
 41 | 
 42 | ```
 43 | pip install hui
 44 | ```
 45 | 
 46 | ## Usage
 47 | 
 48 | Here is a basic example of how to use the `Identifier` class from the package:
 49 | 
 50 | ```python
 51 | from hui.identify import Identifier
 52 | import requests
 53 | 
 54 | def handler(payload):
 55 |     return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text
 56 | 
 57 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False)
 58 | print(a.identify())
 59 | # run all
 60 | # Example output 
 61 | # [[1.0, 27, 'JS_SANITIZE_HTML'], [0.8148148148148148, 22, 'PYTHON_HTML_SANITIZE'], ...
 62 | 
 63 | print(a.check_attr_allowed("href",tag="a"))
 64 | # True or False
 65 | print(a.INCORRECT_PARSED)
 66 | # Example output
 67 | # [{'output': '<h5><h6>govnoed</h6></h5>', 'expected': '<h5></h5><h6>$text</h6>'}, .. ]
 68 | print(a.ALLOWED_TAGS)
 69 | # print allowed tags
 70 | print(a.ATTRIBUTES)
 71 | # Prints ATTRIBUTES info
 72 | print(a.DEPTH_LIMITS)
 73 | # Example Outputs:
 74 | # (514, 'No max tags limit')
 75 | # (512, 'Flattening')
 76 | # (255, 'Removing')
 77 | ```
 78 | 
 79 | ## Identifier Class
 80 | 
 81 | The `Identifier` class is the core of this package. It is responsible for identifying allowed HTML, SVG, and MathML tags based on a handler function that processes HTML payloads.
 82 | 
 83 | The class also maintains an `INCORRECT_PARSED` list, which contains payloads that were incorrectly parsed by the handler. For example, this may include cases where the parser fails to remove nested forms and similar issues.
 84 | 
 85 | ## Current Parsers
 86 | 
 87 | The following parsers are currently supported in the project:
 88 | 
 89 | - **DOMpurify with JSDOM (JS)**
 90 | - **JSDOM (JS)**
 91 | - **sanitize_html (JS)**
 92 | - **htmlparser2 (JS)**
 93 | - **JSXSS (JS)**
 94 | - **html (python)**
 95 | - **lxml (python)**
 96 | - **html_sanitizer (python)**
 97 | - **net/html (go)**
 98 | - **bluemonday (go)**
 99 | 
100 | If you believe a new parser/sanitizer should be added, please create an issue, and I will be happy to include it.
101 | ### Constructor Parameters
102 | 
103 | - **`handler`**: A function that takes a payload and returns an HTML response. Example:
104 |   ```python
105 |   lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text
106 |   ```
107 | 
108 | - **`buffer_enabled`** (optional, default=False): A boolean flag to enable or disable buffering of payloads before sending them to the handler. By default, buffering is disabled, as it can sometimes lead to incorrect results. For example, some sanitizers may simply remove all input if it contains a dangerous tag. Use buffering only if the server you are interacting with has strict rate limits.
109 | 
110 | - **`buffer_delimeter`** (optional, default=`<div>TEXTTEXT</div>`): A string used to delimit buffered payloads when sending them to the handler.
111 | 
112 | - **`buffer_limit`** (optional, default=32): An integer that specifies the maximum number of payloads to buffer before sending them to the handler.
113 | 
114 | - **`template_vars`** (optional, default=None): A dictionary of template variables to use for substitution in payloads.
115 | 
116 | - **`debug_mode`** (optional, default=False): A boolean flag to enable or disable debug logging.
117 | 
118 | ### Methods
119 | 
120 | - **`check_allowed_tags()`**: Checks and populates the `ALLOWED_TAGS` dictionary with allowed tags for HTML, SVG, and MathML.
121 | - **`call_handler(template_payloads: list[str])`**: Calls the handler function with a list of template payloads and returns the processed results.
122 | - **`check_namespace(namespace: str)`**: Checks for allowed tags in the specified namespace (SVG or MathML).
123 | - **`identify()`**: Identifies the best matching Parser based on generated payloads and returns a list of matches.
124 | - **`check_allowed_attrs()`**: Checks and validates allowed attributes for HTML tags.
125 | 
126 | ### identify() Method
127 | 
128 | The `identify()` method checks if allowed tags have been determined. If not, it calls `check_allowed_tags()` to populate the `ALLOWED_TAGS`. It then loads a list of generated payloads from a JSON file and calls the handler for each payload. Finally, it compares the results against all JSON files in the `results_parsers` directory to count matches and returns a sorted list of results.
129 | 
130 | - **Returns**: A list of tuples, each containing:
131 |   - The match ratio (float)
132 |   - The number of matches (int)
133 |   - The name of the Parser (str)
134 | 
135 | ### Attributes
136 | 
137 | - **`ATTRIBUTES`**: A dictionary that holds information about allowed attributes for HTML tags, including:
138 |   - `custom_attribute`: Indicates if custom attributes are allowed.
139 |   - `event_attributes_blocked`: Indicates if event attributes are directly blocked.
140 |   - `data_attributes`: Indicates if data attributes are allowed.
141 |   - `attrs_allowed`: A nested dictionary categorizing allowed attributes into global, event and specific tags attributes.
142 | 
143 | ### Allowed Tags
144 | 
145 | - **`ALLOWED_TAGS`**: A dictionary that holds information about allowed tags for HTML, SVG, and MathML, including:
146 |   - `html`: A list of allowed HTML tags.
147 |   - `svg`: A list of allowed SVG tags.
148 |   - `math`: A list of allowed MathML tags.
149 | 
150 | ### Incorrectly Parsed Tags
151 | 
152 | - **`INCORRECT_PARSED`**: A dictionary that holds information about incorrectly parsed tags for HTML, SVG, and MathML, including:
153 |   - `html`: A list of incorrectly parsed HTML tags.
154 |   - `svg`: A list of incorrectly parsed SVG tags.
155 |   - `math`: A list of incorrectly parsed MathML tags.
156 | 
157 | ### DEPTH_LIMITS
158 | **DEPTH_LIMITS**: A tuple that holds information about the depth limits of HTML tags, including:
159 |   - `max_depth`: The maximum depth of HTML tags.
160 |   - `limit_strategy`: The strategy used to handle tags exceeding the depth limit, which can be 'No max tags limit', 'Flattening', or 'Removing'.
161 | 


--------------------------------------------------------------------------------
/hui/ALLOWED_ATTRS.py:
--------------------------------------------------------------------------------
  1 | GLOBAL_ATTRS = [
  2 |     "accesskey",
  3 |     "anchor",
  4 |     "autocapitalize",
  5 |     "autocorrect",
  6 |     "autofocus",
  7 |     "class",
  8 |     "contenteditable",
  9 |     "data-*",
 10 |     "dir",
 11 |     "draggable",
 12 |     "enterkeyhint",
 13 |     "exportparts",
 14 |     "hidden",
 15 |     "id",
 16 |     "inert",
 17 |     "inputmode",
 18 |     "is",
 19 |     "itemid",
 20 |     "itemprop",
 21 |     "itemref",
 22 |     "itemscope",
 23 |     "itemtype",
 24 |     "lang",
 25 |     "nonce",
 26 |     "part",
 27 |     "popover",
 28 |     "slot",
 29 |     "spellcheck",
 30 |     "style",
 31 |     "tabindex",
 32 |     "title",
 33 |     "translate",
 34 |     "virtualkeyboardpolicy",
 35 |     "writingsuggestions"
 36 | ]
 37 | 
 38 | EVENT_ATTRS = [
 39 |     "onafterprint",
 40 |     "onafterscriptexecute",
 41 |     "onanimationcancel",
 42 |     "onanimationend",
 43 |     "onanimationiteration",
 44 |     "onanimationstart",
 45 |     "onauxclick",
 46 |     "onbeforecopy",
 47 |     "onbeforecut",
 48 |     "onbeforeinput",
 49 |     "onbeforeprint",
 50 |     "onbeforescriptexecute",
 51 |     "onbeforetoggle",
 52 |     "onbeforeunload",
 53 |     "onbegin",
 54 |     "onblur",
 55 |     "oncancel",
 56 |     "oncanplay",
 57 |     "oncanplaythrough",
 58 |     "onchange",
 59 |     "onclick",
 60 |     "onclose",
 61 |     "oncontentvisibilityautostatechange",
 62 |     "oncontextmenu",
 63 |     "oncopy",
 64 |     "oncuechange",
 65 |     "oncut",
 66 |     "ondblclick",
 67 |     "ondrag",
 68 |     "ondragend",
 69 |     "ondragenter",
 70 |     "ondragexit",
 71 |     "ondragleave",
 72 |     "ondragover",
 73 |     "ondragstart",
 74 |     "ondrop",
 75 |     "ondurationchange",
 76 |     "onend",
 77 |     "onended",
 78 |     "onerror",
 79 |     "onfocus",
 80 |     "onfocus(autofocus)",
 81 |     "onfocusin",
 82 |     "onfocusout",
 83 |     "onformdata",
 84 |     "onfullscreenchange",
 85 |     "onhashchange",
 86 |     "oninput",
 87 |     "oninvalid",
 88 |     "onkeydown",
 89 |     "onkeypress",
 90 |     "onkeyup",
 91 |     "onload",
 92 |     "onloadeddata",
 93 |     "onloadedmetadata",
 94 |     "onloadstart",
 95 |     "onmessage",
 96 |     "onmousedown",
 97 |     "onmouseenter",
 98 |     "onmouseleave",
 99 |     "onmousemove",
100 |     "onmouseout",
101 |     "onmouseover",
102 |     "onmouseup",
103 |     "onmousewheel",
104 |     "onmozfullscreenchange",
105 |     "onpagehide",
106 |     "onpageshow",
107 |     "onpaste",
108 |     "onpause",
109 |     "onplay",
110 |     "onplaying",
111 |     "onpointercancel",
112 |     "onpointerdown",
113 |     "onpointerenter",
114 |     "onpointerleave",
115 |     "onpointermove",
116 |     "onpointerout",
117 |     "onpointerover",
118 |     "onpointerrawupdate",
119 |     "onpointerup",
120 |     "onpopstate",
121 |     "onprogress",
122 |     "onratechange",
123 |     "onrepeat",
124 |     "onreset",
125 |     "onresize",
126 |     "onscroll",
127 |     "onscrollend",
128 |     "onscrollsnapchange",
129 |     "onsearch",
130 |     "onseeked",
131 |     "onseeking",
132 |     "onselect",
133 |     "onselectionchange",
134 |     "onselectstart",
135 |     "onshow",
136 |     "onsubmit",
137 |     "onsuspend",
138 |     "ontimeupdate",
139 |     "ontoggle",
140 |     "ontoggle(popover)",
141 |     "ontouchend",
142 |     "ontouchmove",
143 |     "ontouchstart",
144 |     "ontransitioncancel",
145 |     "ontransitionend",
146 |     "ontransitionrun",
147 |     "ontransitionstart",
148 |     "onunhandledrejection",
149 |     "onunload",
150 |     "onvolumechange",
151 |     "onwaiting",
152 |     "onwebkitanimationend",
153 |     "onwebkitanimationiteration",
154 |     "onwebkitanimationstart",
155 |     "onwebkitfullscreenchange",
156 |     "onwebkitmouseforcechanged",
157 |     "onwebkitmouseforcedown",
158 |     "onwebkitmouseforceup",
159 |     "onwebkitmouseforcewillbegin",
160 |     "onwebkitplaybacktargetavailabilitychanged",
161 |     "onwebkitpresentationmodechanged",
162 |     "onwebkittransitionend",
163 |     "onwebkitwillrevealbottom",
164 |     "onwheel"
165 | ]
166 | 
167 | DEFAULT_ATTRS = {
168 |     "form": [
169 |         "accept", "accept-charset", "action", "autocomplete", "enctype", 
170 |         "method", "name", "novalidate", "target"
171 |     ],
172 |     "input": [
173 |         "accept", "alt", "autocomplete", "capture", "checked", "dirname", 
174 |         "disabled", "form", "formaction", "formenctype", "formmethod", 
175 |         "formnovalidate", "formtarget", "list", "max", "maxlength", 
176 |         "minlength", "min", "multiple", "name", "pattern", "placeholder", 
177 |         "readonly", "required", "size", "src", "step", "type", "usemap", 
178 |         "value", "width"
179 |     ],
180 |     "col": ["span"],
181 |     "colgroup": ["span"],
182 |     "iframe": [
183 |         "allow", "csp", "name", "referrerpolicy", "sandbox", "src", 
184 |         "srcdoc", "width"
185 |     ],
186 |     "img": [
187 |         "alt", "crossorigin", "decoding", "intrinsicsize", "ismap", 
188 |         "referrerpolicy", "sizes", "src", "srcset", "usemap", "width"
189 |     ],
190 |     "table": ["summary"],
191 |     "td": ["colspan", "headers", "rowspan"],
192 |     "th": ["colspan", "headers", "rowspan", "scope"],
193 |     "area": [
194 |         "alt", "coords", "download", "href", "media", "ping", 
195 |         "referrerpolicy", "rel", "shape", "target"
196 |     ],
197 |     "link": [
198 |         "as", "crossorigin", "href", "hreflang", "integrity", 
199 |         "media", "referrerpolicy", "rel", "sizes", "type"
200 |     ],
201 |     "script": [
202 |         "async", "crossorigin", "defer", "integrity", "language", 
203 |         "referrerpolicy", "src", "type"
204 |     ],
205 |     "select": [
206 |         "autocomplete", "disabled", "form", "multiple", "name", 
207 |         "required", "size"
208 |     ],
209 |     "textarea": [
210 |         "autocomplete", "cols", "dirname", "disabled", "enterkeyhint", 
211 |         "form", "inputmode", "maxlength", "minlength", "name", 
212 |         "placeholder", "readonly", "required", "rows", "wrap"
213 |     ],
214 |     "audio": [
215 |         "autoplay", "controls", "crossorigin", "loop", "muted", 
216 |         "preload", "src"
217 |     ],
218 |     "video": [
219 |         "autoplay", "controls", "crossorigin", "loop", "muted", 
220 |         "playsinline", "poster", "preload", "src", "width"
221 |     ],
222 |     "marquee": ["loop"],
223 |     "object": [
224 |         "data", "form", "name", "type", "usemap", "width"
225 |     ],
226 |     "meta": ["charset", "content", "http-equiv", "name"],
227 |     "blockquote": ["cite"],
228 |     "del": ["cite", "datetime"],
229 |     "ins": ["cite", "datetime"],
230 |     "q": ["cite"],
231 |     "time": ["datetime"],
232 |     "track": ["default", "kind", "label", "src", "srclang"],
233 |     "button": [
234 |         "disabled", "form", "formaction", "formenctype", "formmethod", 
235 |         "formnovalidate", "formtarget", "name", "type", "value"
236 |     ],
237 |     "fieldset": ["disabled", "form", "name"],
238 |     "optgroup": ["disabled", "label"],
239 |     "option": ["disabled", "label", "selected", "value"],
240 |     "a": [
241 |         "download", "href", "hreflang", "media", "ping", 
242 |         "referrerpolicy", "rel", "shape", "target"
243 |     ],
244 |     "label": ["for", "form"],
245 |     "output": ["for", "form", "name"],
246 |     "meter": ["form", "high", "low", "max", "min", "optimum", "value"],
247 |     "progress": ["form", "max", "value"],
248 |     "canvas": ["width"],
249 |     "embed": ["src", "type", "width"],
250 |     "base": ["href", "target"],
251 |     "source": ["media", "sizes", "src", "srcset", "type"],
252 |     "style": ["media", "scoped", "type"],
253 |     "map": ["name"],
254 |     "param": ["name", "value"],
255 |     "details": ["open"],
256 |     "dialog": ["open"],
257 |     "ol": ["reversed", "start", "type"],
258 |     "menu": ["type"],
259 |     "data": ["value"],
260 |     "li": ["value"]
261 | }


--------------------------------------------------------------------------------
/hui/identify.py:
--------------------------------------------------------------------------------
  1 | from .ALLOWED_TAGS import *
  2 | from .ALLOWED_ATTRS import *
  3 | from string import Template
  4 | import json
  5 | import os
  6 | import importlib.resources
  7 | from importlib.resources import files
  8 | from .parsers.simple_parser import SANITIZE_HTML
  9 | import logging
 10 | from .CustomParser import CustomParser
 11 | 
 12 | class Identifier:
 13 |     def __init__(self, handler, buffer_enabled=False, buffer_delimeter="<div>TEXTTEXT</div>", buffer_limit=32, template_vars=None, debug_mode=False) -> None:
 14 |         """
 15 |         Initializes the Identifier class with a handler function and optional parameters for buffer management, template variables, and logging.
 16 | 
 17 |         :param handler: handler function that must return text with an HTML response.
 18 |             Example of a handler function:
 19 |                 lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text
 20 |         :param buffer_enabled: Boolean indicating whether to enable buffering of payloads before sending to the handler.
 21 |         :param buffer_delimeter: String used to delimit payloads in the buffer.
 22 |         :param buffer_limit: Integer specifying the maximum number of payloads to buffer before sending to the handler.
 23 |         :param template_vars: Optional dictionary of template variables to use for substitution in payloads.
 24 |         :param debug_mode: Boolean indicating whether to enable debug logging.
 25 |         :return: returns nothing
 26 |         """
 27 |         self.handler = handler
 28 |         self.ALLOWED_TAGS = {
 29 |             "html": [],
 30 |             "svg": [],
 31 |             "math": [],
 32 |         }
 33 |         self.TEMPLATE_VARS = template_vars if template_vars is not None else {
 34 |             'text': 'govnoed',
 35 |             'href': 'https://github.com',
 36 |             'attribute_prefix': 'data'
 37 |         }
 38 | 
 39 |         self.ALLOWED_TAGS_CHECKED = False
 40 |         self.DEFAULT_SANITIZER = SANITIZE_HTML()
 41 | 
 42 |         self.BUFFER = ""
 43 |         self.BUFFER_LIMIT = buffer_limit
 44 |         self.BUFFER_ENABLED = buffer_enabled
 45 |         self.BUFFER_DELIMETER = buffer_delimeter
 46 |         
 47 |         self.INCORRECT_PARSED = []
 48 |         
 49 |         self.DEPTH_LIMITS = ()
 50 | 
 51 |         self.ATTRIBUTES = {
 52 |             "custom_attribute" : None, # is custom attributes allowed
 53 |             "event_attributes_blocked": None, # is event attributes directly blocked
 54 |             "data_attributes": None, # is data attributes allowed
 55 |             "attrs_allowed":{
 56 |                 "global":[], # global attributes
 57 |                 "events":[] # events attributes
 58 |             }
 59 |         }
 60 | 
 61 |         # Configure logging based on debug_mode
 62 |         if debug_mode:
 63 |             logging.basicConfig(level=logging.DEBUG)
 64 |         else:
 65 |             logging.basicConfig(level=logging.INFO)
 66 | 
 67 |         self.logger = logging.getLogger(__name__)
 68 |         self.parser = CustomParser()
 69 | 
 70 |     def check_allowed_tags(self) -> dict:
 71 |         """
 72 |         Check and validate allowed HTML, SVG, and MathML tags.
 73 | 
 74 |         :return: A dictionary of allowed tags.
 75 |         """
 76 |         self.logger.debug("Checking allowed tags...")
 77 |         self.ALLOWED_TAGS_CHECKED = True
 78 |         self.check_html_namespace()
 79 |         self.check_namespace("math")
 80 |         self.check_namespace("svg")
 81 | 
 82 |         self.logger.debug("Allowed tags checked: %s", self.ALLOWED_TAGS)
 83 |         return self.ALLOWED_TAGS
 84 | 
 85 |     def call_handler(self, template_payloads: list[str]) -> list[str]:
 86 |         """
 87 |         Call the handler function with the provided template payloads.
 88 | 
 89 |         :param template_payloads: List of template strings to process.
 90 |         :return: List of processed results from the handler.
 91 |         """
 92 |         self.logger.debug("Calling handler with payloads: %s", template_payloads)
 93 |         for i in range(len(template_payloads)):
 94 |             template_payloads[i] = Template(template_payloads[i]).safe_substitute(self.TEMPLATE_VARS)
 95 | 
 96 |         if self.BUFFER_ENABLED:
 97 |             res = []
 98 |             buffer = []
 99 |             for payload in template_payloads:
100 |                 buffer.append(payload)
101 |                 if len(buffer) >= self.BUFFER_LIMIT:
102 |                     res.extend(self.handler(self.BUFFER_DELIMETER.join(buffer)).split(self.BUFFER_DELIMETER))
103 |                     buffer = []
104 |             if buffer:
105 |                 res.extend(self.handler(self.BUFFER_DELIMETER.join(buffer)).split(self.BUFFER_DELIMETER))
106 |             self.logger.debug("Handler results: %s", res)
107 |             return res
108 | 
109 |         res = [self.handler(payload) for payload in template_payloads]
110 |         self.logger.debug("Handler results: %s", res)
111 |         return res
112 | 
113 |     def check_html_namespace(self) -> None:
114 |         """
115 |         Check and validate allowed HTML tags.
116 | 
117 |         :return: None
118 |         """
119 |         self.logger.debug("Checking HTML namespace...")
120 |         arr = []
121 |         for tag in html_tags:
122 |             arr.append([f'<{tag}>$text</{tag}>', tag])
123 | 
124 |         for tag in html_table_tags:
125 |             arr.append([f'<table><{tag}>$text</{tag}></table>', tag])
126 | 
127 |         handler_results = self.call_handler([x[0] for x in arr])
128 |         for i in range(len(handler_results)):
129 |             res = handler_results[i]
130 |             if f'<{arr[i][1]}' in res:
131 |                 self.ALLOWED_TAGS["html"].append(arr[i][1])
132 | 
133 |         self.logger.debug("Allowed HTML tags: %s", self.ALLOWED_TAGS["html"])
134 | 
135 |     def check_namespace(self, namespace: str) -> None:
136 |         """
137 |         Check and validate tags in the specified namespace (math or svg).
138 | 
139 |         :param namespace: The namespace to check (math or svg).
140 |         :raises Exception: If the namespace is not supported.
141 |         :return: None
142 |         """
143 |         self.logger.debug("Checking namespace: %s", namespace)
144 |         if namespace not in self.ALLOWED_TAGS:
145 |             raise Exception(f'{namespace} namespace is not supported')
146 | 
147 |         tag_arr = []
148 |         namespace_tags = []
149 |         if namespace == "math":
150 |             namespace_tags = mathml_tags
151 |         elif namespace == "svg":
152 |             namespace_tags = svg_tags
153 | 
154 |         for tag in namespace_tags:
155 |             tag_arr.append([f'<{namespace}><{tag}>$text</{tag}></{namespace}>', tag])
156 | 
157 |         handler_results = self.call_handler([x[0] for x in tag_arr])
158 | 
159 |         for i in range(len(handler_results)):
160 |             res = handler_results[i]
161 |             if f'<{tag_arr[i][1]}' in res:
162 |                 self.ALLOWED_TAGS[namespace].append(tag_arr[i][1])
163 | 
164 |         self.logger.debug("Allowed tags for namespace '%s': %s", namespace, self.ALLOWED_TAGS[namespace])
165 | 
166 |     def check_tag_allowed(self, tag: str) -> bool:
167 |         """
168 |         Check if a tag is allowed.
169 | 
170 |         :param tag: The tag to check.
171 |         :return: True if the tag is allowed, False otherwise.
172 |         """
173 |         return any([(tag in self.ALLOWED_TAGS[namespace]) for namespace in self.ALLOWED_TAGS])
174 | 
175 |     def identify(self) -> list[list[float | int | str]]:
176 |         """
177 |         Identify and validate tags against expected outputs.
178 | 
179 |         :return: A sorted list of results with match ratios and file names.
180 |         """
181 |         self.logger.debug("Identifying tags...")
182 |         if len(self.ALLOWED_TAGS['html']) == 0:
183 |             self.check_allowed_tags()
184 |         self.check_allowed_attrs()
185 |         self.check_depth()
186 |         arr = self.DEFAULT_SANITIZER.checks
187 |         res = self.call_handler([tag.payload for tag in arr])
188 |         for i in range(len(res)):
189 |             all_tags_allowed = all([self.check_tag_allowed(tag) for tag in arr[i].tags])
190 |             if  all_tags_allowed and  not(arr[i].check(res[i],self.TEMPLATE_VARS)):
191 |                 self.logger.debug("Found incorrect parsing logic: %s, but %s is expected", res[i], arr[i].expected_output)
192 |                 self.INCORRECT_PARSED.append({"output": res[i].strip(), "expected": arr[i].expected_output})
193 |                 
194 | 
195 |         json_files = [f for f in importlib.resources.files('hui.results_parsers').iterdir() if f.name.endswith('.json')]
196 | 
197 |         result = []
198 |         for json_file in json_files:
199 |             with open(json_file) as f:
200 |                 data = json.load(f)
201 | 
202 |             # Count the number of matches in the JSON file
203 |             matches = sum([1 for i in range(len(res)) if Template(data[i]).substitute(self.TEMPLATE_VARS).strip() in res[i].strip()])
204 |             result.append([matches / len(data), matches, json_file.name.split('.')[0]])
205 | 
206 |         result = sorted(result, reverse=True)
207 |         self.logger.debug("Identification results: %s", result)
208 |         return result
209 | 
210 |     def check_namespace_supported(self, namespace: str) -> bool:
211 |         """
212 |         Check if the specified namespace is supported.
213 | 
214 |         :param namespace: The namespace to check.
215 |         :raises Exception: If the namespace is invalid or not supported.
216 |         :return: True if the namespace is supported, False otherwise.
217 |         """
218 |         if not self.ALLOWED_TAGS_CHECKED:
219 |             self.check_allowed_tags()
220 |         if namespace not in self.ALLOWED_TAGS:
221 |             raise Exception('Invalid namespace name')
222 |         return len(self.ALLOWED_TAGS[namespace]) > 0
223 |     
224 |     def check_attr_allowed(self, attr: str, tag: str = None, attr_value: str = "https://github.com/Slonser/hui") -> bool:
225 |         """
226 |         Checks if a given attribute is allowed for a specified tag.
227 | 
228 |         This method checks if a given attribute is allowed for a specified tag by simulating the parsing of HTML elements with the attribute and then checking if the attribute is present in the parsed attributes.
229 | 
230 |         :param attr: The attribute to check.
231 |         :param tag: The tag to check the attribute for. Defaults to None, which means the first allowed HTML tag will be used.
232 |         :param attr_value: The value to assign to the attribute for testing. Defaults to "https://github.com/Slonser/hui".
233 |         :return: True if the attribute is allowed, False otherwise.
234 |         """
235 |         if tag is None:
236 |             assert self.check_namespace_supported("html"), "No tags allowed"
237 |             tag = self.ALLOWED_TAGS['html'][0]
238 |         
239 |         # Simulate parsing of HTML elements with the attribute to check
240 |         res = self.call_handler([f'<{tag} {attr}="{attr_value}"></{tag}>',
241 |                                  f'<{tag}/{attr}="{attr_value}"></{tag}>']) # In some situations, the attribute might only be parsed with a / symbol
242 |         self.parser.check(res[0]+res[1])
243 |         # Check if the attribute is present in the parsed attributes
244 |         return attr in [attr_parsed[0] for attr_parsed in self.parser.found_attrs]
245 |     
246 |     def check_allowed_attrs(self):
247 |         """
248 |         Check and validate allowed attributes for HTML tags.
249 | 
250 |         This method checks if global attributes, event attributes, and default attributes are allowed.
251 |         It updates the ATTRIBUTES dictionary with the allowed attributes and logs the results.
252 | 
253 |         :return: A dictionary containing the allowed attributes categorized by global, event, and specific tags.
254 |         """
255 |         for attr in GLOBAL_ATTRS:
256 |             is_allowed = self.check_attr_allowed(attr)
257 |             if is_allowed:
258 |                 self.ATTRIBUTES["attrs_allowed"]["global"].append(attr)
259 |         
260 | 
261 |         for attr in EVENT_ATTRS:
262 |             is_allowed = self.check_attr_allowed(attr)
263 |             if is_allowed:
264 |                 self.ATTRIBUTES["events"]["events"].append(attr)
265 | 
266 |         for tag in DEFAULT_ATTRS:
267 |             self.ATTRIBUTES["attrs_allowed"][tag] = []
268 |             for attr in DEFAULT_ATTRS[tag]:
269 |                 is_allowed = self.check_attr_allowed(attr,tag=tag)
270 |                 if is_allowed:
271 |                     self.ATTRIBUTES["attrs_allowed"][tag].append(attr)
272 |  
273 |         self.ATTRIBUTES["data_attributes"] = self.check_attr_allowed("data-hui")
274 |         if self.ATTRIBUTES["data_attributes"]:
275 |             self.logger.debug("data attributes allowed")
276 |         
277 |         self.ATTRIBUTES["custom_attribute"] = self.check_attr_allowed("custom")
278 | 
279 |         if self.ATTRIBUTES["custom_attribute"]:
280 |             self.logger.debug("Custom attributes allowed")
281 |         
282 |         self.ATTRIBUTES["event_attributes_blocked"] = not(self.check_attr_allowed("onhui"))
283 |         
284 |         if self.ATTRIBUTES["custom_attribute"] and self.ATTRIBUTES["event_attributes_blocked"]:
285 |             self.logger.debug("Event attributes directly blocked")
286 |         
287 |         return self.ATTRIBUTES
288 |     
289 |     def check_depth(self):
290 |         """
291 |         Check and validate the depth of HTML tags.
292 | 
293 |         This method checks if the depth of HTML tags exceeds the limit and updates the DEPTH_LIMITS accordingly.
294 | 
295 |         :return: DEPTH_LIMITS
296 |         """
297 |         assert self.check_namespace_supported("html"), "No tags allowed"
298 |         tag = self.ALLOWED_TAGS['html'][0]
299 |         res = self.call_handler([f'<div>'*514+f'</div>'])
300 |         self.parser.check(res[0])
301 |         if self.parser.max_depth > 512:
302 |             self.DEPTH_LIMITS = (self.parser.max_depth, 'No max tags limit')
303 |         elif self.parser.start_tags > 512:
304 |             self.DEPTH_LIMITS = (self.parser.max_depth, 'Flattening')
305 |         else:
306 |             self.DEPTH_LIMITS = (self.parser.max_depth, 'Removing')
307 |         return self.DEPTH_LIMITS
308 | 


--------------------------------------------------------------------------------