├── hui ├── parsers │ ├── __init__.py │ ├── generators │ │ ├── JSOUP │ │ │ ├── target │ │ │ │ ├── maven-status │ │ │ │ │ └── maven-compiler-plugin │ │ │ │ │ │ └── compile │ │ │ │ │ │ └── default-compile │ │ │ │ │ │ ├── createdFiles.lst │ │ │ │ │ │ └── inputFiles.lst │ │ │ │ ├── maven-archiver │ │ │ │ │ └── pom.properties │ │ │ │ ├── java-jsoup-1.0-SNAPSHOT.jar │ │ │ │ ├── classes │ │ │ │ │ └── com │ │ │ │ │ │ └── example │ │ │ │ │ │ └── Main.class │ │ │ │ └── original-java-jsoup-1.0-SNAPSHOT.jar │ │ │ ├── src │ │ │ │ └── main │ │ │ │ │ └── java │ │ │ │ │ └── com │ │ │ │ │ └── example │ │ │ │ │ └── Main.java │ │ │ ├── dependency-reduced-pom.xml │ │ │ └── pom.xml │ │ ├── go.mod │ │ ├── python-html_sanitizer.py │ │ ├── python-html.py │ │ ├── python-lxml-html.py │ │ ├── js_jsxss.js │ │ ├── js_sanitize-html.js │ │ ├── js_htmlparser2.js │ │ ├── js_jsdom.js │ │ ├── js_dompurify.js │ │ ├── go.sum │ │ ├── go_bluemonday.go │ │ └── go_html.go │ ├── JS_DOM.py │ ├── GO_HTML.py │ ├── JS_JSXSS.py │ ├── JS_DOMPURIFY.py │ ├── JS_HTMLPARSER2.py │ ├── simple_parser.py │ ├── JS_SANITIZE_HTML.py │ ├── JAVA_JSOUP.py │ ├── PYTHON_HTML.py │ ├── GO_bluemonday.py │ ├── PYTHON_HTML_SANITIZER.py │ └── PYTHON_LXML_HTML.py ├── __init__.py ├── results_parsers │ ├── PYTHON_HTML_SANITIZE.json │ ├── JS_DOMPURIFY.json │ ├── JS_SANITIZE_HTML.json │ ├── JSDOM_HTML.json │ ├── JAVA_JSOUP.json │ ├── JS_HTMLPARSER2.json │ ├── PYTHON_HTML.json │ ├── JS_JSXSS.json │ ├── PYTHON_LXML_HTML.json │ ├── GO_BLUEMONDAY.json │ └── GO_HTML.json ├── Generator.py ├── CustomParser.py ├── generated_payloads.json ├── ParserPayload.py ├── ALLOWED_TAGS.py ├── ParserBase.py ├── ALLOWED_ATTRS.py └── identify.py ├── hui.egg-info ├── dependency_links.txt ├── top_level.txt ├── SOURCES.txt └── PKG-INFO ├── MANIFEST ├── .gitignore ├── dist ├── hui-0.2.2.tar.gz └── hui-0.2.2-py3-none-any.whl ├── pyproject.toml ├── setup.cfg ├── examples └── example.py ├── setup.py ├── LICENSE.txt └── README.md /hui/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hui.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hui.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | hui 2 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.py 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | node_modules 3 | package-lock.json 4 | hui.egg-info -------------------------------------------------------------------------------- /dist/hui-0.2.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Slonser/hui/HEAD/dist/hui-0.2.2.tar.gz -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ['setuptools>=42'] 3 | build-backend = 'setuptools.build_meta' -------------------------------------------------------------------------------- /dist/hui-0.2.2-py3-none-any.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Slonser/hui/HEAD/dist/hui-0.2.2-py3-none-any.whl -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/target/maven-status/maven-compiler-plugin/compile/default-compile/createdFiles.lst: -------------------------------------------------------------------------------- 1 | com/example/Main.class 2 | -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/target/maven-archiver/pom.properties: -------------------------------------------------------------------------------- 1 | artifactId=java-jsoup 2 | groupId=com.example 3 | version=1.0-SNAPSHOT 4 | -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/target/java-jsoup-1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Slonser/hui/HEAD/hui/parsers/generators/JSOUP/target/java-jsoup-1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/target/classes/com/example/Main.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Slonser/hui/HEAD/hui/parsers/generators/JSOUP/target/classes/com/example/Main.class -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/target/original-java-jsoup-1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Slonser/hui/HEAD/hui/parsers/generators/JSOUP/target/original-java-jsoup-1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/target/maven-status/maven-compiler-plugin/compile/default-compile/inputFiles.lst: -------------------------------------------------------------------------------- 1 | /Users/slonser/parser_identifier/src/parsers/generators/JSOUP/src/main/java/com/example/Main.java 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | include_package_data = True 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | name = HTML Universal Identifier 7 | -------------------------------------------------------------------------------- /hui/__init__.py: -------------------------------------------------------------------------------- 1 | from .identify import Identifier 2 | from .parsers import * 3 | from .ParserPayload import ParserPayload 4 | from .ParserBase import ParserBase 5 | from .ALLOWED_TAGS import * 6 | from .Generator import generate 7 | from .CustomParser import CustomParser 8 | from .ALLOWED_ATTRS import * -------------------------------------------------------------------------------- /hui/parsers/generators/go.mod: -------------------------------------------------------------------------------- 1 | module generator 2 | 3 | go 1.23.4 4 | 5 | require ( 6 | github.com/aymerick/douceur v0.2.0 // indirect 7 | github.com/gorilla/css v1.0.1 // indirect 8 | github.com/microcosm-cc/bluemonday v1.0.27 // indirect 9 | golang.org/x/net v0.26.0 // indirect 10 | ) 11 | -------------------------------------------------------------------------------- /hui/parsers/JS_DOM.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("JS_DOM") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("node ./parsers/generators/js_jsdom.js") 12 | -------------------------------------------------------------------------------- /hui/parsers/GO_HTML.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("GO_HTML") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("cd ./parsers/generators/;go run go_html.go") 12 | -------------------------------------------------------------------------------- /hui/parsers/JS_JSXSS.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("JS_DOMPURIFY") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("node ./parsers/generators/js_jsxss.js") 12 | -------------------------------------------------------------------------------- /hui/parsers/JS_DOMPURIFY.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("JS_DOMPURIFY") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("node ./parsers/generators/js_dompurify.js") 12 | -------------------------------------------------------------------------------- /hui/parsers/JS_HTMLPARSER2.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("JS_HTMLPARSER2") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("node ./parsers/generators/js_htmlparser2.js") 12 | -------------------------------------------------------------------------------- /hui/parsers/simple_parser.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | # Cheat for include basic checks into identify 5 | # TODO: replace 6 | class SANITIZE_HTML(ParserBase): 7 | 8 | def __init__(self) -> None: 9 | super().__init__("SIMPLE_PARSER") 10 | 11 | def get_results(self): 12 | self.generate_payloads() -------------------------------------------------------------------------------- /hui/parsers/JS_SANITIZE_HTML.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("JS_SANITIZE_HTML") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("node ./parsers/generators/js_sanitize-html.js") 12 | -------------------------------------------------------------------------------- /hui/parsers/JAVA_JSOUP.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("JAVA_JSOUP") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("java -jar ./parsers/generators/JSOUP/target/java-jsoup-1.0-SNAPSHOT.jar") 12 | -------------------------------------------------------------------------------- /hui/parsers/PYTHON_HTML.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | # Python HTML parser 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("PYTHON_HTML") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("python ./parsers/generators/python-html.py") 12 | -------------------------------------------------------------------------------- /hui/parsers/GO_bluemonday.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | from ..ParserPayload import ParserPayload 3 | import os 4 | 5 | class SANITIZE_HTML(ParserBase): 6 | 7 | def __init__(self) -> None: 8 | super().__init__("GO_bluemonday") 9 | 10 | def get_results(self): 11 | self.generate_payloads() 12 | os.system("cd ./parsers/generators/;go run go_bluemonday.go") 13 | -------------------------------------------------------------------------------- /hui/parsers/PYTHON_HTML_SANITIZER.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | # Python HTML parser 4 | class SANITIZE_HTML(ParserBase): 5 | 6 | def __init__(self) -> None: 7 | super().__init__("PYTHON_HTML_SANITIZER") 8 | 9 | def get_results(self): 10 | self.generate_payloads() 11 | os.system("python ./parsers/generators/python-html_sanitizer.py") 12 | -------------------------------------------------------------------------------- /hui/parsers/PYTHON_LXML_HTML.py: -------------------------------------------------------------------------------- 1 | from ..ParserBase import ParserBase 2 | import os 3 | 4 | # Python LXML_HTML parser 5 | class SANITIZE_HTML(ParserBase): 6 | 7 | def __init__(self) -> None: 8 | super().__init__("PYTHON_LXML_HTML") 9 | 10 | def get_results(self): 11 | self.generate_payloads() 12 | os.system("python ./parsers/generators/python-lxml-html.py") 13 | -------------------------------------------------------------------------------- /hui/parsers/generators/python-html_sanitizer.py: -------------------------------------------------------------------------------- 1 | from html_sanitizer import Sanitizer 2 | import json 3 | 4 | def generate(): 5 | arr = json.load(open("generated_payloads.json")) 6 | res = [] 7 | sanitizer = Sanitizer() 8 | for payload in arr: 9 | html_content = f"{payload}" 10 | sanitized_content = sanitizer.sanitize(html_content) 11 | res.append(sanitized_content) 12 | json.dump(res,open("results_parsers/PYTHON_HTML_SANITIZE.json","w")) 13 | 14 | if __name__ == "__main__": 15 | generate() -------------------------------------------------------------------------------- /hui/parsers/generators/python-html.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import json 3 | 4 | def generate(): 5 | arr = json.load(open("generated_payloads.json")) 6 | res = [] 7 | for payload in arr: 8 | html_content = f"{payload}" 9 | 10 | soup = BeautifulSoup(html_content, 'html.parser') 11 | 12 | body_inner_html = str(soup.body) 13 | res.append(body_inner_html[6:-7]) 14 | json.dump(res,open("results_parsers/PYTHON_HTML.json","w")) 15 | 16 | if __name__ == "__main__": 17 | generate() -------------------------------------------------------------------------------- /hui/parsers/generators/python-lxml-html.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | import json 3 | 4 | def generate(): 5 | arr = json.load(open("generated_payloads.json")) 6 | res = [] 7 | for payload in arr: 8 | html_content = f"{payload}" 9 | 10 | parser = etree.HTMLParser() 11 | tree = etree.fromstring(html_content, parser) 12 | 13 | body_inner_html = etree.tostring(tree.find('.//body'), encoding='unicode') 14 | res.append(body_inner_html) 15 | json.dump(res, open("results_parsers/PYTHON_LXML_HTML.json", "w")) 16 | 17 | if __name__ == "__main__": 18 | generate() -------------------------------------------------------------------------------- /hui/results_parsers/PYTHON_HTML_SANITIZE.json: -------------------------------------------------------------------------------- 1 | ["", "", "", "", "\">", "", "", "

$text

", "

$text

", "

$text

", "$text", "$text", "

$text

", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "$text", "
$text", "

$text

", ""] -------------------------------------------------------------------------------- /hui/parsers/generators/js_jsxss.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const xss = require('xss'); 3 | 4 | function generate() { 5 | const arr = JSON.parse(fs.readFileSync("generated_payloads.json")); 6 | const res = []; 7 | for(let payload of arr){ 8 | try{ 9 | const html_content = `${payload}`; 10 | console.log(html_content) 11 | const sanitized_html = xss(html_content); 12 | res.push(sanitized_html); 13 | }catch(e){ 14 | res.push(""); 15 | } 16 | } 17 | fs.writeFileSync("results_parsers/JS_JSXSS.json", JSON.stringify(res)); 18 | } 19 | 20 | if (require.main === module) { 21 | generate(); 22 | } -------------------------------------------------------------------------------- /hui/parsers/generators/js_sanitize-html.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const sanitizeHtml = require('sanitize-html'); 3 | 4 | function generate() { 5 | const arr = JSON.parse(fs.readFileSync("generated_payloads.json")); 6 | const res = []; 7 | arr.forEach(payload => { 8 | try{ 9 | const html_content = `${payload}`; 10 | const sanitized_html = sanitizeHtml(html_content); 11 | res.push(sanitized_html); 12 | }catch{ 13 | res.push(""); 14 | } 15 | }); 16 | fs.writeFileSync("results_parsers/JS_SANITIZE_HTML.json", JSON.stringify(res)); 17 | } 18 | 19 | if (require.main === module) { 20 | generate(); 21 | } -------------------------------------------------------------------------------- /hui/parsers/generators/js_htmlparser2.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const { parseDocument } = require('htmlparser2'); 3 | const cheerio = require('cheerio'); 4 | 5 | function generate() { 6 | const arr = JSON.parse(fs.readFileSync("generated_payloads.json")); 7 | const res = []; 8 | arr.forEach(payload => { 9 | const html_content = `${payload}`; 10 | const dom = parseDocument(html_content); 11 | const $ = cheerio.load(dom); 12 | const body_inner_html = $('body').html(); 13 | res.push(body_inner_html); 14 | }); 15 | fs.writeFileSync("results_parsers/JS_HTMLPARSER2.json", JSON.stringify(res)); 16 | } 17 | 18 | if (require.main === module) { 19 | generate(); 20 | } -------------------------------------------------------------------------------- /hui/results_parsers/JS_DOMPURIFY.json: -------------------------------------------------------------------------------- 1 | ["\">","\">","\">","\">","\">","","","

$text

","

$text

","

$text

","

$text
","
$text
","
$text
","
$text","
$text
","
$text
","
$text","
$text","$text","$text","$text","$text","$text","$text","$text","$text","
$text","

$text

","

"] -------------------------------------------------------------------------------- /examples/example.py: -------------------------------------------------------------------------------- 1 | from hui.identify import Identifier 2 | import requests 3 | 4 | def handler(payload): 5 | return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text 6 | 7 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False) 8 | print(a.identify()) 9 | # run all 10 | print(a.check_attr_allowed("href",tag="a")) 11 | # True or False 12 | print(a.INCORRECT_PARSED) 13 | # Example output 14 | # [{'output': '
govnoed
', 'expected': '
$text
'}, .. ] 15 | print(a.ALLOWED_TAGS) 16 | # print allowed tags 17 | print(a.ATTRIBUTES) 18 | # Prints ATTRIBUTES info 19 | print(a.DEPTH_LIMITS) 20 | # Example Outputs: 21 | # (514, 'No max tags limit') 22 | # (512, 'Flattening') 23 | # (255, 'Removing') -------------------------------------------------------------------------------- /hui/parsers/generators/js_jsdom.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const { JSDOM } = require('jsdom'); 3 | 4 | function generate() { 5 | const arr = JSON.parse(fs.readFileSync("generated_payloads.json")); 6 | const res = []; 7 | for(let payload of arr){ 8 | try{ 9 | const html_content = `${payload}`; 10 | const dom = new JSDOM(); 11 | dom.window.document.body.innerHTML = html_content; 12 | const body_inner_html = dom.window.document.body.innerHTML; 13 | res.push(body_inner_html); 14 | }catch(e){ 15 | res.push(""); 16 | } 17 | } 18 | fs.writeFileSync("results_parsers/JSDOM_HTML.json", JSON.stringify(res)); 19 | } 20 | 21 | if (require.main === module) { 22 | generate(); 23 | } -------------------------------------------------------------------------------- /hui/Generator.py: -------------------------------------------------------------------------------- 1 | from .parsers import JS_DOM, JS_DOMPURIFY, JS_HTMLPARSER2, PYTHON_HTML, PYTHON_LXML_HTML,PYTHON_HTML_SANITIZER ,GO_HTML, JS_SANITIZE_HTML, GO_bluemonday, JS_JSXSS 2 | 3 | def generate(): 4 | parsers_list = [ 5 | JS_DOMPURIFY.SANITIZE_HTML(), 6 | JS_DOM.SANITIZE_HTML(), 7 | JS_HTMLPARSER2.SANITIZE_HTML(), 8 | PYTHON_HTML.SANITIZE_HTML(), 9 | PYTHON_LXML_HTML.SANITIZE_HTML(), 10 | GO_HTML.SANITIZE_HTML(), 11 | JS_SANITIZE_HTML.SANITIZE_HTML(), 12 | PYTHON_HTML_SANITIZER.SANITIZE_HTML(), 13 | GO_bluemonday.SANITIZE_HTML(), 14 | JS_JSXSS.SANITIZE_HTML() 15 | ] 16 | 17 | for parser in parsers_list: 18 | parser.get_results() 19 | 20 | if __name__ == "__main__": 21 | generate() -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name = 'hui', 6 | packages= find_packages(), 7 | version = '0.2.2', 8 | license='MIT', 9 | author = 'Slonser', 10 | author_email = 'slonser@slonser.info', 11 | url = 'https://github.com/slonser/hui', 12 | download_url = 'https://github.com/Slonser/hui/archive/v_01.tar.gz', 13 | keywords = ['HTML', 'hui', 'HTML GUESSER', "HTML identifier", "XSS", "bugbounty"], 14 | classifiers=[ 15 | 'Development Status :: 3 - Alpha', 16 | 'Intended Audience :: Developers', 17 | 'Topic :: Software Development :: Build Tools', 18 | 'License :: OSI Approved :: MIT License', 19 | 'Programming Language :: Python :: 3' 20 | ], 21 | package_data={'': ['generated_payloads.json','results_parsers/*.json']} 22 | ) -------------------------------------------------------------------------------- /hui/parsers/generators/js_dompurify.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const { JSDOM } = require('jsdom'); 3 | const DOMPurify = require('dompurify'); 4 | 5 | function generate() { 6 | const arr = JSON.parse(fs.readFileSync("generated_payloads.json")); 7 | const res = []; 8 | arr.forEach(payload => { 9 | try{ 10 | const html_content = `${payload}`; 11 | const window = new JSDOM('').window; 12 | const purify = DOMPurify(window); 13 | const sanitized_html = purify.sanitize(html_content); 14 | res.push(sanitized_html); 15 | }catch{ 16 | res.push(""); 17 | } 18 | }); 19 | fs.writeFileSync("results_parsers/JS_DOMPURIFY.json", JSON.stringify(res)); 20 | } 21 | 22 | if (require.main === module) { 23 | generate(); 24 | } -------------------------------------------------------------------------------- /hui/parsers/generators/go.sum: -------------------------------------------------------------------------------- 1 | github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk= 2 | github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4= 3 | github.com/gorilla/css v1.0.1 h1:ntNaBIghp6JmvWnxbZKANoLyuXTPZ4cAMlo6RyhlbO8= 4 | github.com/gorilla/css v1.0.1/go.mod h1:BvnYkspnSzMmwRK+b8/xgNPLiIuNZr6vbZBTPQ2A3b0= 5 | github.com/microcosm-cc/bluemonday v1.0.27 h1:MpEUotklkwCSLeH+Qdx1VJgNqLlpY2KXwXFM08ygZfk= 6 | github.com/microcosm-cc/bluemonday v1.0.27/go.mod h1:jFi9vgW+H7c3V0lb6nR74Ib/DIB5OBs92Dimizgw2cA= 7 | golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= 8 | golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= 9 | golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= 10 | golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= 11 | -------------------------------------------------------------------------------- /hui/results_parsers/JS_SANITIZE_HTML.json: -------------------------------------------------------------------------------- 1 | ["","","","","\">","","

","

$text

","

$text

","

$text

","

$text
","
$text
","$text","
$text
","
$text
","
$text
","$text
","$text
","$text","$text","$text","$text","$text","$text","$text","$text","
$text","

$text

","

"] -------------------------------------------------------------------------------- /hui/results_parsers/JSDOM_HTML.json: -------------------------------------------------------------------------------- 1 | ["<a href=\"https://github.com/Slonser/hui/\">","\">","","<a href=\"https://github.com/Slonser/hui/\">","\">","<a href=\"https://github.com/Slonser/hui/</plaintext>\"></a></plaintext></plaintext>","<select></select>","<h1></h1><h2>$text</h2>","<h2></h2><h3>$text</h3>","<h3></h3><h4>$text</h4>","<h4></h4><h5>$text</h5>","<h5></h5><h6>$text</h6>","<form>$text</form>","<table></table><table></table>$text","<table><caption></caption><caption>$text</caption></table>","<table><tbody><tr><td></td><td>$text</td></tr></tbody></table>","<table><tbody><tr></tr><tr></tr></tbody></table>$text","<table><colgroup><col><col></colgroup></table>$text","$text","$text","$text","$text","$text","$text","<a $attribute_prefix-K=\"1\" href=\"$href\">$text</a>","<wbr>$text","<hr>$text","<h1>$text</h1>","<h1></h1>"] -------------------------------------------------------------------------------- /hui/parsers/generators/go_bluemonday.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "github.com/microcosm-cc/bluemonday" 8 | ) 9 | 10 | func generate() { 11 | file, err := os.Open("../../generated_payloads.json") 12 | if err != nil { 13 | fmt.Println("Error opening file:", err) 14 | return 15 | } 16 | defer file.Close() 17 | 18 | var arr []string 19 | if err := json.NewDecoder(file).Decode(&arr); err != nil { 20 | fmt.Println("Error decoding JSON:", err) 21 | return 22 | } 23 | 24 | var res []string 25 | for _, payload := range arr { 26 | p := bluemonday.UGCPolicy() 27 | sanitizedHTML := p.Sanitize(payload) 28 | res = append(res, sanitizedHTML) 29 | } 30 | 31 | outputFile, err := os.Create("../../results_parsers/GO_BLUEMONDAY.json") 32 | if err != nil { 33 | fmt.Println("Error creating output file:", err) 34 | return 35 | } 36 | defer outputFile.Close() 37 | 38 | if err := json.NewEncoder(outputFile).Encode(res); err != nil { 39 | fmt.Println("Error encoding JSON:", err) 40 | } 41 | } 42 | 43 | func main() { 44 | generate() 45 | } -------------------------------------------------------------------------------- /hui/CustomParser.py: -------------------------------------------------------------------------------- 1 | from html.parser import HTMLParser 2 | 3 | class CustomParser(HTMLParser): 4 | def __init__(self): 5 | super().__init__() 6 | self.customattr_found = False 7 | self.found_attrs = [] 8 | self.found_tags = [] 9 | self.current_depth = 0 10 | self.max_depth = 0 11 | self.start_tags = 0 12 | 13 | def handle_starttag(self, tag, attrs): 14 | self.found_tags.append(tag) 15 | self.found_attrs.extend(attrs) 16 | self.current_depth += 1 17 | self.start_tags += 1 18 | self.max_depth = max(self.max_depth, self.current_depth) 19 | 20 | def handle_endtag(self, tag): 21 | self.current_depth -= 1 22 | 23 | def check(self, payload): 24 | self.found_attrs = [] 25 | self.found_tags = [] 26 | self.current_depth = 0 27 | self.max_depth = 0 28 | self.start_tags = 0 29 | 30 | self.feed(payload) 31 | # Need to close parser to clear buffer 32 | # TODO: Is this best solution? 33 | self.close() 34 | return self.max_depth -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | Copyright (c) 2024 Slonser 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in all 10 | copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. 18 | -------------------------------------------------------------------------------- /hui/results_parsers/JAVA_JSOUP.json: -------------------------------------------------------------------------------- 1 | ["<xmp>\n &lt;a href=\"https:\/\/github.com\/Slonser\/hui\/\n<\/xmp>\"&gt;","<textarea>&lt;a href=\"https:\/\/github.com\/Slonser\/hui\/<\/textarea>\"&gt;","","<noembed>\n &lt;a href=\"https:\/\/github.com\/Slonser\/hui\/\n<\/noembed>\"&gt;","\"&gt;","<plaintext>\n &lt;a href=\"https:\/\/github.com\/Slonser\/hui\/&lt;\/plaintext&gt;\"&gt;&lt;\/a&gt;&lt;\/plaintext&gt;\n<\/plaintext>","<select><\/select>","<h1><\/h1>\n<h2>$text<\/h2>","<h2><\/h2>\n<h3>$text<\/h3>","<h3><\/h3>\n<h4>$text<\/h4>","<h4><\/h4>\n<h5>$text<\/h5>","<h5><\/h5>\n<h6>$text<\/h6>","<form>\n $text\n<\/form>","<table><\/table>\n<table>\n $text\n<\/table>","<table>\n <caption><\/caption>\n <caption>\n $text\n <\/caption>\n<\/table>","<table>\n <tbody>\n <tr>\n <td><\/td>\n <td>$text<\/td>\n <\/tr>\n <\/tbody>\n<\/table>","<table>\n <tbody>\n <tr><\/tr>\n <tr>\n $text\n <\/tr>\n <\/tbody>\n<\/table>","<table>\n <colgroup>\n <col>\n <col>\n <\/colgroup>$text\n<\/table>","$text","$text","$text","$text","$text","$text","<a $attribute_prefix-k=\"1\" href=\"$href\">$text<\/a>","<wbr>$text","<hr>$text","<h1>$text<\/h1>","<h1><\/h1>"] -------------------------------------------------------------------------------- /hui/results_parsers/JS_HTMLPARSER2.json: -------------------------------------------------------------------------------- 1 | ["<xmp><a href=\"https://github.com/Slonser/hui/</xmp>\"></a></xmp>","<textarea>&lt;a href=\"https://github.com/Slonser/hui/</textarea>\"&gt;","<noscript><a href=\"https://github.com/Slonser/hui/</noscript>\"></a></noscript>","<noembed><a href=\"https://github.com/Slonser/hui/</noembed>\"></a></noembed>","<style>&lt;a href=\"https://github.com/Slonser/hui/</style>\"&gt;","<plaintext><a href=\"https://github.com/Slonser/hui/</plaintext>\"></a></plaintext>","<select><h1></h1></select>","<h1><h2>$text</h2></h1>","<h2><h3>$text</h3></h2>","<h3><h4>$text</h4></h3>","<h4><h5>$text</h5></h4>","<h5><h6>$text</h6></h5>","<form><form>$text</form></form>","<table><table>$text</table></table>","<table><caption><caption>$text</caption></caption></table>","<table><td></td><td>$text</td></table>","<table><tr></tr><tr>$text</tr></table>","<table><col></col><col></col>$text</table>","<th>$text</th>","<td>$text</td>","<tfoot>$text</tfoot>","<thead>$text</thead>","<tbody>$text</tbody>","<tr>$text</tr>","<a $attribute_prefix-k=\"1\" href=\"$href\">$text</a>","<wbr></wbr>$text","<hr></hr>$text","<h1>$text</h1>","<h1></h1>"] -------------------------------------------------------------------------------- /hui/generated_payloads.json: -------------------------------------------------------------------------------- 1 | ["<xmp><a href=\"https://github.com/Slonser/hui/</xmp>\"></a></xmp>", "<textarea><a href=\"https://github.com/Slonser/hui/</textarea>\"></a></textarea>", "<noscript><a href=\"https://github.com/Slonser/hui/</noscript>\"></a></noscript>", "<noembed><a href=\"https://github.com/Slonser/hui/</noembed>\"></a></noembed>", "<style><a href=\"https://github.com/Slonser/hui/</style>\"></a></style>", "<plaintext><a href=\"https://github.com/Slonser/hui/</plaintext>\"></a></plaintext>", "<select><h1></h1></select>", "<h1><h2>$text</h2></h1>", "<h2><h3>$text</h3></h2>", "<h3><h4>$text</h4></h3>", "<h4><h5>$text</h5></h4>", "<h5><h6>$text</h6></h5>", "<form><form>$text</form></form>", "<table><table>$text</table></table>", "<table><caption><caption>$text</caption></caption></caption>", "<table><td><td>$text</td></td></td>", "<table><tr><tr>$text</tr></tr></tr>", "<table><col><col>$text</col></col></col>", "<th>$text</th>", "<td>$text</td>", "<tfoot>$text</tfoot>", "<thead>$text</thead>", "<tbody>$text</tbody>", "<tr>$text</tr>", "<a $attribute_prefix-\u212a=\"1\" href=\"$href\">$text</a>", "<wbr>$text</wbr>", "<hr>$text</hr>", "<h1>$text</h1></h1>", "<h1>"] -------------------------------------------------------------------------------- /hui/results_parsers/PYTHON_HTML.json: -------------------------------------------------------------------------------- 1 | ["<xmp><a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"></a></xmp>", "<textarea><a href=\"https://github.com/Slonser/hui/&lt;/textarea&gt;\"></a></textarea>", "<noscript><a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"></a></noscript>", "<noembed><a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"></a></noembed>", "<style><a href=\"https://github.com/Slonser/hui/</style>\"&gt;", "<plaintext><a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"></a></plaintext>", "<select><h1></h1></select>", "<h1><h2>$text</h2></h1>", "<h2><h3>$text</h3></h2>", "<h3><h4>$text</h4></h3>", "<h4><h5>$text</h5></h4>", "<h5><h6>$text</h6></h5>", "<form><form>$text</form></form>", "<table><table>$text</table></table>", "<table><caption><caption>$text</caption></caption></table>", "<table><td><td>$text</td></td></table>", "<table><tr><tr>$text</tr></tr></table>", "<table><col/><col/>$text</table>", "<th>$text</th>", "<td>$text</td>", "<tfoot>$text</tfoot>", "<thead>$text</thead>", "<tbody>$text</tbody>", "<tr>$text</tr>", "<a $attribute_prefix-k=\"1\" href=\"$href\">$text</a>", "<wbr/>$text", "<hr/>$text", "<h1>$text</h1>", "<h1></h1>"] -------------------------------------------------------------------------------- /hui/results_parsers/JS_JSXSS.json: -------------------------------------------------------------------------------- 1 | ["&lt;xmp&gt;<a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"></a>&lt;/xmp&gt;","&lt;textarea&gt;<a href=\"https://github.com/Slonser/hui/&lt;/textarea&gt;\"></a>&lt;/textarea&gt;","&lt;noscript&gt;<a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"></a>&lt;/noscript&gt;","&lt;noembed&gt;<a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"></a>&lt;/noembed&gt;","&lt;style&gt;<a href=\"https://github.com/Slonser/hui/&lt;/style&gt;\"></a>&lt;/style&gt;","&lt;plaintext&gt;<a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"></a>&lt;/plaintext&gt;","&lt;select&gt;<h1></h1>&lt;/select&gt;","<h1><h2>$text</h2></h1>","<h2><h3>$text</h3></h2>","<h3><h4>$text</h4></h3>","<h4><h5>$text</h5></h4>","<h5><h6>$text</h6></h5>","&lt;form&gt;&lt;form&gt;$text&lt;/form&gt;&lt;/form&gt;","<table><table>$text</table></table>","<table><caption><caption>$text</caption></caption></caption>","<table><td><td>$text</td></td></td>","<table><tr><tr>$text</tr></tr></tr>","<table><col><col>$text</col></col></col>","<th>$text</th>","<td>$text</td>","<tfoot>$text</tfoot>","<thead>$text</thead>","<tbody>$text</tbody>","<tr>$text</tr>","<a href>$text</a>","&lt;wbr&gt;$text&lt;/wbr&gt;","<hr>$text</hr>","<h1>$text</h1></h1>","<h1>"] -------------------------------------------------------------------------------- /hui.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | LICENSE.txt 2 | README.md 3 | pyproject.toml 4 | setup.cfg 5 | setup.py 6 | hui/ALLOWED_ATTRS.py 7 | hui/ALLOWED_TAGS.py 8 | hui/CustomParser.py 9 | hui/Generator.py 10 | hui/ParserBase.py 11 | hui/ParserPayload.py 12 | hui/__init__.py 13 | hui/generated_payloads.json 14 | hui/identify.py 15 | hui.egg-info/PKG-INFO 16 | hui.egg-info/SOURCES.txt 17 | hui.egg-info/dependency_links.txt 18 | hui.egg-info/top_level.txt 19 | hui/parsers/GO_HTML.py 20 | hui/parsers/GO_bluemonday.py 21 | hui/parsers/JAVA_JSOUP.py 22 | hui/parsers/JS_DOM.py 23 | hui/parsers/JS_DOMPURIFY.py 24 | hui/parsers/JS_HTMLPARSER2.py 25 | hui/parsers/JS_JSXSS.py 26 | hui/parsers/JS_SANITIZE_HTML.py 27 | hui/parsers/PYTHON_HTML.py 28 | hui/parsers/PYTHON_HTML_SANITIZER.py 29 | hui/parsers/PYTHON_LXML_HTML.py 30 | hui/parsers/__init__.py 31 | hui/parsers/simple_parser.py 32 | hui/results_parsers/GO_BLUEMONDAY.json 33 | hui/results_parsers/GO_HTML.json 34 | hui/results_parsers/JAVA_JSOUP.json 35 | hui/results_parsers/JSDOM_HTML.json 36 | hui/results_parsers/JS_DOMPURIFY.json 37 | hui/results_parsers/JS_HTMLPARSER2.json 38 | hui/results_parsers/JS_JSXSS.json 39 | hui/results_parsers/JS_SANITIZE_HTML.json 40 | hui/results_parsers/PYTHON_HTML.json 41 | hui/results_parsers/PYTHON_HTML_SANITIZE.json 42 | hui/results_parsers/PYTHON_LXML_HTML.json -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/src/main/java/com/example/Main.java: -------------------------------------------------------------------------------- 1 | package com.example; 2 | 3 | import org.jsoup.Jsoup; 4 | import org.jsoup.nodes.Document; 5 | import org.json.simple.JSONArray; 6 | import org.json.simple.parser.JSONParser; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.io.IOException; 10 | import java.io.FileReader; 11 | 12 | 13 | public class Main { 14 | public static void generate() { 15 | try { 16 | System.out.println("Starting generation process..."); 17 | JSONParser parser = new JSONParser(); 18 | JSONArray arr = (JSONArray) parser.parse(new FileReader("generated_payloads.json")); 19 | JSONArray res = new JSONArray(); 20 | 21 | for (int i = 0; i < arr.size(); i++) { 22 | String htmlContent = (String) arr.get(i); 23 | Document doc = Jsoup.parse(htmlContent); 24 | String bodyInnerHtml = doc.body().html(); 25 | res.add(bodyInnerHtml); 26 | } 27 | Files.write(Paths.get("results_parsers/JAVA_JSOUP.json"), res.toJSONString().getBytes()); 28 | } catch (Exception e) { 29 | e.printStackTrace(); 30 | } 31 | } 32 | 33 | public static void main(String[] args) { 34 | generate(); 35 | } 36 | } -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="UTF-8"?> 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> 3 | <modelVersion>4.0.0</modelVersion> 4 | <groupId>com.example</groupId> 5 | <artifactId>java-jsoup</artifactId> 6 | <version>1.0-SNAPSHOT</version> 7 | <build> 8 | <plugins> 9 | <plugin> 10 | <artifactId>maven-jar-plugin</artifactId> 11 | <version>3.2.0</version> 12 | <configuration> 13 | <archive> 14 | <manifest> 15 | <addDefaultImplementationEntries>true</addDefaultImplementationEntries> 16 | <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries> 17 | <mainClass>com.example.Main</mainClass> 18 | </manifest> 19 | </archive> 20 | </configuration> 21 | </plugin> 22 | <plugin> 23 | <artifactId>maven-shade-plugin</artifactId> 24 | <version>3.6.0</version> 25 | <executions> 26 | <execution> 27 | <phase>package</phase> 28 | <goals> 29 | <goal>shade</goal> 30 | </goals> 31 | </execution> 32 | </executions> 33 | </plugin> 34 | </plugins> 35 | </build> 36 | </project> 37 | -------------------------------------------------------------------------------- /hui/results_parsers/PYTHON_LXML_HTML.json: -------------------------------------------------------------------------------- 1 | ["<body><xmp><a href=\"https://github.com/Slonser/hui/&lt;/xmp&gt;\"/></xmp></body>", "<body><textarea><a href=\"https://github.com/Slonser/hui/&lt;/textarea&gt;\"/></textarea></body>", "<body><noscript><a href=\"https://github.com/Slonser/hui/&lt;/noscript&gt;\"/></noscript></body>", "<body><noembed><a href=\"https://github.com/Slonser/hui/&lt;/noembed&gt;\"/></noembed></body>", "<body><style>&lt;a href=\"https://github.com/Slonser/hui/</style>\"&gt;</body>", "<body><plaintext><a href=\"https://github.com/Slonser/hui/&lt;/plaintext&gt;\"/></plaintext></body>", "<body><select><h1/></select></body>", "<body><h1><h2>$text</h2></h1></body>", "<body><h2><h3>$text</h3></h2></body>", "<body><h3><h4>$text</h4></h3></body>", "<body><h4><h5>$text</h5></h4></body>", "<body><h5><h6>$text</h6></h5></body>", "<body><form/><form>$text</form></body>", "<body><table><table>$text</table></table></body>", "<body><table><caption><caption>$text</caption></caption></table></body>", "<body><table><td/><td>$text</td></table></body>", "<body><table><tr/><tr>$text</tr></table></body>", "<body><table><col/><col/>$text</table></body>", "<body><th>$text</th></body>", "<body><td>$text</td></body>", "<body><tfoot>$text</tfoot></body>", "<body><thead>$text</thead></body>", "<body><tbody>$text</tbody></body>", "<body><tr>$text</tr></body>", "<body><a href=\"$href\">$text</a></body>", "<body><wbr>$text</wbr></body>", "<body><hr/>$text</body>", "<body><h1>$text</h1></body>", "<body><h1/></body>"] -------------------------------------------------------------------------------- /hui/parsers/generators/go_html.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "strings" 8 | 9 | "golang.org/x/net/html" 10 | ) 11 | 12 | func generate() { 13 | file, err := os.Open("../../generated_payloads.json") 14 | if err != nil { 15 | fmt.Println("Error opening file:", err) 16 | return 17 | } 18 | defer file.Close() 19 | 20 | var arr []string 21 | if err := json.NewDecoder(file).Decode(&arr); err != nil { 22 | fmt.Println("Error decoding JSON:", err) 23 | return 24 | } 25 | 26 | var res []string 27 | for _, payload := range arr { 28 | htmlContent := fmt.Sprintf("<html><body>%s</body></html>", payload) 29 | 30 | doc, err := html.Parse(strings.NewReader(htmlContent)) 31 | if err != nil { 32 | fmt.Println("Error parsing HTML:", err) 33 | continue 34 | } 35 | 36 | var bodyInnerHTML string 37 | var f func(*html.Node) 38 | f = func(n *html.Node) { 39 | if n.Type == html.ElementNode && n.Data == "body" { 40 | var buf strings.Builder 41 | html.Render(&buf, n) 42 | bodyInnerHTML = buf.String() 43 | } 44 | for c := n.FirstChild; c != nil; c = c.NextSibling { 45 | f(c) 46 | } 47 | } 48 | f(doc) 49 | 50 | res = append(res, strings.TrimSuffix(strings.TrimPrefix(bodyInnerHTML, "<body>"), "</body>")) 51 | } 52 | 53 | outputFile, err := os.Create("../../results_parsers/GO_HTML.json") 54 | if err != nil { 55 | fmt.Println("Error creating output file:", err) 56 | return 57 | } 58 | defer outputFile.Close() 59 | 60 | if err := json.NewEncoder(outputFile).Encode(res); err != nil { 61 | fmt.Println("Error encoding JSON:", err) 62 | } 63 | } 64 | 65 | func main() { 66 | generate() 67 | } -------------------------------------------------------------------------------- /hui/results_parsers/GO_BLUEMONDAY.json: -------------------------------------------------------------------------------- 1 | ["\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026#34;\u0026gt;\u003c/a\u003e","\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026#34;\u0026gt;\u003c/a\u003e","\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u0026lt;/plaintext\u0026gt;\u0026#34;\u0026gt;\u0026lt;/a\u0026gt;\u0026lt;/plaintext\u0026gt;","\u003ch1\u003e\u003c/h1\u003e","\u003ch1\u003e\u003ch2\u003e$text\u003c/h2\u003e\u003c/h1\u003e","\u003ch2\u003e\u003ch3\u003e$text\u003c/h3\u003e\u003c/h2\u003e","\u003ch3\u003e\u003ch4\u003e$text\u003c/h4\u003e\u003c/h3\u003e","\u003ch4\u003e\u003ch5\u003e$text\u003c/h5\u003e\u003c/h4\u003e","\u003ch5\u003e\u003ch6\u003e$text\u003c/h6\u003e\u003c/h5\u003e","$text","\u003ctable\u003e\u003ctable\u003e$text\u003c/table\u003e\u003c/table\u003e","\u003ctable\u003e\u003ccaption\u003e\u003ccaption\u003e$text\u003c/caption\u003e\u003c/caption\u003e\u003c/caption\u003e","\u003ctable\u003e\u003ctd\u003e\u003ctd\u003e$text\u003c/td\u003e\u003c/td\u003e\u003c/td\u003e","\u003ctable\u003e\u003ctr\u003e\u003ctr\u003e$text\u003c/tr\u003e\u003c/tr\u003e\u003c/tr\u003e","\u003ctable\u003e\u003ccol\u003e\u003ccol\u003e$text\u003c/col\u003e\u003c/col\u003e\u003c/col\u003e","\u003cth\u003e$text\u003c/th\u003e","\u003ctd\u003e$text\u003c/td\u003e","\u003ctfoot\u003e$text\u003c/tfoot\u003e","\u003cthead\u003e$text\u003c/thead\u003e","\u003ctbody\u003e$text\u003c/tbody\u003e","\u003ctr\u003e$text\u003c/tr\u003e","\u003ca href=\"$href\" rel=\"nofollow\"\u003e$text\u003c/a\u003e","\u003cwbr\u003e$text\u003c/wbr\u003e","\u003chr\u003e$text\u003c/hr\u003e","\u003ch1\u003e$text\u003c/h1\u003e\u003c/h1\u003e","\u003ch1\u003e"] 2 | -------------------------------------------------------------------------------- /hui/results_parsers/GO_HTML.json: -------------------------------------------------------------------------------- 1 | ["\u003cxmp\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/xmp\u003e\u0026#34;\u0026gt;","\u003ctextarea\u003e\u0026lt;a href=\u0026#34;https://github.com/Slonser/hui/\u003c/textarea\u003e\u0026#34;\u0026gt;","\u003cnoscript\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/noscript\u003e\u0026#34;\u0026gt;","\u003cnoembed\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/noembed\u003e\u0026#34;\u0026gt;","\u003cstyle\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/style\u003e\u0026#34;\u0026gt;","\u003cplaintext\u003e\u003ca href=\"https://github.com/Slonser/hui/\u003c/plaintext\u003e\"\u003e\u003c/a\u003e\u003c/plaintext\u003e\u003c/body\u003e\u003c/html\u003e","\u003cselect\u003e\u003c/select\u003e","\u003ch1\u003e\u003c/h1\u003e\u003ch2\u003e$text\u003c/h2\u003e","\u003ch2\u003e\u003c/h2\u003e\u003ch3\u003e$text\u003c/h3\u003e","\u003ch3\u003e\u003c/h3\u003e\u003ch4\u003e$text\u003c/h4\u003e","\u003ch4\u003e\u003c/h4\u003e\u003ch5\u003e$text\u003c/h5\u003e","\u003ch5\u003e\u003c/h5\u003e\u003ch6\u003e$text\u003c/h6\u003e","\u003cform\u003e$text\u003c/form\u003e","\u003ctable\u003e\u003c/table\u003e$text\u003ctable\u003e\u003c/table\u003e","\u003ctable\u003e\u003ccaption\u003e\u003c/caption\u003e\u003ccaption\u003e$text\u003c/caption\u003e\u003c/table\u003e","\u003ctable\u003e\u003ctbody\u003e\u003ctr\u003e\u003ctd\u003e\u003c/td\u003e\u003ctd\u003e$text\u003c/td\u003e\u003c/tr\u003e\u003c/tbody\u003e\u003c/table\u003e","$text\u003ctable\u003e\u003ctbody\u003e\u003ctr\u003e\u003c/tr\u003e\u003ctr\u003e\u003c/tr\u003e\u003c/tbody\u003e\u003c/table\u003e","$text\u003ctable\u003e\u003ccolgroup\u003e\u003ccol/\u003e\u003ccol/\u003e\u003c/colgroup\u003e\u003c/table\u003e","$text","$text","$text","$text","$text","$text","\u003ca $attribute_prefix-K=\"1\" href=\"$href\"\u003e$text\u003c/a\u003e","\u003cwbr/\u003e$text","\u003chr/\u003e$text","\u003ch1\u003e$text\u003c/h1\u003e","\u003ch1\u003e\u003c/h1\u003e"] 2 | -------------------------------------------------------------------------------- /hui/parsers/generators/JSOUP/pom.xml: -------------------------------------------------------------------------------- 1 | <!-- pom.xml --> 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" 3 | xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 4 | xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 5 | <modelVersion>4.0.0</modelVersion> 6 | 7 | <groupId>com.example</groupId> 8 | <artifactId>java-jsoup</artifactId> 9 | <version>1.0-SNAPSHOT</version> 10 | 11 | <build> 12 | <plugins> 13 | <plugin> 14 | <groupId>org.apache.maven.plugins</groupId> 15 | <artifactId>maven-jar-plugin</artifactId> 16 | <version>3.2.0</version> 17 | <configuration> 18 | <archive> 19 | <manifest> 20 | <addDefaultImplementationEntries>true</addDefaultImplementationEntries> 21 | <addDefaultSpecificationEntries>true</addDefaultSpecificationEntries> 22 | <mainClass>com.example.Main</mainClass> 23 | </manifest> 24 | </archive> 25 | </configuration> 26 | </plugin> 27 | <plugin> 28 | <groupId>org.apache.maven.plugins</groupId> 29 | <artifactId>maven-shade-plugin</artifactId> 30 | <version>3.6.0</version> 31 | <executions> 32 | <execution> 33 | <phase>package</phase> 34 | <goals> 35 | <goal>shade</goal> 36 | </goals> 37 | </execution> 38 | </executions> 39 | </plugin> 40 | </plugins> 41 | </build> 42 | 43 | <dependencies> 44 | <dependency> 45 | <groupId>org.jsoup</groupId> 46 | <artifactId>jsoup</artifactId> 47 | <version>1.14.3</version> 48 | </dependency> 49 | <dependency> 50 | <groupId>com.googlecode.json-simple</groupId> 51 | <artifactId>json-simple</artifactId> 52 | <version>1.1.1</version> 53 | </dependency> 54 | </dependencies> 55 | </project> -------------------------------------------------------------------------------- /hui/ParserPayload.py: -------------------------------------------------------------------------------- 1 | from string import Template 2 | 3 | class ParserPayload: 4 | """ 5 | Represents a payload for parsing with additional metadata and methods for validation. 6 | 7 | Attributes: 8 | payload (str): The actual payload to be parsed. 9 | expected_output (str, optional): The expected output of the parsing process. Defaults to None. 10 | version (str, optional): The version of the parser or the payload. Defaults to None. 11 | sanitizer (str, optional): The sanitizer to be used for the payload. Defaults to None. 12 | parametrs (str, optional): Parameters for the parsing process. Defaults to None. 13 | tags (list): Tags associated with the payload. 14 | 15 | Methods: 16 | check(output): Checks if the output matches the expected output, considering whitespace. 17 | remove_whitespace(string): Removes whitespace from a given string. 18 | """ 19 | 20 | def __init__(self, payload: str, tags, expected_output: str = None, version: str = None, sanitizer: str = None, parametrs: str = None) -> None: 21 | self.payload = payload 22 | self.expected_output = expected_output 23 | self.version = version 24 | self.sanitizer = sanitizer 25 | self.parametrs = parametrs 26 | self.tags = tags 27 | 28 | def check(self, output, TEMPLATE_VARS): 29 | """ 30 | Checks if the output matches the expected output, considering whitespace. 31 | 32 | Args: 33 | output (str): The output to be checked against the expected output. 34 | 35 | Returns: 36 | bool: True if the output matches the expected output, False otherwise. 37 | """ 38 | output = Template(output).safe_substitute(TEMPLATE_VARS) 39 | expected_output_template = Template(self.expected_output).safe_substitute(TEMPLATE_VARS) 40 | if output in expected_output_template: 41 | return True 42 | if self.remove_whitespace(output) in self.remove_whitespace(expected_output_template): 43 | return True 44 | return False 45 | 46 | 47 | def remove_whitespace(self, string): 48 | """ 49 | Removes whitespace from a given string. 50 | 51 | Args: 52 | string (str): The string from which to remove whitespace. 53 | 54 | Returns: 55 | str: The string with all whitespace removed. 56 | """ 57 | return "".join(string.split()) -------------------------------------------------------------------------------- /hui/ALLOWED_TAGS.py: -------------------------------------------------------------------------------- 1 | html_tags = [ 2 | 'a', 3 | 'abbr', 4 | 'acronym', 5 | 'address', 6 | 'area', 7 | 'article', 8 | 'aside', 9 | 'audio', 10 | 'b', 11 | 'base', 12 | 'bdi', 13 | 'bdo', 14 | 'big', 15 | 'blockquote', 16 | 'body', 17 | 'br', 18 | 'button', 19 | 'canvas', 20 | 'caption', 21 | 'center', 22 | 'cite', 23 | 'code', 24 | 'col', 25 | 'colgroup', 26 | 'data', 27 | 'datalist', 28 | 'dd', 29 | 'del', 30 | 'details', 31 | 'dfn', 32 | 'dialog', 33 | 'dir', 34 | 'div', 35 | 'dl', 36 | 'dt', 37 | 'em', 38 | 'embed', 39 | 'fencedframe', 40 | 'fieldset', 41 | 'figcaption', 42 | 'figure', 43 | 'font', 44 | 'footer', 45 | 'form', 46 | 'frame', 47 | 'frameset', 48 | 'h1', 49 | 'h2', 50 | 'h3', 51 | 'h4', 52 | 'h5', 53 | 'h6', 54 | 'head', 55 | 'header', 56 | 'hgroup', 57 | 'hr', 58 | 'html', 59 | 'i', 60 | 'iframe', 61 | 'img', 62 | 'input', 63 | 'ins', 64 | 'kbd', 65 | 'label', 66 | 'legend', 67 | 'li', 68 | 'link', 69 | 'main', 70 | 'map', 71 | 'mark', 72 | 'marquee', 73 | 'menu', 74 | 'meta', 75 | 'meter', 76 | 'nav', 77 | 'nobr', 78 | 'noembed', 79 | 'noframes', 80 | 'noscript', 81 | 'object', 82 | 'ol', 83 | 'optgroup', 84 | 'option', 85 | 'output', 86 | 'p', 87 | 'param', 88 | 'picture', 89 | 'plaintext', 90 | 'portal', 91 | 'pre', 92 | 'progress', 93 | 'q', 94 | 'rb', 95 | 'rp', 96 | 'rt', 97 | 'rtc', 98 | 'ruby', 99 | 's', 100 | 'samp', 101 | 'script', 102 | 'search', 103 | 'section', 104 | 'select', 105 | 'slot', 106 | 'small', 107 | 'source', 108 | 'span', 109 | 'strike', 110 | 'strong', 111 | 'style', 112 | 'sub', 113 | 'summary', 114 | 'sup', 115 | 'table', 116 | 'tbody', 117 | 'td', 118 | 'template', 119 | 'textarea', 120 | 'tfoot', 121 | 'th', 122 | 'thead', 123 | 'time', 124 | 'title', 125 | 'tr', 126 | 'track', 127 | 'tt', 128 | 'u', 129 | 'ul', 130 | 'var', 131 | 'video', 132 | 'wbr', 133 | 'xmp', 134 | 'customtag' 135 | ] 136 | 137 | # Some tags may be only work inside table tag 138 | html_table_tags = [ 139 | 'caption', 140 | 'thead', 141 | 'colgroup', 142 | 'col', 143 | 'th', 144 | 'tbody', 145 | 'tr', 146 | 'td', 147 | 'tfoot', 148 | ] 149 | 150 | # Copyied from https://github.com/cure53/DOMPurify/blob/f1106aae5a861d1096cb57ad9a6f518b4279ea8c/src/tags.ts#L226 151 | mathml_tags = [ 152 | 'math', 153 | 'menclose', 154 | 'merror', 155 | 'mfenced', 156 | 'mfrac', 157 | 'mglyph', 158 | 'mi', 159 | 'mlabeledtr', 160 | 'mmultiscripts', 161 | 'mn', 162 | 'mo', 163 | 'mover', 164 | 'mpadded', 165 | 'mphantom', 166 | 'mroot', 167 | 'mrow', 168 | 'ms', 169 | 'mspace', 170 | 'msqrt', 171 | 'mstyle', 172 | 'msub', 173 | 'msup', 174 | 'msubsup', 175 | 'mtable', 176 | 'mtd', 177 | 'mtext', 178 | 'mtr', 179 | 'munder', 180 | 'munderover', 181 | 'mprescripts', 182 | 'maction', 183 | 'maligngroup', 184 | 'malignmark', 185 | 'mlongdiv', 186 | 'mscarries', 187 | 'mscarry', 188 | 'msgroup', 189 | 'mstack', 190 | 'msline', 191 | 'msrow', 192 | 'semantics', 193 | 'annotation', 194 | 'annotation-xml', 195 | 'mprescripts', 196 | 'none', 197 | ] 198 | 199 | # Copyied from https://github.com/cure53/DOMPurify/blob/f1106aae5a861d1096cb57ad9a6f518b4279ea8c/src/tags.ts#L123 200 | svg_tags = [ 201 | 'animate', 202 | 'color-profile', 203 | 'cursor', 204 | 'discard', 205 | 'font-face', 206 | 'font-face-format', 207 | 'font-face-name', 208 | 'font-face-src', 209 | 'font-face-uri', 210 | 'foreignobject', 211 | 'hatch', 212 | 'hatchpath', 213 | 'mesh', 214 | 'meshgradient', 215 | 'meshpatch', 216 | 'meshrow', 217 | 'missing-glyph', 218 | 'script', 219 | 'set', 220 | 'solidcolor', 221 | 'unknown', 222 | 'use', 223 | 'feBlend', 224 | 'feColorMatrix', 225 | 'feComponentTransfer', 226 | 'feComposite', 227 | 'feConvolveMatrix', 228 | 'feDiffuseLighting', 229 | 'feDisplacementMap', 230 | 'feDistantLight', 231 | 'feDropShadow', 232 | 'feFlood', 233 | 'feFuncA', 234 | 'feFuncB', 235 | 'feFuncG', 236 | 'feFuncR', 237 | 'feGaussianBlur', 238 | 'feImage', 239 | 'feMerge', 240 | 'feMergeNode', 241 | 'feMorphology', 242 | 'feOffset', 243 | 'fePointLight', 244 | 'feSpecularLighting', 245 | 'feSpotLight', 246 | 'feTile', 247 | 'feTurbulence', 248 | 'a', 249 | 'altglyph', 250 | 'altglyphdef', 251 | 'altglyphitem', 252 | 'animatecolor', 253 | 'animatemotion', 254 | 'animatetransform', 255 | 'circle', 256 | 'clippath', 257 | 'defs', 258 | 'desc', 259 | 'ellipse', 260 | 'filter', 261 | 'font', 262 | 'g', 263 | 'glyph', 264 | 'glyphref', 265 | 'hkern', 266 | 'image', 267 | 'line', 268 | 'lineargradient', 269 | 'marker', 270 | 'mask', 271 | 'metadata', 272 | 'mpath', 273 | 'path', 274 | 'pattern', 275 | 'polygon', 276 | 'polyline', 277 | 'radialgradient', 278 | 'rect', 279 | 'stop', 280 | 'style', 281 | 'switch', 282 | 'symbol', 283 | 'text', 284 | 'textpath', 285 | 'title', 286 | 'tref', 287 | 'tspan', 288 | 'view', 289 | 'vkern' 290 | ] -------------------------------------------------------------------------------- /hui/ParserBase.py: -------------------------------------------------------------------------------- 1 | from .ParserPayload import ParserPayload 2 | import json 3 | import os 4 | 5 | class ParserBase: 6 | """ 7 | A class to handle parsing of HTML content with various checks for incorrect parsing states. 8 | 9 | Attributes: 10 | parser_name (str): The name of the parser. 11 | checks (list): A list of ParserPayload objects that define parsing checks. 12 | """ 13 | 14 | def __init__(self, parser_name: str, attribute_prefix='data-') -> None: 15 | """ 16 | Initializes the ParserBase with a parser name and an optional attribute prefix. 17 | 18 | Args: 19 | parser_name (str): The name of the parser. 20 | attribute_prefix (str): The prefix for attributes (default is 'data-'). 21 | """ 22 | self.parser_name = parser_name 23 | self.checks = [] 24 | 25 | # Some HTML parser don't properly resolve raw text tags 26 | incorrect_parsing_state_tags = ['xmp','textarea','noscript','noembed','style','plaintext'] 27 | for tag in incorrect_parsing_state_tags: 28 | self.add( 29 | ParserPayload(f'<{tag}><a href="https://github.com/Slonser/hui/</{tag}>"></a></{tag}>', 30 | [tag,'a'], 31 | expected_output=f'<{tag}>&lt;a href="https://github.com/Slonser/hui/</{tag}>"&gt;') 32 | ) 33 | 34 | # Some HTML parsers incorrectly handle select tag 35 | # Browsers will remove <img/> tag, They - don't do this 36 | self.add( 37 | ParserPayload('<select><h1></h1></select>', 38 | ['select','h1'], 39 | expected_output="<select></select>") 40 | ) 41 | 42 | # Some HTML parsers don't use flattening with headers tags 43 | for i in range(1,6): 44 | self.add( 45 | ParserPayload(f'<h{i}><h{i+1}>$text</h{i+1}></h{i}>', 46 | [f'h{i}',f'h{i+1}'], 47 | expected_output=f"<h{i}></h{i}><h{i+1}>$text</h{i+1}>") 48 | ) 49 | 50 | # Some HTML parsers don't resolve nested forms 51 | self.add( 52 | ParserPayload('<form><form>$text</form></form>', 53 | ['form'], 54 | expected_output=f"<form>$text</form>") 55 | ) 56 | 57 | # Some HTML parser don't resolve nested tables 58 | self.add( 59 | ParserPayload('<table><table>$text</table></table>', 60 | ['table'], 61 | expected_output=f"<table></table>$text<table></table>") 62 | ) 63 | 64 | # Some parser don't resolve nested table elemenents 65 | table_nested_tags = ['caption','td','tr','col'] 66 | for tag in table_nested_tags: 67 | self.add( 68 | ParserPayload(f'<table><{tag}><{tag}>$text</{tag}></{tag}></{tag}>', 69 | ['table',tag], 70 | expected_output=f"<table><{tag}></{tag}><{tag}>$text</{tag}></table>") 71 | ) 72 | 73 | # Some HTML parsers don't implement "in row" insertion mode correctly 74 | row_insertion_mode = ['th','td','tfoot','thead','tbody','tr'] 75 | for tag in row_insertion_mode: 76 | self.add( 77 | ParserPayload(f'<{tag}>$text</{tag}>', 78 | [tag], 79 | expected_output='$text') 80 | ) 81 | # Python HTML parsers incorrectly handle lower on html attribute names 82 | # By default in browsers, only ascii chars would be lowercased 83 | # In python \u212a -> 0x6b 84 | self.add( 85 | ParserPayload('<a $attribute_prefix-\u212a="1" href="$href">$text</a>', 86 | ['a'], 87 | expected_output=f'<a href="$href" data-K="1">$text</a>') 88 | ) 89 | 90 | #Some parsers incorrectrly parse self closing tags 91 | self_closing_tags = ['wbr','hr'] 92 | for tag in self_closing_tags: 93 | self.add( 94 | ParserPayload(f'<{tag}>$text</{tag}>', 95 | [tag], 96 | expected_output=f'<{tag}>$text') 97 | ) 98 | 99 | # Some parsers incorrect handle <tag></tag></tag> 100 | # TODO: should check with another tags, because h1 maybe banned 101 | self.add( 102 | ParserPayload(f'<h1>$text</h1></h1>', 103 | ['h1'], 104 | expected_output=f'<h1>123</h1></h1>') 105 | ) 106 | 107 | #Some sanitizers not close tags 108 | self.add( 109 | ParserPayload(f'<h1>', 110 | ['h1'], 111 | expected_output=f'<h1></h1>') 112 | ) 113 | 114 | def add(self, payload): 115 | """ 116 | Adds a ParserPayload to the checks list. 117 | 118 | Args: 119 | payload (ParserPayload): The payload to be added to the checks. 120 | """ 121 | self.checks.append(payload) 122 | 123 | def add_all(self, arr): 124 | """ 125 | Adds multiple ParserPayloads to the checks list. 126 | 127 | Args: 128 | arr (list): A list of ParserPayload objects to be added. 129 | """ 130 | for x in arr: 131 | self.add(x) 132 | 133 | def generate_payloads(self): 134 | """ 135 | Generates payloads and saves them to a JSON file if it does not already exist. 136 | """ 137 | if os.path.exists("./generated_payloads.json"): 138 | return 139 | 140 | tag_arr = [] 141 | for tag in self.checks: 142 | tag_arr.append(tag.payload) 143 | 144 | return json.dump(tag_arr, open('./generated_payloads.json',"w")) 145 | 146 | def get_results(self): 147 | """ 148 | Placeholder method for getting results. 149 | """ 150 | pass -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Html Universal Identifier 2 | 3 | Html Universal Identifier is an alpha version of an application designed for identifying server-side HTML parsers. This package provides a way to determine which HTML, SVG, and MathML tags are allowed, helps to find parser features (incorrectly implemented tags), and can also help to guess which parser is used on the backend. 4 | 5 | Primarily, this library relies on the incorrectness of HTML parsing, for example, here are some classic examples: 6 | - `<form><form>text</form></form>` should be transformed to `<form>text</form>` 7 | - `<h1><h2>text</h2></h1>` should be transformed to `<h1><h2>text</h2></h1>` 8 | 9 | There are several reasons why you don't want to rely entirely on allowed tags: 10 | - It won't help you determine which parser your custom sanitization is based on 11 | - Allowed tags can be changed 12 | 13 | ## Features 14 | 15 | - Identify allowed HTML, SVG, and MathML tags. 16 | - Identify allowed attributes. 17 | - Identify incorrect parsing 18 | - Use a customizable handler function to process HTML payloads. 19 | - Load and compare results against predefined Parser outputs. 20 | 21 | ## Installation 22 | 23 | To install the package, use pip: 24 | 25 | ``` 26 | pip install hui 27 | ``` 28 | 29 | ## Usage 30 | 31 | Here is a basic example of how to use the `Identifier` class from the package: 32 | 33 | ```python 34 | from hui.identify import Identifier 35 | import requests 36 | 37 | def handler(payload): 38 | return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text 39 | 40 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False) 41 | print(a.identify()) 42 | # run all 43 | # Example output 44 | # [[1.0, 27, 'JS_SANITIZE_HTML'], [0.8148148148148148, 22, 'PYTHON_HTML_SANITIZE'], ... 45 | 46 | print(a.check_attr_allowed("href",tag="a")) 47 | # True or False 48 | print(a.INCORRECT_PARSED) 49 | # Example output 50 | # [{'output': '<h5><h6>govnoed</h6></h5>', 'expected': '<h5></h5><h6>$text</h6>'}, .. ] 51 | print(a.ALLOWED_TAGS) 52 | # print allowed tags 53 | print(a.ATTRIBUTES) 54 | # Prints ATTRIBUTES info 55 | print(a.DEPTH_LIMITS) 56 | # Example Outputs: 57 | # (514, 'No max tags limit') 58 | # (512, 'Flattening') 59 | # (255, 'Removing') 60 | ``` 61 | 62 | ## Identifier Class 63 | 64 | The `Identifier` class is the core of this package. It is responsible for identifying allowed HTML, SVG, and MathML tags based on a handler function that processes HTML payloads. 65 | 66 | The class also maintains an `INCORRECT_PARSED` list, which contains payloads that were incorrectly parsed by the handler. For example, this may include cases where the parser fails to remove nested forms and similar issues. 67 | 68 | ## Current Parsers 69 | 70 | The following parsers are currently supported in the project: 71 | 72 | - **DOMpurify with JSDOM (JS)** 73 | - **JSDOM (JS)** 74 | - **sanitize_html (JS)** 75 | - **htmlparser2 (JS)** 76 | - **JSXSS (JS)** 77 | - **html (python)** 78 | - **lxml (python)** 79 | - **html_sanitizer (python)** 80 | - **net/html (go)** 81 | - **bluemonday (go)** 82 | 83 | If you believe a new parser/sanitizer should be added, please create an issue, and I will be happy to include it. 84 | ### Constructor Parameters 85 | 86 | - **`handler`**: A function that takes a payload and returns an HTML response. Example: 87 | ```python 88 | lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text 89 | ``` 90 | 91 | - **`buffer_enabled`** (optional, default=False): A boolean flag to enable or disable buffering of payloads before sending them to the handler. By default, buffering is disabled, as it can sometimes lead to incorrect results. For example, some sanitizers may simply remove all input if it contains a dangerous tag. Use buffering only if the server you are interacting with has strict rate limits. 92 | 93 | - **`buffer_delimeter`** (optional, default=`<div>TEXTTEXT</div>`): A string used to delimit buffered payloads when sending them to the handler. 94 | 95 | - **`buffer_limit`** (optional, default=32): An integer that specifies the maximum number of payloads to buffer before sending them to the handler. 96 | 97 | - **`template_vars`** (optional, default=None): A dictionary of template variables to use for substitution in payloads. 98 | 99 | - **`debug_mode`** (optional, default=False): A boolean flag to enable or disable debug logging. 100 | 101 | ### Methods 102 | 103 | - **`check_allowed_tags()`**: Checks and populates the `ALLOWED_TAGS` dictionary with allowed tags for HTML, SVG, and MathML. 104 | - **`call_handler(template_payloads: list[str])`**: Calls the handler function with a list of template payloads and returns the processed results. 105 | - **`check_namespace(namespace: str)`**: Checks for allowed tags in the specified namespace (SVG or MathML). 106 | - **`identify()`**: Identifies the best matching Parser based on generated payloads and returns a list of matches. 107 | - **`check_allowed_attrs()`**: Checks and validates allowed attributes for HTML tags. 108 | 109 | ### identify() Method 110 | 111 | The `identify()` method checks if allowed tags have been determined. If not, it calls `check_allowed_tags()` to populate the `ALLOWED_TAGS`. It then loads a list of generated payloads from a JSON file and calls the handler for each payload. Finally, it compares the results against all JSON files in the `results_parsers` directory to count matches and returns a sorted list of results. 112 | 113 | - **Returns**: A list of tuples, each containing: 114 | - The match ratio (float) 115 | - The number of matches (int) 116 | - The name of the Parser (str) 117 | 118 | ### Attributes 119 | 120 | - **`ATTRIBUTES`**: A dictionary that holds information about allowed attributes for HTML tags, including: 121 | - `custom_attribute`: Indicates if custom attributes are allowed. 122 | - `event_attributes_blocked`: Indicates if event attributes are directly blocked. 123 | - `data_attributes`: Indicates if data attributes are allowed. 124 | - `attrs_allowed`: A nested dictionary categorizing allowed attributes into global, event and specific tags attributes. 125 | 126 | ### Allowed Tags 127 | 128 | - **`ALLOWED_TAGS`**: A dictionary that holds information about allowed tags for HTML, SVG, and MathML, including: 129 | - `html`: A list of allowed HTML tags. 130 | - `svg`: A list of allowed SVG tags. 131 | - `math`: A list of allowed MathML tags. 132 | 133 | ### Incorrectly Parsed Tags 134 | 135 | - **`INCORRECT_PARSED`**: A dictionary that holds information about incorrectly parsed tags for HTML, SVG, and MathML, including: 136 | - `html`: A list of incorrectly parsed HTML tags. 137 | - `svg`: A list of incorrectly parsed SVG tags. 138 | - `math`: A list of incorrectly parsed MathML tags. 139 | 140 | ### DEPTH_LIMITS 141 | **DEPTH_LIMITS**: A tuple that holds information about the depth limits of HTML tags, including: 142 | - `max_depth`: The maximum depth of HTML tags. 143 | - `limit_strategy`: The strategy used to handle tags exceeding the depth limit, which can be 'No max tags limit', 'Flattening', or 'Removing'. 144 | -------------------------------------------------------------------------------- /hui.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: hui 3 | Version: 0.2.2 4 | Home-page: https://github.com/slonser/hui 5 | Download-URL: https://github.com/Slonser/hui/archive/v_01.tar.gz 6 | Author: Slonser 7 | Author-email: slonser@slonser.info 8 | License: MIT 9 | Keywords: HTML,hui,HTML GUESSER,HTML identifier,XSS,bugbounty 10 | Classifier: Development Status :: 3 - Alpha 11 | Classifier: Intended Audience :: Developers 12 | Classifier: Topic :: Software Development :: Build Tools 13 | Classifier: License :: OSI Approved :: MIT License 14 | Classifier: Programming Language :: Python :: 3 15 | Description-Content-Type: text/markdown 16 | License-File: LICENSE.txt 17 | 18 | # Html Universal Identifier 19 | 20 | Html Universal Identifier is an alpha version of an application designed for identifying server-side HTML parsers. This package provides a way to determine which HTML, SVG, and MathML tags are allowed, helps to find parser features (incorrectly implemented tags), and can also help to guess which parser is used on the backend. 21 | 22 | Primarily, this library relies on the incorrectness of HTML parsing, for example, here are some classic examples: 23 | - `<form><form>text</form></form>` should be transformed to `<form>text</form>` 24 | - `<h1><h2>text</h2></h1>` should be transformed to `<h1><h2>text</h2></h1>` 25 | 26 | There are several reasons why you don't want to rely entirely on allowed tags: 27 | - It won't help you determine which parser your custom sanitization is based on 28 | - Allowed tags can be changed 29 | 30 | ## Features 31 | 32 | - Identify allowed HTML, SVG, and MathML tags. 33 | - Identify allowed attributes. 34 | - Identify incorrect parsing 35 | - Use a customizable handler function to process HTML payloads. 36 | - Load and compare results against predefined Parser outputs. 37 | 38 | ## Installation 39 | 40 | To install the package, use pip: 41 | 42 | ``` 43 | pip install hui 44 | ``` 45 | 46 | ## Usage 47 | 48 | Here is a basic example of how to use the `Identifier` class from the package: 49 | 50 | ```python 51 | from hui.identify import Identifier 52 | import requests 53 | 54 | def handler(payload): 55 | return requests.get("http://localhost:3005/sanitize",params={"html":payload}).text 56 | 57 | a = Identifier(handler=handler, buffer_enabled=False, buffer_limit=64, debug_mode=False) 58 | print(a.identify()) 59 | # run all 60 | # Example output 61 | # [[1.0, 27, 'JS_SANITIZE_HTML'], [0.8148148148148148, 22, 'PYTHON_HTML_SANITIZE'], ... 62 | 63 | print(a.check_attr_allowed("href",tag="a")) 64 | # True or False 65 | print(a.INCORRECT_PARSED) 66 | # Example output 67 | # [{'output': '<h5><h6>govnoed</h6></h5>', 'expected': '<h5></h5><h6>$text</h6>'}, .. ] 68 | print(a.ALLOWED_TAGS) 69 | # print allowed tags 70 | print(a.ATTRIBUTES) 71 | # Prints ATTRIBUTES info 72 | print(a.DEPTH_LIMITS) 73 | # Example Outputs: 74 | # (514, 'No max tags limit') 75 | # (512, 'Flattening') 76 | # (255, 'Removing') 77 | ``` 78 | 79 | ## Identifier Class 80 | 81 | The `Identifier` class is the core of this package. It is responsible for identifying allowed HTML, SVG, and MathML tags based on a handler function that processes HTML payloads. 82 | 83 | The class also maintains an `INCORRECT_PARSED` list, which contains payloads that were incorrectly parsed by the handler. For example, this may include cases where the parser fails to remove nested forms and similar issues. 84 | 85 | ## Current Parsers 86 | 87 | The following parsers are currently supported in the project: 88 | 89 | - **DOMpurify with JSDOM (JS)** 90 | - **JSDOM (JS)** 91 | - **sanitize_html (JS)** 92 | - **htmlparser2 (JS)** 93 | - **JSXSS (JS)** 94 | - **html (python)** 95 | - **lxml (python)** 96 | - **html_sanitizer (python)** 97 | - **net/html (go)** 98 | - **bluemonday (go)** 99 | 100 | If you believe a new parser/sanitizer should be added, please create an issue, and I will be happy to include it. 101 | ### Constructor Parameters 102 | 103 | - **`handler`**: A function that takes a payload and returns an HTML response. Example: 104 | ```python 105 | lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text 106 | ``` 107 | 108 | - **`buffer_enabled`** (optional, default=False): A boolean flag to enable or disable buffering of payloads before sending them to the handler. By default, buffering is disabled, as it can sometimes lead to incorrect results. For example, some sanitizers may simply remove all input if it contains a dangerous tag. Use buffering only if the server you are interacting with has strict rate limits. 109 | 110 | - **`buffer_delimeter`** (optional, default=`<div>TEXTTEXT</div>`): A string used to delimit buffered payloads when sending them to the handler. 111 | 112 | - **`buffer_limit`** (optional, default=32): An integer that specifies the maximum number of payloads to buffer before sending them to the handler. 113 | 114 | - **`template_vars`** (optional, default=None): A dictionary of template variables to use for substitution in payloads. 115 | 116 | - **`debug_mode`** (optional, default=False): A boolean flag to enable or disable debug logging. 117 | 118 | ### Methods 119 | 120 | - **`check_allowed_tags()`**: Checks and populates the `ALLOWED_TAGS` dictionary with allowed tags for HTML, SVG, and MathML. 121 | - **`call_handler(template_payloads: list[str])`**: Calls the handler function with a list of template payloads and returns the processed results. 122 | - **`check_namespace(namespace: str)`**: Checks for allowed tags in the specified namespace (SVG or MathML). 123 | - **`identify()`**: Identifies the best matching Parser based on generated payloads and returns a list of matches. 124 | - **`check_allowed_attrs()`**: Checks and validates allowed attributes for HTML tags. 125 | 126 | ### identify() Method 127 | 128 | The `identify()` method checks if allowed tags have been determined. If not, it calls `check_allowed_tags()` to populate the `ALLOWED_TAGS`. It then loads a list of generated payloads from a JSON file and calls the handler for each payload. Finally, it compares the results against all JSON files in the `results_parsers` directory to count matches and returns a sorted list of results. 129 | 130 | - **Returns**: A list of tuples, each containing: 131 | - The match ratio (float) 132 | - The number of matches (int) 133 | - The name of the Parser (str) 134 | 135 | ### Attributes 136 | 137 | - **`ATTRIBUTES`**: A dictionary that holds information about allowed attributes for HTML tags, including: 138 | - `custom_attribute`: Indicates if custom attributes are allowed. 139 | - `event_attributes_blocked`: Indicates if event attributes are directly blocked. 140 | - `data_attributes`: Indicates if data attributes are allowed. 141 | - `attrs_allowed`: A nested dictionary categorizing allowed attributes into global, event and specific tags attributes. 142 | 143 | ### Allowed Tags 144 | 145 | - **`ALLOWED_TAGS`**: A dictionary that holds information about allowed tags for HTML, SVG, and MathML, including: 146 | - `html`: A list of allowed HTML tags. 147 | - `svg`: A list of allowed SVG tags. 148 | - `math`: A list of allowed MathML tags. 149 | 150 | ### Incorrectly Parsed Tags 151 | 152 | - **`INCORRECT_PARSED`**: A dictionary that holds information about incorrectly parsed tags for HTML, SVG, and MathML, including: 153 | - `html`: A list of incorrectly parsed HTML tags. 154 | - `svg`: A list of incorrectly parsed SVG tags. 155 | - `math`: A list of incorrectly parsed MathML tags. 156 | 157 | ### DEPTH_LIMITS 158 | **DEPTH_LIMITS**: A tuple that holds information about the depth limits of HTML tags, including: 159 | - `max_depth`: The maximum depth of HTML tags. 160 | - `limit_strategy`: The strategy used to handle tags exceeding the depth limit, which can be 'No max tags limit', 'Flattening', or 'Removing'. 161 | -------------------------------------------------------------------------------- /hui/ALLOWED_ATTRS.py: -------------------------------------------------------------------------------- 1 | GLOBAL_ATTRS = [ 2 | "accesskey", 3 | "anchor", 4 | "autocapitalize", 5 | "autocorrect", 6 | "autofocus", 7 | "class", 8 | "contenteditable", 9 | "data-*", 10 | "dir", 11 | "draggable", 12 | "enterkeyhint", 13 | "exportparts", 14 | "hidden", 15 | "id", 16 | "inert", 17 | "inputmode", 18 | "is", 19 | "itemid", 20 | "itemprop", 21 | "itemref", 22 | "itemscope", 23 | "itemtype", 24 | "lang", 25 | "nonce", 26 | "part", 27 | "popover", 28 | "slot", 29 | "spellcheck", 30 | "style", 31 | "tabindex", 32 | "title", 33 | "translate", 34 | "virtualkeyboardpolicy", 35 | "writingsuggestions" 36 | ] 37 | 38 | EVENT_ATTRS = [ 39 | "onafterprint", 40 | "onafterscriptexecute", 41 | "onanimationcancel", 42 | "onanimationend", 43 | "onanimationiteration", 44 | "onanimationstart", 45 | "onauxclick", 46 | "onbeforecopy", 47 | "onbeforecut", 48 | "onbeforeinput", 49 | "onbeforeprint", 50 | "onbeforescriptexecute", 51 | "onbeforetoggle", 52 | "onbeforeunload", 53 | "onbegin", 54 | "onblur", 55 | "oncancel", 56 | "oncanplay", 57 | "oncanplaythrough", 58 | "onchange", 59 | "onclick", 60 | "onclose", 61 | "oncontentvisibilityautostatechange", 62 | "oncontextmenu", 63 | "oncopy", 64 | "oncuechange", 65 | "oncut", 66 | "ondblclick", 67 | "ondrag", 68 | "ondragend", 69 | "ondragenter", 70 | "ondragexit", 71 | "ondragleave", 72 | "ondragover", 73 | "ondragstart", 74 | "ondrop", 75 | "ondurationchange", 76 | "onend", 77 | "onended", 78 | "onerror", 79 | "onfocus", 80 | "onfocus(autofocus)", 81 | "onfocusin", 82 | "onfocusout", 83 | "onformdata", 84 | "onfullscreenchange", 85 | "onhashchange", 86 | "oninput", 87 | "oninvalid", 88 | "onkeydown", 89 | "onkeypress", 90 | "onkeyup", 91 | "onload", 92 | "onloadeddata", 93 | "onloadedmetadata", 94 | "onloadstart", 95 | "onmessage", 96 | "onmousedown", 97 | "onmouseenter", 98 | "onmouseleave", 99 | "onmousemove", 100 | "onmouseout", 101 | "onmouseover", 102 | "onmouseup", 103 | "onmousewheel", 104 | "onmozfullscreenchange", 105 | "onpagehide", 106 | "onpageshow", 107 | "onpaste", 108 | "onpause", 109 | "onplay", 110 | "onplaying", 111 | "onpointercancel", 112 | "onpointerdown", 113 | "onpointerenter", 114 | "onpointerleave", 115 | "onpointermove", 116 | "onpointerout", 117 | "onpointerover", 118 | "onpointerrawupdate", 119 | "onpointerup", 120 | "onpopstate", 121 | "onprogress", 122 | "onratechange", 123 | "onrepeat", 124 | "onreset", 125 | "onresize", 126 | "onscroll", 127 | "onscrollend", 128 | "onscrollsnapchange", 129 | "onsearch", 130 | "onseeked", 131 | "onseeking", 132 | "onselect", 133 | "onselectionchange", 134 | "onselectstart", 135 | "onshow", 136 | "onsubmit", 137 | "onsuspend", 138 | "ontimeupdate", 139 | "ontoggle", 140 | "ontoggle(popover)", 141 | "ontouchend", 142 | "ontouchmove", 143 | "ontouchstart", 144 | "ontransitioncancel", 145 | "ontransitionend", 146 | "ontransitionrun", 147 | "ontransitionstart", 148 | "onunhandledrejection", 149 | "onunload", 150 | "onvolumechange", 151 | "onwaiting", 152 | "onwebkitanimationend", 153 | "onwebkitanimationiteration", 154 | "onwebkitanimationstart", 155 | "onwebkitfullscreenchange", 156 | "onwebkitmouseforcechanged", 157 | "onwebkitmouseforcedown", 158 | "onwebkitmouseforceup", 159 | "onwebkitmouseforcewillbegin", 160 | "onwebkitplaybacktargetavailabilitychanged", 161 | "onwebkitpresentationmodechanged", 162 | "onwebkittransitionend", 163 | "onwebkitwillrevealbottom", 164 | "onwheel" 165 | ] 166 | 167 | DEFAULT_ATTRS = { 168 | "form": [ 169 | "accept", "accept-charset", "action", "autocomplete", "enctype", 170 | "method", "name", "novalidate", "target" 171 | ], 172 | "input": [ 173 | "accept", "alt", "autocomplete", "capture", "checked", "dirname", 174 | "disabled", "form", "formaction", "formenctype", "formmethod", 175 | "formnovalidate", "formtarget", "list", "max", "maxlength", 176 | "minlength", "min", "multiple", "name", "pattern", "placeholder", 177 | "readonly", "required", "size", "src", "step", "type", "usemap", 178 | "value", "width" 179 | ], 180 | "col": ["span"], 181 | "colgroup": ["span"], 182 | "iframe": [ 183 | "allow", "csp", "name", "referrerpolicy", "sandbox", "src", 184 | "srcdoc", "width" 185 | ], 186 | "img": [ 187 | "alt", "crossorigin", "decoding", "intrinsicsize", "ismap", 188 | "referrerpolicy", "sizes", "src", "srcset", "usemap", "width" 189 | ], 190 | "table": ["summary"], 191 | "td": ["colspan", "headers", "rowspan"], 192 | "th": ["colspan", "headers", "rowspan", "scope"], 193 | "area": [ 194 | "alt", "coords", "download", "href", "media", "ping", 195 | "referrerpolicy", "rel", "shape", "target" 196 | ], 197 | "link": [ 198 | "as", "crossorigin", "href", "hreflang", "integrity", 199 | "media", "referrerpolicy", "rel", "sizes", "type" 200 | ], 201 | "script": [ 202 | "async", "crossorigin", "defer", "integrity", "language", 203 | "referrerpolicy", "src", "type" 204 | ], 205 | "select": [ 206 | "autocomplete", "disabled", "form", "multiple", "name", 207 | "required", "size" 208 | ], 209 | "textarea": [ 210 | "autocomplete", "cols", "dirname", "disabled", "enterkeyhint", 211 | "form", "inputmode", "maxlength", "minlength", "name", 212 | "placeholder", "readonly", "required", "rows", "wrap" 213 | ], 214 | "audio": [ 215 | "autoplay", "controls", "crossorigin", "loop", "muted", 216 | "preload", "src" 217 | ], 218 | "video": [ 219 | "autoplay", "controls", "crossorigin", "loop", "muted", 220 | "playsinline", "poster", "preload", "src", "width" 221 | ], 222 | "marquee": ["loop"], 223 | "object": [ 224 | "data", "form", "name", "type", "usemap", "width" 225 | ], 226 | "meta": ["charset", "content", "http-equiv", "name"], 227 | "blockquote": ["cite"], 228 | "del": ["cite", "datetime"], 229 | "ins": ["cite", "datetime"], 230 | "q": ["cite"], 231 | "time": ["datetime"], 232 | "track": ["default", "kind", "label", "src", "srclang"], 233 | "button": [ 234 | "disabled", "form", "formaction", "formenctype", "formmethod", 235 | "formnovalidate", "formtarget", "name", "type", "value" 236 | ], 237 | "fieldset": ["disabled", "form", "name"], 238 | "optgroup": ["disabled", "label"], 239 | "option": ["disabled", "label", "selected", "value"], 240 | "a": [ 241 | "download", "href", "hreflang", "media", "ping", 242 | "referrerpolicy", "rel", "shape", "target" 243 | ], 244 | "label": ["for", "form"], 245 | "output": ["for", "form", "name"], 246 | "meter": ["form", "high", "low", "max", "min", "optimum", "value"], 247 | "progress": ["form", "max", "value"], 248 | "canvas": ["width"], 249 | "embed": ["src", "type", "width"], 250 | "base": ["href", "target"], 251 | "source": ["media", "sizes", "src", "srcset", "type"], 252 | "style": ["media", "scoped", "type"], 253 | "map": ["name"], 254 | "param": ["name", "value"], 255 | "details": ["open"], 256 | "dialog": ["open"], 257 | "ol": ["reversed", "start", "type"], 258 | "menu": ["type"], 259 | "data": ["value"], 260 | "li": ["value"] 261 | } -------------------------------------------------------------------------------- /hui/identify.py: -------------------------------------------------------------------------------- 1 | from .ALLOWED_TAGS import * 2 | from .ALLOWED_ATTRS import * 3 | from string import Template 4 | import json 5 | import os 6 | import importlib.resources 7 | from importlib.resources import files 8 | from .parsers.simple_parser import SANITIZE_HTML 9 | import logging 10 | from .CustomParser import CustomParser 11 | 12 | class Identifier: 13 | def __init__(self, handler, buffer_enabled=False, buffer_delimeter="<div>TEXTTEXT</div>", buffer_limit=32, template_vars=None, debug_mode=False) -> None: 14 | """ 15 | Initializes the Identifier class with a handler function and optional parameters for buffer management, template variables, and logging. 16 | 17 | :param handler: handler function that must return text with an HTML response. 18 | Example of a handler function: 19 | lambda payload: requests.get(f"http://localhost:3000?payload={payload}").text 20 | :param buffer_enabled: Boolean indicating whether to enable buffering of payloads before sending to the handler. 21 | :param buffer_delimeter: String used to delimit payloads in the buffer. 22 | :param buffer_limit: Integer specifying the maximum number of payloads to buffer before sending to the handler. 23 | :param template_vars: Optional dictionary of template variables to use for substitution in payloads. 24 | :param debug_mode: Boolean indicating whether to enable debug logging. 25 | :return: returns nothing 26 | """ 27 | self.handler = handler 28 | self.ALLOWED_TAGS = { 29 | "html": [], 30 | "svg": [], 31 | "math": [], 32 | } 33 | self.TEMPLATE_VARS = template_vars if template_vars is not None else { 34 | 'text': 'govnoed', 35 | 'href': 'https://github.com', 36 | 'attribute_prefix': 'data' 37 | } 38 | 39 | self.ALLOWED_TAGS_CHECKED = False 40 | self.DEFAULT_SANITIZER = SANITIZE_HTML() 41 | 42 | self.BUFFER = "" 43 | self.BUFFER_LIMIT = buffer_limit 44 | self.BUFFER_ENABLED = buffer_enabled 45 | self.BUFFER_DELIMETER = buffer_delimeter 46 | 47 | self.INCORRECT_PARSED = [] 48 | 49 | self.DEPTH_LIMITS = () 50 | 51 | self.ATTRIBUTES = { 52 | "custom_attribute" : None, # is custom attributes allowed 53 | "event_attributes_blocked": None, # is event attributes directly blocked 54 | "data_attributes": None, # is data attributes allowed 55 | "attrs_allowed":{ 56 | "global":[], # global attributes 57 | "events":[] # events attributes 58 | } 59 | } 60 | 61 | # Configure logging based on debug_mode 62 | if debug_mode: 63 | logging.basicConfig(level=logging.DEBUG) 64 | else: 65 | logging.basicConfig(level=logging.INFO) 66 | 67 | self.logger = logging.getLogger(__name__) 68 | self.parser = CustomParser() 69 | 70 | def check_allowed_tags(self) -> dict: 71 | """ 72 | Check and validate allowed HTML, SVG, and MathML tags. 73 | 74 | :return: A dictionary of allowed tags. 75 | """ 76 | self.logger.debug("Checking allowed tags...") 77 | self.ALLOWED_TAGS_CHECKED = True 78 | self.check_html_namespace() 79 | self.check_namespace("math") 80 | self.check_namespace("svg") 81 | 82 | self.logger.debug("Allowed tags checked: %s", self.ALLOWED_TAGS) 83 | return self.ALLOWED_TAGS 84 | 85 | def call_handler(self, template_payloads: list[str]) -> list[str]: 86 | """ 87 | Call the handler function with the provided template payloads. 88 | 89 | :param template_payloads: List of template strings to process. 90 | :return: List of processed results from the handler. 91 | """ 92 | self.logger.debug("Calling handler with payloads: %s", template_payloads) 93 | for i in range(len(template_payloads)): 94 | template_payloads[i] = Template(template_payloads[i]).safe_substitute(self.TEMPLATE_VARS) 95 | 96 | if self.BUFFER_ENABLED: 97 | res = [] 98 | buffer = [] 99 | for payload in template_payloads: 100 | buffer.append(payload) 101 | if len(buffer) >= self.BUFFER_LIMIT: 102 | res.extend(self.handler(self.BUFFER_DELIMETER.join(buffer)).split(self.BUFFER_DELIMETER)) 103 | buffer = [] 104 | if buffer: 105 | res.extend(self.handler(self.BUFFER_DELIMETER.join(buffer)).split(self.BUFFER_DELIMETER)) 106 | self.logger.debug("Handler results: %s", res) 107 | return res 108 | 109 | res = [self.handler(payload) for payload in template_payloads] 110 | self.logger.debug("Handler results: %s", res) 111 | return res 112 | 113 | def check_html_namespace(self) -> None: 114 | """ 115 | Check and validate allowed HTML tags. 116 | 117 | :return: None 118 | """ 119 | self.logger.debug("Checking HTML namespace...") 120 | arr = [] 121 | for tag in html_tags: 122 | arr.append([f'<{tag}>$text</{tag}>', tag]) 123 | 124 | for tag in html_table_tags: 125 | arr.append([f'<table><{tag}>$text</{tag}></table>', tag]) 126 | 127 | handler_results = self.call_handler([x[0] for x in arr]) 128 | for i in range(len(handler_results)): 129 | res = handler_results[i] 130 | if f'<{arr[i][1]}' in res: 131 | self.ALLOWED_TAGS["html"].append(arr[i][1]) 132 | 133 | self.logger.debug("Allowed HTML tags: %s", self.ALLOWED_TAGS["html"]) 134 | 135 | def check_namespace(self, namespace: str) -> None: 136 | """ 137 | Check and validate tags in the specified namespace (math or svg). 138 | 139 | :param namespace: The namespace to check (math or svg). 140 | :raises Exception: If the namespace is not supported. 141 | :return: None 142 | """ 143 | self.logger.debug("Checking namespace: %s", namespace) 144 | if namespace not in self.ALLOWED_TAGS: 145 | raise Exception(f'{namespace} namespace is not supported') 146 | 147 | tag_arr = [] 148 | namespace_tags = [] 149 | if namespace == "math": 150 | namespace_tags = mathml_tags 151 | elif namespace == "svg": 152 | namespace_tags = svg_tags 153 | 154 | for tag in namespace_tags: 155 | tag_arr.append([f'<{namespace}><{tag}>$text</{tag}></{namespace}>', tag]) 156 | 157 | handler_results = self.call_handler([x[0] for x in tag_arr]) 158 | 159 | for i in range(len(handler_results)): 160 | res = handler_results[i] 161 | if f'<{tag_arr[i][1]}' in res: 162 | self.ALLOWED_TAGS[namespace].append(tag_arr[i][1]) 163 | 164 | self.logger.debug("Allowed tags for namespace '%s': %s", namespace, self.ALLOWED_TAGS[namespace]) 165 | 166 | def check_tag_allowed(self, tag: str) -> bool: 167 | """ 168 | Check if a tag is allowed. 169 | 170 | :param tag: The tag to check. 171 | :return: True if the tag is allowed, False otherwise. 172 | """ 173 | return any([(tag in self.ALLOWED_TAGS[namespace]) for namespace in self.ALLOWED_TAGS]) 174 | 175 | def identify(self) -> list[list[float | int | str]]: 176 | """ 177 | Identify and validate tags against expected outputs. 178 | 179 | :return: A sorted list of results with match ratios and file names. 180 | """ 181 | self.logger.debug("Identifying tags...") 182 | if len(self.ALLOWED_TAGS['html']) == 0: 183 | self.check_allowed_tags() 184 | self.check_allowed_attrs() 185 | self.check_depth() 186 | arr = self.DEFAULT_SANITIZER.checks 187 | res = self.call_handler([tag.payload for tag in arr]) 188 | for i in range(len(res)): 189 | all_tags_allowed = all([self.check_tag_allowed(tag) for tag in arr[i].tags]) 190 | if all_tags_allowed and not(arr[i].check(res[i],self.TEMPLATE_VARS)): 191 | self.logger.debug("Found incorrect parsing logic: %s, but %s is expected", res[i], arr[i].expected_output) 192 | self.INCORRECT_PARSED.append({"output": res[i].strip(), "expected": arr[i].expected_output}) 193 | 194 | 195 | json_files = [f for f in importlib.resources.files('hui.results_parsers').iterdir() if f.name.endswith('.json')] 196 | 197 | result = [] 198 | for json_file in json_files: 199 | with open(json_file) as f: 200 | data = json.load(f) 201 | 202 | # Count the number of matches in the JSON file 203 | matches = sum([1 for i in range(len(res)) if Template(data[i]).substitute(self.TEMPLATE_VARS).strip() in res[i].strip()]) 204 | result.append([matches / len(data), matches, json_file.name.split('.')[0]]) 205 | 206 | result = sorted(result, reverse=True) 207 | self.logger.debug("Identification results: %s", result) 208 | return result 209 | 210 | def check_namespace_supported(self, namespace: str) -> bool: 211 | """ 212 | Check if the specified namespace is supported. 213 | 214 | :param namespace: The namespace to check. 215 | :raises Exception: If the namespace is invalid or not supported. 216 | :return: True if the namespace is supported, False otherwise. 217 | """ 218 | if not self.ALLOWED_TAGS_CHECKED: 219 | self.check_allowed_tags() 220 | if namespace not in self.ALLOWED_TAGS: 221 | raise Exception('Invalid namespace name') 222 | return len(self.ALLOWED_TAGS[namespace]) > 0 223 | 224 | def check_attr_allowed(self, attr: str, tag: str = None, attr_value: str = "https://github.com/Slonser/hui") -> bool: 225 | """ 226 | Checks if a given attribute is allowed for a specified tag. 227 | 228 | This method checks if a given attribute is allowed for a specified tag by simulating the parsing of HTML elements with the attribute and then checking if the attribute is present in the parsed attributes. 229 | 230 | :param attr: The attribute to check. 231 | :param tag: The tag to check the attribute for. Defaults to None, which means the first allowed HTML tag will be used. 232 | :param attr_value: The value to assign to the attribute for testing. Defaults to "https://github.com/Slonser/hui". 233 | :return: True if the attribute is allowed, False otherwise. 234 | """ 235 | if tag is None: 236 | assert self.check_namespace_supported("html"), "No tags allowed" 237 | tag = self.ALLOWED_TAGS['html'][0] 238 | 239 | # Simulate parsing of HTML elements with the attribute to check 240 | res = self.call_handler([f'<{tag} {attr}="{attr_value}"></{tag}>', 241 | f'<{tag}/{attr}="{attr_value}"></{tag}>']) # In some situations, the attribute might only be parsed with a / symbol 242 | self.parser.check(res[0]+res[1]) 243 | # Check if the attribute is present in the parsed attributes 244 | return attr in [attr_parsed[0] for attr_parsed in self.parser.found_attrs] 245 | 246 | def check_allowed_attrs(self): 247 | """ 248 | Check and validate allowed attributes for HTML tags. 249 | 250 | This method checks if global attributes, event attributes, and default attributes are allowed. 251 | It updates the ATTRIBUTES dictionary with the allowed attributes and logs the results. 252 | 253 | :return: A dictionary containing the allowed attributes categorized by global, event, and specific tags. 254 | """ 255 | for attr in GLOBAL_ATTRS: 256 | is_allowed = self.check_attr_allowed(attr) 257 | if is_allowed: 258 | self.ATTRIBUTES["attrs_allowed"]["global"].append(attr) 259 | 260 | 261 | for attr in EVENT_ATTRS: 262 | is_allowed = self.check_attr_allowed(attr) 263 | if is_allowed: 264 | self.ATTRIBUTES["events"]["events"].append(attr) 265 | 266 | for tag in DEFAULT_ATTRS: 267 | self.ATTRIBUTES["attrs_allowed"][tag] = [] 268 | for attr in DEFAULT_ATTRS[tag]: 269 | is_allowed = self.check_attr_allowed(attr,tag=tag) 270 | if is_allowed: 271 | self.ATTRIBUTES["attrs_allowed"][tag].append(attr) 272 | 273 | self.ATTRIBUTES["data_attributes"] = self.check_attr_allowed("data-hui") 274 | if self.ATTRIBUTES["data_attributes"]: 275 | self.logger.debug("data attributes allowed") 276 | 277 | self.ATTRIBUTES["custom_attribute"] = self.check_attr_allowed("custom") 278 | 279 | if self.ATTRIBUTES["custom_attribute"]: 280 | self.logger.debug("Custom attributes allowed") 281 | 282 | self.ATTRIBUTES["event_attributes_blocked"] = not(self.check_attr_allowed("onhui")) 283 | 284 | if self.ATTRIBUTES["custom_attribute"] and self.ATTRIBUTES["event_attributes_blocked"]: 285 | self.logger.debug("Event attributes directly blocked") 286 | 287 | return self.ATTRIBUTES 288 | 289 | def check_depth(self): 290 | """ 291 | Check and validate the depth of HTML tags. 292 | 293 | This method checks if the depth of HTML tags exceeds the limit and updates the DEPTH_LIMITS accordingly. 294 | 295 | :return: DEPTH_LIMITS 296 | """ 297 | assert self.check_namespace_supported("html"), "No tags allowed" 298 | tag = self.ALLOWED_TAGS['html'][0] 299 | res = self.call_handler([f'<div>'*514+f'</div>']) 300 | self.parser.check(res[0]) 301 | if self.parser.max_depth > 512: 302 | self.DEPTH_LIMITS = (self.parser.max_depth, 'No max tags limit') 303 | elif self.parser.start_tags > 512: 304 | self.DEPTH_LIMITS = (self.parser.max_depth, 'Flattening') 305 | else: 306 | self.DEPTH_LIMITS = (self.parser.max_depth, 'Removing') 307 | return self.DEPTH_LIMITS 308 | --------------------------------------------------------------------------------