├── tests
    ├── __init__.py
    ├── test_req_res.py
    ├── test_common.py
    └── test_main.py
├── requirements.txt
├── pathbuster
    ├── __init__.py
    ├── utils
    │   └── common.py
    ├── classes
    │   ├── config.py
    │   └── response.py
    └── pathbuster.py
├── pyproject.toml
├── .github
    └── workflows
    │   └── publish-to-pypi.yml
├── LICENSE
├── .gitignore
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | pytest
3 | 


--------------------------------------------------------------------------------
/pathbuster/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | 
4 | sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))


--------------------------------------------------------------------------------
/tests/test_req_res.py:
--------------------------------------------------------------------------------
 1 | from pathbuster.pathbuster import Response
 2 | 
 3 | 
 4 | def test_init():
 5 |     bodyb = b'one two\nthree four'
 6 |     headers = {'Cookie': 'test=1234;'}
 7 |     parent_url = 'http://example.com'
 8 |     req = Response('http://example.com/admin', 200, 'OK', bodyb, headers, parent_url, 'meta1')
 9 |     assert(len(req.headers) == 1)
10 |     assert(req.headers['Cookie'] == 'test=1234;')
11 |     assert(req.strbody == 'one two\nthree four')
12 |     assert(req.bodylen == len(bodyb))
13 |     assert(req.scheme == 'http')
14 |     assert(req.parent_url == parent_url)
15 |     assert(req.bodylines == 2)
16 |     assert(req.bodywords == 3)
17 | 


--------------------------------------------------------------------------------
/pathbuster/utils/common.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import random
 3 | from hashlib import md5
 4 | 
 5 | 
 6 | def random_str(length=30):
 7 |     """Generate a random string of fixed length """
 8 |     letters = string.ascii_letters + string.digits
 9 |     return ''.join(random.choice(letters) for i in range(length))
10 | 
11 | 
12 | def count_lines(text: str):
13 |     if len(text) > 0:
14 |         return text.count("\n") + 1
15 |     else:
16 |         return 0
17 | 
18 | 
19 | def count_words(text: str):
20 |     if len(text) > 0:
21 |         return text.count(" ") + 1
22 |     else:
23 |         return 0
24 | 
25 | 
26 | def md5str(s):
27 |     return md5(s.encode()).hexdigest()


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "pathbuster"
 7 | version = "0.3.0"
 8 | authors = [
 9 |   { name="Vladimir Sopernikov" },
10 | ]
11 | description = "PathBuster - multiple hosts Web path scanner"
12 | readme = "README.md"
13 | requires-python = ">=3.7"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "requests"
21 | ]
22 | 
23 | [project.urls]
24 | "Homepage" = "https://github.com/rivalsec/pathbuster"
25 | "Bug Tracker" = "https://github.com/rivalsec/pathbuster/issues"
26 | 
27 | [project.scripts]
28 | pathbuster = "pathbuster.pathbuster:main"


--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pathbuster.utils.common import count_lines, count_words
 3 | 
 4 | def test_count_lines():
 5 |     assert count_lines("") == 0  # Empty string should return 0
 6 |     assert count_lines("Hello\nWorld") == 2  # Two lines separated by a newline character
 7 |     assert count_lines("Hello\n\nWorld") == 3  # Three lines with an empty line in between
 8 |     assert count_lines("Hello World") == 1  # Single line without any newline characters
 9 | 
10 | def test_count_words():
11 |     assert count_words("") == 0  # Empty string should return 0
12 |     assert count_words("Hello World") == 2  # Two words separated by a space
13 |     assert count_words("Hello\nWorld") == 1  # TODO:fix? Two words separated by a newline character
14 |     assert count_words("Hello\n\nWorld") == 1  # TODO:fix? Two words with an empty line in between
15 | 


--------------------------------------------------------------------------------
/pathbuster/classes/config.py:
--------------------------------------------------------------------------------
 1 | class Config:
 2 |     __slots__ = [
 3 |         "proxies", "timeout", "headers", "max_errors", "http_method",
 4 |         "max_response_size", "store_response", "filter_regex",
 5 |         "json_print", "follow_redirects", "max_redirects", "exclude_codes", 
 6 |         "extensions", "stats", "res_dir", "stats_interval"
 7 |         ]
 8 |     
 9 |     def __init__(self):
10 |         #global settings
11 |         self.proxies = None
12 |         self.timeout = 30
13 |         self.headers = dict()
14 |         self.max_errors = 5
15 |         self.http_method = 'GET'
16 |         self.max_response_size = 250000
17 |         self.store_response = False
18 |         self.filter_regex = None
19 |         self.json_print = False
20 |         self.follow_redirects = False
21 |         self.max_redirects = 3
22 |         self.exclude_codes = []
23 |         self.extensions = ['']
24 |         self.stats = None
25 |         self.res_dir = None
26 |         self.stats_interval = 60


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI and TestPyPI
 2 | on: push
 3 | jobs:
 4 |   build-n-publish:
 5 |     name: Build and publish PyPI and TestPyPI
 6 |     runs-on: ubuntu-latest
 7 |     steps:
 8 |     - uses: actions/checkout@v3
 9 |     - name: Set up Python
10 |       uses: actions/setup-python@v3
11 |       with:
12 |         python-version: '3.x'
13 |     - name: Install pypa/build
14 |       run: |
15 |         python -m pip install --upgrade pip
16 |         pip install build
17 |     - name: Build package
18 |       run: python -m build
19 |     # - name: Publish distribution to Test PyPI
20 |     #   uses: pypa/gh-action-pypi-publish@release/v1
21 |     #   with:
22 |     #     password: ${{ secrets.TEST_PYPI_API_TOKEN }}
23 |     #     repository-url: https://test.pypi.org/legacy/
24 |     #     skip-existing: true
25 |     - name: Publish distribution to PyPI (only on tags)
26 |       if: startsWith(github.ref, 'refs/tags')
27 |       uses: pypa/gh-action-pypi-publish@release/v1
28 |       with:
29 |         password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Vladimir Sopernikov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pathbuster/classes/response.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from utils.common import count_lines, count_words, md5str
 3 | import urllib.parse
 4 | 
 5 | 
 6 | class Response:
 7 |     __slots__ = [
 8 |         "base_url", "url", "status", "reason", "headers",
 9 |         "parent_url", "bodylen", "strbody",
10 |         "bodywords", "bodylines", "meta", "location", 
11 |         "scheme", "host", "path_hash", "body"
12 |         ]
13 |     
14 |     def __init__(self, url, status, reason, body, headers, parent_url, meta = None):
15 |         self.base_url = None
16 |         self.url = url
17 |         self.status = status
18 |         self.reason = reason
19 |         self.headers = headers
20 |         self.parent_url = parent_url
21 |         if "Content-Length" in headers:
22 |             self.bodylen = int(headers["Content-Length"])
23 |         else:
24 |             self.bodylen = len(body)
25 |         self.strbody = body.decode('utf-8', errors='ignore')
26 |         self.bodywords = count_words(self.strbody)
27 |         self.bodylines = count_lines(self.strbody)
28 |         self.meta = []
29 |         if meta:
30 |             self.meta.append(meta)
31 |         if 'location' in headers:
32 |             self.location = headers['location']
33 |         else:
34 |             self.location = None
35 |         up = urllib.parse.urlparse(url)
36 |         self.scheme = up[0]
37 |         self.host = up[1]
38 |         self.path_hash = md5str(up[2])
39 |         self.body = body
40 | 
41 | 
42 |     def add_meta(self, s):
43 |         self.meta.append(s)
44 | 
45 | 
46 |     def __str__(self):
47 |         s = f"{self.url}\t{self.status}\tBytes:{self.bodylen}/Lines:{self.bodylines}/Words:{self.bodywords}"
48 |         if self.location:
49 |             s += f"\t-> {self.location}"
50 |         if self.meta:
51 |             meta = ', '.join(self.meta)
52 |             s += f"\t{meta}"
53 |         return s
54 | 
55 | 
56 |     def is_similar(self, other:'Response'):
57 |         if  self.status == other.status and self.bodywords == other.bodywords and self.bodylines == other.bodylines:
58 |             return True
59 | 
60 | 
61 |     def to_json(self, store_response=False):
62 |         jkeys = ['url', 'status', 'reason', 'parent_url', 'meta', 'scheme', 'host']
63 |         if store_response:
64 |             jkeys.append('strbody')
65 |         jres = { k:getattr(self,k) for k in jkeys}
66 |         return json.dumps(jres)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | 
132 | test/
133 | pathbuster-res/
134 | .vscode/
135 | .DS_Store
136 | dist/
137 | 


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
 1 | import pathbuster.pathbuster as pathbuster
 2 | from pathbuster.classes.response import Response
 3 | 
 4 | required_args = [
 5 |     '-u', './test/testurls',
 6 |     '-p', './test/testwordlist',
 7 | ]
 8 | 
 9 | 
10 | def test_filter_regex():
11 |     pathbuster.conf.filter_regex = '<title>Admin'
12 |     pathbuster.conf.exclude_codes = [301,]
13 | 
14 |     res1 = Response(url='http://example.com/test', status=200, reason='OK', body=b'<title>Member</title> etc', headers=[], parent_url=None)
15 |     assert(pathbuster.result_valid(res1) == False)
16 | 
17 |     res2 = Response(url='http://example.com/test', status=404, reason='OK', body=b'bla \n <title>Admin Panel</title> ', headers=[], parent_url=None)
18 |     assert(pathbuster.result_valid(res2) == True)
19 | 
20 |     #code is filtered
21 |     res2 = Response(url='http://example.com/test', status=301, reason='OK', body=b'bla \n <title>Admin Panel</title> ', headers=[], parent_url=None)
22 |     assert(pathbuster.result_valid(res2) == False)
23 | 
24 | 
25 | def test_not_ac():
26 |     # only status code filter
27 |     args = required_args.copy()
28 |     args.extend(['-e', '401,404,400'])
29 |     pathbuster.parse_args(args)
30 | 
31 |     res1 = Response(url='http://example.com/test', status=200, reason='OK', body=b'', headers=[], parent_url=None)
32 |     assert(pathbuster.result_valid(res1) == True)
33 | 
34 |     res2 = Response(url='http://example.com/test', status=400, reason='NOT OK', body=b'', headers=[], parent_url=None)
35 |     assert(pathbuster.result_valid(res2) == False)
36 | 
37 |     res3 = Response(url='http://example.com/test', status=401, reason='NOT OK', body=b'', headers=[], parent_url=None)
38 |     assert(pathbuster.result_valid(res3) == False)
39 | 
40 | 
41 | def test_empty_e():
42 |     pathbuster.conf.exclude_codes = []
43 |     
44 |     res1 = Response(url='http://example.com/test', status=200, reason='OK', body=b'', headers=[], parent_url=None)
45 |     assert(pathbuster.result_valid(res1) == True)
46 | 
47 |     res2 = Response(url='http://example.com/test', status=400, reason='NOT OK', body=b'', headers=[], parent_url=None)
48 |     assert(pathbuster.result_valid(res2) == True)
49 | 
50 | 
51 | def test_ac():
52 |     args = required_args.copy()
53 |     args.extend(['-ac'])
54 |     pathbuster.parse_args(args)
55 |     pf_res1 = Response(url='http://example.com/test', status=200, reason='OK', body=b'1 2 3\n4 5', headers=[], parent_url='http://example.com')
56 |     pf_res2 = Response(url='http://example.com/redirect', status=301, reason='OK', body=b'1 2 3\n4 5\n6 7', headers=[], parent_url='http://example.com')
57 |     pathbuster.preflight_samples = {
58 |         'http://example.com': [pf_res1, pf_res2],
59 |     }
60 |     assert(pathbuster.result_valid(pf_res1) == False)
61 |     assert(pathbuster.result_valid(pf_res2) == False)
62 |     res = Response(url='http://example.com/test2', status=200, reason='OK', body=b'1 2 3\n4 5 6', headers=[], parent_url='http://example.com')
63 |     assert(pathbuster.result_valid(res) == True)
64 |     res2 = Response(url='http://example.com/test22', status=301, reason='OK', body=b'1 2 3\n4 5 6', headers=[], parent_url='http://example.com')
65 |     assert(pathbuster.result_valid(res2) == True)
66 |     res3 = Response(url='http://example2.com/test222', status=200, reason='OK', body=b'1 2 3\n4 5 6', headers=[], parent_url='http://example2.com')
67 |     assert(pathbuster.result_valid(res3) == True)
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PathBuster - multiple hosts Web path scanner
 2 | 
 3 | This scanner is designed to check paths on multiple hosts at the same time.
 4 | One path is taken and checked multithreaded across all hosts, then the next path is taken, etc.
 5 | 
 6 | This gives us the following benefits:
 7 | - there is no heavy load on one host (if we checked many paths in several threads on one host).
 8 | - prevents a possible ban on the scanner by the WAF.
 9 | - saving time, there is no need to run the scanner for each host separately.
10 | - a large number of results at once.
11 | 
12 | For convenience, the results are written to two files at once in the pathbuster-res folder:
13 | - file with hostname (all paths found for this host)
14 | - file with response code (file with all responses 200, 301, etc.)
15 | 
16 | ![image](https://user-images.githubusercontent.com/50343281/114876542-de8ab200-9e17-11eb-9c1c-78702fd2d4f1.png)
17 | 
18 | 
19 | Before starting scanning, the program checks the server's responses on random string and, if the response code is not excluded by the program settings, writes a sample (code and size) of the response for subsequent comparison.
20 | This allows us to exclude a large number of false positives (for example, if the server responds to us 200 OK for all requests)
21 | And it allows you to find answers that differ from the recorded samples, even if the code was the same.
22 | 
23 | ## Installation: 
24 | ```
25 | pip3 install -U pathbuster
26 | ```
27 | 
28 | ## Basic usage:
29 | ```
30 | pathbuster -u /path/to/URLS_FILE -p /path/to/wordlist -srd pathbuster-res
31 | ```
32 | 
33 | ## Passive check with Nuclei
34 | ```
35 | pathbuster -u /path/to/URLS_FILE -p /path/to/wordlist --store_response -srd pathbuster-res
36 | ```
37 | ![image](https://user-images.githubusercontent.com/50343281/149454129-c3c262f3-d3e1-4125-bb87-c334839ac338.png)
38 | 
39 | 
40 | ## options:
41 | ```
42 |   -h, --help            show this help message and exit
43 |   -m HTTP_METHOD, --http_method HTTP_METHOD
44 |                         HTTP method to use (default: GET)
45 |   -u URLS_FILE, --urls_file URLS_FILE
46 |                         urls file (base url) (default: None)
47 |   -p PATHS_FILE, --paths_file PATHS_FILE
48 |                         paths wordlist (default: None)
49 |   -e EXCLUDE_CODES, --exclude_codes EXCLUDE_CODES
50 |                         Exclude status codes, separated by commas (Example: 404,403) (default: 404)
51 |   -x EXTENSIONS, --extensions EXTENSIONS
52 |                         Extension list separated by commas (Example: php,asp) (default: )
53 |   -ac                   Automatically calibrate filtering options (default: False)
54 |   -sr, --store_response
55 |                         Store finded HTTP responses (default: False)
56 |   -srd STORE_RESPONSE_DIR, --store_response_dir STORE_RESPONSE_DIR
57 |                         Output directory (default: None)
58 |   -fe FILTER_REGEX, --filter-regex FILTER_REGEX
59 |                         filter response with specified regex (-fe admin) (default: None)
60 |   -json                 store output in JSONL(ines) format (default: False)
61 |   -f, --follow_redirects
62 |                         Follow HTTP redirects (same host only) (default: False)
63 |   -H HEADER, --header HEADER
64 |                         Add custom HTTP request header, support multiple flags (Example: -H "Referer: example.com" -H "Accept: */*") (default: None)
65 |   --proxy PROXY         proxy ip:port (default: None)
66 |   --max_response_size MAX_RESPONSE_SIZE
67 |                         Maximum response size in bytes (default: 250000)
68 |   --max_errors MAX_ERRORS
69 |                         Maximum errors before url exclude (default: 5)
70 |   -t THREADS, --threads THREADS
71 |                         Number of threads (keep number of threads less than the number of hosts) (default: 10)
72 |   -ua USER_AGENT, --user_agent USER_AGENT
73 |                         User agent (default: Mozilla/5.0 (compatible; pathbuster/0.1; +https://github.com/rivalsec/pathbuster))
74 |   --stats_interval STATS_INTERVAL
75 |                         number of seconds to wait between showing a statistics update (default: 60)
76 |   -maxr MAX_REDIRECTS, --max_redirects MAX_REDIRECTS
77 |                         Max number of redirects to follow (default: 5)
78 | ```
79 | 


--------------------------------------------------------------------------------
/pathbuster/pathbuster.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import requests
  4 | import argparse
  5 | import threading
  6 | from requests.packages import urllib3
  7 | from io import BytesIO
  8 | import os
  9 | import urllib.parse
 10 | import sys
 11 | import time
 12 | import re
 13 | from classes.config import Config
 14 | from classes.response import Response
 15 | from utils.common import random_str
 16 | 
 17 | 
 18 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 19 | 
 20 | get_work_locker = threading.Lock()
 21 | print_locker = threading.Lock()
 22 | preflight_iter = None
 23 | task_iter = None
 24 | preflight_samples = {} # for preflight results
 25 | err_table = dict()
 26 | uniq_locs = set()
 27 | 
 28 | 
 29 | #global settings
 30 | conf = Config()
 31 | 
 32 | 
 33 | def work_prod(urls, paths, extensions = [''], update_stats=False):
 34 |     for path in paths:
 35 |         for ext in extensions:
 36 |             p = path.lstrip('/')
 37 |             if ext:
 38 |                 p += f".{ext.lstrip('.')}"
 39 |             if update_stats:
 40 |                 stats['path'] = p
 41 |             for url in urls:
 42 |                 if update_stats:
 43 |                     stats['reqs_done'] += 1
 44 |                 if url in err_table and err_table[url] >= conf.max_errors:
 45 |                     continue
 46 |                 yield (url.rstrip('/') , p)
 47 |             
 48 | 
 49 | def truncated_stream_res(s: requests.Response, max_size:int):
 50 |     readed = 0
 51 |     with BytesIO() as buf:
 52 |         for chunk in s.iter_content(None, False):
 53 |             readed += buf.write(chunk)
 54 |             if readed > max_size:
 55 |                 break
 56 |         r = buf.getvalue()
 57 |     return r
 58 | 
 59 | 
 60 | def process_url(url, parent = None):
 61 |     with requests.request(conf.http_method, url, headers=conf.headers, timeout=conf.timeout, verify=False, stream=True, allow_redirects=False, proxies=conf.proxies) as s:
 62 |         body = truncated_stream_res(s, conf.max_response_size)
 63 |         return Response(url, s.status_code, s.reason, body, s.headers, parent_url=parent)
 64 | 
 65 | 
 66 | def lprint(s, **kwargs):
 67 |     with print_locker:
 68 |         print(s, **kwargs)
 69 | 
 70 | 
 71 | def save_res(s:Response):
 72 |     fn = f'{conf.res_dir}/{s.scheme}_{s.host}.txt'
 73 |     with print_locker:
 74 |         with open(fn, "a") as f:
 75 |             f.write(str(s) + "\n")
 76 |         with open(f'{conf.res_dir}/_{s.status}.txt', 'a') as f:
 77 |             f.write(str(s) + '\n')
 78 |     if conf.store_response and s.bodylen:
 79 |         site_dir = f'{conf.res_dir}/responses/{s.scheme}_{s.host}'
 80 |         res_fn = f'{site_dir}/{s.path_hash}.txt'
 81 |         if not os.path.exists(site_dir):
 82 |             os.mkdir(site_dir)
 83 |         with print_locker:
 84 |             with open(f'{conf.res_dir}/_index.txt', 'a') as f:
 85 |                 f.write(f'{res_fn}\t{s.url}\n')
 86 |         with open(res_fn, 'wb') as f:
 87 |             f.write(f'HTTP/2 {s.status} {s.reason}\n'.encode())
 88 |             for k,v in s.headers.items():
 89 |                 # remove because of nuclei parse error with passive mode
 90 |                 if k.title() == 'Transfer-Encoding':
 91 |                     continue
 92 |                 f.write(f'{k.title()}: {v}\n'.encode())
 93 |             f.write('\n'.encode())
 94 |             f.write(s.body)
 95 | 
 96 | 
 97 | def preflight_worker():
 98 |     while True:
 99 |         with get_work_locker:
100 |             try:
101 |                 url, path = next(preflight_iter)
102 |             except StopIteration:
103 |                 return
104 | 
105 |         try:
106 |             res = process_url(f'{url}/{path}', url)
107 |         except Exception as e:
108 |             err_table[url] = err_table.get(url, 0) + 1
109 |             # lprint(str(e), file=sys.stderr)
110 |             continue
111 | 
112 |         # collect samples (status code, body length) for future comparison if response status of random url not excluded by settings
113 |         if res.status not in conf.exclude_codes:
114 |             if url not in preflight_samples:
115 |                 preflight_samples[url] = []
116 | 
117 |             if len(preflight_samples[url]) == 0 or samples_diff(res, url):
118 |                 lprint(f"{res} status code not excluded, add to preflight samples", file=sys.stderr)
119 |                 preflight_samples[url].append(res)
120 | 
121 | 
122 | def samples_diff(res: Response, url: str):
123 |     """is differ from ALL url samples?"""
124 |     for sample in preflight_samples.get(url, []):
125 |         if res.is_similar(sample):
126 |             return False
127 |     return True
128 | 
129 | 
130 | def result_valid(res:Response):
131 |     if res.status in conf.exclude_codes:
132 |         return False
133 | 
134 |     if conf.filter_regex:
135 |         if re.search(conf.filter_regex, res.body.decode('utf-8', 'ignore')):
136 |             res.add_meta(f"{conf.filter_regex} match")
137 |         else:
138 |             return False
139 | 
140 |     # if ac
141 |     if len(preflight_samples) > 0:
142 |         if samples_diff(res, res.parent_url):
143 |             res.add_meta('(preflight differ)')
144 |         else:
145 |             return False
146 |                 
147 |     #pass all filters
148 |     return True
149 | 
150 | 
151 | def worker_process(url, parent, redirect_count = 0):
152 |     try:
153 |         res = process_url(url, parent)
154 |     except requests.exceptions.RequestException as e:
155 |         err_table[url] = err_table.get(url, 0) + 1
156 |         #lprint(str(e))
157 |         return
158 | 
159 |     if result_valid(res):
160 |         if conf.json_print:
161 |             lprint(res.to_json(conf.store_response))
162 |         else:
163 |             lprint(f"{res}")
164 |         if conf.res_dir:
165 |             save_res(res)
166 |         # follow host redirects on valid results
167 |         if res.location and conf.follow_redirects and redirect_count < conf.max_redirects:
168 |             if res.location.startswith('http://') or res.location.startswith('https://'):
169 |                 location = res.location
170 |             else:
171 |                 location = urllib.parse.urljoin(res.url, res.location)
172 | 
173 |             loc_p = urllib.parse.urlparse(location)
174 |             loc_wo_query = f'{loc_p.scheme}://{loc_p.netloc}{loc_p.path}' 
175 |             if loc_p.netloc == res.host and loc_wo_query not in uniq_locs:
176 |                 redirect_count += 1
177 |                 uniq_locs.add(loc_wo_query)
178 |                 worker_process(location, parent, redirect_count)
179 | 
180 | 
181 | def worker():
182 |     while True:
183 |         with get_work_locker:
184 |             try:
185 |                 url, path = next(task_iter)
186 |             except StopIteration:
187 |                 return
188 |         urlpath = f"{url}/{path}"
189 |         worker_process(urlpath, url)
190 | 
191 | 
192 | def statworker(looptime = 5):
193 |     while True:
194 |         time.sleep(looptime)
195 |         time_passed = time.time() - stats['starttime']
196 |         req_left = stats['allreqs'] - stats['reqs_done']
197 |         vel = int(stats["reqs_done"] / time_passed * 60)  
198 |         try:
199 |             timeleft = req_left // vel
200 |         except ZeroDivisionError:
201 |             timeleft = 0
202 |         lprint(f'[Statistics] path: {stats["path"]}, {stats["reqs_done"]}/{stats["allreqs"]} requests, speed {vel} req/min (about {timeleft} min left)', file=sys.stderr)            
203 | 
204 | 
205 | def start_thread_pool(threads, worker):
206 |     workers = []
207 |     for i in range(threads):
208 |         t = threading.Thread(target=worker, name='worker {}'.format(i),args=())
209 |         t.start()
210 |         workers.append(t)
211 | 
212 |     for w in workers:
213 |         w.join()
214 | 
215 | 
216 | def parse_args(sys_args):
217 |     global conf
218 | 
219 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
220 |         description='multiple hosts web path scanner')
221 |     parser.add_argument('-m', '--http_method', type=str, help='HTTP method to use', default='GET')
222 |     parser.add_argument('-u', '--urls_file', type=argparse.FileType(mode='r', encoding='UTF-8'), help='urls file (base url)', required=True)
223 |     parser.add_argument('-p', '--paths_file', type=argparse.FileType(mode='r', encoding='UTF-8'), help='paths wordlist', required=True)
224 |     parser.add_argument('-e', '--exclude_codes', type=str, help="Exclude status codes, separated by commas (Example: 404,403)", default="404")
225 |     parser.add_argument('-x', '--extensions', type=str, help="Extension list separated by commas (Example: php,asp)", default="")
226 |     parser.add_argument('-ac', action='store_true', help='Automatically calibrate filtering options')
227 |     parser.add_argument('-sr', '--store_response', action='store_true', help='Store finded HTTP responses')
228 |     parser.add_argument('-srd', '--store_response_dir', type=str, help='Output directory')
229 |     parser.add_argument('-fe', '--filter-regex', type=str, help='filter response with specified regex (-fe admin)', default=None)
230 |     parser.add_argument('-json', action='store_true', help='store output in JSONL(ines) format')
231 |     parser.add_argument('-f', '--follow_redirects', action='store_true', help='Follow HTTP redirects (same host only)')
232 |     parser.add_argument('-H','--header', action='append', help="Add custom HTTP request header, support multiple flags (Example: -H \"Referer: example.com\" -H \"Accept: */*\")")
233 |     parser.add_argument('--proxy', type=str, help='proxy ip:port', default=None)
234 |     parser.add_argument('--max_response_size', help='Maximum response size in bytes', default=250000)
235 |     parser.add_argument('--max_errors', type=int, help='Maximum errors before url exclude', default=5)
236 |     parser.add_argument('-t', '--threads', type=int, help='Number of threads (keep number of threads less than the number of hosts)', default=10)
237 |     parser.add_argument('-ua', '--user_agent', type=str, help="User agent", default="Mozilla/5.0 (compatible; pathbuster/0.1; +https://github.com/rivalsec/pathbuster)")
238 |     parser.add_argument('--stats_interval', type=int, help="number of seconds to wait between showing a statistics update", default = 60)
239 |     parser.add_argument('-maxr', '--max_redirects', type=int, help='Max number of redirects to follow', default=5)
240 | 
241 |     args = parser.parse_args(sys_args)
242 | 
243 |     if args.proxy:
244 |         conf.proxies = {
245 |             'http': 'http://' + args.proxy,
246 |             'https': 'http://' + args.proxy
247 |         }
248 | 
249 |     conf.headers["User-Agent"] = args.user_agent
250 |     if args.header:
251 |         for h in args.header:
252 |             k, v = [x.strip() for x in h.split(':', maxsplit=1)]
253 |             conf.headers[k] = v
254 | 
255 |     if args.exclude_codes:
256 |         conf.exclude_codes = [int(x.strip()) for x in args.exclude_codes.strip(',').split(',')]
257 | 
258 |     if args.extensions:
259 |         conf.extensions.extend([x.strip() for x in args.extensions.strip().strip(',').split(',')])
260 | 
261 |     conf.max_errors = args.max_errors
262 |     conf.http_method = args.http_method
263 |     conf.max_response_size = args.max_response_size
264 |     conf.store_response = args.store_response
265 |     conf.filter_regex = args.filter_regex
266 |     conf.json_print = args.json
267 |     conf.follow_redirects = args.follow_redirects
268 |     conf.max_redirects = args.max_redirects
269 |     conf.res_dir = args.store_response_dir
270 |     conf.stats_interval = args.stats_interval
271 |     return args
272 | 
273 | 
274 | def auto_calibration(urls, threads):
275 |     global preflight_iter
276 |     print("Collecting auto-calibration samples...", file=sys.stderr)
277 |     # auto calibration like in ffuf
278 |     acStrings = [
279 |         random_str(16),
280 |         random_str(16) + '/',
281 |         '.' + random_str(16) + '/',
282 |         '.htaccess' + random_str(16),
283 |         'admin' + random_str(16) + '/'
284 |     ]
285 |     acStrings.extend( [ random_str(16) + '.' + ext for ext in conf.extensions if ext] )
286 |     preflight_iter = work_prod(urls, acStrings)
287 |     start_thread_pool(threads, preflight_worker)
288 | 
289 | 
290 | def fuzz(urls, paths, extensions, threads, ac=False ):
291 |     global task_iter, stats
292 | 
293 |     if conf.res_dir:
294 |         if not os.path.exists(conf.res_dir):
295 |             os.mkdir(conf.res_dir)
296 |         if conf.store_response and not os.path.exists(conf.res_dir + "/responses"):
297 |             os.mkdir(conf.res_dir + "/responses")
298 | 
299 |     if ac:
300 |         auto_calibration(urls, threads)
301 | 
302 |     #stats
303 |     stats = {
304 |         "allreqs": len(urls) * len(paths) * len(conf.extensions),
305 |         "reqs_done": 0,
306 |         "path": "",
307 |         "starttime": time.time(),
308 |     }
309 |     st = threading.Thread(target=statworker, daemon=True, name='StatThread', args=(conf.stats_interval,))
310 |     st.start()
311 | 
312 |     task_iter = work_prod(urls, paths, extensions, True)
313 |     start_thread_pool(threads, worker)
314 | 
315 | 
316 | def main():
317 |     global stats, task_iter
318 |     args = parse_args(sys.argv[1:])
319 | 
320 |     urls = [l.strip() for l in args.urls_file]
321 |     args.urls_file.close()
322 | 
323 |     paths = [l.strip() for l in args.paths_file]
324 |     args.paths_file.close()
325 | 
326 |     fuzz(urls, paths, conf.extensions, args.threads, args.ac)
327 | 
328 |     print('THE END', file=sys.stderr)
329 | 
330 | 
331 | if __name__ == "__main__":
332 |     main()


--------------------------------------------------------------------------------