├── .github ├── FUNDING.yml └── workflows │ ├── publish-to-pypi.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── demo-result.txt ├── demo.txt ├── requirements.txt ├── setup.py ├── tests ├── __init__.py └── uddup_test.py └── uddup ├── __init__.py └── main.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | custom: ["https://www.buymeacoffee.com/2RS3C"] 2 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI 2 | on: 3 | release: 4 | types: 5 | - published 6 | jobs: 7 | run-tests: 8 | name: Run unit-tests 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@master 12 | - name: Set up Python 3.x 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: 3.x 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install -r requirements.txt 20 | - name: Test with pytest 21 | run: | 22 | pytest 23 | build-n-publish: 24 | name: Build and publish Python 🐍 distributions 📦 to PyPI 25 | runs-on: ubuntu-latest 26 | steps: 27 | - uses: actions/checkout@master 28 | - name: Set up Python 3.x 29 | uses: actions/setup-python@v2 30 | with: 31 | python-version: 3.x 32 | - name: Install pypa/build 33 | run: >- 34 | python -m 35 | pip install 36 | build 37 | --user 38 | - name: Build a binary wheel and a source tarball 39 | run: >- 40 | python -m 41 | build 42 | --sdist 43 | --wheel 44 | --outdir dist/ 45 | . 46 | - name: Publish distribution 📦 to PyPI 47 | if: startsWith(github.ref, 'refs/tags') 48 | uses: pypa/gh-action-pypi-publish@master 49 | with: 50 | password: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | jobs: 10 | run-tests: 11 | name: Run unit-tests 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@master 15 | - name: Set up Python 3.x 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.x 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install -r requirements.txt 23 | - name: Test with pytest 24 | run: | 25 | pytest 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea 3 | __pycache__ 4 | .pytest_cache 5 | build 6 | dist 7 | uddup.egg-info 8 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v0.9.3 (28/02/2021) 4 | ### Bug Fixes: 5 | 6 | - [#5](https://github.com/rotemreiss/uddup/pull/5) Fix a bug with unicode char in urls (UTF-8 support) 7 | 8 | ## v0.9.2 (08/02/2021) 9 | 10 | #### Enhancements: 11 | 12 | - [#3](https://github.com/rotemreiss/uddup/issues/3) [feature request] Support paths filtering by Regex 13 | 14 | #### Bug Fixes: 15 | 16 | - [#2](https://github.com/rotemreiss/uddup/issues/2) Multiple hostnames (domains) which shares the same patterns conflicts 17 | 18 | --- 19 | 20 | ## v0.9.1.1 (06/02/2021) 21 | 22 | First stable release. 23 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Rotem Reiss 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UDdup - URLs Deduplication Tool 2 | 3 | The tool gets a list of URLs, and removes "duplicate" pages in the sense 4 | of URL patterns that are probably repetitive and points to the same web template. 5 | 6 | For example: 7 | ``` 8 | https://www.example.com/product/123 9 | https://www.example.com/product/456 10 | https://www.example.com/product/123?is_prod=false 11 | https://www.example.com/product/222?is_debug=true 12 | ``` 13 | All the above are probably points to the same product "template". 14 | Therefore it should be enough to scan only some of these URLs by our various scanners. 15 | 16 | The result of the above after UDdup should be: 17 | ``` 18 | https://www.example.com/product/123?is_prod=false 19 | https://www.example.com/product/222?is_debug=true 20 | ``` 21 | 22 | ## Why do I need it? 23 | Mostly for better (automated) reconnaissance process, 24 | with less noise (for both the tester and the target). 25 | 26 | ## Examples 27 | Take a look at `demo.txt` which is the raw URLs file which results in `demo-results.txt`. 28 | 29 | --- 30 | 31 | ## Installation 32 | ### With pip (Recommended) 33 | ```bash 34 | pip install uddup 35 | ``` 36 | 37 | ### Manual (from code) 38 | ```bash 39 | # Clone the repository. 40 | git clone https://github.com/rotemreiss/uddup.git 41 | 42 | # Install the Python requirements. 43 | cd uddup 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | --- 48 | ## Usage 49 | 50 | `uddup -u demo.txt -o ./demo-result.txt` 51 | 52 | ### More Usage Options 53 | `uddup -h` 54 | 55 | Short Form | Long Form | Description 56 | ------------- | -------------------- |------------- 57 | -h | --help | Show this help message and exit 58 | -u | --urls | File with a list of urls 59 | -o | --output | Save results to a file 60 | -s | --silent | Print only the result URLs 61 | -fp | --filter-path | Filter paths by a given Regex 62 | 63 | ### Filter Paths by Regex 64 | Allows filtering custom paths pattern. 65 | For example, if we would like to filter all paths that starts with `/product` we will need to run: 66 | ```bash 67 | # Single Regex 68 | uddup -u demo.txt -fp "^product" 69 | ``` 70 | 71 | **Input:** 72 | ```bash 73 | https://www.example.com/ 74 | https://www.example.com/privacy-policy 75 | https://www.example.com/product/1 76 | https://www.example2.com/product/2 77 | https://www.example3.com/product/4 78 | ``` 79 | 80 | **Output:** 81 | ```bash 82 | https://www.example.com/ 83 | https://www.example.com/privacy-policy 84 | ``` 85 | 86 | ### Advanced Regex with multiple path filters 87 | ```bash 88 | uddup -u demo.txt -fp "(^product)|(^category)" 89 | ``` 90 | --- 91 | ## Contributing 92 | Feel free to fork the repository and submit pull-requests. 93 | 94 | --- 95 | 96 | ## Support 97 | 98 | [Create new GitHub issue][newissue] 99 | 100 | Want to say thanks? :) Message me on Linkedin 101 | 102 | 103 | --- 104 | 105 | ## License 106 | 107 | [![License](http://img.shields.io/:license-mit-blue.svg?style=flat-square)](http://badges.mit-license.org) 108 | 109 | - **[MIT license](http://opensource.org/licenses/mit-license.php)** 110 | 111 | 112 | [newissue]: https://github.com/rotemreiss/uddup/issues/new 113 | -------------------------------------------------------------------------------- /demo-result.txt: -------------------------------------------------------------------------------- 1 | http://www.example.com/ 2 | https://www.example.com/ 3 | https://www.example.com/about 4 | https://www.example.com/category/hidden.html 5 | https://www.example.com/category/index.php 6 | https://www.example.com/category/watches?paramkeynoval 7 | https://www.example.com/privacy-policy 8 | https://www.example.com/product/123?is_prod=false 9 | https://www.example.com/product/456?foo=bar&main=true 10 | https://www.example.com/product/456?is_debug=true&main=true&baz=2 11 | https://www.example.com/utf8/is/supported/בדיקה 12 | https://www.example2.com/product/2?is_prod=true 13 | -------------------------------------------------------------------------------- /demo.txt: -------------------------------------------------------------------------------- 1 | http://www.example.com/ 2 | https://www.example.com/ 3 | https://www.example.com/privacy-policy 4 | https://www.example.com/about 5 | https://www.example.com/product/123 6 | https://www.example.com/product/123?is_prod=false 7 | https://www.example.com/product/123?is_debug=true 8 | https://www.example.com/product/456?is_debug=true 9 | https://www.example.com/product/5?is_debug=true&main=true 10 | https://www.example.com/product/51?is_debug=true&main=true 11 | https://www.example.com/product/456 12 | https://www.example.com/product/456?is_debug=true&main=true&baz=2 13 | https://www.example.com/product/456?foo=bar&main=true 14 | https://www.example.com/category/watches 15 | https://www.example.com/category/watches?paramkeynoval 16 | https://www.example.com/category/shirts 17 | https://www.example.com/category/hidden.html 18 | https://www.example.com/category/image.jpg 19 | https://www.example.com/category/picture.gif 20 | https://www.example.com/category/index.php 21 | https://www.example.com/utf8/is/supported/בדיקה 22 | https://www.example2.com/product/123 23 | https://www.example2.com/product/2?is_prod=true 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | colorama==0.4.3 2 | pytest==6.2.2 3 | win_unicode_console==0.5 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="uddup", 8 | version="0.9.3", 9 | author="Rotem Reiss", 10 | author_email="reiss.r@gmail.com", 11 | description="URLs Deduplication Tool.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/rotemreiss/uddup", 15 | packages=find_packages(exclude=['tests*']), 16 | install_requires=[], 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | entry_points={ 23 | 'console_scripts': [ 24 | 'uddup=uddup.main:interactive', 25 | ], 26 | }, 27 | ) 28 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rotemreiss/uddup/c3e19ed364f84ebfcc83bfb25616f7fac34ab372/tests/__init__.py -------------------------------------------------------------------------------- /tests/uddup_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf-8 3 | from urllib.parse import urlparse 4 | import uddup.main 5 | import pytest 6 | 7 | def test_uddup_main(): 8 | expected_result_raw = ( 9 | "http://www.example.com/", 10 | "https://www.example.com/", 11 | "https://www.example.com/about", 12 | "https://www.example.com/category/hidden.html", 13 | "https://www.example.com/category/index.php", 14 | "https://www.example.com/category/watches?paramkeynoval", 15 | "https://www.example.com/privacy-policy", 16 | "https://www.example.com/product/123?is_prod=false", 17 | "https://www.example.com/product/456?foo=bar&main=true", 18 | "https://www.example.com/product/456?is_debug=true&main=true&baz=2", 19 | "https://www.example.com/utf8/is/supported/בדיקה", 20 | "https://www.example2.com/product/2?is_prod=true" 21 | ) 22 | 23 | expected_result = set() 24 | for url in expected_result_raw: 25 | expected_result.add(urlparse(url.rstrip())) 26 | 27 | existing_urls = uddup.main.main("./demo.txt", "", True, None) 28 | assert existing_urls == expected_result 29 | 30 | 31 | def test_uddup_filter_path(): 32 | expected_result_raw = ( 33 | "http://www.example.com/", 34 | "https://www.example.com/", 35 | "https://www.example.com/about", 36 | "https://www.example.com/category/hidden.html", 37 | "https://www.example.com/category/index.php", 38 | "https://www.example.com/category/watches?paramkeynoval", 39 | "https://www.example.com/privacy-policy", 40 | "https://www.example.com/utf8/is/supported/בדיקה" 41 | ) 42 | 43 | expected_result = set() 44 | for url in expected_result_raw: 45 | expected_result.add(urlparse(url.rstrip())) 46 | 47 | existing_urls = uddup.main.main("./demo.txt", "", True, "^product") 48 | assert existing_urls == expected_result 49 | -------------------------------------------------------------------------------- /uddup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rotemreiss/uddup/c3e19ed364f84ebfcc83bfb25616f7fac34ab372/uddup/__init__.py -------------------------------------------------------------------------------- /uddup/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf-8 3 | import argparse 4 | import sys 5 | import os 6 | import re 7 | from urllib.parse import urlparse 8 | 9 | # Check if we are running this on windows platform 10 | is_windows = sys.platform.startswith('win') 11 | 12 | # Console Colors 13 | if is_windows: 14 | # Windows deserves coloring too :D 15 | G = '\033[92m' # green 16 | Y = '\033[93m' # yellow 17 | W = '\033[0m' # white 18 | try: 19 | import win_unicode_console, colorama 20 | win_unicode_console.enable() 21 | colorama.init() 22 | except: 23 | G = Y = W = '' 24 | else: 25 | G = '\033[92m' # green 26 | Y = '\033[93m' # yellow 27 | W = '\033[0m' # white 28 | 29 | 30 | def banner(): 31 | print("""%s 32 | _ _ ____ _ 33 | | | | | _ \ __| |_ _ _ __ 34 | | | | | | | |/ _` | | | | '_ \ 35 | | |_| | |_| | (_| | |_| | |_) | 36 | \___/|____/ \__,_|\__,_| .__/ 37 | |_| 38 | 39 | %s# Coded By @2RS3C 40 | %s""" % (Y, G, W)) 41 | 42 | 43 | def file_arg(path): 44 | # from os.path import exists 45 | if not os.path.isfile(path): 46 | raise ValueError # or TypeError, or `argparse.ArgumentTypeError 47 | return path 48 | 49 | 50 | def get_ignored_suffixes(): 51 | return ( 52 | 'css', 53 | 'js', 54 | 'gif', 55 | 'jpg', 56 | 'png', 57 | 'jpeg', 58 | 'svg', 59 | 'xml', 60 | 'txt', 61 | 'json', 62 | 'ico', 63 | 'webp', 64 | 'otf', 65 | 'ttf', 66 | 'woff', 67 | 'woff2', 68 | 'eot', 69 | 'swf', 70 | 'zip', 71 | 'pdf', 72 | 'doc', 73 | 'ppt', 74 | 'docx', 75 | 'xls', 76 | 'xlsx', 77 | 'ogg', 78 | 'mp4', 79 | 'mp3', 80 | 'mov' 81 | ) 82 | 83 | 84 | def get_web_suffixes(): 85 | return ( 86 | 'htm', 87 | 'html', 88 | 'xhtml', 89 | 'shtml', 90 | 'jhtml', 91 | 'cfm', 92 | 'jsp', 93 | 'jspx', 94 | 'wss', 95 | 'action', 96 | 'php', 97 | 'php4', 98 | 'php5', 99 | 'py', 100 | 'rb', 101 | 'pl', 102 | 'do', 103 | 'xml', 104 | 'rss', 105 | 'cgi', 106 | 'axd', 107 | 'asx', 108 | 'asmx', 109 | 'ashx', 110 | 'asp', 111 | 'aspx', 112 | 'dll' 113 | ) 114 | 115 | 116 | def get_existing_pattern_urls(purl, uurls): 117 | results = [] 118 | 119 | url_path = get_url_path(purl) 120 | path_parts = url_path.split('/') 121 | 122 | # If there is only one path, return empty list. 123 | if len(path_parts) == 1: 124 | return results 125 | 126 | url_pattern = '/'.join(path_parts[:-1]) 127 | 128 | url_schema = purl.scheme 129 | url_hostname = purl.hostname 130 | 131 | for uurl in uurls: 132 | # Skip different hostname and schemes (they can't be a match). 133 | if uurl.scheme != url_schema or uurl.hostname != url_hostname: 134 | continue 135 | 136 | uurl_path = get_url_path(uurl) 137 | if uurl_path.startswith(url_pattern): 138 | results.append(uurl) 139 | 140 | return results 141 | 142 | 143 | def get_query_params_keys(parsed_url_query): 144 | keys = [] 145 | qparams = parsed_url_query.split('&') 146 | for q in qparams: 147 | keys.append(q.split('=')[0]) 148 | 149 | return keys 150 | 151 | 152 | def is_all_params_exists(old_pattern, new_pattern): 153 | old_params_keys = get_query_params_keys(old_pattern.query) 154 | new_params_keys = get_query_params_keys(new_pattern.query) 155 | 156 | for k in old_params_keys: 157 | if k not in new_params_keys: 158 | return False 159 | 160 | return True 161 | 162 | 163 | def has_more_params(old_pattern, new_pattern): 164 | old_params_keys = get_query_params_keys(old_pattern.query) 165 | new_params_keys = get_query_params_keys(new_pattern.query) 166 | return len(new_params_keys) > len(old_params_keys) 167 | 168 | 169 | def get_url_path(purl): 170 | return purl.path.strip('/') 171 | 172 | 173 | def main(urls_file, output, silent, filter_path): 174 | unique_urls = set() 175 | 176 | # Every tool needs a banner. 177 | if not silent: 178 | banner() 179 | 180 | web_suffixes = get_web_suffixes() 181 | ignored_suffixes = get_ignored_suffixes() 182 | # Iterate over the given domains 183 | with open(urls_file, 'r', encoding="utf-8") as f: 184 | for url in f: 185 | url = url.rstrip() 186 | if not url: 187 | continue 188 | 189 | parsed_url = urlparse(url) 190 | 191 | # @todo Reconsider the strip, since it can remove some interesting urls 192 | url_path = get_url_path(parsed_url) 193 | 194 | # If the URL doesn't have a path, just add it as is. 195 | # @todo Some dups can still occur, handle it 196 | if not url_path: 197 | unique_urls.add(parsed_url) 198 | continue 199 | 200 | # Do not add paths to common files. 201 | if url_path.endswith(ignored_suffixes): 202 | continue 203 | 204 | # Filter paths by custom Regex if set. 205 | if filter_path and re.search(filter_path, url_path): 206 | continue 207 | 208 | # Add as-is paths that points to a specific web extension (e.g. html). 209 | if url_path.endswith(web_suffixes): 210 | unique_urls.add(parsed_url) 211 | continue 212 | 213 | # Do the more complicated ddup work. 214 | # Get existing URL patterns from our unique patterns. 215 | existing_pattern_urls = get_existing_pattern_urls(parsed_url, unique_urls) 216 | if not existing_pattern_urls: 217 | unique_urls.add(parsed_url) 218 | elif parsed_url.query: 219 | for u in existing_pattern_urls: 220 | # Favor URL patterns with params over those without params. 221 | if not u.query: 222 | unique_urls.remove(u) 223 | unique_urls.add(parsed_url) 224 | continue 225 | 226 | # Check if it has query params that are extra to the unique URL pattern. 227 | if is_all_params_exists(u, parsed_url): 228 | if has_more_params(u, parsed_url): 229 | unique_urls.remove(u) 230 | unique_urls.add(parsed_url) 231 | continue 232 | else: 233 | unique_urls.add(parsed_url) 234 | continue 235 | 236 | print_results(unique_urls, output) 237 | return unique_urls 238 | 239 | 240 | def print_results(uurls, output): 241 | if output: 242 | try: 243 | f = open(output, "w") 244 | 245 | for url in sorted(uurls): 246 | u = url.geturl() 247 | f.write(u + "\n") 248 | print(u) 249 | 250 | f.close() 251 | except: 252 | print('[X] Failed to save the output to a file.') 253 | else: 254 | for url in sorted(uurls): 255 | u = url.geturl() 256 | print(u) 257 | 258 | 259 | def interactive(): 260 | parser = argparse.ArgumentParser(description='Remove URL pattern duplications..') 261 | 262 | # Add the arguments 263 | parser.add_argument('-u', '--urls', help='File with a list of urls.', type=file_arg, dest='urls_file', required=True) 264 | parser.add_argument('-o', '--output', help='Save results to a file.', dest='output') 265 | parser.add_argument('-s', '--silent', help='Print only the result URLs.', action='store_true', dest='silent') 266 | parser.add_argument('-fp', '--filter-path', help='Filter paths by a given Regex.', dest='filter_path') 267 | args = parser.parse_args() 268 | 269 | main(args.urls_file, args.output, args.silent, args.filter_path) 270 | 271 | 272 | if __name__ == "__main__": 273 | interactive() 274 | --------------------------------------------------------------------------------