├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── publish-to-pypi.yml
    │   └── test.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE.md
├── README.md
├── demo-result.txt
├── demo.txt
├── requirements.txt
├── setup.py
├── tests
    ├── __init__.py
    └── uddup_test.py
└── uddup
    ├── __init__.py
    └── main.py


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | custom: ["https://www.buymeacoffee.com/2RS3C"]
2 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI
 2 | on:
 3 |   release:
 4 |     types:
 5 |       - published
 6 | jobs:
 7 |   run-tests:
 8 |     name: Run unit-tests
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - uses: actions/checkout@master
12 |       - name: Set up Python 3.x
13 |         uses: actions/setup-python@v2
14 |         with:
15 |           python-version: 3.x
16 |       - name: Install dependencies
17 |         run: |
18 |           python -m pip install --upgrade pip
19 |           pip install -r requirements.txt
20 |       - name: Test with pytest
21 |         run: |
22 |           pytest
23 |   build-n-publish:
24 |     name: Build and publish Python 🐍 distributions 📦 to PyPI
25 |     runs-on: ubuntu-latest
26 |     steps:
27 |       - uses: actions/checkout@master
28 |       - name: Set up Python 3.x
29 |         uses: actions/setup-python@v2
30 |         with:
31 |           python-version: 3.x
32 |       - name: Install pypa/build
33 |         run: >-
34 |           python -m
35 |           pip install
36 |           build
37 |           --user
38 |       - name: Build a binary wheel and a source tarball
39 |         run: >-
40 |           python -m
41 |           build
42 |           --sdist
43 |           --wheel
44 |           --outdir dist/
45 |           .
46 |       - name: Publish distribution 📦 to PyPI
47 |         if: startsWith(github.ref, 'refs/tags')
48 |         uses: pypa/gh-action-pypi-publish@master
49 |         with:
50 |           password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Run Tests
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   pull_request:
 7 |     branches:
 8 |       - master
 9 | jobs:
10 |   run-tests:
11 |     name: Run unit-tests
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@master
15 |       - name: Set up Python 3.x
16 |         uses: actions/setup-python@v2
17 |         with:
18 |           python-version: 3.x
19 |       - name: Install dependencies
20 |         run: |
21 |           python -m pip install --upgrade pip
22 |           pip install -r requirements.txt
23 |       - name: Test with pytest
24 |         run: |
25 |           pytest
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | __pycache__
4 | .pytest_cache
5 | build
6 | dist
7 | uddup.egg-info
8 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## v0.9.3 (28/02/2021)
 4 | ### Bug Fixes:
 5 | 
 6 | - [#5](https://github.com/rotemreiss/uddup/pull/5) Fix a bug with unicode char in urls (UTF-8 support)
 7 | 
 8 | ## v0.9.2 (08/02/2021)
 9 | 
10 | #### Enhancements:
11 | 
12 | - [#3](https://github.com/rotemreiss/uddup/issues/3) [feature request] Support paths filtering by Regex
13 | 
14 | #### Bug Fixes:
15 | 
16 | - [#2](https://github.com/rotemreiss/uddup/issues/2) Multiple hostnames (domains) which shares the same patterns conflicts
17 | 
18 | ---
19 | 
20 | ## v0.9.1.1 (06/02/2021)
21 | 
22 | First stable release.
23 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Rotem Reiss
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # UDdup - URLs Deduplication Tool
  2 | 
  3 | The tool gets a list of URLs, and removes "duplicate" pages in the sense
  4 | of URL patterns that are probably repetitive and points to the same web template.
  5 | 
  6 | For example:
  7 | ```
  8 | https://www.example.com/product/123
  9 | https://www.example.com/product/456
 10 | https://www.example.com/product/123?is_prod=false
 11 | https://www.example.com/product/222?is_debug=true
 12 | ```
 13 | All the above are probably points to the same product "template".
 14 | Therefore it should be enough to scan only some of these URLs by our various scanners.
 15 | 
 16 | The result of the above after UDdup should be:
 17 | ```
 18 | https://www.example.com/product/123?is_prod=false
 19 | https://www.example.com/product/222?is_debug=true
 20 | ```
 21 | 
 22 | ## Why do I need it?
 23 | Mostly for better (automated) reconnaissance process,
 24 | with less noise (for both the tester and the target).
 25 | 
 26 | ## Examples
 27 | Take a look at `demo.txt` which is the raw URLs file which results in `demo-results.txt`.
 28 | 
 29 | ---
 30 | 
 31 | ## Installation
 32 | ### With pip (Recommended)
 33 | ```bash
 34 | pip install uddup
 35 | ```
 36 | 
 37 | ### Manual (from code)
 38 | ```bash
 39 | # Clone the repository.
 40 | git clone https://github.com/rotemreiss/uddup.git
 41 | 
 42 | # Install the Python requirements.
 43 | cd uddup
 44 | pip install -r requirements.txt
 45 | ```
 46 | 
 47 | ---
 48 | ## Usage
 49 | 
 50 | `uddup -u demo.txt -o ./demo-result.txt`
 51 | 
 52 | ### More Usage Options
 53 | `uddup -h`
 54 | 
 55 | Short Form    | Long Form            | Description
 56 | ------------- | -------------------- |-------------
 57 | -h            | --help               | Show this help message and exit
 58 | -u			  | --urls				 | File with a list of urls
 59 | -o			  | --output			 | Save results to a file
 60 | -s			  | --silent			 | Print only the result URLs
 61 | -fp           | --filter-path        | Filter paths by a given Regex
 62 | 
 63 | ### Filter Paths by Regex
 64 | Allows filtering custom paths pattern.
 65 | For example, if we would like to filter all paths that starts with `/product` we will need to run:
 66 | ```bash
 67 | # Single Regex
 68 | uddup -u demo.txt -fp "^product"
 69 | ```
 70 | 
 71 | **Input:**
 72 | ```bash
 73 | https://www.example.com/
 74 | https://www.example.com/privacy-policy
 75 | https://www.example.com/product/1
 76 | https://www.example2.com/product/2
 77 | https://www.example3.com/product/4
 78 | ```
 79 | 
 80 | **Output:**
 81 | ```bash
 82 | https://www.example.com/
 83 | https://www.example.com/privacy-policy
 84 | ```
 85 | 
 86 | ### Advanced Regex with multiple path filters
 87 | ```bash
 88 | uddup -u demo.txt -fp "(^product)|(^category)"
 89 | ```
 90 | ---
 91 | ## Contributing
 92 | Feel free to fork the repository and submit pull-requests.
 93 | 
 94 | ---
 95 | 
 96 | ## Support
 97 | 
 98 | [Create new GitHub issue][newissue]
 99 | 
100 | Want to say thanks? :) Message me on <a href="https://www.linkedin.com/in/reissr" target="_blank">Linkedin</a>
101 | 
102 | 
103 | ---
104 | 
105 | ## License
106 | 
107 | [![License](http://img.shields.io/:license-mit-blue.svg?style=flat-square)](http://badges.mit-license.org)
108 | 
109 | - **[MIT license](http://opensource.org/licenses/mit-license.php)**
110 | 
111 | <!-- Markdown helper -->
112 | [newissue]: https://github.com/rotemreiss/uddup/issues/new
113 | 


--------------------------------------------------------------------------------
/demo-result.txt:
--------------------------------------------------------------------------------
 1 | http://www.example.com/
 2 | https://www.example.com/
 3 | https://www.example.com/about
 4 | https://www.example.com/category/hidden.html
 5 | https://www.example.com/category/index.php
 6 | https://www.example.com/category/watches?paramkeynoval
 7 | https://www.example.com/privacy-policy
 8 | https://www.example.com/product/123?is_prod=false
 9 | https://www.example.com/product/456?foo=bar&main=true
10 | https://www.example.com/product/456?is_debug=true&main=true&baz=2
11 | https://www.example.com/utf8/is/supported/בדיקה
12 | https://www.example2.com/product/2?is_prod=true
13 | 


--------------------------------------------------------------------------------
/demo.txt:
--------------------------------------------------------------------------------
 1 | http://www.example.com/
 2 | https://www.example.com/
 3 | https://www.example.com/privacy-policy
 4 | https://www.example.com/about
 5 | https://www.example.com/product/123
 6 | https://www.example.com/product/123?is_prod=false
 7 | https://www.example.com/product/123?is_debug=true
 8 | https://www.example.com/product/456?is_debug=true
 9 | https://www.example.com/product/5?is_debug=true&main=true
10 | https://www.example.com/product/51?is_debug=true&main=true
11 | https://www.example.com/product/456
12 | https://www.example.com/product/456?is_debug=true&main=true&baz=2
13 | https://www.example.com/product/456?foo=bar&main=true
14 | https://www.example.com/category/watches
15 | https://www.example.com/category/watches?paramkeynoval
16 | https://www.example.com/category/shirts
17 | https://www.example.com/category/hidden.html
18 | https://www.example.com/category/image.jpg
19 | https://www.example.com/category/picture.gif
20 | https://www.example.com/category/index.php
21 | https://www.example.com/utf8/is/supported/בדיקה
22 | https://www.example2.com/product/123
23 | https://www.example2.com/product/2?is_prod=true
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | colorama==0.4.3
2 | pytest==6.2.2
3 | win_unicode_console==0.5
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(
 7 |     name="uddup",
 8 |     version="0.9.3",
 9 |     author="Rotem Reiss",
10 |     author_email="reiss.r@gmail.com",
11 |     description="URLs Deduplication Tool.",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/rotemreiss/uddup",
15 |     packages=find_packages(exclude=['tests*']),
16 |     install_requires=[],
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3",
19 |         "License :: OSI Approved :: MIT License",
20 |         "Operating System :: OS Independent",
21 |     ],
22 |     entry_points={
23 |         'console_scripts': [
24 |             'uddup=uddup.main:interactive',
25 |         ],
26 |     },
27 | )
28 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rotemreiss/uddup/c3e19ed364f84ebfcc83bfb25616f7fac34ab372/tests/__init__.py


--------------------------------------------------------------------------------
/tests/uddup_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # coding=utf-8
 3 | from urllib.parse import urlparse
 4 | import uddup.main
 5 | import pytest
 6 | 
 7 | def test_uddup_main():
 8 |     expected_result_raw = (
 9 |         "http://www.example.com/",
10 |         "https://www.example.com/",
11 |         "https://www.example.com/about",
12 |         "https://www.example.com/category/hidden.html",
13 |         "https://www.example.com/category/index.php",
14 |         "https://www.example.com/category/watches?paramkeynoval",
15 |         "https://www.example.com/privacy-policy",
16 |         "https://www.example.com/product/123?is_prod=false",
17 |         "https://www.example.com/product/456?foo=bar&main=true",
18 |         "https://www.example.com/product/456?is_debug=true&main=true&baz=2",
19 |         "https://www.example.com/utf8/is/supported/בדיקה",
20 |         "https://www.example2.com/product/2?is_prod=true"
21 |     )
22 | 
23 |     expected_result = set()
24 |     for url in expected_result_raw:
25 |         expected_result.add(urlparse(url.rstrip()))
26 | 
27 |     existing_urls = uddup.main.main("./demo.txt", "", True, None)
28 |     assert existing_urls == expected_result
29 | 
30 | 
31 | def test_uddup_filter_path():
32 |     expected_result_raw = (
33 |         "http://www.example.com/",
34 |         "https://www.example.com/",
35 |         "https://www.example.com/about",
36 |         "https://www.example.com/category/hidden.html",
37 |         "https://www.example.com/category/index.php",
38 |         "https://www.example.com/category/watches?paramkeynoval",
39 |         "https://www.example.com/privacy-policy",
40 |         "https://www.example.com/utf8/is/supported/בדיקה"
41 |     )
42 | 
43 |     expected_result = set()
44 |     for url in expected_result_raw:
45 |         expected_result.add(urlparse(url.rstrip()))
46 | 
47 |     existing_urls = uddup.main.main("./demo.txt", "", True, "^product")
48 |     assert existing_urls == expected_result
49 | 


--------------------------------------------------------------------------------
/uddup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rotemreiss/uddup/c3e19ed364f84ebfcc83bfb25616f7fac34ab372/uddup/__init__.py


--------------------------------------------------------------------------------
/uddup/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding=utf-8
  3 | import argparse
  4 | import sys
  5 | import os
  6 | import re
  7 | from urllib.parse import urlparse
  8 | 
  9 | # Check if we are running this on windows platform
 10 | is_windows = sys.platform.startswith('win')
 11 | 
 12 | # Console Colors
 13 | if is_windows:
 14 |     # Windows deserves coloring too :D
 15 |     G = '\033[92m'  # green
 16 |     Y = '\033[93m'  # yellow
 17 |     W = '\033[0m'   # white
 18 |     try:
 19 |         import win_unicode_console, colorama
 20 |         win_unicode_console.enable()
 21 |         colorama.init()
 22 |     except:
 23 |         G = Y = W = ''
 24 | else:
 25 |     G = '\033[92m'  # green
 26 |     Y = '\033[93m'  # yellow
 27 |     W = '\033[0m'   # white
 28 | 
 29 | 
 30 | def banner():
 31 |     print("""%s
 32 |   _   _ ____      _             
 33 |  | | | |  _ \  __| |_   _ _ __  
 34 |  | | | | | | |/ _` | | | | '_ \ 
 35 |  | |_| | |_| | (_| | |_| | |_) |
 36 |   \___/|____/ \__,_|\__,_| .__/ 
 37 |                          |_|    
 38 | 
 39 |               %s# Coded By @2RS3C
 40 |     %s""" % (Y, G, W))
 41 | 
 42 | 
 43 | def file_arg(path):
 44 |     # from os.path import exists
 45 |     if not os.path.isfile(path):
 46 |         raise ValueError  # or TypeError, or `argparse.ArgumentTypeError
 47 |     return path
 48 | 
 49 | 
 50 | def get_ignored_suffixes():
 51 |     return (
 52 |         'css',
 53 |         'js',
 54 |         'gif',
 55 |         'jpg',
 56 |         'png',
 57 |         'jpeg',
 58 |         'svg',
 59 |         'xml',
 60 |         'txt',
 61 |         'json',
 62 |         'ico',
 63 |         'webp',
 64 |         'otf',
 65 |         'ttf',
 66 |         'woff',
 67 |         'woff2',
 68 |         'eot',
 69 |         'swf',
 70 |         'zip',
 71 |         'pdf',
 72 |         'doc',
 73 |         'ppt',
 74 |         'docx',
 75 |         'xls',
 76 |         'xlsx',
 77 |         'ogg',
 78 |         'mp4',
 79 |         'mp3',
 80 |         'mov'
 81 |     )
 82 | 
 83 | 
 84 | def get_web_suffixes():
 85 |     return (
 86 |         'htm',
 87 |         'html',
 88 |         'xhtml',
 89 |         'shtml',
 90 |         'jhtml',
 91 |         'cfm',
 92 |         'jsp',
 93 |         'jspx',
 94 |         'wss',
 95 |         'action',
 96 |         'php',
 97 |         'php4',
 98 |         'php5',
 99 |         'py',
100 |         'rb',
101 |         'pl',
102 |         'do',
103 |         'xml',
104 |         'rss',
105 |         'cgi',
106 |         'axd',
107 |         'asx',
108 |         'asmx',
109 |         'ashx',
110 |         'asp',
111 |         'aspx',
112 |         'dll'
113 |     )
114 | 
115 | 
116 | def get_existing_pattern_urls(purl, uurls):
117 |     results = []
118 | 
119 |     url_path = get_url_path(purl)
120 |     path_parts = url_path.split('/')
121 | 
122 |     # If there is only one path, return empty list.
123 |     if len(path_parts) == 1:
124 |         return results
125 | 
126 |     url_pattern = '/'.join(path_parts[:-1])
127 | 
128 |     url_schema = purl.scheme
129 |     url_hostname = purl.hostname
130 | 
131 |     for uurl in uurls:
132 |         # Skip different hostname and schemes (they can't be a match).
133 |         if uurl.scheme != url_schema or uurl.hostname != url_hostname:
134 |             continue
135 | 
136 |         uurl_path = get_url_path(uurl)
137 |         if uurl_path.startswith(url_pattern):
138 |             results.append(uurl)
139 | 
140 |     return results
141 | 
142 | 
143 | def get_query_params_keys(parsed_url_query):
144 |     keys = []
145 |     qparams = parsed_url_query.split('&')
146 |     for q in qparams:
147 |         keys.append(q.split('=')[0])
148 | 
149 |     return keys
150 | 
151 | 
152 | def is_all_params_exists(old_pattern, new_pattern):
153 |     old_params_keys = get_query_params_keys(old_pattern.query)
154 |     new_params_keys = get_query_params_keys(new_pattern.query)
155 | 
156 |     for k in old_params_keys:
157 |         if k not in new_params_keys:
158 |             return False
159 | 
160 |     return True
161 | 
162 | 
163 | def has_more_params(old_pattern, new_pattern):
164 |     old_params_keys = get_query_params_keys(old_pattern.query)
165 |     new_params_keys = get_query_params_keys(new_pattern.query)
166 |     return len(new_params_keys) > len(old_params_keys)
167 | 
168 | 
169 | def get_url_path(purl):
170 |     return purl.path.strip('/')
171 | 
172 | 
173 | def main(urls_file, output, silent, filter_path):
174 |     unique_urls = set()
175 | 
176 |     # Every tool needs a banner.
177 |     if not silent:
178 |         banner()
179 | 
180 |     web_suffixes = get_web_suffixes()
181 |     ignored_suffixes = get_ignored_suffixes()
182 |     # Iterate over the given domains
183 |     with open(urls_file, 'r', encoding="utf-8") as f:
184 |         for url in f:
185 |             url = url.rstrip()
186 |             if not url:
187 |                 continue
188 | 
189 |             parsed_url = urlparse(url)
190 | 
191 |             # @todo Reconsider the strip, since it can remove some interesting urls
192 |             url_path = get_url_path(parsed_url)
193 | 
194 |             # If the URL doesn't have a path, just add it as is.
195 |             # @todo Some dups can still occur, handle it
196 |             if not url_path:
197 |                 unique_urls.add(parsed_url)
198 |                 continue
199 | 
200 |             # Do not add paths to common files.
201 |             if url_path.endswith(ignored_suffixes):
202 |                 continue
203 | 
204 |             # Filter paths by custom Regex if set.
205 |             if filter_path and re.search(filter_path, url_path):
206 |                 continue
207 | 
208 |             # Add as-is paths that points to a specific web extension (e.g. html).
209 |             if url_path.endswith(web_suffixes):
210 |                 unique_urls.add(parsed_url)
211 |                 continue
212 | 
213 |             # Do the more complicated ddup work.
214 |             # Get existing URL patterns from our unique patterns.
215 |             existing_pattern_urls = get_existing_pattern_urls(parsed_url, unique_urls)
216 |             if not existing_pattern_urls:
217 |                 unique_urls.add(parsed_url)
218 |             elif parsed_url.query:
219 |                 for u in existing_pattern_urls:
220 |                     # Favor URL patterns with params over those without params.
221 |                     if not u.query:
222 |                         unique_urls.remove(u)
223 |                         unique_urls.add(parsed_url)
224 |                         continue
225 | 
226 |                     # Check if it has query params that are extra to the unique URL pattern.
227 |                     if is_all_params_exists(u, parsed_url):
228 |                         if has_more_params(u, parsed_url):
229 |                             unique_urls.remove(u)
230 |                             unique_urls.add(parsed_url)
231 |                             continue
232 |                     else:
233 |                         unique_urls.add(parsed_url)
234 |                         continue
235 | 
236 |     print_results(unique_urls, output)
237 |     return unique_urls
238 | 
239 | 
240 | def print_results(uurls, output):
241 |     if output:
242 |         try:
243 |             f = open(output, "w")
244 | 
245 |             for url in sorted(uurls):
246 |                 u = url.geturl()
247 |                 f.write(u + "\n")
248 |                 print(u)
249 | 
250 |             f.close()
251 |         except:
252 |             print('[X] Failed to save the output to a file.')
253 |     else:
254 |         for url in sorted(uurls):
255 |             u = url.geturl()
256 |             print(u)
257 | 
258 | 
259 | def interactive():
260 |     parser = argparse.ArgumentParser(description='Remove URL pattern duplications..')
261 | 
262 |     # Add the arguments
263 |     parser.add_argument('-u', '--urls', help='File with a list of urls.', type=file_arg, dest='urls_file', required=True)
264 |     parser.add_argument('-o', '--output', help='Save results to a file.', dest='output')
265 |     parser.add_argument('-s', '--silent', help='Print only the result URLs.', action='store_true', dest='silent')
266 |     parser.add_argument('-fp', '--filter-path', help='Filter paths by a given Regex.', dest='filter_path')
267 |     args = parser.parse_args()
268 | 
269 |     main(args.urls_file, args.output, args.silent, args.filter_path)
270 | 
271 | 
272 | if __name__ == "__main__":
273 |     interactive()
274 | 


--------------------------------------------------------------------------------