├── .gitattributes
├── .github
    └── workflows
    │   ├── codeql.yaml
    │   └── python-app.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── img
    └── scraping.png
├── pyproject.toml
└── src
    └── scrapy_scraper
        ├── main.py
        └── utils
            ├── array.py
            ├── config.py
            ├── cookie.py
            ├── directory.py
            ├── file.py
            ├── general.py
            ├── header.py
            ├── scrape.py
            ├── stopwatch.py
            ├── url.py
            └── validate.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yaml:
--------------------------------------------------------------------------------
 1 | name: CodeQL
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 |     paths:
 9 |       - 'src/**'
10 | 
11 | jobs:
12 |   analyze:
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         include:
17 |         - language: python
18 |           build-mode: none
19 | 
20 |     runs-on: ubuntu-latest
21 | 
22 |     permissions:
23 |       security-events: write
24 |       packages: read
25 |       actions: read
26 |       contents: read
27 | 
28 |     steps:
29 |     - name: Checkout Repository
30 |       uses: actions/checkout@v4
31 | 
32 |     - name: Initialize CodeQL
33 |       uses: github/codeql-action/init@v3
34 |       with:
35 |         languages: ${{ matrix.language }}
36 |         build-mode: ${{ matrix.build-mode }}
37 | 
38 |     - name: Perform CodeQL Analysis
39 |       uses: github/codeql-action/analyze@v3
40 |       with:
41 |         category: "/language:${{matrix.language}}"
42 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | name: Test Python Application
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 | 
 9 | permissions:
10 |   contents: read
11 | 
12 | jobs:
13 |   build:
14 |     strategy:
15 |       matrix:
16 |         os: [ ubuntu-latest, macos-latest, windows-latest ]
17 |         python-version: [ "3.10", "3.x" ]
18 | 
19 |     runs-on: ${{ matrix.os }}
20 | 
21 |     steps:
22 |     - name: Checkout Repository
23 |       uses: actions/checkout@v4
24 | 
25 |     - name: Set Up Python ${{ matrix.python-version }}
26 |       uses: actions/setup-python@v5
27 |       with:
28 |         python-version: ${{ matrix.python-version }}
29 | 
30 |     - name: Install Dependencies and Build
31 |       run: |
32 |         python -m pip install --upgrade pip setuptools build wheel
33 |         python -m build
34 |         pip install .
35 | 
36 |     - name: Run Python Application
37 |       run: |
38 |         scrapy-scraper --help
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ivan Šincek
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/scrapy_scraper/*.py
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Scrapy Scraper
  2 | 
  3 | Web crawler and scraper based on Scrapy and Playwright's headless browser.
  4 | 
  5 | To use the headless browser specify `-p` option. Browsers, unlike other standard web request libraries, have the ability to render JavaScript encoded HTML content.
  6 | 
  7 | To automatically download and beautify all JavaScript files, including minified ones, specify `-dir downloads` option - where `downloads` is your desired output directory.
  8 | 
  9 | Future plans:
 10 | 
 11 | * check if Playwright's Chromium headless browser is installed or not,
 12 | * add option to stop on rate limiting.
 13 | 
 14 | Resources:
 15 | 
 16 | * [scrapy.org](https://scrapy.org) - official
 17 | * [playwright.dev](https://playwright.dev/python/docs/intro) - official
 18 | * [scrapy/scrapy](https://github.com/scrapy/scrapy) - GitHub
 19 | * [scrapy-plugins/scrapy-playwright](https://github.com/scrapy-plugins/scrapy-playwright) - GitHub
 20 | 
 21 | Tested on Kali Linux v2024.2 (64-bit).
 22 | 
 23 | Made for educational purposes. I hope it will help!
 24 | 
 25 | ## Table of Contents
 26 | 
 27 | * [How to Install](#how-to-install)
 28 | 	* [Install Playwright and Chromium](#install-playwright-and-chromium)
 29 | 	* [Standard Install](#standard-install)
 30 | 	* [Build and Install From the Source](#build-and-install-from-the-source)
 31 | * [How to Run](#how-to-run)
 32 | * [Usage](#usage)
 33 | * [Images](#images)
 34 | 
 35 | ## How to Install
 36 | 
 37 | ### Install Playwright and Chromium
 38 | 
 39 | ```bash
 40 | pip3 install --upgrade playwright
 41 | 
 42 | playwright install chromium
 43 | ```
 44 | 
 45 | Make sure each time you upgrade your Playwright dependency to re-install Chromium; otherwise, you might get an error using the headless browser.
 46 | 
 47 | ### Standard Install
 48 | 
 49 | ```bash
 50 | pip3 install --upgrade scrapy-scraper
 51 | ```
 52 | 
 53 | ### Build and Install From the Source
 54 | 
 55 | ```bash
 56 | git clone https://github.com/ivan-sincek/scrapy-scraper && cd scrapy-scraper
 57 | 
 58 | python3 -m pip install --upgrade build
 59 | 
 60 | python3 -m build
 61 | 
 62 | python3 -m pip install dist/scrapy-scraper-3.6-py3-none-any.whl
 63 | ```
 64 | 
 65 | ## How to Run
 66 | 
 67 | Restricted, crawl only `example.com`, and include only links to `example.com`:
 68 | 
 69 | ```fundamental
 70 | scrapy-scraper -u https://example.com/home -o results.txt -a random -s 2 -rs -dir js
 71 | ```
 72 | 
 73 | Restricted, crawl only `example.com`, and include both, links to `example.com` and 3rd party links:
 74 | 
 75 | ```fundamental
 76 | scrapy-scraper -u https://example.com/home -o results.txt -a random -s 2 -rs -dir js -l
 77 | ```
 78 | 
 79 | Restricted, crawl everywhere, and include all the links:
 80 | 
 81 | ```fundamental
 82 | scrapy-scraper -u https://example.com/home -o results.txt -a random -s 2 -rs -dir js -w off
 83 | ```
 84 | 
 85 | ## Usage
 86 | 
 87 | ```fundamental
 88 | Scrapy Scraper v3.6 ( github.com/ivan-sincek/scrapy-scraper )
 89 | 
 90 | Usage:   scrapy-scraper -u urls                     -o out         [-dir directory]
 91 | Example: scrapy-scraper -u https://example.com/home -o results.txt [-dir downloads]
 92 | 
 93 | DESCRIPTION
 94 |     Crawl and scrape websites
 95 | URLS
 96 |     File containing URLs or a single URL to start crawling and scraping from
 97 |     -u, --urls = urls.txt | https://example.com/home | etc.
 98 | WHITELIST
 99 |     File containing whitelisted domain names to limit the scope
100 |     Specify 'off' to disable domain whitelisting
101 |     Default: limit the scope to domain names extracted from the URLs
102 |     -w, --whitelist = whitelist.txt | off | etc.
103 | LINKS
104 |     Include all 3rd party links and sources in the output file
105 |     -l, --links
106 | PLAYWRIGHT
107 |     Use Playwright's headless browser
108 |     -p, --playwright
109 | PLAYWRIGHT WAIT
110 |     Wait time in seconds before fetching the page content
111 |     -pw, --playwright-wait = 0.5 | 2 | 4 | etc.
112 | CONCURRENT REQUESTS
113 |     Number of concurrent requests
114 |     Default: 30
115 |     -cr, --concurrent-requests = 30 | 45 | etc.
116 | CONCURRENT REQUESTS PER DOMAIN
117 |     Number of concurrent requests per domain
118 |     Default: 10
119 |     -crd, --concurrent-requests-domain = 10 | 15 | etc.
120 | SLEEP
121 |     Sleep time in seconds between two consecutive requests to the same domain
122 |     -s, --sleep = 1.5 | 3 | etc.
123 | RANDOM SLEEP
124 |     Randomize the sleep time between requests to vary between '0.5 * sleep' and '1.5 * sleep'
125 |     -rs, --random-sleep
126 | AUTO THROTTLE
127 |     Auto throttle concurrent requests based on the load and latency
128 |     Sleep time is still respected
129 |     -at, --auto-throttle = 0.5 | 10 | 15 | 45 | etc.
130 | RETRIES
131 |     Number of retries per URL
132 |     Default: 2
133 |     -rt, --retries = 0 | 4 | etc.
134 | RECURSION
135 |     Recursion depth limit
136 |     Specify '0' for no limit
137 |     Default: 1
138 |     -r, --recursion = 0 | 2 | etc.
139 | REQUEST TIMEOUT
140 |     Request timeout in seconds
141 |     Default: 60
142 |     -t, --request-timeout = 30 | 90 | etc.
143 | HEADER
144 |     Specify any number of extra HTTP request headers
145 |     -H, --header = "Authorization: Bearer ey..." | etc.
146 | COOKIE
147 |     Specify any number of extra HTTP cookies
148 |     -b, --cookie = PHPSESSIONID=3301 | etc.
149 | USER AGENT
150 |     User agent to use
151 |     Default: Scrapy Scraper/3.6
152 |     -a, --user-agent = random[-all] | curl/3.30.1 | etc.
153 | PROXY
154 |     Web proxy to use
155 |     -x, --proxy = http://127.0.0.1:8080 | etc.
156 | DIRECTORY
157 |     Output directory
158 |     All extracted JavaScript files will be saved in this directory
159 |     -dir, --directory = downloads | etc.
160 | OUT
161 |     Output file
162 |     -o, --out = results.txt | etc.
163 | DEBUG
164 |     Enable debug output
165 |     -dbg, --debug
166 | ```
167 | 
168 | ## Images
169 | 
170 | <p align="center"><img src="https://github.com/ivan-sincek/scrapy-scraper/blob/main/img/scraping.png" alt="Scraping"></p>
171 | 
172 | <p align="center">Figure 1 - Scraping</p>
173 | 


--------------------------------------------------------------------------------
/img/scraping.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ivan-sincek/scrapy-scraper/8d009f7fe4ff21442da832e4237274a5ed10ab94/img/scraping.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=75.3.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "scrapy-scraper"
 7 | version = "3.6"
 8 | authors = [{ name = "Ivan Sincek" }]
 9 | description = "Web crawler and scraper based on Scrapy and Playwright's headless browser."
10 | readme = "README.md"
11 | requires-python = ">=3.10"
12 | classifiers = [
13 | 	"Programming Language :: Python :: 3",
14 | 	"License :: OSI Approved :: MIT License",
15 | 	"Operating System :: OS Independent"
16 | ]
17 | dependencies = [
18 | 	"beautifulsoup4>=4.12.3",
19 | 	"bot-safe-agents>=1.0",
20 | 	"colorama>=0.4.6",
21 | 	"jsbeautifier>=1.14.11",
22 | 	"playwright>=1.49.0",
23 | 	"scrapy>=2.12.0",
24 | 	"scrapy-playwright>=0.0.42",
25 | 	"termcolor>=2.4.0",
26 | 	"tldextract>=3.6.0"
27 | ]
28 | 
29 | [project.urls]
30 | "Homepage" = "https://github.com/ivan-sincek/scrapy-scraper"
31 | 
32 | [project.scripts]
33 | scrapy-scraper = "scrapy_scraper.main:main"
34 | 
35 | [tool.setuptools.packages.find]
36 | where = ["src"]
37 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from .utils import config, scrape, validate
 4 | 
 5 | def main():
 6 | 	success, args = validate.Validate().validate_args()
 7 | 	if success:
 8 | 		config.banner()
 9 | 		scrapy_scraper = scrape.ScrapyScraper(
10 | 			args.urls,
11 | 			args.whitelist,
12 | 			args.links,
13 | 			args.playwright,
14 | 			args.playwright_wait,
15 | 			args.concurrent_requests,
16 | 			args.concurrent_requests_domain,
17 | 			args.sleep,
18 | 			args.random_sleep,
19 | 			args.auto_throttle,
20 | 			args.retries,
21 | 			args.recursion,
22 | 			args.request_timeout,
23 | 			args.header,
24 | 			args.cookie,
25 | 			args.user_agent,
26 | 			args.proxy,
27 | 			args.directory,
28 | 			args.out,
29 | 			args.debug
30 | 		)
31 | 		scrapy_scraper.run()
32 | 
33 | if __name__ == "__main__":
34 | 	main()
35 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/array.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | 
3 | def unique(array: list):
4 | 	"""
5 | 	Remove duplicates from a list.
6 | 	"""
7 | 	seen = set()
8 | 	return [x for x in array if not (x in seen or seen.add(x))]
9 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | APP_VERSION = "v3.6"
 4 | 
 5 | USER_AGENT = "Scrapy Scraper/3.6"
 6 | 
 7 | def banner():
 8 | 	"""
 9 | 	Display the banner.
10 | 	"""
11 | 	print("#########################################################################")
12 | 	print("#                                                                       #")
13 | 	print("#                          Scrapy Scraper v3.6                          #")
14 | 	print("#                                     by Ivan Sincek                    #")
15 | 	print("#                                                                       #")
16 | 	print("# Crawl and scrape websites.                                            #")
17 | 	print("# GitHub repository at github.com/ivan-sincek/scrapy-scraper.           #")
18 | 	print("#                                                                       #")
19 | 	print("#########################################################################")
20 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/cookie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | 
 5 | def get_key_value(cookie: str):
 6 | 	"""
 7 | 	Get a key-value pair from an HTTP cookie.\n
 8 | 	Returns an empty key-value pair on failure.
 9 | 	"""
10 | 	key = ""; value = ""
11 | 	if re.search(r"^[^\=\;]+\=[^\=\;]+$", cookie):
12 | 		key, value = cookie.split("=", 1)
13 | 	return key.strip(), value.strip()
14 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/directory.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | 
 5 | def is_directory(directory: str):
 6 | 	"""
 7 | 	Returns 'True' if the 'directory' exists and is a regular directory.
 8 | 	"""
 9 | 	return os.path.isdir(directory)
10 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from . import array
 4 | 
 5 | import os
 6 | 
 7 | __ENCODING = "ISO-8859-1"
 8 | 
 9 | def is_file(file: str):
10 | 	"""
11 | 	Returns 'True' if the 'file' exists and is a regular file.
12 | 	"""
13 | 	return os.path.isfile(file)
14 | 
15 | def validate(file: str):
16 | 	"""
17 | 	Validate a file.\n
18 | 	Success flag is 'True' if the file has a read permission and is not empty.
19 | 	"""
20 | 	success = False
21 | 	message = ""
22 | 	if not os.access(file, os.R_OK):
23 | 		message = f"\"{file}\" does not have a read permission"
24 | 	elif not os.stat(file).st_size > 0:
25 | 		message = f"\"{file}\" is empty"
26 | 	else:
27 | 		success = True
28 | 	return success, message
29 | 
30 | def read_array(file: str) -> list[str]:
31 | 	"""
32 | 	Read a file line by line, and append the lines to a list.\n
33 | 	Whitespace will be stripped from each line, and empty lines will be removed.\n
34 | 	Returns a unique list.
35 | 	"""
36 | 	tmp = []
37 | 	with open(file, "r", encoding = __ENCODING) as stream:
38 | 		for line in stream:
39 | 			line = line.strip()
40 | 			if line:
41 | 				tmp.append(line)
42 | 	return array.unique(tmp)
43 | 
44 | def write_array(array: list[str], out: str):
45 | 	"""
46 | 	Write a list to an output file.\n
47 | 	Whitespace will be stripped from each string in the list, and empty strings will be removed.
48 | 	"""
49 | 	with open(out, "w", encoding = __ENCODING) as stream:
50 | 		for entry in array:
51 | 			entry = entry.strip()
52 | 			if entry:
53 | 				stream.write(f"{entry}\n")
54 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/general.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import colorama, datetime, termcolor
 4 | 
 5 | colorama.init(autoreset = True)
 6 | 
 7 | def to_float(value: str):
 8 | 	"""
 9 | 	Returns 'None' on failure.
10 | 	"""
11 | 	tmp = None
12 | 	try:
13 | 		tmp = float(value)
14 | 	except ValueError:
15 | 		pass
16 | 	return tmp
17 | 
18 | # ----------------------------------------
19 | 
20 | def get_timestamp(message: str):
21 | 	"""
22 | 	Get the current timestamp.
23 | 	"""
24 | 	return f"{datetime.datetime.now().strftime('%H:%M:%S')} - {message}"
25 | 
26 | def print_error(message: str):
27 | 	"""
28 | 	Print an error message.
29 | 	"""
30 | 	print(f"ERROR: {message}")
31 | 
32 | def print_cyan(message: str):
33 | 	"""
34 | 	Print a message in cyan color.
35 | 	"""
36 | 	termcolor.cprint(message, "cyan")
37 | 
38 | def print_green(message: str):
39 | 	"""
40 | 	Print a message in green color.
41 | 	"""
42 | 	termcolor.cprint(message, "green")
43 | 
44 | def print_yellow(message: str):
45 | 	"""
46 | 	Print a message in yellow color.
47 | 	"""
48 | 	termcolor.cprint(message, "yellow")
49 | 
50 | def print_red(message: str):
51 | 	"""
52 | 	Print a message in red color.
53 | 	"""
54 | 	termcolor.cprint(message, "red")
55 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/header.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import re
 4 | 
 5 | def get_key_value(header: str):
 6 | 	"""
 7 | 	Get a key-value pair from an HTTP request header.\n
 8 | 	Returns an empty key-value pair on failure.
 9 | 	"""
10 | 	key = ""; value = ""
11 | 	if re.search(r"^[^\:]+\:.+$", header):
12 | 		key, value = header.split(":", 1)
13 | 	return key.strip(), value.strip()
14 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/scrape.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from . import array, file, general, stopwatch
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | import asyncio, jsbeautifier, os, random, scrapy, scrapy.crawler, scrapy.utils.project, typing, urllib.parse
  8 | 
  9 | class ScrapyScraperSpider(scrapy.Spider):
 10 | 
 11 | 	def __init__(
 12 | 		self,
 13 | 		urls           : list[str],
 14 | 		whitelist      : list[str],
 15 | 		links          : bool,
 16 | 		playwright     : bool,
 17 | 		playwright_wait: float,
 18 | 		headers        : dict[str, str],
 19 | 		cookies        : dict[str, str],
 20 | 		user_agents    : list[str],
 21 | 		proxy          : str,
 22 | 		directory      : str,
 23 | 		out            : str,
 24 | 		debug          : bool
 25 | 	):
 26 | 		"""
 27 | 		Class for managing Scrapy's spider.
 28 | 		"""
 29 | 		self.name              = "ScrapyScraperSpider"
 30 | 		self.start_urls        = urls
 31 | 		self.allowed_domains   = whitelist
 32 | 		self.__links           = links
 33 | 		self.__playwright      = playwright
 34 | 		self.__playwright_wait = playwright_wait
 35 | 		self.__headers         = headers
 36 | 		self.__cookies         = cookies
 37 | 		self.__user_agents     = user_agents
 38 | 		self.__user_agents_len = len(self.__user_agents)
 39 | 		self.__proxy           = proxy
 40 | 		self.__directory       = directory
 41 | 		self.__out             = out
 42 | 		self.__debug           = debug
 43 | 		self.__context         = 0
 44 | 		self.__crawled         = []
 45 | 		self.__collected       = []
 46 | 
 47 | 	def __print_start_urls(self):
 48 | 		"""
 49 | 		Print start URLS.
 50 | 		"""
 51 | 		general.print_green("Start URLs:")
 52 | 		for url in self.start_urls:
 53 | 			print(url)
 54 | 
 55 | 	def __print_allowed_domains(self):
 56 | 		"""
 57 | 		Print allowed domains/subdomains.
 58 | 		"""
 59 | 		if self.allowed_domains:
 60 | 			general.print_cyan("Allowed domains/subdomains:")
 61 | 			for domain in self.allowed_domains:
 62 | 				print(f"*.{domain}")
 63 | 		else:
 64 | 			general.print_red("Domain whitelisting is off!")
 65 | 
 66 | 	def start_requests(self):
 67 | 		"""
 68 | 		Main method.
 69 | 		"""
 70 | 		self.__print_start_urls()
 71 | 		self.__print_allowed_domains()
 72 | 		print(general.get_timestamp("Crawling and scraping..."))
 73 | 		print("Press CTRL + C to exit early - results will be saved, be patient")
 74 | 		for url in self.start_urls:
 75 | 			yield scrapy.Request(
 76 | 				url         = url,
 77 | 				headers     = self.__get_headers(),
 78 | 				cookies     = self.__cookies,
 79 | 				meta        = self.__get_metadata(),
 80 | 				errback     = self.__error,
 81 | 				callback    = self.__success,
 82 | 				dont_filter = False
 83 | 			)
 84 | 
 85 | 	def __get_headers(self) -> dict[str, str]:
 86 | 		"""
 87 | 		Get default HTTP request headers.
 88 | 		"""
 89 | 		default_headers = {
 90 | 			"User-Agent"               : self.__get_user_agent(),
 91 | 			"Accept-Language"          : "en-US, *",
 92 | 			"Accept"                   : "*/*",
 93 | 			"Referer"                  : "https://www.google.com/",
 94 | 			"Upgrade-Insecure-Requests": "1"
 95 | 		}
 96 | 		headers = {}
 97 | 		for name, value in default_headers.items():
 98 | 			if value:
 99 | 				headers[name.lower()] = value
100 | 		for name, value in self.__headers.items(): # override
101 | 			headers[name.lower()] = value
102 | 		return headers
103 | 
104 | 	def __get_user_agent(self):
105 | 		"""
106 | 		Get a [random] user agent.\n
107 | 		Returns an empty string if there are no user agents.
108 | 		"""
109 | 		user_agent = ""
110 | 		if self.__user_agents_len > 0:
111 | 			user_agent = self.__user_agents[random.randint(0, self.__user_agents_len - 1)]
112 | 		return user_agent
113 | 
114 | 	def __get_metadata(self) -> dict[str, typing.Any]:
115 | 		"""
116 | 		Get Scrapy's request metadata.
117 | 		"""
118 | 		self.__context += 1
119 | 		tmp                                = {}
120 | 		tmp["playwright"                 ] = self.__playwright
121 | 		tmp["playwright_context"         ] = str(self.__context)
122 | 		tmp["playwright_include_page"    ] = self.__playwright
123 | 		tmp["playwright_context_kwargs"  ] = {}
124 | 		tmp["playwright_context_kwargs"  ]["ignore_https_errors"] = True
125 | 		tmp["playwright_context_kwargs"  ]["java_script_enabled"] = True
126 | 		tmp["playwright_context_kwargs"  ]["accept_downloads"   ] = False
127 | 		tmp["playwright_context_kwargs"  ]["bypass_csp"         ] = False
128 | 		tmp["playwright_page_goto_kwargs"] = {"wait_until": "load"}
129 | 		tmp["proxy"                      ] = self.__proxy
130 | 		tmp["cookiejar"                  ] = 1
131 | 		tmp["dont_merge_cookies"         ] = False
132 | 		return tmp
133 | 
134 | 	# ------------------------------------
135 | 
136 | 	def closed(self, reason: typing.Any):
137 | 		"""
138 | 		On close callback.
139 | 		"""
140 | 		self.__crawled = array.unique(self.__crawled)
141 | 		print(f"Total unique URLs crawled: {len(self.__crawled)}")
142 | 		self.__collected = array.unique(self.__collected + self.__crawled)
143 | 		print(f"Total unique URLs collected: {len(self.__collected)}")
144 | 		stopwatch.stopwatch.stop()
145 | 		if self.__collected:
146 | 			file.write_array(sorted(self.__collected, key = str.casefold), self.__out)
147 | 
148 | 	# ------------------------------------
149 | 
150 | 	async def __error(self, failure: typing.Any):
151 | 		"""
152 | 		Error callback.
153 | 		"""
154 | 		status = failure.value.response.status if failure.check(scrapy.spidermiddlewares.httperror.HttpError) else 0
155 | 		url    = failure.request.url
156 | 		error  = str(failure.value).splitlines()[0]
157 | 		if self.__playwright:
158 | 			page = failure.request.meta["playwright_page"]
159 | 			await page.close()
160 | 			await page.context.close()
161 | 		self.__print_error(status, url, error)
162 | 
163 | 	def __print_error(self, status: int, url: str, message: str):
164 | 		"""
165 | 		Print error.
166 | 		"""
167 | 		if self.__debug:
168 | 			if status:
169 | 				url = f"{status} {url}"
170 | 			general.print_red(f"[ ERROR ] {url} -> {message}")
171 | 
172 | 	# ------------------------------------
173 | 
174 | 	async def __success(self, response: typing.Any):
175 | 		"""
176 | 		Success callback.
177 | 		"""
178 | 		status  = response.status
179 | 		url     = response.url
180 | 		content = ""
181 | 		if self.__playwright:
182 | 			page = response.request.meta["playwright_page"]
183 | 			if self.__playwright_wait > 0:
184 | 				await asyncio.sleep(self.__playwright_wait)
185 | 			content = await page.content()
186 | 			await page.close()
187 | 			await page.context.close()
188 | 		else:
189 | 			content = response.body
190 | 		# --------------------------------
191 | 		self.__crawled.append(url)
192 | 		self.__print_success(status, url)
193 | 		# --------------------------------
194 | 		if self.__directory:
195 | 			self.__download_js(url, content)
196 | 		# --------------------------------
197 | 		links = self.__extract_links(url, scrapy.http.HtmlResponse(url = url, body = content, encoding = "UTF-8") if self.__playwright else response)
198 | 		self.__collected.extend(links)
199 | 		for link in links:
200 | 			yield response.follow(
201 | 				url         = link,
202 | 				headers     = self.__get_headers(),
203 | 				cookies     = self.__cookies,
204 | 				meta        = self.__get_metadata(),
205 | 				errback     = self.__error,
206 | 				callback    = self.__success,
207 | 				dont_filter = False
208 | 			)
209 | 
210 | 	def __print_success(self, status: int, url: str):
211 | 		"""
212 | 		Print success.
213 | 		"""
214 | 		if self.__debug:
215 | 			general.print_green(f"[ OK ] {status} {url}")
216 | 
217 | 	def __download_js(self, url: str, content: str | bytes):
218 | 		"""
219 | 		Download JavaScript files.
220 | 		"""
221 | 		filename = urllib.parse.urlsplit(url).path.rsplit("/", 1)[-1]
222 | 		if filename.lower().endswith(".js"):
223 | 			filename = os.path.join(self.__directory, filename)
224 | 			if not os.path.exists(filename):
225 | 				try:
226 | 					soup = BeautifulSoup(content, "html.parser")
227 | 					open(filename, "w").write(jsbeautifier.beautify(soup.get_text()))
228 | 				except Exception as ex:
229 | 					self.__print_exception(url, str(ex))
230 | 
231 | 	def __extract_links(self, url: str, response: typing.Any):
232 | 		"""
233 | 		Extract links.\n
234 | 		Returns a unique list.
235 | 		"""
236 | 		tmp = []
237 | 		try:
238 | 			tmp.extend(self.__extract_links_xpath(url, response, "script", "src" ))
239 | 			tmp.extend(self.__extract_links_xpath(url, response, "a"     , "href"))
240 | 			tmp.extend(self.__extract_links_xpath(url, response, "link"  , "href"))
241 | 		except (UnicodeEncodeError, AttributeError) as ex:
242 | 			self.__print_exception(url, str(ex))
243 | 		return array.unique(tmp)
244 | 
245 | 	def __extract_links_xpath(self, url: str, response: typing.Any, tag: str, attr: str):
246 | 		"""
247 | 		Extract links based on the specified XPath.
248 | 		"""
249 | 		tmp = []
250 | 		for link in response.xpath(f"//{tag}[@{attr}]"):
251 | 			link   = link.xpath(f"@{attr}").get()
252 | 			obj    = urllib.parse.urlsplit(link)
253 | 			scheme = obj.scheme
254 | 			domain = obj.netloc.lower()
255 | 			if scheme and scheme not in ["http", "https"]:
256 | 				continue
257 | 			elif not self.__links and domain and not self.__is_allowed(domain):
258 | 				continue
259 | 			tmp.append(urllib.parse.urljoin(url, link))
260 | 		return tmp
261 | 
262 | 	def __is_allowed(self, domain: str):
263 | 		"""
264 | 		Check if a domain name is in the scope.
265 | 		"""
266 | 		return not self.allowed_domains or any(domain == allowed or domain.endswith(f".{allowed}") for allowed in self.allowed_domains)
267 | 
268 | 	def __print_exception(self, url: str, message: str):
269 | 		"""
270 | 		Print exception.
271 | 		"""
272 | 		if self.__debug:
273 | 			general.print_red(f"[ EXCEPTION ] {url} -> {message}")
274 | 
275 | # ----------------------------------------
276 | 
277 | class ScrapyScraper:
278 | 
279 | 	def __init__(
280 | 		self,
281 | 		urls                      : list[str],
282 | 		whitelist                 : list[str],
283 | 		links                     : bool,
284 | 		playwright                : bool,
285 | 		playwright_wait           : float,
286 | 		concurrent_requests       : int,
287 | 		concurrent_requests_domain: int,
288 | 		sleep                     : float,
289 | 		random_sleep              : bool,
290 | 		auto_throttle             : float,
291 | 		retries                   : int,
292 | 		recursion                 : int,
293 | 		request_timeout           : float,
294 | 		headers                   : dict[str, str],
295 | 		cookies                   : dict[str, str],
296 | 		user_agents               : list[str],
297 | 		proxy                     : str,
298 | 		directory                 : str,
299 | 		out                       : str,
300 | 		debug                     : bool
301 | 	):
302 | 		"""
303 | 		Class for managing Scrapy's runner.
304 | 		"""
305 | 		self.__urls                       = urls
306 | 		self.__whitelist                  = whitelist
307 | 		self.__links                      = links
308 | 		self.__playwright                 = playwright
309 | 		self.__playwright_wait            = playwright_wait
310 | 		self.__concurrent_requests        = concurrent_requests
311 | 		self.__concurrent_requests_domain = concurrent_requests_domain
312 | 		self.__sleep                      = sleep
313 | 		self.__random_sleep               = random_sleep
314 | 		self.__auto_throttle              = auto_throttle
315 | 		self.__retries                    = retries
316 | 		self.__recursion                  = recursion
317 | 		self.__request_timeout            = request_timeout # all timeouts
318 | 		self.__headers                    = headers
319 | 		self.__cookies                    = cookies
320 | 		self.__user_agents                = user_agents
321 | 		self.__proxy                      = proxy
322 | 		self.__directory                  = directory
323 | 		self.__out                        = out
324 | 		self.__debug                      = debug
325 | 		self.__headless_browser           = True
326 | 		self.__browser_type               = "chromium" # Playwright's headless browser
327 | 		self.__handle_sigint              = False
328 | 		self.__max_redirects              = 10
329 | 
330 | 	def __page_block(self, request: typing.Any):
331 | 		"""
332 | 		Types of content to block while using Playwright's headless browser.
333 | 		"""
334 | 		return request.resource_type in ["fetch", "stylesheet", "image", "ping", "font", "media", "imageset", "beacon", "csp_report", "object", "texttrack", "manifest"]
335 | 
336 | 	def run(self):
337 | 		"""
338 | 		Configure the settings and run the Chad Extractor spider.
339 | 		"""
340 | 		settings = scrapy.utils.project.get_project_settings()
341 | 		# --------------------------------
342 | 		settings["COOKIES_ENABLED"         ] = True
343 | 		settings["DOWNLOAD_TIMEOUT"        ] = self.__request_timeout # connect / read timeout
344 | 		settings["DOWNLOAD_DELAY"          ] = self.__sleep
345 | 		settings["RANDOMIZE_DOWNLOAD_DELAY"] = self.__random_sleep
346 | 		settings["HTTPPROXY_ENABLED"       ] = bool(self.__proxy)
347 | 		# --------------------------------
348 | 		settings["EXTENSIONS"]["scrapy.extensions.throttle.AutoThrottle"] = 100
349 | 		# --------------------------------
350 | 		settings["AUTOTHROTTLE_ENABLED"           ] = self.__auto_throttle > 0
351 | 		settings["AUTOTHROTTLE_DEBUG"             ] = False
352 | 		settings["AUTOTHROTTLE_START_DELAY"       ] = self.__sleep
353 | 		settings["AUTOTHROTTLE_MAX_DELAY"         ] = settings["AUTOTHROTTLE_START_DELAY"] + 30
354 | 		settings["AUTOTHROTTLE_TARGET_CONCURRENCY"] = self.__auto_throttle
355 | 		# --------------------------------
356 | 		settings["CONCURRENT_REQUESTS"           ] = self.__concurrent_requests
357 | 		settings["CONCURRENT_REQUESTS_PER_DOMAIN"] = self.__concurrent_requests_domain
358 | 		settings["RETRY_ENABLED"                 ] = self.__retries > 0
359 | 		settings["RETRY_TIMES"                   ] = self.__retries
360 | 		settings["REDIRECT_ENABLED"              ] = self.__max_redirects > 0
361 | 		settings["REDIRECT_MAX_TIMES"            ] = self.__max_redirects
362 | 		settings["DEPTH_LIMIT"                   ] = self.__recursion
363 | 		# --------------------------------
364 | 		settings["ROBOTSTXT_OBEY"                      ] = False
365 | 		settings["TELNETCONSOLE_ENABLED"               ] = False
366 | 		settings["LOG_ENABLED"                         ] = False
367 | 		settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7"
368 | 		# --------------------------------
369 | 		if self.__playwright:
370 | 			settings["DOWNLOAD_HANDLERS"]["https"] = "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler"
371 | 			settings["DOWNLOAD_HANDLERS"]["http" ] = "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler"
372 | 			settings["TWISTED_REACTOR"           ] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
373 | 			settings["PLAYWRIGHT_LAUNCH_OPTIONS" ] = {
374 | 				"headless"     : self.__headless_browser,
375 | 				"handle_sigint": self.__handle_sigint,
376 | 				"proxy"        : {"server": self.__proxy} if self.__proxy else None
377 | 			}
378 | 			settings["PLAYWRIGHT_BROWSER_TYPE"              ] = self.__browser_type
379 | 			settings["PLAYWRIGHT_ABORT_REQUEST"             ] = self.__page_block
380 | 			settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"] = self.__request_timeout * 1000
381 | 		# --------------------------------
382 | 		scrapy_scraper_spider = scrapy.crawler.CrawlerProcess(settings)
383 | 		scrapy_scraper_spider.crawl(ScrapyScraperSpider, self.__urls, self.__whitelist, self.__links, self.__playwright, self.__playwright_wait, self.__headers, self.__cookies, self.__user_agents, self.__proxy, self.__directory, self.__out, self.__debug)
384 | 		scrapy_scraper_spider.start()
385 | 		scrapy_scraper_spider.join()
386 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/stopwatch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import datetime
 4 | 
 5 | class Stopwatch:
 6 | 
 7 | 	def __init__(self):
 8 | 		self.__start = datetime.datetime.now()
 9 | 
10 | 	def stop(self):
11 | 		self.__end = datetime.datetime.now()
12 | 		print(f"Script has finished in {self.__end - self.__start}")
13 | 
14 | stopwatch = Stopwatch()
15 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import tldextract, urllib.parse
 4 | 
 5 | __URL_SCHEME_WHITELIST = ["http", "https", "socks4", "socks4h", "socks5", "socks5h"]
 6 | __MIN_PORT_NUM         = 1
 7 | __MAX_PORT_NUM         = 65535
 8 | 
 9 | def is_http(url: str):
10 | 	return urllib.parse.urlsplit(url).scheme.lower() in ["http", "https"]
11 | 
12 | def validate(url: str):
13 | 	"""
14 | 	Validate a URL.
15 | 	"""
16 | 	success = False
17 | 	message = ""
18 | 	tmp = urllib.parse.urlsplit(url)
19 | 	if not tmp.scheme:
20 | 		message = f"URL scheme is required: {url}"
21 | 	elif tmp.scheme not in __URL_SCHEME_WHITELIST:
22 | 		message = f"Supported URL schemes are 'http[s]', 'socks4[h]', and 'socks5[h]': {url}"
23 | 	elif not tmp.netloc:
24 | 		message = f"Invalid domain name: {url}"
25 | 	elif tmp.port and (tmp.port < __MIN_PORT_NUM or tmp.port > __MAX_PORT_NUM):
26 | 		message = f"Port number is out of range: {url}"
27 | 	else:
28 | 		success = True
29 | 	return success, message
30 | 
31 | def validate_multiple(urls: list[str]):
32 | 	"""
33 | 	Validate multiple URLs.
34 | 	"""
35 | 	success = True
36 | 	message = ""
37 | 	for url in urls:
38 | 		success, message = validate(url)
39 | 		if not success:
40 | 			break
41 | 	return success, message
42 | 
43 | def extract_fqdn(url: str) -> str:
44 | 	"""
45 | 	Extract the fully qualified domain name from a URL.\n
46 | 	Returns an empty string on failure.
47 | 	"""
48 | 	tmp = ""
49 | 	obj = tldextract.extract(url)
50 | 	if obj.fqdn:
51 | 		tmp = obj.fqdn.lower()
52 | 	return tmp
53 | 
54 | def extract_fqdn_multiple(urls: list[str]) -> list[str]:
55 | 	"""
56 | 	Extract the fully qualified domain names from a list of URLs.\n
57 | 	Returns an empty list on failure.
58 | 	"""
59 | 	tmp = []
60 | 	for url in urls:
61 | 		url = extract_fqdn(url)
62 | 		if url:
63 | 			tmp.append(url)
64 | 	return tmp
65 | 


--------------------------------------------------------------------------------
/src/scrapy_scraper/utils/validate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | from . import config, cookie, directory, file, general, header, url
  4 | 
  5 | import argparse, bot_safe_agents, sys
  6 | 
  7 | class MyArgParser(argparse.ArgumentParser):
  8 | 
  9 | 	def print_help(self):
 10 | 		print(f"Scrapy Scraper {config.APP_VERSION} ( github.com/ivan-sincek/scrapy-scraper )")
 11 | 		print("")
 12 | 		print("Usage:   scrapy-scraper -u urls                     -o out         [-dir directory]")
 13 | 		print("Example: scrapy-scraper -u https://example.com/home -o results.txt [-dir downloads]")
 14 | 		print("")
 15 | 		print("DESCRIPTION")
 16 | 		print("    Crawl and scrape websites")
 17 | 		print("URLS")
 18 | 		print("    File containing URLs or a single URL to start crawling and scraping from")
 19 | 		print("    -u, --urls = urls.txt | https://example.com/home | etc.")
 20 | 		print("WHITELIST")
 21 | 		print("    File containing whitelisted domain names to limit the scope")
 22 | 		print("    Specify 'off' to disable domain whitelisting")
 23 | 		print("    Default: limit the scope to domain names extracted from the URLs")
 24 | 		print("    -w, --whitelist = whitelist.txt | off | etc.")
 25 | 		print("LINKS")
 26 | 		print("    Include all 3rd party links and sources in the output file")
 27 | 		print("    -l, --links")
 28 | 		print("PLAYWRIGHT")
 29 | 		print("    Use Playwright's headless browser")
 30 | 		print("    -p, --playwright")
 31 | 		print("PLAYWRIGHT WAIT")
 32 | 		print("    Wait time in seconds before fetching the page content")
 33 | 		print("    -pw, --playwright-wait = 0.5 | 2 | 4 | etc.")
 34 | 		print("CONCURRENT REQUESTS")
 35 | 		print("    Number of concurrent requests")
 36 | 		print("    Default: 30")
 37 | 		print("    -cr, --concurrent-requests = 30 | 45 | etc.")
 38 | 		print("CONCURRENT REQUESTS PER DOMAIN")
 39 | 		print("    Number of concurrent requests per domain")
 40 | 		print("    Default: 10")
 41 | 		print("    -crd, --concurrent-requests-domain = 10 | 15 | etc.")
 42 | 		print("SLEEP")
 43 | 		print("    Sleep time in seconds between two consecutive requests to the same domain")
 44 | 		print("    -s, --sleep = 1.5 | 3 | etc.")
 45 | 		print("RANDOM SLEEP")
 46 | 		print("    Randomize the sleep time between requests to vary between '0.5 * sleep' and '1.5 * sleep'")
 47 | 		print("    -rs, --random-sleep")
 48 | 		print("AUTO THROTTLE")
 49 | 		print("    Auto throttle concurrent requests based on the load and latency")
 50 | 		print("    Sleep time is still respected")
 51 | 		print("    -at, --auto-throttle = 0.5 | 10 | 15 | 45 | etc.")
 52 | 		print("RETRIES")
 53 | 		print("    Number of retries per URL")
 54 | 		print("    Default: 2")
 55 | 		print("    -rt, --retries = 0 | 4 | etc.")
 56 | 		print("RECURSION")
 57 | 		print("    Recursion depth limit")
 58 | 		print("    Specify '0' for no limit")
 59 | 		print("    Default: 1")
 60 | 		print("    -r, --recursion = 0 | 2 | etc.")
 61 | 		print("REQUEST TIMEOUT")
 62 | 		print("    Request timeout in seconds")
 63 | 		print("    Default: 60")
 64 | 		print("    -t, --request-timeout = 30 | 90 | etc.")
 65 | 		print("HEADER")
 66 | 		print("    Specify any number of extra HTTP request headers")
 67 | 		print("    -H, --header = \"Authorization: Bearer ey...\" | etc.")
 68 | 		print("COOKIE")
 69 | 		print("    Specify any number of extra HTTP cookies")
 70 | 		print("    -b, --cookie = PHPSESSIONID=3301 | etc.")
 71 | 		print("USER AGENT")
 72 | 		print("    User agent to use")
 73 | 		print(f"    Default: {config.USER_AGENT}")
 74 | 		print("    -a, --user-agent = random[-all] | curl/3.30.1 | etc.")
 75 | 		print("PROXY")
 76 | 		print("    Web proxy to use")
 77 | 		print("    -x, --proxy = http://127.0.0.1:8080 | etc.")
 78 | 		print("DIRECTORY")
 79 | 		print("    Output directory")
 80 | 		print("    All extracted JavaScript files will be saved in this directory")
 81 | 		print("    -dir, --directory = downloads | etc.")
 82 | 		print("OUT")
 83 | 		print("    Output file")
 84 | 		print("    -o, --out = results.txt | etc.")
 85 | 		print("DEBUG")
 86 | 		print("    Enable debug output")
 87 | 		print("    -dbg, --debug")
 88 | 
 89 | 	def error(self, message):
 90 | 		if len(sys.argv) > 1:
 91 | 			print("Missing a mandatory option (-u, -o) and/or optional (-w, -l, -p, -pw, -cr, -crd, -s, -rs, -at, -rt, -r, -t, -H, -b, -a, -x, -dir, -dbg)")
 92 | 			print("Use -h or --help for more info")
 93 | 		else:
 94 | 			self.print_help()
 95 | 		exit()
 96 | 
 97 | class Validate:
 98 | 
 99 | 	def __init__(self):
100 | 		"""
101 | 		Class for validating and managing CLI arguments.
102 | 		"""
103 | 		self.__parser = MyArgParser()
104 | 		self.__parser.add_argument("-u"  , "--urls"                      , required = True , type   = str         , default = ""   )
105 | 		self.__parser.add_argument("-w"  , "--whitelist"                 , required = False, type   = str         , default = ""   )
106 | 		self.__parser.add_argument("-l"  , "--links"                     , required = False, action = "store_true", default = False)
107 | 		self.__parser.add_argument("-p"  , "--playwright"                , required = False, action = "store_true", default = False)
108 | 		self.__parser.add_argument("-pw" , "--playwright-wait"           , required = False, type   = str         , default = ""   )
109 | 		self.__parser.add_argument("-cr" , "--concurrent-requests"       , required = False, type   = str         , default = ""   )
110 | 		self.__parser.add_argument("-crd", "--concurrent-requests-domain", required = False, type   = str         , default = ""   )
111 | 		self.__parser.add_argument("-s"  , "--sleep"                     , required = False, type   = str         , default = ""   )
112 | 		self.__parser.add_argument("-rs" , "--random-sleep"              , required = False, action = "store_true", default = False)
113 | 		self.__parser.add_argument("-at" , "--auto-throttle"             , required = False, type   = str         , default = ""   )
114 | 		self.__parser.add_argument("-rt" , "--retries"                   , required = False, type   = str         , default = ""   )
115 | 		self.__parser.add_argument("-r"  , "--recursion"                 , required = False, type   = str         , default = ""   )
116 | 		self.__parser.add_argument("-t"  , "--request-timeout"           , required = False, type   = str         , default = ""   )
117 | 		self.__parser.add_argument("-H"  , "--header"                    , required = False, action = "append"    , nargs   = "+"  )
118 | 		self.__parser.add_argument("-b"  , "--cookie"                    , required = False, action = "append"    , nargs   = "+"  )
119 | 		self.__parser.add_argument("-a"  , "--user-agent"                , required = False, type   = str         , default = ""   )
120 | 		self.__parser.add_argument("-x"  , "--proxy"                     , required = False, type   = str         , default = ""   )
121 | 		self.__parser.add_argument("-dir", "--directory"                 , required = False, type   = str         , default = ""   )
122 | 		self.__parser.add_argument("-o"  , "--out"                       , required = True , type   = str         , default = ""   )
123 | 		self.__parser.add_argument("-dbg", "--debug"                     , required = False, action = "store_true", default = False)
124 | 
125 | 	def validate_args(self):
126 | 		"""
127 | 		Validate and return the CLI arguments.
128 | 		"""
129 | 		self.__success = True
130 | 		self.__args = self.__parser.parse_args()
131 | 		self.__validate_urls()
132 | 		self.__validate_whitelist()
133 | 		self.__validate_playwright_wait()
134 | 		self.__validate_concurrent_requests()
135 | 		self.__validate_concurrent_requests_domain()
136 | 		self.__validate_sleep()
137 | 		self.__validate_auto_throttle()
138 | 		self.__validate_retries()
139 | 		self.__validate_recursion()
140 | 		self.__validate_request_timeout()
141 | 		self.__validate_header()
142 | 		self.__validate_cookie()
143 | 		self.__validate_user_agent()
144 | 		self.__validate_proxy()
145 | 		self.__validate_directory()
146 | 		return self.__success, self.__args
147 | 
148 | 	def __error(self, message: str):
149 | 		"""
150 | 		Set the success flag to 'False' to prevent the main task from executing, and print an error message.
151 | 		"""
152 | 		self.__success = False
153 | 		general.print_error(message)
154 | 
155 | 	# ------------------------------------
156 | 
157 | 	def __validate_urls(self):
158 | 		tmp = []
159 | 		if file.is_file(self.__args.urls):
160 | 			success, message = file.validate(self.__args.urls)
161 | 			if not success:
162 | 				self.__error(message)
163 | 			else:
164 | 				tmp = file.read_array(self.__args.urls)
165 | 				if not tmp:
166 | 					self.__error(f"No URLs were found in \"{self.__args.urls}\"")
167 | 				else:
168 | 					success, message = url.validate_multiple(tmp)
169 | 					if not success:
170 | 						self.__error(message)
171 | 		else:
172 | 			success, message = url.validate(self.__args.urls)
173 | 			if not success:
174 | 				self.__error(message)
175 | 			else:
176 | 				tmp = [self.__args.urls]
177 | 		self.__args.urls = tmp
178 | 
179 | 	def __validate_whitelist(self):
180 | 		tmp = []
181 | 		if self.__args.whitelist.lower() == "off":
182 | 			pass
183 | 		elif self.__args.whitelist:
184 | 			if not file.is_file(self.__args.whitelist):
185 | 				self.__error(f"\"{self.__args.whitelist}\" does not exist")
186 | 			else:
187 | 				success, message = file.validate(self.__args.whitelist)
188 | 				if not success:
189 | 					self.__error(message)
190 | 				else:
191 | 					tmp = url.extract_fqdn_multiple(file.read_array(self.__args.whitelist))
192 | 					if not tmp:
193 | 						self.__error(f"No valid whitelisted domain names were found in \"{self.__args.whitelist}\"")
194 | 		elif self.__success:
195 | 			tmp = url.extract_fqdn_multiple(self.__args.urls)
196 | 			if not tmp:
197 | 				self.__error("No domain names could be extracted from the provided URLs for domain whitelisting")
198 | 		self.__args.whitelist = tmp
199 | 
200 | 	def __validate_playwright_wait(self):
201 | 		tmp = 0
202 | 		if self.__args.playwright_wait:
203 | 			tmp = general.to_float(self.__args.playwright_wait)
204 | 			if tmp is None:
205 | 				self.__error("Playwright's wait time must be numeric")
206 | 			elif tmp <= 0:
207 | 				self.__error("Playwright's wait time must be greater than zero")
208 | 		self.__args.playwright_wait = tmp
209 | 
210 | 	def __validate_concurrent_requests(self):
211 | 		tmp = 30
212 | 		if self.__args.concurrent_requests:
213 | 			if not self.__args.concurrent_requests.isdigit():
214 | 				self.__error("Number of concurrent requests must be numeric")
215 | 			else:
216 | 				tmp = int(self.__args.concurrent_requests)
217 | 				if tmp <= 0:
218 | 					self.__error("Number of concurrent requests must be greater than zero")
219 | 		self.__args.concurrent_requests = tmp
220 | 
221 | 	def __validate_concurrent_requests_domain(self):
222 | 		tmp = 10
223 | 		if self.__args.concurrent_requests_domain:
224 | 			if not self.__args.concurrent_requests_domain.isdigit():
225 | 				self.__error("Number of concurrent requests per domain must be numeric")
226 | 			else:
227 | 				tmp = int(self.__args.concurrent_requests_domain)
228 | 				if tmp <= 0:
229 | 					self.__error("Number of concurrent requests per domain must be greater than zero")
230 | 		self.__args.concurrent_requests_domain = tmp
231 | 
232 | 	def __validate_sleep(self,):
233 | 		tmp = 0
234 | 		if self.__args.sleep:
235 | 			tmp = general.to_float(self.__args.sleep)
236 | 			if tmp is None:
237 | 				self.__error("Sleep time between two consecutive requests must be numeric")
238 | 			elif tmp <= 0:
239 | 				self.__error("Sleep time between two consecutive requests must be greater than zero")
240 | 		self.__args.sleep = tmp
241 | 
242 | 	def __validate_auto_throttle(self):
243 | 		tmp = 0
244 | 		if self.__args.auto_throttle:
245 | 			tmp = general.to_float(self.__args.auto_throttle)
246 | 			if tmp is None:
247 | 				self.__error("Auto throttle must be numeric")
248 | 			elif tmp <= 0:
249 | 				self.__error("Auto throttle must be greater than zero")
250 | 		self.__args.auto_throttle = tmp
251 | 
252 | 	def __validate_retries(self):
253 | 		tmp = 2
254 | 		if self.__args.retries:
255 | 			if not self.__args.retries.isdigit():
256 | 				self.__error("Number of retries must be numeric")
257 | 			else:
258 | 				tmp = int(self.__args.retries)
259 | 				if tmp <= 0:
260 | 					self.__error("Number of retries must be greater than zero")
261 | 		self.__args.retries = tmp
262 | 
263 | 	def __validate_recursion(self):
264 | 		tmp = 1
265 | 		if self.__args.recursion:
266 | 			if not self.__args.recursion.isdigit():
267 | 				self.__error("Recursion depth must be numeric")
268 | 			else:
269 | 				tmp = int(self.__args.recursion)
270 | 				if tmp < 0:
271 | 					self.__error("Recursion depth must be greater than or equal to zero")
272 | 		self.__args.recursion = tmp
273 | 
274 | 	def __validate_request_timeout(self):
275 | 		tmp = 60
276 | 		if self.__args.request_timeout:
277 | 			tmp = general.to_float(self.__args.request_timeout)
278 | 			if tmp is None:
279 | 				self.__error("Request timeout must be numeric")
280 | 			elif tmp <= 0:
281 | 				self.__error("Request timeout must be greater than zero")
282 | 		self.__args.request_timeout = tmp
283 | 
284 | 	def __validate_header(self):
285 | 		tmp = {}
286 | 		if self.__args.header:
287 | 			for entry in self.__args.header:
288 | 				key, value = header.get_key_value(entry[0])
289 | 				if not key:
290 | 					self.__error(f"Invalid HTTP request header: {entry[0]}")
291 | 					continue
292 | 				tmp[key] = value
293 | 		self.__args.header = tmp
294 | 
295 | 	def __validate_cookie(self):
296 | 		tmp = {}
297 | 		if self.__args.cookie:
298 | 			for entry in self.__args.cookie:
299 | 				key, value = cookie.get_key_value(entry[0])
300 | 				if not key:
301 | 					self.__error(f"Invalid HTTP cookie: {entry[0]}")
302 | 					continue
303 | 				tmp[key] = value
304 | 		self.__args.cookie = tmp
305 | 
306 | 	def __validate_user_agent(self):
307 | 		tmp = [config.USER_AGENT]
308 | 		if self.__args.user_agent:
309 | 			lower = self.__args.user_agent.lower()
310 | 			if lower == "random-all":
311 | 				tmp = bot_safe_agents.get_all()
312 | 			elif lower == "random":
313 | 				tmp = [bot_safe_agents.get_random()]
314 | 			else:
315 | 				tmp = [self.__args.user_agent]
316 | 		self.__args.user_agent = tmp
317 | 
318 | 	def __validate_proxy(self):
319 | 		if self.__args.proxy:
320 | 			success, message = url.validate(self.__args.proxy)
321 | 			if not success:
322 | 				self.__error(message)
323 | 
324 | 	def __validate_directory(self):
325 | 		if self.__args.directory:
326 | 			if not directory.is_directory(self.__args.directory):
327 | 				self.__error(f"\"{self.__args.directory}\" does not exist or is not a directory")
328 | 


--------------------------------------------------------------------------------