├── .gitattributes ├── .github └── workflows │ ├── codeql.yaml │ └── python-app.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── img └── scraping.png ├── pyproject.toml └── src └── scrapy_scraper ├── main.py └── utils ├── array.py ├── config.py ├── cookie.py ├── directory.py ├── file.py ├── general.py ├── header.py ├── scrape.py ├── stopwatch.py ├── url.py └── validate.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yaml: -------------------------------------------------------------------------------- 1 | name: CodeQL 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | paths: 9 | - 'src/**' 10 | 11 | jobs: 12 | analyze: 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | include: 17 | - language: python 18 | build-mode: none 19 | 20 | runs-on: ubuntu-latest 21 | 22 | permissions: 23 | security-events: write 24 | packages: read 25 | actions: read 26 | contents: read 27 | 28 | steps: 29 | - name: Checkout Repository 30 | uses: actions/checkout@v4 31 | 32 | - name: Initialize CodeQL 33 | uses: github/codeql-action/init@v3 34 | with: 35 | languages: ${{ matrix.language }} 36 | build-mode: ${{ matrix.build-mode }} 37 | 38 | - name: Perform CodeQL Analysis 39 | uses: github/codeql-action/analyze@v3 40 | with: 41 | category: "/language:${{matrix.language}}" 42 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | name: Test Python Application 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | permissions: 10 | contents: read 11 | 12 | jobs: 13 | build: 14 | strategy: 15 | matrix: 16 | os: [ ubuntu-latest, macos-latest, windows-latest ] 17 | python-version: [ "3.10", "3.x" ] 18 | 19 | runs-on: ${{ matrix.os }} 20 | 21 | steps: 22 | - name: Checkout Repository 23 | uses: actions/checkout@v4 24 | 25 | - name: Set Up Python ${{ matrix.python-version }} 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | 30 | - name: Install Dependencies and Build 31 | run: | 32 | python -m pip install --upgrade pip setuptools build wheel 33 | python -m build 34 | pip install . 35 | 36 | - name: Run Python Application 37 | run: | 38 | scrapy-scraper --help 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ivan Šincek 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/scrapy_scraper/*.py 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy Scraper 2 | 3 | Web crawler and scraper based on Scrapy and Playwright's headless browser. 4 | 5 | To use the headless browser specify `-p` option. Browsers, unlike other standard web request libraries, have the ability to render JavaScript encoded HTML content. 6 | 7 | To automatically download and beautify all JavaScript files, including minified ones, specify `-dir downloads` option - where `downloads` is your desired output directory. 8 | 9 | Future plans: 10 | 11 | * check if Playwright's Chromium headless browser is installed or not, 12 | * add option to stop on rate limiting. 13 | 14 | Resources: 15 | 16 | * [scrapy.org](https://scrapy.org) - official 17 | * [playwright.dev](https://playwright.dev/python/docs/intro) - official 18 | * [scrapy/scrapy](https://github.com/scrapy/scrapy) - GitHub 19 | * [scrapy-plugins/scrapy-playwright](https://github.com/scrapy-plugins/scrapy-playwright) - GitHub 20 | 21 | Tested on Kali Linux v2024.2 (64-bit). 22 | 23 | Made for educational purposes. I hope it will help! 24 | 25 | ## Table of Contents 26 | 27 | * [How to Install](#how-to-install) 28 | * [Install Playwright and Chromium](#install-playwright-and-chromium) 29 | * [Standard Install](#standard-install) 30 | * [Build and Install From the Source](#build-and-install-from-the-source) 31 | * [How to Run](#how-to-run) 32 | * [Usage](#usage) 33 | * [Images](#images) 34 | 35 | ## How to Install 36 | 37 | ### Install Playwright and Chromium 38 | 39 | ```bash 40 | pip3 install --upgrade playwright 41 | 42 | playwright install chromium 43 | ``` 44 | 45 | Make sure each time you upgrade your Playwright dependency to re-install Chromium; otherwise, you might get an error using the headless browser. 46 | 47 | ### Standard Install 48 | 49 | ```bash 50 | pip3 install --upgrade scrapy-scraper 51 | ``` 52 | 53 | ### Build and Install From the Source 54 | 55 | ```bash 56 | git clone https://github.com/ivan-sincek/scrapy-scraper && cd scrapy-scraper 57 | 58 | python3 -m pip install --upgrade build 59 | 60 | python3 -m build 61 | 62 | python3 -m pip install dist/scrapy-scraper-3.6-py3-none-any.whl 63 | ``` 64 | 65 | ## How to Run 66 | 67 | Restricted, crawl only `example.com`, and include only links to `example.com`: 68 | 69 | ```fundamental 70 | scrapy-scraper -u https://example.com/home -o results.txt -a random -s 2 -rs -dir js 71 | ``` 72 | 73 | Restricted, crawl only `example.com`, and include both, links to `example.com` and 3rd party links: 74 | 75 | ```fundamental 76 | scrapy-scraper -u https://example.com/home -o results.txt -a random -s 2 -rs -dir js -l 77 | ``` 78 | 79 | Restricted, crawl everywhere, and include all the links: 80 | 81 | ```fundamental 82 | scrapy-scraper -u https://example.com/home -o results.txt -a random -s 2 -rs -dir js -w off 83 | ``` 84 | 85 | ## Usage 86 | 87 | ```fundamental 88 | Scrapy Scraper v3.6 ( github.com/ivan-sincek/scrapy-scraper ) 89 | 90 | Usage: scrapy-scraper -u urls -o out [-dir directory] 91 | Example: scrapy-scraper -u https://example.com/home -o results.txt [-dir downloads] 92 | 93 | DESCRIPTION 94 | Crawl and scrape websites 95 | URLS 96 | File containing URLs or a single URL to start crawling and scraping from 97 | -u, --urls = urls.txt | https://example.com/home | etc. 98 | WHITELIST 99 | File containing whitelisted domain names to limit the scope 100 | Specify 'off' to disable domain whitelisting 101 | Default: limit the scope to domain names extracted from the URLs 102 | -w, --whitelist = whitelist.txt | off | etc. 103 | LINKS 104 | Include all 3rd party links and sources in the output file 105 | -l, --links 106 | PLAYWRIGHT 107 | Use Playwright's headless browser 108 | -p, --playwright 109 | PLAYWRIGHT WAIT 110 | Wait time in seconds before fetching the page content 111 | -pw, --playwright-wait = 0.5 | 2 | 4 | etc. 112 | CONCURRENT REQUESTS 113 | Number of concurrent requests 114 | Default: 30 115 | -cr, --concurrent-requests = 30 | 45 | etc. 116 | CONCURRENT REQUESTS PER DOMAIN 117 | Number of concurrent requests per domain 118 | Default: 10 119 | -crd, --concurrent-requests-domain = 10 | 15 | etc. 120 | SLEEP 121 | Sleep time in seconds between two consecutive requests to the same domain 122 | -s, --sleep = 1.5 | 3 | etc. 123 | RANDOM SLEEP 124 | Randomize the sleep time between requests to vary between '0.5 * sleep' and '1.5 * sleep' 125 | -rs, --random-sleep 126 | AUTO THROTTLE 127 | Auto throttle concurrent requests based on the load and latency 128 | Sleep time is still respected 129 | -at, --auto-throttle = 0.5 | 10 | 15 | 45 | etc. 130 | RETRIES 131 | Number of retries per URL 132 | Default: 2 133 | -rt, --retries = 0 | 4 | etc. 134 | RECURSION 135 | Recursion depth limit 136 | Specify '0' for no limit 137 | Default: 1 138 | -r, --recursion = 0 | 2 | etc. 139 | REQUEST TIMEOUT 140 | Request timeout in seconds 141 | Default: 60 142 | -t, --request-timeout = 30 | 90 | etc. 143 | HEADER 144 | Specify any number of extra HTTP request headers 145 | -H, --header = "Authorization: Bearer ey..." | etc. 146 | COOKIE 147 | Specify any number of extra HTTP cookies 148 | -b, --cookie = PHPSESSIONID=3301 | etc. 149 | USER AGENT 150 | User agent to use 151 | Default: Scrapy Scraper/3.6 152 | -a, --user-agent = random[-all] | curl/3.30.1 | etc. 153 | PROXY 154 | Web proxy to use 155 | -x, --proxy = http://127.0.0.1:8080 | etc. 156 | DIRECTORY 157 | Output directory 158 | All extracted JavaScript files will be saved in this directory 159 | -dir, --directory = downloads | etc. 160 | OUT 161 | Output file 162 | -o, --out = results.txt | etc. 163 | DEBUG 164 | Enable debug output 165 | -dbg, --debug 166 | ``` 167 | 168 | ## Images 169 | 170 |
Figure 1 - Scraping
173 | -------------------------------------------------------------------------------- /img/scraping.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ivan-sincek/scrapy-scraper/8d009f7fe4ff21442da832e4237274a5ed10ab94/img/scraping.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=75.3.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "scrapy-scraper" 7 | version = "3.6" 8 | authors = [{ name = "Ivan Sincek" }] 9 | description = "Web crawler and scraper based on Scrapy and Playwright's headless browser." 10 | readme = "README.md" 11 | requires-python = ">=3.10" 12 | classifiers = [ 13 | "Programming Language :: Python :: 3", 14 | "License :: OSI Approved :: MIT License", 15 | "Operating System :: OS Independent" 16 | ] 17 | dependencies = [ 18 | "beautifulsoup4>=4.12.3", 19 | "bot-safe-agents>=1.0", 20 | "colorama>=0.4.6", 21 | "jsbeautifier>=1.14.11", 22 | "playwright>=1.49.0", 23 | "scrapy>=2.12.0", 24 | "scrapy-playwright>=0.0.42", 25 | "termcolor>=2.4.0", 26 | "tldextract>=3.6.0" 27 | ] 28 | 29 | [project.urls] 30 | "Homepage" = "https://github.com/ivan-sincek/scrapy-scraper" 31 | 32 | [project.scripts] 33 | scrapy-scraper = "scrapy_scraper.main:main" 34 | 35 | [tool.setuptools.packages.find] 36 | where = ["src"] 37 | -------------------------------------------------------------------------------- /src/scrapy_scraper/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from .utils import config, scrape, validate 4 | 5 | def main(): 6 | success, args = validate.Validate().validate_args() 7 | if success: 8 | config.banner() 9 | scrapy_scraper = scrape.ScrapyScraper( 10 | args.urls, 11 | args.whitelist, 12 | args.links, 13 | args.playwright, 14 | args.playwright_wait, 15 | args.concurrent_requests, 16 | args.concurrent_requests_domain, 17 | args.sleep, 18 | args.random_sleep, 19 | args.auto_throttle, 20 | args.retries, 21 | args.recursion, 22 | args.request_timeout, 23 | args.header, 24 | args.cookie, 25 | args.user_agent, 26 | args.proxy, 27 | args.directory, 28 | args.out, 29 | args.debug 30 | ) 31 | scrapy_scraper.run() 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/array.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | def unique(array: list): 4 | """ 5 | Remove duplicates from a list. 6 | """ 7 | seen = set() 8 | return [x for x in array if not (x in seen or seen.add(x))] 9 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | APP_VERSION = "v3.6" 4 | 5 | USER_AGENT = "Scrapy Scraper/3.6" 6 | 7 | def banner(): 8 | """ 9 | Display the banner. 10 | """ 11 | print("#########################################################################") 12 | print("# #") 13 | print("# Scrapy Scraper v3.6 #") 14 | print("# by Ivan Sincek #") 15 | print("# #") 16 | print("# Crawl and scrape websites. #") 17 | print("# GitHub repository at github.com/ivan-sincek/scrapy-scraper. #") 18 | print("# #") 19 | print("#########################################################################") 20 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/cookie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | 5 | def get_key_value(cookie: str): 6 | """ 7 | Get a key-value pair from an HTTP cookie.\n 8 | Returns an empty key-value pair on failure. 9 | """ 10 | key = ""; value = "" 11 | if re.search(r"^[^\=\;]+\=[^\=\;]+$", cookie): 12 | key, value = cookie.split("=", 1) 13 | return key.strip(), value.strip() 14 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/directory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | 5 | def is_directory(directory: str): 6 | """ 7 | Returns 'True' if the 'directory' exists and is a regular directory. 8 | """ 9 | return os.path.isdir(directory) 10 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from . import array 4 | 5 | import os 6 | 7 | __ENCODING = "ISO-8859-1" 8 | 9 | def is_file(file: str): 10 | """ 11 | Returns 'True' if the 'file' exists and is a regular file. 12 | """ 13 | return os.path.isfile(file) 14 | 15 | def validate(file: str): 16 | """ 17 | Validate a file.\n 18 | Success flag is 'True' if the file has a read permission and is not empty. 19 | """ 20 | success = False 21 | message = "" 22 | if not os.access(file, os.R_OK): 23 | message = f"\"{file}\" does not have a read permission" 24 | elif not os.stat(file).st_size > 0: 25 | message = f"\"{file}\" is empty" 26 | else: 27 | success = True 28 | return success, message 29 | 30 | def read_array(file: str) -> list[str]: 31 | """ 32 | Read a file line by line, and append the lines to a list.\n 33 | Whitespace will be stripped from each line, and empty lines will be removed.\n 34 | Returns a unique list. 35 | """ 36 | tmp = [] 37 | with open(file, "r", encoding = __ENCODING) as stream: 38 | for line in stream: 39 | line = line.strip() 40 | if line: 41 | tmp.append(line) 42 | return array.unique(tmp) 43 | 44 | def write_array(array: list[str], out: str): 45 | """ 46 | Write a list to an output file.\n 47 | Whitespace will be stripped from each string in the list, and empty strings will be removed. 48 | """ 49 | with open(out, "w", encoding = __ENCODING) as stream: 50 | for entry in array: 51 | entry = entry.strip() 52 | if entry: 53 | stream.write(f"{entry}\n") 54 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/general.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import colorama, datetime, termcolor 4 | 5 | colorama.init(autoreset = True) 6 | 7 | def to_float(value: str): 8 | """ 9 | Returns 'None' on failure. 10 | """ 11 | tmp = None 12 | try: 13 | tmp = float(value) 14 | except ValueError: 15 | pass 16 | return tmp 17 | 18 | # ---------------------------------------- 19 | 20 | def get_timestamp(message: str): 21 | """ 22 | Get the current timestamp. 23 | """ 24 | return f"{datetime.datetime.now().strftime('%H:%M:%S')} - {message}" 25 | 26 | def print_error(message: str): 27 | """ 28 | Print an error message. 29 | """ 30 | print(f"ERROR: {message}") 31 | 32 | def print_cyan(message: str): 33 | """ 34 | Print a message in cyan color. 35 | """ 36 | termcolor.cprint(message, "cyan") 37 | 38 | def print_green(message: str): 39 | """ 40 | Print a message in green color. 41 | """ 42 | termcolor.cprint(message, "green") 43 | 44 | def print_yellow(message: str): 45 | """ 46 | Print a message in yellow color. 47 | """ 48 | termcolor.cprint(message, "yellow") 49 | 50 | def print_red(message: str): 51 | """ 52 | Print a message in red color. 53 | """ 54 | termcolor.cprint(message, "red") 55 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/header.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import re 4 | 5 | def get_key_value(header: str): 6 | """ 7 | Get a key-value pair from an HTTP request header.\n 8 | Returns an empty key-value pair on failure. 9 | """ 10 | key = ""; value = "" 11 | if re.search(r"^[^\:]+\:.+$", header): 12 | key, value = header.split(":", 1) 13 | return key.strip(), value.strip() 14 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/scrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from . import array, file, general, stopwatch 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | import asyncio, jsbeautifier, os, random, scrapy, scrapy.crawler, scrapy.utils.project, typing, urllib.parse 8 | 9 | class ScrapyScraperSpider(scrapy.Spider): 10 | 11 | def __init__( 12 | self, 13 | urls : list[str], 14 | whitelist : list[str], 15 | links : bool, 16 | playwright : bool, 17 | playwright_wait: float, 18 | headers : dict[str, str], 19 | cookies : dict[str, str], 20 | user_agents : list[str], 21 | proxy : str, 22 | directory : str, 23 | out : str, 24 | debug : bool 25 | ): 26 | """ 27 | Class for managing Scrapy's spider. 28 | """ 29 | self.name = "ScrapyScraperSpider" 30 | self.start_urls = urls 31 | self.allowed_domains = whitelist 32 | self.__links = links 33 | self.__playwright = playwright 34 | self.__playwright_wait = playwright_wait 35 | self.__headers = headers 36 | self.__cookies = cookies 37 | self.__user_agents = user_agents 38 | self.__user_agents_len = len(self.__user_agents) 39 | self.__proxy = proxy 40 | self.__directory = directory 41 | self.__out = out 42 | self.__debug = debug 43 | self.__context = 0 44 | self.__crawled = [] 45 | self.__collected = [] 46 | 47 | def __print_start_urls(self): 48 | """ 49 | Print start URLS. 50 | """ 51 | general.print_green("Start URLs:") 52 | for url in self.start_urls: 53 | print(url) 54 | 55 | def __print_allowed_domains(self): 56 | """ 57 | Print allowed domains/subdomains. 58 | """ 59 | if self.allowed_domains: 60 | general.print_cyan("Allowed domains/subdomains:") 61 | for domain in self.allowed_domains: 62 | print(f"*.{domain}") 63 | else: 64 | general.print_red("Domain whitelisting is off!") 65 | 66 | def start_requests(self): 67 | """ 68 | Main method. 69 | """ 70 | self.__print_start_urls() 71 | self.__print_allowed_domains() 72 | print(general.get_timestamp("Crawling and scraping...")) 73 | print("Press CTRL + C to exit early - results will be saved, be patient") 74 | for url in self.start_urls: 75 | yield scrapy.Request( 76 | url = url, 77 | headers = self.__get_headers(), 78 | cookies = self.__cookies, 79 | meta = self.__get_metadata(), 80 | errback = self.__error, 81 | callback = self.__success, 82 | dont_filter = False 83 | ) 84 | 85 | def __get_headers(self) -> dict[str, str]: 86 | """ 87 | Get default HTTP request headers. 88 | """ 89 | default_headers = { 90 | "User-Agent" : self.__get_user_agent(), 91 | "Accept-Language" : "en-US, *", 92 | "Accept" : "*/*", 93 | "Referer" : "https://www.google.com/", 94 | "Upgrade-Insecure-Requests": "1" 95 | } 96 | headers = {} 97 | for name, value in default_headers.items(): 98 | if value: 99 | headers[name.lower()] = value 100 | for name, value in self.__headers.items(): # override 101 | headers[name.lower()] = value 102 | return headers 103 | 104 | def __get_user_agent(self): 105 | """ 106 | Get a [random] user agent.\n 107 | Returns an empty string if there are no user agents. 108 | """ 109 | user_agent = "" 110 | if self.__user_agents_len > 0: 111 | user_agent = self.__user_agents[random.randint(0, self.__user_agents_len - 1)] 112 | return user_agent 113 | 114 | def __get_metadata(self) -> dict[str, typing.Any]: 115 | """ 116 | Get Scrapy's request metadata. 117 | """ 118 | self.__context += 1 119 | tmp = {} 120 | tmp["playwright" ] = self.__playwright 121 | tmp["playwright_context" ] = str(self.__context) 122 | tmp["playwright_include_page" ] = self.__playwright 123 | tmp["playwright_context_kwargs" ] = {} 124 | tmp["playwright_context_kwargs" ]["ignore_https_errors"] = True 125 | tmp["playwright_context_kwargs" ]["java_script_enabled"] = True 126 | tmp["playwright_context_kwargs" ]["accept_downloads" ] = False 127 | tmp["playwright_context_kwargs" ]["bypass_csp" ] = False 128 | tmp["playwright_page_goto_kwargs"] = {"wait_until": "load"} 129 | tmp["proxy" ] = self.__proxy 130 | tmp["cookiejar" ] = 1 131 | tmp["dont_merge_cookies" ] = False 132 | return tmp 133 | 134 | # ------------------------------------ 135 | 136 | def closed(self, reason: typing.Any): 137 | """ 138 | On close callback. 139 | """ 140 | self.__crawled = array.unique(self.__crawled) 141 | print(f"Total unique URLs crawled: {len(self.__crawled)}") 142 | self.__collected = array.unique(self.__collected + self.__crawled) 143 | print(f"Total unique URLs collected: {len(self.__collected)}") 144 | stopwatch.stopwatch.stop() 145 | if self.__collected: 146 | file.write_array(sorted(self.__collected, key = str.casefold), self.__out) 147 | 148 | # ------------------------------------ 149 | 150 | async def __error(self, failure: typing.Any): 151 | """ 152 | Error callback. 153 | """ 154 | status = failure.value.response.status if failure.check(scrapy.spidermiddlewares.httperror.HttpError) else 0 155 | url = failure.request.url 156 | error = str(failure.value).splitlines()[0] 157 | if self.__playwright: 158 | page = failure.request.meta["playwright_page"] 159 | await page.close() 160 | await page.context.close() 161 | self.__print_error(status, url, error) 162 | 163 | def __print_error(self, status: int, url: str, message: str): 164 | """ 165 | Print error. 166 | """ 167 | if self.__debug: 168 | if status: 169 | url = f"{status} {url}" 170 | general.print_red(f"[ ERROR ] {url} -> {message}") 171 | 172 | # ------------------------------------ 173 | 174 | async def __success(self, response: typing.Any): 175 | """ 176 | Success callback. 177 | """ 178 | status = response.status 179 | url = response.url 180 | content = "" 181 | if self.__playwright: 182 | page = response.request.meta["playwright_page"] 183 | if self.__playwright_wait > 0: 184 | await asyncio.sleep(self.__playwright_wait) 185 | content = await page.content() 186 | await page.close() 187 | await page.context.close() 188 | else: 189 | content = response.body 190 | # -------------------------------- 191 | self.__crawled.append(url) 192 | self.__print_success(status, url) 193 | # -------------------------------- 194 | if self.__directory: 195 | self.__download_js(url, content) 196 | # -------------------------------- 197 | links = self.__extract_links(url, scrapy.http.HtmlResponse(url = url, body = content, encoding = "UTF-8") if self.__playwright else response) 198 | self.__collected.extend(links) 199 | for link in links: 200 | yield response.follow( 201 | url = link, 202 | headers = self.__get_headers(), 203 | cookies = self.__cookies, 204 | meta = self.__get_metadata(), 205 | errback = self.__error, 206 | callback = self.__success, 207 | dont_filter = False 208 | ) 209 | 210 | def __print_success(self, status: int, url: str): 211 | """ 212 | Print success. 213 | """ 214 | if self.__debug: 215 | general.print_green(f"[ OK ] {status} {url}") 216 | 217 | def __download_js(self, url: str, content: str | bytes): 218 | """ 219 | Download JavaScript files. 220 | """ 221 | filename = urllib.parse.urlsplit(url).path.rsplit("/", 1)[-1] 222 | if filename.lower().endswith(".js"): 223 | filename = os.path.join(self.__directory, filename) 224 | if not os.path.exists(filename): 225 | try: 226 | soup = BeautifulSoup(content, "html.parser") 227 | open(filename, "w").write(jsbeautifier.beautify(soup.get_text())) 228 | except Exception as ex: 229 | self.__print_exception(url, str(ex)) 230 | 231 | def __extract_links(self, url: str, response: typing.Any): 232 | """ 233 | Extract links.\n 234 | Returns a unique list. 235 | """ 236 | tmp = [] 237 | try: 238 | tmp.extend(self.__extract_links_xpath(url, response, "script", "src" )) 239 | tmp.extend(self.__extract_links_xpath(url, response, "a" , "href")) 240 | tmp.extend(self.__extract_links_xpath(url, response, "link" , "href")) 241 | except (UnicodeEncodeError, AttributeError) as ex: 242 | self.__print_exception(url, str(ex)) 243 | return array.unique(tmp) 244 | 245 | def __extract_links_xpath(self, url: str, response: typing.Any, tag: str, attr: str): 246 | """ 247 | Extract links based on the specified XPath. 248 | """ 249 | tmp = [] 250 | for link in response.xpath(f"//{tag}[@{attr}]"): 251 | link = link.xpath(f"@{attr}").get() 252 | obj = urllib.parse.urlsplit(link) 253 | scheme = obj.scheme 254 | domain = obj.netloc.lower() 255 | if scheme and scheme not in ["http", "https"]: 256 | continue 257 | elif not self.__links and domain and not self.__is_allowed(domain): 258 | continue 259 | tmp.append(urllib.parse.urljoin(url, link)) 260 | return tmp 261 | 262 | def __is_allowed(self, domain: str): 263 | """ 264 | Check if a domain name is in the scope. 265 | """ 266 | return not self.allowed_domains or any(domain == allowed or domain.endswith(f".{allowed}") for allowed in self.allowed_domains) 267 | 268 | def __print_exception(self, url: str, message: str): 269 | """ 270 | Print exception. 271 | """ 272 | if self.__debug: 273 | general.print_red(f"[ EXCEPTION ] {url} -> {message}") 274 | 275 | # ---------------------------------------- 276 | 277 | class ScrapyScraper: 278 | 279 | def __init__( 280 | self, 281 | urls : list[str], 282 | whitelist : list[str], 283 | links : bool, 284 | playwright : bool, 285 | playwright_wait : float, 286 | concurrent_requests : int, 287 | concurrent_requests_domain: int, 288 | sleep : float, 289 | random_sleep : bool, 290 | auto_throttle : float, 291 | retries : int, 292 | recursion : int, 293 | request_timeout : float, 294 | headers : dict[str, str], 295 | cookies : dict[str, str], 296 | user_agents : list[str], 297 | proxy : str, 298 | directory : str, 299 | out : str, 300 | debug : bool 301 | ): 302 | """ 303 | Class for managing Scrapy's runner. 304 | """ 305 | self.__urls = urls 306 | self.__whitelist = whitelist 307 | self.__links = links 308 | self.__playwright = playwright 309 | self.__playwright_wait = playwright_wait 310 | self.__concurrent_requests = concurrent_requests 311 | self.__concurrent_requests_domain = concurrent_requests_domain 312 | self.__sleep = sleep 313 | self.__random_sleep = random_sleep 314 | self.__auto_throttle = auto_throttle 315 | self.__retries = retries 316 | self.__recursion = recursion 317 | self.__request_timeout = request_timeout # all timeouts 318 | self.__headers = headers 319 | self.__cookies = cookies 320 | self.__user_agents = user_agents 321 | self.__proxy = proxy 322 | self.__directory = directory 323 | self.__out = out 324 | self.__debug = debug 325 | self.__headless_browser = True 326 | self.__browser_type = "chromium" # Playwright's headless browser 327 | self.__handle_sigint = False 328 | self.__max_redirects = 10 329 | 330 | def __page_block(self, request: typing.Any): 331 | """ 332 | Types of content to block while using Playwright's headless browser. 333 | """ 334 | return request.resource_type in ["fetch", "stylesheet", "image", "ping", "font", "media", "imageset", "beacon", "csp_report", "object", "texttrack", "manifest"] 335 | 336 | def run(self): 337 | """ 338 | Configure the settings and run the Chad Extractor spider. 339 | """ 340 | settings = scrapy.utils.project.get_project_settings() 341 | # -------------------------------- 342 | settings["COOKIES_ENABLED" ] = True 343 | settings["DOWNLOAD_TIMEOUT" ] = self.__request_timeout # connect / read timeout 344 | settings["DOWNLOAD_DELAY" ] = self.__sleep 345 | settings["RANDOMIZE_DOWNLOAD_DELAY"] = self.__random_sleep 346 | settings["HTTPPROXY_ENABLED" ] = bool(self.__proxy) 347 | # -------------------------------- 348 | settings["EXTENSIONS"]["scrapy.extensions.throttle.AutoThrottle"] = 100 349 | # -------------------------------- 350 | settings["AUTOTHROTTLE_ENABLED" ] = self.__auto_throttle > 0 351 | settings["AUTOTHROTTLE_DEBUG" ] = False 352 | settings["AUTOTHROTTLE_START_DELAY" ] = self.__sleep 353 | settings["AUTOTHROTTLE_MAX_DELAY" ] = settings["AUTOTHROTTLE_START_DELAY"] + 30 354 | settings["AUTOTHROTTLE_TARGET_CONCURRENCY"] = self.__auto_throttle 355 | # -------------------------------- 356 | settings["CONCURRENT_REQUESTS" ] = self.__concurrent_requests 357 | settings["CONCURRENT_REQUESTS_PER_DOMAIN"] = self.__concurrent_requests_domain 358 | settings["RETRY_ENABLED" ] = self.__retries > 0 359 | settings["RETRY_TIMES" ] = self.__retries 360 | settings["REDIRECT_ENABLED" ] = self.__max_redirects > 0 361 | settings["REDIRECT_MAX_TIMES" ] = self.__max_redirects 362 | settings["DEPTH_LIMIT" ] = self.__recursion 363 | # -------------------------------- 364 | settings["ROBOTSTXT_OBEY" ] = False 365 | settings["TELNETCONSOLE_ENABLED" ] = False 366 | settings["LOG_ENABLED" ] = False 367 | settings["REQUEST_FINGERPRINTER_IMPLEMENTATION"] = "2.7" 368 | # -------------------------------- 369 | if self.__playwright: 370 | settings["DOWNLOAD_HANDLERS"]["https"] = "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" 371 | settings["DOWNLOAD_HANDLERS"]["http" ] = "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler" 372 | settings["TWISTED_REACTOR" ] = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 373 | settings["PLAYWRIGHT_LAUNCH_OPTIONS" ] = { 374 | "headless" : self.__headless_browser, 375 | "handle_sigint": self.__handle_sigint, 376 | "proxy" : {"server": self.__proxy} if self.__proxy else None 377 | } 378 | settings["PLAYWRIGHT_BROWSER_TYPE" ] = self.__browser_type 379 | settings["PLAYWRIGHT_ABORT_REQUEST" ] = self.__page_block 380 | settings["PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"] = self.__request_timeout * 1000 381 | # -------------------------------- 382 | scrapy_scraper_spider = scrapy.crawler.CrawlerProcess(settings) 383 | scrapy_scraper_spider.crawl(ScrapyScraperSpider, self.__urls, self.__whitelist, self.__links, self.__playwright, self.__playwright_wait, self.__headers, self.__cookies, self.__user_agents, self.__proxy, self.__directory, self.__out, self.__debug) 384 | scrapy_scraper_spider.start() 385 | scrapy_scraper_spider.join() 386 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/stopwatch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import datetime 4 | 5 | class Stopwatch: 6 | 7 | def __init__(self): 8 | self.__start = datetime.datetime.now() 9 | 10 | def stop(self): 11 | self.__end = datetime.datetime.now() 12 | print(f"Script has finished in {self.__end - self.__start}") 13 | 14 | stopwatch = Stopwatch() 15 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import tldextract, urllib.parse 4 | 5 | __URL_SCHEME_WHITELIST = ["http", "https", "socks4", "socks4h", "socks5", "socks5h"] 6 | __MIN_PORT_NUM = 1 7 | __MAX_PORT_NUM = 65535 8 | 9 | def is_http(url: str): 10 | return urllib.parse.urlsplit(url).scheme.lower() in ["http", "https"] 11 | 12 | def validate(url: str): 13 | """ 14 | Validate a URL. 15 | """ 16 | success = False 17 | message = "" 18 | tmp = urllib.parse.urlsplit(url) 19 | if not tmp.scheme: 20 | message = f"URL scheme is required: {url}" 21 | elif tmp.scheme not in __URL_SCHEME_WHITELIST: 22 | message = f"Supported URL schemes are 'http[s]', 'socks4[h]', and 'socks5[h]': {url}" 23 | elif not tmp.netloc: 24 | message = f"Invalid domain name: {url}" 25 | elif tmp.port and (tmp.port < __MIN_PORT_NUM or tmp.port > __MAX_PORT_NUM): 26 | message = f"Port number is out of range: {url}" 27 | else: 28 | success = True 29 | return success, message 30 | 31 | def validate_multiple(urls: list[str]): 32 | """ 33 | Validate multiple URLs. 34 | """ 35 | success = True 36 | message = "" 37 | for url in urls: 38 | success, message = validate(url) 39 | if not success: 40 | break 41 | return success, message 42 | 43 | def extract_fqdn(url: str) -> str: 44 | """ 45 | Extract the fully qualified domain name from a URL.\n 46 | Returns an empty string on failure. 47 | """ 48 | tmp = "" 49 | obj = tldextract.extract(url) 50 | if obj.fqdn: 51 | tmp = obj.fqdn.lower() 52 | return tmp 53 | 54 | def extract_fqdn_multiple(urls: list[str]) -> list[str]: 55 | """ 56 | Extract the fully qualified domain names from a list of URLs.\n 57 | Returns an empty list on failure. 58 | """ 59 | tmp = [] 60 | for url in urls: 61 | url = extract_fqdn(url) 62 | if url: 63 | tmp.append(url) 64 | return tmp 65 | -------------------------------------------------------------------------------- /src/scrapy_scraper/utils/validate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from . import config, cookie, directory, file, general, header, url 4 | 5 | import argparse, bot_safe_agents, sys 6 | 7 | class MyArgParser(argparse.ArgumentParser): 8 | 9 | def print_help(self): 10 | print(f"Scrapy Scraper {config.APP_VERSION} ( github.com/ivan-sincek/scrapy-scraper )") 11 | print("") 12 | print("Usage: scrapy-scraper -u urls -o out [-dir directory]") 13 | print("Example: scrapy-scraper -u https://example.com/home -o results.txt [-dir downloads]") 14 | print("") 15 | print("DESCRIPTION") 16 | print(" Crawl and scrape websites") 17 | print("URLS") 18 | print(" File containing URLs or a single URL to start crawling and scraping from") 19 | print(" -u, --urls = urls.txt | https://example.com/home | etc.") 20 | print("WHITELIST") 21 | print(" File containing whitelisted domain names to limit the scope") 22 | print(" Specify 'off' to disable domain whitelisting") 23 | print(" Default: limit the scope to domain names extracted from the URLs") 24 | print(" -w, --whitelist = whitelist.txt | off | etc.") 25 | print("LINKS") 26 | print(" Include all 3rd party links and sources in the output file") 27 | print(" -l, --links") 28 | print("PLAYWRIGHT") 29 | print(" Use Playwright's headless browser") 30 | print(" -p, --playwright") 31 | print("PLAYWRIGHT WAIT") 32 | print(" Wait time in seconds before fetching the page content") 33 | print(" -pw, --playwright-wait = 0.5 | 2 | 4 | etc.") 34 | print("CONCURRENT REQUESTS") 35 | print(" Number of concurrent requests") 36 | print(" Default: 30") 37 | print(" -cr, --concurrent-requests = 30 | 45 | etc.") 38 | print("CONCURRENT REQUESTS PER DOMAIN") 39 | print(" Number of concurrent requests per domain") 40 | print(" Default: 10") 41 | print(" -crd, --concurrent-requests-domain = 10 | 15 | etc.") 42 | print("SLEEP") 43 | print(" Sleep time in seconds between two consecutive requests to the same domain") 44 | print(" -s, --sleep = 1.5 | 3 | etc.") 45 | print("RANDOM SLEEP") 46 | print(" Randomize the sleep time between requests to vary between '0.5 * sleep' and '1.5 * sleep'") 47 | print(" -rs, --random-sleep") 48 | print("AUTO THROTTLE") 49 | print(" Auto throttle concurrent requests based on the load and latency") 50 | print(" Sleep time is still respected") 51 | print(" -at, --auto-throttle = 0.5 | 10 | 15 | 45 | etc.") 52 | print("RETRIES") 53 | print(" Number of retries per URL") 54 | print(" Default: 2") 55 | print(" -rt, --retries = 0 | 4 | etc.") 56 | print("RECURSION") 57 | print(" Recursion depth limit") 58 | print(" Specify '0' for no limit") 59 | print(" Default: 1") 60 | print(" -r, --recursion = 0 | 2 | etc.") 61 | print("REQUEST TIMEOUT") 62 | print(" Request timeout in seconds") 63 | print(" Default: 60") 64 | print(" -t, --request-timeout = 30 | 90 | etc.") 65 | print("HEADER") 66 | print(" Specify any number of extra HTTP request headers") 67 | print(" -H, --header = \"Authorization: Bearer ey...\" | etc.") 68 | print("COOKIE") 69 | print(" Specify any number of extra HTTP cookies") 70 | print(" -b, --cookie = PHPSESSIONID=3301 | etc.") 71 | print("USER AGENT") 72 | print(" User agent to use") 73 | print(f" Default: {config.USER_AGENT}") 74 | print(" -a, --user-agent = random[-all] | curl/3.30.1 | etc.") 75 | print("PROXY") 76 | print(" Web proxy to use") 77 | print(" -x, --proxy = http://127.0.0.1:8080 | etc.") 78 | print("DIRECTORY") 79 | print(" Output directory") 80 | print(" All extracted JavaScript files will be saved in this directory") 81 | print(" -dir, --directory = downloads | etc.") 82 | print("OUT") 83 | print(" Output file") 84 | print(" -o, --out = results.txt | etc.") 85 | print("DEBUG") 86 | print(" Enable debug output") 87 | print(" -dbg, --debug") 88 | 89 | def error(self, message): 90 | if len(sys.argv) > 1: 91 | print("Missing a mandatory option (-u, -o) and/or optional (-w, -l, -p, -pw, -cr, -crd, -s, -rs, -at, -rt, -r, -t, -H, -b, -a, -x, -dir, -dbg)") 92 | print("Use -h or --help for more info") 93 | else: 94 | self.print_help() 95 | exit() 96 | 97 | class Validate: 98 | 99 | def __init__(self): 100 | """ 101 | Class for validating and managing CLI arguments. 102 | """ 103 | self.__parser = MyArgParser() 104 | self.__parser.add_argument("-u" , "--urls" , required = True , type = str , default = "" ) 105 | self.__parser.add_argument("-w" , "--whitelist" , required = False, type = str , default = "" ) 106 | self.__parser.add_argument("-l" , "--links" , required = False, action = "store_true", default = False) 107 | self.__parser.add_argument("-p" , "--playwright" , required = False, action = "store_true", default = False) 108 | self.__parser.add_argument("-pw" , "--playwright-wait" , required = False, type = str , default = "" ) 109 | self.__parser.add_argument("-cr" , "--concurrent-requests" , required = False, type = str , default = "" ) 110 | self.__parser.add_argument("-crd", "--concurrent-requests-domain", required = False, type = str , default = "" ) 111 | self.__parser.add_argument("-s" , "--sleep" , required = False, type = str , default = "" ) 112 | self.__parser.add_argument("-rs" , "--random-sleep" , required = False, action = "store_true", default = False) 113 | self.__parser.add_argument("-at" , "--auto-throttle" , required = False, type = str , default = "" ) 114 | self.__parser.add_argument("-rt" , "--retries" , required = False, type = str , default = "" ) 115 | self.__parser.add_argument("-r" , "--recursion" , required = False, type = str , default = "" ) 116 | self.__parser.add_argument("-t" , "--request-timeout" , required = False, type = str , default = "" ) 117 | self.__parser.add_argument("-H" , "--header" , required = False, action = "append" , nargs = "+" ) 118 | self.__parser.add_argument("-b" , "--cookie" , required = False, action = "append" , nargs = "+" ) 119 | self.__parser.add_argument("-a" , "--user-agent" , required = False, type = str , default = "" ) 120 | self.__parser.add_argument("-x" , "--proxy" , required = False, type = str , default = "" ) 121 | self.__parser.add_argument("-dir", "--directory" , required = False, type = str , default = "" ) 122 | self.__parser.add_argument("-o" , "--out" , required = True , type = str , default = "" ) 123 | self.__parser.add_argument("-dbg", "--debug" , required = False, action = "store_true", default = False) 124 | 125 | def validate_args(self): 126 | """ 127 | Validate and return the CLI arguments. 128 | """ 129 | self.__success = True 130 | self.__args = self.__parser.parse_args() 131 | self.__validate_urls() 132 | self.__validate_whitelist() 133 | self.__validate_playwright_wait() 134 | self.__validate_concurrent_requests() 135 | self.__validate_concurrent_requests_domain() 136 | self.__validate_sleep() 137 | self.__validate_auto_throttle() 138 | self.__validate_retries() 139 | self.__validate_recursion() 140 | self.__validate_request_timeout() 141 | self.__validate_header() 142 | self.__validate_cookie() 143 | self.__validate_user_agent() 144 | self.__validate_proxy() 145 | self.__validate_directory() 146 | return self.__success, self.__args 147 | 148 | def __error(self, message: str): 149 | """ 150 | Set the success flag to 'False' to prevent the main task from executing, and print an error message. 151 | """ 152 | self.__success = False 153 | general.print_error(message) 154 | 155 | # ------------------------------------ 156 | 157 | def __validate_urls(self): 158 | tmp = [] 159 | if file.is_file(self.__args.urls): 160 | success, message = file.validate(self.__args.urls) 161 | if not success: 162 | self.__error(message) 163 | else: 164 | tmp = file.read_array(self.__args.urls) 165 | if not tmp: 166 | self.__error(f"No URLs were found in \"{self.__args.urls}\"") 167 | else: 168 | success, message = url.validate_multiple(tmp) 169 | if not success: 170 | self.__error(message) 171 | else: 172 | success, message = url.validate(self.__args.urls) 173 | if not success: 174 | self.__error(message) 175 | else: 176 | tmp = [self.__args.urls] 177 | self.__args.urls = tmp 178 | 179 | def __validate_whitelist(self): 180 | tmp = [] 181 | if self.__args.whitelist.lower() == "off": 182 | pass 183 | elif self.__args.whitelist: 184 | if not file.is_file(self.__args.whitelist): 185 | self.__error(f"\"{self.__args.whitelist}\" does not exist") 186 | else: 187 | success, message = file.validate(self.__args.whitelist) 188 | if not success: 189 | self.__error(message) 190 | else: 191 | tmp = url.extract_fqdn_multiple(file.read_array(self.__args.whitelist)) 192 | if not tmp: 193 | self.__error(f"No valid whitelisted domain names were found in \"{self.__args.whitelist}\"") 194 | elif self.__success: 195 | tmp = url.extract_fqdn_multiple(self.__args.urls) 196 | if not tmp: 197 | self.__error("No domain names could be extracted from the provided URLs for domain whitelisting") 198 | self.__args.whitelist = tmp 199 | 200 | def __validate_playwright_wait(self): 201 | tmp = 0 202 | if self.__args.playwright_wait: 203 | tmp = general.to_float(self.__args.playwright_wait) 204 | if tmp is None: 205 | self.__error("Playwright's wait time must be numeric") 206 | elif tmp <= 0: 207 | self.__error("Playwright's wait time must be greater than zero") 208 | self.__args.playwright_wait = tmp 209 | 210 | def __validate_concurrent_requests(self): 211 | tmp = 30 212 | if self.__args.concurrent_requests: 213 | if not self.__args.concurrent_requests.isdigit(): 214 | self.__error("Number of concurrent requests must be numeric") 215 | else: 216 | tmp = int(self.__args.concurrent_requests) 217 | if tmp <= 0: 218 | self.__error("Number of concurrent requests must be greater than zero") 219 | self.__args.concurrent_requests = tmp 220 | 221 | def __validate_concurrent_requests_domain(self): 222 | tmp = 10 223 | if self.__args.concurrent_requests_domain: 224 | if not self.__args.concurrent_requests_domain.isdigit(): 225 | self.__error("Number of concurrent requests per domain must be numeric") 226 | else: 227 | tmp = int(self.__args.concurrent_requests_domain) 228 | if tmp <= 0: 229 | self.__error("Number of concurrent requests per domain must be greater than zero") 230 | self.__args.concurrent_requests_domain = tmp 231 | 232 | def __validate_sleep(self,): 233 | tmp = 0 234 | if self.__args.sleep: 235 | tmp = general.to_float(self.__args.sleep) 236 | if tmp is None: 237 | self.__error("Sleep time between two consecutive requests must be numeric") 238 | elif tmp <= 0: 239 | self.__error("Sleep time between two consecutive requests must be greater than zero") 240 | self.__args.sleep = tmp 241 | 242 | def __validate_auto_throttle(self): 243 | tmp = 0 244 | if self.__args.auto_throttle: 245 | tmp = general.to_float(self.__args.auto_throttle) 246 | if tmp is None: 247 | self.__error("Auto throttle must be numeric") 248 | elif tmp <= 0: 249 | self.__error("Auto throttle must be greater than zero") 250 | self.__args.auto_throttle = tmp 251 | 252 | def __validate_retries(self): 253 | tmp = 2 254 | if self.__args.retries: 255 | if not self.__args.retries.isdigit(): 256 | self.__error("Number of retries must be numeric") 257 | else: 258 | tmp = int(self.__args.retries) 259 | if tmp <= 0: 260 | self.__error("Number of retries must be greater than zero") 261 | self.__args.retries = tmp 262 | 263 | def __validate_recursion(self): 264 | tmp = 1 265 | if self.__args.recursion: 266 | if not self.__args.recursion.isdigit(): 267 | self.__error("Recursion depth must be numeric") 268 | else: 269 | tmp = int(self.__args.recursion) 270 | if tmp < 0: 271 | self.__error("Recursion depth must be greater than or equal to zero") 272 | self.__args.recursion = tmp 273 | 274 | def __validate_request_timeout(self): 275 | tmp = 60 276 | if self.__args.request_timeout: 277 | tmp = general.to_float(self.__args.request_timeout) 278 | if tmp is None: 279 | self.__error("Request timeout must be numeric") 280 | elif tmp <= 0: 281 | self.__error("Request timeout must be greater than zero") 282 | self.__args.request_timeout = tmp 283 | 284 | def __validate_header(self): 285 | tmp = {} 286 | if self.__args.header: 287 | for entry in self.__args.header: 288 | key, value = header.get_key_value(entry[0]) 289 | if not key: 290 | self.__error(f"Invalid HTTP request header: {entry[0]}") 291 | continue 292 | tmp[key] = value 293 | self.__args.header = tmp 294 | 295 | def __validate_cookie(self): 296 | tmp = {} 297 | if self.__args.cookie: 298 | for entry in self.__args.cookie: 299 | key, value = cookie.get_key_value(entry[0]) 300 | if not key: 301 | self.__error(f"Invalid HTTP cookie: {entry[0]}") 302 | continue 303 | tmp[key] = value 304 | self.__args.cookie = tmp 305 | 306 | def __validate_user_agent(self): 307 | tmp = [config.USER_AGENT] 308 | if self.__args.user_agent: 309 | lower = self.__args.user_agent.lower() 310 | if lower == "random-all": 311 | tmp = bot_safe_agents.get_all() 312 | elif lower == "random": 313 | tmp = [bot_safe_agents.get_random()] 314 | else: 315 | tmp = [self.__args.user_agent] 316 | self.__args.user_agent = tmp 317 | 318 | def __validate_proxy(self): 319 | if self.__args.proxy: 320 | success, message = url.validate(self.__args.proxy) 321 | if not success: 322 | self.__error(message) 323 | 324 | def __validate_directory(self): 325 | if self.__args.directory: 326 | if not directory.is_directory(self.__args.directory): 327 | self.__error(f"\"{self.__args.directory}\" does not exist or is not a directory") 328 | --------------------------------------------------------------------------------