├── tor2tor
    ├── settings
    │   └── settings.json
    ├── __init__.py
    ├── main.py
    ├── coreutils.py
    └── tor2tor.py
├── uninstall.sh
├── Dockerfile
├── install.sh
├── update.sh
├── update.ps1
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── codeql.yml
    │   └── docker-publish.yml
├── uninstall.ps1
├── LICENSE
├── pyproject.toml
├── install.ps1
├── .gitignore
└── README.md


/tor2tor/settings/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "proxy": {
 3 |     "socks5": {
 4 |       "host": "localhost",
 5 |       "port": 9050,
 6 |       "type": 1,
 7 |       "version": 5
 8 |     }
 9 |   },
10 |   "tor.exe": "Tor\\tor\\tor.exe"
11 | }
12 | 


--------------------------------------------------------------------------------
/tor2tor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = "Richard Mwewa"
2 | __about__ = "https://rly0nheart.github.io"
3 | __version__ = "0.15.0"
4 | __description__ = f"**Tor2Tor** - by [{__author__}]({__about__})"
5 | __epilog__ = "**Tor2Tor** scrapes a given onion link and captures screenshots of all links available on it."
6 | 


--------------------------------------------------------------------------------
/uninstall.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Remove the geckodriver binary from /usr/bin
 4 | rm /usr/bin/geckodriver -v
 5 | 
 6 | # Uninstall tor and its configuration files including unused packages (I might be doing you a favour by removing unused packages haha)
 7 | apt remove tor --autoremove --purge -y
 8 | 
 9 | # Uninstall tor2tor
10 | pip3 uninstall tor2tor -y -v
11 | echo "Cleanup complete."
12 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:latest
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY . .
 6 | 
 7 | RUN apt-get update && \
 8 |     apt-get install -y --no-install-recommends \
 9 |     ca-certificates \
10 |     curl \
11 |     firefox-esr \
12 |     tor && \
13 |     rm -fr /var/lib/apt/lists/*
14 | 
15 | RUN curl -L https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-linux64.tar.gz | \
16 |     tar xz -C /usr/bin && \
17 |     apt-get purge -y \
18 |     ca-certificates \
19 |     curl
20 | 
21 | RUN pip install --upgrade pip && pip install .
22 | 
23 | ENTRYPOINT ["tor2tor"]
24 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install the Tor package without installing recommended packages.
 4 | apt-get update && apt-get install -y --no-install-recommends tor
 5 | 
 6 | # Download geckodriver .tar.gz file and pipe it to 'tar' to extract the geckodriver binary directly into /usr/bin.
 7 | curl -L https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-linux64.tar.gz | \
 8 |     tar xz -C /usr/bin
 9 | 
10 | # Install Python packages defined in the current directory's setup.py/pyproject.toml file. (pyproject.toml in this case)
11 | pip3 install .
12 | echo "Setup complete."
13 | 


--------------------------------------------------------------------------------
/update.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if the directory has a .git folder
 4 | if [ -d ".git" ]; then
 5 |   # Fetch the latest updates from the https://github.com/rly0nheart/tor2tor
 6 |   git fetch origin
 7 | 
 8 |   # Compare local and remote branches
 9 |   LOCAL=$(git rev-parse @)
10 |   REMOTE=$(git rev-parse @{u})
11 | 
12 |   # Check if the local repository is up-to-date
13 |   if [ $LOCAL = $REMOTE ]; then
14 |     echo "Tor2Tor is up-to-date."
15 |   else
16 |     echo "Pulling the latest changes. Please wait..."
17 |     git pull
18 | 
19 |     # Install tor2tor after pulling the updates
20 |     pip3 install .
21 |     echo "Update complete."
22 |   fi
23 | else
24 |   echo "Current directory is not a Git repository."
25 | fi
26 | 


--------------------------------------------------------------------------------
/update.ps1:
--------------------------------------------------------------------------------
 1 | # Check if the directory has a .git folder
 2 | if (Test-Path ".git") {
 3 | 
 4 |     # Fetch the latest updates
 5 |     git fetch origin
 6 | 
 7 |     # Compare local and remote branches
 8 |     $LOCAL = git rev-parse @
 9 |     $REMOTE = git rev-parse "@{u}"
10 | 
11 |     # Check if the local repository is up-to-date
12 |     if ($LOCAL -eq $REMOTE) {
13 |         Write-Host "Tor2Tor is up-to-date."
14 |     } else {
15 |         Write-Host "Pulling the latest changes. Please wait..."
16 |         git pull
17 | 
18 |         # Install tor2tor after pulling the updates
19 |         pip3 install .
20 |         Write-Host "Update complete."
21 |     }
22 | } else {
23 |     Write-Host "Current directory is not a Git repository."
24 | }
25 | 


--------------------------------------------------------------------------------
/tor2tor/main.py:
--------------------------------------------------------------------------------
 1 | from .tor2tor import log, args, Tor2Tor
 2 | from .coreutils import (
 3 |     path_finder,
 4 |     is_valid_onion,
 5 | )
 6 | 
 7 | 
 8 | def execute_tor2tor():
 9 |     target_onion = args.onion
10 |     tor2tor = Tor2Tor()
11 | 
12 |     if is_valid_onion(url=target_onion):
13 |         print("""
14 | ┏┳┓     ┏┳┓    
15 |  ┃ ┏┓┏┓┓ ┃ ┏┓┏┓
16 |  ┻ ┗┛┛ ┗ ┻ ┗┛┛ """
17 |               )
18 |         path_finder(
19 |             url=target_onion
20 |         )  # Create a directory with the onion link as the name.
21 | 
22 |         tor2tor.execute_scraper(
23 |             target_onion=target_onion,
24 |             pool_size=args.pool,
25 |             worker_threads=args.threads,
26 |         )
27 | 
28 |     else:
29 |         log.warning(f"{target_onion} does not seem to be a valid onion.")
30 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | buy_me_a_coffee: rly0nheart


--------------------------------------------------------------------------------
/uninstall.ps1:
--------------------------------------------------------------------------------
 1 | # Define target directories for removal
 2 | $torDir = "$env:USERPROFILE\tor2tor\Tor"
 3 | $geckoDir = "$env:USERPROFILE\tor2tor\GeckoDriver"
 4 | 
 5 | # Function to remove directory
 6 | function RemoveDir([string]$dirPath) {
 7 |     if (Test-Path $dirPath) {
 8 |         Remove-Item -Path $dirPath -Recurse -Force
 9 |         Write-Host "Removed directory: $dirPath"
10 |     } else {
11 |         Write-Host "Directory $dirPath does not exist."
12 |     }
13 | }
14 | 
15 | # Remove Tor directory
16 | RemoveDir $torDir
17 | 
18 | # Remove GeckoDriver directory
19 | RemoveDir $geckoDir
20 | 
21 | # Remove the geckodriver directory from PATH
22 | $pathEnv = [Environment]::GetEnvironmentVariable("PATH", [EnvironmentVariableTarget]::User)
23 | $newPath = ($pathEnv -split ";" | Where-Object { $_ -ne $geckoDir }) -join ";"
24 | [Environment]::SetEnvironmentVariable("PATH", $newPath, [EnvironmentVariableTarget]::User)
25 | Write-Host "Removed GeckoDriver directory from PATH."
26 | 
27 | # Uninstall tor2tor Python package
28 | pip uninstall tor2tor -y
29 | 
30 | Write-Host "Cleanup complete."
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Richard Mwewa
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["poetry-core"]
 3 | build-backend = "poetry.core.masonry.api"
 4 | 
 5 | [tool.poetry]
 6 | name = "tor2tor"
 7 | version = "0.15.0"
 8 | description = "Capture screenshots of onion services on an onion service."
 9 | authors = ["Richard Mwewa <rly0nheart@duck.com>"]
10 | readme = "README.md"
11 | license = "MIT License"
12 | homepage = "https://hub.docker.com/r/rly0nheart/tor2tor"
13 | repository = "https://github.com/rly0nheart/tor2tor"
14 | documentation = "https://github.com/rly0nheart/tor2tor/blob/latest/README.md"
15 | classifiers = [
16 |     "License :: OSI Approved :: MIT License",
17 |     "Development Status :: 5 - Production/Stable",
18 |     "Intended Audience :: Information Technology",
19 |     "Natural Language :: English",
20 |     "Operating System :: POSIX :: Linux",
21 |     "Programming Language :: Python :: 3"
22 | ]
23 | 
24 | [tool.poetry.dependencies]
25 | python = "^3.10"
26 | rich = "*"
27 | requests = "*"
28 | rich-argparse = "*"
29 | selenium = "*"
30 | BeautifulSoup4 = "*"
31 | 
32 | [tool.poetry.scripts]
33 | t2t = "tor2tor.main:execute_tor2tor"
34 | tor2tor = "tor2tor.main:execute_tor2tor"
35 | 


--------------------------------------------------------------------------------
/install.ps1:
--------------------------------------------------------------------------------
 1 | # Define URLs for Tor Expert Bundle and GeckoDriver
 2 | $torURL = "https://archive.torproject.org/tor-package-archive/torbrowser/13.0/tor-expert-bundle-13.0-windows-x86_64.tar.gz"
 3 | $geckoURL = "https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-win64.zip"
 4 | 
 5 | # Define target directories for installation
 6 | $torDir = "$env:USERPROFILE\tor2tor\Tor"
 7 | $geckoDir = "$env:USERPROFILE\tor2tor\GeckoDriver"
 8 | 
 9 | # Function to download a file
10 | function DownloadFile([string]$url, [string]$path) {
11 |     Invoke-WebRequest -Uri $url -OutFile $path
12 | }
13 | 
14 | # Check if Tor directory exists, if not create and download
15 | if (-Not (Test-Path $torDir)) {
16 |     New-Item -Path $torDir -ItemType Directory
17 |     Write-Host "Downloading Tor..."
18 |     DownloadFile $torURL "$torDir\tor.tar.gz"
19 | 
20 |     # Unpacking the Tor archive
21 |     tar -xf "$torDir\tor.tar.gz" -C $torDir
22 |     Remove-Item "$torDir\tor.tar.gz"
23 | }
24 | 
25 | # Check if GeckoDriver directory exists, if not create and download
26 | if (-Not (Test-Path $geckoDir)) {
27 |     New-Item -Path $geckoDir -ItemType Directory
28 |     Write-Host "Downloading GeckoDriver..."
29 |     DownloadFile $geckoURL "$geckoDir\geckodriver.zip"
30 | 
31 |     # Unzipping the GeckoDriver
32 |     Expand-Archive -Path "$geckoDir\geckodriver.zip" -DestinationPath $geckoDir
33 |     Remove-Item "$geckoDir\geckodriver.zip"
34 | }
35 | 
36 | # Add the geckodriver directory to PATH
37 | [Environment]::SetEnvironmentVariable("PATH", [Environment]::GetEnvironmentVariable("PATH", [EnvironmentVariableTarget]::User) + ";$geckoDir", [EnvironmentVariableTarget]::User)
38 | 
39 | pip install .
40 | Write-Host "Setup complete."
41 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "latest" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "latest" ]
20 |   schedule:
21 |     - cron: '0 0 * * *'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     # Runner size impacts CodeQL analysis time. To learn more, please see:
27 |     #   - https://gh.io/recommended-hardware-resources-for-running-codeql
28 |     #   - https://gh.io/supported-runners-and-hardware-resources
29 |     #   - https://gh.io/using-larger-runners
30 |     # Consider using larger runners for possible analysis time improvements.
31 |     runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
32 |     timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
33 |     permissions:
34 |       actions: read
35 |       contents: read
36 |       security-events: write
37 | 
38 |     strategy:
39 |       fail-fast: false
40 |       matrix:
41 |         language: [ 'python' ]
42 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby', 'swift' ]
43 |         # Use only 'java' to analyze code written in Java, Kotlin or both
44 |         # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both
45 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
46 | 
47 |     steps:
48 |     - name: Checkout repository
49 |       uses: actions/checkout@v3
50 | 
51 |     # Initializes the CodeQL tools for scanning.
52 |     - name: Initialize CodeQL
53 |       uses: github/codeql-action/init@v2
54 |       with:
55 |         languages: ${{ matrix.language }}
56 |         # If you wish to specify custom queries, you can do so here or in a config file.
57 |         # By default, queries listed here will override any specified in a config file.
58 |         # Prefix the list here with "+" to use these queries and those in the config file.
59 | 
60 |         # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
61 |         # queries: security-extended,security-and-quality
62 | 
63 | 
64 |     # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift).
65 |     # If this step fails, then you should remove it and run the build manually (see below)
66 |     - name: Autobuild
67 |       uses: github/codeql-action/autobuild@v2
68 | 
69 |     # ℹ️ Command-line programs to run using the OS shell.
70 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
71 | 
72 |     #   If the Autobuild fails above, remove it and uncomment the following three lines.
73 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
74 | 
75 |     # - run: |
76 |     #     echo "Run, Build Application using script"
77 |     #     ./location_of_script_within_repo/buildscript.sh
78 | 
79 |     - name: Perform CodeQL Analysis
80 |       uses: github/codeql-action/analyze@v2
81 |       with:
82 |         category: "/language:${{matrix.language}}"
83 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Docker
 2 | 
 3 | # This workflow uses actions that are not certified by GitHub.
 4 | # They are provided by a third-party and are governed by
 5 | # separate terms of service, privacy policy, and support
 6 | # documentation.
 7 | 
 8 | on:
 9 |   schedule:
10 |     - cron: '0 0 * * *'
11 |   push:
12 |     branches: [ "latest" ]
13 |     # Publish semver tags as releases.
14 |     tags: [ 'v*.*.*' ]
15 |   pull_request:
16 |     branches: [ "latest" ]
17 | 
18 | env:
19 |   # Use docker.io for Docker Hub if empty
20 |   REGISTRY: docker.io
21 |   # github.repository as <account>/<repo>
22 |   IMAGE_NAME: ${{ github.repository }}
23 | 
24 | 
25 | jobs:
26 |   build:
27 | 
28 |     runs-on: ubuntu-latest
29 |     permissions:
30 |       contents: read
31 |       packages: write
32 |       # This is used to complete the identity challenge
33 |       # with sigstore/fulcio when running outside of PRs.
34 |       id-token: write
35 | 
36 |     steps:
37 |       - name: Checkout repository
38 |         uses: actions/checkout@v3
39 | 
40 |       # Install the cosign tool except on PR
41 |       # https://github.com/sigstore/cosign-installer
42 |       - name: Install cosign
43 |         if: github.event_name != 'pull_request'
44 |         uses: sigstore/cosign-installer@v3.4.0
45 |         with:
46 |           cosign-release: 'v2.2.3'
47 | 
48 |       # Set up BuildKit Docker container builder to be able to build
49 |       # multi-platform images and export cache
50 |       # https://github.com/docker/setup-buildx-action
51 |       - name: Set up Docker Buildx
52 |         uses: docker/setup-buildx-action@f95db51fddba0c2d1ec667646a06c2ce06100226 # v3.0.0
53 | 
54 |       # Login against a Docker registry except on PR
55 |       # https://github.com/docker/login-action
56 |       - name: Log into registry ${{ env.REGISTRY }}
57 |         if: github.event_name != 'pull_request'
58 |         uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
59 |         with:
60 |           registry: ${{ env.REGISTRY }}
61 |           username: ${{ github.actor }}
62 |           password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
63 | 
64 |       # Extract metadata (tags, labels) for Docker
65 |       # https://github.com/docker/metadata-action
66 |       - name: Extract Docker metadata
67 |         id: meta
68 |         uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # v5.0.0
69 |         with:
70 |           images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
71 | 
72 |       # Build and push Docker image with Buildx (don't push on PR)
73 |       # https://github.com/docker/build-push-action
74 |       - name: Build and push Docker image
75 |         id: build-and-push
76 |         uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0
77 |         with:
78 |           context: .
79 |           push: ${{ github.event_name != 'pull_request' }}
80 |           tags: ${{ steps.meta.outputs.tags }}
81 |           labels: ${{ steps.meta.outputs.labels }}
82 |           cache-from: type=gha
83 |           cache-to: type=gha,mode=max
84 | 
85 |       # Sign the resulting Docker image digest except on PRs.
86 |       # This will only write to the public Rekor transparency log when the Docker
87 |       # repository is public to avoid leaking data.  If you would like to publish
88 |       # transparency data even for private images, pass --force to cosign below.
89 |       # https://github.com/sigstore/cosign
90 |       - name: Sign the published Docker image
91 |         if: ${{ github.event_name != 'pull_request' }}
92 |         env:
93 |           # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
94 |           TAGS: ${{ steps.meta.outputs.tags }}
95 |           DIGEST: ${{ steps.build-and-push.outputs.digest }}
96 |         # This step uses the identity token to provision an ephemeral certificate
97 |         # against the sigstore community Fulcio instance.
98 |         run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST}
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![banner](https://github.com/rly0nheart/tor2tor/assets/74001397/3ce19824-9414-4828-a770-081b0b0ae857)
  2 | 
  3 | **Tor2Tor** scrapes a given onion link and captures screenshots of all links available on it.
  4 | 
  5 | ![Python](https://img.shields.io/badge/Python-14354C?style=flat&logo=python)
  6 | ![Powershell](https://img.shields.io/badge/PowerShell-000000?style=flat&logo=powershell)
  7 | ![Shell](https://img.shields.io/badge/Shell-121011?style=flat&logo=gnu-bash)
  8 | ![Dockerfile](https://img.shields.io/badge/Dockerfile-grey.svg?style=flat&logo=docker)
  9 | [![Docker](https://github.com/rly0nheart/tor2tor/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/rly0nheart/tor2tor/actions/workflows/docker-publish.yml)
 10 | [![CodeQL](https://github.com/rly0nheart/tor2tor/actions/workflows/codeql.yml/badge.svg)](https://github.com/rly0nheart/tor2tor/actions/workflows/codeql.yml)
 11 | ***
 12 | 
 13 | # Installation ⬇️
 14 | 
 15 | ## Note ⚠️
 16 | 
 17 | > This assumes the Firefox browser is installed on the user's machine.
 18 | 
 19 | **1.** Clone the repository
 20 | 
 21 |   ```commandline
 22 |   git clone https://github.com/rly0nheart/tor2tor
 23 |   ```
 24 | 
 25 | **2.** Move to the tor2tor directory
 26 | 
 27 |   ```commandline
 28 |   cd tor2tor
 29 |   ```
 30 | 
 31 | <details>
 32 |   <summary>🐧 Linux</summary>
 33 | 
 34 | Run the installation script
 35 | > Assuming it has already been made executable with `sudo chmod +x install.sh`
 36 | 
 37 |   ```commandline
 38 |   sudo ./install.sh
 39 |   ```
 40 | 
 41 | The installation script will install `tor` then download and setup the latest version of `geckodriver`, and
 42 | install `tor2tor` together with its dependencies (because we're all too lazy to manually do it)
 43 |   ***
 44 | </details>
 45 | 
 46 | <details>
 47 |   <summary>🪟 Windows</summary>
 48 | 
 49 | Run the powershell installation script
 50 | 
 51 |   ```powershell
 52 |   .\install.ps1
 53 |   ```
 54 | 
 55 | The installation script will download the `tor` bundle, `geckodriver`, and install `tor2tor` together with its
 56 | dependencies. The downloads will be stored in the `tor2tor` directory.
 57 | </details>
 58 | 
 59 | <details>
 60 |   <summary>🐋 Docker Image</summary>
 61 | 
 62 | ## Note ⚠️
 63 | 
 64 | > This assumes you have docker installed and running
 65 | 
 66 | You can just pull the docker image from [DockerHub](https://hub.docker.com/r/rly0nheart/tor2tor) by running:
 67 | 
 68 |   ```commandline
 69 |   docker pull rly0nheart/tor2tor
 70 |   ```
 71 | 
 72 | ***
 73 | </details>
 74 | 
 75 | # Usage ⌨️
 76 | 
 77 | <details>
 78 |   <summary>🐧 Linux</summary>
 79 | 
 80 | To see available options/usage, call *Tor2Tor* with the `-h/--help` flag
 81 | 
 82 |   ```commandline
 83 |   tor2tor --help
 84 |   ```
 85 | 
 86 | or
 87 | 
 88 |   ```commandline
 89 |   t2t --help
 90 |   ```
 91 | 
 92 | Calling it with an onion url should look like the following
 93 | 
 94 | ```commandline
 95 | sudo tor2tor http://example.onion
 96 | ```
 97 | 
 98 | ***
 99 | 
100 | </details>
101 | 
102 | <details>
103 |   <summary>🪟 Windows</summary>
104 | 
105 | To see available options/usage, call *Tor2Tor* with the `-h/--help` flag
106 | 
107 |   ```commandline
108 |   tor2tor --help
109 |   ```
110 | 
111 | or
112 | 
113 |   ```commandline
114 |   t2t --help
115 |   ```
116 | 
117 | Calling it with an onion url should look like the following
118 | 
119 | ```commandline
120 | tor2tor http://example.onion
121 | ```
122 | 
123 | ***
124 | 
125 | </details>
126 | 
127 | <details>
128 |   <summary>🐋 Docker Container</summary>
129 | 
130 | The *Tor2Tor* container can be called with `docker run` like so:
131 | 
132 |   ```commandline
133 |   docker run rly0nheart/tor2tor --help
134 |   ```
135 | 
136 | Calling the container with an onion url should look like the following
137 | 
138 |   ```commandline
139 |   docker run --tty --volume $PWD/tor2tor:/root/tor2tor rly0nheart/tor2tor http://example.onion
140 |   ```
141 | 
142 | ## Note ⚠️
143 | 
144 | > --tty Allocates a pseudo-TTY, use it to enable the container to display colours (trust me, you will need this)
145 | >> --volume $PWD/tor2tor:/root/tor2tor Will mount the *tor2tor* directory from the container to your host machine's
146 | *tor2tor* directory.
147 | 
148 | ***
149 | </details>
150 | 
151 | # Updating ⬆️
152 | 
153 | <details>
154 |   <summary>🐧 Linux</summary>
155 | 
156 | [*update.sh*](https://github.com/rly0nheart/tor2tor/blob/latest/update.sh)
157 | > Assuming it has already been made executable with `sudo chmod +x update.sh`
158 | 
159 | Navigate to the `tor2tor` directory that you cloned and find the `update.sh` file.
160 | 
161 | and run it
162 | 
163 |   ```commandline
164 |   sudo ./update.sh
165 |   ```
166 | 
167 | The script will pull the latest changes (if any are available) then rebuild and install the package.
168 | 
169 | ***
170 | 
171 | </details>
172 | 
173 | <details>
174 |   <summary>🪟 Windows</summary>
175 | 
176 | Navigate to the `tor2tor` directory that you cloned and find the `update.ps1` file.
177 | 
178 |   ```powershell
179 |   .\update.ps1
180 |   ```
181 | 
182 | The script will pull the latest changes (if any are available) then rebuild and install the package.
183 | 
184 | ***
185 | 
186 | </details>
187 | 
188 | <details>
189 |   <summary>🐋 Docker Container</summary>
190 | 
191 | As for the docker container, just run the docker pull command again.
192 | 
193 |   ```commandline
194 |   docker run rly0nheart/tor2tor --help
195 |   ```
196 | 
197 | Calling the container with an onion url should look like the following
198 | 
199 |   ```commandline
200 |   docker run --tty --volume $PWD/tor2tor:/root/tor2tor rly0nheart/tor2tor http://example.onion
201 |   ```
202 | 
203 | ## Note ⚠️
204 | 
205 | > --tty Allocates a pseudo-TTY, use it to enable the container to display colours (trust me, you will need this)
206 | >> --volume $PWD/tor2tor:/root/tor2tor Will mount the *tor2tor* directory from the container to your host machine's
207 | *tor2tor* directory.
208 | 
209 | ***
210 | </details>
211 | 
212 | # Uninstalling ❌
213 | 
214 | <details>
215 |   <summary>🐧 Linux</summary>
216 | 
217 | ## Note ⚠️
218 | 
219 | > Assuming it has already been made executable with `sudo chmod +x uninstall.sh`
220 | 
221 | Navigate to the `tor2tor` directory that you cloned and find the `uninstall.sh` file.
222 | 
223 | Run it!
224 | 
225 |   ```commandline
226 |   sudo ./uninstall.sh
227 |   ```
228 | 
229 | This will uninstall `tor`, delete the `geckodriver` binary and uninstall `tor2tor`
230 |   ***
231 | </details>
232 | 
233 | <details>
234 |   <summary>🪟 Windows</summary>
235 | 
236 | Navigate to the `tor2tor` directory that you cloned and find the `uninstall.ps1` file.
237 | 
238 | Run it!
239 | 
240 |   ```powershell
241 |   .\uninstall.sh
242 |   ```
243 | 
244 | This will delete the `geckodriver` and tor binaries then uninstall `tor2tor`
245 |   ***
246 | </details>
247 | 
248 | <details>
249 |   <summary>🐋 Docker Container</summary>
250 | 
251 | You can stop (if it's running) and remove the container by running:
252 | 
253 |   ```commandline
254 |   docker rm -f rly0nheart/tor2tor
255 |   ```
256 | 
257 | ***
258 | </details>
259 | 
260 | # Important 🚧
261 | 
262 | As you probably already know,Tor routes data via three relays (servers) for your privacy.
263 | As a result, connections become slower than an ordinary connection.
264 | 
265 | ## Point ⚠️
266 | 
267 | Once you start **Tor2Tor**, give it at least 2 minutes tops to query the specified onion url and extract links from it.
268 | 
269 | If you want to work around this, you can always just use a cloud shell service.
270 | 
271 | # Screenshots
272 | 
273 | ![tor2tor-archive](https://github.com/rly0nheart/tor2tor-archive/assets/74001397/759082c5-f5ea-4b25-80da-a756d182ae86)
274 | 
275 | There's a dedicated repository of onion screenshots captured with **Tor2Tor**
276 | at [Tor2Tor Archive](https://github.com/rly0nheart/tor2tor-archive)
277 | 
278 | ## CI/CD Workflow 🌊
279 | 
280 | ### Docker Image Building 🐳
281 | 
282 | - Pushing to or merging into the `latest` branch triggers an automatic build of the Docker image.
283 | - This image is tagged as `latest` on Docker Hub, indicating it's the most stable release.
284 | 
285 | ***
286 | ![me](https://github.com/rly0nheart/tor2tor/assets/74001397/97bf7845-db43-4fd0-87bd-04e8b6b02e74)
287 | 
288 | 
289 | 
290 | 


--------------------------------------------------------------------------------
/tor2tor/coreutils.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import os
  5 | import re
  6 | import subprocess
  7 | import time
  8 | from datetime import datetime, date
  9 | from urllib.parse import urlparse
 10 | 
 11 | import requests
 12 | from rich import print
 13 | from rich.logging import RichHandler
 14 | from rich.markdown import Markdown
 15 | from rich.table import Table
 16 | from rich_argparse import RichHelpFormatter
 17 | 
 18 | from . import __author__, __version__
 19 | 
 20 | # Construct path to the user's home directory
 21 | PROGRAM_DIRECTORY = os.path.expanduser(os.path.join("~", "tor2tor"))
 22 | 
 23 | 
 24 | def load_settings() -> dict:
 25 |     """
 26 |     Loads settings from /settings/settings.json
 27 | 
 28 |     :return: Dictionary (JSON) containing settings
 29 |     """
 30 |     # Get the absolute path of the current file
 31 |     current_dir = os.path.dirname(os.path.abspath(__file__))
 32 | 
 33 |     # Construct the path to the settings.json file
 34 |     settings_path = os.path.join(current_dir, "settings", "settings.json")
 35 | 
 36 |     # Load the settings from the file
 37 |     with open(settings_path) as file:
 38 |         data = json.load(file)
 39 | 
 40 |     return data
 41 | 
 42 | 
 43 | def create_parser() -> argparse.ArgumentParser:
 44 |     from . import __version__, __epilog__, __description__
 45 | 
 46 |     parser = argparse.ArgumentParser(
 47 |         description=Markdown(__description__, "argparse.text"),
 48 |         epilog=Markdown(__epilog__),
 49 |         formatter_class=RichHelpFormatter,
 50 |     )
 51 |     parser.add_argument("onion", help="onion url to scrape")
 52 |     parser.add_argument(
 53 |         "--headless",
 54 |         help="run Firefox WebDriver instances in headless mode",
 55 |         action="store_true",
 56 |     )
 57 |     parser.add_argument(
 58 |         "-l", "--limit", help="number of onion links to capture", type=int, default=10
 59 |     )
 60 |     parser.add_argument(
 61 |         "-p",
 62 |         "--pool",
 63 |         help="size of the Firefox WebDriver instance pool (default: %(default)s)",
 64 |         type=int,
 65 |         default=3,
 66 |     )
 67 |     parser.add_argument(
 68 |         "-t",
 69 |         "--threads",
 70 |         help="number of worker threads to run (default: %(default)s)",
 71 |         type=int,
 72 |         default=3,
 73 |     )
 74 |     parser.add_argument(
 75 |         "--log-skipped",
 76 |         help="log skipped onions on output",
 77 |         dest="log_skipped",
 78 |         action="store_true",
 79 |     )
 80 |     parser.add_argument(
 81 |         "-d", "--debug", help="run program in debug mode", action="store_true"
 82 |     )
 83 |     parser.add_argument(
 84 |         "-v",
 85 |         "--version",
 86 |         version=f"Tor2Tor v{__version__} Copyright (c) 2023-{date.today().year} {__author__}",
 87 |         action="version",
 88 |     )
 89 |     return parser
 90 | 
 91 | 
 92 | def set_loglevel(debug_mode: bool) -> logging.getLogger:
 93 |     """
 94 |     Configure and return a logging object with the specified log level.
 95 | 
 96 |     :param debug_mode: If True, the log level is set to "NOTSET". Otherwise, it is set to "INFO".
 97 |     :return: A logging object configured with the specified log level.
 98 |     """
 99 |     logging.basicConfig(
100 |         level="NOTSET" if debug_mode else "INFO",
101 |         format="%(message)s",
102 |         handlers=[
103 |             RichHandler(markup=True, log_time_format="%H:%M:%S", show_level=debug_mode)
104 |         ],
105 |     )
106 |     return logging.getLogger("Tor2Tor")
107 | 
108 | 
109 | def add_http_to_link(link: str) -> str:
110 |     """
111 |     Adds 'http://' to the URL if it doesn't already start with 'http://' or 'https://'.
112 | 
113 |     :param link: The link to modify.
114 |     :return: The modified URL.
115 |     """
116 |     if not link.startswith(("http://", "https://")):
117 |         return f"http://{link}"
118 |     return link
119 | 
120 | 
121 | def is_valid_onion(url: str) -> bool:
122 |     """
123 |     Uses a regex pattern to determine whether a given url is an onion service or not.
124 | 
125 |     :param url: The url to check.
126 |     :return: True if the url matches the strict pattern criterion. False if it doesn't
127 | 
128 |     Regex Explanation
129 |     -----------------
130 |     - ^ - Asserts the start of a string.
131 |     - (http://|https://)? - Matches HTTP or HTTPS protocol in the string (optional).
132 |     - (www\\.)? - Optionally matches the www. subdomain.
133 |     - ([a-z2-7]{54,}d) - Matches 55 or more characters, where each can be a lowercase letter or a digit from 2 to 7,
134 |       and ends with 'd'.
135 |     - \\.onion - Matches .onion.
136 |     - (/|$) - Matches either a forward slash or the end of the string.
137 |     """
138 |     if re.search(r"^(http://|https://)?(www\.)?([a-z2-7]{54,}d)\.onion(/|$)", url):
139 |         return True
140 |     else:
141 |         return False
142 | 
143 | 
144 | def create_table(table_headers: list, table_title: str = "") -> Table:
145 |     """
146 |     Creates a rich table with the given column headers.
147 | 
148 |     :param table_headers: The column headers to add to the Table.
149 |     :param table_title: The title of the table (an empty string is the default tile).
150 |     :returns: A table with added column headers.
151 |     """
152 |     table = Table(
153 |         title=table_title,
154 |         title_style="italic",
155 |         caption=f"{time.asctime()}",
156 |         caption_style="italic",
157 |         show_header=True,
158 |         header_style="bold",
159 |         highlight=True,
160 |     )
161 |     for header in table_headers:
162 |         table.add_column(header, style="dim" if header == "#" else "")
163 |     return table
164 | 
165 | 
166 | def construct_output_name(url: str) -> str:
167 |     """
168 |     Constructs an output name based on the network location part (netloc) of a given URL.
169 | 
170 |     :param url: The URL to parse.
171 |     :return: The network location part (netloc) of the URL.
172 |     """
173 |     parsed_url = urlparse(url)
174 |     output_name = parsed_url.netloc
175 |     return output_name
176 | 
177 | 
178 | def path_finder(url: str):
179 |     """
180 |     Checks if the specified directories exist.
181 |     If not, it creates them.
182 |     """
183 |     os.makedirs(
184 |         os.path.join(PROGRAM_DIRECTORY, construct_output_name(url=url)), exist_ok=True
185 |     )
186 | 
187 | 
188 | def convert_timestamp_to_datetime(timestamp: float) -> datetime:
189 |     """
190 |     Converts a Unix timestamp to a datetime object.
191 | 
192 |     :param timestamp: The Unix timestamp to be converted, given as a float.
193 |     :return: A datetime object.
194 |     """
195 |     datetime_from_timestamp = datetime.fromtimestamp(timestamp)
196 |     return datetime_from_timestamp
197 | 
198 | 
199 | def get_file_info(filename: str) -> tuple:
200 |     """
201 |     Gets a given file's information.
202 | 
203 |     :param filename: File to get info for.
204 |     :return: A tuple containing the file's size and created time.
205 |     """
206 |     file_size = os.path.getsize(filename=filename)
207 | 
208 |     created_time = convert_timestamp_to_datetime(
209 |         timestamp=os.path.getmtime(filename=filename)
210 |     )
211 | 
212 |     return file_size, created_time
213 | 
214 | 
215 | def check_updates():
216 |     """
217 |     Checks the program's updates by comparing the current program version tag with the remote version tag from GitHub.
218 |     """
219 |     response = requests.get(
220 |         "https://api.github.com/repos/rly0nheart/tor2tor/releases/latest"
221 |     ).json()
222 |     remote_version = response.get("tag_name")
223 | 
224 |     if remote_version != __version__:
225 |         log.info(
226 |             f"Tor2Tor version {remote_version} published at {response.get('published_at')} "
227 |             f"is available. Run the 'update.sh' "
228 |             f"script (for local installation) or re-pull the image (for docker container) "
229 |             f"with 'docker pull rly0nheart/tor2tor' to get the updates. "
230 |         )
231 |         release_notes = Markdown(response.get("body"))
232 |         print(release_notes)
233 |         print("\n")
234 | 
235 | 
236 | def tor_service(command: str):
237 |     """
238 |     Starts/Stops the Tor service based on the provided command and operating system.
239 | 
240 |     This function can start or stop the Tor service on Windows and Unix-like
241 |     systems. On Windows, it looks for Tor\\tor\\tor.exe in the user's home directory.
242 | 
243 |     :param command: The command to manage the Tor service. Acceptable values are "start" or "stop".
244 |     :raise: subprocess.CalledProcessError If the subprocess fails to execute.
245 |     """
246 | 
247 |     if command not in ["start", "stop"]:
248 |         log.warning("Command must be either 'start' or 'stop'")
249 | 
250 |     try:
251 |         if os.name == "nt":
252 |             tor_path = os.path.join(PROGRAM_DIRECTORY, load_settings().get("tor.exe"))
253 | 
254 |             if command == "start":
255 |                 log.info(f"Starting {tor_path}...")
256 |                 subprocess.Popen(tor_path)
257 |             else:
258 |                 subprocess.Popen("taskkill /IM tor.exe /F")
259 | 
260 |         else:
261 |             subprocess.run(["service", "tor", command])
262 | 
263 |     except subprocess.CalledProcessError as e:
264 |         print(f"Failed to {command} the Tor service: {e}")
265 | 
266 | 
267 | args = create_parser().parse_args()
268 | log = set_loglevel(debug_mode=args.debug)
269 | 


--------------------------------------------------------------------------------
/tor2tor/tor2tor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import sys
  4 | import time
  5 | from datetime import datetime
  6 | from queue import Queue
  7 | from threading import Lock, Thread
  8 | 
  9 | import requests
 10 | from rich import print
 11 | from rich.table import Table
 12 | from bs4 import BeautifulSoup
 13 | from selenium import webdriver
 14 | from selenium.webdriver.firefox.options import Options
 15 | 
 16 | from . import __version__
 17 | from .coreutils import (
 18 |     log,
 19 |     args,
 20 |     tor_service,
 21 |     create_table,
 22 |     load_settings,
 23 |     get_file_info,
 24 |     is_valid_onion,
 25 |     PROGRAM_DIRECTORY,
 26 |     add_http_to_link,
 27 |     construct_output_name,
 28 |     convert_timestamp_to_datetime,
 29 |     check_updates,
 30 | )
 31 | 
 32 | 
 33 | class Tor2Tor:
 34 |     def __init__(self):
 35 |         # Initialise locks for logging and table updates
 36 |         self.log_lock = Lock()
 37 |         self.table_lock = Lock()
 38 | 
 39 |         # Initialise queues for storing captured and skipped onions
 40 |         self.captured_onions_queue = Queue()
 41 |         self.skipped_onions_queue = Queue()
 42 | 
 43 |         # Initialise tor proxy settings
 44 |         self.socks_host = load_settings().get("proxy").get("socks5").get("host")
 45 |         self.socks_port = load_settings().get("proxy").get("socks5").get("port")
 46 |         self.socks_type = load_settings().get("proxy").get("socks5").get("type")
 47 |         self.socks_version = load_settings().get("proxy").get("socks5").get("version")
 48 | 
 49 |     def firefox_options(self, instance_index: int) -> Options:
 50 |         """
 51 |         Configure Firefox options for web scraping with a headless browser and Tor network settings.
 52 | 
 53 |         :param instance_index: Index of the opened WebDriver instance in the firefox_pool.
 54 |         :returns: A Selenium WebDriver Options object with preset configurations.
 55 |         """
 56 |         options = Options()
 57 |         options.add_argument("--incognito")
 58 |         if args.headless:
 59 |             options.add_argument("--headless")
 60 |             log.info(f"Running headless on WebDriver instance {instance_index}...")
 61 |         options.set_preference("network.proxy.type", self.socks_type)
 62 |         options.set_preference("network.proxy.socks", self.socks_host)  # "127.0.0.1"
 63 |         options.set_preference("network.proxy.socks_port", self.socks_port)
 64 |         options.set_preference("network.proxy.socks_version", self.socks_version)
 65 |         options.set_preference("network.proxy.socks_remote_dns", True)
 66 |         options.set_preference("network.dns.blockDotOnion", False)
 67 |         return options
 68 | 
 69 |     def open_firefox_pool(self, pool_size: int) -> Queue:
 70 |         """
 71 |         Initializes a queue of Firefox WebDriver instances for future use.
 72 | 
 73 |         :param pool_size: The number of Firefox instances to create.
 74 |         :return: A queue containing the created Firefox instances.
 75 |         """
 76 |         # Initialize a new queue to hold the Firefox instances.
 77 |         pool = Queue()
 78 | 
 79 |         log.info(f"Opening WebDriver pool with {pool_size} instances...")
 80 | 
 81 |         # Populate the pool with Firefox instances.
 82 |         for instance_index, webdriver_instance in enumerate(
 83 |             range(pool_size), start=1
 84 |         ):  # Create 3 (default) instances
 85 |             driver = webdriver.Firefox(
 86 |                 options=self.firefox_options(instance_index=instance_index),
 87 |             )
 88 |             pool.put(driver)
 89 | 
 90 |         return pool
 91 | 
 92 |     @staticmethod
 93 |     def close_firefox_pool(pool: Queue):
 94 |         """
 95 |         Closes all the Firefox instances in the pool.
 96 | 
 97 |         :param pool: The pool containing Firefox WebDriver instances to close.
 98 |         """
 99 |         log.info("Closing WebDriver pool...")
100 |         while not pool.empty():
101 |             driver = pool.get()
102 |             driver.quit()
103 | 
104 |     def worker(self, tasks_queue: Queue, screenshots_table: Table, firefox_pool: Queue):
105 |         """
106 |         Worker function to capture screenshots of websites.
107 | 
108 |         This function is intended to be used as a target for a Thread. It captures screenshots
109 |         of websites as tasks are fed via the queue. The function borrows a Firefox instance from
110 |         the pool for each task and returns it after the task is complete.
111 | 
112 |         :param tasks_queue: The queue containing tasks (websites to capture).
113 |         :param screenshots_table: A table where captured screenshot metadata is stored.
114 |         :param firefox_pool: The pool of Firefox WebDriver instances.
115 |         """
116 |         onion_index = None
117 |         driver = None
118 |         onion = None
119 | 
120 |         # Continue working as long as the queue is not empty
121 |         while not tasks_queue.empty():
122 |             try:
123 |                 # Get a new task from the queue
124 |                 onion_index, onion = tasks_queue.get()
125 | 
126 |                 driver = firefox_pool.get()
127 | 
128 |                 # Capture the screenshot
129 |                 self.capture_onion(
130 |                     onion_url=onion,
131 |                     onion_index=onion_index,
132 |                     driver=driver,
133 |                     screenshots_table=screenshots_table,
134 |                 )
135 |                 self.captured_onions_queue.put(
136 |                     (
137 |                         onion_index,
138 |                         onion,
139 |                         convert_timestamp_to_datetime(timestamp=time.time()),
140 |                     )
141 |                 )
142 | 
143 |                 # On successful capture, return the Firefox instance back to the pool and mark the task as done
144 |                 # Do the same on exception.
145 |                 firefox_pool.put(driver)
146 |                 tasks_queue.task_done()
147 | 
148 |             except KeyboardInterrupt:
149 |                 log.warning("User interruption detected ([yellow]Ctrl+C[/])")
150 |                 sys.exit()
151 |             except Exception as e:
152 |                 if args.log_skipped:
153 |                     log.error(f"{onion_index} [yellow]{e}[/]")
154 | 
155 |                 # Add the skipped onion index, the onion itself, the time it was skipped, and the reason it was skipped
156 |                 self.skipped_onions_queue.put(
157 |                     (
158 |                         onion_index,
159 |                         onion,
160 |                         f"[yellow]{e}[/]",
161 |                         convert_timestamp_to_datetime(timestamp=time.time()),
162 |                     )
163 |                 )
164 | 
165 |                 firefox_pool.put(driver)
166 |                 tasks_queue.task_done()
167 | 
168 |     def execute_worker(
169 |         self,
170 |         worker_threads: int,
171 |         tasks_queue: Queue,
172 |         screenshots_table: Table,
173 |         firefox_pool: Queue,
174 |     ):
175 |         """
176 |         Executes the worker method.
177 | 
178 |         :param worker_threads: Number of threads to execute the worker with.
179 |         :param tasks_queue: The queue containing tasks (websites to capture).
180 |         :param screenshots_table: The table where captured screenshots will be added.
181 |         :param firefox_pool: A pool containing n number of firefox instances.
182 |         """
183 |         # Initialize threads
184 |         threads = []
185 |         for _ in range(worker_threads):  # create 3 (default) worker threads
186 |             t = Thread(
187 |                 target=self.worker, args=(tasks_queue, screenshots_table, firefox_pool)
188 |             )
189 |             t.start()
190 |             threads.append(t)
191 | 
192 |         # Wait for all threads to finish
193 |         for thread in threads:
194 |             thread.join()
195 | 
196 |     def get_onion_response(self, onion_url: str) -> BeautifulSoup:
197 |         """
198 |         Fetches the HTML content of a given onion link using a SOCKS5 proxy.
199 | 
200 |         :param onion_url: The onion URL to fetch the content from.
201 |         :return: A BeautifulSoup object containing the parsed HTML content.
202 |         """
203 | 
204 |         # Define the SOCKS5 proxy settings
205 |         proxies = {
206 |             "http": f"socks5h://{self.socks_host}:{self.socks_port}",
207 |             "https": f"socks5h://{self.socks_host}:{self.socks_port}",
208 |         }
209 | 
210 |         # Perform the HTTP GET request
211 |         response = requests.get(onion_url, proxies=proxies)
212 | 
213 |         # Parse the HTML content using BeautifulSoup
214 |         soup = BeautifulSoup(response.content, "html.parser")
215 | 
216 |         return soup
217 | 
218 |     def get_onions_on_page(self, onion_url: str) -> list:
219 |         """
220 |         Scrapes a given onion URL and extracts all valid URLs found in <a> tags.
221 | 
222 |         :param onion_url: The onion URL to scrape.
223 |         :return: A list of valid URLs found on the page.
224 | 
225 |         Regex Explanation:
226 |         -----------------
227 |         - `https?`: Matches either 'http' or 'https'.
228 |         - `://`: Matches the '://' that follows the protocol.
229 |         - `\\S+`: Matches one or more non-whitespace characters.
230 |         """
231 | 
232 |         # Initialize an empty list to store valid URLs
233 |         valid_onions = []
234 | 
235 |         # Fetch the page content
236 |         page_content = self.get_onion_response(onion_url=onion_url)
237 | 
238 |         # Define the regex pattern to match URLs
239 |         url_pattern = re.compile(r"https?://\S+")
240 | 
241 |         # Find all <a> tags in the HTML content
242 |         found_onions = page_content.find_all("a")
243 | 
244 |         # Loop through each <a> tag and extract the href attribute
245 |         for onion_index, onion in enumerate(found_onions, start=1):
246 |             href = onion.get("href")
247 |             # Check if the 'href' attribute exists and is not None
248 |             if href:
249 |                 # Find all URLs in the 'href' attribute using the regex pattern
250 |                 urls = url_pattern.findall(href)
251 |                 # Loop through each URL found in the 'href' attribute
252 |                 for url in urls:
253 |                     # Check if the URL is a valid Onion URL
254 |                     if is_valid_onion(url):
255 |                         # Append the valid Onion URL to the list of valid_onion_urls
256 |                         valid_onions.append(url)
257 | 
258 |         log.info(f"Found {len(valid_onions)} links on {onion_url}")
259 |         return valid_onions
260 | 
261 |     def capture_onion(
262 |         self, onion_url: str, onion_index, driver: webdriver, screenshots_table: Table
263 |     ):
264 |         """
265 |         Captures a screenshot of a given onion link using a webdriver.
266 | 
267 |         :param onion_url: The onion URL to capture.
268 |         :param onion_index: The index of the onion link in a list or sequence.
269 |         :param driver: The webdriver instance to use for capturing the screenshot.
270 |         :param screenshots_table: Table to add captured screenshots to.
271 |         """
272 | 
273 |         # Construct the directory name based on the URL
274 |         directory_name = construct_output_name(url=args.onion)
275 | 
276 |         # Add HTTP to the URL if it's not already there
277 |         validated_onion_link = add_http_to_link(link=onion_url)
278 | 
279 |         # Construct the filename for the screenshot from the onion link
280 |         filename = construct_output_name(url=validated_onion_link) + ".png"
281 | 
282 |         # Construct the full file path
283 |         file_path = os.path.join(PROGRAM_DIRECTORY, directory_name, filename)
284 | 
285 |         # Log the onion link being captured
286 |         log.info(f"{onion_index} Capturing... {validated_onion_link}")
287 | 
288 |         # Navigate to the URL
289 |         driver.get(validated_onion_link)
290 | 
291 |         if os.path.exists(path=file_path):
292 |             log.info(f"{onion_index} [yellow][italic]{filename}[/][/] already exists.")
293 |         else:
294 |             # Take a full screenshot of the onion and save it to the given file path
295 |             driver.save_full_page_screenshot(file_path)
296 | 
297 |             with self.log_lock:
298 |                 # Log the successful capture
299 |                 log.info(
300 |                     f"{onion_index} [dim]{driver.title}[/] - [yellow][italic][link file://{filename}]{filename}[/][/]"
301 |                 )
302 | 
303 |             with self.table_lock:
304 |                 # Add screenshot info to the Table
305 |                 file_size, created_time = get_file_info(filename=file_path)
306 |                 screenshots_table.add_row(
307 |                     str(onion_index),
308 |                     filename,
309 |                     str(file_size),
310 |                     str(created_time),
311 |                 )
312 | 
313 |     def execute_scraper(
314 |         self,
315 |         target_onion: str,
316 |         pool_size: int,
317 |         worker_threads: int,
318 |     ):
319 |         """
320 |         Executes the scraper code.
321 | 
322 |         :param target_onion: The onion to scrape.
323 |         :param pool_size: Size of the WebDriver instance pool (default is 3).
324 |         :param worker_threads: Number of threads.
325 |         """
326 |         firefox_pool = None
327 | 
328 |         start_time = datetime.now()
329 |         log.info(f"Starting 🧅Tor2Tor {__version__} {start_time}...")
330 | 
331 |         try:
332 |             check_updates()
333 | 
334 |             tor_service(command="start")  # Start the Tor service.
335 | 
336 |             # Fetch onion URLs from the provided URL
337 |             onions = self.get_onions_on_page(
338 |                 onion_url=add_http_to_link(link=target_onion)
339 |             )
340 | 
341 |             firefox_pool = self.open_firefox_pool(pool_size=pool_size)
342 | 
343 |             # Create a table where capture screenshots will be displayed
344 |             screenshots_table = create_table(
345 |                 table_title="Screenshots",
346 |                 table_headers=["#", "filename", "size (bytes)", "timestamp"],
347 |             )
348 | 
349 |             # Initialize Queue and add tasks
350 |             tasks_queue = Queue()
351 | 
352 |             for onion_index, onion in enumerate(onions, start=1):
353 |                 tasks_queue.put((onion_index, onion))
354 | 
355 |                 if onion_index == args.limit:
356 |                     # If onion index is equal to the limit set in -l/--limit, break the loop.
357 |                     break
358 | 
359 |             self.execute_worker(
360 |                 worker_threads=worker_threads,
361 |                 tasks_queue=tasks_queue,
362 |                 screenshots_table=screenshots_table,
363 |                 firefox_pool=firefox_pool,
364 |             )
365 | 
366 |             log.info("DONE!\n")
367 | 
368 |             # Print table showing captured screenshots
369 |             print(screenshots_table)
370 |             print("\n")
371 | 
372 |             # Print the summary tables for captured and skipped onions
373 |             captured_onions, skipped_onions = self.onion_summary_tables(
374 |                 captured_onions=list(self.captured_onions_queue.queue),
375 |                 skipped_onions=list(self.skipped_onions_queue.queue),
376 |             )
377 | 
378 |             log.info(f"{len(self.captured_onions_queue.queue)} onions captured.")
379 |             print(captured_onions)
380 | 
381 |             log.info(f"{len(self.skipped_onions_queue.queue)} onions skipped.")
382 |             print(skipped_onions)
383 | 
384 |         except KeyboardInterrupt:
385 |             log.warning(f"User Interruption detected ([yellow]Ctrl+C[/])")
386 |             sys.exit()
387 |         except Exception as e:
388 |             log.error(f"An error occurred: [red]{e}[/]")
389 |             sys.exit()
390 |         finally:
391 |             if firefox_pool is not None:
392 |                 self.close_firefox_pool(pool=firefox_pool)
393 | 
394 |             tor_service(command="stop")  # Stop the Tor service.
395 |             log.info(f"Stopped in {datetime.now() - start_time} seconds.")
396 | 
397 |     @staticmethod
398 |     def onion_summary_tables(
399 |         captured_onions: list,
400 |         skipped_onions: list,
401 |     ) -> tuple:
402 |         """
403 |         Creates tables showing a summary of captured and skipped onions.
404 | 
405 |         Note
406 |         ----
407 |         - The index value in the loops, holds the index of the onion in the captured/skipped onions lists
408 |         - And the *_onion[0] holds the index of the onion from the scraper task.
409 | 
410 |         :param captured_onions: A list of tuples, each containing the captured onion url and its index
411 |          from the scraper task.
412 |         :param skipped_onions: A list of tuples,
413 |            each containing the skipped onion url and its index from the scraper task.
414 |         :returns: A tuple containing the captured and skipped onions tables:
415 |            (captured_onions_table, skipped_onions_table).
416 |         """
417 | 
418 |         # Create a table of captured onions
419 |         captured_onions_table = create_table(
420 |             table_headers=["#", "index", "onion", "timestamp"],
421 |         )
422 |         for index, captured_onion in enumerate(captured_onions, start=1):
423 |             captured_onions_table.add_row(
424 |                 str(index),  # Index of the onion from the captured_onions list
425 |                 str(captured_onion[0]),  # Index of the onion from the scraping task
426 |                 str(captured_onion[1]),  # Onion url
427 |                 str(captured_onion[2]),  # Time the onion was captured
428 |             )
429 | 
430 |         # Create a table of skipped onions
431 |         skipped_onions_table = create_table(
432 |             table_headers=["#", "index", "onion", "reason", "timestamp"],
433 |         )
434 |         for index, skipped_onion in enumerate(skipped_onions, start=1):
435 |             skipped_onions_table.add_row(
436 |                 str(index),  # Index of the onion from the skipped_onions list
437 |                 str(skipped_onion[0]),  # Index of the onion from the scraping task
438 |                 str(skipped_onion[1]),  # Onion url
439 |                 str(skipped_onion[2]),  # Reason the onion was skipped
440 |                 str(skipped_onion[3]),  # Time the onion was skipped
441 |             )
442 | 
443 |         return captured_onions_table, skipped_onions_table
444 | 


--------------------------------------------------------------------------------