├── tor2tor ├── settings │ └── settings.json ├── __init__.py ├── main.py ├── coreutils.py └── tor2tor.py ├── uninstall.sh ├── Dockerfile ├── install.sh ├── update.sh ├── update.ps1 ├── .github ├── FUNDING.yml └── workflows │ ├── codeql.yml │ └── docker-publish.yml ├── uninstall.ps1 ├── LICENSE ├── pyproject.toml ├── install.ps1 ├── .gitignore └── README.md /tor2tor/settings/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "proxy": { 3 | "socks5": { 4 | "host": "localhost", 5 | "port": 9050, 6 | "type": 1, 7 | "version": 5 8 | } 9 | }, 10 | "tor.exe": "Tor\\tor\\tor.exe" 11 | } 12 | -------------------------------------------------------------------------------- /tor2tor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Richard Mwewa" 2 | __about__ = "https://rly0nheart.github.io" 3 | __version__ = "0.15.0" 4 | __description__ = f"**Tor2Tor** - by [{__author__}]({__about__})" 5 | __epilog__ = "**Tor2Tor** scrapes a given onion link and captures screenshots of all links available on it." 6 | -------------------------------------------------------------------------------- /uninstall.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Remove the geckodriver binary from /usr/bin 4 | rm /usr/bin/geckodriver -v 5 | 6 | # Uninstall tor and its configuration files including unused packages (I might be doing you a favour by removing unused packages haha) 7 | apt remove tor --autoremove --purge -y 8 | 9 | # Uninstall tor2tor 10 | pip3 uninstall tor2tor -y -v 11 | echo "Cleanup complete." 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:latest 2 | 3 | WORKDIR /app 4 | 5 | COPY . . 6 | 7 | RUN apt-get update && \ 8 | apt-get install -y --no-install-recommends \ 9 | ca-certificates \ 10 | curl \ 11 | firefox-esr \ 12 | tor && \ 13 | rm -fr /var/lib/apt/lists/* 14 | 15 | RUN curl -L https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-linux64.tar.gz | \ 16 | tar xz -C /usr/bin && \ 17 | apt-get purge -y \ 18 | ca-certificates \ 19 | curl 20 | 21 | RUN pip install --upgrade pip && pip install . 22 | 23 | ENTRYPOINT ["tor2tor"] 24 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Install the Tor package without installing recommended packages. 4 | apt-get update && apt-get install -y --no-install-recommends tor 5 | 6 | # Download geckodriver .tar.gz file and pipe it to 'tar' to extract the geckodriver binary directly into /usr/bin. 7 | curl -L https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-linux64.tar.gz | \ 8 | tar xz -C /usr/bin 9 | 10 | # Install Python packages defined in the current directory's setup.py/pyproject.toml file. (pyproject.toml in this case) 11 | pip3 install . 12 | echo "Setup complete." 13 | -------------------------------------------------------------------------------- /update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if the directory has a .git folder 4 | if [ -d ".git" ]; then 5 | # Fetch the latest updates from the https://github.com/rly0nheart/tor2tor 6 | git fetch origin 7 | 8 | # Compare local and remote branches 9 | LOCAL=$(git rev-parse @) 10 | REMOTE=$(git rev-parse @{u}) 11 | 12 | # Check if the local repository is up-to-date 13 | if [ $LOCAL = $REMOTE ]; then 14 | echo "Tor2Tor is up-to-date." 15 | else 16 | echo "Pulling the latest changes. Please wait..." 17 | git pull 18 | 19 | # Install tor2tor after pulling the updates 20 | pip3 install . 21 | echo "Update complete." 22 | fi 23 | else 24 | echo "Current directory is not a Git repository." 25 | fi 26 | -------------------------------------------------------------------------------- /update.ps1: -------------------------------------------------------------------------------- 1 | # Check if the directory has a .git folder 2 | if (Test-Path ".git") { 3 | 4 | # Fetch the latest updates 5 | git fetch origin 6 | 7 | # Compare local and remote branches 8 | $LOCAL = git rev-parse @ 9 | $REMOTE = git rev-parse "@{u}" 10 | 11 | # Check if the local repository is up-to-date 12 | if ($LOCAL -eq $REMOTE) { 13 | Write-Host "Tor2Tor is up-to-date." 14 | } else { 15 | Write-Host "Pulling the latest changes. Please wait..." 16 | git pull 17 | 18 | # Install tor2tor after pulling the updates 19 | pip3 install . 20 | Write-Host "Update complete." 21 | } 22 | } else { 23 | Write-Host "Current directory is not a Git repository." 24 | } 25 | -------------------------------------------------------------------------------- /tor2tor/main.py: -------------------------------------------------------------------------------- 1 | from .tor2tor import log, args, Tor2Tor 2 | from .coreutils import ( 3 | path_finder, 4 | is_valid_onion, 5 | ) 6 | 7 | 8 | def execute_tor2tor(): 9 | target_onion = args.onion 10 | tor2tor = Tor2Tor() 11 | 12 | if is_valid_onion(url=target_onion): 13 | print(""" 14 | ┏┳┓ ┏┳┓ 15 | ┃ ┏┓┏┓┓ ┃ ┏┓┏┓ 16 | ┻ ┗┛┛ ┗ ┻ ┗┛┛ """ 17 | ) 18 | path_finder( 19 | url=target_onion 20 | ) # Create a directory with the onion link as the name. 21 | 22 | tor2tor.execute_scraper( 23 | target_onion=target_onion, 24 | pool_size=args.pool, 25 | worker_threads=args.threads, 26 | ) 27 | 28 | else: 29 | log.warning(f"{target_onion} does not seem to be a valid onion.") 30 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | buy_me_a_coffee: rly0nheart -------------------------------------------------------------------------------- /uninstall.ps1: -------------------------------------------------------------------------------- 1 | # Define target directories for removal 2 | $torDir = "$env:USERPROFILE\tor2tor\Tor" 3 | $geckoDir = "$env:USERPROFILE\tor2tor\GeckoDriver" 4 | 5 | # Function to remove directory 6 | function RemoveDir([string]$dirPath) { 7 | if (Test-Path $dirPath) { 8 | Remove-Item -Path $dirPath -Recurse -Force 9 | Write-Host "Removed directory: $dirPath" 10 | } else { 11 | Write-Host "Directory $dirPath does not exist." 12 | } 13 | } 14 | 15 | # Remove Tor directory 16 | RemoveDir $torDir 17 | 18 | # Remove GeckoDriver directory 19 | RemoveDir $geckoDir 20 | 21 | # Remove the geckodriver directory from PATH 22 | $pathEnv = [Environment]::GetEnvironmentVariable("PATH", [EnvironmentVariableTarget]::User) 23 | $newPath = ($pathEnv -split ";" | Where-Object { $_ -ne $geckoDir }) -join ";" 24 | [Environment]::SetEnvironmentVariable("PATH", $newPath, [EnvironmentVariableTarget]::User) 25 | Write-Host "Removed GeckoDriver directory from PATH." 26 | 27 | # Uninstall tor2tor Python package 28 | pip uninstall tor2tor -y 29 | 30 | Write-Host "Cleanup complete." 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Richard Mwewa 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["poetry-core"] 3 | build-backend = "poetry.core.masonry.api" 4 | 5 | [tool.poetry] 6 | name = "tor2tor" 7 | version = "0.15.0" 8 | description = "Capture screenshots of onion services on an onion service." 9 | authors = ["Richard Mwewa "] 10 | readme = "README.md" 11 | license = "MIT License" 12 | homepage = "https://hub.docker.com/r/rly0nheart/tor2tor" 13 | repository = "https://github.com/rly0nheart/tor2tor" 14 | documentation = "https://github.com/rly0nheart/tor2tor/blob/latest/README.md" 15 | classifiers = [ 16 | "License :: OSI Approved :: MIT License", 17 | "Development Status :: 5 - Production/Stable", 18 | "Intended Audience :: Information Technology", 19 | "Natural Language :: English", 20 | "Operating System :: POSIX :: Linux", 21 | "Programming Language :: Python :: 3" 22 | ] 23 | 24 | [tool.poetry.dependencies] 25 | python = "^3.10" 26 | rich = "*" 27 | requests = "*" 28 | rich-argparse = "*" 29 | selenium = "*" 30 | BeautifulSoup4 = "*" 31 | 32 | [tool.poetry.scripts] 33 | t2t = "tor2tor.main:execute_tor2tor" 34 | tor2tor = "tor2tor.main:execute_tor2tor" 35 | -------------------------------------------------------------------------------- /install.ps1: -------------------------------------------------------------------------------- 1 | # Define URLs for Tor Expert Bundle and GeckoDriver 2 | $torURL = "https://archive.torproject.org/tor-package-archive/torbrowser/13.0/tor-expert-bundle-13.0-windows-x86_64.tar.gz" 3 | $geckoURL = "https://github.com/mozilla/geckodriver/releases/download/v0.34.0/geckodriver-v0.34.0-win64.zip" 4 | 5 | # Define target directories for installation 6 | $torDir = "$env:USERPROFILE\tor2tor\Tor" 7 | $geckoDir = "$env:USERPROFILE\tor2tor\GeckoDriver" 8 | 9 | # Function to download a file 10 | function DownloadFile([string]$url, [string]$path) { 11 | Invoke-WebRequest -Uri $url -OutFile $path 12 | } 13 | 14 | # Check if Tor directory exists, if not create and download 15 | if (-Not (Test-Path $torDir)) { 16 | New-Item -Path $torDir -ItemType Directory 17 | Write-Host "Downloading Tor..." 18 | DownloadFile $torURL "$torDir\tor.tar.gz" 19 | 20 | # Unpacking the Tor archive 21 | tar -xf "$torDir\tor.tar.gz" -C $torDir 22 | Remove-Item "$torDir\tor.tar.gz" 23 | } 24 | 25 | # Check if GeckoDriver directory exists, if not create and download 26 | if (-Not (Test-Path $geckoDir)) { 27 | New-Item -Path $geckoDir -ItemType Directory 28 | Write-Host "Downloading GeckoDriver..." 29 | DownloadFile $geckoURL "$geckoDir\geckodriver.zip" 30 | 31 | # Unzipping the GeckoDriver 32 | Expand-Archive -Path "$geckoDir\geckodriver.zip" -DestinationPath $geckoDir 33 | Remove-Item "$geckoDir\geckodriver.zip" 34 | } 35 | 36 | # Add the geckodriver directory to PATH 37 | [Environment]::SetEnvironmentVariable("PATH", [Environment]::GetEnvironmentVariable("PATH", [EnvironmentVariableTarget]::User) + ";$geckoDir", [EnvironmentVariableTarget]::User) 38 | 39 | pip install . 40 | Write-Host "Setup complete." 41 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "latest" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "latest" ] 20 | schedule: 21 | - cron: '0 0 * * *' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | # Runner size impacts CodeQL analysis time. To learn more, please see: 27 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 28 | # - https://gh.io/supported-runners-and-hardware-resources 29 | # - https://gh.io/using-larger-runners 30 | # Consider using larger runners for possible analysis time improvements. 31 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 32 | timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }} 33 | permissions: 34 | actions: read 35 | contents: read 36 | security-events: write 37 | 38 | strategy: 39 | fail-fast: false 40 | matrix: 41 | language: [ 'python' ] 42 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby', 'swift' ] 43 | # Use only 'java' to analyze code written in Java, Kotlin or both 44 | # Use only 'javascript' to analyze code written in JavaScript, TypeScript or both 45 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 46 | 47 | steps: 48 | - name: Checkout repository 49 | uses: actions/checkout@v3 50 | 51 | # Initializes the CodeQL tools for scanning. 52 | - name: Initialize CodeQL 53 | uses: github/codeql-action/init@v2 54 | with: 55 | languages: ${{ matrix.language }} 56 | # If you wish to specify custom queries, you can do so here or in a config file. 57 | # By default, queries listed here will override any specified in a config file. 58 | # Prefix the list here with "+" to use these queries and those in the config file. 59 | 60 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 61 | # queries: security-extended,security-and-quality 62 | 63 | 64 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). 65 | # If this step fails, then you should remove it and run the build manually (see below) 66 | - name: Autobuild 67 | uses: github/codeql-action/autobuild@v2 68 | 69 | # ℹ️ Command-line programs to run using the OS shell. 70 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 71 | 72 | # If the Autobuild fails above, remove it and uncomment the following three lines. 73 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 74 | 75 | # - run: | 76 | # echo "Run, Build Application using script" 77 | # ./location_of_script_within_repo/buildscript.sh 78 | 79 | - name: Perform CodeQL Analysis 80 | uses: github/codeql-action/analyze@v2 81 | with: 82 | category: "/language:${{matrix.language}}" 83 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /.github/workflows/docker-publish.yml: -------------------------------------------------------------------------------- 1 | name: Docker 2 | 3 | # This workflow uses actions that are not certified by GitHub. 4 | # They are provided by a third-party and are governed by 5 | # separate terms of service, privacy policy, and support 6 | # documentation. 7 | 8 | on: 9 | schedule: 10 | - cron: '0 0 * * *' 11 | push: 12 | branches: [ "latest" ] 13 | # Publish semver tags as releases. 14 | tags: [ 'v*.*.*' ] 15 | pull_request: 16 | branches: [ "latest" ] 17 | 18 | env: 19 | # Use docker.io for Docker Hub if empty 20 | REGISTRY: docker.io 21 | # github.repository as / 22 | IMAGE_NAME: ${{ github.repository }} 23 | 24 | 25 | jobs: 26 | build: 27 | 28 | runs-on: ubuntu-latest 29 | permissions: 30 | contents: read 31 | packages: write 32 | # This is used to complete the identity challenge 33 | # with sigstore/fulcio when running outside of PRs. 34 | id-token: write 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v3 39 | 40 | # Install the cosign tool except on PR 41 | # https://github.com/sigstore/cosign-installer 42 | - name: Install cosign 43 | if: github.event_name != 'pull_request' 44 | uses: sigstore/cosign-installer@v3.4.0 45 | with: 46 | cosign-release: 'v2.2.3' 47 | 48 | # Set up BuildKit Docker container builder to be able to build 49 | # multi-platform images and export cache 50 | # https://github.com/docker/setup-buildx-action 51 | - name: Set up Docker Buildx 52 | uses: docker/setup-buildx-action@f95db51fddba0c2d1ec667646a06c2ce06100226 # v3.0.0 53 | 54 | # Login against a Docker registry except on PR 55 | # https://github.com/docker/login-action 56 | - name: Log into registry ${{ env.REGISTRY }} 57 | if: github.event_name != 'pull_request' 58 | uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 59 | with: 60 | registry: ${{ env.REGISTRY }} 61 | username: ${{ github.actor }} 62 | password: ${{ secrets.DOCKER_ACCESS_TOKEN }} 63 | 64 | # Extract metadata (tags, labels) for Docker 65 | # https://github.com/docker/metadata-action 66 | - name: Extract Docker metadata 67 | id: meta 68 | uses: docker/metadata-action@96383f45573cb7f253c731d3b3ab81c87ef81934 # v5.0.0 69 | with: 70 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 71 | 72 | # Build and push Docker image with Buildx (don't push on PR) 73 | # https://github.com/docker/build-push-action 74 | - name: Build and push Docker image 75 | id: build-and-push 76 | uses: docker/build-push-action@0565240e2d4ab88bba5387d719585280857ece09 # v5.0.0 77 | with: 78 | context: . 79 | push: ${{ github.event_name != 'pull_request' }} 80 | tags: ${{ steps.meta.outputs.tags }} 81 | labels: ${{ steps.meta.outputs.labels }} 82 | cache-from: type=gha 83 | cache-to: type=gha,mode=max 84 | 85 | # Sign the resulting Docker image digest except on PRs. 86 | # This will only write to the public Rekor transparency log when the Docker 87 | # repository is public to avoid leaking data. If you would like to publish 88 | # transparency data even for private images, pass --force to cosign below. 89 | # https://github.com/sigstore/cosign 90 | - name: Sign the published Docker image 91 | if: ${{ github.event_name != 'pull_request' }} 92 | env: 93 | # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable 94 | TAGS: ${{ steps.meta.outputs.tags }} 95 | DIGEST: ${{ steps.build-and-push.outputs.digest }} 96 | # This step uses the identity token to provision an ephemeral certificate 97 | # against the sigstore community Fulcio instance. 98 | run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![banner](https://github.com/rly0nheart/tor2tor/assets/74001397/3ce19824-9414-4828-a770-081b0b0ae857) 2 | 3 | **Tor2Tor** scrapes a given onion link and captures screenshots of all links available on it. 4 | 5 | ![Python](https://img.shields.io/badge/Python-14354C?style=flat&logo=python) 6 | ![Powershell](https://img.shields.io/badge/PowerShell-000000?style=flat&logo=powershell) 7 | ![Shell](https://img.shields.io/badge/Shell-121011?style=flat&logo=gnu-bash) 8 | ![Dockerfile](https://img.shields.io/badge/Dockerfile-grey.svg?style=flat&logo=docker) 9 | [![Docker](https://github.com/rly0nheart/tor2tor/actions/workflows/docker-publish.yml/badge.svg)](https://github.com/rly0nheart/tor2tor/actions/workflows/docker-publish.yml) 10 | [![CodeQL](https://github.com/rly0nheart/tor2tor/actions/workflows/codeql.yml/badge.svg)](https://github.com/rly0nheart/tor2tor/actions/workflows/codeql.yml) 11 | *** 12 | 13 | # Installation ⬇️ 14 | 15 | ## Note ⚠️ 16 | 17 | > This assumes the Firefox browser is installed on the user's machine. 18 | 19 | **1.** Clone the repository 20 | 21 | ```commandline 22 | git clone https://github.com/rly0nheart/tor2tor 23 | ``` 24 | 25 | **2.** Move to the tor2tor directory 26 | 27 | ```commandline 28 | cd tor2tor 29 | ``` 30 | 31 |
32 | 🐧 Linux 33 | 34 | Run the installation script 35 | > Assuming it has already been made executable with `sudo chmod +x install.sh` 36 | 37 | ```commandline 38 | sudo ./install.sh 39 | ``` 40 | 41 | The installation script will install `tor` then download and setup the latest version of `geckodriver`, and 42 | install `tor2tor` together with its dependencies (because we're all too lazy to manually do it) 43 | *** 44 |
45 | 46 |
47 | 🪟 Windows 48 | 49 | Run the powershell installation script 50 | 51 | ```powershell 52 | .\install.ps1 53 | ``` 54 | 55 | The installation script will download the `tor` bundle, `geckodriver`, and install `tor2tor` together with its 56 | dependencies. The downloads will be stored in the `tor2tor` directory. 57 |
58 | 59 |
60 | 🐋 Docker Image 61 | 62 | ## Note ⚠️ 63 | 64 | > This assumes you have docker installed and running 65 | 66 | You can just pull the docker image from [DockerHub](https://hub.docker.com/r/rly0nheart/tor2tor) by running: 67 | 68 | ```commandline 69 | docker pull rly0nheart/tor2tor 70 | ``` 71 | 72 | *** 73 |
74 | 75 | # Usage ⌨️ 76 | 77 |
78 | 🐧 Linux 79 | 80 | To see available options/usage, call *Tor2Tor* with the `-h/--help` flag 81 | 82 | ```commandline 83 | tor2tor --help 84 | ``` 85 | 86 | or 87 | 88 | ```commandline 89 | t2t --help 90 | ``` 91 | 92 | Calling it with an onion url should look like the following 93 | 94 | ```commandline 95 | sudo tor2tor http://example.onion 96 | ``` 97 | 98 | *** 99 | 100 |
101 | 102 |
103 | 🪟 Windows 104 | 105 | To see available options/usage, call *Tor2Tor* with the `-h/--help` flag 106 | 107 | ```commandline 108 | tor2tor --help 109 | ``` 110 | 111 | or 112 | 113 | ```commandline 114 | t2t --help 115 | ``` 116 | 117 | Calling it with an onion url should look like the following 118 | 119 | ```commandline 120 | tor2tor http://example.onion 121 | ``` 122 | 123 | *** 124 | 125 |
126 | 127 |
128 | 🐋 Docker Container 129 | 130 | The *Tor2Tor* container can be called with `docker run` like so: 131 | 132 | ```commandline 133 | docker run rly0nheart/tor2tor --help 134 | ``` 135 | 136 | Calling the container with an onion url should look like the following 137 | 138 | ```commandline 139 | docker run --tty --volume $PWD/tor2tor:/root/tor2tor rly0nheart/tor2tor http://example.onion 140 | ``` 141 | 142 | ## Note ⚠️ 143 | 144 | > --tty Allocates a pseudo-TTY, use it to enable the container to display colours (trust me, you will need this) 145 | >> --volume $PWD/tor2tor:/root/tor2tor Will mount the *tor2tor* directory from the container to your host machine's 146 | *tor2tor* directory. 147 | 148 | *** 149 |
150 | 151 | # Updating ⬆️ 152 | 153 |
154 | 🐧 Linux 155 | 156 | [*update.sh*](https://github.com/rly0nheart/tor2tor/blob/latest/update.sh) 157 | > Assuming it has already been made executable with `sudo chmod +x update.sh` 158 | 159 | Navigate to the `tor2tor` directory that you cloned and find the `update.sh` file. 160 | 161 | and run it 162 | 163 | ```commandline 164 | sudo ./update.sh 165 | ``` 166 | 167 | The script will pull the latest changes (if any are available) then rebuild and install the package. 168 | 169 | *** 170 | 171 |
172 | 173 |
174 | 🪟 Windows 175 | 176 | Navigate to the `tor2tor` directory that you cloned and find the `update.ps1` file. 177 | 178 | ```powershell 179 | .\update.ps1 180 | ``` 181 | 182 | The script will pull the latest changes (if any are available) then rebuild and install the package. 183 | 184 | *** 185 | 186 |
187 | 188 |
189 | 🐋 Docker Container 190 | 191 | As for the docker container, just run the docker pull command again. 192 | 193 | ```commandline 194 | docker run rly0nheart/tor2tor --help 195 | ``` 196 | 197 | Calling the container with an onion url should look like the following 198 | 199 | ```commandline 200 | docker run --tty --volume $PWD/tor2tor:/root/tor2tor rly0nheart/tor2tor http://example.onion 201 | ``` 202 | 203 | ## Note ⚠️ 204 | 205 | > --tty Allocates a pseudo-TTY, use it to enable the container to display colours (trust me, you will need this) 206 | >> --volume $PWD/tor2tor:/root/tor2tor Will mount the *tor2tor* directory from the container to your host machine's 207 | *tor2tor* directory. 208 | 209 | *** 210 |
211 | 212 | # Uninstalling ❌ 213 | 214 |
215 | 🐧 Linux 216 | 217 | ## Note ⚠️ 218 | 219 | > Assuming it has already been made executable with `sudo chmod +x uninstall.sh` 220 | 221 | Navigate to the `tor2tor` directory that you cloned and find the `uninstall.sh` file. 222 | 223 | Run it! 224 | 225 | ```commandline 226 | sudo ./uninstall.sh 227 | ``` 228 | 229 | This will uninstall `tor`, delete the `geckodriver` binary and uninstall `tor2tor` 230 | *** 231 |
232 | 233 |
234 | 🪟 Windows 235 | 236 | Navigate to the `tor2tor` directory that you cloned and find the `uninstall.ps1` file. 237 | 238 | Run it! 239 | 240 | ```powershell 241 | .\uninstall.sh 242 | ``` 243 | 244 | This will delete the `geckodriver` and tor binaries then uninstall `tor2tor` 245 | *** 246 |
247 | 248 |
249 | 🐋 Docker Container 250 | 251 | You can stop (if it's running) and remove the container by running: 252 | 253 | ```commandline 254 | docker rm -f rly0nheart/tor2tor 255 | ``` 256 | 257 | *** 258 |
259 | 260 | # Important 🚧 261 | 262 | As you probably already know,Tor routes data via three relays (servers) for your privacy. 263 | As a result, connections become slower than an ordinary connection. 264 | 265 | ## Point ⚠️ 266 | 267 | Once you start **Tor2Tor**, give it at least 2 minutes tops to query the specified onion url and extract links from it. 268 | 269 | If you want to work around this, you can always just use a cloud shell service. 270 | 271 | # Screenshots 272 | 273 | ![tor2tor-archive](https://github.com/rly0nheart/tor2tor-archive/assets/74001397/759082c5-f5ea-4b25-80da-a756d182ae86) 274 | 275 | There's a dedicated repository of onion screenshots captured with **Tor2Tor** 276 | at [Tor2Tor Archive](https://github.com/rly0nheart/tor2tor-archive) 277 | 278 | ## CI/CD Workflow 🌊 279 | 280 | ### Docker Image Building 🐳 281 | 282 | - Pushing to or merging into the `latest` branch triggers an automatic build of the Docker image. 283 | - This image is tagged as `latest` on Docker Hub, indicating it's the most stable release. 284 | 285 | *** 286 | ![me](https://github.com/rly0nheart/tor2tor/assets/74001397/97bf7845-db43-4fd0-87bd-04e8b6b02e74) 287 | 288 | 289 | 290 | -------------------------------------------------------------------------------- /tor2tor/coreutils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import re 6 | import subprocess 7 | import time 8 | from datetime import datetime, date 9 | from urllib.parse import urlparse 10 | 11 | import requests 12 | from rich import print 13 | from rich.logging import RichHandler 14 | from rich.markdown import Markdown 15 | from rich.table import Table 16 | from rich_argparse import RichHelpFormatter 17 | 18 | from . import __author__, __version__ 19 | 20 | # Construct path to the user's home directory 21 | PROGRAM_DIRECTORY = os.path.expanduser(os.path.join("~", "tor2tor")) 22 | 23 | 24 | def load_settings() -> dict: 25 | """ 26 | Loads settings from /settings/settings.json 27 | 28 | :return: Dictionary (JSON) containing settings 29 | """ 30 | # Get the absolute path of the current file 31 | current_dir = os.path.dirname(os.path.abspath(__file__)) 32 | 33 | # Construct the path to the settings.json file 34 | settings_path = os.path.join(current_dir, "settings", "settings.json") 35 | 36 | # Load the settings from the file 37 | with open(settings_path) as file: 38 | data = json.load(file) 39 | 40 | return data 41 | 42 | 43 | def create_parser() -> argparse.ArgumentParser: 44 | from . import __version__, __epilog__, __description__ 45 | 46 | parser = argparse.ArgumentParser( 47 | description=Markdown(__description__, "argparse.text"), 48 | epilog=Markdown(__epilog__), 49 | formatter_class=RichHelpFormatter, 50 | ) 51 | parser.add_argument("onion", help="onion url to scrape") 52 | parser.add_argument( 53 | "--headless", 54 | help="run Firefox WebDriver instances in headless mode", 55 | action="store_true", 56 | ) 57 | parser.add_argument( 58 | "-l", "--limit", help="number of onion links to capture", type=int, default=10 59 | ) 60 | parser.add_argument( 61 | "-p", 62 | "--pool", 63 | help="size of the Firefox WebDriver instance pool (default: %(default)s)", 64 | type=int, 65 | default=3, 66 | ) 67 | parser.add_argument( 68 | "-t", 69 | "--threads", 70 | help="number of worker threads to run (default: %(default)s)", 71 | type=int, 72 | default=3, 73 | ) 74 | parser.add_argument( 75 | "--log-skipped", 76 | help="log skipped onions on output", 77 | dest="log_skipped", 78 | action="store_true", 79 | ) 80 | parser.add_argument( 81 | "-d", "--debug", help="run program in debug mode", action="store_true" 82 | ) 83 | parser.add_argument( 84 | "-v", 85 | "--version", 86 | version=f"Tor2Tor v{__version__} Copyright (c) 2023-{date.today().year} {__author__}", 87 | action="version", 88 | ) 89 | return parser 90 | 91 | 92 | def set_loglevel(debug_mode: bool) -> logging.getLogger: 93 | """ 94 | Configure and return a logging object with the specified log level. 95 | 96 | :param debug_mode: If True, the log level is set to "NOTSET". Otherwise, it is set to "INFO". 97 | :return: A logging object configured with the specified log level. 98 | """ 99 | logging.basicConfig( 100 | level="NOTSET" if debug_mode else "INFO", 101 | format="%(message)s", 102 | handlers=[ 103 | RichHandler(markup=True, log_time_format="%H:%M:%S", show_level=debug_mode) 104 | ], 105 | ) 106 | return logging.getLogger("Tor2Tor") 107 | 108 | 109 | def add_http_to_link(link: str) -> str: 110 | """ 111 | Adds 'http://' to the URL if it doesn't already start with 'http://' or 'https://'. 112 | 113 | :param link: The link to modify. 114 | :return: The modified URL. 115 | """ 116 | if not link.startswith(("http://", "https://")): 117 | return f"http://{link}" 118 | return link 119 | 120 | 121 | def is_valid_onion(url: str) -> bool: 122 | """ 123 | Uses a regex pattern to determine whether a given url is an onion service or not. 124 | 125 | :param url: The url to check. 126 | :return: True if the url matches the strict pattern criterion. False if it doesn't 127 | 128 | Regex Explanation 129 | ----------------- 130 | - ^ - Asserts the start of a string. 131 | - (http://|https://)? - Matches HTTP or HTTPS protocol in the string (optional). 132 | - (www\\.)? - Optionally matches the www. subdomain. 133 | - ([a-z2-7]{54,}d) - Matches 55 or more characters, where each can be a lowercase letter or a digit from 2 to 7, 134 | and ends with 'd'. 135 | - \\.onion - Matches .onion. 136 | - (/|$) - Matches either a forward slash or the end of the string. 137 | """ 138 | if re.search(r"^(http://|https://)?(www\.)?([a-z2-7]{54,}d)\.onion(/|$)", url): 139 | return True 140 | else: 141 | return False 142 | 143 | 144 | def create_table(table_headers: list, table_title: str = "") -> Table: 145 | """ 146 | Creates a rich table with the given column headers. 147 | 148 | :param table_headers: The column headers to add to the Table. 149 | :param table_title: The title of the table (an empty string is the default tile). 150 | :returns: A table with added column headers. 151 | """ 152 | table = Table( 153 | title=table_title, 154 | title_style="italic", 155 | caption=f"{time.asctime()}", 156 | caption_style="italic", 157 | show_header=True, 158 | header_style="bold", 159 | highlight=True, 160 | ) 161 | for header in table_headers: 162 | table.add_column(header, style="dim" if header == "#" else "") 163 | return table 164 | 165 | 166 | def construct_output_name(url: str) -> str: 167 | """ 168 | Constructs an output name based on the network location part (netloc) of a given URL. 169 | 170 | :param url: The URL to parse. 171 | :return: The network location part (netloc) of the URL. 172 | """ 173 | parsed_url = urlparse(url) 174 | output_name = parsed_url.netloc 175 | return output_name 176 | 177 | 178 | def path_finder(url: str): 179 | """ 180 | Checks if the specified directories exist. 181 | If not, it creates them. 182 | """ 183 | os.makedirs( 184 | os.path.join(PROGRAM_DIRECTORY, construct_output_name(url=url)), exist_ok=True 185 | ) 186 | 187 | 188 | def convert_timestamp_to_datetime(timestamp: float) -> datetime: 189 | """ 190 | Converts a Unix timestamp to a datetime object. 191 | 192 | :param timestamp: The Unix timestamp to be converted, given as a float. 193 | :return: A datetime object. 194 | """ 195 | datetime_from_timestamp = datetime.fromtimestamp(timestamp) 196 | return datetime_from_timestamp 197 | 198 | 199 | def get_file_info(filename: str) -> tuple: 200 | """ 201 | Gets a given file's information. 202 | 203 | :param filename: File to get info for. 204 | :return: A tuple containing the file's size and created time. 205 | """ 206 | file_size = os.path.getsize(filename=filename) 207 | 208 | created_time = convert_timestamp_to_datetime( 209 | timestamp=os.path.getmtime(filename=filename) 210 | ) 211 | 212 | return file_size, created_time 213 | 214 | 215 | def check_updates(): 216 | """ 217 | Checks the program's updates by comparing the current program version tag with the remote version tag from GitHub. 218 | """ 219 | response = requests.get( 220 | "https://api.github.com/repos/rly0nheart/tor2tor/releases/latest" 221 | ).json() 222 | remote_version = response.get("tag_name") 223 | 224 | if remote_version != __version__: 225 | log.info( 226 | f"Tor2Tor version {remote_version} published at {response.get('published_at')} " 227 | f"is available. Run the 'update.sh' " 228 | f"script (for local installation) or re-pull the image (for docker container) " 229 | f"with 'docker pull rly0nheart/tor2tor' to get the updates. " 230 | ) 231 | release_notes = Markdown(response.get("body")) 232 | print(release_notes) 233 | print("\n") 234 | 235 | 236 | def tor_service(command: str): 237 | """ 238 | Starts/Stops the Tor service based on the provided command and operating system. 239 | 240 | This function can start or stop the Tor service on Windows and Unix-like 241 | systems. On Windows, it looks for Tor\\tor\\tor.exe in the user's home directory. 242 | 243 | :param command: The command to manage the Tor service. Acceptable values are "start" or "stop". 244 | :raise: subprocess.CalledProcessError If the subprocess fails to execute. 245 | """ 246 | 247 | if command not in ["start", "stop"]: 248 | log.warning("Command must be either 'start' or 'stop'") 249 | 250 | try: 251 | if os.name == "nt": 252 | tor_path = os.path.join(PROGRAM_DIRECTORY, load_settings().get("tor.exe")) 253 | 254 | if command == "start": 255 | log.info(f"Starting {tor_path}...") 256 | subprocess.Popen(tor_path) 257 | else: 258 | subprocess.Popen("taskkill /IM tor.exe /F") 259 | 260 | else: 261 | subprocess.run(["service", "tor", command]) 262 | 263 | except subprocess.CalledProcessError as e: 264 | print(f"Failed to {command} the Tor service: {e}") 265 | 266 | 267 | args = create_parser().parse_args() 268 | log = set_loglevel(debug_mode=args.debug) 269 | -------------------------------------------------------------------------------- /tor2tor/tor2tor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import time 5 | from datetime import datetime 6 | from queue import Queue 7 | from threading import Lock, Thread 8 | 9 | import requests 10 | from rich import print 11 | from rich.table import Table 12 | from bs4 import BeautifulSoup 13 | from selenium import webdriver 14 | from selenium.webdriver.firefox.options import Options 15 | 16 | from . import __version__ 17 | from .coreutils import ( 18 | log, 19 | args, 20 | tor_service, 21 | create_table, 22 | load_settings, 23 | get_file_info, 24 | is_valid_onion, 25 | PROGRAM_DIRECTORY, 26 | add_http_to_link, 27 | construct_output_name, 28 | convert_timestamp_to_datetime, 29 | check_updates, 30 | ) 31 | 32 | 33 | class Tor2Tor: 34 | def __init__(self): 35 | # Initialise locks for logging and table updates 36 | self.log_lock = Lock() 37 | self.table_lock = Lock() 38 | 39 | # Initialise queues for storing captured and skipped onions 40 | self.captured_onions_queue = Queue() 41 | self.skipped_onions_queue = Queue() 42 | 43 | # Initialise tor proxy settings 44 | self.socks_host = load_settings().get("proxy").get("socks5").get("host") 45 | self.socks_port = load_settings().get("proxy").get("socks5").get("port") 46 | self.socks_type = load_settings().get("proxy").get("socks5").get("type") 47 | self.socks_version = load_settings().get("proxy").get("socks5").get("version") 48 | 49 | def firefox_options(self, instance_index: int) -> Options: 50 | """ 51 | Configure Firefox options for web scraping with a headless browser and Tor network settings. 52 | 53 | :param instance_index: Index of the opened WebDriver instance in the firefox_pool. 54 | :returns: A Selenium WebDriver Options object with preset configurations. 55 | """ 56 | options = Options() 57 | options.add_argument("--incognito") 58 | if args.headless: 59 | options.add_argument("--headless") 60 | log.info(f"Running headless on WebDriver instance {instance_index}...") 61 | options.set_preference("network.proxy.type", self.socks_type) 62 | options.set_preference("network.proxy.socks", self.socks_host) # "127.0.0.1" 63 | options.set_preference("network.proxy.socks_port", self.socks_port) 64 | options.set_preference("network.proxy.socks_version", self.socks_version) 65 | options.set_preference("network.proxy.socks_remote_dns", True) 66 | options.set_preference("network.dns.blockDotOnion", False) 67 | return options 68 | 69 | def open_firefox_pool(self, pool_size: int) -> Queue: 70 | """ 71 | Initializes a queue of Firefox WebDriver instances for future use. 72 | 73 | :param pool_size: The number of Firefox instances to create. 74 | :return: A queue containing the created Firefox instances. 75 | """ 76 | # Initialize a new queue to hold the Firefox instances. 77 | pool = Queue() 78 | 79 | log.info(f"Opening WebDriver pool with {pool_size} instances...") 80 | 81 | # Populate the pool with Firefox instances. 82 | for instance_index, webdriver_instance in enumerate( 83 | range(pool_size), start=1 84 | ): # Create 3 (default) instances 85 | driver = webdriver.Firefox( 86 | options=self.firefox_options(instance_index=instance_index), 87 | ) 88 | pool.put(driver) 89 | 90 | return pool 91 | 92 | @staticmethod 93 | def close_firefox_pool(pool: Queue): 94 | """ 95 | Closes all the Firefox instances in the pool. 96 | 97 | :param pool: The pool containing Firefox WebDriver instances to close. 98 | """ 99 | log.info("Closing WebDriver pool...") 100 | while not pool.empty(): 101 | driver = pool.get() 102 | driver.quit() 103 | 104 | def worker(self, tasks_queue: Queue, screenshots_table: Table, firefox_pool: Queue): 105 | """ 106 | Worker function to capture screenshots of websites. 107 | 108 | This function is intended to be used as a target for a Thread. It captures screenshots 109 | of websites as tasks are fed via the queue. The function borrows a Firefox instance from 110 | the pool for each task and returns it after the task is complete. 111 | 112 | :param tasks_queue: The queue containing tasks (websites to capture). 113 | :param screenshots_table: A table where captured screenshot metadata is stored. 114 | :param firefox_pool: The pool of Firefox WebDriver instances. 115 | """ 116 | onion_index = None 117 | driver = None 118 | onion = None 119 | 120 | # Continue working as long as the queue is not empty 121 | while not tasks_queue.empty(): 122 | try: 123 | # Get a new task from the queue 124 | onion_index, onion = tasks_queue.get() 125 | 126 | driver = firefox_pool.get() 127 | 128 | # Capture the screenshot 129 | self.capture_onion( 130 | onion_url=onion, 131 | onion_index=onion_index, 132 | driver=driver, 133 | screenshots_table=screenshots_table, 134 | ) 135 | self.captured_onions_queue.put( 136 | ( 137 | onion_index, 138 | onion, 139 | convert_timestamp_to_datetime(timestamp=time.time()), 140 | ) 141 | ) 142 | 143 | # On successful capture, return the Firefox instance back to the pool and mark the task as done 144 | # Do the same on exception. 145 | firefox_pool.put(driver) 146 | tasks_queue.task_done() 147 | 148 | except KeyboardInterrupt: 149 | log.warning("User interruption detected ([yellow]Ctrl+C[/])") 150 | sys.exit() 151 | except Exception as e: 152 | if args.log_skipped: 153 | log.error(f"{onion_index} [yellow]{e}[/]") 154 | 155 | # Add the skipped onion index, the onion itself, the time it was skipped, and the reason it was skipped 156 | self.skipped_onions_queue.put( 157 | ( 158 | onion_index, 159 | onion, 160 | f"[yellow]{e}[/]", 161 | convert_timestamp_to_datetime(timestamp=time.time()), 162 | ) 163 | ) 164 | 165 | firefox_pool.put(driver) 166 | tasks_queue.task_done() 167 | 168 | def execute_worker( 169 | self, 170 | worker_threads: int, 171 | tasks_queue: Queue, 172 | screenshots_table: Table, 173 | firefox_pool: Queue, 174 | ): 175 | """ 176 | Executes the worker method. 177 | 178 | :param worker_threads: Number of threads to execute the worker with. 179 | :param tasks_queue: The queue containing tasks (websites to capture). 180 | :param screenshots_table: The table where captured screenshots will be added. 181 | :param firefox_pool: A pool containing n number of firefox instances. 182 | """ 183 | # Initialize threads 184 | threads = [] 185 | for _ in range(worker_threads): # create 3 (default) worker threads 186 | t = Thread( 187 | target=self.worker, args=(tasks_queue, screenshots_table, firefox_pool) 188 | ) 189 | t.start() 190 | threads.append(t) 191 | 192 | # Wait for all threads to finish 193 | for thread in threads: 194 | thread.join() 195 | 196 | def get_onion_response(self, onion_url: str) -> BeautifulSoup: 197 | """ 198 | Fetches the HTML content of a given onion link using a SOCKS5 proxy. 199 | 200 | :param onion_url: The onion URL to fetch the content from. 201 | :return: A BeautifulSoup object containing the parsed HTML content. 202 | """ 203 | 204 | # Define the SOCKS5 proxy settings 205 | proxies = { 206 | "http": f"socks5h://{self.socks_host}:{self.socks_port}", 207 | "https": f"socks5h://{self.socks_host}:{self.socks_port}", 208 | } 209 | 210 | # Perform the HTTP GET request 211 | response = requests.get(onion_url, proxies=proxies) 212 | 213 | # Parse the HTML content using BeautifulSoup 214 | soup = BeautifulSoup(response.content, "html.parser") 215 | 216 | return soup 217 | 218 | def get_onions_on_page(self, onion_url: str) -> list: 219 | """ 220 | Scrapes a given onion URL and extracts all valid URLs found in tags. 221 | 222 | :param onion_url: The onion URL to scrape. 223 | :return: A list of valid URLs found on the page. 224 | 225 | Regex Explanation: 226 | ----------------- 227 | - `https?`: Matches either 'http' or 'https'. 228 | - `://`: Matches the '://' that follows the protocol. 229 | - `\\S+`: Matches one or more non-whitespace characters. 230 | """ 231 | 232 | # Initialize an empty list to store valid URLs 233 | valid_onions = [] 234 | 235 | # Fetch the page content 236 | page_content = self.get_onion_response(onion_url=onion_url) 237 | 238 | # Define the regex pattern to match URLs 239 | url_pattern = re.compile(r"https?://\S+") 240 | 241 | # Find all tags in the HTML content 242 | found_onions = page_content.find_all("a") 243 | 244 | # Loop through each tag and extract the href attribute 245 | for onion_index, onion in enumerate(found_onions, start=1): 246 | href = onion.get("href") 247 | # Check if the 'href' attribute exists and is not None 248 | if href: 249 | # Find all URLs in the 'href' attribute using the regex pattern 250 | urls = url_pattern.findall(href) 251 | # Loop through each URL found in the 'href' attribute 252 | for url in urls: 253 | # Check if the URL is a valid Onion URL 254 | if is_valid_onion(url): 255 | # Append the valid Onion URL to the list of valid_onion_urls 256 | valid_onions.append(url) 257 | 258 | log.info(f"Found {len(valid_onions)} links on {onion_url}") 259 | return valid_onions 260 | 261 | def capture_onion( 262 | self, onion_url: str, onion_index, driver: webdriver, screenshots_table: Table 263 | ): 264 | """ 265 | Captures a screenshot of a given onion link using a webdriver. 266 | 267 | :param onion_url: The onion URL to capture. 268 | :param onion_index: The index of the onion link in a list or sequence. 269 | :param driver: The webdriver instance to use for capturing the screenshot. 270 | :param screenshots_table: Table to add captured screenshots to. 271 | """ 272 | 273 | # Construct the directory name based on the URL 274 | directory_name = construct_output_name(url=args.onion) 275 | 276 | # Add HTTP to the URL if it's not already there 277 | validated_onion_link = add_http_to_link(link=onion_url) 278 | 279 | # Construct the filename for the screenshot from the onion link 280 | filename = construct_output_name(url=validated_onion_link) + ".png" 281 | 282 | # Construct the full file path 283 | file_path = os.path.join(PROGRAM_DIRECTORY, directory_name, filename) 284 | 285 | # Log the onion link being captured 286 | log.info(f"{onion_index} Capturing... {validated_onion_link}") 287 | 288 | # Navigate to the URL 289 | driver.get(validated_onion_link) 290 | 291 | if os.path.exists(path=file_path): 292 | log.info(f"{onion_index} [yellow][italic]{filename}[/][/] already exists.") 293 | else: 294 | # Take a full screenshot of the onion and save it to the given file path 295 | driver.save_full_page_screenshot(file_path) 296 | 297 | with self.log_lock: 298 | # Log the successful capture 299 | log.info( 300 | f"{onion_index} [dim]{driver.title}[/] - [yellow][italic][link file://{filename}]{filename}[/][/]" 301 | ) 302 | 303 | with self.table_lock: 304 | # Add screenshot info to the Table 305 | file_size, created_time = get_file_info(filename=file_path) 306 | screenshots_table.add_row( 307 | str(onion_index), 308 | filename, 309 | str(file_size), 310 | str(created_time), 311 | ) 312 | 313 | def execute_scraper( 314 | self, 315 | target_onion: str, 316 | pool_size: int, 317 | worker_threads: int, 318 | ): 319 | """ 320 | Executes the scraper code. 321 | 322 | :param target_onion: The onion to scrape. 323 | :param pool_size: Size of the WebDriver instance pool (default is 3). 324 | :param worker_threads: Number of threads. 325 | """ 326 | firefox_pool = None 327 | 328 | start_time = datetime.now() 329 | log.info(f"Starting 🧅Tor2Tor {__version__} {start_time}...") 330 | 331 | try: 332 | check_updates() 333 | 334 | tor_service(command="start") # Start the Tor service. 335 | 336 | # Fetch onion URLs from the provided URL 337 | onions = self.get_onions_on_page( 338 | onion_url=add_http_to_link(link=target_onion) 339 | ) 340 | 341 | firefox_pool = self.open_firefox_pool(pool_size=pool_size) 342 | 343 | # Create a table where capture screenshots will be displayed 344 | screenshots_table = create_table( 345 | table_title="Screenshots", 346 | table_headers=["#", "filename", "size (bytes)", "timestamp"], 347 | ) 348 | 349 | # Initialize Queue and add tasks 350 | tasks_queue = Queue() 351 | 352 | for onion_index, onion in enumerate(onions, start=1): 353 | tasks_queue.put((onion_index, onion)) 354 | 355 | if onion_index == args.limit: 356 | # If onion index is equal to the limit set in -l/--limit, break the loop. 357 | break 358 | 359 | self.execute_worker( 360 | worker_threads=worker_threads, 361 | tasks_queue=tasks_queue, 362 | screenshots_table=screenshots_table, 363 | firefox_pool=firefox_pool, 364 | ) 365 | 366 | log.info("DONE!\n") 367 | 368 | # Print table showing captured screenshots 369 | print(screenshots_table) 370 | print("\n") 371 | 372 | # Print the summary tables for captured and skipped onions 373 | captured_onions, skipped_onions = self.onion_summary_tables( 374 | captured_onions=list(self.captured_onions_queue.queue), 375 | skipped_onions=list(self.skipped_onions_queue.queue), 376 | ) 377 | 378 | log.info(f"{len(self.captured_onions_queue.queue)} onions captured.") 379 | print(captured_onions) 380 | 381 | log.info(f"{len(self.skipped_onions_queue.queue)} onions skipped.") 382 | print(skipped_onions) 383 | 384 | except KeyboardInterrupt: 385 | log.warning(f"User Interruption detected ([yellow]Ctrl+C[/])") 386 | sys.exit() 387 | except Exception as e: 388 | log.error(f"An error occurred: [red]{e}[/]") 389 | sys.exit() 390 | finally: 391 | if firefox_pool is not None: 392 | self.close_firefox_pool(pool=firefox_pool) 393 | 394 | tor_service(command="stop") # Stop the Tor service. 395 | log.info(f"Stopped in {datetime.now() - start_time} seconds.") 396 | 397 | @staticmethod 398 | def onion_summary_tables( 399 | captured_onions: list, 400 | skipped_onions: list, 401 | ) -> tuple: 402 | """ 403 | Creates tables showing a summary of captured and skipped onions. 404 | 405 | Note 406 | ---- 407 | - The index value in the loops, holds the index of the onion in the captured/skipped onions lists 408 | - And the *_onion[0] holds the index of the onion from the scraper task. 409 | 410 | :param captured_onions: A list of tuples, each containing the captured onion url and its index 411 | from the scraper task. 412 | :param skipped_onions: A list of tuples, 413 | each containing the skipped onion url and its index from the scraper task. 414 | :returns: A tuple containing the captured and skipped onions tables: 415 | (captured_onions_table, skipped_onions_table). 416 | """ 417 | 418 | # Create a table of captured onions 419 | captured_onions_table = create_table( 420 | table_headers=["#", "index", "onion", "timestamp"], 421 | ) 422 | for index, captured_onion in enumerate(captured_onions, start=1): 423 | captured_onions_table.add_row( 424 | str(index), # Index of the onion from the captured_onions list 425 | str(captured_onion[0]), # Index of the onion from the scraping task 426 | str(captured_onion[1]), # Onion url 427 | str(captured_onion[2]), # Time the onion was captured 428 | ) 429 | 430 | # Create a table of skipped onions 431 | skipped_onions_table = create_table( 432 | table_headers=["#", "index", "onion", "reason", "timestamp"], 433 | ) 434 | for index, skipped_onion in enumerate(skipped_onions, start=1): 435 | skipped_onions_table.add_row( 436 | str(index), # Index of the onion from the skipped_onions list 437 | str(skipped_onion[0]), # Index of the onion from the scraping task 438 | str(skipped_onion[1]), # Onion url 439 | str(skipped_onion[2]), # Reason the onion was skipped 440 | str(skipped_onion[3]), # Time the onion was skipped 441 | ) 442 | 443 | return captured_onions_table, skipped_onions_table 444 | --------------------------------------------------------------------------------