├── .dockerignore ├── .git-blame-ignore-revs ├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ └── issue-template.md ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ ├── docker-build-push.yml │ ├── dockerci.yml │ └── theHarvester.yml ├── .gitignore ├── Dockerfile ├── README.md ├── README ├── CONTRIBUTING.md ├── COPYING └── LICENSES ├── bin ├── restfulHarvest └── theHarvester ├── docker-compose.yml ├── pyproject.toml ├── requirements.txt ├── restfulHarvest.py ├── tests ├── __init__.py ├── discovery │ ├── __init__.py │ ├── test_certspotter.py │ ├── test_githubcode.py │ └── test_otx.py ├── lib │ └── test_core.py └── test_myparser.py ├── theHarvester-logo.png ├── theHarvester-logo.webp ├── theHarvester.py └── theHarvester ├── __init__.py ├── __main__.py ├── data ├── api-keys.yaml ├── proxies.yaml └── wordlists │ ├── api_endpoints.txt │ ├── dns-big.txt │ ├── dns-names.txt │ ├── dorks.txt │ ├── general │ └── common.txt │ └── names_small.txt ├── discovery ├── __init__.py ├── api_endpoints.py ├── baidusearch.py ├── bevigil.py ├── bingsearch.py ├── bravesearch.py ├── bufferoverun.py ├── builtwith.py ├── censysearch.py ├── certspottersearch.py ├── constants.py ├── criminalip.py ├── crtsh.py ├── dnssearch.py ├── duckduckgosearch.py ├── fullhuntsearch.py ├── githubcode.py ├── hackertarget.py ├── haveibeenpwned.py ├── huntersearch.py ├── intelxsearch.py ├── leaklookup.py ├── netlas.py ├── onyphe.py ├── otxsearch.py ├── pentesttools.py ├── projectdiscovery.py ├── rapiddns.py ├── rocketreach.py ├── search_dehashed.py ├── search_dnsdumpster.py ├── searchhunterhow.py ├── securityscorecard.py ├── securitytrailssearch.py ├── shodansearch.py ├── sitedossier.py ├── subdomaincenter.py ├── subdomainfinderc99.py ├── takeover.py ├── threatminer.py ├── tombasearch.py ├── urlscan.py ├── venacussearch.py ├── virustotal.py ├── whoisxml.py ├── yahoosearch.py └── zoomeyesearch.py ├── lib ├── __init__.py ├── api │ ├── __init__.py │ ├── additional_endpoints.py │ ├── api.py │ ├── api_example.py │ └── static │ │ └── .gitkeep ├── core.py ├── hostchecker.py ├── ip-ranges.json ├── resolvers.txt ├── stash.py └── version.py ├── parsers ├── __init__.py ├── intelxparser.py ├── myparser.py ├── securitytrailsparser.py └── venacusparser.py ├── restfulHarvest.py ├── screenshot ├── __init__.py └── screenshot.py └── theHarvester.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .github/* 2 | .gitattributes 3 | .git-blame-ignore-revs 4 | .idea/ 5 | .pytest_cache 6 | .mypy_cache 7 | tests/* 8 | README/ 9 | bin/ 10 | theHarvester-logo.png 11 | theHarvester-logo.webp -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # #1492 run `black .` and `isort .` 2 | c13843ec0d513ac7f9c35b7bd0501fa46e356415 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, which is to have git automatically determine 2 | # whether a file is a text or binary, unless otherwise specified. 3 | 4 | * text=auto 5 | 6 | # Basic .gitattributes for a python repo. 7 | 8 | # Source files 9 | # ============ 10 | *.pxd text diff=python 11 | *.py text diff=python 12 | *.py3 text diff=python 13 | *.pyw text diff=python 14 | *.pyx text diff=python 15 | 16 | # Binary files 17 | # ============ 18 | *.db binary 19 | *.p binary 20 | *.pkl binary 21 | *.pyc binary 22 | *.pyd binary 23 | *.pyo binary 24 | 25 | # Note: .db, .p, and .pkl files are associated with the python modules 26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` 27 | # (among others). 28 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [L1ghtn1ng, NotoriousRebel] 4 | open_collective: # Replace with a single Open Collective username 5 | ko_fi: # 6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 8 | liberapay: # Replace with a single Liberapay username 9 | issuehunt: # Replace with a single IssueHunt username 10 | otechie: # Replace with a single Otechie username 11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Issue Template 3 | about: A template for new issues. 4 | title: "[Bug|Feature Request|Other] Short Description of Issue" 5 | labels: '' 6 | 7 | --- 8 | 9 | ## Note we do not support installing theHarvester on android 10 | 11 | **Feature Request or Bug or Another** 12 | Feature Request | Bug | Other 13 | 14 | **Describe the feature request or bug or other** 15 | A clear and concise description of what the bug, feature request, 16 | or other request is. 17 | 18 | **To Reproduce** 19 | Steps to reproduce the behaviour: 20 | 1. Run tool like this: '...' 21 | 2. See error 22 | 23 | **Expected behaviour** 24 | A clear and concise description of what you expected to happen. 25 | 26 | **Screenshots** 27 | If possible please add screenshots to help explain your problem. 28 | 29 | **System Information (System that tool is running on):** 30 | - OS: [e.g. Windows10] 31 | - Version [e.g. 2.7] 32 | 33 | **Additional context** 34 | Add any other context about the problem here. 35 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | timezone: Europe/London 8 | - package-ecosystem: uv 9 | directory: "/" 10 | schedule: 11 | interval: daily 12 | timezone: Europe/London 13 | open-pull-requests-limit: 10 14 | target-branch: master 15 | allow: 16 | - dependency-type: direct 17 | - dependency-type: indirect 18 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master, dev ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master, dev ] 20 | schedule: 21 | - cron: '19 11 * * 4' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | language: [ 'python' ] 32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 33 | # Learn more: 34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v4 39 | 40 | # Initializes the CodeQL tools for scanning. 41 | - name: Initialize CodeQL 42 | uses: github/codeql-action/init@v3 43 | with: 44 | languages: ${{ matrix.language }} 45 | # If you wish to specify custom queries, you can do so here or in a config file. 46 | # By default, queries listed here will override any specified in a config file. 47 | # Prefix the list here with "+" to use these queries and those in the config file. 48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 49 | 50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 51 | # If this step fails, then you should remove it and run the build manually (see below) 52 | - name: Autobuild 53 | uses: github/codeql-action/autobuild@v3 54 | 55 | # ℹ️ Command-line programs to run using the OS shell. 56 | # 📚 https://git.io/JvXDl 57 | 58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 59 | # and modify them (or add more) to build your code if your project 60 | # uses a compiled language 61 | 62 | #- run: | 63 | # make bootstrap 64 | # make release 65 | 66 | - name: Perform CodeQL Analysis 67 | uses: github/codeql-action/analyze@v3 68 | -------------------------------------------------------------------------------- /.github/workflows/docker-build-push.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Docker Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | permissions: 9 | contents: read 10 | packages: write 11 | 12 | jobs: 13 | build-and-push: 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v4 19 | 20 | - name: Set up Docker Buildx 21 | uses: docker/setup-buildx-action@v3 22 | 23 | - name: Log in to GitHub Container Registry 24 | uses: docker/login-action@v3 25 | with: 26 | registry: ghcr.io 27 | username: ${{ github.actor }} 28 | password: ${{ secrets.GITHUB_TOKEN }} 29 | 30 | - name: Extract metadata for Docker 31 | id: meta 32 | uses: docker/metadata-action@v5 33 | with: 34 | images: ghcr.io/${{ github.repository_owner }}/theharvester 35 | tags: | 36 | latest 37 | type=ref,event=branch 38 | type=sha 39 | 40 | - name: Build and push Docker image 41 | uses: docker/build-push-action@v6 42 | with: 43 | context: . 44 | file: Dockerfile 45 | push: true 46 | platforms: linux/amd64,linux/arm64 47 | tags: ${{ steps.meta.outputs.tags }} 48 | labels: ${{ steps.meta.outputs.labels }} 49 | -------------------------------------------------------------------------------- /.github/workflows/dockerci.yml: -------------------------------------------------------------------------------- 1 | name: TheHarvester Docker Image CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - name: Build the Docker image 11 | run: docker build . --file Dockerfile --tag theharvester:$(date +%s) -------------------------------------------------------------------------------- /.github/workflows/theHarvester.yml: -------------------------------------------------------------------------------- 1 | name: TheHarvester Python CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - '*' 7 | 8 | pull_request: 9 | branches: 10 | - '*' 11 | 12 | jobs: 13 | Python: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | max-parallel: 10 17 | matrix: 18 | os: [ ubuntu-latest ] 19 | python-version: [ '3.12', '3.13', '3.14.0-beta.1' ] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | sudo mkdir -p /usr/local/etc/theHarvester 30 | sudo cp theHarvester/data/*.yaml /usr/local/etc/theHarvester/ 31 | sudo chown -R runner:runner /usr/local/etc/theHarvester/ 32 | pip install --upgrade pip 33 | pip install .[dev] 34 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 35 | 36 | - name: Lint with ruff 37 | run: | 38 | ruff check --fix 39 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 40 | 41 | - name: Format with ruff 42 | run: | 43 | ruff format 44 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 45 | 46 | - name: Commit changes for ruff formating and linting 47 | run: | 48 | git config user.name github-actions 49 | git config user.email github-actions@github.com 50 | git add . 51 | git commit -m "Apply ruff fixes and formatting" || true # Use || true to prevent failure if no changes 52 | git push origin $GITHUB_REF 53 | env: 54 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 55 | 56 | - name: Test with pytest 57 | run: | 58 | pytest tests/** 59 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 60 | 61 | - name: Run theHarvester module Baidu 62 | run: | 63 | theHarvester -d yale.edu -b baidu 64 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 65 | 66 | - name: Run theHarvester module Bing 67 | run: | 68 | theHarvester -d yale.edu -b bing 69 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 70 | 71 | - name: Run theHarvester module CertSpotter 72 | run: | 73 | theHarvester -d yale.edu -b certspotter 74 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 75 | 76 | - name: Run theHarvester module Crtsh 77 | run: | 78 | theHarvester -d hcl.com -b crtsh 79 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 80 | 81 | - name: Run theHarvester module DuckDuckGo 82 | run: | 83 | theHarvester -d yale.edu -b duckduckgo 84 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 85 | 86 | - name: Run theHarvester module HackerTarget 87 | run: | 88 | theHarvester -d yale.edu -b hackertarget 89 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 90 | 91 | - name: Run theHarvester module Otx 92 | run: | 93 | theHarvester -d yale.edu -b otx 94 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 95 | 96 | - name: Run theHarvester module RapidDns 97 | run: | 98 | theHarvester -d yale.edu -b rapiddns 99 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 100 | 101 | - name: Run theHarvester module Threatminer 102 | run: | 103 | theHarvester -d yale.edu -b threatminer 104 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 105 | 106 | - name: Run theHarvester module Urlscan 107 | run: | 108 | theHarvester -d yale.edu -b urlscan 109 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 110 | 111 | - name: Run theHarvester module Yahoo 112 | run: | 113 | theHarvester -d yale.edu -b yahoo 114 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} 115 | 116 | - name: Run theHarvester module DNS brute force 117 | run: | 118 | theHarvester -d yale.edu -c 119 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.idea 2 | *.pyc 3 | *.sqlite 4 | *.html 5 | *.htm 6 | *.vscode 7 | *.xml 8 | *.json 9 | debug_results.txt 10 | venv 11 | .mypy_cache 12 | .pytest_cache 13 | build/ 14 | dist/ 15 | theHarvester.egg-info 16 | api-keys.yaml 17 | .DS_Store 18 | .venv 19 | .pyre 20 | uv.lock 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:testing-slim 2 | 3 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1" 4 | 5 | # Install dependencies for building Python from source 6 | RUN apt update && apt install -y \ 7 | curl \ 8 | build-essential \ 9 | libssl-dev \ 10 | zlib1g-dev \ 11 | libbz2-dev \ 12 | libreadline-dev \ 13 | libsqlite3-dev \ 14 | wget \ 15 | curl \ 16 | llvm \ 17 | libncurses5-dev \ 18 | libncursesw5-dev \ 19 | xz-utils \ 20 | tk-dev \ 21 | libffi-dev \ 22 | liblzma-dev \ 23 | python3-dev \ 24 | git \ 25 | gcc \ 26 | && rm -rf /var/lib/apt/lists/* 27 | 28 | # Install Python 3.11 from source 29 | RUN curl -fsSL https://www.python.org/ftp/python/3.11.6/Python-3.11.6.tgz -o Python-3.11.6.tgz \ 30 | && tar -xvf Python-3.11.6.tgz \ 31 | && cd Python-3.11.6 \ 32 | && ./configure --enable-optimizations \ 33 | && make -j 2 \ 34 | && make altinstall \ 35 | && rm -rf /Python-3.11.6 /Python-3.11.6.tgz 36 | 37 | # Install pip for Python 3.11 38 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.11 39 | 40 | # Install pipx for Python 3.11 41 | RUN python3.11 -m pip install --user pipx 42 | 43 | # Add pipx to PATH 44 | ENV PATH=/root/.local/bin:$PATH 45 | 46 | # Install theHarvester via pipx 47 | RUN pipx install --python python3.11 git+https://github.com/laramies/theHarvester.git 48 | 49 | # Ensure pipx path 50 | RUN pipx ensurepath 51 | 52 | # Set the entrypoint 53 | ENTRYPOINT ["/root/.local/bin/restfulHarvest", "-H", "0.0.0.0", "-p", "80"] 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![theHarvester](https://github.com/laramies/theHarvester/blob/master/theHarvester-logo.webp) 2 | 3 | ![TheHarvester CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Python%20CI/badge.svg) ![TheHarvester Docker Image CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Docker%20Image%20CI/badge.svg) 4 | [![Rawsec's CyberSecurity Inventory](https://inventory.raw.pm/img/badges/Rawsec-inventoried-FF5050_flat_without_logo.svg)](https://inventory.raw.pm/) 5 | 6 | What is this? 7 | ------------- 8 | theHarvester is a simple to use, yet powerful tool designed to be used during the reconnaissance stage of a red
9 | team assessment or penetration test. It performs open source intelligence (OSINT) gathering to help determine
10 | a domain's external threat landscape. The tool gathers names, emails, IPs, subdomains, and URLs by using
11 | multiple public resources that include:
12 | 13 | Passive modules: 14 | ---------------- 15 | 16 | * baidu: Baidu search engine - www.baidu.com 17 | 18 | * bevigil: CloudSEK BeVigil scans mobile application for OSINT assets (Requires an API key, see below.) - https://bevigil.com/osint-api 19 | 20 | * bing: Microsoft search engine - https://www.bing.com 21 | 22 | * bingapi: Microsoft search engine, through the API (Requires an API key, see below.) 23 | 24 | * brave: Brave search engine - https://search.brave.com/ 25 | 26 | * bufferoverun: (Requires an API key, see below.) https://tls.bufferover.run 27 | 28 | * censys: [Censys search engine](https://search.censys.io/) will use certificates searches to enumerate subdomains and gather emails
29 | (Requires an API key, see below.) https://censys.io 30 | 31 | * certspotter: Cert Spotter monitors Certificate Transparency logs - https://sslmate.com/certspotter/ 32 | 33 | * criminalip: Specialized Cyber Threat Intelligence (CTI) search engine (Requires an API key, see below.) - https://www.criminalip.io 34 | 35 | * crtsh: Comodo Certificate search - https://crt.sh 36 | 37 | * duckduckgo: DuckDuckGo search engine - https://duckduckgo.com 38 | 39 | * fullhunt: Next-generation attack surface security platform (Requires an API key, see below.) - https://fullhunt.io 40 | 41 | * github-code: GitHub code search engine (Requires a GitHub Personal Access Token, see below.) - www.github.com 42 | 43 | * hackertarget: Online vulnerability scanners and network intelligence to help organizations - https://hackertarget.com 44 | 45 | * hunter: Hunter search engine (Requires an API key, see below.) - https://hunter.io 46 | 47 | * hunterhow: Internet search engines for security researchers (Requires an API key, see below.) - https://hunter.how 48 | 49 | * intelx: Intelx search engine (Requires an API key, see below.) - http://intelx.io 50 | 51 | * netlas: A Shodan or Censys competitor (Requires an API key, see below.) - https://app.netlas.io 52 | 53 | * onyphe: Cyber defense search engine (Requires an API key, see below.) - https://www.onyphe.io/ 54 | 55 | * otx: AlienVault open threat exchange - https://otx.alienvault.com 56 | 57 | * pentestTools: Cloud-based toolkit for offensive security testing, focused on web applications and network penetration
58 | testing (Requires an API key, see below.) - https://pentest-tools.com/ 59 | 60 | * projecDiscovery: We actively collect and maintain internet-wide assets data, to enhance research and analyse changes around
61 | DNS for better insights (Requires an API key, see below.) - https://chaos.projectdiscovery.io 62 | 63 | * rapiddns: DNS query tool which make querying subdomains or sites of a same IP easy! https://rapiddns.io 64 | 65 | * rocketreach: Access real-time verified personal/professional emails, phone numbers, and social media links (Requires an API key,
66 | see below.) - https://rocketreach.co 67 | 68 | * securityTrails: Security Trails search engine, the world's largest repository of historical DNS data (Requires an API key, see
69 | below.) - https://securitytrails.com 70 | 71 | * -s, --shodan: Shodan search engine will search for ports and banners from discovered hosts (Requires an API key, see below.)
72 | https://shodan.io 73 | 74 | * sitedossier: Find available information on a site - http://www.sitedossier.com 75 | 76 | * subdomaincenter: A subdomain finder tool used to find subdomains of a given domain - https://www.subdomain.center/ 77 | 78 | * subdomainfinderc99: A subdomain finder is a tool used to find the subdomains of a given domain - https://subdomainfinder.c99.nl 79 | 80 | * threatminer: Data mining for threat intelligence - https://www.threatminer.org/ 81 | 82 | * tomba: Tomba search engine (Requires an API key, see below.) - https://tomba.io 83 | 84 | * urlscan: A sandbox for the web that is a URL and website scanner - https://urlscan.io 85 | 86 | * venacus: Venacus search engine (Requires an API key, see below.) - https://venacus.com 87 | 88 | * vhost: Bing virtual hosts search 89 | 90 | * virustotal: Domain search (Requires an API key, see below.) - https://www.virustotal.com 91 | 92 | * whoisxml: Subdomain search (Requires an API key, see below.) - https://subdomains.whoisxmlapi.com/api/pricing 93 | 94 | * yahoo: Yahoo search engine 95 | 96 | * zoomeye: China's version of Shodan (Requires an API key, see below.) - https://www.zoomeye.org 97 | 98 | Active modules: 99 | --------------- 100 | * DNS brute force: dictionary brute force enumeration 101 | * Screenshots: Take screenshots of subdomains that were found 102 | 103 | Modules that require an API key: 104 | -------------------------------- 105 | Documentation to setup API keys can be found at - https://github.com/laramies/theHarvester/wiki/Installation#api-keys 106 | 107 | * bevigil - Free upto 50 queries. Pricing can be found here: https://bevigil.com/pricing/osint 108 | * bing 109 | * bufferoverun - uses the free binaAPI 110 | * censys - API keys are required and can be retrieved from your [Censys account](https://search.censys.io/account/api). 111 | * criminalip 112 | * fullhunt 113 | * github 114 | * hunter - limited to 10 on the free plan, so you will need to do -l 10 switch 115 | * hunterhow 116 | * intelx 117 | * netlas - $ 118 | * onyphe -$ 119 | * pentestTools - $ 120 | * projecDiscovery - invite only for now 121 | * rocketreach - $ 122 | * securityTrails 123 | * shodan - $ 124 | * tomba - Free up to 50 search. 125 | * venacus - $ 126 | * whoisxml 127 | * zoomeye 128 | 129 | Install and dependencies: 130 | ------------------------- 131 | * Python 3.11+ 132 | * https://github.com/laramies/theHarvester/wiki/Installation 133 | 134 | Comments, bugs, and requests: 135 | ----------------------------- 136 | * [![Twitter Follow](https://img.shields.io/twitter/follow/laramies.svg?style=social&label=Follow)](https://twitter.com/laramies) Christian Martorella @laramies 137 | cmartorella@edge-security.com 138 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1 139 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1 140 | 141 | Main contributors: 142 | ------------------ 143 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1 144 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1 145 | * [![Twitter Follow](https://img.shields.io/twitter/follow/discoverscripts.svg?style=social&label=Follow)](https://twitter.com/discoverscripts) Lee Baird @discoverscripts 146 | 147 | 148 | Thanks: 149 | ------- 150 | * John Matherly - Shodan project 151 | * Ahmed Aboul Ela - subdomain names dictionaries (big and small) 152 | -------------------------------------------------------------------------------- /README/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to theHarvester Project 2 | Welcome to theHarvester project, so you would like to contribute. 3 | The following below must be met to get accepted. 4 | 5 | # CI 6 | Make sure all CI passes and you do not introduce any alerts from ruff 7 | 8 | # Unit Tests 9 | For new modules a unit test for that module is required and we use pytest. 10 | 11 | # Coding Standards 12 | * No single letter variables and variable names must represent the action that it is performing 13 | * Have static typing on functions etc 14 | * Make sure no errors are reported from mypy 15 | * No issues reported with ruff 16 | 17 | # Submitting Bugs 18 | If you find a bug in a module that you want to submit an issue for and know how to write python code. 19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project. 20 | -------------------------------------------------------------------------------- /README/LICENSES: -------------------------------------------------------------------------------- 1 | Released under the GPL v 2.0. 2 | 3 | If you did not receive a copy of the GPL, try http://www.gnu.org/. 4 | 5 | Copyright 2011 Christian Martorella 6 | 7 | theHarvester is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation version 2 of the License. 10 | 11 | theHarvester is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 | -------------------------------------------------------------------------------- /bin/restfulHarvest: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | 4 | import uvicorn 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument( 8 | "-H", 9 | "--host", 10 | default="127.0.0.1", 11 | help="IP address to listen on default is 127.0.0.1", 12 | ) 13 | parser.add_argument( 14 | "-p", 15 | "--port", 16 | default=5000, 17 | help="Port to bind the web server to, default is 5000", 18 | type=int, 19 | ) 20 | parser.add_argument( 21 | "-l", 22 | "--log-level", 23 | default="info", 24 | help="Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set", 25 | ) 26 | parser.add_argument( 27 | "-r", 28 | "--reload", 29 | default=False, 30 | help="Enable automatic reload used during development of the api", 31 | action="store_true", 32 | ) 33 | 34 | args = parser.parse_args() 35 | 36 | if __name__ == "__main__": 37 | uvicorn.run( 38 | "theHarvester.lib.api.api:app", 39 | host=args.host, 40 | port=args.port, 41 | log_level=args.log_level, 42 | reload=args.reload, 43 | ) 44 | -------------------------------------------------------------------------------- /bin/theHarvester: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Note: This script runs theHarvester 3 | import asyncio 4 | import sys 5 | 6 | from theHarvester import __main__ 7 | 8 | if sys.version_info.major < 3 or sys.version_info.minor < 11: 9 | print( 10 | "\033[93m[!] Make sure you have Python 3.11+ installed, quitting.\n\n \033[0m" 11 | ) 12 | sys.exit(1) 13 | 14 | if __name__ == "__main__": 15 | platform = sys.platform 16 | if platform == "win32": 17 | # Required or things will break if trying to take screenshots 18 | import multiprocessing 19 | 20 | multiprocessing.freeze_support() 21 | try: 22 | # See if we have winloop as a performance enhancement on windows 23 | import winloop 24 | 25 | asyncio.DefaultEventLoopPolicy = winloop.EventLoopPolicy 26 | except ModuleNotFoundError: 27 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy 28 | else: 29 | import uvloop 30 | 31 | uvloop.install() 32 | 33 | if "linux" in platform: 34 | import aiomultiprocess 35 | 36 | # As we are not using Windows, we can change the spawn method to fork for greater performance 37 | aiomultiprocess.set_context("fork") 38 | asyncio.run(__main__.entry_point()) 39 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | theharvester.svc.local: 3 | container_name: theHarvester 4 | volumes: 5 | - ./theHarvester/data/api-keys.yaml:/root/.theHarvester/api-keys.yaml 6 | - ./theHarvester/data/api-keys.yaml:/etc/theHarvester/api-keys.yaml 7 | - ./theHarvester/data/proxies.yaml:/etc/theHarvester/proxies.yaml 8 | - ./theHarvester/data/proxies.yaml:/root/.theHarvester/proxies.yaml 9 | build: . 10 | ports: 11 | - "8080:80" 12 | 13 | networks: 14 | default: 15 | name: app_theHarvester_network 16 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "theHarvester" 3 | description = "theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test" 4 | readme = "README.md" 5 | authors = [ 6 | { name = "Christian Martorella", email = "cmartorella@edge-security.com" }, 7 | { name = "Jay Townsend", email = "jay@cybermon.uk" }, 8 | { name = "Matthew Brown", email = "36310667+NotoriousRebel@users.noreply.github.com" }, 9 | ] 10 | requires-python = ">=3.11" 11 | urls.Homepage = "https://github.com/laramies/theHarvester" 12 | classifiers = [ 13 | "Programming Language :: Python :: 3", 14 | "Programming Language :: Python :: 3.11", 15 | "Programming Language :: Python :: 3.12", 16 | "Programming Language :: Python :: 3.13", 17 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", 18 | "Operating System :: OS Independent", 19 | ] 20 | dynamic = ["version"] 21 | dependencies = [ 22 | "aiodns==3.4.0", 23 | "aiofiles==24.1.0", 24 | "aiohttp==3.12.2", 25 | "aiomultiprocess==0.9.1", 26 | "aiosqlite==0.21.0", 27 | "beautifulsoup4==4.13.4", 28 | "censys==2.2.17", 29 | "certifi==2025.4.26", 30 | "dnspython==2.7.0", 31 | "fastapi==0.115.12", 32 | "lxml==5.4.0", 33 | "netaddr==1.3.0", 34 | "playwright==1.52.0", 35 | "PyYAML==6.0.2", 36 | "python-dateutil==2.9.0.post0", 37 | "requests==2.32.3", 38 | "retrying==1.3.4", 39 | "shodan==1.31.0", 40 | "slowapi==0.1.9", 41 | "ujson==5.10.0", 42 | "uvicorn==0.34.2", 43 | "uvloop==0.21.0; platform_system != 'Windows'", 44 | "winloop==0.1.8; platform_system == 'Windows'", 45 | ] 46 | 47 | [project.optional-dependencies] 48 | dev = [ 49 | "mypy==1.15.0", 50 | "mypy-extensions==1.1.0", 51 | "pytest==8.3.5", 52 | "pytest-asyncio==0.26.0", 53 | "types-certifi==2021.10.8.3", 54 | "types-chardet==5.0.4.6", 55 | "types-python-dateutil==2.9.0.20250516", 56 | "types-PyYAML==6.0.12.20250516", 57 | "types-requests==2.32.0.20250515", 58 | "ruff==0.11.11", 59 | "types-ujson==5.10.0.20250326", 60 | "wheel==0.45.1", 61 | ] 62 | 63 | [project.scripts] 64 | theHarvester = "theHarvester.theHarvester:main" 65 | restfulHarvest = "theHarvester.restfulHarvest:main" 66 | 67 | [tool.setuptools.dynamic] 68 | version = { attr = "theHarvester.lib.version.VERSION" } 69 | 70 | [tool.setuptools.packages.find] 71 | include = ["theHarvester*"] 72 | 73 | [tool.setuptools.package-data] 74 | "*" = ["*.txt", "*.yaml"] 75 | 76 | [tool.pytest.ini_options] 77 | minversion = "8.3.3" 78 | asyncio_mode = "auto" 79 | asyncio_default_fixture_loop_scope = "function" 80 | addopts = "--no-header" 81 | testpaths = [ 82 | "tests", 83 | "tests/discovery/", 84 | ] 85 | 86 | [build-system] 87 | requires = ["setuptools>=68"] 88 | build-backend = "setuptools.build_meta" 89 | 90 | [tool.mypy] 91 | python_version = "3.11" 92 | warn_unused_configs = true 93 | ignore_missing_imports = true 94 | show_traceback = true 95 | show_error_codes = true 96 | namespace_packages = true 97 | 98 | [tool.uv] 99 | python-preference = "managed" 100 | 101 | [tool.uv.pip] 102 | python-version = "3.11" 103 | 104 | [tool.ruff] 105 | # Exclude a variety of commonly ignored directories. 106 | exclude = [ 107 | "tests", 108 | ".eggs", 109 | ".git", 110 | ".git-rewrite", 111 | ".mypy_cache", 112 | ".pyenv", 113 | ".pytest_cache", 114 | ".pytype", 115 | ".ruff_cache", 116 | ".github", 117 | ".venv", 118 | ".vscode", 119 | ".idea", 120 | "__pypackages__", 121 | "build", 122 | "dist", 123 | "site-packages", 124 | "venv", 125 | ] 126 | 127 | line-length = 130 128 | target-version = "py311" 129 | show-fixes = true 130 | 131 | [tool.ruff.lint] 132 | select = ["E4", 133 | "E7", 134 | "E9", 135 | "F", 136 | "I", 137 | "UP", 138 | "TCH", 139 | "FA", 140 | "RUF", 141 | "PT", 142 | ] 143 | ignore = ["S311", "RUF021", "RUF029", "F841"] 144 | 145 | # Allow fix for all enabled rules (when `--fix`) is provided. 146 | fixable = ["ALL"] 147 | unfixable = [] 148 | 149 | # Allow unused variables when underscore-prefixed. 150 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 151 | 152 | [tool.ruff.format] 153 | # Like Black, use double quotes for strings. 154 | quote-style = "single" 155 | indent-style = "space" 156 | 157 | # Like Black, respect magic trailing commas. 158 | skip-magic-trailing-comma = false 159 | 160 | # Like Black, automatically detect the appropriate line ending. 161 | line-ending = "auto" 162 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file is deprecated. All dependencies are now defined in pyproject.toml 2 | -------------------------------------------------------------------------------- /restfulHarvest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from theHarvester.restfulHarvest import main 3 | 4 | if __name__ == '__main__': 5 | main() 6 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/tests/__init__.py -------------------------------------------------------------------------------- /tests/discovery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/tests/discovery/__init__.py -------------------------------------------------------------------------------- /tests/discovery/test_certspotter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import os 4 | from typing import Optional 5 | 6 | import pytest 7 | import requests 8 | from _pytest.mark.structures import MarkDecorator 9 | 10 | from theHarvester.discovery import certspottersearch 11 | from theHarvester.lib.core import * 12 | 13 | pytestmark: MarkDecorator = pytest.mark.asyncio 14 | github_ci: Optional[str] = os.getenv( 15 | "GITHUB_ACTIONS" 16 | ) # Github set this to be the following: true instead of True 17 | 18 | 19 | class TestCertspotter(object): 20 | @staticmethod 21 | def domain() -> str: 22 | return "metasploit.com" 23 | 24 | 25 | @pytest.mark.skipif(github_ci == 'true', reason="Skipping this test for now") 26 | class TestCertspotterSearch(object): 27 | async def test_api(self) -> None: 28 | base_url = f"https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names" 29 | headers = {"User-Agent": Core.get_user_agent()} 30 | request = requests.get(base_url, headers=headers) 31 | assert request.status_code == 200 32 | 33 | async def test_search(self) -> None: 34 | search = certspottersearch.SearchCertspoter(TestCertspotter.domain()) 35 | await search.process() 36 | assert isinstance(await search.get_hostnames(), set) 37 | 38 | 39 | if __name__ == "__main__": 40 | pytest.main() 41 | -------------------------------------------------------------------------------- /tests/discovery/test_githubcode.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | import pytest 3 | from _pytest.mark.structures import MarkDecorator 4 | from requests import Response 5 | from theHarvester.discovery import githubcode 6 | from theHarvester.discovery.constants import MissingKey 7 | from theHarvester.lib.core import Core 8 | 9 | pytestmark: MarkDecorator = pytest.mark.asyncio 10 | 11 | 12 | class TestSearchGithubCode: 13 | class OkResponse: 14 | response = Response() 15 | 16 | # Mocking the json method properly 17 | def __init__(self): 18 | self.response = Response() 19 | self.response.status_code = 200 20 | self.response.json = MagicMock( 21 | return_value={ 22 | "items": [ 23 | {"text_matches": [{"fragment": "test1"}]}, 24 | {"text_matches": [{"fragment": "test2"}]}, 25 | ] 26 | } 27 | ) 28 | 29 | class FailureResponse: 30 | response = Response() 31 | 32 | def __init__(self): 33 | self.response = Response() 34 | self.response.status_code = 401 35 | self.response.json = MagicMock(return_value={}) 36 | 37 | class RetryResponse: 38 | def __init__(self): 39 | self.response = Response() 40 | self.response.status_code = 403 41 | self.response.json = MagicMock(return_value={}) 42 | 43 | class MalformedResponse: 44 | response = Response() 45 | 46 | def __init__(self): 47 | self.response = Response() 48 | self.response.status_code = 200 49 | self.response.json = MagicMock( 50 | return_value={ 51 | "items": [ 52 | {"fail": True}, 53 | {"text_matches": []}, 54 | {"text_matches": [{"weird": "result"}]}, 55 | ] 56 | } 57 | ) 58 | 59 | async def test_missing_key(self): 60 | with pytest.raises(MissingKey): 61 | Core.github_key = MagicMock(return_value=None) 62 | githubcode.SearchGithubCode(word="test", limit=500) 63 | 64 | async def test_fragments_from_response(self): 65 | Core.github_key = MagicMock(return_value="test_key") 66 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 67 | test_result = await test_class_instance.fragments_from_response( 68 | self.OkResponse().response.json() 69 | ) 70 | print("test_result: ", test_result) 71 | assert test_result == ["test1", "test2"] 72 | 73 | async def test_invalid_fragments_from_response(self): 74 | Core.github_key = MagicMock(return_value="test_key") 75 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 76 | test_result = await test_class_instance.fragments_from_response( 77 | self.MalformedResponse().response.json() 78 | ) 79 | assert test_result == [] 80 | 81 | async def test_next_page(self): 82 | Core.github_key = MagicMock(return_value="test_key") 83 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 84 | test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4) 85 | assert 2 == await test_class_instance.next_page_or_end(test_result) 86 | 87 | async def test_last_page(self): 88 | Core.github_key = MagicMock(return_value="test_key") 89 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 90 | test_result = githubcode.SuccessResult(list(), 0, 0) 91 | assert await test_class_instance.next_page_or_end(test_result) is 0 92 | 93 | 94 | if __name__ == "__main__": 95 | pytest.main() 96 | -------------------------------------------------------------------------------- /tests/discovery/test_otx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import os 4 | from typing import Optional 5 | 6 | import pytest 7 | import requests 8 | from _pytest.mark.structures import MarkDecorator 9 | 10 | from theHarvester.discovery import otxsearch 11 | from theHarvester.lib.core import * 12 | 13 | pytestmark: MarkDecorator = pytest.mark.asyncio 14 | github_ci: Optional[str] = os.getenv( 15 | "GITHUB_ACTIONS" 16 | ) # Github set this to be the following: true instead of True 17 | 18 | 19 | class TestOtx(object): 20 | @staticmethod 21 | def domain() -> str: 22 | return "cybermon.uk" 23 | 24 | async def test_api(self) -> None: 25 | base_url = f"https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns" 26 | headers = {"User-Agent": Core.get_user_agent()} 27 | request = requests.get(base_url, headers=headers) 28 | assert request.status_code == 200 29 | 30 | async def test_search(self) -> None: 31 | search = otxsearch.SearchOtx(TestOtx.domain()) 32 | await search.process() 33 | assert isinstance(await search.get_hostnames(), set) 34 | assert isinstance(await search.get_ips(), set) 35 | 36 | 37 | if __name__ == "__main__": 38 | pytest.main() 39 | -------------------------------------------------------------------------------- /tests/lib/test_core.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | from typing import Any 5 | from unittest import mock 6 | 7 | import pytest 8 | import yaml 9 | 10 | from theHarvester.lib.core import CONFIG_DIRS, DATA_DIR, Core 11 | 12 | 13 | @pytest.fixture(autouse=True) 14 | def mock_environ(monkeypatch, tmp_path: Path): 15 | monkeypatch.setenv("HOME", str(tmp_path)) 16 | 17 | 18 | def mock_read_text(mocked: dict[Path, str | Exception]): 19 | read_text = Path.read_text 20 | 21 | def _read_text(self: Path, *args, **kwargs): 22 | if result := mocked.get(self): 23 | if isinstance(result, Exception): 24 | raise result 25 | return result 26 | return read_text(self, *args, **kwargs) 27 | 28 | return _read_text 29 | 30 | 31 | @pytest.mark.parametrize( 32 | ("name", "contents", "expected"), 33 | [ 34 | ("api-keys", "apikeys: {}", {}), 35 | ("proxies", "http: [localhost:8080]", ["http://localhost:8080"]), 36 | ], 37 | ) 38 | @pytest.mark.parametrize("dir", CONFIG_DIRS) 39 | def test_read_config_searches_config_dirs( 40 | name: str, contents: str, expected: Any, dir: Path, capsys 41 | ): 42 | file = dir.expanduser() / f"{name}.yaml" 43 | config_files = [d.expanduser() / file.name for d in CONFIG_DIRS] 44 | side_effect = mock_read_text( 45 | {f: contents if f == file else FileNotFoundError() for f in config_files} 46 | ) 47 | 48 | with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect): 49 | got = Core.api_keys() if name == "api-keys" else Core.proxy_list() 50 | 51 | assert got == expected 52 | assert f"Read {file.name} from {file}" in capsys.readouterr().out 53 | 54 | 55 | @pytest.mark.parametrize("name", ("api-keys", "proxies")) 56 | def test_read_config_copies_default_to_home(name: str, capsys): 57 | file = Path(f"~/.theHarvester/{name}.yaml").expanduser() 58 | config_files = [d.expanduser() / file.name for d in CONFIG_DIRS] 59 | side_effect = mock_read_text({f: FileNotFoundError() for f in config_files}) 60 | 61 | with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect): 62 | got = Core.api_keys() if name == "api-keys" else Core.proxy_list() 63 | 64 | default = yaml.safe_load((DATA_DIR / file.name).read_text()) 65 | expected = ( 66 | default["apikeys"] 67 | if name == "api-keys" 68 | else [f"http://{h}" for h in default["http"]] 69 | ) 70 | assert got == expected 71 | assert f"Created default {file.name} at {file}" in capsys.readouterr().out 72 | assert file.exists() 73 | -------------------------------------------------------------------------------- /tests/test_myparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import pytest 5 | 6 | from theHarvester.parsers import myparser 7 | 8 | 9 | class TestMyParser(object): 10 | @pytest.mark.asyncio 11 | async def test_emails(self) -> None: 12 | word = "domain.com" 13 | results = "@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***" 14 | parse = myparser.Parser(results, word) 15 | emails = sorted(await parse.emails()) 16 | assert emails, ["c@domain.com", "d@sub.domain.com"] 17 | 18 | 19 | if __name__ == "__main__": 20 | pytest.main() 21 | -------------------------------------------------------------------------------- /theHarvester-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester-logo.png -------------------------------------------------------------------------------- /theHarvester-logo.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester-logo.webp -------------------------------------------------------------------------------- /theHarvester.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Note: This script runs theHarvester 3 | import sys 4 | 5 | from theHarvester.theHarvester import main 6 | 7 | if sys.version_info.major < 3 or sys.version_info.minor < 10: 8 | print('\033[93m[!] Make sure you have Python 3.10+ installed, quitting.\n\n \033[0m') 9 | sys.exit(1) 10 | 11 | if __name__ == '__main__': 12 | main() 13 | -------------------------------------------------------------------------------- /theHarvester/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/__init__.py -------------------------------------------------------------------------------- /theHarvester/data/api-keys.yaml: -------------------------------------------------------------------------------- 1 | apikeys: 2 | 3 | bevigil: 4 | key: 5 | 6 | bing: 7 | key: 8 | 9 | bufferoverun: 10 | key: 11 | 12 | censys: 13 | id: 14 | secret: 15 | 16 | criminalip: 17 | key: 18 | 19 | dehashed: 20 | key: 21 | 22 | dnsdumpster: 23 | key: 24 | 25 | fullhunt: 26 | key: 27 | 28 | github: 29 | key: 30 | 31 | hunter: 32 | key: 33 | 34 | hunterhow: 35 | key: 36 | 37 | intelx: 38 | key: 39 | 40 | netlas: 41 | key: 42 | 43 | onyphe: 44 | key: 45 | 46 | pentestTools: 47 | key: 48 | 49 | projectDiscovery: 50 | key: 51 | 52 | rocketreach: 53 | key: 54 | 55 | securityTrails: 56 | key: 57 | 58 | shodan: 59 | key: 60 | 61 | tomba: 62 | key: 63 | secret: 64 | 65 | venacus: 66 | key: 67 | 68 | virustotal: 69 | key: 70 | 71 | whoisxml: 72 | key: 73 | 74 | zoomeye: 75 | key: 76 | -------------------------------------------------------------------------------- /theHarvester/data/proxies.yaml: -------------------------------------------------------------------------------- 1 | http: 2 | - ip:port 3 | -------------------------------------------------------------------------------- /theHarvester/data/wordlists/dorks.txt: -------------------------------------------------------------------------------- 1 | inurl:"contact" 2 | intext:email filetype:log 3 | "Index of /mail" 4 | "admin account info" filetype:log 5 | intext:@ 6 | administrator accounts/ 7 | intitle:"Index of" .bash_history 8 | intitle:"index of" members OR accounts 9 | inurl:/shared/help.php 10 | inurl:public 11 | intitle:index.of inbox 12 | intitle:"Server Administration" 13 | inurl:passwd.txt 14 | robots.txt 15 | php-addressbook "This is the addressbook for *" -warning -------------------------------------------------------------------------------- /theHarvester/data/wordlists/general/common.txt: -------------------------------------------------------------------------------- 1 | admin 2 | test 3 | hello 4 | uk 5 | login 6 | book 7 | robots.txt 8 | -------------------------------------------------------------------------------- /theHarvester/discovery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/discovery/__init__.py -------------------------------------------------------------------------------- /theHarvester/discovery/baidusearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | from theHarvester.parsers import myparser 3 | 4 | 5 | class SearchBaidu: 6 | def __init__(self, word, limit) -> None: 7 | self.word = word 8 | self.total_results = '' 9 | self.server = 'www.baidu.com' 10 | self.hostname = 'www.baidu.com' 11 | self.limit = limit 12 | self.proxy = False 13 | 14 | async def do_search(self) -> None: 15 | headers = {'Host': self.hostname, 'User-agent': Core.get_user_agent()} 16 | base_url = f'https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}' 17 | urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 10) if num <= self.limit] 18 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 19 | for response in responses: 20 | self.total_results += response 21 | 22 | async def process(self, proxy: bool = False) -> None: 23 | self.proxy = proxy 24 | await self.do_search() 25 | 26 | async def get_emails(self): 27 | rawres = myparser.Parser(self.total_results, self.word) 28 | return await rawres.emails() 29 | 30 | async def get_hostnames(self): 31 | rawres = myparser.Parser(self.total_results, self.word) 32 | return await rawres.hostnames() 33 | -------------------------------------------------------------------------------- /theHarvester/discovery/bevigil.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import MissingKey 2 | from theHarvester.lib.core import AsyncFetcher, Core 3 | 4 | 5 | class SearchBeVigil: 6 | def __init__(self, word) -> None: 7 | self.word = word 8 | self.totalhosts: set = set() 9 | self.interestingurls: set = set() 10 | self.key = Core.bevigil_key() 11 | if self.key is None: 12 | self.key = '' 13 | raise MissingKey('bevigil') 14 | self.proxy = False 15 | 16 | async def do_search(self) -> None: 17 | subdomain_endpoint = f'https://osint.bevigil.com/api/{self.word}/subdomains/' 18 | url_endpoint = f'https://osint.bevigil.com/api/{self.word}/urls/' 19 | headers = {'X-Access-Token': self.key} 20 | 21 | responses = await AsyncFetcher.fetch_all([subdomain_endpoint], json=True, proxy=self.proxy, headers=headers) 22 | response = responses[0] 23 | for subdomain in response['subdomains']: 24 | self.totalhosts.add(subdomain) 25 | 26 | responses = await AsyncFetcher.fetch_all([url_endpoint], json=True, proxy=self.proxy, headers=headers) 27 | response = responses[0] 28 | for url in response['urls']: 29 | self.interestingurls.add(url) 30 | 31 | async def get_hostnames(self) -> set: 32 | return self.totalhosts 33 | 34 | async def get_interestingurls(self) -> set: 35 | return self.interestingurls 36 | 37 | async def process(self, proxy: bool = False) -> None: 38 | self.proxy = proxy 39 | await self.do_search() 40 | -------------------------------------------------------------------------------- /theHarvester/discovery/bingsearch.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | from theHarvester.parsers import myparser 6 | 7 | 8 | class SearchBing: 9 | def __init__(self, word, limit, start) -> None: 10 | self.word = word.replace(' ', '%20') 11 | self.results: list[Any] = [] 12 | self.total_results = '' 13 | self.server = 'www.bing.com' 14 | self.apiserver = 'api.search.live.net' 15 | self.hostname = 'www.bing.com' 16 | self.limit = int(limit) 17 | self.bingApi = Core.bing_key() 18 | self.counter = start 19 | self.proxy = False 20 | 21 | async def do_search(self) -> None: 22 | headers = { 23 | 'Host': self.hostname, 24 | 'Cookie': 'SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50', 25 | 'Accept-Language': 'en-us,en', 26 | 'User-agent': Core.get_user_agent(), 27 | } 28 | base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx' 29 | urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 50) if num <= self.limit] 30 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 31 | for response in responses: 32 | self.total_results += response 33 | 34 | async def do_search_api(self) -> None: 35 | url = 'https://api.bing.microsoft.com/v7.0/search?' 36 | params = { 37 | 'q': self.word, 38 | 'count': str(self.limit), 39 | 'offset': '0', 40 | 'mkt': 'en-us', 41 | 'safesearch': 'Off', 42 | } 43 | headers = { 44 | 'User-Agent': Core.get_user_agent(), 45 | 'Ocp-Apim-Subscription-Key': self.bingApi, 46 | } 47 | self.results = await AsyncFetcher.fetch_all([url], headers=headers, params=params, proxy=self.proxy) 48 | for res in self.results: 49 | self.total_results += res 50 | 51 | async def do_search_vhost(self) -> None: 52 | headers = { 53 | 'Host': self.hostname, 54 | 'Cookie': 'mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50', 55 | 'Accept-Language': 'en-us,en', 56 | 'User-agent': Core.get_user_agent(), 57 | } 58 | base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx' 59 | urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 50) if num <= self.limit] 60 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 61 | for response in responses: 62 | self.total_results += response 63 | 64 | async def get_emails(self): 65 | rawres = myparser.Parser(self.total_results, self.word) 66 | return await rawres.emails() 67 | 68 | async def get_hostnames(self): 69 | rawres = myparser.Parser(self.total_results, self.word) 70 | return await rawres.hostnames() 71 | 72 | async def get_allhostnames(self): 73 | rawres = myparser.Parser(self.total_results, self.word) 74 | return await rawres.hostnames_all() 75 | 76 | async def process(self, api, proxy: bool = False) -> None: 77 | self.proxy = proxy 78 | if api == 'yes': 79 | if self.bingApi is None: 80 | raise MissingKey('BingAPI') 81 | await self.do_search_api() 82 | else: 83 | await self.do_search() 84 | print(f'\tSearching {self.counter} results.') 85 | 86 | async def process_vhost(self) -> None: 87 | await self.do_search_vhost() 88 | -------------------------------------------------------------------------------- /theHarvester/discovery/bravesearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from theHarvester.discovery.constants import get_delay 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | from theHarvester.parsers import myparser 6 | 7 | 8 | class SearchBrave: 9 | def __init__(self, word, limit): 10 | self.word = word 11 | self.results = '' 12 | self.totalresults = '' 13 | self.server = 'https://search.brave.com/search?q=' 14 | self.limit = limit 15 | self.proxy = False 16 | 17 | async def do_search(self): 18 | headers = {'User-Agent': Core.get_user_agent()} 19 | for query in [f'"{self.word}"', f'site:{self.word}']: 20 | try: 21 | for offset in range(0, 50): 22 | # To reduce the total number of requests, only two queries are made "self.word" and site:self.word 23 | current_url = f'{self.server}{query}&offset={offset}&source=web&show_local=0&spellcheck=0' 24 | resp = await AsyncFetcher.fetch_all([current_url], headers=headers, proxy=self.proxy) 25 | self.results = resp[0] 26 | self.totalresults += self.results 27 | # if 'Results from Microsoft Bing.' in resp[0] \ 28 | if ( 29 | 'Not many great matches came back for your search' in resp[0] 30 | or 'Your request has been flagged as being suspicious and Brave Search' in resp[0] 31 | or 'Prove' in resp[0] 32 | and 'robot' in resp[0] 33 | or 'Robot' in resp[0] 34 | ): 35 | break 36 | await asyncio.sleep(get_delay() + 15) 37 | except Exception as e: 38 | print(f'An exception has occurred in bravesearch: {e}') 39 | await asyncio.sleep(get_delay() + 80) 40 | continue 41 | 42 | async def get_emails(self): 43 | rawres = myparser.Parser(self.totalresults, self.word) 44 | return await rawres.emails() 45 | 46 | async def get_hostnames(self): 47 | rawres = myparser.Parser(self.totalresults, self.word) 48 | return await rawres.hostnames() 49 | 50 | async def process(self, proxy=False): 51 | self.proxy = proxy 52 | await self.do_search() 53 | -------------------------------------------------------------------------------- /theHarvester/discovery/bufferoverun.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchBufferover: 8 | def __init__(self, word) -> None: 9 | self.word = word 10 | self.totalhosts: set = set() 11 | self.totalips: set = set() 12 | self.key = Core.bufferoverun_key() 13 | if self.key is None: 14 | raise MissingKey('bufferoverun') 15 | self.proxy = False 16 | 17 | async def do_search(self) -> None: 18 | url = f'https://tls.bufferover.run/dns?q={self.word}' 19 | response = await AsyncFetcher.fetch_all( 20 | [url], 21 | json=True, 22 | headers={'User-Agent': Core.get_user_agent(), 'x-api-key': f'{self.key}'}, 23 | proxy=self.proxy, 24 | ) 25 | dct = response[0] 26 | if dct['Results']: 27 | self.totalhosts = { 28 | ( 29 | host.split(',') 30 | if ',' in host and self.word.replace('www.', '') in host.split(',')[0] in host 31 | else host.split(',')[4] 32 | ) 33 | for host in dct['Results'] 34 | } 35 | 36 | self.totalips = { 37 | ip.split(',')[0] for ip in dct['Results'] if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', ip.split(',')[0]) 38 | } 39 | 40 | async def get_hostnames(self) -> set: 41 | return self.totalhosts 42 | 43 | async def get_ips(self) -> set: 44 | return self.totalips 45 | 46 | async def process(self, proxy: bool = False) -> None: 47 | self.proxy = proxy 48 | await self.do_search() 49 | -------------------------------------------------------------------------------- /theHarvester/discovery/builtwith.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchBuiltWith: 8 | def __init__(self, word: str): 9 | self.word = word 10 | self.api_key = Core.builtwith_key() 11 | self.base_url = 'https://api.builtwith.com/v21/api.json' 12 | self.headers = {'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json'} 13 | self.hosts = set() 14 | self.tech_stack = {} 15 | self.interesting_urls = set() 16 | self.frameworks = set() 17 | self.languages = set() 18 | self.servers = set() 19 | self.cms = set() 20 | self.analytics = set() 21 | 22 | async def process(self, proxy: bool = False) -> None: 23 | """Get technology stack information for a domain.""" 24 | try: 25 | if proxy: 26 | response = await AsyncFetcher.fetch( 27 | session=None, url=f'{self.base_url}?KEY={self.api_key}&LOOKUP={self.word}', headers=self.headers, proxy=proxy 28 | ) 29 | if response: 30 | self.tech_stack = response 31 | self._extract_data() 32 | else: 33 | async with aiohttp.ClientSession(headers=self.headers) as session: 34 | async with session.get(f'{self.base_url}?KEY={self.api_key}&LOOKUP={self.word}') as response: 35 | if response.status == 200: 36 | data = await response.json() 37 | self.tech_stack = data 38 | self._extract_data() 39 | elif response.status == 401: 40 | print('[!] Missing API key for BuiltWith.') 41 | raise MissingKey('BuiltWith') 42 | except Exception as e: 43 | print(f'Error in BuiltWith search: {e}') 44 | 45 | def _extract_data(self) -> None: 46 | """Extract and categorize technology information.""" 47 | if 'domains' in self.tech_stack: 48 | self.hosts.update(self.tech_stack['domains']) 49 | if 'paths' in self.tech_stack: 50 | self.interesting_urls.update(self.tech_stack['paths']) 51 | if 'technologies' in self.tech_stack: 52 | for tech in self.tech_stack['technologies']: 53 | category = tech.get('category', '').lower() 54 | name = tech.get('name', '') 55 | 56 | if 'framework' in category: 57 | self.frameworks.add(name) 58 | elif 'language' in category: 59 | self.languages.add(name) 60 | elif 'server' in category: 61 | self.servers.add(name) 62 | elif 'cms' in category: 63 | self.cms.add(name) 64 | elif 'analytics' in category: 65 | self.analytics.add(name) 66 | 67 | async def get_hostnames(self) -> set[str]: 68 | return self.hosts 69 | 70 | async def get_tech_stack(self) -> dict: 71 | return self.tech_stack 72 | 73 | async def get_interesting_urls(self) -> set[str]: 74 | return self.interesting_urls 75 | 76 | async def get_frameworks(self) -> set[str]: 77 | return self.frameworks 78 | 79 | async def get_languages(self) -> set[str]: 80 | return self.languages 81 | 82 | async def get_servers(self) -> set[str]: 83 | return self.servers 84 | 85 | async def get_cms(self) -> set[str]: 86 | return self.cms 87 | 88 | async def get_analytics(self) -> set[str]: 89 | return self.analytics 90 | -------------------------------------------------------------------------------- /theHarvester/discovery/censysearch.py: -------------------------------------------------------------------------------- 1 | from censys.common import __version__ 2 | from censys.common.exceptions import ( 3 | CensysRateLimitExceededException, 4 | CensysUnauthorizedException, 5 | ) 6 | from censys.search import CensysCerts 7 | 8 | from theHarvester.discovery.constants import MissingKey 9 | from theHarvester.lib.core import Core 10 | from theHarvester.lib.version import version as thehavester_version 11 | 12 | 13 | class SearchCensys: 14 | def __init__(self, domain, limit: int = 500) -> None: 15 | self.word = domain 16 | self.key = Core.censys_key() 17 | if self.key[0] is None or self.key[1] is None: 18 | raise MissingKey('Censys ID and/or Secret') 19 | self.totalhosts: set = set() 20 | self.emails: set = set() 21 | self.limit = limit 22 | self.proxy = False 23 | 24 | async def do_search(self) -> None: 25 | try: 26 | cert_search = CensysCerts( 27 | api_id=self.key[0], 28 | api_secret=self.key[1], 29 | user_agent=f'censys-python/{__version__} (theHarvester/{thehavester_version}); +https://github.com/laramies/theHarvester)', 30 | ) 31 | except CensysUnauthorizedException: 32 | raise MissingKey('Censys ID and/or Secret') 33 | 34 | query = f'names: {self.word}' 35 | try: 36 | response = cert_search.search( 37 | query=query, 38 | fields=['names', 'parsed.subject.email_address'], 39 | max_records=self.limit, 40 | ) 41 | for cert in response(): 42 | self.totalhosts.update(cert.get('names', [])) 43 | email_address = cert.get('parsed', {}).get('subject', {}).get('email_address', []) 44 | self.emails.update(email_address) 45 | except CensysRateLimitExceededException: 46 | print('Censys rate limit exceeded') 47 | 48 | async def get_hostnames(self) -> set: 49 | return self.totalhosts 50 | 51 | async def get_emails(self) -> set: 52 | return self.emails 53 | 54 | async def process(self, proxy: bool = False) -> None: 55 | self.proxy = proxy 56 | await self.do_search() 57 | -------------------------------------------------------------------------------- /theHarvester/discovery/certspottersearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher 2 | 3 | 4 | class SearchCertspoter: 5 | def __init__(self, word) -> None: 6 | self.word = word 7 | self.totalhosts: set = set() 8 | self.proxy = False 9 | 10 | async def do_search(self) -> None: 11 | base_url = f'https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names' 12 | try: 13 | response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy) 14 | response = response[0] 15 | if isinstance(response, list): 16 | for dct in response: 17 | for key, value in dct.items(): 18 | if key == 'dns_names': 19 | self.totalhosts.update({name for name in value if name}) 20 | elif isinstance(response, dict): 21 | self.totalhosts.update({response['dns_names'] if 'dns_names' in response.keys() else ''}) # type: ignore 22 | else: 23 | self.totalhosts.update({''}) 24 | except Exception as e: 25 | print(e) 26 | 27 | async def get_hostnames(self) -> set: 28 | return self.totalhosts 29 | 30 | async def process(self, proxy: bool = False) -> None: 31 | self.proxy = proxy 32 | await self.do_search() 33 | print('\tSearching results.') 34 | -------------------------------------------------------------------------------- /theHarvester/discovery/constants.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | from theHarvester.lib.core import AsyncFetcher, Core 4 | 5 | 6 | async def splitter(links): 7 | """ 8 | Method that tries to remove duplicates 9 | LinkedinLists pulls a lot of profiles with the same name. 10 | This method tries to remove duplicates from the list. 11 | :param links: list of links to remove duplicates from 12 | :return: a unique-ish list 13 | """ 14 | unique_list = [] 15 | name_check = [] 16 | for url in links: 17 | tail = url.split('/')[-1] 18 | if len(tail) == 2 or tail == 'zh-cn': 19 | tail = url.split('/')[-2] 20 | name = tail.split('-') 21 | if len(name) > 1: 22 | joined_name = name[0] + name[1] 23 | else: 24 | joined_name = name[0] 25 | if joined_name not in name_check: 26 | unique_list.append(url) 27 | name_check.append(joined_name) 28 | return unique_list 29 | 30 | 31 | def filter(lst): 32 | """ 33 | Method that filters list 34 | :param lst: list to be filtered 35 | :return: new filtered list 36 | """ 37 | if lst is None: 38 | return [] 39 | if not isinstance(lst, set): 40 | lst = set(lst) # Remove duplicates. 41 | new_lst = [] 42 | for item in lst: 43 | item = str(item) 44 | if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item): 45 | item = item.replace('252f', '').replace('2F', '').replace('2f', '') 46 | new_lst.append(item.lower()) 47 | return new_lst 48 | 49 | 50 | def get_delay() -> float: 51 | """Method that is used to generate a random delay""" 52 | return random.randint(1, 3) - 0.5 53 | 54 | 55 | async def search(text: str) -> bool: 56 | """Helper function to check if Google has blocked traffic. 57 | :param text: See if specific text is returned, which means Google is blocking us 58 | :return bool: 59 | """ 60 | for line in text.strip().splitlines(): 61 | if ( 62 | 'This page appears when Google automatically detects requests coming from your computer network' in line 63 | or 'http://www.google.com/sorry/index' in line 64 | or 'https://www.google.com/sorry/index' in line 65 | ): 66 | # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP') 67 | return True 68 | return False 69 | 70 | 71 | async def google_workaround(visit_url: str) -> bool | str: 72 | """ 73 | Function that makes a request on our behalf if Google starts to block us 74 | :param visit_url: Url to scrape 75 | :return: Correct html that can be parsed by BS4 76 | """ 77 | url = 'https://websniffer.cc/' 78 | data = { 79 | 'Cookie': '', 80 | 'url': visit_url, 81 | 'submit': 'Submit', 82 | 'type': 'GET&http=1.1', 83 | 'uak': str(random.randint(4, 8)), # select random UA to send to Google 84 | } 85 | returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data) 86 | returned_html = ( 87 | 'This page appears when Google automatically detects requests coming from your computer network' 88 | if returned_html == '' 89 | else returned_html[0] 90 | ) 91 | 92 | returned_html = '' if 'Please Wait... | Cloudflare' in returned_html else returned_html 93 | 94 | if len(returned_html) == 0 or await search(returned_html) or '<html' not in returned_html: 95 | # indicates that google is serving workaround a captcha 96 | # That means we will try out second option which will utilize proxies 97 | return True 98 | # the html we get is malformed for BS4 as there are no greater than or less than signs 99 | if '<html>' in returned_html: 100 | start_index = returned_html.index('<html>') 101 | else: 102 | start_index = returned_html.index('<html') 103 | 104 | end_index = returned_html.index('</html>') + 1 105 | correct_html = returned_html[start_index:end_index] 106 | # Slice list to get the response's html 107 | correct_html = ''.join([ch.strip().replace('<', '<').replace('>', '>') for ch in correct_html]) 108 | return correct_html 109 | 110 | 111 | class MissingKey(Exception): 112 | """ 113 | :raise: When there is a module that has not been provided its API key 114 | """ 115 | 116 | def __init__(self, source: str | None) -> None: 117 | if source: 118 | self.message = f'\n\033[93m[!] Missing API key for {source}. \033[0m' 119 | else: 120 | self.message = '\n\033[93m[!] Missing CSE id. \033[0m' 121 | 122 | def __str__(self) -> str: 123 | return self.message 124 | -------------------------------------------------------------------------------- /theHarvester/discovery/criminalip.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from urllib.parse import urlparse 3 | 4 | from theHarvester.discovery.constants import MissingKey, get_delay 5 | from theHarvester.lib.core import AsyncFetcher, Core 6 | 7 | 8 | class SearchCriminalIP: 9 | def __init__(self, word) -> None: 10 | self.word = word 11 | self.totalhosts: set = set() 12 | self.totalips: set = set() 13 | self.asns: set = set() 14 | self.key = Core.criminalip_key() 15 | if self.key is None: 16 | raise MissingKey('criminalip') 17 | self.proxy = False 18 | 19 | async def do_search(self) -> None: 20 | # https://www.criminalip.io/developer/api/post-domain-scan 21 | # https://www.criminalip.io/developer/api/get-domain-status-id 22 | # https://www.criminalip.io/developer/api/get-domain-report-id 23 | url = 'https://api.criminalip.io/v1/domain/scan' 24 | data = f'{{"query": "{self.word}"}}' 25 | # print(f'Current key: {self.key}') 26 | user_agent = Core.get_user_agent() 27 | response = await AsyncFetcher.post_fetch( 28 | url, 29 | json=True, 30 | headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'}, 31 | data=data, 32 | proxy=self.proxy, 33 | ) 34 | # print(f'My response: {response}') 35 | # Expected response format: 36 | # {'data': {'scan_id': scan_id}, 'message': 'api success', 'status': 200} 37 | if 'status' in response.keys(): 38 | status = response['status'] 39 | if status != 200: 40 | print(f'An error has occurred searching criminalip dumping response: {response}') 41 | else: 42 | scan_id = response['data']['scan_id'] 43 | scan_percentage = 0 44 | counter = 0 45 | while scan_percentage != 100: 46 | status_url = f'https://api.criminalip.io/v1/domain/status/{scan_id}' 47 | status_response = await AsyncFetcher.fetch_all( 48 | [status_url], 49 | json=True, 50 | headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'}, 51 | proxy=self.proxy, 52 | ) 53 | status = status_response[0] 54 | # print(f'Status response: {status}') 55 | # Expected format: 56 | # {"data": {"scan_percentage": 100}, "message": "api success", "status": 200} 57 | scan_percentage = status['data']['scan_percentage'] 58 | if scan_percentage == 100: 59 | break 60 | if scan_percentage == -2: 61 | print(f'CriminalIP failed to scan: {self.word} does not exist, verify manually') 62 | print(f'Dumping data: scan_response: {response} status_response: {status}') 63 | return 64 | if scan_percentage == -1: 65 | print(f'CriminalIP scan failed dumping data: scan_response: {response} status_response: {status}') 66 | return 67 | # Wait for scan to finish 68 | if counter >= 5: 69 | await asyncio.sleep(20 * get_delay()) 70 | else: 71 | await asyncio.sleep(10 * get_delay()) 72 | counter += 1 73 | if counter == 10: 74 | print( 75 | 'Ten iterations have occurred in CriminalIP waiting for scan to finish, returning to prevent infinite loop.' 76 | ) 77 | print( 78 | f'Verify results manually on CriminalIP dumping data: scan_response: {response} status_response: {status}' 79 | ) 80 | return 81 | 82 | report_url = f'https://api.criminalip.io/v1/domain/report/{scan_id}' 83 | scan_response = await AsyncFetcher.fetch_all( 84 | [report_url], 85 | json=True, 86 | headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'}, 87 | proxy=self.proxy, 88 | ) 89 | scan = scan_response[0] 90 | # json_formatted_str = json.dumps(scan, indent=2) 91 | # print(json_formatted_str) 92 | try: 93 | await self.parser(scan) 94 | except Exception as e: 95 | print(f'An exception occurred while parsing criminalip result: {e}') 96 | print('Dumping json: ') 97 | print(scan) 98 | 99 | async def parser(self, jlines): 100 | # TODO when new scope field is added to parse lines for potential new scope! 101 | # TODO map as_name to asn for asn data 102 | # TODO determine if worth storing interesting urls 103 | if 'data' not in jlines.keys(): 104 | print(f'Error with criminalip data, dumping: {jlines}') 105 | return 106 | data = jlines['data'] 107 | for cert in data['certificates']: 108 | # print(f'Current cert: {cert}') 109 | if cert['subject'].endswith('.' + self.word): 110 | self.totalhosts.add(cert['subject']) 111 | 112 | for connected_domain in data['connected_domain_subdomain']: 113 | try: 114 | main_domain = connected_domain['main_domain']['domain'] 115 | subdomains = [sub['domain'] for sub in connected_domain['subdomains']] 116 | if main_domain.endswith('.' + self.word): 117 | self.totalhosts.add(main_domain) 118 | for sub in subdomains: 119 | # print(f'Current sub: {sub}') 120 | if sub.endswith('.' + self.word): 121 | self.totalhosts.add(sub) 122 | except Exception as e: 123 | print(f'An exception has occurred: {e}') 124 | print(f'Main line: {connected_domain}') 125 | 126 | for ip_info in data['connected_ip_info']: 127 | self.asns.add(str(ip_info['asn'])) 128 | domains = [sub['domain'] for sub in ip_info['domain_list']] 129 | for sub in domains: 130 | if sub.endswith('.' + self.word): 131 | self.totalhosts.add(sub) 132 | self.totalips.add(ip_info['ip']) 133 | 134 | for cookie in data['cookies']: 135 | if cookie['domain'] != '.' + self.word and cookie['domain'].endswith('.' + self.word): 136 | self.totalhosts.add(cookie['domain']) 137 | 138 | for country in data['country']: 139 | if country['domain'].endswith('.' + self.word): 140 | self.totalhosts.add(country['domain']) 141 | for ip in country['mapped_ips']: 142 | self.totalips.add(ip['ip']) 143 | 144 | for k, v in data['dns_record'].items(): 145 | if k == 'dns_record_type_a': 146 | for ip in data['dns_record'][k]['ipv4']: 147 | self.totalips.add(ip['ip']) 148 | else: 149 | if isinstance(v, list): 150 | for item in v: 151 | if isinstance(item, list): 152 | for subitem in item: 153 | if subitem.endswith('.' + self.word): 154 | self.totalhosts.add(subitem) 155 | else: 156 | if item.endswith('.' + self.word): 157 | self.totalhosts.add(item) 158 | 159 | for domain_list in data['domain_list']: 160 | self.asns.add(str(domain_list['asn'])) 161 | domains = [sub['domain'] for sub in domain_list['domain_list']] 162 | for sub in domains: 163 | if sub.endswith('.' + self.word): 164 | self.totalhosts.add(sub) 165 | self.totalips.add(domain_list['ip']) 166 | 167 | for html_page_links in data['html_page_link_domains']: 168 | domain = html_page_links['domain'] 169 | if domain.endswith('.' + self.word): 170 | self.totalhosts.add(domain) 171 | for ip in html_page_links['mapped_ips']: 172 | self.totalips.add(ip['ip']) 173 | 174 | # TODO combine data['links'] and data['network_logs'] urls into one list for one run through 175 | for link in data['links']: 176 | url = link['url'] 177 | parsed_url = urlparse(url) 178 | netloc = parsed_url.netloc 179 | if self.word in netloc: 180 | if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word): 181 | self.totalhosts.add(netloc) 182 | 183 | for log in data['network_logs']: 184 | url = log['url'] 185 | parsed_url = urlparse(url) 186 | netloc = parsed_url.netloc 187 | if self.word in netloc: 188 | if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word): 189 | self.totalhosts.add(netloc) 190 | self.asns.add(str(log['as_number'])) 191 | 192 | for redirects in data['page_redirections']: 193 | for redirect in redirects: 194 | url = redirect['url'] 195 | parsed_url = urlparse(url) 196 | netloc = parsed_url.netloc 197 | if self.word in netloc: 198 | if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word): 199 | self.totalhosts.add(netloc) 200 | 201 | self.totalhosts = {host.replace('www.', '') for host in self.totalhosts if '*.' + self.word != host} 202 | 203 | # print(f'hostnames: {self.totalhosts}') 204 | # print(f'asns: {self.asns}') 205 | # print(f'ips: {self.totalips}') 206 | 207 | async def get_asns(self) -> set: 208 | return self.asns 209 | 210 | async def get_hostnames(self) -> set: 211 | return self.totalhosts 212 | 213 | async def get_ips(self) -> set: 214 | return self.totalips 215 | 216 | async def process(self, proxy: bool = False) -> None: 217 | self.proxy = proxy 218 | await self.do_search() 219 | -------------------------------------------------------------------------------- /theHarvester/discovery/crtsh.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher 2 | 3 | 4 | class SearchCrtsh: 5 | def __init__(self, word) -> None: 6 | self.word = word 7 | self.data: list = [] 8 | self.proxy = False 9 | 10 | async def do_search(self) -> list: 11 | data: set = set() 12 | try: 13 | url = f'https://crt.sh/?q=%25.{self.word}&exclude=expired&deduplicate=Y&output=json' 14 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 15 | response = response[0] 16 | data = set([(dct['name_value'][2:] if dct['name_value'][:2] == '*.' else dct['name_value']) for dct in response]) 17 | data = {domain for domain in data if (domain[0] != '*' and str(domain[0:4]).isnumeric() is False)} 18 | except Exception as e: 19 | print(e) 20 | clean: list = [] 21 | for x in data: 22 | pre = x.split() 23 | for y in pre: 24 | clean.append(y) 25 | return clean 26 | 27 | async def process(self, proxy: bool = False) -> None: 28 | self.proxy = proxy 29 | data = await self.do_search() 30 | self.data = data 31 | 32 | async def get_hostnames(self) -> list: 33 | return self.data 34 | -------------------------------------------------------------------------------- /theHarvester/discovery/dnssearch.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============ 3 | DNS Browsing 4 | ============ 5 | 6 | Explore the space around known hosts & ips for extra catches. 7 | """ 8 | 9 | import asyncio 10 | import re 11 | import sys 12 | from collections.abc import Callable 13 | from ipaddress import IPv4Network 14 | 15 | from aiodns import DNSResolver 16 | 17 | from theHarvester.lib import hostchecker 18 | from theHarvester.lib.core import DATA_DIR 19 | 20 | ##################################################################### 21 | # DNS FORCE 22 | ##################################################################### 23 | 24 | DNS_NAMES = DATA_DIR / 'wordlists' / 'dns-names.txt' 25 | 26 | 27 | class DnsForce: 28 | def __init__(self, domain, dnsserver, verbose: bool = False) -> None: 29 | self.domain = domain 30 | self.subdo = False 31 | self.verbose = verbose 32 | # self.dnsserver = [dnsserver] if isinstance(dnsserver, str) else dnsserver 33 | # self.dnsserver = list(map(str, dnsserver.split(','))) if isinstance(dnsserver, str) else dnsserver 34 | self.dnsserver = dnsserver 35 | with DNS_NAMES.open('r') as file: 36 | self.list = file.readlines() 37 | self.domain = domain.replace('www.', '') 38 | self.list = [f'{word.strip()}.{self.domain}' for word in self.list] 39 | 40 | async def run(self): 41 | print(f'Starting DNS brute forcing with {len(self.list)} words') 42 | checker = hostchecker.Checker(self.list, nameservers=self.dnsserver) 43 | resolved_pair, hosts, ips = await checker.check() 44 | return resolved_pair, hosts, ips 45 | 46 | 47 | ##################################################################### 48 | # DNS REVERSE 49 | ##################################################################### 50 | 51 | 52 | IP_REGEX = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' 53 | PORT_REGEX = r'\d{1,5}' 54 | NETMASK_REGEX: str = r'\d{1,2}|' + IP_REGEX 55 | NETWORK_REGEX: str = rf'\b({IP_REGEX})(?:\:({PORT_REGEX}))?(?:\/({NETMASK_REGEX}))?\b' 56 | 57 | 58 | def serialize_ip_range(ip: str, netmask: str = '24') -> str: 59 | """ 60 | Serialize a network range in a constant format, 'x.x.x.x/y'. 61 | 62 | Parameters 63 | ---------- 64 | ip: str. 65 | A serialized ip in the format 'x.x.x.x'. 66 | Extra information like port (':z') or subnet ('/n') 67 | will be ignored. 68 | netmask: str. 69 | The subnet subdivision, represented by a 2 digit netmask. 70 | 71 | Returns 72 | ------- 73 | out: str. 74 | The network OSI address, like '192.168.0.0/24'. 75 | """ 76 | __ip_matches = re.search(NETWORK_REGEX, ip, re.IGNORECASE) 77 | if __ip_matches and __ip_matches.groups(): 78 | __ip = __ip_matches.group(1) 79 | __netmask = netmask if netmask else __ip_matches.group(3) 80 | if __ip and __netmask: 81 | return str(IPv4Network(f'{__ip}/{__netmask}', strict=False)) 82 | elif __ip: 83 | return str(IPv4Network('{}/{}'.format(__ip, '24'), strict=False)) 84 | 85 | # invalid input ip 86 | return '' 87 | 88 | 89 | def list_ips_in_network_range(iprange: str) -> list[str]: 90 | """ 91 | List all the IPs in the range. 92 | 93 | Parameters 94 | ---------- 95 | iprange: str. 96 | A serialized ip range, like '1.2.3.0/24'. 97 | The last digit can be set to anything, it will be ignored. 98 | 99 | Returns 100 | ------- 101 | out: list. 102 | The list of IPs in the range. 103 | """ 104 | try: 105 | __network = IPv4Network(iprange, strict=False) 106 | return [__address.exploded for __address in __network.hosts()] 107 | except Exception: 108 | return [] 109 | 110 | 111 | async def reverse_single_ip(ip: str, resolver: DNSResolver) -> str: 112 | """ 113 | Reverse a single IP and output the linked CNAME, if it exists. 114 | Parameters 115 | ---------- 116 | :param ip: IP address to reverse 117 | :param resolver: DNS server to use 118 | 119 | Returns 120 | ------- 121 | :return str: with the corresponding CNAME or None 122 | """ 123 | try: 124 | __host = await resolver.gethostbyaddr(ip) 125 | return __host.name if __host else '' 126 | except Exception: 127 | return '' 128 | 129 | 130 | async def reverse_all_ips_in_range(iprange: str, callback: Callable, nameservers: list[str] | None = None) -> None: 131 | """ 132 | Reverse all the IPs stored in a network range. 133 | All the queries are made concurrently. 134 | 135 | Parameters 136 | ---------- 137 | iprange: str. 138 | An IPv4 range formatted as 'x.x.x.x/y'. 139 | The last 2 digits of the ip can be set to anything, 140 | they will be ignored. 141 | callback: Callable. 142 | Arbitrary postprocessing function. 143 | nameservers: List[str]. 144 | Optional list of DNS servers. 145 | 146 | Returns 147 | ------- 148 | out: None. 149 | """ 150 | loop = asyncio.get_event_loop() 151 | __resolver = DNSResolver(loop=loop, timeout=8, nameservers=nameservers) 152 | for __ip in list_ips_in_network_range(iprange): 153 | log_query(__ip) 154 | __host = await reverse_single_ip(ip=__ip, resolver=__resolver) 155 | callback(__host) 156 | log_result(__host) 157 | 158 | 159 | ##################################################################### 160 | # IO 161 | ##################################################################### 162 | 163 | 164 | def log_query(ip: str) -> None: 165 | """ 166 | Display the current query in the console. 167 | 168 | Parameters 169 | ---------- 170 | ip: str. 171 | Queried ip. 172 | 173 | Results 174 | ------- 175 | out: None. 176 | """ 177 | sys.stdout.write(chr(27) + '[2K' + chr(27) + '[G') 178 | sys.stdout.write('\r' + ip + ' - ') 179 | sys.stdout.flush() 180 | 181 | 182 | def log_result(host: str) -> None: 183 | """ 184 | Display the query result in the console. 185 | 186 | Parameters 187 | ---------- 188 | host: str. 189 | Host name returned by the DNS query. 190 | 191 | Results 192 | ------- 193 | out: None. 194 | """ 195 | if host: 196 | print(host) 197 | 198 | 199 | def generate_postprocessing_callback(target: str, **allhosts: list[str]) -> Callable: 200 | """ 201 | Postprocess the query results asynchronously too, instead of waiting for 202 | the querying stage to be completely finished. 203 | 204 | Parameters 205 | ---------- 206 | target: str. 207 | The domain wanted as TLD. 208 | allhosts: List. 209 | A collection of all the subdomains -of target- found so far. 210 | 211 | Returns 212 | ------- 213 | out: Callable. 214 | A function that will update the collection of target subdomains 215 | when the query result is satisfying. 216 | """ 217 | 218 | def append_matching_hosts(host: str) -> None: 219 | if host and target in host: 220 | for __name, __hosts in allhosts.items(): 221 | if host not in __hosts: 222 | __hosts.append(host) 223 | 224 | return append_matching_hosts 225 | -------------------------------------------------------------------------------- /theHarvester/discovery/duckduckgosearch.py: -------------------------------------------------------------------------------- 1 | import ujson 2 | 3 | from theHarvester.lib.core import AsyncFetcher, Core 4 | from theHarvester.parsers import myparser 5 | 6 | 7 | class SearchDuckDuckGo: 8 | def __init__(self, word, limit) -> None: 9 | self.word = word 10 | self.results = '' 11 | self.totalresults = '' 12 | self.dorks: list = [] 13 | self.links: list = [] 14 | self.database = 'https://duckduckgo.com/?q=' 15 | self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1' # Currently using API. 16 | self.quantity = '100' 17 | self.limit = limit 18 | self.proxy = False 19 | 20 | async def do_search(self) -> None: 21 | # Do normal scraping. 22 | url = self.api.replace('x', self.word) 23 | headers = {'User-Agent': Core.get_user_agent()} 24 | first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 25 | self.results = first_resp[0] 26 | self.totalresults += self.results 27 | urls = await self.crawl(self.results) 28 | urls = {url for url in urls if len(url) > 5} 29 | all_resps = await AsyncFetcher.fetch_all(urls) 30 | self.totalresults += ''.join(all_resps) 31 | 32 | async def crawl(self, text): 33 | """ 34 | Function parses json and returns URLs. 35 | :param text: formatted json 36 | :return: set of URLs 37 | """ 38 | urls = set() 39 | try: 40 | load = ujson.loads(text) 41 | for keys in load.keys(): # Iterate through keys of dict. 42 | val = load.get(keys) 43 | 44 | if isinstance(val, int) or isinstance(val, dict) or val is None: 45 | continue 46 | 47 | if isinstance(val, list): 48 | if len(val) == 0: # Make sure not indexing an empty list. 49 | continue 50 | val = val[0] # The First value should be dict. 51 | 52 | if isinstance(val, dict): # Validation check. 53 | for key in val.keys(): 54 | value = val.get(key) 55 | if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value: 56 | urls.add(value) 57 | 58 | if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val: 59 | urls.add(val) 60 | tmp = set() 61 | for url in urls: 62 | if '<' in url and 'href=' in url: # Format is 63 | equal_index = url.index('=') 64 | true_url = '' 65 | for ch in url[equal_index + 1 :]: 66 | if ch == '"': 67 | tmp.add(true_url) 68 | break 69 | true_url += ch 70 | else: 71 | if url != '': 72 | tmp.add(url) 73 | return tmp 74 | except Exception as e: 75 | print(f'Exception occurred: {e}') 76 | return [] 77 | 78 | async def get_emails(self): 79 | rawres = myparser.Parser(self.totalresults, self.word) 80 | return await rawres.emails() 81 | 82 | async def get_hostnames(self): 83 | rawres = myparser.Parser(self.totalresults, self.word) 84 | return await rawres.hostnames() 85 | 86 | async def process(self, proxy: bool = False) -> None: 87 | self.proxy = proxy 88 | await self.do_search() # Only need to search once since using API. 89 | -------------------------------------------------------------------------------- /theHarvester/discovery/githubcode.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | import urllib.parse as urlparse 4 | from typing import Any, NamedTuple 5 | 6 | import aiohttp 7 | 8 | from theHarvester.discovery.constants import MissingKey, get_delay 9 | from theHarvester.lib.core import Core 10 | from theHarvester.parsers import myparser 11 | 12 | 13 | class RetryResult(NamedTuple): 14 | time: float 15 | 16 | 17 | class SuccessResult(NamedTuple): 18 | fragments: list[str] 19 | next_page: int 20 | last_page: int 21 | 22 | 23 | class ErrorResult(NamedTuple): 24 | status_code: int 25 | body: Any 26 | 27 | 28 | class SearchGithubCode: 29 | def __init__(self, word, limit) -> None: 30 | try: 31 | self.word = word 32 | self.total_results = '' 33 | self.server = 'api.github.com' 34 | self.limit = limit 35 | self.counter = 0 36 | self.page = 1 37 | self.key = Core.github_key() 38 | if self.key is None: 39 | raise MissingKey('Github') 40 | self.proxy = False 41 | self.base_url = f'https://{self.server}/search/code?q="{self.word}"' 42 | self.headers = { 43 | 'Host': self.server, 44 | 'User-agent': Core.get_user_agent(), 45 | 'Accept': 'application/vnd.github.v3.text-match+json', 46 | 'Authorization': f'token {self.key}', 47 | } 48 | except Exception as e: 49 | print(f'Error initializing SearchGithubCode: {e}') 50 | raise 51 | 52 | @staticmethod 53 | async def fragments_from_response(json_data: dict) -> list[str]: 54 | try: 55 | return [ 56 | match['fragment'] 57 | for item in json_data.get('items', []) 58 | for match in item.get('text_matches', []) 59 | if match.get('fragment') is not None 60 | ] 61 | except Exception as e: 62 | print(f'Error extracting fragments: {e}') 63 | return [] 64 | 65 | @staticmethod 66 | async def page_from_response(page: str, links) -> int | None: 67 | try: 68 | if page_link := links.get(page): 69 | parsed = urlparse.urlparse(str(page_link.get('url'))) 70 | if page_param := urlparse.parse_qs(parsed.query).get('page', [None])[0]: 71 | return int(page_param) 72 | return 0 73 | except Exception as e: 74 | print(f'Error parsing page response: {e}') 75 | return None 76 | 77 | async def handle_response(self, response: tuple[str, dict, int, Any]) -> ErrorResult | RetryResult | SuccessResult: 78 | try: 79 | text, json_data, status, links = response 80 | if status == 200: 81 | results = await self.fragments_from_response(json_data) 82 | # Ensure next_page and last_page default to 0 if None 83 | next_page = await self.page_from_response('next', links) or 0 84 | last_page = await self.page_from_response('last', links) or 0 85 | return SuccessResult(results, next_page, last_page) 86 | if status in (429, 403): 87 | return RetryResult(60) 88 | return ErrorResult(status, json_data if isinstance(json_data, dict) else text) 89 | except Exception as e: 90 | print(f'Error handling response: {e}') 91 | return ErrorResult(500, str(e)) 92 | 93 | @staticmethod 94 | async def next_page_or_end(result: SuccessResult) -> int | None: 95 | if result.next_page is not None: 96 | return result.next_page 97 | else: 98 | return result.last_page 99 | 100 | async def do_search(self, page: int) -> tuple[str, dict, int, Any]: 101 | try: 102 | url = f'{self.base_url}&page={page}' if page else self.base_url 103 | async with aiohttp.ClientSession(headers=self.headers) as sess: 104 | async with sess.get(url, proxy=random.choice(Core.proxy_list()) if self.proxy else None) as resp: 105 | return await resp.text(), await resp.json(), resp.status, resp.links 106 | except Exception as e: 107 | print(f'Error performing search: {e}') 108 | return '', {}, 500, {} 109 | 110 | async def process(self, proxy: bool = False) -> None: 111 | try: 112 | self.proxy = proxy 113 | while self.counter <= self.limit and self.page is not None: 114 | try: 115 | api_response = await self.do_search(self.page) 116 | result = await self.handle_response(api_response) 117 | 118 | if isinstance(result, SuccessResult): 119 | print(f'\tSearching {self.counter} results.') 120 | self.total_results += ''.join(result.fragments) 121 | self.counter += len(result.fragments) 122 | self.page = result.next_page or result.last_page 123 | await asyncio.sleep(get_delay()) 124 | elif isinstance(result, RetryResult): 125 | sleepy_time = get_delay() + result.time 126 | print(f'\tRetrying page in {sleepy_time} seconds...') 127 | await asyncio.sleep(sleepy_time) 128 | else: 129 | print(f'\tException occurred: status_code: {result.status_code} reason: {result.body}') 130 | except Exception as e: 131 | print(f'Error processing page: {e}') 132 | await asyncio.sleep(get_delay()) 133 | except Exception as e: 134 | print(f'An exception has occurred in githubcode process: {e}') 135 | 136 | async def get_emails(self): 137 | try: 138 | rawres = myparser.Parser(self.total_results, self.word) 139 | return await rawres.emails() 140 | except Exception as e: 141 | print(f'Error getting emails: {e}') 142 | return [] 143 | 144 | async def get_hostnames(self): 145 | try: 146 | rawres = myparser.Parser(self.total_results, self.word) 147 | return await rawres.hostnames() 148 | except Exception as e: 149 | print(f'Error getting hostnames: {e}') 150 | return [] 151 | -------------------------------------------------------------------------------- /theHarvester/discovery/hackertarget.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | 3 | 4 | class SearchHackerTarget: 5 | """ 6 | Class uses the HackerTarget api to gather subdomains and ips 7 | """ 8 | 9 | def __init__(self, word) -> None: 10 | self.word = word 11 | self.total_results = '' 12 | self.hostname = 'https://api.hackertarget.com' 13 | self.proxy = False 14 | self.results = None 15 | 16 | async def do_search(self) -> None: 17 | headers = {'User-agent': Core.get_user_agent()} 18 | urls = [ 19 | f'{self.hostname}/hostsearch/?q={self.word}', 20 | f'{self.hostname}/reversedns/?q={self.word}', 21 | ] 22 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 23 | for response in responses: 24 | self.total_results += response.replace(',', ':') 25 | 26 | async def process(self, proxy: bool = False) -> None: 27 | self.proxy = proxy 28 | await self.do_search() 29 | 30 | async def get_hostnames(self) -> list: 31 | return [result for result in self.total_results.splitlines() if 'No PTR records found' not in result] 32 | -------------------------------------------------------------------------------- /theHarvester/discovery/haveibeenpwned.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchHaveIBeenPwned: 8 | def __init__(self, word: str): 9 | self.word = word 10 | self.api_key = Core.haveibeenpwned_key() 11 | self.base_url = 'https://haveibeenpwned.com/api/v3' 12 | self.headers = {'hibp-api-key': self.api_key, 'user-agent': 'theHarvester', 'Content-Type': 'application/json'} 13 | self.hosts = set() 14 | self.emails = set() 15 | self.breaches = [] 16 | self.pastes = [] 17 | self.breach_dates = set() 18 | self.breach_types = set() 19 | self.affected_data = set() 20 | 21 | async def process(self, proxy: bool = False) -> None: 22 | """Search for breaches associated with a domain or email.""" 23 | try: 24 | if proxy: 25 | response = await AsyncFetcher.fetch( 26 | session=None, url=f'{self.base_url}/breaches?domain={self.word}', headers=self.headers, proxy=proxy 27 | ) 28 | if response: 29 | self.breaches = response 30 | self._extract_data() 31 | else: 32 | async with aiohttp.ClientSession(headers=self.headers) as session: 33 | async with session.get(f'{self.base_url}/breaches?domain={self.word}') as response: 34 | if response.status == 200: 35 | self.breaches = await response.json() 36 | self._extract_data() 37 | elif response.status == 401: 38 | print('[!] Missing API key for HaveIBeenPwned.') 39 | raise MissingKey('HaveIBeenPwned') 40 | except Exception as e: 41 | print(f'Error in HaveIBeenPwned search: {e}') 42 | 43 | def _extract_data(self) -> None: 44 | """Extract and categorize breach information.""" 45 | for breach in self.breaches: 46 | if 'Domain' in breach: 47 | self.hosts.add(breach['Domain']) 48 | if 'BreachDate' in breach: 49 | self.breach_dates.add(breach['BreachDate']) 50 | if 'BreachType' in breach: 51 | self.breach_types.add(breach['BreachType']) 52 | if 'DataClasses' in breach: 53 | self.affected_data.update(breach['DataClasses']) 54 | 55 | async def get_hostnames(self) -> set[str]: 56 | return self.hosts 57 | 58 | async def get_emails(self) -> set[str]: 59 | return self.emails 60 | 61 | async def get_breaches(self) -> list[dict]: 62 | return self.breaches 63 | 64 | async def get_pastes(self) -> list[dict]: 65 | return self.pastes 66 | 67 | async def get_breach_dates(self) -> set[str]: 68 | return self.breach_dates 69 | 70 | async def get_breach_types(self) -> set[str]: 71 | return self.breach_types 72 | 73 | async def get_affected_data(self) -> set[str]: 74 | return self.affected_data 75 | -------------------------------------------------------------------------------- /theHarvester/discovery/huntersearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchHunter: 8 | def __init__(self, word, limit, start) -> None: 9 | self.word = word 10 | self.limit = limit 11 | self.limit = 10 if limit > 10 else limit 12 | self.start = start 13 | self.key = Core.hunter_key() 14 | if self.key is None: 15 | raise MissingKey('Hunter') 16 | self.total_results = '' 17 | self.counter = start 18 | self.database = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit=10' 19 | self.proxy = False 20 | self.hostnames: list = [] 21 | self.emails: list = [] 22 | 23 | async def do_search(self) -> None: 24 | # First determine if a user account is not a free account, this call is free 25 | is_free = True 26 | headers = {'User-Agent': Core.get_user_agent()} 27 | acc_info_url = f'https://api.hunter.io/v2/account?api_key={self.key}' 28 | response = await AsyncFetcher.fetch_all([acc_info_url], headers=headers, json=True) 29 | is_free = ( 30 | is_free if 'plan_name' in response[0]['data'].keys() and response[0]['data']['plan_name'].lower() == 'free' else False 31 | ) 32 | # Extract the total number of requests that are available for an account 33 | 34 | total_requests_avail = ( 35 | response[0]['data']['requests']['searches']['available'] - response[0]['data']['requests']['searches']['used'] 36 | ) 37 | if is_free: 38 | response = await AsyncFetcher.fetch_all([self.database], headers=headers, proxy=self.proxy, json=True) 39 | self.emails, self.hostnames = await self.parse_resp(json_resp=response[0]) 40 | else: 41 | # Determine the total number of emails that are available 42 | # As the most emails you can get within one query are 100 43 | # This is only done where paid accounts are in play 44 | hunter_dinfo_url = f'https://api.hunter.io/v2/email-count?domain={self.word}' 45 | response = await AsyncFetcher.fetch_all([hunter_dinfo_url], headers=headers, proxy=self.proxy, json=True) 46 | total_number_reqs = response[0]['data']['total'] // 100 47 | # Parse out meta field within initial JSON response to determine the total number of results 48 | if total_requests_avail < total_number_reqs: 49 | print('WARNING: account does not have enough requests to gather all emails') 50 | print(f'Total requests available: {total_requests_avail}, total requests needed to be made: {total_number_reqs}') 51 | print('RETURNING current results, if you would still like to run this module comment out the if request') 52 | return 53 | self.limit = 100 54 | # max number of emails you can get per request is 100 55 | # increments of 100 with offset determining where to start 56 | # See docs for more details: https://hunter.io/api-documentation/v2#domain-search 57 | for offset in range(0, 100 * total_number_reqs, 100): 58 | req_url = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit{self.limit}&offset={offset}' 59 | response = await AsyncFetcher.fetch_all([req_url], headers=headers, proxy=self.proxy, json=True) 60 | temp_emails, temp_hostnames = await self.parse_resp(response[0]) 61 | self.emails.extend(temp_emails) 62 | self.hostnames.extend(temp_hostnames) 63 | await asyncio.sleep(1) 64 | 65 | async def parse_resp(self, json_resp): 66 | emails = list(sorted({email['value'] for email in json_resp['data']['emails']})) 67 | domains = list( 68 | sorted( 69 | { 70 | source['domain'] 71 | for email in json_resp['data']['emails'] 72 | for source in email['sources'] 73 | if self.word in source['domain'] 74 | } 75 | ) 76 | ) 77 | return emails, domains 78 | 79 | async def process(self, proxy: bool = False) -> None: 80 | self.proxy = proxy 81 | await self.do_search() # Only need to do it once. 82 | 83 | async def get_emails(self): 84 | return self.emails 85 | 86 | async def get_hostnames(self): 87 | return self.hostnames 88 | -------------------------------------------------------------------------------- /theHarvester/discovery/intelxsearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Any 3 | from urllib.parse import urlparse 4 | 5 | import aiohttp 6 | 7 | from theHarvester.discovery.constants import MissingKey 8 | from theHarvester.lib.core import Core 9 | from theHarvester.parsers import intelxparser 10 | 11 | 12 | class SearchIntelx: 13 | def __init__(self, word) -> None: 14 | self.word = word 15 | self.key = Core.intelx_key() 16 | if self.key is None: 17 | raise MissingKey('Intelx') 18 | self.database = 'https://2.intelx.io' 19 | self.results: dict[str, Any] = {} 20 | self.info: tuple[list[str], list[str], list[str]] = ([], [], []) 21 | self.limit: int = 10000 22 | self.proxy = False 23 | self.offset = 0 24 | 25 | async def do_search(self) -> None: 26 | try: 27 | headers = { 28 | 'x-key': self.key, 29 | 'User-Agent': f'{Core.get_user_agent()}-theHarvester', 30 | 'Content-Type': 'application/json', 31 | } 32 | data = { 33 | 'term': self.word, 34 | 'buckets': [], 35 | 'lookuplevel': 0, 36 | 'maxresults': self.limit, 37 | 'timeout': 5, 38 | 'datefrom': '', 39 | 'dateto': '', 40 | 'sort': 4, # Sort by date descending for faster relevant results 41 | 'media': 0, 42 | 'terminate': [], 43 | 'target': 0, 44 | } 45 | async with aiohttp.ClientSession() as session: 46 | async with session.post(f'{self.database}/phonebook/search', headers=headers, json=data) as total_resp: 47 | search_data = await total_resp.json() 48 | if not search_data['success']: 49 | print(f'Error: {search_data["message"]}') 50 | return 51 | phonebook_id = search_data['id'] 52 | 53 | await asyncio.sleep(2) # Reduced sleep time as 5s is excessive 54 | 55 | async with session.get( 56 | f'{self.database}/phonebook/search/result?id={phonebook_id}&limit={self.limit}&offset={self.offset}', 57 | headers=headers, 58 | ) as resp: 59 | self.results = await resp.json() 60 | 61 | except Exception as e: 62 | print(f'An exception has occurred in Intelx: {e}') 63 | 64 | async def process(self, proxy: bool = False): 65 | self.proxy = proxy 66 | await self.do_search() 67 | intelx_parser = intelxparser.Parser() 68 | self.info = await intelx_parser.parse_dictionaries(self.results) 69 | 70 | async def get_emails(self) -> list[str]: 71 | return self.info[0] 72 | 73 | async def get_interestingurls(self) -> tuple[list[str], list[str]]: 74 | urls = self.info[1] 75 | subdomains = [] 76 | 77 | for url in urls: 78 | try: 79 | parsed = urlparse(url) 80 | domain = parsed.netloc 81 | if domain.count('.') > 1 and self.word in domain: 82 | subdomains.append(domain) 83 | except Exception: 84 | continue 85 | 86 | return urls, list(set(subdomains)) 87 | -------------------------------------------------------------------------------- /theHarvester/discovery/leaklookup.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchLeakLookup: 8 | def __init__(self, word: str): 9 | self.word = word 10 | self.api_key = Core.leaklookup_key() 11 | self.base_url = 'https://leak-lookup.com/api' 12 | self.headers = {'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json'} 13 | self.hosts = set() 14 | self.emails = set() 15 | self.leaks = [] 16 | self.passwords = set() 17 | self.sources = set() 18 | self.leak_dates = set() 19 | 20 | async def process(self, proxy: bool = False) -> None: 21 | """Search for leaked credentials associated with an email.""" 22 | try: 23 | if proxy: 24 | response = await AsyncFetcher.fetch( 25 | session=None, 26 | url=f'{self.base_url}/search?key={self.api_key}&type=email&query={self.word}', 27 | headers=self.headers, 28 | proxy=proxy, 29 | ) 30 | if response: 31 | self.leaks = response 32 | self._extract_data() 33 | else: 34 | async with aiohttp.ClientSession(headers=self.headers) as session: 35 | async with session.get(f'{self.base_url}/search?key={self.api_key}&type=email&query={self.word}') as response: 36 | if response.status == 200: 37 | self.leaks = await response.json() 38 | self._extract_data() 39 | elif response.status == 401: 40 | print('[!] Missing API key for Leak-Lookup.') 41 | raise MissingKey('Leak-Lookup') 42 | except Exception as e: 43 | print(f'Error in Leak-Lookup search: {e}') 44 | 45 | def _extract_data(self) -> None: 46 | """Extract and categorize leak information.""" 47 | for leak in self.leaks: 48 | if 'domain' in leak: 49 | self.hosts.add(leak['domain']) 50 | if 'email' in leak: 51 | self.emails.add(leak['email']) 52 | if 'password' in leak: 53 | self.passwords.add(leak['password']) 54 | if 'source' in leak: 55 | self.sources.add(leak['source']) 56 | if 'date' in leak: 57 | self.leak_dates.add(leak['date']) 58 | 59 | async def get_hostnames(self) -> set[str]: 60 | return self.hosts 61 | 62 | async def get_emails(self) -> set[str]: 63 | return self.emails 64 | 65 | async def get_leaks(self) -> list[dict]: 66 | return self.leaks 67 | 68 | async def get_passwords(self) -> set[str]: 69 | return self.passwords 70 | 71 | async def get_sources(self) -> set[str]: 72 | return self.sources 73 | 74 | async def get_leak_dates(self) -> set[str]: 75 | return self.leak_dates 76 | -------------------------------------------------------------------------------- /theHarvester/discovery/netlas.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchNetlas: 8 | def __init__(self, word, limit: int) -> None: 9 | self.word = word 10 | self.totalhosts: list = [] 11 | self.totalips: list = [] 12 | self.key = Core.netlas_key() 13 | self.limit = limit 14 | if self.key is None: 15 | raise MissingKey('netlas') 16 | self.proxy = False 17 | 18 | async def do_count(self) -> None: 19 | """Counts the total number of subdomains 20 | 21 | :return: None 22 | """ 23 | api = f'https://app.netlas.io/api/domains_count/?q=*.{self.word}' 24 | headers = {'X-API-Key': self.key} 25 | response = await AsyncFetcher.fetch_all([api], json=True, headers=headers, proxy=self.proxy) 26 | amount_size = response[0]['count'] 27 | self.limit = amount_size if amount_size < self.limit else self.limit 28 | 29 | async def do_search(self) -> None: 30 | """Download domains for query 'q' size of 'limit' 31 | 32 | :return: None 33 | """ 34 | user_agent = Core.get_user_agent() 35 | url = 'https://app.netlas.io/api/domains/download/' 36 | 37 | payload = { 38 | 'q': f'*.{self.word}', 39 | 'fields': json.dumps(['domain']), # Convert the list to a JSON string 40 | 'source_type': 'include', 41 | 'size': str(self.limit), # Convert integer to string 42 | 'type': 'json', 43 | 'indice': json.dumps([0]), # Convert the list to a JSON string 44 | } 45 | 46 | headers = { 47 | 'X-API-Key': self.key, 48 | 'User-Agent': user_agent, 49 | } 50 | response = await AsyncFetcher.post_fetch(url, data=payload, headers=headers, proxy=self.proxy) 51 | resp_json = json.loads(response) 52 | 53 | for data in resp_json: 54 | domain = data['data']['domain'] 55 | self.totalhosts.append(domain) 56 | 57 | async def get_hostnames(self) -> list: 58 | return self.totalhosts 59 | 60 | async def process(self, proxy: bool = False) -> None: 61 | self.proxy = proxy 62 | await self.do_count() 63 | await self.do_search() 64 | -------------------------------------------------------------------------------- /theHarvester/discovery/onyphe.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | # from theHarvester.parsers import myparser 7 | 8 | 9 | class SearchOnyphe: 10 | def __init__(self, word) -> None: 11 | self.word = word 12 | self.response = '' 13 | self.totalhosts: set = set() 14 | self.totalips: set = set() 15 | self.asns: set = set() 16 | self.key = Core.onyphe_key() 17 | if self.key is None: 18 | raise MissingKey('onyphe') 19 | self.proxy = False 20 | 21 | async def do_search(self) -> None: 22 | # https://www.onyphe.io/docs/apis/search 23 | # https://www.onyphe.io/search?q=domain%3Acharter.com&captcharesponse=j5cGT 24 | # base_url = f'https://www.onyphe.io/api/v2/search/?q=domain:domain:{self.word}' 25 | base_url = f'https://www.onyphe.io/api/v2/search/?q=domain:{self.word}' 26 | headers = { 27 | 'User-Agent': Core.get_user_agent(), 28 | 'Content-Type': 'application/json', 29 | 'Authorization': f'bearer {self.key}', 30 | } 31 | response = await AsyncFetcher.fetch_all([base_url], json=True, headers=headers, proxy=self.proxy) 32 | self.response = response[0] 33 | await self.parse_onyphe_resp_json() 34 | 35 | async def parse_onyphe_resp_json(self): 36 | if isinstance(self.response, list): 37 | self.response = self.response[0] 38 | if not isinstance(self.response, dict): 39 | raise Exception(f'An exception has occurred {self.response} is not a dict') 40 | if self.response['text'] == 'Success': 41 | if 'results' in self.response.keys(): 42 | for result in self.response['results']: 43 | try: 44 | if 'alternativeip' in result.keys(): 45 | self.totalips.update({altip for altip in result['alternativeip']}) 46 | if 'url' in result.keys() and isinstance(result['url'], list): 47 | self.totalhosts.update( 48 | urlparse(url).netloc for url in result['url'] if urlparse(url).netloc.endswith(self.word) 49 | ) 50 | self.asns.add(result['asn']) 51 | self.asns.add(result['geolocus']['asn']) 52 | self.totalips.add(result['geolocus']['subnet']) 53 | self.totalips.add(result['ip']) 54 | self.totalips.add(result['subnet']) 55 | # Shouldn't be needed as API autoparses urls from html raw data 56 | # rawres = myparser.Parser(result['data'], self.word) 57 | # if await rawres.hostnames(): 58 | # self.totalhosts.update(set(await rawres.hostnames())) 59 | for subdomain_key in [ 60 | 'domain', 61 | 'hostname', 62 | 'subdomains', 63 | 'subject', 64 | 'reverse', 65 | 'geolocus', 66 | ]: 67 | if subdomain_key in result.keys(): 68 | if subdomain_key == 'subject': 69 | self.totalhosts.update( 70 | {domain for domain in result[subdomain_key]['altname'] if domain.endswith(self.word)} 71 | ) 72 | elif subdomain_key == 'geolocus': 73 | self.totalhosts.update( 74 | {domain for domain in result[subdomain_key]['domain'] if domain.endswith(self.word)} 75 | ) 76 | else: 77 | self.totalhosts.update( 78 | {domain for domain in result[subdomain_key] if domain.endswith(self.word)} 79 | ) 80 | except Exception: 81 | continue 82 | else: 83 | print(f'Onhyphe API query did not succeed dumping current response: {self.response}') 84 | 85 | async def get_asns(self) -> set: 86 | return self.asns 87 | 88 | async def get_hostnames(self) -> set: 89 | return self.totalhosts 90 | 91 | async def get_ips(self) -> set: 92 | return self.totalips 93 | 94 | async def process(self, proxy: bool = False) -> None: 95 | self.proxy = proxy 96 | await self.do_search() 97 | -------------------------------------------------------------------------------- /theHarvester/discovery/otxsearch.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from theHarvester.lib.core import AsyncFetcher 4 | 5 | 6 | class SearchOtx: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.totalhosts: set = set() 10 | self.totalips: set = set() 11 | self.proxy = False 12 | 13 | async def do_search(self) -> None: 14 | url = f'https://otx.alienvault.com/api/v1/indicators/domain/{self.word}/passive_dns' 15 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 16 | responses = response[0] 17 | dct = responses 18 | self.totalhosts = {host['hostname'] for host in dct['passive_dns']} 19 | # filter out ips that are just called NXDOMAIN 20 | self.totalips = { 21 | ip['address'] for ip in dct['passive_dns'] if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', ip['address']) 22 | } 23 | 24 | async def get_hostnames(self) -> set: 25 | return self.totalhosts 26 | 27 | async def get_ips(self) -> set: 28 | return self.totalips 29 | 30 | async def process(self, proxy: bool = False) -> None: 31 | self.proxy = proxy 32 | await self.do_search() 33 | -------------------------------------------------------------------------------- /theHarvester/discovery/pentesttools.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import ujson 4 | 5 | from theHarvester.discovery.constants import MissingKey 6 | from theHarvester.lib.core import AsyncFetcher, Core 7 | 8 | 9 | class SearchPentestTools: 10 | def __init__(self, word) -> None: 11 | # Script is largely based off https://pentest-tools.com/public/api_client.py.txt 12 | self.word = word 13 | self.key = Core.pentest_tools_key() 14 | if self.key is None: 15 | raise MissingKey('PentestTools') 16 | self.total_results: list = [] 17 | self.api = f'https://pentest-tools.com/api?key={self.key}' 18 | self.proxy = False 19 | 20 | async def poll(self, scan_id): 21 | while True: 22 | time.sleep(3) 23 | # Get the status of our scan 24 | scan_status_data = {'op': 'get_scan_status', 'scan_id': scan_id} 25 | responses = await AsyncFetcher.post_fetch(url=self.api, data=ujson.dumps(scan_status_data), proxy=self.proxy) 26 | res_json = ujson.loads(responses.strip()) 27 | if res_json['op_status'] == 'success': 28 | if res_json['scan_status'] != 'waiting' and res_json['scan_status'] != 'running': 29 | getoutput_data = { 30 | 'op': 'get_output', 31 | 'scan_id': scan_id, 32 | 'output_format': 'json', 33 | } 34 | responses = await AsyncFetcher.post_fetch(url=self.api, data=ujson.dumps(getoutput_data), proxy=self.proxy) 35 | 36 | res_json = ujson.loads(responses.strip('\n')) 37 | self.total_results = await self.parse_json(res_json) 38 | break 39 | else: 40 | print(f'Operation get_scan_status failed because: {res_json["error"]}. {res_json["details"]}') 41 | break 42 | 43 | @staticmethod 44 | async def parse_json(json_results): 45 | status = json_results['op_status'] 46 | if status == 'success': 47 | scan_tests = json_results['scan_output']['output_json'] 48 | output_data = scan_tests[0]['output_data'] 49 | host_to_ip = [f'{subdomain[0]}:{subdomain[1]}' for subdomain in output_data if len(subdomain) > 0] 50 | return host_to_ip 51 | return [] 52 | 53 | async def get_hostnames(self) -> list: 54 | return self.total_results 55 | 56 | async def do_search(self) -> None: 57 | subdomain_payload = { 58 | 'op': 'start_scan', 59 | 'tool_id': 20, 60 | 'tool_params': { 61 | 'target': f'{self.word}', 62 | 'web_details': 'off', 63 | 'do_smart_search': 'off', 64 | }, 65 | } 66 | responses = await AsyncFetcher.post_fetch(url=self.api, data=ujson.dumps(subdomain_payload), proxy=self.proxy) 67 | res_json = ujson.loads(responses.strip()) 68 | if res_json['op_status'] == 'success': 69 | scan_id = res_json['scan_id'] 70 | await self.poll(scan_id) 71 | 72 | async def process(self, proxy: bool = False) -> None: 73 | self.proxy = proxy 74 | await self.do_search() # Only need to do it once. 75 | -------------------------------------------------------------------------------- /theHarvester/discovery/projectdiscovery.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import MissingKey 2 | from theHarvester.lib.core import AsyncFetcher, Core 3 | 4 | 5 | class SearchDiscovery: 6 | def __init__(self, word) -> None: 7 | self.word = word 8 | self.key = Core.projectdiscovery_key() 9 | if self.key is None: 10 | raise MissingKey('ProjectDiscovery') 11 | self.total_results = None 12 | self.proxy = False 13 | 14 | async def do_search(self): 15 | url = f'https://dns.projectdiscovery.io/dns/{self.word}/subdomains' 16 | response = await AsyncFetcher.fetch_all( 17 | [url], 18 | json=True, 19 | headers={'User-Agent': Core.get_user_agent(), 'Authorization': self.key}, 20 | proxy=self.proxy, 21 | ) 22 | self.total_results = [f'{domains}.{self.word}' for domains in response[0]['subdomains']] 23 | 24 | async def get_hostnames(self): 25 | return self.total_results 26 | 27 | async def process(self, proxy: bool = False) -> None: 28 | self.proxy = proxy 29 | await self.do_search() 30 | -------------------------------------------------------------------------------- /theHarvester/discovery/rapiddns.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from theHarvester.lib.core import AsyncFetcher, Core 4 | 5 | 6 | class SearchRapidDns: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.total_results: list = [] 10 | self.proxy = False 11 | 12 | async def do_search(self): 13 | try: 14 | headers = {'User-agent': Core.get_user_agent()} 15 | # TODO see if it's worth adding sameip searches 16 | # f'{self.hostname}/sameip/{self.word}?full=1#result' 17 | urls = [f'https://rapiddns.io/subdomain/{self.word}?full=1#result'] 18 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 19 | if len(responses[0]) <= 1: 20 | return self.total_results 21 | soup = BeautifulSoup(responses[0], 'html.parser') 22 | rows = soup.find('table').find('tbody').find_all('tr') 23 | if rows: 24 | # Validation check 25 | for row in rows: 26 | cells = row.find_all('td') 27 | if len(cells) > 0: 28 | # sanity check 29 | subdomain = str(cells[0].get_text()) 30 | if cells[-1].get_text() == 'CNAME': 31 | self.total_results.append(f'{subdomain}') 32 | else: 33 | self.total_results.append(f'{subdomain}:{str(cells[1].get_text()).strip()}') 34 | self.total_results = list({domain for domain in self.total_results}) 35 | except Exception as e: 36 | print(f'An exception has occurred: {e!s}') 37 | 38 | async def process(self, proxy: bool = False) -> None: 39 | self.proxy = proxy 40 | await self.do_search() 41 | 42 | async def get_hostnames(self): 43 | return self.total_results 44 | -------------------------------------------------------------------------------- /theHarvester/discovery/rocketreach.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from theHarvester.discovery.constants import MissingKey, get_delay 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchRocketReach: 8 | def __init__(self, word, limit) -> None: 9 | self.ips: set = set() 10 | self.word = word 11 | self.key = Core.rocketreach_key() 12 | if self.key is None: 13 | raise MissingKey('RocketReach') 14 | self.hosts: set = set() 15 | self.proxy = False 16 | self.baseurl = 'https://rocketreach.co/api/v2/person/search' 17 | self.links: set = set() 18 | self.emails: set = set() 19 | self.limit = limit 20 | 21 | async def do_search(self) -> None: 22 | try: 23 | headers = { 24 | 'Api-Key': self.key, 25 | 'Content-Type': 'application/json', 26 | 'User-Agent': Core.get_user_agent(), 27 | } 28 | 29 | next_page = 1 # track pagination 30 | for count in range(1, self.limit): 31 | data = f'{{"query":{{"current_employer_domain": ["{self.word}"]}}, "page": {next_page}, "page_size": 100}}' 32 | result = await AsyncFetcher.post_fetch(self.baseurl, headers=headers, data=data, json=True) 33 | if 'detail' in result.keys() and 'error' in result.keys() and 'Subscribe to a plan to access' in result['detail']: 34 | # No more results can be fetched 35 | break 36 | if 'detail' in result.keys() and 'Request was throttled.' in result['detail']: 37 | # Rate limit has been triggered need to sleep extra 38 | print( 39 | f'RocketReach requests have been throttled; ' 40 | f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}' 41 | ) 42 | break 43 | if 'profiles' in dict(result).keys(): 44 | if len(result['profiles']) == 0: 45 | break 46 | for profile in result['profiles']: 47 | if 'linkedin_url' in dict(profile).keys(): 48 | self.links.add(profile['linkedin_url']) 49 | if 'emails' in dict(profile).keys() and profile['emails']: 50 | for email in profile['emails']: 51 | if email.get('email'): 52 | self.emails.add(email['email']) 53 | if 'pagination' in dict(result).keys(): 54 | next_page = result['pagination']['page'] + 1 55 | if next_page > result['pagination']['total_pages']: 56 | break 57 | 58 | await asyncio.sleep(get_delay() + 5) 59 | 60 | except Exception as e: 61 | print(f'An exception has occurred rocketreach: {e}') 62 | 63 | async def get_links(self): 64 | return self.links 65 | 66 | async def get_emails(self): 67 | return self.emails 68 | 69 | async def process(self, proxy: bool = False) -> None: 70 | self.proxy = proxy 71 | await self.do_search() 72 | -------------------------------------------------------------------------------- /theHarvester/discovery/search_dehashed.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import requests 4 | 5 | from theHarvester.discovery.constants import MissingKey 6 | from theHarvester.lib.core import Core 7 | 8 | 9 | class SearchDehashed: 10 | def __init__(self, word) -> None: 11 | self.word = word 12 | self.key = Core.dehashed_key() 13 | if self.key is None: 14 | raise MissingKey('Dehashed') 15 | 16 | self.api = 'https://api.dehashed.com/v2/search' 17 | self.headers = {'Content-Type': 'application/json', 'Dehashed-Api-Key': self.key} 18 | self.results = '' 19 | self.data: list[dict] = [] 20 | 21 | async def do_search(self) -> None: 22 | print(f'\t[+] Performing Dehashed search for: {self.word}') 23 | page = 1 24 | size = 100 25 | while True: 26 | payload = {'query': self.word, 'page': page, 'size': size, 'wildcard': False, 'regex': False, 'de_dupe': False} 27 | 28 | try: 29 | response = requests.post(self.api, json=payload, headers=self.headers) 30 | if response.status_code == 401: 31 | raise Exception('Unauthorized. Check Dehashed API key.') 32 | if response.status_code == 403: 33 | raise Exception('Forbidden. API key is not allowed.') 34 | 35 | data = response.json() 36 | entries = data.get('entries', []) 37 | if not entries: 38 | break 39 | 40 | self.data.extend(entries) 41 | print(f'\t[+] Page {page} - Retrieved {len(entries)} entries.') 42 | 43 | if len(entries) < size: 44 | break 45 | page += 1 46 | time.sleep(0.5) 47 | except Exception as e: 48 | print(f'\t[!] Dehashed error: {e}') 49 | break 50 | 51 | async def print_csv_results(self) -> None: 52 | if not self.data: 53 | print('\t[!] No data found.') 54 | return 55 | 56 | print('\n[Dehashed Results]') 57 | print('Email,Username,Password,Phone,IP,Source') 58 | 59 | for entry in self.data: 60 | email = entry.get('email', '') 61 | username = entry.get('username', '') 62 | password = entry.get('password', '') 63 | phone = entry.get('phone', '') 64 | ip = entry.get('ip_address', '') 65 | source = entry.get('database_name', '') 66 | 67 | csv_line = f'"{email}","{username}","{password}","{phone}","{ip}","{source}"' 68 | print(csv_line) 69 | 70 | async def process(self, proxy: bool = False) -> None: 71 | await self.do_search() 72 | await self.print_csv_results() 73 | 74 | async def get_emails(self) -> set: 75 | emails = set() 76 | for entry in self.data: 77 | if entry.get('email'): 78 | emails.add(entry['email']) 79 | return emails 80 | 81 | async def get_hostnames(self) -> set: 82 | return set() 83 | 84 | async def get_ips(self) -> set: 85 | ips = set() 86 | for entry in self.data: 87 | if entry.get('ip_address'): 88 | ips.add(entry['ip_address']) 89 | return ips 90 | -------------------------------------------------------------------------------- /theHarvester/discovery/search_dnsdumpster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from theHarvester.discovery.constants import MissingKey 3 | from theHarvester.lib.core import AsyncFetcher, Core 4 | 5 | 6 | class SearchDNSDumpster: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.key = Core.dnsdumpster_key() 10 | if not self.key: 11 | raise MissingKey('DNSDumpster') 12 | self.hosts: set = set() 13 | self.ips: set = set() 14 | self.base_url = 'https://api.dnsdumpster.com' 15 | 16 | async def do_search(self) -> None: 17 | try: 18 | url = f'{self.base_url}/domain/{self.word}' 19 | headers = {'User-Agent': 'Mozilla/5.0 (theHarvester)', 'X-API-Key': self.key} 20 | 21 | response = await AsyncFetcher.fetch_all([url], headers=headers, json=True) 22 | data = response[0] 23 | 24 | if isinstance(data, dict): 25 | # Process A records 26 | for record in data.get('a', []): 27 | host = record['host'] 28 | if host.endswith(self.word): 29 | self.hosts.add(host) 30 | for ip_info in record['ips']: 31 | self.ips.add(ip_info['ip']) 32 | 33 | # Process NS records 34 | for record in data.get('ns', []): 35 | host = record['host'] 36 | if host.endswith(self.word): 37 | self.hosts.add(host) 38 | for ip_info in record['ips']: 39 | self.ips.add(ip_info['ip']) 40 | 41 | except Exception as e: 42 | print(f'Error occurred in DNSDumpster search: {e}') 43 | 44 | async def process(self, proxy: bool = False) -> None: 45 | await self.do_search() 46 | 47 | async def get_hostnames(self) -> set: 48 | return self.hosts 49 | 50 | async def get_ips(self) -> set: 51 | return self.ips 52 | -------------------------------------------------------------------------------- /theHarvester/discovery/searchhunterhow.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from datetime import datetime 3 | 4 | from dateutil.relativedelta import relativedelta 5 | 6 | from theHarvester.discovery.constants import MissingKey 7 | from theHarvester.lib.core import AsyncFetcher, Core 8 | 9 | 10 | class SearchHunterHow: 11 | def __init__(self, word) -> None: 12 | self.word = word 13 | self.total_hostnames: set = set() 14 | self.key = Core.hunterhow_key() 15 | if self.key is None: 16 | raise MissingKey('hunterhow') 17 | self.proxy = False 18 | 19 | async def do_search(self) -> None: 20 | # https://hunter.how/search-api 21 | query = f'domain.suffix="{self.word}"' 22 | # second_query = f'domain="{self.word}"' 23 | encoded_query = base64.urlsafe_b64encode(query.encode('utf-8')).decode('ascii') 24 | page = 1 25 | page_size = 100 # can be either: 10,20,50,100) 26 | # The interval between the start time and the end time cannot exceed one year 27 | # Can not exceed one year, but years=1 does not work due to their backend, 364 will suffice 28 | today = datetime.today() 29 | one_year_ago = today - relativedelta(days=364) 30 | start_time = one_year_ago.strftime('%Y-%m-%d') 31 | end_time = today.strftime('%Y-%m-%d') 32 | # two_years_ago = one_year_ago - relativedelta(days=364) 33 | # start_time = two_years_ago.strftime('%Y-%m-%d') 34 | # end_time = one_year_ago.strftime('%Y-%m-%d') 35 | url = f'https://api.hunter.how/search?api-key={self.key}&query={encoded_query}&page={page}&page_size={page_size}&start_time={start_time}&end_time={end_time}' 36 | response = await AsyncFetcher.fetch_all( 37 | [url], 38 | json=True, 39 | headers={'User-Agent': Core.get_user_agent(), 'x-api-key': f'{self.key}'}, 40 | proxy=self.proxy, 41 | ) 42 | dct = response[0] 43 | # print(f'json response: ') 44 | # print(dct) 45 | if 'code' in dct.keys(): 46 | if dct['code'] == 40001: 47 | print(f'Code 40001 indicates for searchhunterhow: {dct["message"]}') 48 | return 49 | # total = dct['data']['total'] 50 | # TODO determine if total is ever 100 how to get more subdomains? 51 | for sub in dct['data']['list']: 52 | self.total_hostnames.add(sub['domain']) 53 | 54 | async def get_hostnames(self) -> set: 55 | return self.total_hostnames 56 | 57 | async def process(self, proxy: bool = False) -> None: 58 | self.proxy = proxy 59 | await self.do_search() 60 | -------------------------------------------------------------------------------- /theHarvester/discovery/securityscorecard.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchSecurityScorecard: 8 | def __init__(self, word: str): 9 | self.word = word 10 | self.api_key = Core.securityscorecard_key() 11 | self.base_url = 'https://api.securityscorecard.io' 12 | self.headers = {'Authorization': f'Token {self.api_key}', 'Content-Type': 'application/json'} 13 | self.hosts = set() 14 | self.score = 0 15 | self.grades = {} 16 | self.issues = [] 17 | self.recommendations = [] 18 | self.history = [] 19 | 20 | async def process(self, proxy: bool = False) -> None: 21 | """Get security scorecard information for a domain.""" 22 | try: 23 | if proxy: 24 | response = await AsyncFetcher.fetch( 25 | session=None, url=f'{self.base_url}/companies/{self.word}', headers=self.headers, proxy=proxy 26 | ) 27 | if response: 28 | self._extract_data(response) 29 | else: 30 | async with aiohttp.ClientSession(headers=self.headers) as session: 31 | async with session.get(f'{self.base_url}/companies/{self.word}') as response: 32 | if response.status == 200: 33 | data = await response.json() 34 | self._extract_data(data) 35 | elif response.status == 401: 36 | print('[!] Missing API key for SecurityScorecard.') 37 | raise MissingKey('SecurityScorecard') 38 | except Exception as e: 39 | print(f'Error in SecurityScorecard search: {e}') 40 | 41 | def _extract_data(self, data: dict) -> None: 42 | """Extract and categorize security scorecard information.""" 43 | if 'grade' in data: 44 | self.score = data.get('grade', 0) 45 | 46 | if 'factor_grades' in data: 47 | self.grades = data['factor_grades'] 48 | 49 | if 'issues' in data: 50 | self.issues = data['issues'] 51 | 52 | if 'recommendations' in data: 53 | self.recommendations = data['recommendations'] 54 | 55 | if 'history' in data: 56 | self.history = data['history'] 57 | 58 | if 'domains' in data: 59 | self.hosts.update(data['domains']) 60 | 61 | async def get_hostnames(self) -> set[str]: 62 | return self.hosts 63 | 64 | async def get_score(self) -> int: 65 | return self.score 66 | 67 | async def get_grades(self) -> dict: 68 | return self.grades 69 | 70 | async def get_issues(self) -> list[dict]: 71 | return self.issues 72 | 73 | async def get_recommendations(self) -> list[dict]: 74 | return self.recommendations 75 | 76 | async def get_history(self) -> list[dict]: 77 | return self.history 78 | -------------------------------------------------------------------------------- /theHarvester/discovery/securitytrailssearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | from theHarvester.parsers import securitytrailsparser 6 | 7 | 8 | class SearchSecuritytrail: 9 | def __init__(self, word) -> None: 10 | self.word = word 11 | self.key = Core.security_trails_key() 12 | if self.key is None: 13 | raise MissingKey('Securitytrail') 14 | self.results = '' 15 | self.totalresults = '' 16 | self.api = 'https://api.securitytrails.com/v1/' 17 | self.info: tuple[set, set] = (set(), set()) 18 | self.proxy = False 19 | 20 | async def authenticate(self) -> None: 21 | # Method to authenticate API key before sending requests. 22 | headers = {'APIKEY': self.key} 23 | url = f'{self.api}ping' 24 | auth_responses = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 25 | auth_responses = auth_responses[0] 26 | if 'False' in auth_responses or 'Invalid authentication' in auth_responses: 27 | print('\tKey could not be authenticated exiting program.') 28 | await asyncio.sleep(5) 29 | 30 | async def do_search(self) -> None: 31 | # https://api.securitytrails.com/v1/domain/domain.com 32 | url = f'{self.api}domain/{self.word}' 33 | headers = {'APIKEY': self.key} 34 | response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 35 | await asyncio.sleep(5) # Not random delay because 2 seconds is required due to rate limit. 36 | self.results = response[0] 37 | self.totalresults += self.results 38 | url += '/subdomains' # Get subdomains now. 39 | subdomain_response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 40 | await asyncio.sleep(5) 41 | self.results = subdomain_response[0] 42 | self.totalresults += self.results 43 | 44 | async def process(self, proxy: bool = False) -> None: 45 | self.proxy = proxy 46 | await self.authenticate() 47 | await self.do_search() 48 | parser = securitytrailsparser.Parser(word=self.word, text=self.totalresults) 49 | self.info = await parser.parse_text() 50 | # Create parser and set self.info to tuple returned from parsing text. 51 | print('\tDone Searching Results') 52 | 53 | async def get_ips(self) -> set: 54 | return self.info[0] 55 | 56 | async def get_hostnames(self) -> set: 57 | return self.info[1] 58 | -------------------------------------------------------------------------------- /theHarvester/discovery/shodansearch.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from shodan import Shodan, exception 4 | 5 | from theHarvester.discovery.constants import MissingKey 6 | from theHarvester.lib.core import Core 7 | 8 | 9 | class SearchShodan: 10 | def __init__(self) -> None: 11 | self.key = Core.shodan_key() 12 | if self.key is None: 13 | raise MissingKey('Shodan') 14 | self.api = Shodan(self.key) 15 | self.hostdatarow: list = [] 16 | self.tracker: OrderedDict = OrderedDict() 17 | 18 | async def search_ip(self, ip) -> OrderedDict: 19 | try: 20 | ipaddress = ip 21 | results = self.api.host(ipaddress) 22 | asn = '' 23 | domains: list = list() 24 | hostnames: list = list() 25 | ip_str = '' 26 | isp = '' 27 | org = '' 28 | ports: list = list() 29 | title = '' 30 | server = '' 31 | product = '' 32 | technologies: list = list() 33 | 34 | data_first_dict = dict(results['data'][0]) 35 | 36 | if 'ip_str' in data_first_dict: 37 | ip_str += data_first_dict['ip_str'] 38 | 39 | if 'http' in data_first_dict: 40 | http_results_dict = dict(data_first_dict['http']) 41 | if 'title' in http_results_dict: 42 | title_val = str(http_results_dict['title']).strip() 43 | if title_val != 'None': 44 | title += title_val 45 | if 'components' in http_results_dict: 46 | for key in http_results_dict['components'].keys(): 47 | technologies.append(key) 48 | if 'server' in http_results_dict: 49 | server_val = str(http_results_dict['server']).strip() 50 | if server_val != 'None': 51 | server += server_val 52 | 53 | for key, value in results.items(): 54 | if key == 'asn': 55 | asn += value 56 | if key == 'domains': 57 | value = list(value) 58 | value.sort() 59 | domains.extend(value) 60 | if key == 'hostnames': 61 | value = [host.strip() for host in list(value)] 62 | value.sort() 63 | hostnames.extend(value) 64 | if key == 'isp': 65 | isp += value 66 | if key == 'org': 67 | org += str(value) 68 | if key == 'ports': 69 | value = list(value) 70 | value.sort() 71 | ports.extend(value) 72 | if key == 'product': 73 | product += value 74 | 75 | technologies = list(set(technologies)) 76 | 77 | self.tracker[ip] = { 78 | 'asn': asn.strip(), 79 | 'domains': domains, 80 | 'hostnames': hostnames, 81 | 'ip_str': ip_str.strip(), 82 | 'isp': isp.strip(), 83 | 'org': org.strip(), 84 | 'ports': ports, 85 | 'product': product.strip(), 86 | 'server': server.strip(), 87 | 'technologies': technologies, 88 | 'title': title.strip(), 89 | } 90 | 91 | return self.tracker 92 | except exception.APIError: 93 | print(f'{ip}: Not in Shodan') 94 | self.tracker[ip] = 'Not in Shodan' 95 | except Exception as e: 96 | # print(f'Error occurred in the Shodan IP search module: {e}') 97 | self.tracker[ip] = f'Error occurred in the Shodan IP search module: {e}' 98 | finally: 99 | return self.tracker 100 | -------------------------------------------------------------------------------- /theHarvester/discovery/sitedossier.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from theHarvester.discovery.constants import get_delay 6 | from theHarvester.lib.core import AsyncFetcher, Core 7 | 8 | 9 | class SearchSitedossier: 10 | def __init__(self, word): 11 | self.word = word 12 | self.totalhosts = set() 13 | self.server = 'www.sitedossier.com' 14 | self.proxy = False 15 | 16 | async def do_search(self): 17 | # 2023 but this site doesn't support https... 18 | # This site seems to yield a lot of results but is a bit annoying to scrape 19 | # Hence the need for delays after each request to get the most results 20 | # Feel free to tweak the delays as needed 21 | url = f'http://{self.server}/parentdomain/{self.word}' 22 | headers = {'User-Agent': Core.get_user_agent()} 23 | response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 24 | base_response = response[0] 25 | soup = BeautifulSoup(base_response, 'html.parser') 26 | # iter_counter = 1 27 | # iterations_needed = total_number // 100 28 | # iterations_needed += 1 29 | flagged_counter = 0 30 | stop_conditions = ['End of list.', 'No data currently available.'] 31 | bot_string = ( 32 | 'Our web servers have detected unusual or excessive requests ' 33 | 'from your computer or network. Please enter the unique "word"' 34 | ' below to confirm that you are a human interactively using this site.' 35 | ) 36 | if ( 37 | stop_conditions[0] not in base_response and stop_conditions[1] not in base_response 38 | ) and bot_string not in base_response: 39 | total_number = soup.find('i') 40 | total_number = int(total_number.text.strip().split(' ')[-1].replace(',', '')) 41 | hrefs = soup.find_all('a', href=True) 42 | for a in hrefs: 43 | unparsed = a['href'] 44 | if '/site/' in unparsed: 45 | subdomain = str(unparsed.split('/')[-1]).lower() 46 | self.totalhosts.add(subdomain) 47 | await asyncio.sleep(get_delay() + 15 + get_delay()) 48 | for i in range(101, total_number, 100): 49 | headers = {'User-Agent': Core.get_user_agent()} 50 | iter_url = f'http://{self.server}/parentdomain/{self.word}/{i}' 51 | print(f'My current iter_url: {iter_url}') 52 | response = await AsyncFetcher.fetch_all([iter_url], headers=headers, proxy=self.proxy) 53 | response = response[0] 54 | if stop_conditions[0] in response or stop_conditions[1] in response or flagged_counter >= 3: 55 | break 56 | if bot_string in response: 57 | new_sleep_time = get_delay() * 30 58 | print(f'Triggered a captcha for sitedossier sleeping for: {new_sleep_time} seconds') 59 | flagged_counter += 1 60 | await asyncio.sleep(new_sleep_time) 61 | response = await AsyncFetcher.fetch_all( 62 | [iter_url], 63 | headers={'User-Agent': Core.get_user_agent()}, 64 | proxy=self.proxy, 65 | ) 66 | response = response[0] 67 | if bot_string in response: 68 | new_sleep_time = get_delay() * 30 * get_delay() 69 | print( 70 | f'Still triggering a captcha, sleeping longer for: {new_sleep_time}' 71 | f' and skipping this batch: {iter_url}' 72 | ) 73 | await asyncio.sleep(new_sleep_time) 74 | flagged_counter += 1 75 | if flagged_counter >= 3: 76 | break 77 | soup = BeautifulSoup(response, 'html.parser') 78 | hrefs = soup.find_all('a', href=True) 79 | for a in hrefs: 80 | unparsed = a['href'] 81 | if '/site/' in unparsed: 82 | subdomain = str(unparsed.split('/')[-1]).lower() 83 | self.totalhosts.add(subdomain) 84 | await asyncio.sleep(get_delay() + 15 + get_delay()) 85 | print(f'In total found: {len(self.totalhosts)}') 86 | print(self.totalhosts) 87 | else: 88 | print('Sitedossier module has triggered a captcha on first iteration, no results can be found.') 89 | print('Change IPs, manually solve the captcha, or wait before rerunning Sitedossier module') 90 | 91 | async def get_hostnames(self): 92 | return self.totalhosts 93 | 94 | async def process(self, proxy: bool = False) -> None: 95 | self.proxy = proxy 96 | await self.do_search() 97 | -------------------------------------------------------------------------------- /theHarvester/discovery/subdomaincenter.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | 3 | 4 | class SubdomainCenter: 5 | def __init__(self, word): 6 | self.word = word 7 | self.results = set() 8 | self.server = 'https://api.subdomain.center/?domain=' 9 | self.proxy = False 10 | 11 | async def do_search(self): 12 | headers = {'User-Agent': Core.get_user_agent()} 13 | try: 14 | current_url = f'{self.server}{self.word}' 15 | resp = await AsyncFetcher.fetch_all([current_url], headers=headers, proxy=self.proxy, json=True) 16 | self.results = resp[0] 17 | self.results = {sub[4:] if sub[:4] == 'www.' and sub[4:] else sub for sub in self.results} 18 | except Exception as e: 19 | print(f'An exception has occurred in SubdomainCenter on : {e}') 20 | 21 | async def get_hostnames(self): 22 | return self.results 23 | 24 | async def process(self, proxy=False): 25 | self.proxy = proxy 26 | await self.do_search() 27 | -------------------------------------------------------------------------------- /theHarvester/discovery/subdomainfinderc99.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import ujson 4 | from bs4 import BeautifulSoup 5 | 6 | from theHarvester.discovery.constants import get_delay 7 | from theHarvester.lib.core import AsyncFetcher, Core 8 | from theHarvester.parsers import myparser 9 | 10 | 11 | class SearchSubdomainfinderc99: 12 | def __init__(self, word) -> None: 13 | self.word = word 14 | self.total_results: set = set() 15 | self.proxy = False 16 | # TODO add api support 17 | self.server = 'https://subdomainfinder.c99.nl/' 18 | self.totalresults = '' 19 | 20 | async def do_search(self) -> None: 21 | # Based on https://gist.github.com/th3gundy/bc83580cbe04031e9164362b33600962 22 | headers = {'User-Agent': Core.get_user_agent()} 23 | resp = await AsyncFetcher.fetch_all([self.server], headers=headers, proxy=self.proxy) 24 | data = await self.get_csrf_params(resp[0]) 25 | 26 | data['scan_subdomains'] = '' 27 | data['domain'] = self.word 28 | data['privatequery'] = 'on' 29 | await asyncio.sleep(get_delay()) 30 | second_resp = await AsyncFetcher.post_fetch(self.server, headers=headers, proxy=self.proxy, data=ujson.dumps(data)) 31 | 32 | # print(second_resp) 33 | self.totalresults += second_resp 34 | # y = await self.get_hostnames() 35 | # print(list(sorted(y))) 36 | # print(f'Found: {len(y)} subdomains') 37 | 38 | # regex = r"value='(https://subdomainfinder\.c99\.nl/scans/\d{4}-\d{2}-\d{2}/" + self.word + r")'" 39 | # match = re.search(regex, second_resp) 40 | # if match: 41 | # print(match.group(1)) 42 | 43 | async def get_hostnames(self): 44 | rawres = myparser.Parser(self.totalresults, self.word) 45 | return await rawres.hostnames() 46 | 47 | async def process(self, proxy: bool = False) -> None: 48 | self.proxy = proxy 49 | await self.do_search() 50 | 51 | @staticmethod 52 | async def get_csrf_params(data): 53 | csrf_params = {} 54 | html = BeautifulSoup(data, 'html.parser').find('div', {'class': 'input-group'}) 55 | for c in html.find_all('input'): 56 | try: 57 | csrf_params[c.get('name')] = c.get('value') 58 | except Exception: 59 | continue 60 | 61 | return csrf_params 62 | -------------------------------------------------------------------------------- /theHarvester/discovery/takeover.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import defaultdict 3 | from random import shuffle 4 | 5 | import ujson 6 | 7 | from theHarvester.lib.core import AsyncFetcher, Core 8 | 9 | 10 | class TakeOver: 11 | def __init__(self, hosts) -> None: 12 | # NOTE THIS MODULE IS ACTIVE RECON 13 | self.hosts = hosts 14 | self.proxy = False 15 | self.fingerprints: dict[str, str] = dict() 16 | # https://stackoverflow.com/questions/33080869/python-how-to-create-a-dict-of-dict-of-list-with-defaultdict 17 | self.results: defaultdict[str, list] = defaultdict() 18 | 19 | async def populate_fingerprints(self): 20 | # Thank you to https://github.com/EdOverflow/can-i-take-over-xyz for these fingerprints 21 | populate_url = 'https://raw.githubusercontent.com/EdOverflow/can-i-take-over-xyz/master/fingerprints.json' 22 | headers = {'User-Agent': Core.get_user_agent()} 23 | response = await AsyncFetcher.fetch_all([populate_url], headers=headers) 24 | try: 25 | resp = response[0] 26 | unparsed_json = ujson.loads(resp) 27 | for unparsed_fingerprint in unparsed_json: 28 | if unparsed_fingerprint['service'] in ['Smugsmug']: 29 | # Subdomain must be in format domain.smugsmug.com 30 | # This will never happen as subdomains are parsed and filtered to be in format of *.word.com 31 | continue 32 | if unparsed_fingerprint['status'] == 'Vulnerable' or unparsed_fingerprint['status'] == 'Edge case': 33 | self.fingerprints[unparsed_fingerprint['fingerprint']] = unparsed_fingerprint['service'] 34 | except Exception as e: 35 | print(f'An exception has occurred populating takeover fingerprints: {e}, defaulting to static list') 36 | self.fingerprints = { 37 | "'Trying to access your account?'": 'Campaign Monitor', 38 | '404 Not Found': 'Fly.io', 39 | '404 error unknown site!': 'Pantheon', 40 | 'Do you want to register *.wordpress.com?': 'Wordpress', 41 | 'Domain uses DO name serves with no records in DO.': 'Digital Ocean', 42 | "It looks like you may have taken a wrong turn somewhere. Don't worry...it happens to all of us.": 'LaunchRock', 43 | 'No Site For Domain': 'Kinsta', 44 | 'No settings were found for this company:': 'Help Scout', 45 | 'Project doesnt exist... yet!': 'Readme.io', 46 | 'Repository not found': 'Bitbucket', 47 | 'The feed has not been found.': 'Feedpress', 48 | 'No such app': 'Heroku', 49 | 'The specified bucket does not exist': 'AWS/S3', 50 | 'The thing you were looking for is no longer here, or never was': 'Ghost', 51 | "There isn't a Github Pages site here.": 'Github', 52 | 'This UserVoice subdomain is currently available!': 'UserVoice', 53 | "Uh oh. That page doesn't exist.": 'Intercom', 54 | "We could not find what you're looking for.": 'Help Juice', 55 | "Whatever you were looking for doesn't currently exist at this address": 'Tumblr', 56 | 'is not a registered InCloud YouTrack': 'JetBrains', 57 | 'page not found': 'Uptimerobot', 58 | 'project not found': 'Surge.sh', 59 | } 60 | 61 | async def check(self, url, resp) -> None: 62 | # Simple function that takes response and checks if any fingerprints exist 63 | # If a fingerprint exists figures out which one and prints it out 64 | regex = re.compile('(?=(' + '|'.join(map(re.escape, list(self.fingerprints.keys()))) + '))') 65 | # Sanitize fingerprints 66 | matches = re.findall(regex, resp) 67 | matches = list(set(matches)) 68 | for match in matches: 69 | print(f'\t\033[91m Takeover detected: {url}\033[1;32;40m') 70 | if match in self.fingerprints.keys(): 71 | # Validation check as to not error out 72 | service = self.fingerprints[match] 73 | print(f'\t\033[91m Type of takeover is: {service} with match: {match}\033[1;32;40m') 74 | self.results[url].append({match: service}) 75 | 76 | async def do_take(self) -> None: 77 | try: 78 | if len(self.hosts) > 0: 79 | # Returns a list of tuples in this format: (url, response) 80 | # Filter out responses whose responses are empty strings (indicates errored) 81 | https_hosts = [f'https://{host}' for host in self.hosts] 82 | http_hosts = [f'http://{host}' for host in self.hosts] 83 | all_hosts = https_hosts + http_hosts 84 | shuffle(all_hosts) 85 | resps: list = await AsyncFetcher.fetch_all(all_hosts, takeover=True, proxy=self.proxy) 86 | for url, resp in tuple(resp for resp in resps if len(resp[1]) >= 1): 87 | await self.check(url, resp) 88 | else: 89 | return 90 | except Exception as e: 91 | print(e) 92 | 93 | async def process(self, proxy: bool = False) -> None: 94 | self.proxy = proxy 95 | await self.do_take() 96 | 97 | async def get_takeover_results(self): 98 | return self.results 99 | -------------------------------------------------------------------------------- /theHarvester/discovery/threatminer.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher 2 | 3 | 4 | class SearchThreatminer: 5 | def __init__(self, word) -> None: 6 | self.word = word 7 | self.totalhosts: set = set() 8 | self.totalips: set = set() 9 | self.proxy = False 10 | 11 | async def do_search(self) -> None: 12 | url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5' 13 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 14 | self.totalhosts = {host for host in response[0]['results']} 15 | second_url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2' 16 | secondresp = await AsyncFetcher.fetch_all([second_url], json=True, proxy=self.proxy) 17 | try: 18 | self.totalips = {resp['ip'] for resp in secondresp[0]['results']} 19 | except TypeError: 20 | pass 21 | 22 | async def get_hostnames(self) -> set: 23 | return self.totalhosts 24 | 25 | async def get_ips(self) -> set: 26 | return self.totalips 27 | 28 | async def process(self, proxy: bool = False) -> None: 29 | self.proxy = proxy 30 | await self.do_search() 31 | -------------------------------------------------------------------------------- /theHarvester/discovery/tombasearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchTomba: 8 | def __init__(self, word, limit, start) -> None: 9 | self.word = word 10 | self.limit = limit 11 | self.limit = 10 if limit > 10 else limit 12 | self.start = start 13 | self.key = Core.tomba_key() 14 | if self.key[0] is None or self.key[1] is None: 15 | raise MissingKey('Tomba Key and/or Secret') 16 | self.total_results = '' 17 | self.counter = start 18 | self.database = f'https://api.tomba.io/v1/domain-search?domain={self.word}&limit=10' 19 | self.proxy = False 20 | self.hostnames: list = [] 21 | self.emails: list = [] 22 | 23 | async def do_search(self) -> None: 24 | # First determine if a user account is not a free account, this call is free 25 | is_free = True 26 | headers = { 27 | 'User-Agent': Core.get_user_agent(), 28 | 'X-Tomba-Key': self.key[0], 29 | 'X-Tomba-Secret': self.key[1], 30 | } 31 | acc_info_url = 'https://api.tomba.io/v1/me' 32 | response = await AsyncFetcher.fetch_all([acc_info_url], headers=headers, json=True) 33 | is_free = ( 34 | is_free 35 | if 'name' in response[0]['data']['pricing'].keys() and response[0]['data']['pricing']['name'].lower() == 'free' 36 | else False 37 | ) 38 | # Extract the total number of requests that are available for an account 39 | 40 | total_requests_avail = ( 41 | response[0]['data']['requests']['domains']['available'] - response[0]['data']['requests']['domains']['used'] 42 | ) 43 | 44 | if is_free: 45 | response = await AsyncFetcher.fetch_all([self.database], headers=headers, proxy=self.proxy, json=True) 46 | self.emails, self.hostnames = await self.parse_resp(json_resp=response[0]) 47 | else: 48 | # Determine the total number of emails that are available 49 | # As the most emails you can get within one query are 100 50 | # This is only done where paid accounts are in play 51 | tomba_counter = f'https://api.tomba.io/v1/email-count?domain={self.word}' 52 | response = await AsyncFetcher.fetch_all([tomba_counter], headers=headers, proxy=self.proxy, json=True) 53 | total_number_reqs = response[0]['data']['total'] // 100 54 | # Parse out meta field within initial JSON response to determine the total number of results 55 | if total_requests_avail < total_number_reqs: 56 | print('WARNING: The account does not have enough requests to gather all the emails.') 57 | print(f'Total requests available: {total_requests_avail}, total requests needed to be made: {total_number_reqs}') 58 | print( 59 | 'RETURNING current results, If you still wish to run this module despite the current results, please comment out the "if request" line.' 60 | ) 61 | return 62 | self.limit = 100 63 | # max number of emails you can get per request 64 | # increments of max number with page determining where to start 65 | # See docs for more details: https://developer.tomba.io/#domain-search 66 | for page in range(0, total_number_reqs + 1): 67 | req_url = f'https://api.tomba.io/v1/domain-search?domain={self.word}&limit={self.limit}&page={page}' 68 | response = await AsyncFetcher.fetch_all([req_url], headers=headers, proxy=self.proxy, json=True) 69 | temp_emails, temp_hostnames = await self.parse_resp(response[0]) 70 | self.emails.extend(temp_emails) 71 | self.hostnames.extend(temp_hostnames) 72 | await asyncio.sleep(1) 73 | 74 | async def parse_resp(self, json_resp): 75 | emails = list(sorted({email['email'] for email in json_resp['data']['emails']})) 76 | domains = list( 77 | sorted( 78 | { 79 | source['website_url'] 80 | for email in json_resp['data']['emails'] 81 | for source in email['sources'] 82 | if self.word in source['website_url'] 83 | } 84 | ) 85 | ) 86 | return emails, domains 87 | 88 | async def process(self, proxy: bool = False) -> None: 89 | self.proxy = proxy 90 | await self.do_search() # Only need to do it once. 91 | 92 | async def get_emails(self): 93 | return self.emails 94 | 95 | async def get_hostnames(self): 96 | return self.hostnames 97 | -------------------------------------------------------------------------------- /theHarvester/discovery/urlscan.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher 2 | 3 | 4 | class SearchUrlscan: 5 | def __init__(self, word) -> None: 6 | self.word = word 7 | self.totalhosts: set = set() 8 | self.totalips: set = set() 9 | self.interestingurls: set = set() 10 | self.totalasns: set = set() 11 | self.proxy = False 12 | 13 | async def do_search(self) -> None: 14 | url = f'https://urlscan.io/api/v1/search/?q=domain:{self.word}' 15 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 16 | resp = response[0] 17 | self.totalhosts = {f'{page["page"]["domain"]}' for page in resp['results']} 18 | self.totalips = {f'{page["page"]["ip"]}' for page in resp['results'] if 'ip' in page['page'].keys()} 19 | self.interestingurls = { 20 | f'{page["page"]["url"]}' 21 | for page in resp['results'] 22 | if self.word in page['page']['url'] and 'url' in page['page'].keys() 23 | } 24 | self.totalasns = {f'{page["page"]["asn"]}' for page in resp['results'] if 'asn' in page['page'].keys()} 25 | 26 | async def get_hostnames(self) -> set: 27 | return self.totalhosts 28 | 29 | async def get_ips(self) -> set: 30 | return self.totalips 31 | 32 | async def get_interestingurls(self) -> set: 33 | return self.interestingurls 34 | 35 | async def get_asns(self) -> set: 36 | return self.totalasns 37 | 38 | async def process(self, proxy: bool = False) -> None: 39 | self.proxy = proxy 40 | await self.do_search() 41 | -------------------------------------------------------------------------------- /theHarvester/discovery/venacussearch.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import aiohttp 4 | 5 | from theHarvester.discovery.constants import MissingKey 6 | from theHarvester.lib.core import Core 7 | from theHarvester.parsers import venacusparser 8 | 9 | 10 | class SearchVenacus: 11 | def __init__(self, word: str, limit=1000, offset_doc=0) -> None: 12 | self.word = word 13 | self.key = Core.venacus_key() 14 | if self.key is None: 15 | raise MissingKey('Venacus') 16 | self.base_url = 'https://api.venacus.com' 17 | self.results: list[dict[str, Any]] = [] 18 | self.parsed: dict[str, Any] = {} 19 | self.proxy = False 20 | self.offset_doc = offset_doc 21 | self.offset_in_doc = 0 22 | self.ai = False 23 | self.more = True 24 | self.limit = limit 25 | 26 | async def do_search(self) -> None: 27 | total_results = [] 28 | result_count = 0 29 | 30 | try: 31 | headers = { 32 | 'Authorization': f'Bearer {self.key}', 33 | 'User-Agent': f'{Core.get_user_agent()}-theHarvester', 34 | } 35 | 36 | async with aiohttp.ClientSession() as session: 37 | while self.more and result_count < self.limit: 38 | query = { 39 | 'q': self.word, 40 | 'offset_doc': self.offset_doc, 41 | 'offset_in_doc': self.offset_in_doc, 42 | 'limit': 100, 43 | 'ai': 'true' if self.ai else 'false', 44 | } 45 | 46 | async with session.get(f'{self.base_url}/v1/search/', headers=headers, params=query) as total_resp: 47 | search_data = await total_resp.json() 48 | current_results = search_data.get('data', []) 49 | 50 | if not current_results: 51 | print('No more results found.') 52 | break 53 | 54 | total_results.extend(current_results) 55 | result_count += len(current_results) 56 | 57 | self.offset_doc = search_data.get('offset_doc', 0) 58 | self.offset_in_doc = search_data.get('offset_in_doc', 0) 59 | 60 | self.more = search_data.get('more', False) 61 | 62 | self.results = total_results 63 | if not self.results: 64 | print('No results found.') 65 | 66 | except Exception as e: 67 | print(f'An exception has occurred in Venacus: {e}') 68 | 69 | async def process(self, proxy: bool = False): 70 | self.proxy = proxy 71 | await self.do_search() 72 | parser = venacusparser.Parser() 73 | self.parsed = await parser.parse_text_tokens(self.results) # type: ignore 74 | 75 | async def get_people(self) -> list[dict[str, str]]: 76 | if 'people' not in self.parsed: 77 | return [] 78 | return self.parsed['people'] 79 | 80 | async def get_emails(self) -> set[str]: 81 | if 'emails' not in self.parsed: 82 | return set() 83 | return self.parsed['emails'] 84 | 85 | async def get_ips(self) -> set[str]: 86 | if 'ips' not in self.parsed: 87 | return set() 88 | return self.parsed['ips'] 89 | 90 | async def get_interestingurls(self) -> set[str]: 91 | if 'urls' not in self.parsed: 92 | return set() 93 | return self.parsed['urls'] 94 | -------------------------------------------------------------------------------- /theHarvester/discovery/virustotal.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchVirustotal: 8 | def __init__(self, word) -> None: 9 | self.key = Core.virustotal_key() 10 | if self.key is None: 11 | raise MissingKey('virustotal') 12 | self.word = word 13 | self.proxy = False 14 | self.hostnames: list = [] 15 | 16 | async def do_search(self) -> None: 17 | # TODO determine if more endpoints can yield useful info given a domain 18 | # based on: https://developers.virustotal.com/reference/domains-relationships 19 | # base_url = "https://www.virustotal.com/api/v3/domains/domain/subdomains?limit=40" 20 | headers = { 21 | 'User-Agent': Core.get_user_agent(), 22 | 'Accept': 'application/json', 23 | 'x-apikey': self.key, 24 | } 25 | base_url = f'https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40' 26 | cursor = '' 27 | count = 0 28 | fail_counter = 0 29 | counter = 0 30 | breakcon = False 31 | while True: 32 | if breakcon: 33 | break 34 | # rate limit is 4 per minute 35 | # TODO add timer logic if proven to be needed 36 | # in the meantime sleeping 16 seconds should eliminate hitting the rate limit 37 | # in case rate limit is hit, fail counter exists and sleep for 65 seconds 38 | send_url = base_url + '&cursor=' + cursor if cursor != '' and len(cursor) > 2 else base_url 39 | responses = await AsyncFetcher.fetch_all([send_url], headers=headers, proxy=self.proxy, json=True) 40 | jdata = responses[0] 41 | if 'data' not in jdata.keys(): 42 | await asyncio.sleep(60 + 5) 43 | fail_counter += 1 44 | if 'meta' in jdata.keys(): 45 | cursor = jdata['meta']['cursor'] if 'cursor' in jdata['meta'].keys() else '' 46 | if len(cursor) == 0 and 'data' in jdata.keys(): 47 | # if cursor no longer is within the meta field have hit last entry 48 | breakcon = True 49 | count += jdata['meta']['count'] 50 | if count == 0 or fail_counter >= 2: 51 | break 52 | if 'data' in jdata.keys(): 53 | data = jdata['data'] 54 | self.hostnames.extend(await self.parse_hostnames(data, self.word)) 55 | counter += 1 56 | await asyncio.sleep(16) 57 | self.hostnames = list(sorted(set(self.hostnames))) 58 | # verify domains such as x.x.com.multicdn.x.com are parsed properly 59 | self.hostnames = [ 60 | host for host in self.hostnames if ((len(host.split('.')) >= 3) and host.split('.')[-2] == self.word.split('.')[-2]) 61 | ] 62 | 63 | async def get_hostnames(self) -> list: 64 | return self.hostnames 65 | 66 | @staticmethod 67 | async def parse_hostnames(data, word): 68 | total_subdomains = set() 69 | for attribute in data: 70 | total_subdomains.add(attribute['id'].replace('"', '').replace('www.', '')) 71 | attributes = attribute['attributes'] 72 | total_subdomains.update( 73 | { 74 | value['value'].replace('"', '').replace('www.', '') 75 | for value in attributes['last_dns_records'] 76 | if word in value['value'] 77 | } 78 | ) 79 | if 'last_https_certificate' in attributes.keys(): 80 | total_subdomains.update( 81 | { 82 | value.replace('"', '').replace('www.', '') 83 | for value in attributes['last_https_certificate']['extensions']['subject_alternative_name'] 84 | if word in value 85 | } 86 | ) 87 | total_subdomains = list(sorted(total_subdomains)) 88 | # Other false positives may occur over time and yes there are other ways to parse this, feel free to implement 89 | # them and submit a PR or raise an issue if you run into this filtering not being enough 90 | # TODO determine if parsing 'v=spf1 include:_spf-x.acme.com include:_spf-x.acme.com' is worth parsing 91 | total_subdomains = [ 92 | x 93 | for x in total_subdomains 94 | if 'edgekey.net' not in str(x) and 'akadns.net' not in str(x) and 'include:_spf' not in str(x) 95 | ] 96 | total_subdomains.sort() 97 | return total_subdomains 98 | 99 | async def process(self, proxy: bool = False) -> None: 100 | self.proxy = proxy 101 | await self.do_search() 102 | -------------------------------------------------------------------------------- /theHarvester/discovery/whoisxml.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import MissingKey 2 | from theHarvester.lib.core import AsyncFetcher, Core 3 | 4 | 5 | class SearchWhoisXML: 6 | def __init__(self, word) -> None: 7 | self.word = word 8 | self.key = Core.whoisxml_key() 9 | if self.key is None: 10 | raise MissingKey('whoisxml') 11 | self.total_results = None 12 | self.proxy = False 13 | 14 | async def do_search(self): 15 | # https://subdomains.whoisxmlapi.com/api/documentation/making-requests 16 | url = 'https://subdomains.whoisxmlapi.com/api/v1' 17 | params = {'apiKey': self.key, 'domainName': self.word} 18 | response = await AsyncFetcher.fetch_all( 19 | [url], 20 | json=True, 21 | params=params, 22 | headers={'User-Agent': Core.get_user_agent()}, 23 | proxy=self.proxy, 24 | ) 25 | # Parse the response according to the example JSON structure: 26 | # {"search":"example.com.com","result":{"count":10000,"records":[{"domain":"test.example.com","firstSeen":1678169400,"lastSeen":1678169400}]}} 27 | self.total_results = [] 28 | print(response[0]) 29 | if response and response[0]: 30 | # Extract domains from the records array 31 | if 'result' in response[0] and 'records' in response[0]['result']: 32 | self.total_results = [record['domain'] for record in response[0]['result']['records']] 33 | 34 | async def get_hostnames(self): 35 | return self.total_results 36 | 37 | async def process(self, proxy: bool = False) -> None: 38 | self.proxy = proxy 39 | await self.do_search() 40 | -------------------------------------------------------------------------------- /theHarvester/discovery/yahoosearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | from theHarvester.parsers import myparser 3 | 4 | 5 | class SearchYahoo: 6 | def __init__(self, word, limit) -> None: 7 | self.word = word 8 | self.total_results = '' 9 | self.server = 'search.yahoo.com' 10 | self.limit = limit 11 | self.proxy = False 12 | 13 | async def do_search(self) -> None: 14 | base_url = f'https://{self.server}/search?p=%40{self.word}&b=xx&pz=10' 15 | headers = {'Host': self.server, 'User-agent': Core.get_user_agent()} 16 | urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 10) if num <= self.limit] 17 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 18 | for response in responses: 19 | self.total_results += response 20 | 21 | async def process(self, proxy: bool = False) -> None: 22 | self.proxy = proxy 23 | await self.do_search() 24 | 25 | async def get_emails(self): 26 | rawres = myparser.Parser(self.total_results, self.word) 27 | toparse_emails = await rawres.emails() 28 | emails = set() 29 | # strip out numbers and dashes for emails that look like xxx-xxx-xxxemail@host.tld 30 | for email in toparse_emails: 31 | email = str(email) 32 | if '-' in email and email[0].isdigit() and email.index('-') <= 9: 33 | while email[0] == '-' or email[0].isdigit(): 34 | email = email[1:] 35 | emails.add(email) 36 | return list(emails) 37 | 38 | async def get_hostnames(self, proxy: bool = False): 39 | self.proxy = proxy 40 | rawres = myparser.Parser(self.total_results, self.word) 41 | return await rawres.hostnames() 42 | -------------------------------------------------------------------------------- /theHarvester/lib/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['hostchecker'] 2 | -------------------------------------------------------------------------------- /theHarvester/lib/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/lib/api/__init__.py -------------------------------------------------------------------------------- /theHarvester/lib/api/additional_endpoints.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends, HTTPException 2 | from pydantic import BaseModel 3 | 4 | from theHarvester.discovery.additional_apis import AdditionalAPIs 5 | from theHarvester.lib.api.auth import get_api_key 6 | 7 | router = APIRouter() 8 | 9 | 10 | class DomainRequest(BaseModel): 11 | domain: str 12 | api_keys: dict[str, str] | None = None 13 | 14 | 15 | @router.post('/breaches') 16 | async def get_breaches(request: DomainRequest, api_key: str = Depends(get_api_key)): 17 | """Get breach information for a domain using HaveIBeenPwned.""" 18 | try: 19 | apis = AdditionalAPIs(request.domain, request.api_keys or {}) 20 | results = await apis.haveibeenpwned.search_breaches(request.domain) 21 | return {'status': 'success', 'data': results} 22 | except Exception as e: 23 | raise HTTPException(status_code=500, detail=str(e)) 24 | 25 | 26 | @router.post('/leaks') 27 | async def get_leaks(request: DomainRequest, api_key: str = Depends(get_api_key)): 28 | """Get leaked credentials for a domain using Leak-Lookup.""" 29 | try: 30 | apis = AdditionalAPIs(request.domain, request.api_keys or {}) 31 | results = await apis.leaklookup.search_leaks(request.domain) 32 | return {'status': 'success', 'data': results} 33 | except Exception as e: 34 | raise HTTPException(status_code=500, detail=str(e)) 35 | 36 | 37 | @router.post('/security-score') 38 | async def get_security_score(request: DomainRequest, api_key: str = Depends(get_api_key)): 39 | """Get security scorecard for a domain.""" 40 | try: 41 | apis = AdditionalAPIs(request.domain, request.api_keys or {}) 42 | results = await apis.securityscorecard.get_domain_score(request.domain) 43 | return {'status': 'success', 'data': results} 44 | except Exception as e: 45 | raise HTTPException(status_code=500, detail=str(e)) 46 | 47 | 48 | @router.post('/tech-stack') 49 | async def get_tech_stack(request: DomainRequest, api_key: str = Depends(get_api_key)): 50 | """Get technology stack information for a domain using BuiltWith.""" 51 | try: 52 | apis = AdditionalAPIs(request.domain, request.api_keys or {}) 53 | results = await apis.builtwith.get_tech_stack(request.domain) 54 | return {'status': 'success', 'data': results} 55 | except Exception as e: 56 | raise HTTPException(status_code=500, detail=str(e)) 57 | 58 | 59 | @router.post('/all') 60 | async def get_all_info(request: DomainRequest, api_key: str = Depends(get_api_key)): 61 | """Get all additional information for a domain.""" 62 | try: 63 | apis = AdditionalAPIs(request.domain, request.api_keys or {}) 64 | results = await apis.process() 65 | return {'status': 'success', 'data': results} 66 | except Exception as e: 67 | raise HTTPException(status_code=500, detail=str(e)) 68 | -------------------------------------------------------------------------------- /theHarvester/lib/api/api.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | from fastapi import FastAPI, Header, Query, Request 5 | from fastapi.middleware.cors import CORSMiddleware 6 | from fastapi.responses import HTMLResponse, RedirectResponse, Response, UJSONResponse 7 | from slowapi import Limiter, _rate_limit_exceeded_handler 8 | from slowapi.errors import RateLimitExceeded 9 | from slowapi.util import get_remote_address 10 | from starlette.staticfiles import StaticFiles 11 | 12 | from theHarvester import __main__ 13 | from theHarvester.lib.api.additional_endpoints import router as additional_router 14 | 15 | limiter = Limiter(key_func=get_remote_address) 16 | app = FastAPI( 17 | title='Restful Harvest', 18 | description='Rest API for theHarvester powered by FastAPI', 19 | version='0.0.2', 20 | ) 21 | app.state.limiter = limiter 22 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) # type: ignore 23 | 24 | # Add CORS middleware 25 | app.add_middleware( 26 | CORSMiddleware, 27 | allow_origins=['*'], 28 | allow_credentials=True, 29 | allow_methods=['*'], 30 | allow_headers=['*'], 31 | ) 32 | 33 | # Include additional endpoints 34 | app.include_router(additional_router, prefix='/additional', tags=['Additional APIs']) 35 | 36 | # This is where we will host files that arise if the user specifies a filename 37 | try: 38 | app.mount('/static', StaticFiles(directory='theHarvester/lib/api/static/'), name='static') 39 | except RuntimeError: 40 | static_path = os.path.expanduser('~/.local/share/theHarvester/static/') 41 | if not os.path.isdir(static_path): 42 | os.makedirs(static_path) 43 | app.mount( 44 | '/static', 45 | StaticFiles(directory=static_path), 46 | name='static', 47 | ) 48 | 49 | 50 | @app.get('/') 51 | async def root(*, user_agent: str = Header(None)) -> Response: 52 | # very basic user agent filtering 53 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent): 54 | response = RedirectResponse(app.url_path_for('bot')) 55 | return response 56 | 57 | return HTMLResponse( 58 | """ 59 | 60 | 61 | 62 | theHarvester API 63 | 69 | 70 | 71 |
72 | 73 | 74 | theHarvester logo 75 | 76 | 77 | 78 | 79 | """ 80 | ) 81 | 82 | 83 | @app.get('/nicebot') 84 | async def bot() -> dict[str, str]: 85 | # nice bot 86 | string = {'bot': 'These are not the droids you are looking for'} 87 | return string 88 | 89 | 90 | @app.get('/sources', response_class=UJSONResponse) 91 | @limiter.limit('5/minute') 92 | async def getsources(request: Request): 93 | # Endpoint for user to query for available sources theHarvester supports 94 | # Rate limit of 5 requests per minute 95 | sources = __main__.Core.get_supportedengines() 96 | return {'sources': sources} 97 | 98 | 99 | @app.get('/dnsbrute') 100 | @limiter.limit('5/minute') 101 | async def dnsbrute( 102 | request: Request, 103 | user_agent: str = Header(None), 104 | domain: str = Query(..., description='Domain to be brute forced'), 105 | ) -> Response: 106 | # Endpoint for user to signal to do DNS brute forcing 107 | # Rate limit of 5 requests per minute 108 | # basic user agent filtering 109 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent): 110 | response = RedirectResponse(app.url_path_for('bot')) 111 | return response 112 | dns_bruteforce = await __main__.start( 113 | argparse.Namespace( 114 | dns_brute=True, 115 | dns_lookup=False, 116 | dns_server=False, 117 | dns_tld=False, 118 | domain=domain, 119 | filename='', 120 | google_dork=False, 121 | limit=500, 122 | proxies=False, 123 | shodan=False, 124 | source=','.join([]), 125 | start=0, 126 | take_over=False, 127 | virtual_host=False, 128 | ) 129 | ) 130 | return UJSONResponse({'dns_bruteforce': dns_bruteforce}) 131 | 132 | 133 | @app.get('/query') 134 | @limiter.limit('2/minute') 135 | async def query( 136 | request: Request, 137 | dns_server: str = Query(''), 138 | user_agent: str = Header(None), 139 | dns_brute: bool = Query(False), 140 | dns_lookup: bool = Query(False), 141 | dns_tld: bool = Query(False), 142 | filename: str = Query(''), 143 | google_dork: bool = Query(False), 144 | proxies: bool = Query(False), 145 | shodan: bool = Query(False), 146 | take_over: bool = Query(False), 147 | virtual_host: bool = Query(False), 148 | source: list[str] = Query(..., description='Data sources to query comma separated with no space'), 149 | limit: int = Query(500), 150 | start: int = Query(0), 151 | domain: str = Query(..., description='Domain to be harvested'), 152 | ) -> Response: 153 | # Query function that allows user to query theHarvester rest API 154 | # Rate limit of 2 requests per minute 155 | # basic user agent filtering 156 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent): 157 | response = RedirectResponse(app.url_path_for('bot')) 158 | return response 159 | try: 160 | ( 161 | asns, 162 | iurls, 163 | twitter_people_list, 164 | linkedin_people_list, 165 | linkedin_links, 166 | aurls, 167 | aips, 168 | aemails, 169 | ahosts, 170 | ) = await __main__.start( 171 | argparse.Namespace( 172 | dns_brute=dns_brute, 173 | dns_lookup=dns_lookup, 174 | dns_server=dns_server, 175 | dns_tld=dns_tld, 176 | domain=domain, 177 | filename=filename, 178 | google_dork=google_dork, 179 | limit=limit, 180 | proxies=proxies, 181 | shodan=shodan, 182 | source=','.join(source), 183 | start=start, 184 | take_over=take_over, 185 | virtual_host=virtual_host, 186 | ) 187 | ) 188 | 189 | return UJSONResponse( 190 | { 191 | 'asns': asns, 192 | 'interesting_urls': iurls, 193 | 'twitter_people': twitter_people_list, 194 | 'linkedin_people': linkedin_people_list, 195 | 'linkedin_links': linkedin_links, 196 | 'trello_urls': aurls, 197 | 'ips': aips, 198 | 'emails': aemails, 199 | 'hosts': ahosts, 200 | } 201 | ) 202 | except Exception: 203 | return UJSONResponse({'exception': 'Please contact the server administrator to check the issue'}) 204 | -------------------------------------------------------------------------------- /theHarvester/lib/api/api_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example script to query theHarvester rest API, obtain results, and write out to stdout as well as an html 3 | """ 4 | 5 | import asyncio 6 | 7 | import aiohttp 8 | import netaddr 9 | 10 | 11 | async def fetch_json(session, url): 12 | async with session.get(url) as response: 13 | return await response.json() 14 | 15 | 16 | async def fetch(session, url): 17 | async with session.get(url) as response: 18 | return await response.text() 19 | 20 | 21 | async def main() -> None: 22 | """ 23 | Just a simple example of how to interact with the rest api 24 | you can easily use requests instead of aiohttp or whatever you best see fit 25 | """ 26 | url = 'http://127.0.0.1:5000' 27 | domain = 'netflix.com' 28 | query_url = f'{url}/query?limit=300&source=bing,baidu,duckduckgo,dogpile&domain={domain}' 29 | async with aiohttp.ClientSession() as session: 30 | fetched_json = await fetch_json(session, query_url) 31 | total_asns = fetched_json['asns'] 32 | interesting_urls = fetched_json['interesting_urls'] 33 | twitter_people_list_tracker = fetched_json['twitter_people'] 34 | linkedin_people_list_tracker = fetched_json['linkedin_people'] 35 | linkedin_links_tracker = fetched_json['linkedin_links'] 36 | trello_urls = fetched_json['trello_urls'] 37 | ips = fetched_json['ips'] 38 | emails = fetched_json['emails'] 39 | hosts = fetched_json['hosts'] 40 | 41 | if len(total_asns) > 0: 42 | print(f'\n[*] ASNS found: {len(total_asns)}') 43 | print('--------------------') 44 | total_asns = list(sorted(set(total_asns))) 45 | for asn in total_asns: 46 | print(asn) 47 | 48 | if len(interesting_urls) > 0: 49 | print(f'\n[*] Interesting Urls found: {len(interesting_urls)}') 50 | print('--------------------') 51 | interesting_urls = list(sorted(set(interesting_urls))) 52 | for iurl in interesting_urls: 53 | print(iurl) 54 | 55 | if len(twitter_people_list_tracker) == 0: 56 | print('\n[*] No Twitter users found.\n\n') 57 | else: 58 | if len(twitter_people_list_tracker) >= 1: 59 | print('\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker))) 60 | print('---------------------') 61 | twitter_people_list_tracker = list(sorted(set(twitter_people_list_tracker))) 62 | for usr in twitter_people_list_tracker: 63 | print(usr) 64 | 65 | if len(linkedin_people_list_tracker) == 0: 66 | print('\n[*] No LinkedIn users found.\n\n') 67 | else: 68 | if len(linkedin_people_list_tracker) >= 1: 69 | print('\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker))) 70 | print('---------------------') 71 | linkedin_people_list_tracker = list(sorted(set(linkedin_people_list_tracker))) 72 | for usr in linkedin_people_list_tracker: 73 | print(usr) 74 | 75 | if len(linkedin_links_tracker) == 0: 76 | print(f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}') 77 | linkedin_links_tracker = list(sorted(set(linkedin_links_tracker))) 78 | print('---------------------') 79 | for link in linkedin_links_tracker: 80 | print(link) 81 | 82 | length_urls = len(trello_urls) 83 | total = length_urls 84 | print('\n[*] Trello URLs found: ' + str(total)) 85 | print('--------------------') 86 | all_urls = list(sorted(set(trello_urls))) 87 | for url in sorted(all_urls): 88 | print(url) 89 | 90 | if len(ips) == 0: 91 | print('\n[*] No IPs found.') 92 | else: 93 | print('\n[*] IPs found: ' + str(len(ips))) 94 | print('-------------------') 95 | # use netaddr as the list may contain ipv4 and ipv6 addresses 96 | ip_list = sorted([netaddr.IPAddress(ip.strip()) for ip in set(ips)]) 97 | print('\n'.join(map(str, ip_list))) 98 | 99 | if len(emails) == 0: 100 | print('\n[*] No emails found.') 101 | else: 102 | print('\n[*] Emails found: ' + str(len(emails))) 103 | print('----------------------') 104 | all_emails = sorted(list(set(emails))) 105 | print('\n'.join(all_emails)) 106 | 107 | if len(hosts) == 0: 108 | print('\n[*] No hosts found.\n\n') 109 | else: 110 | print('\n[*] Hosts found: ' + str(len(hosts))) 111 | print('---------------------') 112 | print('\n'.join(hosts)) 113 | 114 | 115 | if __name__ == '__main__': 116 | asyncio.run(main()) 117 | -------------------------------------------------------------------------------- /theHarvester/lib/api/static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/lib/api/static/.gitkeep -------------------------------------------------------------------------------- /theHarvester/lib/hostchecker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Created by laramies on 2008-08-21. 4 | Revised to use aiodns & asyncio on 2019-09-23 5 | """ 6 | 7 | # Support for Python3.9 8 | from __future__ import annotations 9 | 10 | import asyncio 11 | import socket 12 | from typing import Any 13 | 14 | import aiodns 15 | 16 | 17 | class Checker: 18 | def __init__(self, hosts: list, nameservers: list) -> None: 19 | self.hosts = hosts 20 | self.realhosts: list = [] 21 | self.addresses: set = set() 22 | self.nameservers = nameservers 23 | 24 | # @staticmethod 25 | # async def query(host, resolver) -> Tuple[str, Any]: 26 | # try: 27 | # result = await resolver.gethostbyname(host, socket.AF_INET) 28 | # addresses = result.addresses 29 | # if addresses == [] or addresses is None or result is None: 30 | # return f"{host}:", tuple() 31 | # else: 32 | # return f"{host}:{', '.join(map(str, addresses))}", addresses 33 | # except Exception: 34 | # return f"{host}", tuple() 35 | 36 | @staticmethod 37 | async def resolve_host(host, resolver) -> str: 38 | try: 39 | # TODO add check for ipv6 addrs as well 40 | result = await resolver.gethostbyname(host, socket.AF_INET) 41 | addresses = result.addresses 42 | if addresses == [] or addresses is None or result is None: 43 | return f'{host}:' 44 | else: 45 | addresses = ','.join(map(str, list(sorted(set(addresses))))) 46 | # addresses = list(sorted(addresses)) 47 | return f'{host}:{addresses}' 48 | except Exception: 49 | return f'{host}:' 50 | 51 | # https://stackoverflow.com/questions/312443/how-do-i-split-a-list-into-equally-sized-chunks 52 | @staticmethod 53 | def chunks(lst, n): 54 | """Yield successive n-sized chunks from lst.""" 55 | for i in range(0, len(lst), n): 56 | yield lst[i : i + n] 57 | 58 | async def query_all(self, resolver, hosts) -> list[Any]: 59 | # TODO chunk list into 50 pieces regardless of IPs and subnets 60 | results = await asyncio.gather(*[asyncio.create_task(self.resolve_host(host, resolver)) for host in hosts]) 61 | return results 62 | 63 | async def check(self): 64 | loop = asyncio.get_event_loop() 65 | resolver = ( 66 | aiodns.DNSResolver(loop=loop, timeout=8) 67 | if len(self.nameservers) == 0 68 | else aiodns.DNSResolver(loop=loop, timeout=8, nameservers=self.nameservers) 69 | ) 70 | all_results = set() 71 | for chunk in self.chunks(self.hosts, 50): 72 | # TODO split this to get IPs added total ips 73 | results = await self.query_all(resolver, chunk) 74 | all_results.update(results) 75 | for pair in results: 76 | host, addresses = pair.split(':') 77 | self.realhosts.append(host) 78 | self.addresses.update({addr for addr in addresses.split(',')}) 79 | # address may be a list of ips 80 | # and do a set comprehension to remove duplicates 81 | self.realhosts.sort() 82 | self.addresses = list(self.addresses) 83 | all_results = list(sorted(all_results)) 84 | return all_results, self.realhosts, self.addresses 85 | -------------------------------------------------------------------------------- /theHarvester/lib/version.py: -------------------------------------------------------------------------------- 1 | VERSION = '4.8.0' 2 | 3 | 4 | def version() -> str: 5 | return VERSION 6 | -------------------------------------------------------------------------------- /theHarvester/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/parsers/__init__.py -------------------------------------------------------------------------------- /theHarvester/parsers/intelxparser.py: -------------------------------------------------------------------------------- 1 | class Parser: 2 | def __init__(self) -> None: 3 | self.emails: set = set() 4 | self.hosts: set = set() 5 | 6 | async def parse_dictionaries(self, results: dict) -> tuple: 7 | """ 8 | Parse method to parse json results 9 | :param results: Dictionary containing a list of dictionaries known as selectors 10 | :return: tuple of emails and hosts 11 | """ 12 | if results is not None: 13 | for dictionary in results['selectors']: 14 | field = dictionary['selectorvalue'] 15 | if '@' in field: 16 | self.emails.add(field) 17 | else: 18 | field = str(field) 19 | if 'http' in field or 'https' in field: 20 | if field[:5] == 'https': 21 | field = field[8:] 22 | else: 23 | field = field[7:] 24 | self.hosts.add(field.replace(')', '').replace(',', '')) 25 | return self.emails, self.hosts 26 | return None, None 27 | -------------------------------------------------------------------------------- /theHarvester/parsers/myparser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections.abc import Set 3 | 4 | 5 | class Parser: 6 | def __init__(self, results, word) -> None: 7 | self.results = results 8 | self.word = word 9 | self.temp: list = [] 10 | 11 | async def genericClean(self) -> None: 12 | self.results = ( 13 | self.results.replace('', '') 14 | .replace('', '') 15 | .replace('', '') 16 | .replace('', '') 17 | .replace('%3a', '') 18 | .replace('', '') 19 | .replace('', '') 20 | .replace('', '') 21 | .replace('', '') 22 | ) 23 | 24 | for search in ( 25 | '<', 26 | '>', 27 | ':', 28 | '=', 29 | ';', 30 | '&', 31 | '%3A', 32 | '%3D', 33 | '%3C', 34 | '%2f', 35 | '/', 36 | '\\', 37 | ): 38 | self.results = self.results.replace(search, ' ') 39 | 40 | async def urlClean(self) -> None: 41 | self.results = self.results.replace('', '').replace('', '').replace('%2f', '').replace('%3a', '') 42 | for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'): 43 | self.results = self.results.replace(search, ' ') 44 | 45 | async def emails(self): 46 | await self.genericClean() 47 | # Local part is required, charset is flexible. 48 | # https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly) 49 | reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', '')) 50 | self.temp = reg_emails.findall(self.results) 51 | emails = await self.unique() 52 | true_emails = { 53 | ( 54 | str(email)[1:].lower().strip() 55 | if len(str(email)) > 1 and str(email)[0] == '.' 56 | else len(str(email)) > 1 and str(email).lower().strip() 57 | ) 58 | for email in emails 59 | } 60 | # if email starts with dot shift email string and make sure all emails are lowercase 61 | return true_emails 62 | 63 | async def fileurls(self, file) -> list: 64 | urls: list = [] 65 | reg_urls = re.compile('(.*?)') 92 | temp = reg_hosts.findall(self.results) 93 | for iteration in temp: 94 | if iteration.count(':'): 95 | res = iteration.split(':')[1].split('/')[2] 96 | else: 97 | res = iteration.split('/')[0] 98 | self.temp.append(res) 99 | hostnames = await self.unique() 100 | return hostnames 101 | 102 | async def set(self): 103 | reg_sets = re.compile(r'>[a-zA-Z\d]*') 104 | self.temp = reg_sets.findall(self.results) 105 | sets = [] 106 | for iteration in self.temp: 107 | delete = iteration.replace('>', '') 108 | delete = delete.replace(' Set[str]: 113 | found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z\d\-_\.]+/?)*', self.results) 114 | urls = {match.group().strip() for match in found} 115 | return urls 116 | 117 | async def unique(self) -> list: 118 | return list(set(self.temp)) 119 | -------------------------------------------------------------------------------- /theHarvester/parsers/securitytrailsparser.py: -------------------------------------------------------------------------------- 1 | class Parser: 2 | def __init__(self, word, text) -> None: 3 | self.word = word 4 | self.text = text 5 | self.hostnames: set = set() 6 | self.ips: set = set() 7 | 8 | async def parse_text(self) -> tuple[set, set]: 9 | sub_domain_flag = 0 10 | self.text = str(self.text).splitlines() 11 | # Split lines to get a list of lines. 12 | for index in range(0, len(self.text)): 13 | line = self.text[index].strip() 14 | if '"ip":' in line: 15 | # Extract IP. 16 | ip = '' 17 | for ch in line[7:]: 18 | if ch == '"': 19 | break 20 | else: 21 | ip += ch 22 | self.ips.add(ip) 23 | elif '"subdomains":' in line: 24 | # subdomains start here so set flag to 1 25 | sub_domain_flag = 1 26 | continue 27 | elif sub_domain_flag > 0: 28 | if ']' in line: 29 | sub_domain_flag = 0 30 | else: 31 | if 'www' in self.word: 32 | self.word = str(self.word).replace('www.', '').replace('www', '') 33 | # Remove www from word if entered 34 | self.hostnames.add(str(line).replace('"', '').replace(',', '') + '.' + self.word) 35 | else: 36 | continue 37 | return self.ips, self.hostnames 38 | -------------------------------------------------------------------------------- /theHarvester/parsers/venacusparser.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from collections.abc import Mapping 3 | from typing import Any 4 | 5 | 6 | class TokenTypesEnum(str, enum.Enum): 7 | ID = 'id' 8 | FIRSTNAME = 'firstname' 9 | LASTNAME = 'lastname' 10 | EMAIL = 'email' 11 | DOB = 'dob' 12 | URL = 'url' 13 | PHONE = 'phone' 14 | DATE = 'date' 15 | TIME = 'time' 16 | IP = 'ip_address' 17 | HASH = 'hash' 18 | PASSWORD = 'password' 19 | ADDRESS = 'address' 20 | COMPANY = 'company' 21 | JOB_TITLE = 'job_title' 22 | USERNAME = 'username' 23 | COUNTRY = 'country' 24 | CITY = 'city' 25 | STATE = 'state' 26 | ZIP_CODE = 'zip_code' 27 | CURRENCY = 'currency' 28 | INDUSTRY = 'industry' 29 | DEPARTMENT = 'department' 30 | ROLE = 'role' 31 | 32 | 33 | class Parser: 34 | def __init__(self) -> None: 35 | self.parsed_data: dict[str, set[str]] = {} 36 | self.people: list[dict[str, str]] = [] 37 | 38 | async def parse_text_tokens(self, results: list[dict[str, Any]]) -> Mapping[str, set[str] | list[dict[str, str]]]: 39 | """ 40 | Extracts different types of information from the recognized text tokens 41 | """ 42 | if not results: 43 | return {'people': set(), 'emails': set(), 'ips': set(), 'urls': set()} 44 | 45 | for res in results: 46 | person: dict[str, str] | None = None 47 | for token in res['tokens']: 48 | if token['type'] == TokenTypesEnum.EMAIL: 49 | if 'emails' not in self.parsed_data: 50 | self.parsed_data['emails'] = set() 51 | self.parsed_data['emails'].add(token['value']) 52 | person = person or {} 53 | person['email'] = token['value'] 54 | elif token['type'] == TokenTypesEnum.IP: 55 | if 'ips' not in self.parsed_data: 56 | self.parsed_data['ips'] = set() 57 | self.parsed_data['ips'].add(token['value']) 58 | elif token['type'] == TokenTypesEnum.URL: 59 | if 'urls' not in self.parsed_data: 60 | self.parsed_data['urls'] = set() 61 | self.parsed_data['urls'].add(token['value']) 62 | elif token['type'] == TokenTypesEnum.FIRSTNAME: 63 | person = person or {} 64 | person['firstname'] = token['value'] 65 | elif token['type'] == TokenTypesEnum.LASTNAME: 66 | person = person or {} 67 | person['lastname'] = token['value'] 68 | elif token['type'] == TokenTypesEnum.COMPANY: 69 | person = person or {} 70 | person['company'] = token['value'] 71 | elif token['type'] == TokenTypesEnum.CITY: 72 | person = person or {} 73 | person['city'] = token['value'] 74 | elif token['type'] == TokenTypesEnum.STATE: 75 | person = person or {} 76 | person['state'] = token['value'] 77 | elif token['type'] == TokenTypesEnum.COUNTRY: 78 | person = person or {} 79 | person['country'] = token['value'] 80 | elif token['type'] == TokenTypesEnum.ZIP_CODE: 81 | person = person or {} 82 | person['zip_code'] = token['value'] 83 | elif token['type'] == TokenTypesEnum.PHONE: 84 | person = person or {} 85 | person['phone'] = token['value'] 86 | elif token['type'] == TokenTypesEnum.ADDRESS: 87 | person = person or {} 88 | person['address'] = token['value'] 89 | elif token['type'] == TokenTypesEnum.ROLE: 90 | person = person or {} 91 | person['role'] = token['value'] 92 | elif token['type'] == TokenTypesEnum.DOB: 93 | person = person or {} 94 | person['dob'] = token['value'] 95 | elif token['type'] == TokenTypesEnum.JOB_TITLE: 96 | person = person or {} 97 | person['job_title'] = token['value'] 98 | elif token['type'] == TokenTypesEnum.INDUSTRY: 99 | person = person or {} 100 | person['industry'] = token['value'] 101 | elif token['type'] == TokenTypesEnum.DEPARTMENT: 102 | person = person or {} 103 | person['department'] = token['value'] 104 | 105 | if person: 106 | for key in person: 107 | if key != 'email': 108 | self.people.append(person) 109 | break 110 | 111 | if self.people: 112 | self.parsed_data['people'] = self.people # type: ignore 113 | 114 | return self.parsed_data 115 | -------------------------------------------------------------------------------- /theHarvester/restfulHarvest.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | 5 | 6 | def main(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument( 9 | '-H', 10 | '--host', 11 | default='127.0.0.1', 12 | help='IP address to listen on default is 127.0.0.1', 13 | ) 14 | parser.add_argument( 15 | '-p', 16 | '--port', 17 | default=5000, 18 | help='Port to bind the web server to, default is 5000', 19 | type=int, 20 | ) 21 | parser.add_argument( 22 | '-l', 23 | '--log-level', 24 | default='info', 25 | help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set', 26 | ) 27 | parser.add_argument( 28 | '-r', 29 | '--reload', 30 | default=False, 31 | help='Enable automatic reload used during development of the api', 32 | action='store_true', 33 | ) 34 | 35 | args: argparse.Namespace = parser.parse_args() 36 | uvicorn.run( 37 | 'theHarvester.lib.api.api:app', 38 | host=args.host, 39 | port=args.port, 40 | log_level=args.log_level, 41 | reload=args.reload, 42 | ) 43 | 44 | 45 | if __name__ == '__main__': 46 | main() 47 | -------------------------------------------------------------------------------- /theHarvester/screenshot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/screenshot/__init__.py -------------------------------------------------------------------------------- /theHarvester/screenshot/screenshot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Screenshot module that utilizes playwright to asynchronously 3 | take screenshots 4 | """ 5 | 6 | import os 7 | import ssl 8 | import sys 9 | from collections.abc import Collection 10 | from datetime import datetime 11 | 12 | import aiohttp 13 | import certifi 14 | from playwright.async_api import async_playwright 15 | 16 | 17 | class ScreenShotter: 18 | def __init__(self, output) -> None: 19 | self.output = output 20 | self.slash = '\\' if 'win' in sys.platform else '/' 21 | self.slash = '' if (self.output[-1] == '\\' or self.output[-1] == '/') else self.slash 22 | 23 | def verify_path(self) -> bool: 24 | try: 25 | if not os.path.isdir(self.output): 26 | answer = input('[+] The output path you have entered does not exist would you like to create it (y/n): ') 27 | if answer.lower() == 'yes' or answer.lower() == 'y': 28 | os.makedirs(self.output) 29 | return True 30 | else: 31 | return False 32 | return True 33 | except Exception as e: 34 | print(f"An exception has occurred while attempting to verify output path's existence: {e}") 35 | return False 36 | 37 | @staticmethod 38 | async def verify_installation() -> None: 39 | # Helper function that verifies playwright & chromium is installed 40 | try: 41 | async with async_playwright() as p: 42 | browser = await p.chromium.launch() 43 | await browser.close() 44 | print('Playwright and Chromium are successfully installed.') 45 | except Exception as e: 46 | print(f'An exception has occurred while attempting to verify installation: {e}') 47 | 48 | @staticmethod 49 | def chunk_list(items: Collection, chunk_size: int) -> list: 50 | # Based off of: https://github.com/apache/incubator-sdap-ingester 51 | return [list(items)[i : i + chunk_size] for i in range(0, len(items), chunk_size)] 52 | 53 | @staticmethod 54 | async def visit(url: str) -> tuple[str, str]: 55 | try: 56 | timeout = aiohttp.ClientTimeout(total=35) 57 | headers = { 58 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) ' 59 | 'Chrome/122.0.0.0 Safari/537.36' 60 | } 61 | url = f'http://{url}' if not url.startswith('http') else url 62 | url = url.replace('www.', '') 63 | sslcontext = ssl.create_default_context(cafile=certifi.where()) 64 | async with aiohttp.ClientSession( 65 | timeout=timeout, 66 | headers=headers, 67 | connector=aiohttp.TCPConnector(ssl=sslcontext), 68 | ) as session: 69 | async with session.get(url, ssl=False) as resp: 70 | text = await resp.text('UTF-8') 71 | return f'http://{url}' if not url.startswith('http') else url, text 72 | except Exception as e: 73 | print(f'An exception has occurred while attempting to visit {url} : {e}') 74 | return '', '' 75 | 76 | async def take_screenshot(self, url: str) -> tuple[str, ...]: 77 | url = f'http://{url}' if not url.startswith('http') else url 78 | url = url.replace('www.', '') 79 | print(f'Attempting to take a screenshot of: {url}') 80 | async with async_playwright() as p: 81 | browser = await p.chromium.launch(headless=True) 82 | # New browser context 83 | context = await browser.new_context() 84 | page = await context.new_page() 85 | path = rf'{self.output}{self.slash}{url.replace("http://", "").replace("https://", "")}.png' 86 | date = str(datetime.utcnow()) 87 | try: 88 | # Will fail if network idle or load event doesn't fire after 89 | # 35s which should be handled 90 | await page.goto(url, timeout=35000) 91 | await page.screenshot(path=path) 92 | except Exception as e: 93 | print(f'An exception has occurred attempting to screenshot: {url} : {e}') 94 | path = '' 95 | finally: 96 | await page.close() 97 | await context.close() 98 | await browser.close() 99 | return date, url, path 100 | -------------------------------------------------------------------------------- /theHarvester/theHarvester.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import sys 3 | 4 | from theHarvester import __main__ 5 | 6 | 7 | def main(): 8 | platform = sys.platform 9 | if platform == 'win32': 10 | # Required or things will break if trying to take screenshots 11 | import multiprocessing 12 | 13 | multiprocessing.freeze_support() 14 | try: 15 | # See if we have winloop as a performance enhancement on windows 16 | import winloop 17 | 18 | asyncio.DefaultEventLoopPolicy = winloop.EventLoopPolicy 19 | except ModuleNotFoundError: 20 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy 21 | else: 22 | import uvloop 23 | 24 | uvloop.install() 25 | 26 | if 'linux' in platform: 27 | import aiomultiprocess 28 | 29 | # As we are not using Windows, we can change the spawn method to fork for greater performance 30 | aiomultiprocess.set_context('fork') 31 | asyncio.run(__main__.entry_point()) 32 | --------------------------------------------------------------------------------