├── .dockerignore
├── .git-blame-ignore-revs
├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   └── issue-template.md
    ├── dependabot.yml
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── docker-build-push.yml
    │   ├── dockerci.yml
    │   └── theHarvester.yml
├── .gitignore
├── Dockerfile
├── README.md
├── README
    ├── CONTRIBUTING.md
    ├── COPYING
    └── LICENSES
├── bin
    ├── restfulHarvest
    └── theHarvester
├── docker-compose.yml
├── pyproject.toml
├── requirements.txt
├── restfulHarvest.py
├── tests
    ├── __init__.py
    ├── discovery
    │   ├── __init__.py
    │   ├── test_certspotter.py
    │   ├── test_githubcode.py
    │   └── test_otx.py
    ├── lib
    │   └── test_core.py
    └── test_myparser.py
├── theHarvester-logo.png
├── theHarvester-logo.webp
├── theHarvester.py
└── theHarvester
    ├── __init__.py
    ├── __main__.py
    ├── data
        ├── api-keys.yaml
        ├── proxies.yaml
        └── wordlists
        │   ├── api_endpoints.txt
        │   ├── dns-big.txt
        │   ├── dns-names.txt
        │   ├── dorks.txt
        │   ├── general
        │       └── common.txt
        │   └── names_small.txt
    ├── discovery
        ├── __init__.py
        ├── api_endpoints.py
        ├── baidusearch.py
        ├── bevigil.py
        ├── bingsearch.py
        ├── bravesearch.py
        ├── bufferoverun.py
        ├── builtwith.py
        ├── censysearch.py
        ├── certspottersearch.py
        ├── constants.py
        ├── criminalip.py
        ├── crtsh.py
        ├── dnssearch.py
        ├── duckduckgosearch.py
        ├── fullhuntsearch.py
        ├── githubcode.py
        ├── hackertarget.py
        ├── haveibeenpwned.py
        ├── huntersearch.py
        ├── intelxsearch.py
        ├── leaklookup.py
        ├── netlas.py
        ├── onyphe.py
        ├── otxsearch.py
        ├── pentesttools.py
        ├── projectdiscovery.py
        ├── rapiddns.py
        ├── rocketreach.py
        ├── search_dehashed.py
        ├── search_dnsdumpster.py
        ├── searchhunterhow.py
        ├── securityscorecard.py
        ├── securitytrailssearch.py
        ├── shodansearch.py
        ├── sitedossier.py
        ├── subdomaincenter.py
        ├── subdomainfinderc99.py
        ├── takeover.py
        ├── threatminer.py
        ├── tombasearch.py
        ├── urlscan.py
        ├── venacussearch.py
        ├── virustotal.py
        ├── whoisxml.py
        ├── yahoosearch.py
        └── zoomeyesearch.py
    ├── lib
        ├── __init__.py
        ├── api
        │   ├── __init__.py
        │   ├── additional_endpoints.py
        │   ├── api.py
        │   ├── api_example.py
        │   └── static
        │   │   └── .gitkeep
        ├── core.py
        ├── hostchecker.py
        ├── ip-ranges.json
        ├── resolvers.txt
        ├── stash.py
        └── version.py
    ├── parsers
        ├── __init__.py
        ├── intelxparser.py
        ├── myparser.py
        ├── securitytrailsparser.py
        └── venacusparser.py
    ├── restfulHarvest.py
    ├── screenshot
        ├── __init__.py
        └── screenshot.py
    └── theHarvester.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .github/*
 2 | .gitattributes
 3 | .git-blame-ignore-revs
 4 | .idea/
 5 | .pytest_cache
 6 | .mypy_cache
 7 | tests/*
 8 | README/
 9 | bin/
10 | theHarvester-logo.png
11 | theHarvester-logo.webp


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # #1492 run `black .` and `isort .`
2 | c13843ec0d513ac7f9c35b7bd0501fa46e356415


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Set the default behavior, which is to have git automatically determine
 2 | # whether a file is a text or binary, unless otherwise specified.
 3 | 
 4 | * text=auto
 5 | 
 6 | # Basic .gitattributes for a python repo.
 7 | 
 8 | # Source files
 9 | # ============
10 | *.pxd       text diff=python
11 | *.py        text diff=python
12 | *.py3       text diff=python
13 | *.pyw       text diff=python
14 | *.pyx       text diff=python
15 | 
16 | # Binary files
17 | # ============
18 | *.db        binary
19 | *.p         binary
20 | *.pkl       binary
21 | *.pyc       binary
22 | *.pyd       binary
23 | *.pyo       binary
24 | 
25 | # Note: .db, .p, and .pkl files are associated with the python modules
26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
27 | # (among others).
28 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [L1ghtn1ng, NotoriousRebel]
 4 | open_collective: # Replace with a single Open Collective username
 5 | ko_fi: #
 6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 8 | liberapay: # Replace with a single Liberapay username
 9 | issuehunt: # Replace with a single IssueHunt username
10 | otechie: # Replace with a single Otechie username
11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
12 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue-template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Issue Template
 3 | about: A template for new issues.
 4 | title: "[Bug|Feature Request|Other] Short Description of Issue"
 5 | labels: ''
 6 | 
 7 | ---
 8 | 
 9 | ## Note we do not support installing theHarvester on android
10 | 
11 | **Feature Request or Bug or Another**
12 | Feature Request | Bug | Other
13 | 
14 | **Describe the feature request or bug or other**
15 | A clear and concise description of what the bug, feature request,
16 | or other request is.
17 | 
18 | **To Reproduce**
19 | Steps to reproduce the behaviour:
20 | 1. Run tool like this: '...'
21 | 2. See error
22 | 
23 | **Expected behaviour**
24 | A clear and concise description of what you expected to happen.
25 | 
26 | **Screenshots**
27 | If possible please add screenshots to help explain your problem.
28 | 
29 | **System Information (System that tool is running on):**
30 |  - OS: [e.g. Windows10]
31 |  - Version [e.g. 2.7]
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: github-actions
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     timezone: Europe/London
 8 | - package-ecosystem: uv
 9 |   directory: "/"
10 |   schedule:
11 |     interval: daily
12 |     timezone: Europe/London
13 |   open-pull-requests-limit: 10
14 |   target-branch: master
15 |   allow:
16 |   - dependency-type: direct
17 |   - dependency-type: indirect
18 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master, dev ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master, dev ]
20 |   schedule:
21 |     - cron: '19 11 * * 4'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 | 
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         language: [ 'python' ]
32 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 |         # Learn more:
34 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 | 
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v4
39 | 
40 |     # Initializes the CodeQL tools for scanning.
41 |     - name: Initialize CodeQL
42 |       uses: github/codeql-action/init@v3
43 |       with:
44 |         languages: ${{ matrix.language }}
45 |         # If you wish to specify custom queries, you can do so here or in a config file.
46 |         # By default, queries listed here will override any specified in a config file.
47 |         # Prefix the list here with "+" to use these queries and those in the config file.
48 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 | 
50 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
51 |     # If this step fails, then you should remove it and run the build manually (see below)
52 |     - name: Autobuild
53 |       uses: github/codeql-action/autobuild@v3
54 | 
55 |     # ℹ️ Command-line programs to run using the OS shell.
56 |     # 📚 https://git.io/JvXDl
57 | 
58 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 |     #    and modify them (or add more) to build your code if your project
60 |     #    uses a compiled language
61 | 
62 |     #- run: |
63 |     #   make bootstrap
64 |     #   make release
65 | 
66 |     - name: Perform CodeQL Analysis
67 |       uses: github/codeql-action/analyze@v3
68 | 


--------------------------------------------------------------------------------
/.github/workflows/docker-build-push.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Push Docker Image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | permissions:
 9 |   contents: read
10 |   packages: write
11 | 
12 | jobs:
13 |   build-and-push:
14 |     runs-on: ubuntu-latest
15 | 
16 |     steps:
17 |       - name: Checkout repository
18 |         uses: actions/checkout@v4
19 | 
20 |       - name: Set up Docker Buildx
21 |         uses: docker/setup-buildx-action@v3
22 | 
23 |       - name: Log in to GitHub Container Registry
24 |         uses: docker/login-action@v3
25 |         with:
26 |           registry: ghcr.io
27 |           username: ${{ github.actor }}
28 |           password: ${{ secrets.GITHUB_TOKEN }}
29 | 
30 |       - name: Extract metadata for Docker
31 |         id: meta
32 |         uses: docker/metadata-action@v5
33 |         with:
34 |           images: ghcr.io/${{ github.repository_owner }}/theharvester
35 |           tags: |
36 |             latest
37 |             type=ref,event=branch
38 |             type=sha
39 | 
40 |       - name: Build and push Docker image
41 |         uses: docker/build-push-action@v6
42 |         with:
43 |           context: .
44 |           file: Dockerfile
45 |           push: true
46 |           platforms: linux/amd64,linux/arm64
47 |           tags: ${{ steps.meta.outputs.tags }}
48 |           labels: ${{ steps.meta.outputs.labels }}
49 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerci.yml:
--------------------------------------------------------------------------------
 1 | name: TheHarvester Docker Image CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v4
10 |     - name: Build the Docker image
11 |       run: docker build . --file Dockerfile --tag theharvester:$(date +%s)


--------------------------------------------------------------------------------
/.github/workflows/theHarvester.yml:
--------------------------------------------------------------------------------
  1 | name: TheHarvester Python CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - '*'
  7 | 
  8 |   pull_request:
  9 |     branches:
 10 |       - '*'
 11 | 
 12 | jobs:
 13 |   Python:
 14 |     runs-on: ${{ matrix.os }}
 15 |     strategy:
 16 |       max-parallel: 10
 17 |       matrix:
 18 |         os: [ ubuntu-latest ]
 19 |         python-version: [ '3.12', '3.13', '3.14.0-beta.1' ]
 20 | 
 21 |     steps:
 22 |       - uses: actions/checkout@v4
 23 |       - name: Python ${{ matrix.python-version }}
 24 |         uses: actions/setup-python@v5
 25 |         with:
 26 |           python-version: ${{ matrix.python-version }}
 27 |       - name: Install dependencies
 28 |         run: |
 29 |           sudo mkdir -p /usr/local/etc/theHarvester
 30 |           sudo cp theHarvester/data/*.yaml /usr/local/etc/theHarvester/
 31 |           sudo chown -R runner:runner /usr/local/etc/theHarvester/
 32 |           pip install --upgrade pip
 33 |           pip install .[dev]
 34 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 35 | 
 36 |       - name: Lint with ruff
 37 |         run: |
 38 |           ruff check --fix
 39 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 40 | 
 41 |       - name: Format with ruff
 42 |         run: |
 43 |           ruff format
 44 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 45 | 
 46 |       - name: Commit changes for ruff formating and linting
 47 |         run: |
 48 |           git config user.name github-actions
 49 |           git config user.email github-actions@github.com
 50 |           git add .
 51 |           git commit -m "Apply ruff fixes and formatting" || true # Use || true to prevent failure if no changes
 52 |           git push origin $GITHUB_REF
 53 |         env:
 54 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 55 | 
 56 |       - name: Test with pytest
 57 |         run: |
 58 |           pytest tests/**
 59 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 60 | 
 61 |       - name: Run theHarvester module Baidu
 62 |         run: |
 63 |           theHarvester -d yale.edu -b baidu
 64 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 65 | 
 66 |       - name: Run theHarvester module Bing
 67 |         run: |
 68 |           theHarvester -d yale.edu -b bing
 69 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 70 | 
 71 |       - name: Run theHarvester module CertSpotter
 72 |         run: |
 73 |           theHarvester -d yale.edu -b certspotter
 74 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 75 | 
 76 |       - name: Run theHarvester module Crtsh
 77 |         run: |
 78 |           theHarvester -d hcl.com -b crtsh
 79 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 80 | 
 81 |       - name: Run theHarvester module DuckDuckGo
 82 |         run: |
 83 |           theHarvester -d yale.edu -b duckduckgo
 84 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 85 | 
 86 |       - name: Run theHarvester module HackerTarget
 87 |         run: |
 88 |           theHarvester -d yale.edu -b hackertarget
 89 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 90 | 
 91 |       - name: Run theHarvester module Otx
 92 |         run: |
 93 |           theHarvester -d yale.edu -b otx
 94 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
 95 | 
 96 |       - name: Run theHarvester module RapidDns
 97 |         run: |
 98 |           theHarvester -d yale.edu -b rapiddns
 99 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
100 | 
101 |       - name: Run theHarvester module Threatminer
102 |         run: |
103 |           theHarvester -d yale.edu -b threatminer
104 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
105 | 
106 |       - name: Run theHarvester module Urlscan
107 |         run: |
108 |           theHarvester -d yale.edu -b urlscan
109 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
110 | 
111 |       - name: Run theHarvester module Yahoo
112 |         run: |
113 |           theHarvester -d yale.edu -b yahoo
114 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
115 | 
116 |       - name: Run theHarvester module DNS brute force
117 |         run: |
118 |           theHarvester -d yale.edu -c
119 |         continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.idea
 2 | *.pyc
 3 | *.sqlite
 4 | *.html
 5 | *.htm
 6 | *.vscode
 7 | *.xml
 8 | *.json
 9 | debug_results.txt
10 | venv
11 | .mypy_cache
12 | .pytest_cache
13 | build/
14 | dist/
15 | theHarvester.egg-info
16 | api-keys.yaml
17 | .DS_Store
18 | .venv
19 | .pyre
20 | uv.lock
21 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:testing-slim
 2 | 
 3 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1"
 4 | 
 5 | # Install dependencies for building Python from source
 6 | RUN apt update && apt install -y \
 7 |     curl \
 8 |     build-essential \
 9 |     libssl-dev \
10 |     zlib1g-dev \
11 |     libbz2-dev \
12 |     libreadline-dev \
13 |     libsqlite3-dev \
14 |     wget \
15 |     curl \
16 |     llvm \
17 |     libncurses5-dev \
18 |     libncursesw5-dev \
19 |     xz-utils \
20 |     tk-dev \
21 |     libffi-dev \
22 |     liblzma-dev \
23 |     python3-dev \
24 |     git \
25 |     gcc \
26 |     && rm -rf /var/lib/apt/lists/*
27 | 
28 | # Install Python 3.11 from source
29 | RUN curl -fsSL https://www.python.org/ftp/python/3.11.6/Python-3.11.6.tgz -o Python-3.11.6.tgz \
30 |     && tar -xvf Python-3.11.6.tgz \
31 |     && cd Python-3.11.6 \
32 |     && ./configure --enable-optimizations \
33 |     && make -j 2 \
34 |     && make altinstall \
35 |     && rm -rf /Python-3.11.6 /Python-3.11.6.tgz
36 | 
37 | # Install pip for Python 3.11
38 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.11
39 | 
40 | # Install pipx for Python 3.11
41 | RUN python3.11 -m pip install --user pipx
42 | 
43 | # Add pipx to PATH
44 | ENV PATH=/root/.local/bin:$PATH
45 | 
46 | # Install theHarvester via pipx
47 | RUN pipx install --python python3.11 git+https://github.com/laramies/theHarvester.git
48 | 
49 | # Ensure pipx path
50 | RUN pipx ensurepath
51 | 
52 | # Set the entrypoint
53 | ENTRYPOINT ["/root/.local/bin/restfulHarvest", "-H", "0.0.0.0", "-p", "80"]
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![theHarvester](https://github.com/laramies/theHarvester/blob/master/theHarvester-logo.webp)
  2 | 
  3 | ![TheHarvester CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Python%20CI/badge.svg) ![TheHarvester Docker Image CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Docker%20Image%20CI/badge.svg)
  4 | [![Rawsec's CyberSecurity Inventory](https://inventory.raw.pm/img/badges/Rawsec-inventoried-FF5050_flat_without_logo.svg)](https://inventory.raw.pm/)
  5 | 
  6 | What is this?
  7 | -------------
  8 | theHarvester is a simple to use, yet powerful tool designed to be used during the reconnaissance stage of a red<br>
  9 | team assessment or penetration test. It performs open source intelligence (OSINT) gathering to help determine<br>
 10 | a domain's external threat landscape. The tool gathers names, emails, IPs, subdomains, and URLs by using<br>
 11 | multiple public resources that include:<br>
 12 | 
 13 | Passive modules:
 14 | ----------------
 15 | 
 16 | * baidu: Baidu search engine - www.baidu.com
 17 | 
 18 | * bevigil: CloudSEK BeVigil scans mobile application for OSINT assets (Requires an API key, see below.) - https://bevigil.com/osint-api
 19 | 
 20 | * bing: Microsoft search engine - https://www.bing.com
 21 | 
 22 | * bingapi: Microsoft search engine, through the API (Requires an API key, see below.)
 23 | 
 24 | * brave: Brave search engine - https://search.brave.com/
 25 | 
 26 | * bufferoverun: (Requires an API key, see below.) https://tls.bufferover.run
 27 | 
 28 | * censys: [Censys search engine](https://search.censys.io/) will use certificates searches to enumerate subdomains and gather emails<br>
 29 |   (Requires an API key, see below.) https://censys.io
 30 | 
 31 | * certspotter: Cert Spotter monitors Certificate Transparency logs - https://sslmate.com/certspotter/
 32 | 
 33 | * criminalip: Specialized Cyber Threat Intelligence (CTI) search engine (Requires an API key, see below.) - https://www.criminalip.io
 34 | 
 35 | * crtsh: Comodo Certificate search - https://crt.sh
 36 | 
 37 | * duckduckgo: DuckDuckGo search engine - https://duckduckgo.com
 38 | 
 39 | * fullhunt: Next-generation attack surface security platform (Requires an API key, see below.) - https://fullhunt.io
 40 | 
 41 | * github-code: GitHub code search engine (Requires a GitHub Personal Access Token, see below.) - www.github.com
 42 | 
 43 | * hackertarget: Online vulnerability scanners and network intelligence to help organizations - https://hackertarget.com
 44 | 
 45 | * hunter: Hunter search engine (Requires an API key, see below.) - https://hunter.io
 46 | 
 47 | * hunterhow: Internet search engines for security researchers (Requires an API key, see below.) - https://hunter.how
 48 | 
 49 | * intelx: Intelx search engine (Requires an API key, see below.) - http://intelx.io
 50 | 
 51 | * netlas: A Shodan or Censys competitor (Requires an API key, see below.) - https://app.netlas.io
 52 | 
 53 | * onyphe: Cyber defense search engine (Requires an API key, see below.) - https://www.onyphe.io/
 54 | 
 55 | * otx: AlienVault open threat exchange - https://otx.alienvault.com
 56 | 
 57 | * pentestTools: Cloud-based toolkit for offensive security testing, focused on web applications and network penetration<br>
 58 |   testing (Requires an API key, see below.) - https://pentest-tools.com/
 59 | 
 60 | * projecDiscovery: We actively collect and maintain internet-wide assets data, to enhance research and analyse changes around<br>
 61 |   DNS for better insights (Requires an API key, see below.) - https://chaos.projectdiscovery.io
 62 | 
 63 | * rapiddns: DNS query tool which make querying subdomains or sites of a same IP easy! https://rapiddns.io
 64 | 
 65 | * rocketreach: Access real-time verified personal/professional emails, phone numbers, and social media links (Requires an API key,<br>
 66 |   see below.) - https://rocketreach.co
 67 | 
 68 | * securityTrails: Security Trails search engine, the world's largest repository of historical DNS data (Requires an API key, see<br>
 69 |   below.) - https://securitytrails.com
 70 | 
 71 | * -s, --shodan: Shodan search engine will search for ports and banners from discovered hosts (Requires an API key, see below.)<br>
 72 |   https://shodan.io
 73 | 
 74 | * sitedossier: Find available information on a site - http://www.sitedossier.com
 75 | 
 76 | * subdomaincenter: A subdomain finder tool used to find subdomains of a given domain - https://www.subdomain.center/
 77 | 
 78 | * subdomainfinderc99: A subdomain finder is a tool used to find the subdomains of a given domain - https://subdomainfinder.c99.nl
 79 | 
 80 | * threatminer: Data mining for threat intelligence - https://www.threatminer.org/
 81 | 
 82 | * tomba: Tomba search engine (Requires an API key, see below.) - https://tomba.io
 83 | 
 84 | * urlscan: A sandbox for the web that is a URL and website scanner - https://urlscan.io
 85 | 
 86 | * venacus: Venacus search engine (Requires an API key, see below.) - https://venacus.com
 87 | 
 88 | * vhost: Bing virtual hosts search
 89 | 
 90 | * virustotal: Domain search (Requires an API key, see below.) - https://www.virustotal.com
 91 | 
 92 | * whoisxml: Subdomain search (Requires an API key, see below.) - https://subdomains.whoisxmlapi.com/api/pricing
 93 | 
 94 | * yahoo: Yahoo search engine
 95 | 
 96 | * zoomeye: China's version of Shodan (Requires an API key, see below.) - https://www.zoomeye.org
 97 | 
 98 | Active modules:
 99 | ---------------
100 | * DNS brute force: dictionary brute force enumeration
101 | * Screenshots: Take screenshots of subdomains that were found
102 | 
103 | Modules that require an API key:
104 | --------------------------------
105 | Documentation to setup API keys can be found at - https://github.com/laramies/theHarvester/wiki/Installation#api-keys
106 | 
107 | * bevigil - Free upto 50 queries. Pricing can be found here: https://bevigil.com/pricing/osint
108 | * bing
109 | * bufferoverun - uses the free binaAPI
110 | * censys - API keys are required and can be retrieved from your [Censys account](https://search.censys.io/account/api).
111 | * criminalip
112 | * fullhunt
113 | * github
114 | * hunter - limited to 10 on the free plan, so you will need to do -l 10 switch
115 | * hunterhow
116 | * intelx
117 | * netlas - $
118 | * onyphe -$
119 | * pentestTools - $
120 | * projecDiscovery - invite only for now
121 | * rocketreach - $
122 | * securityTrails
123 | * shodan - $
124 | * tomba - Free up to 50 search.
125 | * venacus - $
126 | * whoisxml
127 | * zoomeye
128 | 
129 | Install and dependencies:
130 | -------------------------
131 | * Python 3.11+
132 | * https://github.com/laramies/theHarvester/wiki/Installation
133 | 
134 | Comments, bugs, and requests:
135 | -----------------------------
136 | * [![Twitter Follow](https://img.shields.io/twitter/follow/laramies.svg?style=social&label=Follow)](https://twitter.com/laramies) Christian Martorella @laramies
137 |   cmartorella@edge-security.com
138 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
139 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
140 | 
141 | Main contributors:
142 | ------------------
143 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
144 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
145 | * [![Twitter Follow](https://img.shields.io/twitter/follow/discoverscripts.svg?style=social&label=Follow)](https://twitter.com/discoverscripts) Lee Baird @discoverscripts
146 | 
147 | 
148 | Thanks:
149 | -------
150 | * John Matherly - Shodan project
151 | * Ahmed Aboul Ela - subdomain names dictionaries (big and small)
152 | 


--------------------------------------------------------------------------------
/README/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to theHarvester Project
 2 | Welcome to theHarvester project, so you would like to contribute.
 3 | The following below must be met to get accepted.
 4 | 
 5 | # CI
 6 | Make sure all CI passes and you do not introduce any alerts from ruff
 7 | 
 8 | # Unit Tests
 9 | For new modules a unit test for that module is required and we use pytest.
10 | 
11 | # Coding Standards
12 | * No single letter variables and variable names must represent the action that it is performing
13 | * Have static typing on functions etc
14 | * Make sure no errors are reported from mypy
15 | * No issues reported with ruff
16 |  
17 | # Submitting Bugs
18 | If you find a bug in a module that you want to submit an issue for and know how to write python code.
19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project. 
20 | 


--------------------------------------------------------------------------------
/README/LICENSES:
--------------------------------------------------------------------------------
 1 | Released under the GPL v 2.0.
 2 | 
 3 | If you did not receive a copy of the GPL, try http://www.gnu.org/.
 4 | 
 5 | Copyright 2011 Christian Martorella 
 6 | 
 7 | theHarvester is free software; you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation version 2 of the License.
10 | 
11 | theHarvester is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU General Public License for more details.
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16 | 


--------------------------------------------------------------------------------
/bin/restfulHarvest:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | 
 4 | import uvicorn
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument(
 8 |     "-H",
 9 |     "--host",
10 |     default="127.0.0.1",
11 |     help="IP address to listen on default is 127.0.0.1",
12 | )
13 | parser.add_argument(
14 |     "-p",
15 |     "--port",
16 |     default=5000,
17 |     help="Port to bind the web server to, default is 5000",
18 |     type=int,
19 | )
20 | parser.add_argument(
21 |     "-l",
22 |     "--log-level",
23 |     default="info",
24 |     help="Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set",
25 | )
26 | parser.add_argument(
27 |     "-r",
28 |     "--reload",
29 |     default=False,
30 |     help="Enable automatic reload used during development of the api",
31 |     action="store_true",
32 | )
33 | 
34 | args = parser.parse_args()
35 | 
36 | if __name__ == "__main__":
37 |     uvicorn.run(
38 |         "theHarvester.lib.api.api:app",
39 |         host=args.host,
40 |         port=args.port,
41 |         log_level=args.log_level,
42 |         reload=args.reload,
43 |     )
44 | 


--------------------------------------------------------------------------------
/bin/theHarvester:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Note: This script runs theHarvester
 3 | import asyncio
 4 | import sys
 5 | 
 6 | from theHarvester import __main__
 7 | 
 8 | if sys.version_info.major < 3 or sys.version_info.minor < 11:
 9 |     print(
10 |         "\033[93m[!] Make sure you have Python 3.11+ installed, quitting.\n\n \033[0m"
11 |     )
12 |     sys.exit(1)
13 | 
14 | if __name__ == "__main__":
15 |     platform = sys.platform
16 |     if platform == "win32":
17 |         # Required or things will break if trying to take screenshots
18 |         import multiprocessing
19 | 
20 |         multiprocessing.freeze_support()
21 |         try:
22 |             # See if we have winloop as a performance enhancement on windows
23 |             import winloop
24 | 
25 |             asyncio.DefaultEventLoopPolicy = winloop.EventLoopPolicy
26 |         except ModuleNotFoundError:
27 |             asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
28 |     else:
29 |         import uvloop
30 | 
31 |         uvloop.install()
32 | 
33 |         if "linux" in platform:
34 |             import aiomultiprocess
35 | 
36 |             # As we are not using Windows, we can change the spawn method to fork for greater performance
37 |             aiomultiprocess.set_context("fork")
38 |     asyncio.run(__main__.entry_point())
39 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   theharvester.svc.local:
 3 |     container_name: theHarvester
 4 |     volumes:
 5 |       - ./theHarvester/data/api-keys.yaml:/root/.theHarvester/api-keys.yaml
 6 |       - ./theHarvester/data/api-keys.yaml:/etc/theHarvester/api-keys.yaml
 7 |       - ./theHarvester/data/proxies.yaml:/etc/theHarvester/proxies.yaml
 8 |       - ./theHarvester/data/proxies.yaml:/root/.theHarvester/proxies.yaml
 9 |     build: .
10 |     ports:
11 |       - "8080:80"
12 | 
13 | networks:
14 |   default:
15 |     name: app_theHarvester_network
16 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "theHarvester"
  3 | description = "theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test"
  4 | readme = "README.md"
  5 | authors = [
  6 |     { name = "Christian Martorella", email = "cmartorella@edge-security.com" },
  7 |     { name = "Jay Townsend", email = "jay@cybermon.uk" },
  8 |     { name = "Matthew Brown", email = "36310667+NotoriousRebel@users.noreply.github.com" },
  9 | ]
 10 | requires-python = ">=3.11"
 11 | urls.Homepage = "https://github.com/laramies/theHarvester"
 12 | classifiers = [
 13 |     "Programming Language :: Python :: 3",
 14 |     "Programming Language :: Python :: 3.11",
 15 |     "Programming Language :: Python :: 3.12",
 16 |     "Programming Language :: Python :: 3.13",
 17 |     "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
 18 |     "Operating System :: OS Independent",
 19 | ]
 20 | dynamic = ["version"]
 21 | dependencies = [
 22 |     "aiodns==3.4.0",
 23 |     "aiofiles==24.1.0",
 24 |     "aiohttp==3.12.2",
 25 |     "aiomultiprocess==0.9.1",
 26 |     "aiosqlite==0.21.0",
 27 |     "beautifulsoup4==4.13.4",
 28 |     "censys==2.2.17",
 29 |     "certifi==2025.4.26",
 30 |     "dnspython==2.7.0",
 31 |     "fastapi==0.115.12",
 32 |     "lxml==5.4.0",
 33 |     "netaddr==1.3.0",
 34 |     "playwright==1.52.0",
 35 |     "PyYAML==6.0.2",
 36 |     "python-dateutil==2.9.0.post0",
 37 |     "requests==2.32.3",
 38 |     "retrying==1.3.4",
 39 |     "shodan==1.31.0",
 40 |     "slowapi==0.1.9",
 41 |     "ujson==5.10.0",
 42 |     "uvicorn==0.34.2",
 43 |     "uvloop==0.21.0; platform_system != 'Windows'",
 44 |     "winloop==0.1.8; platform_system == 'Windows'",
 45 | ]
 46 | 
 47 | [project.optional-dependencies]
 48 | dev = [
 49 |     "mypy==1.15.0",
 50 |     "mypy-extensions==1.1.0",
 51 |     "pytest==8.3.5",
 52 |     "pytest-asyncio==0.26.0",
 53 |     "types-certifi==2021.10.8.3",
 54 |     "types-chardet==5.0.4.6",
 55 |     "types-python-dateutil==2.9.0.20250516",
 56 |     "types-PyYAML==6.0.12.20250516",
 57 |     "types-requests==2.32.0.20250515",
 58 |     "ruff==0.11.11",
 59 |     "types-ujson==5.10.0.20250326",
 60 |     "wheel==0.45.1",
 61 | ]
 62 | 
 63 | [project.scripts]
 64 | theHarvester = "theHarvester.theHarvester:main"
 65 | restfulHarvest = "theHarvester.restfulHarvest:main"
 66 | 
 67 | [tool.setuptools.dynamic]
 68 | version = { attr = "theHarvester.lib.version.VERSION" }
 69 | 
 70 | [tool.setuptools.packages.find]
 71 | include = ["theHarvester*"]
 72 | 
 73 | [tool.setuptools.package-data]
 74 | "*" = ["*.txt", "*.yaml"]
 75 | 
 76 | [tool.pytest.ini_options]
 77 | minversion = "8.3.3"
 78 | asyncio_mode = "auto"
 79 | asyncio_default_fixture_loop_scope = "function"
 80 | addopts = "--no-header"
 81 | testpaths = [
 82 |     "tests",
 83 |     "tests/discovery/",
 84 | ]
 85 | 
 86 | [build-system]
 87 | requires = ["setuptools>=68"]
 88 | build-backend = "setuptools.build_meta"
 89 | 
 90 | [tool.mypy]
 91 | python_version = "3.11"
 92 | warn_unused_configs = true
 93 | ignore_missing_imports = true
 94 | show_traceback = true
 95 | show_error_codes = true
 96 | namespace_packages = true
 97 | 
 98 | [tool.uv]
 99 | python-preference = "managed"
100 | 
101 | [tool.uv.pip]
102 | python-version = "3.11"
103 | 
104 | [tool.ruff]
105 | # Exclude a variety of commonly ignored directories.
106 | exclude = [
107 |     "tests",
108 |     ".eggs",
109 |     ".git",
110 |     ".git-rewrite",
111 |     ".mypy_cache",
112 |     ".pyenv",
113 |     ".pytest_cache",
114 |     ".pytype",
115 |     ".ruff_cache",
116 |     ".github",
117 |     ".venv",
118 |     ".vscode",
119 |     ".idea",
120 |     "__pypackages__",
121 |     "build",
122 |     "dist",
123 |     "site-packages",
124 |     "venv",
125 | ]
126 | 
127 | line-length = 130
128 | target-version = "py311"
129 | show-fixes = true
130 | 
131 | [tool.ruff.lint]
132 | select = ["E4",
133 |     "E7",
134 |     "E9",
135 |     "F",
136 |     "I",
137 |     "UP",
138 |     "TCH",
139 |     "FA",
140 |     "RUF",
141 |     "PT",
142 |     ]
143 | ignore = ["S311", "RUF021", "RUF029", "F841"]
144 | 
145 | # Allow fix for all enabled rules (when `--fix`) is provided.
146 | fixable = ["ALL"]
147 | unfixable = []
148 | 
149 | # Allow unused variables when underscore-prefixed.
150 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
151 | 
152 | [tool.ruff.format]
153 | # Like Black, use double quotes for strings.
154 | quote-style = "single"
155 | indent-style = "space"
156 | 
157 | # Like Black, respect magic trailing commas.
158 | skip-magic-trailing-comma = false
159 | 
160 | # Like Black, automatically detect the appropriate line ending.
161 | line-ending = "auto"
162 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file is deprecated. All dependencies are now defined in pyproject.toml
2 | 


--------------------------------------------------------------------------------
/restfulHarvest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from theHarvester.restfulHarvest import main
3 | 
4 | if __name__ == '__main__':
5 |     main()
6 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/tests/__init__.py


--------------------------------------------------------------------------------
/tests/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/tests/discovery/__init__.py


--------------------------------------------------------------------------------
/tests/discovery/test_certspotter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import os
 4 | from typing import Optional
 5 | 
 6 | import pytest
 7 | import requests
 8 | from _pytest.mark.structures import MarkDecorator
 9 | 
10 | from theHarvester.discovery import certspottersearch
11 | from theHarvester.lib.core import *
12 | 
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 |     "GITHUB_ACTIONS"
16 | )  # Github set this to be the following: true instead of True
17 | 
18 | 
19 | class TestCertspotter(object):
20 |     @staticmethod
21 |     def domain() -> str:
22 |         return "metasploit.com"
23 | 
24 | 
25 | @pytest.mark.skipif(github_ci == 'true', reason="Skipping this test for now")
26 | class TestCertspotterSearch(object):
27 |     async def test_api(self) -> None:
28 |         base_url = f"https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names"
29 |         headers = {"User-Agent": Core.get_user_agent()}
30 |         request = requests.get(base_url, headers=headers)
31 |         assert request.status_code == 200
32 | 
33 |     async def test_search(self) -> None:
34 |         search = certspottersearch.SearchCertspoter(TestCertspotter.domain())
35 |         await search.process()
36 |         assert isinstance(await search.get_hostnames(), set)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     pytest.main()
41 | 


--------------------------------------------------------------------------------
/tests/discovery/test_githubcode.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | import pytest
 3 | from _pytest.mark.structures import MarkDecorator
 4 | from requests import Response
 5 | from theHarvester.discovery import githubcode
 6 | from theHarvester.discovery.constants import MissingKey
 7 | from theHarvester.lib.core import Core
 8 | 
 9 | pytestmark: MarkDecorator = pytest.mark.asyncio
10 | 
11 | 
12 | class TestSearchGithubCode:
13 |     class OkResponse:
14 |         response = Response()
15 | 
16 |         # Mocking the json method properly
17 |         def __init__(self):
18 |             self.response = Response()
19 |             self.response.status_code = 200
20 |             self.response.json = MagicMock(
21 |                 return_value={
22 |                     "items": [
23 |                         {"text_matches": [{"fragment": "test1"}]},
24 |                         {"text_matches": [{"fragment": "test2"}]},
25 |                     ]
26 |                 }
27 |             )
28 | 
29 |     class FailureResponse:
30 |         response = Response()
31 | 
32 |         def __init__(self):
33 |             self.response = Response()
34 |             self.response.status_code = 401
35 |             self.response.json = MagicMock(return_value={})
36 | 
37 |     class RetryResponse:
38 |         def __init__(self):
39 |             self.response = Response()
40 |             self.response.status_code = 403
41 |             self.response.json = MagicMock(return_value={})
42 | 
43 |     class MalformedResponse:
44 |         response = Response()
45 | 
46 |         def __init__(self):
47 |             self.response = Response()
48 |             self.response.status_code = 200
49 |             self.response.json = MagicMock(
50 |                 return_value={
51 |                     "items": [
52 |                         {"fail": True},
53 |                         {"text_matches": []},
54 |                         {"text_matches": [{"weird": "result"}]},
55 |                     ]
56 |                 }
57 |             )
58 | 
59 |     async def test_missing_key(self):
60 |         with pytest.raises(MissingKey):
61 |             Core.github_key = MagicMock(return_value=None)
62 |             githubcode.SearchGithubCode(word="test", limit=500)
63 | 
64 |     async def test_fragments_from_response(self):
65 |         Core.github_key = MagicMock(return_value="test_key")
66 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
67 |         test_result = await test_class_instance.fragments_from_response(
68 |             self.OkResponse().response.json()
69 |         )
70 |         print("test_result: ", test_result)
71 |         assert test_result == ["test1", "test2"]
72 | 
73 |     async def test_invalid_fragments_from_response(self):
74 |         Core.github_key = MagicMock(return_value="test_key")
75 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
76 |         test_result = await test_class_instance.fragments_from_response(
77 |             self.MalformedResponse().response.json()
78 |         )
79 |         assert test_result == []
80 | 
81 |     async def test_next_page(self):
82 |         Core.github_key = MagicMock(return_value="test_key")
83 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
84 |         test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4)
85 |         assert 2 == await test_class_instance.next_page_or_end(test_result)
86 | 
87 |     async def test_last_page(self):
88 |         Core.github_key = MagicMock(return_value="test_key")
89 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
90 |         test_result = githubcode.SuccessResult(list(), 0, 0)
91 |         assert await test_class_instance.next_page_or_end(test_result) is 0
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     pytest.main()
96 | 


--------------------------------------------------------------------------------
/tests/discovery/test_otx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import os
 4 | from typing import Optional
 5 | 
 6 | import pytest
 7 | import requests
 8 | from _pytest.mark.structures import MarkDecorator
 9 | 
10 | from theHarvester.discovery import otxsearch
11 | from theHarvester.lib.core import *
12 | 
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 |     "GITHUB_ACTIONS"
16 | )  # Github set this to be the following: true instead of True
17 | 
18 | 
19 | class TestOtx(object):
20 |     @staticmethod
21 |     def domain() -> str:
22 |         return "cybermon.uk"
23 | 
24 |     async def test_api(self) -> None:
25 |         base_url = f"https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns"
26 |         headers = {"User-Agent": Core.get_user_agent()}
27 |         request = requests.get(base_url, headers=headers)
28 |         assert request.status_code == 200
29 | 
30 |     async def test_search(self) -> None:
31 |         search = otxsearch.SearchOtx(TestOtx.domain())
32 |         await search.process()
33 |         assert isinstance(await search.get_hostnames(), set)
34 |         assert isinstance(await search.get_ips(), set)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     pytest.main()
39 | 


--------------------------------------------------------------------------------
/tests/lib/test_core.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from pathlib import Path
 4 | from typing import Any
 5 | from unittest import mock
 6 | 
 7 | import pytest
 8 | import yaml
 9 | 
10 | from theHarvester.lib.core import CONFIG_DIRS, DATA_DIR, Core
11 | 
12 | 
13 | @pytest.fixture(autouse=True)
14 | def mock_environ(monkeypatch, tmp_path: Path):
15 |     monkeypatch.setenv("HOME", str(tmp_path))
16 | 
17 | 
18 | def mock_read_text(mocked: dict[Path, str | Exception]):
19 |     read_text = Path.read_text
20 | 
21 |     def _read_text(self: Path, *args, **kwargs):
22 |         if result := mocked.get(self):
23 |             if isinstance(result, Exception):
24 |                 raise result
25 |             return result
26 |         return read_text(self, *args, **kwargs)
27 | 
28 |     return _read_text
29 | 
30 | 
31 | @pytest.mark.parametrize(
32 |     ("name", "contents", "expected"),
33 |     [
34 |         ("api-keys", "apikeys: {}", {}),
35 |         ("proxies", "http: [localhost:8080]", ["http://localhost:8080"]),
36 |     ],
37 | )
38 | @pytest.mark.parametrize("dir", CONFIG_DIRS)
39 | def test_read_config_searches_config_dirs(
40 |     name: str, contents: str, expected: Any, dir: Path, capsys
41 | ):
42 |     file = dir.expanduser() / f"{name}.yaml"
43 |     config_files = [d.expanduser() / file.name for d in CONFIG_DIRS]
44 |     side_effect = mock_read_text(
45 |         {f: contents if f == file else FileNotFoundError() for f in config_files}
46 |     )
47 | 
48 |     with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect):
49 |         got = Core.api_keys() if name == "api-keys" else Core.proxy_list()
50 | 
51 |     assert got == expected
52 |     assert f"Read {file.name} from {file}" in capsys.readouterr().out
53 | 
54 | 
55 | @pytest.mark.parametrize("name", ("api-keys", "proxies"))
56 | def test_read_config_copies_default_to_home(name: str, capsys):
57 |     file = Path(f"~/.theHarvester/{name}.yaml").expanduser()
58 |     config_files = [d.expanduser() / file.name for d in CONFIG_DIRS]
59 |     side_effect = mock_read_text({f: FileNotFoundError() for f in config_files})
60 | 
61 |     with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect):
62 |         got = Core.api_keys() if name == "api-keys" else Core.proxy_list()
63 | 
64 |     default = yaml.safe_load((DATA_DIR / file.name).read_text())
65 |     expected = (
66 |         default["apikeys"]
67 |         if name == "api-keys"
68 |         else [f"http://{h}" for h in default["http"]]
69 |     )
70 |     assert got == expected
71 |     assert f"Created default {file.name} at {file}" in capsys.readouterr().out
72 |     assert file.exists()
73 | 


--------------------------------------------------------------------------------
/tests/test_myparser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | 
 4 | import pytest
 5 | 
 6 | from theHarvester.parsers import myparser
 7 | 
 8 | 
 9 | class TestMyParser(object):
10 |     @pytest.mark.asyncio
11 |     async def test_emails(self) -> None:
12 |         word = "domain.com"
13 |         results = "@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***"
14 |         parse = myparser.Parser(results, word)
15 |         emails = sorted(await parse.emails())
16 |         assert emails, ["c@domain.com", "d@sub.domain.com"]
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     pytest.main()
21 | 


--------------------------------------------------------------------------------
/theHarvester-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester-logo.png


--------------------------------------------------------------------------------
/theHarvester-logo.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester-logo.webp


--------------------------------------------------------------------------------
/theHarvester.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Note: This script runs theHarvester
 3 | import sys
 4 | 
 5 | from theHarvester.theHarvester import main
 6 | 
 7 | if sys.version_info.major < 3 or sys.version_info.minor < 10:
 8 |     print('\033[93m[!] Make sure you have Python 3.10+ installed, quitting.\n\n \033[0m')
 9 |     sys.exit(1)
10 | 
11 | if __name__ == '__main__':
12 |     main()
13 | 


--------------------------------------------------------------------------------
/theHarvester/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/__init__.py


--------------------------------------------------------------------------------
/theHarvester/data/api-keys.yaml:
--------------------------------------------------------------------------------
 1 | apikeys:
 2 | 
 3 |   bevigil:
 4 |     key:
 5 | 
 6 |   bing:
 7 |     key:
 8 | 
 9 |   bufferoverun:
10 |     key:
11 | 
12 |   censys:
13 |     id:
14 |     secret:
15 | 
16 |   criminalip:
17 |     key:
18 | 
19 |   dehashed:
20 |     key:
21 | 
22 |   dnsdumpster:
23 |     key:
24 | 
25 |   fullhunt:
26 |     key:
27 | 
28 |   github:
29 |     key:
30 | 
31 |   hunter:
32 |     key:
33 | 
34 |   hunterhow:
35 |     key:
36 | 
37 |   intelx:
38 |     key:
39 | 
40 |   netlas:
41 |     key:
42 | 
43 |   onyphe:
44 |     key:
45 | 
46 |   pentestTools:
47 |     key:
48 | 
49 |   projectDiscovery:
50 |     key:
51 | 
52 |   rocketreach:
53 |     key:
54 | 
55 |   securityTrails:
56 |     key:
57 | 
58 |   shodan:
59 |     key:
60 | 
61 |   tomba:
62 |     key:
63 |     secret:
64 | 
65 |   venacus:
66 |     key:
67 | 
68 |   virustotal:
69 |     key:
70 | 
71 |   whoisxml:
72 |     key:
73 | 
74 |   zoomeye:
75 |     key:
76 | 


--------------------------------------------------------------------------------
/theHarvester/data/proxies.yaml:
--------------------------------------------------------------------------------
1 | http:
2 |     - ip:port
3 | 


--------------------------------------------------------------------------------
/theHarvester/data/wordlists/dorks.txt:
--------------------------------------------------------------------------------
 1 | inurl:"contact"
 2 | intext:email filetype:log
 3 | "Index of /mail"
 4 | "admin account info" filetype:log
 5 | intext:@
 6 | administrator accounts/
 7 | intitle:"Index of" .bash_history
 8 | intitle:"index of" members OR accounts
 9 | inurl:/shared/help.php
10 | inurl:public
11 | intitle:index.of inbox
12 | intitle:"Server Administration" 
13 | inurl:passwd.txt
14 | robots.txt
15 | php-addressbook "This is the addressbook for *" -warning


--------------------------------------------------------------------------------
/theHarvester/data/wordlists/general/common.txt:
--------------------------------------------------------------------------------
1 | admin
2 | test
3 | hello
4 | uk
5 | login
6 | book
7 | robots.txt
8 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/discovery/__init__.py


--------------------------------------------------------------------------------
/theHarvester/discovery/baidusearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | from theHarvester.parsers import myparser
 3 | 
 4 | 
 5 | class SearchBaidu:
 6 |     def __init__(self, word, limit) -> None:
 7 |         self.word = word
 8 |         self.total_results = ''
 9 |         self.server = 'www.baidu.com'
10 |         self.hostname = 'www.baidu.com'
11 |         self.limit = limit
12 |         self.proxy = False
13 | 
14 |     async def do_search(self) -> None:
15 |         headers = {'Host': self.hostname, 'User-agent': Core.get_user_agent()}
16 |         base_url = f'https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}'
17 |         urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
18 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
19 |         for response in responses:
20 |             self.total_results += response
21 | 
22 |     async def process(self, proxy: bool = False) -> None:
23 |         self.proxy = proxy
24 |         await self.do_search()
25 | 
26 |     async def get_emails(self):
27 |         rawres = myparser.Parser(self.total_results, self.word)
28 |         return await rawres.emails()
29 | 
30 |     async def get_hostnames(self):
31 |         rawres = myparser.Parser(self.total_results, self.word)
32 |         return await rawres.hostnames()
33 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/bevigil.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import MissingKey
 2 | from theHarvester.lib.core import AsyncFetcher, Core
 3 | 
 4 | 
 5 | class SearchBeVigil:
 6 |     def __init__(self, word) -> None:
 7 |         self.word = word
 8 |         self.totalhosts: set = set()
 9 |         self.interestingurls: set = set()
10 |         self.key = Core.bevigil_key()
11 |         if self.key is None:
12 |             self.key = ''
13 |             raise MissingKey('bevigil')
14 |         self.proxy = False
15 | 
16 |     async def do_search(self) -> None:
17 |         subdomain_endpoint = f'https://osint.bevigil.com/api/{self.word}/subdomains/'
18 |         url_endpoint = f'https://osint.bevigil.com/api/{self.word}/urls/'
19 |         headers = {'X-Access-Token': self.key}
20 | 
21 |         responses = await AsyncFetcher.fetch_all([subdomain_endpoint], json=True, proxy=self.proxy, headers=headers)
22 |         response = responses[0]
23 |         for subdomain in response['subdomains']:
24 |             self.totalhosts.add(subdomain)
25 | 
26 |         responses = await AsyncFetcher.fetch_all([url_endpoint], json=True, proxy=self.proxy, headers=headers)
27 |         response = responses[0]
28 |         for url in response['urls']:
29 |             self.interestingurls.add(url)
30 | 
31 |     async def get_hostnames(self) -> set:
32 |         return self.totalhosts
33 | 
34 |     async def get_interestingurls(self) -> set:
35 |         return self.interestingurls
36 | 
37 |     async def process(self, proxy: bool = False) -> None:
38 |         self.proxy = proxy
39 |         await self.do_search()
40 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/bingsearch.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | from theHarvester.parsers import myparser
 6 | 
 7 | 
 8 | class SearchBing:
 9 |     def __init__(self, word, limit, start) -> None:
10 |         self.word = word.replace(' ', '%20')
11 |         self.results: list[Any] = []
12 |         self.total_results = ''
13 |         self.server = 'www.bing.com'
14 |         self.apiserver = 'api.search.live.net'
15 |         self.hostname = 'www.bing.com'
16 |         self.limit = int(limit)
17 |         self.bingApi = Core.bing_key()
18 |         self.counter = start
19 |         self.proxy = False
20 | 
21 |     async def do_search(self) -> None:
22 |         headers = {
23 |             'Host': self.hostname,
24 |             'Cookie': 'SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50',
25 |             'Accept-Language': 'en-us,en',
26 |             'User-agent': Core.get_user_agent(),
27 |         }
28 |         base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
29 |         urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
30 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
31 |         for response in responses:
32 |             self.total_results += response
33 | 
34 |     async def do_search_api(self) -> None:
35 |         url = 'https://api.bing.microsoft.com/v7.0/search?'
36 |         params = {
37 |             'q': self.word,
38 |             'count': str(self.limit),
39 |             'offset': '0',
40 |             'mkt': 'en-us',
41 |             'safesearch': 'Off',
42 |         }
43 |         headers = {
44 |             'User-Agent': Core.get_user_agent(),
45 |             'Ocp-Apim-Subscription-Key': self.bingApi,
46 |         }
47 |         self.results = await AsyncFetcher.fetch_all([url], headers=headers, params=params, proxy=self.proxy)
48 |         for res in self.results:
49 |             self.total_results += res
50 | 
51 |     async def do_search_vhost(self) -> None:
52 |         headers = {
53 |             'Host': self.hostname,
54 |             'Cookie': 'mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50',
55 |             'Accept-Language': 'en-us,en',
56 |             'User-agent': Core.get_user_agent(),
57 |         }
58 |         base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx'
59 |         urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
60 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
61 |         for response in responses:
62 |             self.total_results += response
63 | 
64 |     async def get_emails(self):
65 |         rawres = myparser.Parser(self.total_results, self.word)
66 |         return await rawres.emails()
67 | 
68 |     async def get_hostnames(self):
69 |         rawres = myparser.Parser(self.total_results, self.word)
70 |         return await rawres.hostnames()
71 | 
72 |     async def get_allhostnames(self):
73 |         rawres = myparser.Parser(self.total_results, self.word)
74 |         return await rawres.hostnames_all()
75 | 
76 |     async def process(self, api, proxy: bool = False) -> None:
77 |         self.proxy = proxy
78 |         if api == 'yes':
79 |             if self.bingApi is None:
80 |                 raise MissingKey('BingAPI')
81 |             await self.do_search_api()
82 |         else:
83 |             await self.do_search()
84 |             print(f'\tSearching {self.counter} results.')
85 | 
86 |     async def process_vhost(self) -> None:
87 |         await self.do_search_vhost()
88 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/bravesearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from theHarvester.discovery.constants import get_delay
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | from theHarvester.parsers import myparser
 6 | 
 7 | 
 8 | class SearchBrave:
 9 |     def __init__(self, word, limit):
10 |         self.word = word
11 |         self.results = ''
12 |         self.totalresults = ''
13 |         self.server = 'https://search.brave.com/search?q='
14 |         self.limit = limit
15 |         self.proxy = False
16 | 
17 |     async def do_search(self):
18 |         headers = {'User-Agent': Core.get_user_agent()}
19 |         for query in [f'"{self.word}"', f'site:{self.word}']:
20 |             try:
21 |                 for offset in range(0, 50):
22 |                     # To reduce the total number of requests, only two queries are made "self.word" and site:self.word
23 |                     current_url = f'{self.server}{query}&offset={offset}&source=web&show_local=0&spellcheck=0'
24 |                     resp = await AsyncFetcher.fetch_all([current_url], headers=headers, proxy=self.proxy)
25 |                     self.results = resp[0]
26 |                     self.totalresults += self.results
27 |                     # if 'Results from Microsoft Bing.' in resp[0] \
28 |                     if (
29 |                         'Not many great matches came back for your search' in resp[0]
30 |                         or 'Your request has been flagged as being suspicious and Brave Search' in resp[0]
31 |                         or 'Prove' in resp[0]
32 |                         and 'robot' in resp[0]
33 |                         or 'Robot' in resp[0]
34 |                     ):
35 |                         break
36 |                     await asyncio.sleep(get_delay() + 15)
37 |             except Exception as e:
38 |                 print(f'An exception has occurred in bravesearch: {e}')
39 |                 await asyncio.sleep(get_delay() + 80)
40 |                 continue
41 | 
42 |     async def get_emails(self):
43 |         rawres = myparser.Parser(self.totalresults, self.word)
44 |         return await rawres.emails()
45 | 
46 |     async def get_hostnames(self):
47 |         rawres = myparser.Parser(self.totalresults, self.word)
48 |         return await rawres.hostnames()
49 | 
50 |     async def process(self, proxy=False):
51 |         self.proxy = proxy
52 |         await self.do_search()
53 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/bufferoverun.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchBufferover:
 8 |     def __init__(self, word) -> None:
 9 |         self.word = word
10 |         self.totalhosts: set = set()
11 |         self.totalips: set = set()
12 |         self.key = Core.bufferoverun_key()
13 |         if self.key is None:
14 |             raise MissingKey('bufferoverun')
15 |         self.proxy = False
16 | 
17 |     async def do_search(self) -> None:
18 |         url = f'https://tls.bufferover.run/dns?q={self.word}'
19 |         response = await AsyncFetcher.fetch_all(
20 |             [url],
21 |             json=True,
22 |             headers={'User-Agent': Core.get_user_agent(), 'x-api-key': f'{self.key}'},
23 |             proxy=self.proxy,
24 |         )
25 |         dct = response[0]
26 |         if dct['Results']:
27 |             self.totalhosts = {
28 |                 (
29 |                     host.split(',')
30 |                     if ',' in host and self.word.replace('www.', '') in host.split(',')[0] in host
31 |                     else host.split(',')[4]
32 |                 )
33 |                 for host in dct['Results']
34 |             }
35 | 
36 |         self.totalips = {
37 |             ip.split(',')[0] for ip in dct['Results'] if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', ip.split(',')[0])
38 |         }
39 | 
40 |     async def get_hostnames(self) -> set:
41 |         return self.totalhosts
42 | 
43 |     async def get_ips(self) -> set:
44 |         return self.totalips
45 | 
46 |     async def process(self, proxy: bool = False) -> None:
47 |         self.proxy = proxy
48 |         await self.do_search()
49 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/builtwith.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchBuiltWith:
 8 |     def __init__(self, word: str):
 9 |         self.word = word
10 |         self.api_key = Core.builtwith_key()
11 |         self.base_url = 'https://api.builtwith.com/v21/api.json'
12 |         self.headers = {'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json'}
13 |         self.hosts = set()
14 |         self.tech_stack = {}
15 |         self.interesting_urls = set()
16 |         self.frameworks = set()
17 |         self.languages = set()
18 |         self.servers = set()
19 |         self.cms = set()
20 |         self.analytics = set()
21 | 
22 |     async def process(self, proxy: bool = False) -> None:
23 |         """Get technology stack information for a domain."""
24 |         try:
25 |             if proxy:
26 |                 response = await AsyncFetcher.fetch(
27 |                     session=None, url=f'{self.base_url}?KEY={self.api_key}&LOOKUP={self.word}', headers=self.headers, proxy=proxy
28 |                 )
29 |                 if response:
30 |                     self.tech_stack = response
31 |                     self._extract_data()
32 |             else:
33 |                 async with aiohttp.ClientSession(headers=self.headers) as session:
34 |                     async with session.get(f'{self.base_url}?KEY={self.api_key}&LOOKUP={self.word}') as response:
35 |                         if response.status == 200:
36 |                             data = await response.json()
37 |                             self.tech_stack = data
38 |                             self._extract_data()
39 |                         elif response.status == 401:
40 |                             print('[!] Missing API key for BuiltWith.')
41 |                             raise MissingKey('BuiltWith')
42 |         except Exception as e:
43 |             print(f'Error in BuiltWith search: {e}')
44 | 
45 |     def _extract_data(self) -> None:
46 |         """Extract and categorize technology information."""
47 |         if 'domains' in self.tech_stack:
48 |             self.hosts.update(self.tech_stack['domains'])
49 |         if 'paths' in self.tech_stack:
50 |             self.interesting_urls.update(self.tech_stack['paths'])
51 |         if 'technologies' in self.tech_stack:
52 |             for tech in self.tech_stack['technologies']:
53 |                 category = tech.get('category', '').lower()
54 |                 name = tech.get('name', '')
55 | 
56 |                 if 'framework' in category:
57 |                     self.frameworks.add(name)
58 |                 elif 'language' in category:
59 |                     self.languages.add(name)
60 |                 elif 'server' in category:
61 |                     self.servers.add(name)
62 |                 elif 'cms' in category:
63 |                     self.cms.add(name)
64 |                 elif 'analytics' in category:
65 |                     self.analytics.add(name)
66 | 
67 |     async def get_hostnames(self) -> set[str]:
68 |         return self.hosts
69 | 
70 |     async def get_tech_stack(self) -> dict:
71 |         return self.tech_stack
72 | 
73 |     async def get_interesting_urls(self) -> set[str]:
74 |         return self.interesting_urls
75 | 
76 |     async def get_frameworks(self) -> set[str]:
77 |         return self.frameworks
78 | 
79 |     async def get_languages(self) -> set[str]:
80 |         return self.languages
81 | 
82 |     async def get_servers(self) -> set[str]:
83 |         return self.servers
84 | 
85 |     async def get_cms(self) -> set[str]:
86 |         return self.cms
87 | 
88 |     async def get_analytics(self) -> set[str]:
89 |         return self.analytics
90 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/censysearch.py:
--------------------------------------------------------------------------------
 1 | from censys.common import __version__
 2 | from censys.common.exceptions import (
 3 |     CensysRateLimitExceededException,
 4 |     CensysUnauthorizedException,
 5 | )
 6 | from censys.search import CensysCerts
 7 | 
 8 | from theHarvester.discovery.constants import MissingKey
 9 | from theHarvester.lib.core import Core
10 | from theHarvester.lib.version import version as thehavester_version
11 | 
12 | 
13 | class SearchCensys:
14 |     def __init__(self, domain, limit: int = 500) -> None:
15 |         self.word = domain
16 |         self.key = Core.censys_key()
17 |         if self.key[0] is None or self.key[1] is None:
18 |             raise MissingKey('Censys ID and/or Secret')
19 |         self.totalhosts: set = set()
20 |         self.emails: set = set()
21 |         self.limit = limit
22 |         self.proxy = False
23 | 
24 |     async def do_search(self) -> None:
25 |         try:
26 |             cert_search = CensysCerts(
27 |                 api_id=self.key[0],
28 |                 api_secret=self.key[1],
29 |                 user_agent=f'censys-python/{__version__} (theHarvester/{thehavester_version}); +https://github.com/laramies/theHarvester)',
30 |             )
31 |         except CensysUnauthorizedException:
32 |             raise MissingKey('Censys ID and/or Secret')
33 | 
34 |         query = f'names: {self.word}'
35 |         try:
36 |             response = cert_search.search(
37 |                 query=query,
38 |                 fields=['names', 'parsed.subject.email_address'],
39 |                 max_records=self.limit,
40 |             )
41 |             for cert in response():
42 |                 self.totalhosts.update(cert.get('names', []))
43 |                 email_address = cert.get('parsed', {}).get('subject', {}).get('email_address', [])
44 |                 self.emails.update(email_address)
45 |         except CensysRateLimitExceededException:
46 |             print('Censys rate limit exceeded')
47 | 
48 |     async def get_hostnames(self) -> set:
49 |         return self.totalhosts
50 | 
51 |     async def get_emails(self) -> set:
52 |         return self.emails
53 | 
54 |     async def process(self, proxy: bool = False) -> None:
55 |         self.proxy = proxy
56 |         await self.do_search()
57 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/certspottersearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher
 2 | 
 3 | 
 4 | class SearchCertspoter:
 5 |     def __init__(self, word) -> None:
 6 |         self.word = word
 7 |         self.totalhosts: set = set()
 8 |         self.proxy = False
 9 | 
10 |     async def do_search(self) -> None:
11 |         base_url = f'https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names'
12 |         try:
13 |             response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy)
14 |             response = response[0]
15 |             if isinstance(response, list):
16 |                 for dct in response:
17 |                     for key, value in dct.items():
18 |                         if key == 'dns_names':
19 |                             self.totalhosts.update({name for name in value if name})
20 |             elif isinstance(response, dict):
21 |                 self.totalhosts.update({response['dns_names'] if 'dns_names' in response.keys() else ''})  # type: ignore
22 |             else:
23 |                 self.totalhosts.update({''})
24 |         except Exception as e:
25 |             print(e)
26 | 
27 |     async def get_hostnames(self) -> set:
28 |         return self.totalhosts
29 | 
30 |     async def process(self, proxy: bool = False) -> None:
31 |         self.proxy = proxy
32 |         await self.do_search()
33 |         print('\tSearching results.')
34 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/constants.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | from theHarvester.lib.core import AsyncFetcher, Core
  4 | 
  5 | 
  6 | async def splitter(links):
  7 |     """
  8 |     Method that tries to remove duplicates
  9 |     LinkedinLists pulls a lot of profiles with the same name.
 10 |     This method tries to remove duplicates from the list.
 11 |     :param links: list of links to remove duplicates from
 12 |     :return: a unique-ish list
 13 |     """
 14 |     unique_list = []
 15 |     name_check = []
 16 |     for url in links:
 17 |         tail = url.split('/')[-1]
 18 |         if len(tail) == 2 or tail == 'zh-cn':
 19 |             tail = url.split('/')[-2]
 20 |         name = tail.split('-')
 21 |         if len(name) > 1:
 22 |             joined_name = name[0] + name[1]
 23 |         else:
 24 |             joined_name = name[0]
 25 |         if joined_name not in name_check:
 26 |             unique_list.append(url)
 27 |             name_check.append(joined_name)
 28 |     return unique_list
 29 | 
 30 | 
 31 | def filter(lst):
 32 |     """
 33 |     Method that filters list
 34 |     :param lst: list to be filtered
 35 |     :return: new filtered list
 36 |     """
 37 |     if lst is None:
 38 |         return []
 39 |     if not isinstance(lst, set):
 40 |         lst = set(lst)  # Remove duplicates.
 41 |     new_lst = []
 42 |     for item in lst:
 43 |         item = str(item)
 44 |         if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
 45 |             item = item.replace('252f', '').replace('2F', '').replace('2f', '')
 46 |             new_lst.append(item.lower())
 47 |     return new_lst
 48 | 
 49 | 
 50 | def get_delay() -> float:
 51 |     """Method that is used to generate a random delay"""
 52 |     return random.randint(1, 3) - 0.5
 53 | 
 54 | 
 55 | async def search(text: str) -> bool:
 56 |     """Helper function to check if Google has blocked traffic.
 57 |     :param text: See if specific text is returned, which means Google is blocking us
 58 |     :return bool:
 59 |     """
 60 |     for line in text.strip().splitlines():
 61 |         if (
 62 |             'This page appears when Google automatically detects requests coming from your computer network' in line
 63 |             or 'http://www.google.com/sorry/index' in line
 64 |             or 'https://www.google.com/sorry/index' in line
 65 |         ):
 66 |             # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
 67 |             return True
 68 |     return False
 69 | 
 70 | 
 71 | async def google_workaround(visit_url: str) -> bool | str:
 72 |     """
 73 |     Function that makes a request on our behalf if Google starts to block us
 74 |     :param visit_url: Url to scrape
 75 |     :return: Correct html that can be parsed by BS4
 76 |     """
 77 |     url = 'https://websniffer.cc/'
 78 |     data = {
 79 |         'Cookie': '',
 80 |         'url': visit_url,
 81 |         'submit': 'Submit',
 82 |         'type': 'GET&http=1.1',
 83 |         'uak': str(random.randint(4, 8)),  # select random UA to send to Google
 84 |     }
 85 |     returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data)
 86 |     returned_html = (
 87 |         'This page appears when Google automatically detects requests coming from your computer network'
 88 |         if returned_html == ''
 89 |         else returned_html[0]
 90 |     )
 91 | 
 92 |     returned_html = '' if 'Please Wait... | Cloudflare' in returned_html else returned_html
 93 | 
 94 |     if len(returned_html) == 0 or await search(returned_html) or '&lt;html' not in returned_html:
 95 |         # indicates that google is serving workaround a captcha
 96 |         # That means we will try out second option which will utilize proxies
 97 |         return True
 98 |     # the html we get is malformed for BS4 as there are no greater than or less than signs
 99 |     if '&lt;html&gt;' in returned_html:
100 |         start_index = returned_html.index('&lt;html&gt;')
101 |     else:
102 |         start_index = returned_html.index('&lt;html')
103 | 
104 |     end_index = returned_html.index('&lt;/html&gt;') + 1
105 |     correct_html = returned_html[start_index:end_index]
106 |     # Slice list to get the response's html
107 |     correct_html = ''.join([ch.strip().replace('&lt;', '<').replace('&gt;', '>') for ch in correct_html])
108 |     return correct_html
109 | 
110 | 
111 | class MissingKey(Exception):
112 |     """
113 |     :raise: When there is a module that has not been provided its API key
114 |     """
115 | 
116 |     def __init__(self, source: str | None) -> None:
117 |         if source:
118 |             self.message = f'\n\033[93m[!] Missing API key for {source}. \033[0m'
119 |         else:
120 |             self.message = '\n\033[93m[!] Missing CSE id. \033[0m'
121 | 
122 |     def __str__(self) -> str:
123 |         return self.message
124 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/criminalip.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from urllib.parse import urlparse
  3 | 
  4 | from theHarvester.discovery.constants import MissingKey, get_delay
  5 | from theHarvester.lib.core import AsyncFetcher, Core
  6 | 
  7 | 
  8 | class SearchCriminalIP:
  9 |     def __init__(self, word) -> None:
 10 |         self.word = word
 11 |         self.totalhosts: set = set()
 12 |         self.totalips: set = set()
 13 |         self.asns: set = set()
 14 |         self.key = Core.criminalip_key()
 15 |         if self.key is None:
 16 |             raise MissingKey('criminalip')
 17 |         self.proxy = False
 18 | 
 19 |     async def do_search(self) -> None:
 20 |         # https://www.criminalip.io/developer/api/post-domain-scan
 21 |         # https://www.criminalip.io/developer/api/get-domain-status-id
 22 |         # https://www.criminalip.io/developer/api/get-domain-report-id
 23 |         url = 'https://api.criminalip.io/v1/domain/scan'
 24 |         data = f'{{"query": "{self.word}"}}'
 25 |         # print(f'Current key: {self.key}')
 26 |         user_agent = Core.get_user_agent()
 27 |         response = await AsyncFetcher.post_fetch(
 28 |             url,
 29 |             json=True,
 30 |             headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'},
 31 |             data=data,
 32 |             proxy=self.proxy,
 33 |         )
 34 |         # print(f'My response: {response}')
 35 |         # Expected response format:
 36 |         # {'data': {'scan_id': scan_id}, 'message': 'api success', 'status': 200}
 37 |         if 'status' in response.keys():
 38 |             status = response['status']
 39 |             if status != 200:
 40 |                 print(f'An error has occurred searching criminalip dumping response: {response}')
 41 |             else:
 42 |                 scan_id = response['data']['scan_id']
 43 |                 scan_percentage = 0
 44 |                 counter = 0
 45 |                 while scan_percentage != 100:
 46 |                     status_url = f'https://api.criminalip.io/v1/domain/status/{scan_id}'
 47 |                     status_response = await AsyncFetcher.fetch_all(
 48 |                         [status_url],
 49 |                         json=True,
 50 |                         headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'},
 51 |                         proxy=self.proxy,
 52 |                     )
 53 |                     status = status_response[0]
 54 |                     # print(f'Status response: {status}')
 55 |                     # Expected format:
 56 |                     # {"data": {"scan_percentage": 100}, "message": "api success", "status": 200}
 57 |                     scan_percentage = status['data']['scan_percentage']
 58 |                     if scan_percentage == 100:
 59 |                         break
 60 |                     if scan_percentage == -2:
 61 |                         print(f'CriminalIP failed to scan: {self.word} does not exist, verify manually')
 62 |                         print(f'Dumping data: scan_response: {response} status_response: {status}')
 63 |                         return
 64 |                     if scan_percentage == -1:
 65 |                         print(f'CriminalIP scan failed dumping data: scan_response: {response} status_response: {status}')
 66 |                         return
 67 |                     # Wait for scan to finish
 68 |                     if counter >= 5:
 69 |                         await asyncio.sleep(20 * get_delay())
 70 |                     else:
 71 |                         await asyncio.sleep(10 * get_delay())
 72 |                     counter += 1
 73 |                     if counter == 10:
 74 |                         print(
 75 |                             'Ten iterations have occurred in CriminalIP waiting for scan to finish, returning to prevent infinite loop.'
 76 |                         )
 77 |                         print(
 78 |                             f'Verify results manually on CriminalIP dumping data: scan_response: {response} status_response: {status}'
 79 |                         )
 80 |                         return
 81 | 
 82 |                 report_url = f'https://api.criminalip.io/v1/domain/report/{scan_id}'
 83 |                 scan_response = await AsyncFetcher.fetch_all(
 84 |                     [report_url],
 85 |                     json=True,
 86 |                     headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'},
 87 |                     proxy=self.proxy,
 88 |                 )
 89 |                 scan = scan_response[0]
 90 |                 # json_formatted_str = json.dumps(scan, indent=2)
 91 |                 # print(json_formatted_str)
 92 |                 try:
 93 |                     await self.parser(scan)
 94 |                 except Exception as e:
 95 |                     print(f'An exception occurred while parsing criminalip result: {e}')
 96 |                     print('Dumping json: ')
 97 |                     print(scan)
 98 | 
 99 |     async def parser(self, jlines):
100 |         # TODO when new scope field is added to parse lines for potential new scope!
101 |         # TODO map as_name to asn for asn data
102 |         # TODO determine if worth storing interesting urls
103 |         if 'data' not in jlines.keys():
104 |             print(f'Error with criminalip data, dumping: {jlines}')
105 |             return
106 |         data = jlines['data']
107 |         for cert in data['certificates']:
108 |             # print(f'Current cert: {cert}')
109 |             if cert['subject'].endswith('.' + self.word):
110 |                 self.totalhosts.add(cert['subject'])
111 | 
112 |         for connected_domain in data['connected_domain_subdomain']:
113 |             try:
114 |                 main_domain = connected_domain['main_domain']['domain']
115 |                 subdomains = [sub['domain'] for sub in connected_domain['subdomains']]
116 |                 if main_domain.endswith('.' + self.word):
117 |                     self.totalhosts.add(main_domain)
118 |                 for sub in subdomains:
119 |                     # print(f'Current sub: {sub}')
120 |                     if sub.endswith('.' + self.word):
121 |                         self.totalhosts.add(sub)
122 |             except Exception as e:
123 |                 print(f'An exception has occurred: {e}')
124 |                 print(f'Main line: {connected_domain}')
125 | 
126 |         for ip_info in data['connected_ip_info']:
127 |             self.asns.add(str(ip_info['asn']))
128 |             domains = [sub['domain'] for sub in ip_info['domain_list']]
129 |             for sub in domains:
130 |                 if sub.endswith('.' + self.word):
131 |                     self.totalhosts.add(sub)
132 |                     self.totalips.add(ip_info['ip'])
133 | 
134 |         for cookie in data['cookies']:
135 |             if cookie['domain'] != '.' + self.word and cookie['domain'].endswith('.' + self.word):
136 |                 self.totalhosts.add(cookie['domain'])
137 | 
138 |         for country in data['country']:
139 |             if country['domain'].endswith('.' + self.word):
140 |                 self.totalhosts.add(country['domain'])
141 |                 for ip in country['mapped_ips']:
142 |                     self.totalips.add(ip['ip'])
143 | 
144 |         for k, v in data['dns_record'].items():
145 |             if k == 'dns_record_type_a':
146 |                 for ip in data['dns_record'][k]['ipv4']:
147 |                     self.totalips.add(ip['ip'])
148 |             else:
149 |                 if isinstance(v, list):
150 |                     for item in v:
151 |                         if isinstance(item, list):
152 |                             for subitem in item:
153 |                                 if subitem.endswith('.' + self.word):
154 |                                     self.totalhosts.add(subitem)
155 |                         else:
156 |                             if item.endswith('.' + self.word):
157 |                                 self.totalhosts.add(item)
158 | 
159 |         for domain_list in data['domain_list']:
160 |             self.asns.add(str(domain_list['asn']))
161 |             domains = [sub['domain'] for sub in domain_list['domain_list']]
162 |             for sub in domains:
163 |                 if sub.endswith('.' + self.word):
164 |                     self.totalhosts.add(sub)
165 |                     self.totalips.add(domain_list['ip'])
166 | 
167 |         for html_page_links in data['html_page_link_domains']:
168 |             domain = html_page_links['domain']
169 |             if domain.endswith('.' + self.word):
170 |                 self.totalhosts.add(domain)
171 |                 for ip in html_page_links['mapped_ips']:
172 |                     self.totalips.add(ip['ip'])
173 | 
174 |         # TODO combine data['links'] and data['network_logs'] urls into one list for one run through
175 |         for link in data['links']:
176 |             url = link['url']
177 |             parsed_url = urlparse(url)
178 |             netloc = parsed_url.netloc
179 |             if self.word in netloc:
180 |                 if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word):
181 |                     self.totalhosts.add(netloc)
182 | 
183 |         for log in data['network_logs']:
184 |             url = log['url']
185 |             parsed_url = urlparse(url)
186 |             netloc = parsed_url.netloc
187 |             if self.word in netloc:
188 |                 if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word):
189 |                     self.totalhosts.add(netloc)
190 |                     self.asns.add(str(log['as_number']))
191 | 
192 |         for redirects in data['page_redirections']:
193 |             for redirect in redirects:
194 |                 url = redirect['url']
195 |                 parsed_url = urlparse(url)
196 |                 netloc = parsed_url.netloc
197 |                 if self.word in netloc:
198 |                     if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word):
199 |                         self.totalhosts.add(netloc)
200 | 
201 |         self.totalhosts = {host.replace('www.', '') for host in self.totalhosts if '*.' + self.word != host}
202 | 
203 |         # print(f'hostnames: {self.totalhosts}')
204 |         # print(f'asns: {self.asns}')
205 |         # print(f'ips: {self.totalips}')
206 | 
207 |     async def get_asns(self) -> set:
208 |         return self.asns
209 | 
210 |     async def get_hostnames(self) -> set:
211 |         return self.totalhosts
212 | 
213 |     async def get_ips(self) -> set:
214 |         return self.totalips
215 | 
216 |     async def process(self, proxy: bool = False) -> None:
217 |         self.proxy = proxy
218 |         await self.do_search()
219 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/crtsh.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher
 2 | 
 3 | 
 4 | class SearchCrtsh:
 5 |     def __init__(self, word) -> None:
 6 |         self.word = word
 7 |         self.data: list = []
 8 |         self.proxy = False
 9 | 
10 |     async def do_search(self) -> list:
11 |         data: set = set()
12 |         try:
13 |             url = f'https://crt.sh/?q=%25.{self.word}&exclude=expired&deduplicate=Y&output=json'
14 |             response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
15 |             response = response[0]
16 |             data = set([(dct['name_value'][2:] if dct['name_value'][:2] == '*.' else dct['name_value']) for dct in response])
17 |             data = {domain for domain in data if (domain[0] != '*' and str(domain[0:4]).isnumeric() is False)}
18 |         except Exception as e:
19 |             print(e)
20 |         clean: list = []
21 |         for x in data:
22 |             pre = x.split()
23 |             for y in pre:
24 |                 clean.append(y)
25 |         return clean
26 | 
27 |     async def process(self, proxy: bool = False) -> None:
28 |         self.proxy = proxy
29 |         data = await self.do_search()
30 |         self.data = data
31 | 
32 |     async def get_hostnames(self) -> list:
33 |         return self.data
34 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/dnssearch.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ============
  3 | DNS Browsing
  4 | ============
  5 | 
  6 | Explore the space around known hosts & ips for extra catches.
  7 | """
  8 | 
  9 | import asyncio
 10 | import re
 11 | import sys
 12 | from collections.abc import Callable
 13 | from ipaddress import IPv4Network
 14 | 
 15 | from aiodns import DNSResolver
 16 | 
 17 | from theHarvester.lib import hostchecker
 18 | from theHarvester.lib.core import DATA_DIR
 19 | 
 20 | #####################################################################
 21 | # DNS FORCE
 22 | #####################################################################
 23 | 
 24 | DNS_NAMES = DATA_DIR / 'wordlists' / 'dns-names.txt'
 25 | 
 26 | 
 27 | class DnsForce:
 28 |     def __init__(self, domain, dnsserver, verbose: bool = False) -> None:
 29 |         self.domain = domain
 30 |         self.subdo = False
 31 |         self.verbose = verbose
 32 |         # self.dnsserver = [dnsserver] if isinstance(dnsserver, str) else dnsserver
 33 |         # self.dnsserver = list(map(str, dnsserver.split(','))) if isinstance(dnsserver, str) else dnsserver
 34 |         self.dnsserver = dnsserver
 35 |         with DNS_NAMES.open('r') as file:
 36 |             self.list = file.readlines()
 37 |         self.domain = domain.replace('www.', '')
 38 |         self.list = [f'{word.strip()}.{self.domain}' for word in self.list]
 39 | 
 40 |     async def run(self):
 41 |         print(f'Starting DNS brute forcing with {len(self.list)} words')
 42 |         checker = hostchecker.Checker(self.list, nameservers=self.dnsserver)
 43 |         resolved_pair, hosts, ips = await checker.check()
 44 |         return resolved_pair, hosts, ips
 45 | 
 46 | 
 47 | #####################################################################
 48 | # DNS REVERSE
 49 | #####################################################################
 50 | 
 51 | 
 52 | IP_REGEX = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
 53 | PORT_REGEX = r'\d{1,5}'
 54 | NETMASK_REGEX: str = r'\d{1,2}|' + IP_REGEX
 55 | NETWORK_REGEX: str = rf'\b({IP_REGEX})(?:\:({PORT_REGEX}))?(?:\/({NETMASK_REGEX}))?\b'
 56 | 
 57 | 
 58 | def serialize_ip_range(ip: str, netmask: str = '24') -> str:
 59 |     """
 60 |     Serialize a network range in a constant format, 'x.x.x.x/y'.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     ip: str.
 65 |         A serialized ip in the format 'x.x.x.x'.
 66 |         Extra information like port (':z') or subnet ('/n')
 67 |         will be ignored.
 68 |     netmask: str.
 69 |         The subnet subdivision, represented by a 2 digit netmask.
 70 | 
 71 |     Returns
 72 |     -------
 73 |     out: str.
 74 |         The network OSI address, like '192.168.0.0/24'.
 75 |     """
 76 |     __ip_matches = re.search(NETWORK_REGEX, ip, re.IGNORECASE)
 77 |     if __ip_matches and __ip_matches.groups():
 78 |         __ip = __ip_matches.group(1)
 79 |         __netmask = netmask if netmask else __ip_matches.group(3)
 80 |         if __ip and __netmask:
 81 |             return str(IPv4Network(f'{__ip}/{__netmask}', strict=False))
 82 |         elif __ip:
 83 |             return str(IPv4Network('{}/{}'.format(__ip, '24'), strict=False))
 84 | 
 85 |     # invalid input ip
 86 |     return ''
 87 | 
 88 | 
 89 | def list_ips_in_network_range(iprange: str) -> list[str]:
 90 |     """
 91 |     List all the IPs in the range.
 92 | 
 93 |     Parameters
 94 |     ----------
 95 |     iprange: str.
 96 |         A serialized ip range, like '1.2.3.0/24'.
 97 |         The last digit can be set to anything, it will be ignored.
 98 | 
 99 |     Returns
100 |     -------
101 |     out: list.
102 |         The list of IPs in the range.
103 |     """
104 |     try:
105 |         __network = IPv4Network(iprange, strict=False)
106 |         return [__address.exploded for __address in __network.hosts()]
107 |     except Exception:
108 |         return []
109 | 
110 | 
111 | async def reverse_single_ip(ip: str, resolver: DNSResolver) -> str:
112 |     """
113 |     Reverse a single IP and output the linked CNAME, if it exists.
114 |         Parameters
115 |         ----------
116 |         :param ip:  IP address to reverse
117 |         :param resolver: DNS server to use
118 | 
119 |         Returns
120 |         -------
121 |         :return str: with the corresponding CNAME or None
122 |     """
123 |     try:
124 |         __host = await resolver.gethostbyaddr(ip)
125 |         return __host.name if __host else ''
126 |     except Exception:
127 |         return ''
128 | 
129 | 
130 | async def reverse_all_ips_in_range(iprange: str, callback: Callable, nameservers: list[str] | None = None) -> None:
131 |     """
132 |     Reverse all the IPs stored in a network range.
133 |     All the queries are made concurrently.
134 | 
135 |     Parameters
136 |     ----------
137 |     iprange: str.
138 |         An IPv4 range formatted as 'x.x.x.x/y'.
139 |         The last 2 digits of the ip can be set to anything,
140 |         they will be ignored.
141 |     callback: Callable.
142 |         Arbitrary postprocessing function.
143 |     nameservers: List[str].
144 |         Optional list of DNS servers.
145 | 
146 |     Returns
147 |     -------
148 |     out: None.
149 |     """
150 |     loop = asyncio.get_event_loop()
151 |     __resolver = DNSResolver(loop=loop, timeout=8, nameservers=nameservers)
152 |     for __ip in list_ips_in_network_range(iprange):
153 |         log_query(__ip)
154 |         __host = await reverse_single_ip(ip=__ip, resolver=__resolver)
155 |         callback(__host)
156 |         log_result(__host)
157 | 
158 | 
159 | #####################################################################
160 | # IO
161 | #####################################################################
162 | 
163 | 
164 | def log_query(ip: str) -> None:
165 |     """
166 |     Display the current query in the console.
167 | 
168 |     Parameters
169 |     ----------
170 |     ip: str.
171 |         Queried ip.
172 | 
173 |     Results
174 |     -------
175 |     out: None.
176 |     """
177 |     sys.stdout.write(chr(27) + '[2K' + chr(27) + '[G')
178 |     sys.stdout.write('\r' + ip + ' - ')
179 |     sys.stdout.flush()
180 | 
181 | 
182 | def log_result(host: str) -> None:
183 |     """
184 |     Display the query result in the console.
185 | 
186 |     Parameters
187 |     ----------
188 |     host: str.
189 |         Host name returned by the DNS query.
190 | 
191 |     Results
192 |     -------
193 |     out: None.
194 |     """
195 |     if host:
196 |         print(host)
197 | 
198 | 
199 | def generate_postprocessing_callback(target: str, **allhosts: list[str]) -> Callable:
200 |     """
201 |     Postprocess the query results asynchronously too, instead of waiting for
202 |     the querying stage to be completely finished.
203 | 
204 |     Parameters
205 |     ----------
206 |     target: str.
207 |         The domain wanted as TLD.
208 |     allhosts: List.
209 |         A collection of all the subdomains -of target- found so far.
210 | 
211 |     Returns
212 |     -------
213 |     out: Callable.
214 |         A function that will update the collection of target subdomains
215 |         when the query result is satisfying.
216 |     """
217 | 
218 |     def append_matching_hosts(host: str) -> None:
219 |         if host and target in host:
220 |             for __name, __hosts in allhosts.items():
221 |                 if host not in __hosts:
222 |                     __hosts.append(host)
223 | 
224 |     return append_matching_hosts
225 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/duckduckgosearch.py:
--------------------------------------------------------------------------------
 1 | import ujson
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher, Core
 4 | from theHarvester.parsers import myparser
 5 | 
 6 | 
 7 | class SearchDuckDuckGo:
 8 |     def __init__(self, word, limit) -> None:
 9 |         self.word = word
10 |         self.results = ''
11 |         self.totalresults = ''
12 |         self.dorks: list = []
13 |         self.links: list = []
14 |         self.database = 'https://duckduckgo.com/?q='
15 |         self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1'  # Currently using API.
16 |         self.quantity = '100'
17 |         self.limit = limit
18 |         self.proxy = False
19 | 
20 |     async def do_search(self) -> None:
21 |         # Do normal scraping.
22 |         url = self.api.replace('x', self.word)
23 |         headers = {'User-Agent': Core.get_user_agent()}
24 |         first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
25 |         self.results = first_resp[0]
26 |         self.totalresults += self.results
27 |         urls = await self.crawl(self.results)
28 |         urls = {url for url in urls if len(url) > 5}
29 |         all_resps = await AsyncFetcher.fetch_all(urls)
30 |         self.totalresults += ''.join(all_resps)
31 | 
32 |     async def crawl(self, text):
33 |         """
34 |         Function parses json and returns URLs.
35 |         :param text: formatted json
36 |         :return: set of URLs
37 |         """
38 |         urls = set()
39 |         try:
40 |             load = ujson.loads(text)
41 |             for keys in load.keys():  # Iterate through keys of dict.
42 |                 val = load.get(keys)
43 | 
44 |                 if isinstance(val, int) or isinstance(val, dict) or val is None:
45 |                     continue
46 | 
47 |                 if isinstance(val, list):
48 |                     if len(val) == 0:  # Make sure not indexing an empty list.
49 |                         continue
50 |                     val = val[0]  # The First value should be dict.
51 | 
52 |                     if isinstance(val, dict):  # Validation check.
53 |                         for key in val.keys():
54 |                             value = val.get(key)
55 |                             if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
56 |                                 urls.add(value)
57 | 
58 |                 if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
59 |                     urls.add(val)
60 |             tmp = set()
61 |             for url in urls:
62 |                 if '<' in url and 'href=' in url:  # Format is <href="https://www.website.com"/>
63 |                     equal_index = url.index('=')
64 |                     true_url = ''
65 |                     for ch in url[equal_index + 1 :]:
66 |                         if ch == '"':
67 |                             tmp.add(true_url)
68 |                             break
69 |                         true_url += ch
70 |                 else:
71 |                     if url != '':
72 |                         tmp.add(url)
73 |             return tmp
74 |         except Exception as e:
75 |             print(f'Exception occurred: {e}')
76 |             return []
77 | 
78 |     async def get_emails(self):
79 |         rawres = myparser.Parser(self.totalresults, self.word)
80 |         return await rawres.emails()
81 | 
82 |     async def get_hostnames(self):
83 |         rawres = myparser.Parser(self.totalresults, self.word)
84 |         return await rawres.hostnames()
85 | 
86 |     async def process(self, proxy: bool = False) -> None:
87 |         self.proxy = proxy
88 |         await self.do_search()  # Only need to search once since using API.
89 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/githubcode.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import random
  3 | import urllib.parse as urlparse
  4 | from typing import Any, NamedTuple
  5 | 
  6 | import aiohttp
  7 | 
  8 | from theHarvester.discovery.constants import MissingKey, get_delay
  9 | from theHarvester.lib.core import Core
 10 | from theHarvester.parsers import myparser
 11 | 
 12 | 
 13 | class RetryResult(NamedTuple):
 14 |     time: float
 15 | 
 16 | 
 17 | class SuccessResult(NamedTuple):
 18 |     fragments: list[str]
 19 |     next_page: int
 20 |     last_page: int
 21 | 
 22 | 
 23 | class ErrorResult(NamedTuple):
 24 |     status_code: int
 25 |     body: Any
 26 | 
 27 | 
 28 | class SearchGithubCode:
 29 |     def __init__(self, word, limit) -> None:
 30 |         try:
 31 |             self.word = word
 32 |             self.total_results = ''
 33 |             self.server = 'api.github.com'
 34 |             self.limit = limit
 35 |             self.counter = 0
 36 |             self.page = 1
 37 |             self.key = Core.github_key()
 38 |             if self.key is None:
 39 |                 raise MissingKey('Github')
 40 |             self.proxy = False
 41 |             self.base_url = f'https://{self.server}/search/code?q="{self.word}"'
 42 |             self.headers = {
 43 |                 'Host': self.server,
 44 |                 'User-agent': Core.get_user_agent(),
 45 |                 'Accept': 'application/vnd.github.v3.text-match+json',
 46 |                 'Authorization': f'token {self.key}',
 47 |             }
 48 |         except Exception as e:
 49 |             print(f'Error initializing SearchGithubCode: {e}')
 50 |             raise
 51 | 
 52 |     @staticmethod
 53 |     async def fragments_from_response(json_data: dict) -> list[str]:
 54 |         try:
 55 |             return [
 56 |                 match['fragment']
 57 |                 for item in json_data.get('items', [])
 58 |                 for match in item.get('text_matches', [])
 59 |                 if match.get('fragment') is not None
 60 |             ]
 61 |         except Exception as e:
 62 |             print(f'Error extracting fragments: {e}')
 63 |             return []
 64 | 
 65 |     @staticmethod
 66 |     async def page_from_response(page: str, links) -> int | None:
 67 |         try:
 68 |             if page_link := links.get(page):
 69 |                 parsed = urlparse.urlparse(str(page_link.get('url')))
 70 |                 if page_param := urlparse.parse_qs(parsed.query).get('page', [None])[0]:
 71 |                     return int(page_param)
 72 |             return 0
 73 |         except Exception as e:
 74 |             print(f'Error parsing page response: {e}')
 75 |             return None
 76 | 
 77 |     async def handle_response(self, response: tuple[str, dict, int, Any]) -> ErrorResult | RetryResult | SuccessResult:
 78 |         try:
 79 |             text, json_data, status, links = response
 80 |             if status == 200:
 81 |                 results = await self.fragments_from_response(json_data)
 82 |                 # Ensure next_page and last_page default to 0 if None
 83 |                 next_page = await self.page_from_response('next', links) or 0
 84 |                 last_page = await self.page_from_response('last', links) or 0
 85 |                 return SuccessResult(results, next_page, last_page)
 86 |             if status in (429, 403):
 87 |                 return RetryResult(60)
 88 |             return ErrorResult(status, json_data if isinstance(json_data, dict) else text)
 89 |         except Exception as e:
 90 |             print(f'Error handling response: {e}')
 91 |             return ErrorResult(500, str(e))
 92 | 
 93 |     @staticmethod
 94 |     async def next_page_or_end(result: SuccessResult) -> int | None:
 95 |         if result.next_page is not None:
 96 |             return result.next_page
 97 |         else:
 98 |             return result.last_page
 99 | 
100 |     async def do_search(self, page: int) -> tuple[str, dict, int, Any]:
101 |         try:
102 |             url = f'{self.base_url}&page={page}' if page else self.base_url
103 |             async with aiohttp.ClientSession(headers=self.headers) as sess:
104 |                 async with sess.get(url, proxy=random.choice(Core.proxy_list()) if self.proxy else None) as resp:
105 |                     return await resp.text(), await resp.json(), resp.status, resp.links
106 |         except Exception as e:
107 |             print(f'Error performing search: {e}')
108 |             return '', {}, 500, {}
109 | 
110 |     async def process(self, proxy: bool = False) -> None:
111 |         try:
112 |             self.proxy = proxy
113 |             while self.counter <= self.limit and self.page is not None:
114 |                 try:
115 |                     api_response = await self.do_search(self.page)
116 |                     result = await self.handle_response(api_response)
117 | 
118 |                     if isinstance(result, SuccessResult):
119 |                         print(f'\tSearching {self.counter} results.')
120 |                         self.total_results += ''.join(result.fragments)
121 |                         self.counter += len(result.fragments)
122 |                         self.page = result.next_page or result.last_page
123 |                         await asyncio.sleep(get_delay())
124 |                     elif isinstance(result, RetryResult):
125 |                         sleepy_time = get_delay() + result.time
126 |                         print(f'\tRetrying page in {sleepy_time} seconds...')
127 |                         await asyncio.sleep(sleepy_time)
128 |                     else:
129 |                         print(f'\tException occurred: status_code: {result.status_code} reason: {result.body}')
130 |                 except Exception as e:
131 |                     print(f'Error processing page: {e}')
132 |                     await asyncio.sleep(get_delay())
133 |         except Exception as e:
134 |             print(f'An exception has occurred in githubcode process: {e}')
135 | 
136 |     async def get_emails(self):
137 |         try:
138 |             rawres = myparser.Parser(self.total_results, self.word)
139 |             return await rawres.emails()
140 |         except Exception as e:
141 |             print(f'Error getting emails: {e}')
142 |             return []
143 | 
144 |     async def get_hostnames(self):
145 |         try:
146 |             rawres = myparser.Parser(self.total_results, self.word)
147 |             return await rawres.hostnames()
148 |         except Exception as e:
149 |             print(f'Error getting hostnames: {e}')
150 |             return []
151 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/hackertarget.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | 
 3 | 
 4 | class SearchHackerTarget:
 5 |     """
 6 |     Class uses the HackerTarget api to gather subdomains and ips
 7 |     """
 8 | 
 9 |     def __init__(self, word) -> None:
10 |         self.word = word
11 |         self.total_results = ''
12 |         self.hostname = 'https://api.hackertarget.com'
13 |         self.proxy = False
14 |         self.results = None
15 | 
16 |     async def do_search(self) -> None:
17 |         headers = {'User-agent': Core.get_user_agent()}
18 |         urls = [
19 |             f'{self.hostname}/hostsearch/?q={self.word}',
20 |             f'{self.hostname}/reversedns/?q={self.word}',
21 |         ]
22 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
23 |         for response in responses:
24 |             self.total_results += response.replace(',', ':')
25 | 
26 |     async def process(self, proxy: bool = False) -> None:
27 |         self.proxy = proxy
28 |         await self.do_search()
29 | 
30 |     async def get_hostnames(self) -> list:
31 |         return [result for result in self.total_results.splitlines() if 'No PTR records found' not in result]
32 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/haveibeenpwned.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchHaveIBeenPwned:
 8 |     def __init__(self, word: str):
 9 |         self.word = word
10 |         self.api_key = Core.haveibeenpwned_key()
11 |         self.base_url = 'https://haveibeenpwned.com/api/v3'
12 |         self.headers = {'hibp-api-key': self.api_key, 'user-agent': 'theHarvester', 'Content-Type': 'application/json'}
13 |         self.hosts = set()
14 |         self.emails = set()
15 |         self.breaches = []
16 |         self.pastes = []
17 |         self.breach_dates = set()
18 |         self.breach_types = set()
19 |         self.affected_data = set()
20 | 
21 |     async def process(self, proxy: bool = False) -> None:
22 |         """Search for breaches associated with a domain or email."""
23 |         try:
24 |             if proxy:
25 |                 response = await AsyncFetcher.fetch(
26 |                     session=None, url=f'{self.base_url}/breaches?domain={self.word}', headers=self.headers, proxy=proxy
27 |                 )
28 |                 if response:
29 |                     self.breaches = response
30 |                     self._extract_data()
31 |             else:
32 |                 async with aiohttp.ClientSession(headers=self.headers) as session:
33 |                     async with session.get(f'{self.base_url}/breaches?domain={self.word}') as response:
34 |                         if response.status == 200:
35 |                             self.breaches = await response.json()
36 |                             self._extract_data()
37 |                         elif response.status == 401:
38 |                             print('[!] Missing API key for HaveIBeenPwned.')
39 |                             raise MissingKey('HaveIBeenPwned')
40 |         except Exception as e:
41 |             print(f'Error in HaveIBeenPwned search: {e}')
42 | 
43 |     def _extract_data(self) -> None:
44 |         """Extract and categorize breach information."""
45 |         for breach in self.breaches:
46 |             if 'Domain' in breach:
47 |                 self.hosts.add(breach['Domain'])
48 |             if 'BreachDate' in breach:
49 |                 self.breach_dates.add(breach['BreachDate'])
50 |             if 'BreachType' in breach:
51 |                 self.breach_types.add(breach['BreachType'])
52 |             if 'DataClasses' in breach:
53 |                 self.affected_data.update(breach['DataClasses'])
54 | 
55 |     async def get_hostnames(self) -> set[str]:
56 |         return self.hosts
57 | 
58 |     async def get_emails(self) -> set[str]:
59 |         return self.emails
60 | 
61 |     async def get_breaches(self) -> list[dict]:
62 |         return self.breaches
63 | 
64 |     async def get_pastes(self) -> list[dict]:
65 |         return self.pastes
66 | 
67 |     async def get_breach_dates(self) -> set[str]:
68 |         return self.breach_dates
69 | 
70 |     async def get_breach_types(self) -> set[str]:
71 |         return self.breach_types
72 | 
73 |     async def get_affected_data(self) -> set[str]:
74 |         return self.affected_data
75 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/huntersearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchHunter:
 8 |     def __init__(self, word, limit, start) -> None:
 9 |         self.word = word
10 |         self.limit = limit
11 |         self.limit = 10 if limit > 10 else limit
12 |         self.start = start
13 |         self.key = Core.hunter_key()
14 |         if self.key is None:
15 |             raise MissingKey('Hunter')
16 |         self.total_results = ''
17 |         self.counter = start
18 |         self.database = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit=10'
19 |         self.proxy = False
20 |         self.hostnames: list = []
21 |         self.emails: list = []
22 | 
23 |     async def do_search(self) -> None:
24 |         # First determine if a user account is not a free account, this call is free
25 |         is_free = True
26 |         headers = {'User-Agent': Core.get_user_agent()}
27 |         acc_info_url = f'https://api.hunter.io/v2/account?api_key={self.key}'
28 |         response = await AsyncFetcher.fetch_all([acc_info_url], headers=headers, json=True)
29 |         is_free = (
30 |             is_free if 'plan_name' in response[0]['data'].keys() and response[0]['data']['plan_name'].lower() == 'free' else False
31 |         )
32 |         # Extract the total number of requests that are available for an account
33 | 
34 |         total_requests_avail = (
35 |             response[0]['data']['requests']['searches']['available'] - response[0]['data']['requests']['searches']['used']
36 |         )
37 |         if is_free:
38 |             response = await AsyncFetcher.fetch_all([self.database], headers=headers, proxy=self.proxy, json=True)
39 |             self.emails, self.hostnames = await self.parse_resp(json_resp=response[0])
40 |         else:
41 |             # Determine the total number of emails that are available
42 |             # As the most emails you can get within one query are 100
43 |             # This is only done where paid accounts are in play
44 |             hunter_dinfo_url = f'https://api.hunter.io/v2/email-count?domain={self.word}'
45 |             response = await AsyncFetcher.fetch_all([hunter_dinfo_url], headers=headers, proxy=self.proxy, json=True)
46 |             total_number_reqs = response[0]['data']['total'] // 100
47 |             # Parse out meta field within initial JSON response to determine the total number of results
48 |             if total_requests_avail < total_number_reqs:
49 |                 print('WARNING: account does not have enough requests to gather all emails')
50 |                 print(f'Total requests available: {total_requests_avail}, total requests needed to be made: {total_number_reqs}')
51 |                 print('RETURNING current results, if you would still like to run this module comment out the if request')
52 |                 return
53 |             self.limit = 100
54 |             # max number of emails you can get per request is 100
55 |             # increments of 100 with offset determining where to start
56 |             # See docs for more details: https://hunter.io/api-documentation/v2#domain-search
57 |             for offset in range(0, 100 * total_number_reqs, 100):
58 |                 req_url = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit{self.limit}&offset={offset}'
59 |                 response = await AsyncFetcher.fetch_all([req_url], headers=headers, proxy=self.proxy, json=True)
60 |                 temp_emails, temp_hostnames = await self.parse_resp(response[0])
61 |                 self.emails.extend(temp_emails)
62 |                 self.hostnames.extend(temp_hostnames)
63 |                 await asyncio.sleep(1)
64 | 
65 |     async def parse_resp(self, json_resp):
66 |         emails = list(sorted({email['value'] for email in json_resp['data']['emails']}))
67 |         domains = list(
68 |             sorted(
69 |                 {
70 |                     source['domain']
71 |                     for email in json_resp['data']['emails']
72 |                     for source in email['sources']
73 |                     if self.word in source['domain']
74 |                 }
75 |             )
76 |         )
77 |         return emails, domains
78 | 
79 |     async def process(self, proxy: bool = False) -> None:
80 |         self.proxy = proxy
81 |         await self.do_search()  # Only need to do it once.
82 | 
83 |     async def get_emails(self):
84 |         return self.emails
85 | 
86 |     async def get_hostnames(self):
87 |         return self.hostnames
88 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/intelxsearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Any
 3 | from urllib.parse import urlparse
 4 | 
 5 | import aiohttp
 6 | 
 7 | from theHarvester.discovery.constants import MissingKey
 8 | from theHarvester.lib.core import Core
 9 | from theHarvester.parsers import intelxparser
10 | 
11 | 
12 | class SearchIntelx:
13 |     def __init__(self, word) -> None:
14 |         self.word = word
15 |         self.key = Core.intelx_key()
16 |         if self.key is None:
17 |             raise MissingKey('Intelx')
18 |         self.database = 'https://2.intelx.io'
19 |         self.results: dict[str, Any] = {}
20 |         self.info: tuple[list[str], list[str], list[str]] = ([], [], [])
21 |         self.limit: int = 10000
22 |         self.proxy = False
23 |         self.offset = 0
24 | 
25 |     async def do_search(self) -> None:
26 |         try:
27 |             headers = {
28 |                 'x-key': self.key,
29 |                 'User-Agent': f'{Core.get_user_agent()}-theHarvester',
30 |                 'Content-Type': 'application/json',
31 |             }
32 |             data = {
33 |                 'term': self.word,
34 |                 'buckets': [],
35 |                 'lookuplevel': 0,
36 |                 'maxresults': self.limit,
37 |                 'timeout': 5,
38 |                 'datefrom': '',
39 |                 'dateto': '',
40 |                 'sort': 4,  # Sort by date descending for faster relevant results
41 |                 'media': 0,
42 |                 'terminate': [],
43 |                 'target': 0,
44 |             }
45 |             async with aiohttp.ClientSession() as session:
46 |                 async with session.post(f'{self.database}/phonebook/search', headers=headers, json=data) as total_resp:
47 |                     search_data = await total_resp.json()
48 |                     if not search_data['success']:
49 |                         print(f'Error: {search_data["message"]}')
50 |                         return
51 |                     phonebook_id = search_data['id']
52 | 
53 |                 await asyncio.sleep(2)  # Reduced sleep time as 5s is excessive
54 | 
55 |                 async with session.get(
56 |                     f'{self.database}/phonebook/search/result?id={phonebook_id}&limit={self.limit}&offset={self.offset}',
57 |                     headers=headers,
58 |                 ) as resp:
59 |                     self.results = await resp.json()
60 | 
61 |         except Exception as e:
62 |             print(f'An exception has occurred in Intelx: {e}')
63 | 
64 |     async def process(self, proxy: bool = False):
65 |         self.proxy = proxy
66 |         await self.do_search()
67 |         intelx_parser = intelxparser.Parser()
68 |         self.info = await intelx_parser.parse_dictionaries(self.results)
69 | 
70 |     async def get_emails(self) -> list[str]:
71 |         return self.info[0]
72 | 
73 |     async def get_interestingurls(self) -> tuple[list[str], list[str]]:
74 |         urls = self.info[1]
75 |         subdomains = []
76 | 
77 |         for url in urls:
78 |             try:
79 |                 parsed = urlparse(url)
80 |                 domain = parsed.netloc
81 |                 if domain.count('.') > 1 and self.word in domain:
82 |                     subdomains.append(domain)
83 |             except Exception:
84 |                 continue
85 | 
86 |         return urls, list(set(subdomains))
87 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/leaklookup.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchLeakLookup:
 8 |     def __init__(self, word: str):
 9 |         self.word = word
10 |         self.api_key = Core.leaklookup_key()
11 |         self.base_url = 'https://leak-lookup.com/api'
12 |         self.headers = {'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json'}
13 |         self.hosts = set()
14 |         self.emails = set()
15 |         self.leaks = []
16 |         self.passwords = set()
17 |         self.sources = set()
18 |         self.leak_dates = set()
19 | 
20 |     async def process(self, proxy: bool = False) -> None:
21 |         """Search for leaked credentials associated with an email."""
22 |         try:
23 |             if proxy:
24 |                 response = await AsyncFetcher.fetch(
25 |                     session=None,
26 |                     url=f'{self.base_url}/search?key={self.api_key}&type=email&query={self.word}',
27 |                     headers=self.headers,
28 |                     proxy=proxy,
29 |                 )
30 |                 if response:
31 |                     self.leaks = response
32 |                     self._extract_data()
33 |             else:
34 |                 async with aiohttp.ClientSession(headers=self.headers) as session:
35 |                     async with session.get(f'{self.base_url}/search?key={self.api_key}&type=email&query={self.word}') as response:
36 |                         if response.status == 200:
37 |                             self.leaks = await response.json()
38 |                             self._extract_data()
39 |                         elif response.status == 401:
40 |                             print('[!] Missing API key for Leak-Lookup.')
41 |                             raise MissingKey('Leak-Lookup')
42 |         except Exception as e:
43 |             print(f'Error in Leak-Lookup search: {e}')
44 | 
45 |     def _extract_data(self) -> None:
46 |         """Extract and categorize leak information."""
47 |         for leak in self.leaks:
48 |             if 'domain' in leak:
49 |                 self.hosts.add(leak['domain'])
50 |             if 'email' in leak:
51 |                 self.emails.add(leak['email'])
52 |             if 'password' in leak:
53 |                 self.passwords.add(leak['password'])
54 |             if 'source' in leak:
55 |                 self.sources.add(leak['source'])
56 |             if 'date' in leak:
57 |                 self.leak_dates.add(leak['date'])
58 | 
59 |     async def get_hostnames(self) -> set[str]:
60 |         return self.hosts
61 | 
62 |     async def get_emails(self) -> set[str]:
63 |         return self.emails
64 | 
65 |     async def get_leaks(self) -> list[dict]:
66 |         return self.leaks
67 | 
68 |     async def get_passwords(self) -> set[str]:
69 |         return self.passwords
70 | 
71 |     async def get_sources(self) -> set[str]:
72 |         return self.sources
73 | 
74 |     async def get_leak_dates(self) -> set[str]:
75 |         return self.leak_dates
76 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/netlas.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchNetlas:
 8 |     def __init__(self, word, limit: int) -> None:
 9 |         self.word = word
10 |         self.totalhosts: list = []
11 |         self.totalips: list = []
12 |         self.key = Core.netlas_key()
13 |         self.limit = limit
14 |         if self.key is None:
15 |             raise MissingKey('netlas')
16 |         self.proxy = False
17 | 
18 |     async def do_count(self) -> None:
19 |         """Counts the total number of subdomains
20 | 
21 |         :return: None
22 |         """
23 |         api = f'https://app.netlas.io/api/domains_count/?q=*.{self.word}'
24 |         headers = {'X-API-Key': self.key}
25 |         response = await AsyncFetcher.fetch_all([api], json=True, headers=headers, proxy=self.proxy)
26 |         amount_size = response[0]['count']
27 |         self.limit = amount_size if amount_size < self.limit else self.limit
28 | 
29 |     async def do_search(self) -> None:
30 |         """Download domains for query 'q' size of 'limit'
31 | 
32 |         :return: None
33 |         """
34 |         user_agent = Core.get_user_agent()
35 |         url = 'https://app.netlas.io/api/domains/download/'
36 | 
37 |         payload = {
38 |             'q': f'*.{self.word}',
39 |             'fields': json.dumps(['domain']),  # Convert the list to a JSON string
40 |             'source_type': 'include',
41 |             'size': str(self.limit),  # Convert integer to string
42 |             'type': 'json',
43 |             'indice': json.dumps([0]),  # Convert the list to a JSON string
44 |         }
45 | 
46 |         headers = {
47 |             'X-API-Key': self.key,
48 |             'User-Agent': user_agent,
49 |         }
50 |         response = await AsyncFetcher.post_fetch(url, data=payload, headers=headers, proxy=self.proxy)
51 |         resp_json = json.loads(response)
52 | 
53 |         for data in resp_json:
54 |             domain = data['data']['domain']
55 |             self.totalhosts.append(domain)
56 | 
57 |     async def get_hostnames(self) -> list:
58 |         return self.totalhosts
59 | 
60 |     async def process(self, proxy: bool = False) -> None:
61 |         self.proxy = proxy
62 |         await self.do_count()
63 |         await self.do_search()
64 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/onyphe.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | # from theHarvester.parsers import myparser
 7 | 
 8 | 
 9 | class SearchOnyphe:
10 |     def __init__(self, word) -> None:
11 |         self.word = word
12 |         self.response = ''
13 |         self.totalhosts: set = set()
14 |         self.totalips: set = set()
15 |         self.asns: set = set()
16 |         self.key = Core.onyphe_key()
17 |         if self.key is None:
18 |             raise MissingKey('onyphe')
19 |         self.proxy = False
20 | 
21 |     async def do_search(self) -> None:
22 |         # https://www.onyphe.io/docs/apis/search
23 |         # https://www.onyphe.io/search?q=domain%3Acharter.com&captcharesponse=j5cGT
24 |         # base_url = f'https://www.onyphe.io/api/v2/search/?q=domain:domain:{self.word}'
25 |         base_url = f'https://www.onyphe.io/api/v2/search/?q=domain:{self.word}'
26 |         headers = {
27 |             'User-Agent': Core.get_user_agent(),
28 |             'Content-Type': 'application/json',
29 |             'Authorization': f'bearer {self.key}',
30 |         }
31 |         response = await AsyncFetcher.fetch_all([base_url], json=True, headers=headers, proxy=self.proxy)
32 |         self.response = response[0]
33 |         await self.parse_onyphe_resp_json()
34 | 
35 |     async def parse_onyphe_resp_json(self):
36 |         if isinstance(self.response, list):
37 |             self.response = self.response[0]
38 |         if not isinstance(self.response, dict):
39 |             raise Exception(f'An exception has occurred {self.response} is not a dict')
40 |         if self.response['text'] == 'Success':
41 |             if 'results' in self.response.keys():
42 |                 for result in self.response['results']:
43 |                     try:
44 |                         if 'alternativeip' in result.keys():
45 |                             self.totalips.update({altip for altip in result['alternativeip']})
46 |                         if 'url' in result.keys() and isinstance(result['url'], list):
47 |                             self.totalhosts.update(
48 |                                 urlparse(url).netloc for url in result['url'] if urlparse(url).netloc.endswith(self.word)
49 |                             )
50 |                         self.asns.add(result['asn'])
51 |                         self.asns.add(result['geolocus']['asn'])
52 |                         self.totalips.add(result['geolocus']['subnet'])
53 |                         self.totalips.add(result['ip'])
54 |                         self.totalips.add(result['subnet'])
55 |                         # Shouldn't be needed as API autoparses urls from html raw data
56 |                         # rawres = myparser.Parser(result['data'], self.word)
57 |                         # if await rawres.hostnames():
58 |                         #     self.totalhosts.update(set(await rawres.hostnames()))
59 |                         for subdomain_key in [
60 |                             'domain',
61 |                             'hostname',
62 |                             'subdomains',
63 |                             'subject',
64 |                             'reverse',
65 |                             'geolocus',
66 |                         ]:
67 |                             if subdomain_key in result.keys():
68 |                                 if subdomain_key == 'subject':
69 |                                     self.totalhosts.update(
70 |                                         {domain for domain in result[subdomain_key]['altname'] if domain.endswith(self.word)}
71 |                                     )
72 |                                 elif subdomain_key == 'geolocus':
73 |                                     self.totalhosts.update(
74 |                                         {domain for domain in result[subdomain_key]['domain'] if domain.endswith(self.word)}
75 |                                     )
76 |                                 else:
77 |                                     self.totalhosts.update(
78 |                                         {domain for domain in result[subdomain_key] if domain.endswith(self.word)}
79 |                                     )
80 |                     except Exception:
81 |                         continue
82 |         else:
83 |             print(f'Onhyphe API query did not succeed dumping current response: {self.response}')
84 | 
85 |     async def get_asns(self) -> set:
86 |         return self.asns
87 | 
88 |     async def get_hostnames(self) -> set:
89 |         return self.totalhosts
90 | 
91 |     async def get_ips(self) -> set:
92 |         return self.totalips
93 | 
94 |     async def process(self, proxy: bool = False) -> None:
95 |         self.proxy = proxy
96 |         await self.do_search()
97 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/otxsearch.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher
 4 | 
 5 | 
 6 | class SearchOtx:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.totalhosts: set = set()
10 |         self.totalips: set = set()
11 |         self.proxy = False
12 | 
13 |     async def do_search(self) -> None:
14 |         url = f'https://otx.alienvault.com/api/v1/indicators/domain/{self.word}/passive_dns'
15 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
16 |         responses = response[0]
17 |         dct = responses
18 |         self.totalhosts = {host['hostname'] for host in dct['passive_dns']}
19 |         # filter out ips that are just called NXDOMAIN
20 |         self.totalips = {
21 |             ip['address'] for ip in dct['passive_dns'] if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', ip['address'])
22 |         }
23 | 
24 |     async def get_hostnames(self) -> set:
25 |         return self.totalhosts
26 | 
27 |     async def get_ips(self) -> set:
28 |         return self.totalips
29 | 
30 |     async def process(self, proxy: bool = False) -> None:
31 |         self.proxy = proxy
32 |         await self.do_search()
33 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/pentesttools.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import ujson
 4 | 
 5 | from theHarvester.discovery.constants import MissingKey
 6 | from theHarvester.lib.core import AsyncFetcher, Core
 7 | 
 8 | 
 9 | class SearchPentestTools:
10 |     def __init__(self, word) -> None:
11 |         # Script is largely based off https://pentest-tools.com/public/api_client.py.txt
12 |         self.word = word
13 |         self.key = Core.pentest_tools_key()
14 |         if self.key is None:
15 |             raise MissingKey('PentestTools')
16 |         self.total_results: list = []
17 |         self.api = f'https://pentest-tools.com/api?key={self.key}'
18 |         self.proxy = False
19 | 
20 |     async def poll(self, scan_id):
21 |         while True:
22 |             time.sleep(3)
23 |             # Get the status of our scan
24 |             scan_status_data = {'op': 'get_scan_status', 'scan_id': scan_id}
25 |             responses = await AsyncFetcher.post_fetch(url=self.api, data=ujson.dumps(scan_status_data), proxy=self.proxy)
26 |             res_json = ujson.loads(responses.strip())
27 |             if res_json['op_status'] == 'success':
28 |                 if res_json['scan_status'] != 'waiting' and res_json['scan_status'] != 'running':
29 |                     getoutput_data = {
30 |                         'op': 'get_output',
31 |                         'scan_id': scan_id,
32 |                         'output_format': 'json',
33 |                     }
34 |                     responses = await AsyncFetcher.post_fetch(url=self.api, data=ujson.dumps(getoutput_data), proxy=self.proxy)
35 | 
36 |                     res_json = ujson.loads(responses.strip('\n'))
37 |                     self.total_results = await self.parse_json(res_json)
38 |                     break
39 |             else:
40 |                 print(f'Operation get_scan_status failed because: {res_json["error"]}. {res_json["details"]}')
41 |                 break
42 | 
43 |     @staticmethod
44 |     async def parse_json(json_results):
45 |         status = json_results['op_status']
46 |         if status == 'success':
47 |             scan_tests = json_results['scan_output']['output_json']
48 |             output_data = scan_tests[0]['output_data']
49 |             host_to_ip = [f'{subdomain[0]}:{subdomain[1]}' for subdomain in output_data if len(subdomain) > 0]
50 |             return host_to_ip
51 |         return []
52 | 
53 |     async def get_hostnames(self) -> list:
54 |         return self.total_results
55 | 
56 |     async def do_search(self) -> None:
57 |         subdomain_payload = {
58 |             'op': 'start_scan',
59 |             'tool_id': 20,
60 |             'tool_params': {
61 |                 'target': f'{self.word}',
62 |                 'web_details': 'off',
63 |                 'do_smart_search': 'off',
64 |             },
65 |         }
66 |         responses = await AsyncFetcher.post_fetch(url=self.api, data=ujson.dumps(subdomain_payload), proxy=self.proxy)
67 |         res_json = ujson.loads(responses.strip())
68 |         if res_json['op_status'] == 'success':
69 |             scan_id = res_json['scan_id']
70 |             await self.poll(scan_id)
71 | 
72 |     async def process(self, proxy: bool = False) -> None:
73 |         self.proxy = proxy
74 |         await self.do_search()  # Only need to do it once.
75 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/projectdiscovery.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import MissingKey
 2 | from theHarvester.lib.core import AsyncFetcher, Core
 3 | 
 4 | 
 5 | class SearchDiscovery:
 6 |     def __init__(self, word) -> None:
 7 |         self.word = word
 8 |         self.key = Core.projectdiscovery_key()
 9 |         if self.key is None:
10 |             raise MissingKey('ProjectDiscovery')
11 |         self.total_results = None
12 |         self.proxy = False
13 | 
14 |     async def do_search(self):
15 |         url = f'https://dns.projectdiscovery.io/dns/{self.word}/subdomains'
16 |         response = await AsyncFetcher.fetch_all(
17 |             [url],
18 |             json=True,
19 |             headers={'User-Agent': Core.get_user_agent(), 'Authorization': self.key},
20 |             proxy=self.proxy,
21 |         )
22 |         self.total_results = [f'{domains}.{self.word}' for domains in response[0]['subdomains']]
23 | 
24 |     async def get_hostnames(self):
25 |         return self.total_results
26 | 
27 |     async def process(self, proxy: bool = False) -> None:
28 |         self.proxy = proxy
29 |         await self.do_search()
30 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/rapiddns.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher, Core
 4 | 
 5 | 
 6 | class SearchRapidDns:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.total_results: list = []
10 |         self.proxy = False
11 | 
12 |     async def do_search(self):
13 |         try:
14 |             headers = {'User-agent': Core.get_user_agent()}
15 |             # TODO see if it's worth adding sameip searches
16 |             # f'{self.hostname}/sameip/{self.word}?full=1#result'
17 |             urls = [f'https://rapiddns.io/subdomain/{self.word}?full=1#result']
18 |             responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
19 |             if len(responses[0]) <= 1:
20 |                 return self.total_results
21 |             soup = BeautifulSoup(responses[0], 'html.parser')
22 |             rows = soup.find('table').find('tbody').find_all('tr')
23 |             if rows:
24 |                 # Validation check
25 |                 for row in rows:
26 |                     cells = row.find_all('td')
27 |                     if len(cells) > 0:
28 |                         # sanity check
29 |                         subdomain = str(cells[0].get_text())
30 |                         if cells[-1].get_text() == 'CNAME':
31 |                             self.total_results.append(f'{subdomain}')
32 |                         else:
33 |                             self.total_results.append(f'{subdomain}:{str(cells[1].get_text()).strip()}')
34 |                 self.total_results = list({domain for domain in self.total_results})
35 |         except Exception as e:
36 |             print(f'An exception has occurred: {e!s}')
37 | 
38 |     async def process(self, proxy: bool = False) -> None:
39 |         self.proxy = proxy
40 |         await self.do_search()
41 | 
42 |     async def get_hostnames(self):
43 |         return self.total_results
44 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/rocketreach.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey, get_delay
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchRocketReach:
 8 |     def __init__(self, word, limit) -> None:
 9 |         self.ips: set = set()
10 |         self.word = word
11 |         self.key = Core.rocketreach_key()
12 |         if self.key is None:
13 |             raise MissingKey('RocketReach')
14 |         self.hosts: set = set()
15 |         self.proxy = False
16 |         self.baseurl = 'https://rocketreach.co/api/v2/person/search'
17 |         self.links: set = set()
18 |         self.emails: set = set()
19 |         self.limit = limit
20 | 
21 |     async def do_search(self) -> None:
22 |         try:
23 |             headers = {
24 |                 'Api-Key': self.key,
25 |                 'Content-Type': 'application/json',
26 |                 'User-Agent': Core.get_user_agent(),
27 |             }
28 | 
29 |             next_page = 1  # track pagination
30 |             for count in range(1, self.limit):
31 |                 data = f'{{"query":{{"current_employer_domain": ["{self.word}"]}}, "page": {next_page}, "page_size": 100}}'
32 |                 result = await AsyncFetcher.post_fetch(self.baseurl, headers=headers, data=data, json=True)
33 |                 if 'detail' in result.keys() and 'error' in result.keys() and 'Subscribe to a plan to access' in result['detail']:
34 |                     # No more results can be fetched
35 |                     break
36 |                 if 'detail' in result.keys() and 'Request was throttled.' in result['detail']:
37 |                     # Rate limit has been triggered need to sleep extra
38 |                     print(
39 |                         f'RocketReach requests have been throttled; '
40 |                         f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}'
41 |                     )
42 |                     break
43 |                 if 'profiles' in dict(result).keys():
44 |                     if len(result['profiles']) == 0:
45 |                         break
46 |                     for profile in result['profiles']:
47 |                         if 'linkedin_url' in dict(profile).keys():
48 |                             self.links.add(profile['linkedin_url'])
49 |                         if 'emails' in dict(profile).keys() and profile['emails']:
50 |                             for email in profile['emails']:
51 |                                 if email.get('email'):
52 |                                     self.emails.add(email['email'])
53 |                 if 'pagination' in dict(result).keys():
54 |                     next_page = result['pagination']['page'] + 1
55 |                     if next_page > result['pagination']['total_pages']:
56 |                         break
57 | 
58 |             await asyncio.sleep(get_delay() + 5)
59 | 
60 |         except Exception as e:
61 |             print(f'An exception has occurred rocketreach: {e}')
62 | 
63 |     async def get_links(self):
64 |         return self.links
65 | 
66 |     async def get_emails(self):
67 |         return self.emails
68 | 
69 |     async def process(self, proxy: bool = False) -> None:
70 |         self.proxy = proxy
71 |         await self.do_search()
72 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/search_dehashed.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import requests
 4 | 
 5 | from theHarvester.discovery.constants import MissingKey
 6 | from theHarvester.lib.core import Core
 7 | 
 8 | 
 9 | class SearchDehashed:
10 |     def __init__(self, word) -> None:
11 |         self.word = word
12 |         self.key = Core.dehashed_key()
13 |         if self.key is None:
14 |             raise MissingKey('Dehashed')
15 | 
16 |         self.api = 'https://api.dehashed.com/v2/search'
17 |         self.headers = {'Content-Type': 'application/json', 'Dehashed-Api-Key': self.key}
18 |         self.results = ''
19 |         self.data: list[dict] = []
20 | 
21 |     async def do_search(self) -> None:
22 |         print(f'\t[+] Performing Dehashed search for: {self.word}')
23 |         page = 1
24 |         size = 100
25 |         while True:
26 |             payload = {'query': self.word, 'page': page, 'size': size, 'wildcard': False, 'regex': False, 'de_dupe': False}
27 | 
28 |             try:
29 |                 response = requests.post(self.api, json=payload, headers=self.headers)
30 |                 if response.status_code == 401:
31 |                     raise Exception('Unauthorized. Check Dehashed API key.')
32 |                 if response.status_code == 403:
33 |                     raise Exception('Forbidden. API key is not allowed.')
34 | 
35 |                 data = response.json()
36 |                 entries = data.get('entries', [])
37 |                 if not entries:
38 |                     break
39 | 
40 |                 self.data.extend(entries)
41 |                 print(f'\t[+] Page {page} - Retrieved {len(entries)} entries.')
42 | 
43 |                 if len(entries) < size:
44 |                     break
45 |                 page += 1
46 |                 time.sleep(0.5)
47 |             except Exception as e:
48 |                 print(f'\t[!] Dehashed error: {e}')
49 |                 break
50 | 
51 |     async def print_csv_results(self) -> None:
52 |         if not self.data:
53 |             print('\t[!] No data found.')
54 |             return
55 | 
56 |         print('\n[Dehashed Results]')
57 |         print('Email,Username,Password,Phone,IP,Source')
58 | 
59 |         for entry in self.data:
60 |             email = entry.get('email', '')
61 |             username = entry.get('username', '')
62 |             password = entry.get('password', '')
63 |             phone = entry.get('phone', '')
64 |             ip = entry.get('ip_address', '')
65 |             source = entry.get('database_name', '')
66 | 
67 |             csv_line = f'"{email}","{username}","{password}","{phone}","{ip}","{source}"'
68 |             print(csv_line)
69 | 
70 |     async def process(self, proxy: bool = False) -> None:
71 |         await self.do_search()
72 |         await self.print_csv_results()
73 | 
74 |     async def get_emails(self) -> set:
75 |         emails = set()
76 |         for entry in self.data:
77 |             if entry.get('email'):
78 |                 emails.add(entry['email'])
79 |         return emails
80 | 
81 |     async def get_hostnames(self) -> set:
82 |         return set()
83 | 
84 |     async def get_ips(self) -> set:
85 |         ips = set()
86 |         for entry in self.data:
87 |             if entry.get('ip_address'):
88 |                 ips.add(entry['ip_address'])
89 |         return ips
90 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/search_dnsdumpster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from theHarvester.discovery.constants import MissingKey
 3 | from theHarvester.lib.core import AsyncFetcher, Core
 4 | 
 5 | 
 6 | class SearchDNSDumpster:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.key = Core.dnsdumpster_key()
10 |         if not self.key:
11 |             raise MissingKey('DNSDumpster')
12 |         self.hosts: set = set()
13 |         self.ips: set = set()
14 |         self.base_url = 'https://api.dnsdumpster.com'
15 | 
16 |     async def do_search(self) -> None:
17 |         try:
18 |             url = f'{self.base_url}/domain/{self.word}'
19 |             headers = {'User-Agent': 'Mozilla/5.0 (theHarvester)', 'X-API-Key': self.key}
20 | 
21 |             response = await AsyncFetcher.fetch_all([url], headers=headers, json=True)
22 |             data = response[0]
23 | 
24 |             if isinstance(data, dict):
25 |                 # Process A records
26 |                 for record in data.get('a', []):
27 |                     host = record['host']
28 |                     if host.endswith(self.word):
29 |                         self.hosts.add(host)
30 |                     for ip_info in record['ips']:
31 |                         self.ips.add(ip_info['ip'])
32 | 
33 |                 # Process NS records
34 |                 for record in data.get('ns', []):
35 |                     host = record['host']
36 |                     if host.endswith(self.word):
37 |                         self.hosts.add(host)
38 |                     for ip_info in record['ips']:
39 |                         self.ips.add(ip_info['ip'])
40 | 
41 |         except Exception as e:
42 |             print(f'Error occurred in DNSDumpster search: {e}')
43 | 
44 |     async def process(self, proxy: bool = False) -> None:
45 |         await self.do_search()
46 | 
47 |     async def get_hostnames(self) -> set:
48 |         return self.hosts
49 | 
50 |     async def get_ips(self) -> set:
51 |         return self.ips
52 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/searchhunterhow.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from datetime import datetime
 3 | 
 4 | from dateutil.relativedelta import relativedelta
 5 | 
 6 | from theHarvester.discovery.constants import MissingKey
 7 | from theHarvester.lib.core import AsyncFetcher, Core
 8 | 
 9 | 
10 | class SearchHunterHow:
11 |     def __init__(self, word) -> None:
12 |         self.word = word
13 |         self.total_hostnames: set = set()
14 |         self.key = Core.hunterhow_key()
15 |         if self.key is None:
16 |             raise MissingKey('hunterhow')
17 |         self.proxy = False
18 | 
19 |     async def do_search(self) -> None:
20 |         # https://hunter.how/search-api
21 |         query = f'domain.suffix="{self.word}"'
22 |         # second_query = f'domain="{self.word}"'
23 |         encoded_query = base64.urlsafe_b64encode(query.encode('utf-8')).decode('ascii')
24 |         page = 1
25 |         page_size = 100  # can be either: 10,20,50,100)
26 |         # The interval between the start time and the end time cannot exceed one year
27 |         # Can not exceed one year, but years=1 does not work due to their backend, 364 will suffice
28 |         today = datetime.today()
29 |         one_year_ago = today - relativedelta(days=364)
30 |         start_time = one_year_ago.strftime('%Y-%m-%d')
31 |         end_time = today.strftime('%Y-%m-%d')
32 |         # two_years_ago = one_year_ago - relativedelta(days=364)
33 |         # start_time = two_years_ago.strftime('%Y-%m-%d')
34 |         # end_time = one_year_ago.strftime('%Y-%m-%d')
35 |         url = f'https://api.hunter.how/search?api-key={self.key}&query={encoded_query}&page={page}&page_size={page_size}&start_time={start_time}&end_time={end_time}'
36 |         response = await AsyncFetcher.fetch_all(
37 |             [url],
38 |             json=True,
39 |             headers={'User-Agent': Core.get_user_agent(), 'x-api-key': f'{self.key}'},
40 |             proxy=self.proxy,
41 |         )
42 |         dct = response[0]
43 |         # print(f'json response: ')
44 |         # print(dct)
45 |         if 'code' in dct.keys():
46 |             if dct['code'] == 40001:
47 |                 print(f'Code 40001 indicates for searchhunterhow: {dct["message"]}')
48 |                 return
49 |         # total = dct['data']['total']
50 |         # TODO determine if total is ever 100 how to get more subdomains?
51 |         for sub in dct['data']['list']:
52 |             self.total_hostnames.add(sub['domain'])
53 | 
54 |     async def get_hostnames(self) -> set:
55 |         return self.total_hostnames
56 | 
57 |     async def process(self, proxy: bool = False) -> None:
58 |         self.proxy = proxy
59 |         await self.do_search()
60 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/securityscorecard.py:
--------------------------------------------------------------------------------
 1 | import aiohttp
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchSecurityScorecard:
 8 |     def __init__(self, word: str):
 9 |         self.word = word
10 |         self.api_key = Core.securityscorecard_key()
11 |         self.base_url = 'https://api.securityscorecard.io'
12 |         self.headers = {'Authorization': f'Token {self.api_key}', 'Content-Type': 'application/json'}
13 |         self.hosts = set()
14 |         self.score = 0
15 |         self.grades = {}
16 |         self.issues = []
17 |         self.recommendations = []
18 |         self.history = []
19 | 
20 |     async def process(self, proxy: bool = False) -> None:
21 |         """Get security scorecard information for a domain."""
22 |         try:
23 |             if proxy:
24 |                 response = await AsyncFetcher.fetch(
25 |                     session=None, url=f'{self.base_url}/companies/{self.word}', headers=self.headers, proxy=proxy
26 |                 )
27 |                 if response:
28 |                     self._extract_data(response)
29 |             else:
30 |                 async with aiohttp.ClientSession(headers=self.headers) as session:
31 |                     async with session.get(f'{self.base_url}/companies/{self.word}') as response:
32 |                         if response.status == 200:
33 |                             data = await response.json()
34 |                             self._extract_data(data)
35 |                         elif response.status == 401:
36 |                             print('[!] Missing API key for SecurityScorecard.')
37 |                             raise MissingKey('SecurityScorecard')
38 |         except Exception as e:
39 |             print(f'Error in SecurityScorecard search: {e}')
40 | 
41 |     def _extract_data(self, data: dict) -> None:
42 |         """Extract and categorize security scorecard information."""
43 |         if 'grade' in data:
44 |             self.score = data.get('grade', 0)
45 | 
46 |         if 'factor_grades' in data:
47 |             self.grades = data['factor_grades']
48 | 
49 |         if 'issues' in data:
50 |             self.issues = data['issues']
51 | 
52 |         if 'recommendations' in data:
53 |             self.recommendations = data['recommendations']
54 | 
55 |         if 'history' in data:
56 |             self.history = data['history']
57 | 
58 |         if 'domains' in data:
59 |             self.hosts.update(data['domains'])
60 | 
61 |     async def get_hostnames(self) -> set[str]:
62 |         return self.hosts
63 | 
64 |     async def get_score(self) -> int:
65 |         return self.score
66 | 
67 |     async def get_grades(self) -> dict:
68 |         return self.grades
69 | 
70 |     async def get_issues(self) -> list[dict]:
71 |         return self.issues
72 | 
73 |     async def get_recommendations(self) -> list[dict]:
74 |         return self.recommendations
75 | 
76 |     async def get_history(self) -> list[dict]:
77 |         return self.history
78 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/securitytrailssearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | from theHarvester.parsers import securitytrailsparser
 6 | 
 7 | 
 8 | class SearchSecuritytrail:
 9 |     def __init__(self, word) -> None:
10 |         self.word = word
11 |         self.key = Core.security_trails_key()
12 |         if self.key is None:
13 |             raise MissingKey('Securitytrail')
14 |         self.results = ''
15 |         self.totalresults = ''
16 |         self.api = 'https://api.securitytrails.com/v1/'
17 |         self.info: tuple[set, set] = (set(), set())
18 |         self.proxy = False
19 | 
20 |     async def authenticate(self) -> None:
21 |         # Method to authenticate API key before sending requests.
22 |         headers = {'APIKEY': self.key}
23 |         url = f'{self.api}ping'
24 |         auth_responses = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
25 |         auth_responses = auth_responses[0]
26 |         if 'False' in auth_responses or 'Invalid authentication' in auth_responses:
27 |             print('\tKey could not be authenticated exiting program.')
28 |         await asyncio.sleep(5)
29 | 
30 |     async def do_search(self) -> None:
31 |         # https://api.securitytrails.com/v1/domain/domain.com
32 |         url = f'{self.api}domain/{self.word}'
33 |         headers = {'APIKEY': self.key}
34 |         response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
35 |         await asyncio.sleep(5)  # Not random delay because 2 seconds is required due to rate limit.
36 |         self.results = response[0]
37 |         self.totalresults += self.results
38 |         url += '/subdomains'  # Get subdomains now.
39 |         subdomain_response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
40 |         await asyncio.sleep(5)
41 |         self.results = subdomain_response[0]
42 |         self.totalresults += self.results
43 | 
44 |     async def process(self, proxy: bool = False) -> None:
45 |         self.proxy = proxy
46 |         await self.authenticate()
47 |         await self.do_search()
48 |         parser = securitytrailsparser.Parser(word=self.word, text=self.totalresults)
49 |         self.info = await parser.parse_text()
50 |         # Create parser and set self.info to tuple returned from parsing text.
51 |         print('\tDone Searching Results')
52 | 
53 |     async def get_ips(self) -> set:
54 |         return self.info[0]
55 | 
56 |     async def get_hostnames(self) -> set:
57 |         return self.info[1]
58 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/shodansearch.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | from shodan import Shodan, exception
  4 | 
  5 | from theHarvester.discovery.constants import MissingKey
  6 | from theHarvester.lib.core import Core
  7 | 
  8 | 
  9 | class SearchShodan:
 10 |     def __init__(self) -> None:
 11 |         self.key = Core.shodan_key()
 12 |         if self.key is None:
 13 |             raise MissingKey('Shodan')
 14 |         self.api = Shodan(self.key)
 15 |         self.hostdatarow: list = []
 16 |         self.tracker: OrderedDict = OrderedDict()
 17 | 
 18 |     async def search_ip(self, ip) -> OrderedDict:
 19 |         try:
 20 |             ipaddress = ip
 21 |             results = self.api.host(ipaddress)
 22 |             asn = ''
 23 |             domains: list = list()
 24 |             hostnames: list = list()
 25 |             ip_str = ''
 26 |             isp = ''
 27 |             org = ''
 28 |             ports: list = list()
 29 |             title = ''
 30 |             server = ''
 31 |             product = ''
 32 |             technologies: list = list()
 33 | 
 34 |             data_first_dict = dict(results['data'][0])
 35 | 
 36 |             if 'ip_str' in data_first_dict:
 37 |                 ip_str += data_first_dict['ip_str']
 38 | 
 39 |             if 'http' in data_first_dict:
 40 |                 http_results_dict = dict(data_first_dict['http'])
 41 |                 if 'title' in http_results_dict:
 42 |                     title_val = str(http_results_dict['title']).strip()
 43 |                     if title_val != 'None':
 44 |                         title += title_val
 45 |                 if 'components' in http_results_dict:
 46 |                     for key in http_results_dict['components'].keys():
 47 |                         technologies.append(key)
 48 |                 if 'server' in http_results_dict:
 49 |                     server_val = str(http_results_dict['server']).strip()
 50 |                     if server_val != 'None':
 51 |                         server += server_val
 52 | 
 53 |             for key, value in results.items():
 54 |                 if key == 'asn':
 55 |                     asn += value
 56 |                 if key == 'domains':
 57 |                     value = list(value)
 58 |                     value.sort()
 59 |                     domains.extend(value)
 60 |                 if key == 'hostnames':
 61 |                     value = [host.strip() for host in list(value)]
 62 |                     value.sort()
 63 |                     hostnames.extend(value)
 64 |                 if key == 'isp':
 65 |                     isp += value
 66 |                 if key == 'org':
 67 |                     org += str(value)
 68 |                 if key == 'ports':
 69 |                     value = list(value)
 70 |                     value.sort()
 71 |                     ports.extend(value)
 72 |                 if key == 'product':
 73 |                     product += value
 74 | 
 75 |             technologies = list(set(technologies))
 76 | 
 77 |             self.tracker[ip] = {
 78 |                 'asn': asn.strip(),
 79 |                 'domains': domains,
 80 |                 'hostnames': hostnames,
 81 |                 'ip_str': ip_str.strip(),
 82 |                 'isp': isp.strip(),
 83 |                 'org': org.strip(),
 84 |                 'ports': ports,
 85 |                 'product': product.strip(),
 86 |                 'server': server.strip(),
 87 |                 'technologies': technologies,
 88 |                 'title': title.strip(),
 89 |             }
 90 | 
 91 |             return self.tracker
 92 |         except exception.APIError:
 93 |             print(f'{ip}: Not in Shodan')
 94 |             self.tracker[ip] = 'Not in Shodan'
 95 |         except Exception as e:
 96 |             # print(f'Error occurred in the Shodan IP search module: {e}')
 97 |             self.tracker[ip] = f'Error occurred in the Shodan IP search module: {e}'
 98 |         finally:
 99 |             return self.tracker
100 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/sitedossier.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from theHarvester.discovery.constants import get_delay
 6 | from theHarvester.lib.core import AsyncFetcher, Core
 7 | 
 8 | 
 9 | class SearchSitedossier:
10 |     def __init__(self, word):
11 |         self.word = word
12 |         self.totalhosts = set()
13 |         self.server = 'www.sitedossier.com'
14 |         self.proxy = False
15 | 
16 |     async def do_search(self):
17 |         # 2023 but this site doesn't support https...
18 |         # This site seems to yield a lot of results but is a bit annoying to scrape
19 |         # Hence the need for delays after each request to get the most results
20 |         # Feel free to tweak the delays as needed
21 |         url = f'http://{self.server}/parentdomain/{self.word}'
22 |         headers = {'User-Agent': Core.get_user_agent()}
23 |         response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
24 |         base_response = response[0]
25 |         soup = BeautifulSoup(base_response, 'html.parser')
26 |         # iter_counter = 1
27 |         # iterations_needed = total_number // 100
28 |         # iterations_needed += 1
29 |         flagged_counter = 0
30 |         stop_conditions = ['End of list.', 'No data currently available.']
31 |         bot_string = (
32 |             'Our web servers have detected unusual or excessive requests '
33 |             'from your computer or network. Please enter the unique "word"'
34 |             ' below to confirm that you are a human interactively using this site.'
35 |         )
36 |         if (
37 |             stop_conditions[0] not in base_response and stop_conditions[1] not in base_response
38 |         ) and bot_string not in base_response:
39 |             total_number = soup.find('i')
40 |             total_number = int(total_number.text.strip().split(' ')[-1].replace(',', ''))
41 |             hrefs = soup.find_all('a', href=True)
42 |             for a in hrefs:
43 |                 unparsed = a['href']
44 |                 if '/site/' in unparsed:
45 |                     subdomain = str(unparsed.split('/')[-1]).lower()
46 |                     self.totalhosts.add(subdomain)
47 |             await asyncio.sleep(get_delay() + 15 + get_delay())
48 |             for i in range(101, total_number, 100):
49 |                 headers = {'User-Agent': Core.get_user_agent()}
50 |                 iter_url = f'http://{self.server}/parentdomain/{self.word}/{i}'
51 |                 print(f'My current iter_url: {iter_url}')
52 |                 response = await AsyncFetcher.fetch_all([iter_url], headers=headers, proxy=self.proxy)
53 |                 response = response[0]
54 |                 if stop_conditions[0] in response or stop_conditions[1] in response or flagged_counter >= 3:
55 |                     break
56 |                 if bot_string in response:
57 |                     new_sleep_time = get_delay() * 30
58 |                     print(f'Triggered a captcha for sitedossier sleeping for: {new_sleep_time} seconds')
59 |                     flagged_counter += 1
60 |                     await asyncio.sleep(new_sleep_time)
61 |                     response = await AsyncFetcher.fetch_all(
62 |                         [iter_url],
63 |                         headers={'User-Agent': Core.get_user_agent()},
64 |                         proxy=self.proxy,
65 |                     )
66 |                     response = response[0]
67 |                     if bot_string in response:
68 |                         new_sleep_time = get_delay() * 30 * get_delay()
69 |                         print(
70 |                             f'Still triggering a captcha, sleeping longer for: {new_sleep_time}'
71 |                             f' and skipping this batch: {iter_url}'
72 |                         )
73 |                         await asyncio.sleep(new_sleep_time)
74 |                         flagged_counter += 1
75 |                         if flagged_counter >= 3:
76 |                             break
77 |                 soup = BeautifulSoup(response, 'html.parser')
78 |                 hrefs = soup.find_all('a', href=True)
79 |                 for a in hrefs:
80 |                     unparsed = a['href']
81 |                     if '/site/' in unparsed:
82 |                         subdomain = str(unparsed.split('/')[-1]).lower()
83 |                         self.totalhosts.add(subdomain)
84 |                 await asyncio.sleep(get_delay() + 15 + get_delay())
85 |             print(f'In total found: {len(self.totalhosts)}')
86 |             print(self.totalhosts)
87 |         else:
88 |             print('Sitedossier module has triggered a captcha on first iteration, no results can be found.')
89 |             print('Change IPs, manually solve the captcha, or wait before rerunning Sitedossier module')
90 | 
91 |     async def get_hostnames(self):
92 |         return self.totalhosts
93 | 
94 |     async def process(self, proxy: bool = False) -> None:
95 |         self.proxy = proxy
96 |         await self.do_search()
97 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/subdomaincenter.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | 
 3 | 
 4 | class SubdomainCenter:
 5 |     def __init__(self, word):
 6 |         self.word = word
 7 |         self.results = set()
 8 |         self.server = 'https://api.subdomain.center/?domain='
 9 |         self.proxy = False
10 | 
11 |     async def do_search(self):
12 |         headers = {'User-Agent': Core.get_user_agent()}
13 |         try:
14 |             current_url = f'{self.server}{self.word}'
15 |             resp = await AsyncFetcher.fetch_all([current_url], headers=headers, proxy=self.proxy, json=True)
16 |             self.results = resp[0]
17 |             self.results = {sub[4:] if sub[:4] == 'www.' and sub[4:] else sub for sub in self.results}
18 |         except Exception as e:
19 |             print(f'An exception has occurred in SubdomainCenter on : {e}')
20 | 
21 |     async def get_hostnames(self):
22 |         return self.results
23 | 
24 |     async def process(self, proxy=False):
25 |         self.proxy = proxy
26 |         await self.do_search()
27 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/subdomainfinderc99.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | import ujson
 4 | from bs4 import BeautifulSoup
 5 | 
 6 | from theHarvester.discovery.constants import get_delay
 7 | from theHarvester.lib.core import AsyncFetcher, Core
 8 | from theHarvester.parsers import myparser
 9 | 
10 | 
11 | class SearchSubdomainfinderc99:
12 |     def __init__(self, word) -> None:
13 |         self.word = word
14 |         self.total_results: set = set()
15 |         self.proxy = False
16 |         # TODO add api support
17 |         self.server = 'https://subdomainfinder.c99.nl/'
18 |         self.totalresults = ''
19 | 
20 |     async def do_search(self) -> None:
21 |         # Based on https://gist.github.com/th3gundy/bc83580cbe04031e9164362b33600962
22 |         headers = {'User-Agent': Core.get_user_agent()}
23 |         resp = await AsyncFetcher.fetch_all([self.server], headers=headers, proxy=self.proxy)
24 |         data = await self.get_csrf_params(resp[0])
25 | 
26 |         data['scan_subdomains'] = ''
27 |         data['domain'] = self.word
28 |         data['privatequery'] = 'on'
29 |         await asyncio.sleep(get_delay())
30 |         second_resp = await AsyncFetcher.post_fetch(self.server, headers=headers, proxy=self.proxy, data=ujson.dumps(data))
31 | 
32 |         # print(second_resp)
33 |         self.totalresults += second_resp
34 |         # y = await self.get_hostnames()
35 |         # print(list(sorted(y)))
36 |         # print(f'Found: {len(y)} subdomains')
37 | 
38 |         # regex = r"value='(https://subdomainfinder\.c99\.nl/scans/\d{4}-\d{2}-\d{2}/" + self.word + r")'"
39 |         # match = re.search(regex, second_resp)
40 |         # if match:
41 |         #     print(match.group(1))
42 | 
43 |     async def get_hostnames(self):
44 |         rawres = myparser.Parser(self.totalresults, self.word)
45 |         return await rawres.hostnames()
46 | 
47 |     async def process(self, proxy: bool = False) -> None:
48 |         self.proxy = proxy
49 |         await self.do_search()
50 | 
51 |     @staticmethod
52 |     async def get_csrf_params(data):
53 |         csrf_params = {}
54 |         html = BeautifulSoup(data, 'html.parser').find('div', {'class': 'input-group'})
55 |         for c in html.find_all('input'):
56 |             try:
57 |                 csrf_params[c.get('name')] = c.get('value')
58 |             except Exception:
59 |                 continue
60 | 
61 |         return csrf_params
62 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/takeover.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from collections import defaultdict
 3 | from random import shuffle
 4 | 
 5 | import ujson
 6 | 
 7 | from theHarvester.lib.core import AsyncFetcher, Core
 8 | 
 9 | 
10 | class TakeOver:
11 |     def __init__(self, hosts) -> None:
12 |         # NOTE THIS MODULE IS ACTIVE RECON
13 |         self.hosts = hosts
14 |         self.proxy = False
15 |         self.fingerprints: dict[str, str] = dict()
16 |         # https://stackoverflow.com/questions/33080869/python-how-to-create-a-dict-of-dict-of-list-with-defaultdict
17 |         self.results: defaultdict[str, list] = defaultdict()
18 | 
19 |     async def populate_fingerprints(self):
20 |         # Thank you to https://github.com/EdOverflow/can-i-take-over-xyz for these fingerprints
21 |         populate_url = 'https://raw.githubusercontent.com/EdOverflow/can-i-take-over-xyz/master/fingerprints.json'
22 |         headers = {'User-Agent': Core.get_user_agent()}
23 |         response = await AsyncFetcher.fetch_all([populate_url], headers=headers)
24 |         try:
25 |             resp = response[0]
26 |             unparsed_json = ujson.loads(resp)
27 |             for unparsed_fingerprint in unparsed_json:
28 |                 if unparsed_fingerprint['service'] in ['Smugsmug']:
29 |                     # Subdomain must be in format domain.smugsmug.com
30 |                     # This will never happen as subdomains are parsed and filtered to be in format of *.word.com
31 |                     continue
32 |                 if unparsed_fingerprint['status'] == 'Vulnerable' or unparsed_fingerprint['status'] == 'Edge case':
33 |                     self.fingerprints[unparsed_fingerprint['fingerprint']] = unparsed_fingerprint['service']
34 |         except Exception as e:
35 |             print(f'An exception has occurred populating takeover fingerprints: {e}, defaulting to static list')
36 |             self.fingerprints = {
37 |                 "'Trying to access your account?'": 'Campaign Monitor',
38 |                 '404 Not Found': 'Fly.io',
39 |                 '404 error unknown site!': 'Pantheon',
40 |                 'Do you want to register *.wordpress.com?': 'Wordpress',
41 |                 'Domain uses DO name serves with no records in DO.': 'Digital Ocean',
42 |                 "It looks like you may have taken a wrong turn somewhere. Don't worry...it happens to all of us.": 'LaunchRock',
43 |                 'No Site For Domain': 'Kinsta',
44 |                 'No settings were found for this company:': 'Help Scout',
45 |                 'Project doesnt exist... yet!': 'Readme.io',
46 |                 'Repository not found': 'Bitbucket',
47 |                 'The feed has not been found.': 'Feedpress',
48 |                 'No such app': 'Heroku',
49 |                 'The specified bucket does not exist': 'AWS/S3',
50 |                 'The thing you were looking for is no longer here, or never was': 'Ghost',
51 |                 "There isn't a Github Pages site here.": 'Github',
52 |                 'This UserVoice subdomain is currently available!': 'UserVoice',
53 |                 "Uh oh. That page doesn't exist.": 'Intercom',
54 |                 "We could not find what you're looking for.": 'Help Juice',
55 |                 "Whatever you were looking for doesn't currently exist at this address": 'Tumblr',
56 |                 'is not a registered InCloud YouTrack': 'JetBrains',
57 |                 'page not found': 'Uptimerobot',
58 |                 'project not found': 'Surge.sh',
59 |             }
60 | 
61 |     async def check(self, url, resp) -> None:
62 |         # Simple function that takes response and checks if any fingerprints exist
63 |         # If a fingerprint exists figures out which one and prints it out
64 |         regex = re.compile('(?=(' + '|'.join(map(re.escape, list(self.fingerprints.keys()))) + '))')
65 |         # Sanitize fingerprints
66 |         matches = re.findall(regex, resp)
67 |         matches = list(set(matches))
68 |         for match in matches:
69 |             print(f'\t\033[91m Takeover detected: {url}\033[1;32;40m')
70 |             if match in self.fingerprints.keys():
71 |                 # Validation check as to not error out
72 |                 service = self.fingerprints[match]
73 |                 print(f'\t\033[91m Type of takeover is: {service} with match: {match}\033[1;32;40m')
74 |                 self.results[url].append({match: service})
75 | 
76 |     async def do_take(self) -> None:
77 |         try:
78 |             if len(self.hosts) > 0:
79 |                 # Returns a list of tuples in this format: (url, response)
80 |                 # Filter out responses whose responses are empty strings (indicates errored)
81 |                 https_hosts = [f'https://{host}' for host in self.hosts]
82 |                 http_hosts = [f'http://{host}' for host in self.hosts]
83 |                 all_hosts = https_hosts + http_hosts
84 |                 shuffle(all_hosts)
85 |                 resps: list = await AsyncFetcher.fetch_all(all_hosts, takeover=True, proxy=self.proxy)
86 |                 for url, resp in tuple(resp for resp in resps if len(resp[1]) >= 1):
87 |                     await self.check(url, resp)
88 |             else:
89 |                 return
90 |         except Exception as e:
91 |             print(e)
92 | 
93 |     async def process(self, proxy: bool = False) -> None:
94 |         self.proxy = proxy
95 |         await self.do_take()
96 | 
97 |     async def get_takeover_results(self):
98 |         return self.results
99 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/threatminer.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher
 2 | 
 3 | 
 4 | class SearchThreatminer:
 5 |     def __init__(self, word) -> None:
 6 |         self.word = word
 7 |         self.totalhosts: set = set()
 8 |         self.totalips: set = set()
 9 |         self.proxy = False
10 | 
11 |     async def do_search(self) -> None:
12 |         url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5'
13 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
14 |         self.totalhosts = {host for host in response[0]['results']}
15 |         second_url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2'
16 |         secondresp = await AsyncFetcher.fetch_all([second_url], json=True, proxy=self.proxy)
17 |         try:
18 |             self.totalips = {resp['ip'] for resp in secondresp[0]['results']}
19 |         except TypeError:
20 |             pass
21 | 
22 |     async def get_hostnames(self) -> set:
23 |         return self.totalhosts
24 | 
25 |     async def get_ips(self) -> set:
26 |         return self.totalips
27 | 
28 |     async def process(self, proxy: bool = False) -> None:
29 |         self.proxy = proxy
30 |         await self.do_search()
31 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/tombasearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchTomba:
 8 |     def __init__(self, word, limit, start) -> None:
 9 |         self.word = word
10 |         self.limit = limit
11 |         self.limit = 10 if limit > 10 else limit
12 |         self.start = start
13 |         self.key = Core.tomba_key()
14 |         if self.key[0] is None or self.key[1] is None:
15 |             raise MissingKey('Tomba Key and/or Secret')
16 |         self.total_results = ''
17 |         self.counter = start
18 |         self.database = f'https://api.tomba.io/v1/domain-search?domain={self.word}&limit=10'
19 |         self.proxy = False
20 |         self.hostnames: list = []
21 |         self.emails: list = []
22 | 
23 |     async def do_search(self) -> None:
24 |         # First determine if a user account is not a free account, this call is free
25 |         is_free = True
26 |         headers = {
27 |             'User-Agent': Core.get_user_agent(),
28 |             'X-Tomba-Key': self.key[0],
29 |             'X-Tomba-Secret': self.key[1],
30 |         }
31 |         acc_info_url = 'https://api.tomba.io/v1/me'
32 |         response = await AsyncFetcher.fetch_all([acc_info_url], headers=headers, json=True)
33 |         is_free = (
34 |             is_free
35 |             if 'name' in response[0]['data']['pricing'].keys() and response[0]['data']['pricing']['name'].lower() == 'free'
36 |             else False
37 |         )
38 |         # Extract the total number of requests that are available for an account
39 | 
40 |         total_requests_avail = (
41 |             response[0]['data']['requests']['domains']['available'] - response[0]['data']['requests']['domains']['used']
42 |         )
43 | 
44 |         if is_free:
45 |             response = await AsyncFetcher.fetch_all([self.database], headers=headers, proxy=self.proxy, json=True)
46 |             self.emails, self.hostnames = await self.parse_resp(json_resp=response[0])
47 |         else:
48 |             # Determine the total number of emails that are available
49 |             # As the most emails you can get within one query are 100
50 |             # This is only done where paid accounts are in play
51 |             tomba_counter = f'https://api.tomba.io/v1/email-count?domain={self.word}'
52 |             response = await AsyncFetcher.fetch_all([tomba_counter], headers=headers, proxy=self.proxy, json=True)
53 |             total_number_reqs = response[0]['data']['total'] // 100
54 |             # Parse out meta field within initial JSON response to determine the total number of results
55 |             if total_requests_avail < total_number_reqs:
56 |                 print('WARNING: The account does not have enough requests to gather all the emails.')
57 |                 print(f'Total requests available: {total_requests_avail}, total requests needed to be made: {total_number_reqs}')
58 |                 print(
59 |                     'RETURNING current results, If you still wish to run this module despite the current results, please comment out the "if request" line.'
60 |                 )
61 |                 return
62 |             self.limit = 100
63 |             # max number of emails you can get per request
64 |             # increments of max number with page determining where to start
65 |             # See docs for more details: https://developer.tomba.io/#domain-search
66 |             for page in range(0, total_number_reqs + 1):
67 |                 req_url = f'https://api.tomba.io/v1/domain-search?domain={self.word}&limit={self.limit}&page={page}'
68 |                 response = await AsyncFetcher.fetch_all([req_url], headers=headers, proxy=self.proxy, json=True)
69 |                 temp_emails, temp_hostnames = await self.parse_resp(response[0])
70 |                 self.emails.extend(temp_emails)
71 |                 self.hostnames.extend(temp_hostnames)
72 |                 await asyncio.sleep(1)
73 | 
74 |     async def parse_resp(self, json_resp):
75 |         emails = list(sorted({email['email'] for email in json_resp['data']['emails']}))
76 |         domains = list(
77 |             sorted(
78 |                 {
79 |                     source['website_url']
80 |                     for email in json_resp['data']['emails']
81 |                     for source in email['sources']
82 |                     if self.word in source['website_url']
83 |                 }
84 |             )
85 |         )
86 |         return emails, domains
87 | 
88 |     async def process(self, proxy: bool = False) -> None:
89 |         self.proxy = proxy
90 |         await self.do_search()  # Only need to do it once.
91 | 
92 |     async def get_emails(self):
93 |         return self.emails
94 | 
95 |     async def get_hostnames(self):
96 |         return self.hostnames
97 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/urlscan.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher
 2 | 
 3 | 
 4 | class SearchUrlscan:
 5 |     def __init__(self, word) -> None:
 6 |         self.word = word
 7 |         self.totalhosts: set = set()
 8 |         self.totalips: set = set()
 9 |         self.interestingurls: set = set()
10 |         self.totalasns: set = set()
11 |         self.proxy = False
12 | 
13 |     async def do_search(self) -> None:
14 |         url = f'https://urlscan.io/api/v1/search/?q=domain:{self.word}'
15 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
16 |         resp = response[0]
17 |         self.totalhosts = {f'{page["page"]["domain"]}' for page in resp['results']}
18 |         self.totalips = {f'{page["page"]["ip"]}' for page in resp['results'] if 'ip' in page['page'].keys()}
19 |         self.interestingurls = {
20 |             f'{page["page"]["url"]}'
21 |             for page in resp['results']
22 |             if self.word in page['page']['url'] and 'url' in page['page'].keys()
23 |         }
24 |         self.totalasns = {f'{page["page"]["asn"]}' for page in resp['results'] if 'asn' in page['page'].keys()}
25 | 
26 |     async def get_hostnames(self) -> set:
27 |         return self.totalhosts
28 | 
29 |     async def get_ips(self) -> set:
30 |         return self.totalips
31 | 
32 |     async def get_interestingurls(self) -> set:
33 |         return self.interestingurls
34 | 
35 |     async def get_asns(self) -> set:
36 |         return self.totalasns
37 | 
38 |     async def process(self, proxy: bool = False) -> None:
39 |         self.proxy = proxy
40 |         await self.do_search()
41 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/venacussearch.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import aiohttp
 4 | 
 5 | from theHarvester.discovery.constants import MissingKey
 6 | from theHarvester.lib.core import Core
 7 | from theHarvester.parsers import venacusparser
 8 | 
 9 | 
10 | class SearchVenacus:
11 |     def __init__(self, word: str, limit=1000, offset_doc=0) -> None:
12 |         self.word = word
13 |         self.key = Core.venacus_key()
14 |         if self.key is None:
15 |             raise MissingKey('Venacus')
16 |         self.base_url = 'https://api.venacus.com'
17 |         self.results: list[dict[str, Any]] = []
18 |         self.parsed: dict[str, Any] = {}
19 |         self.proxy = False
20 |         self.offset_doc = offset_doc
21 |         self.offset_in_doc = 0
22 |         self.ai = False
23 |         self.more = True
24 |         self.limit = limit
25 | 
26 |     async def do_search(self) -> None:
27 |         total_results = []
28 |         result_count = 0
29 | 
30 |         try:
31 |             headers = {
32 |                 'Authorization': f'Bearer {self.key}',
33 |                 'User-Agent': f'{Core.get_user_agent()}-theHarvester',
34 |             }
35 | 
36 |             async with aiohttp.ClientSession() as session:
37 |                 while self.more and result_count < self.limit:
38 |                     query = {
39 |                         'q': self.word,
40 |                         'offset_doc': self.offset_doc,
41 |                         'offset_in_doc': self.offset_in_doc,
42 |                         'limit': 100,
43 |                         'ai': 'true' if self.ai else 'false',
44 |                     }
45 | 
46 |                     async with session.get(f'{self.base_url}/v1/search/', headers=headers, params=query) as total_resp:
47 |                         search_data = await total_resp.json()
48 |                         current_results = search_data.get('data', [])
49 | 
50 |                         if not current_results:
51 |                             print('No more results found.')
52 |                             break
53 | 
54 |                         total_results.extend(current_results)
55 |                         result_count += len(current_results)
56 | 
57 |                         self.offset_doc = search_data.get('offset_doc', 0)
58 |                         self.offset_in_doc = search_data.get('offset_in_doc', 0)
59 | 
60 |                         self.more = search_data.get('more', False)
61 | 
62 |                 self.results = total_results
63 |                 if not self.results:
64 |                     print('No results found.')
65 | 
66 |         except Exception as e:
67 |             print(f'An exception has occurred in Venacus: {e}')
68 | 
69 |     async def process(self, proxy: bool = False):
70 |         self.proxy = proxy
71 |         await self.do_search()
72 |         parser = venacusparser.Parser()
73 |         self.parsed = await parser.parse_text_tokens(self.results)  # type: ignore
74 | 
75 |     async def get_people(self) -> list[dict[str, str]]:
76 |         if 'people' not in self.parsed:
77 |             return []
78 |         return self.parsed['people']
79 | 
80 |     async def get_emails(self) -> set[str]:
81 |         if 'emails' not in self.parsed:
82 |             return set()
83 |         return self.parsed['emails']
84 | 
85 |     async def get_ips(self) -> set[str]:
86 |         if 'ips' not in self.parsed:
87 |             return set()
88 |         return self.parsed['ips']
89 | 
90 |     async def get_interestingurls(self) -> set[str]:
91 |         if 'urls' not in self.parsed:
92 |             return set()
93 |         return self.parsed['urls']
94 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/virustotal.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | 
  3 | from theHarvester.discovery.constants import MissingKey
  4 | from theHarvester.lib.core import AsyncFetcher, Core
  5 | 
  6 | 
  7 | class SearchVirustotal:
  8 |     def __init__(self, word) -> None:
  9 |         self.key = Core.virustotal_key()
 10 |         if self.key is None:
 11 |             raise MissingKey('virustotal')
 12 |         self.word = word
 13 |         self.proxy = False
 14 |         self.hostnames: list = []
 15 | 
 16 |     async def do_search(self) -> None:
 17 |         # TODO determine if more endpoints can yield useful info given a domain
 18 |         # based on: https://developers.virustotal.com/reference/domains-relationships
 19 |         # base_url = "https://www.virustotal.com/api/v3/domains/domain/subdomains?limit=40"
 20 |         headers = {
 21 |             'User-Agent': Core.get_user_agent(),
 22 |             'Accept': 'application/json',
 23 |             'x-apikey': self.key,
 24 |         }
 25 |         base_url = f'https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40'
 26 |         cursor = ''
 27 |         count = 0
 28 |         fail_counter = 0
 29 |         counter = 0
 30 |         breakcon = False
 31 |         while True:
 32 |             if breakcon:
 33 |                 break
 34 |             # rate limit is 4 per minute
 35 |             # TODO add timer logic if proven to be needed
 36 |             # in the meantime sleeping 16 seconds should eliminate hitting the rate limit
 37 |             # in case rate limit is hit, fail counter exists and sleep for 65 seconds
 38 |             send_url = base_url + '&cursor=' + cursor if cursor != '' and len(cursor) > 2 else base_url
 39 |             responses = await AsyncFetcher.fetch_all([send_url], headers=headers, proxy=self.proxy, json=True)
 40 |             jdata = responses[0]
 41 |             if 'data' not in jdata.keys():
 42 |                 await asyncio.sleep(60 + 5)
 43 |                 fail_counter += 1
 44 |             if 'meta' in jdata.keys():
 45 |                 cursor = jdata['meta']['cursor'] if 'cursor' in jdata['meta'].keys() else ''
 46 |                 if len(cursor) == 0 and 'data' in jdata.keys():
 47 |                     # if cursor no longer is within the meta field have hit last entry
 48 |                     breakcon = True
 49 |             count += jdata['meta']['count']
 50 |             if count == 0 or fail_counter >= 2:
 51 |                 break
 52 |             if 'data' in jdata.keys():
 53 |                 data = jdata['data']
 54 |                 self.hostnames.extend(await self.parse_hostnames(data, self.word))
 55 |                 counter += 1
 56 |             await asyncio.sleep(16)
 57 |         self.hostnames = list(sorted(set(self.hostnames)))
 58 |         # verify domains such as x.x.com.multicdn.x.com are parsed properly
 59 |         self.hostnames = [
 60 |             host for host in self.hostnames if ((len(host.split('.')) >= 3) and host.split('.')[-2] == self.word.split('.')[-2])
 61 |         ]
 62 | 
 63 |     async def get_hostnames(self) -> list:
 64 |         return self.hostnames
 65 | 
 66 |     @staticmethod
 67 |     async def parse_hostnames(data, word):
 68 |         total_subdomains = set()
 69 |         for attribute in data:
 70 |             total_subdomains.add(attribute['id'].replace('"', '').replace('www.', ''))
 71 |             attributes = attribute['attributes']
 72 |             total_subdomains.update(
 73 |                 {
 74 |                     value['value'].replace('"', '').replace('www.', '')
 75 |                     for value in attributes['last_dns_records']
 76 |                     if word in value['value']
 77 |                 }
 78 |             )
 79 |             if 'last_https_certificate' in attributes.keys():
 80 |                 total_subdomains.update(
 81 |                     {
 82 |                         value.replace('"', '').replace('www.', '')
 83 |                         for value in attributes['last_https_certificate']['extensions']['subject_alternative_name']
 84 |                         if word in value
 85 |                     }
 86 |                 )
 87 |         total_subdomains = list(sorted(total_subdomains))
 88 |         # Other false positives may occur over time and yes there are other ways to parse this, feel free to implement
 89 |         # them and submit a PR or raise an issue if you run into this filtering not being enough
 90 |         # TODO determine if parsing 'v=spf1 include:_spf-x.acme.com include:_spf-x.acme.com' is worth parsing
 91 |         total_subdomains = [
 92 |             x
 93 |             for x in total_subdomains
 94 |             if 'edgekey.net' not in str(x) and 'akadns.net' not in str(x) and 'include:_spf' not in str(x)
 95 |         ]
 96 |         total_subdomains.sort()
 97 |         return total_subdomains
 98 | 
 99 |     async def process(self, proxy: bool = False) -> None:
100 |         self.proxy = proxy
101 |         await self.do_search()
102 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/whoisxml.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import MissingKey
 2 | from theHarvester.lib.core import AsyncFetcher, Core
 3 | 
 4 | 
 5 | class SearchWhoisXML:
 6 |     def __init__(self, word) -> None:
 7 |         self.word = word
 8 |         self.key = Core.whoisxml_key()
 9 |         if self.key is None:
10 |             raise MissingKey('whoisxml')
11 |         self.total_results = None
12 |         self.proxy = False
13 | 
14 |     async def do_search(self):
15 |         # https://subdomains.whoisxmlapi.com/api/documentation/making-requests
16 |         url = 'https://subdomains.whoisxmlapi.com/api/v1'
17 |         params = {'apiKey': self.key, 'domainName': self.word}
18 |         response = await AsyncFetcher.fetch_all(
19 |             [url],
20 |             json=True,
21 |             params=params,
22 |             headers={'User-Agent': Core.get_user_agent()},
23 |             proxy=self.proxy,
24 |         )
25 |         # Parse the response according to the example JSON structure:
26 |         # {"search":"example.com.com","result":{"count":10000,"records":[{"domain":"test.example.com","firstSeen":1678169400,"lastSeen":1678169400}]}}
27 |         self.total_results = []
28 |         print(response[0])
29 |         if response and response[0]:
30 |             # Extract domains from the records array
31 |             if 'result' in response[0] and 'records' in response[0]['result']:
32 |                 self.total_results = [record['domain'] for record in response[0]['result']['records']]
33 | 
34 |     async def get_hostnames(self):
35 |         return self.total_results
36 | 
37 |     async def process(self, proxy: bool = False) -> None:
38 |         self.proxy = proxy
39 |         await self.do_search()
40 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/yahoosearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | from theHarvester.parsers import myparser
 3 | 
 4 | 
 5 | class SearchYahoo:
 6 |     def __init__(self, word, limit) -> None:
 7 |         self.word = word
 8 |         self.total_results = ''
 9 |         self.server = 'search.yahoo.com'
10 |         self.limit = limit
11 |         self.proxy = False
12 | 
13 |     async def do_search(self) -> None:
14 |         base_url = f'https://{self.server}/search?p=%40{self.word}&b=xx&pz=10'
15 |         headers = {'Host': self.server, 'User-agent': Core.get_user_agent()}
16 |         urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
17 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
18 |         for response in responses:
19 |             self.total_results += response
20 | 
21 |     async def process(self, proxy: bool = False) -> None:
22 |         self.proxy = proxy
23 |         await self.do_search()
24 | 
25 |     async def get_emails(self):
26 |         rawres = myparser.Parser(self.total_results, self.word)
27 |         toparse_emails = await rawres.emails()
28 |         emails = set()
29 |         # strip out numbers and dashes for emails that look like xxx-xxx-xxxemail@host.tld
30 |         for email in toparse_emails:
31 |             email = str(email)
32 |             if '-' in email and email[0].isdigit() and email.index('-') <= 9:
33 |                 while email[0] == '-' or email[0].isdigit():
34 |                     email = email[1:]
35 |             emails.add(email)
36 |         return list(emails)
37 | 
38 |     async def get_hostnames(self, proxy: bool = False):
39 |         self.proxy = proxy
40 |         rawres = myparser.Parser(self.total_results, self.word)
41 |         return await rawres.hostnames()
42 | 


--------------------------------------------------------------------------------
/theHarvester/lib/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['hostchecker']
2 | 


--------------------------------------------------------------------------------
/theHarvester/lib/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/lib/api/__init__.py


--------------------------------------------------------------------------------
/theHarvester/lib/api/additional_endpoints.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, Depends, HTTPException
 2 | from pydantic import BaseModel
 3 | 
 4 | from theHarvester.discovery.additional_apis import AdditionalAPIs
 5 | from theHarvester.lib.api.auth import get_api_key
 6 | 
 7 | router = APIRouter()
 8 | 
 9 | 
10 | class DomainRequest(BaseModel):
11 |     domain: str
12 |     api_keys: dict[str, str] | None = None
13 | 
14 | 
15 | @router.post('/breaches')
16 | async def get_breaches(request: DomainRequest, api_key: str = Depends(get_api_key)):
17 |     """Get breach information for a domain using HaveIBeenPwned."""
18 |     try:
19 |         apis = AdditionalAPIs(request.domain, request.api_keys or {})
20 |         results = await apis.haveibeenpwned.search_breaches(request.domain)
21 |         return {'status': 'success', 'data': results}
22 |     except Exception as e:
23 |         raise HTTPException(status_code=500, detail=str(e))
24 | 
25 | 
26 | @router.post('/leaks')
27 | async def get_leaks(request: DomainRequest, api_key: str = Depends(get_api_key)):
28 |     """Get leaked credentials for a domain using Leak-Lookup."""
29 |     try:
30 |         apis = AdditionalAPIs(request.domain, request.api_keys or {})
31 |         results = await apis.leaklookup.search_leaks(request.domain)
32 |         return {'status': 'success', 'data': results}
33 |     except Exception as e:
34 |         raise HTTPException(status_code=500, detail=str(e))
35 | 
36 | 
37 | @router.post('/security-score')
38 | async def get_security_score(request: DomainRequest, api_key: str = Depends(get_api_key)):
39 |     """Get security scorecard for a domain."""
40 |     try:
41 |         apis = AdditionalAPIs(request.domain, request.api_keys or {})
42 |         results = await apis.securityscorecard.get_domain_score(request.domain)
43 |         return {'status': 'success', 'data': results}
44 |     except Exception as e:
45 |         raise HTTPException(status_code=500, detail=str(e))
46 | 
47 | 
48 | @router.post('/tech-stack')
49 | async def get_tech_stack(request: DomainRequest, api_key: str = Depends(get_api_key)):
50 |     """Get technology stack information for a domain using BuiltWith."""
51 |     try:
52 |         apis = AdditionalAPIs(request.domain, request.api_keys or {})
53 |         results = await apis.builtwith.get_tech_stack(request.domain)
54 |         return {'status': 'success', 'data': results}
55 |     except Exception as e:
56 |         raise HTTPException(status_code=500, detail=str(e))
57 | 
58 | 
59 | @router.post('/all')
60 | async def get_all_info(request: DomainRequest, api_key: str = Depends(get_api_key)):
61 |     """Get all additional information for a domain."""
62 |     try:
63 |         apis = AdditionalAPIs(request.domain, request.api_keys or {})
64 |         results = await apis.process()
65 |         return {'status': 'success', 'data': results}
66 |     except Exception as e:
67 |         raise HTTPException(status_code=500, detail=str(e))
68 | 


--------------------------------------------------------------------------------
/theHarvester/lib/api/api.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | from fastapi import FastAPI, Header, Query, Request
  5 | from fastapi.middleware.cors import CORSMiddleware
  6 | from fastapi.responses import HTMLResponse, RedirectResponse, Response, UJSONResponse
  7 | from slowapi import Limiter, _rate_limit_exceeded_handler
  8 | from slowapi.errors import RateLimitExceeded
  9 | from slowapi.util import get_remote_address
 10 | from starlette.staticfiles import StaticFiles
 11 | 
 12 | from theHarvester import __main__
 13 | from theHarvester.lib.api.additional_endpoints import router as additional_router
 14 | 
 15 | limiter = Limiter(key_func=get_remote_address)
 16 | app = FastAPI(
 17 |     title='Restful Harvest',
 18 |     description='Rest API for theHarvester powered by FastAPI',
 19 |     version='0.0.2',
 20 | )
 21 | app.state.limiter = limiter
 22 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)  # type: ignore
 23 | 
 24 | # Add CORS middleware
 25 | app.add_middleware(
 26 |     CORSMiddleware,
 27 |     allow_origins=['*'],
 28 |     allow_credentials=True,
 29 |     allow_methods=['*'],
 30 |     allow_headers=['*'],
 31 | )
 32 | 
 33 | # Include additional endpoints
 34 | app.include_router(additional_router, prefix='/additional', tags=['Additional APIs'])
 35 | 
 36 | # This is where we will host files that arise if the user specifies a filename
 37 | try:
 38 |     app.mount('/static', StaticFiles(directory='theHarvester/lib/api/static/'), name='static')
 39 | except RuntimeError:
 40 |     static_path = os.path.expanduser('~/.local/share/theHarvester/static/')
 41 |     if not os.path.isdir(static_path):
 42 |         os.makedirs(static_path)
 43 |         app.mount(
 44 |             '/static',
 45 |             StaticFiles(directory=static_path),
 46 |             name='static',
 47 |         )
 48 | 
 49 | 
 50 | @app.get('/')
 51 | async def root(*, user_agent: str = Header(None)) -> Response:
 52 |     # very basic user agent filtering
 53 |     if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
 54 |         response = RedirectResponse(app.url_path_for('bot'))
 55 |         return response
 56 | 
 57 |     return HTMLResponse(
 58 |         """
 59 |     <!DOCTYPE html>
 60 |     <html lang="en-US">
 61 |         <head>
 62 |             <title>theHarvester API</title>
 63 |              <style>
 64 |               .img-container {
 65 |                 text-align: center;
 66 |                 display: block;
 67 |                 }
 68 |             </style>
 69 |         </head>
 70 |         <body>
 71 |             <br/>
 72 |             <a href="https://github.com/laramies/theHarvester" target="_blank">
 73 |             <span class="img-container">
 74 |                 <img src="https://raw.githubusercontent.com/laramies/theHarvester/master/theHarvester-logo.png" alt="theHarvester logo"/>
 75 |             </span>
 76 |             </a>
 77 |         </body>
 78 |     </html>
 79 |     """
 80 |     )
 81 | 
 82 | 
 83 | @app.get('/nicebot')
 84 | async def bot() -> dict[str, str]:
 85 |     # nice bot
 86 |     string = {'bot': 'These are not the droids you are looking for'}
 87 |     return string
 88 | 
 89 | 
 90 | @app.get('/sources', response_class=UJSONResponse)
 91 | @limiter.limit('5/minute')
 92 | async def getsources(request: Request):
 93 |     # Endpoint for user to query for available sources theHarvester supports
 94 |     # Rate limit of 5 requests per minute
 95 |     sources = __main__.Core.get_supportedengines()
 96 |     return {'sources': sources}
 97 | 
 98 | 
 99 | @app.get('/dnsbrute')
100 | @limiter.limit('5/minute')
101 | async def dnsbrute(
102 |     request: Request,
103 |     user_agent: str = Header(None),
104 |     domain: str = Query(..., description='Domain to be brute forced'),
105 | ) -> Response:
106 |     # Endpoint for user to signal to do DNS brute forcing
107 |     # Rate limit of 5 requests per minute
108 |     # basic user agent filtering
109 |     if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
110 |         response = RedirectResponse(app.url_path_for('bot'))
111 |         return response
112 |     dns_bruteforce = await __main__.start(
113 |         argparse.Namespace(
114 |             dns_brute=True,
115 |             dns_lookup=False,
116 |             dns_server=False,
117 |             dns_tld=False,
118 |             domain=domain,
119 |             filename='',
120 |             google_dork=False,
121 |             limit=500,
122 |             proxies=False,
123 |             shodan=False,
124 |             source=','.join([]),
125 |             start=0,
126 |             take_over=False,
127 |             virtual_host=False,
128 |         )
129 |     )
130 |     return UJSONResponse({'dns_bruteforce': dns_bruteforce})
131 | 
132 | 
133 | @app.get('/query')
134 | @limiter.limit('2/minute')
135 | async def query(
136 |     request: Request,
137 |     dns_server: str = Query(''),
138 |     user_agent: str = Header(None),
139 |     dns_brute: bool = Query(False),
140 |     dns_lookup: bool = Query(False),
141 |     dns_tld: bool = Query(False),
142 |     filename: str = Query(''),
143 |     google_dork: bool = Query(False),
144 |     proxies: bool = Query(False),
145 |     shodan: bool = Query(False),
146 |     take_over: bool = Query(False),
147 |     virtual_host: bool = Query(False),
148 |     source: list[str] = Query(..., description='Data sources to query comma separated with no space'),
149 |     limit: int = Query(500),
150 |     start: int = Query(0),
151 |     domain: str = Query(..., description='Domain to be harvested'),
152 | ) -> Response:
153 |     # Query function that allows user to query theHarvester rest API
154 |     # Rate limit of 2 requests per minute
155 |     # basic user agent filtering
156 |     if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
157 |         response = RedirectResponse(app.url_path_for('bot'))
158 |         return response
159 |     try:
160 |         (
161 |             asns,
162 |             iurls,
163 |             twitter_people_list,
164 |             linkedin_people_list,
165 |             linkedin_links,
166 |             aurls,
167 |             aips,
168 |             aemails,
169 |             ahosts,
170 |         ) = await __main__.start(
171 |             argparse.Namespace(
172 |                 dns_brute=dns_brute,
173 |                 dns_lookup=dns_lookup,
174 |                 dns_server=dns_server,
175 |                 dns_tld=dns_tld,
176 |                 domain=domain,
177 |                 filename=filename,
178 |                 google_dork=google_dork,
179 |                 limit=limit,
180 |                 proxies=proxies,
181 |                 shodan=shodan,
182 |                 source=','.join(source),
183 |                 start=start,
184 |                 take_over=take_over,
185 |                 virtual_host=virtual_host,
186 |             )
187 |         )
188 | 
189 |         return UJSONResponse(
190 |             {
191 |                 'asns': asns,
192 |                 'interesting_urls': iurls,
193 |                 'twitter_people': twitter_people_list,
194 |                 'linkedin_people': linkedin_people_list,
195 |                 'linkedin_links': linkedin_links,
196 |                 'trello_urls': aurls,
197 |                 'ips': aips,
198 |                 'emails': aemails,
199 |                 'hosts': ahosts,
200 |             }
201 |         )
202 |     except Exception:
203 |         return UJSONResponse({'exception': 'Please contact the server administrator to check the issue'})
204 | 


--------------------------------------------------------------------------------
/theHarvester/lib/api/api_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example script to query theHarvester rest API, obtain results, and write out to stdout as well as an html
  3 | """
  4 | 
  5 | import asyncio
  6 | 
  7 | import aiohttp
  8 | import netaddr
  9 | 
 10 | 
 11 | async def fetch_json(session, url):
 12 |     async with session.get(url) as response:
 13 |         return await response.json()
 14 | 
 15 | 
 16 | async def fetch(session, url):
 17 |     async with session.get(url) as response:
 18 |         return await response.text()
 19 | 
 20 | 
 21 | async def main() -> None:
 22 |     """
 23 |     Just a simple example of how to interact with the rest api
 24 |     you can easily use requests instead of aiohttp or whatever you best see fit
 25 |     """
 26 |     url = 'http://127.0.0.1:5000'
 27 |     domain = 'netflix.com'
 28 |     query_url = f'{url}/query?limit=300&source=bing,baidu,duckduckgo,dogpile&domain={domain}'
 29 |     async with aiohttp.ClientSession() as session:
 30 |         fetched_json = await fetch_json(session, query_url)
 31 |         total_asns = fetched_json['asns']
 32 |         interesting_urls = fetched_json['interesting_urls']
 33 |         twitter_people_list_tracker = fetched_json['twitter_people']
 34 |         linkedin_people_list_tracker = fetched_json['linkedin_people']
 35 |         linkedin_links_tracker = fetched_json['linkedin_links']
 36 |         trello_urls = fetched_json['trello_urls']
 37 |         ips = fetched_json['ips']
 38 |         emails = fetched_json['emails']
 39 |         hosts = fetched_json['hosts']
 40 | 
 41 |     if len(total_asns) > 0:
 42 |         print(f'\n[*] ASNS found: {len(total_asns)}')
 43 |         print('--------------------')
 44 |         total_asns = list(sorted(set(total_asns)))
 45 |         for asn in total_asns:
 46 |             print(asn)
 47 | 
 48 |     if len(interesting_urls) > 0:
 49 |         print(f'\n[*] Interesting Urls found: {len(interesting_urls)}')
 50 |         print('--------------------')
 51 |         interesting_urls = list(sorted(set(interesting_urls)))
 52 |         for iurl in interesting_urls:
 53 |             print(iurl)
 54 | 
 55 |     if len(twitter_people_list_tracker) == 0:
 56 |         print('\n[*] No Twitter users found.\n\n')
 57 |     else:
 58 |         if len(twitter_people_list_tracker) >= 1:
 59 |             print('\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker)))
 60 |             print('---------------------')
 61 |             twitter_people_list_tracker = list(sorted(set(twitter_people_list_tracker)))
 62 |             for usr in twitter_people_list_tracker:
 63 |                 print(usr)
 64 | 
 65 |     if len(linkedin_people_list_tracker) == 0:
 66 |         print('\n[*] No LinkedIn users found.\n\n')
 67 |     else:
 68 |         if len(linkedin_people_list_tracker) >= 1:
 69 |             print('\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker)))
 70 |             print('---------------------')
 71 |             linkedin_people_list_tracker = list(sorted(set(linkedin_people_list_tracker)))
 72 |             for usr in linkedin_people_list_tracker:
 73 |                 print(usr)
 74 | 
 75 |     if len(linkedin_links_tracker) == 0:
 76 |         print(f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}')
 77 |         linkedin_links_tracker = list(sorted(set(linkedin_links_tracker)))
 78 |         print('---------------------')
 79 |         for link in linkedin_links_tracker:
 80 |             print(link)
 81 | 
 82 |     length_urls = len(trello_urls)
 83 |     total = length_urls
 84 |     print('\n[*] Trello URLs found: ' + str(total))
 85 |     print('--------------------')
 86 |     all_urls = list(sorted(set(trello_urls)))
 87 |     for url in sorted(all_urls):
 88 |         print(url)
 89 | 
 90 |     if len(ips) == 0:
 91 |         print('\n[*] No IPs found.')
 92 |     else:
 93 |         print('\n[*] IPs found: ' + str(len(ips)))
 94 |         print('-------------------')
 95 |         # use netaddr as the list may contain ipv4 and ipv6 addresses
 96 |         ip_list = sorted([netaddr.IPAddress(ip.strip()) for ip in set(ips)])
 97 |         print('\n'.join(map(str, ip_list)))
 98 | 
 99 |     if len(emails) == 0:
100 |         print('\n[*] No emails found.')
101 |     else:
102 |         print('\n[*] Emails found: ' + str(len(emails)))
103 |         print('----------------------')
104 |         all_emails = sorted(list(set(emails)))
105 |         print('\n'.join(all_emails))
106 | 
107 |     if len(hosts) == 0:
108 |         print('\n[*] No hosts found.\n\n')
109 |     else:
110 |         print('\n[*] Hosts found: ' + str(len(hosts)))
111 |         print('---------------------')
112 |         print('\n'.join(hosts))
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     asyncio.run(main())
117 | 


--------------------------------------------------------------------------------
/theHarvester/lib/api/static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/lib/api/static/.gitkeep


--------------------------------------------------------------------------------
/theHarvester/lib/hostchecker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Created by laramies on 2008-08-21.
 4 | Revised to use aiodns & asyncio on 2019-09-23
 5 | """
 6 | 
 7 | # Support for Python3.9
 8 | from __future__ import annotations
 9 | 
10 | import asyncio
11 | import socket
12 | from typing import Any
13 | 
14 | import aiodns
15 | 
16 | 
17 | class Checker:
18 |     def __init__(self, hosts: list, nameservers: list) -> None:
19 |         self.hosts = hosts
20 |         self.realhosts: list = []
21 |         self.addresses: set = set()
22 |         self.nameservers = nameservers
23 | 
24 |     # @staticmethod
25 |     # async def query(host, resolver) -> Tuple[str, Any]:
26 |     #     try:
27 |     #         result = await resolver.gethostbyname(host, socket.AF_INET)
28 |     #         addresses = result.addresses
29 |     #         if addresses == [] or addresses is None or result is None:
30 |     #             return f"{host}:", tuple()
31 |     #         else:
32 |     #             return f"{host}:{', '.join(map(str, addresses))}", addresses
33 |     #     except Exception:
34 |     #         return f"{host}", tuple()
35 | 
36 |     @staticmethod
37 |     async def resolve_host(host, resolver) -> str:
38 |         try:
39 |             # TODO add check for ipv6 addrs as well
40 |             result = await resolver.gethostbyname(host, socket.AF_INET)
41 |             addresses = result.addresses
42 |             if addresses == [] or addresses is None or result is None:
43 |                 return f'{host}:'
44 |             else:
45 |                 addresses = ','.join(map(str, list(sorted(set(addresses)))))
46 |                 # addresses = list(sorted(addresses))
47 |                 return f'{host}:{addresses}'
48 |         except Exception:
49 |             return f'{host}:'
50 | 
51 |     # https://stackoverflow.com/questions/312443/how-do-i-split-a-list-into-equally-sized-chunks
52 |     @staticmethod
53 |     def chunks(lst, n):
54 |         """Yield successive n-sized chunks from lst."""
55 |         for i in range(0, len(lst), n):
56 |             yield lst[i : i + n]
57 | 
58 |     async def query_all(self, resolver, hosts) -> list[Any]:
59 |         # TODO chunk list into 50 pieces regardless of IPs and subnets
60 |         results = await asyncio.gather(*[asyncio.create_task(self.resolve_host(host, resolver)) for host in hosts])
61 |         return results
62 | 
63 |     async def check(self):
64 |         loop = asyncio.get_event_loop()
65 |         resolver = (
66 |             aiodns.DNSResolver(loop=loop, timeout=8)
67 |             if len(self.nameservers) == 0
68 |             else aiodns.DNSResolver(loop=loop, timeout=8, nameservers=self.nameservers)
69 |         )
70 |         all_results = set()
71 |         for chunk in self.chunks(self.hosts, 50):
72 |             # TODO split this to get IPs added total ips
73 |             results = await self.query_all(resolver, chunk)
74 |             all_results.update(results)
75 |             for pair in results:
76 |                 host, addresses = pair.split(':')
77 |                 self.realhosts.append(host)
78 |                 self.addresses.update({addr for addr in addresses.split(',')})
79 |                 # address may be a list of ips
80 |                 # and do a set comprehension to remove duplicates
81 |         self.realhosts.sort()
82 |         self.addresses = list(self.addresses)
83 |         all_results = list(sorted(all_results))
84 |         return all_results, self.realhosts, self.addresses
85 | 


--------------------------------------------------------------------------------
/theHarvester/lib/version.py:
--------------------------------------------------------------------------------
1 | VERSION = '4.8.0'
2 | 
3 | 
4 | def version() -> str:
5 |     return VERSION
6 | 


--------------------------------------------------------------------------------
/theHarvester/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/parsers/__init__.py


--------------------------------------------------------------------------------
/theHarvester/parsers/intelxparser.py:
--------------------------------------------------------------------------------
 1 | class Parser:
 2 |     def __init__(self) -> None:
 3 |         self.emails: set = set()
 4 |         self.hosts: set = set()
 5 | 
 6 |     async def parse_dictionaries(self, results: dict) -> tuple:
 7 |         """
 8 |         Parse method to parse json results
 9 |         :param results: Dictionary containing a list of dictionaries known as selectors
10 |         :return: tuple of emails and hosts
11 |         """
12 |         if results is not None:
13 |             for dictionary in results['selectors']:
14 |                 field = dictionary['selectorvalue']
15 |                 if '@' in field:
16 |                     self.emails.add(field)
17 |                 else:
18 |                     field = str(field)
19 |                     if 'http' in field or 'https' in field:
20 |                         if field[:5] == 'https':
21 |                             field = field[8:]
22 |                         else:
23 |                             field = field[7:]
24 |                     self.hosts.add(field.replace(')', '').replace(',', ''))
25 |             return self.emails, self.hosts
26 |         return None, None
27 | 


--------------------------------------------------------------------------------
/theHarvester/parsers/myparser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections.abc import Set
  3 | 
  4 | 
  5 | class Parser:
  6 |     def __init__(self, results, word) -> None:
  7 |         self.results = results
  8 |         self.word = word
  9 |         self.temp: list = []
 10 | 
 11 |     async def genericClean(self) -> None:
 12 |         self.results = (
 13 |             self.results.replace('<em>', '')
 14 |             .replace('<b>', '')
 15 |             .replace('</b>', '')
 16 |             .replace('</em>', '')
 17 |             .replace('%3a', '')
 18 |             .replace('<strong>', '')
 19 |             .replace('</strong>', '')
 20 |             .replace('<wbr>', '')
 21 |             .replace('</wbr>', '')
 22 |         )
 23 | 
 24 |         for search in (
 25 |             '<',
 26 |             '>',
 27 |             ':',
 28 |             '=',
 29 |             ';',
 30 |             '&',
 31 |             '%3A',
 32 |             '%3D',
 33 |             '%3C',
 34 |             '%2f',
 35 |             '/',
 36 |             '\\',
 37 |         ):
 38 |             self.results = self.results.replace(search, ' ')
 39 | 
 40 |     async def urlClean(self) -> None:
 41 |         self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '')
 42 |         for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
 43 |             self.results = self.results.replace(search, ' ')
 44 | 
 45 |     async def emails(self):
 46 |         await self.genericClean()
 47 |         # Local part is required, charset is flexible.
 48 |         # https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly)
 49 |         reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', ''))
 50 |         self.temp = reg_emails.findall(self.results)
 51 |         emails = await self.unique()
 52 |         true_emails = {
 53 |             (
 54 |                 str(email)[1:].lower().strip()
 55 |                 if len(str(email)) > 1 and str(email)[0] == '.'
 56 |                 else len(str(email)) > 1 and str(email).lower().strip()
 57 |             )
 58 |             for email in emails
 59 |         }
 60 |         # if email starts with dot shift email string and make sure all emails are lowercase
 61 |         return true_emails
 62 | 
 63 |     async def fileurls(self, file) -> list:
 64 |         urls: list = []
 65 |         reg_urls = re.compile('<a href="(.*?)"')
 66 |         self.temp = reg_urls.findall(self.results)
 67 |         allurls = await self.unique()
 68 |         for iteration in allurls:
 69 |             if iteration.count('webcache') or iteration.count('google.com') or iteration.count('search?hl'):
 70 |                 pass
 71 |             else:
 72 |                 urls.append(iteration)
 73 |         return urls
 74 | 
 75 |     async def hostnames(self):
 76 |         # should check both www. and not www.
 77 |         hostnames = []
 78 |         await self.genericClean()
 79 |         reg_hosts = re.compile(r'[a-zA-Z0-9.-]*\.' + self.word)
 80 |         first_hostnames = reg_hosts.findall(self.results)
 81 |         hostnames.extend(first_hostnames)
 82 |         # TODO determine if necessary below or if only pass through is fine
 83 |         reg_hosts = re.compile(r'[a-zA-Z0-9.-]*\.' + self.word.replace('www.', ''))
 84 |         # reg_hosts = re.compile(r'www\.[a-zA-Z0-9.-]*\.' + 'www.' + self.word)
 85 |         # reg_hosts = re.compile(r'www\.[a-zA-Z0-9.-]*\.(?:' + 'www.' + self.word + ')?')
 86 |         second_hostnames = reg_hosts.findall(self.results)
 87 |         hostnames.extend(second_hostnames)
 88 |         return list(set(hostnames))
 89 | 
 90 |     async def hostnames_all(self):
 91 |         reg_hosts = re.compile('<cite>(.*?)</cite>')
 92 |         temp = reg_hosts.findall(self.results)
 93 |         for iteration in temp:
 94 |             if iteration.count(':'):
 95 |                 res = iteration.split(':')[1].split('/')[2]
 96 |             else:
 97 |                 res = iteration.split('/')[0]
 98 |             self.temp.append(res)
 99 |         hostnames = await self.unique()
100 |         return hostnames
101 | 
102 |     async def set(self):
103 |         reg_sets = re.compile(r'>[a-zA-Z\d]*</a></font>')
104 |         self.temp = reg_sets.findall(self.results)
105 |         sets = []
106 |         for iteration in self.temp:
107 |             delete = iteration.replace('>', '')
108 |             delete = delete.replace('</a</font', '')
109 |             sets.append(delete)
110 |         return sets
111 | 
112 |     async def urls(self) -> Set[str]:
113 |         found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z\d\-_\.]+/?)*', self.results)
114 |         urls = {match.group().strip() for match in found}
115 |         return urls
116 | 
117 |     async def unique(self) -> list:
118 |         return list(set(self.temp))
119 | 


--------------------------------------------------------------------------------
/theHarvester/parsers/securitytrailsparser.py:
--------------------------------------------------------------------------------
 1 | class Parser:
 2 |     def __init__(self, word, text) -> None:
 3 |         self.word = word
 4 |         self.text = text
 5 |         self.hostnames: set = set()
 6 |         self.ips: set = set()
 7 | 
 8 |     async def parse_text(self) -> tuple[set, set]:
 9 |         sub_domain_flag = 0
10 |         self.text = str(self.text).splitlines()
11 |         # Split lines to get a list of lines.
12 |         for index in range(0, len(self.text)):
13 |             line = self.text[index].strip()
14 |             if '"ip":' in line:
15 |                 # Extract IP.
16 |                 ip = ''
17 |                 for ch in line[7:]:
18 |                     if ch == '"':
19 |                         break
20 |                     else:
21 |                         ip += ch
22 |                 self.ips.add(ip)
23 |             elif '"subdomains":' in line:
24 |                 # subdomains start here so set flag to 1
25 |                 sub_domain_flag = 1
26 |                 continue
27 |             elif sub_domain_flag > 0:
28 |                 if ']' in line:
29 |                     sub_domain_flag = 0
30 |                 else:
31 |                     if 'www' in self.word:
32 |                         self.word = str(self.word).replace('www.', '').replace('www', '')
33 |                     # Remove www from word if entered
34 |                     self.hostnames.add(str(line).replace('"', '').replace(',', '') + '.' + self.word)
35 |             else:
36 |                 continue
37 |         return self.ips, self.hostnames
38 | 


--------------------------------------------------------------------------------
/theHarvester/parsers/venacusparser.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | from collections.abc import Mapping
  3 | from typing import Any
  4 | 
  5 | 
  6 | class TokenTypesEnum(str, enum.Enum):
  7 |     ID = 'id'
  8 |     FIRSTNAME = 'firstname'
  9 |     LASTNAME = 'lastname'
 10 |     EMAIL = 'email'
 11 |     DOB = 'dob'
 12 |     URL = 'url'
 13 |     PHONE = 'phone'
 14 |     DATE = 'date'
 15 |     TIME = 'time'
 16 |     IP = 'ip_address'
 17 |     HASH = 'hash'
 18 |     PASSWORD = 'password'
 19 |     ADDRESS = 'address'
 20 |     COMPANY = 'company'
 21 |     JOB_TITLE = 'job_title'
 22 |     USERNAME = 'username'
 23 |     COUNTRY = 'country'
 24 |     CITY = 'city'
 25 |     STATE = 'state'
 26 |     ZIP_CODE = 'zip_code'
 27 |     CURRENCY = 'currency'
 28 |     INDUSTRY = 'industry'
 29 |     DEPARTMENT = 'department'
 30 |     ROLE = 'role'
 31 | 
 32 | 
 33 | class Parser:
 34 |     def __init__(self) -> None:
 35 |         self.parsed_data: dict[str, set[str]] = {}
 36 |         self.people: list[dict[str, str]] = []
 37 | 
 38 |     async def parse_text_tokens(self, results: list[dict[str, Any]]) -> Mapping[str, set[str] | list[dict[str, str]]]:
 39 |         """
 40 |         Extracts different types of information from the recognized text tokens
 41 |         """
 42 |         if not results:
 43 |             return {'people': set(), 'emails': set(), 'ips': set(), 'urls': set()}
 44 | 
 45 |         for res in results:
 46 |             person: dict[str, str] | None = None
 47 |             for token in res['tokens']:
 48 |                 if token['type'] == TokenTypesEnum.EMAIL:
 49 |                     if 'emails' not in self.parsed_data:
 50 |                         self.parsed_data['emails'] = set()
 51 |                     self.parsed_data['emails'].add(token['value'])
 52 |                     person = person or {}
 53 |                     person['email'] = token['value']
 54 |                 elif token['type'] == TokenTypesEnum.IP:
 55 |                     if 'ips' not in self.parsed_data:
 56 |                         self.parsed_data['ips'] = set()
 57 |                     self.parsed_data['ips'].add(token['value'])
 58 |                 elif token['type'] == TokenTypesEnum.URL:
 59 |                     if 'urls' not in self.parsed_data:
 60 |                         self.parsed_data['urls'] = set()
 61 |                     self.parsed_data['urls'].add(token['value'])
 62 |                 elif token['type'] == TokenTypesEnum.FIRSTNAME:
 63 |                     person = person or {}
 64 |                     person['firstname'] = token['value']
 65 |                 elif token['type'] == TokenTypesEnum.LASTNAME:
 66 |                     person = person or {}
 67 |                     person['lastname'] = token['value']
 68 |                 elif token['type'] == TokenTypesEnum.COMPANY:
 69 |                     person = person or {}
 70 |                     person['company'] = token['value']
 71 |                 elif token['type'] == TokenTypesEnum.CITY:
 72 |                     person = person or {}
 73 |                     person['city'] = token['value']
 74 |                 elif token['type'] == TokenTypesEnum.STATE:
 75 |                     person = person or {}
 76 |                     person['state'] = token['value']
 77 |                 elif token['type'] == TokenTypesEnum.COUNTRY:
 78 |                     person = person or {}
 79 |                     person['country'] = token['value']
 80 |                 elif token['type'] == TokenTypesEnum.ZIP_CODE:
 81 |                     person = person or {}
 82 |                     person['zip_code'] = token['value']
 83 |                 elif token['type'] == TokenTypesEnum.PHONE:
 84 |                     person = person or {}
 85 |                     person['phone'] = token['value']
 86 |                 elif token['type'] == TokenTypesEnum.ADDRESS:
 87 |                     person = person or {}
 88 |                     person['address'] = token['value']
 89 |                 elif token['type'] == TokenTypesEnum.ROLE:
 90 |                     person = person or {}
 91 |                     person['role'] = token['value']
 92 |                 elif token['type'] == TokenTypesEnum.DOB:
 93 |                     person = person or {}
 94 |                     person['dob'] = token['value']
 95 |                 elif token['type'] == TokenTypesEnum.JOB_TITLE:
 96 |                     person = person or {}
 97 |                     person['job_title'] = token['value']
 98 |                 elif token['type'] == TokenTypesEnum.INDUSTRY:
 99 |                     person = person or {}
100 |                     person['industry'] = token['value']
101 |                 elif token['type'] == TokenTypesEnum.DEPARTMENT:
102 |                     person = person or {}
103 |                     person['department'] = token['value']
104 | 
105 |             if person:
106 |                 for key in person:
107 |                     if key != 'email':
108 |                         self.people.append(person)
109 |                         break
110 | 
111 |         if self.people:
112 |             self.parsed_data['people'] = self.people  # type: ignore
113 | 
114 |         return self.parsed_data
115 | 


--------------------------------------------------------------------------------
/theHarvester/restfulHarvest.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import uvicorn
 4 | 
 5 | 
 6 | def main():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument(
 9 |         '-H',
10 |         '--host',
11 |         default='127.0.0.1',
12 |         help='IP address to listen on default is 127.0.0.1',
13 |     )
14 |     parser.add_argument(
15 |         '-p',
16 |         '--port',
17 |         default=5000,
18 |         help='Port to bind the web server to, default is 5000',
19 |         type=int,
20 |     )
21 |     parser.add_argument(
22 |         '-l',
23 |         '--log-level',
24 |         default='info',
25 |         help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set',
26 |     )
27 |     parser.add_argument(
28 |         '-r',
29 |         '--reload',
30 |         default=False,
31 |         help='Enable automatic reload used during development of the api',
32 |         action='store_true',
33 |     )
34 | 
35 |     args: argparse.Namespace = parser.parse_args()
36 |     uvicorn.run(
37 |         'theHarvester.lib.api.api:app',
38 |         host=args.host,
39 |         port=args.port,
40 |         log_level=args.log_level,
41 |         reload=args.reload,
42 |     )
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     main()
47 | 


--------------------------------------------------------------------------------
/theHarvester/screenshot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/screenshot/__init__.py


--------------------------------------------------------------------------------
/theHarvester/screenshot/screenshot.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Screenshot module that utilizes playwright to asynchronously
  3 | take screenshots
  4 | """
  5 | 
  6 | import os
  7 | import ssl
  8 | import sys
  9 | from collections.abc import Collection
 10 | from datetime import datetime
 11 | 
 12 | import aiohttp
 13 | import certifi
 14 | from playwright.async_api import async_playwright
 15 | 
 16 | 
 17 | class ScreenShotter:
 18 |     def __init__(self, output) -> None:
 19 |         self.output = output
 20 |         self.slash = '\\' if 'win' in sys.platform else '/'
 21 |         self.slash = '' if (self.output[-1] == '\\' or self.output[-1] == '/') else self.slash
 22 | 
 23 |     def verify_path(self) -> bool:
 24 |         try:
 25 |             if not os.path.isdir(self.output):
 26 |                 answer = input('[+] The output path you have entered does not exist would you like to create it (y/n): ')
 27 |                 if answer.lower() == 'yes' or answer.lower() == 'y':
 28 |                     os.makedirs(self.output)
 29 |                     return True
 30 |                 else:
 31 |                     return False
 32 |             return True
 33 |         except Exception as e:
 34 |             print(f"An exception has occurred while attempting to verify output path's existence: {e}")
 35 |             return False
 36 | 
 37 |     @staticmethod
 38 |     async def verify_installation() -> None:
 39 |         # Helper function that verifies playwright & chromium is installed
 40 |         try:
 41 |             async with async_playwright() as p:
 42 |                 browser = await p.chromium.launch()
 43 |                 await browser.close()
 44 |             print('Playwright and Chromium are successfully installed.')
 45 |         except Exception as e:
 46 |             print(f'An exception has occurred while attempting to verify installation: {e}')
 47 | 
 48 |     @staticmethod
 49 |     def chunk_list(items: Collection, chunk_size: int) -> list:
 50 |         # Based off of: https://github.com/apache/incubator-sdap-ingester
 51 |         return [list(items)[i : i + chunk_size] for i in range(0, len(items), chunk_size)]
 52 | 
 53 |     @staticmethod
 54 |     async def visit(url: str) -> tuple[str, str]:
 55 |         try:
 56 |             timeout = aiohttp.ClientTimeout(total=35)
 57 |             headers = {
 58 |                 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
 59 |                 'Chrome/122.0.0.0 Safari/537.36'
 60 |             }
 61 |             url = f'http://{url}' if not url.startswith('http') else url
 62 |             url = url.replace('www.', '')
 63 |             sslcontext = ssl.create_default_context(cafile=certifi.where())
 64 |             async with aiohttp.ClientSession(
 65 |                 timeout=timeout,
 66 |                 headers=headers,
 67 |                 connector=aiohttp.TCPConnector(ssl=sslcontext),
 68 |             ) as session:
 69 |                 async with session.get(url, ssl=False) as resp:
 70 |                     text = await resp.text('UTF-8')
 71 |                     return f'http://{url}' if not url.startswith('http') else url, text
 72 |         except Exception as e:
 73 |             print(f'An exception has occurred while attempting to visit {url} : {e}')
 74 |             return '', ''
 75 | 
 76 |     async def take_screenshot(self, url: str) -> tuple[str, ...]:
 77 |         url = f'http://{url}' if not url.startswith('http') else url
 78 |         url = url.replace('www.', '')
 79 |         print(f'Attempting to take a screenshot of: {url}')
 80 |         async with async_playwright() as p:
 81 |             browser = await p.chromium.launch(headless=True)
 82 |             # New browser context
 83 |             context = await browser.new_context()
 84 |             page = await context.new_page()
 85 |             path = rf'{self.output}{self.slash}{url.replace("http://", "").replace("https://", "")}.png'
 86 |             date = str(datetime.utcnow())
 87 |             try:
 88 |                 # Will fail if network idle or load event doesn't fire after
 89 |                 # 35s which should be handled
 90 |                 await page.goto(url, timeout=35000)
 91 |                 await page.screenshot(path=path)
 92 |             except Exception as e:
 93 |                 print(f'An exception has occurred attempting to screenshot: {url} : {e}')
 94 |                 path = ''
 95 |             finally:
 96 |                 await page.close()
 97 |                 await context.close()
 98 |                 await browser.close()
 99 |                 return date, url, path
100 | 


--------------------------------------------------------------------------------
/theHarvester/theHarvester.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import sys
 3 | 
 4 | from theHarvester import __main__
 5 | 
 6 | 
 7 | def main():
 8 |     platform = sys.platform
 9 |     if platform == 'win32':
10 |         # Required or things will break if trying to take screenshots
11 |         import multiprocessing
12 | 
13 |         multiprocessing.freeze_support()
14 |         try:
15 |             # See if we have winloop as a performance enhancement on windows
16 |             import winloop
17 | 
18 |             asyncio.DefaultEventLoopPolicy = winloop.EventLoopPolicy
19 |         except ModuleNotFoundError:
20 |             asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
21 |     else:
22 |         import uvloop
23 | 
24 |         uvloop.install()
25 | 
26 |         if 'linux' in platform:
27 |             import aiomultiprocess
28 | 
29 |             # As we are not using Windows, we can change the spawn method to fork for greater performance
30 |             aiomultiprocess.set_context('fork')
31 |     asyncio.run(__main__.entry_point())
32 | 


--------------------------------------------------------------------------------