├── .dockerignore
├── .git-blame-ignore-revs
├── .gitattributes
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ └── issue-template.md
├── dependabot.yml
└── workflows
│ ├── codeql-analysis.yml
│ ├── docker-build-push.yml
│ ├── dockerci.yml
│ └── theHarvester.yml
├── .gitignore
├── Dockerfile
├── README.md
├── README
├── CONTRIBUTING.md
├── COPYING
└── LICENSES
├── bin
├── restfulHarvest
└── theHarvester
├── docker-compose.yml
├── pyproject.toml
├── requirements.txt
├── restfulHarvest.py
├── tests
├── __init__.py
├── discovery
│ ├── __init__.py
│ ├── test_certspotter.py
│ ├── test_githubcode.py
│ └── test_otx.py
├── lib
│ └── test_core.py
└── test_myparser.py
├── theHarvester-logo.png
├── theHarvester-logo.webp
├── theHarvester.py
└── theHarvester
├── __init__.py
├── __main__.py
├── data
├── api-keys.yaml
├── proxies.yaml
└── wordlists
│ ├── api_endpoints.txt
│ ├── dns-big.txt
│ ├── dns-names.txt
│ ├── dorks.txt
│ ├── general
│ └── common.txt
│ └── names_small.txt
├── discovery
├── __init__.py
├── api_endpoints.py
├── baidusearch.py
├── bevigil.py
├── bingsearch.py
├── bravesearch.py
├── bufferoverun.py
├── builtwith.py
├── censysearch.py
├── certspottersearch.py
├── constants.py
├── criminalip.py
├── crtsh.py
├── dnssearch.py
├── duckduckgosearch.py
├── fullhuntsearch.py
├── githubcode.py
├── hackertarget.py
├── haveibeenpwned.py
├── huntersearch.py
├── intelxsearch.py
├── leaklookup.py
├── netlas.py
├── onyphe.py
├── otxsearch.py
├── pentesttools.py
├── projectdiscovery.py
├── rapiddns.py
├── rocketreach.py
├── search_dehashed.py
├── search_dnsdumpster.py
├── searchhunterhow.py
├── securityscorecard.py
├── securitytrailssearch.py
├── shodansearch.py
├── sitedossier.py
├── subdomaincenter.py
├── subdomainfinderc99.py
├── takeover.py
├── threatminer.py
├── tombasearch.py
├── urlscan.py
├── venacussearch.py
├── virustotal.py
├── whoisxml.py
├── yahoosearch.py
└── zoomeyesearch.py
├── lib
├── __init__.py
├── api
│ ├── __init__.py
│ ├── additional_endpoints.py
│ ├── api.py
│ ├── api_example.py
│ └── static
│ │ └── .gitkeep
├── core.py
├── hostchecker.py
├── ip-ranges.json
├── resolvers.txt
├── stash.py
└── version.py
├── parsers
├── __init__.py
├── intelxparser.py
├── myparser.py
├── securitytrailsparser.py
└── venacusparser.py
├── restfulHarvest.py
├── screenshot
├── __init__.py
└── screenshot.py
└── theHarvester.py
/.dockerignore:
--------------------------------------------------------------------------------
1 | .github/*
2 | .gitattributes
3 | .git-blame-ignore-revs
4 | .idea/
5 | .pytest_cache
6 | .mypy_cache
7 | tests/*
8 | README/
9 | bin/
10 | theHarvester-logo.png
11 | theHarvester-logo.webp
--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # #1492 run `black .` and `isort .`
2 | c13843ec0d513ac7f9c35b7bd0501fa46e356415
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, which is to have git automatically determine
2 | # whether a file is a text or binary, unless otherwise specified.
3 |
4 | * text=auto
5 |
6 | # Basic .gitattributes for a python repo.
7 |
8 | # Source files
9 | # ============
10 | *.pxd text diff=python
11 | *.py text diff=python
12 | *.py3 text diff=python
13 | *.pyw text diff=python
14 | *.pyx text diff=python
15 |
16 | # Binary files
17 | # ============
18 | *.db binary
19 | *.p binary
20 | *.pkl binary
21 | *.pyc binary
22 | *.pyd binary
23 | *.pyo binary
24 |
25 | # Note: .db, .p, and .pkl files are associated with the python modules
26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
27 | # (among others).
28 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [L1ghtn1ng, NotoriousRebel]
4 | open_collective: # Replace with a single Open Collective username
5 | ko_fi: #
6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
8 | liberapay: # Replace with a single Liberapay username
9 | issuehunt: # Replace with a single IssueHunt username
10 | otechie: # Replace with a single Otechie username
11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
12 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue-template.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Issue Template
3 | about: A template for new issues.
4 | title: "[Bug|Feature Request|Other] Short Description of Issue"
5 | labels: ''
6 |
7 | ---
8 |
9 | ## Note we do not support installing theHarvester on android
10 |
11 | **Feature Request or Bug or Another**
12 | Feature Request | Bug | Other
13 |
14 | **Describe the feature request or bug or other**
15 | A clear and concise description of what the bug, feature request,
16 | or other request is.
17 |
18 | **To Reproduce**
19 | Steps to reproduce the behaviour:
20 | 1. Run tool like this: '...'
21 | 2. See error
22 |
23 | **Expected behaviour**
24 | A clear and concise description of what you expected to happen.
25 |
26 | **Screenshots**
27 | If possible please add screenshots to help explain your problem.
28 |
29 | **System Information (System that tool is running on):**
30 | - OS: [e.g. Windows10]
31 | - Version [e.g. 2.7]
32 |
33 | **Additional context**
34 | Add any other context about the problem here.
35 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: github-actions
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | timezone: Europe/London
8 | - package-ecosystem: uv
9 | directory: "/"
10 | schedule:
11 | interval: daily
12 | timezone: Europe/London
13 | open-pull-requests-limit: 10
14 | target-branch: master
15 | allow:
16 | - dependency-type: direct
17 | - dependency-type: indirect
18 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ master, dev ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ master, dev ]
20 | schedule:
21 | - cron: '19 11 * * 4'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 |
28 | strategy:
29 | fail-fast: false
30 | matrix:
31 | language: [ 'python' ]
32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 | # Learn more:
34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 |
36 | steps:
37 | - name: Checkout repository
38 | uses: actions/checkout@v4
39 |
40 | # Initializes the CodeQL tools for scanning.
41 | - name: Initialize CodeQL
42 | uses: github/codeql-action/init@v3
43 | with:
44 | languages: ${{ matrix.language }}
45 | # If you wish to specify custom queries, you can do so here or in a config file.
46 | # By default, queries listed here will override any specified in a config file.
47 | # Prefix the list here with "+" to use these queries and those in the config file.
48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 |
50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
51 | # If this step fails, then you should remove it and run the build manually (see below)
52 | - name: Autobuild
53 | uses: github/codeql-action/autobuild@v3
54 |
55 | # ℹ️ Command-line programs to run using the OS shell.
56 | # 📚 https://git.io/JvXDl
57 |
58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 | # and modify them (or add more) to build your code if your project
60 | # uses a compiled language
61 |
62 | #- run: |
63 | # make bootstrap
64 | # make release
65 |
66 | - name: Perform CodeQL Analysis
67 | uses: github/codeql-action/analyze@v3
68 |
--------------------------------------------------------------------------------
/.github/workflows/docker-build-push.yml:
--------------------------------------------------------------------------------
1 | name: Build and Push Docker Image
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 |
8 | permissions:
9 | contents: read
10 | packages: write
11 |
12 | jobs:
13 | build-and-push:
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - name: Checkout repository
18 | uses: actions/checkout@v4
19 |
20 | - name: Set up Docker Buildx
21 | uses: docker/setup-buildx-action@v3
22 |
23 | - name: Log in to GitHub Container Registry
24 | uses: docker/login-action@v3
25 | with:
26 | registry: ghcr.io
27 | username: ${{ github.actor }}
28 | password: ${{ secrets.GITHUB_TOKEN }}
29 |
30 | - name: Extract metadata for Docker
31 | id: meta
32 | uses: docker/metadata-action@v5
33 | with:
34 | images: ghcr.io/${{ github.repository_owner }}/theharvester
35 | tags: |
36 | latest
37 | type=ref,event=branch
38 | type=sha
39 |
40 | - name: Build and push Docker image
41 | uses: docker/build-push-action@v6
42 | with:
43 | context: .
44 | file: Dockerfile
45 | push: true
46 | platforms: linux/amd64,linux/arm64
47 | tags: ${{ steps.meta.outputs.tags }}
48 | labels: ${{ steps.meta.outputs.labels }}
49 |
--------------------------------------------------------------------------------
/.github/workflows/dockerci.yml:
--------------------------------------------------------------------------------
1 | name: TheHarvester Docker Image CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - name: Build the Docker image
11 | run: docker build . --file Dockerfile --tag theharvester:$(date +%s)
--------------------------------------------------------------------------------
/.github/workflows/theHarvester.yml:
--------------------------------------------------------------------------------
1 | name: TheHarvester Python CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - '*'
7 |
8 | pull_request:
9 | branches:
10 | - '*'
11 |
12 | jobs:
13 | Python:
14 | runs-on: ${{ matrix.os }}
15 | strategy:
16 | max-parallel: 10
17 | matrix:
18 | os: [ ubuntu-latest ]
19 | python-version: [ '3.12', '3.13', '3.14.0-beta.1' ]
20 |
21 | steps:
22 | - uses: actions/checkout@v4
23 | - name: Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | sudo mkdir -p /usr/local/etc/theHarvester
30 | sudo cp theHarvester/data/*.yaml /usr/local/etc/theHarvester/
31 | sudo chown -R runner:runner /usr/local/etc/theHarvester/
32 | pip install --upgrade pip
33 | pip install .[dev]
34 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
35 |
36 | - name: Lint with ruff
37 | run: |
38 | ruff check --fix
39 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
40 |
41 | - name: Format with ruff
42 | run: |
43 | ruff format
44 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
45 |
46 | - name: Commit changes for ruff formating and linting
47 | run: |
48 | git config user.name github-actions
49 | git config user.email github-actions@github.com
50 | git add .
51 | git commit -m "Apply ruff fixes and formatting" || true # Use || true to prevent failure if no changes
52 | git push origin $GITHUB_REF
53 | env:
54 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
55 |
56 | - name: Test with pytest
57 | run: |
58 | pytest tests/**
59 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
60 |
61 | - name: Run theHarvester module Baidu
62 | run: |
63 | theHarvester -d yale.edu -b baidu
64 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
65 |
66 | - name: Run theHarvester module Bing
67 | run: |
68 | theHarvester -d yale.edu -b bing
69 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
70 |
71 | - name: Run theHarvester module CertSpotter
72 | run: |
73 | theHarvester -d yale.edu -b certspotter
74 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
75 |
76 | - name: Run theHarvester module Crtsh
77 | run: |
78 | theHarvester -d hcl.com -b crtsh
79 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
80 |
81 | - name: Run theHarvester module DuckDuckGo
82 | run: |
83 | theHarvester -d yale.edu -b duckduckgo
84 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
85 |
86 | - name: Run theHarvester module HackerTarget
87 | run: |
88 | theHarvester -d yale.edu -b hackertarget
89 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
90 |
91 | - name: Run theHarvester module Otx
92 | run: |
93 | theHarvester -d yale.edu -b otx
94 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
95 |
96 | - name: Run theHarvester module RapidDns
97 | run: |
98 | theHarvester -d yale.edu -b rapiddns
99 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
100 |
101 | - name: Run theHarvester module Threatminer
102 | run: |
103 | theHarvester -d yale.edu -b threatminer
104 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
105 |
106 | - name: Run theHarvester module Urlscan
107 | run: |
108 | theHarvester -d yale.edu -b urlscan
109 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
110 |
111 | - name: Run theHarvester module Yahoo
112 | run: |
113 | theHarvester -d yale.edu -b yahoo
114 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
115 |
116 | - name: Run theHarvester module DNS brute force
117 | run: |
118 | theHarvester -d yale.edu -c
119 | continue-on-error: ${{ matrix.python-version == '3.14.0-beta.1' }}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.idea
2 | *.pyc
3 | *.sqlite
4 | *.html
5 | *.htm
6 | *.vscode
7 | *.xml
8 | *.json
9 | debug_results.txt
10 | venv
11 | .mypy_cache
12 | .pytest_cache
13 | build/
14 | dist/
15 | theHarvester.egg-info
16 | api-keys.yaml
17 | .DS_Store
18 | .venv
19 | .pyre
20 | uv.lock
21 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian:testing-slim
2 |
3 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1"
4 |
5 | # Install dependencies for building Python from source
6 | RUN apt update && apt install -y \
7 | curl \
8 | build-essential \
9 | libssl-dev \
10 | zlib1g-dev \
11 | libbz2-dev \
12 | libreadline-dev \
13 | libsqlite3-dev \
14 | wget \
15 | curl \
16 | llvm \
17 | libncurses5-dev \
18 | libncursesw5-dev \
19 | xz-utils \
20 | tk-dev \
21 | libffi-dev \
22 | liblzma-dev \
23 | python3-dev \
24 | git \
25 | gcc \
26 | && rm -rf /var/lib/apt/lists/*
27 |
28 | # Install Python 3.11 from source
29 | RUN curl -fsSL https://www.python.org/ftp/python/3.11.6/Python-3.11.6.tgz -o Python-3.11.6.tgz \
30 | && tar -xvf Python-3.11.6.tgz \
31 | && cd Python-3.11.6 \
32 | && ./configure --enable-optimizations \
33 | && make -j 2 \
34 | && make altinstall \
35 | && rm -rf /Python-3.11.6 /Python-3.11.6.tgz
36 |
37 | # Install pip for Python 3.11
38 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.11
39 |
40 | # Install pipx for Python 3.11
41 | RUN python3.11 -m pip install --user pipx
42 |
43 | # Add pipx to PATH
44 | ENV PATH=/root/.local/bin:$PATH
45 |
46 | # Install theHarvester via pipx
47 | RUN pipx install --python python3.11 git+https://github.com/laramies/theHarvester.git
48 |
49 | # Ensure pipx path
50 | RUN pipx ensurepath
51 |
52 | # Set the entrypoint
53 | ENTRYPOINT ["/root/.local/bin/restfulHarvest", "-H", "0.0.0.0", "-p", "80"]
54 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 |  
4 | [](https://inventory.raw.pm/)
5 |
6 | What is this?
7 | -------------
8 | theHarvester is a simple to use, yet powerful tool designed to be used during the reconnaissance stage of a red
9 | team assessment or penetration test. It performs open source intelligence (OSINT) gathering to help determine
10 | a domain's external threat landscape. The tool gathers names, emails, IPs, subdomains, and URLs by using
11 | multiple public resources that include:
12 |
13 | Passive modules:
14 | ----------------
15 |
16 | * baidu: Baidu search engine - www.baidu.com
17 |
18 | * bevigil: CloudSEK BeVigil scans mobile application for OSINT assets (Requires an API key, see below.) - https://bevigil.com/osint-api
19 |
20 | * bing: Microsoft search engine - https://www.bing.com
21 |
22 | * bingapi: Microsoft search engine, through the API (Requires an API key, see below.)
23 |
24 | * brave: Brave search engine - https://search.brave.com/
25 |
26 | * bufferoverun: (Requires an API key, see below.) https://tls.bufferover.run
27 |
28 | * censys: [Censys search engine](https://search.censys.io/) will use certificates searches to enumerate subdomains and gather emails
29 | (Requires an API key, see below.) https://censys.io
30 |
31 | * certspotter: Cert Spotter monitors Certificate Transparency logs - https://sslmate.com/certspotter/
32 |
33 | * criminalip: Specialized Cyber Threat Intelligence (CTI) search engine (Requires an API key, see below.) - https://www.criminalip.io
34 |
35 | * crtsh: Comodo Certificate search - https://crt.sh
36 |
37 | * duckduckgo: DuckDuckGo search engine - https://duckduckgo.com
38 |
39 | * fullhunt: Next-generation attack surface security platform (Requires an API key, see below.) - https://fullhunt.io
40 |
41 | * github-code: GitHub code search engine (Requires a GitHub Personal Access Token, see below.) - www.github.com
42 |
43 | * hackertarget: Online vulnerability scanners and network intelligence to help organizations - https://hackertarget.com
44 |
45 | * hunter: Hunter search engine (Requires an API key, see below.) - https://hunter.io
46 |
47 | * hunterhow: Internet search engines for security researchers (Requires an API key, see below.) - https://hunter.how
48 |
49 | * intelx: Intelx search engine (Requires an API key, see below.) - http://intelx.io
50 |
51 | * netlas: A Shodan or Censys competitor (Requires an API key, see below.) - https://app.netlas.io
52 |
53 | * onyphe: Cyber defense search engine (Requires an API key, see below.) - https://www.onyphe.io/
54 |
55 | * otx: AlienVault open threat exchange - https://otx.alienvault.com
56 |
57 | * pentestTools: Cloud-based toolkit for offensive security testing, focused on web applications and network penetration
58 | testing (Requires an API key, see below.) - https://pentest-tools.com/
59 |
60 | * projecDiscovery: We actively collect and maintain internet-wide assets data, to enhance research and analyse changes around
61 | DNS for better insights (Requires an API key, see below.) - https://chaos.projectdiscovery.io
62 |
63 | * rapiddns: DNS query tool which make querying subdomains or sites of a same IP easy! https://rapiddns.io
64 |
65 | * rocketreach: Access real-time verified personal/professional emails, phone numbers, and social media links (Requires an API key,
66 | see below.) - https://rocketreach.co
67 |
68 | * securityTrails: Security Trails search engine, the world's largest repository of historical DNS data (Requires an API key, see
69 | below.) - https://securitytrails.com
70 |
71 | * -s, --shodan: Shodan search engine will search for ports and banners from discovered hosts (Requires an API key, see below.)
72 | https://shodan.io
73 |
74 | * sitedossier: Find available information on a site - http://www.sitedossier.com
75 |
76 | * subdomaincenter: A subdomain finder tool used to find subdomains of a given domain - https://www.subdomain.center/
77 |
78 | * subdomainfinderc99: A subdomain finder is a tool used to find the subdomains of a given domain - https://subdomainfinder.c99.nl
79 |
80 | * threatminer: Data mining for threat intelligence - https://www.threatminer.org/
81 |
82 | * tomba: Tomba search engine (Requires an API key, see below.) - https://tomba.io
83 |
84 | * urlscan: A sandbox for the web that is a URL and website scanner - https://urlscan.io
85 |
86 | * venacus: Venacus search engine (Requires an API key, see below.) - https://venacus.com
87 |
88 | * vhost: Bing virtual hosts search
89 |
90 | * virustotal: Domain search (Requires an API key, see below.) - https://www.virustotal.com
91 |
92 | * whoisxml: Subdomain search (Requires an API key, see below.) - https://subdomains.whoisxmlapi.com/api/pricing
93 |
94 | * yahoo: Yahoo search engine
95 |
96 | * zoomeye: China's version of Shodan (Requires an API key, see below.) - https://www.zoomeye.org
97 |
98 | Active modules:
99 | ---------------
100 | * DNS brute force: dictionary brute force enumeration
101 | * Screenshots: Take screenshots of subdomains that were found
102 |
103 | Modules that require an API key:
104 | --------------------------------
105 | Documentation to setup API keys can be found at - https://github.com/laramies/theHarvester/wiki/Installation#api-keys
106 |
107 | * bevigil - Free upto 50 queries. Pricing can be found here: https://bevigil.com/pricing/osint
108 | * bing
109 | * bufferoverun - uses the free binaAPI
110 | * censys - API keys are required and can be retrieved from your [Censys account](https://search.censys.io/account/api).
111 | * criminalip
112 | * fullhunt
113 | * github
114 | * hunter - limited to 10 on the free plan, so you will need to do -l 10 switch
115 | * hunterhow
116 | * intelx
117 | * netlas - $
118 | * onyphe -$
119 | * pentestTools - $
120 | * projecDiscovery - invite only for now
121 | * rocketreach - $
122 | * securityTrails
123 | * shodan - $
124 | * tomba - Free up to 50 search.
125 | * venacus - $
126 | * whoisxml
127 | * zoomeye
128 |
129 | Install and dependencies:
130 | -------------------------
131 | * Python 3.11+
132 | * https://github.com/laramies/theHarvester/wiki/Installation
133 |
134 | Comments, bugs, and requests:
135 | -----------------------------
136 | * [](https://twitter.com/laramies) Christian Martorella @laramies
137 | cmartorella@edge-security.com
138 | * [](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
139 | * [](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
140 |
141 | Main contributors:
142 | ------------------
143 | * [](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
144 | * [](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
145 | * [](https://twitter.com/discoverscripts) Lee Baird @discoverscripts
146 |
147 |
148 | Thanks:
149 | -------
150 | * John Matherly - Shodan project
151 | * Ahmed Aboul Ela - subdomain names dictionaries (big and small)
152 |
--------------------------------------------------------------------------------
/README/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to theHarvester Project
2 | Welcome to theHarvester project, so you would like to contribute.
3 | The following below must be met to get accepted.
4 |
5 | # CI
6 | Make sure all CI passes and you do not introduce any alerts from ruff
7 |
8 | # Unit Tests
9 | For new modules a unit test for that module is required and we use pytest.
10 |
11 | # Coding Standards
12 | * No single letter variables and variable names must represent the action that it is performing
13 | * Have static typing on functions etc
14 | * Make sure no errors are reported from mypy
15 | * No issues reported with ruff
16 |
17 | # Submitting Bugs
18 | If you find a bug in a module that you want to submit an issue for and know how to write python code.
19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project.
20 |
--------------------------------------------------------------------------------
/README/LICENSES:
--------------------------------------------------------------------------------
1 | Released under the GPL v 2.0.
2 |
3 | If you did not receive a copy of the GPL, try http://www.gnu.org/.
4 |
5 | Copyright 2011 Christian Martorella
6 |
7 | theHarvester is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation version 2 of the License.
10 |
11 | theHarvester is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 |
--------------------------------------------------------------------------------
/bin/restfulHarvest:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 |
4 | import uvicorn
5 |
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument(
8 | "-H",
9 | "--host",
10 | default="127.0.0.1",
11 | help="IP address to listen on default is 127.0.0.1",
12 | )
13 | parser.add_argument(
14 | "-p",
15 | "--port",
16 | default=5000,
17 | help="Port to bind the web server to, default is 5000",
18 | type=int,
19 | )
20 | parser.add_argument(
21 | "-l",
22 | "--log-level",
23 | default="info",
24 | help="Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set",
25 | )
26 | parser.add_argument(
27 | "-r",
28 | "--reload",
29 | default=False,
30 | help="Enable automatic reload used during development of the api",
31 | action="store_true",
32 | )
33 |
34 | args = parser.parse_args()
35 |
36 | if __name__ == "__main__":
37 | uvicorn.run(
38 | "theHarvester.lib.api.api:app",
39 | host=args.host,
40 | port=args.port,
41 | log_level=args.log_level,
42 | reload=args.reload,
43 | )
44 |
--------------------------------------------------------------------------------
/bin/theHarvester:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Note: This script runs theHarvester
3 | import asyncio
4 | import sys
5 |
6 | from theHarvester import __main__
7 |
8 | if sys.version_info.major < 3 or sys.version_info.minor < 11:
9 | print(
10 | "\033[93m[!] Make sure you have Python 3.11+ installed, quitting.\n\n \033[0m"
11 | )
12 | sys.exit(1)
13 |
14 | if __name__ == "__main__":
15 | platform = sys.platform
16 | if platform == "win32":
17 | # Required or things will break if trying to take screenshots
18 | import multiprocessing
19 |
20 | multiprocessing.freeze_support()
21 | try:
22 | # See if we have winloop as a performance enhancement on windows
23 | import winloop
24 |
25 | asyncio.DefaultEventLoopPolicy = winloop.EventLoopPolicy
26 | except ModuleNotFoundError:
27 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
28 | else:
29 | import uvloop
30 |
31 | uvloop.install()
32 |
33 | if "linux" in platform:
34 | import aiomultiprocess
35 |
36 | # As we are not using Windows, we can change the spawn method to fork for greater performance
37 | aiomultiprocess.set_context("fork")
38 | asyncio.run(__main__.entry_point())
39 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | theharvester.svc.local:
3 | container_name: theHarvester
4 | volumes:
5 | - ./theHarvester/data/api-keys.yaml:/root/.theHarvester/api-keys.yaml
6 | - ./theHarvester/data/api-keys.yaml:/etc/theHarvester/api-keys.yaml
7 | - ./theHarvester/data/proxies.yaml:/etc/theHarvester/proxies.yaml
8 | - ./theHarvester/data/proxies.yaml:/root/.theHarvester/proxies.yaml
9 | build: .
10 | ports:
11 | - "8080:80"
12 |
13 | networks:
14 | default:
15 | name: app_theHarvester_network
16 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "theHarvester"
3 | description = "theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test"
4 | readme = "README.md"
5 | authors = [
6 | { name = "Christian Martorella", email = "cmartorella@edge-security.com" },
7 | { name = "Jay Townsend", email = "jay@cybermon.uk" },
8 | { name = "Matthew Brown", email = "36310667+NotoriousRebel@users.noreply.github.com" },
9 | ]
10 | requires-python = ">=3.11"
11 | urls.Homepage = "https://github.com/laramies/theHarvester"
12 | classifiers = [
13 | "Programming Language :: Python :: 3",
14 | "Programming Language :: Python :: 3.11",
15 | "Programming Language :: Python :: 3.12",
16 | "Programming Language :: Python :: 3.13",
17 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
18 | "Operating System :: OS Independent",
19 | ]
20 | dynamic = ["version"]
21 | dependencies = [
22 | "aiodns==3.4.0",
23 | "aiofiles==24.1.0",
24 | "aiohttp==3.12.2",
25 | "aiomultiprocess==0.9.1",
26 | "aiosqlite==0.21.0",
27 | "beautifulsoup4==4.13.4",
28 | "censys==2.2.17",
29 | "certifi==2025.4.26",
30 | "dnspython==2.7.0",
31 | "fastapi==0.115.12",
32 | "lxml==5.4.0",
33 | "netaddr==1.3.0",
34 | "playwright==1.52.0",
35 | "PyYAML==6.0.2",
36 | "python-dateutil==2.9.0.post0",
37 | "requests==2.32.3",
38 | "retrying==1.3.4",
39 | "shodan==1.31.0",
40 | "slowapi==0.1.9",
41 | "ujson==5.10.0",
42 | "uvicorn==0.34.2",
43 | "uvloop==0.21.0; platform_system != 'Windows'",
44 | "winloop==0.1.8; platform_system == 'Windows'",
45 | ]
46 |
47 | [project.optional-dependencies]
48 | dev = [
49 | "mypy==1.15.0",
50 | "mypy-extensions==1.1.0",
51 | "pytest==8.3.5",
52 | "pytest-asyncio==0.26.0",
53 | "types-certifi==2021.10.8.3",
54 | "types-chardet==5.0.4.6",
55 | "types-python-dateutil==2.9.0.20250516",
56 | "types-PyYAML==6.0.12.20250516",
57 | "types-requests==2.32.0.20250515",
58 | "ruff==0.11.11",
59 | "types-ujson==5.10.0.20250326",
60 | "wheel==0.45.1",
61 | ]
62 |
63 | [project.scripts]
64 | theHarvester = "theHarvester.theHarvester:main"
65 | restfulHarvest = "theHarvester.restfulHarvest:main"
66 |
67 | [tool.setuptools.dynamic]
68 | version = { attr = "theHarvester.lib.version.VERSION" }
69 |
70 | [tool.setuptools.packages.find]
71 | include = ["theHarvester*"]
72 |
73 | [tool.setuptools.package-data]
74 | "*" = ["*.txt", "*.yaml"]
75 |
76 | [tool.pytest.ini_options]
77 | minversion = "8.3.3"
78 | asyncio_mode = "auto"
79 | asyncio_default_fixture_loop_scope = "function"
80 | addopts = "--no-header"
81 | testpaths = [
82 | "tests",
83 | "tests/discovery/",
84 | ]
85 |
86 | [build-system]
87 | requires = ["setuptools>=68"]
88 | build-backend = "setuptools.build_meta"
89 |
90 | [tool.mypy]
91 | python_version = "3.11"
92 | warn_unused_configs = true
93 | ignore_missing_imports = true
94 | show_traceback = true
95 | show_error_codes = true
96 | namespace_packages = true
97 |
98 | [tool.uv]
99 | python-preference = "managed"
100 |
101 | [tool.uv.pip]
102 | python-version = "3.11"
103 |
104 | [tool.ruff]
105 | # Exclude a variety of commonly ignored directories.
106 | exclude = [
107 | "tests",
108 | ".eggs",
109 | ".git",
110 | ".git-rewrite",
111 | ".mypy_cache",
112 | ".pyenv",
113 | ".pytest_cache",
114 | ".pytype",
115 | ".ruff_cache",
116 | ".github",
117 | ".venv",
118 | ".vscode",
119 | ".idea",
120 | "__pypackages__",
121 | "build",
122 | "dist",
123 | "site-packages",
124 | "venv",
125 | ]
126 |
127 | line-length = 130
128 | target-version = "py311"
129 | show-fixes = true
130 |
131 | [tool.ruff.lint]
132 | select = ["E4",
133 | "E7",
134 | "E9",
135 | "F",
136 | "I",
137 | "UP",
138 | "TCH",
139 | "FA",
140 | "RUF",
141 | "PT",
142 | ]
143 | ignore = ["S311", "RUF021", "RUF029", "F841"]
144 |
145 | # Allow fix for all enabled rules (when `--fix`) is provided.
146 | fixable = ["ALL"]
147 | unfixable = []
148 |
149 | # Allow unused variables when underscore-prefixed.
150 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
151 |
152 | [tool.ruff.format]
153 | # Like Black, use double quotes for strings.
154 | quote-style = "single"
155 | indent-style = "space"
156 |
157 | # Like Black, respect magic trailing commas.
158 | skip-magic-trailing-comma = false
159 |
160 | # Like Black, automatically detect the appropriate line ending.
161 | line-ending = "auto"
162 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file is deprecated. All dependencies are now defined in pyproject.toml
2 |
--------------------------------------------------------------------------------
/restfulHarvest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from theHarvester.restfulHarvest import main
3 |
4 | if __name__ == '__main__':
5 | main()
6 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/tests/__init__.py
--------------------------------------------------------------------------------
/tests/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/tests/discovery/__init__.py
--------------------------------------------------------------------------------
/tests/discovery/test_certspotter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import os
4 | from typing import Optional
5 |
6 | import pytest
7 | import requests
8 | from _pytest.mark.structures import MarkDecorator
9 |
10 | from theHarvester.discovery import certspottersearch
11 | from theHarvester.lib.core import *
12 |
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 | "GITHUB_ACTIONS"
16 | ) # Github set this to be the following: true instead of True
17 |
18 |
19 | class TestCertspotter(object):
20 | @staticmethod
21 | def domain() -> str:
22 | return "metasploit.com"
23 |
24 |
25 | @pytest.mark.skipif(github_ci == 'true', reason="Skipping this test for now")
26 | class TestCertspotterSearch(object):
27 | async def test_api(self) -> None:
28 | base_url = f"https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names"
29 | headers = {"User-Agent": Core.get_user_agent()}
30 | request = requests.get(base_url, headers=headers)
31 | assert request.status_code == 200
32 |
33 | async def test_search(self) -> None:
34 | search = certspottersearch.SearchCertspoter(TestCertspotter.domain())
35 | await search.process()
36 | assert isinstance(await search.get_hostnames(), set)
37 |
38 |
39 | if __name__ == "__main__":
40 | pytest.main()
41 |
--------------------------------------------------------------------------------
/tests/discovery/test_githubcode.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import MagicMock
2 | import pytest
3 | from _pytest.mark.structures import MarkDecorator
4 | from requests import Response
5 | from theHarvester.discovery import githubcode
6 | from theHarvester.discovery.constants import MissingKey
7 | from theHarvester.lib.core import Core
8 |
9 | pytestmark: MarkDecorator = pytest.mark.asyncio
10 |
11 |
12 | class TestSearchGithubCode:
13 | class OkResponse:
14 | response = Response()
15 |
16 | # Mocking the json method properly
17 | def __init__(self):
18 | self.response = Response()
19 | self.response.status_code = 200
20 | self.response.json = MagicMock(
21 | return_value={
22 | "items": [
23 | {"text_matches": [{"fragment": "test1"}]},
24 | {"text_matches": [{"fragment": "test2"}]},
25 | ]
26 | }
27 | )
28 |
29 | class FailureResponse:
30 | response = Response()
31 |
32 | def __init__(self):
33 | self.response = Response()
34 | self.response.status_code = 401
35 | self.response.json = MagicMock(return_value={})
36 |
37 | class RetryResponse:
38 | def __init__(self):
39 | self.response = Response()
40 | self.response.status_code = 403
41 | self.response.json = MagicMock(return_value={})
42 |
43 | class MalformedResponse:
44 | response = Response()
45 |
46 | def __init__(self):
47 | self.response = Response()
48 | self.response.status_code = 200
49 | self.response.json = MagicMock(
50 | return_value={
51 | "items": [
52 | {"fail": True},
53 | {"text_matches": []},
54 | {"text_matches": [{"weird": "result"}]},
55 | ]
56 | }
57 | )
58 |
59 | async def test_missing_key(self):
60 | with pytest.raises(MissingKey):
61 | Core.github_key = MagicMock(return_value=None)
62 | githubcode.SearchGithubCode(word="test", limit=500)
63 |
64 | async def test_fragments_from_response(self):
65 | Core.github_key = MagicMock(return_value="test_key")
66 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
67 | test_result = await test_class_instance.fragments_from_response(
68 | self.OkResponse().response.json()
69 | )
70 | print("test_result: ", test_result)
71 | assert test_result == ["test1", "test2"]
72 |
73 | async def test_invalid_fragments_from_response(self):
74 | Core.github_key = MagicMock(return_value="test_key")
75 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
76 | test_result = await test_class_instance.fragments_from_response(
77 | self.MalformedResponse().response.json()
78 | )
79 | assert test_result == []
80 |
81 | async def test_next_page(self):
82 | Core.github_key = MagicMock(return_value="test_key")
83 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
84 | test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4)
85 | assert 2 == await test_class_instance.next_page_or_end(test_result)
86 |
87 | async def test_last_page(self):
88 | Core.github_key = MagicMock(return_value="test_key")
89 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
90 | test_result = githubcode.SuccessResult(list(), 0, 0)
91 | assert await test_class_instance.next_page_or_end(test_result) is 0
92 |
93 |
94 | if __name__ == "__main__":
95 | pytest.main()
96 |
--------------------------------------------------------------------------------
/tests/discovery/test_otx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import os
4 | from typing import Optional
5 |
6 | import pytest
7 | import requests
8 | from _pytest.mark.structures import MarkDecorator
9 |
10 | from theHarvester.discovery import otxsearch
11 | from theHarvester.lib.core import *
12 |
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 | "GITHUB_ACTIONS"
16 | ) # Github set this to be the following: true instead of True
17 |
18 |
19 | class TestOtx(object):
20 | @staticmethod
21 | def domain() -> str:
22 | return "cybermon.uk"
23 |
24 | async def test_api(self) -> None:
25 | base_url = f"https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns"
26 | headers = {"User-Agent": Core.get_user_agent()}
27 | request = requests.get(base_url, headers=headers)
28 | assert request.status_code == 200
29 |
30 | async def test_search(self) -> None:
31 | search = otxsearch.SearchOtx(TestOtx.domain())
32 | await search.process()
33 | assert isinstance(await search.get_hostnames(), set)
34 | assert isinstance(await search.get_ips(), set)
35 |
36 |
37 | if __name__ == "__main__":
38 | pytest.main()
39 |
--------------------------------------------------------------------------------
/tests/lib/test_core.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from pathlib import Path
4 | from typing import Any
5 | from unittest import mock
6 |
7 | import pytest
8 | import yaml
9 |
10 | from theHarvester.lib.core import CONFIG_DIRS, DATA_DIR, Core
11 |
12 |
13 | @pytest.fixture(autouse=True)
14 | def mock_environ(monkeypatch, tmp_path: Path):
15 | monkeypatch.setenv("HOME", str(tmp_path))
16 |
17 |
18 | def mock_read_text(mocked: dict[Path, str | Exception]):
19 | read_text = Path.read_text
20 |
21 | def _read_text(self: Path, *args, **kwargs):
22 | if result := mocked.get(self):
23 | if isinstance(result, Exception):
24 | raise result
25 | return result
26 | return read_text(self, *args, **kwargs)
27 |
28 | return _read_text
29 |
30 |
31 | @pytest.mark.parametrize(
32 | ("name", "contents", "expected"),
33 | [
34 | ("api-keys", "apikeys: {}", {}),
35 | ("proxies", "http: [localhost:8080]", ["http://localhost:8080"]),
36 | ],
37 | )
38 | @pytest.mark.parametrize("dir", CONFIG_DIRS)
39 | def test_read_config_searches_config_dirs(
40 | name: str, contents: str, expected: Any, dir: Path, capsys
41 | ):
42 | file = dir.expanduser() / f"{name}.yaml"
43 | config_files = [d.expanduser() / file.name for d in CONFIG_DIRS]
44 | side_effect = mock_read_text(
45 | {f: contents if f == file else FileNotFoundError() for f in config_files}
46 | )
47 |
48 | with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect):
49 | got = Core.api_keys() if name == "api-keys" else Core.proxy_list()
50 |
51 | assert got == expected
52 | assert f"Read {file.name} from {file}" in capsys.readouterr().out
53 |
54 |
55 | @pytest.mark.parametrize("name", ("api-keys", "proxies"))
56 | def test_read_config_copies_default_to_home(name: str, capsys):
57 | file = Path(f"~/.theHarvester/{name}.yaml").expanduser()
58 | config_files = [d.expanduser() / file.name for d in CONFIG_DIRS]
59 | side_effect = mock_read_text({f: FileNotFoundError() for f in config_files})
60 |
61 | with mock.patch("pathlib.Path.read_text", autospec=True, side_effect=side_effect):
62 | got = Core.api_keys() if name == "api-keys" else Core.proxy_list()
63 |
64 | default = yaml.safe_load((DATA_DIR / file.name).read_text())
65 | expected = (
66 | default["apikeys"]
67 | if name == "api-keys"
68 | else [f"http://{h}" for h in default["http"]]
69 | )
70 | assert got == expected
71 | assert f"Created default {file.name} at {file}" in capsys.readouterr().out
72 | assert file.exists()
73 |
--------------------------------------------------------------------------------
/tests/test_myparser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 |
4 | import pytest
5 |
6 | from theHarvester.parsers import myparser
7 |
8 |
9 | class TestMyParser(object):
10 | @pytest.mark.asyncio
11 | async def test_emails(self) -> None:
12 | word = "domain.com"
13 | results = "@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***"
14 | parse = myparser.Parser(results, word)
15 | emails = sorted(await parse.emails())
16 | assert emails, ["c@domain.com", "d@sub.domain.com"]
17 |
18 |
19 | if __name__ == "__main__":
20 | pytest.main()
21 |
--------------------------------------------------------------------------------
/theHarvester-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester-logo.png
--------------------------------------------------------------------------------
/theHarvester-logo.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester-logo.webp
--------------------------------------------------------------------------------
/theHarvester.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Note: This script runs theHarvester
3 | import sys
4 |
5 | from theHarvester.theHarvester import main
6 |
7 | if sys.version_info.major < 3 or sys.version_info.minor < 10:
8 | print('\033[93m[!] Make sure you have Python 3.10+ installed, quitting.\n\n \033[0m')
9 | sys.exit(1)
10 |
11 | if __name__ == '__main__':
12 | main()
13 |
--------------------------------------------------------------------------------
/theHarvester/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/__init__.py
--------------------------------------------------------------------------------
/theHarvester/data/api-keys.yaml:
--------------------------------------------------------------------------------
1 | apikeys:
2 |
3 | bevigil:
4 | key:
5 |
6 | bing:
7 | key:
8 |
9 | bufferoverun:
10 | key:
11 |
12 | censys:
13 | id:
14 | secret:
15 |
16 | criminalip:
17 | key:
18 |
19 | dehashed:
20 | key:
21 |
22 | dnsdumpster:
23 | key:
24 |
25 | fullhunt:
26 | key:
27 |
28 | github:
29 | key:
30 |
31 | hunter:
32 | key:
33 |
34 | hunterhow:
35 | key:
36 |
37 | intelx:
38 | key:
39 |
40 | netlas:
41 | key:
42 |
43 | onyphe:
44 | key:
45 |
46 | pentestTools:
47 | key:
48 |
49 | projectDiscovery:
50 | key:
51 |
52 | rocketreach:
53 | key:
54 |
55 | securityTrails:
56 | key:
57 |
58 | shodan:
59 | key:
60 |
61 | tomba:
62 | key:
63 | secret:
64 |
65 | venacus:
66 | key:
67 |
68 | virustotal:
69 | key:
70 |
71 | whoisxml:
72 | key:
73 |
74 | zoomeye:
75 | key:
76 |
--------------------------------------------------------------------------------
/theHarvester/data/proxies.yaml:
--------------------------------------------------------------------------------
1 | http:
2 | - ip:port
3 |
--------------------------------------------------------------------------------
/theHarvester/data/wordlists/dorks.txt:
--------------------------------------------------------------------------------
1 | inurl:"contact"
2 | intext:email filetype:log
3 | "Index of /mail"
4 | "admin account info" filetype:log
5 | intext:@
6 | administrator accounts/
7 | intitle:"Index of" .bash_history
8 | intitle:"index of" members OR accounts
9 | inurl:/shared/help.php
10 | inurl:public
11 | intitle:index.of inbox
12 | intitle:"Server Administration"
13 | inurl:passwd.txt
14 | robots.txt
15 | php-addressbook "This is the addressbook for *" -warning
--------------------------------------------------------------------------------
/theHarvester/data/wordlists/general/common.txt:
--------------------------------------------------------------------------------
1 | admin
2 | test
3 | hello
4 | uk
5 | login
6 | book
7 | robots.txt
8 |
--------------------------------------------------------------------------------
/theHarvester/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/laramies/theHarvester/da97b0fc366f216fbaa287e8e419dcdd85e79273/theHarvester/discovery/__init__.py
--------------------------------------------------------------------------------
/theHarvester/discovery/baidusearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import AsyncFetcher, Core
2 | from theHarvester.parsers import myparser
3 |
4 |
5 | class SearchBaidu:
6 | def __init__(self, word, limit) -> None:
7 | self.word = word
8 | self.total_results = ''
9 | self.server = 'www.baidu.com'
10 | self.hostname = 'www.baidu.com'
11 | self.limit = limit
12 | self.proxy = False
13 |
14 | async def do_search(self) -> None:
15 | headers = {'Host': self.hostname, 'User-agent': Core.get_user_agent()}
16 | base_url = f'https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}'
17 | urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
18 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
19 | for response in responses:
20 | self.total_results += response
21 |
22 | async def process(self, proxy: bool = False) -> None:
23 | self.proxy = proxy
24 | await self.do_search()
25 |
26 | async def get_emails(self):
27 | rawres = myparser.Parser(self.total_results, self.word)
28 | return await rawres.emails()
29 |
30 | async def get_hostnames(self):
31 | rawres = myparser.Parser(self.total_results, self.word)
32 | return await rawres.hostnames()
33 |
--------------------------------------------------------------------------------
/theHarvester/discovery/bevigil.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import MissingKey
2 | from theHarvester.lib.core import AsyncFetcher, Core
3 |
4 |
5 | class SearchBeVigil:
6 | def __init__(self, word) -> None:
7 | self.word = word
8 | self.totalhosts: set = set()
9 | self.interestingurls: set = set()
10 | self.key = Core.bevigil_key()
11 | if self.key is None:
12 | self.key = ''
13 | raise MissingKey('bevigil')
14 | self.proxy = False
15 |
16 | async def do_search(self) -> None:
17 | subdomain_endpoint = f'https://osint.bevigil.com/api/{self.word}/subdomains/'
18 | url_endpoint = f'https://osint.bevigil.com/api/{self.word}/urls/'
19 | headers = {'X-Access-Token': self.key}
20 |
21 | responses = await AsyncFetcher.fetch_all([subdomain_endpoint], json=True, proxy=self.proxy, headers=headers)
22 | response = responses[0]
23 | for subdomain in response['subdomains']:
24 | self.totalhosts.add(subdomain)
25 |
26 | responses = await AsyncFetcher.fetch_all([url_endpoint], json=True, proxy=self.proxy, headers=headers)
27 | response = responses[0]
28 | for url in response['urls']:
29 | self.interestingurls.add(url)
30 |
31 | async def get_hostnames(self) -> set:
32 | return self.totalhosts
33 |
34 | async def get_interestingurls(self) -> set:
35 | return self.interestingurls
36 |
37 | async def process(self, proxy: bool = False) -> None:
38 | self.proxy = proxy
39 | await self.do_search()
40 |
--------------------------------------------------------------------------------
/theHarvester/discovery/bingsearch.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from theHarvester.discovery.constants import MissingKey
4 | from theHarvester.lib.core import AsyncFetcher, Core
5 | from theHarvester.parsers import myparser
6 |
7 |
8 | class SearchBing:
9 | def __init__(self, word, limit, start) -> None:
10 | self.word = word.replace(' ', '%20')
11 | self.results: list[Any] = []
12 | self.total_results = ''
13 | self.server = 'www.bing.com'
14 | self.apiserver = 'api.search.live.net'
15 | self.hostname = 'www.bing.com'
16 | self.limit = int(limit)
17 | self.bingApi = Core.bing_key()
18 | self.counter = start
19 | self.proxy = False
20 |
21 | async def do_search(self) -> None:
22 | headers = {
23 | 'Host': self.hostname,
24 | 'Cookie': 'SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50',
25 | 'Accept-Language': 'en-us,en',
26 | 'User-agent': Core.get_user_agent(),
27 | }
28 | base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
29 | urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
30 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
31 | for response in responses:
32 | self.total_results += response
33 |
34 | async def do_search_api(self) -> None:
35 | url = 'https://api.bing.microsoft.com/v7.0/search?'
36 | params = {
37 | 'q': self.word,
38 | 'count': str(self.limit),
39 | 'offset': '0',
40 | 'mkt': 'en-us',
41 | 'safesearch': 'Off',
42 | }
43 | headers = {
44 | 'User-Agent': Core.get_user_agent(),
45 | 'Ocp-Apim-Subscription-Key': self.bingApi,
46 | }
47 | self.results = await AsyncFetcher.fetch_all([url], headers=headers, params=params, proxy=self.proxy)
48 | for res in self.results:
49 | self.total_results += res
50 |
51 | async def do_search_vhost(self) -> None:
52 | headers = {
53 | 'Host': self.hostname,
54 | 'Cookie': 'mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50',
55 | 'Accept-Language': 'en-us,en',
56 | 'User-agent': Core.get_user_agent(),
57 | }
58 | base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx'
59 | urls = [base_url.replace('xx', str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
60 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
61 | for response in responses:
62 | self.total_results += response
63 |
64 | async def get_emails(self):
65 | rawres = myparser.Parser(self.total_results, self.word)
66 | return await rawres.emails()
67 |
68 | async def get_hostnames(self):
69 | rawres = myparser.Parser(self.total_results, self.word)
70 | return await rawres.hostnames()
71 |
72 | async def get_allhostnames(self):
73 | rawres = myparser.Parser(self.total_results, self.word)
74 | return await rawres.hostnames_all()
75 |
76 | async def process(self, api, proxy: bool = False) -> None:
77 | self.proxy = proxy
78 | if api == 'yes':
79 | if self.bingApi is None:
80 | raise MissingKey('BingAPI')
81 | await self.do_search_api()
82 | else:
83 | await self.do_search()
84 | print(f'\tSearching {self.counter} results.')
85 |
86 | async def process_vhost(self) -> None:
87 | await self.do_search_vhost()
88 |
--------------------------------------------------------------------------------
/theHarvester/discovery/bravesearch.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | from theHarvester.discovery.constants import get_delay
4 | from theHarvester.lib.core import AsyncFetcher, Core
5 | from theHarvester.parsers import myparser
6 |
7 |
8 | class SearchBrave:
9 | def __init__(self, word, limit):
10 | self.word = word
11 | self.results = ''
12 | self.totalresults = ''
13 | self.server = 'https://search.brave.com/search?q='
14 | self.limit = limit
15 | self.proxy = False
16 |
17 | async def do_search(self):
18 | headers = {'User-Agent': Core.get_user_agent()}
19 | for query in [f'"{self.word}"', f'site:{self.word}']:
20 | try:
21 | for offset in range(0, 50):
22 | # To reduce the total number of requests, only two queries are made "self.word" and site:self.word
23 | current_url = f'{self.server}{query}&offset={offset}&source=web&show_local=0&spellcheck=0'
24 | resp = await AsyncFetcher.fetch_all([current_url], headers=headers, proxy=self.proxy)
25 | self.results = resp[0]
26 | self.totalresults += self.results
27 | # if 'Results from Microsoft Bing.' in resp[0] \
28 | if (
29 | 'Not many great matches came back for your search' in resp[0]
30 | or 'Your request has been flagged as being suspicious and Brave Search' in resp[0]
31 | or 'Prove' in resp[0]
32 | and 'robot' in resp[0]
33 | or 'Robot' in resp[0]
34 | ):
35 | break
36 | await asyncio.sleep(get_delay() + 15)
37 | except Exception as e:
38 | print(f'An exception has occurred in bravesearch: {e}')
39 | await asyncio.sleep(get_delay() + 80)
40 | continue
41 |
42 | async def get_emails(self):
43 | rawres = myparser.Parser(self.totalresults, self.word)
44 | return await rawres.emails()
45 |
46 | async def get_hostnames(self):
47 | rawres = myparser.Parser(self.totalresults, self.word)
48 | return await rawres.hostnames()
49 |
50 | async def process(self, proxy=False):
51 | self.proxy = proxy
52 | await self.do_search()
53 |
--------------------------------------------------------------------------------
/theHarvester/discovery/bufferoverun.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from theHarvester.discovery.constants import MissingKey
4 | from theHarvester.lib.core import AsyncFetcher, Core
5 |
6 |
7 | class SearchBufferover:
8 | def __init__(self, word) -> None:
9 | self.word = word
10 | self.totalhosts: set = set()
11 | self.totalips: set = set()
12 | self.key = Core.bufferoverun_key()
13 | if self.key is None:
14 | raise MissingKey('bufferoverun')
15 | self.proxy = False
16 |
17 | async def do_search(self) -> None:
18 | url = f'https://tls.bufferover.run/dns?q={self.word}'
19 | response = await AsyncFetcher.fetch_all(
20 | [url],
21 | json=True,
22 | headers={'User-Agent': Core.get_user_agent(), 'x-api-key': f'{self.key}'},
23 | proxy=self.proxy,
24 | )
25 | dct = response[0]
26 | if dct['Results']:
27 | self.totalhosts = {
28 | (
29 | host.split(',')
30 | if ',' in host and self.word.replace('www.', '') in host.split(',')[0] in host
31 | else host.split(',')[4]
32 | )
33 | for host in dct['Results']
34 | }
35 |
36 | self.totalips = {
37 | ip.split(',')[0] for ip in dct['Results'] if re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', ip.split(',')[0])
38 | }
39 |
40 | async def get_hostnames(self) -> set:
41 | return self.totalhosts
42 |
43 | async def get_ips(self) -> set:
44 | return self.totalips
45 |
46 | async def process(self, proxy: bool = False) -> None:
47 | self.proxy = proxy
48 | await self.do_search()
49 |
--------------------------------------------------------------------------------
/theHarvester/discovery/builtwith.py:
--------------------------------------------------------------------------------
1 | import aiohttp
2 |
3 | from theHarvester.discovery.constants import MissingKey
4 | from theHarvester.lib.core import AsyncFetcher, Core
5 |
6 |
7 | class SearchBuiltWith:
8 | def __init__(self, word: str):
9 | self.word = word
10 | self.api_key = Core.builtwith_key()
11 | self.base_url = 'https://api.builtwith.com/v21/api.json'
12 | self.headers = {'Authorization': f'Bearer {self.api_key}', 'Content-Type': 'application/json'}
13 | self.hosts = set()
14 | self.tech_stack = {}
15 | self.interesting_urls = set()
16 | self.frameworks = set()
17 | self.languages = set()
18 | self.servers = set()
19 | self.cms = set()
20 | self.analytics = set()
21 |
22 | async def process(self, proxy: bool = False) -> None:
23 | """Get technology stack information for a domain."""
24 | try:
25 | if proxy:
26 | response = await AsyncFetcher.fetch(
27 | session=None, url=f'{self.base_url}?KEY={self.api_key}&LOOKUP={self.word}', headers=self.headers, proxy=proxy
28 | )
29 | if response:
30 | self.tech_stack = response
31 | self._extract_data()
32 | else:
33 | async with aiohttp.ClientSession(headers=self.headers) as session:
34 | async with session.get(f'{self.base_url}?KEY={self.api_key}&LOOKUP={self.word}') as response:
35 | if response.status == 200:
36 | data = await response.json()
37 | self.tech_stack = data
38 | self._extract_data()
39 | elif response.status == 401:
40 | print('[!] Missing API key for BuiltWith.')
41 | raise MissingKey('BuiltWith')
42 | except Exception as e:
43 | print(f'Error in BuiltWith search: {e}')
44 |
45 | def _extract_data(self) -> None:
46 | """Extract and categorize technology information."""
47 | if 'domains' in self.tech_stack:
48 | self.hosts.update(self.tech_stack['domains'])
49 | if 'paths' in self.tech_stack:
50 | self.interesting_urls.update(self.tech_stack['paths'])
51 | if 'technologies' in self.tech_stack:
52 | for tech in self.tech_stack['technologies']:
53 | category = tech.get('category', '').lower()
54 | name = tech.get('name', '')
55 |
56 | if 'framework' in category:
57 | self.frameworks.add(name)
58 | elif 'language' in category:
59 | self.languages.add(name)
60 | elif 'server' in category:
61 | self.servers.add(name)
62 | elif 'cms' in category:
63 | self.cms.add(name)
64 | elif 'analytics' in category:
65 | self.analytics.add(name)
66 |
67 | async def get_hostnames(self) -> set[str]:
68 | return self.hosts
69 |
70 | async def get_tech_stack(self) -> dict:
71 | return self.tech_stack
72 |
73 | async def get_interesting_urls(self) -> set[str]:
74 | return self.interesting_urls
75 |
76 | async def get_frameworks(self) -> set[str]:
77 | return self.frameworks
78 |
79 | async def get_languages(self) -> set[str]:
80 | return self.languages
81 |
82 | async def get_servers(self) -> set[str]:
83 | return self.servers
84 |
85 | async def get_cms(self) -> set[str]:
86 | return self.cms
87 |
88 | async def get_analytics(self) -> set[str]:
89 | return self.analytics
90 |
--------------------------------------------------------------------------------
/theHarvester/discovery/censysearch.py:
--------------------------------------------------------------------------------
1 | from censys.common import __version__
2 | from censys.common.exceptions import (
3 | CensysRateLimitExceededException,
4 | CensysUnauthorizedException,
5 | )
6 | from censys.search import CensysCerts
7 |
8 | from theHarvester.discovery.constants import MissingKey
9 | from theHarvester.lib.core import Core
10 | from theHarvester.lib.version import version as thehavester_version
11 |
12 |
13 | class SearchCensys:
14 | def __init__(self, domain, limit: int = 500) -> None:
15 | self.word = domain
16 | self.key = Core.censys_key()
17 | if self.key[0] is None or self.key[1] is None:
18 | raise MissingKey('Censys ID and/or Secret')
19 | self.totalhosts: set = set()
20 | self.emails: set = set()
21 | self.limit = limit
22 | self.proxy = False
23 |
24 | async def do_search(self) -> None:
25 | try:
26 | cert_search = CensysCerts(
27 | api_id=self.key[0],
28 | api_secret=self.key[1],
29 | user_agent=f'censys-python/{__version__} (theHarvester/{thehavester_version}); +https://github.com/laramies/theHarvester)',
30 | )
31 | except CensysUnauthorizedException:
32 | raise MissingKey('Censys ID and/or Secret')
33 |
34 | query = f'names: {self.word}'
35 | try:
36 | response = cert_search.search(
37 | query=query,
38 | fields=['names', 'parsed.subject.email_address'],
39 | max_records=self.limit,
40 | )
41 | for cert in response():
42 | self.totalhosts.update(cert.get('names', []))
43 | email_address = cert.get('parsed', {}).get('subject', {}).get('email_address', [])
44 | self.emails.update(email_address)
45 | except CensysRateLimitExceededException:
46 | print('Censys rate limit exceeded')
47 |
48 | async def get_hostnames(self) -> set:
49 | return self.totalhosts
50 |
51 | async def get_emails(self) -> set:
52 | return self.emails
53 |
54 | async def process(self, proxy: bool = False) -> None:
55 | self.proxy = proxy
56 | await self.do_search()
57 |
--------------------------------------------------------------------------------
/theHarvester/discovery/certspottersearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import AsyncFetcher
2 |
3 |
4 | class SearchCertspoter:
5 | def __init__(self, word) -> None:
6 | self.word = word
7 | self.totalhosts: set = set()
8 | self.proxy = False
9 |
10 | async def do_search(self) -> None:
11 | base_url = f'https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names'
12 | try:
13 | response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy)
14 | response = response[0]
15 | if isinstance(response, list):
16 | for dct in response:
17 | for key, value in dct.items():
18 | if key == 'dns_names':
19 | self.totalhosts.update({name for name in value if name})
20 | elif isinstance(response, dict):
21 | self.totalhosts.update({response['dns_names'] if 'dns_names' in response.keys() else ''}) # type: ignore
22 | else:
23 | self.totalhosts.update({''})
24 | except Exception as e:
25 | print(e)
26 |
27 | async def get_hostnames(self) -> set:
28 | return self.totalhosts
29 |
30 | async def process(self, proxy: bool = False) -> None:
31 | self.proxy = proxy
32 | await self.do_search()
33 | print('\tSearching results.')
34 |
--------------------------------------------------------------------------------
/theHarvester/discovery/constants.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | from theHarvester.lib.core import AsyncFetcher, Core
4 |
5 |
6 | async def splitter(links):
7 | """
8 | Method that tries to remove duplicates
9 | LinkedinLists pulls a lot of profiles with the same name.
10 | This method tries to remove duplicates from the list.
11 | :param links: list of links to remove duplicates from
12 | :return: a unique-ish list
13 | """
14 | unique_list = []
15 | name_check = []
16 | for url in links:
17 | tail = url.split('/')[-1]
18 | if len(tail) == 2 or tail == 'zh-cn':
19 | tail = url.split('/')[-2]
20 | name = tail.split('-')
21 | if len(name) > 1:
22 | joined_name = name[0] + name[1]
23 | else:
24 | joined_name = name[0]
25 | if joined_name not in name_check:
26 | unique_list.append(url)
27 | name_check.append(joined_name)
28 | return unique_list
29 |
30 |
31 | def filter(lst):
32 | """
33 | Method that filters list
34 | :param lst: list to be filtered
35 | :return: new filtered list
36 | """
37 | if lst is None:
38 | return []
39 | if not isinstance(lst, set):
40 | lst = set(lst) # Remove duplicates.
41 | new_lst = []
42 | for item in lst:
43 | item = str(item)
44 | if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
45 | item = item.replace('252f', '').replace('2F', '').replace('2f', '')
46 | new_lst.append(item.lower())
47 | return new_lst
48 |
49 |
50 | def get_delay() -> float:
51 | """Method that is used to generate a random delay"""
52 | return random.randint(1, 3) - 0.5
53 |
54 |
55 | async def search(text: str) -> bool:
56 | """Helper function to check if Google has blocked traffic.
57 | :param text: See if specific text is returned, which means Google is blocking us
58 | :return bool:
59 | """
60 | for line in text.strip().splitlines():
61 | if (
62 | 'This page appears when Google automatically detects requests coming from your computer network' in line
63 | or 'http://www.google.com/sorry/index' in line
64 | or 'https://www.google.com/sorry/index' in line
65 | ):
66 | # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
67 | return True
68 | return False
69 |
70 |
71 | async def google_workaround(visit_url: str) -> bool | str:
72 | """
73 | Function that makes a request on our behalf if Google starts to block us
74 | :param visit_url: Url to scrape
75 | :return: Correct html that can be parsed by BS4
76 | """
77 | url = 'https://websniffer.cc/'
78 | data = {
79 | 'Cookie': '',
80 | 'url': visit_url,
81 | 'submit': 'Submit',
82 | 'type': 'GET&http=1.1',
83 | 'uak': str(random.randint(4, 8)), # select random UA to send to Google
84 | }
85 | returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data)
86 | returned_html = (
87 | 'This page appears when Google automatically detects requests coming from your computer network'
88 | if returned_html == ''
89 | else returned_html[0]
90 | )
91 |
92 | returned_html = '' if 'Please Wait... | Cloudflare' in returned_html else returned_html
93 |
94 | if len(returned_html) == 0 or await search(returned_html) or '<html' not in returned_html:
95 | # indicates that google is serving workaround a captcha
96 | # That means we will try out second option which will utilize proxies
97 | return True
98 | # the html we get is malformed for BS4 as there are no greater than or less than signs
99 | if '<html>' in returned_html:
100 | start_index = returned_html.index('<html>')
101 | else:
102 | start_index = returned_html.index('<html')
103 |
104 | end_index = returned_html.index('</html>') + 1
105 | correct_html = returned_html[start_index:end_index]
106 | # Slice list to get the response's html
107 | correct_html = ''.join([ch.strip().replace('<', '<').replace('>', '>') for ch in correct_html])
108 | return correct_html
109 |
110 |
111 | class MissingKey(Exception):
112 | """
113 | :raise: When there is a module that has not been provided its API key
114 | """
115 |
116 | def __init__(self, source: str | None) -> None:
117 | if source:
118 | self.message = f'\n\033[93m[!] Missing API key for {source}. \033[0m'
119 | else:
120 | self.message = '\n\033[93m[!] Missing CSE id. \033[0m'
121 |
122 | def __str__(self) -> str:
123 | return self.message
124 |
--------------------------------------------------------------------------------
/theHarvester/discovery/criminalip.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from urllib.parse import urlparse
3 |
4 | from theHarvester.discovery.constants import MissingKey, get_delay
5 | from theHarvester.lib.core import AsyncFetcher, Core
6 |
7 |
8 | class SearchCriminalIP:
9 | def __init__(self, word) -> None:
10 | self.word = word
11 | self.totalhosts: set = set()
12 | self.totalips: set = set()
13 | self.asns: set = set()
14 | self.key = Core.criminalip_key()
15 | if self.key is None:
16 | raise MissingKey('criminalip')
17 | self.proxy = False
18 |
19 | async def do_search(self) -> None:
20 | # https://www.criminalip.io/developer/api/post-domain-scan
21 | # https://www.criminalip.io/developer/api/get-domain-status-id
22 | # https://www.criminalip.io/developer/api/get-domain-report-id
23 | url = 'https://api.criminalip.io/v1/domain/scan'
24 | data = f'{{"query": "{self.word}"}}'
25 | # print(f'Current key: {self.key}')
26 | user_agent = Core.get_user_agent()
27 | response = await AsyncFetcher.post_fetch(
28 | url,
29 | json=True,
30 | headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'},
31 | data=data,
32 | proxy=self.proxy,
33 | )
34 | # print(f'My response: {response}')
35 | # Expected response format:
36 | # {'data': {'scan_id': scan_id}, 'message': 'api success', 'status': 200}
37 | if 'status' in response.keys():
38 | status = response['status']
39 | if status != 200:
40 | print(f'An error has occurred searching criminalip dumping response: {response}')
41 | else:
42 | scan_id = response['data']['scan_id']
43 | scan_percentage = 0
44 | counter = 0
45 | while scan_percentage != 100:
46 | status_url = f'https://api.criminalip.io/v1/domain/status/{scan_id}'
47 | status_response = await AsyncFetcher.fetch_all(
48 | [status_url],
49 | json=True,
50 | headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'},
51 | proxy=self.proxy,
52 | )
53 | status = status_response[0]
54 | # print(f'Status response: {status}')
55 | # Expected format:
56 | # {"data": {"scan_percentage": 100}, "message": "api success", "status": 200}
57 | scan_percentage = status['data']['scan_percentage']
58 | if scan_percentage == 100:
59 | break
60 | if scan_percentage == -2:
61 | print(f'CriminalIP failed to scan: {self.word} does not exist, verify manually')
62 | print(f'Dumping data: scan_response: {response} status_response: {status}')
63 | return
64 | if scan_percentage == -1:
65 | print(f'CriminalIP scan failed dumping data: scan_response: {response} status_response: {status}')
66 | return
67 | # Wait for scan to finish
68 | if counter >= 5:
69 | await asyncio.sleep(20 * get_delay())
70 | else:
71 | await asyncio.sleep(10 * get_delay())
72 | counter += 1
73 | if counter == 10:
74 | print(
75 | 'Ten iterations have occurred in CriminalIP waiting for scan to finish, returning to prevent infinite loop.'
76 | )
77 | print(
78 | f'Verify results manually on CriminalIP dumping data: scan_response: {response} status_response: {status}'
79 | )
80 | return
81 |
82 | report_url = f'https://api.criminalip.io/v1/domain/report/{scan_id}'
83 | scan_response = await AsyncFetcher.fetch_all(
84 | [report_url],
85 | json=True,
86 | headers={'User-Agent': user_agent, 'x-api-key': f'{self.key}'},
87 | proxy=self.proxy,
88 | )
89 | scan = scan_response[0]
90 | # json_formatted_str = json.dumps(scan, indent=2)
91 | # print(json_formatted_str)
92 | try:
93 | await self.parser(scan)
94 | except Exception as e:
95 | print(f'An exception occurred while parsing criminalip result: {e}')
96 | print('Dumping json: ')
97 | print(scan)
98 |
99 | async def parser(self, jlines):
100 | # TODO when new scope field is added to parse lines for potential new scope!
101 | # TODO map as_name to asn for asn data
102 | # TODO determine if worth storing interesting urls
103 | if 'data' not in jlines.keys():
104 | print(f'Error with criminalip data, dumping: {jlines}')
105 | return
106 | data = jlines['data']
107 | for cert in data['certificates']:
108 | # print(f'Current cert: {cert}')
109 | if cert['subject'].endswith('.' + self.word):
110 | self.totalhosts.add(cert['subject'])
111 |
112 | for connected_domain in data['connected_domain_subdomain']:
113 | try:
114 | main_domain = connected_domain['main_domain']['domain']
115 | subdomains = [sub['domain'] for sub in connected_domain['subdomains']]
116 | if main_domain.endswith('.' + self.word):
117 | self.totalhosts.add(main_domain)
118 | for sub in subdomains:
119 | # print(f'Current sub: {sub}')
120 | if sub.endswith('.' + self.word):
121 | self.totalhosts.add(sub)
122 | except Exception as e:
123 | print(f'An exception has occurred: {e}')
124 | print(f'Main line: {connected_domain}')
125 |
126 | for ip_info in data['connected_ip_info']:
127 | self.asns.add(str(ip_info['asn']))
128 | domains = [sub['domain'] for sub in ip_info['domain_list']]
129 | for sub in domains:
130 | if sub.endswith('.' + self.word):
131 | self.totalhosts.add(sub)
132 | self.totalips.add(ip_info['ip'])
133 |
134 | for cookie in data['cookies']:
135 | if cookie['domain'] != '.' + self.word and cookie['domain'].endswith('.' + self.word):
136 | self.totalhosts.add(cookie['domain'])
137 |
138 | for country in data['country']:
139 | if country['domain'].endswith('.' + self.word):
140 | self.totalhosts.add(country['domain'])
141 | for ip in country['mapped_ips']:
142 | self.totalips.add(ip['ip'])
143 |
144 | for k, v in data['dns_record'].items():
145 | if k == 'dns_record_type_a':
146 | for ip in data['dns_record'][k]['ipv4']:
147 | self.totalips.add(ip['ip'])
148 | else:
149 | if isinstance(v, list):
150 | for item in v:
151 | if isinstance(item, list):
152 | for subitem in item:
153 | if subitem.endswith('.' + self.word):
154 | self.totalhosts.add(subitem)
155 | else:
156 | if item.endswith('.' + self.word):
157 | self.totalhosts.add(item)
158 |
159 | for domain_list in data['domain_list']:
160 | self.asns.add(str(domain_list['asn']))
161 | domains = [sub['domain'] for sub in domain_list['domain_list']]
162 | for sub in domains:
163 | if sub.endswith('.' + self.word):
164 | self.totalhosts.add(sub)
165 | self.totalips.add(domain_list['ip'])
166 |
167 | for html_page_links in data['html_page_link_domains']:
168 | domain = html_page_links['domain']
169 | if domain.endswith('.' + self.word):
170 | self.totalhosts.add(domain)
171 | for ip in html_page_links['mapped_ips']:
172 | self.totalips.add(ip['ip'])
173 |
174 | # TODO combine data['links'] and data['network_logs'] urls into one list for one run through
175 | for link in data['links']:
176 | url = link['url']
177 | parsed_url = urlparse(url)
178 | netloc = parsed_url.netloc
179 | if self.word in netloc:
180 | if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word):
181 | self.totalhosts.add(netloc)
182 |
183 | for log in data['network_logs']:
184 | url = log['url']
185 | parsed_url = urlparse(url)
186 | netloc = parsed_url.netloc
187 | if self.word in netloc:
188 | if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word):
189 | self.totalhosts.add(netloc)
190 | self.asns.add(str(log['as_number']))
191 |
192 | for redirects in data['page_redirections']:
193 | for redirect in redirects:
194 | url = redirect['url']
195 | parsed_url = urlparse(url)
196 | netloc = parsed_url.netloc
197 | if self.word in netloc:
198 | if (':' in netloc and netloc.split(':')[0].endswith(self.word)) or netloc.endswith(self.word):
199 | self.totalhosts.add(netloc)
200 |
201 | self.totalhosts = {host.replace('www.', '') for host in self.totalhosts if '*.' + self.word != host}
202 |
203 | # print(f'hostnames: {self.totalhosts}')
204 | # print(f'asns: {self.asns}')
205 | # print(f'ips: {self.totalips}')
206 |
207 | async def get_asns(self) -> set:
208 | return self.asns
209 |
210 | async def get_hostnames(self) -> set:
211 | return self.totalhosts
212 |
213 | async def get_ips(self) -> set:
214 | return self.totalips
215 |
216 | async def process(self, proxy: bool = False) -> None:
217 | self.proxy = proxy
218 | await self.do_search()
219 |
--------------------------------------------------------------------------------
/theHarvester/discovery/crtsh.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import AsyncFetcher
2 |
3 |
4 | class SearchCrtsh:
5 | def __init__(self, word) -> None:
6 | self.word = word
7 | self.data: list = []
8 | self.proxy = False
9 |
10 | async def do_search(self) -> list:
11 | data: set = set()
12 | try:
13 | url = f'https://crt.sh/?q=%25.{self.word}&exclude=expired&deduplicate=Y&output=json'
14 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
15 | response = response[0]
16 | data = set([(dct['name_value'][2:] if dct['name_value'][:2] == '*.' else dct['name_value']) for dct in response])
17 | data = {domain for domain in data if (domain[0] != '*' and str(domain[0:4]).isnumeric() is False)}
18 | except Exception as e:
19 | print(e)
20 | clean: list = []
21 | for x in data:
22 | pre = x.split()
23 | for y in pre:
24 | clean.append(y)
25 | return clean
26 |
27 | async def process(self, proxy: bool = False) -> None:
28 | self.proxy = proxy
29 | data = await self.do_search()
30 | self.data = data
31 |
32 | async def get_hostnames(self) -> list:
33 | return self.data
34 |
--------------------------------------------------------------------------------
/theHarvester/discovery/dnssearch.py:
--------------------------------------------------------------------------------
1 | """
2 | ============
3 | DNS Browsing
4 | ============
5 |
6 | Explore the space around known hosts & ips for extra catches.
7 | """
8 |
9 | import asyncio
10 | import re
11 | import sys
12 | from collections.abc import Callable
13 | from ipaddress import IPv4Network
14 |
15 | from aiodns import DNSResolver
16 |
17 | from theHarvester.lib import hostchecker
18 | from theHarvester.lib.core import DATA_DIR
19 |
20 | #####################################################################
21 | # DNS FORCE
22 | #####################################################################
23 |
24 | DNS_NAMES = DATA_DIR / 'wordlists' / 'dns-names.txt'
25 |
26 |
27 | class DnsForce:
28 | def __init__(self, domain, dnsserver, verbose: bool = False) -> None:
29 | self.domain = domain
30 | self.subdo = False
31 | self.verbose = verbose
32 | # self.dnsserver = [dnsserver] if isinstance(dnsserver, str) else dnsserver
33 | # self.dnsserver = list(map(str, dnsserver.split(','))) if isinstance(dnsserver, str) else dnsserver
34 | self.dnsserver = dnsserver
35 | with DNS_NAMES.open('r') as file:
36 | self.list = file.readlines()
37 | self.domain = domain.replace('www.', '')
38 | self.list = [f'{word.strip()}.{self.domain}' for word in self.list]
39 |
40 | async def run(self):
41 | print(f'Starting DNS brute forcing with {len(self.list)} words')
42 | checker = hostchecker.Checker(self.list, nameservers=self.dnsserver)
43 | resolved_pair, hosts, ips = await checker.check()
44 | return resolved_pair, hosts, ips
45 |
46 |
47 | #####################################################################
48 | # DNS REVERSE
49 | #####################################################################
50 |
51 |
52 | IP_REGEX = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
53 | PORT_REGEX = r'\d{1,5}'
54 | NETMASK_REGEX: str = r'\d{1,2}|' + IP_REGEX
55 | NETWORK_REGEX: str = rf'\b({IP_REGEX})(?:\:({PORT_REGEX}))?(?:\/({NETMASK_REGEX}))?\b'
56 |
57 |
58 | def serialize_ip_range(ip: str, netmask: str = '24') -> str:
59 | """
60 | Serialize a network range in a constant format, 'x.x.x.x/y'.
61 |
62 | Parameters
63 | ----------
64 | ip: str.
65 | A serialized ip in the format 'x.x.x.x'.
66 | Extra information like port (':z') or subnet ('/n')
67 | will be ignored.
68 | netmask: str.
69 | The subnet subdivision, represented by a 2 digit netmask.
70 |
71 | Returns
72 | -------
73 | out: str.
74 | The network OSI address, like '192.168.0.0/24'.
75 | """
76 | __ip_matches = re.search(NETWORK_REGEX, ip, re.IGNORECASE)
77 | if __ip_matches and __ip_matches.groups():
78 | __ip = __ip_matches.group(1)
79 | __netmask = netmask if netmask else __ip_matches.group(3)
80 | if __ip and __netmask:
81 | return str(IPv4Network(f'{__ip}/{__netmask}', strict=False))
82 | elif __ip:
83 | return str(IPv4Network('{}/{}'.format(__ip, '24'), strict=False))
84 |
85 | # invalid input ip
86 | return ''
87 |
88 |
89 | def list_ips_in_network_range(iprange: str) -> list[str]:
90 | """
91 | List all the IPs in the range.
92 |
93 | Parameters
94 | ----------
95 | iprange: str.
96 | A serialized ip range, like '1.2.3.0/24'.
97 | The last digit can be set to anything, it will be ignored.
98 |
99 | Returns
100 | -------
101 | out: list.
102 | The list of IPs in the range.
103 | """
104 | try:
105 | __network = IPv4Network(iprange, strict=False)
106 | return [__address.exploded for __address in __network.hosts()]
107 | except Exception:
108 | return []
109 |
110 |
111 | async def reverse_single_ip(ip: str, resolver: DNSResolver) -> str:
112 | """
113 | Reverse a single IP and output the linked CNAME, if it exists.
114 | Parameters
115 | ----------
116 | :param ip: IP address to reverse
117 | :param resolver: DNS server to use
118 |
119 | Returns
120 | -------
121 | :return str: with the corresponding CNAME or None
122 | """
123 | try:
124 | __host = await resolver.gethostbyaddr(ip)
125 | return __host.name if __host else ''
126 | except Exception:
127 | return ''
128 |
129 |
130 | async def reverse_all_ips_in_range(iprange: str, callback: Callable, nameservers: list[str] | None = None) -> None:
131 | """
132 | Reverse all the IPs stored in a network range.
133 | All the queries are made concurrently.
134 |
135 | Parameters
136 | ----------
137 | iprange: str.
138 | An IPv4 range formatted as 'x.x.x.x/y'.
139 | The last 2 digits of the ip can be set to anything,
140 | they will be ignored.
141 | callback: Callable.
142 | Arbitrary postprocessing function.
143 | nameservers: List[str].
144 | Optional list of DNS servers.
145 |
146 | Returns
147 | -------
148 | out: None.
149 | """
150 | loop = asyncio.get_event_loop()
151 | __resolver = DNSResolver(loop=loop, timeout=8, nameservers=nameservers)
152 | for __ip in list_ips_in_network_range(iprange):
153 | log_query(__ip)
154 | __host = await reverse_single_ip(ip=__ip, resolver=__resolver)
155 | callback(__host)
156 | log_result(__host)
157 |
158 |
159 | #####################################################################
160 | # IO
161 | #####################################################################
162 |
163 |
164 | def log_query(ip: str) -> None:
165 | """
166 | Display the current query in the console.
167 |
168 | Parameters
169 | ----------
170 | ip: str.
171 | Queried ip.
172 |
173 | Results
174 | -------
175 | out: None.
176 | """
177 | sys.stdout.write(chr(27) + '[2K' + chr(27) + '[G')
178 | sys.stdout.write('\r' + ip + ' - ')
179 | sys.stdout.flush()
180 |
181 |
182 | def log_result(host: str) -> None:
183 | """
184 | Display the query result in the console.
185 |
186 | Parameters
187 | ----------
188 | host: str.
189 | Host name returned by the DNS query.
190 |
191 | Results
192 | -------
193 | out: None.
194 | """
195 | if host:
196 | print(host)
197 |
198 |
199 | def generate_postprocessing_callback(target: str, **allhosts: list[str]) -> Callable:
200 | """
201 | Postprocess the query results asynchronously too, instead of waiting for
202 | the querying stage to be completely finished.
203 |
204 | Parameters
205 | ----------
206 | target: str.
207 | The domain wanted as TLD.
208 | allhosts: List.
209 | A collection of all the subdomains -of target- found so far.
210 |
211 | Returns
212 | -------
213 | out: Callable.
214 | A function that will update the collection of target subdomains
215 | when the query result is satisfying.
216 | """
217 |
218 | def append_matching_hosts(host: str) -> None:
219 | if host and target in host:
220 | for __name, __hosts in allhosts.items():
221 | if host not in __hosts:
222 | __hosts.append(host)
223 |
224 | return append_matching_hosts
225 |
--------------------------------------------------------------------------------
/theHarvester/discovery/duckduckgosearch.py:
--------------------------------------------------------------------------------
1 | import ujson
2 |
3 | from theHarvester.lib.core import AsyncFetcher, Core
4 | from theHarvester.parsers import myparser
5 |
6 |
7 | class SearchDuckDuckGo:
8 | def __init__(self, word, limit) -> None:
9 | self.word = word
10 | self.results = ''
11 | self.totalresults = ''
12 | self.dorks: list = []
13 | self.links: list = []
14 | self.database = 'https://duckduckgo.com/?q='
15 | self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1' # Currently using API.
16 | self.quantity = '100'
17 | self.limit = limit
18 | self.proxy = False
19 |
20 | async def do_search(self) -> None:
21 | # Do normal scraping.
22 | url = self.api.replace('x', self.word)
23 | headers = {'User-Agent': Core.get_user_agent()}
24 | first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
25 | self.results = first_resp[0]
26 | self.totalresults += self.results
27 | urls = await self.crawl(self.results)
28 | urls = {url for url in urls if len(url) > 5}
29 | all_resps = await AsyncFetcher.fetch_all(urls)
30 | self.totalresults += ''.join(all_resps)
31 |
32 | async def crawl(self, text):
33 | """
34 | Function parses json and returns URLs.
35 | :param text: formatted json
36 | :return: set of URLs
37 | """
38 | urls = set()
39 | try:
40 | load = ujson.loads(text)
41 | for keys in load.keys(): # Iterate through keys of dict.
42 | val = load.get(keys)
43 |
44 | if isinstance(val, int) or isinstance(val, dict) or val is None:
45 | continue
46 |
47 | if isinstance(val, list):
48 | if len(val) == 0: # Make sure not indexing an empty list.
49 | continue
50 | val = val[0] # The First value should be dict.
51 |
52 | if isinstance(val, dict): # Validation check.
53 | for key in val.keys():
54 | value = val.get(key)
55 | if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
56 | urls.add(value)
57 |
58 | if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
59 | urls.add(val)
60 | tmp = set()
61 | for url in urls:
62 | if '<' in url and 'href=' in url: # Format is