├── .dockerignore ├── .flake8 ├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ └── issue-template.md ├── dependabot.yml └── workflows │ ├── codeql-analysis.yml │ ├── dockerci.yml │ └── theHarvester.yml ├── .gitignore ├── .lgtm.yml ├── Dockerfile ├── README.md ├── README ├── CONTRIBUTING.md ├── COPYING └── LICENSES ├── api-keys.yaml ├── bin ├── restfulHarvest └── theHarvester ├── debian ├── changelog ├── control ├── copyright ├── dirs ├── docs ├── gbp.conf ├── helper-script │ └── theharvester ├── patches │ ├── Disable-a-failing-test-unstable-site.patch │ ├── Improve-data-installation.patch │ └── series ├── rules ├── source │ └── format ├── tests │ └── control ├── theharvester.install ├── theharvester.links ├── upstream │ └── metadata └── watch ├── mypy.ini ├── proxies.yaml ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── requirements ├── base.txt └── dev.txt ├── restfulHarvest.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── discovery │ ├── __init__.py │ ├── test_anubis.py │ ├── test_certspotter.py │ ├── test_githubcode.py │ ├── test_omnisint.py │ ├── test_otx.py │ ├── test_qwantsearch.py │ ├── test_sublist3r.py │ └── test_threatminer.py └── test_myparser.py ├── theHarvester-logo.png ├── theHarvester.py ├── theHarvester ├── __init__.py ├── __main__.py ├── discovery │ ├── __init__.py │ ├── anubis.py │ ├── baidusearch.py │ ├── bevigil.py │ ├── binaryedgesearch.py │ ├── bingsearch.py │ ├── bufferoverun.py │ ├── censysearch.py │ ├── certspottersearch.py │ ├── constants.py │ ├── crtsh.py │ ├── dnsdumpster.py │ ├── dnssearch.py │ ├── duckduckgosearch.py │ ├── fullhuntsearch.py │ ├── githubcode.py │ ├── hackertarget.py │ ├── huntersearch.py │ ├── intelxsearch.py │ ├── omnisint.py │ ├── otxsearch.py │ ├── pentesttools.py │ ├── projectdiscovery.py │ ├── qwantsearch.py │ ├── rapiddns.py │ ├── rocketreach.py │ ├── securitytrailssearch.py │ ├── shodansearch.py │ ├── sublist3r.py │ ├── takeover.py │ ├── threatcrowd.py │ ├── threatminer.py │ ├── urlscan.py │ ├── virustotal.py │ ├── yahoosearch.py │ └── zoomeyesearch.py ├── lib │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ ├── api.py │ │ ├── api_example.py │ │ └── static │ │ │ └── .gitkeep │ ├── core.py │ ├── hostchecker.py │ ├── ip-ranges.json │ ├── resolvers.txt │ └── stash.py ├── parsers │ ├── __init__.py │ ├── intelxparser.py │ ├── myparser.py │ └── securitytrailsparser.py └── screenshot │ └── screenshot.py └── wordlists ├── dns-big.txt ├── dns-names.txt ├── dorks.txt ├── general └── common.txt └── names_small.txt /.dockerignore: -------------------------------------------------------------------------------- 1 | .github/* 2 | .gitattributes 3 | .idea/ 4 | .lgtm.yml 5 | mypy.ini 6 | .pytest_cache 7 | .mypy_cache 8 | tests/* 9 | README/ 10 | bin/ -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, F405, F403, F401, E402 -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, which is to have git automatically determine 2 | # whether a file is a text or binary, unless otherwise specified. 3 | 4 | * text=auto 5 | 6 | # Basic .gitattributes for a python repo. 7 | 8 | # Source files 9 | # ============ 10 | *.pxd text diff=python 11 | *.py text diff=python 12 | *.py3 text diff=python 13 | *.pyw text diff=python 14 | *.pyx text diff=python 15 | 16 | # Binary files 17 | # ============ 18 | *.db binary 19 | *.p binary 20 | *.pkl binary 21 | *.pyc binary 22 | *.pyd binary 23 | *.pyo binary 24 | 25 | # Note: .db, .p, and .pkl files are associated with the python modules 26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` 27 | # (among others). 28 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [L1ghtn1ng, NotoriousRebel] 4 | open_collective: # Replace with a single Open Collective username 5 | ko_fi: # 6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 8 | liberapay: # Replace with a single Liberapay username 9 | issuehunt: # Replace with a single IssueHunt username 10 | otechie: # Replace with a single Otechie username 11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/issue-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Issue Template 3 | about: A template for new issues. 4 | title: "[Bug|Feature Request|Other] Short Description of Issue" 5 | labels: '' 6 | 7 | --- 8 | 9 | ## Note we do not support installing theHarvester on andriod 10 | 11 | **Feature Request or Bug or Other** 12 | Feature Request | Bug | Other 13 | 14 | **Describe the feature request or bug or other** 15 | A clear and concise description of what the bug, feature request, 16 | or other request is. 17 | 18 | **To Reproduce** 19 | Steps to reproduce the behaviour: 20 | 1. Run tool like this: '...' 21 | 2. See error 22 | 23 | **Expected behaviour** 24 | A clear and concise description of what you expected to happen. 25 | 26 | **Screenshots** 27 | If possible please add screenshots to help explain your problem. 28 | 29 | **System Information (System that tool is running on):** 30 | - OS: [e.g. Windows10] 31 | - Version [e.g. 2.7] 32 | 33 | **Additional context** 34 | Add any other context about the problem here. 35 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | timezone: Europe/London 8 | - package-ecosystem: pip 9 | directory: "/" 10 | schedule: 11 | interval: daily 12 | timezone: Europe/London 13 | open-pull-requests-limit: 10 14 | target-branch: master 15 | allow: 16 | - dependency-type: direct 17 | - dependency-type: indirect 18 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master, dev ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master, dev ] 20 | schedule: 21 | - cron: '19 11 * * 4' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | language: [ 'python' ] 32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 33 | # Learn more: 34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v3 39 | 40 | # Initializes the CodeQL tools for scanning. 41 | - name: Initialize CodeQL 42 | uses: github/codeql-action/init@v2 43 | with: 44 | languages: ${{ matrix.language }} 45 | # If you wish to specify custom queries, you can do so here or in a config file. 46 | # By default, queries listed here will override any specified in a config file. 47 | # Prefix the list here with "+" to use these queries and those in the config file. 48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 49 | 50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 51 | # If this step fails, then you should remove it and run the build manually (see below) 52 | - name: Autobuild 53 | uses: github/codeql-action/autobuild@v2 54 | 55 | # ℹ️ Command-line programs to run using the OS shell. 56 | # 📚 https://git.io/JvXDl 57 | 58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 59 | # and modify them (or add more) to build your code if your project 60 | # uses a compiled language 61 | 62 | #- run: | 63 | # make bootstrap 64 | # make release 65 | 66 | - name: Perform CodeQL Analysis 67 | uses: github/codeql-action/analyze@v2 68 | -------------------------------------------------------------------------------- /.github/workflows/dockerci.yml: -------------------------------------------------------------------------------- 1 | name: TheHarvester Docker Image CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - name: Build the Docker image 11 | run: docker build . --file Dockerfile --tag theharvester:$(date +%s) -------------------------------------------------------------------------------- /.github/workflows/theHarvester.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: TheHarvester Python CI 3 | 4 | on: 5 | push: 6 | branches: 7 | - '*' 8 | 9 | pull_request: 10 | branches: 11 | - '*' 12 | 13 | jobs: 14 | Python: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | max-parallel: 8 18 | matrix: 19 | os: [ ubuntu-latest, macos-latest ] 20 | python-version: [ 3.8, 3.9, 3.10.0 ] 21 | 22 | steps: 23 | - uses: actions/checkout@v3 24 | - name: Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | pip install --upgrade pip 31 | pip install wheel 32 | pip install -r requirements/dev.txt 33 | 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --show-source --statistics 38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | flake8 . --count --exit-zero --max-line-length=127 --statistics 40 | 41 | - name: Test with pytest 42 | run: | 43 | pytest 44 | 45 | - name: Static type checking with mypy 46 | run: | 47 | mypy --pretty theHarvester/*/*.py 48 | mypy --pretty theHarvester/*/*/*.py 49 | 50 | - name: Run theHarvester module Anubis 51 | run: | 52 | python theHarvester.py -d apple.com -b anubis 53 | 54 | - name: Run theHarvester module Baidu 55 | run: | 56 | python theHarvester.py -d yale.edu -b baidu 57 | 58 | - name: Run theHarvester module Bing 59 | run: | 60 | python theHarvester.py -d yale.edu -b bing 61 | 62 | - name: Run theHarvester module CertSpotter 63 | run: | 64 | python theHarvester.py -d yale.edu -b certspotter 65 | 66 | - name: Run theHarvester module Crtsh 67 | run: | 68 | python theHarvester.py -d hcl.com -b crtsh 69 | 70 | - name: Run theHarvester module DnsDumpster 71 | run: | 72 | python theHarvester.py -d yale.edu -b dnsdumpster 73 | 74 | - name: Run theHarvester module DuckDuckGo 75 | run: | 76 | python theHarvester.py -d yale.edu -b duckduckgo 77 | 78 | - name: Run theHarvester module HackerTarget 79 | run: | 80 | python theHarvester.py -d yale.edu -b hackertarget 81 | 82 | - name: Run theHarvester module Intelx 83 | run: | 84 | python theHarvester.py -d yale.edu -b intelx 85 | 86 | - name: Run theHarvester module Omnisint 87 | run: | 88 | python theHarvester.py -d yale.edu -b omnisint 89 | 90 | - name: Run theHarvester module Otx 91 | run: | 92 | python theHarvester.py -d yale.edu -b otx 93 | 94 | - name: Run theHarvester module Qwant 95 | run: | 96 | python theHarvester.py -d yale.edu -b qwant 97 | 98 | - name: Run theHarvester module RapidDns 99 | run: | 100 | python theHarvester.py -d yale.edu -b rapiddns 101 | 102 | - name: Run theHarvester module Sublist3r 103 | run: | 104 | python theHarvester.py -d yale.edu -b sublist3r 105 | 106 | - name: Run theHarvester module Threatcrowd 107 | run: | 108 | python theHarvester.py -d yale.edu -b threatcrowd 109 | 110 | - name: Run theHarvester module Threatminer 111 | run: | 112 | python theHarvester.py -d yale.edu -b threatminer 113 | 114 | - name: Run theHarvester module Urlscan 115 | run: | 116 | python theHarvester.py -d yale.edu -b urlscan 117 | 118 | - name: Run theHarvester module Yahoo 119 | run: | 120 | python theHarvester.py -d yale.edu -b yahoo 121 | 122 | - name: Run theHarvester module DNS brute force 123 | run: | 124 | python theHarvester.py -d yale.edu -c 125 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.idea 2 | *.pyc 3 | *.sqlite 4 | *.html 5 | *.htm 6 | *.vscode 7 | *.xml 8 | *.json 9 | debug_results.txt 10 | venv 11 | .mypy_cache 12 | .pytest_cache 13 | build/ 14 | dist/ 15 | theHarvester.egg-info 16 | api-keys.yaml 17 | .DS_Store 18 | .venv 19 | -------------------------------------------------------------------------------- /.lgtm.yml: -------------------------------------------------------------------------------- 1 | queries: 2 | - exclude: py/import-and-import-from 3 | - exclude: py/polluting-import 4 | - exclude: py/member-test-non-container 5 | 6 | extraction: 7 | python: 8 | python_setup: 9 | version: 3 -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:kinetic 2 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1" 3 | RUN mkdir /app 4 | WORKDIR /app 5 | COPY . /app 6 | ENV DEBIAN_FRONTEND=noninteractive 7 | RUN apt update && apt dist-upgrade -qy && apt install -qy git python3 python3-pip libffi-dev libxml2-dev libxslt1-dev && /usr/bin/python3 -m pip install --upgrade pip && apt autoremove -qy 8 | RUN /usr/bin/python3 --version && pip3 install --no-cache-dir -r requirements.txt && chmod +x ./*.py 9 | ENTRYPOINT ["/app/theHarvester.py"] 10 | ENTRYPOINT ["/app/restfulHarvest.py", "-H", "0.0.0.0", "-p", "80"] 11 | EXPOSE 80 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![theHarvester](https://github.com/laramies/theHarvester/blob/master/theHarvester-logo.png) 2 | 3 | ![TheHarvester CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Python%20CI/badge.svg) ![TheHarvester Docker Image CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Docker%20Image%20CI/badge.svg) [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/laramies/theHarvester.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/laramies/theHarvester/context:python) 4 | [![Rawsec's CyberSecurity Inventory](https://inventory.rawsec.ml/img/badges/Rawsec-inventoried-FF5050_flat_without_logo.svg)](https://inventory.rawsec.ml/) 5 | 6 | What is this? 7 | ------------- 8 | theHarvester is a simple to use, yet powerful tool designed to be used during the reconnaissance stage of a red
9 | team assessment or penetration test. It performs open source intelligence (OSINT) gathering to help determine
10 | a domain's external threat landscape. The tool gathers names, emails, IPs, subdomains, and URLs by using
11 | multiple public resources that include:
12 | 13 | Passive: 14 | -------- 15 | * anubis: Anubis-DB - https://github.com/jonluca/anubis 16 | 17 | * bevigil: CloudSEK BeVigil scans mobile application for OSINT assets and makes them available through an API - https://bevigil.com/osint-api 18 | 19 | * baidu: Baidu search engine - www.baidu.com 20 | 21 | * binaryedge: List of known subdomains from www.binaryedge.io 22 | 23 | * bing: Microsoft search engine - www.bing.com 24 | 25 | * bingapi: Microsoft search engine, through the API (Requires an API key, see below.) 26 | 27 | * bufferoverun: Uses data from Rapid7's Project Sonar - www.rapid7.com/research/project-sonar/ 28 | 29 | * censys: [Censys search engine](https://search.censys.io/), will use certificates searches to enumerate subdomains and gather emails (Requires an API key, see below.) - [censys.io](https://censys.io/) 30 | 31 | * certspotter: Cert Spotter monitors Certificate Transparency logs - https://sslmate.com/certspotter/ 32 | 33 | * crtsh: Comodo Certificate search - https://crt.sh 34 | 35 | * dnsdumpster: DNSdumpster search engine - https://dnsdumpster.com 36 | 37 | * duckduckgo: DuckDuckGo search engine - www.duckduckgo.com 38 | 39 | * fullhunt: The Next-Generation Attack Surface Security Platform - https://fullhunt.io 40 | 41 | * github-code: GitHub code search engine (Requires a GitHub Personal Access Token, see below.) - www.github.com 42 | 43 | * hackertarget: Online vulnerability scanners and network intelligence to help organizations - https://hackertarget.com 44 | 45 | * hunter: Hunter search engine (Requires an API key, see below.) - www.hunter.io 46 | 47 | * intelx: Intelx search engine (Requires an API key, see below.) - www.intelx.io 48 | 49 | * omnisint: Project Crobat, A Centralised Searchable Open Source Project Sonar DNS Database - https://github.com/Cgboal/SonarSearch 50 | 51 | * otx: AlienVault Open Threat Exchange - https://otx.alienvault.com 52 | 53 | * pentesttools: Powerful Penetration Testing Tools, Easy to Use (Requires an API key, see below.) - https://pentest-tools.com/home 54 | 55 | * projecdiscovery: We actively collect and maintain internet-wide assets data, 56 | to enhance research and analyse changes around DNS for better insights (Requires an API key, see below.) - https://chaos.projectdiscovery.io 57 | 58 | * qwant: Qwant search engine - www.qwant.com 59 | 60 | * rapiddns: DNS query tool which make querying subdomains or sites of a same IP easy! https://rapiddns.io 61 | 62 | * rocketreach: Access real-time verified personal/professional emails, phone numbers, and social media links. - https://rocketreach.co 63 | 64 | * securityTrails: Security Trails search engine, the world's largest repository of historical DNS data
65 | (Requires an API key, see below.) - www.securitytrails.com 66 | 67 | * shodan: Shodan search engine, will search for ports and banners from discovered hosts (Requires an API key, see below.) - www.shodanhq.com 68 | 69 | * sublist3r: Fast subdomains enumeration tool for penetration testers - https://api.sublist3r.com/search.php?domain=example.com 70 | 71 | * threatcrowd: Open source threat intelligence - www.threatcrowd.org 72 | 73 | * threatminer: Data mining for threat intelligence - https://www.threatminer.org/ 74 | 75 | * urlscan: A sandbox for the web that is a URL and website scanner - https://urlscan.io 76 | 77 | * vhost: Bing virtual hosts search 78 | 79 | * virustotal: virustotal.com domain search 80 | 81 | * yahoo: Yahoo search engine 82 | 83 | * zoomeye: China version of shodan - https://www.zoomeye.org 84 | 85 | 86 | Active: 87 | ------- 88 | * DNS brute force: dictionary brute force enumeration 89 | * Screenshots: Take screenshots of subdomains that were found 90 | 91 | Modules that require an API key: 92 | -------------------------------- 93 | Documentation to setup API keys can be found at - https://github.com/laramies/theHarvester/wiki/Installation#api-keys 94 | 95 | * bevigil - Free upto 50 queries. Pricing can be found here: https://bevigil.com/pricing/osint 96 | * binaryedge - $10/month 97 | * bing 98 | * censys - API keys are required and can be retrieved from your [Censys account](https://search.censys.io/account/api). 99 | * fullhunt 100 | * github 101 | * hunter - limited to 10 on the free plan, so you will need to do -l 10 switch 102 | * intelx 103 | * pentesttools - $ 104 | * projecdiscovery - invite only for now 105 | * rocketreach - $ 106 | * securityTrails 107 | * shodan - $ 108 | * zoomeye 109 | 110 | Install and dependencies: 111 | ------------------------- 112 | * Python 3.7+ 113 | * https://github.com/laramies/theHarvester/wiki/Installation 114 | 115 | 116 | Comments, bugs, and requests: 117 | ----------------------------- 118 | * [![Twitter Follow](https://img.shields.io/twitter/follow/laramies.svg?style=social&label=Follow)](https://twitter.com/laramies) Christian Martorella @laramies 119 | cmartorella@edge-security.com 120 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1 121 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1 122 | 123 | 124 | Main contributors: 125 | ------------------ 126 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1 127 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1 128 | * [![Twitter Follow](https://img.shields.io/twitter/follow/discoverscripts.svg?style=social&label=Follow)](https://twitter.com/discoverscripts) Lee Baird @discoverscripts 129 | 130 | 131 | Thanks: 132 | ------- 133 | * John Matherly - Shodan project 134 | * Ahmed Aboul Ela - subdomain names dictionaries (big and small) 135 | -------------------------------------------------------------------------------- /README/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to theHarvester Project 2 | Welcome to theHarvester project, so you would like to contribute. 3 | The following below must be met to get accepted. 4 | 5 | # CI 6 | Make sure all CI passes and you do not introduce any alerts from lgtm. 7 | 8 | # Unit Tests 9 | For new modules a unit test for that module is required and we use pytest. 10 | 11 | # Coding Standards 12 | * No single letter variables and variable names must represent the action that it is performing 13 | * Have static typing on functions etc 14 | * Make sure no errors are reported from mypy 15 | * No issues reported with flake8 16 | 17 | # Submitting Bugs 18 | If you find a bug in a module that you want to submit an issue for and know how to write python code. 19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project. 20 | -------------------------------------------------------------------------------- /README/COPYING: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc. 5 | 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Library General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | -------------------------------------------------------------------------------- /README/LICENSES: -------------------------------------------------------------------------------- 1 | Released under the GPL v 2.0. 2 | 3 | If you did not receive a copy of the GPL, try http://www.gnu.org/. 4 | 5 | Copyright 2011 Christian Martorella 6 | 7 | theHarvester is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation version 2 of the License. 10 | 11 | theHarvester is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 | -------------------------------------------------------------------------------- /api-keys.yaml: -------------------------------------------------------------------------------- 1 | apikeys: 2 | bevigil: 3 | key: 4 | 5 | binaryedge: 6 | key: 7 | 8 | bing: 9 | key: 10 | 11 | censys: 12 | id: 13 | secret: 14 | 15 | fullhunt: 16 | key: 17 | 18 | github: 19 | key: 20 | 21 | hunter: 22 | key: 23 | 24 | intelx: 25 | key: 26 | 27 | pentestTools: 28 | key: 29 | 30 | projectDiscovery: 31 | key: 32 | 33 | rocketreach: 34 | key: 35 | 36 | securityTrails: 37 | key: 38 | 39 | shodan: 40 | key: 41 | 42 | virustotal: 43 | key: 44 | 45 | zoomeye: 46 | key: 47 | -------------------------------------------------------------------------------- /bin/restfulHarvest: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import uvicorn 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('-H', '--host', default='127.0.0.1', help='IP address to listen on default is 127.0.0.1') 7 | parser.add_argument('-p', '--port', default=5000, help='Port to bind the web server to, default is 5000', type=int) 8 | parser.add_argument('-l', '--log-level', default='info', help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set') 9 | parser.add_argument('-r', '--reload', default=False, help='Enable automatic reload used during development of the api', action='store_true') 10 | 11 | args = parser.parse_args() 12 | 13 | if __name__ == '__main__': 14 | uvicorn.run('theHarvester.lib.api.api:app', host=args.host, port=args.port, log_level=args.log_level, reload=args.reload) 15 | -------------------------------------------------------------------------------- /bin/theHarvester: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Note: This script runs theHarvester 3 | import sys 4 | import asyncio 5 | from theHarvester import __main__ 6 | 7 | if sys.version_info.major < 3 or sys.version_info.minor < 7: 8 | print('\033[93m[!] Make sure you have Python 3.7+ installed, quitting.\n\n \033[0m') 9 | sys.exit(1) 10 | 11 | if __name__ == '__main__': 12 | platform = sys.platform 13 | if platform == 'win32': 14 | # Required or things will break if trying to take screenshots 15 | import multiprocessing 16 | 17 | multiprocessing.freeze_support() 18 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy 19 | else: 20 | import uvloop 21 | uvloop.install() 22 | 23 | if "linux" in platform: 24 | import aiomultiprocess 25 | 26 | # As we are not using Windows we can change the spawn method to fork for greater performance 27 | aiomultiprocess.set_context("fork") 28 | asyncio.run(__main__.entry_point()) 29 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | theharvester (4.2.0-0parrot3) parrot-updates; urgency=medium 2 | 3 | * Update package dependencies. 4 | * Update helper script. 5 | 6 | -- Lorenzo "Palinuro" Faletra Wed, 21 Dec 2022 15:11:49 +0100 7 | 8 | theharvester (4.2.0-0parrot2) parrot-updates; urgency=medium 9 | 10 | * Rebuild package. 11 | 12 | -- Lorenzo "Palinuro" Faletra Wed, 21 Dec 2022 15:06:12 +0100 13 | 14 | theharvester (4.2.0-0parrot1) parrot-updates; urgency=medium 15 | 16 | * Import new Upstream release. 17 | 18 | -- Lorenzo "Palinuro" Faletra Wed, 21 Dec 2022 12:59:39 +0100 19 | 20 | theharvester (3.2.3-parrot0) rolling-testing; urgency=medium 21 | 22 | * Remove Kali ci scripts 23 | * Init Parrot team info 24 | * Remove old command 25 | * Add launcher 26 | * Edit launcher command 27 | 28 | -- Nong Hoang Tu Thu, 04 Mar 2021 00:39:41 +0700 29 | 30 | theharvester (3.2.3-0kali1) kali-dev; urgency=medium 31 | 32 | * New upstream version 3.2.3 33 | * Add Restrictions: superficial to autopkgtest 34 | 35 | -- Sophie Brun Mon, 08 Feb 2021 11:45:55 +0100 36 | 37 | theharvester (3.2.2-0kali2) kali-dev; urgency=medium 38 | 39 | * Fix installation of the wordlists 40 | 41 | -- Sophie Brun Thu, 07 Jan 2021 10:47:15 +0100 42 | 43 | theharvester (3.2.2-0kali1) kali-dev; urgency=medium 44 | 45 | [ Ben Wilson ] 46 | * Fix email address 47 | 48 | [ Sophie Brun ] 49 | * New upstream version 3.2.2 50 | 51 | -- Sophie Brun Thu, 17 Dec 2020 10:08:13 +0100 52 | 53 | theharvester (3.2.0-0kali1) kali-dev; urgency=medium 54 | 55 | * New upstream version 3.2.0 56 | * Remove merged patches 57 | * Update build-deps and deps 58 | * Update installation to use usptream setup.py 59 | * Add lintian-overrides for breakout-link 60 | 61 | -- Sophie Brun Fri, 11 Sep 2020 09:30:08 +0200 62 | 63 | theharvester (3.1-0kali4) kali-dev; urgency=medium 64 | 65 | * Fix for issue 6450: 66 | - Add a link to the wordlists 67 | - Use an helper-script to change the run directory 68 | - Add a patch to change directory of sqlite db 69 | * Bump Standards-Version to 4.5.0 70 | 71 | -- Sophie Brun Wed, 08 Jul 2020 12:06:35 +0200 72 | 73 | theharvester (3.1-0kali3) kali-dev; urgency=medium 74 | 75 | [ Sven Höper ] 76 | * Add missing depends: python3-yaml 77 | * Packaging: Fix test command 78 | 79 | [ Sophie Brun ] 80 | * Add a script to mention that theharvester command is deprecated 81 | 82 | -- Sophie Brun Wed, 18 Dec 2019 08:44:38 +0100 83 | 84 | theharvester (3.1-0kali2) kali-dev; urgency=medium 85 | 86 | * Add missing depends: python3-dnspython 87 | 88 | -- Sophie Brun Tue, 15 Oct 2019 18:20:09 +0200 89 | 90 | theharvester (3.1-0kali1) kali-dev; urgency=medium 91 | 92 | [ Raphaël Hertzog ] 93 | * Update Vcs-* fields for the move to gitlab.com 94 | * Add GitLab's CI configuration file 95 | * Configure git-buildpackage for Kali 96 | * Update URL in GitLab's CI configuration file 97 | 98 | [ g0tmi1k ] 99 | * New format 100 | 101 | [ Sophie Brun ] 102 | * Update debian/watch 103 | * New upstream version 3.1 104 | * Remove obsolete patches 105 | * Use debhelper-compat 12 106 | * Update packaging to use setup.py 107 | * Bump Standards-Version to 4.4.1 108 | 109 | -- Sophie Brun Tue, 15 Oct 2019 08:59:27 +0200 110 | 111 | theharvester (3.0.6-0kali1) kali-dev; urgency=medium 112 | 113 | * New upstream version 3.0.6 114 | 115 | -- Sophie Brun Thu, 20 Dec 2018 09:27:33 +0100 116 | 117 | theharvester (3.0.5-0kali1) kali-dev; urgency=medium 118 | 119 | * New upstream version 3.0.5 120 | * Add dependency: python3-plotly 121 | * Refresh patch 122 | * Add minimal autopkgtest 123 | 124 | -- Sophie Brun Wed, 19 Dec 2018 11:00:29 +0100 125 | 126 | theharvester (3.0.4-0kali1) kali-dev; urgency=medium 127 | 128 | * New upstream version 3.0.4 129 | * Switch to Python 3 130 | * Add a minimal required version of wfuzz: this is the first version in 131 | Python 3 132 | 133 | -- Sophie Brun Thu, 13 Dec 2018 11:11:59 +0100 134 | 135 | theharvester (3.0.1-0kali1) kali-dev; urgency=medium 136 | 137 | * New upstream version 3.0.1 138 | * Bump Standards-Version to 4.2.1 139 | * Update debian/copyright 140 | * Add missing dependency: python-bs4 141 | * Refresh patch 142 | 143 | -- Sophie Brun Thu, 29 Nov 2018 14:40:11 +0100 144 | 145 | theharvester (3.0-0kali1) kali-dev; urgency=medium 146 | 147 | * Upstream update 148 | 149 | -- Ben Wilson Tue, 09 Oct 2018 12:19:07 +0100 150 | 151 | theharvester (2.7.2~20180322-0kali1) kali-dev; urgency=medium 152 | 153 | * Import new upstream version (Closes: 0004685) 154 | * Bump Standards-Version and use debhelper 11 155 | * Update debian/control and debian/theharvester.install 156 | * Refresh patches 157 | * Add wfuzz as dependency 158 | 159 | -- Sophie Brun Thu, 19 Apr 2018 09:11:32 +0200 160 | 161 | theharvester (2.7-0kali1) kali-dev; urgency=medium 162 | 163 | * Import new upstream release 164 | 165 | -- Sophie Brun Tue, 19 Apr 2016 09:32:54 +0200 166 | 167 | theharvester (2.6-0kali1) kali-dev; urgency=medium 168 | 169 | * Update watch file 170 | 171 | -- Sophie Brun Wed, 27 Jan 2016 10:26:13 +0100 172 | 173 | theharvester (2.6-0kali0) kali; urgency=low 174 | 175 | * Imported new upstream release (Closes: 0002291) 176 | 177 | -- Devon Kearns Tue, 26 May 2015 12:37:58 -0600 178 | 179 | theharvester (2.5+git20150109-0kali0) kali; urgency=medium 180 | 181 | * Imported new upstream release (Closes: 0001961) 182 | 183 | -- Devon Kearns Fri, 09 Jan 2015 12:24:36 -0700 184 | 185 | theharvester (2.2a-1kali2) kali; urgency=low 186 | 187 | * Patched usage output (Closes: 0001251) 188 | 189 | -- Devon Kearns Tue, 20 May 2014 10:49:13 -0600 190 | 191 | theharvester (2.2a-1kali1) kali; urgency=low 192 | 193 | * Updated watch file 194 | 195 | -- Mati Aharoni Sun, 12 Jan 2014 15:40:59 -0500 196 | 197 | theharvester (2.2a-1kali0) kali; urgency=low 198 | 199 | * New upstream version 200 | 201 | -- Devon Kearns Sat, 09 Feb 2013 14:47:29 -0700 202 | 203 | theharvester (2.2-1kali3) kali; urgency=low 204 | 205 | * Cleaned up debian files 206 | 207 | -- balding_parrot Tue, 18 Dec 2012 22:49:17 +0000 208 | 209 | theharvester (2.2-1kali2) kali; urgency=low 210 | 211 | * Removed desktop file 212 | 213 | -- balding_parrot Tue, 18 Dec 2012 22:32:47 +0000 214 | 215 | theharvester (2.2-1kali1) kali; urgency=low 216 | 217 | * Initial release 218 | 219 | -- balding_parrot Tue, 18 Dec 2012 08:21:47 +0000 220 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: theharvester 2 | Section: utils 3 | Priority: optional 4 | Maintainer: Parrot Dev Team 5 | Uploaders: Lorenzo "Palinuro" Faletra 6 | Build-Depends: debhelper-compat (= 12), 7 | dh-python, 8 | python3-aiohttp, 9 | python3-all, 10 | python3-certifi, 11 | python3-requests, 12 | python3-setuptools, 13 | python3-yaml 14 | Standards-Version: 4.6.1 15 | Homepage: https://github.com/laramies/theHarvester 16 | 17 | Package: theharvester 18 | Architecture: all 19 | Depends: python3, 20 | python3-aiodns (>= 2.0.0), 21 | python3-aiohttp (>= 3.6.2), 22 | python3-aiofiles, 23 | python3-aiomultiprocess (>= 0.8.0), 24 | python3-aiosqlite (>= 0.15.0), 25 | python3-bs4 (>= 4.9.1), 26 | python3-censys (>= 2.1.7), 27 | python3-certifi (>= 2022.6.15), 28 | python3-dnspython (>= 2.0.0), 29 | # python3-fastapi: upstream went from 0.70.0 to 0.79.0 30 | # packaged in Debian, rev deps: theharvester, witnessme 31 | python3-fastapi (>= 0.74.0), 32 | python3-lxml (>= 4.5.2), 33 | python3-netaddr (>= 0.7.19), 34 | python3-ujson, 35 | python3-pyppeteer (>= 1.0.2), 36 | python3-requests (>= 2.23.0), 37 | python3-retrying (>= 1.3.3), 38 | python3-shodan (>= 1.23.0), 39 | python3-slowapi, 40 | python3-starlette, 41 | python3-texttable (>= 1.6.2), 42 | python3-uvicorn, 43 | python3-uvloop (>= 0.14.0), 44 | python3-yaml (>= 5.3.1), 45 | ${misc:Depends}, 46 | ${python3:Depends}, 47 | Description: tool for gathering e-mail accounts and subdomain names from public sources 48 | The package contains a tool for gathering subdomain names, e-mail addresses, 49 | virtual hosts, open ports/ banners, and employee names from different public 50 | sources (search engines, pgp key servers). 51 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Upstream-Name: theharvester 3 | Source: https://github.com/laramies/theHarvester 4 | 5 | Files: * 6 | Copyright: 2011 Christian Martorella 7 | License: GPL-2 8 | This package is free software; you can redistribute it and/or modify 9 | it under the terms of the GNU General Public License version 2 as published by 10 | the Free Software Foundation. 11 | . 12 | This package is distributed in the hope that it will be useful, 13 | but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | GNU General Public License for more details. 16 | . 17 | You should have received a copy of the GNU General Public License 18 | along with this program. If not, see 19 | . 20 | On Debian systems, the complete text of the GNU General 21 | Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". 22 | 23 | Files: debian/* 24 | Copyright: 2012 balding_parrot 25 | 2018 Sophie Brun 26 | License: GPL-2+ 27 | This package is free software; you can redistribute it and/or modify 28 | it under the terms of the GNU General Public License as published by 29 | the Free Software Foundation; either version 2 of the License, or 30 | (at your option) any later version. 31 | . 32 | This package is distributed in the hope that it will be useful, 33 | but WITHOUT ANY WARRANTY; without even the implied warranty of 34 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 35 | GNU General Public License for more details. 36 | . 37 | You should have received a copy of the GNU General Public License 38 | along with this program. If not, see 39 | . 40 | On Debian systems, the complete text of the GNU General 41 | Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". 42 | -------------------------------------------------------------------------------- /debian/dirs: -------------------------------------------------------------------------------- 1 | usr/bin 2 | -------------------------------------------------------------------------------- /debian/docs: -------------------------------------------------------------------------------- 1 | README.md 2 | -------------------------------------------------------------------------------- /debian/gbp.conf: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | pristine-tar = True 3 | 4 | [pq] 5 | patch-numbers = False 6 | 7 | [dch] 8 | multimaint-merge = True 9 | -------------------------------------------------------------------------------- /debian/helper-script/theharvester: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | set -e 4 | 5 | echo -e "this command is deprecated, use theHarvester instead" 6 | /usr/bin/theHarvester $@ 7 | -------------------------------------------------------------------------------- /debian/patches/Disable-a-failing-test-unstable-site.patch: -------------------------------------------------------------------------------- 1 | From: Sophie Brun 2 | Date: Tue, 30 Aug 2022 15:37:52 +0200 3 | Subject: Disable a failing test (unstable site) 4 | 5 | --- 6 | tests/discovery/test_sublist3r.py | 9 +++++---- 7 | 1 file changed, 5 insertions(+), 4 deletions(-) 8 | 9 | diff --git a/tests/discovery/test_sublist3r.py b/tests/discovery/test_sublist3r.py 10 | index 374095e..1d5fdd1 100644 11 | --- a/tests/discovery/test_sublist3r.py 12 | +++ b/tests/discovery/test_sublist3r.py 13 | @@ -21,10 +21,11 @@ async def test_api(self): 14 | request = requests.get(base_url, headers=headers) 15 | assert request.status_code == 200 16 | 17 | - async def test_do_search(self): 18 | - search = sublist3r.SearchSublist3r(TestSublist3r.domain()) 19 | - await search.process() 20 | - assert isinstance(await search.get_hostnames(), list) 21 | +# disable as it fails (unstable site?) 22 | +# async def test_do_search(self): 23 | +# search = sublist3r.SearchSublist3r(TestSublist3r.domain()) 24 | +# await search.process() 25 | +# assert isinstance(await search.get_hostnames(), list) 26 | 27 | 28 | if __name__ == '__main__': 29 | -------------------------------------------------------------------------------- /debian/patches/Improve-data-installation.patch: -------------------------------------------------------------------------------- 1 | From: Sophie Brun 2 | Date: Thu, 7 Jan 2021 10:19:09 +0100 3 | Subject: Improve data installation 4 | 5 | Bug-Kali: https://gitlab.com/kalilinux/packages/theharvester/-/issues/6 6 | 7 | By default the wordlists were installed directly in /etc/theHarvester 8 | instead of /etc/theHarvester/wordlists 9 | --- 10 | setup.py | 12 ++++++++---- 11 | 1 file changed, 8 insertions(+), 4 deletions(-) 12 | 13 | diff --git a/setup.py b/setup.py 14 | index 128bd89..34a3ef0 100755 15 | --- a/setup.py 16 | +++ b/setup.py 17 | @@ -26,13 +26,17 @@ 18 | ], 19 | data_files=[ 20 | ('/etc/theHarvester', [ 21 | - 'wordlists/general/common.txt', 22 | + 'api-keys.yaml', 23 | + 'proxies.yaml' 24 | + ]), 25 | + ('/etc/theHarvester/wordlists', [ 26 | 'wordlists/dns-big.txt', 27 | 'wordlists/dns-names.txt', 28 | 'wordlists/dorks.txt', 29 | - 'wordlists/names_small.txt', 30 | - 'api-keys.yaml', 31 | - 'proxies.yaml' 32 | + 'wordlists/names_small.txt' 33 | + ]), 34 | + ('/etc/theHarvester/wordlists/general', [ 35 | + 'wordlists/general/common.txt' 36 | ] 37 | ) 38 | ], 39 | -------------------------------------------------------------------------------- /debian/patches/series: -------------------------------------------------------------------------------- 1 | Improve-data-installation.patch 2 | Disable-a-failing-test-unstable-site.patch 3 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | # output every command that modifies files on the build system. 4 | #export DH_VERBOSE = 1 5 | 6 | %: 7 | dh $@ --with python3 --buildsystem=pybuild 8 | 9 | override_dh_auto_test: 10 | # do not run tests during the build: most of the tests require 11 | # network 12 | -------------------------------------------------------------------------------- /debian/source/format: -------------------------------------------------------------------------------- 1 | 3.0 (quilt) 2 | -------------------------------------------------------------------------------- /debian/tests/control: -------------------------------------------------------------------------------- 1 | Test-Command: python3 -m pytest tests 2 | Depends: @, python3-pytest, python3-pytest-asyncio 3 | 4 | Test-Command: theHarvester -h 5 | Restrictions: superficial, allow-stderr 6 | -------------------------------------------------------------------------------- /debian/theharvester.install: -------------------------------------------------------------------------------- 1 | debian/helper-script/* usr/bin 2 | -------------------------------------------------------------------------------- /debian/theharvester.links: -------------------------------------------------------------------------------- 1 | etc/theHarvester/api-keys.yaml usr/lib/python3/dist-packages/theHarvester/api-keys.yaml 2 | etc/theHarvester/wordlists usr/lib/python3/dist-packages/theHarvester/wordlists 3 | -------------------------------------------------------------------------------- /debian/upstream/metadata: -------------------------------------------------------------------------------- 1 | --- 2 | Bug-Database: https://github.com/laramies/theHarvester/issues 3 | Bug-Submit: https://github.com/laramies/theHarvester/issues/new 4 | Repository: https://github.com/laramies/theHarvester.git 5 | Repository-Browse: https://github.com/laramies/theHarvester 6 | -------------------------------------------------------------------------------- /debian/watch: -------------------------------------------------------------------------------- 1 | version=4 2 | opts="filenamemangle=s/.*\/v?V?(.*)\.tar\.gz/theharvester-$1.tar.gz/" \ 3 | https://github.com/laramies/theHarvester/tags .*/v?V?(.*)\.tar\.gz 4 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | show_traceback = True 4 | show_error_codes = True 5 | namespace_packages = True 6 | -------------------------------------------------------------------------------- /proxies.yaml: -------------------------------------------------------------------------------- 1 | http: 2 | - ip:port 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | minversion = "7.1" 3 | addopts = "--no-header --asyncio-mode=auto" 4 | testpaths = [ 5 | "tests", 6 | "tests/discovery/", 7 | ] -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | minversion = 7.1.1 3 | testpaths = tests 4 | asyncio_mode=auto -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/base.txt 2 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | aiodns==3.0.0 2 | aiofiles==0.8.0 3 | aiohttp==3.8.1 4 | aiomultiprocess==0.9.0 5 | aiosqlite==0.17.0 6 | beautifulsoup4==4.11.1 7 | censys==2.1.7 8 | certifi==2022.6.15 9 | dnspython==2.2.1 10 | fastapi==0.79.0 11 | lxml==4.9.1 12 | netaddr==0.8.0 13 | ujson==5.4.0 14 | pyppeteer==1.0.2 15 | PyYAML==6.0 16 | requests==2.28.1 17 | retrying==1.3.3 18 | setuptools==64.0.3 19 | shodan==1.28.0 20 | slowapi==0.1.5 21 | uvicorn==0.18.2 22 | uvloop==0.16.0; platform_system != "Windows" -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | -r base.txt 2 | flake8==5.0.4 3 | mypy==0.971 4 | mypy-extensions==0.4.3 5 | pyflakes==2.5.0 6 | pytest==7.1.2 7 | pytest-asyncio==0.19.0 8 | types-certifi==2021.10.8.3 9 | types-chardet==5.0.4 10 | types-ujson==5.4.0 11 | types-PyYAML==6.0.11 12 | types-requests==2.28.8 13 | wheel==0.37.1 -------------------------------------------------------------------------------- /restfulHarvest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import uvicorn 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('-H', '--host', default='127.0.0.1', help='IP address to listen on default is 127.0.0.1') 7 | parser.add_argument('-p', '--port', default=5000, help='Port to bind the web server to, default is 5000', type=int) 8 | parser.add_argument('-l', '--log-level', default='info', help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set') 9 | parser.add_argument('-r', '--reload', default=False, help='Enable automatic reload used during development of the api', action='store_true') 10 | 11 | args = parser.parse_args() 12 | 13 | if __name__ == '__main__': 14 | uvicorn.run('theHarvester.lib.api.api:app', host=args.host, port=args.port, log_level=args.log_level, reload=args.reload) 15 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, F405, F403, E402, F401 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from theHarvester.lib.core import Core 3 | 4 | with open('README.md', 'r') as fh: 5 | long_description = fh.read() 6 | 7 | setup( 8 | name='theHarvester', 9 | version=Core.version(), 10 | author="Christian Martorella", 11 | author_email="cmartorella@edge-security.com", 12 | description="theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | url="https://github.com/laramies/theHarvester", 16 | packages=find_packages(exclude=['tests']), 17 | python_requires='>=3.7', 18 | scripts=['bin/theHarvester', 19 | 'bin/restfulHarvest'], 20 | 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", 27 | "Operating System :: OS Independent", 28 | ], 29 | data_files=[ 30 | ('/etc/theHarvester', [ 31 | 'wordlists/general/common.txt', 32 | 'wordlists/dns-big.txt', 33 | 'wordlists/dns-names.txt', 34 | 'wordlists/dorks.txt', 35 | 'wordlists/names_small.txt', 36 | 'api-keys.yaml', 37 | 'proxies.yaml' 38 | ] 39 | ) 40 | ], 41 | ) 42 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/tests/__init__.py -------------------------------------------------------------------------------- /tests/discovery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/tests/discovery/__init__.py -------------------------------------------------------------------------------- /tests/discovery/test_anubis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import requests 4 | from theHarvester.lib.core import * 5 | from theHarvester.discovery import anubis 6 | import os 7 | import pytest 8 | 9 | pytestmark = pytest.mark.asyncio 10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True 11 | 12 | 13 | class TestAnubis: 14 | @staticmethod 15 | def domain() -> str: 16 | return 'apple.com' 17 | 18 | async def test_api(self): 19 | base_url = f'https://jldc.me/anubis/subdomains/{TestAnubis.domain()}' 20 | headers = {'User-Agent': Core.get_user_agent()} 21 | request = requests.get(base_url, headers=headers) 22 | assert request.status_code == 200 23 | 24 | async def test_do_search(self): 25 | search = anubis.SearchAnubis(word=TestAnubis.domain()) 26 | await search.do_search() 27 | return await search.get_hostnames() 28 | 29 | async def test_process(self): 30 | await self.test_do_search() 31 | assert len(await self.test_do_search()) > 0 32 | -------------------------------------------------------------------------------- /tests/discovery/test_certspotter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | from theHarvester.lib.core import * 4 | from theHarvester.discovery import certspottersearch 5 | import os 6 | import requests 7 | import pytest 8 | 9 | pytestmark = pytest.mark.asyncio 10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True 11 | 12 | 13 | class TestCertspotter(object): 14 | @staticmethod 15 | def domain() -> str: 16 | return 'metasploit.com' 17 | 18 | async def test_api(self): 19 | base_url = f'https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names' 20 | headers = {'User-Agent': Core.get_user_agent()} 21 | request = requests.get(base_url, headers=headers) 22 | assert request.status_code == 200 23 | 24 | async def test_search(self): 25 | search = certspottersearch.SearchCertspoter(TestCertspotter.domain()) 26 | await search.process() 27 | assert isinstance(await search.get_hostnames(), set) 28 | 29 | async def test_search_no_results(self): 30 | search = certspottersearch.SearchCertspoter('radiant.eu') 31 | await search.process() 32 | assert len(await search.get_hostnames()) == 0 33 | 34 | 35 | if __name__ == '__main__': 36 | pytest.main() 37 | -------------------------------------------------------------------------------- /tests/discovery/test_githubcode.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery import githubcode 2 | from theHarvester.discovery.constants import MissingKey 3 | from theHarvester.lib.core import Core 4 | from unittest.mock import MagicMock 5 | from requests import Response 6 | import pytest 7 | 8 | pytestmark = pytest.mark.asyncio 9 | 10 | 11 | class TestSearchGithubCode: 12 | 13 | class OkResponse: 14 | response = Response() 15 | json = { 16 | "items": [ 17 | { 18 | "text_matches": [ 19 | { 20 | "fragment": "test1" 21 | } 22 | ] 23 | }, 24 | { 25 | "text_matches": [ 26 | { 27 | "fragment": "test2" 28 | } 29 | ] 30 | } 31 | ] 32 | } 33 | response.status_code = 200 34 | response.json = MagicMock(return_value=json) 35 | 36 | class FailureResponse: 37 | response = Response() 38 | response.json = MagicMock(return_value={}) 39 | response.status_code = 401 40 | 41 | class RetryResponse: 42 | response = Response() 43 | response.json = MagicMock(return_value={}) 44 | response.status_code = 403 45 | 46 | class MalformedResponse: 47 | response = Response() 48 | json = { 49 | "items": [ 50 | { 51 | "fail": True 52 | }, 53 | { 54 | "text_matches": [] 55 | }, 56 | { 57 | "text_matches": [ 58 | { 59 | "weird": "result" 60 | } 61 | ] 62 | } 63 | ] 64 | } 65 | response.json = MagicMock(return_value=json) 66 | response.status_code = 200 67 | 68 | async def test_missing_key(self): 69 | with pytest.raises(MissingKey): 70 | Core.github_key = MagicMock(return_value=None) 71 | githubcode.SearchGithubCode(word="test", limit=500) 72 | 73 | async def test_fragments_from_response(self): 74 | Core.github_key = MagicMock(return_value="lol") 75 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 76 | test_result = await test_class_instance.fragments_from_response(self.OkResponse.response.json()) 77 | print('test_result: ', test_result) 78 | assert test_result == ["test1", "test2"] 79 | 80 | async def test_invalid_fragments_from_response(self): 81 | Core.github_key = MagicMock(return_value="lol") 82 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 83 | test_result = await test_class_instance.fragments_from_response(self.MalformedResponse.response.json()) 84 | assert test_result == [] 85 | 86 | async def test_next_page(self): 87 | Core.github_key = MagicMock(return_value="lol") 88 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 89 | test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4) 90 | assert (2 == await test_class_instance.next_page_or_end(test_result)) 91 | 92 | async def test_last_page(self): 93 | Core.github_key = MagicMock(return_value="lol") 94 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 95 | test_result = githubcode.SuccessResult(list(), None, None) 96 | assert (None is await test_class_instance.next_page_or_end(test_result)) 97 | 98 | if __name__ == '__main__': 99 | pytest.main() 100 | -------------------------------------------------------------------------------- /tests/discovery/test_omnisint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | from theHarvester.lib.core import * 4 | from theHarvester.discovery import omnisint 5 | import os 6 | import requests 7 | import pytest 8 | 9 | pytestmark = pytest.mark.asyncio 10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True 11 | 12 | 13 | class TestOmnisint(object): 14 | @staticmethod 15 | def domain() -> str: 16 | return 'uber.com' 17 | 18 | @pytest.mark.skipif(github_ci == 'true', reason='Skipping on Github CI due to unstable status code from site') 19 | async def test_api(self): 20 | base_url = f'https://sonar.omnisint.io/all/{TestOmnisint.domain()}' 21 | headers = {'User-Agent': Core.get_user_agent()} 22 | request = requests.get(base_url, headers=headers) 23 | assert request.status_code == 200 24 | 25 | async def test_search(self): 26 | search = omnisint.SearchOmnisint(TestOmnisint.domain()) 27 | await search.process() 28 | assert isinstance(await search.get_hostnames(), list) 29 | 30 | 31 | if __name__ == '__main__': 32 | pytest.main() 33 | -------------------------------------------------------------------------------- /tests/discovery/test_otx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | from theHarvester.lib.core import * 4 | from theHarvester.discovery import otxsearch 5 | import os 6 | import requests 7 | import pytest 8 | 9 | pytestmark = pytest.mark.asyncio 10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True 11 | 12 | 13 | class TestOtx(object): 14 | @staticmethod 15 | def domain() -> str: 16 | return 'metasploit.com' 17 | 18 | async def test_api(self): 19 | base_url = f'https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns' 20 | headers = {'User-Agent': Core.get_user_agent()} 21 | request = requests.get(base_url, headers=headers) 22 | assert request.status_code == 200 23 | 24 | async def test_search(self): 25 | search = otxsearch.SearchOtx(TestOtx.domain()) 26 | await search.process() 27 | assert isinstance(await search.get_hostnames(), set) 28 | assert isinstance(await search.get_ips(), set) 29 | 30 | 31 | if __name__ == '__main__': 32 | pytest.main() 33 | -------------------------------------------------------------------------------- /tests/discovery/test_qwantsearch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | from theHarvester.discovery import qwantsearch 4 | import os 5 | import pytest 6 | 7 | pytestmark = pytest.mark.asyncio 8 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True 9 | 10 | 11 | class TestSearchQwant(object): 12 | 13 | @staticmethod 14 | def domain() -> str: 15 | return 'example.com' 16 | 17 | async def test_get_start_offset_return_0(self): 18 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200) 19 | assert search.get_start_offset() == 0 20 | 21 | async def test_get_start_offset_return_50(self): 22 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 55, 200) 23 | assert search.get_start_offset() == 50 24 | 25 | async def test_get_start_offset_return_100(self): 26 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 100, 200) 27 | assert search.get_start_offset() == 100 28 | 29 | async def test_get_emails(self): 30 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200) 31 | await search.process() 32 | assert isinstance(await search.get_emails(), set) 33 | 34 | async def test_get_hostnames(self): 35 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200) 36 | await search.process() 37 | assert isinstance(await search.get_hostnames(), list) 38 | -------------------------------------------------------------------------------- /tests/discovery/test_sublist3r.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import requests 4 | from theHarvester.lib.core import * 5 | from theHarvester.discovery import sublist3r 6 | import os 7 | import pytest 8 | 9 | pytestmark = pytest.mark.asyncio 10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True 11 | 12 | 13 | class TestSublist3r(object): 14 | @staticmethod 15 | def domain() -> str: 16 | return 'target.com' 17 | 18 | async def test_api(self): 19 | base_url = f'https://api.sublist3r.com/search.php?domain={TestSublist3r.domain()}' 20 | headers = {'User-Agent': Core.get_user_agent()} 21 | request = requests.get(base_url, headers=headers) 22 | assert request.status_code == 200 23 | 24 | async def test_do_search(self): 25 | search = sublist3r.SearchSublist3r(TestSublist3r.domain()) 26 | await search.process() 27 | assert isinstance(await search.get_hostnames(), list) 28 | 29 | 30 | if __name__ == '__main__': 31 | pytest.main() 32 | -------------------------------------------------------------------------------- /tests/discovery/test_threatminer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import requests 4 | from theHarvester.lib.core import * 5 | from theHarvester.discovery import threatminer 6 | import os 7 | import pytest 8 | 9 | pytestmark = pytest.mark.asyncio 10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True 11 | 12 | 13 | class TestThreatminer(object): 14 | @staticmethod 15 | def domain() -> str: 16 | return 'target.com' 17 | 18 | async def test_api(self): 19 | base_url = f'https://api.threatminer.org/v2/domain.php?q={TestThreatminer.domain()}&rt=5' 20 | headers = {'User-Agent': Core.get_user_agent()} 21 | request = requests.get(base_url, headers=headers) 22 | assert request.status_code == 200 23 | 24 | async def test_search(self): 25 | search = threatminer.SearchThreatminer(TestThreatminer.domain()) 26 | await search.process() 27 | assert isinstance(await search.get_hostnames(), set) 28 | assert isinstance(await search.get_ips(), set) 29 | 30 | 31 | if __name__ == '__main__': 32 | pytest.main() 33 | -------------------------------------------------------------------------------- /tests/test_myparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | from theHarvester.parsers import myparser 5 | import pytest 6 | 7 | 8 | class TestMyParser(object): 9 | 10 | @pytest.mark.asyncio 11 | async def test_emails(self): 12 | word = 'domain.com' 13 | results = '@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***' 14 | parse = myparser.Parser(results, word) 15 | emails = sorted(await parse.emails()) 16 | assert emails, ['c@domain.com', 'd@sub.domain.com'] 17 | 18 | 19 | if __name__ == '__main__': 20 | pytest.main() 21 | -------------------------------------------------------------------------------- /theHarvester-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester-logo.png -------------------------------------------------------------------------------- /theHarvester.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Note: This script runs theHarvester 3 | import sys 4 | import asyncio 5 | from theHarvester import __main__ 6 | 7 | if sys.version_info.major < 3 or sys.version_info.minor < 7: 8 | print('\033[93m[!] Make sure you have Python 3.7+ installed, quitting.\n\n \033[0m') 9 | sys.exit(1) 10 | 11 | if __name__ == '__main__': 12 | platform = sys.platform 13 | if platform == 'win32': 14 | # Required or things will break if trying to take screenshots 15 | import multiprocessing 16 | 17 | multiprocessing.freeze_support() 18 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy 19 | else: 20 | import uvloop 21 | uvloop.install() 22 | 23 | if "linux" in platform: 24 | import aiomultiprocess 25 | 26 | # As we are not using Windows we can change the spawn method to fork for greater performance 27 | aiomultiprocess.set_context("fork") 28 | asyncio.run(__main__.entry_point()) 29 | -------------------------------------------------------------------------------- /theHarvester/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/__init__.py -------------------------------------------------------------------------------- /theHarvester/discovery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/discovery/__init__.py -------------------------------------------------------------------------------- /theHarvester/discovery/anubis.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchAnubis: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.totalhosts = list 10 | self.proxy = False 11 | 12 | async def do_search(self): 13 | url = f'https://jldc.me/anubis/subdomains/{self.word}' 14 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 15 | self.totalhosts: list = response[0] 16 | 17 | async def get_hostnames(self) -> Type[list]: 18 | return self.totalhosts 19 | 20 | async def process(self, proxy=False): 21 | self.proxy = proxy 22 | await self.do_search() 23 | -------------------------------------------------------------------------------- /theHarvester/discovery/baidusearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | from theHarvester.parsers import myparser 3 | 4 | 5 | class SearchBaidu: 6 | 7 | def __init__(self, word, limit): 8 | self.word = word 9 | self.total_results = "" 10 | self.server = 'www.baidu.com' 11 | self.hostname = 'www.baidu.com' 12 | self.limit = limit 13 | self.proxy = False 14 | 15 | async def do_search(self): 16 | headers = { 17 | 'Host': self.hostname, 18 | 'User-agent': Core.get_user_agent() 19 | } 20 | base_url = f'https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}' 21 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] 22 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 23 | for response in responses: 24 | self.total_results += response 25 | 26 | async def process(self, proxy=False): 27 | self.proxy = proxy 28 | await self.do_search() 29 | 30 | async def get_emails(self): 31 | rawres = myparser.Parser(self.total_results, self.word) 32 | return await rawres.emails() 33 | 34 | async def get_hostnames(self): 35 | rawres = myparser.Parser(self.total_results, self.word) 36 | return await rawres.hostnames() 37 | -------------------------------------------------------------------------------- /theHarvester/discovery/bevigil.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | 3 | 4 | class SearchBeVigil: 5 | 6 | def __init__(self, word): 7 | self.word = word 8 | self.totalhosts = set() 9 | self.interestingurls = set() 10 | self.key = Core.bevigil_key() 11 | self.proxy = False 12 | 13 | async def do_search(self): 14 | subdomain_endpoint = f"https://osint.bevigil.com/api/{self.word}/subdomains/" 15 | url_endpoint = f"https://osint.bevigil.com/api/{self.word}/urls/" 16 | headers = {'X-Access-Token': self.key} 17 | 18 | responses = await AsyncFetcher.fetch_all([subdomain_endpoint], json=True, proxy=self.proxy, headers=headers) 19 | response = responses[0] 20 | for subdomain in response["subdomains"]: 21 | self.totalhosts.add(subdomain) 22 | 23 | responses = await AsyncFetcher.fetch_all([url_endpoint], json=True, proxy=self.proxy, headers=headers) 24 | response = responses[0] 25 | for url in response["urls"]: 26 | self.interestingurls.add(url) 27 | 28 | async def get_hostnames(self) -> set: 29 | return self.totalhosts 30 | 31 | async def get_interestingurls(self) -> set: 32 | return self.interestingurls 33 | 34 | async def process(self, proxy=False): 35 | self.proxy = proxy 36 | await self.do_search() 37 | -------------------------------------------------------------------------------- /theHarvester/discovery/binaryedgesearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | import asyncio 3 | 4 | 5 | class SearchBinaryEdge: 6 | 7 | def __init__(self, word, limit): 8 | self.word = word 9 | self.totalhosts = set() 10 | self.proxy = False 11 | self.key = Core.binaryedge_key() 12 | self.limit = 501 if limit >= 501 else limit 13 | self.limit = 2 if self.limit == 1 else self.limit 14 | if self.key is None: 15 | raise MissingKey('binaryedge') 16 | 17 | async def do_search(self): 18 | base_url = f'https://api.binaryedge.io/v2/query/domains/subdomain/{self.word}' 19 | headers = {'X-KEY': self.key, 'User-Agent': Core.get_user_agent()} 20 | for page in range(1, self.limit): 21 | params = {'page': page} 22 | response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy, params=params, headers=headers) 23 | responses = response[0] 24 | dct = responses 25 | if ('status' in dct.keys() and 'message' in dct.keys()) and \ 26 | (dct['status'] == 400 or 'Bad Parameter' in dct['message'] or 'Error' in dct['message']): 27 | # 400 status code means no more results 28 | break 29 | if 'events' in dct.keys(): 30 | if len(dct['events']) == 0: 31 | break 32 | self.totalhosts.update({host for host in dct['events']}) 33 | await asyncio.sleep(get_delay()) 34 | 35 | async def get_hostnames(self) -> set: 36 | return self.totalhosts 37 | 38 | async def process(self, proxy=False): 39 | self.proxy = proxy 40 | await self.do_search() 41 | -------------------------------------------------------------------------------- /theHarvester/discovery/bingsearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | from theHarvester.parsers import myparser 4 | 5 | 6 | class SearchBing: 7 | 8 | def __init__(self, word, limit, start): 9 | self.word = word.replace(' ', '%20') 10 | self.results = "" 11 | self.total_results = "" 12 | self.server = 'www.bing.com' 13 | self.apiserver = 'api.search.live.net' 14 | self.hostname = 'www.bing.com' 15 | self.limit = int(limit) 16 | self.bingApi = Core.bing_key() 17 | self.counter = start 18 | self.proxy = False 19 | 20 | async def do_search(self): 21 | headers = { 22 | 'Host': self.hostname, 23 | 'Cookie': 'SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50', 24 | 'Accept-Language': 'en-us,en', 25 | 'User-agent': Core.get_user_agent() 26 | } 27 | base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx' 28 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit] 29 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 30 | for response in responses: 31 | self.total_results += response 32 | 33 | async def do_search_api(self): 34 | url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?' 35 | params = { 36 | 'q': self.word, 37 | 'count': str(self.limit), 38 | 'offset': '0', 39 | 'mkt': 'en-us', 40 | 'safesearch': 'Off' 41 | } 42 | headers = {'User-Agent': Core.get_user_agent(), 'Ocp-Apim-Subscription-Key': self.bingApi} 43 | self.results = await AsyncFetcher.fetch_all([url], headers=headers, params=params, proxy=self.proxy) 44 | self.total_results += self.results 45 | 46 | async def do_search_vhost(self): 47 | headers = { 48 | 'Host': self.hostname, 49 | 'Cookie': 'mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50', 50 | 'Accept-Language': 'en-us,en', 51 | 'User-agent': Core.get_user_agent() 52 | } 53 | base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx' 54 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit] 55 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 56 | for response in responses: 57 | self.total_results += response 58 | 59 | async def get_emails(self): 60 | rawres = myparser.Parser(self.total_results, self.word) 61 | return await rawres.emails() 62 | 63 | async def get_hostnames(self): 64 | rawres = myparser.Parser(self.total_results, self.word) 65 | return await rawres.hostnames() 66 | 67 | async def get_allhostnames(self): 68 | rawres = myparser.Parser(self.total_results, self.word) 69 | return await rawres.hostnames_all() 70 | 71 | async def process(self, api, proxy=False): 72 | self.proxy = proxy 73 | if api == 'yes': 74 | if self.bingApi is None: 75 | raise MissingKey('BingAPI') 76 | else: 77 | if api == 'yes': 78 | await self.do_search_api() 79 | else: 80 | await self.do_search() 81 | print(f'\tSearching {self.counter} results.') 82 | 83 | async def process_vhost(self): 84 | await self.do_search_vhost() 85 | -------------------------------------------------------------------------------- /theHarvester/discovery/bufferoverun.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | import re 3 | 4 | 5 | class SearchBufferover: 6 | def __init__(self, word): 7 | self.word = word 8 | self.totalhosts = set() 9 | self.totalips = set() 10 | self.proxy = False 11 | 12 | async def do_search(self): 13 | url = f'https://dns.bufferover.run/dns?q={self.word}' 14 | responses = await AsyncFetcher.fetch_all(urls=[url], json=True, proxy=self.proxy) 15 | responses = responses[0] 16 | dct = responses 17 | 18 | if dct['FDNS_A']: 19 | self.totalhosts: set = { 20 | host.split(',')[0].replace('www.', '') if ',' in host and self.word.replace('www.', '') in host.split(',')[ 21 | 0] in host else 22 | host.split(',')[1] for host in dct['FDNS_A']} 23 | 24 | self.totalips: set = {ip.split(',')[0] for ip in dct['FDNS_A'] if 25 | re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip.split(',')[0])} 26 | 27 | async def get_hostnames(self) -> set: 28 | return self.totalhosts 29 | 30 | async def get_ips(self) -> set: 31 | return self.totalips 32 | 33 | async def process(self, proxy=False): 34 | self.proxy = proxy 35 | await self.do_search() 36 | -------------------------------------------------------------------------------- /theHarvester/discovery/censysearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import MissingKey 2 | from theHarvester.lib.core import Core 3 | from censys.search import CensysCertificates 4 | from censys.common import __version__ 5 | from censys.common.exceptions import ( 6 | CensysRateLimitExceededException, 7 | CensysUnauthorizedException, 8 | ) 9 | 10 | 11 | class SearchCensys: 12 | def __init__(self, domain, limit=500): 13 | self.word = domain 14 | self.key = Core.censys_key() 15 | if self.key[0] is None or self.key[1] is None: 16 | raise MissingKey("Censys ID and/or Secret") 17 | self.totalhosts = set() 18 | self.emails = set() 19 | self.limit = limit 20 | self.proxy = False 21 | 22 | async def do_search(self): 23 | try: 24 | cert_search = CensysCertificates( 25 | api_id=self.key[0], 26 | api_secret=self.key[1], 27 | user_agent=f"censys/{__version__} (theHarvester/{Core.version()}; +https://github.com/laramies/theHarvester)", 28 | ) 29 | except CensysUnauthorizedException: 30 | raise MissingKey('Censys ID and/or Secret') 31 | 32 | query = f"parsed.names: {self.word}" 33 | try: 34 | response = cert_search.search( 35 | query=query, 36 | fields=["parsed.names", "metadata", "parsed.subject.email_address"], 37 | max_records=self.limit, 38 | ) 39 | for cert in response: 40 | self.totalhosts.update(cert.get("parsed.names", [])) 41 | self.emails.update(cert.get("parsed.subject.email_address", [])) 42 | except CensysRateLimitExceededException: 43 | print("Censys rate limit exceeded") 44 | 45 | async def get_hostnames(self) -> set: 46 | return self.totalhosts 47 | 48 | async def get_emails(self) -> set: 49 | return self.emails 50 | 51 | async def process(self, proxy=False): 52 | self.proxy = proxy 53 | await self.do_search() 54 | -------------------------------------------------------------------------------- /theHarvester/discovery/certspottersearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | 3 | 4 | class SearchCertspoter: 5 | 6 | def __init__(self, word): 7 | self.word = word 8 | self.totalhosts = set() 9 | self.proxy = False 10 | 11 | async def do_search(self) -> None: 12 | base_url = f'https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names' 13 | try: 14 | response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy) 15 | response = response[0] 16 | if isinstance(response, list): 17 | for dct in response: 18 | for key, value in dct.items(): 19 | if key == 'dns_names': 20 | self.totalhosts.update({name for name in value if name}) 21 | elif isinstance(response, dict): 22 | self.totalhosts.update({response['dns_names'] if 'dns_names' in response.keys() else ''}) # type: ignore 23 | else: 24 | self.totalhosts.update({''}) 25 | except Exception as e: 26 | print(e) 27 | 28 | async def get_hostnames(self) -> set: 29 | return self.totalhosts 30 | 31 | async def process(self, proxy=False): 32 | self.proxy = proxy 33 | await self.do_search() 34 | print('\tSearching results.') 35 | -------------------------------------------------------------------------------- /theHarvester/discovery/constants.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | from typing import Union, Optional 3 | import random 4 | 5 | googleUA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 ' \ 6 | 'Safari/537.36 ' 7 | 8 | 9 | async def splitter(links): 10 | """ 11 | Method that tries to remove duplicates 12 | LinkedinLists pulls a lot of profiles with the same name. 13 | This method tries to remove duplicates from the list. 14 | :param links: list of links to remove duplicates from 15 | :return: unique-ish list 16 | """ 17 | unique_list = [] 18 | name_check = [] 19 | for url in links: 20 | tail = url.split("/")[-1] 21 | if len(tail) == 2 or tail == "zh-cn": 22 | tail = url.split("/")[-2] 23 | name = tail.split("-") 24 | if len(name) > 1: 25 | joined_name = name[0] + name[1] 26 | else: 27 | joined_name = name[0] 28 | if joined_name not in name_check: 29 | unique_list.append(url) 30 | name_check.append(joined_name) 31 | return unique_list 32 | 33 | 34 | def filter(lst): 35 | """ 36 | Method that filters list 37 | :param lst: list to be filtered 38 | :return: new filtered list 39 | """ 40 | if lst is None: 41 | return [] 42 | if not isinstance(lst, set): 43 | lst = set(lst) # Remove duplicates. 44 | new_lst = [] 45 | for item in lst: 46 | item = str(item) 47 | if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item): 48 | item = item.replace('252f', '').replace('2F', '').replace('2f', '') 49 | new_lst.append(item.lower()) 50 | return new_lst 51 | 52 | 53 | def get_delay() -> float: 54 | """Method that is used to generate a random delay""" 55 | return random.randint(1, 3) - .5 56 | 57 | 58 | async def search(text: str) -> bool: 59 | """Helper function to check if Google has blocked traffic. 60 | :param text: See if certain text is returned which means Google is blocking us 61 | :return bool: 62 | """ 63 | for line in text.strip().splitlines(): 64 | if 'This page appears when Google automatically detects requests coming from your computer network' in line \ 65 | or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line: 66 | # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP') 67 | return True 68 | return False 69 | 70 | 71 | async def google_workaround(visit_url: str) -> Union[bool, str]: 72 | """ 73 | Function that makes a request on our behalf, if Google starts to block us 74 | :param visit_url: Url to scrape 75 | :return: Correct html that can be parsed by BS4 76 | """ 77 | url = 'https://websniffer.cc/' 78 | data = { 79 | 'Cookie': '', 80 | 'url': visit_url, 81 | 'submit': 'Submit', 82 | 'type': 'GET&http=1.1', 83 | 'uak': str(random.randint(4, 8)) # select random UA to send to Google 84 | } 85 | returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data) 86 | returned_html = "This page appears when Google automatically detects requests coming from your computer network" \ 87 | if returned_html == "" else returned_html[0] 88 | 89 | returned_html = "" if 'Please Wait... | Cloudflare' in returned_html else returned_html 90 | 91 | if len(returned_html) == 0 or await search(returned_html) or '<html' not in returned_html: 92 | # indicates that google is serving workaround a captcha 93 | # That means we will try out second option which will utilize proxies 94 | return True 95 | # the html we get is malformed for BS4 as there are no greater than or less than signs 96 | if '<html>' in returned_html: 97 | start_index = returned_html.index('<html>') 98 | else: 99 | start_index = returned_html.index('<html') 100 | 101 | end_index = returned_html.index('</html>') + 1 102 | correct_html = returned_html[start_index:end_index] 103 | # Slice list to get the response's html 104 | correct_html = ''.join([ch.strip().replace('<', '<').replace('>', '>') for ch in correct_html]) 105 | return correct_html 106 | 107 | 108 | class MissingKey(Exception): 109 | """ 110 | :raise: When there is a module that has not been provided its API key 111 | """ 112 | def __init__(self, source: Optional[str]): 113 | if source: 114 | self.message = f'\n\033[93m[!] Missing API key for {source}. \033[0m' 115 | else: 116 | self.message = '\n\033[93m[!] Missing CSE id. \033[0m' 117 | 118 | def __str__(self) -> str: 119 | return self.message 120 | -------------------------------------------------------------------------------- /theHarvester/discovery/crtsh.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | from typing import List, Set 3 | 4 | 5 | class SearchCrtsh: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.data = set() 10 | self.proxy = False 11 | 12 | async def do_search(self) -> List: 13 | data: set = set() 14 | try: 15 | url = f'https://crt.sh/?q=%25.{self.word}&output=json' 16 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 17 | response = response[0] 18 | data = set( 19 | [dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value'] 20 | for dct in response]) 21 | data = {domain for domain in data if (domain[0] != '*' and str(domain[0:4]).isnumeric() is False)} 22 | except Exception as e: 23 | print(e) 24 | clean = [] 25 | for x in data: 26 | pre = x.split() 27 | for y in pre: 28 | clean.append(y) 29 | return clean 30 | 31 | async def process(self, proxy=False) -> None: 32 | self.proxy = proxy 33 | data = await self.do_search() 34 | self.data = data 35 | 36 | async def get_hostnames(self) -> Set: 37 | return self.data 38 | -------------------------------------------------------------------------------- /theHarvester/discovery/dnsdumpster.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | from theHarvester.parsers import myparser 3 | import aiohttp 4 | import asyncio 5 | 6 | 7 | class SearchDnsDumpster: 8 | 9 | def __init__(self, word): 10 | self.word = word.replace(' ', '%20') 11 | self.results = "" 12 | self.totalresults = "" 13 | self.server = 'dnsdumpster.com' 14 | self.proxy = False 15 | 16 | async def do_search(self): 17 | try: 18 | agent = Core.get_user_agent() 19 | headers = {'User-Agent': agent} 20 | session = aiohttp.ClientSession(headers=headers) 21 | # create a session to properly verify 22 | url = f'https://{self.server}' 23 | csrftoken = '' 24 | if self.proxy is False: 25 | async with session.get(url, headers=headers) as resp: 26 | cookies = str(resp.cookies) 27 | cookies = cookies.split('csrftoken=') 28 | csrftoken += cookies[1][:cookies[1].find(';')] 29 | else: 30 | async with session.get(url, headers=headers, proxy=self.proxy) as resp: 31 | cookies = str(resp.cookies) 32 | cookies = cookies.split('csrftoken=') 33 | csrftoken += cookies[1][:cookies[1].find(';')] 34 | await asyncio.sleep(2) 35 | 36 | # extract csrftoken from cookies 37 | data = { 38 | 'Cookie': f'csfrtoken={csrftoken}', 'csrfmiddlewaretoken': csrftoken, 39 | 'targetip': self.word, 'user': 'free'} 40 | headers['Referer'] = url 41 | if self.proxy is False: 42 | async with session.post(url, headers=headers, data=data) as resp: 43 | self.results = await resp.text() 44 | else: 45 | async with session.post(url, headers=headers, data=data, proxy=self.proxy) as resp: 46 | self.results = await resp.text() 47 | await session.close() 48 | except Exception as e: 49 | print(f'An exception occurred: {e}') 50 | self.totalresults += self.results 51 | 52 | async def get_hostnames(self): 53 | rawres = myparser.Parser(self.totalresults, self.word) 54 | return await rawres.hostnames() 55 | 56 | async def process(self, proxy=False): 57 | self.proxy = proxy 58 | await self.do_search() # Only need to do it once. 59 | -------------------------------------------------------------------------------- /theHarvester/discovery/dnssearch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | ============ 5 | DNS Browsing 6 | ============ 7 | 8 | Explore the space around known hosts & ips for extra catches. 9 | """ 10 | 11 | import re 12 | import sys 13 | 14 | from aiodns import DNSResolver 15 | from ipaddress import IPv4Network 16 | from typing import Callable, List, Optional 17 | from theHarvester.lib import hostchecker 18 | 19 | 20 | ##################################################################### 21 | # DNS FORCE 22 | ##################################################################### 23 | 24 | 25 | class DnsForce: 26 | 27 | def __init__(self, domain, dnsserver, verbose=False): 28 | self.domain = domain 29 | self.subdo = False 30 | self.verbose = verbose 31 | # self.dnsserver = [dnsserver] if isinstance(dnsserver, str) else dnsserver 32 | self.dnsserver = list(map(str, dnsserver.split(','))) if isinstance(dnsserver, str) else dnsserver 33 | try: 34 | with open('/etc/theHarvester/wordlists/dns-names.txt', 'r') as file: 35 | self.list = file.readlines() 36 | except FileNotFoundError: 37 | try: 38 | with open('/usr/local/etc/theHarvester/wordlists/dns-names.txt', 'r') as file: 39 | self.list = file.readlines() 40 | except FileNotFoundError: 41 | with open('wordlists/dns-names.txt', 'r') as file: 42 | self.list = file.readlines() 43 | self.domain = domain.replace('www.', '') 44 | self.list = [f'{word.strip()}.{self.domain}' for word in self.list] 45 | 46 | async def run(self): 47 | print(f'Starting DNS brute forcing with {len(self.list)} words') 48 | checker = hostchecker.Checker( 49 | self.list) if self.dnsserver == [] or self.dnsserver == "" or self.dnsserver is None \ 50 | else hostchecker.Checker(self.list, nameserver=self.dnsserver) 51 | hosts, ips = await checker.check() 52 | return hosts, ips 53 | 54 | 55 | ##################################################################### 56 | # DNS REVERSE 57 | ##################################################################### 58 | 59 | 60 | IP_REGEX = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' 61 | PORT_REGEX = r'\d{1,5}' 62 | NETMASK_REGEX = r'\d{1,2}|' + IP_REGEX 63 | NETWORK_REGEX = r'\b({})(?:\:({}))?(?:\/({}))?\b'.format( 64 | IP_REGEX, 65 | PORT_REGEX, 66 | NETMASK_REGEX) 67 | 68 | 69 | def serialize_ip_range(ip: str, netmask: str = '24') -> str: 70 | """ 71 | Serialize a network range in a constant format, 'x.x.x.x/y'. 72 | 73 | Parameters 74 | ---------- 75 | ip: str. 76 | A serialized ip in the format 'x.x.x.x'. 77 | Extra information like port (':z') or subnet ('/n') 78 | will be ignored. 79 | netmask: str. 80 | The subnet subdivision, represented by a 2 digit netmask. 81 | 82 | Returns 83 | ------- 84 | out: str. 85 | The network OSI address, like '192.168.0.0/24'. 86 | """ 87 | __ip_matches = re.search(NETWORK_REGEX, ip, re.IGNORECASE) 88 | if __ip_matches and __ip_matches.groups(): 89 | __ip = __ip_matches.group(1) 90 | __netmask = netmask if netmask else __ip_matches.group(3) 91 | if __ip and __netmask: 92 | return str(IPv4Network('{}/{}'.format(__ip, __netmask), strict=False)) 93 | elif __ip: 94 | return str(IPv4Network('{}/{}'.format(__ip, '24'), strict=False)) 95 | 96 | # invalid input ip 97 | return '' 98 | 99 | 100 | def list_ips_in_network_range(iprange: str) -> List[str]: 101 | """ 102 | List all the IPs in the range. 103 | 104 | Parameters 105 | ---------- 106 | iprange: str. 107 | A serialized ip range, like '1.2.3.0/24'. 108 | The last digit can be set to anything, it will be ignored. 109 | 110 | Returns 111 | ------- 112 | out: list. 113 | The list of IPs in the range. 114 | """ 115 | try: 116 | __network = IPv4Network(iprange, strict=False) 117 | return [__address.exploded for __address in __network.hosts()] 118 | except Exception: 119 | return [] 120 | 121 | 122 | async def reverse_single_ip(ip: str, resolver: DNSResolver) -> str: 123 | """ 124 | Reverse a single IP and output the linked CNAME, if it exists. 125 | Parameters 126 | ---------- 127 | :param ip: IP address to reverse 128 | :param resolver: DNS server to use 129 | 130 | Returns 131 | ------- 132 | :return str: with the corresponding CNAME or None 133 | """ 134 | try: 135 | __host = await resolver.gethostbyaddr(ip) 136 | return __host.name if __host else '' 137 | except Exception: 138 | return '' 139 | 140 | 141 | async def reverse_all_ips_in_range(iprange: str, callback: Callable, nameservers: Optional[List[str]] = None) -> None: 142 | """ 143 | Reverse all the IPs stored in a network range. 144 | All the queries are made concurrently. 145 | 146 | Parameters 147 | ---------- 148 | iprange: str. 149 | An IPv4 range formatted as 'x.x.x.x/y'. 150 | The last 2 digits of the ip can be set to anything, 151 | they will be ignored. 152 | callback: Callable. 153 | Arbitrary postprocessing function. 154 | nameservers: List[str]. 155 | Optional list of DNS servers. 156 | 157 | Returns 158 | ------- 159 | out: None. 160 | """ 161 | __resolver = DNSResolver(timeout=4, nameservers=nameservers) 162 | for __ip in list_ips_in_network_range(iprange): 163 | log_query(__ip) 164 | __host = await reverse_single_ip(ip=__ip, resolver=__resolver) 165 | callback(__host) 166 | log_result(__host) 167 | 168 | 169 | ##################################################################### 170 | # IO 171 | ##################################################################### 172 | 173 | 174 | def log_query(ip: str) -> None: 175 | """ 176 | Display the current query in the console. 177 | 178 | Parameters 179 | ---------- 180 | ip: str. 181 | Queried ip. 182 | 183 | Results 184 | ------- 185 | out: None. 186 | """ 187 | sys.stdout.write(chr(27) + '[2K' + chr(27) + '[G') 188 | sys.stdout.write('\r' + ip + ' - ') 189 | sys.stdout.flush() 190 | 191 | 192 | def log_result(host: str) -> None: 193 | """ 194 | Display the query result in the console. 195 | 196 | Parameters 197 | ---------- 198 | host: str. 199 | Host name returned by the DNS query. 200 | 201 | Results 202 | ------- 203 | out: None. 204 | """ 205 | if host: 206 | print(host) 207 | 208 | 209 | def generate_postprocessing_callback(target: str, **allhosts: List[str]) -> Callable: 210 | """ 211 | Postprocess the query results asynchronously too, instead of waiting for 212 | the querying stage to be completely finished. 213 | 214 | Parameters 215 | ---------- 216 | target: str. 217 | The domain wanted as TLD. 218 | allhosts: List. 219 | A collection of all the subdomains -of target- found so far. 220 | 221 | Returns 222 | ------- 223 | out: Callable. 224 | A function that will update the collection of target subdomains 225 | when the query result is satisfying. 226 | """ 227 | 228 | def append_matching_hosts(host: str) -> None: 229 | if host and target in host: 230 | for __name, __hosts in allhosts.items(): 231 | if host not in __hosts: 232 | __hosts.append(host) 233 | 234 | return append_matching_hosts 235 | -------------------------------------------------------------------------------- /theHarvester/discovery/duckduckgosearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | from theHarvester.parsers import myparser 4 | import json 5 | 6 | 7 | class SearchDuckDuckGo: 8 | 9 | def __init__(self, word, limit): 10 | self.word = word 11 | self.results = "" 12 | self.totalresults = "" 13 | self.dorks = [] 14 | self.links = [] 15 | self.database = 'https://duckduckgo.com/?q=' 16 | self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1' # Currently using API. 17 | self.quantity = '100' 18 | self.limit = limit 19 | self.proxy = False 20 | 21 | async def do_search(self): 22 | # Do normal scraping. 23 | url = self.api.replace('x', self.word) 24 | headers = {'User-Agent': googleUA} 25 | first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 26 | self.results = first_resp[0] 27 | self.totalresults += self.results 28 | urls = await self.crawl(self.results) 29 | urls = {url for url in urls if len(url) > 5} 30 | all_resps = await AsyncFetcher.fetch_all(urls) 31 | self.totalresults += ''.join(all_resps) 32 | 33 | async def crawl(self, text): 34 | """ 35 | Function parses json and returns URLs. 36 | :param text: formatted json 37 | :return: set of URLs 38 | """ 39 | urls = set() 40 | try: 41 | load = json.loads(text) 42 | for keys in load.keys(): # Iterate through keys of dict. 43 | val = load.get(keys) 44 | if isinstance(val, int) or isinstance(val, dict) or val is None: 45 | continue 46 | if isinstance(val, list): 47 | if len(val) == 0: # Make sure not indexing an empty list. 48 | continue 49 | val = val[0] # First value should be dict. 50 | if isinstance(val, dict): # Sanity check. 51 | for key in val.keys(): 52 | value = val.get(key) 53 | if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value: 54 | urls.add(value) 55 | if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val: 56 | urls.add(val) 57 | tmp = set() 58 | for url in urls: 59 | if '<' in url and 'href=' in url: # Format is 60 | equal_index = url.index('=') 61 | true_url = '' 62 | for ch in url[equal_index + 1:]: 63 | if ch == '"': 64 | tmp.add(true_url) 65 | break 66 | true_url += ch 67 | else: 68 | if url != '': 69 | tmp.add(url) 70 | return tmp 71 | except Exception as e: 72 | print(f'Exception occurred: {e}') 73 | return [] 74 | 75 | async def get_emails(self): 76 | rawres = myparser.Parser(self.totalresults, self.word) 77 | return await rawres.emails() 78 | 79 | async def get_hostnames(self): 80 | rawres = myparser.Parser(self.totalresults, self.word) 81 | return await rawres.hostnames() 82 | 83 | async def process(self, proxy=False): 84 | self.proxy = proxy 85 | await self.do_search() # Only need to search once since using API. 86 | -------------------------------------------------------------------------------- /theHarvester/discovery/fullhuntsearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchFullHunt: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.key = Core.fullhunt_key() 10 | if self.key is None: 11 | raise MissingKey('fullhunt') 12 | self.total_results = None 13 | self.proxy = False 14 | 15 | async def do_search(self): 16 | url = f'https://fullhunt.io/api/v1/domain/{self.word}/subdomains' 17 | response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(), 18 | 'X-API-KEY': self.key}, 19 | proxy=self.proxy) 20 | self.total_results = response[0]['hosts'] 21 | 22 | async def get_hostnames(self) -> set: 23 | return self.total_results 24 | 25 | async def process(self, proxy=False): 26 | self.proxy = proxy 27 | await self.do_search() 28 | -------------------------------------------------------------------------------- /theHarvester/discovery/githubcode.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | from theHarvester.parsers import myparser 4 | from typing import List, Dict, Any, Optional, NamedTuple, Tuple 5 | import asyncio 6 | import aiohttp 7 | import urllib.parse as urlparse 8 | import random 9 | 10 | 11 | class RetryResult(NamedTuple): 12 | time: float 13 | 14 | 15 | class SuccessResult(NamedTuple): 16 | fragments: List[str] 17 | next_page: Optional[int] 18 | last_page: Optional[int] 19 | 20 | 21 | class ErrorResult(NamedTuple): 22 | status_code: int 23 | body: Any 24 | 25 | 26 | class SearchGithubCode: 27 | 28 | def __init__(self, word, limit): 29 | self.word = word 30 | self.total_results = "" 31 | self.server = 'api.github.com' 32 | self.limit = limit 33 | self.counter = 0 34 | self.page = 1 35 | self.key = Core.github_key() 36 | # If you don't have a personal access token, github narrows your search capabilities significantly 37 | # rate limits you more severely 38 | # https://developer.github.com/v3/search/#rate-limit 39 | if self.key is None: 40 | raise MissingKey('Github') 41 | self.proxy = False 42 | 43 | @staticmethod 44 | async def fragments_from_response(json_data: dict) -> List[str]: 45 | items: List[Dict[str, Any]] = json_data.get('items') or list() 46 | fragments: List[str] = list() 47 | for item in items: 48 | matches = item.get("text_matches") or list() 49 | for match in matches: 50 | fragments.append(match.get("fragment")) 51 | 52 | return [fragment for fragment in fragments if fragment is not None] 53 | 54 | @staticmethod 55 | async def page_from_response(page: str, links) -> Optional[Any]: 56 | page_link = links.get(page) 57 | if page_link: 58 | parsed = urlparse.urlparse(str(page_link.get("url"))) 59 | params = urlparse.parse_qs(parsed.query) 60 | pages: List[Any] = params.get('page', [None]) 61 | page_number = pages[0] and int(pages[0]) 62 | return page_number 63 | else: 64 | return None 65 | 66 | async def handle_response(self, response: Tuple[str, dict, int, Any]): 67 | text, json_data, status, links = response 68 | if status == 200: 69 | results = await self.fragments_from_response(json_data) 70 | next_page = await self.page_from_response("next", links) 71 | last_page = await self.page_from_response("last", links) 72 | return SuccessResult(results, next_page, last_page) 73 | elif status == 429 or status == 403: 74 | return RetryResult(60) 75 | else: 76 | try: 77 | return ErrorResult(status, json_data) 78 | except ValueError: 79 | return ErrorResult(status, text) 80 | 81 | async def do_search(self, page: Optional[int]) -> Tuple[str, dict, int, Any]: 82 | if page is None: 83 | url = f'https://{self.server}/search/code?q="{self.word}"' 84 | else: 85 | url = f'https://{self.server}/search/code?q="{self.word}"&page={page}' 86 | headers = { 87 | 'Host': self.server, 88 | 'User-agent': Core.get_user_agent(), 89 | 'Accept': "application/vnd.github.v3.text-match+json", 90 | 'Authorization': f'token {self.key}' 91 | } 92 | 93 | async with aiohttp.ClientSession(headers=headers) as sess: 94 | if self.proxy: 95 | async with sess.get(url, proxy=random.choice(Core.proxy_list())) as resp: 96 | return await resp.text(), await resp.json(), resp.status, resp.links 97 | else: 98 | async with sess.get(url) as resp: 99 | return await resp.text(), await resp.json(), resp.status, resp.links 100 | 101 | @staticmethod 102 | async def next_page_or_end(result: SuccessResult) -> Optional[int]: 103 | if result.next_page is not None: 104 | return result.next_page 105 | else: 106 | return result.last_page 107 | 108 | async def process(self, proxy=False): 109 | self.proxy = proxy 110 | try: 111 | while self.counter <= self.limit and self.page is not None: 112 | api_response = await self.do_search(self.page) 113 | result = await self.handle_response(api_response) 114 | if isinstance(result, SuccessResult): 115 | print(f'\tSearching {self.counter} results.') 116 | for fragment in result.fragments: 117 | self.total_results += fragment 118 | self.counter = self.counter + 1 119 | self.page = await self.next_page_or_end(result) 120 | await asyncio.sleep(get_delay()) 121 | elif isinstance(result, RetryResult): 122 | sleepy_time = get_delay() + result.time 123 | print(f'\tRetrying page in {sleepy_time} seconds...') 124 | await asyncio.sleep(sleepy_time) 125 | elif isinstance(result, ErrorResult): 126 | raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}") 127 | else: 128 | raise Exception("\tUnknown exception occurred") 129 | except Exception as e: 130 | print(f'An exception has occurred: {e}') 131 | 132 | async def get_emails(self): 133 | rawres = myparser.Parser(self.total_results, self.word) 134 | return await rawres.emails() 135 | 136 | async def get_hostnames(self): 137 | rawres = myparser.Parser(self.total_results, self.word) 138 | return await rawres.hostnames() 139 | -------------------------------------------------------------------------------- /theHarvester/discovery/hackertarget.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | 3 | 4 | class SearchHackerTarget: 5 | """ 6 | Class uses the HackerTarget api to gather subdomains and ips 7 | """ 8 | 9 | def __init__(self, word): 10 | self.word = word 11 | self.total_results = "" 12 | self.hostname = 'https://api.hackertarget.com' 13 | self.proxy = False 14 | self.results = None 15 | 16 | async def do_search(self): 17 | headers = {'User-agent': Core.get_user_agent()} 18 | urls = [f'{self.hostname}/hostsearch/?q={self.word}', f'{self.hostname}/reversedns/?q={self.word}'] 19 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 20 | for response in responses: 21 | self.total_results += response.replace(",", ":") 22 | 23 | async def process(self, proxy=False): 24 | self.proxy = proxy 25 | await self.do_search() 26 | 27 | async def get_hostnames(self) -> list: 28 | return [result for result in self.total_results.splitlines() if 'No PTR records found' not in result] 29 | -------------------------------------------------------------------------------- /theHarvester/discovery/huntersearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchHunter: 6 | 7 | def __init__(self, word, limit, start): 8 | self.word = word 9 | self.limit = limit 10 | self.limit = 10 if limit > 10 else limit 11 | self.start = start 12 | self.key = Core.hunter_key() 13 | if self.key is None: 14 | raise MissingKey('Hunter') 15 | self.total_results = "" 16 | self.counter = start 17 | self.database = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit=10' 18 | self.proxy = False 19 | self.hostnames = [] 20 | self.emails = [] 21 | 22 | async def do_search(self): 23 | # First determine if user account is not a free account, this call is free 24 | is_free = True 25 | headers = {'User-Agent': Core.get_user_agent()} 26 | acc_info_url = f'https://api.hunter.io/v2/account?api_key={self.key}' 27 | response = await AsyncFetcher.fetch_all([acc_info_url], headers=headers, json=True) 28 | is_free = is_free if 'plan_name' in response[0]['data'].keys() and response[0]['data']['plan_name'].lower() \ 29 | == 'free' else False 30 | # Extract total number of requests that are available for account 31 | 32 | total_requests_avail = response[0]['data']['requests']['searches']['available'] - response[0]['data']['requests']['searches']['used'] 33 | if is_free: 34 | response = await AsyncFetcher.fetch_all([self.database], headers=headers, proxy=self.proxy, json=True) 35 | self.emails, self.hostnames = await self.parse_resp(json_resp=response[0]) 36 | else: 37 | # Determine total number of emails that are available 38 | # As the most emails you can get within one query is 100 39 | # This is only done where paid accounts are in play 40 | hunter_dinfo_url = f'https://api.hunter.io/v2/email-count?domain={self.word}' 41 | response = await AsyncFetcher.fetch_all([hunter_dinfo_url], headers=headers, proxy=self.proxy, json=True) 42 | total_number_reqs = response[0]['data']['total'] // 100 43 | # Parse out meta field within initial JSON response to determine total number of results 44 | if total_requests_avail < total_number_reqs: 45 | print('WARNING: account does not have enough requests to gather all emails') 46 | print(f'Total requests available: {total_requests_avail}, total requests ' 47 | f'needed to be made: {total_number_reqs}') 48 | print('RETURNING current results, if you would still like to ' 49 | 'run this module comment out the if request') 50 | return 51 | self.limit = 100 52 | # max number of emails you can get per request is 100 53 | # increments of 100 with offset determining where to start 54 | # See docs for more details: https://hunter.io/api-documentation/v2#domain-search 55 | for offset in range(0, 100 * total_number_reqs, 100): 56 | req_url = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit{self.limit}&offset={offset}' 57 | response = await AsyncFetcher.fetch_all([req_url], headers=headers, proxy=self.proxy, json=True) 58 | temp_emails, temp_hostnames = await self.parse_resp(response[0]) 59 | self.emails.extend(temp_emails) 60 | self.hostnames.extend(temp_hostnames) 61 | await asyncio.sleep(1) 62 | 63 | async def parse_resp(self, json_resp): 64 | emails = list(sorted({email['value'] for email in json_resp['data']['emails']})) 65 | domains = list(sorted({source['domain'] for email in json_resp['data']['emails'] for source in email['sources'] 66 | if self.word in source['domain']})) 67 | return emails, domains 68 | 69 | async def process(self, proxy=False): 70 | self.proxy = proxy 71 | await self.do_search() # Only need to do it once. 72 | 73 | async def get_emails(self): 74 | return self.emails 75 | 76 | async def get_hostnames(self): 77 | return self.hostnames 78 | -------------------------------------------------------------------------------- /theHarvester/discovery/intelxsearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | from theHarvester.parsers import intelxparser 4 | import asyncio 5 | import json 6 | import requests 7 | 8 | 9 | class SearchIntelx: 10 | 11 | def __init__(self, word): 12 | self.word = word 13 | self.key = Core.intelx_key() 14 | if self.key is None: 15 | raise MissingKey('Intelx') 16 | self.database = 'https://2.intelx.io' 17 | self.results = None 18 | self.info = () 19 | self.limit = 10000 20 | self.proxy = False 21 | self.offset = -1 22 | 23 | async def do_search(self): 24 | try: 25 | # Based on: https://github.com/IntelligenceX/SDK/blob/master/Python/intelxapi.py 26 | # API requests self identification 27 | # https://intelx.io/integrations 28 | headers = {'x-key': self.key, 'User-Agent': f'{Core.get_user_agent()}-theHarvester'} 29 | data = { 30 | "term": self.word, 31 | "buckets": [], 32 | "lookuplevel": 0, 33 | "maxresults": self.limit, 34 | "timeout": 5, 35 | "datefrom": "", 36 | "dateto": "", 37 | "sort": 2, 38 | "media": 0, 39 | "terminate": [], 40 | "target": 0 41 | } 42 | 43 | total_resp = requests.post(f'{self.database}/phonebook/search', headers=headers, json=data) 44 | phonebook_id = json.loads(total_resp.text)['id'] 45 | await asyncio.sleep(2) 46 | 47 | # Fetch results from phonebook based on ID 48 | resp = await AsyncFetcher.fetch_all( 49 | [f'{self.database}/phonebook/search/result?id={phonebook_id}&limit={self.limit}&offset={self.offset}'], 50 | headers=headers, json=True, proxy=self.proxy) 51 | resp = resp[0] 52 | self.results = resp 53 | except Exception as e: 54 | print(f'An exception has occurred in Intelx: {e}') 55 | 56 | async def process(self, proxy=False): 57 | self.proxy = proxy 58 | await self.do_search() 59 | intelx_parser = intelxparser.Parser() 60 | self.info = await intelx_parser.parse_dictionaries(self.results) 61 | 62 | async def get_emails(self): 63 | return self.info[0] 64 | 65 | async def get_interestingurls(self): 66 | return self.info[1] 67 | -------------------------------------------------------------------------------- /theHarvester/discovery/omnisint.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | 3 | 4 | class SearchOmnisint: 5 | def __init__(self, word): 6 | self.word = word 7 | self.totalhosts = set() 8 | self.totalips = set() 9 | self.proxy = False 10 | 11 | async def do_search(self): 12 | base_url = f'https://sonar.omnisint.io/all/{self.word}?page=1' 13 | responses = await AsyncFetcher.fetch_all([base_url], json=True, headers={'User-Agent': Core.get_user_agent()}, 14 | proxy=self.proxy) 15 | self.totalhosts = list({host for host in responses[0]}) 16 | 17 | async def get_hostnames(self) -> set: 18 | return self.totalhosts 19 | 20 | async def get_ips(self) -> set: 21 | return self.totalips 22 | 23 | async def process(self, proxy=False): 24 | self.proxy = proxy 25 | await self.do_search() 26 | -------------------------------------------------------------------------------- /theHarvester/discovery/otxsearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | import re 3 | 4 | 5 | class SearchOtx: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.totalhosts = set() 10 | self.totalips = set() 11 | self.proxy = False 12 | 13 | async def do_search(self): 14 | url = f'https://otx.alienvault.com/api/v1/indicators/domain/{self.word}/passive_dns' 15 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 16 | responses = response[0] 17 | dct = responses 18 | self.totalhosts: set = {host['hostname'] for host in dct['passive_dns']} 19 | # filter out ips that are just called NXDOMAIN 20 | self.totalips: set = {ip['address'] for ip in dct['passive_dns'] 21 | if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip['address'])} 22 | 23 | async def get_hostnames(self) -> set: 24 | return self.totalhosts 25 | 26 | async def get_ips(self) -> set: 27 | return self.totalips 28 | 29 | async def process(self, proxy=False): 30 | self.proxy = proxy 31 | await self.do_search() 32 | -------------------------------------------------------------------------------- /theHarvester/discovery/pentesttools.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | import json 4 | import time 5 | 6 | 7 | class SearchPentestTools: 8 | 9 | def __init__(self, word): 10 | # Script is largely based off https://pentest-tools.com/public/api_client.py.txt 11 | self.word = word 12 | self.key = Core.pentest_tools_key() 13 | if self.key is None: 14 | raise MissingKey('PentestTools') 15 | self.total_results = [] 16 | self.api = f'https://pentest-tools.com/api?key={self.key}' 17 | self.proxy = False 18 | 19 | async def poll(self, scan_id): 20 | while True: 21 | time.sleep(3) 22 | # Get the status of our scan 23 | scan_status_data = { 24 | 'op': 'get_scan_status', 25 | 'scan_id': scan_id 26 | } 27 | responses = await AsyncFetcher.post_fetch(url=self.api, data=json.dumps(scan_status_data), proxy=self.proxy) 28 | res_json = json.loads(responses.strip()) 29 | if res_json['op_status'] == 'success': 30 | if res_json['scan_status'] != 'waiting' and res_json['scan_status'] != 'running': 31 | getoutput_data = { 32 | 'op': 'get_output', 33 | 'scan_id': scan_id, 34 | 'output_format': 'json' 35 | } 36 | responses = await AsyncFetcher.post_fetch(url=self.api, 37 | data=json.dumps(getoutput_data), 38 | proxy=self.proxy) 39 | 40 | res_json = json.loads(responses.strip('\n')) 41 | self.total_results = await self.parse_json(res_json) 42 | break 43 | else: 44 | print(f"Operation get_scan_status failed because: {res_json['error']}. {res_json['details']}") 45 | break 46 | 47 | @staticmethod 48 | async def parse_json(json_results): 49 | status = json_results['op_status'] 50 | if status == 'success': 51 | scan_tests = json_results['scan_output']['output_json'] 52 | output_data = scan_tests[0]['output_data'] 53 | host_to_ip = [f'{subdomain[0]}:{subdomain[1]}' for subdomain in output_data if len(subdomain) > 0] 54 | return host_to_ip 55 | return [] 56 | 57 | async def get_hostnames(self) -> list: 58 | return self.total_results 59 | 60 | async def do_search(self): 61 | subdomain_payload = { 62 | 'op': 'start_scan', 63 | 'tool_id': 20, 64 | 'tool_params': { 65 | 'target': f'{self.word}', 66 | 'web_details': 'off', 67 | 'do_smart_search': 'off' 68 | } 69 | } 70 | responses = await AsyncFetcher.post_fetch(url=self.api, data=json.dumps(subdomain_payload), proxy=self.proxy) 71 | res_json = json.loads(responses.strip()) 72 | if res_json['op_status'] == 'success': 73 | scan_id = res_json['scan_id'] 74 | await self.poll(scan_id) 75 | 76 | async def process(self, proxy=False): 77 | self.proxy = proxy 78 | await self.do_search() # Only need to do it once. 79 | -------------------------------------------------------------------------------- /theHarvester/discovery/projectdiscovery.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchDiscovery: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.key = Core.projectdiscovery_key() 10 | if self.key is None: 11 | raise MissingKey('ProjectDiscovery') 12 | self.total_results = None 13 | self.proxy = False 14 | 15 | async def do_search(self): 16 | url = f'https://dns.projectdiscovery.io/dns/{self.word}/subdomains' 17 | response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(), 18 | 'Authorization': self.key}, 19 | proxy=self.proxy) 20 | self.total_results = [f'{domains}.{self.word}' for domains in response[0]['subdomains']] 21 | 22 | async def get_hostnames(self) -> set: 23 | return self.total_results 24 | 25 | async def process(self, proxy=False): 26 | self.proxy = proxy 27 | await self.do_search() 28 | -------------------------------------------------------------------------------- /theHarvester/discovery/qwantsearch.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | from json.decoder import JSONDecodeError 4 | 5 | from theHarvester.lib.core import * 6 | from theHarvester.parsers import myparser 7 | 8 | 9 | class SearchQwant: 10 | def __init__(self, word, start, limit): 11 | self.word = word 12 | self.total_results = "" 13 | self.limit = int(limit) 14 | self.start = int(start) 15 | self.proxy = False 16 | 17 | def get_start_offset(self) -> int: 18 | """ 19 | print(get_start_offset(0)) 20 | >>> 0 21 | print(get_start_offset(7)) 22 | >>> 0 23 | print(get_start_offset(25)) 24 | >>> 20 25 | print(get_start_offset(42)) 26 | >>> 40 27 | """ 28 | start = int(math.floor(self.start / 10.0)) * 10 29 | return max(start, 0) 30 | 31 | async def do_search(self) -> None: 32 | headers = {'User-agent': Core.get_user_agent()} 33 | 34 | start = self.get_start_offset() 35 | limit = self.limit + start 36 | step = 10 37 | 38 | api_urls = [ 39 | f"https://api.qwant.com/api/search/web?count=10&offset={str(offset)}&q={self.word}&t=web&r=US&device=desktop&safesearch=0&locale=en_US&uiv=4" 40 | for offset in range(start, limit, step) 41 | ] 42 | 43 | responses = await AsyncFetcher.fetch_all(api_urls, headers=headers, proxy=self.proxy) 44 | 45 | for response in responses: 46 | try: 47 | json_response = json.loads(response) 48 | except JSONDecodeError: 49 | # sometimes error 502 from server 50 | continue 51 | 52 | try: 53 | response_items = json_response['data']['result']['items'] 54 | except KeyError: 55 | if json_response.get("status", None) \ 56 | and json_response.get("error", None) == 24: 57 | # https://www.qwant.com/anti_robot 58 | print("Rate limit reached - IP Blocked until captcha is solved") 59 | break 60 | continue 61 | 62 | for response_item in response_items: 63 | desc = response_item.get('desc', '') 64 | """ 65 | response_item[0]['desc'] = "end of previous description." 66 | response_item[1]['desc'] = "john.doo@company.com start the next description" 67 | total_results = "end of first description.john.doo@company.com" 68 | get_emails() = "description.john.doo@company.com" 69 | """ 70 | self.total_results += " " 71 | self.total_results += desc 72 | 73 | async def get_emails(self) -> set: 74 | parser = myparser.Parser(self.total_results, self.word) 75 | return await parser.emails() 76 | 77 | async def get_hostnames(self) -> list: 78 | parser = myparser.Parser(self.total_results, self.word) 79 | return await parser.hostnames() 80 | 81 | async def process(self, proxy=False) -> None: 82 | self.proxy = proxy 83 | await self.do_search() 84 | -------------------------------------------------------------------------------- /theHarvester/discovery/rapiddns.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchRapidDns: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.total_results = [] 10 | self.proxy = False 11 | 12 | async def do_search(self): 13 | try: 14 | headers = {'User-agent': Core.get_user_agent()} 15 | # TODO see if it's worth adding sameip searches 16 | # f'{self.hostname}/sameip/{self.word}?full=1#result' 17 | urls = [f'https://rapiddns.io/subdomain/{self.word}?full=1#result'] 18 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 19 | if len(responses[0]) <= 1: 20 | return self.total_results 21 | soup = BeautifulSoup(responses[0], 'html.parser') 22 | rows = soup.find("table").find("tbody").find_all("tr") 23 | if rows: 24 | # Sanity check 25 | for row in rows: 26 | cells = row.find_all("td") 27 | if len(cells) >= 0: 28 | # sanity check 29 | subdomain = str(cells[0].get_text()) 30 | if cells[-1].get_text() == 'CNAME': 31 | self.total_results.append(f'{subdomain}') 32 | else: 33 | self.total_results.append(f'{subdomain}:{str(cells[1].get_text()).strip()}') 34 | self.total_results = list({domain for domain in self.total_results}) 35 | except Exception as e: 36 | print(f'An exception has occurred: {str(e)}') 37 | 38 | async def process(self, proxy=False): 39 | self.proxy = proxy 40 | await self.do_search() 41 | 42 | async def get_hostnames(self): 43 | return self.total_results 44 | -------------------------------------------------------------------------------- /theHarvester/discovery/rocketreach.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | import asyncio 4 | 5 | 6 | class SearchRocketReach: 7 | 8 | def __init__(self, word, limit): 9 | self.ips = set() 10 | self.word = word 11 | self.key = Core.rocketreach_key() 12 | if self.key is None: 13 | raise MissingKey('RocketReach') 14 | self.hosts = set() 15 | self.proxy = False 16 | self.baseurl = 'https://api.rocketreach.co/v2/api/search' 17 | self.links = set() 18 | self.limit = limit 19 | 20 | async def do_search(self): 21 | try: 22 | headers = { 23 | 'Api-Key': self.key, 24 | 'Content-Type': 'application/json', 25 | 'User-Agent': Core.get_user_agent() 26 | } 27 | 28 | next_page = 1 # track pagniation 29 | for count in range(1, self.limit): 30 | data = f'{{"query":{{"company_domain": ["{self.word}"]}}, "start": {next_page}, "page_size": 100}}' 31 | result = await AsyncFetcher.post_fetch(self.baseurl, headers=headers, data=data, json=True) 32 | if 'detail' in result.keys() and 'error' in result.keys() and 'Subscribe to a plan to access' in result['detail']: 33 | # No more results can be fetched 34 | break 35 | if 'detail' in result.keys() and 'Request was throttled.' in result['detail']: 36 | # Rate limit has been triggered need to sleep extra 37 | print(f'RocketReach requests have been throttled; ' 38 | f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}') 39 | break 40 | if 'profiles' in dict(result).keys(): 41 | if len(result['profiles']) == 0: 42 | break 43 | for profile in result['profiles']: 44 | if 'linkedin_url' in dict(profile).keys(): 45 | self.links.add(profile['linkedin_url']) 46 | if 'pagination' in dict(result).keys(): 47 | next_page = int(result['pagination']['next']) 48 | if next_page > int(result['pagination']['total']): 49 | break 50 | 51 | await asyncio.sleep(get_delay() + 2) 52 | 53 | except Exception as e: 54 | print(f'An exception has occurred: {e}') 55 | 56 | async def get_links(self): 57 | return self.links 58 | 59 | async def process(self, proxy=False): 60 | self.proxy = proxy 61 | await self.do_search() 62 | -------------------------------------------------------------------------------- /theHarvester/discovery/securitytrailssearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | from theHarvester.parsers import securitytrailsparser 4 | import asyncio 5 | 6 | 7 | class SearchSecuritytrail: 8 | 9 | def __init__(self, word): 10 | self.word = word 11 | self.key = Core.security_trails_key() 12 | if self.key is None: 13 | raise MissingKey('Securitytrail') 14 | self.results = "" 15 | self.totalresults = "" 16 | self.api = 'https://api.securitytrails.com/v1/' 17 | self.info = () 18 | self.proxy = False 19 | 20 | async def authenticate(self) -> None: 21 | # Method to authenticate API key before sending requests. 22 | headers = {'APIKEY': self.key} 23 | url = f'{self.api}ping' 24 | auth_responses = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 25 | auth_responses = auth_responses[0] 26 | if 'False' in auth_responses or 'Invalid authentication' in auth_responses: 27 | print('\tKey could not be authenticated exiting program.') 28 | await asyncio.sleep(2) 29 | 30 | async def do_search(self) -> None: 31 | # https://api.securitytrails.com/v1/domain/domain.com 32 | url = f'{self.api}domain/{self.word}' 33 | headers = {'APIKEY': self.key} 34 | response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 35 | await asyncio.sleep(2) # Not random delay because 2 seconds is required due to rate limit. 36 | self.results = response[0] 37 | self.totalresults += self.results 38 | url += '/subdomains' # Get subdomains now. 39 | subdomain_response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy) 40 | await asyncio.sleep(2) 41 | self.results = subdomain_response[0] 42 | self.totalresults += self.results 43 | 44 | async def process(self, proxy=False) -> None: 45 | self.proxy = proxy 46 | await self.authenticate() 47 | await self.do_search() 48 | parser = securitytrailsparser.Parser(word=self.word, text=self.totalresults) 49 | self.info = await parser.parse_text() 50 | # Create parser and set self.info to tuple returned from parsing text. 51 | print('\tDone Searching Results') 52 | 53 | async def get_ips(self) -> set: 54 | return self.info[0] 55 | 56 | async def get_hostnames(self) -> set: 57 | return self.info[1] 58 | -------------------------------------------------------------------------------- /theHarvester/discovery/shodansearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | from shodan import exception 4 | from shodan import Shodan 5 | from collections import OrderedDict 6 | 7 | 8 | class SearchShodan: 9 | 10 | def __init__(self): 11 | self.key = Core.shodan_key() 12 | if self.key is None: 13 | raise MissingKey('Shodan') 14 | self.api = Shodan(self.key) 15 | self.hostdatarow = [] 16 | self.tracker: OrderedDict = OrderedDict() 17 | 18 | async def search_ip(self, ip): 19 | try: 20 | ipaddress = ip 21 | results = self.api.host(ipaddress) 22 | asn = '' 23 | domains = list() 24 | hostnames = list() 25 | ip_str = '' 26 | isp = '' 27 | org = '' 28 | ports = list() 29 | title = '' 30 | server = '' 31 | product = '' 32 | technologies = list() 33 | 34 | data_first_dict = dict(results['data'][0]) 35 | 36 | if 'ip_str' in data_first_dict.keys(): 37 | ip_str += data_first_dict['ip_str'] 38 | 39 | if 'http' in data_first_dict.keys(): 40 | http_results_dict = dict(data_first_dict['http']) 41 | if 'title' in http_results_dict.keys(): 42 | title_val = str(http_results_dict['title']).strip() 43 | if title_val != 'None': 44 | title += title_val 45 | if 'components' in http_results_dict.keys(): 46 | for key in http_results_dict['components'].keys(): 47 | technologies.append(key) 48 | if 'server' in http_results_dict.keys(): 49 | server_val = str(http_results_dict['server']).strip() 50 | if server_val != 'None': 51 | server += server_val 52 | 53 | for key, value in results.items(): 54 | if key == 'asn': 55 | asn += value 56 | if key == 'domains': 57 | value = list(value) 58 | value.sort() 59 | domains.extend(value) 60 | if key == 'hostnames': 61 | value = [host.strip() for host in list(value)] 62 | value.sort() 63 | hostnames.extend(value) 64 | if key == 'isp': 65 | isp += value 66 | if key == 'org': 67 | org += str(value) 68 | if key == 'ports': 69 | value = list(value) 70 | value.sort() 71 | ports.extend(value) 72 | if key == 'product': 73 | product += value 74 | 75 | technologies = list(set(technologies)) 76 | 77 | self.tracker[ip] = {'asn': asn.strip(), 'domains': domains, 'hostnames': hostnames, 78 | 'ip_str': ip_str.strip(), 'isp': isp.strip(), 'org': org.strip(), 79 | 'ports': ports, 'product': product.strip(), 80 | 'server': server.strip(), 'technologies': technologies, 'title': title.strip()} 81 | 82 | return self.tracker 83 | except exception.APIError: 84 | print(f'{ip}: Not in Shodan') 85 | self.tracker[ip] = 'Not in Shodan' 86 | except Exception as e: 87 | # print(f'Error occurred in the Shodan IP search module: {e}') 88 | self.tracker[ip] = f'Error occurred in the Shodan IP search module: {e}' 89 | finally: 90 | return self.tracker 91 | -------------------------------------------------------------------------------- /theHarvester/discovery/sublist3r.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchSublist3r: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.totalhosts = list 10 | self.proxy = False 11 | 12 | async def do_search(self): 13 | url = f'https://api.sublist3r.com/search.php?domain={self.word}' 14 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 15 | self.totalhosts: list = response[0] 16 | 17 | async def get_hostnames(self) -> Type[list]: 18 | return self.totalhosts 19 | 20 | async def process(self, proxy=False): 21 | self.proxy = proxy 22 | await self.do_search() 23 | -------------------------------------------------------------------------------- /theHarvester/discovery/takeover.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | import re 3 | 4 | 5 | class TakeOver: 6 | 7 | def __init__(self, hosts): 8 | # NOTE THIS MODULE IS ACTIVE RECON 9 | self.hosts = hosts 10 | self.results = "" 11 | self.totalresults = "" 12 | self.proxy = False 13 | # Thank you to https://github.com/EdOverflow/can-i-take-over-xyz for these fingerprints 14 | self.fingerprints = {"'Trying to access your account?'": 'Campaign Monitor', 15 | '404 Not Found': 'Fly.io', 16 | '404 error unknown site!': 'Pantheon', 17 | 'Do you want to register *.wordpress.com?': 'Wordpress', 18 | 'Domain uses DO name serves with no records in DO.': 'Digital Ocean', 19 | "It looks like you may have taken a wrong turn somewhere. Don't worry...it happens to all of us.": 'LaunchRock', 20 | 'No Site For Domain': 'Kinsta', 21 | 'No settings were found for this company:': 'Help Scout', 22 | 'Project doesnt exist... yet!': 'Readme.io', 23 | 'Repository not found': 'Bitbucket', 24 | 'The feed has not been found.': 'Feedpress', 25 | 'No such app': 'Heroku', 26 | 'The specified bucket does not exist': 'AWS/S3', 27 | 'The thing you were looking for is no longer here, or never was': 'Ghost', 28 | "There isn't a Github Pages site here.": 'Github', 29 | 'This UserVoice subdomain is currently available!': 'UserVoice', 30 | "Uh oh. That page doesn't exist.": 'Intercom', 31 | "We could not find what you're looking for.": 'Help Juice', 32 | "Whatever you were looking for doesn't currently exist at this address": 'Tumblr', 33 | 'is not a registered InCloud YouTrack': 'JetBrains', 34 | 'page not found': 'Uptimerobot', 35 | 'project not found': 'Surge.sh'} 36 | 37 | async def check(self, url, resp): 38 | # Simple function that takes response and checks if any fingerprints exists 39 | # If a fingerprint exists figures out which one and prints it out 40 | regex = re.compile("(?=(" + "|".join(map(re.escape, list(self.fingerprints.keys()))) + "))") 41 | # Sanitize fingerprints 42 | matches = re.findall(regex, resp) 43 | for match in matches: 44 | print(f'\t\033[91m Takeover detected: {url}\033[1;32;40m') 45 | if match in self.fingerprints.keys(): 46 | # Sanity check as to not error out 47 | print(f'\t\033[91m Type of takeover is: {self.fingerprints[match]}\033[1;32;40m') 48 | 49 | async def do_take(self): 50 | try: 51 | if len(self.hosts) > 0: 52 | tup_resps: list = await AsyncFetcher.fetch_all(self.hosts, takeover=True, proxy=self.proxy) 53 | # Returns a list of tuples in this format: (url, response) 54 | tup_resps = [tup for tup in tup_resps if tup[1] != ''] 55 | # Filter out responses whose responses are empty strings (indicates errored) 56 | for url, resp in tup_resps: 57 | await self.check(url, resp) 58 | else: 59 | return 60 | except Exception as e: 61 | print(e) 62 | 63 | async def process(self, proxy=False): 64 | self.proxy = proxy 65 | await self.do_take() 66 | -------------------------------------------------------------------------------- /theHarvester/discovery/threatcrowd.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchThreatcrowd: 6 | 7 | def __init__(self, word): 8 | self.word = word.replace(' ', '%20') 9 | self.hostnames = list() 10 | self.ips = list() 11 | self.proxy = False 12 | 13 | async def do_search(self): 14 | base_url = f'https://www.threatcrowd.org/searchApi/v2/domain/report/?domain={self.word}' 15 | headers = {'User-Agent': Core.get_user_agent()} 16 | try: 17 | responses = await AsyncFetcher.fetch_all([base_url], headers=headers, proxy=self.proxy, json=True) 18 | resp = responses[0] 19 | self.ips = {ip['ip_address'] for ip in resp['resolutions'] if len(ip['ip_address']) > 4} 20 | self.hostnames = set(list(resp['subdomains'])) 21 | except Exception as e: 22 | print(e) 23 | 24 | async def get_ips(self) -> List: 25 | return self.ips 26 | 27 | async def get_hostnames(self) -> List: 28 | return self.hostnames 29 | 30 | async def process(self, proxy=False): 31 | self.proxy = proxy 32 | await self.do_search() 33 | await self.get_hostnames() 34 | -------------------------------------------------------------------------------- /theHarvester/discovery/threatminer.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchThreatminer: 6 | 7 | def __init__(self, word): 8 | self.word = word 9 | self.totalhosts = list 10 | self.totalips = list 11 | self.proxy = False 12 | 13 | async def do_search(self): 14 | url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5' 15 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 16 | self.totalhosts: set = {host for host in response[0]['results']} 17 | second_url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2' 18 | secondresp = await AsyncFetcher.fetch_all([second_url], json=True, proxy=self.proxy) 19 | try: 20 | self.totalips: set = {resp['ip'] for resp in secondresp[0]['results']} 21 | except TypeError: 22 | pass 23 | 24 | async def get_hostnames(self) -> Type[list]: 25 | return self.totalhosts 26 | 27 | async def get_ips(self) -> Type[list]: 28 | return self.totalips 29 | 30 | async def process(self, proxy=False): 31 | self.proxy = proxy 32 | await self.do_search() 33 | -------------------------------------------------------------------------------- /theHarvester/discovery/urlscan.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchUrlscan: 6 | def __init__(self, word): 7 | self.word = word 8 | self.totalhosts = list() 9 | self.totalips = list() 10 | self.interestingurls = list() 11 | self.totalasns = list() 12 | self.proxy = False 13 | 14 | async def do_search(self): 15 | url = f'https://urlscan.io/api/v1/search/?q=domain:{self.word}' 16 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 17 | resp = response[0] 18 | self.totalhosts = {f"{page['page']['domain']}" for page in resp['results']} 19 | self.totalips = {f"{page['page']['ip']}" for page in resp['results'] if 'ip' in page['page'].keys()} 20 | self.interestingurls = {f"{page['page']['url']}" for page in resp['results'] if self.word in page['page']['url'] and 'url' in page['page'].keys()} 21 | self.totalasns = {f"{page['page']['asn']}" for page in resp['results'] if 'asn' in page['page'].keys()} 22 | 23 | async def get_hostnames(self) -> List: 24 | return self.totalhosts 25 | 26 | async def get_ips(self) -> List: 27 | return self.totalips 28 | 29 | async def get_interestingurls(self) -> List: 30 | return self.interestingurls 31 | 32 | async def get_asns(self) -> List: 33 | return self.totalasns 34 | 35 | async def process(self, proxy=False): 36 | self.proxy = proxy 37 | await self.do_search() 38 | -------------------------------------------------------------------------------- /theHarvester/discovery/virustotal.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | 4 | 5 | class SearchVirustotal: 6 | 7 | def __init__(self, word): 8 | self.key = Core.virustotal_key() 9 | if self.key is None: 10 | raise MissingKey('virustotal') 11 | self.word = word 12 | self.proxy = False 13 | self.hostnames = [] 14 | 15 | async def do_search(self): 16 | # TODO determine if more endpoints can yield useful info given a domain 17 | # based on: https://developers.virustotal.com/reference/domains-relationships 18 | # base_url = "https://www.virustotal.com/api/v3/domains/domain/subdomains?limit=40" 19 | headers = { 20 | 'User-Agent': Core.get_user_agent(), 21 | "Accept": "application/json", 22 | "x-apikey": self.key 23 | } 24 | base_url = f"https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40" 25 | cursor = '' 26 | count = 0 27 | fail_counter = 0 28 | counter = 0 29 | breakcon = False 30 | while True: 31 | if breakcon: 32 | break 33 | # rate limit is 4 per minute 34 | # TODO add timer logic if proven to be needed 35 | # in the meantime sleeping 16 seconds should eliminate hitting the rate limit 36 | # in case rate limit is hit, fail counter exists and sleep for 65 seconds 37 | send_url = base_url + "&cursor=" + cursor if cursor != '' and len(cursor) > 2 else base_url 38 | responses = await AsyncFetcher.fetch_all([send_url], headers=headers, proxy=self.proxy, json=True) 39 | jdata = responses[0] 40 | if 'data' not in jdata.keys(): 41 | await asyncio.sleep(60 + 5) 42 | fail_counter += 1 43 | if 'meta' in jdata.keys(): 44 | cursor = jdata['meta']['cursor'] if 'cursor' in jdata['meta'].keys() else '' 45 | if len(cursor) == 0 and 'data' in jdata.keys(): 46 | # if cursor no longer is within the meta field have hit last entry 47 | breakcon = True 48 | count += jdata['meta']['count'] 49 | if count == 0 or fail_counter >= 2: 50 | break 51 | if 'data' in jdata.keys(): 52 | data = jdata['data'] 53 | self.hostnames.extend(await self.parse_hostnames(data, self.word)) 54 | counter += 1 55 | await asyncio.sleep(16) 56 | self.hostnames = list(sorted(set(self.hostnames))) 57 | # verify domains such as x.x.com.multicdn.x.com are parsed properly 58 | self.hostnames = [host for host in self.hostnames if ((len(host.split('.')) >= 3) and host.split('.')[-2] == self.word.split('.')[-2])] 59 | 60 | async def get_hostnames(self) -> list: 61 | return self.hostnames 62 | 63 | @staticmethod 64 | async def parse_hostnames(data, word): 65 | total_subdomains = set() 66 | for attribute in data: 67 | total_subdomains.add(attribute['id'].replace('"', '').replace('www.', '')) 68 | attributes = attribute['attributes'] 69 | total_subdomains.update( 70 | {value['value'].replace('"', '').replace('www.', '') for value in attributes['last_dns_records'] if 71 | word in value['value']}) 72 | if 'last_https_certificate' in attributes.keys(): 73 | total_subdomains.update({value.replace('"', '').replace('www.', '') for value in 74 | attributes['last_https_certificate']['extensions']['subject_alternative_name'] 75 | if word in value}) 76 | total_subdomains = list(sorted(total_subdomains)) 77 | # Other false positives may occur over time and yes there are other ways to parse this, feel free to implement 78 | # them and submit a PR or raise an issue if you run into this filtering not being enough 79 | # TODO determine if parsing 'v=spf1 include:_spf-x.acme.com include:_spf-x.acme.com' is worth parsing 80 | total_subdomains = [x for x in total_subdomains if not str(x).endswith('edgekey.net') and not str(x).endswith('akadns.net') and 'include:_spf' not in str(x)] 81 | total_subdomains.sort() 82 | return total_subdomains 83 | 84 | async def process(self, proxy=False): 85 | self.proxy = proxy 86 | await self.do_search() 87 | -------------------------------------------------------------------------------- /theHarvester/discovery/yahoosearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import * 2 | from theHarvester.parsers import myparser 3 | 4 | 5 | class SearchYahoo: 6 | 7 | def __init__(self, word, limit): 8 | self.word = word 9 | self.total_results = "" 10 | self.server = 'search.yahoo.com' 11 | self.limit = limit 12 | self.proxy = False 13 | 14 | async def do_search(self): 15 | base_url = f'https://{self.server}/search?p=%40{self.word}&b=xx&pz=10' 16 | headers = { 17 | 'Host': self.server, 18 | 'User-agent': Core.get_user_agent() 19 | } 20 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit] 21 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy) 22 | for response in responses: 23 | self.total_results += response 24 | 25 | async def process(self): 26 | await self.do_search() 27 | 28 | async def get_emails(self): 29 | rawres = myparser.Parser(self.total_results, self.word) 30 | toparse_emails = await rawres.emails() 31 | emails = set() 32 | # strip out numbers and dashes for emails that look like xxx-xxx-xxxemail@host.tld 33 | for email in toparse_emails: 34 | email = str(email) 35 | if '-' in email and email[0].isdigit() and email.index('-') <= 9: 36 | while email[0] == '-' or email[0].isdigit(): 37 | email = email[1:] 38 | emails.add(email) 39 | return list(emails) 40 | 41 | async def get_hostnames(self, proxy=False): 42 | self.proxy = proxy 43 | rawres = myparser.Parser(self.total_results, self.word) 44 | return await rawres.hostnames() 45 | -------------------------------------------------------------------------------- /theHarvester/discovery/zoomeyesearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import * 2 | from theHarvester.lib.core import * 3 | from theHarvester.parsers import myparser 4 | import asyncio 5 | import re 6 | 7 | 8 | class SearchZoomEye: 9 | 10 | def __init__(self, word, limit): 11 | self.word = word 12 | self.limit = limit 13 | self.key = Core.zoomeye_key() 14 | # NOTE for ZoomEye you get a system recharge on the 1st of every month 15 | # Which resets your balance to 10000 requests 16 | # If you wish to extract as many subdomains as possible visit the fetch_subdomains 17 | # To see how 18 | if self.key is None: 19 | raise MissingKey('zoomeye') 20 | self.baseurl = 'https://api.zoomeye.org/host/search' 21 | self.proxy = False 22 | self.totalasns = list() 23 | self.totalhosts = list() 24 | self.interestingurls = list() 25 | self.totalips = list() 26 | self.totalemails = list() 27 | # Regex used is directly from: https://github.com/GerbenJavado/LinkFinder/blob/master/linkfinder.py#L29 28 | # Maybe one day it will be a pip package 29 | # Regardless LinkFinder is an amazing tool! 30 | self.iurl_regex = r""" 31 | (?:"|') # Start newline delimiter 32 | ( 33 | ((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or // 34 | [^"'/]{1,}\. # Match a domainname (any character + dot) 35 | [a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path 36 | | 37 | ((?:/|\.\./|\./) # Start with /,../,./ 38 | [^"'><,;| *()(%%$^/\\\[\]] # Next character can't be... 39 | [^"'><,;|()]{1,}) # Rest of the characters can't be 40 | | 41 | ([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with / 42 | [a-zA-Z0-9_\-/]{1,} # Resource name 43 | \.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action) 44 | (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters 45 | | 46 | ([a-zA-Z0-9_\-/]{1,}/ # REST API (no extension) with / 47 | [a-zA-Z0-9_\-/]{3,} # Proper REST endpoints usually have 3+ chars 48 | (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters 49 | | 50 | ([a-zA-Z0-9_\-]{1,} # filename 51 | \.(?:php|asp|aspx|jsp|json| 52 | action|html|js|txt|xml) # . + extension 53 | (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters 54 | ) 55 | (?:"|') # End newline delimiter 56 | """ 57 | self.iurl_regex = re.compile(self.iurl_regex, re.VERBOSE) 58 | 59 | async def fetch_subdomains(self): 60 | # Based on docs from: https://www.zoomeye.org/doc#search-sub-domain-ip 61 | headers = { 62 | 'API-KEY': self.key, 63 | 'User-Agent': Core.get_user_agent() 64 | } 65 | 66 | subdomain_search_endpoint = f'https://api.zoomeye.org/domain/search?q={self.word}&type=0&' 67 | 68 | response = await AsyncFetcher.fetch_all([subdomain_search_endpoint + 'page=1'], 69 | json=True, proxy=self.proxy, headers=headers) 70 | # Make initial request to determine total number of subdomains 71 | resp = response[0] 72 | if resp['status'] != 200: 73 | return 74 | total = resp['total'] 75 | # max number of results per request seems to be 30 76 | # NOTE: If you wish to get as many subdomains as possible 77 | # Change the line below to: 78 | # self.limit = (total // 30) + 1 79 | self.limit = self.limit if total > self.limit else (total // 30) + 1 80 | self.totalhosts.extend([item["name"] for item in resp["list"]]) 81 | for i in range(2, self.limit): 82 | response = await AsyncFetcher.fetch_all([subdomain_search_endpoint + f'page={i}'], 83 | json=True, proxy=self.proxy, headers=headers) 84 | resp = response[0] 85 | if resp['status'] != 200: 86 | return 87 | found_subdomains = [item["name"] for item in resp["list"]] 88 | if len(found_subdomains) == 0: 89 | break 90 | self.totalhosts.extend(found_subdomains) 91 | if i % 10 == 0: 92 | await asyncio.sleep(get_delay() + 1) 93 | 94 | async def do_search(self): 95 | headers = { 96 | 'API-KEY': self.key, 97 | 'User-Agent': Core.get_user_agent() 98 | } 99 | # Fetch subdomains first 100 | await self.fetch_subdomains() 101 | params = ( 102 | ('query', f'site:{self.word}'), 103 | ('page', '1'), 104 | ) 105 | response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers, 106 | params=params) 107 | # First request determines how many pages there in total 108 | resp = response[0] 109 | total_pages = int(resp['available']) 110 | self.limit = self.limit if total_pages > self.limit else total_pages 111 | self.limit = 3 if self.limit == 2 else self.limit 112 | cur_page = 2 if self.limit >= 2 else -1 113 | # Means there is only one page 114 | # hostnames, emails, ips, asns, iurls 115 | nomatches_counter = 0 116 | # cur_page = -1 117 | if cur_page == -1: 118 | # No need to do loop just parse and leave 119 | if 'matches' in resp.keys(): 120 | hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches']) 121 | self.totalhosts.extend(hostnames) 122 | self.totalemails.extend(emails) 123 | self.totalips.extend(ips) 124 | self.totalasns.extend(asns) 125 | self.interestingurls.extend(iurls) 126 | else: 127 | if 'matches' in resp.keys(): 128 | # Parse out initial results and then continue to loop 129 | hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches']) 130 | self.totalhosts.extend(hostnames) 131 | self.totalemails.extend(emails) 132 | self.totalips.extend(ips) 133 | self.totalasns.extend(asns) 134 | self.interestingurls.extend(iurls) 135 | 136 | for num in range(2, self.limit): 137 | # print(f'Currently on page: {num}') 138 | params = ( 139 | ('query', f'site:{self.word}'), 140 | ('page', f'{num}'), 141 | ) 142 | response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers, 143 | params=params) 144 | resp = response[0] 145 | if 'matches' not in resp.keys(): 146 | print(f'Your resp: {resp}') 147 | print('Match not found in keys') 148 | break 149 | 150 | hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches']) 151 | 152 | if len(hostnames) == 0 and len(emails) == 0 and len(ips) == 0 \ 153 | and len(asns) == 0 and len(iurls) == 0: 154 | nomatches_counter += 1 155 | 156 | if nomatches_counter >= 5: 157 | break 158 | 159 | self.totalhosts.extend(hostnames) 160 | self.totalemails.extend(emails) 161 | self.totalips.extend(ips) 162 | self.totalasns.extend(asns) 163 | self.interestingurls.extend(iurls) 164 | 165 | if num % 10 == 0: 166 | await asyncio.sleep(get_delay() + 1) 167 | 168 | async def parse_matches(self, matches): 169 | # Helper function to parse items from match json 170 | # ips = {match["ip"] for match in matches} 171 | ips = set() 172 | iurls = set() 173 | hostnames = set() 174 | asns = set() 175 | emails = set() 176 | for match in matches: 177 | try: 178 | ips.add(match['ip']) 179 | 180 | if 'geoinfo' in match.keys(): 181 | asns.add(int(match['geoinfo']['asn'])) 182 | 183 | if 'rdns_new' in match.keys(): 184 | rdns_new = match['rdns_new'] 185 | 186 | if ',' in rdns_new: 187 | parts = str(rdns_new).split(',') 188 | rdns_new = parts[0] 189 | if len(parts) == 2: 190 | hostnames.add(parts[1]) 191 | rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new 192 | hostnames.add(rdns_new) 193 | else: 194 | rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new 195 | hostnames.add(rdns_new) 196 | 197 | if 'rdns' in match.keys(): 198 | rdns = match['rdns'] 199 | rdns = rdns[:-1] if rdns[-1] == '.' else rdns 200 | hostnames.add(rdns) 201 | 202 | if 'portinfo' in match.keys(): 203 | # re. 204 | temp_emails = set(await self.parse_emails(match['portinfo']['banner'])) 205 | emails.update(temp_emails) 206 | hostnames.update(set(await self.parse_hostnames(match['portinfo']['banner']))) 207 | iurls = {str(iurl.group(1)).replace('"', '') for iurl 208 | in re.finditer(self.iurl_regex, match['portinfo']['banner']) 209 | if self.word in str(iurl.group(1))} 210 | except Exception as e: 211 | print(f'An exception has occurred: {e}') 212 | return hostnames, emails, ips, asns, iurls 213 | 214 | async def process(self, proxy=False): 215 | self.proxy = proxy 216 | await self.do_search() # Only need to do it once. 217 | 218 | async def parse_emails(self, content): 219 | rawres = myparser.Parser(content, self.word) 220 | return await rawres.emails() 221 | 222 | async def parse_hostnames(self, content): 223 | rawres = myparser.Parser(content, self.word) 224 | return await rawres.hostnames() 225 | 226 | async def get_hostnames(self): 227 | return set(self.totalhosts) 228 | 229 | async def get_emails(self): 230 | return set(self.totalemails) 231 | 232 | async def get_ips(self): 233 | return set(self.totalips) 234 | 235 | async def get_asns(self): 236 | return set(self.totalasns) 237 | 238 | async def get_interestingurls(self): 239 | return set(self.interestingurls) 240 | -------------------------------------------------------------------------------- /theHarvester/lib/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['hostchecker'] 2 | -------------------------------------------------------------------------------- /theHarvester/lib/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/lib/api/__init__.py -------------------------------------------------------------------------------- /theHarvester/lib/api/api.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | import os 4 | from fastapi import FastAPI, Header, Query, Request 5 | from fastapi.responses import HTMLResponse, UJSONResponse 6 | from slowapi import Limiter, _rate_limit_exceeded_handler 7 | from slowapi.errors import RateLimitExceeded 8 | from slowapi.util import get_remote_address 9 | from starlette.responses import RedirectResponse 10 | from starlette.staticfiles import StaticFiles 11 | 12 | from theHarvester import __main__ 13 | 14 | limiter = Limiter(key_func=get_remote_address) 15 | app = FastAPI(title='Restful Harvest', description='Rest API for theHarvester powered by FastAPI', version='0.0.2') 16 | app.state.limiter = limiter 17 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) 18 | 19 | # This is where we will host files that arise if the user specifies a filename 20 | try: 21 | app.mount('/static', StaticFiles(directory='theHarvester/lib/api/static/'), name='static') 22 | except RuntimeError: 23 | static_path = os.path.expanduser('~/.local/share/theHarvester/static/') 24 | if not os.path.isdir(static_path): 25 | os.makedirs(static_path) 26 | app.mount('/static', StaticFiles(directory='~/.local/share/theHarvester/static/'), name='static') 27 | 28 | 29 | @app.get('/', response_class=HTMLResponse) 30 | async def root(*, user_agent: str = Header(None)): 31 | # very basic user agent filtering 32 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent): 33 | response = RedirectResponse(app.url_path_for('bot')) 34 | return response 35 | 36 | html = """ 37 | 38 | 39 | 40 | theHarvester API 41 | 47 | 48 | 49 |
50 | 51 | 52 | theHarvester logo 53 | 54 | 55 | 56 | 57 | """ 58 | return html 59 | 60 | 61 | @app.get('/nicebot') 62 | async def bot(): 63 | # nice bot 64 | string = {'bot': 'These are not the droids you are looking for'} 65 | return string 66 | 67 | 68 | @app.get('/sources', response_class=UJSONResponse) 69 | @limiter.limit('5/minute') 70 | async def getsources(request: Request): 71 | # Endpoint for user to query for available sources theHarvester supports 72 | # Rate limit of 5 requests per minute 73 | sources = __main__.Core.get_supportedengines() 74 | return {'sources': sources} 75 | 76 | 77 | @app.get('/dnsbrute', response_class=UJSONResponse) 78 | @limiter.limit('5/minute') 79 | async def dnsbrute(request: Request, user_agent: str = Header(None), 80 | domain: str = Query(..., description='Domain to be brute forced')): 81 | # Endpoint for user to signal to do DNS brute forcing 82 | # Rate limit of 5 requests per minute 83 | # basic user agent filtering 84 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent): 85 | response = RedirectResponse(app.url_path_for('bot')) 86 | return response 87 | dns_bruteforce = await __main__.start(argparse.Namespace(dns_brute=True, 88 | dns_lookup=False, 89 | dns_server=False, 90 | dns_tld=False, 91 | domain=domain, 92 | filename='', 93 | google_dork=False, 94 | limit=500, 95 | proxies=False, 96 | shodan=False, 97 | source=','.join([]), 98 | start=0, 99 | take_over=False, 100 | virtual_host=False)) 101 | return {'dns_bruteforce': dns_bruteforce} 102 | 103 | 104 | @app.get('/query', response_class=UJSONResponse) 105 | @limiter.limit('2/minute') 106 | async def query(request: Request, dns_server: str = Query(""), user_agent: str = Header(None), 107 | dns_brute: bool = Query(False), 108 | dns_lookup: bool = Query(False), 109 | dns_tld: bool = Query(False), 110 | filename: str = Query(""), 111 | google_dork: bool = Query(False), proxies: bool = Query(False), shodan: bool = Query(False), 112 | take_over: bool = Query(False), virtual_host: bool = Query(False), 113 | source: List[str] = Query(..., description='Data sources to query comma separated with no space'), 114 | limit: int = Query(500), start: int = Query(0), 115 | domain: str = Query(..., description='Domain to be harvested')): 116 | 117 | # Query function that allows user to query theHarvester rest API 118 | # Rate limit of 2 requests per minute 119 | # basic user agent filtering 120 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent): 121 | response = RedirectResponse(app.url_path_for('bot')) 122 | return response 123 | try: 124 | asns, iurls, twitter_people_list, \ 125 | linkedin_people_list, linkedin_links, \ 126 | aurls, aips, aemails, ahosts = await __main__.start(argparse.Namespace(dns_brute=dns_brute, 127 | dns_lookup=dns_lookup, 128 | dns_server=dns_server, 129 | dns_tld=dns_tld, 130 | domain=domain, 131 | filename=filename, 132 | google_dork=google_dork, 133 | limit=limit, 134 | proxies=proxies, 135 | shodan=shodan, 136 | source=','.join(source), 137 | start=start, 138 | take_over=take_over, 139 | virtual_host=virtual_host)) 140 | 141 | return {'asns': asns, 'interesting_urls': iurls, 142 | 'twitter_people': twitter_people_list, 143 | 'linkedin_people': linkedin_people_list, 144 | 'linkedin_links': linkedin_links, 145 | 'trello_urls': aurls, 146 | 'ips': aips, 147 | 'emails': aemails, 148 | 'hosts': ahosts} 149 | except Exception: 150 | return {'exception': 'Please contact the server administrator to check the issue'} 151 | -------------------------------------------------------------------------------- /theHarvester/lib/api/api_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example script to query theHarvester rest API, obtain results, and write out to stdout as well as an html 3 | """ 4 | 5 | import asyncio 6 | import aiohttp 7 | import netaddr 8 | 9 | 10 | async def fetch_json(session, url): 11 | async with session.get(url) as response: 12 | return await response.json() 13 | 14 | 15 | async def fetch(session, url): 16 | async with session.get(url) as response: 17 | return await response.text() 18 | 19 | 20 | async def main(): 21 | """ 22 | Just a simple example of how to interact with the rest api 23 | you can easily use requests instead of aiohttp or whatever you best see fit 24 | """ 25 | url = "http://127.0.0.1:5000" 26 | domain = "netflix.com" 27 | query_url = f'{url}/query?limit=300&source=bing,baidu,duckduckgo,dogpile&domain={domain}' 28 | async with aiohttp.ClientSession() as session: 29 | fetched_json = await fetch_json(session, query_url) 30 | total_asns = fetched_json['asns'] 31 | interesting_urls = fetched_json['interesting_urls'] 32 | twitter_people_list_tracker = fetched_json['twitter_people'] 33 | linkedin_people_list_tracker = fetched_json['linkedin_people'] 34 | linkedin_links_tracker = fetched_json['linkedin_links'] 35 | trello_urls = fetched_json['trello_urls'] 36 | ips = fetched_json['ips'] 37 | emails = fetched_json['emails'] 38 | hosts = fetched_json['hosts'] 39 | 40 | if len(total_asns) > 0: 41 | print(f'\n[*] ASNS found: {len(total_asns)}') 42 | print('--------------------') 43 | total_asns = list(sorted(set(total_asns))) 44 | for asn in total_asns: 45 | print(asn) 46 | 47 | if len(interesting_urls) > 0: 48 | print(f'\n[*] Interesting Urls found: {len(interesting_urls)}') 49 | print('--------------------') 50 | interesting_urls = list(sorted(set(interesting_urls))) 51 | for iurl in interesting_urls: 52 | print(iurl) 53 | 54 | if len(twitter_people_list_tracker) == 0: 55 | print('\n[*] No Twitter users found.\n\n') 56 | else: 57 | if len(twitter_people_list_tracker) >= 1: 58 | print('\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker))) 59 | print('---------------------') 60 | twitter_people_list_tracker = list(sorted(set(twitter_people_list_tracker))) 61 | for usr in twitter_people_list_tracker: 62 | print(usr) 63 | 64 | if len(linkedin_people_list_tracker) == 0: 65 | print('\n[*] No LinkedIn users found.\n\n') 66 | else: 67 | if len(linkedin_people_list_tracker) >= 1: 68 | print('\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker))) 69 | print('---------------------') 70 | linkedin_people_list_tracker = list(sorted(set(linkedin_people_list_tracker))) 71 | for usr in linkedin_people_list_tracker: 72 | print(usr) 73 | 74 | if len(linkedin_links_tracker) == 0: 75 | print(f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}') 76 | linkedin_links_tracker = list(sorted(set(linkedin_links_tracker))) 77 | print('---------------------') 78 | for link in linkedin_links_tracker: 79 | print(link) 80 | 81 | length_urls = len(trello_urls) 82 | total = length_urls 83 | print('\n[*] Trello URLs found: ' + str(total)) 84 | print('--------------------') 85 | all_urls = list(sorted(set(trello_urls))) 86 | for url in sorted(all_urls): 87 | print(url) 88 | 89 | if len(ips) == 0: 90 | print('\n[*] No IPs found.') 91 | else: 92 | print('\n[*] IPs found: ' + str(len(ips))) 93 | print('-------------------') 94 | # use netaddr as the list may contain ipv4 and ipv6 addresses 95 | ip_list = sorted([netaddr.IPAddress(ip.strip()) for ip in set(ips)]) 96 | print('\n'.join(map(str, ip_list))) 97 | 98 | if len(emails) == 0: 99 | print('\n[*] No emails found.') 100 | else: 101 | print('\n[*] Emails found: ' + str(len(emails))) 102 | print('----------------------') 103 | all_emails = sorted(list(set(emails))) 104 | print(('\n'.join(all_emails))) 105 | 106 | if len(hosts) == 0: 107 | print('\n[*] No hosts found.\n\n') 108 | else: 109 | print('\n[*] Hosts found: ' + str(len(hosts))) 110 | print('---------------------') 111 | print('\n'.join(hosts)) 112 | 113 | 114 | if __name__ == '__main__': 115 | asyncio.run(main()) 116 | -------------------------------------------------------------------------------- /theHarvester/lib/api/static/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/lib/api/static/.gitkeep -------------------------------------------------------------------------------- /theHarvester/lib/hostchecker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | """ 4 | Created by laramies on 2008-08-21. 5 | Revised to use aiodns & asyncio on 2019-09-23 6 | """ 7 | 8 | import aiodns 9 | import asyncio 10 | import socket 11 | from typing import Tuple, Any 12 | 13 | 14 | class Checker: 15 | 16 | def __init__(self, hosts: list, nameserver=False): 17 | self.hosts = hosts 18 | self.realhosts: list = [] 19 | self.addresses: set = set() 20 | self.nameserver = [] 21 | if nameserver: 22 | self.nameserver = nameserver 23 | 24 | @staticmethod 25 | async def query(host, resolver) -> Tuple[str, Any]: 26 | try: 27 | result = await resolver.gethostbyname(host, socket.AF_INET) 28 | addresses = result.addresses 29 | if addresses == [] or addresses is None or result is None: 30 | return f"{host}:", tuple() 31 | else: 32 | return f"{host}:{', '.join(map(str, addresses))}", addresses 33 | except Exception: 34 | return f"{host}", tuple() 35 | 36 | async def query_all(self, resolver) -> list: 37 | results = await asyncio.gather(*[asyncio.create_task(self.query(host, resolver)) 38 | for host in self.hosts]) 39 | return results 40 | 41 | async def check(self): 42 | loop = asyncio.get_event_loop() 43 | resolver = aiodns.DNSResolver(loop=loop, timeout=4) if len(self.nameserver) == 0\ 44 | else aiodns.DNSResolver(loop=loop, timeout=4, nameservers=self.nameserver) 45 | results = await self.query_all(resolver) 46 | for host, address in results: 47 | self.realhosts.append(host) 48 | self.addresses.update({addr for addr in address}) 49 | # address may be a list of ips 50 | # and do a set comprehension to remove duplicates 51 | self.realhosts.sort() 52 | self.addresses = list(self.addresses) 53 | return self.realhosts, self.addresses 54 | -------------------------------------------------------------------------------- /theHarvester/lib/stash.py: -------------------------------------------------------------------------------- 1 | import aiosqlite 2 | import datetime 3 | import os 4 | 5 | db_path = os.path.expanduser('~/.local/share/theHarvester') 6 | 7 | if not os.path.isdir(db_path): 8 | os.makedirs(db_path) 9 | 10 | 11 | class StashManager: 12 | 13 | def __init__(self): 14 | self.db = os.path.join(db_path, 'stash.sqlite') 15 | self.results = "" 16 | self.totalresults = "" 17 | self.latestscandomain = {} 18 | self.domainscanhistory = [] 19 | self.scanboarddata = {} 20 | self.scanstats = [] 21 | self.latestscanresults = [] 22 | self.previousscanresults = [] 23 | 24 | async def do_init(self): 25 | async with aiosqlite.connect(self.db) as db: 26 | await db.execute( 27 | 'CREATE TABLE IF NOT EXISTS results (domain text, resource text, type text, find_date date, source text)') 28 | await db.commit() 29 | 30 | async def store(self, domain, resource, res_type, source): 31 | self.domain = domain 32 | self.resource = resource 33 | self.type = res_type 34 | self.source = source 35 | self.date = datetime.date.today() 36 | try: 37 | async with aiosqlite.connect(self.db, timeout=30) as db: 38 | await db.execute('INSERT INTO results (domain,resource, type, find_date, source) VALUES (?,?,?,?,?)', 39 | (self.domain, self.resource, self.type, self.date, self.source)) 40 | await db.commit() 41 | except Exception as e: 42 | print(e) 43 | 44 | async def store_all(self, domain, all, res_type, source): 45 | self.domain = domain 46 | self.all = all 47 | self.type = res_type 48 | self.source = source 49 | self.date = datetime.date.today() 50 | master_list = [(self.domain, x, self.type, self.date, self.source) for x in self.all] 51 | async with aiosqlite.connect(self.db, timeout=30) as db: 52 | try: 53 | await db.executemany('INSERT INTO results (domain,resource, type, find_date, source) VALUES (?,?,?,?,?)', 54 | master_list) 55 | await db.commit() 56 | except Exception as e: 57 | print(e) 58 | 59 | async def generatedashboardcode(self, domain): 60 | try: 61 | # TODO refactor into generic method 62 | self.latestscandomain["domain"] = domain 63 | async with aiosqlite.connect(self.db, timeout=30) as conn: 64 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="host"''', 65 | (domain,)) 66 | data = await cursor.fetchone() 67 | self.latestscandomain["host"] = data[0] 68 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="email"''', 69 | (domain,)) 70 | data = await cursor.fetchone() 71 | self.latestscandomain["email"] = data[0] 72 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="ip"''', (domain,)) 73 | data = await cursor.fetchone() 74 | self.latestscandomain["ip"] = data[0] 75 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="vhost"''', 76 | (domain,)) 77 | data = await cursor.fetchone() 78 | self.latestscandomain["vhost"] = data[0] 79 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="shodan"''', 80 | (domain,)) 81 | data = await cursor.fetchone() 82 | self.latestscandomain["shodan"] = data[0] 83 | cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,)) 84 | data = await cursor.fetchone() 85 | self.latestscandomain["latestdate"] = data[0] 86 | latestdate = data[0] 87 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="host"''', 88 | (domain, latestdate,)) 89 | scandetailshost = await cursor.fetchall() 90 | self.latestscandomain["scandetailshost"] = scandetailshost 91 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="email"''', 92 | (domain, latestdate,)) 93 | scandetailsemail = await cursor.fetchall() 94 | self.latestscandomain["scandetailsemail"] = scandetailsemail 95 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="ip"''', 96 | (domain, latestdate,)) 97 | scandetailsip = await cursor.fetchall() 98 | self.latestscandomain["scandetailsip"] = scandetailsip 99 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="vhost"''', 100 | (domain, latestdate,)) 101 | scandetailsvhost = await cursor.fetchall() 102 | self.latestscandomain["scandetailsvhost"] = scandetailsvhost 103 | cursor = await conn.execute( 104 | '''SELECT * FROM results WHERE domain=? AND find_date=? AND type="shodan"''', 105 | (domain, latestdate,)) 106 | scandetailsshodan = await cursor.fetchall() 107 | self.latestscandomain["scandetailsshodan"] = scandetailsshodan 108 | return self.latestscandomain 109 | except Exception as e: 110 | print(e) 111 | 112 | async def getlatestscanresults(self, domain, previousday=False): 113 | try: 114 | async with aiosqlite.connect(self.db, timeout=30) as conn: 115 | if previousday: 116 | try: 117 | cursor = await conn.execute(''' 118 | SELECT DISTINCT(find_date) 119 | FROM results 120 | WHERE find_date=date('now', '-1 day') and domain=?''', (domain,)) 121 | previousscandate = await cursor.fetchone() 122 | if not previousscandate: # When theHarvester runs first time/day this query will return. 123 | self.previousscanresults = ["No results", "No results", "No results", "No results", 124 | "No results"] 125 | else: 126 | cursor = await conn.execute(''' 127 | SELECT find_date, domain, source, type, resource 128 | FROM results 129 | WHERE find_date=? and domain=? 130 | ORDER BY source,type 131 | ''', (previousscandate[0], domain,)) 132 | results = await cursor.fetchall() 133 | self.previousscanresults = results 134 | return self.previousscanresults 135 | except Exception as e: 136 | print(f'Error in getting the previous scan results from the database: {e}') 137 | else: 138 | try: 139 | cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,)) 140 | latestscandate = await cursor.fetchone() 141 | cursor = await conn.execute(''' 142 | SELECT find_date, domain, source, type, resource 143 | FROM results 144 | WHERE find_date=? and domain=? 145 | ORDER BY source,type 146 | ''', (latestscandate[0], domain,)) 147 | results = await cursor.fetchall() 148 | self.latestscanresults = results 149 | return self.latestscanresults 150 | except Exception as e: 151 | print(f'Error in getting the latest scan results from the database: {e}') 152 | except Exception as e: 153 | print(f'Error connecting to theHarvester database: {e}') 154 | 155 | async def getscanboarddata(self): 156 | try: 157 | async with aiosqlite.connect(self.db, timeout=30) as conn: 158 | 159 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="host"''') 160 | data = await cursor.fetchone() 161 | self.scanboarddata["host"] = data[0] 162 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="email"''') 163 | data = await cursor.fetchone() 164 | self.scanboarddata["email"] = data[0] 165 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="ip"''') 166 | data = await cursor.fetchone() 167 | self.scanboarddata["ip"] = data[0] 168 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="vhost"''') 169 | data = await cursor.fetchone() 170 | self.scanboarddata["vhost"] = data[0] 171 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="shodan"''') 172 | data = await cursor.fetchone() 173 | self.scanboarddata["shodan"] = data[0] 174 | cursor = await conn.execute('''SELECT COUNT(DISTINCT(domain)) FROM results ''') 175 | data = await cursor.fetchone() 176 | self.scanboarddata["domains"] = data[0] 177 | return self.scanboarddata 178 | except Exception as e: 179 | print(e) 180 | 181 | async def getscanhistorydomain(self, domain): 182 | try: 183 | async with aiosqlite.connect(self.db, timeout=30) as conn: 184 | cursor = await conn.execute('''SELECT DISTINCT(find_date) FROM results WHERE domain=?''', (domain,)) 185 | dates = await cursor.fetchall() 186 | for date in dates: 187 | cursor = await conn.execute( 188 | '''SELECT COUNT(*) from results WHERE domain=? AND type="host" AND find_date=?''', 189 | (domain, date[0])) 190 | counthost = await cursor.fetchone() 191 | cursor = await conn.execute( 192 | '''SELECT COUNT(*) from results WHERE domain=? AND type="email" AND find_date=?''', 193 | (domain, date[0])) 194 | countemail = await cursor.fetchone() 195 | cursor = await conn.execute( 196 | '''SELECT COUNT(*) from results WHERE domain=? AND type="ip" AND find_date=?''', 197 | (domain, date[0])) 198 | countip = await cursor.fetchone() 199 | cursor = await conn.execute( 200 | '''SELECT COUNT(*) from results WHERE domain=? AND type="vhost" AND find_date=?''', 201 | (domain, date[0])) 202 | countvhost = await cursor.fetchone() 203 | cursor = await conn.execute( 204 | '''SELECT COUNT(*) from results WHERE domain=? AND type="shodan" AND find_date=?''', 205 | (domain, date[0])) 206 | countshodan = await cursor.fetchone() 207 | results = { 208 | "date": str(date[0]), 209 | "hosts": str(counthost[0]), 210 | "email": str(countemail[0]), 211 | "ip": str(countip[0]), 212 | "vhost": str(countvhost[0]), 213 | "shodan": str(countshodan[0]) 214 | } 215 | self.domainscanhistory.append(results) 216 | return self.domainscanhistory 217 | except Exception as e: 218 | print(e) 219 | 220 | async def getpluginscanstatistics(self): 221 | try: 222 | async with aiosqlite.connect(self.db, timeout=30) as conn: 223 | cursor = await conn.execute(''' 224 | SELECT domain,find_date, type, source, count(*) 225 | FROM results 226 | GROUP BY domain, find_date, type, source 227 | ''') 228 | results = await cursor.fetchall() 229 | self.scanstats = results 230 | return self.scanstats 231 | except Exception as e: 232 | print(e) 233 | 234 | async def latestscanchartdata(self, domain): 235 | try: 236 | async with aiosqlite.connect(self.db, timeout=30) as conn: 237 | self.latestscandomain["domain"] = domain 238 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="host"''', (domain,)) 239 | data = await cursor.fetchone() 240 | self.latestscandomain["host"] = data[0] 241 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="email"''', (domain,)) 242 | data = await cursor.fetchone() 243 | self.latestscandomain["email"] = data[0] 244 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="ip"''', (domain,)) 245 | data = await cursor.fetchone() 246 | self.latestscandomain["ip"] = data[0] 247 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="vhost"''', (domain,)) 248 | data = await cursor.fetchone() 249 | self.latestscandomain["vhost"] = data[0] 250 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="shodan"''', (domain,)) 251 | data = await cursor.fetchone() 252 | self.latestscandomain["shodan"] = data[0] 253 | cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,)) 254 | data = await cursor.fetchone() 255 | self.latestscandomain["latestdate"] = data[0] 256 | latestdate = data[0] 257 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="host"''', (domain, latestdate,)) 258 | scandetailshost = await cursor.fetchall() 259 | self.latestscandomain["scandetailshost"] = scandetailshost 260 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="email"''', (domain, latestdate,)) 261 | scandetailsemail = await cursor.fetchall() 262 | self.latestscandomain["scandetailsemail"] = scandetailsemail 263 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="ip"''', (domain, latestdate,)) 264 | scandetailsip = await cursor.fetchall() 265 | self.latestscandomain["scandetailsip"] = scandetailsip 266 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="vhost"''', (domain, latestdate,)) 267 | scandetailsvhost = await cursor.fetchall() 268 | self.latestscandomain["scandetailsvhost"] = scandetailsvhost 269 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="shodan"''', (domain, latestdate,)) 270 | scandetailsshodan = await cursor.fetchall() 271 | self.latestscandomain["scandetailsshodan"] = scandetailsshodan 272 | return self.latestscandomain 273 | except Exception as e: 274 | print(e) 275 | -------------------------------------------------------------------------------- /theHarvester/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/parsers/__init__.py -------------------------------------------------------------------------------- /theHarvester/parsers/intelxparser.py: -------------------------------------------------------------------------------- 1 | class Parser: 2 | 3 | def __init__(self): 4 | self.emails = set() 5 | self.hosts = set() 6 | 7 | async def parse_dictionaries(self, results: dict) -> tuple: 8 | """ 9 | Parse method to parse json results 10 | :param results: Dictionary containing a list of dictionaries known as selectors 11 | :return: tuple of emails and hosts 12 | """ 13 | if results is not None: 14 | for dictionary in results["selectors"]: 15 | field = dictionary['selectorvalue'] 16 | if '@' in field: 17 | self.emails.add(field) 18 | else: 19 | field = str(field) 20 | if 'http' in field or 'https' in field: 21 | if field[:5] == 'https': 22 | field = field[8:] 23 | else: 24 | field = field[7:] 25 | self.hosts.add(field.replace(')', '').replace(',', '')) 26 | return self.emails, self.hosts 27 | return None, None 28 | -------------------------------------------------------------------------------- /theHarvester/parsers/myparser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class Parser: 5 | 6 | def __init__(self, results, word): 7 | self.results = results 8 | self.word = word 9 | self.temp = [] 10 | 11 | async def genericClean(self): 12 | self.results = self.results.replace('', '').replace('', '').replace('', '').replace('', '') \ 13 | .replace('%3a', '').replace('', '').replace('', '') \ 14 | .replace('', '').replace('', '') 15 | 16 | for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '%2f', '/', '\\'): 17 | self.results = self.results.replace(search, ' ') 18 | 19 | async def urlClean(self): 20 | self.results = self.results.replace('', '').replace('', '').replace('%2f', '').replace('%3a', '') 21 | for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'): 22 | self.results = self.results.replace(search, ' ') 23 | 24 | async def emails(self): 25 | await self.genericClean() 26 | # Local part is required, charset is flexible. 27 | # https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly) 28 | reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', '')) 29 | self.temp = reg_emails.findall(self.results) 30 | emails = await self.unique() 31 | true_emails = {str(email)[1:].lower().strip() if len(str(email)) > 1 and str(email)[0] == '.' 32 | else len(str(email)) > 1 and str(email).lower().strip() for email in emails} 33 | # if email starts with dot shift email string and make sure all emails are lowercase 34 | return true_emails 35 | 36 | async def fileurls(self, file): 37 | urls = [] 38 | reg_urls = re.compile('(.*?)') 60 | temp = reg_hosts.findall(self.results) 61 | for iteration in temp: 62 | if iteration.count(':'): 63 | res = iteration.split(':')[1].split('/')[2] 64 | else: 65 | res = iteration.split('/')[0] 66 | self.temp.append(res) 67 | hostnames = await self.unique() 68 | return hostnames 69 | 70 | async def set(self): 71 | reg_sets = re.compile(r'>[a-zA-Z\d]*') 72 | self.temp = reg_sets.findall(self.results) 73 | sets = [] 74 | for iteration in self.temp: 75 | delete = iteration.replace('>', '') 76 | delete = delete.replace(' list: 86 | return list(set(self.temp)) 87 | -------------------------------------------------------------------------------- /theHarvester/parsers/securitytrailsparser.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List 2 | 3 | 4 | class Parser: 5 | 6 | def __init__(self, word, text): 7 | self.word = word 8 | self.text = text 9 | self.hostnames = set() 10 | self.ips = set() 11 | 12 | async def parse_text(self) -> Union[List, Tuple]: 13 | sub_domain_flag = 0 14 | self.text = str(self.text).splitlines() 15 | # Split lines to get a list of lines. 16 | for index in range(0, len(self.text)): 17 | line = self.text[index].strip() 18 | if '"ip":' in line: 19 | # Extract IP. 20 | ip = '' 21 | for ch in line[7:]: 22 | if ch == '"': 23 | break 24 | else: 25 | ip += ch 26 | self.ips.add(ip) 27 | elif '"subdomains":' in line: 28 | # subdomains start here so set flag to 1 29 | sub_domain_flag = 1 30 | continue 31 | elif sub_domain_flag > 0: 32 | if ']' in line: 33 | sub_domain_flag = 0 34 | else: 35 | if 'www' in self.word: 36 | self.word = str(self.word).replace('www.', '').replace('www', '') 37 | # Remove www from word if entered 38 | self.hostnames.add(str(line).replace('"', '').replace(',', '') + '.' + self.word) 39 | else: 40 | continue 41 | return list(self.ips), list(self.hostnames) 42 | -------------------------------------------------------------------------------- /theHarvester/screenshot/screenshot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Screenshot module that utilizes pyppeteer to asynchronously 3 | take screenshots 4 | """ 5 | 6 | from pyppeteer import launch 7 | import aiohttp 8 | import asyncio 9 | import certifi 10 | from datetime import datetime 11 | import os 12 | import ssl 13 | import sys 14 | 15 | 16 | class ScreenShotter: 17 | 18 | def __init__(self, output): 19 | self.output = output 20 | self.slash = "\\" if 'win' in sys.platform else '/' 21 | self.slash = "" if (self.output[-1] == "\\" or self.output[-1] == "/") else self.slash 22 | 23 | def verify_path(self): 24 | try: 25 | if not os.path.isdir(self.output): 26 | answer = input( 27 | '[+] The output path you have entered does not exist would you like to create it (y/n): ') 28 | if answer.lower() == 'yes' or answer.lower() == 'y': 29 | os.mkdir(self.output) 30 | return True 31 | else: 32 | return False 33 | return True 34 | except Exception as e: 35 | print(f"An exception has occurred while attempting to verify output path's existence: {e}") 36 | return False 37 | 38 | @staticmethod 39 | async def verify_installation(): 40 | # Helper function that verifies pyppeteer & chromium are installed 41 | # If chromium is not installed pyppeteer will prompt user to install it 42 | browser = await launch(headless=True, ignoreHTTPSErrors=True, args=["--no-sandbox"]) 43 | await browser.close() 44 | 45 | @staticmethod 46 | def chunk_list(items, chunk_size): 47 | # Based off of: https://github.com/apache/incubator-sdap-ingester 48 | return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)] 49 | 50 | @staticmethod 51 | async def visit(url): 52 | try: 53 | # print(f'attempting to visit: {url}') 54 | timeout = aiohttp.ClientTimeout(total=35) 55 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 56 | 'Chrome/83.0.4103.106 Safari/537.36'} 57 | url = f'http://{url}' if not url.startswith('http') else url 58 | url = url.replace('www.', '') 59 | sslcontext = ssl.create_default_context(cafile=certifi.where()) 60 | async with aiohttp.ClientSession(timeout=timeout, headers=headers, 61 | connector=aiohttp.TCPConnector(ssl=sslcontext)) as session: 62 | async with session.get(url, verify_ssl=False) as resp: 63 | # TODO fix with origin url, should be there somewhere 64 | text = await resp.text("UTF-8") 65 | return f'http://{url}' if not url.startswith('http') else url, text 66 | except Exception as e: 67 | print(f'An exception has occurred while attempting to visit {url} : {e}') 68 | return "", "" 69 | 70 | async def take_screenshot(self, url): 71 | url = f'http://{url}' if not url.startswith('http') else url 72 | url = url.replace('www.', '') 73 | print(f'Attempting to take a screenshot of: {url}') 74 | browser = await launch(headless=True, ignoreHTTPSErrors=True, args=["--no-sandbox"]) 75 | context = await browser.createIncognitoBrowserContext() 76 | # Create a new page in a pristine context. 77 | page = await context.newPage() 78 | path = fr'{self.output}{self.slash}{url.replace("http://", "").replace("https://", "")}.png' 79 | date = str(datetime.utcnow()) 80 | try: 81 | # change default timeout from 30 to 35 seconds 82 | page.setDefaultNavigationTimeout(35000) 83 | await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 84 | 'Chrome/83.0.4103.106 Safari/537.36') 85 | await page.goto(url) 86 | await page.screenshot({'path': path}) 87 | except Exception as e: 88 | print(f'An exception has occurred attempting to screenshot: {url} : {e}') 89 | path = "" 90 | finally: 91 | # Clean up everything whether screenshot is taken or not 92 | await asyncio.sleep(2) 93 | await page.close() 94 | await context.close() 95 | await browser.close() 96 | return date, url, path 97 | -------------------------------------------------------------------------------- /wordlists/dorks.txt: -------------------------------------------------------------------------------- 1 | inurl:"contact" 2 | intext:email filetype:log 3 | "Index of /mail" 4 | "admin account info" filetype:log 5 | intext:@ 6 | administrator accounts/ 7 | intitle:"Index of" .bash_history 8 | intitle:"index of" members OR accounts 9 | inurl:/shared/help.php 10 | inurl:public 11 | intitle:index.of inbox 12 | intitle:"Server Administration" 13 | inurl:passwd.txt 14 | robots.txt 15 | php-addressbook "This is the addressbook for *" -warning -------------------------------------------------------------------------------- /wordlists/general/common.txt: -------------------------------------------------------------------------------- 1 | admin 2 | test 3 | hello 4 | uk 5 | login 6 | book 7 | robots.txt 8 | --------------------------------------------------------------------------------