├── .dockerignore
├── .flake8
├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   └── issue-template.md
    ├── dependabot.yml
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── dockerci.yml
    │   └── theHarvester.yml
├── .gitignore
├── .lgtm.yml
├── Dockerfile
├── README.md
├── README
    ├── CONTRIBUTING.md
    ├── COPYING
    └── LICENSES
├── api-keys.yaml
├── bin
    ├── restfulHarvest
    └── theHarvester
├── debian
    ├── changelog
    ├── control
    ├── copyright
    ├── dirs
    ├── docs
    ├── gbp.conf
    ├── helper-script
    │   └── theharvester
    ├── patches
    │   ├── Disable-a-failing-test-unstable-site.patch
    │   ├── Improve-data-installation.patch
    │   └── series
    ├── rules
    ├── source
    │   └── format
    ├── tests
    │   └── control
    ├── theharvester.install
    ├── theharvester.links
    ├── upstream
    │   └── metadata
    └── watch
├── mypy.ini
├── proxies.yaml
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── requirements
    ├── base.txt
    └── dev.txt
├── restfulHarvest.py
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── discovery
    │   ├── __init__.py
    │   ├── test_anubis.py
    │   ├── test_certspotter.py
    │   ├── test_githubcode.py
    │   ├── test_omnisint.py
    │   ├── test_otx.py
    │   ├── test_qwantsearch.py
    │   ├── test_sublist3r.py
    │   └── test_threatminer.py
    └── test_myparser.py
├── theHarvester-logo.png
├── theHarvester.py
├── theHarvester
    ├── __init__.py
    ├── __main__.py
    ├── discovery
    │   ├── __init__.py
    │   ├── anubis.py
    │   ├── baidusearch.py
    │   ├── bevigil.py
    │   ├── binaryedgesearch.py
    │   ├── bingsearch.py
    │   ├── bufferoverun.py
    │   ├── censysearch.py
    │   ├── certspottersearch.py
    │   ├── constants.py
    │   ├── crtsh.py
    │   ├── dnsdumpster.py
    │   ├── dnssearch.py
    │   ├── duckduckgosearch.py
    │   ├── fullhuntsearch.py
    │   ├── githubcode.py
    │   ├── hackertarget.py
    │   ├── huntersearch.py
    │   ├── intelxsearch.py
    │   ├── omnisint.py
    │   ├── otxsearch.py
    │   ├── pentesttools.py
    │   ├── projectdiscovery.py
    │   ├── qwantsearch.py
    │   ├── rapiddns.py
    │   ├── rocketreach.py
    │   ├── securitytrailssearch.py
    │   ├── shodansearch.py
    │   ├── sublist3r.py
    │   ├── takeover.py
    │   ├── threatcrowd.py
    │   ├── threatminer.py
    │   ├── urlscan.py
    │   ├── virustotal.py
    │   ├── yahoosearch.py
    │   └── zoomeyesearch.py
    ├── lib
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── api.py
    │   │   ├── api_example.py
    │   │   └── static
    │   │   │   └── .gitkeep
    │   ├── core.py
    │   ├── hostchecker.py
    │   ├── ip-ranges.json
    │   ├── resolvers.txt
    │   └── stash.py
    ├── parsers
    │   ├── __init__.py
    │   ├── intelxparser.py
    │   ├── myparser.py
    │   └── securitytrailsparser.py
    └── screenshot
    │   └── screenshot.py
└── wordlists
    ├── dns-big.txt
    ├── dns-names.txt
    ├── dorks.txt
    ├── general
        └── common.txt
    └── names_small.txt


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .github/*
 2 | .gitattributes
 3 | .idea/
 4 | .lgtm.yml
 5 | mypy.ini
 6 | .pytest_cache
 7 | .mypy_cache
 8 | tests/*
 9 | README/
10 | bin/


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F405, F403, F401, E402


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Set the default behavior, which is to have git automatically determine
 2 | # whether a file is a text or binary, unless otherwise specified.
 3 | 
 4 | * text=auto
 5 | 
 6 | # Basic .gitattributes for a python repo.
 7 | 
 8 | # Source files
 9 | # ============
10 | *.pxd       text diff=python
11 | *.py        text diff=python
12 | *.py3       text diff=python
13 | *.pyw       text diff=python
14 | *.pyx       text diff=python
15 | 
16 | # Binary files
17 | # ============
18 | *.db        binary
19 | *.p         binary
20 | *.pkl       binary
21 | *.pyc       binary
22 | *.pyd       binary
23 | *.pyo       binary
24 | 
25 | # Note: .db, .p, and .pkl files are associated with the python modules
26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
27 | # (among others).
28 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [L1ghtn1ng, NotoriousRebel]
 4 | open_collective: # Replace with a single Open Collective username
 5 | ko_fi: #
 6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 8 | liberapay: # Replace with a single Liberapay username
 9 | issuehunt: # Replace with a single IssueHunt username
10 | otechie: # Replace with a single Otechie username
11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
12 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue-template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Issue Template
 3 | about: A template for new issues.
 4 | title: "[Bug|Feature Request|Other] Short Description of Issue"
 5 | labels: ''
 6 | 
 7 | ---
 8 | 
 9 | ## Note we do not support installing theHarvester on andriod
10 | 
11 | **Feature Request or Bug or Other**
12 | Feature Request | Bug | Other
13 | 
14 | **Describe the feature request or bug or other**
15 | A clear and concise description of what the bug, feature request,
16 | or other request is.
17 | 
18 | **To Reproduce**
19 | Steps to reproduce the behaviour:
20 | 1. Run tool like this: '...'
21 | 2. See error
22 | 
23 | **Expected behaviour**
24 | A clear and concise description of what you expected to happen.
25 | 
26 | **Screenshots**
27 | If possible please add screenshots to help explain your problem.
28 | 
29 | **System Information (System that tool is running on):**
30 |  - OS: [e.g. Windows10]
31 |  - Version [e.g. 2.7]
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: github-actions
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     timezone: Europe/London
 8 | - package-ecosystem: pip
 9 |   directory: "/"
10 |   schedule:
11 |     interval: daily
12 |     timezone: Europe/London
13 |   open-pull-requests-limit: 10
14 |   target-branch: master
15 |   allow:
16 |   - dependency-type: direct
17 |   - dependency-type: indirect
18 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master, dev ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master, dev ]
20 |   schedule:
21 |     - cron: '19 11 * * 4'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 | 
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         language: [ 'python' ]
32 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 |         # Learn more:
34 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 | 
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v3
39 | 
40 |     # Initializes the CodeQL tools for scanning.
41 |     - name: Initialize CodeQL
42 |       uses: github/codeql-action/init@v2
43 |       with:
44 |         languages: ${{ matrix.language }}
45 |         # If you wish to specify custom queries, you can do so here or in a config file.
46 |         # By default, queries listed here will override any specified in a config file.
47 |         # Prefix the list here with "+" to use these queries and those in the config file.
48 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 | 
50 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
51 |     # If this step fails, then you should remove it and run the build manually (see below)
52 |     - name: Autobuild
53 |       uses: github/codeql-action/autobuild@v2
54 | 
55 |     # ℹ️ Command-line programs to run using the OS shell.
56 |     # 📚 https://git.io/JvXDl
57 | 
58 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 |     #    and modify them (or add more) to build your code if your project
60 |     #    uses a compiled language
61 | 
62 |     #- run: |
63 |     #   make bootstrap
64 |     #   make release
65 | 
66 |     - name: Perform CodeQL Analysis
67 |       uses: github/codeql-action/analyze@v2
68 | 


--------------------------------------------------------------------------------
/.github/workflows/dockerci.yml:
--------------------------------------------------------------------------------
 1 | name: TheHarvester Docker Image CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v3
10 |     - name: Build the Docker image
11 |       run: docker build . --file Dockerfile --tag theharvester:$(date +%s)


--------------------------------------------------------------------------------
/.github/workflows/theHarvester.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: TheHarvester Python CI
  3 | 
  4 | on:
  5 |   push:
  6 |     branches:
  7 |       - '*'
  8 | 
  9 |   pull_request:
 10 |     branches:
 11 |       - '*'
 12 | 
 13 | jobs:
 14 |   Python:
 15 |     runs-on: ${{ matrix.os }}
 16 |     strategy:
 17 |       max-parallel: 8
 18 |       matrix:
 19 |         os: [ ubuntu-latest, macos-latest ]
 20 |         python-version: [ 3.8, 3.9, 3.10.0 ]
 21 | 
 22 |     steps:
 23 |       - uses: actions/checkout@v3
 24 |       - name: Python ${{ matrix.python-version }}
 25 |         uses: actions/setup-python@v4
 26 |         with:
 27 |           python-version: ${{ matrix.python-version }}
 28 |       - name: Install dependencies
 29 |         run: |
 30 |           pip install --upgrade pip
 31 |           pip install wheel
 32 |           pip install -r requirements/dev.txt
 33 | 
 34 |       - name: Lint with flake8
 35 |         run: |
 36 |           # stop the build if there are Python syntax errors or undefined names
 37 |           flake8 . --count --show-source --statistics
 38 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
 39 |           flake8 . --count --exit-zero  --max-line-length=127 --statistics
 40 | 
 41 |       - name: Test with pytest
 42 |         run: |
 43 |           pytest
 44 | 
 45 |       - name: Static type checking with mypy
 46 |         run: |
 47 |           mypy --pretty theHarvester/*/*.py
 48 |           mypy --pretty theHarvester/*/*/*.py
 49 | 
 50 |       - name: Run theHarvester module Anubis
 51 |         run: |
 52 |           python theHarvester.py -d apple.com -b anubis
 53 | 
 54 |       - name: Run theHarvester module Baidu
 55 |         run: |
 56 |           python theHarvester.py -d yale.edu -b baidu
 57 | 
 58 |       - name: Run theHarvester module Bing
 59 |         run: |
 60 |           python theHarvester.py -d yale.edu -b bing
 61 | 
 62 |       - name: Run theHarvester module CertSpotter
 63 |         run: |
 64 |           python theHarvester.py -d yale.edu -b certspotter
 65 | 
 66 |       - name: Run theHarvester module Crtsh
 67 |         run: |
 68 |           python theHarvester.py -d hcl.com -b crtsh
 69 | 
 70 |       - name: Run theHarvester module DnsDumpster
 71 |         run: |
 72 |           python theHarvester.py -d yale.edu -b dnsdumpster
 73 | 
 74 |       - name: Run theHarvester module DuckDuckGo
 75 |         run: |
 76 |           python theHarvester.py -d yale.edu -b duckduckgo
 77 | 
 78 |       - name: Run theHarvester module HackerTarget
 79 |         run: |
 80 |           python theHarvester.py -d yale.edu -b hackertarget
 81 | 
 82 |       - name: Run theHarvester module Intelx
 83 |         run: |
 84 |           python theHarvester.py -d yale.edu -b intelx
 85 | 
 86 |       - name: Run theHarvester module Omnisint
 87 |         run: |
 88 |           python theHarvester.py -d yale.edu -b omnisint
 89 | 
 90 |       - name: Run theHarvester module Otx
 91 |         run: |
 92 |           python theHarvester.py -d yale.edu -b otx
 93 | 
 94 |       - name: Run theHarvester module Qwant
 95 |         run: |
 96 |           python theHarvester.py -d yale.edu -b qwant
 97 | 
 98 |       - name: Run theHarvester module RapidDns
 99 |         run: |
100 |           python theHarvester.py -d yale.edu -b rapiddns
101 | 
102 |       - name: Run theHarvester module Sublist3r
103 |         run: |
104 |           python theHarvester.py -d yale.edu -b sublist3r
105 | 
106 |       - name: Run theHarvester module Threatcrowd
107 |         run: |
108 |           python theHarvester.py -d yale.edu -b threatcrowd
109 | 
110 |       - name: Run theHarvester module Threatminer
111 |         run: |
112 |           python theHarvester.py -d yale.edu -b threatminer
113 | 
114 |       - name: Run theHarvester module Urlscan
115 |         run: |
116 |           python theHarvester.py -d yale.edu -b urlscan
117 | 
118 |       - name: Run theHarvester module Yahoo
119 |         run: |
120 |           python theHarvester.py -d yale.edu -b yahoo
121 | 
122 |       - name: Run theHarvester module DNS brute force
123 |         run: |
124 |           python theHarvester.py -d yale.edu -c
125 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.idea
 2 | *.pyc
 3 | *.sqlite
 4 | *.html
 5 | *.htm
 6 | *.vscode
 7 | *.xml
 8 | *.json
 9 | debug_results.txt
10 | venv
11 | .mypy_cache
12 | .pytest_cache
13 | build/
14 | dist/
15 | theHarvester.egg-info
16 | api-keys.yaml
17 | .DS_Store
18 | .venv
19 | 


--------------------------------------------------------------------------------
/.lgtm.yml:
--------------------------------------------------------------------------------
1 | queries:
2 |   - exclude: py/import-and-import-from
3 |   - exclude: py/polluting-import
4 |   - exclude: py/member-test-non-container
5 | 
6 | extraction:
7 |   python:
8 |     python_setup:
9 |       version: 3


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:kinetic
 2 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1"
 3 | RUN mkdir /app
 4 | WORKDIR /app
 5 | COPY . /app
 6 | ENV DEBIAN_FRONTEND=noninteractive
 7 | RUN apt update && apt dist-upgrade -qy && apt install -qy git python3 python3-pip libffi-dev libxml2-dev libxslt1-dev && /usr/bin/python3 -m pip install --upgrade pip && apt autoremove -qy
 8 | RUN /usr/bin/python3 --version && pip3 install --no-cache-dir -r requirements.txt && chmod +x ./*.py
 9 | ENTRYPOINT ["/app/theHarvester.py"]
10 | ENTRYPOINT ["/app/restfulHarvest.py", "-H", "0.0.0.0", "-p", "80"]
11 | EXPOSE 80
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![theHarvester](https://github.com/laramies/theHarvester/blob/master/theHarvester-logo.png)
  2 | 
  3 | ![TheHarvester CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Python%20CI/badge.svg) ![TheHarvester Docker Image CI](https://github.com/laramies/theHarvester/workflows/TheHarvester%20Docker%20Image%20CI/badge.svg) [![Language grade: Python](https://img.shields.io/lgtm/grade/python/g/laramies/theHarvester.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/laramies/theHarvester/context:python)
  4 | [![Rawsec's CyberSecurity Inventory](https://inventory.rawsec.ml/img/badges/Rawsec-inventoried-FF5050_flat_without_logo.svg)](https://inventory.rawsec.ml/)
  5 | 
  6 | What is this?
  7 | -------------
  8 | theHarvester is a simple to use, yet powerful tool designed to be used during the reconnaissance stage of a red<br>
  9 | team assessment or penetration test. It performs open source intelligence (OSINT) gathering to help determine<br>
 10 | a domain's external threat landscape. The tool gathers names, emails, IPs, subdomains, and URLs by using<br>
 11 | multiple public resources that include:<br>
 12 | 
 13 | Passive:
 14 | --------
 15 | * anubis: Anubis-DB - https://github.com/jonluca/anubis
 16 | 
 17 | * bevigil: CloudSEK BeVigil scans mobile application for OSINT assets and makes them available through an API - https://bevigil.com/osint-api
 18 | 
 19 | * baidu: Baidu search engine - www.baidu.com
 20 | 
 21 | * binaryedge: List of known subdomains from www.binaryedge.io
 22 | 
 23 | * bing: Microsoft search engine - www.bing.com
 24 | 
 25 | * bingapi: Microsoft search engine, through the API (Requires an API key, see below.)
 26 | 
 27 | * bufferoverun: Uses data from Rapid7's Project Sonar - www.rapid7.com/research/project-sonar/
 28 | 
 29 | * censys: [Censys search engine](https://search.censys.io/), will use certificates searches to enumerate subdomains and gather emails (Requires an API key, see below.) - [censys.io](https://censys.io/)
 30 | 
 31 | * certspotter: Cert Spotter monitors Certificate Transparency logs - https://sslmate.com/certspotter/
 32 | 
 33 | * crtsh: Comodo Certificate search - https://crt.sh
 34 | 
 35 | * dnsdumpster: DNSdumpster search engine - https://dnsdumpster.com
 36 | 
 37 | * duckduckgo: DuckDuckGo search engine - www.duckduckgo.com
 38 | 
 39 | * fullhunt: The Next-Generation Attack Surface Security Platform - https://fullhunt.io
 40 | 
 41 | * github-code: GitHub code search engine (Requires a GitHub Personal Access Token, see below.) - www.github.com
 42 | 
 43 | * hackertarget: Online vulnerability scanners and network intelligence to help organizations - https://hackertarget.com
 44 | 
 45 | * hunter: Hunter search engine (Requires an API key, see below.) - www.hunter.io
 46 | 
 47 | * intelx: Intelx search engine (Requires an API key, see below.) - www.intelx.io
 48 | 
 49 | * omnisint: Project Crobat, A Centralised Searchable Open Source Project Sonar DNS Database - https://github.com/Cgboal/SonarSearch
 50 | 
 51 | * otx: AlienVault Open Threat Exchange - https://otx.alienvault.com
 52 | 
 53 | * pentesttools: Powerful Penetration Testing Tools, Easy to Use (Requires an API key, see below.) - https://pentest-tools.com/home
 54 | 
 55 | * projecdiscovery: We actively collect and maintain internet-wide assets data,
 56 |   to enhance research and analyse changes around DNS for better insights (Requires an API key, see below.) - https://chaos.projectdiscovery.io
 57 | 
 58 | * qwant: Qwant search engine - www.qwant.com
 59 | 
 60 | * rapiddns: DNS query tool which make querying subdomains or sites of a same IP easy! https://rapiddns.io
 61 | 
 62 | * rocketreach: Access real-time verified personal/professional emails, phone numbers, and social media links. - https://rocketreach.co
 63 | 
 64 | * securityTrails: Security Trails search engine, the world's largest repository of historical DNS data<br>
 65 |   (Requires an API key, see below.) - www.securitytrails.com
 66 | 
 67 | * shodan: Shodan search engine, will search for ports and banners from discovered hosts (Requires an API key, see below.) - www.shodanhq.com
 68 | 
 69 | * sublist3r: Fast subdomains enumeration tool for penetration testers - https://api.sublist3r.com/search.php?domain=example.com
 70 | 
 71 | * threatcrowd: Open source threat intelligence - www.threatcrowd.org
 72 | 
 73 | * threatminer: Data mining for threat intelligence - https://www.threatminer.org/
 74 | 
 75 | * urlscan: A sandbox for the web that is a URL and website scanner - https://urlscan.io
 76 | 
 77 | * vhost: Bing virtual hosts search
 78 | 
 79 | * virustotal: virustotal.com domain search
 80 | 
 81 | * yahoo: Yahoo search engine
 82 | 
 83 | * zoomeye: China version of shodan - https://www.zoomeye.org
 84 | 
 85 | 
 86 | Active:
 87 | -------
 88 | * DNS brute force: dictionary brute force enumeration
 89 | * Screenshots: Take screenshots of subdomains that were found
 90 | 
 91 | Modules that require an API key:
 92 | --------------------------------
 93 | Documentation to setup API keys can be found at - https://github.com/laramies/theHarvester/wiki/Installation#api-keys
 94 | 
 95 | * bevigil - Free upto 50 queries. Pricing can be found here: https://bevigil.com/pricing/osint
 96 | * binaryedge - $10/month
 97 | * bing
 98 | * censys - API keys are required and can be retrieved from your [Censys account](https://search.censys.io/account/api).
 99 | * fullhunt
100 | * github
101 | * hunter - limited to 10 on the free plan, so you will need to do -l 10 switch
102 | * intelx
103 | * pentesttools - $
104 | * projecdiscovery - invite only for now
105 | * rocketreach - $
106 | * securityTrails
107 | * shodan - $
108 | * zoomeye
109 | 
110 | Install and dependencies:
111 | -------------------------
112 | * Python 3.7+
113 | * https://github.com/laramies/theHarvester/wiki/Installation
114 | 
115 | 
116 | Comments, bugs, and requests:
117 | -----------------------------
118 | * [![Twitter Follow](https://img.shields.io/twitter/follow/laramies.svg?style=social&label=Follow)](https://twitter.com/laramies) Christian Martorella @laramies
119 |   cmartorella@edge-security.com
120 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
121 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
122 | 
123 | 
124 | Main contributors:
125 | ------------------
126 | * [![Twitter Follow](https://img.shields.io/twitter/follow/NotoriousRebel1.svg?style=social&label=Follow)](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
127 | * [![Twitter Follow](https://img.shields.io/twitter/follow/jay_townsend1.svg?style=social&label=Follow)](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
128 | * [![Twitter Follow](https://img.shields.io/twitter/follow/discoverscripts.svg?style=social&label=Follow)](https://twitter.com/discoverscripts) Lee Baird @discoverscripts
129 | 
130 | 
131 | Thanks:
132 | -------
133 | * John Matherly - Shodan project
134 | * Ahmed Aboul Ela - subdomain names dictionaries (big and small)
135 | 


--------------------------------------------------------------------------------
/README/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to theHarvester Project
 2 | Welcome to theHarvester project, so you would like to contribute.
 3 | The following below must be met to get accepted.
 4 | 
 5 | # CI
 6 | Make sure all CI passes and you do not introduce any alerts from lgtm.
 7 | 
 8 | # Unit Tests
 9 | For new modules a unit test for that module is required and we use pytest.
10 | 
11 | # Coding Standards
12 | * No single letter variables and variable names must represent the action that it is performing
13 | * Have static typing on functions etc
14 | * Make sure no errors are reported from mypy
15 | * No issues reported with flake8
16 |  
17 | # Submitting Bugs
18 | If you find a bug in a module that you want to submit an issue for and know how to write python code.
19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project. 
20 | 


--------------------------------------------------------------------------------
/README/COPYING:
--------------------------------------------------------------------------------
  1 |                    GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.
  5 |  59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Library General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 


--------------------------------------------------------------------------------
/README/LICENSES:
--------------------------------------------------------------------------------
 1 | Released under the GPL v 2.0.
 2 | 
 3 | If you did not receive a copy of the GPL, try http://www.gnu.org/.
 4 | 
 5 | Copyright 2011 Christian Martorella 
 6 | 
 7 | theHarvester is free software; you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation version 2 of the License.
10 | 
11 | theHarvester is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU General Public License for more details.
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16 | 


--------------------------------------------------------------------------------
/api-keys.yaml:
--------------------------------------------------------------------------------
 1 | apikeys:
 2 |   bevigil:
 3 |    key:
 4 | 
 5 |   binaryedge:
 6 |     key:
 7 | 
 8 |   bing:
 9 |     key:
10 | 
11 |   censys:
12 |     id:
13 |     secret:
14 | 
15 |   fullhunt:
16 |     key:
17 | 
18 |   github:
19 |     key:
20 | 
21 |   hunter:
22 |     key:
23 | 
24 |   intelx:
25 |     key:
26 | 
27 |   pentestTools:
28 |     key:
29 | 
30 |   projectDiscovery:
31 |     key:
32 | 
33 |   rocketreach:
34 |     key:
35 | 
36 |   securityTrails:
37 |     key:
38 | 
39 |   shodan:
40 |     key:
41 | 
42 |   virustotal:
43 |     key:
44 | 
45 |   zoomeye:
46 |     key:
47 | 


--------------------------------------------------------------------------------
/bin/restfulHarvest:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import uvicorn
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('-H', '--host', default='127.0.0.1', help='IP address to listen on default is 127.0.0.1')
 7 | parser.add_argument('-p', '--port', default=5000, help='Port to bind the web server to, default is 5000', type=int)
 8 | parser.add_argument('-l', '--log-level', default='info', help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set')
 9 | parser.add_argument('-r', '--reload', default=False, help='Enable automatic reload used during development of the api', action='store_true')
10 | 
11 | args = parser.parse_args()
12 | 
13 | if __name__ == '__main__':
14 |     uvicorn.run('theHarvester.lib.api.api:app', host=args.host, port=args.port, log_level=args.log_level, reload=args.reload)
15 | 


--------------------------------------------------------------------------------
/bin/theHarvester:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Note: This script runs theHarvester
 3 | import sys
 4 | import asyncio
 5 | from theHarvester import __main__
 6 | 
 7 | if sys.version_info.major < 3 or sys.version_info.minor < 7:
 8 |     print('\033[93m[!] Make sure you have Python 3.7+ installed, quitting.\n\n \033[0m')
 9 |     sys.exit(1)
10 | 
11 | if __name__ == '__main__':
12 |     platform = sys.platform
13 |     if platform == 'win32':
14 |         # Required or things will break if trying to take screenshots
15 |         import multiprocessing
16 | 
17 |         multiprocessing.freeze_support()
18 |         asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
19 |     else:
20 |         import uvloop
21 |         uvloop.install()
22 | 
23 |         if "linux" in platform:
24 |             import aiomultiprocess
25 | 
26 |             # As we are not using Windows we can change the spawn method to fork for greater performance
27 |             aiomultiprocess.set_context("fork")
28 |     asyncio.run(__main__.entry_point())
29 | 


--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
  1 | theharvester (4.2.0-0parrot3) parrot-updates; urgency=medium
  2 | 
  3 |   * Update package dependencies.
  4 |   * Update helper script.
  5 | 
  6 |  -- Lorenzo "Palinuro" Faletra <palinuro@parrotsec.org>  Wed, 21 Dec 2022 15:11:49 +0100
  7 | 
  8 | theharvester (4.2.0-0parrot2) parrot-updates; urgency=medium
  9 | 
 10 |   * Rebuild package.
 11 | 
 12 |  -- Lorenzo "Palinuro" Faletra <palinuro@parrotsec.org>  Wed, 21 Dec 2022 15:06:12 +0100
 13 | 
 14 | theharvester (4.2.0-0parrot1) parrot-updates; urgency=medium
 15 | 
 16 |   * Import new Upstream release.
 17 | 
 18 |  -- Lorenzo "Palinuro" Faletra <palinuro@parrotsec.org>  Wed, 21 Dec 2022 12:59:39 +0100
 19 | 
 20 | theharvester (3.2.3-parrot0) rolling-testing; urgency=medium
 21 | 
 22 |   * Remove Kali ci scripts
 23 |   * Init Parrot team info
 24 |   * Remove old command
 25 |   * Add launcher
 26 |   * Edit launcher command
 27 | 
 28 |  -- Nong Hoang Tu <dmknght@parrotsec.org>  Thu, 04 Mar 2021 00:39:41 +0700
 29 | 
 30 | theharvester (3.2.3-0kali1) kali-dev; urgency=medium
 31 | 
 32 |   * New upstream version 3.2.3
 33 |   * Add Restrictions: superficial to autopkgtest
 34 | 
 35 |  -- Sophie Brun <sophie@offensive-security.com>  Mon, 08 Feb 2021 11:45:55 +0100
 36 | 
 37 | theharvester (3.2.2-0kali2) kali-dev; urgency=medium
 38 | 
 39 |   * Fix installation of the wordlists
 40 | 
 41 |  -- Sophie Brun <sophie@offensive-security.com>  Thu, 07 Jan 2021 10:47:15 +0100
 42 | 
 43 | theharvester (3.2.2-0kali1) kali-dev; urgency=medium
 44 | 
 45 |   [ Ben Wilson ]
 46 |   * Fix email address
 47 | 
 48 |   [ Sophie Brun ]
 49 |   * New upstream version 3.2.2
 50 | 
 51 |  -- Sophie Brun <sophie@offensive-security.com>  Thu, 17 Dec 2020 10:08:13 +0100
 52 | 
 53 | theharvester (3.2.0-0kali1) kali-dev; urgency=medium
 54 | 
 55 |   * New upstream version 3.2.0
 56 |   * Remove merged patches
 57 |   * Update build-deps and deps
 58 |   * Update installation to use usptream setup.py
 59 |   * Add lintian-overrides for breakout-link
 60 | 
 61 |  -- Sophie Brun <sophie@offensive-security.com>  Fri, 11 Sep 2020 09:30:08 +0200
 62 | 
 63 | theharvester (3.1-0kali4) kali-dev; urgency=medium
 64 | 
 65 |   * Fix for issue 6450:
 66 |     - Add a link to the wordlists
 67 |     - Use an helper-script to change the run directory
 68 |     - Add a patch to change directory of sqlite db
 69 |   * Bump Standards-Version to 4.5.0
 70 | 
 71 |  -- Sophie Brun <sophie@offensive-security.com>  Wed, 08 Jul 2020 12:06:35 +0200
 72 | 
 73 | theharvester (3.1-0kali3) kali-dev; urgency=medium
 74 | 
 75 |   [ Sven Höper ]
 76 |   * Add missing depends: python3-yaml
 77 |   * Packaging: Fix test command
 78 | 
 79 |   [ Sophie Brun ]
 80 |   * Add a script to mention that theharvester command is deprecated
 81 | 
 82 |  -- Sophie Brun <sophie@offensive-security.com>  Wed, 18 Dec 2019 08:44:38 +0100
 83 | 
 84 | theharvester (3.1-0kali2) kali-dev; urgency=medium
 85 | 
 86 |   * Add missing depends: python3-dnspython
 87 | 
 88 |  -- Sophie Brun <sophie@offensive-security.com>  Tue, 15 Oct 2019 18:20:09 +0200
 89 | 
 90 | theharvester (3.1-0kali1) kali-dev; urgency=medium
 91 | 
 92 |   [ Raphaël Hertzog ]
 93 |   * Update Vcs-* fields for the move to gitlab.com
 94 |   * Add GitLab's CI configuration file
 95 |   * Configure git-buildpackage for Kali
 96 |   * Update URL in GitLab's CI configuration file
 97 | 
 98 |   [ g0tmi1k ]
 99 |   * New format
100 | 
101 |   [ Sophie Brun ]
102 |   * Update debian/watch
103 |   * New upstream version 3.1
104 |   * Remove obsolete patches
105 |   * Use debhelper-compat 12
106 |   * Update packaging to use setup.py
107 |   * Bump Standards-Version to 4.4.1
108 | 
109 |  -- Sophie Brun <sophie@offensive-security.com>  Tue, 15 Oct 2019 08:59:27 +0200
110 | 
111 | theharvester (3.0.6-0kali1) kali-dev; urgency=medium
112 | 
113 |   * New upstream version 3.0.6
114 | 
115 |  -- Sophie Brun <sophie@offensive-security.com>  Thu, 20 Dec 2018 09:27:33 +0100
116 | 
117 | theharvester (3.0.5-0kali1) kali-dev; urgency=medium
118 | 
119 |   * New upstream version 3.0.5
120 |   * Add dependency: python3-plotly
121 |   * Refresh patch
122 |   * Add minimal autopkgtest
123 | 
124 |  -- Sophie Brun <sophie@offensive-security.com>  Wed, 19 Dec 2018 11:00:29 +0100
125 | 
126 | theharvester (3.0.4-0kali1) kali-dev; urgency=medium
127 | 
128 |   * New upstream version 3.0.4
129 |   * Switch to Python 3
130 |   * Add a minimal required version of wfuzz: this is the first version in
131 |     Python 3
132 | 
133 |  -- Sophie Brun <sophie@offensive-security.com>  Thu, 13 Dec 2018 11:11:59 +0100
134 | 
135 | theharvester (3.0.1-0kali1) kali-dev; urgency=medium
136 | 
137 |   * New upstream version 3.0.1
138 |   * Bump Standards-Version to 4.2.1
139 |   * Update debian/copyright
140 |   * Add missing dependency: python-bs4
141 |   * Refresh patch
142 | 
143 |  -- Sophie Brun <sophie@freexian.com>  Thu, 29 Nov 2018 14:40:11 +0100
144 | 
145 | theharvester (3.0-0kali1) kali-dev; urgency=medium
146 | 
147 |   * Upstream update
148 | 
149 |  -- Ben Wilson <g0tmi1k@kali.org>  Tue, 09 Oct 2018 12:19:07 +0100
150 | 
151 | theharvester (2.7.2~20180322-0kali1) kali-dev; urgency=medium
152 | 
153 |   * Import new upstream version (Closes: 0004685)
154 |   * Bump Standards-Version and use debhelper 11
155 |   * Update debian/control and debian/theharvester.install
156 |   * Refresh patches
157 |   * Add wfuzz as dependency
158 | 
159 |  -- Sophie Brun <sophie@freexian.com>  Thu, 19 Apr 2018 09:11:32 +0200
160 | 
161 | theharvester (2.7-0kali1) kali-dev; urgency=medium
162 | 
163 |   * Import new upstream release
164 | 
165 |  -- Sophie Brun <sophie@freexian.com>  Tue, 19 Apr 2016 09:32:54 +0200
166 | 
167 | theharvester (2.6-0kali1) kali-dev; urgency=medium
168 | 
169 |   * Update watch file
170 | 
171 |  -- Sophie Brun <sophie@freexian.com>  Wed, 27 Jan 2016 10:26:13 +0100
172 | 
173 | theharvester (2.6-0kali0) kali; urgency=low
174 | 
175 |   * Imported new upstream release (Closes: 0002291)
176 | 
177 |  -- Devon Kearns <dookie@kali.org>  Tue, 26 May 2015 12:37:58 -0600
178 | 
179 | theharvester (2.5+git20150109-0kali0) kali; urgency=medium
180 | 
181 |   * Imported new upstream release (Closes: 0001961)
182 | 
183 |  -- Devon Kearns <dookie@kali.org>  Fri, 09 Jan 2015 12:24:36 -0700
184 | 
185 | theharvester (2.2a-1kali2) kali; urgency=low
186 | 
187 |   * Patched usage output (Closes: 0001251)
188 | 
189 |  -- Devon Kearns <dookie@kali.org>  Tue, 20 May 2014 10:49:13 -0600
190 | 
191 | theharvester (2.2a-1kali1) kali; urgency=low
192 | 
193 |   * Updated watch file
194 | 
195 |  -- Mati Aharoni <muts@kali.org>  Sun, 12 Jan 2014 15:40:59 -0500
196 | 
197 | theharvester (2.2a-1kali0) kali; urgency=low
198 | 
199 |   * New upstream version
200 | 
201 |  -- Devon Kearns <dookie@kali.org>  Sat, 09 Feb 2013 14:47:29 -0700
202 | 
203 | theharvester (2.2-1kali3) kali; urgency=low
204 | 
205 |   * Cleaned up debian files
206 | 
207 |  -- balding_parrot <balding_parrot@kali.org>  Tue, 18 Dec 2012 22:49:17 +0000
208 | 
209 | theharvester (2.2-1kali2) kali; urgency=low
210 | 
211 |   * Removed desktop file
212 | 
213 |  -- balding_parrot <balding_parrot@kali.org>  Tue, 18 Dec 2012 22:32:47 +0000
214 | 
215 | theharvester (2.2-1kali1) kali; urgency=low
216 | 
217 |   * Initial release 
218 | 
219 |  -- balding_parrot <balding_parrot@kali.org>  Tue, 18 Dec 2012 08:21:47 +0000
220 | 


--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
 1 | Source: theharvester
 2 | Section: utils
 3 | Priority: optional
 4 | Maintainer: Parrot Dev Team <team@parrotsec.org>
 5 | Uploaders: Lorenzo "Palinuro" Faletra <palinuro@parrotsec.org>
 6 | Build-Depends: debhelper-compat (= 12),
 7 |                dh-python,
 8 |                python3-aiohttp,
 9 |                python3-all,
10 |                python3-certifi,
11 |                python3-requests,
12 |                python3-setuptools,
13 |                python3-yaml
14 | Standards-Version: 4.6.1
15 | Homepage: https://github.com/laramies/theHarvester
16 | 
17 | Package: theharvester
18 | Architecture: all
19 | Depends: python3,
20 |          python3-aiodns (>= 2.0.0),
21 |          python3-aiohttp (>= 3.6.2),
22 |          python3-aiofiles,
23 |          python3-aiomultiprocess (>= 0.8.0),
24 |          python3-aiosqlite (>= 0.15.0),
25 |          python3-bs4 (>= 4.9.1),
26 |          python3-censys (>= 2.1.7),
27 |          python3-certifi (>= 2022.6.15),
28 |          python3-dnspython (>= 2.0.0),
29 | # python3-fastapi: upstream went from 0.70.0 to 0.79.0
30 | # packaged in Debian, rev deps: theharvester, witnessme
31 |          python3-fastapi (>= 0.74.0),
32 |          python3-lxml (>= 4.5.2),
33 |          python3-netaddr (>= 0.7.19),
34 |          python3-ujson,
35 |          python3-pyppeteer (>= 1.0.2),
36 |          python3-requests (>= 2.23.0),
37 |          python3-retrying (>= 1.3.3),
38 |          python3-shodan (>= 1.23.0),
39 |          python3-slowapi,
40 |          python3-starlette,
41 |          python3-texttable (>= 1.6.2),
42 |          python3-uvicorn,
43 |          python3-uvloop (>= 0.14.0),
44 |          python3-yaml (>= 5.3.1),
45 |          ${misc:Depends},
46 |          ${python3:Depends},
47 | Description: tool for gathering e-mail accounts and subdomain names from public sources
48 |  The package contains a tool for gathering subdomain names, e-mail addresses,
49 |  virtual hosts, open ports/ banners, and employee names from different public
50 |  sources (search engines, pgp key servers).
51 | 


--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
 1 | Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 2 | Upstream-Name: theharvester
 3 | Source: https://github.com/laramies/theHarvester
 4 | 
 5 | Files: *
 6 | Copyright: 2011 Christian Martorella  <cmartorella@edge-security.com >
 7 | License: GPL-2
 8 |  This package is free software; you can redistribute it and/or modify
 9 |  it under the terms of the GNU General Public License version 2 as published by
10 |  the Free Software Foundation.
11 |  .
12 |  This package is distributed in the hope that it will be useful,
13 |  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 |  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 |  GNU General Public License for more details.
16 |  .
17 |  You should have received a copy of the GNU General Public License
18 |  along with this program. If not, see <http://www.gnu.org/licenses/>
19 |  .
20 |  On Debian systems, the complete text of the GNU General
21 |  Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
22 | 
23 | Files: debian/*
24 | Copyright: 2012 balding_parrot <baldingparrot@localhost.localdomain>
25 |            2018 Sophie Brun <sophie@offensive-security.com>
26 | License: GPL-2+
27 |  This package is free software; you can redistribute it and/or modify
28 |  it under the terms of the GNU General Public License as published by
29 |  the Free Software Foundation; either version 2 of the License, or
30 |  (at your option) any later version.
31 |  .
32 |  This package is distributed in the hope that it will be useful,
33 |  but WITHOUT ANY WARRANTY; without even the implied warranty of
34 |  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
35 |  GNU General Public License for more details.
36 |  .
37 |  You should have received a copy of the GNU General Public License
38 |  along with this program. If not, see <http://www.gnu.org/licenses/>
39 |  .
40 |  On Debian systems, the complete text of the GNU General
41 |  Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
42 | 


--------------------------------------------------------------------------------
/debian/dirs:
--------------------------------------------------------------------------------
1 | usr/bin
2 | 


--------------------------------------------------------------------------------
/debian/docs:
--------------------------------------------------------------------------------
1 | README.md
2 | 


--------------------------------------------------------------------------------
/debian/gbp.conf:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | pristine-tar = True
3 | 
4 | [pq]
5 | patch-numbers = False
6 | 
7 | [dch]
8 | multimaint-merge = True
9 | 


--------------------------------------------------------------------------------
/debian/helper-script/theharvester:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | 
3 | set -e
4 | 
5 | echo -e "this command is deprecated, use theHarvester instead"
6 | /usr/bin/theHarvester $@
7 | 


--------------------------------------------------------------------------------
/debian/patches/Disable-a-failing-test-unstable-site.patch:
--------------------------------------------------------------------------------
 1 | From: Sophie Brun <sophie@offensive-security.com>
 2 | Date: Tue, 30 Aug 2022 15:37:52 +0200
 3 | Subject: Disable a failing test (unstable site)
 4 | 
 5 | ---
 6 |  tests/discovery/test_sublist3r.py | 9 +++++----
 7 |  1 file changed, 5 insertions(+), 4 deletions(-)
 8 | 
 9 | diff --git a/tests/discovery/test_sublist3r.py b/tests/discovery/test_sublist3r.py
10 | index 374095e..1d5fdd1 100644
11 | --- a/tests/discovery/test_sublist3r.py
12 | +++ b/tests/discovery/test_sublist3r.py
13 | @@ -21,10 +21,11 @@ async def test_api(self):
14 |          request = requests.get(base_url, headers=headers)
15 |          assert request.status_code == 200
16 |  
17 | -    async def test_do_search(self):
18 | -        search = sublist3r.SearchSublist3r(TestSublist3r.domain())
19 | -        await search.process()
20 | -        assert isinstance(await search.get_hostnames(), list)
21 | +# disable as it fails (unstable site?)
22 | +#    async def test_do_search(self):
23 | +#        search = sublist3r.SearchSublist3r(TestSublist3r.domain())
24 | +#        await search.process()
25 | +#        assert isinstance(await search.get_hostnames(), list)
26 |  
27 |  
28 |  if __name__ == '__main__':
29 | 


--------------------------------------------------------------------------------
/debian/patches/Improve-data-installation.patch:
--------------------------------------------------------------------------------
 1 | From: Sophie Brun <sophie@offensive-security.com>
 2 | Date: Thu, 7 Jan 2021 10:19:09 +0100
 3 | Subject: Improve data installation
 4 | 
 5 | Bug-Kali: https://gitlab.com/kalilinux/packages/theharvester/-/issues/6
 6 | 
 7 | By default the wordlists were installed directly in /etc/theHarvester
 8 | instead of /etc/theHarvester/wordlists
 9 | ---
10 |  setup.py | 12 ++++++++----
11 |  1 file changed, 8 insertions(+), 4 deletions(-)
12 | 
13 | diff --git a/setup.py b/setup.py
14 | index 128bd89..34a3ef0 100755
15 | --- a/setup.py
16 | +++ b/setup.py
17 | @@ -26,13 +26,17 @@
18 |      ],
19 |      data_files=[
20 |          ('/etc/theHarvester', [
21 | -            'wordlists/general/common.txt',
22 | +            'api-keys.yaml',
23 | +            'proxies.yaml'
24 | +        ]),
25 | +        ('/etc/theHarvester/wordlists', [
26 |              'wordlists/dns-big.txt',
27 |              'wordlists/dns-names.txt',
28 |              'wordlists/dorks.txt',
29 | -            'wordlists/names_small.txt',
30 | -            'api-keys.yaml',
31 | -            'proxies.yaml'
32 | +            'wordlists/names_small.txt'
33 | +        ]),
34 | +        ('/etc/theHarvester/wordlists/general', [
35 | +            'wordlists/general/common.txt'
36 |          ]
37 |          )
38 |      ],
39 | 


--------------------------------------------------------------------------------
/debian/patches/series:
--------------------------------------------------------------------------------
1 | Improve-data-installation.patch
2 | Disable-a-failing-test-unstable-site.patch
3 | 


--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | # output every command that modifies files on the build system.
 4 | #export DH_VERBOSE = 1
 5 | 
 6 | %:
 7 | 	dh $@ --with python3 --buildsystem=pybuild
 8 | 
 9 | override_dh_auto_test:
10 | 	# do not run tests during the build: most of the tests require
11 | 	# network
12 | 


--------------------------------------------------------------------------------
/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (quilt)
2 | 


--------------------------------------------------------------------------------
/debian/tests/control:
--------------------------------------------------------------------------------
1 | Test-Command: python3 -m pytest tests
2 | Depends: @, python3-pytest, python3-pytest-asyncio
3 | 
4 | Test-Command: theHarvester -h
5 | Restrictions: superficial, allow-stderr
6 | 


--------------------------------------------------------------------------------
/debian/theharvester.install:
--------------------------------------------------------------------------------
1 | debian/helper-script/* usr/bin
2 | 


--------------------------------------------------------------------------------
/debian/theharvester.links:
--------------------------------------------------------------------------------
1 | etc/theHarvester/api-keys.yaml usr/lib/python3/dist-packages/theHarvester/api-keys.yaml
2 | etc/theHarvester/wordlists usr/lib/python3/dist-packages/theHarvester/wordlists
3 | 


--------------------------------------------------------------------------------
/debian/upstream/metadata:
--------------------------------------------------------------------------------
1 | ---
2 | Bug-Database: https://github.com/laramies/theHarvester/issues
3 | Bug-Submit: https://github.com/laramies/theHarvester/issues/new
4 | Repository: https://github.com/laramies/theHarvester.git
5 | Repository-Browse: https://github.com/laramies/theHarvester
6 | 


--------------------------------------------------------------------------------
/debian/watch:
--------------------------------------------------------------------------------
1 | version=4
2 | opts="filenamemangle=s/.*\/v?V?(.*)\.tar\.gz/theharvester-$1.tar.gz/" \
3 | https://github.com/laramies/theHarvester/tags .*/v?V?(.*)\.tar\.gz
4 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | show_traceback = True
4 | show_error_codes = True
5 | namespace_packages = True
6 | 


--------------------------------------------------------------------------------
/proxies.yaml:
--------------------------------------------------------------------------------
1 | http:
2 |     - ip:port
3 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.pytest.ini_options]
2 | minversion = "7.1"
3 | addopts = "--no-header --asyncio-mode=auto"
4 | testpaths = [
5 |     "tests",
6 |     "tests/discovery/",
7 | ]


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | minversion = 7.1.1
3 | testpaths = tests
4 | asyncio_mode=auto


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/base.txt
2 | 


--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
 1 | aiodns==3.0.0
 2 | aiofiles==0.8.0
 3 | aiohttp==3.8.1
 4 | aiomultiprocess==0.9.0
 5 | aiosqlite==0.17.0
 6 | beautifulsoup4==4.11.1
 7 | censys==2.1.7
 8 | certifi==2022.6.15
 9 | dnspython==2.2.1
10 | fastapi==0.79.0
11 | lxml==4.9.1
12 | netaddr==0.8.0
13 | ujson==5.4.0
14 | pyppeteer==1.0.2
15 | PyYAML==6.0
16 | requests==2.28.1
17 | retrying==1.3.3
18 | setuptools==64.0.3
19 | shodan==1.28.0
20 | slowapi==0.1.5
21 | uvicorn==0.18.2
22 | uvloop==0.16.0; platform_system != "Windows"


--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | -r base.txt
 2 | flake8==5.0.4
 3 | mypy==0.971
 4 | mypy-extensions==0.4.3
 5 | pyflakes==2.5.0
 6 | pytest==7.1.2
 7 | pytest-asyncio==0.19.0
 8 | types-certifi==2021.10.8.3
 9 | types-chardet==5.0.4
10 | types-ujson==5.4.0
11 | types-PyYAML==6.0.11
12 | types-requests==2.28.8
13 | wheel==0.37.1


--------------------------------------------------------------------------------
/restfulHarvest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import uvicorn
 4 | 
 5 | parser = argparse.ArgumentParser()
 6 | parser.add_argument('-H', '--host', default='127.0.0.1', help='IP address to listen on default is 127.0.0.1')
 7 | parser.add_argument('-p', '--port', default=5000, help='Port to bind the web server to, default is 5000', type=int)
 8 | parser.add_argument('-l', '--log-level', default='info', help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set')
 9 | parser.add_argument('-r', '--reload', default=False, help='Enable automatic reload used during development of the api', action='store_true')
10 | 
11 | args = parser.parse_args()
12 | 
13 | if __name__ == '__main__':
14 |     uvicorn.run('theHarvester.lib.api.api:app', host=args.host, port=args.port, log_level=args.log_level, reload=args.reload)
15 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F405, F403, E402, F401


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from theHarvester.lib.core import Core
 3 | 
 4 | with open('README.md', 'r') as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | setup(
 8 |     name='theHarvester',
 9 |     version=Core.version(),
10 |     author="Christian Martorella",
11 |     author_email="cmartorella@edge-security.com",
12 |     description="theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test",
13 |     long_description=long_description,
14 |     long_description_content_type="text/markdown",
15 |     url="https://github.com/laramies/theHarvester",
16 |     packages=find_packages(exclude=['tests']),
17 |     python_requires='>=3.7',
18 |     scripts=['bin/theHarvester',
19 |              'bin/restfulHarvest'],
20 | 
21 |     classifiers=[
22 |         "Programming Language :: Python :: 3",
23 |         "Programming Language :: Python :: 3.8",
24 |         "Programming Language :: Python :: 3.9",
25 |         "Programming Language :: Python :: 3.10",
26 |         "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
27 |         "Operating System :: OS Independent",
28 |     ],
29 |     data_files=[
30 |         ('/etc/theHarvester', [
31 |             'wordlists/general/common.txt',
32 |             'wordlists/dns-big.txt',
33 |             'wordlists/dns-names.txt',
34 |             'wordlists/dorks.txt',
35 |             'wordlists/names_small.txt',
36 |             'api-keys.yaml',
37 |             'proxies.yaml'
38 |         ]
39 |         )
40 |     ],
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/tests/__init__.py


--------------------------------------------------------------------------------
/tests/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/tests/discovery/__init__.py


--------------------------------------------------------------------------------
/tests/discovery/test_anubis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import requests
 4 | from theHarvester.lib.core import *
 5 | from theHarvester.discovery import anubis
 6 | import os
 7 | import pytest
 8 | 
 9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS')  # Github set this to be the following: true instead of True
11 | 
12 | 
13 | class TestAnubis:
14 |     @staticmethod
15 |     def domain() -> str:
16 |         return 'apple.com'
17 | 
18 |     async def test_api(self):
19 |         base_url = f'https://jldc.me/anubis/subdomains/{TestAnubis.domain()}'
20 |         headers = {'User-Agent': Core.get_user_agent()}
21 |         request = requests.get(base_url, headers=headers)
22 |         assert request.status_code == 200
23 | 
24 |     async def test_do_search(self):
25 |         search = anubis.SearchAnubis(word=TestAnubis.domain())
26 |         await search.do_search()
27 |         return await search.get_hostnames()
28 | 
29 |     async def test_process(self):
30 |         await self.test_do_search()
31 |         assert len(await self.test_do_search()) > 0
32 | 


--------------------------------------------------------------------------------
/tests/discovery/test_certspotter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | from theHarvester.lib.core import *
 4 | from theHarvester.discovery import certspottersearch
 5 | import os
 6 | import requests
 7 | import pytest
 8 | 
 9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS')  # Github set this to be the following: true instead of True
11 | 
12 | 
13 | class TestCertspotter(object):
14 |     @staticmethod
15 |     def domain() -> str:
16 |         return 'metasploit.com'
17 | 
18 |     async def test_api(self):
19 |         base_url = f'https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names'
20 |         headers = {'User-Agent': Core.get_user_agent()}
21 |         request = requests.get(base_url, headers=headers)
22 |         assert request.status_code == 200
23 | 
24 |     async def test_search(self):
25 |         search = certspottersearch.SearchCertspoter(TestCertspotter.domain())
26 |         await search.process()
27 |         assert isinstance(await search.get_hostnames(), set)
28 | 
29 |     async def test_search_no_results(self):
30 |         search = certspottersearch.SearchCertspoter('radiant.eu')
31 |         await search.process()
32 |         assert len(await search.get_hostnames()) == 0
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     pytest.main()
37 | 


--------------------------------------------------------------------------------
/tests/discovery/test_githubcode.py:
--------------------------------------------------------------------------------
  1 | from theHarvester.discovery import githubcode
  2 | from theHarvester.discovery.constants import MissingKey
  3 | from theHarvester.lib.core import Core
  4 | from unittest.mock import MagicMock
  5 | from requests import Response
  6 | import pytest
  7 | 
  8 | pytestmark = pytest.mark.asyncio
  9 | 
 10 | 
 11 | class TestSearchGithubCode:
 12 | 
 13 |     class OkResponse:
 14 |         response = Response()
 15 |         json = {
 16 |             "items": [
 17 |                 {
 18 |                     "text_matches": [
 19 |                         {
 20 |                             "fragment": "test1"
 21 |                         }
 22 |                     ]
 23 |                 },
 24 |                 {
 25 |                     "text_matches": [
 26 |                         {
 27 |                             "fragment": "test2"
 28 |                         }
 29 |                     ]
 30 |                 }
 31 |             ]
 32 |         }
 33 |         response.status_code = 200
 34 |         response.json = MagicMock(return_value=json)
 35 | 
 36 |     class FailureResponse:
 37 |         response = Response()
 38 |         response.json = MagicMock(return_value={})
 39 |         response.status_code = 401
 40 | 
 41 |     class RetryResponse:
 42 |         response = Response()
 43 |         response.json = MagicMock(return_value={})
 44 |         response.status_code = 403
 45 | 
 46 |     class MalformedResponse:
 47 |         response = Response()
 48 |         json = {
 49 |             "items": [
 50 |                 {
 51 |                     "fail": True
 52 |                 },
 53 |                 {
 54 |                     "text_matches": []
 55 |                 },
 56 |                 {
 57 |                     "text_matches": [
 58 |                         {
 59 |                             "weird": "result"
 60 |                         }
 61 |                     ]
 62 |                 }
 63 |             ]
 64 |         }
 65 |         response.json = MagicMock(return_value=json)
 66 |         response.status_code = 200
 67 | 
 68 |     async def test_missing_key(self):
 69 |         with pytest.raises(MissingKey):
 70 |             Core.github_key = MagicMock(return_value=None)
 71 |             githubcode.SearchGithubCode(word="test", limit=500)
 72 | 
 73 |     async def test_fragments_from_response(self):
 74 |         Core.github_key = MagicMock(return_value="lol")
 75 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
 76 |         test_result = await test_class_instance.fragments_from_response(self.OkResponse.response.json())
 77 |         print('test_result: ', test_result)
 78 |         assert test_result == ["test1", "test2"]
 79 | 
 80 |     async def test_invalid_fragments_from_response(self):
 81 |         Core.github_key = MagicMock(return_value="lol")
 82 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
 83 |         test_result = await test_class_instance.fragments_from_response(self.MalformedResponse.response.json())
 84 |         assert test_result == []
 85 | 
 86 |     async def test_next_page(self):
 87 |         Core.github_key = MagicMock(return_value="lol")
 88 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
 89 |         test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4)
 90 |         assert (2 == await test_class_instance.next_page_or_end(test_result))
 91 | 
 92 |     async def test_last_page(self):
 93 |         Core.github_key = MagicMock(return_value="lol")
 94 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
 95 |         test_result = githubcode.SuccessResult(list(), None, None)
 96 |         assert (None is await test_class_instance.next_page_or_end(test_result))
 97 | 
 98 |     if __name__ == '__main__':
 99 |         pytest.main()
100 | 


--------------------------------------------------------------------------------
/tests/discovery/test_omnisint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | from theHarvester.lib.core import *
 4 | from theHarvester.discovery import omnisint
 5 | import os
 6 | import requests
 7 | import pytest
 8 | 
 9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS')  # Github set this to be the following: true instead of True
11 | 
12 | 
13 | class TestOmnisint(object):
14 |     @staticmethod
15 |     def domain() -> str:
16 |         return 'uber.com'
17 | 
18 |     @pytest.mark.skipif(github_ci == 'true', reason='Skipping on Github CI due to unstable status code from site')
19 |     async def test_api(self):
20 |         base_url = f'https://sonar.omnisint.io/all/{TestOmnisint.domain()}'
21 |         headers = {'User-Agent': Core.get_user_agent()}
22 |         request = requests.get(base_url, headers=headers)
23 |         assert request.status_code == 200
24 | 
25 |     async def test_search(self):
26 |         search = omnisint.SearchOmnisint(TestOmnisint.domain())
27 |         await search.process()
28 |         assert isinstance(await search.get_hostnames(), list)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     pytest.main()
33 | 


--------------------------------------------------------------------------------
/tests/discovery/test_otx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | from theHarvester.lib.core import *
 4 | from theHarvester.discovery import otxsearch
 5 | import os
 6 | import requests
 7 | import pytest
 8 | 
 9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS')  # Github set this to be the following: true instead of True
11 | 
12 | 
13 | class TestOtx(object):
14 |     @staticmethod
15 |     def domain() -> str:
16 |         return 'metasploit.com'
17 | 
18 |     async def test_api(self):
19 |         base_url = f'https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns'
20 |         headers = {'User-Agent': Core.get_user_agent()}
21 |         request = requests.get(base_url, headers=headers)
22 |         assert request.status_code == 200
23 | 
24 |     async def test_search(self):
25 |         search = otxsearch.SearchOtx(TestOtx.domain())
26 |         await search.process()
27 |         assert isinstance(await search.get_hostnames(), set)
28 |         assert isinstance(await search.get_ips(), set)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     pytest.main()
33 | 


--------------------------------------------------------------------------------
/tests/discovery/test_qwantsearch.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | from theHarvester.discovery import qwantsearch
 4 | import os
 5 | import pytest
 6 | 
 7 | pytestmark = pytest.mark.asyncio
 8 | github_ci = os.getenv('GITHUB_ACTIONS')  # Github set this to be the following: true instead of True
 9 | 
10 | 
11 | class TestSearchQwant(object):
12 | 
13 |     @staticmethod
14 |     def domain() -> str:
15 |         return 'example.com'
16 | 
17 |     async def test_get_start_offset_return_0(self):
18 |         search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200)
19 |         assert search.get_start_offset() == 0
20 | 
21 |     async def test_get_start_offset_return_50(self):
22 |         search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 55, 200)
23 |         assert search.get_start_offset() == 50
24 | 
25 |     async def test_get_start_offset_return_100(self):
26 |         search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 100, 200)
27 |         assert search.get_start_offset() == 100
28 | 
29 |     async def test_get_emails(self):
30 |         search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200)
31 |         await search.process()
32 |         assert isinstance(await search.get_emails(), set)
33 | 
34 |     async def test_get_hostnames(self):
35 |         search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200)
36 |         await search.process()
37 |         assert isinstance(await search.get_hostnames(), list)
38 | 


--------------------------------------------------------------------------------
/tests/discovery/test_sublist3r.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import requests
 4 | from theHarvester.lib.core import *
 5 | from theHarvester.discovery import sublist3r
 6 | import os
 7 | import pytest
 8 | 
 9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS')  # Github set this to be the following: true instead of True
11 | 
12 | 
13 | class TestSublist3r(object):
14 |     @staticmethod
15 |     def domain() -> str:
16 |         return 'target.com'
17 | 
18 |     async def test_api(self):
19 |         base_url = f'https://api.sublist3r.com/search.php?domain={TestSublist3r.domain()}'
20 |         headers = {'User-Agent': Core.get_user_agent()}
21 |         request = requests.get(base_url, headers=headers)
22 |         assert request.status_code == 200
23 | 
24 |     async def test_do_search(self):
25 |         search = sublist3r.SearchSublist3r(TestSublist3r.domain())
26 |         await search.process()
27 |         assert isinstance(await search.get_hostnames(), list)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     pytest.main()
32 | 


--------------------------------------------------------------------------------
/tests/discovery/test_threatminer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import requests
 4 | from theHarvester.lib.core import *
 5 | from theHarvester.discovery import threatminer
 6 | import os
 7 | import pytest
 8 | 
 9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS')  # Github set this to be the following: true instead of True
11 | 
12 | 
13 | class TestThreatminer(object):
14 |     @staticmethod
15 |     def domain() -> str:
16 |         return 'target.com'
17 | 
18 |     async def test_api(self):
19 |         base_url = f'https://api.threatminer.org/v2/domain.php?q={TestThreatminer.domain()}&rt=5'
20 |         headers = {'User-Agent': Core.get_user_agent()}
21 |         request = requests.get(base_url, headers=headers)
22 |         assert request.status_code == 200
23 | 
24 |     async def test_search(self):
25 |         search = threatminer.SearchThreatminer(TestThreatminer.domain())
26 |         await search.process()
27 |         assert isinstance(await search.get_hostnames(), set)
28 |         assert isinstance(await search.get_ips(), set)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     pytest.main()
33 | 


--------------------------------------------------------------------------------
/tests/test_myparser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | 
 4 | from theHarvester.parsers import myparser
 5 | import pytest
 6 | 
 7 | 
 8 | class TestMyParser(object):
 9 | 
10 |     @pytest.mark.asyncio
11 |     async def test_emails(self):
12 |         word = 'domain.com'
13 |         results = '@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***'
14 |         parse = myparser.Parser(results, word)
15 |         emails = sorted(await parse.emails())
16 |         assert emails, ['c@domain.com', 'd@sub.domain.com']
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     pytest.main()
21 | 


--------------------------------------------------------------------------------
/theHarvester-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester-logo.png


--------------------------------------------------------------------------------
/theHarvester.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Note: This script runs theHarvester
 3 | import sys
 4 | import asyncio
 5 | from theHarvester import __main__
 6 | 
 7 | if sys.version_info.major < 3 or sys.version_info.minor < 7:
 8 |     print('\033[93m[!] Make sure you have Python 3.7+ installed, quitting.\n\n \033[0m')
 9 |     sys.exit(1)
10 | 
11 | if __name__ == '__main__':
12 |     platform = sys.platform
13 |     if platform == 'win32':
14 |         # Required or things will break if trying to take screenshots
15 |         import multiprocessing
16 | 
17 |         multiprocessing.freeze_support()
18 |         asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
19 |     else:
20 |         import uvloop
21 |         uvloop.install()
22 | 
23 |         if "linux" in platform:
24 |             import aiomultiprocess
25 | 
26 |             # As we are not using Windows we can change the spawn method to fork for greater performance
27 |             aiomultiprocess.set_context("fork")
28 |     asyncio.run(__main__.entry_point())
29 | 


--------------------------------------------------------------------------------
/theHarvester/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/__init__.py


--------------------------------------------------------------------------------
/theHarvester/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/discovery/__init__.py


--------------------------------------------------------------------------------
/theHarvester/discovery/anubis.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchAnubis:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.totalhosts = list
10 |         self.proxy = False
11 | 
12 |     async def do_search(self):
13 |         url = f'https://jldc.me/anubis/subdomains/{self.word}'
14 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
15 |         self.totalhosts: list = response[0]
16 | 
17 |     async def get_hostnames(self) -> Type[list]:
18 |         return self.totalhosts
19 | 
20 |     async def process(self, proxy=False):
21 |         self.proxy = proxy
22 |         await self.do_search()
23 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/baidusearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | from theHarvester.parsers import myparser
 3 | 
 4 | 
 5 | class SearchBaidu:
 6 | 
 7 |     def __init__(self, word, limit):
 8 |         self.word = word
 9 |         self.total_results = ""
10 |         self.server = 'www.baidu.com'
11 |         self.hostname = 'www.baidu.com'
12 |         self.limit = limit
13 |         self.proxy = False
14 | 
15 |     async def do_search(self):
16 |         headers = {
17 |             'Host': self.hostname,
18 |             'User-agent': Core.get_user_agent()
19 |         }
20 |         base_url = f'https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}'
21 |         urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
22 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
23 |         for response in responses:
24 |             self.total_results += response
25 | 
26 |     async def process(self, proxy=False):
27 |         self.proxy = proxy
28 |         await self.do_search()
29 | 
30 |     async def get_emails(self):
31 |         rawres = myparser.Parser(self.total_results, self.word)
32 |         return await rawres.emails()
33 | 
34 |     async def get_hostnames(self):
35 |         rawres = myparser.Parser(self.total_results, self.word)
36 |         return await rawres.hostnames()
37 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/bevigil.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | 
 3 | 
 4 | class SearchBeVigil:
 5 | 
 6 |     def __init__(self, word):
 7 |         self.word = word
 8 |         self.totalhosts = set()
 9 |         self.interestingurls = set()
10 |         self.key = Core.bevigil_key()
11 |         self.proxy = False
12 | 
13 |     async def do_search(self):
14 |         subdomain_endpoint = f"https://osint.bevigil.com/api/{self.word}/subdomains/"
15 |         url_endpoint = f"https://osint.bevigil.com/api/{self.word}/urls/"
16 |         headers = {'X-Access-Token': self.key}
17 | 
18 |         responses = await AsyncFetcher.fetch_all([subdomain_endpoint], json=True, proxy=self.proxy, headers=headers)
19 |         response = responses[0]
20 |         for subdomain in response["subdomains"]:
21 |             self.totalhosts.add(subdomain)
22 | 
23 |         responses = await AsyncFetcher.fetch_all([url_endpoint], json=True, proxy=self.proxy, headers=headers)
24 |         response = responses[0]
25 |         for url in response["urls"]:
26 |             self.interestingurls.add(url)
27 | 
28 |     async def get_hostnames(self) -> set:
29 |         return self.totalhosts
30 | 
31 |     async def get_interestingurls(self) -> set:
32 |         return self.interestingurls
33 | 
34 |     async def process(self, proxy=False):
35 |         self.proxy = proxy
36 |         await self.do_search()
37 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/binaryedgesearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | import asyncio
 3 | 
 4 | 
 5 | class SearchBinaryEdge:
 6 | 
 7 |     def __init__(self, word, limit):
 8 |         self.word = word
 9 |         self.totalhosts = set()
10 |         self.proxy = False
11 |         self.key = Core.binaryedge_key()
12 |         self.limit = 501 if limit >= 501 else limit
13 |         self.limit = 2 if self.limit == 1 else self.limit
14 |         if self.key is None:
15 |             raise MissingKey('binaryedge')
16 | 
17 |     async def do_search(self):
18 |         base_url = f'https://api.binaryedge.io/v2/query/domains/subdomain/{self.word}'
19 |         headers = {'X-KEY': self.key, 'User-Agent': Core.get_user_agent()}
20 |         for page in range(1, self.limit):
21 |             params = {'page': page}
22 |             response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy, params=params, headers=headers)
23 |             responses = response[0]
24 |             dct = responses
25 |             if ('status' in dct.keys() and 'message' in dct.keys()) and \
26 |                     (dct['status'] == 400 or 'Bad Parameter' in dct['message'] or 'Error' in dct['message']):
27 |                 # 400 status code means no more results
28 |                 break
29 |             if 'events' in dct.keys():
30 |                 if len(dct['events']) == 0:
31 |                     break
32 |                 self.totalhosts.update({host for host in dct['events']})
33 |             await asyncio.sleep(get_delay())
34 | 
35 |     async def get_hostnames(self) -> set:
36 |         return self.totalhosts
37 | 
38 |     async def process(self, proxy=False):
39 |         self.proxy = proxy
40 |         await self.do_search()
41 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/bingsearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | from theHarvester.parsers import myparser
 4 | 
 5 | 
 6 | class SearchBing:
 7 | 
 8 |     def __init__(self, word, limit, start):
 9 |         self.word = word.replace(' ', '%20')
10 |         self.results = ""
11 |         self.total_results = ""
12 |         self.server = 'www.bing.com'
13 |         self.apiserver = 'api.search.live.net'
14 |         self.hostname = 'www.bing.com'
15 |         self.limit = int(limit)
16 |         self.bingApi = Core.bing_key()
17 |         self.counter = start
18 |         self.proxy = False
19 | 
20 |     async def do_search(self):
21 |         headers = {
22 |             'Host': self.hostname,
23 |             'Cookie': 'SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50',
24 |             'Accept-Language': 'en-us,en',
25 |             'User-agent': Core.get_user_agent()
26 |         }
27 |         base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
28 |         urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
29 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
30 |         for response in responses:
31 |             self.total_results += response
32 | 
33 |     async def do_search_api(self):
34 |         url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?'
35 |         params = {
36 |             'q': self.word,
37 |             'count': str(self.limit),
38 |             'offset': '0',
39 |             'mkt': 'en-us',
40 |             'safesearch': 'Off'
41 |         }
42 |         headers = {'User-Agent': Core.get_user_agent(), 'Ocp-Apim-Subscription-Key': self.bingApi}
43 |         self.results = await AsyncFetcher.fetch_all([url], headers=headers, params=params, proxy=self.proxy)
44 |         self.total_results += self.results
45 | 
46 |     async def do_search_vhost(self):
47 |         headers = {
48 |             'Host': self.hostname,
49 |             'Cookie': 'mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50',
50 |             'Accept-Language': 'en-us,en',
51 |             'User-agent': Core.get_user_agent()
52 |         }
53 |         base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx'
54 |         urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
55 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
56 |         for response in responses:
57 |             self.total_results += response
58 | 
59 |     async def get_emails(self):
60 |         rawres = myparser.Parser(self.total_results, self.word)
61 |         return await rawres.emails()
62 | 
63 |     async def get_hostnames(self):
64 |         rawres = myparser.Parser(self.total_results, self.word)
65 |         return await rawres.hostnames()
66 | 
67 |     async def get_allhostnames(self):
68 |         rawres = myparser.Parser(self.total_results, self.word)
69 |         return await rawres.hostnames_all()
70 | 
71 |     async def process(self, api, proxy=False):
72 |         self.proxy = proxy
73 |         if api == 'yes':
74 |             if self.bingApi is None:
75 |                 raise MissingKey('BingAPI')
76 |         else:
77 |             if api == 'yes':
78 |                 await self.do_search_api()
79 |             else:
80 |                 await self.do_search()
81 |             print(f'\tSearching {self.counter} results.')
82 | 
83 |     async def process_vhost(self):
84 |         await self.do_search_vhost()
85 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/bufferoverun.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | import re
 3 | 
 4 | 
 5 | class SearchBufferover:
 6 |     def __init__(self, word):
 7 |         self.word = word
 8 |         self.totalhosts = set()
 9 |         self.totalips = set()
10 |         self.proxy = False
11 | 
12 |     async def do_search(self):
13 |         url = f'https://dns.bufferover.run/dns?q={self.word}'
14 |         responses = await AsyncFetcher.fetch_all(urls=[url], json=True, proxy=self.proxy)
15 |         responses = responses[0]
16 |         dct = responses
17 | 
18 |         if dct['FDNS_A']:
19 |             self.totalhosts: set = {
20 |                 host.split(',')[0].replace('www.', '') if ',' in host and self.word.replace('www.', '') in host.split(',')[
21 |                     0] in host else
22 |                 host.split(',')[1] for host in dct['FDNS_A']}
23 | 
24 |         self.totalips: set = {ip.split(',')[0] for ip in dct['FDNS_A'] if
25 |                               re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip.split(',')[0])}
26 | 
27 |     async def get_hostnames(self) -> set:
28 |         return self.totalhosts
29 | 
30 |     async def get_ips(self) -> set:
31 |         return self.totalips
32 | 
33 |     async def process(self, proxy=False):
34 |         self.proxy = proxy
35 |         await self.do_search()
36 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/censysearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import MissingKey
 2 | from theHarvester.lib.core import Core
 3 | from censys.search import CensysCertificates
 4 | from censys.common import __version__
 5 | from censys.common.exceptions import (
 6 |     CensysRateLimitExceededException,
 7 |     CensysUnauthorizedException,
 8 | )
 9 | 
10 | 
11 | class SearchCensys:
12 |     def __init__(self, domain, limit=500):
13 |         self.word = domain
14 |         self.key = Core.censys_key()
15 |         if self.key[0] is None or self.key[1] is None:
16 |             raise MissingKey("Censys ID and/or Secret")
17 |         self.totalhosts = set()
18 |         self.emails = set()
19 |         self.limit = limit
20 |         self.proxy = False
21 | 
22 |     async def do_search(self):
23 |         try:
24 |             cert_search = CensysCertificates(
25 |                 api_id=self.key[0],
26 |                 api_secret=self.key[1],
27 |                 user_agent=f"censys/{__version__} (theHarvester/{Core.version()}; +https://github.com/laramies/theHarvester)",
28 |             )
29 |         except CensysUnauthorizedException:
30 |             raise MissingKey('Censys ID and/or Secret')
31 | 
32 |         query = f"parsed.names: {self.word}"
33 |         try:
34 |             response = cert_search.search(
35 |                 query=query,
36 |                 fields=["parsed.names", "metadata", "parsed.subject.email_address"],
37 |                 max_records=self.limit,
38 |             )
39 |             for cert in response:
40 |                 self.totalhosts.update(cert.get("parsed.names", []))
41 |                 self.emails.update(cert.get("parsed.subject.email_address", []))
42 |         except CensysRateLimitExceededException:
43 |             print("Censys rate limit exceeded")
44 | 
45 |     async def get_hostnames(self) -> set:
46 |         return self.totalhosts
47 | 
48 |     async def get_emails(self) -> set:
49 |         return self.emails
50 | 
51 |     async def process(self, proxy=False):
52 |         self.proxy = proxy
53 |         await self.do_search()
54 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/certspottersearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | 
 3 | 
 4 | class SearchCertspoter:
 5 | 
 6 |     def __init__(self, word):
 7 |         self.word = word
 8 |         self.totalhosts = set()
 9 |         self.proxy = False
10 | 
11 |     async def do_search(self) -> None:
12 |         base_url = f'https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names'
13 |         try:
14 |             response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy)
15 |             response = response[0]
16 |             if isinstance(response, list):
17 |                 for dct in response:
18 |                     for key, value in dct.items():
19 |                         if key == 'dns_names':
20 |                             self.totalhosts.update({name for name in value if name})
21 |             elif isinstance(response, dict):
22 |                 self.totalhosts.update({response['dns_names'] if 'dns_names' in response.keys() else ''})  # type: ignore
23 |             else:
24 |                 self.totalhosts.update({''})
25 |         except Exception as e:
26 |             print(e)
27 | 
28 |     async def get_hostnames(self) -> set:
29 |         return self.totalhosts
30 | 
31 |     async def process(self, proxy=False):
32 |         self.proxy = proxy
33 |         await self.do_search()
34 |         print('\tSearching results.')
35 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/constants.py:
--------------------------------------------------------------------------------
  1 | from theHarvester.lib.core import *
  2 | from typing import Union, Optional
  3 | import random
  4 | 
  5 | googleUA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 ' \
  6 |            'Safari/537.36 '
  7 | 
  8 | 
  9 | async def splitter(links):
 10 |     """
 11 |     Method that tries to remove duplicates
 12 |     LinkedinLists pulls a lot of profiles with the same name.
 13 |     This method tries to remove duplicates from the list.
 14 |     :param links: list of links to remove duplicates from
 15 |     :return: unique-ish list
 16 |     """
 17 |     unique_list = []
 18 |     name_check = []
 19 |     for url in links:
 20 |         tail = url.split("/")[-1]
 21 |         if len(tail) == 2 or tail == "zh-cn":
 22 |             tail = url.split("/")[-2]
 23 |         name = tail.split("-")
 24 |         if len(name) > 1:
 25 |             joined_name = name[0] + name[1]
 26 |         else:
 27 |             joined_name = name[0]
 28 |         if joined_name not in name_check:
 29 |             unique_list.append(url)
 30 |             name_check.append(joined_name)
 31 |     return unique_list
 32 | 
 33 | 
 34 | def filter(lst):
 35 |     """
 36 |     Method that filters list
 37 |     :param lst: list to be filtered
 38 |     :return: new filtered list
 39 |     """
 40 |     if lst is None:
 41 |         return []
 42 |     if not isinstance(lst, set):
 43 |         lst = set(lst)  # Remove duplicates.
 44 |     new_lst = []
 45 |     for item in lst:
 46 |         item = str(item)
 47 |         if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
 48 |             item = item.replace('252f', '').replace('2F', '').replace('2f', '')
 49 |             new_lst.append(item.lower())
 50 |     return new_lst
 51 | 
 52 | 
 53 | def get_delay() -> float:
 54 |     """Method that is used to generate a random delay"""
 55 |     return random.randint(1, 3) - .5
 56 | 
 57 | 
 58 | async def search(text: str) -> bool:
 59 |     """Helper function to check if Google has blocked traffic.
 60 |     :param text: See if certain text is returned which means Google is blocking us
 61 |     :return bool:
 62 |     """
 63 |     for line in text.strip().splitlines():
 64 |         if 'This page appears when Google automatically detects requests coming from your computer network' in line \
 65 |                 or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
 66 |             # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
 67 |             return True
 68 |     return False
 69 | 
 70 | 
 71 | async def google_workaround(visit_url: str) -> Union[bool, str]:
 72 |     """
 73 |     Function that makes a request on our behalf, if Google starts to block us
 74 |     :param visit_url: Url to scrape
 75 |     :return: Correct html that can be parsed by BS4
 76 |     """
 77 |     url = 'https://websniffer.cc/'
 78 |     data = {
 79 |         'Cookie': '',
 80 |         'url': visit_url,
 81 |         'submit': 'Submit',
 82 |         'type': 'GET&http=1.1',
 83 |         'uak': str(random.randint(4, 8))  # select random UA to send to Google
 84 |     }
 85 |     returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data)
 86 |     returned_html = "This page appears when Google automatically detects requests coming from your computer network" \
 87 |         if returned_html == "" else returned_html[0]
 88 | 
 89 |     returned_html = "" if 'Please Wait... | Cloudflare' in returned_html else returned_html
 90 | 
 91 |     if len(returned_html) == 0 or await search(returned_html) or '&lt;html' not in returned_html:
 92 |         # indicates that google is serving workaround a captcha
 93 |         # That means we will try out second option which will utilize proxies
 94 |         return True
 95 |     # the html we get is malformed for BS4 as there are no greater than or less than signs
 96 |     if '&lt;html&gt;' in returned_html:
 97 |         start_index = returned_html.index('&lt;html&gt;')
 98 |     else:
 99 |         start_index = returned_html.index('&lt;html')
100 | 
101 |     end_index = returned_html.index('&lt;/html&gt;') + 1
102 |     correct_html = returned_html[start_index:end_index]
103 |     # Slice list to get the response's html
104 |     correct_html = ''.join([ch.strip().replace('&lt;', '<').replace('&gt;', '>') for ch in correct_html])
105 |     return correct_html
106 | 
107 | 
108 | class MissingKey(Exception):
109 |     """
110 |     :raise: When there is a module that has not been provided its API key
111 |     """
112 |     def __init__(self, source: Optional[str]):
113 |         if source:
114 |             self.message = f'\n\033[93m[!] Missing API key for {source}. \033[0m'
115 |         else:
116 |             self.message = '\n\033[93m[!] Missing CSE id. \033[0m'
117 | 
118 |     def __str__(self) -> str:
119 |         return self.message
120 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/crtsh.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | from typing import List, Set
 3 | 
 4 | 
 5 | class SearchCrtsh:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.data = set()
10 |         self.proxy = False
11 | 
12 |     async def do_search(self) -> List:
13 |         data: set = set()
14 |         try:
15 |             url = f'https://crt.sh/?q=%25.{self.word}&output=json'
16 |             response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
17 |             response = response[0]
18 |             data = set(
19 |                 [dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value']
20 |                  for dct in response])
21 |             data = {domain for domain in data if (domain[0] != '*' and str(domain[0:4]).isnumeric() is False)}
22 |         except Exception as e:
23 |             print(e)
24 |         clean = []
25 |         for x in data:
26 |             pre = x.split()
27 |             for y in pre:
28 |                 clean.append(y)
29 |         return clean
30 | 
31 |     async def process(self, proxy=False) -> None:
32 |         self.proxy = proxy
33 |         data = await self.do_search()
34 |         self.data = data
35 | 
36 |     async def get_hostnames(self) -> Set:
37 |         return self.data
38 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/dnsdumpster.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | from theHarvester.parsers import myparser
 3 | import aiohttp
 4 | import asyncio
 5 | 
 6 | 
 7 | class SearchDnsDumpster:
 8 | 
 9 |     def __init__(self, word):
10 |         self.word = word.replace(' ', '%20')
11 |         self.results = ""
12 |         self.totalresults = ""
13 |         self.server = 'dnsdumpster.com'
14 |         self.proxy = False
15 | 
16 |     async def do_search(self):
17 |         try:
18 |             agent = Core.get_user_agent()
19 |             headers = {'User-Agent': agent}
20 |             session = aiohttp.ClientSession(headers=headers)
21 |             # create a session to properly verify
22 |             url = f'https://{self.server}'
23 |             csrftoken = ''
24 |             if self.proxy is False:
25 |                 async with session.get(url, headers=headers) as resp:
26 |                     cookies = str(resp.cookies)
27 |                     cookies = cookies.split('csrftoken=')
28 |                     csrftoken += cookies[1][:cookies[1].find(';')]
29 |             else:
30 |                 async with session.get(url, headers=headers, proxy=self.proxy) as resp:
31 |                     cookies = str(resp.cookies)
32 |                     cookies = cookies.split('csrftoken=')
33 |                     csrftoken += cookies[1][:cookies[1].find(';')]
34 |             await asyncio.sleep(2)
35 | 
36 |             # extract csrftoken from cookies
37 |             data = {
38 |                 'Cookie': f'csfrtoken={csrftoken}', 'csrfmiddlewaretoken': csrftoken,
39 |                 'targetip': self.word, 'user': 'free'}
40 |             headers['Referer'] = url
41 |             if self.proxy is False:
42 |                 async with session.post(url, headers=headers, data=data) as resp:
43 |                     self.results = await resp.text()
44 |             else:
45 |                 async with session.post(url, headers=headers, data=data, proxy=self.proxy) as resp:
46 |                     self.results = await resp.text()
47 |             await session.close()
48 |         except Exception as e:
49 |             print(f'An exception occurred: {e}')
50 |         self.totalresults += self.results
51 | 
52 |     async def get_hostnames(self):
53 |         rawres = myparser.Parser(self.totalresults, self.word)
54 |         return await rawres.hostnames()
55 | 
56 |     async def process(self, proxy=False):
57 |         self.proxy = proxy
58 |         await self.do_search()  # Only need to do it once.
59 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/dnssearch.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | ============
  5 | DNS Browsing
  6 | ============
  7 | 
  8 | Explore the space around known hosts & ips for extra catches.
  9 | """
 10 | 
 11 | import re
 12 | import sys
 13 | 
 14 | from aiodns import DNSResolver
 15 | from ipaddress import IPv4Network
 16 | from typing import Callable, List, Optional
 17 | from theHarvester.lib import hostchecker
 18 | 
 19 | 
 20 | #####################################################################
 21 | # DNS FORCE
 22 | #####################################################################
 23 | 
 24 | 
 25 | class DnsForce:
 26 | 
 27 |     def __init__(self, domain, dnsserver, verbose=False):
 28 |         self.domain = domain
 29 |         self.subdo = False
 30 |         self.verbose = verbose
 31 |         # self.dnsserver = [dnsserver] if isinstance(dnsserver, str) else dnsserver
 32 |         self.dnsserver = list(map(str, dnsserver.split(','))) if isinstance(dnsserver, str) else dnsserver
 33 |         try:
 34 |             with open('/etc/theHarvester/wordlists/dns-names.txt', 'r') as file:
 35 |                 self.list = file.readlines()
 36 |         except FileNotFoundError:
 37 |             try:
 38 |                 with open('/usr/local/etc/theHarvester/wordlists/dns-names.txt', 'r') as file:
 39 |                     self.list = file.readlines()
 40 |             except FileNotFoundError:
 41 |                 with open('wordlists/dns-names.txt', 'r') as file:
 42 |                     self.list = file.readlines()
 43 |         self.domain = domain.replace('www.', '')
 44 |         self.list = [f'{word.strip()}.{self.domain}' for word in self.list]
 45 | 
 46 |     async def run(self):
 47 |         print(f'Starting DNS brute forcing with {len(self.list)} words')
 48 |         checker = hostchecker.Checker(
 49 |             self.list) if self.dnsserver == [] or self.dnsserver == "" or self.dnsserver is None \
 50 |             else hostchecker.Checker(self.list, nameserver=self.dnsserver)
 51 |         hosts, ips = await checker.check()
 52 |         return hosts, ips
 53 | 
 54 | 
 55 | #####################################################################
 56 | # DNS REVERSE
 57 | #####################################################################
 58 | 
 59 | 
 60 | IP_REGEX = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
 61 | PORT_REGEX = r'\d{1,5}'
 62 | NETMASK_REGEX = r'\d{1,2}|' + IP_REGEX
 63 | NETWORK_REGEX = r'\b({})(?:\:({}))?(?:\/({}))?\b'.format(
 64 |     IP_REGEX,
 65 |     PORT_REGEX,
 66 |     NETMASK_REGEX)
 67 | 
 68 | 
 69 | def serialize_ip_range(ip: str, netmask: str = '24') -> str:
 70 |     """
 71 |     Serialize a network range in a constant format, 'x.x.x.x/y'.
 72 | 
 73 |     Parameters
 74 |     ----------
 75 |     ip: str.
 76 |         A serialized ip in the format 'x.x.x.x'.
 77 |         Extra information like port (':z') or subnet ('/n')
 78 |         will be ignored.
 79 |     netmask: str.
 80 |         The subnet subdivision, represented by a 2 digit netmask.
 81 | 
 82 |     Returns
 83 |     -------
 84 |     out: str.
 85 |         The network OSI address, like '192.168.0.0/24'.
 86 |     """
 87 |     __ip_matches = re.search(NETWORK_REGEX, ip, re.IGNORECASE)
 88 |     if __ip_matches and __ip_matches.groups():
 89 |         __ip = __ip_matches.group(1)
 90 |         __netmask = netmask if netmask else __ip_matches.group(3)
 91 |         if __ip and __netmask:
 92 |             return str(IPv4Network('{}/{}'.format(__ip, __netmask), strict=False))
 93 |         elif __ip:
 94 |             return str(IPv4Network('{}/{}'.format(__ip, '24'), strict=False))
 95 | 
 96 |     # invalid input ip
 97 |     return ''
 98 | 
 99 | 
100 | def list_ips_in_network_range(iprange: str) -> List[str]:
101 |     """
102 |     List all the IPs in the range.
103 | 
104 |     Parameters
105 |     ----------
106 |     iprange: str.
107 |         A serialized ip range, like '1.2.3.0/24'.
108 |         The last digit can be set to anything, it will be ignored.
109 | 
110 |     Returns
111 |     -------
112 |     out: list.
113 |         The list of IPs in the range.
114 |     """
115 |     try:
116 |         __network = IPv4Network(iprange, strict=False)
117 |         return [__address.exploded for __address in __network.hosts()]
118 |     except Exception:
119 |         return []
120 | 
121 | 
122 | async def reverse_single_ip(ip: str, resolver: DNSResolver) -> str:
123 |     """
124 |     Reverse a single IP and output the linked CNAME, if it exists.
125 |         Parameters
126 |         ----------
127 |         :param ip:  IP address to reverse
128 |         :param resolver: DNS server to use
129 | 
130 |         Returns
131 |         -------
132 |         :return str: with the corresponding CNAME or None
133 |     """
134 |     try:
135 |         __host = await resolver.gethostbyaddr(ip)
136 |         return __host.name if __host else ''
137 |     except Exception:
138 |         return ''
139 | 
140 | 
141 | async def reverse_all_ips_in_range(iprange: str, callback: Callable, nameservers: Optional[List[str]] = None) -> None:
142 |     """
143 |     Reverse all the IPs stored in a network range.
144 |     All the queries are made concurrently.
145 | 
146 |     Parameters
147 |     ----------
148 |     iprange: str.
149 |         An IPv4 range formatted as 'x.x.x.x/y'.
150 |         The last 2 digits of the ip can be set to anything,
151 |         they will be ignored.
152 |     callback: Callable.
153 |         Arbitrary postprocessing function.
154 |     nameservers: List[str].
155 |         Optional list of DNS servers.
156 | 
157 |     Returns
158 |     -------
159 |     out: None.
160 |     """
161 |     __resolver = DNSResolver(timeout=4, nameservers=nameservers)
162 |     for __ip in list_ips_in_network_range(iprange):
163 |         log_query(__ip)
164 |         __host = await reverse_single_ip(ip=__ip, resolver=__resolver)
165 |         callback(__host)
166 |         log_result(__host)
167 | 
168 | 
169 | #####################################################################
170 | # IO
171 | #####################################################################
172 | 
173 | 
174 | def log_query(ip: str) -> None:
175 |     """
176 |     Display the current query in the console.
177 | 
178 |     Parameters
179 |     ----------
180 |     ip: str.
181 |         Queried ip.
182 | 
183 |     Results
184 |     -------
185 |     out: None.
186 |     """
187 |     sys.stdout.write(chr(27) + '[2K' + chr(27) + '[G')
188 |     sys.stdout.write('\r' + ip + ' - ')
189 |     sys.stdout.flush()
190 | 
191 | 
192 | def log_result(host: str) -> None:
193 |     """
194 |     Display the query result in the console.
195 | 
196 |     Parameters
197 |     ----------
198 |     host: str.
199 |         Host name returned by the DNS query.
200 | 
201 |     Results
202 |     -------
203 |     out: None.
204 |     """
205 |     if host:
206 |         print(host)
207 | 
208 | 
209 | def generate_postprocessing_callback(target: str, **allhosts: List[str]) -> Callable:
210 |     """
211 |     Postprocess the query results asynchronously too, instead of waiting for
212 |     the querying stage to be completely finished.
213 | 
214 |     Parameters
215 |     ----------
216 |     target: str.
217 |         The domain wanted as TLD.
218 |     allhosts: List.
219 |         A collection of all the subdomains -of target- found so far.
220 | 
221 |     Returns
222 |     -------
223 |     out: Callable.
224 |         A function that will update the collection of target subdomains
225 |         when the query result is satisfying.
226 |     """
227 | 
228 |     def append_matching_hosts(host: str) -> None:
229 |         if host and target in host:
230 |             for __name, __hosts in allhosts.items():
231 |                 if host not in __hosts:
232 |                     __hosts.append(host)
233 | 
234 |     return append_matching_hosts
235 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/duckduckgosearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | from theHarvester.parsers import myparser
 4 | import json
 5 | 
 6 | 
 7 | class SearchDuckDuckGo:
 8 | 
 9 |     def __init__(self, word, limit):
10 |         self.word = word
11 |         self.results = ""
12 |         self.totalresults = ""
13 |         self.dorks = []
14 |         self.links = []
15 |         self.database = 'https://duckduckgo.com/?q='
16 |         self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1'  # Currently using API.
17 |         self.quantity = '100'
18 |         self.limit = limit
19 |         self.proxy = False
20 | 
21 |     async def do_search(self):
22 |         # Do normal scraping.
23 |         url = self.api.replace('x', self.word)
24 |         headers = {'User-Agent': googleUA}
25 |         first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
26 |         self.results = first_resp[0]
27 |         self.totalresults += self.results
28 |         urls = await self.crawl(self.results)
29 |         urls = {url for url in urls if len(url) > 5}
30 |         all_resps = await AsyncFetcher.fetch_all(urls)
31 |         self.totalresults += ''.join(all_resps)
32 | 
33 |     async def crawl(self, text):
34 |         """
35 |         Function parses json and returns URLs.
36 |         :param text: formatted json
37 |         :return: set of URLs
38 |         """
39 |         urls = set()
40 |         try:
41 |             load = json.loads(text)
42 |             for keys in load.keys():  # Iterate through keys of dict.
43 |                 val = load.get(keys)
44 |                 if isinstance(val, int) or isinstance(val, dict) or val is None:
45 |                     continue
46 |                 if isinstance(val, list):
47 |                     if len(val) == 0:  # Make sure not indexing an empty list.
48 |                         continue
49 |                     val = val[0]  # First value should be dict.
50 |                     if isinstance(val, dict):  # Sanity check.
51 |                         for key in val.keys():
52 |                             value = val.get(key)
53 |                             if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
54 |                                 urls.add(value)
55 |                 if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
56 |                     urls.add(val)
57 |             tmp = set()
58 |             for url in urls:
59 |                 if '<' in url and 'href=' in url:  # Format is <href="https://www.website.com"/>
60 |                     equal_index = url.index('=')
61 |                     true_url = ''
62 |                     for ch in url[equal_index + 1:]:
63 |                         if ch == '"':
64 |                             tmp.add(true_url)
65 |                             break
66 |                         true_url += ch
67 |                 else:
68 |                     if url != '':
69 |                         tmp.add(url)
70 |             return tmp
71 |         except Exception as e:
72 |             print(f'Exception occurred: {e}')
73 |             return []
74 | 
75 |     async def get_emails(self):
76 |         rawres = myparser.Parser(self.totalresults, self.word)
77 |         return await rawres.emails()
78 | 
79 |     async def get_hostnames(self):
80 |         rawres = myparser.Parser(self.totalresults, self.word)
81 |         return await rawres.hostnames()
82 | 
83 |     async def process(self, proxy=False):
84 |         self.proxy = proxy
85 |         await self.do_search()  # Only need to search once since using API.
86 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/fullhuntsearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchFullHunt:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.key = Core.fullhunt_key()
10 |         if self.key is None:
11 |             raise MissingKey('fullhunt')
12 |         self.total_results = None
13 |         self.proxy = False
14 | 
15 |     async def do_search(self):
16 |         url = f'https://fullhunt.io/api/v1/domain/{self.word}/subdomains'
17 |         response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(),
18 |                                                                            'X-API-KEY': self.key},
19 |                                                 proxy=self.proxy)
20 |         self.total_results = response[0]['hosts']
21 | 
22 |     async def get_hostnames(self) -> set:
23 |         return self.total_results
24 | 
25 |     async def process(self, proxy=False):
26 |         self.proxy = proxy
27 |         await self.do_search()
28 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/githubcode.py:
--------------------------------------------------------------------------------
  1 | from theHarvester.discovery.constants import *
  2 | from theHarvester.lib.core import *
  3 | from theHarvester.parsers import myparser
  4 | from typing import List, Dict, Any, Optional, NamedTuple, Tuple
  5 | import asyncio
  6 | import aiohttp
  7 | import urllib.parse as urlparse
  8 | import random
  9 | 
 10 | 
 11 | class RetryResult(NamedTuple):
 12 |     time: float
 13 | 
 14 | 
 15 | class SuccessResult(NamedTuple):
 16 |     fragments: List[str]
 17 |     next_page: Optional[int]
 18 |     last_page: Optional[int]
 19 | 
 20 | 
 21 | class ErrorResult(NamedTuple):
 22 |     status_code: int
 23 |     body: Any
 24 | 
 25 | 
 26 | class SearchGithubCode:
 27 | 
 28 |     def __init__(self, word, limit):
 29 |         self.word = word
 30 |         self.total_results = ""
 31 |         self.server = 'api.github.com'
 32 |         self.limit = limit
 33 |         self.counter = 0
 34 |         self.page = 1
 35 |         self.key = Core.github_key()
 36 |         # If you don't have a personal access token, github narrows your search capabilities significantly
 37 |         # rate limits you more severely
 38 |         # https://developer.github.com/v3/search/#rate-limit
 39 |         if self.key is None:
 40 |             raise MissingKey('Github')
 41 |         self.proxy = False
 42 | 
 43 |     @staticmethod
 44 |     async def fragments_from_response(json_data: dict) -> List[str]:
 45 |         items: List[Dict[str, Any]] = json_data.get('items') or list()
 46 |         fragments: List[str] = list()
 47 |         for item in items:
 48 |             matches = item.get("text_matches") or list()
 49 |             for match in matches:
 50 |                 fragments.append(match.get("fragment"))
 51 | 
 52 |         return [fragment for fragment in fragments if fragment is not None]
 53 | 
 54 |     @staticmethod
 55 |     async def page_from_response(page: str, links) -> Optional[Any]:
 56 |         page_link = links.get(page)
 57 |         if page_link:
 58 |             parsed = urlparse.urlparse(str(page_link.get("url")))
 59 |             params = urlparse.parse_qs(parsed.query)
 60 |             pages: List[Any] = params.get('page', [None])
 61 |             page_number = pages[0] and int(pages[0])
 62 |             return page_number
 63 |         else:
 64 |             return None
 65 | 
 66 |     async def handle_response(self, response: Tuple[str, dict, int, Any]):
 67 |         text, json_data, status, links = response
 68 |         if status == 200:
 69 |             results = await self.fragments_from_response(json_data)
 70 |             next_page = await self.page_from_response("next", links)
 71 |             last_page = await self.page_from_response("last", links)
 72 |             return SuccessResult(results, next_page, last_page)
 73 |         elif status == 429 or status == 403:
 74 |             return RetryResult(60)
 75 |         else:
 76 |             try:
 77 |                 return ErrorResult(status, json_data)
 78 |             except ValueError:
 79 |                 return ErrorResult(status, text)
 80 | 
 81 |     async def do_search(self, page: Optional[int]) -> Tuple[str, dict, int, Any]:
 82 |         if page is None:
 83 |             url = f'https://{self.server}/search/code?q="{self.word}"'
 84 |         else:
 85 |             url = f'https://{self.server}/search/code?q="{self.word}"&page={page}'
 86 |         headers = {
 87 |             'Host': self.server,
 88 |             'User-agent': Core.get_user_agent(),
 89 |             'Accept': "application/vnd.github.v3.text-match+json",
 90 |             'Authorization': f'token {self.key}'
 91 |         }
 92 | 
 93 |         async with aiohttp.ClientSession(headers=headers) as sess:
 94 |             if self.proxy:
 95 |                 async with sess.get(url, proxy=random.choice(Core.proxy_list())) as resp:
 96 |                     return await resp.text(), await resp.json(), resp.status, resp.links
 97 |             else:
 98 |                 async with sess.get(url) as resp:
 99 |                     return await resp.text(), await resp.json(), resp.status, resp.links
100 | 
101 |     @staticmethod
102 |     async def next_page_or_end(result: SuccessResult) -> Optional[int]:
103 |         if result.next_page is not None:
104 |             return result.next_page
105 |         else:
106 |             return result.last_page
107 | 
108 |     async def process(self, proxy=False):
109 |         self.proxy = proxy
110 |         try:
111 |             while self.counter <= self.limit and self.page is not None:
112 |                 api_response = await self.do_search(self.page)
113 |                 result = await self.handle_response(api_response)
114 |                 if isinstance(result, SuccessResult):
115 |                     print(f'\tSearching {self.counter} results.')
116 |                     for fragment in result.fragments:
117 |                         self.total_results += fragment
118 |                         self.counter = self.counter + 1
119 |                     self.page = await self.next_page_or_end(result)
120 |                     await asyncio.sleep(get_delay())
121 |                 elif isinstance(result, RetryResult):
122 |                     sleepy_time = get_delay() + result.time
123 |                     print(f'\tRetrying page in {sleepy_time} seconds...')
124 |                     await asyncio.sleep(sleepy_time)
125 |                 elif isinstance(result, ErrorResult):
126 |                     raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}")
127 |                 else:
128 |                     raise Exception("\tUnknown exception occurred")
129 |         except Exception as e:
130 |             print(f'An exception has occurred: {e}')
131 | 
132 |     async def get_emails(self):
133 |         rawres = myparser.Parser(self.total_results, self.word)
134 |         return await rawres.emails()
135 | 
136 |     async def get_hostnames(self):
137 |         rawres = myparser.Parser(self.total_results, self.word)
138 |         return await rawres.hostnames()
139 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/hackertarget.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | 
 3 | 
 4 | class SearchHackerTarget:
 5 |     """
 6 |     Class uses the HackerTarget api to gather subdomains and ips
 7 |     """
 8 | 
 9 |     def __init__(self, word):
10 |         self.word = word
11 |         self.total_results = ""
12 |         self.hostname = 'https://api.hackertarget.com'
13 |         self.proxy = False
14 |         self.results = None
15 | 
16 |     async def do_search(self):
17 |         headers = {'User-agent': Core.get_user_agent()}
18 |         urls = [f'{self.hostname}/hostsearch/?q={self.word}', f'{self.hostname}/reversedns/?q={self.word}']
19 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
20 |         for response in responses:
21 |             self.total_results += response.replace(",", ":")
22 | 
23 |     async def process(self, proxy=False):
24 |         self.proxy = proxy
25 |         await self.do_search()
26 | 
27 |     async def get_hostnames(self) -> list:
28 |         return [result for result in self.total_results.splitlines() if 'No PTR records found' not in result]
29 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/huntersearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchHunter:
 6 | 
 7 |     def __init__(self, word, limit, start):
 8 |         self.word = word
 9 |         self.limit = limit
10 |         self.limit = 10 if limit > 10 else limit
11 |         self.start = start
12 |         self.key = Core.hunter_key()
13 |         if self.key is None:
14 |             raise MissingKey('Hunter')
15 |         self.total_results = ""
16 |         self.counter = start
17 |         self.database = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit=10'
18 |         self.proxy = False
19 |         self.hostnames = []
20 |         self.emails = []
21 | 
22 |     async def do_search(self):
23 |         # First determine if user account is not a free account, this call is free
24 |         is_free = True
25 |         headers = {'User-Agent': Core.get_user_agent()}
26 |         acc_info_url = f'https://api.hunter.io/v2/account?api_key={self.key}'
27 |         response = await AsyncFetcher.fetch_all([acc_info_url], headers=headers, json=True)
28 |         is_free = is_free if 'plan_name' in response[0]['data'].keys() and response[0]['data']['plan_name'].lower() \
29 |                              == 'free' else False
30 |         # Extract total number of requests that are available for account
31 | 
32 |         total_requests_avail = response[0]['data']['requests']['searches']['available'] - response[0]['data']['requests']['searches']['used']
33 |         if is_free:
34 |             response = await AsyncFetcher.fetch_all([self.database], headers=headers, proxy=self.proxy, json=True)
35 |             self.emails, self.hostnames = await self.parse_resp(json_resp=response[0])
36 |         else:
37 |             # Determine total number of emails that are available
38 |             # As the most emails you can get within one query is 100
39 |             # This is only done where paid accounts are in play
40 |             hunter_dinfo_url = f'https://api.hunter.io/v2/email-count?domain={self.word}'
41 |             response = await AsyncFetcher.fetch_all([hunter_dinfo_url], headers=headers, proxy=self.proxy, json=True)
42 |             total_number_reqs = response[0]['data']['total'] // 100
43 |             # Parse out meta field within initial JSON response to determine total number of results
44 |             if total_requests_avail < total_number_reqs:
45 |                 print('WARNING: account does not have enough requests to gather all emails')
46 |                 print(f'Total requests available: {total_requests_avail}, total requests '
47 |                       f'needed to be made: {total_number_reqs}')
48 |                 print('RETURNING current results, if you would still like to '
49 |                       'run this module comment out the if request')
50 |                 return
51 |             self.limit = 100
52 |             # max number of emails you can get per request is 100
53 |             # increments of 100 with offset determining where to start
54 |             # See docs for more details: https://hunter.io/api-documentation/v2#domain-search
55 |             for offset in range(0, 100 * total_number_reqs, 100):
56 |                 req_url = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit{self.limit}&offset={offset}'
57 |                 response = await AsyncFetcher.fetch_all([req_url], headers=headers, proxy=self.proxy, json=True)
58 |                 temp_emails, temp_hostnames = await self.parse_resp(response[0])
59 |                 self.emails.extend(temp_emails)
60 |                 self.hostnames.extend(temp_hostnames)
61 |                 await asyncio.sleep(1)
62 | 
63 |     async def parse_resp(self, json_resp):
64 |         emails = list(sorted({email['value'] for email in json_resp['data']['emails']}))
65 |         domains = list(sorted({source['domain'] for email in json_resp['data']['emails'] for source in email['sources']
66 |                                if self.word in source['domain']}))
67 |         return emails, domains
68 | 
69 |     async def process(self, proxy=False):
70 |         self.proxy = proxy
71 |         await self.do_search()  # Only need to do it once.
72 | 
73 |     async def get_emails(self):
74 |         return self.emails
75 | 
76 |     async def get_hostnames(self):
77 |         return self.hostnames
78 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/intelxsearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | from theHarvester.parsers import intelxparser
 4 | import asyncio
 5 | import json
 6 | import requests
 7 | 
 8 | 
 9 | class SearchIntelx:
10 | 
11 |     def __init__(self, word):
12 |         self.word = word
13 |         self.key = Core.intelx_key()
14 |         if self.key is None:
15 |             raise MissingKey('Intelx')
16 |         self.database = 'https://2.intelx.io'
17 |         self.results = None
18 |         self.info = ()
19 |         self.limit = 10000
20 |         self.proxy = False
21 |         self.offset = -1
22 | 
23 |     async def do_search(self):
24 |         try:
25 |             # Based on: https://github.com/IntelligenceX/SDK/blob/master/Python/intelxapi.py
26 |             # API requests self identification
27 |             # https://intelx.io/integrations
28 |             headers = {'x-key': self.key, 'User-Agent': f'{Core.get_user_agent()}-theHarvester'}
29 |             data = {
30 |                 "term": self.word,
31 |                 "buckets": [],
32 |                 "lookuplevel": 0,
33 |                 "maxresults": self.limit,
34 |                 "timeout": 5,
35 |                 "datefrom": "",
36 |                 "dateto": "",
37 |                 "sort": 2,
38 |                 "media": 0,
39 |                 "terminate": [],
40 |                 "target": 0
41 |             }
42 | 
43 |             total_resp = requests.post(f'{self.database}/phonebook/search', headers=headers, json=data)
44 |             phonebook_id = json.loads(total_resp.text)['id']
45 |             await asyncio.sleep(2)
46 | 
47 |             # Fetch results from phonebook based on ID
48 |             resp = await AsyncFetcher.fetch_all(
49 |                 [f'{self.database}/phonebook/search/result?id={phonebook_id}&limit={self.limit}&offset={self.offset}'],
50 |                 headers=headers, json=True, proxy=self.proxy)
51 |             resp = resp[0]
52 |             self.results = resp
53 |         except Exception as e:
54 |             print(f'An exception has occurred in Intelx: {e}')
55 | 
56 |     async def process(self, proxy=False):
57 |         self.proxy = proxy
58 |         await self.do_search()
59 |         intelx_parser = intelxparser.Parser()
60 |         self.info = await intelx_parser.parse_dictionaries(self.results)
61 | 
62 |     async def get_emails(self):
63 |         return self.info[0]
64 | 
65 |     async def get_interestingurls(self):
66 |         return self.info[1]
67 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/omnisint.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | 
 3 | 
 4 | class SearchOmnisint:
 5 |     def __init__(self, word):
 6 |         self.word = word
 7 |         self.totalhosts = set()
 8 |         self.totalips = set()
 9 |         self.proxy = False
10 | 
11 |     async def do_search(self):
12 |         base_url = f'https://sonar.omnisint.io/all/{self.word}?page=1'
13 |         responses = await AsyncFetcher.fetch_all([base_url], json=True, headers={'User-Agent': Core.get_user_agent()},
14 |                                                  proxy=self.proxy)
15 |         self.totalhosts = list({host for host in responses[0]})
16 | 
17 |     async def get_hostnames(self) -> set:
18 |         return self.totalhosts
19 | 
20 |     async def get_ips(self) -> set:
21 |         return self.totalips
22 | 
23 |     async def process(self, proxy=False):
24 |         self.proxy = proxy
25 |         await self.do_search()
26 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/otxsearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | import re
 3 | 
 4 | 
 5 | class SearchOtx:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.totalhosts = set()
10 |         self.totalips = set()
11 |         self.proxy = False
12 | 
13 |     async def do_search(self):
14 |         url = f'https://otx.alienvault.com/api/v1/indicators/domain/{self.word}/passive_dns'
15 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
16 |         responses = response[0]
17 |         dct = responses
18 |         self.totalhosts: set = {host['hostname'] for host in dct['passive_dns']}
19 |         # filter out ips that are just called NXDOMAIN
20 |         self.totalips: set = {ip['address'] for ip in dct['passive_dns']
21 |                               if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip['address'])}
22 | 
23 |     async def get_hostnames(self) -> set:
24 |         return self.totalhosts
25 | 
26 |     async def get_ips(self) -> set:
27 |         return self.totalips
28 | 
29 |     async def process(self, proxy=False):
30 |         self.proxy = proxy
31 |         await self.do_search()
32 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/pentesttools.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | import json
 4 | import time
 5 | 
 6 | 
 7 | class SearchPentestTools:
 8 | 
 9 |     def __init__(self, word):
10 |         # Script is largely based off https://pentest-tools.com/public/api_client.py.txt
11 |         self.word = word
12 |         self.key = Core.pentest_tools_key()
13 |         if self.key is None:
14 |             raise MissingKey('PentestTools')
15 |         self.total_results = []
16 |         self.api = f'https://pentest-tools.com/api?key={self.key}'
17 |         self.proxy = False
18 | 
19 |     async def poll(self, scan_id):
20 |         while True:
21 |             time.sleep(3)
22 |             # Get the status of our scan
23 |             scan_status_data = {
24 |                 'op': 'get_scan_status',
25 |                 'scan_id': scan_id
26 |             }
27 |             responses = await AsyncFetcher.post_fetch(url=self.api, data=json.dumps(scan_status_data), proxy=self.proxy)
28 |             res_json = json.loads(responses.strip())
29 |             if res_json['op_status'] == 'success':
30 |                 if res_json['scan_status'] != 'waiting' and res_json['scan_status'] != 'running':
31 |                     getoutput_data = {
32 |                         'op': 'get_output',
33 |                         'scan_id': scan_id,
34 |                         'output_format': 'json'
35 |                     }
36 |                     responses = await AsyncFetcher.post_fetch(url=self.api,
37 |                                                               data=json.dumps(getoutput_data),
38 |                                                               proxy=self.proxy)
39 | 
40 |                     res_json = json.loads(responses.strip('\n'))
41 |                     self.total_results = await self.parse_json(res_json)
42 |                     break
43 |             else:
44 |                 print(f"Operation get_scan_status failed because: {res_json['error']}. {res_json['details']}")
45 |                 break
46 | 
47 |     @staticmethod
48 |     async def parse_json(json_results):
49 |         status = json_results['op_status']
50 |         if status == 'success':
51 |             scan_tests = json_results['scan_output']['output_json']
52 |             output_data = scan_tests[0]['output_data']
53 |             host_to_ip = [f'{subdomain[0]}:{subdomain[1]}' for subdomain in output_data if len(subdomain) > 0]
54 |             return host_to_ip
55 |         return []
56 | 
57 |     async def get_hostnames(self) -> list:
58 |         return self.total_results
59 | 
60 |     async def do_search(self):
61 |         subdomain_payload = {
62 |             'op': 'start_scan',
63 |             'tool_id': 20,
64 |             'tool_params': {
65 |                 'target': f'{self.word}',
66 |                 'web_details': 'off',
67 |                 'do_smart_search': 'off'
68 |             }
69 |         }
70 |         responses = await AsyncFetcher.post_fetch(url=self.api, data=json.dumps(subdomain_payload), proxy=self.proxy)
71 |         res_json = json.loads(responses.strip())
72 |         if res_json['op_status'] == 'success':
73 |             scan_id = res_json['scan_id']
74 |             await self.poll(scan_id)
75 | 
76 |     async def process(self, proxy=False):
77 |         self.proxy = proxy
78 |         await self.do_search()  # Only need to do it once.
79 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/projectdiscovery.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchDiscovery:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.key = Core.projectdiscovery_key()
10 |         if self.key is None:
11 |             raise MissingKey('ProjectDiscovery')
12 |         self.total_results = None
13 |         self.proxy = False
14 | 
15 |     async def do_search(self):
16 |         url = f'https://dns.projectdiscovery.io/dns/{self.word}/subdomains'
17 |         response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(),
18 |                                                                            'Authorization': self.key},
19 |                                                 proxy=self.proxy)
20 |         self.total_results = [f'{domains}.{self.word}' for domains in response[0]['subdomains']]
21 | 
22 |     async def get_hostnames(self) -> set:
23 |         return self.total_results
24 | 
25 |     async def process(self, proxy=False):
26 |         self.proxy = proxy
27 |         await self.do_search()
28 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/qwantsearch.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | from json.decoder import JSONDecodeError
 4 | 
 5 | from theHarvester.lib.core import *
 6 | from theHarvester.parsers import myparser
 7 | 
 8 | 
 9 | class SearchQwant:
10 |     def __init__(self, word, start, limit):
11 |         self.word = word
12 |         self.total_results = ""
13 |         self.limit = int(limit)
14 |         self.start = int(start)
15 |         self.proxy = False
16 | 
17 |     def get_start_offset(self) -> int:
18 |         """
19 |         print(get_start_offset(0))
20 |         >>> 0
21 |         print(get_start_offset(7))
22 |         >>> 0
23 |         print(get_start_offset(25))
24 |         >>> 20
25 |         print(get_start_offset(42))
26 |         >>> 40
27 |         """
28 |         start = int(math.floor(self.start / 10.0)) * 10
29 |         return max(start, 0)
30 | 
31 |     async def do_search(self) -> None:
32 |         headers = {'User-agent': Core.get_user_agent()}
33 | 
34 |         start = self.get_start_offset()
35 |         limit = self.limit + start
36 |         step = 10
37 | 
38 |         api_urls = [
39 |             f"https://api.qwant.com/api/search/web?count=10&offset={str(offset)}&q={self.word}&t=web&r=US&device=desktop&safesearch=0&locale=en_US&uiv=4"
40 |             for offset in range(start, limit, step)
41 |         ]
42 | 
43 |         responses = await AsyncFetcher.fetch_all(api_urls, headers=headers, proxy=self.proxy)
44 | 
45 |         for response in responses:
46 |             try:
47 |                 json_response = json.loads(response)
48 |             except JSONDecodeError:
49 |                 # sometimes error 502 from server
50 |                 continue
51 | 
52 |             try:
53 |                 response_items = json_response['data']['result']['items']
54 |             except KeyError:
55 |                 if json_response.get("status", None) \
56 |                         and json_response.get("error", None) == 24:
57 |                     # https://www.qwant.com/anti_robot
58 |                     print("Rate limit reached - IP Blocked until captcha is solved")
59 |                     break
60 |                 continue
61 | 
62 |             for response_item in response_items:
63 |                 desc = response_item.get('desc', '')
64 |                 """
65 |                 response_item[0]['desc'] = "end of previous description."
66 |                 response_item[1]['desc'] = "john.doo@company.com start the next description"
67 |                 total_results = "end of first description.john.doo@company.com"
68 |                 get_emails() = "description.john.doo@company.com"
69 |                 """
70 |                 self.total_results += " "
71 |                 self.total_results += desc
72 | 
73 |     async def get_emails(self) -> set:
74 |         parser = myparser.Parser(self.total_results, self.word)
75 |         return await parser.emails()
76 | 
77 |     async def get_hostnames(self) -> list:
78 |         parser = myparser.Parser(self.total_results, self.word)
79 |         return await parser.hostnames()
80 | 
81 |     async def process(self, proxy=False) -> None:
82 |         self.proxy = proxy
83 |         await self.do_search()
84 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/rapiddns.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchRapidDns:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.total_results = []
10 |         self.proxy = False
11 | 
12 |     async def do_search(self):
13 |         try:
14 |             headers = {'User-agent': Core.get_user_agent()}
15 |             # TODO see if it's worth adding sameip searches
16 |             # f'{self.hostname}/sameip/{self.word}?full=1#result'
17 |             urls = [f'https://rapiddns.io/subdomain/{self.word}?full=1#result']
18 |             responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
19 |             if len(responses[0]) <= 1:
20 |                 return self.total_results
21 |             soup = BeautifulSoup(responses[0], 'html.parser')
22 |             rows = soup.find("table").find("tbody").find_all("tr")
23 |             if rows:
24 |                 # Sanity check
25 |                 for row in rows:
26 |                     cells = row.find_all("td")
27 |                     if len(cells) >= 0:
28 |                         # sanity check
29 |                         subdomain = str(cells[0].get_text())
30 |                         if cells[-1].get_text() == 'CNAME':
31 |                             self.total_results.append(f'{subdomain}')
32 |                         else:
33 |                             self.total_results.append(f'{subdomain}:{str(cells[1].get_text()).strip()}')
34 |                 self.total_results = list({domain for domain in self.total_results})
35 |         except Exception as e:
36 |             print(f'An exception has occurred: {str(e)}')
37 | 
38 |     async def process(self, proxy=False):
39 |         self.proxy = proxy
40 |         await self.do_search()
41 | 
42 |     async def get_hostnames(self):
43 |         return self.total_results
44 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/rocketreach.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | import asyncio
 4 | 
 5 | 
 6 | class SearchRocketReach:
 7 | 
 8 |     def __init__(self, word, limit):
 9 |         self.ips = set()
10 |         self.word = word
11 |         self.key = Core.rocketreach_key()
12 |         if self.key is None:
13 |             raise MissingKey('RocketReach')
14 |         self.hosts = set()
15 |         self.proxy = False
16 |         self.baseurl = 'https://api.rocketreach.co/v2/api/search'
17 |         self.links = set()
18 |         self.limit = limit
19 | 
20 |     async def do_search(self):
21 |         try:
22 |             headers = {
23 |                 'Api-Key': self.key,
24 |                 'Content-Type': 'application/json',
25 |                 'User-Agent': Core.get_user_agent()
26 |             }
27 | 
28 |             next_page = 1  # track pagniation
29 |             for count in range(1, self.limit):
30 |                 data = f'{{"query":{{"company_domain": ["{self.word}"]}}, "start": {next_page}, "page_size": 100}}'
31 |                 result = await AsyncFetcher.post_fetch(self.baseurl, headers=headers, data=data, json=True)
32 |                 if 'detail' in result.keys() and 'error' in result.keys() and 'Subscribe to a plan to access' in result['detail']:
33 |                     # No more results can be fetched
34 |                     break
35 |                 if 'detail' in result.keys() and 'Request was throttled.' in result['detail']:
36 |                     # Rate limit has been triggered need to sleep extra
37 |                     print(f'RocketReach requests have been throttled; '
38 |                           f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}')
39 |                     break
40 |                 if 'profiles' in dict(result).keys():
41 |                     if len(result['profiles']) == 0:
42 |                         break
43 |                     for profile in result['profiles']:
44 |                         if 'linkedin_url' in dict(profile).keys():
45 |                             self.links.add(profile['linkedin_url'])
46 |                 if 'pagination' in dict(result).keys():
47 |                     next_page = int(result['pagination']['next'])
48 |                     if next_page > int(result['pagination']['total']):
49 |                         break
50 | 
51 |             await asyncio.sleep(get_delay() + 2)
52 | 
53 |         except Exception as e:
54 |             print(f'An exception has occurred: {e}')
55 | 
56 |     async def get_links(self):
57 |         return self.links
58 | 
59 |     async def process(self, proxy=False):
60 |         self.proxy = proxy
61 |         await self.do_search()
62 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/securitytrailssearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | from theHarvester.parsers import securitytrailsparser
 4 | import asyncio
 5 | 
 6 | 
 7 | class SearchSecuritytrail:
 8 | 
 9 |     def __init__(self, word):
10 |         self.word = word
11 |         self.key = Core.security_trails_key()
12 |         if self.key is None:
13 |             raise MissingKey('Securitytrail')
14 |         self.results = ""
15 |         self.totalresults = ""
16 |         self.api = 'https://api.securitytrails.com/v1/'
17 |         self.info = ()
18 |         self.proxy = False
19 | 
20 |     async def authenticate(self) -> None:
21 |         # Method to authenticate API key before sending requests.
22 |         headers = {'APIKEY': self.key}
23 |         url = f'{self.api}ping'
24 |         auth_responses = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
25 |         auth_responses = auth_responses[0]
26 |         if 'False' in auth_responses or 'Invalid authentication' in auth_responses:
27 |             print('\tKey could not be authenticated exiting program.')
28 |         await asyncio.sleep(2)
29 | 
30 |     async def do_search(self) -> None:
31 |         # https://api.securitytrails.com/v1/domain/domain.com
32 |         url = f'{self.api}domain/{self.word}'
33 |         headers = {'APIKEY': self.key}
34 |         response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
35 |         await asyncio.sleep(2)  # Not random delay because 2 seconds is required due to rate limit.
36 |         self.results = response[0]
37 |         self.totalresults += self.results
38 |         url += '/subdomains'  # Get subdomains now.
39 |         subdomain_response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
40 |         await asyncio.sleep(2)
41 |         self.results = subdomain_response[0]
42 |         self.totalresults += self.results
43 | 
44 |     async def process(self, proxy=False) -> None:
45 |         self.proxy = proxy
46 |         await self.authenticate()
47 |         await self.do_search()
48 |         parser = securitytrailsparser.Parser(word=self.word, text=self.totalresults)
49 |         self.info = await parser.parse_text()
50 |         # Create parser and set self.info to tuple returned from parsing text.
51 |         print('\tDone Searching Results')
52 | 
53 |     async def get_ips(self) -> set:
54 |         return self.info[0]
55 | 
56 |     async def get_hostnames(self) -> set:
57 |         return self.info[1]
58 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/shodansearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | from shodan import exception
 4 | from shodan import Shodan
 5 | from collections import OrderedDict
 6 | 
 7 | 
 8 | class SearchShodan:
 9 | 
10 |     def __init__(self):
11 |         self.key = Core.shodan_key()
12 |         if self.key is None:
13 |             raise MissingKey('Shodan')
14 |         self.api = Shodan(self.key)
15 |         self.hostdatarow = []
16 |         self.tracker: OrderedDict = OrderedDict()
17 | 
18 |     async def search_ip(self, ip):
19 |         try:
20 |             ipaddress = ip
21 |             results = self.api.host(ipaddress)
22 |             asn = ''
23 |             domains = list()
24 |             hostnames = list()
25 |             ip_str = ''
26 |             isp = ''
27 |             org = ''
28 |             ports = list()
29 |             title = ''
30 |             server = ''
31 |             product = ''
32 |             technologies = list()
33 | 
34 |             data_first_dict = dict(results['data'][0])
35 | 
36 |             if 'ip_str' in data_first_dict.keys():
37 |                 ip_str += data_first_dict['ip_str']
38 | 
39 |             if 'http' in data_first_dict.keys():
40 |                 http_results_dict = dict(data_first_dict['http'])
41 |                 if 'title' in http_results_dict.keys():
42 |                     title_val = str(http_results_dict['title']).strip()
43 |                     if title_val != 'None':
44 |                         title += title_val
45 |                 if 'components' in http_results_dict.keys():
46 |                     for key in http_results_dict['components'].keys():
47 |                         technologies.append(key)
48 |                 if 'server' in http_results_dict.keys():
49 |                     server_val = str(http_results_dict['server']).strip()
50 |                     if server_val != 'None':
51 |                         server += server_val
52 | 
53 |             for key, value in results.items():
54 |                 if key == 'asn':
55 |                     asn += value
56 |                 if key == 'domains':
57 |                     value = list(value)
58 |                     value.sort()
59 |                     domains.extend(value)
60 |                 if key == 'hostnames':
61 |                     value = [host.strip() for host in list(value)]
62 |                     value.sort()
63 |                     hostnames.extend(value)
64 |                 if key == 'isp':
65 |                     isp += value
66 |                 if key == 'org':
67 |                     org += str(value)
68 |                 if key == 'ports':
69 |                     value = list(value)
70 |                     value.sort()
71 |                     ports.extend(value)
72 |                 if key == 'product':
73 |                     product += value
74 | 
75 |             technologies = list(set(technologies))
76 | 
77 |             self.tracker[ip] = {'asn': asn.strip(), 'domains': domains, 'hostnames': hostnames,
78 |                                 'ip_str': ip_str.strip(), 'isp': isp.strip(), 'org': org.strip(),
79 |                                 'ports': ports, 'product': product.strip(),
80 |                                 'server': server.strip(), 'technologies': technologies, 'title': title.strip()}
81 | 
82 |             return self.tracker
83 |         except exception.APIError:
84 |             print(f'{ip}: Not in Shodan')
85 |             self.tracker[ip] = 'Not in Shodan'
86 |         except Exception as e:
87 |             # print(f'Error occurred in the Shodan IP search module: {e}')
88 |             self.tracker[ip] = f'Error occurred in the Shodan IP search module: {e}'
89 |         finally:
90 |             return self.tracker
91 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/sublist3r.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchSublist3r:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.totalhosts = list
10 |         self.proxy = False
11 | 
12 |     async def do_search(self):
13 |         url = f'https://api.sublist3r.com/search.php?domain={self.word}'
14 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
15 |         self.totalhosts: list = response[0]
16 | 
17 |     async def get_hostnames(self) -> Type[list]:
18 |         return self.totalhosts
19 | 
20 |     async def process(self, proxy=False):
21 |         self.proxy = proxy
22 |         await self.do_search()
23 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/takeover.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | import re
 3 | 
 4 | 
 5 | class TakeOver:
 6 | 
 7 |     def __init__(self, hosts):
 8 |         # NOTE THIS MODULE IS ACTIVE RECON
 9 |         self.hosts = hosts
10 |         self.results = ""
11 |         self.totalresults = ""
12 |         self.proxy = False
13 |         # Thank you to https://github.com/EdOverflow/can-i-take-over-xyz for these fingerprints
14 |         self.fingerprints = {"'Trying to access your account?'": 'Campaign Monitor',
15 |                              '404 Not Found': 'Fly.io',
16 |                              '404 error unknown site!': 'Pantheon',
17 |                              'Do you want to register *.wordpress.com?': 'Wordpress',
18 |                              'Domain uses DO name serves with no records in DO.': 'Digital Ocean',
19 |                              "It looks like you may have taken a wrong turn somewhere. Don't worry...it happens to all of us.": 'LaunchRock',
20 |                              'No Site For Domain': 'Kinsta',
21 |                              'No settings were found for this company:': 'Help Scout',
22 |                              'Project doesnt exist... yet!': 'Readme.io',
23 |                              'Repository not found': 'Bitbucket',
24 |                              'The feed has not been found.': 'Feedpress',
25 |                              'No such app': 'Heroku',
26 |                              'The specified bucket does not exist': 'AWS/S3',
27 |                              'The thing you were looking for is no longer here, or never was': 'Ghost',
28 |                              "There isn't a Github Pages site here.": 'Github',
29 |                              'This UserVoice subdomain is currently available!': 'UserVoice',
30 |                              "Uh oh. That page doesn't exist.": 'Intercom',
31 |                              "We could not find what you're looking for.": 'Help Juice',
32 |                              "Whatever you were looking for doesn't currently exist at this address": 'Tumblr',
33 |                              'is not a registered InCloud YouTrack': 'JetBrains',
34 |                              'page not found': 'Uptimerobot',
35 |                              'project not found': 'Surge.sh'}
36 | 
37 |     async def check(self, url, resp):
38 |         # Simple function that takes response and checks if any fingerprints exists
39 |         # If a fingerprint exists figures out which one and prints it out
40 |         regex = re.compile("(?=(" + "|".join(map(re.escape, list(self.fingerprints.keys()))) + "))")
41 |         # Sanitize fingerprints
42 |         matches = re.findall(regex, resp)
43 |         for match in matches:
44 |             print(f'\t\033[91m Takeover detected: {url}\033[1;32;40m')
45 |             if match in self.fingerprints.keys():
46 |                 # Sanity check as to not error out
47 |                 print(f'\t\033[91m Type of takeover is: {self.fingerprints[match]}\033[1;32;40m')
48 | 
49 |     async def do_take(self):
50 |         try:
51 |             if len(self.hosts) > 0:
52 |                 tup_resps: list = await AsyncFetcher.fetch_all(self.hosts, takeover=True, proxy=self.proxy)
53 |                 # Returns a list of tuples in this format: (url, response)
54 |                 tup_resps = [tup for tup in tup_resps if tup[1] != '']
55 |                 # Filter out responses whose responses are empty strings (indicates errored)
56 |                 for url, resp in tup_resps:
57 |                     await self.check(url, resp)
58 |             else:
59 |                 return
60 |         except Exception as e:
61 |             print(e)
62 | 
63 |     async def process(self, proxy=False):
64 |         self.proxy = proxy
65 |         await self.do_take()
66 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/threatcrowd.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchThreatcrowd:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word.replace(' ', '%20')
 9 |         self.hostnames = list()
10 |         self.ips = list()
11 |         self.proxy = False
12 | 
13 |     async def do_search(self):
14 |         base_url = f'https://www.threatcrowd.org/searchApi/v2/domain/report/?domain={self.word}'
15 |         headers = {'User-Agent': Core.get_user_agent()}
16 |         try:
17 |             responses = await AsyncFetcher.fetch_all([base_url], headers=headers, proxy=self.proxy, json=True)
18 |             resp = responses[0]
19 |             self.ips = {ip['ip_address'] for ip in resp['resolutions'] if len(ip['ip_address']) > 4}
20 |             self.hostnames = set(list(resp['subdomains']))
21 |         except Exception as e:
22 |             print(e)
23 | 
24 |     async def get_ips(self) -> List:
25 |         return self.ips
26 | 
27 |     async def get_hostnames(self) -> List:
28 |         return self.hostnames
29 | 
30 |     async def process(self, proxy=False):
31 |         self.proxy = proxy
32 |         await self.do_search()
33 |         await self.get_hostnames()
34 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/threatminer.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchThreatminer:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.word = word
 9 |         self.totalhosts = list
10 |         self.totalips = list
11 |         self.proxy = False
12 | 
13 |     async def do_search(self):
14 |         url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5'
15 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
16 |         self.totalhosts: set = {host for host in response[0]['results']}
17 |         second_url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2'
18 |         secondresp = await AsyncFetcher.fetch_all([second_url], json=True, proxy=self.proxy)
19 |         try:
20 |             self.totalips: set = {resp['ip'] for resp in secondresp[0]['results']}
21 |         except TypeError:
22 |             pass
23 | 
24 |     async def get_hostnames(self) -> Type[list]:
25 |         return self.totalhosts
26 | 
27 |     async def get_ips(self) -> Type[list]:
28 |         return self.totalips
29 | 
30 |     async def process(self, proxy=False):
31 |         self.proxy = proxy
32 |         await self.do_search()
33 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/urlscan.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchUrlscan:
 6 |     def __init__(self, word):
 7 |         self.word = word
 8 |         self.totalhosts = list()
 9 |         self.totalips = list()
10 |         self.interestingurls = list()
11 |         self.totalasns = list()
12 |         self.proxy = False
13 | 
14 |     async def do_search(self):
15 |         url = f'https://urlscan.io/api/v1/search/?q=domain:{self.word}'
16 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
17 |         resp = response[0]
18 |         self.totalhosts = {f"{page['page']['domain']}" for page in resp['results']}
19 |         self.totalips = {f"{page['page']['ip']}" for page in resp['results'] if 'ip' in page['page'].keys()}
20 |         self.interestingurls = {f"{page['page']['url']}" for page in resp['results'] if self.word in page['page']['url'] and 'url' in page['page'].keys()}
21 |         self.totalasns = {f"{page['page']['asn']}" for page in resp['results'] if 'asn' in page['page'].keys()}
22 | 
23 |     async def get_hostnames(self) -> List:
24 |         return self.totalhosts
25 | 
26 |     async def get_ips(self) -> List:
27 |         return self.totalips
28 | 
29 |     async def get_interestingurls(self) -> List:
30 |         return self.interestingurls
31 | 
32 |     async def get_asns(self) -> List:
33 |         return self.totalasns
34 | 
35 |     async def process(self, proxy=False):
36 |         self.proxy = proxy
37 |         await self.do_search()
38 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/virustotal.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import *
 2 | from theHarvester.lib.core import *
 3 | 
 4 | 
 5 | class SearchVirustotal:
 6 | 
 7 |     def __init__(self, word):
 8 |         self.key = Core.virustotal_key()
 9 |         if self.key is None:
10 |             raise MissingKey('virustotal')
11 |         self.word = word
12 |         self.proxy = False
13 |         self.hostnames = []
14 | 
15 |     async def do_search(self):
16 |         # TODO determine if more endpoints can yield useful info given a domain
17 |         # based on: https://developers.virustotal.com/reference/domains-relationships
18 |         # base_url = "https://www.virustotal.com/api/v3/domains/domain/subdomains?limit=40"
19 |         headers = {
20 |             'User-Agent': Core.get_user_agent(),
21 |             "Accept": "application/json",
22 |             "x-apikey": self.key
23 |         }
24 |         base_url = f"https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40"
25 |         cursor = ''
26 |         count = 0
27 |         fail_counter = 0
28 |         counter = 0
29 |         breakcon = False
30 |         while True:
31 |             if breakcon:
32 |                 break
33 |             # rate limit is 4 per minute
34 |             # TODO add timer logic if proven to be needed
35 |             # in the meantime sleeping 16 seconds should eliminate hitting the rate limit
36 |             # in case rate limit is hit, fail counter exists and sleep for 65 seconds
37 |             send_url = base_url + "&cursor=" + cursor if cursor != '' and len(cursor) > 2 else base_url
38 |             responses = await AsyncFetcher.fetch_all([send_url], headers=headers, proxy=self.proxy, json=True)
39 |             jdata = responses[0]
40 |             if 'data' not in jdata.keys():
41 |                 await asyncio.sleep(60 + 5)
42 |                 fail_counter += 1
43 |             if 'meta' in jdata.keys():
44 |                 cursor = jdata['meta']['cursor'] if 'cursor' in jdata['meta'].keys() else ''
45 |                 if len(cursor) == 0 and 'data' in jdata.keys():
46 |                     # if cursor no longer is within the meta field have hit last entry
47 |                     breakcon = True
48 |             count += jdata['meta']['count']
49 |             if count == 0 or fail_counter >= 2:
50 |                 break
51 |             if 'data' in jdata.keys():
52 |                 data = jdata['data']
53 |                 self.hostnames.extend(await self.parse_hostnames(data, self.word))
54 |                 counter += 1
55 |             await asyncio.sleep(16)
56 |         self.hostnames = list(sorted(set(self.hostnames)))
57 |         # verify domains such as x.x.com.multicdn.x.com are parsed properly
58 |         self.hostnames = [host for host in self.hostnames if ((len(host.split('.')) >= 3) and host.split('.')[-2] == self.word.split('.')[-2])]
59 | 
60 |     async def get_hostnames(self) -> list:
61 |         return self.hostnames
62 | 
63 |     @staticmethod
64 |     async def parse_hostnames(data, word):
65 |         total_subdomains = set()
66 |         for attribute in data:
67 |             total_subdomains.add(attribute['id'].replace('"', '').replace('www.', ''))
68 |             attributes = attribute['attributes']
69 |             total_subdomains.update(
70 |                 {value['value'].replace('"', '').replace('www.', '') for value in attributes['last_dns_records'] if
71 |                  word in value['value']})
72 |             if 'last_https_certificate' in attributes.keys():
73 |                 total_subdomains.update({value.replace('"', '').replace('www.', '') for value in
74 |                                          attributes['last_https_certificate']['extensions']['subject_alternative_name']
75 |                                          if word in value})
76 |         total_subdomains = list(sorted(total_subdomains))
77 |         # Other false positives may occur over time and yes there are other ways to parse this, feel free to implement
78 |         # them and submit a PR or raise an issue if you run into this filtering not being enough
79 |         # TODO determine if parsing 'v=spf1 include:_spf-x.acme.com include:_spf-x.acme.com' is worth parsing
80 |         total_subdomains = [x for x in total_subdomains if not str(x).endswith('edgekey.net') and not str(x).endswith('akadns.net') and 'include:_spf' not in str(x)]
81 |         total_subdomains.sort()
82 |         return total_subdomains
83 | 
84 |     async def process(self, proxy=False):
85 |         self.proxy = proxy
86 |         await self.do_search()
87 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/yahoosearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import *
 2 | from theHarvester.parsers import myparser
 3 | 
 4 | 
 5 | class SearchYahoo:
 6 | 
 7 |     def __init__(self, word, limit):
 8 |         self.word = word
 9 |         self.total_results = ""
10 |         self.server = 'search.yahoo.com'
11 |         self.limit = limit
12 |         self.proxy = False
13 | 
14 |     async def do_search(self):
15 |         base_url = f'https://{self.server}/search?p=%40{self.word}&b=xx&pz=10'
16 |         headers = {
17 |             'Host': self.server,
18 |             'User-agent': Core.get_user_agent()
19 |         }
20 |         urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
21 |         responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
22 |         for response in responses:
23 |             self.total_results += response
24 | 
25 |     async def process(self):
26 |         await self.do_search()
27 | 
28 |     async def get_emails(self):
29 |         rawres = myparser.Parser(self.total_results, self.word)
30 |         toparse_emails = await rawres.emails()
31 |         emails = set()
32 |         # strip out numbers and dashes for emails that look like xxx-xxx-xxxemail@host.tld
33 |         for email in toparse_emails:
34 |             email = str(email)
35 |             if '-' in email and email[0].isdigit() and email.index('-') <= 9:
36 |                 while email[0] == '-' or email[0].isdigit():
37 |                     email = email[1:]
38 |             emails.add(email)
39 |         return list(emails)
40 | 
41 |     async def get_hostnames(self, proxy=False):
42 |         self.proxy = proxy
43 |         rawres = myparser.Parser(self.total_results, self.word)
44 |         return await rawres.hostnames()
45 | 


--------------------------------------------------------------------------------
/theHarvester/discovery/zoomeyesearch.py:
--------------------------------------------------------------------------------
  1 | from theHarvester.discovery.constants import *
  2 | from theHarvester.lib.core import *
  3 | from theHarvester.parsers import myparser
  4 | import asyncio
  5 | import re
  6 | 
  7 | 
  8 | class SearchZoomEye:
  9 | 
 10 |     def __init__(self, word, limit):
 11 |         self.word = word
 12 |         self.limit = limit
 13 |         self.key = Core.zoomeye_key()
 14 |         # NOTE for ZoomEye you get a system recharge on the 1st of every month
 15 |         # Which resets your balance to 10000 requests
 16 |         # If you wish to extract as many subdomains as possible visit the fetch_subdomains
 17 |         # To see how
 18 |         if self.key is None:
 19 |             raise MissingKey('zoomeye')
 20 |         self.baseurl = 'https://api.zoomeye.org/host/search'
 21 |         self.proxy = False
 22 |         self.totalasns = list()
 23 |         self.totalhosts = list()
 24 |         self.interestingurls = list()
 25 |         self.totalips = list()
 26 |         self.totalemails = list()
 27 |         # Regex used is directly from: https://github.com/GerbenJavado/LinkFinder/blob/master/linkfinder.py#L29
 28 |         # Maybe one day it will be a pip package
 29 |         # Regardless LinkFinder is an amazing tool!
 30 |         self.iurl_regex = r"""
 31 |           (?:"|')                               # Start newline delimiter
 32 |           (
 33 |             ((?:[a-zA-Z]{1,10}://|//)           # Match a scheme [a-Z]*1-10 or //
 34 |             [^"'/]{1,}\.                        # Match a domainname (any character + dot)
 35 |             [a-zA-Z]{2,}[^"']{0,})              # The domainextension and/or path
 36 |             |
 37 |             ((?:/|\.\./|\./)                    # Start with /,../,./
 38 |             [^"'><,;| *()(%%$^/\\\[\]]          # Next character can't be...
 39 |             [^"'><,;|()]{1,})                   # Rest of the characters can't be
 40 |             |
 41 |             ([a-zA-Z0-9_\-/]{1,}/               # Relative endpoint with /
 42 |             [a-zA-Z0-9_\-/]{1,}                 # Resource name
 43 |             \.(?:[a-zA-Z]{1,4}|action)          # Rest + extension (length 1-4 or action)
 44 |             (?:[\?|#][^"|']{0,}|))              # ? or # mark with parameters
 45 |             |
 46 |             ([a-zA-Z0-9_\-/]{1,}/               # REST API (no extension) with /
 47 |             [a-zA-Z0-9_\-/]{3,}                 # Proper REST endpoints usually have 3+ chars
 48 |             (?:[\?|#][^"|']{0,}|))              # ? or # mark with parameters
 49 |             |
 50 |             ([a-zA-Z0-9_\-]{1,}                 # filename
 51 |             \.(?:php|asp|aspx|jsp|json|
 52 |                  action|html|js|txt|xml)        # . + extension
 53 |             (?:[\?|#][^"|']{0,}|))              # ? or # mark with parameters
 54 |           )
 55 |           (?:"|')                               # End newline delimiter
 56 |         """
 57 |         self.iurl_regex = re.compile(self.iurl_regex, re.VERBOSE)
 58 | 
 59 |     async def fetch_subdomains(self):
 60 |         # Based on docs from: https://www.zoomeye.org/doc#search-sub-domain-ip
 61 |         headers = {
 62 |             'API-KEY': self.key,
 63 |             'User-Agent': Core.get_user_agent()
 64 |         }
 65 | 
 66 |         subdomain_search_endpoint = f'https://api.zoomeye.org/domain/search?q={self.word}&type=0&'
 67 | 
 68 |         response = await AsyncFetcher.fetch_all([subdomain_search_endpoint + 'page=1'],
 69 |                                                 json=True, proxy=self.proxy, headers=headers)
 70 |         # Make initial request to determine total number of subdomains
 71 |         resp = response[0]
 72 |         if resp['status'] != 200:
 73 |             return
 74 |         total = resp['total']
 75 |         # max number of results per request seems to be 30
 76 |         # NOTE: If you wish to get as many subdomains as possible
 77 |         # Change the line below to:
 78 |         # self.limit = (total // 30) + 1
 79 |         self.limit = self.limit if total > self.limit else (total // 30) + 1
 80 |         self.totalhosts.extend([item["name"] for item in resp["list"]])
 81 |         for i in range(2, self.limit):
 82 |             response = await AsyncFetcher.fetch_all([subdomain_search_endpoint + f'page={i}'],
 83 |                                                     json=True, proxy=self.proxy, headers=headers)
 84 |             resp = response[0]
 85 |             if resp['status'] != 200:
 86 |                 return
 87 |             found_subdomains = [item["name"] for item in resp["list"]]
 88 |             if len(found_subdomains) == 0:
 89 |                 break
 90 |             self.totalhosts.extend(found_subdomains)
 91 |             if i % 10 == 0:
 92 |                 await asyncio.sleep(get_delay() + 1)
 93 | 
 94 |     async def do_search(self):
 95 |         headers = {
 96 |             'API-KEY': self.key,
 97 |             'User-Agent': Core.get_user_agent()
 98 |         }
 99 |         # Fetch subdomains first
100 |         await self.fetch_subdomains()
101 |         params = (
102 |             ('query', f'site:{self.word}'),
103 |             ('page', '1'),
104 |         )
105 |         response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers,
106 |                                                 params=params)
107 |         # First request determines how many pages there in total
108 |         resp = response[0]
109 |         total_pages = int(resp['available'])
110 |         self.limit = self.limit if total_pages > self.limit else total_pages
111 |         self.limit = 3 if self.limit == 2 else self.limit
112 |         cur_page = 2 if self.limit >= 2 else -1
113 |         # Means there is only one page
114 |         # hostnames, emails, ips, asns, iurls
115 |         nomatches_counter = 0
116 |         # cur_page = -1
117 |         if cur_page == -1:
118 |             # No need to do loop just parse and leave
119 |             if 'matches' in resp.keys():
120 |                 hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches'])
121 |                 self.totalhosts.extend(hostnames)
122 |                 self.totalemails.extend(emails)
123 |                 self.totalips.extend(ips)
124 |                 self.totalasns.extend(asns)
125 |                 self.interestingurls.extend(iurls)
126 |         else:
127 |             if 'matches' in resp.keys():
128 |                 # Parse out initial results and then continue to loop
129 |                 hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches'])
130 |                 self.totalhosts.extend(hostnames)
131 |                 self.totalemails.extend(emails)
132 |                 self.totalips.extend(ips)
133 |                 self.totalasns.extend(asns)
134 |                 self.interestingurls.extend(iurls)
135 | 
136 |             for num in range(2, self.limit):
137 |                 # print(f'Currently on page: {num}')
138 |                 params = (
139 |                     ('query', f'site:{self.word}'),
140 |                     ('page', f'{num}'),
141 |                 )
142 |                 response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers,
143 |                                                         params=params)
144 |                 resp = response[0]
145 |                 if 'matches' not in resp.keys():
146 |                     print(f'Your resp: {resp}')
147 |                     print('Match not found in keys')
148 |                     break
149 | 
150 |                 hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches'])
151 | 
152 |                 if len(hostnames) == 0 and len(emails) == 0 and len(ips) == 0 \
153 |                         and len(asns) == 0 and len(iurls) == 0:
154 |                     nomatches_counter += 1
155 | 
156 |                 if nomatches_counter >= 5:
157 |                     break
158 | 
159 |                 self.totalhosts.extend(hostnames)
160 |                 self.totalemails.extend(emails)
161 |                 self.totalips.extend(ips)
162 |                 self.totalasns.extend(asns)
163 |                 self.interestingurls.extend(iurls)
164 | 
165 |                 if num % 10 == 0:
166 |                     await asyncio.sleep(get_delay() + 1)
167 | 
168 |     async def parse_matches(self, matches):
169 |         # Helper function to parse items from match json
170 |         # ips = {match["ip"] for match in matches}
171 |         ips = set()
172 |         iurls = set()
173 |         hostnames = set()
174 |         asns = set()
175 |         emails = set()
176 |         for match in matches:
177 |             try:
178 |                 ips.add(match['ip'])
179 | 
180 |                 if 'geoinfo' in match.keys():
181 |                     asns.add(int(match['geoinfo']['asn']))
182 | 
183 |                 if 'rdns_new' in match.keys():
184 |                     rdns_new = match['rdns_new']
185 | 
186 |                     if ',' in rdns_new:
187 |                         parts = str(rdns_new).split(',')
188 |                         rdns_new = parts[0]
189 |                         if len(parts) == 2:
190 |                             hostnames.add(parts[1])
191 |                         rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new
192 |                         hostnames.add(rdns_new)
193 |                     else:
194 |                         rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new
195 |                         hostnames.add(rdns_new)
196 | 
197 |                 if 'rdns' in match.keys():
198 |                     rdns = match['rdns']
199 |                     rdns = rdns[:-1] if rdns[-1] == '.' else rdns
200 |                     hostnames.add(rdns)
201 | 
202 |                 if 'portinfo' in match.keys():
203 |                     # re.
204 |                     temp_emails = set(await self.parse_emails(match['portinfo']['banner']))
205 |                     emails.update(temp_emails)
206 |                     hostnames.update(set(await self.parse_hostnames(match['portinfo']['banner'])))
207 |                     iurls = {str(iurl.group(1)).replace('"', '') for iurl
208 |                              in re.finditer(self.iurl_regex, match['portinfo']['banner'])
209 |                              if self.word in str(iurl.group(1))}
210 |             except Exception as e:
211 |                 print(f'An exception has occurred: {e}')
212 |         return hostnames, emails, ips, asns, iurls
213 | 
214 |     async def process(self, proxy=False):
215 |         self.proxy = proxy
216 |         await self.do_search()  # Only need to do it once.
217 | 
218 |     async def parse_emails(self, content):
219 |         rawres = myparser.Parser(content, self.word)
220 |         return await rawres.emails()
221 | 
222 |     async def parse_hostnames(self, content):
223 |         rawres = myparser.Parser(content, self.word)
224 |         return await rawres.hostnames()
225 | 
226 |     async def get_hostnames(self):
227 |         return set(self.totalhosts)
228 | 
229 |     async def get_emails(self):
230 |         return set(self.totalemails)
231 | 
232 |     async def get_ips(self):
233 |         return set(self.totalips)
234 | 
235 |     async def get_asns(self):
236 |         return set(self.totalasns)
237 | 
238 |     async def get_interestingurls(self):
239 |         return set(self.interestingurls)
240 | 


--------------------------------------------------------------------------------
/theHarvester/lib/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['hostchecker']
2 | 


--------------------------------------------------------------------------------
/theHarvester/lib/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/lib/api/__init__.py


--------------------------------------------------------------------------------
/theHarvester/lib/api/api.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from typing import List
  3 | import os
  4 | from fastapi import FastAPI, Header, Query, Request
  5 | from fastapi.responses import HTMLResponse, UJSONResponse
  6 | from slowapi import Limiter, _rate_limit_exceeded_handler
  7 | from slowapi.errors import RateLimitExceeded
  8 | from slowapi.util import get_remote_address
  9 | from starlette.responses import RedirectResponse
 10 | from starlette.staticfiles import StaticFiles
 11 | 
 12 | from theHarvester import __main__
 13 | 
 14 | limiter = Limiter(key_func=get_remote_address)
 15 | app = FastAPI(title='Restful Harvest', description='Rest API for theHarvester powered by FastAPI', version='0.0.2')
 16 | app.state.limiter = limiter
 17 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
 18 | 
 19 | # This is where we will host files that arise if the user specifies a filename
 20 | try:
 21 |     app.mount('/static', StaticFiles(directory='theHarvester/lib/api/static/'), name='static')
 22 | except RuntimeError:
 23 |     static_path = os.path.expanduser('~/.local/share/theHarvester/static/')
 24 |     if not os.path.isdir(static_path):
 25 |         os.makedirs(static_path)
 26 |         app.mount('/static', StaticFiles(directory='~/.local/share/theHarvester/static/'), name='static')
 27 | 
 28 | 
 29 | @app.get('/', response_class=HTMLResponse)
 30 | async def root(*, user_agent: str = Header(None)):
 31 |     # very basic user agent filtering
 32 |     if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
 33 |         response = RedirectResponse(app.url_path_for('bot'))
 34 |         return response
 35 | 
 36 |     html = """
 37 |     <!DOCTYPE html>
 38 |     <html lang="en-US">
 39 |         <head>
 40 |             <title>theHarvester API</title>
 41 |              <style>
 42 |               .img-container {
 43 |                 text-align: center;
 44 |                 display: block;
 45 |                 }
 46 |             </style>
 47 |         </head>
 48 |         <body>
 49 |             <br/>
 50 |             <a href="https://github.com/laramies/theHarvester" target="_blank">
 51 |             <span class="img-container">
 52 |                 <img src="https://raw.githubusercontent.com/laramies/theHarvester/master/theHarvester-logo.png" alt="theHarvester logo"/>
 53 |             </span>
 54 |             </a>
 55 |         </body>
 56 |     </html>
 57 |     """
 58 |     return html
 59 | 
 60 | 
 61 | @app.get('/nicebot')
 62 | async def bot():
 63 |     # nice bot
 64 |     string = {'bot': 'These are not the droids you are looking for'}
 65 |     return string
 66 | 
 67 | 
 68 | @app.get('/sources', response_class=UJSONResponse)
 69 | @limiter.limit('5/minute')
 70 | async def getsources(request: Request):
 71 |     # Endpoint for user to query for available sources theHarvester supports
 72 |     # Rate limit of 5 requests per minute
 73 |     sources = __main__.Core.get_supportedengines()
 74 |     return {'sources': sources}
 75 | 
 76 | 
 77 | @app.get('/dnsbrute', response_class=UJSONResponse)
 78 | @limiter.limit('5/minute')
 79 | async def dnsbrute(request: Request, user_agent: str = Header(None),
 80 |                    domain: str = Query(..., description='Domain to be brute forced')):
 81 |     # Endpoint for user to signal to do DNS brute forcing
 82 |     # Rate limit of 5 requests per minute
 83 |     # basic user agent filtering
 84 |     if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
 85 |         response = RedirectResponse(app.url_path_for('bot'))
 86 |         return response
 87 |     dns_bruteforce = await __main__.start(argparse.Namespace(dns_brute=True,
 88 |                                                              dns_lookup=False,
 89 |                                                              dns_server=False,
 90 |                                                              dns_tld=False,
 91 |                                                              domain=domain,
 92 |                                                              filename='',
 93 |                                                              google_dork=False,
 94 |                                                              limit=500,
 95 |                                                              proxies=False,
 96 |                                                              shodan=False,
 97 |                                                              source=','.join([]),
 98 |                                                              start=0,
 99 |                                                              take_over=False,
100 |                                                              virtual_host=False))
101 |     return {'dns_bruteforce': dns_bruteforce}
102 | 
103 | 
104 | @app.get('/query', response_class=UJSONResponse)
105 | @limiter.limit('2/minute')
106 | async def query(request: Request, dns_server: str = Query(""), user_agent: str = Header(None),
107 |                 dns_brute: bool = Query(False),
108 |                 dns_lookup: bool = Query(False),
109 |                 dns_tld: bool = Query(False),
110 |                 filename: str = Query(""),
111 |                 google_dork: bool = Query(False), proxies: bool = Query(False), shodan: bool = Query(False),
112 |                 take_over: bool = Query(False), virtual_host: bool = Query(False),
113 |                 source: List[str] = Query(..., description='Data sources to query comma separated with no space'),
114 |                 limit: int = Query(500), start: int = Query(0),
115 |                 domain: str = Query(..., description='Domain to be harvested')):
116 | 
117 |     # Query function that allows user to query theHarvester rest API
118 |     # Rate limit of 2 requests per minute
119 |     # basic user agent filtering
120 |     if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
121 |         response = RedirectResponse(app.url_path_for('bot'))
122 |         return response
123 |     try:
124 |         asns, iurls, twitter_people_list, \
125 |             linkedin_people_list, linkedin_links, \
126 |             aurls, aips, aemails, ahosts = await __main__.start(argparse.Namespace(dns_brute=dns_brute,
127 |                                                                                    dns_lookup=dns_lookup,
128 |                                                                                    dns_server=dns_server,
129 |                                                                                    dns_tld=dns_tld,
130 |                                                                                    domain=domain,
131 |                                                                                    filename=filename,
132 |                                                                                    google_dork=google_dork,
133 |                                                                                    limit=limit,
134 |                                                                                    proxies=proxies,
135 |                                                                                    shodan=shodan,
136 |                                                                                    source=','.join(source),
137 |                                                                                    start=start,
138 |                                                                                    take_over=take_over,
139 |                                                                                    virtual_host=virtual_host))
140 | 
141 |         return {'asns': asns, 'interesting_urls': iurls,
142 |                 'twitter_people': twitter_people_list,
143 |                 'linkedin_people': linkedin_people_list,
144 |                 'linkedin_links': linkedin_links,
145 |                 'trello_urls': aurls,
146 |                 'ips': aips,
147 |                 'emails': aemails,
148 |                 'hosts': ahosts}
149 |     except Exception:
150 |         return {'exception': 'Please contact the server administrator to check the issue'}
151 | 


--------------------------------------------------------------------------------
/theHarvester/lib/api/api_example.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Example script to query theHarvester rest API, obtain results, and write out to stdout as well as an html
  3 | """
  4 | 
  5 | import asyncio
  6 | import aiohttp
  7 | import netaddr
  8 | 
  9 | 
 10 | async def fetch_json(session, url):
 11 |     async with session.get(url) as response:
 12 |         return await response.json()
 13 | 
 14 | 
 15 | async def fetch(session, url):
 16 |     async with session.get(url) as response:
 17 |         return await response.text()
 18 | 
 19 | 
 20 | async def main():
 21 |     """
 22 |     Just a simple example of how to interact with the rest api
 23 |     you can easily use requests instead of aiohttp or whatever you best see fit
 24 |     """
 25 |     url = "http://127.0.0.1:5000"
 26 |     domain = "netflix.com"
 27 |     query_url = f'{url}/query?limit=300&source=bing,baidu,duckduckgo,dogpile&domain={domain}'
 28 |     async with aiohttp.ClientSession() as session:
 29 |         fetched_json = await fetch_json(session, query_url)
 30 |         total_asns = fetched_json['asns']
 31 |         interesting_urls = fetched_json['interesting_urls']
 32 |         twitter_people_list_tracker = fetched_json['twitter_people']
 33 |         linkedin_people_list_tracker = fetched_json['linkedin_people']
 34 |         linkedin_links_tracker = fetched_json['linkedin_links']
 35 |         trello_urls = fetched_json['trello_urls']
 36 |         ips = fetched_json['ips']
 37 |         emails = fetched_json['emails']
 38 |         hosts = fetched_json['hosts']
 39 | 
 40 |     if len(total_asns) > 0:
 41 |         print(f'\n[*] ASNS found: {len(total_asns)}')
 42 |         print('--------------------')
 43 |         total_asns = list(sorted(set(total_asns)))
 44 |         for asn in total_asns:
 45 |             print(asn)
 46 | 
 47 |     if len(interesting_urls) > 0:
 48 |         print(f'\n[*] Interesting Urls found: {len(interesting_urls)}')
 49 |         print('--------------------')
 50 |         interesting_urls = list(sorted(set(interesting_urls)))
 51 |         for iurl in interesting_urls:
 52 |             print(iurl)
 53 | 
 54 |     if len(twitter_people_list_tracker) == 0:
 55 |         print('\n[*] No Twitter users found.\n\n')
 56 |     else:
 57 |         if len(twitter_people_list_tracker) >= 1:
 58 |             print('\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker)))
 59 |             print('---------------------')
 60 |             twitter_people_list_tracker = list(sorted(set(twitter_people_list_tracker)))
 61 |             for usr in twitter_people_list_tracker:
 62 |                 print(usr)
 63 | 
 64 |     if len(linkedin_people_list_tracker) == 0:
 65 |         print('\n[*] No LinkedIn users found.\n\n')
 66 |     else:
 67 |         if len(linkedin_people_list_tracker) >= 1:
 68 |             print('\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker)))
 69 |             print('---------------------')
 70 |             linkedin_people_list_tracker = list(sorted(set(linkedin_people_list_tracker)))
 71 |             for usr in linkedin_people_list_tracker:
 72 |                 print(usr)
 73 | 
 74 |     if len(linkedin_links_tracker) == 0:
 75 |         print(f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}')
 76 |         linkedin_links_tracker = list(sorted(set(linkedin_links_tracker)))
 77 |         print('---------------------')
 78 |         for link in linkedin_links_tracker:
 79 |             print(link)
 80 | 
 81 |     length_urls = len(trello_urls)
 82 |     total = length_urls
 83 |     print('\n[*] Trello URLs found: ' + str(total))
 84 |     print('--------------------')
 85 |     all_urls = list(sorted(set(trello_urls)))
 86 |     for url in sorted(all_urls):
 87 |         print(url)
 88 | 
 89 |     if len(ips) == 0:
 90 |         print('\n[*] No IPs found.')
 91 |     else:
 92 |         print('\n[*] IPs found: ' + str(len(ips)))
 93 |         print('-------------------')
 94 |         # use netaddr as the list may contain ipv4 and ipv6 addresses
 95 |         ip_list = sorted([netaddr.IPAddress(ip.strip()) for ip in set(ips)])
 96 |         print('\n'.join(map(str, ip_list)))
 97 | 
 98 |     if len(emails) == 0:
 99 |         print('\n[*] No emails found.')
100 |     else:
101 |         print('\n[*] Emails found: ' + str(len(emails)))
102 |         print('----------------------')
103 |         all_emails = sorted(list(set(emails)))
104 |         print(('\n'.join(all_emails)))
105 | 
106 |     if len(hosts) == 0:
107 |         print('\n[*] No hosts found.\n\n')
108 |     else:
109 |         print('\n[*] Hosts found: ' + str(len(hosts)))
110 |         print('---------------------')
111 |         print('\n'.join(hosts))
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     asyncio.run(main())
116 | 


--------------------------------------------------------------------------------
/theHarvester/lib/api/static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/lib/api/static/.gitkeep


--------------------------------------------------------------------------------
/theHarvester/lib/hostchecker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | """
 4 | Created by laramies on 2008-08-21.
 5 | Revised to use aiodns & asyncio on 2019-09-23
 6 | """
 7 | 
 8 | import aiodns
 9 | import asyncio
10 | import socket
11 | from typing import Tuple, Any
12 | 
13 | 
14 | class Checker:
15 | 
16 |     def __init__(self, hosts: list, nameserver=False):
17 |         self.hosts = hosts
18 |         self.realhosts: list = []
19 |         self.addresses: set = set()
20 |         self.nameserver = []
21 |         if nameserver:
22 |             self.nameserver = nameserver
23 | 
24 |     @staticmethod
25 |     async def query(host, resolver) -> Tuple[str, Any]:
26 |         try:
27 |             result = await resolver.gethostbyname(host, socket.AF_INET)
28 |             addresses = result.addresses
29 |             if addresses == [] or addresses is None or result is None:
30 |                 return f"{host}:", tuple()
31 |             else:
32 |                 return f"{host}:{', '.join(map(str, addresses))}", addresses
33 |         except Exception:
34 |             return f"{host}", tuple()
35 | 
36 |     async def query_all(self, resolver) -> list:
37 |         results = await asyncio.gather(*[asyncio.create_task(self.query(host, resolver))
38 |                                          for host in self.hosts])
39 |         return results
40 | 
41 |     async def check(self):
42 |         loop = asyncio.get_event_loop()
43 |         resolver = aiodns.DNSResolver(loop=loop, timeout=4) if len(self.nameserver) == 0\
44 |             else aiodns.DNSResolver(loop=loop, timeout=4, nameservers=self.nameserver)
45 |         results = await self.query_all(resolver)
46 |         for host, address in results:
47 |             self.realhosts.append(host)
48 |             self.addresses.update({addr for addr in address})
49 |             # address may be a list of ips
50 |             # and do a set comprehension to remove duplicates
51 |         self.realhosts.sort()
52 |         self.addresses = list(self.addresses)
53 |         return self.realhosts, self.addresses
54 | 


--------------------------------------------------------------------------------
/theHarvester/lib/stash.py:
--------------------------------------------------------------------------------
  1 | import aiosqlite
  2 | import datetime
  3 | import os
  4 | 
  5 | db_path = os.path.expanduser('~/.local/share/theHarvester')
  6 | 
  7 | if not os.path.isdir(db_path):
  8 |     os.makedirs(db_path)
  9 | 
 10 | 
 11 | class StashManager:
 12 | 
 13 |     def __init__(self):
 14 |         self.db = os.path.join(db_path, 'stash.sqlite')
 15 |         self.results = ""
 16 |         self.totalresults = ""
 17 |         self.latestscandomain = {}
 18 |         self.domainscanhistory = []
 19 |         self.scanboarddata = {}
 20 |         self.scanstats = []
 21 |         self.latestscanresults = []
 22 |         self.previousscanresults = []
 23 | 
 24 |     async def do_init(self):
 25 |         async with aiosqlite.connect(self.db) as db:
 26 |             await db.execute(
 27 |                 'CREATE TABLE IF NOT EXISTS results (domain text, resource text, type text, find_date date, source text)')
 28 |             await db.commit()
 29 | 
 30 |     async def store(self, domain, resource, res_type, source):
 31 |         self.domain = domain
 32 |         self.resource = resource
 33 |         self.type = res_type
 34 |         self.source = source
 35 |         self.date = datetime.date.today()
 36 |         try:
 37 |             async with aiosqlite.connect(self.db, timeout=30) as db:
 38 |                 await db.execute('INSERT INTO results (domain,resource, type, find_date, source) VALUES (?,?,?,?,?)',
 39 |                                  (self.domain, self.resource, self.type, self.date, self.source))
 40 |                 await db.commit()
 41 |         except Exception as e:
 42 |             print(e)
 43 | 
 44 |     async def store_all(self, domain, all, res_type, source):
 45 |         self.domain = domain
 46 |         self.all = all
 47 |         self.type = res_type
 48 |         self.source = source
 49 |         self.date = datetime.date.today()
 50 |         master_list = [(self.domain, x, self.type, self.date, self.source) for x in self.all]
 51 |         async with aiosqlite.connect(self.db, timeout=30) as db:
 52 |             try:
 53 |                 await db.executemany('INSERT INTO results (domain,resource, type, find_date, source) VALUES (?,?,?,?,?)',
 54 |                                      master_list)
 55 |                 await db.commit()
 56 |             except Exception as e:
 57 |                 print(e)
 58 | 
 59 |     async def generatedashboardcode(self, domain):
 60 |         try:
 61 |             # TODO refactor into generic method
 62 |             self.latestscandomain["domain"] = domain
 63 |             async with aiosqlite.connect(self.db, timeout=30) as conn:
 64 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="host"''',
 65 |                                             (domain,))
 66 |                 data = await cursor.fetchone()
 67 |                 self.latestscandomain["host"] = data[0]
 68 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="email"''',
 69 |                                             (domain,))
 70 |                 data = await cursor.fetchone()
 71 |                 self.latestscandomain["email"] = data[0]
 72 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="ip"''', (domain,))
 73 |                 data = await cursor.fetchone()
 74 |                 self.latestscandomain["ip"] = data[0]
 75 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="vhost"''',
 76 |                                             (domain,))
 77 |                 data = await cursor.fetchone()
 78 |                 self.latestscandomain["vhost"] = data[0]
 79 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="shodan"''',
 80 |                                             (domain,))
 81 |                 data = await cursor.fetchone()
 82 |                 self.latestscandomain["shodan"] = data[0]
 83 |                 cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,))
 84 |                 data = await cursor.fetchone()
 85 |                 self.latestscandomain["latestdate"] = data[0]
 86 |                 latestdate = data[0]
 87 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="host"''',
 88 |                                             (domain, latestdate,))
 89 |                 scandetailshost = await cursor.fetchall()
 90 |                 self.latestscandomain["scandetailshost"] = scandetailshost
 91 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="email"''',
 92 |                                             (domain, latestdate,))
 93 |                 scandetailsemail = await cursor.fetchall()
 94 |                 self.latestscandomain["scandetailsemail"] = scandetailsemail
 95 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="ip"''',
 96 |                                             (domain, latestdate,))
 97 |                 scandetailsip = await cursor.fetchall()
 98 |                 self.latestscandomain["scandetailsip"] = scandetailsip
 99 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="vhost"''',
100 |                                             (domain, latestdate,))
101 |                 scandetailsvhost = await cursor.fetchall()
102 |                 self.latestscandomain["scandetailsvhost"] = scandetailsvhost
103 |                 cursor = await conn.execute(
104 |                     '''SELECT * FROM results WHERE domain=? AND find_date=? AND type="shodan"''',
105 |                     (domain, latestdate,))
106 |                 scandetailsshodan = await cursor.fetchall()
107 |                 self.latestscandomain["scandetailsshodan"] = scandetailsshodan
108 |             return self.latestscandomain
109 |         except Exception as e:
110 |             print(e)
111 | 
112 |     async def getlatestscanresults(self, domain, previousday=False):
113 |         try:
114 |             async with aiosqlite.connect(self.db, timeout=30) as conn:
115 |                 if previousday:
116 |                     try:
117 |                         cursor = await conn.execute('''
118 |                         SELECT DISTINCT(find_date)
119 |                         FROM results
120 |                         WHERE find_date=date('now', '-1 day') and domain=?''', (domain,))
121 |                         previousscandate = await cursor.fetchone()
122 |                         if not previousscandate:  # When theHarvester runs first time/day this query will return.
123 |                             self.previousscanresults = ["No results", "No results", "No results", "No results",
124 |                                                         "No results"]
125 |                         else:
126 |                             cursor = await conn.execute('''
127 |                             SELECT find_date, domain, source, type, resource
128 |                             FROM results
129 |                             WHERE find_date=? and domain=?
130 |                             ORDER BY source,type
131 |                             ''', (previousscandate[0], domain,))
132 |                             results = await cursor.fetchall()
133 |                             self.previousscanresults = results
134 |                         return self.previousscanresults
135 |                     except Exception as e:
136 |                         print(f'Error in getting the previous scan results from the database: {e}')
137 |                 else:
138 |                     try:
139 |                         cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,))
140 |                         latestscandate = await cursor.fetchone()
141 |                         cursor = await conn.execute('''
142 |                         SELECT find_date, domain, source, type, resource
143 |                         FROM results
144 |                         WHERE find_date=? and domain=?
145 |                         ORDER BY source,type
146 |                         ''', (latestscandate[0], domain,))
147 |                         results = await cursor.fetchall()
148 |                         self.latestscanresults = results
149 |                         return self.latestscanresults
150 |                     except Exception as e:
151 |                         print(f'Error in getting the latest scan results from the database: {e}')
152 |         except Exception as e:
153 |             print(f'Error connecting to theHarvester database: {e}')
154 | 
155 |     async def getscanboarddata(self):
156 |         try:
157 |             async with aiosqlite.connect(self.db, timeout=30) as conn:
158 | 
159 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="host"''')
160 |                 data = await cursor.fetchone()
161 |                 self.scanboarddata["host"] = data[0]
162 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="email"''')
163 |                 data = await cursor.fetchone()
164 |                 self.scanboarddata["email"] = data[0]
165 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="ip"''')
166 |                 data = await cursor.fetchone()
167 |                 self.scanboarddata["ip"] = data[0]
168 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="vhost"''')
169 |                 data = await cursor.fetchone()
170 |                 self.scanboarddata["vhost"] = data[0]
171 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="shodan"''')
172 |                 data = await cursor.fetchone()
173 |                 self.scanboarddata["shodan"] = data[0]
174 |                 cursor = await conn.execute('''SELECT COUNT(DISTINCT(domain)) FROM results ''')
175 |                 data = await cursor.fetchone()
176 |                 self.scanboarddata["domains"] = data[0]
177 |             return self.scanboarddata
178 |         except Exception as e:
179 |             print(e)
180 | 
181 |     async def getscanhistorydomain(self, domain):
182 |         try:
183 |             async with aiosqlite.connect(self.db, timeout=30) as conn:
184 |                 cursor = await conn.execute('''SELECT DISTINCT(find_date) FROM results WHERE domain=?''', (domain,))
185 |                 dates = await cursor.fetchall()
186 |                 for date in dates:
187 |                     cursor = await conn.execute(
188 |                         '''SELECT COUNT(*) from results WHERE domain=? AND type="host" AND find_date=?''',
189 |                         (domain, date[0]))
190 |                     counthost = await cursor.fetchone()
191 |                     cursor = await conn.execute(
192 |                         '''SELECT COUNT(*) from results WHERE domain=? AND type="email" AND find_date=?''',
193 |                         (domain, date[0]))
194 |                     countemail = await cursor.fetchone()
195 |                     cursor = await conn.execute(
196 |                         '''SELECT COUNT(*) from results WHERE domain=? AND type="ip" AND find_date=?''',
197 |                         (domain, date[0]))
198 |                     countip = await cursor.fetchone()
199 |                     cursor = await conn.execute(
200 |                         '''SELECT COUNT(*) from results WHERE domain=? AND type="vhost" AND find_date=?''',
201 |                         (domain, date[0]))
202 |                     countvhost = await cursor.fetchone()
203 |                     cursor = await conn.execute(
204 |                         '''SELECT COUNT(*) from results WHERE domain=? AND type="shodan" AND find_date=?''',
205 |                         (domain, date[0]))
206 |                     countshodan = await cursor.fetchone()
207 |                     results = {
208 |                         "date": str(date[0]),
209 |                         "hosts": str(counthost[0]),
210 |                         "email": str(countemail[0]),
211 |                         "ip": str(countip[0]),
212 |                         "vhost": str(countvhost[0]),
213 |                         "shodan": str(countshodan[0])
214 |                     }
215 |                     self.domainscanhistory.append(results)
216 |             return self.domainscanhistory
217 |         except Exception as e:
218 |             print(e)
219 | 
220 |     async def getpluginscanstatistics(self):
221 |         try:
222 |             async with aiosqlite.connect(self.db, timeout=30) as conn:
223 |                 cursor = await conn.execute('''
224 |                 SELECT domain,find_date, type, source, count(*)
225 |                 FROM results
226 |                 GROUP BY domain, find_date, type, source
227 |                 ''')
228 |                 results = await cursor.fetchall()
229 |                 self.scanstats = results
230 |             return self.scanstats
231 |         except Exception as e:
232 |             print(e)
233 | 
234 |     async def latestscanchartdata(self, domain):
235 |         try:
236 |             async with aiosqlite.connect(self.db, timeout=30) as conn:
237 |                 self.latestscandomain["domain"] = domain
238 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="host"''', (domain,))
239 |                 data = await cursor.fetchone()
240 |                 self.latestscandomain["host"] = data[0]
241 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="email"''', (domain,))
242 |                 data = await cursor.fetchone()
243 |                 self.latestscandomain["email"] = data[0]
244 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="ip"''', (domain,))
245 |                 data = await cursor.fetchone()
246 |                 self.latestscandomain["ip"] = data[0]
247 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="vhost"''', (domain,))
248 |                 data = await cursor.fetchone()
249 |                 self.latestscandomain["vhost"] = data[0]
250 |                 cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="shodan"''', (domain,))
251 |                 data = await cursor.fetchone()
252 |                 self.latestscandomain["shodan"] = data[0]
253 |                 cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,))
254 |                 data = await cursor.fetchone()
255 |                 self.latestscandomain["latestdate"] = data[0]
256 |                 latestdate = data[0]
257 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="host"''', (domain, latestdate,))
258 |                 scandetailshost = await cursor.fetchall()
259 |                 self.latestscandomain["scandetailshost"] = scandetailshost
260 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="email"''', (domain, latestdate,))
261 |                 scandetailsemail = await cursor.fetchall()
262 |                 self.latestscandomain["scandetailsemail"] = scandetailsemail
263 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="ip"''', (domain, latestdate,))
264 |                 scandetailsip = await cursor.fetchall()
265 |                 self.latestscandomain["scandetailsip"] = scandetailsip
266 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="vhost"''', (domain, latestdate,))
267 |                 scandetailsvhost = await cursor.fetchall()
268 |                 self.latestscandomain["scandetailsvhost"] = scandetailsvhost
269 |                 cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="shodan"''', (domain, latestdate,))
270 |                 scandetailsshodan = await cursor.fetchall()
271 |                 self.latestscandomain["scandetailsshodan"] = scandetailsshodan
272 |             return self.latestscandomain
273 |         except Exception as e:
274 |             print(e)
275 | 


--------------------------------------------------------------------------------
/theHarvester/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/parsers/__init__.py


--------------------------------------------------------------------------------
/theHarvester/parsers/intelxparser.py:
--------------------------------------------------------------------------------
 1 | class Parser:
 2 | 
 3 |     def __init__(self):
 4 |         self.emails = set()
 5 |         self.hosts = set()
 6 | 
 7 |     async def parse_dictionaries(self, results: dict) -> tuple:
 8 |         """
 9 |         Parse method to parse json results
10 |         :param results: Dictionary containing a list of dictionaries known as selectors
11 |         :return: tuple of emails and hosts
12 |         """
13 |         if results is not None:
14 |             for dictionary in results["selectors"]:
15 |                 field = dictionary['selectorvalue']
16 |                 if '@' in field:
17 |                     self.emails.add(field)
18 |                 else:
19 |                     field = str(field)
20 |                     if 'http' in field or 'https' in field:
21 |                         if field[:5] == 'https':
22 |                             field = field[8:]
23 |                         else:
24 |                             field = field[7:]
25 |                     self.hosts.add(field.replace(')', '').replace(',', ''))
26 |             return self.emails, self.hosts
27 |         return None, None
28 | 


--------------------------------------------------------------------------------
/theHarvester/parsers/myparser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | class Parser:
 5 | 
 6 |     def __init__(self, results, word):
 7 |         self.results = results
 8 |         self.word = word
 9 |         self.temp = []
10 | 
11 |     async def genericClean(self):
12 |         self.results = self.results.replace('<em>', '').replace('<b>', '').replace('</b>', '').replace('</em>', '') \
13 |             .replace('%3a', '').replace('<strong>', '').replace('</strong>', '') \
14 |             .replace('<wbr>', '').replace('</wbr>', '')
15 | 
16 |         for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '%2f', '/', '\\'):
17 |             self.results = self.results.replace(search, ' ')
18 | 
19 |     async def urlClean(self):
20 |         self.results = self.results.replace('<em>', '').replace('</em>', '').replace('%2f', '').replace('%3a', '')
21 |         for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
22 |             self.results = self.results.replace(search, ' ')
23 | 
24 |     async def emails(self):
25 |         await self.genericClean()
26 |         # Local part is required, charset is flexible.
27 |         # https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly)
28 |         reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', ''))
29 |         self.temp = reg_emails.findall(self.results)
30 |         emails = await self.unique()
31 |         true_emails = {str(email)[1:].lower().strip() if len(str(email)) > 1 and str(email)[0] == '.'
32 |                        else len(str(email)) > 1 and str(email).lower().strip() for email in emails}
33 |         # if email starts with dot shift email string and make sure all emails are lowercase
34 |         return true_emails
35 | 
36 |     async def fileurls(self, file):
37 |         urls = []
38 |         reg_urls = re.compile('<a href="(.*?)"')
39 |         self.temp = reg_urls.findall(self.results)
40 |         allurls = await self.unique()
41 |         for iteration in allurls:
42 |             if iteration.count('webcache') or iteration.count('google.com') or iteration.count('search?hl'):
43 |                 pass
44 |             else:
45 |                 urls.append(iteration)
46 |         return urls
47 | 
48 |     async def hostnames(self):
49 |         await self.genericClean()
50 |         reg_hosts = re.compile(r'[a-zA-Z0-9.-]*\.' + self.word)
51 |         self.temp = reg_hosts.findall(self.results)
52 |         hostnames = await self.unique()
53 |         reg_hosts = re.compile(r'[a-zA-Z0-9.-]*\.' + self.word.replace('www.', ''))
54 |         self.temp = reg_hosts.findall(self.results)
55 |         hostnames.extend(await self.unique())
56 |         return list(set(hostnames))
57 | 
58 |     async def hostnames_all(self):
59 |         reg_hosts = re.compile('<cite>(.*?)</cite>')
60 |         temp = reg_hosts.findall(self.results)
61 |         for iteration in temp:
62 |             if iteration.count(':'):
63 |                 res = iteration.split(':')[1].split('/')[2]
64 |             else:
65 |                 res = iteration.split('/')[0]
66 |             self.temp.append(res)
67 |         hostnames = await self.unique()
68 |         return hostnames
69 | 
70 |     async def set(self):
71 |         reg_sets = re.compile(r'>[a-zA-Z\d]*</a></font>')
72 |         self.temp = reg_sets.findall(self.results)
73 |         sets = []
74 |         for iteration in self.temp:
75 |             delete = iteration.replace('>', '')
76 |             delete = delete.replace('</a</font', '')
77 |             sets.append(delete)
78 |         return sets
79 | 
80 |     async def urls(self):
81 |         found = re.finditer(r'(http|https)://(www\.)?trello.com/([a-zA-Z\d\-_\.]+/?)*', self.results)
82 |         urls = {match.group().strip() for match in found}
83 |         return urls
84 | 
85 |     async def unique(self) -> list:
86 |         return list(set(self.temp))
87 | 


--------------------------------------------------------------------------------
/theHarvester/parsers/securitytrailsparser.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List
 2 | 
 3 | 
 4 | class Parser:
 5 | 
 6 |     def __init__(self, word, text):
 7 |         self.word = word
 8 |         self.text = text
 9 |         self.hostnames = set()
10 |         self.ips = set()
11 | 
12 |     async def parse_text(self) -> Union[List, Tuple]:
13 |         sub_domain_flag = 0
14 |         self.text = str(self.text).splitlines()
15 |         # Split lines to get a list of lines.
16 |         for index in range(0, len(self.text)):
17 |             line = self.text[index].strip()
18 |             if '"ip":' in line:
19 |                 # Extract IP.
20 |                 ip = ''
21 |                 for ch in line[7:]:
22 |                     if ch == '"':
23 |                         break
24 |                     else:
25 |                         ip += ch
26 |                 self.ips.add(ip)
27 |             elif '"subdomains":' in line:
28 |                 # subdomains start here so set flag to 1
29 |                 sub_domain_flag = 1
30 |                 continue
31 |             elif sub_domain_flag > 0:
32 |                 if ']' in line:
33 |                     sub_domain_flag = 0
34 |                 else:
35 |                     if 'www' in self.word:
36 |                         self.word = str(self.word).replace('www.', '').replace('www', '')
37 |                     # Remove www from word if entered
38 |                     self.hostnames.add(str(line).replace('"', '').replace(',', '') + '.' + self.word)
39 |             else:
40 |                 continue
41 |         return list(self.ips), list(self.hostnames)
42 | 


--------------------------------------------------------------------------------
/theHarvester/screenshot/screenshot.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Screenshot module that utilizes pyppeteer to asynchronously
 3 | take screenshots
 4 | """
 5 | 
 6 | from pyppeteer import launch
 7 | import aiohttp
 8 | import asyncio
 9 | import certifi
10 | from datetime import datetime
11 | import os
12 | import ssl
13 | import sys
14 | 
15 | 
16 | class ScreenShotter:
17 | 
18 |     def __init__(self, output):
19 |         self.output = output
20 |         self.slash = "\\" if 'win' in sys.platform else '/'
21 |         self.slash = "" if (self.output[-1] == "\\" or self.output[-1] == "/") else self.slash
22 | 
23 |     def verify_path(self):
24 |         try:
25 |             if not os.path.isdir(self.output):
26 |                 answer = input(
27 |                     '[+] The output path you have entered does not exist would you like to create it (y/n): ')
28 |                 if answer.lower() == 'yes' or answer.lower() == 'y':
29 |                     os.mkdir(self.output)
30 |                     return True
31 |                 else:
32 |                     return False
33 |             return True
34 |         except Exception as e:
35 |             print(f"An exception has occurred while attempting to verify output path's existence: {e}")
36 |             return False
37 | 
38 |     @staticmethod
39 |     async def verify_installation():
40 |         # Helper function that verifies pyppeteer & chromium are installed
41 |         # If chromium is not installed pyppeteer will prompt user to install it
42 |         browser = await launch(headless=True, ignoreHTTPSErrors=True, args=["--no-sandbox"])
43 |         await browser.close()
44 | 
45 |     @staticmethod
46 |     def chunk_list(items, chunk_size):
47 |         # Based off of: https://github.com/apache/incubator-sdap-ingester
48 |         return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
49 | 
50 |     @staticmethod
51 |     async def visit(url):
52 |         try:
53 |             # print(f'attempting to visit: {url}')
54 |             timeout = aiohttp.ClientTimeout(total=35)
55 |             headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
56 |                                      'Chrome/83.0.4103.106 Safari/537.36'}
57 |             url = f'http://{url}' if not url.startswith('http') else url
58 |             url = url.replace('www.', '')
59 |             sslcontext = ssl.create_default_context(cafile=certifi.where())
60 |             async with aiohttp.ClientSession(timeout=timeout, headers=headers,
61 |                                              connector=aiohttp.TCPConnector(ssl=sslcontext)) as session:
62 |                 async with session.get(url, verify_ssl=False) as resp:
63 |                     # TODO fix with origin url, should be there somewhere
64 |                     text = await resp.text("UTF-8")
65 |                     return f'http://{url}' if not url.startswith('http') else url, text
66 |         except Exception as e:
67 |             print(f'An exception has occurred while attempting to visit {url} : {e}')
68 |             return "", ""
69 | 
70 |     async def take_screenshot(self, url):
71 |         url = f'http://{url}' if not url.startswith('http') else url
72 |         url = url.replace('www.', '')
73 |         print(f'Attempting to take a screenshot of: {url}')
74 |         browser = await launch(headless=True, ignoreHTTPSErrors=True, args=["--no-sandbox"])
75 |         context = await browser.createIncognitoBrowserContext()
76 |         # Create a new page in a pristine context.
77 |         page = await context.newPage()
78 |         path = fr'{self.output}{self.slash}{url.replace("http://", "").replace("https://", "")}.png'
79 |         date = str(datetime.utcnow())
80 |         try:
81 |             # change default timeout from 30 to 35 seconds
82 |             page.setDefaultNavigationTimeout(35000)
83 |             await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
84 |                                     'Chrome/83.0.4103.106 Safari/537.36')
85 |             await page.goto(url)
86 |             await page.screenshot({'path': path})
87 |         except Exception as e:
88 |             print(f'An exception has occurred attempting to screenshot: {url} : {e}')
89 |             path = ""
90 |         finally:
91 |             # Clean up everything whether screenshot is taken or not
92 |             await asyncio.sleep(2)
93 |             await page.close()
94 |             await context.close()
95 |             await browser.close()
96 |             return date, url, path
97 | 


--------------------------------------------------------------------------------
/wordlists/dorks.txt:
--------------------------------------------------------------------------------
 1 | inurl:"contact"
 2 | intext:email filetype:log
 3 | "Index of /mail"
 4 | "admin account info" filetype:log
 5 | intext:@
 6 | administrator accounts/
 7 | intitle:"Index of" .bash_history
 8 | intitle:"index of" members OR accounts
 9 | inurl:/shared/help.php
10 | inurl:public
11 | intitle:index.of inbox
12 | intitle:"Server Administration" 
13 | inurl:passwd.txt
14 | robots.txt
15 | php-addressbook "This is the addressbook for *" -warning


--------------------------------------------------------------------------------
/wordlists/general/common.txt:
--------------------------------------------------------------------------------
1 | admin
2 | test
3 | hello
4 | uk
5 | login
6 | book
7 | robots.txt
8 | 


--------------------------------------------------------------------------------