├── .dockerignore
├── .flake8
├── .gitattributes
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ └── issue-template.md
├── dependabot.yml
└── workflows
│ ├── codeql-analysis.yml
│ ├── dockerci.yml
│ └── theHarvester.yml
├── .gitignore
├── .lgtm.yml
├── Dockerfile
├── README.md
├── README
├── CONTRIBUTING.md
├── COPYING
└── LICENSES
├── api-keys.yaml
├── bin
├── restfulHarvest
└── theHarvester
├── debian
├── changelog
├── control
├── copyright
├── dirs
├── docs
├── gbp.conf
├── helper-script
│ └── theharvester
├── patches
│ ├── Disable-a-failing-test-unstable-site.patch
│ ├── Improve-data-installation.patch
│ └── series
├── rules
├── source
│ └── format
├── tests
│ └── control
├── theharvester.install
├── theharvester.links
├── upstream
│ └── metadata
└── watch
├── mypy.ini
├── proxies.yaml
├── pyproject.toml
├── pytest.ini
├── requirements.txt
├── requirements
├── base.txt
└── dev.txt
├── restfulHarvest.py
├── setup.cfg
├── setup.py
├── tests
├── __init__.py
├── discovery
│ ├── __init__.py
│ ├── test_anubis.py
│ ├── test_certspotter.py
│ ├── test_githubcode.py
│ ├── test_omnisint.py
│ ├── test_otx.py
│ ├── test_qwantsearch.py
│ ├── test_sublist3r.py
│ └── test_threatminer.py
└── test_myparser.py
├── theHarvester-logo.png
├── theHarvester.py
├── theHarvester
├── __init__.py
├── __main__.py
├── discovery
│ ├── __init__.py
│ ├── anubis.py
│ ├── baidusearch.py
│ ├── bevigil.py
│ ├── binaryedgesearch.py
│ ├── bingsearch.py
│ ├── bufferoverun.py
│ ├── censysearch.py
│ ├── certspottersearch.py
│ ├── constants.py
│ ├── crtsh.py
│ ├── dnsdumpster.py
│ ├── dnssearch.py
│ ├── duckduckgosearch.py
│ ├── fullhuntsearch.py
│ ├── githubcode.py
│ ├── hackertarget.py
│ ├── huntersearch.py
│ ├── intelxsearch.py
│ ├── omnisint.py
│ ├── otxsearch.py
│ ├── pentesttools.py
│ ├── projectdiscovery.py
│ ├── qwantsearch.py
│ ├── rapiddns.py
│ ├── rocketreach.py
│ ├── securitytrailssearch.py
│ ├── shodansearch.py
│ ├── sublist3r.py
│ ├── takeover.py
│ ├── threatcrowd.py
│ ├── threatminer.py
│ ├── urlscan.py
│ ├── virustotal.py
│ ├── yahoosearch.py
│ └── zoomeyesearch.py
├── lib
│ ├── __init__.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── api.py
│ │ ├── api_example.py
│ │ └── static
│ │ │ └── .gitkeep
│ ├── core.py
│ ├── hostchecker.py
│ ├── ip-ranges.json
│ ├── resolvers.txt
│ └── stash.py
├── parsers
│ ├── __init__.py
│ ├── intelxparser.py
│ ├── myparser.py
│ └── securitytrailsparser.py
└── screenshot
│ └── screenshot.py
└── wordlists
├── dns-big.txt
├── dns-names.txt
├── dorks.txt
├── general
└── common.txt
└── names_small.txt
/.dockerignore:
--------------------------------------------------------------------------------
1 | .github/*
2 | .gitattributes
3 | .idea/
4 | .lgtm.yml
5 | mypy.ini
6 | .pytest_cache
7 | .mypy_cache
8 | tests/*
9 | README/
10 | bin/
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F405, F403, F401, E402
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, which is to have git automatically determine
2 | # whether a file is a text or binary, unless otherwise specified.
3 |
4 | * text=auto
5 |
6 | # Basic .gitattributes for a python repo.
7 |
8 | # Source files
9 | # ============
10 | *.pxd text diff=python
11 | *.py text diff=python
12 | *.py3 text diff=python
13 | *.pyw text diff=python
14 | *.pyx text diff=python
15 |
16 | # Binary files
17 | # ============
18 | *.db binary
19 | *.p binary
20 | *.pkl binary
21 | *.pyc binary
22 | *.pyd binary
23 | *.pyo binary
24 |
25 | # Note: .db, .p, and .pkl files are associated with the python modules
26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
27 | # (among others).
28 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [L1ghtn1ng, NotoriousRebel]
4 | open_collective: # Replace with a single Open Collective username
5 | ko_fi: #
6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
8 | liberapay: # Replace with a single Liberapay username
9 | issuehunt: # Replace with a single IssueHunt username
10 | otechie: # Replace with a single Otechie username
11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
12 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/issue-template.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Issue Template
3 | about: A template for new issues.
4 | title: "[Bug|Feature Request|Other] Short Description of Issue"
5 | labels: ''
6 |
7 | ---
8 |
9 | ## Note we do not support installing theHarvester on andriod
10 |
11 | **Feature Request or Bug or Other**
12 | Feature Request | Bug | Other
13 |
14 | **Describe the feature request or bug or other**
15 | A clear and concise description of what the bug, feature request,
16 | or other request is.
17 |
18 | **To Reproduce**
19 | Steps to reproduce the behaviour:
20 | 1. Run tool like this: '...'
21 | 2. See error
22 |
23 | **Expected behaviour**
24 | A clear and concise description of what you expected to happen.
25 |
26 | **Screenshots**
27 | If possible please add screenshots to help explain your problem.
28 |
29 | **System Information (System that tool is running on):**
30 | - OS: [e.g. Windows10]
31 | - Version [e.g. 2.7]
32 |
33 | **Additional context**
34 | Add any other context about the problem here.
35 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: github-actions
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | timezone: Europe/London
8 | - package-ecosystem: pip
9 | directory: "/"
10 | schedule:
11 | interval: daily
12 | timezone: Europe/London
13 | open-pull-requests-limit: 10
14 | target-branch: master
15 | allow:
16 | - dependency-type: direct
17 | - dependency-type: indirect
18 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ master, dev ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ master, dev ]
20 | schedule:
21 | - cron: '19 11 * * 4'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 |
28 | strategy:
29 | fail-fast: false
30 | matrix:
31 | language: [ 'python' ]
32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 | # Learn more:
34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 |
36 | steps:
37 | - name: Checkout repository
38 | uses: actions/checkout@v3
39 |
40 | # Initializes the CodeQL tools for scanning.
41 | - name: Initialize CodeQL
42 | uses: github/codeql-action/init@v2
43 | with:
44 | languages: ${{ matrix.language }}
45 | # If you wish to specify custom queries, you can do so here or in a config file.
46 | # By default, queries listed here will override any specified in a config file.
47 | # Prefix the list here with "+" to use these queries and those in the config file.
48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 |
50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
51 | # If this step fails, then you should remove it and run the build manually (see below)
52 | - name: Autobuild
53 | uses: github/codeql-action/autobuild@v2
54 |
55 | # ℹ️ Command-line programs to run using the OS shell.
56 | # 📚 https://git.io/JvXDl
57 |
58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 | # and modify them (or add more) to build your code if your project
60 | # uses a compiled language
61 |
62 | #- run: |
63 | # make bootstrap
64 | # make release
65 |
66 | - name: Perform CodeQL Analysis
67 | uses: github/codeql-action/analyze@v2
68 |
--------------------------------------------------------------------------------
/.github/workflows/dockerci.yml:
--------------------------------------------------------------------------------
1 | name: TheHarvester Docker Image CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v3
10 | - name: Build the Docker image
11 | run: docker build . --file Dockerfile --tag theharvester:$(date +%s)
--------------------------------------------------------------------------------
/.github/workflows/theHarvester.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: TheHarvester Python CI
3 |
4 | on:
5 | push:
6 | branches:
7 | - '*'
8 |
9 | pull_request:
10 | branches:
11 | - '*'
12 |
13 | jobs:
14 | Python:
15 | runs-on: ${{ matrix.os }}
16 | strategy:
17 | max-parallel: 8
18 | matrix:
19 | os: [ ubuntu-latest, macos-latest ]
20 | python-version: [ 3.8, 3.9, 3.10.0 ]
21 |
22 | steps:
23 | - uses: actions/checkout@v3
24 | - name: Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | - name: Install dependencies
29 | run: |
30 | pip install --upgrade pip
31 | pip install wheel
32 | pip install -r requirements/dev.txt
33 |
34 | - name: Lint with flake8
35 | run: |
36 | # stop the build if there are Python syntax errors or undefined names
37 | flake8 . --count --show-source --statistics
38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 | flake8 . --count --exit-zero --max-line-length=127 --statistics
40 |
41 | - name: Test with pytest
42 | run: |
43 | pytest
44 |
45 | - name: Static type checking with mypy
46 | run: |
47 | mypy --pretty theHarvester/*/*.py
48 | mypy --pretty theHarvester/*/*/*.py
49 |
50 | - name: Run theHarvester module Anubis
51 | run: |
52 | python theHarvester.py -d apple.com -b anubis
53 |
54 | - name: Run theHarvester module Baidu
55 | run: |
56 | python theHarvester.py -d yale.edu -b baidu
57 |
58 | - name: Run theHarvester module Bing
59 | run: |
60 | python theHarvester.py -d yale.edu -b bing
61 |
62 | - name: Run theHarvester module CertSpotter
63 | run: |
64 | python theHarvester.py -d yale.edu -b certspotter
65 |
66 | - name: Run theHarvester module Crtsh
67 | run: |
68 | python theHarvester.py -d hcl.com -b crtsh
69 |
70 | - name: Run theHarvester module DnsDumpster
71 | run: |
72 | python theHarvester.py -d yale.edu -b dnsdumpster
73 |
74 | - name: Run theHarvester module DuckDuckGo
75 | run: |
76 | python theHarvester.py -d yale.edu -b duckduckgo
77 |
78 | - name: Run theHarvester module HackerTarget
79 | run: |
80 | python theHarvester.py -d yale.edu -b hackertarget
81 |
82 | - name: Run theHarvester module Intelx
83 | run: |
84 | python theHarvester.py -d yale.edu -b intelx
85 |
86 | - name: Run theHarvester module Omnisint
87 | run: |
88 | python theHarvester.py -d yale.edu -b omnisint
89 |
90 | - name: Run theHarvester module Otx
91 | run: |
92 | python theHarvester.py -d yale.edu -b otx
93 |
94 | - name: Run theHarvester module Qwant
95 | run: |
96 | python theHarvester.py -d yale.edu -b qwant
97 |
98 | - name: Run theHarvester module RapidDns
99 | run: |
100 | python theHarvester.py -d yale.edu -b rapiddns
101 |
102 | - name: Run theHarvester module Sublist3r
103 | run: |
104 | python theHarvester.py -d yale.edu -b sublist3r
105 |
106 | - name: Run theHarvester module Threatcrowd
107 | run: |
108 | python theHarvester.py -d yale.edu -b threatcrowd
109 |
110 | - name: Run theHarvester module Threatminer
111 | run: |
112 | python theHarvester.py -d yale.edu -b threatminer
113 |
114 | - name: Run theHarvester module Urlscan
115 | run: |
116 | python theHarvester.py -d yale.edu -b urlscan
117 |
118 | - name: Run theHarvester module Yahoo
119 | run: |
120 | python theHarvester.py -d yale.edu -b yahoo
121 |
122 | - name: Run theHarvester module DNS brute force
123 | run: |
124 | python theHarvester.py -d yale.edu -c
125 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.idea
2 | *.pyc
3 | *.sqlite
4 | *.html
5 | *.htm
6 | *.vscode
7 | *.xml
8 | *.json
9 | debug_results.txt
10 | venv
11 | .mypy_cache
12 | .pytest_cache
13 | build/
14 | dist/
15 | theHarvester.egg-info
16 | api-keys.yaml
17 | .DS_Store
18 | .venv
19 |
--------------------------------------------------------------------------------
/.lgtm.yml:
--------------------------------------------------------------------------------
1 | queries:
2 | - exclude: py/import-and-import-from
3 | - exclude: py/polluting-import
4 | - exclude: py/member-test-non-container
5 |
6 | extraction:
7 | python:
8 | python_setup:
9 | version: 3
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:kinetic
2 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1"
3 | RUN mkdir /app
4 | WORKDIR /app
5 | COPY . /app
6 | ENV DEBIAN_FRONTEND=noninteractive
7 | RUN apt update && apt dist-upgrade -qy && apt install -qy git python3 python3-pip libffi-dev libxml2-dev libxslt1-dev && /usr/bin/python3 -m pip install --upgrade pip && apt autoremove -qy
8 | RUN /usr/bin/python3 --version && pip3 install --no-cache-dir -r requirements.txt && chmod +x ./*.py
9 | ENTRYPOINT ["/app/theHarvester.py"]
10 | ENTRYPOINT ["/app/restfulHarvest.py", "-H", "0.0.0.0", "-p", "80"]
11 | EXPOSE 80
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 |   [](https://lgtm.com/projects/g/laramies/theHarvester/context:python)
4 | [](https://inventory.rawsec.ml/)
5 |
6 | What is this?
7 | -------------
8 | theHarvester is a simple to use, yet powerful tool designed to be used during the reconnaissance stage of a red
9 | team assessment or penetration test. It performs open source intelligence (OSINT) gathering to help determine
10 | a domain's external threat landscape. The tool gathers names, emails, IPs, subdomains, and URLs by using
11 | multiple public resources that include:
12 |
13 | Passive:
14 | --------
15 | * anubis: Anubis-DB - https://github.com/jonluca/anubis
16 |
17 | * bevigil: CloudSEK BeVigil scans mobile application for OSINT assets and makes them available through an API - https://bevigil.com/osint-api
18 |
19 | * baidu: Baidu search engine - www.baidu.com
20 |
21 | * binaryedge: List of known subdomains from www.binaryedge.io
22 |
23 | * bing: Microsoft search engine - www.bing.com
24 |
25 | * bingapi: Microsoft search engine, through the API (Requires an API key, see below.)
26 |
27 | * bufferoverun: Uses data from Rapid7's Project Sonar - www.rapid7.com/research/project-sonar/
28 |
29 | * censys: [Censys search engine](https://search.censys.io/), will use certificates searches to enumerate subdomains and gather emails (Requires an API key, see below.) - [censys.io](https://censys.io/)
30 |
31 | * certspotter: Cert Spotter monitors Certificate Transparency logs - https://sslmate.com/certspotter/
32 |
33 | * crtsh: Comodo Certificate search - https://crt.sh
34 |
35 | * dnsdumpster: DNSdumpster search engine - https://dnsdumpster.com
36 |
37 | * duckduckgo: DuckDuckGo search engine - www.duckduckgo.com
38 |
39 | * fullhunt: The Next-Generation Attack Surface Security Platform - https://fullhunt.io
40 |
41 | * github-code: GitHub code search engine (Requires a GitHub Personal Access Token, see below.) - www.github.com
42 |
43 | * hackertarget: Online vulnerability scanners and network intelligence to help organizations - https://hackertarget.com
44 |
45 | * hunter: Hunter search engine (Requires an API key, see below.) - www.hunter.io
46 |
47 | * intelx: Intelx search engine (Requires an API key, see below.) - www.intelx.io
48 |
49 | * omnisint: Project Crobat, A Centralised Searchable Open Source Project Sonar DNS Database - https://github.com/Cgboal/SonarSearch
50 |
51 | * otx: AlienVault Open Threat Exchange - https://otx.alienvault.com
52 |
53 | * pentesttools: Powerful Penetration Testing Tools, Easy to Use (Requires an API key, see below.) - https://pentest-tools.com/home
54 |
55 | * projecdiscovery: We actively collect and maintain internet-wide assets data,
56 | to enhance research and analyse changes around DNS for better insights (Requires an API key, see below.) - https://chaos.projectdiscovery.io
57 |
58 | * qwant: Qwant search engine - www.qwant.com
59 |
60 | * rapiddns: DNS query tool which make querying subdomains or sites of a same IP easy! https://rapiddns.io
61 |
62 | * rocketreach: Access real-time verified personal/professional emails, phone numbers, and social media links. - https://rocketreach.co
63 |
64 | * securityTrails: Security Trails search engine, the world's largest repository of historical DNS data
65 | (Requires an API key, see below.) - www.securitytrails.com
66 |
67 | * shodan: Shodan search engine, will search for ports and banners from discovered hosts (Requires an API key, see below.) - www.shodanhq.com
68 |
69 | * sublist3r: Fast subdomains enumeration tool for penetration testers - https://api.sublist3r.com/search.php?domain=example.com
70 |
71 | * threatcrowd: Open source threat intelligence - www.threatcrowd.org
72 |
73 | * threatminer: Data mining for threat intelligence - https://www.threatminer.org/
74 |
75 | * urlscan: A sandbox for the web that is a URL and website scanner - https://urlscan.io
76 |
77 | * vhost: Bing virtual hosts search
78 |
79 | * virustotal: virustotal.com domain search
80 |
81 | * yahoo: Yahoo search engine
82 |
83 | * zoomeye: China version of shodan - https://www.zoomeye.org
84 |
85 |
86 | Active:
87 | -------
88 | * DNS brute force: dictionary brute force enumeration
89 | * Screenshots: Take screenshots of subdomains that were found
90 |
91 | Modules that require an API key:
92 | --------------------------------
93 | Documentation to setup API keys can be found at - https://github.com/laramies/theHarvester/wiki/Installation#api-keys
94 |
95 | * bevigil - Free upto 50 queries. Pricing can be found here: https://bevigil.com/pricing/osint
96 | * binaryedge - $10/month
97 | * bing
98 | * censys - API keys are required and can be retrieved from your [Censys account](https://search.censys.io/account/api).
99 | * fullhunt
100 | * github
101 | * hunter - limited to 10 on the free plan, so you will need to do -l 10 switch
102 | * intelx
103 | * pentesttools - $
104 | * projecdiscovery - invite only for now
105 | * rocketreach - $
106 | * securityTrails
107 | * shodan - $
108 | * zoomeye
109 |
110 | Install and dependencies:
111 | -------------------------
112 | * Python 3.7+
113 | * https://github.com/laramies/theHarvester/wiki/Installation
114 |
115 |
116 | Comments, bugs, and requests:
117 | -----------------------------
118 | * [](https://twitter.com/laramies) Christian Martorella @laramies
119 | cmartorella@edge-security.com
120 | * [](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
121 | * [](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
122 |
123 |
124 | Main contributors:
125 | ------------------
126 | * [](https://twitter.com/NotoriousRebel1) Matthew Brown @NotoriousRebel1
127 | * [](https://twitter.com/jay_townsend1) Jay "L1ghtn1ng" Townsend @jay_townsend1
128 | * [](https://twitter.com/discoverscripts) Lee Baird @discoverscripts
129 |
130 |
131 | Thanks:
132 | -------
133 | * John Matherly - Shodan project
134 | * Ahmed Aboul Ela - subdomain names dictionaries (big and small)
135 |
--------------------------------------------------------------------------------
/README/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to theHarvester Project
2 | Welcome to theHarvester project, so you would like to contribute.
3 | The following below must be met to get accepted.
4 |
5 | # CI
6 | Make sure all CI passes and you do not introduce any alerts from lgtm.
7 |
8 | # Unit Tests
9 | For new modules a unit test for that module is required and we use pytest.
10 |
11 | # Coding Standards
12 | * No single letter variables and variable names must represent the action that it is performing
13 | * Have static typing on functions etc
14 | * Make sure no errors are reported from mypy
15 | * No issues reported with flake8
16 |
17 | # Submitting Bugs
18 | If you find a bug in a module that you want to submit an issue for and know how to write python code.
19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project.
20 |
--------------------------------------------------------------------------------
/README/COPYING:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.
5 | 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Library General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
--------------------------------------------------------------------------------
/README/LICENSES:
--------------------------------------------------------------------------------
1 | Released under the GPL v 2.0.
2 |
3 | If you did not receive a copy of the GPL, try http://www.gnu.org/.
4 |
5 | Copyright 2011 Christian Martorella
6 |
7 | theHarvester is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation version 2 of the License.
10 |
11 | theHarvester is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 |
--------------------------------------------------------------------------------
/api-keys.yaml:
--------------------------------------------------------------------------------
1 | apikeys:
2 | bevigil:
3 | key:
4 |
5 | binaryedge:
6 | key:
7 |
8 | bing:
9 | key:
10 |
11 | censys:
12 | id:
13 | secret:
14 |
15 | fullhunt:
16 | key:
17 |
18 | github:
19 | key:
20 |
21 | hunter:
22 | key:
23 |
24 | intelx:
25 | key:
26 |
27 | pentestTools:
28 | key:
29 |
30 | projectDiscovery:
31 | key:
32 |
33 | rocketreach:
34 | key:
35 |
36 | securityTrails:
37 | key:
38 |
39 | shodan:
40 | key:
41 |
42 | virustotal:
43 | key:
44 |
45 | zoomeye:
46 | key:
47 |
--------------------------------------------------------------------------------
/bin/restfulHarvest:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | import uvicorn
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument('-H', '--host', default='127.0.0.1', help='IP address to listen on default is 127.0.0.1')
7 | parser.add_argument('-p', '--port', default=5000, help='Port to bind the web server to, default is 5000', type=int)
8 | parser.add_argument('-l', '--log-level', default='info', help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set')
9 | parser.add_argument('-r', '--reload', default=False, help='Enable automatic reload used during development of the api', action='store_true')
10 |
11 | args = parser.parse_args()
12 |
13 | if __name__ == '__main__':
14 | uvicorn.run('theHarvester.lib.api.api:app', host=args.host, port=args.port, log_level=args.log_level, reload=args.reload)
15 |
--------------------------------------------------------------------------------
/bin/theHarvester:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Note: This script runs theHarvester
3 | import sys
4 | import asyncio
5 | from theHarvester import __main__
6 |
7 | if sys.version_info.major < 3 or sys.version_info.minor < 7:
8 | print('\033[93m[!] Make sure you have Python 3.7+ installed, quitting.\n\n \033[0m')
9 | sys.exit(1)
10 |
11 | if __name__ == '__main__':
12 | platform = sys.platform
13 | if platform == 'win32':
14 | # Required or things will break if trying to take screenshots
15 | import multiprocessing
16 |
17 | multiprocessing.freeze_support()
18 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
19 | else:
20 | import uvloop
21 | uvloop.install()
22 |
23 | if "linux" in platform:
24 | import aiomultiprocess
25 |
26 | # As we are not using Windows we can change the spawn method to fork for greater performance
27 | aiomultiprocess.set_context("fork")
28 | asyncio.run(__main__.entry_point())
29 |
--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
1 | theharvester (4.2.0-0parrot3) parrot-updates; urgency=medium
2 |
3 | * Update package dependencies.
4 | * Update helper script.
5 |
6 | -- Lorenzo "Palinuro" Faletra Wed, 21 Dec 2022 15:11:49 +0100
7 |
8 | theharvester (4.2.0-0parrot2) parrot-updates; urgency=medium
9 |
10 | * Rebuild package.
11 |
12 | -- Lorenzo "Palinuro" Faletra Wed, 21 Dec 2022 15:06:12 +0100
13 |
14 | theharvester (4.2.0-0parrot1) parrot-updates; urgency=medium
15 |
16 | * Import new Upstream release.
17 |
18 | -- Lorenzo "Palinuro" Faletra Wed, 21 Dec 2022 12:59:39 +0100
19 |
20 | theharvester (3.2.3-parrot0) rolling-testing; urgency=medium
21 |
22 | * Remove Kali ci scripts
23 | * Init Parrot team info
24 | * Remove old command
25 | * Add launcher
26 | * Edit launcher command
27 |
28 | -- Nong Hoang Tu Thu, 04 Mar 2021 00:39:41 +0700
29 |
30 | theharvester (3.2.3-0kali1) kali-dev; urgency=medium
31 |
32 | * New upstream version 3.2.3
33 | * Add Restrictions: superficial to autopkgtest
34 |
35 | -- Sophie Brun Mon, 08 Feb 2021 11:45:55 +0100
36 |
37 | theharvester (3.2.2-0kali2) kali-dev; urgency=medium
38 |
39 | * Fix installation of the wordlists
40 |
41 | -- Sophie Brun Thu, 07 Jan 2021 10:47:15 +0100
42 |
43 | theharvester (3.2.2-0kali1) kali-dev; urgency=medium
44 |
45 | [ Ben Wilson ]
46 | * Fix email address
47 |
48 | [ Sophie Brun ]
49 | * New upstream version 3.2.2
50 |
51 | -- Sophie Brun Thu, 17 Dec 2020 10:08:13 +0100
52 |
53 | theharvester (3.2.0-0kali1) kali-dev; urgency=medium
54 |
55 | * New upstream version 3.2.0
56 | * Remove merged patches
57 | * Update build-deps and deps
58 | * Update installation to use usptream setup.py
59 | * Add lintian-overrides for breakout-link
60 |
61 | -- Sophie Brun Fri, 11 Sep 2020 09:30:08 +0200
62 |
63 | theharvester (3.1-0kali4) kali-dev; urgency=medium
64 |
65 | * Fix for issue 6450:
66 | - Add a link to the wordlists
67 | - Use an helper-script to change the run directory
68 | - Add a patch to change directory of sqlite db
69 | * Bump Standards-Version to 4.5.0
70 |
71 | -- Sophie Brun Wed, 08 Jul 2020 12:06:35 +0200
72 |
73 | theharvester (3.1-0kali3) kali-dev; urgency=medium
74 |
75 | [ Sven Höper ]
76 | * Add missing depends: python3-yaml
77 | * Packaging: Fix test command
78 |
79 | [ Sophie Brun ]
80 | * Add a script to mention that theharvester command is deprecated
81 |
82 | -- Sophie Brun Wed, 18 Dec 2019 08:44:38 +0100
83 |
84 | theharvester (3.1-0kali2) kali-dev; urgency=medium
85 |
86 | * Add missing depends: python3-dnspython
87 |
88 | -- Sophie Brun Tue, 15 Oct 2019 18:20:09 +0200
89 |
90 | theharvester (3.1-0kali1) kali-dev; urgency=medium
91 |
92 | [ Raphaël Hertzog ]
93 | * Update Vcs-* fields for the move to gitlab.com
94 | * Add GitLab's CI configuration file
95 | * Configure git-buildpackage for Kali
96 | * Update URL in GitLab's CI configuration file
97 |
98 | [ g0tmi1k ]
99 | * New format
100 |
101 | [ Sophie Brun ]
102 | * Update debian/watch
103 | * New upstream version 3.1
104 | * Remove obsolete patches
105 | * Use debhelper-compat 12
106 | * Update packaging to use setup.py
107 | * Bump Standards-Version to 4.4.1
108 |
109 | -- Sophie Brun Tue, 15 Oct 2019 08:59:27 +0200
110 |
111 | theharvester (3.0.6-0kali1) kali-dev; urgency=medium
112 |
113 | * New upstream version 3.0.6
114 |
115 | -- Sophie Brun Thu, 20 Dec 2018 09:27:33 +0100
116 |
117 | theharvester (3.0.5-0kali1) kali-dev; urgency=medium
118 |
119 | * New upstream version 3.0.5
120 | * Add dependency: python3-plotly
121 | * Refresh patch
122 | * Add minimal autopkgtest
123 |
124 | -- Sophie Brun Wed, 19 Dec 2018 11:00:29 +0100
125 |
126 | theharvester (3.0.4-0kali1) kali-dev; urgency=medium
127 |
128 | * New upstream version 3.0.4
129 | * Switch to Python 3
130 | * Add a minimal required version of wfuzz: this is the first version in
131 | Python 3
132 |
133 | -- Sophie Brun Thu, 13 Dec 2018 11:11:59 +0100
134 |
135 | theharvester (3.0.1-0kali1) kali-dev; urgency=medium
136 |
137 | * New upstream version 3.0.1
138 | * Bump Standards-Version to 4.2.1
139 | * Update debian/copyright
140 | * Add missing dependency: python-bs4
141 | * Refresh patch
142 |
143 | -- Sophie Brun Thu, 29 Nov 2018 14:40:11 +0100
144 |
145 | theharvester (3.0-0kali1) kali-dev; urgency=medium
146 |
147 | * Upstream update
148 |
149 | -- Ben Wilson Tue, 09 Oct 2018 12:19:07 +0100
150 |
151 | theharvester (2.7.2~20180322-0kali1) kali-dev; urgency=medium
152 |
153 | * Import new upstream version (Closes: 0004685)
154 | * Bump Standards-Version and use debhelper 11
155 | * Update debian/control and debian/theharvester.install
156 | * Refresh patches
157 | * Add wfuzz as dependency
158 |
159 | -- Sophie Brun Thu, 19 Apr 2018 09:11:32 +0200
160 |
161 | theharvester (2.7-0kali1) kali-dev; urgency=medium
162 |
163 | * Import new upstream release
164 |
165 | -- Sophie Brun Tue, 19 Apr 2016 09:32:54 +0200
166 |
167 | theharvester (2.6-0kali1) kali-dev; urgency=medium
168 |
169 | * Update watch file
170 |
171 | -- Sophie Brun Wed, 27 Jan 2016 10:26:13 +0100
172 |
173 | theharvester (2.6-0kali0) kali; urgency=low
174 |
175 | * Imported new upstream release (Closes: 0002291)
176 |
177 | -- Devon Kearns Tue, 26 May 2015 12:37:58 -0600
178 |
179 | theharvester (2.5+git20150109-0kali0) kali; urgency=medium
180 |
181 | * Imported new upstream release (Closes: 0001961)
182 |
183 | -- Devon Kearns Fri, 09 Jan 2015 12:24:36 -0700
184 |
185 | theharvester (2.2a-1kali2) kali; urgency=low
186 |
187 | * Patched usage output (Closes: 0001251)
188 |
189 | -- Devon Kearns Tue, 20 May 2014 10:49:13 -0600
190 |
191 | theharvester (2.2a-1kali1) kali; urgency=low
192 |
193 | * Updated watch file
194 |
195 | -- Mati Aharoni Sun, 12 Jan 2014 15:40:59 -0500
196 |
197 | theharvester (2.2a-1kali0) kali; urgency=low
198 |
199 | * New upstream version
200 |
201 | -- Devon Kearns Sat, 09 Feb 2013 14:47:29 -0700
202 |
203 | theharvester (2.2-1kali3) kali; urgency=low
204 |
205 | * Cleaned up debian files
206 |
207 | -- balding_parrot Tue, 18 Dec 2012 22:49:17 +0000
208 |
209 | theharvester (2.2-1kali2) kali; urgency=low
210 |
211 | * Removed desktop file
212 |
213 | -- balding_parrot Tue, 18 Dec 2012 22:32:47 +0000
214 |
215 | theharvester (2.2-1kali1) kali; urgency=low
216 |
217 | * Initial release
218 |
219 | -- balding_parrot Tue, 18 Dec 2012 08:21:47 +0000
220 |
--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
1 | Source: theharvester
2 | Section: utils
3 | Priority: optional
4 | Maintainer: Parrot Dev Team
5 | Uploaders: Lorenzo "Palinuro" Faletra
6 | Build-Depends: debhelper-compat (= 12),
7 | dh-python,
8 | python3-aiohttp,
9 | python3-all,
10 | python3-certifi,
11 | python3-requests,
12 | python3-setuptools,
13 | python3-yaml
14 | Standards-Version: 4.6.1
15 | Homepage: https://github.com/laramies/theHarvester
16 |
17 | Package: theharvester
18 | Architecture: all
19 | Depends: python3,
20 | python3-aiodns (>= 2.0.0),
21 | python3-aiohttp (>= 3.6.2),
22 | python3-aiofiles,
23 | python3-aiomultiprocess (>= 0.8.0),
24 | python3-aiosqlite (>= 0.15.0),
25 | python3-bs4 (>= 4.9.1),
26 | python3-censys (>= 2.1.7),
27 | python3-certifi (>= 2022.6.15),
28 | python3-dnspython (>= 2.0.0),
29 | # python3-fastapi: upstream went from 0.70.0 to 0.79.0
30 | # packaged in Debian, rev deps: theharvester, witnessme
31 | python3-fastapi (>= 0.74.0),
32 | python3-lxml (>= 4.5.2),
33 | python3-netaddr (>= 0.7.19),
34 | python3-ujson,
35 | python3-pyppeteer (>= 1.0.2),
36 | python3-requests (>= 2.23.0),
37 | python3-retrying (>= 1.3.3),
38 | python3-shodan (>= 1.23.0),
39 | python3-slowapi,
40 | python3-starlette,
41 | python3-texttable (>= 1.6.2),
42 | python3-uvicorn,
43 | python3-uvloop (>= 0.14.0),
44 | python3-yaml (>= 5.3.1),
45 | ${misc:Depends},
46 | ${python3:Depends},
47 | Description: tool for gathering e-mail accounts and subdomain names from public sources
48 | The package contains a tool for gathering subdomain names, e-mail addresses,
49 | virtual hosts, open ports/ banners, and employee names from different public
50 | sources (search engines, pgp key servers).
51 |
--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
1 | Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
2 | Upstream-Name: theharvester
3 | Source: https://github.com/laramies/theHarvester
4 |
5 | Files: *
6 | Copyright: 2011 Christian Martorella
7 | License: GPL-2
8 | This package is free software; you can redistribute it and/or modify
9 | it under the terms of the GNU General Public License version 2 as published by
10 | the Free Software Foundation.
11 | .
12 | This package is distributed in the hope that it will be useful,
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | GNU General Public License for more details.
16 | .
17 | You should have received a copy of the GNU General Public License
18 | along with this program. If not, see
19 | .
20 | On Debian systems, the complete text of the GNU General
21 | Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
22 |
23 | Files: debian/*
24 | Copyright: 2012 balding_parrot
25 | 2018 Sophie Brun
26 | License: GPL-2+
27 | This package is free software; you can redistribute it and/or modify
28 | it under the terms of the GNU General Public License as published by
29 | the Free Software Foundation; either version 2 of the License, or
30 | (at your option) any later version.
31 | .
32 | This package is distributed in the hope that it will be useful,
33 | but WITHOUT ANY WARRANTY; without even the implied warranty of
34 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
35 | GNU General Public License for more details.
36 | .
37 | You should have received a copy of the GNU General Public License
38 | along with this program. If not, see
39 | .
40 | On Debian systems, the complete text of the GNU General
41 | Public License version 2 can be found in "/usr/share/common-licenses/GPL-2".
42 |
--------------------------------------------------------------------------------
/debian/dirs:
--------------------------------------------------------------------------------
1 | usr/bin
2 |
--------------------------------------------------------------------------------
/debian/docs:
--------------------------------------------------------------------------------
1 | README.md
2 |
--------------------------------------------------------------------------------
/debian/gbp.conf:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | pristine-tar = True
3 |
4 | [pq]
5 | patch-numbers = False
6 |
7 | [dch]
8 | multimaint-merge = True
9 |
--------------------------------------------------------------------------------
/debian/helper-script/theharvester:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | set -e
4 |
5 | echo -e "this command is deprecated, use theHarvester instead"
6 | /usr/bin/theHarvester $@
7 |
--------------------------------------------------------------------------------
/debian/patches/Disable-a-failing-test-unstable-site.patch:
--------------------------------------------------------------------------------
1 | From: Sophie Brun
2 | Date: Tue, 30 Aug 2022 15:37:52 +0200
3 | Subject: Disable a failing test (unstable site)
4 |
5 | ---
6 | tests/discovery/test_sublist3r.py | 9 +++++----
7 | 1 file changed, 5 insertions(+), 4 deletions(-)
8 |
9 | diff --git a/tests/discovery/test_sublist3r.py b/tests/discovery/test_sublist3r.py
10 | index 374095e..1d5fdd1 100644
11 | --- a/tests/discovery/test_sublist3r.py
12 | +++ b/tests/discovery/test_sublist3r.py
13 | @@ -21,10 +21,11 @@ async def test_api(self):
14 | request = requests.get(base_url, headers=headers)
15 | assert request.status_code == 200
16 |
17 | - async def test_do_search(self):
18 | - search = sublist3r.SearchSublist3r(TestSublist3r.domain())
19 | - await search.process()
20 | - assert isinstance(await search.get_hostnames(), list)
21 | +# disable as it fails (unstable site?)
22 | +# async def test_do_search(self):
23 | +# search = sublist3r.SearchSublist3r(TestSublist3r.domain())
24 | +# await search.process()
25 | +# assert isinstance(await search.get_hostnames(), list)
26 |
27 |
28 | if __name__ == '__main__':
29 |
--------------------------------------------------------------------------------
/debian/patches/Improve-data-installation.patch:
--------------------------------------------------------------------------------
1 | From: Sophie Brun
2 | Date: Thu, 7 Jan 2021 10:19:09 +0100
3 | Subject: Improve data installation
4 |
5 | Bug-Kali: https://gitlab.com/kalilinux/packages/theharvester/-/issues/6
6 |
7 | By default the wordlists were installed directly in /etc/theHarvester
8 | instead of /etc/theHarvester/wordlists
9 | ---
10 | setup.py | 12 ++++++++----
11 | 1 file changed, 8 insertions(+), 4 deletions(-)
12 |
13 | diff --git a/setup.py b/setup.py
14 | index 128bd89..34a3ef0 100755
15 | --- a/setup.py
16 | +++ b/setup.py
17 | @@ -26,13 +26,17 @@
18 | ],
19 | data_files=[
20 | ('/etc/theHarvester', [
21 | - 'wordlists/general/common.txt',
22 | + 'api-keys.yaml',
23 | + 'proxies.yaml'
24 | + ]),
25 | + ('/etc/theHarvester/wordlists', [
26 | 'wordlists/dns-big.txt',
27 | 'wordlists/dns-names.txt',
28 | 'wordlists/dorks.txt',
29 | - 'wordlists/names_small.txt',
30 | - 'api-keys.yaml',
31 | - 'proxies.yaml'
32 | + 'wordlists/names_small.txt'
33 | + ]),
34 | + ('/etc/theHarvester/wordlists/general', [
35 | + 'wordlists/general/common.txt'
36 | ]
37 | )
38 | ],
39 |
--------------------------------------------------------------------------------
/debian/patches/series:
--------------------------------------------------------------------------------
1 | Improve-data-installation.patch
2 | Disable-a-failing-test-unstable-site.patch
3 |
--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
1 | #!/usr/bin/make -f
2 |
3 | # output every command that modifies files on the build system.
4 | #export DH_VERBOSE = 1
5 |
6 | %:
7 | dh $@ --with python3 --buildsystem=pybuild
8 |
9 | override_dh_auto_test:
10 | # do not run tests during the build: most of the tests require
11 | # network
12 |
--------------------------------------------------------------------------------
/debian/source/format:
--------------------------------------------------------------------------------
1 | 3.0 (quilt)
2 |
--------------------------------------------------------------------------------
/debian/tests/control:
--------------------------------------------------------------------------------
1 | Test-Command: python3 -m pytest tests
2 | Depends: @, python3-pytest, python3-pytest-asyncio
3 |
4 | Test-Command: theHarvester -h
5 | Restrictions: superficial, allow-stderr
6 |
--------------------------------------------------------------------------------
/debian/theharvester.install:
--------------------------------------------------------------------------------
1 | debian/helper-script/* usr/bin
2 |
--------------------------------------------------------------------------------
/debian/theharvester.links:
--------------------------------------------------------------------------------
1 | etc/theHarvester/api-keys.yaml usr/lib/python3/dist-packages/theHarvester/api-keys.yaml
2 | etc/theHarvester/wordlists usr/lib/python3/dist-packages/theHarvester/wordlists
3 |
--------------------------------------------------------------------------------
/debian/upstream/metadata:
--------------------------------------------------------------------------------
1 | ---
2 | Bug-Database: https://github.com/laramies/theHarvester/issues
3 | Bug-Submit: https://github.com/laramies/theHarvester/issues/new
4 | Repository: https://github.com/laramies/theHarvester.git
5 | Repository-Browse: https://github.com/laramies/theHarvester
6 |
--------------------------------------------------------------------------------
/debian/watch:
--------------------------------------------------------------------------------
1 | version=4
2 | opts="filenamemangle=s/.*\/v?V?(.*)\.tar\.gz/theharvester-$1.tar.gz/" \
3 | https://github.com/laramies/theHarvester/tags .*/v?V?(.*)\.tar\.gz
4 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | show_traceback = True
4 | show_error_codes = True
5 | namespace_packages = True
6 |
--------------------------------------------------------------------------------
/proxies.yaml:
--------------------------------------------------------------------------------
1 | http:
2 | - ip:port
3 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.pytest.ini_options]
2 | minversion = "7.1"
3 | addopts = "--no-header --asyncio-mode=auto"
4 | testpaths = [
5 | "tests",
6 | "tests/discovery/",
7 | ]
--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | minversion = 7.1.1
3 | testpaths = tests
4 | asyncio_mode=auto
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/base.txt
2 |
--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
1 | aiodns==3.0.0
2 | aiofiles==0.8.0
3 | aiohttp==3.8.1
4 | aiomultiprocess==0.9.0
5 | aiosqlite==0.17.0
6 | beautifulsoup4==4.11.1
7 | censys==2.1.7
8 | certifi==2022.6.15
9 | dnspython==2.2.1
10 | fastapi==0.79.0
11 | lxml==4.9.1
12 | netaddr==0.8.0
13 | ujson==5.4.0
14 | pyppeteer==1.0.2
15 | PyYAML==6.0
16 | requests==2.28.1
17 | retrying==1.3.3
18 | setuptools==64.0.3
19 | shodan==1.28.0
20 | slowapi==0.1.5
21 | uvicorn==0.18.2
22 | uvloop==0.16.0; platform_system != "Windows"
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | -r base.txt
2 | flake8==5.0.4
3 | mypy==0.971
4 | mypy-extensions==0.4.3
5 | pyflakes==2.5.0
6 | pytest==7.1.2
7 | pytest-asyncio==0.19.0
8 | types-certifi==2021.10.8.3
9 | types-chardet==5.0.4
10 | types-ujson==5.4.0
11 | types-PyYAML==6.0.11
12 | types-requests==2.28.8
13 | wheel==0.37.1
--------------------------------------------------------------------------------
/restfulHarvest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import argparse
3 | import uvicorn
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument('-H', '--host', default='127.0.0.1', help='IP address to listen on default is 127.0.0.1')
7 | parser.add_argument('-p', '--port', default=5000, help='Port to bind the web server to, default is 5000', type=int)
8 | parser.add_argument('-l', '--log-level', default='info', help='Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set')
9 | parser.add_argument('-r', '--reload', default=False, help='Enable automatic reload used during development of the api', action='store_true')
10 |
11 | args = parser.parse_args()
12 |
13 | if __name__ == '__main__':
14 | uvicorn.run('theHarvester.lib.api.api:app', host=args.host, port=args.port, log_level=args.log_level, reload=args.reload)
15 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F405, F403, E402, F401
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | from theHarvester.lib.core import Core
3 |
4 | with open('README.md', 'r') as fh:
5 | long_description = fh.read()
6 |
7 | setup(
8 | name='theHarvester',
9 | version=Core.version(),
10 | author="Christian Martorella",
11 | author_email="cmartorella@edge-security.com",
12 | description="theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test",
13 | long_description=long_description,
14 | long_description_content_type="text/markdown",
15 | url="https://github.com/laramies/theHarvester",
16 | packages=find_packages(exclude=['tests']),
17 | python_requires='>=3.7',
18 | scripts=['bin/theHarvester',
19 | 'bin/restfulHarvest'],
20 |
21 | classifiers=[
22 | "Programming Language :: Python :: 3",
23 | "Programming Language :: Python :: 3.8",
24 | "Programming Language :: Python :: 3.9",
25 | "Programming Language :: Python :: 3.10",
26 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
27 | "Operating System :: OS Independent",
28 | ],
29 | data_files=[
30 | ('/etc/theHarvester', [
31 | 'wordlists/general/common.txt',
32 | 'wordlists/dns-big.txt',
33 | 'wordlists/dns-names.txt',
34 | 'wordlists/dorks.txt',
35 | 'wordlists/names_small.txt',
36 | 'api-keys.yaml',
37 | 'proxies.yaml'
38 | ]
39 | )
40 | ],
41 | )
42 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/tests/__init__.py
--------------------------------------------------------------------------------
/tests/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/tests/discovery/__init__.py
--------------------------------------------------------------------------------
/tests/discovery/test_anubis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import requests
4 | from theHarvester.lib.core import *
5 | from theHarvester.discovery import anubis
6 | import os
7 | import pytest
8 |
9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True
11 |
12 |
13 | class TestAnubis:
14 | @staticmethod
15 | def domain() -> str:
16 | return 'apple.com'
17 |
18 | async def test_api(self):
19 | base_url = f'https://jldc.me/anubis/subdomains/{TestAnubis.domain()}'
20 | headers = {'User-Agent': Core.get_user_agent()}
21 | request = requests.get(base_url, headers=headers)
22 | assert request.status_code == 200
23 |
24 | async def test_do_search(self):
25 | search = anubis.SearchAnubis(word=TestAnubis.domain())
26 | await search.do_search()
27 | return await search.get_hostnames()
28 |
29 | async def test_process(self):
30 | await self.test_do_search()
31 | assert len(await self.test_do_search()) > 0
32 |
--------------------------------------------------------------------------------
/tests/discovery/test_certspotter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | from theHarvester.lib.core import *
4 | from theHarvester.discovery import certspottersearch
5 | import os
6 | import requests
7 | import pytest
8 |
9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True
11 |
12 |
13 | class TestCertspotter(object):
14 | @staticmethod
15 | def domain() -> str:
16 | return 'metasploit.com'
17 |
18 | async def test_api(self):
19 | base_url = f'https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names'
20 | headers = {'User-Agent': Core.get_user_agent()}
21 | request = requests.get(base_url, headers=headers)
22 | assert request.status_code == 200
23 |
24 | async def test_search(self):
25 | search = certspottersearch.SearchCertspoter(TestCertspotter.domain())
26 | await search.process()
27 | assert isinstance(await search.get_hostnames(), set)
28 |
29 | async def test_search_no_results(self):
30 | search = certspottersearch.SearchCertspoter('radiant.eu')
31 | await search.process()
32 | assert len(await search.get_hostnames()) == 0
33 |
34 |
35 | if __name__ == '__main__':
36 | pytest.main()
37 |
--------------------------------------------------------------------------------
/tests/discovery/test_githubcode.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery import githubcode
2 | from theHarvester.discovery.constants import MissingKey
3 | from theHarvester.lib.core import Core
4 | from unittest.mock import MagicMock
5 | from requests import Response
6 | import pytest
7 |
8 | pytestmark = pytest.mark.asyncio
9 |
10 |
11 | class TestSearchGithubCode:
12 |
13 | class OkResponse:
14 | response = Response()
15 | json = {
16 | "items": [
17 | {
18 | "text_matches": [
19 | {
20 | "fragment": "test1"
21 | }
22 | ]
23 | },
24 | {
25 | "text_matches": [
26 | {
27 | "fragment": "test2"
28 | }
29 | ]
30 | }
31 | ]
32 | }
33 | response.status_code = 200
34 | response.json = MagicMock(return_value=json)
35 |
36 | class FailureResponse:
37 | response = Response()
38 | response.json = MagicMock(return_value={})
39 | response.status_code = 401
40 |
41 | class RetryResponse:
42 | response = Response()
43 | response.json = MagicMock(return_value={})
44 | response.status_code = 403
45 |
46 | class MalformedResponse:
47 | response = Response()
48 | json = {
49 | "items": [
50 | {
51 | "fail": True
52 | },
53 | {
54 | "text_matches": []
55 | },
56 | {
57 | "text_matches": [
58 | {
59 | "weird": "result"
60 | }
61 | ]
62 | }
63 | ]
64 | }
65 | response.json = MagicMock(return_value=json)
66 | response.status_code = 200
67 |
68 | async def test_missing_key(self):
69 | with pytest.raises(MissingKey):
70 | Core.github_key = MagicMock(return_value=None)
71 | githubcode.SearchGithubCode(word="test", limit=500)
72 |
73 | async def test_fragments_from_response(self):
74 | Core.github_key = MagicMock(return_value="lol")
75 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
76 | test_result = await test_class_instance.fragments_from_response(self.OkResponse.response.json())
77 | print('test_result: ', test_result)
78 | assert test_result == ["test1", "test2"]
79 |
80 | async def test_invalid_fragments_from_response(self):
81 | Core.github_key = MagicMock(return_value="lol")
82 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
83 | test_result = await test_class_instance.fragments_from_response(self.MalformedResponse.response.json())
84 | assert test_result == []
85 |
86 | async def test_next_page(self):
87 | Core.github_key = MagicMock(return_value="lol")
88 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
89 | test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4)
90 | assert (2 == await test_class_instance.next_page_or_end(test_result))
91 |
92 | async def test_last_page(self):
93 | Core.github_key = MagicMock(return_value="lol")
94 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
95 | test_result = githubcode.SuccessResult(list(), None, None)
96 | assert (None is await test_class_instance.next_page_or_end(test_result))
97 |
98 | if __name__ == '__main__':
99 | pytest.main()
100 |
--------------------------------------------------------------------------------
/tests/discovery/test_omnisint.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | from theHarvester.lib.core import *
4 | from theHarvester.discovery import omnisint
5 | import os
6 | import requests
7 | import pytest
8 |
9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True
11 |
12 |
13 | class TestOmnisint(object):
14 | @staticmethod
15 | def domain() -> str:
16 | return 'uber.com'
17 |
18 | @pytest.mark.skipif(github_ci == 'true', reason='Skipping on Github CI due to unstable status code from site')
19 | async def test_api(self):
20 | base_url = f'https://sonar.omnisint.io/all/{TestOmnisint.domain()}'
21 | headers = {'User-Agent': Core.get_user_agent()}
22 | request = requests.get(base_url, headers=headers)
23 | assert request.status_code == 200
24 |
25 | async def test_search(self):
26 | search = omnisint.SearchOmnisint(TestOmnisint.domain())
27 | await search.process()
28 | assert isinstance(await search.get_hostnames(), list)
29 |
30 |
31 | if __name__ == '__main__':
32 | pytest.main()
33 |
--------------------------------------------------------------------------------
/tests/discovery/test_otx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | from theHarvester.lib.core import *
4 | from theHarvester.discovery import otxsearch
5 | import os
6 | import requests
7 | import pytest
8 |
9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True
11 |
12 |
13 | class TestOtx(object):
14 | @staticmethod
15 | def domain() -> str:
16 | return 'metasploit.com'
17 |
18 | async def test_api(self):
19 | base_url = f'https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns'
20 | headers = {'User-Agent': Core.get_user_agent()}
21 | request = requests.get(base_url, headers=headers)
22 | assert request.status_code == 200
23 |
24 | async def test_search(self):
25 | search = otxsearch.SearchOtx(TestOtx.domain())
26 | await search.process()
27 | assert isinstance(await search.get_hostnames(), set)
28 | assert isinstance(await search.get_ips(), set)
29 |
30 |
31 | if __name__ == '__main__':
32 | pytest.main()
33 |
--------------------------------------------------------------------------------
/tests/discovery/test_qwantsearch.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | from theHarvester.discovery import qwantsearch
4 | import os
5 | import pytest
6 |
7 | pytestmark = pytest.mark.asyncio
8 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True
9 |
10 |
11 | class TestSearchQwant(object):
12 |
13 | @staticmethod
14 | def domain() -> str:
15 | return 'example.com'
16 |
17 | async def test_get_start_offset_return_0(self):
18 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200)
19 | assert search.get_start_offset() == 0
20 |
21 | async def test_get_start_offset_return_50(self):
22 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 55, 200)
23 | assert search.get_start_offset() == 50
24 |
25 | async def test_get_start_offset_return_100(self):
26 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 100, 200)
27 | assert search.get_start_offset() == 100
28 |
29 | async def test_get_emails(self):
30 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200)
31 | await search.process()
32 | assert isinstance(await search.get_emails(), set)
33 |
34 | async def test_get_hostnames(self):
35 | search = qwantsearch.SearchQwant(TestSearchQwant.domain(), 0, 200)
36 | await search.process()
37 | assert isinstance(await search.get_hostnames(), list)
38 |
--------------------------------------------------------------------------------
/tests/discovery/test_sublist3r.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import requests
4 | from theHarvester.lib.core import *
5 | from theHarvester.discovery import sublist3r
6 | import os
7 | import pytest
8 |
9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True
11 |
12 |
13 | class TestSublist3r(object):
14 | @staticmethod
15 | def domain() -> str:
16 | return 'target.com'
17 |
18 | async def test_api(self):
19 | base_url = f'https://api.sublist3r.com/search.php?domain={TestSublist3r.domain()}'
20 | headers = {'User-Agent': Core.get_user_agent()}
21 | request = requests.get(base_url, headers=headers)
22 | assert request.status_code == 200
23 |
24 | async def test_do_search(self):
25 | search = sublist3r.SearchSublist3r(TestSublist3r.domain())
26 | await search.process()
27 | assert isinstance(await search.get_hostnames(), list)
28 |
29 |
30 | if __name__ == '__main__':
31 | pytest.main()
32 |
--------------------------------------------------------------------------------
/tests/discovery/test_threatminer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import requests
4 | from theHarvester.lib.core import *
5 | from theHarvester.discovery import threatminer
6 | import os
7 | import pytest
8 |
9 | pytestmark = pytest.mark.asyncio
10 | github_ci = os.getenv('GITHUB_ACTIONS') # Github set this to be the following: true instead of True
11 |
12 |
13 | class TestThreatminer(object):
14 | @staticmethod
15 | def domain() -> str:
16 | return 'target.com'
17 |
18 | async def test_api(self):
19 | base_url = f'https://api.threatminer.org/v2/domain.php?q={TestThreatminer.domain()}&rt=5'
20 | headers = {'User-Agent': Core.get_user_agent()}
21 | request = requests.get(base_url, headers=headers)
22 | assert request.status_code == 200
23 |
24 | async def test_search(self):
25 | search = threatminer.SearchThreatminer(TestThreatminer.domain())
26 | await search.process()
27 | assert isinstance(await search.get_hostnames(), set)
28 | assert isinstance(await search.get_ips(), set)
29 |
30 |
31 | if __name__ == '__main__':
32 | pytest.main()
33 |
--------------------------------------------------------------------------------
/tests/test_myparser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 |
4 | from theHarvester.parsers import myparser
5 | import pytest
6 |
7 |
8 | class TestMyParser(object):
9 |
10 | @pytest.mark.asyncio
11 | async def test_emails(self):
12 | word = 'domain.com'
13 | results = '@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***'
14 | parse = myparser.Parser(results, word)
15 | emails = sorted(await parse.emails())
16 | assert emails, ['c@domain.com', 'd@sub.domain.com']
17 |
18 |
19 | if __name__ == '__main__':
20 | pytest.main()
21 |
--------------------------------------------------------------------------------
/theHarvester-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester-logo.png
--------------------------------------------------------------------------------
/theHarvester.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Note: This script runs theHarvester
3 | import sys
4 | import asyncio
5 | from theHarvester import __main__
6 |
7 | if sys.version_info.major < 3 or sys.version_info.minor < 7:
8 | print('\033[93m[!] Make sure you have Python 3.7+ installed, quitting.\n\n \033[0m')
9 | sys.exit(1)
10 |
11 | if __name__ == '__main__':
12 | platform = sys.platform
13 | if platform == 'win32':
14 | # Required or things will break if trying to take screenshots
15 | import multiprocessing
16 |
17 | multiprocessing.freeze_support()
18 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
19 | else:
20 | import uvloop
21 | uvloop.install()
22 |
23 | if "linux" in platform:
24 | import aiomultiprocess
25 |
26 | # As we are not using Windows we can change the spawn method to fork for greater performance
27 | aiomultiprocess.set_context("fork")
28 | asyncio.run(__main__.entry_point())
29 |
--------------------------------------------------------------------------------
/theHarvester/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/__init__.py
--------------------------------------------------------------------------------
/theHarvester/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/discovery/__init__.py
--------------------------------------------------------------------------------
/theHarvester/discovery/anubis.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchAnubis:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.totalhosts = list
10 | self.proxy = False
11 |
12 | async def do_search(self):
13 | url = f'https://jldc.me/anubis/subdomains/{self.word}'
14 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
15 | self.totalhosts: list = response[0]
16 |
17 | async def get_hostnames(self) -> Type[list]:
18 | return self.totalhosts
19 |
20 | async def process(self, proxy=False):
21 | self.proxy = proxy
22 | await self.do_search()
23 |
--------------------------------------------------------------------------------
/theHarvester/discovery/baidusearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | from theHarvester.parsers import myparser
3 |
4 |
5 | class SearchBaidu:
6 |
7 | def __init__(self, word, limit):
8 | self.word = word
9 | self.total_results = ""
10 | self.server = 'www.baidu.com'
11 | self.hostname = 'www.baidu.com'
12 | self.limit = limit
13 | self.proxy = False
14 |
15 | async def do_search(self):
16 | headers = {
17 | 'Host': self.hostname,
18 | 'User-agent': Core.get_user_agent()
19 | }
20 | base_url = f'https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}'
21 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
22 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
23 | for response in responses:
24 | self.total_results += response
25 |
26 | async def process(self, proxy=False):
27 | self.proxy = proxy
28 | await self.do_search()
29 |
30 | async def get_emails(self):
31 | rawres = myparser.Parser(self.total_results, self.word)
32 | return await rawres.emails()
33 |
34 | async def get_hostnames(self):
35 | rawres = myparser.Parser(self.total_results, self.word)
36 | return await rawres.hostnames()
37 |
--------------------------------------------------------------------------------
/theHarvester/discovery/bevigil.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 |
3 |
4 | class SearchBeVigil:
5 |
6 | def __init__(self, word):
7 | self.word = word
8 | self.totalhosts = set()
9 | self.interestingurls = set()
10 | self.key = Core.bevigil_key()
11 | self.proxy = False
12 |
13 | async def do_search(self):
14 | subdomain_endpoint = f"https://osint.bevigil.com/api/{self.word}/subdomains/"
15 | url_endpoint = f"https://osint.bevigil.com/api/{self.word}/urls/"
16 | headers = {'X-Access-Token': self.key}
17 |
18 | responses = await AsyncFetcher.fetch_all([subdomain_endpoint], json=True, proxy=self.proxy, headers=headers)
19 | response = responses[0]
20 | for subdomain in response["subdomains"]:
21 | self.totalhosts.add(subdomain)
22 |
23 | responses = await AsyncFetcher.fetch_all([url_endpoint], json=True, proxy=self.proxy, headers=headers)
24 | response = responses[0]
25 | for url in response["urls"]:
26 | self.interestingurls.add(url)
27 |
28 | async def get_hostnames(self) -> set:
29 | return self.totalhosts
30 |
31 | async def get_interestingurls(self) -> set:
32 | return self.interestingurls
33 |
34 | async def process(self, proxy=False):
35 | self.proxy = proxy
36 | await self.do_search()
37 |
--------------------------------------------------------------------------------
/theHarvester/discovery/binaryedgesearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | import asyncio
3 |
4 |
5 | class SearchBinaryEdge:
6 |
7 | def __init__(self, word, limit):
8 | self.word = word
9 | self.totalhosts = set()
10 | self.proxy = False
11 | self.key = Core.binaryedge_key()
12 | self.limit = 501 if limit >= 501 else limit
13 | self.limit = 2 if self.limit == 1 else self.limit
14 | if self.key is None:
15 | raise MissingKey('binaryedge')
16 |
17 | async def do_search(self):
18 | base_url = f'https://api.binaryedge.io/v2/query/domains/subdomain/{self.word}'
19 | headers = {'X-KEY': self.key, 'User-Agent': Core.get_user_agent()}
20 | for page in range(1, self.limit):
21 | params = {'page': page}
22 | response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy, params=params, headers=headers)
23 | responses = response[0]
24 | dct = responses
25 | if ('status' in dct.keys() and 'message' in dct.keys()) and \
26 | (dct['status'] == 400 or 'Bad Parameter' in dct['message'] or 'Error' in dct['message']):
27 | # 400 status code means no more results
28 | break
29 | if 'events' in dct.keys():
30 | if len(dct['events']) == 0:
31 | break
32 | self.totalhosts.update({host for host in dct['events']})
33 | await asyncio.sleep(get_delay())
34 |
35 | async def get_hostnames(self) -> set:
36 | return self.totalhosts
37 |
38 | async def process(self, proxy=False):
39 | self.proxy = proxy
40 | await self.do_search()
41 |
--------------------------------------------------------------------------------
/theHarvester/discovery/bingsearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | from theHarvester.parsers import myparser
4 |
5 |
6 | class SearchBing:
7 |
8 | def __init__(self, word, limit, start):
9 | self.word = word.replace(' ', '%20')
10 | self.results = ""
11 | self.total_results = ""
12 | self.server = 'www.bing.com'
13 | self.apiserver = 'api.search.live.net'
14 | self.hostname = 'www.bing.com'
15 | self.limit = int(limit)
16 | self.bingApi = Core.bing_key()
17 | self.counter = start
18 | self.proxy = False
19 |
20 | async def do_search(self):
21 | headers = {
22 | 'Host': self.hostname,
23 | 'Cookie': 'SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50',
24 | 'Accept-Language': 'en-us,en',
25 | 'User-agent': Core.get_user_agent()
26 | }
27 | base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
28 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
29 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
30 | for response in responses:
31 | self.total_results += response
32 |
33 | async def do_search_api(self):
34 | url = 'https://api.cognitive.microsoft.com/bing/v7.0/search?'
35 | params = {
36 | 'q': self.word,
37 | 'count': str(self.limit),
38 | 'offset': '0',
39 | 'mkt': 'en-us',
40 | 'safesearch': 'Off'
41 | }
42 | headers = {'User-Agent': Core.get_user_agent(), 'Ocp-Apim-Subscription-Key': self.bingApi}
43 | self.results = await AsyncFetcher.fetch_all([url], headers=headers, params=params, proxy=self.proxy)
44 | self.total_results += self.results
45 |
46 | async def do_search_vhost(self):
47 | headers = {
48 | 'Host': self.hostname,
49 | 'Cookie': 'mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50',
50 | 'Accept-Language': 'en-us,en',
51 | 'User-agent': Core.get_user_agent()
52 | }
53 | base_url = f'http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx'
54 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 50) if num <= self.limit]
55 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
56 | for response in responses:
57 | self.total_results += response
58 |
59 | async def get_emails(self):
60 | rawres = myparser.Parser(self.total_results, self.word)
61 | return await rawres.emails()
62 |
63 | async def get_hostnames(self):
64 | rawres = myparser.Parser(self.total_results, self.word)
65 | return await rawres.hostnames()
66 |
67 | async def get_allhostnames(self):
68 | rawres = myparser.Parser(self.total_results, self.word)
69 | return await rawres.hostnames_all()
70 |
71 | async def process(self, api, proxy=False):
72 | self.proxy = proxy
73 | if api == 'yes':
74 | if self.bingApi is None:
75 | raise MissingKey('BingAPI')
76 | else:
77 | if api == 'yes':
78 | await self.do_search_api()
79 | else:
80 | await self.do_search()
81 | print(f'\tSearching {self.counter} results.')
82 |
83 | async def process_vhost(self):
84 | await self.do_search_vhost()
85 |
--------------------------------------------------------------------------------
/theHarvester/discovery/bufferoverun.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | import re
3 |
4 |
5 | class SearchBufferover:
6 | def __init__(self, word):
7 | self.word = word
8 | self.totalhosts = set()
9 | self.totalips = set()
10 | self.proxy = False
11 |
12 | async def do_search(self):
13 | url = f'https://dns.bufferover.run/dns?q={self.word}'
14 | responses = await AsyncFetcher.fetch_all(urls=[url], json=True, proxy=self.proxy)
15 | responses = responses[0]
16 | dct = responses
17 |
18 | if dct['FDNS_A']:
19 | self.totalhosts: set = {
20 | host.split(',')[0].replace('www.', '') if ',' in host and self.word.replace('www.', '') in host.split(',')[
21 | 0] in host else
22 | host.split(',')[1] for host in dct['FDNS_A']}
23 |
24 | self.totalips: set = {ip.split(',')[0] for ip in dct['FDNS_A'] if
25 | re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip.split(',')[0])}
26 |
27 | async def get_hostnames(self) -> set:
28 | return self.totalhosts
29 |
30 | async def get_ips(self) -> set:
31 | return self.totalips
32 |
33 | async def process(self, proxy=False):
34 | self.proxy = proxy
35 | await self.do_search()
36 |
--------------------------------------------------------------------------------
/theHarvester/discovery/censysearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import MissingKey
2 | from theHarvester.lib.core import Core
3 | from censys.search import CensysCertificates
4 | from censys.common import __version__
5 | from censys.common.exceptions import (
6 | CensysRateLimitExceededException,
7 | CensysUnauthorizedException,
8 | )
9 |
10 |
11 | class SearchCensys:
12 | def __init__(self, domain, limit=500):
13 | self.word = domain
14 | self.key = Core.censys_key()
15 | if self.key[0] is None or self.key[1] is None:
16 | raise MissingKey("Censys ID and/or Secret")
17 | self.totalhosts = set()
18 | self.emails = set()
19 | self.limit = limit
20 | self.proxy = False
21 |
22 | async def do_search(self):
23 | try:
24 | cert_search = CensysCertificates(
25 | api_id=self.key[0],
26 | api_secret=self.key[1],
27 | user_agent=f"censys/{__version__} (theHarvester/{Core.version()}; +https://github.com/laramies/theHarvester)",
28 | )
29 | except CensysUnauthorizedException:
30 | raise MissingKey('Censys ID and/or Secret')
31 |
32 | query = f"parsed.names: {self.word}"
33 | try:
34 | response = cert_search.search(
35 | query=query,
36 | fields=["parsed.names", "metadata", "parsed.subject.email_address"],
37 | max_records=self.limit,
38 | )
39 | for cert in response:
40 | self.totalhosts.update(cert.get("parsed.names", []))
41 | self.emails.update(cert.get("parsed.subject.email_address", []))
42 | except CensysRateLimitExceededException:
43 | print("Censys rate limit exceeded")
44 |
45 | async def get_hostnames(self) -> set:
46 | return self.totalhosts
47 |
48 | async def get_emails(self) -> set:
49 | return self.emails
50 |
51 | async def process(self, proxy=False):
52 | self.proxy = proxy
53 | await self.do_search()
54 |
--------------------------------------------------------------------------------
/theHarvester/discovery/certspottersearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 |
3 |
4 | class SearchCertspoter:
5 |
6 | def __init__(self, word):
7 | self.word = word
8 | self.totalhosts = set()
9 | self.proxy = False
10 |
11 | async def do_search(self) -> None:
12 | base_url = f'https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names'
13 | try:
14 | response = await AsyncFetcher.fetch_all([base_url], json=True, proxy=self.proxy)
15 | response = response[0]
16 | if isinstance(response, list):
17 | for dct in response:
18 | for key, value in dct.items():
19 | if key == 'dns_names':
20 | self.totalhosts.update({name for name in value if name})
21 | elif isinstance(response, dict):
22 | self.totalhosts.update({response['dns_names'] if 'dns_names' in response.keys() else ''}) # type: ignore
23 | else:
24 | self.totalhosts.update({''})
25 | except Exception as e:
26 | print(e)
27 |
28 | async def get_hostnames(self) -> set:
29 | return self.totalhosts
30 |
31 | async def process(self, proxy=False):
32 | self.proxy = proxy
33 | await self.do_search()
34 | print('\tSearching results.')
35 |
--------------------------------------------------------------------------------
/theHarvester/discovery/constants.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | from typing import Union, Optional
3 | import random
4 |
5 | googleUA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 ' \
6 | 'Safari/537.36 '
7 |
8 |
9 | async def splitter(links):
10 | """
11 | Method that tries to remove duplicates
12 | LinkedinLists pulls a lot of profiles with the same name.
13 | This method tries to remove duplicates from the list.
14 | :param links: list of links to remove duplicates from
15 | :return: unique-ish list
16 | """
17 | unique_list = []
18 | name_check = []
19 | for url in links:
20 | tail = url.split("/")[-1]
21 | if len(tail) == 2 or tail == "zh-cn":
22 | tail = url.split("/")[-2]
23 | name = tail.split("-")
24 | if len(name) > 1:
25 | joined_name = name[0] + name[1]
26 | else:
27 | joined_name = name[0]
28 | if joined_name not in name_check:
29 | unique_list.append(url)
30 | name_check.append(joined_name)
31 | return unique_list
32 |
33 |
34 | def filter(lst):
35 | """
36 | Method that filters list
37 | :param lst: list to be filtered
38 | :return: new filtered list
39 | """
40 | if lst is None:
41 | return []
42 | if not isinstance(lst, set):
43 | lst = set(lst) # Remove duplicates.
44 | new_lst = []
45 | for item in lst:
46 | item = str(item)
47 | if (item[0].isalpha() or item[0].isdigit()) and ('xxx' not in item) and ('..' not in item):
48 | item = item.replace('252f', '').replace('2F', '').replace('2f', '')
49 | new_lst.append(item.lower())
50 | return new_lst
51 |
52 |
53 | def get_delay() -> float:
54 | """Method that is used to generate a random delay"""
55 | return random.randint(1, 3) - .5
56 |
57 |
58 | async def search(text: str) -> bool:
59 | """Helper function to check if Google has blocked traffic.
60 | :param text: See if certain text is returned which means Google is blocking us
61 | :return bool:
62 | """
63 | for line in text.strip().splitlines():
64 | if 'This page appears when Google automatically detects requests coming from your computer network' in line \
65 | or 'http://www.google.com/sorry/index' in line or 'https://www.google.com/sorry/index' in line:
66 | # print('\tGoogle is blocking your IP due to too many automated requests, wait or change your IP')
67 | return True
68 | return False
69 |
70 |
71 | async def google_workaround(visit_url: str) -> Union[bool, str]:
72 | """
73 | Function that makes a request on our behalf, if Google starts to block us
74 | :param visit_url: Url to scrape
75 | :return: Correct html that can be parsed by BS4
76 | """
77 | url = 'https://websniffer.cc/'
78 | data = {
79 | 'Cookie': '',
80 | 'url': visit_url,
81 | 'submit': 'Submit',
82 | 'type': 'GET&http=1.1',
83 | 'uak': str(random.randint(4, 8)) # select random UA to send to Google
84 | }
85 | returned_html = await AsyncFetcher.post_fetch(url, headers={'User-Agent': Core.get_user_agent()}, data=data)
86 | returned_html = "This page appears when Google automatically detects requests coming from your computer network" \
87 | if returned_html == "" else returned_html[0]
88 |
89 | returned_html = "" if 'Please Wait... | Cloudflare' in returned_html else returned_html
90 |
91 | if len(returned_html) == 0 or await search(returned_html) or '<html' not in returned_html:
92 | # indicates that google is serving workaround a captcha
93 | # That means we will try out second option which will utilize proxies
94 | return True
95 | # the html we get is malformed for BS4 as there are no greater than or less than signs
96 | if '<html>' in returned_html:
97 | start_index = returned_html.index('<html>')
98 | else:
99 | start_index = returned_html.index('<html')
100 |
101 | end_index = returned_html.index('</html>') + 1
102 | correct_html = returned_html[start_index:end_index]
103 | # Slice list to get the response's html
104 | correct_html = ''.join([ch.strip().replace('<', '<').replace('>', '>') for ch in correct_html])
105 | return correct_html
106 |
107 |
108 | class MissingKey(Exception):
109 | """
110 | :raise: When there is a module that has not been provided its API key
111 | """
112 | def __init__(self, source: Optional[str]):
113 | if source:
114 | self.message = f'\n\033[93m[!] Missing API key for {source}. \033[0m'
115 | else:
116 | self.message = '\n\033[93m[!] Missing CSE id. \033[0m'
117 |
118 | def __str__(self) -> str:
119 | return self.message
120 |
--------------------------------------------------------------------------------
/theHarvester/discovery/crtsh.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | from typing import List, Set
3 |
4 |
5 | class SearchCrtsh:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.data = set()
10 | self.proxy = False
11 |
12 | async def do_search(self) -> List:
13 | data: set = set()
14 | try:
15 | url = f'https://crt.sh/?q=%25.{self.word}&output=json'
16 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
17 | response = response[0]
18 | data = set(
19 | [dct['name_value'][2:] if '*.' == dct['name_value'][:2] else dct['name_value']
20 | for dct in response])
21 | data = {domain for domain in data if (domain[0] != '*' and str(domain[0:4]).isnumeric() is False)}
22 | except Exception as e:
23 | print(e)
24 | clean = []
25 | for x in data:
26 | pre = x.split()
27 | for y in pre:
28 | clean.append(y)
29 | return clean
30 |
31 | async def process(self, proxy=False) -> None:
32 | self.proxy = proxy
33 | data = await self.do_search()
34 | self.data = data
35 |
36 | async def get_hostnames(self) -> Set:
37 | return self.data
38 |
--------------------------------------------------------------------------------
/theHarvester/discovery/dnsdumpster.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | from theHarvester.parsers import myparser
3 | import aiohttp
4 | import asyncio
5 |
6 |
7 | class SearchDnsDumpster:
8 |
9 | def __init__(self, word):
10 | self.word = word.replace(' ', '%20')
11 | self.results = ""
12 | self.totalresults = ""
13 | self.server = 'dnsdumpster.com'
14 | self.proxy = False
15 |
16 | async def do_search(self):
17 | try:
18 | agent = Core.get_user_agent()
19 | headers = {'User-Agent': agent}
20 | session = aiohttp.ClientSession(headers=headers)
21 | # create a session to properly verify
22 | url = f'https://{self.server}'
23 | csrftoken = ''
24 | if self.proxy is False:
25 | async with session.get(url, headers=headers) as resp:
26 | cookies = str(resp.cookies)
27 | cookies = cookies.split('csrftoken=')
28 | csrftoken += cookies[1][:cookies[1].find(';')]
29 | else:
30 | async with session.get(url, headers=headers, proxy=self.proxy) as resp:
31 | cookies = str(resp.cookies)
32 | cookies = cookies.split('csrftoken=')
33 | csrftoken += cookies[1][:cookies[1].find(';')]
34 | await asyncio.sleep(2)
35 |
36 | # extract csrftoken from cookies
37 | data = {
38 | 'Cookie': f'csfrtoken={csrftoken}', 'csrfmiddlewaretoken': csrftoken,
39 | 'targetip': self.word, 'user': 'free'}
40 | headers['Referer'] = url
41 | if self.proxy is False:
42 | async with session.post(url, headers=headers, data=data) as resp:
43 | self.results = await resp.text()
44 | else:
45 | async with session.post(url, headers=headers, data=data, proxy=self.proxy) as resp:
46 | self.results = await resp.text()
47 | await session.close()
48 | except Exception as e:
49 | print(f'An exception occurred: {e}')
50 | self.totalresults += self.results
51 |
52 | async def get_hostnames(self):
53 | rawres = myparser.Parser(self.totalresults, self.word)
54 | return await rawres.hostnames()
55 |
56 | async def process(self, proxy=False):
57 | self.proxy = proxy
58 | await self.do_search() # Only need to do it once.
59 |
--------------------------------------------------------------------------------
/theHarvester/discovery/dnssearch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | ============
5 | DNS Browsing
6 | ============
7 |
8 | Explore the space around known hosts & ips for extra catches.
9 | """
10 |
11 | import re
12 | import sys
13 |
14 | from aiodns import DNSResolver
15 | from ipaddress import IPv4Network
16 | from typing import Callable, List, Optional
17 | from theHarvester.lib import hostchecker
18 |
19 |
20 | #####################################################################
21 | # DNS FORCE
22 | #####################################################################
23 |
24 |
25 | class DnsForce:
26 |
27 | def __init__(self, domain, dnsserver, verbose=False):
28 | self.domain = domain
29 | self.subdo = False
30 | self.verbose = verbose
31 | # self.dnsserver = [dnsserver] if isinstance(dnsserver, str) else dnsserver
32 | self.dnsserver = list(map(str, dnsserver.split(','))) if isinstance(dnsserver, str) else dnsserver
33 | try:
34 | with open('/etc/theHarvester/wordlists/dns-names.txt', 'r') as file:
35 | self.list = file.readlines()
36 | except FileNotFoundError:
37 | try:
38 | with open('/usr/local/etc/theHarvester/wordlists/dns-names.txt', 'r') as file:
39 | self.list = file.readlines()
40 | except FileNotFoundError:
41 | with open('wordlists/dns-names.txt', 'r') as file:
42 | self.list = file.readlines()
43 | self.domain = domain.replace('www.', '')
44 | self.list = [f'{word.strip()}.{self.domain}' for word in self.list]
45 |
46 | async def run(self):
47 | print(f'Starting DNS brute forcing with {len(self.list)} words')
48 | checker = hostchecker.Checker(
49 | self.list) if self.dnsserver == [] or self.dnsserver == "" or self.dnsserver is None \
50 | else hostchecker.Checker(self.list, nameserver=self.dnsserver)
51 | hosts, ips = await checker.check()
52 | return hosts, ips
53 |
54 |
55 | #####################################################################
56 | # DNS REVERSE
57 | #####################################################################
58 |
59 |
60 | IP_REGEX = r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
61 | PORT_REGEX = r'\d{1,5}'
62 | NETMASK_REGEX = r'\d{1,2}|' + IP_REGEX
63 | NETWORK_REGEX = r'\b({})(?:\:({}))?(?:\/({}))?\b'.format(
64 | IP_REGEX,
65 | PORT_REGEX,
66 | NETMASK_REGEX)
67 |
68 |
69 | def serialize_ip_range(ip: str, netmask: str = '24') -> str:
70 | """
71 | Serialize a network range in a constant format, 'x.x.x.x/y'.
72 |
73 | Parameters
74 | ----------
75 | ip: str.
76 | A serialized ip in the format 'x.x.x.x'.
77 | Extra information like port (':z') or subnet ('/n')
78 | will be ignored.
79 | netmask: str.
80 | The subnet subdivision, represented by a 2 digit netmask.
81 |
82 | Returns
83 | -------
84 | out: str.
85 | The network OSI address, like '192.168.0.0/24'.
86 | """
87 | __ip_matches = re.search(NETWORK_REGEX, ip, re.IGNORECASE)
88 | if __ip_matches and __ip_matches.groups():
89 | __ip = __ip_matches.group(1)
90 | __netmask = netmask if netmask else __ip_matches.group(3)
91 | if __ip and __netmask:
92 | return str(IPv4Network('{}/{}'.format(__ip, __netmask), strict=False))
93 | elif __ip:
94 | return str(IPv4Network('{}/{}'.format(__ip, '24'), strict=False))
95 |
96 | # invalid input ip
97 | return ''
98 |
99 |
100 | def list_ips_in_network_range(iprange: str) -> List[str]:
101 | """
102 | List all the IPs in the range.
103 |
104 | Parameters
105 | ----------
106 | iprange: str.
107 | A serialized ip range, like '1.2.3.0/24'.
108 | The last digit can be set to anything, it will be ignored.
109 |
110 | Returns
111 | -------
112 | out: list.
113 | The list of IPs in the range.
114 | """
115 | try:
116 | __network = IPv4Network(iprange, strict=False)
117 | return [__address.exploded for __address in __network.hosts()]
118 | except Exception:
119 | return []
120 |
121 |
122 | async def reverse_single_ip(ip: str, resolver: DNSResolver) -> str:
123 | """
124 | Reverse a single IP and output the linked CNAME, if it exists.
125 | Parameters
126 | ----------
127 | :param ip: IP address to reverse
128 | :param resolver: DNS server to use
129 |
130 | Returns
131 | -------
132 | :return str: with the corresponding CNAME or None
133 | """
134 | try:
135 | __host = await resolver.gethostbyaddr(ip)
136 | return __host.name if __host else ''
137 | except Exception:
138 | return ''
139 |
140 |
141 | async def reverse_all_ips_in_range(iprange: str, callback: Callable, nameservers: Optional[List[str]] = None) -> None:
142 | """
143 | Reverse all the IPs stored in a network range.
144 | All the queries are made concurrently.
145 |
146 | Parameters
147 | ----------
148 | iprange: str.
149 | An IPv4 range formatted as 'x.x.x.x/y'.
150 | The last 2 digits of the ip can be set to anything,
151 | they will be ignored.
152 | callback: Callable.
153 | Arbitrary postprocessing function.
154 | nameservers: List[str].
155 | Optional list of DNS servers.
156 |
157 | Returns
158 | -------
159 | out: None.
160 | """
161 | __resolver = DNSResolver(timeout=4, nameservers=nameservers)
162 | for __ip in list_ips_in_network_range(iprange):
163 | log_query(__ip)
164 | __host = await reverse_single_ip(ip=__ip, resolver=__resolver)
165 | callback(__host)
166 | log_result(__host)
167 |
168 |
169 | #####################################################################
170 | # IO
171 | #####################################################################
172 |
173 |
174 | def log_query(ip: str) -> None:
175 | """
176 | Display the current query in the console.
177 |
178 | Parameters
179 | ----------
180 | ip: str.
181 | Queried ip.
182 |
183 | Results
184 | -------
185 | out: None.
186 | """
187 | sys.stdout.write(chr(27) + '[2K' + chr(27) + '[G')
188 | sys.stdout.write('\r' + ip + ' - ')
189 | sys.stdout.flush()
190 |
191 |
192 | def log_result(host: str) -> None:
193 | """
194 | Display the query result in the console.
195 |
196 | Parameters
197 | ----------
198 | host: str.
199 | Host name returned by the DNS query.
200 |
201 | Results
202 | -------
203 | out: None.
204 | """
205 | if host:
206 | print(host)
207 |
208 |
209 | def generate_postprocessing_callback(target: str, **allhosts: List[str]) -> Callable:
210 | """
211 | Postprocess the query results asynchronously too, instead of waiting for
212 | the querying stage to be completely finished.
213 |
214 | Parameters
215 | ----------
216 | target: str.
217 | The domain wanted as TLD.
218 | allhosts: List.
219 | A collection of all the subdomains -of target- found so far.
220 |
221 | Returns
222 | -------
223 | out: Callable.
224 | A function that will update the collection of target subdomains
225 | when the query result is satisfying.
226 | """
227 |
228 | def append_matching_hosts(host: str) -> None:
229 | if host and target in host:
230 | for __name, __hosts in allhosts.items():
231 | if host not in __hosts:
232 | __hosts.append(host)
233 |
234 | return append_matching_hosts
235 |
--------------------------------------------------------------------------------
/theHarvester/discovery/duckduckgosearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | from theHarvester.parsers import myparser
4 | import json
5 |
6 |
7 | class SearchDuckDuckGo:
8 |
9 | def __init__(self, word, limit):
10 | self.word = word
11 | self.results = ""
12 | self.totalresults = ""
13 | self.dorks = []
14 | self.links = []
15 | self.database = 'https://duckduckgo.com/?q='
16 | self.api = 'https://api.duckduckgo.com/?q=x&format=json&pretty=1' # Currently using API.
17 | self.quantity = '100'
18 | self.limit = limit
19 | self.proxy = False
20 |
21 | async def do_search(self):
22 | # Do normal scraping.
23 | url = self.api.replace('x', self.word)
24 | headers = {'User-Agent': googleUA}
25 | first_resp = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
26 | self.results = first_resp[0]
27 | self.totalresults += self.results
28 | urls = await self.crawl(self.results)
29 | urls = {url for url in urls if len(url) > 5}
30 | all_resps = await AsyncFetcher.fetch_all(urls)
31 | self.totalresults += ''.join(all_resps)
32 |
33 | async def crawl(self, text):
34 | """
35 | Function parses json and returns URLs.
36 | :param text: formatted json
37 | :return: set of URLs
38 | """
39 | urls = set()
40 | try:
41 | load = json.loads(text)
42 | for keys in load.keys(): # Iterate through keys of dict.
43 | val = load.get(keys)
44 | if isinstance(val, int) or isinstance(val, dict) or val is None:
45 | continue
46 | if isinstance(val, list):
47 | if len(val) == 0: # Make sure not indexing an empty list.
48 | continue
49 | val = val[0] # First value should be dict.
50 | if isinstance(val, dict): # Sanity check.
51 | for key in val.keys():
52 | value = val.get(key)
53 | if isinstance(value, str) and value != '' and 'https://' in value or 'http://' in value:
54 | urls.add(value)
55 | if isinstance(val, str) and val != '' and 'https://' in val or 'http://' in val:
56 | urls.add(val)
57 | tmp = set()
58 | for url in urls:
59 | if '<' in url and 'href=' in url: # Format is
60 | equal_index = url.index('=')
61 | true_url = ''
62 | for ch in url[equal_index + 1:]:
63 | if ch == '"':
64 | tmp.add(true_url)
65 | break
66 | true_url += ch
67 | else:
68 | if url != '':
69 | tmp.add(url)
70 | return tmp
71 | except Exception as e:
72 | print(f'Exception occurred: {e}')
73 | return []
74 |
75 | async def get_emails(self):
76 | rawres = myparser.Parser(self.totalresults, self.word)
77 | return await rawres.emails()
78 |
79 | async def get_hostnames(self):
80 | rawres = myparser.Parser(self.totalresults, self.word)
81 | return await rawres.hostnames()
82 |
83 | async def process(self, proxy=False):
84 | self.proxy = proxy
85 | await self.do_search() # Only need to search once since using API.
86 |
--------------------------------------------------------------------------------
/theHarvester/discovery/fullhuntsearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchFullHunt:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.key = Core.fullhunt_key()
10 | if self.key is None:
11 | raise MissingKey('fullhunt')
12 | self.total_results = None
13 | self.proxy = False
14 |
15 | async def do_search(self):
16 | url = f'https://fullhunt.io/api/v1/domain/{self.word}/subdomains'
17 | response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(),
18 | 'X-API-KEY': self.key},
19 | proxy=self.proxy)
20 | self.total_results = response[0]['hosts']
21 |
22 | async def get_hostnames(self) -> set:
23 | return self.total_results
24 |
25 | async def process(self, proxy=False):
26 | self.proxy = proxy
27 | await self.do_search()
28 |
--------------------------------------------------------------------------------
/theHarvester/discovery/githubcode.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | from theHarvester.parsers import myparser
4 | from typing import List, Dict, Any, Optional, NamedTuple, Tuple
5 | import asyncio
6 | import aiohttp
7 | import urllib.parse as urlparse
8 | import random
9 |
10 |
11 | class RetryResult(NamedTuple):
12 | time: float
13 |
14 |
15 | class SuccessResult(NamedTuple):
16 | fragments: List[str]
17 | next_page: Optional[int]
18 | last_page: Optional[int]
19 |
20 |
21 | class ErrorResult(NamedTuple):
22 | status_code: int
23 | body: Any
24 |
25 |
26 | class SearchGithubCode:
27 |
28 | def __init__(self, word, limit):
29 | self.word = word
30 | self.total_results = ""
31 | self.server = 'api.github.com'
32 | self.limit = limit
33 | self.counter = 0
34 | self.page = 1
35 | self.key = Core.github_key()
36 | # If you don't have a personal access token, github narrows your search capabilities significantly
37 | # rate limits you more severely
38 | # https://developer.github.com/v3/search/#rate-limit
39 | if self.key is None:
40 | raise MissingKey('Github')
41 | self.proxy = False
42 |
43 | @staticmethod
44 | async def fragments_from_response(json_data: dict) -> List[str]:
45 | items: List[Dict[str, Any]] = json_data.get('items') or list()
46 | fragments: List[str] = list()
47 | for item in items:
48 | matches = item.get("text_matches") or list()
49 | for match in matches:
50 | fragments.append(match.get("fragment"))
51 |
52 | return [fragment for fragment in fragments if fragment is not None]
53 |
54 | @staticmethod
55 | async def page_from_response(page: str, links) -> Optional[Any]:
56 | page_link = links.get(page)
57 | if page_link:
58 | parsed = urlparse.urlparse(str(page_link.get("url")))
59 | params = urlparse.parse_qs(parsed.query)
60 | pages: List[Any] = params.get('page', [None])
61 | page_number = pages[0] and int(pages[0])
62 | return page_number
63 | else:
64 | return None
65 |
66 | async def handle_response(self, response: Tuple[str, dict, int, Any]):
67 | text, json_data, status, links = response
68 | if status == 200:
69 | results = await self.fragments_from_response(json_data)
70 | next_page = await self.page_from_response("next", links)
71 | last_page = await self.page_from_response("last", links)
72 | return SuccessResult(results, next_page, last_page)
73 | elif status == 429 or status == 403:
74 | return RetryResult(60)
75 | else:
76 | try:
77 | return ErrorResult(status, json_data)
78 | except ValueError:
79 | return ErrorResult(status, text)
80 |
81 | async def do_search(self, page: Optional[int]) -> Tuple[str, dict, int, Any]:
82 | if page is None:
83 | url = f'https://{self.server}/search/code?q="{self.word}"'
84 | else:
85 | url = f'https://{self.server}/search/code?q="{self.word}"&page={page}'
86 | headers = {
87 | 'Host': self.server,
88 | 'User-agent': Core.get_user_agent(),
89 | 'Accept': "application/vnd.github.v3.text-match+json",
90 | 'Authorization': f'token {self.key}'
91 | }
92 |
93 | async with aiohttp.ClientSession(headers=headers) as sess:
94 | if self.proxy:
95 | async with sess.get(url, proxy=random.choice(Core.proxy_list())) as resp:
96 | return await resp.text(), await resp.json(), resp.status, resp.links
97 | else:
98 | async with sess.get(url) as resp:
99 | return await resp.text(), await resp.json(), resp.status, resp.links
100 |
101 | @staticmethod
102 | async def next_page_or_end(result: SuccessResult) -> Optional[int]:
103 | if result.next_page is not None:
104 | return result.next_page
105 | else:
106 | return result.last_page
107 |
108 | async def process(self, proxy=False):
109 | self.proxy = proxy
110 | try:
111 | while self.counter <= self.limit and self.page is not None:
112 | api_response = await self.do_search(self.page)
113 | result = await self.handle_response(api_response)
114 | if isinstance(result, SuccessResult):
115 | print(f'\tSearching {self.counter} results.')
116 | for fragment in result.fragments:
117 | self.total_results += fragment
118 | self.counter = self.counter + 1
119 | self.page = await self.next_page_or_end(result)
120 | await asyncio.sleep(get_delay())
121 | elif isinstance(result, RetryResult):
122 | sleepy_time = get_delay() + result.time
123 | print(f'\tRetrying page in {sleepy_time} seconds...')
124 | await asyncio.sleep(sleepy_time)
125 | elif isinstance(result, ErrorResult):
126 | raise Exception(f"\tException occurred: status_code: {result.status_code} reason: {result.body}")
127 | else:
128 | raise Exception("\tUnknown exception occurred")
129 | except Exception as e:
130 | print(f'An exception has occurred: {e}')
131 |
132 | async def get_emails(self):
133 | rawres = myparser.Parser(self.total_results, self.word)
134 | return await rawres.emails()
135 |
136 | async def get_hostnames(self):
137 | rawres = myparser.Parser(self.total_results, self.word)
138 | return await rawres.hostnames()
139 |
--------------------------------------------------------------------------------
/theHarvester/discovery/hackertarget.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 |
3 |
4 | class SearchHackerTarget:
5 | """
6 | Class uses the HackerTarget api to gather subdomains and ips
7 | """
8 |
9 | def __init__(self, word):
10 | self.word = word
11 | self.total_results = ""
12 | self.hostname = 'https://api.hackertarget.com'
13 | self.proxy = False
14 | self.results = None
15 |
16 | async def do_search(self):
17 | headers = {'User-agent': Core.get_user_agent()}
18 | urls = [f'{self.hostname}/hostsearch/?q={self.word}', f'{self.hostname}/reversedns/?q={self.word}']
19 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
20 | for response in responses:
21 | self.total_results += response.replace(",", ":")
22 |
23 | async def process(self, proxy=False):
24 | self.proxy = proxy
25 | await self.do_search()
26 |
27 | async def get_hostnames(self) -> list:
28 | return [result for result in self.total_results.splitlines() if 'No PTR records found' not in result]
29 |
--------------------------------------------------------------------------------
/theHarvester/discovery/huntersearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchHunter:
6 |
7 | def __init__(self, word, limit, start):
8 | self.word = word
9 | self.limit = limit
10 | self.limit = 10 if limit > 10 else limit
11 | self.start = start
12 | self.key = Core.hunter_key()
13 | if self.key is None:
14 | raise MissingKey('Hunter')
15 | self.total_results = ""
16 | self.counter = start
17 | self.database = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit=10'
18 | self.proxy = False
19 | self.hostnames = []
20 | self.emails = []
21 |
22 | async def do_search(self):
23 | # First determine if user account is not a free account, this call is free
24 | is_free = True
25 | headers = {'User-Agent': Core.get_user_agent()}
26 | acc_info_url = f'https://api.hunter.io/v2/account?api_key={self.key}'
27 | response = await AsyncFetcher.fetch_all([acc_info_url], headers=headers, json=True)
28 | is_free = is_free if 'plan_name' in response[0]['data'].keys() and response[0]['data']['plan_name'].lower() \
29 | == 'free' else False
30 | # Extract total number of requests that are available for account
31 |
32 | total_requests_avail = response[0]['data']['requests']['searches']['available'] - response[0]['data']['requests']['searches']['used']
33 | if is_free:
34 | response = await AsyncFetcher.fetch_all([self.database], headers=headers, proxy=self.proxy, json=True)
35 | self.emails, self.hostnames = await self.parse_resp(json_resp=response[0])
36 | else:
37 | # Determine total number of emails that are available
38 | # As the most emails you can get within one query is 100
39 | # This is only done where paid accounts are in play
40 | hunter_dinfo_url = f'https://api.hunter.io/v2/email-count?domain={self.word}'
41 | response = await AsyncFetcher.fetch_all([hunter_dinfo_url], headers=headers, proxy=self.proxy, json=True)
42 | total_number_reqs = response[0]['data']['total'] // 100
43 | # Parse out meta field within initial JSON response to determine total number of results
44 | if total_requests_avail < total_number_reqs:
45 | print('WARNING: account does not have enough requests to gather all emails')
46 | print(f'Total requests available: {total_requests_avail}, total requests '
47 | f'needed to be made: {total_number_reqs}')
48 | print('RETURNING current results, if you would still like to '
49 | 'run this module comment out the if request')
50 | return
51 | self.limit = 100
52 | # max number of emails you can get per request is 100
53 | # increments of 100 with offset determining where to start
54 | # See docs for more details: https://hunter.io/api-documentation/v2#domain-search
55 | for offset in range(0, 100 * total_number_reqs, 100):
56 | req_url = f'https://api.hunter.io/v2/domain-search?domain={self.word}&api_key={self.key}&limit{self.limit}&offset={offset}'
57 | response = await AsyncFetcher.fetch_all([req_url], headers=headers, proxy=self.proxy, json=True)
58 | temp_emails, temp_hostnames = await self.parse_resp(response[0])
59 | self.emails.extend(temp_emails)
60 | self.hostnames.extend(temp_hostnames)
61 | await asyncio.sleep(1)
62 |
63 | async def parse_resp(self, json_resp):
64 | emails = list(sorted({email['value'] for email in json_resp['data']['emails']}))
65 | domains = list(sorted({source['domain'] for email in json_resp['data']['emails'] for source in email['sources']
66 | if self.word in source['domain']}))
67 | return emails, domains
68 |
69 | async def process(self, proxy=False):
70 | self.proxy = proxy
71 | await self.do_search() # Only need to do it once.
72 |
73 | async def get_emails(self):
74 | return self.emails
75 |
76 | async def get_hostnames(self):
77 | return self.hostnames
78 |
--------------------------------------------------------------------------------
/theHarvester/discovery/intelxsearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | from theHarvester.parsers import intelxparser
4 | import asyncio
5 | import json
6 | import requests
7 |
8 |
9 | class SearchIntelx:
10 |
11 | def __init__(self, word):
12 | self.word = word
13 | self.key = Core.intelx_key()
14 | if self.key is None:
15 | raise MissingKey('Intelx')
16 | self.database = 'https://2.intelx.io'
17 | self.results = None
18 | self.info = ()
19 | self.limit = 10000
20 | self.proxy = False
21 | self.offset = -1
22 |
23 | async def do_search(self):
24 | try:
25 | # Based on: https://github.com/IntelligenceX/SDK/blob/master/Python/intelxapi.py
26 | # API requests self identification
27 | # https://intelx.io/integrations
28 | headers = {'x-key': self.key, 'User-Agent': f'{Core.get_user_agent()}-theHarvester'}
29 | data = {
30 | "term": self.word,
31 | "buckets": [],
32 | "lookuplevel": 0,
33 | "maxresults": self.limit,
34 | "timeout": 5,
35 | "datefrom": "",
36 | "dateto": "",
37 | "sort": 2,
38 | "media": 0,
39 | "terminate": [],
40 | "target": 0
41 | }
42 |
43 | total_resp = requests.post(f'{self.database}/phonebook/search', headers=headers, json=data)
44 | phonebook_id = json.loads(total_resp.text)['id']
45 | await asyncio.sleep(2)
46 |
47 | # Fetch results from phonebook based on ID
48 | resp = await AsyncFetcher.fetch_all(
49 | [f'{self.database}/phonebook/search/result?id={phonebook_id}&limit={self.limit}&offset={self.offset}'],
50 | headers=headers, json=True, proxy=self.proxy)
51 | resp = resp[0]
52 | self.results = resp
53 | except Exception as e:
54 | print(f'An exception has occurred in Intelx: {e}')
55 |
56 | async def process(self, proxy=False):
57 | self.proxy = proxy
58 | await self.do_search()
59 | intelx_parser = intelxparser.Parser()
60 | self.info = await intelx_parser.parse_dictionaries(self.results)
61 |
62 | async def get_emails(self):
63 | return self.info[0]
64 |
65 | async def get_interestingurls(self):
66 | return self.info[1]
67 |
--------------------------------------------------------------------------------
/theHarvester/discovery/omnisint.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 |
3 |
4 | class SearchOmnisint:
5 | def __init__(self, word):
6 | self.word = word
7 | self.totalhosts = set()
8 | self.totalips = set()
9 | self.proxy = False
10 |
11 | async def do_search(self):
12 | base_url = f'https://sonar.omnisint.io/all/{self.word}?page=1'
13 | responses = await AsyncFetcher.fetch_all([base_url], json=True, headers={'User-Agent': Core.get_user_agent()},
14 | proxy=self.proxy)
15 | self.totalhosts = list({host for host in responses[0]})
16 |
17 | async def get_hostnames(self) -> set:
18 | return self.totalhosts
19 |
20 | async def get_ips(self) -> set:
21 | return self.totalips
22 |
23 | async def process(self, proxy=False):
24 | self.proxy = proxy
25 | await self.do_search()
26 |
--------------------------------------------------------------------------------
/theHarvester/discovery/otxsearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | import re
3 |
4 |
5 | class SearchOtx:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.totalhosts = set()
10 | self.totalips = set()
11 | self.proxy = False
12 |
13 | async def do_search(self):
14 | url = f'https://otx.alienvault.com/api/v1/indicators/domain/{self.word}/passive_dns'
15 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
16 | responses = response[0]
17 | dct = responses
18 | self.totalhosts: set = {host['hostname'] for host in dct['passive_dns']}
19 | # filter out ips that are just called NXDOMAIN
20 | self.totalips: set = {ip['address'] for ip in dct['passive_dns']
21 | if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip['address'])}
22 |
23 | async def get_hostnames(self) -> set:
24 | return self.totalhosts
25 |
26 | async def get_ips(self) -> set:
27 | return self.totalips
28 |
29 | async def process(self, proxy=False):
30 | self.proxy = proxy
31 | await self.do_search()
32 |
--------------------------------------------------------------------------------
/theHarvester/discovery/pentesttools.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | import json
4 | import time
5 |
6 |
7 | class SearchPentestTools:
8 |
9 | def __init__(self, word):
10 | # Script is largely based off https://pentest-tools.com/public/api_client.py.txt
11 | self.word = word
12 | self.key = Core.pentest_tools_key()
13 | if self.key is None:
14 | raise MissingKey('PentestTools')
15 | self.total_results = []
16 | self.api = f'https://pentest-tools.com/api?key={self.key}'
17 | self.proxy = False
18 |
19 | async def poll(self, scan_id):
20 | while True:
21 | time.sleep(3)
22 | # Get the status of our scan
23 | scan_status_data = {
24 | 'op': 'get_scan_status',
25 | 'scan_id': scan_id
26 | }
27 | responses = await AsyncFetcher.post_fetch(url=self.api, data=json.dumps(scan_status_data), proxy=self.proxy)
28 | res_json = json.loads(responses.strip())
29 | if res_json['op_status'] == 'success':
30 | if res_json['scan_status'] != 'waiting' and res_json['scan_status'] != 'running':
31 | getoutput_data = {
32 | 'op': 'get_output',
33 | 'scan_id': scan_id,
34 | 'output_format': 'json'
35 | }
36 | responses = await AsyncFetcher.post_fetch(url=self.api,
37 | data=json.dumps(getoutput_data),
38 | proxy=self.proxy)
39 |
40 | res_json = json.loads(responses.strip('\n'))
41 | self.total_results = await self.parse_json(res_json)
42 | break
43 | else:
44 | print(f"Operation get_scan_status failed because: {res_json['error']}. {res_json['details']}")
45 | break
46 |
47 | @staticmethod
48 | async def parse_json(json_results):
49 | status = json_results['op_status']
50 | if status == 'success':
51 | scan_tests = json_results['scan_output']['output_json']
52 | output_data = scan_tests[0]['output_data']
53 | host_to_ip = [f'{subdomain[0]}:{subdomain[1]}' for subdomain in output_data if len(subdomain) > 0]
54 | return host_to_ip
55 | return []
56 |
57 | async def get_hostnames(self) -> list:
58 | return self.total_results
59 |
60 | async def do_search(self):
61 | subdomain_payload = {
62 | 'op': 'start_scan',
63 | 'tool_id': 20,
64 | 'tool_params': {
65 | 'target': f'{self.word}',
66 | 'web_details': 'off',
67 | 'do_smart_search': 'off'
68 | }
69 | }
70 | responses = await AsyncFetcher.post_fetch(url=self.api, data=json.dumps(subdomain_payload), proxy=self.proxy)
71 | res_json = json.loads(responses.strip())
72 | if res_json['op_status'] == 'success':
73 | scan_id = res_json['scan_id']
74 | await self.poll(scan_id)
75 |
76 | async def process(self, proxy=False):
77 | self.proxy = proxy
78 | await self.do_search() # Only need to do it once.
79 |
--------------------------------------------------------------------------------
/theHarvester/discovery/projectdiscovery.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchDiscovery:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.key = Core.projectdiscovery_key()
10 | if self.key is None:
11 | raise MissingKey('ProjectDiscovery')
12 | self.total_results = None
13 | self.proxy = False
14 |
15 | async def do_search(self):
16 | url = f'https://dns.projectdiscovery.io/dns/{self.word}/subdomains'
17 | response = await AsyncFetcher.fetch_all([url], json=True, headers={'User-Agent': Core.get_user_agent(),
18 | 'Authorization': self.key},
19 | proxy=self.proxy)
20 | self.total_results = [f'{domains}.{self.word}' for domains in response[0]['subdomains']]
21 |
22 | async def get_hostnames(self) -> set:
23 | return self.total_results
24 |
25 | async def process(self, proxy=False):
26 | self.proxy = proxy
27 | await self.do_search()
28 |
--------------------------------------------------------------------------------
/theHarvester/discovery/qwantsearch.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | from json.decoder import JSONDecodeError
4 |
5 | from theHarvester.lib.core import *
6 | from theHarvester.parsers import myparser
7 |
8 |
9 | class SearchQwant:
10 | def __init__(self, word, start, limit):
11 | self.word = word
12 | self.total_results = ""
13 | self.limit = int(limit)
14 | self.start = int(start)
15 | self.proxy = False
16 |
17 | def get_start_offset(self) -> int:
18 | """
19 | print(get_start_offset(0))
20 | >>> 0
21 | print(get_start_offset(7))
22 | >>> 0
23 | print(get_start_offset(25))
24 | >>> 20
25 | print(get_start_offset(42))
26 | >>> 40
27 | """
28 | start = int(math.floor(self.start / 10.0)) * 10
29 | return max(start, 0)
30 |
31 | async def do_search(self) -> None:
32 | headers = {'User-agent': Core.get_user_agent()}
33 |
34 | start = self.get_start_offset()
35 | limit = self.limit + start
36 | step = 10
37 |
38 | api_urls = [
39 | f"https://api.qwant.com/api/search/web?count=10&offset={str(offset)}&q={self.word}&t=web&r=US&device=desktop&safesearch=0&locale=en_US&uiv=4"
40 | for offset in range(start, limit, step)
41 | ]
42 |
43 | responses = await AsyncFetcher.fetch_all(api_urls, headers=headers, proxy=self.proxy)
44 |
45 | for response in responses:
46 | try:
47 | json_response = json.loads(response)
48 | except JSONDecodeError:
49 | # sometimes error 502 from server
50 | continue
51 |
52 | try:
53 | response_items = json_response['data']['result']['items']
54 | except KeyError:
55 | if json_response.get("status", None) \
56 | and json_response.get("error", None) == 24:
57 | # https://www.qwant.com/anti_robot
58 | print("Rate limit reached - IP Blocked until captcha is solved")
59 | break
60 | continue
61 |
62 | for response_item in response_items:
63 | desc = response_item.get('desc', '')
64 | """
65 | response_item[0]['desc'] = "end of previous description."
66 | response_item[1]['desc'] = "john.doo@company.com start the next description"
67 | total_results = "end of first description.john.doo@company.com"
68 | get_emails() = "description.john.doo@company.com"
69 | """
70 | self.total_results += " "
71 | self.total_results += desc
72 |
73 | async def get_emails(self) -> set:
74 | parser = myparser.Parser(self.total_results, self.word)
75 | return await parser.emails()
76 |
77 | async def get_hostnames(self) -> list:
78 | parser = myparser.Parser(self.total_results, self.word)
79 | return await parser.hostnames()
80 |
81 | async def process(self, proxy=False) -> None:
82 | self.proxy = proxy
83 | await self.do_search()
84 |
--------------------------------------------------------------------------------
/theHarvester/discovery/rapiddns.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchRapidDns:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.total_results = []
10 | self.proxy = False
11 |
12 | async def do_search(self):
13 | try:
14 | headers = {'User-agent': Core.get_user_agent()}
15 | # TODO see if it's worth adding sameip searches
16 | # f'{self.hostname}/sameip/{self.word}?full=1#result'
17 | urls = [f'https://rapiddns.io/subdomain/{self.word}?full=1#result']
18 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
19 | if len(responses[0]) <= 1:
20 | return self.total_results
21 | soup = BeautifulSoup(responses[0], 'html.parser')
22 | rows = soup.find("table").find("tbody").find_all("tr")
23 | if rows:
24 | # Sanity check
25 | for row in rows:
26 | cells = row.find_all("td")
27 | if len(cells) >= 0:
28 | # sanity check
29 | subdomain = str(cells[0].get_text())
30 | if cells[-1].get_text() == 'CNAME':
31 | self.total_results.append(f'{subdomain}')
32 | else:
33 | self.total_results.append(f'{subdomain}:{str(cells[1].get_text()).strip()}')
34 | self.total_results = list({domain for domain in self.total_results})
35 | except Exception as e:
36 | print(f'An exception has occurred: {str(e)}')
37 |
38 | async def process(self, proxy=False):
39 | self.proxy = proxy
40 | await self.do_search()
41 |
42 | async def get_hostnames(self):
43 | return self.total_results
44 |
--------------------------------------------------------------------------------
/theHarvester/discovery/rocketreach.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | import asyncio
4 |
5 |
6 | class SearchRocketReach:
7 |
8 | def __init__(self, word, limit):
9 | self.ips = set()
10 | self.word = word
11 | self.key = Core.rocketreach_key()
12 | if self.key is None:
13 | raise MissingKey('RocketReach')
14 | self.hosts = set()
15 | self.proxy = False
16 | self.baseurl = 'https://api.rocketreach.co/v2/api/search'
17 | self.links = set()
18 | self.limit = limit
19 |
20 | async def do_search(self):
21 | try:
22 | headers = {
23 | 'Api-Key': self.key,
24 | 'Content-Type': 'application/json',
25 | 'User-Agent': Core.get_user_agent()
26 | }
27 |
28 | next_page = 1 # track pagniation
29 | for count in range(1, self.limit):
30 | data = f'{{"query":{{"company_domain": ["{self.word}"]}}, "start": {next_page}, "page_size": 100}}'
31 | result = await AsyncFetcher.post_fetch(self.baseurl, headers=headers, data=data, json=True)
32 | if 'detail' in result.keys() and 'error' in result.keys() and 'Subscribe to a plan to access' in result['detail']:
33 | # No more results can be fetched
34 | break
35 | if 'detail' in result.keys() and 'Request was throttled.' in result['detail']:
36 | # Rate limit has been triggered need to sleep extra
37 | print(f'RocketReach requests have been throttled; '
38 | f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}')
39 | break
40 | if 'profiles' in dict(result).keys():
41 | if len(result['profiles']) == 0:
42 | break
43 | for profile in result['profiles']:
44 | if 'linkedin_url' in dict(profile).keys():
45 | self.links.add(profile['linkedin_url'])
46 | if 'pagination' in dict(result).keys():
47 | next_page = int(result['pagination']['next'])
48 | if next_page > int(result['pagination']['total']):
49 | break
50 |
51 | await asyncio.sleep(get_delay() + 2)
52 |
53 | except Exception as e:
54 | print(f'An exception has occurred: {e}')
55 |
56 | async def get_links(self):
57 | return self.links
58 |
59 | async def process(self, proxy=False):
60 | self.proxy = proxy
61 | await self.do_search()
62 |
--------------------------------------------------------------------------------
/theHarvester/discovery/securitytrailssearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | from theHarvester.parsers import securitytrailsparser
4 | import asyncio
5 |
6 |
7 | class SearchSecuritytrail:
8 |
9 | def __init__(self, word):
10 | self.word = word
11 | self.key = Core.security_trails_key()
12 | if self.key is None:
13 | raise MissingKey('Securitytrail')
14 | self.results = ""
15 | self.totalresults = ""
16 | self.api = 'https://api.securitytrails.com/v1/'
17 | self.info = ()
18 | self.proxy = False
19 |
20 | async def authenticate(self) -> None:
21 | # Method to authenticate API key before sending requests.
22 | headers = {'APIKEY': self.key}
23 | url = f'{self.api}ping'
24 | auth_responses = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
25 | auth_responses = auth_responses[0]
26 | if 'False' in auth_responses or 'Invalid authentication' in auth_responses:
27 | print('\tKey could not be authenticated exiting program.')
28 | await asyncio.sleep(2)
29 |
30 | async def do_search(self) -> None:
31 | # https://api.securitytrails.com/v1/domain/domain.com
32 | url = f'{self.api}domain/{self.word}'
33 | headers = {'APIKEY': self.key}
34 | response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
35 | await asyncio.sleep(2) # Not random delay because 2 seconds is required due to rate limit.
36 | self.results = response[0]
37 | self.totalresults += self.results
38 | url += '/subdomains' # Get subdomains now.
39 | subdomain_response = await AsyncFetcher.fetch_all([url], headers=headers, proxy=self.proxy)
40 | await asyncio.sleep(2)
41 | self.results = subdomain_response[0]
42 | self.totalresults += self.results
43 |
44 | async def process(self, proxy=False) -> None:
45 | self.proxy = proxy
46 | await self.authenticate()
47 | await self.do_search()
48 | parser = securitytrailsparser.Parser(word=self.word, text=self.totalresults)
49 | self.info = await parser.parse_text()
50 | # Create parser and set self.info to tuple returned from parsing text.
51 | print('\tDone Searching Results')
52 |
53 | async def get_ips(self) -> set:
54 | return self.info[0]
55 |
56 | async def get_hostnames(self) -> set:
57 | return self.info[1]
58 |
--------------------------------------------------------------------------------
/theHarvester/discovery/shodansearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | from shodan import exception
4 | from shodan import Shodan
5 | from collections import OrderedDict
6 |
7 |
8 | class SearchShodan:
9 |
10 | def __init__(self):
11 | self.key = Core.shodan_key()
12 | if self.key is None:
13 | raise MissingKey('Shodan')
14 | self.api = Shodan(self.key)
15 | self.hostdatarow = []
16 | self.tracker: OrderedDict = OrderedDict()
17 |
18 | async def search_ip(self, ip):
19 | try:
20 | ipaddress = ip
21 | results = self.api.host(ipaddress)
22 | asn = ''
23 | domains = list()
24 | hostnames = list()
25 | ip_str = ''
26 | isp = ''
27 | org = ''
28 | ports = list()
29 | title = ''
30 | server = ''
31 | product = ''
32 | technologies = list()
33 |
34 | data_first_dict = dict(results['data'][0])
35 |
36 | if 'ip_str' in data_first_dict.keys():
37 | ip_str += data_first_dict['ip_str']
38 |
39 | if 'http' in data_first_dict.keys():
40 | http_results_dict = dict(data_first_dict['http'])
41 | if 'title' in http_results_dict.keys():
42 | title_val = str(http_results_dict['title']).strip()
43 | if title_val != 'None':
44 | title += title_val
45 | if 'components' in http_results_dict.keys():
46 | for key in http_results_dict['components'].keys():
47 | technologies.append(key)
48 | if 'server' in http_results_dict.keys():
49 | server_val = str(http_results_dict['server']).strip()
50 | if server_val != 'None':
51 | server += server_val
52 |
53 | for key, value in results.items():
54 | if key == 'asn':
55 | asn += value
56 | if key == 'domains':
57 | value = list(value)
58 | value.sort()
59 | domains.extend(value)
60 | if key == 'hostnames':
61 | value = [host.strip() for host in list(value)]
62 | value.sort()
63 | hostnames.extend(value)
64 | if key == 'isp':
65 | isp += value
66 | if key == 'org':
67 | org += str(value)
68 | if key == 'ports':
69 | value = list(value)
70 | value.sort()
71 | ports.extend(value)
72 | if key == 'product':
73 | product += value
74 |
75 | technologies = list(set(technologies))
76 |
77 | self.tracker[ip] = {'asn': asn.strip(), 'domains': domains, 'hostnames': hostnames,
78 | 'ip_str': ip_str.strip(), 'isp': isp.strip(), 'org': org.strip(),
79 | 'ports': ports, 'product': product.strip(),
80 | 'server': server.strip(), 'technologies': technologies, 'title': title.strip()}
81 |
82 | return self.tracker
83 | except exception.APIError:
84 | print(f'{ip}: Not in Shodan')
85 | self.tracker[ip] = 'Not in Shodan'
86 | except Exception as e:
87 | # print(f'Error occurred in the Shodan IP search module: {e}')
88 | self.tracker[ip] = f'Error occurred in the Shodan IP search module: {e}'
89 | finally:
90 | return self.tracker
91 |
--------------------------------------------------------------------------------
/theHarvester/discovery/sublist3r.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchSublist3r:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.totalhosts = list
10 | self.proxy = False
11 |
12 | async def do_search(self):
13 | url = f'https://api.sublist3r.com/search.php?domain={self.word}'
14 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
15 | self.totalhosts: list = response[0]
16 |
17 | async def get_hostnames(self) -> Type[list]:
18 | return self.totalhosts
19 |
20 | async def process(self, proxy=False):
21 | self.proxy = proxy
22 | await self.do_search()
23 |
--------------------------------------------------------------------------------
/theHarvester/discovery/takeover.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | import re
3 |
4 |
5 | class TakeOver:
6 |
7 | def __init__(self, hosts):
8 | # NOTE THIS MODULE IS ACTIVE RECON
9 | self.hosts = hosts
10 | self.results = ""
11 | self.totalresults = ""
12 | self.proxy = False
13 | # Thank you to https://github.com/EdOverflow/can-i-take-over-xyz for these fingerprints
14 | self.fingerprints = {"'Trying to access your account?'": 'Campaign Monitor',
15 | '404 Not Found': 'Fly.io',
16 | '404 error unknown site!': 'Pantheon',
17 | 'Do you want to register *.wordpress.com?': 'Wordpress',
18 | 'Domain uses DO name serves with no records in DO.': 'Digital Ocean',
19 | "It looks like you may have taken a wrong turn somewhere. Don't worry...it happens to all of us.": 'LaunchRock',
20 | 'No Site For Domain': 'Kinsta',
21 | 'No settings were found for this company:': 'Help Scout',
22 | 'Project doesnt exist... yet!': 'Readme.io',
23 | 'Repository not found': 'Bitbucket',
24 | 'The feed has not been found.': 'Feedpress',
25 | 'No such app': 'Heroku',
26 | 'The specified bucket does not exist': 'AWS/S3',
27 | 'The thing you were looking for is no longer here, or never was': 'Ghost',
28 | "There isn't a Github Pages site here.": 'Github',
29 | 'This UserVoice subdomain is currently available!': 'UserVoice',
30 | "Uh oh. That page doesn't exist.": 'Intercom',
31 | "We could not find what you're looking for.": 'Help Juice',
32 | "Whatever you were looking for doesn't currently exist at this address": 'Tumblr',
33 | 'is not a registered InCloud YouTrack': 'JetBrains',
34 | 'page not found': 'Uptimerobot',
35 | 'project not found': 'Surge.sh'}
36 |
37 | async def check(self, url, resp):
38 | # Simple function that takes response and checks if any fingerprints exists
39 | # If a fingerprint exists figures out which one and prints it out
40 | regex = re.compile("(?=(" + "|".join(map(re.escape, list(self.fingerprints.keys()))) + "))")
41 | # Sanitize fingerprints
42 | matches = re.findall(regex, resp)
43 | for match in matches:
44 | print(f'\t\033[91m Takeover detected: {url}\033[1;32;40m')
45 | if match in self.fingerprints.keys():
46 | # Sanity check as to not error out
47 | print(f'\t\033[91m Type of takeover is: {self.fingerprints[match]}\033[1;32;40m')
48 |
49 | async def do_take(self):
50 | try:
51 | if len(self.hosts) > 0:
52 | tup_resps: list = await AsyncFetcher.fetch_all(self.hosts, takeover=True, proxy=self.proxy)
53 | # Returns a list of tuples in this format: (url, response)
54 | tup_resps = [tup for tup in tup_resps if tup[1] != '']
55 | # Filter out responses whose responses are empty strings (indicates errored)
56 | for url, resp in tup_resps:
57 | await self.check(url, resp)
58 | else:
59 | return
60 | except Exception as e:
61 | print(e)
62 |
63 | async def process(self, proxy=False):
64 | self.proxy = proxy
65 | await self.do_take()
66 |
--------------------------------------------------------------------------------
/theHarvester/discovery/threatcrowd.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchThreatcrowd:
6 |
7 | def __init__(self, word):
8 | self.word = word.replace(' ', '%20')
9 | self.hostnames = list()
10 | self.ips = list()
11 | self.proxy = False
12 |
13 | async def do_search(self):
14 | base_url = f'https://www.threatcrowd.org/searchApi/v2/domain/report/?domain={self.word}'
15 | headers = {'User-Agent': Core.get_user_agent()}
16 | try:
17 | responses = await AsyncFetcher.fetch_all([base_url], headers=headers, proxy=self.proxy, json=True)
18 | resp = responses[0]
19 | self.ips = {ip['ip_address'] for ip in resp['resolutions'] if len(ip['ip_address']) > 4}
20 | self.hostnames = set(list(resp['subdomains']))
21 | except Exception as e:
22 | print(e)
23 |
24 | async def get_ips(self) -> List:
25 | return self.ips
26 |
27 | async def get_hostnames(self) -> List:
28 | return self.hostnames
29 |
30 | async def process(self, proxy=False):
31 | self.proxy = proxy
32 | await self.do_search()
33 | await self.get_hostnames()
34 |
--------------------------------------------------------------------------------
/theHarvester/discovery/threatminer.py:
--------------------------------------------------------------------------------
1 | from typing import Type
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchThreatminer:
6 |
7 | def __init__(self, word):
8 | self.word = word
9 | self.totalhosts = list
10 | self.totalips = list
11 | self.proxy = False
12 |
13 | async def do_search(self):
14 | url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5'
15 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
16 | self.totalhosts: set = {host for host in response[0]['results']}
17 | second_url = f'https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2'
18 | secondresp = await AsyncFetcher.fetch_all([second_url], json=True, proxy=self.proxy)
19 | try:
20 | self.totalips: set = {resp['ip'] for resp in secondresp[0]['results']}
21 | except TypeError:
22 | pass
23 |
24 | async def get_hostnames(self) -> Type[list]:
25 | return self.totalhosts
26 |
27 | async def get_ips(self) -> Type[list]:
28 | return self.totalips
29 |
30 | async def process(self, proxy=False):
31 | self.proxy = proxy
32 | await self.do_search()
33 |
--------------------------------------------------------------------------------
/theHarvester/discovery/urlscan.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchUrlscan:
6 | def __init__(self, word):
7 | self.word = word
8 | self.totalhosts = list()
9 | self.totalips = list()
10 | self.interestingurls = list()
11 | self.totalasns = list()
12 | self.proxy = False
13 |
14 | async def do_search(self):
15 | url = f'https://urlscan.io/api/v1/search/?q=domain:{self.word}'
16 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
17 | resp = response[0]
18 | self.totalhosts = {f"{page['page']['domain']}" for page in resp['results']}
19 | self.totalips = {f"{page['page']['ip']}" for page in resp['results'] if 'ip' in page['page'].keys()}
20 | self.interestingurls = {f"{page['page']['url']}" for page in resp['results'] if self.word in page['page']['url'] and 'url' in page['page'].keys()}
21 | self.totalasns = {f"{page['page']['asn']}" for page in resp['results'] if 'asn' in page['page'].keys()}
22 |
23 | async def get_hostnames(self) -> List:
24 | return self.totalhosts
25 |
26 | async def get_ips(self) -> List:
27 | return self.totalips
28 |
29 | async def get_interestingurls(self) -> List:
30 | return self.interestingurls
31 |
32 | async def get_asns(self) -> List:
33 | return self.totalasns
34 |
35 | async def process(self, proxy=False):
36 | self.proxy = proxy
37 | await self.do_search()
38 |
--------------------------------------------------------------------------------
/theHarvester/discovery/virustotal.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 |
4 |
5 | class SearchVirustotal:
6 |
7 | def __init__(self, word):
8 | self.key = Core.virustotal_key()
9 | if self.key is None:
10 | raise MissingKey('virustotal')
11 | self.word = word
12 | self.proxy = False
13 | self.hostnames = []
14 |
15 | async def do_search(self):
16 | # TODO determine if more endpoints can yield useful info given a domain
17 | # based on: https://developers.virustotal.com/reference/domains-relationships
18 | # base_url = "https://www.virustotal.com/api/v3/domains/domain/subdomains?limit=40"
19 | headers = {
20 | 'User-Agent': Core.get_user_agent(),
21 | "Accept": "application/json",
22 | "x-apikey": self.key
23 | }
24 | base_url = f"https://www.virustotal.com/api/v3/domains/{self.word}/subdomains?limit=40"
25 | cursor = ''
26 | count = 0
27 | fail_counter = 0
28 | counter = 0
29 | breakcon = False
30 | while True:
31 | if breakcon:
32 | break
33 | # rate limit is 4 per minute
34 | # TODO add timer logic if proven to be needed
35 | # in the meantime sleeping 16 seconds should eliminate hitting the rate limit
36 | # in case rate limit is hit, fail counter exists and sleep for 65 seconds
37 | send_url = base_url + "&cursor=" + cursor if cursor != '' and len(cursor) > 2 else base_url
38 | responses = await AsyncFetcher.fetch_all([send_url], headers=headers, proxy=self.proxy, json=True)
39 | jdata = responses[0]
40 | if 'data' not in jdata.keys():
41 | await asyncio.sleep(60 + 5)
42 | fail_counter += 1
43 | if 'meta' in jdata.keys():
44 | cursor = jdata['meta']['cursor'] if 'cursor' in jdata['meta'].keys() else ''
45 | if len(cursor) == 0 and 'data' in jdata.keys():
46 | # if cursor no longer is within the meta field have hit last entry
47 | breakcon = True
48 | count += jdata['meta']['count']
49 | if count == 0 or fail_counter >= 2:
50 | break
51 | if 'data' in jdata.keys():
52 | data = jdata['data']
53 | self.hostnames.extend(await self.parse_hostnames(data, self.word))
54 | counter += 1
55 | await asyncio.sleep(16)
56 | self.hostnames = list(sorted(set(self.hostnames)))
57 | # verify domains such as x.x.com.multicdn.x.com are parsed properly
58 | self.hostnames = [host for host in self.hostnames if ((len(host.split('.')) >= 3) and host.split('.')[-2] == self.word.split('.')[-2])]
59 |
60 | async def get_hostnames(self) -> list:
61 | return self.hostnames
62 |
63 | @staticmethod
64 | async def parse_hostnames(data, word):
65 | total_subdomains = set()
66 | for attribute in data:
67 | total_subdomains.add(attribute['id'].replace('"', '').replace('www.', ''))
68 | attributes = attribute['attributes']
69 | total_subdomains.update(
70 | {value['value'].replace('"', '').replace('www.', '') for value in attributes['last_dns_records'] if
71 | word in value['value']})
72 | if 'last_https_certificate' in attributes.keys():
73 | total_subdomains.update({value.replace('"', '').replace('www.', '') for value in
74 | attributes['last_https_certificate']['extensions']['subject_alternative_name']
75 | if word in value})
76 | total_subdomains = list(sorted(total_subdomains))
77 | # Other false positives may occur over time and yes there are other ways to parse this, feel free to implement
78 | # them and submit a PR or raise an issue if you run into this filtering not being enough
79 | # TODO determine if parsing 'v=spf1 include:_spf-x.acme.com include:_spf-x.acme.com' is worth parsing
80 | total_subdomains = [x for x in total_subdomains if not str(x).endswith('edgekey.net') and not str(x).endswith('akadns.net') and 'include:_spf' not in str(x)]
81 | total_subdomains.sort()
82 | return total_subdomains
83 |
84 | async def process(self, proxy=False):
85 | self.proxy = proxy
86 | await self.do_search()
87 |
--------------------------------------------------------------------------------
/theHarvester/discovery/yahoosearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import *
2 | from theHarvester.parsers import myparser
3 |
4 |
5 | class SearchYahoo:
6 |
7 | def __init__(self, word, limit):
8 | self.word = word
9 | self.total_results = ""
10 | self.server = 'search.yahoo.com'
11 | self.limit = limit
12 | self.proxy = False
13 |
14 | async def do_search(self):
15 | base_url = f'https://{self.server}/search?p=%40{self.word}&b=xx&pz=10'
16 | headers = {
17 | 'Host': self.server,
18 | 'User-agent': Core.get_user_agent()
19 | }
20 | urls = [base_url.replace("xx", str(num)) for num in range(0, self.limit, 10) if num <= self.limit]
21 | responses = await AsyncFetcher.fetch_all(urls, headers=headers, proxy=self.proxy)
22 | for response in responses:
23 | self.total_results += response
24 |
25 | async def process(self):
26 | await self.do_search()
27 |
28 | async def get_emails(self):
29 | rawres = myparser.Parser(self.total_results, self.word)
30 | toparse_emails = await rawres.emails()
31 | emails = set()
32 | # strip out numbers and dashes for emails that look like xxx-xxx-xxxemail@host.tld
33 | for email in toparse_emails:
34 | email = str(email)
35 | if '-' in email and email[0].isdigit() and email.index('-') <= 9:
36 | while email[0] == '-' or email[0].isdigit():
37 | email = email[1:]
38 | emails.add(email)
39 | return list(emails)
40 |
41 | async def get_hostnames(self, proxy=False):
42 | self.proxy = proxy
43 | rawres = myparser.Parser(self.total_results, self.word)
44 | return await rawres.hostnames()
45 |
--------------------------------------------------------------------------------
/theHarvester/discovery/zoomeyesearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.discovery.constants import *
2 | from theHarvester.lib.core import *
3 | from theHarvester.parsers import myparser
4 | import asyncio
5 | import re
6 |
7 |
8 | class SearchZoomEye:
9 |
10 | def __init__(self, word, limit):
11 | self.word = word
12 | self.limit = limit
13 | self.key = Core.zoomeye_key()
14 | # NOTE for ZoomEye you get a system recharge on the 1st of every month
15 | # Which resets your balance to 10000 requests
16 | # If you wish to extract as many subdomains as possible visit the fetch_subdomains
17 | # To see how
18 | if self.key is None:
19 | raise MissingKey('zoomeye')
20 | self.baseurl = 'https://api.zoomeye.org/host/search'
21 | self.proxy = False
22 | self.totalasns = list()
23 | self.totalhosts = list()
24 | self.interestingurls = list()
25 | self.totalips = list()
26 | self.totalemails = list()
27 | # Regex used is directly from: https://github.com/GerbenJavado/LinkFinder/blob/master/linkfinder.py#L29
28 | # Maybe one day it will be a pip package
29 | # Regardless LinkFinder is an amazing tool!
30 | self.iurl_regex = r"""
31 | (?:"|') # Start newline delimiter
32 | (
33 | ((?:[a-zA-Z]{1,10}://|//) # Match a scheme [a-Z]*1-10 or //
34 | [^"'/]{1,}\. # Match a domainname (any character + dot)
35 | [a-zA-Z]{2,}[^"']{0,}) # The domainextension and/or path
36 | |
37 | ((?:/|\.\./|\./) # Start with /,../,./
38 | [^"'><,;| *()(%%$^/\\\[\]] # Next character can't be...
39 | [^"'><,;|()]{1,}) # Rest of the characters can't be
40 | |
41 | ([a-zA-Z0-9_\-/]{1,}/ # Relative endpoint with /
42 | [a-zA-Z0-9_\-/]{1,} # Resource name
43 | \.(?:[a-zA-Z]{1,4}|action) # Rest + extension (length 1-4 or action)
44 | (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters
45 | |
46 | ([a-zA-Z0-9_\-/]{1,}/ # REST API (no extension) with /
47 | [a-zA-Z0-9_\-/]{3,} # Proper REST endpoints usually have 3+ chars
48 | (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters
49 | |
50 | ([a-zA-Z0-9_\-]{1,} # filename
51 | \.(?:php|asp|aspx|jsp|json|
52 | action|html|js|txt|xml) # . + extension
53 | (?:[\?|#][^"|']{0,}|)) # ? or # mark with parameters
54 | )
55 | (?:"|') # End newline delimiter
56 | """
57 | self.iurl_regex = re.compile(self.iurl_regex, re.VERBOSE)
58 |
59 | async def fetch_subdomains(self):
60 | # Based on docs from: https://www.zoomeye.org/doc#search-sub-domain-ip
61 | headers = {
62 | 'API-KEY': self.key,
63 | 'User-Agent': Core.get_user_agent()
64 | }
65 |
66 | subdomain_search_endpoint = f'https://api.zoomeye.org/domain/search?q={self.word}&type=0&'
67 |
68 | response = await AsyncFetcher.fetch_all([subdomain_search_endpoint + 'page=1'],
69 | json=True, proxy=self.proxy, headers=headers)
70 | # Make initial request to determine total number of subdomains
71 | resp = response[0]
72 | if resp['status'] != 200:
73 | return
74 | total = resp['total']
75 | # max number of results per request seems to be 30
76 | # NOTE: If you wish to get as many subdomains as possible
77 | # Change the line below to:
78 | # self.limit = (total // 30) + 1
79 | self.limit = self.limit if total > self.limit else (total // 30) + 1
80 | self.totalhosts.extend([item["name"] for item in resp["list"]])
81 | for i in range(2, self.limit):
82 | response = await AsyncFetcher.fetch_all([subdomain_search_endpoint + f'page={i}'],
83 | json=True, proxy=self.proxy, headers=headers)
84 | resp = response[0]
85 | if resp['status'] != 200:
86 | return
87 | found_subdomains = [item["name"] for item in resp["list"]]
88 | if len(found_subdomains) == 0:
89 | break
90 | self.totalhosts.extend(found_subdomains)
91 | if i % 10 == 0:
92 | await asyncio.sleep(get_delay() + 1)
93 |
94 | async def do_search(self):
95 | headers = {
96 | 'API-KEY': self.key,
97 | 'User-Agent': Core.get_user_agent()
98 | }
99 | # Fetch subdomains first
100 | await self.fetch_subdomains()
101 | params = (
102 | ('query', f'site:{self.word}'),
103 | ('page', '1'),
104 | )
105 | response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers,
106 | params=params)
107 | # First request determines how many pages there in total
108 | resp = response[0]
109 | total_pages = int(resp['available'])
110 | self.limit = self.limit if total_pages > self.limit else total_pages
111 | self.limit = 3 if self.limit == 2 else self.limit
112 | cur_page = 2 if self.limit >= 2 else -1
113 | # Means there is only one page
114 | # hostnames, emails, ips, asns, iurls
115 | nomatches_counter = 0
116 | # cur_page = -1
117 | if cur_page == -1:
118 | # No need to do loop just parse and leave
119 | if 'matches' in resp.keys():
120 | hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches'])
121 | self.totalhosts.extend(hostnames)
122 | self.totalemails.extend(emails)
123 | self.totalips.extend(ips)
124 | self.totalasns.extend(asns)
125 | self.interestingurls.extend(iurls)
126 | else:
127 | if 'matches' in resp.keys():
128 | # Parse out initial results and then continue to loop
129 | hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches'])
130 | self.totalhosts.extend(hostnames)
131 | self.totalemails.extend(emails)
132 | self.totalips.extend(ips)
133 | self.totalasns.extend(asns)
134 | self.interestingurls.extend(iurls)
135 |
136 | for num in range(2, self.limit):
137 | # print(f'Currently on page: {num}')
138 | params = (
139 | ('query', f'site:{self.word}'),
140 | ('page', f'{num}'),
141 | )
142 | response = await AsyncFetcher.fetch_all([self.baseurl], json=True, proxy=self.proxy, headers=headers,
143 | params=params)
144 | resp = response[0]
145 | if 'matches' not in resp.keys():
146 | print(f'Your resp: {resp}')
147 | print('Match not found in keys')
148 | break
149 |
150 | hostnames, emails, ips, asns, iurls = await self.parse_matches(resp['matches'])
151 |
152 | if len(hostnames) == 0 and len(emails) == 0 and len(ips) == 0 \
153 | and len(asns) == 0 and len(iurls) == 0:
154 | nomatches_counter += 1
155 |
156 | if nomatches_counter >= 5:
157 | break
158 |
159 | self.totalhosts.extend(hostnames)
160 | self.totalemails.extend(emails)
161 | self.totalips.extend(ips)
162 | self.totalasns.extend(asns)
163 | self.interestingurls.extend(iurls)
164 |
165 | if num % 10 == 0:
166 | await asyncio.sleep(get_delay() + 1)
167 |
168 | async def parse_matches(self, matches):
169 | # Helper function to parse items from match json
170 | # ips = {match["ip"] for match in matches}
171 | ips = set()
172 | iurls = set()
173 | hostnames = set()
174 | asns = set()
175 | emails = set()
176 | for match in matches:
177 | try:
178 | ips.add(match['ip'])
179 |
180 | if 'geoinfo' in match.keys():
181 | asns.add(int(match['geoinfo']['asn']))
182 |
183 | if 'rdns_new' in match.keys():
184 | rdns_new = match['rdns_new']
185 |
186 | if ',' in rdns_new:
187 | parts = str(rdns_new).split(',')
188 | rdns_new = parts[0]
189 | if len(parts) == 2:
190 | hostnames.add(parts[1])
191 | rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new
192 | hostnames.add(rdns_new)
193 | else:
194 | rdns_new = rdns_new[:-1] if rdns_new[-1] == '.' else rdns_new
195 | hostnames.add(rdns_new)
196 |
197 | if 'rdns' in match.keys():
198 | rdns = match['rdns']
199 | rdns = rdns[:-1] if rdns[-1] == '.' else rdns
200 | hostnames.add(rdns)
201 |
202 | if 'portinfo' in match.keys():
203 | # re.
204 | temp_emails = set(await self.parse_emails(match['portinfo']['banner']))
205 | emails.update(temp_emails)
206 | hostnames.update(set(await self.parse_hostnames(match['portinfo']['banner'])))
207 | iurls = {str(iurl.group(1)).replace('"', '') for iurl
208 | in re.finditer(self.iurl_regex, match['portinfo']['banner'])
209 | if self.word in str(iurl.group(1))}
210 | except Exception as e:
211 | print(f'An exception has occurred: {e}')
212 | return hostnames, emails, ips, asns, iurls
213 |
214 | async def process(self, proxy=False):
215 | self.proxy = proxy
216 | await self.do_search() # Only need to do it once.
217 |
218 | async def parse_emails(self, content):
219 | rawres = myparser.Parser(content, self.word)
220 | return await rawres.emails()
221 |
222 | async def parse_hostnames(self, content):
223 | rawres = myparser.Parser(content, self.word)
224 | return await rawres.hostnames()
225 |
226 | async def get_hostnames(self):
227 | return set(self.totalhosts)
228 |
229 | async def get_emails(self):
230 | return set(self.totalemails)
231 |
232 | async def get_ips(self):
233 | return set(self.totalips)
234 |
235 | async def get_asns(self):
236 | return set(self.totalasns)
237 |
238 | async def get_interestingurls(self):
239 | return set(self.interestingurls)
240 |
--------------------------------------------------------------------------------
/theHarvester/lib/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['hostchecker']
2 |
--------------------------------------------------------------------------------
/theHarvester/lib/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/lib/api/__init__.py
--------------------------------------------------------------------------------
/theHarvester/lib/api/api.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from typing import List
3 | import os
4 | from fastapi import FastAPI, Header, Query, Request
5 | from fastapi.responses import HTMLResponse, UJSONResponse
6 | from slowapi import Limiter, _rate_limit_exceeded_handler
7 | from slowapi.errors import RateLimitExceeded
8 | from slowapi.util import get_remote_address
9 | from starlette.responses import RedirectResponse
10 | from starlette.staticfiles import StaticFiles
11 |
12 | from theHarvester import __main__
13 |
14 | limiter = Limiter(key_func=get_remote_address)
15 | app = FastAPI(title='Restful Harvest', description='Rest API for theHarvester powered by FastAPI', version='0.0.2')
16 | app.state.limiter = limiter
17 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
18 |
19 | # This is where we will host files that arise if the user specifies a filename
20 | try:
21 | app.mount('/static', StaticFiles(directory='theHarvester/lib/api/static/'), name='static')
22 | except RuntimeError:
23 | static_path = os.path.expanduser('~/.local/share/theHarvester/static/')
24 | if not os.path.isdir(static_path):
25 | os.makedirs(static_path)
26 | app.mount('/static', StaticFiles(directory='~/.local/share/theHarvester/static/'), name='static')
27 |
28 |
29 | @app.get('/', response_class=HTMLResponse)
30 | async def root(*, user_agent: str = Header(None)):
31 | # very basic user agent filtering
32 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
33 | response = RedirectResponse(app.url_path_for('bot'))
34 | return response
35 |
36 | html = """
37 |
38 |
39 |
40 | theHarvester API
41 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 | """
58 | return html
59 |
60 |
61 | @app.get('/nicebot')
62 | async def bot():
63 | # nice bot
64 | string = {'bot': 'These are not the droids you are looking for'}
65 | return string
66 |
67 |
68 | @app.get('/sources', response_class=UJSONResponse)
69 | @limiter.limit('5/minute')
70 | async def getsources(request: Request):
71 | # Endpoint for user to query for available sources theHarvester supports
72 | # Rate limit of 5 requests per minute
73 | sources = __main__.Core.get_supportedengines()
74 | return {'sources': sources}
75 |
76 |
77 | @app.get('/dnsbrute', response_class=UJSONResponse)
78 | @limiter.limit('5/minute')
79 | async def dnsbrute(request: Request, user_agent: str = Header(None),
80 | domain: str = Query(..., description='Domain to be brute forced')):
81 | # Endpoint for user to signal to do DNS brute forcing
82 | # Rate limit of 5 requests per minute
83 | # basic user agent filtering
84 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
85 | response = RedirectResponse(app.url_path_for('bot'))
86 | return response
87 | dns_bruteforce = await __main__.start(argparse.Namespace(dns_brute=True,
88 | dns_lookup=False,
89 | dns_server=False,
90 | dns_tld=False,
91 | domain=domain,
92 | filename='',
93 | google_dork=False,
94 | limit=500,
95 | proxies=False,
96 | shodan=False,
97 | source=','.join([]),
98 | start=0,
99 | take_over=False,
100 | virtual_host=False))
101 | return {'dns_bruteforce': dns_bruteforce}
102 |
103 |
104 | @app.get('/query', response_class=UJSONResponse)
105 | @limiter.limit('2/minute')
106 | async def query(request: Request, dns_server: str = Query(""), user_agent: str = Header(None),
107 | dns_brute: bool = Query(False),
108 | dns_lookup: bool = Query(False),
109 | dns_tld: bool = Query(False),
110 | filename: str = Query(""),
111 | google_dork: bool = Query(False), proxies: bool = Query(False), shodan: bool = Query(False),
112 | take_over: bool = Query(False), virtual_host: bool = Query(False),
113 | source: List[str] = Query(..., description='Data sources to query comma separated with no space'),
114 | limit: int = Query(500), start: int = Query(0),
115 | domain: str = Query(..., description='Domain to be harvested')):
116 |
117 | # Query function that allows user to query theHarvester rest API
118 | # Rate limit of 2 requests per minute
119 | # basic user agent filtering
120 | if user_agent and ('gobuster' in user_agent or 'sqlmap' in user_agent or 'rustbuster' in user_agent):
121 | response = RedirectResponse(app.url_path_for('bot'))
122 | return response
123 | try:
124 | asns, iurls, twitter_people_list, \
125 | linkedin_people_list, linkedin_links, \
126 | aurls, aips, aemails, ahosts = await __main__.start(argparse.Namespace(dns_brute=dns_brute,
127 | dns_lookup=dns_lookup,
128 | dns_server=dns_server,
129 | dns_tld=dns_tld,
130 | domain=domain,
131 | filename=filename,
132 | google_dork=google_dork,
133 | limit=limit,
134 | proxies=proxies,
135 | shodan=shodan,
136 | source=','.join(source),
137 | start=start,
138 | take_over=take_over,
139 | virtual_host=virtual_host))
140 |
141 | return {'asns': asns, 'interesting_urls': iurls,
142 | 'twitter_people': twitter_people_list,
143 | 'linkedin_people': linkedin_people_list,
144 | 'linkedin_links': linkedin_links,
145 | 'trello_urls': aurls,
146 | 'ips': aips,
147 | 'emails': aemails,
148 | 'hosts': ahosts}
149 | except Exception:
150 | return {'exception': 'Please contact the server administrator to check the issue'}
151 |
--------------------------------------------------------------------------------
/theHarvester/lib/api/api_example.py:
--------------------------------------------------------------------------------
1 | """
2 | Example script to query theHarvester rest API, obtain results, and write out to stdout as well as an html
3 | """
4 |
5 | import asyncio
6 | import aiohttp
7 | import netaddr
8 |
9 |
10 | async def fetch_json(session, url):
11 | async with session.get(url) as response:
12 | return await response.json()
13 |
14 |
15 | async def fetch(session, url):
16 | async with session.get(url) as response:
17 | return await response.text()
18 |
19 |
20 | async def main():
21 | """
22 | Just a simple example of how to interact with the rest api
23 | you can easily use requests instead of aiohttp or whatever you best see fit
24 | """
25 | url = "http://127.0.0.1:5000"
26 | domain = "netflix.com"
27 | query_url = f'{url}/query?limit=300&source=bing,baidu,duckduckgo,dogpile&domain={domain}'
28 | async with aiohttp.ClientSession() as session:
29 | fetched_json = await fetch_json(session, query_url)
30 | total_asns = fetched_json['asns']
31 | interesting_urls = fetched_json['interesting_urls']
32 | twitter_people_list_tracker = fetched_json['twitter_people']
33 | linkedin_people_list_tracker = fetched_json['linkedin_people']
34 | linkedin_links_tracker = fetched_json['linkedin_links']
35 | trello_urls = fetched_json['trello_urls']
36 | ips = fetched_json['ips']
37 | emails = fetched_json['emails']
38 | hosts = fetched_json['hosts']
39 |
40 | if len(total_asns) > 0:
41 | print(f'\n[*] ASNS found: {len(total_asns)}')
42 | print('--------------------')
43 | total_asns = list(sorted(set(total_asns)))
44 | for asn in total_asns:
45 | print(asn)
46 |
47 | if len(interesting_urls) > 0:
48 | print(f'\n[*] Interesting Urls found: {len(interesting_urls)}')
49 | print('--------------------')
50 | interesting_urls = list(sorted(set(interesting_urls)))
51 | for iurl in interesting_urls:
52 | print(iurl)
53 |
54 | if len(twitter_people_list_tracker) == 0:
55 | print('\n[*] No Twitter users found.\n\n')
56 | else:
57 | if len(twitter_people_list_tracker) >= 1:
58 | print('\n[*] Twitter Users found: ' + str(len(twitter_people_list_tracker)))
59 | print('---------------------')
60 | twitter_people_list_tracker = list(sorted(set(twitter_people_list_tracker)))
61 | for usr in twitter_people_list_tracker:
62 | print(usr)
63 |
64 | if len(linkedin_people_list_tracker) == 0:
65 | print('\n[*] No LinkedIn users found.\n\n')
66 | else:
67 | if len(linkedin_people_list_tracker) >= 1:
68 | print('\n[*] LinkedIn Users found: ' + str(len(linkedin_people_list_tracker)))
69 | print('---------------------')
70 | linkedin_people_list_tracker = list(sorted(set(linkedin_people_list_tracker)))
71 | for usr in linkedin_people_list_tracker:
72 | print(usr)
73 |
74 | if len(linkedin_links_tracker) == 0:
75 | print(f'\n[*] LinkedIn Links found: {len(linkedin_links_tracker)}')
76 | linkedin_links_tracker = list(sorted(set(linkedin_links_tracker)))
77 | print('---------------------')
78 | for link in linkedin_links_tracker:
79 | print(link)
80 |
81 | length_urls = len(trello_urls)
82 | total = length_urls
83 | print('\n[*] Trello URLs found: ' + str(total))
84 | print('--------------------')
85 | all_urls = list(sorted(set(trello_urls)))
86 | for url in sorted(all_urls):
87 | print(url)
88 |
89 | if len(ips) == 0:
90 | print('\n[*] No IPs found.')
91 | else:
92 | print('\n[*] IPs found: ' + str(len(ips)))
93 | print('-------------------')
94 | # use netaddr as the list may contain ipv4 and ipv6 addresses
95 | ip_list = sorted([netaddr.IPAddress(ip.strip()) for ip in set(ips)])
96 | print('\n'.join(map(str, ip_list)))
97 |
98 | if len(emails) == 0:
99 | print('\n[*] No emails found.')
100 | else:
101 | print('\n[*] Emails found: ' + str(len(emails)))
102 | print('----------------------')
103 | all_emails = sorted(list(set(emails)))
104 | print(('\n'.join(all_emails)))
105 |
106 | if len(hosts) == 0:
107 | print('\n[*] No hosts found.\n\n')
108 | else:
109 | print('\n[*] Hosts found: ' + str(len(hosts)))
110 | print('---------------------')
111 | print('\n'.join(hosts))
112 |
113 |
114 | if __name__ == '__main__':
115 | asyncio.run(main())
116 |
--------------------------------------------------------------------------------
/theHarvester/lib/api/static/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/lib/api/static/.gitkeep
--------------------------------------------------------------------------------
/theHarvester/lib/hostchecker.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 | """
4 | Created by laramies on 2008-08-21.
5 | Revised to use aiodns & asyncio on 2019-09-23
6 | """
7 |
8 | import aiodns
9 | import asyncio
10 | import socket
11 | from typing import Tuple, Any
12 |
13 |
14 | class Checker:
15 |
16 | def __init__(self, hosts: list, nameserver=False):
17 | self.hosts = hosts
18 | self.realhosts: list = []
19 | self.addresses: set = set()
20 | self.nameserver = []
21 | if nameserver:
22 | self.nameserver = nameserver
23 |
24 | @staticmethod
25 | async def query(host, resolver) -> Tuple[str, Any]:
26 | try:
27 | result = await resolver.gethostbyname(host, socket.AF_INET)
28 | addresses = result.addresses
29 | if addresses == [] or addresses is None or result is None:
30 | return f"{host}:", tuple()
31 | else:
32 | return f"{host}:{', '.join(map(str, addresses))}", addresses
33 | except Exception:
34 | return f"{host}", tuple()
35 |
36 | async def query_all(self, resolver) -> list:
37 | results = await asyncio.gather(*[asyncio.create_task(self.query(host, resolver))
38 | for host in self.hosts])
39 | return results
40 |
41 | async def check(self):
42 | loop = asyncio.get_event_loop()
43 | resolver = aiodns.DNSResolver(loop=loop, timeout=4) if len(self.nameserver) == 0\
44 | else aiodns.DNSResolver(loop=loop, timeout=4, nameservers=self.nameserver)
45 | results = await self.query_all(resolver)
46 | for host, address in results:
47 | self.realhosts.append(host)
48 | self.addresses.update({addr for addr in address})
49 | # address may be a list of ips
50 | # and do a set comprehension to remove duplicates
51 | self.realhosts.sort()
52 | self.addresses = list(self.addresses)
53 | return self.realhosts, self.addresses
54 |
--------------------------------------------------------------------------------
/theHarvester/lib/stash.py:
--------------------------------------------------------------------------------
1 | import aiosqlite
2 | import datetime
3 | import os
4 |
5 | db_path = os.path.expanduser('~/.local/share/theHarvester')
6 |
7 | if not os.path.isdir(db_path):
8 | os.makedirs(db_path)
9 |
10 |
11 | class StashManager:
12 |
13 | def __init__(self):
14 | self.db = os.path.join(db_path, 'stash.sqlite')
15 | self.results = ""
16 | self.totalresults = ""
17 | self.latestscandomain = {}
18 | self.domainscanhistory = []
19 | self.scanboarddata = {}
20 | self.scanstats = []
21 | self.latestscanresults = []
22 | self.previousscanresults = []
23 |
24 | async def do_init(self):
25 | async with aiosqlite.connect(self.db) as db:
26 | await db.execute(
27 | 'CREATE TABLE IF NOT EXISTS results (domain text, resource text, type text, find_date date, source text)')
28 | await db.commit()
29 |
30 | async def store(self, domain, resource, res_type, source):
31 | self.domain = domain
32 | self.resource = resource
33 | self.type = res_type
34 | self.source = source
35 | self.date = datetime.date.today()
36 | try:
37 | async with aiosqlite.connect(self.db, timeout=30) as db:
38 | await db.execute('INSERT INTO results (domain,resource, type, find_date, source) VALUES (?,?,?,?,?)',
39 | (self.domain, self.resource, self.type, self.date, self.source))
40 | await db.commit()
41 | except Exception as e:
42 | print(e)
43 |
44 | async def store_all(self, domain, all, res_type, source):
45 | self.domain = domain
46 | self.all = all
47 | self.type = res_type
48 | self.source = source
49 | self.date = datetime.date.today()
50 | master_list = [(self.domain, x, self.type, self.date, self.source) for x in self.all]
51 | async with aiosqlite.connect(self.db, timeout=30) as db:
52 | try:
53 | await db.executemany('INSERT INTO results (domain,resource, type, find_date, source) VALUES (?,?,?,?,?)',
54 | master_list)
55 | await db.commit()
56 | except Exception as e:
57 | print(e)
58 |
59 | async def generatedashboardcode(self, domain):
60 | try:
61 | # TODO refactor into generic method
62 | self.latestscandomain["domain"] = domain
63 | async with aiosqlite.connect(self.db, timeout=30) as conn:
64 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="host"''',
65 | (domain,))
66 | data = await cursor.fetchone()
67 | self.latestscandomain["host"] = data[0]
68 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="email"''',
69 | (domain,))
70 | data = await cursor.fetchone()
71 | self.latestscandomain["email"] = data[0]
72 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="ip"''', (domain,))
73 | data = await cursor.fetchone()
74 | self.latestscandomain["ip"] = data[0]
75 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="vhost"''',
76 | (domain,))
77 | data = await cursor.fetchone()
78 | self.latestscandomain["vhost"] = data[0]
79 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="shodan"''',
80 | (domain,))
81 | data = await cursor.fetchone()
82 | self.latestscandomain["shodan"] = data[0]
83 | cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,))
84 | data = await cursor.fetchone()
85 | self.latestscandomain["latestdate"] = data[0]
86 | latestdate = data[0]
87 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="host"''',
88 | (domain, latestdate,))
89 | scandetailshost = await cursor.fetchall()
90 | self.latestscandomain["scandetailshost"] = scandetailshost
91 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="email"''',
92 | (domain, latestdate,))
93 | scandetailsemail = await cursor.fetchall()
94 | self.latestscandomain["scandetailsemail"] = scandetailsemail
95 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="ip"''',
96 | (domain, latestdate,))
97 | scandetailsip = await cursor.fetchall()
98 | self.latestscandomain["scandetailsip"] = scandetailsip
99 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="vhost"''',
100 | (domain, latestdate,))
101 | scandetailsvhost = await cursor.fetchall()
102 | self.latestscandomain["scandetailsvhost"] = scandetailsvhost
103 | cursor = await conn.execute(
104 | '''SELECT * FROM results WHERE domain=? AND find_date=? AND type="shodan"''',
105 | (domain, latestdate,))
106 | scandetailsshodan = await cursor.fetchall()
107 | self.latestscandomain["scandetailsshodan"] = scandetailsshodan
108 | return self.latestscandomain
109 | except Exception as e:
110 | print(e)
111 |
112 | async def getlatestscanresults(self, domain, previousday=False):
113 | try:
114 | async with aiosqlite.connect(self.db, timeout=30) as conn:
115 | if previousday:
116 | try:
117 | cursor = await conn.execute('''
118 | SELECT DISTINCT(find_date)
119 | FROM results
120 | WHERE find_date=date('now', '-1 day') and domain=?''', (domain,))
121 | previousscandate = await cursor.fetchone()
122 | if not previousscandate: # When theHarvester runs first time/day this query will return.
123 | self.previousscanresults = ["No results", "No results", "No results", "No results",
124 | "No results"]
125 | else:
126 | cursor = await conn.execute('''
127 | SELECT find_date, domain, source, type, resource
128 | FROM results
129 | WHERE find_date=? and domain=?
130 | ORDER BY source,type
131 | ''', (previousscandate[0], domain,))
132 | results = await cursor.fetchall()
133 | self.previousscanresults = results
134 | return self.previousscanresults
135 | except Exception as e:
136 | print(f'Error in getting the previous scan results from the database: {e}')
137 | else:
138 | try:
139 | cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,))
140 | latestscandate = await cursor.fetchone()
141 | cursor = await conn.execute('''
142 | SELECT find_date, domain, source, type, resource
143 | FROM results
144 | WHERE find_date=? and domain=?
145 | ORDER BY source,type
146 | ''', (latestscandate[0], domain,))
147 | results = await cursor.fetchall()
148 | self.latestscanresults = results
149 | return self.latestscanresults
150 | except Exception as e:
151 | print(f'Error in getting the latest scan results from the database: {e}')
152 | except Exception as e:
153 | print(f'Error connecting to theHarvester database: {e}')
154 |
155 | async def getscanboarddata(self):
156 | try:
157 | async with aiosqlite.connect(self.db, timeout=30) as conn:
158 |
159 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="host"''')
160 | data = await cursor.fetchone()
161 | self.scanboarddata["host"] = data[0]
162 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="email"''')
163 | data = await cursor.fetchone()
164 | self.scanboarddata["email"] = data[0]
165 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="ip"''')
166 | data = await cursor.fetchone()
167 | self.scanboarddata["ip"] = data[0]
168 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="vhost"''')
169 | data = await cursor.fetchone()
170 | self.scanboarddata["vhost"] = data[0]
171 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE type="shodan"''')
172 | data = await cursor.fetchone()
173 | self.scanboarddata["shodan"] = data[0]
174 | cursor = await conn.execute('''SELECT COUNT(DISTINCT(domain)) FROM results ''')
175 | data = await cursor.fetchone()
176 | self.scanboarddata["domains"] = data[0]
177 | return self.scanboarddata
178 | except Exception as e:
179 | print(e)
180 |
181 | async def getscanhistorydomain(self, domain):
182 | try:
183 | async with aiosqlite.connect(self.db, timeout=30) as conn:
184 | cursor = await conn.execute('''SELECT DISTINCT(find_date) FROM results WHERE domain=?''', (domain,))
185 | dates = await cursor.fetchall()
186 | for date in dates:
187 | cursor = await conn.execute(
188 | '''SELECT COUNT(*) from results WHERE domain=? AND type="host" AND find_date=?''',
189 | (domain, date[0]))
190 | counthost = await cursor.fetchone()
191 | cursor = await conn.execute(
192 | '''SELECT COUNT(*) from results WHERE domain=? AND type="email" AND find_date=?''',
193 | (domain, date[0]))
194 | countemail = await cursor.fetchone()
195 | cursor = await conn.execute(
196 | '''SELECT COUNT(*) from results WHERE domain=? AND type="ip" AND find_date=?''',
197 | (domain, date[0]))
198 | countip = await cursor.fetchone()
199 | cursor = await conn.execute(
200 | '''SELECT COUNT(*) from results WHERE domain=? AND type="vhost" AND find_date=?''',
201 | (domain, date[0]))
202 | countvhost = await cursor.fetchone()
203 | cursor = await conn.execute(
204 | '''SELECT COUNT(*) from results WHERE domain=? AND type="shodan" AND find_date=?''',
205 | (domain, date[0]))
206 | countshodan = await cursor.fetchone()
207 | results = {
208 | "date": str(date[0]),
209 | "hosts": str(counthost[0]),
210 | "email": str(countemail[0]),
211 | "ip": str(countip[0]),
212 | "vhost": str(countvhost[0]),
213 | "shodan": str(countshodan[0])
214 | }
215 | self.domainscanhistory.append(results)
216 | return self.domainscanhistory
217 | except Exception as e:
218 | print(e)
219 |
220 | async def getpluginscanstatistics(self):
221 | try:
222 | async with aiosqlite.connect(self.db, timeout=30) as conn:
223 | cursor = await conn.execute('''
224 | SELECT domain,find_date, type, source, count(*)
225 | FROM results
226 | GROUP BY domain, find_date, type, source
227 | ''')
228 | results = await cursor.fetchall()
229 | self.scanstats = results
230 | return self.scanstats
231 | except Exception as e:
232 | print(e)
233 |
234 | async def latestscanchartdata(self, domain):
235 | try:
236 | async with aiosqlite.connect(self.db, timeout=30) as conn:
237 | self.latestscandomain["domain"] = domain
238 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="host"''', (domain,))
239 | data = await cursor.fetchone()
240 | self.latestscandomain["host"] = data[0]
241 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="email"''', (domain,))
242 | data = await cursor.fetchone()
243 | self.latestscandomain["email"] = data[0]
244 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="ip"''', (domain,))
245 | data = await cursor.fetchone()
246 | self.latestscandomain["ip"] = data[0]
247 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="vhost"''', (domain,))
248 | data = await cursor.fetchone()
249 | self.latestscandomain["vhost"] = data[0]
250 | cursor = await conn.execute('''SELECT COUNT(*) from results WHERE domain=? AND type="shodan"''', (domain,))
251 | data = await cursor.fetchone()
252 | self.latestscandomain["shodan"] = data[0]
253 | cursor = await conn.execute('''SELECT MAX(find_date) FROM results WHERE domain=?''', (domain,))
254 | data = await cursor.fetchone()
255 | self.latestscandomain["latestdate"] = data[0]
256 | latestdate = data[0]
257 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="host"''', (domain, latestdate,))
258 | scandetailshost = await cursor.fetchall()
259 | self.latestscandomain["scandetailshost"] = scandetailshost
260 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="email"''', (domain, latestdate,))
261 | scandetailsemail = await cursor.fetchall()
262 | self.latestscandomain["scandetailsemail"] = scandetailsemail
263 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="ip"''', (domain, latestdate,))
264 | scandetailsip = await cursor.fetchall()
265 | self.latestscandomain["scandetailsip"] = scandetailsip
266 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="vhost"''', (domain, latestdate,))
267 | scandetailsvhost = await cursor.fetchall()
268 | self.latestscandomain["scandetailsvhost"] = scandetailsvhost
269 | cursor = await conn.execute('''SELECT * FROM results WHERE domain=? AND find_date=? AND type="shodan"''', (domain, latestdate,))
270 | scandetailsshodan = await cursor.fetchall()
271 | self.latestscandomain["scandetailsshodan"] = scandetailsshodan
272 | return self.latestscandomain
273 | except Exception as e:
274 | print(e)
275 |
--------------------------------------------------------------------------------
/theHarvester/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParrotSec/theharvester/b4b78bf900352ca5dfa5159a0f7c080e703def4e/theHarvester/parsers/__init__.py
--------------------------------------------------------------------------------
/theHarvester/parsers/intelxparser.py:
--------------------------------------------------------------------------------
1 | class Parser:
2 |
3 | def __init__(self):
4 | self.emails = set()
5 | self.hosts = set()
6 |
7 | async def parse_dictionaries(self, results: dict) -> tuple:
8 | """
9 | Parse method to parse json results
10 | :param results: Dictionary containing a list of dictionaries known as selectors
11 | :return: tuple of emails and hosts
12 | """
13 | if results is not None:
14 | for dictionary in results["selectors"]:
15 | field = dictionary['selectorvalue']
16 | if '@' in field:
17 | self.emails.add(field)
18 | else:
19 | field = str(field)
20 | if 'http' in field or 'https' in field:
21 | if field[:5] == 'https':
22 | field = field[8:]
23 | else:
24 | field = field[7:]
25 | self.hosts.add(field.replace(')', '').replace(',', ''))
26 | return self.emails, self.hosts
27 | return None, None
28 |
--------------------------------------------------------------------------------
/theHarvester/parsers/myparser.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | class Parser:
5 |
6 | def __init__(self, results, word):
7 | self.results = results
8 | self.word = word
9 | self.temp = []
10 |
11 | async def genericClean(self):
12 | self.results = self.results.replace('', '').replace('', '').replace('', '').replace('', '') \
13 | .replace('%3a', '').replace('', '').replace('', '') \
14 | .replace('', '').replace('', '')
15 |
16 | for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C', '%2f', '/', '\\'):
17 | self.results = self.results.replace(search, ' ')
18 |
19 | async def urlClean(self):
20 | self.results = self.results.replace('', '').replace('', '').replace('%2f', '').replace('%3a', '')
21 | for search in ('<', '>', ':', '=', ';', '&', '%3A', '%3D', '%3C'):
22 | self.results = self.results.replace(search, ' ')
23 |
24 | async def emails(self):
25 | await self.genericClean()
26 | # Local part is required, charset is flexible.
27 | # https://tools.ietf.org/html/rfc6531 (removed * and () as they provide FP mostly)
28 | reg_emails = re.compile(r'[a-zA-Z0-9.\-_+#~!$&\',;=:]+' + '@' + '[a-zA-Z0-9.-]*' + self.word.replace('www.', ''))
29 | self.temp = reg_emails.findall(self.results)
30 | emails = await self.unique()
31 | true_emails = {str(email)[1:].lower().strip() if len(str(email)) > 1 and str(email)[0] == '.'
32 | else len(str(email)) > 1 and str(email).lower().strip() for email in emails}
33 | # if email starts with dot shift email string and make sure all emails are lowercase
34 | return true_emails
35 |
36 | async def fileurls(self, file):
37 | urls = []
38 | reg_urls = re.compile('(.*?)')
60 | temp = reg_hosts.findall(self.results)
61 | for iteration in temp:
62 | if iteration.count(':'):
63 | res = iteration.split(':')[1].split('/')[2]
64 | else:
65 | res = iteration.split('/')[0]
66 | self.temp.append(res)
67 | hostnames = await self.unique()
68 | return hostnames
69 |
70 | async def set(self):
71 | reg_sets = re.compile(r'>[a-zA-Z\d]*')
72 | self.temp = reg_sets.findall(self.results)
73 | sets = []
74 | for iteration in self.temp:
75 | delete = iteration.replace('>', '')
76 | delete = delete.replace(' list:
86 | return list(set(self.temp))
87 |
--------------------------------------------------------------------------------
/theHarvester/parsers/securitytrailsparser.py:
--------------------------------------------------------------------------------
1 | from typing import Union, Tuple, List
2 |
3 |
4 | class Parser:
5 |
6 | def __init__(self, word, text):
7 | self.word = word
8 | self.text = text
9 | self.hostnames = set()
10 | self.ips = set()
11 |
12 | async def parse_text(self) -> Union[List, Tuple]:
13 | sub_domain_flag = 0
14 | self.text = str(self.text).splitlines()
15 | # Split lines to get a list of lines.
16 | for index in range(0, len(self.text)):
17 | line = self.text[index].strip()
18 | if '"ip":' in line:
19 | # Extract IP.
20 | ip = ''
21 | for ch in line[7:]:
22 | if ch == '"':
23 | break
24 | else:
25 | ip += ch
26 | self.ips.add(ip)
27 | elif '"subdomains":' in line:
28 | # subdomains start here so set flag to 1
29 | sub_domain_flag = 1
30 | continue
31 | elif sub_domain_flag > 0:
32 | if ']' in line:
33 | sub_domain_flag = 0
34 | else:
35 | if 'www' in self.word:
36 | self.word = str(self.word).replace('www.', '').replace('www', '')
37 | # Remove www from word if entered
38 | self.hostnames.add(str(line).replace('"', '').replace(',', '') + '.' + self.word)
39 | else:
40 | continue
41 | return list(self.ips), list(self.hostnames)
42 |
--------------------------------------------------------------------------------
/theHarvester/screenshot/screenshot.py:
--------------------------------------------------------------------------------
1 | """
2 | Screenshot module that utilizes pyppeteer to asynchronously
3 | take screenshots
4 | """
5 |
6 | from pyppeteer import launch
7 | import aiohttp
8 | import asyncio
9 | import certifi
10 | from datetime import datetime
11 | import os
12 | import ssl
13 | import sys
14 |
15 |
16 | class ScreenShotter:
17 |
18 | def __init__(self, output):
19 | self.output = output
20 | self.slash = "\\" if 'win' in sys.platform else '/'
21 | self.slash = "" if (self.output[-1] == "\\" or self.output[-1] == "/") else self.slash
22 |
23 | def verify_path(self):
24 | try:
25 | if not os.path.isdir(self.output):
26 | answer = input(
27 | '[+] The output path you have entered does not exist would you like to create it (y/n): ')
28 | if answer.lower() == 'yes' or answer.lower() == 'y':
29 | os.mkdir(self.output)
30 | return True
31 | else:
32 | return False
33 | return True
34 | except Exception as e:
35 | print(f"An exception has occurred while attempting to verify output path's existence: {e}")
36 | return False
37 |
38 | @staticmethod
39 | async def verify_installation():
40 | # Helper function that verifies pyppeteer & chromium are installed
41 | # If chromium is not installed pyppeteer will prompt user to install it
42 | browser = await launch(headless=True, ignoreHTTPSErrors=True, args=["--no-sandbox"])
43 | await browser.close()
44 |
45 | @staticmethod
46 | def chunk_list(items, chunk_size):
47 | # Based off of: https://github.com/apache/incubator-sdap-ingester
48 | return [items[i:i + chunk_size] for i in range(0, len(items), chunk_size)]
49 |
50 | @staticmethod
51 | async def visit(url):
52 | try:
53 | # print(f'attempting to visit: {url}')
54 | timeout = aiohttp.ClientTimeout(total=35)
55 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
56 | 'Chrome/83.0.4103.106 Safari/537.36'}
57 | url = f'http://{url}' if not url.startswith('http') else url
58 | url = url.replace('www.', '')
59 | sslcontext = ssl.create_default_context(cafile=certifi.where())
60 | async with aiohttp.ClientSession(timeout=timeout, headers=headers,
61 | connector=aiohttp.TCPConnector(ssl=sslcontext)) as session:
62 | async with session.get(url, verify_ssl=False) as resp:
63 | # TODO fix with origin url, should be there somewhere
64 | text = await resp.text("UTF-8")
65 | return f'http://{url}' if not url.startswith('http') else url, text
66 | except Exception as e:
67 | print(f'An exception has occurred while attempting to visit {url} : {e}')
68 | return "", ""
69 |
70 | async def take_screenshot(self, url):
71 | url = f'http://{url}' if not url.startswith('http') else url
72 | url = url.replace('www.', '')
73 | print(f'Attempting to take a screenshot of: {url}')
74 | browser = await launch(headless=True, ignoreHTTPSErrors=True, args=["--no-sandbox"])
75 | context = await browser.createIncognitoBrowserContext()
76 | # Create a new page in a pristine context.
77 | page = await context.newPage()
78 | path = fr'{self.output}{self.slash}{url.replace("http://", "").replace("https://", "")}.png'
79 | date = str(datetime.utcnow())
80 | try:
81 | # change default timeout from 30 to 35 seconds
82 | page.setDefaultNavigationTimeout(35000)
83 | await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
84 | 'Chrome/83.0.4103.106 Safari/537.36')
85 | await page.goto(url)
86 | await page.screenshot({'path': path})
87 | except Exception as e:
88 | print(f'An exception has occurred attempting to screenshot: {url} : {e}')
89 | path = ""
90 | finally:
91 | # Clean up everything whether screenshot is taken or not
92 | await asyncio.sleep(2)
93 | await page.close()
94 | await context.close()
95 | await browser.close()
96 | return date, url, path
97 |
--------------------------------------------------------------------------------
/wordlists/dorks.txt:
--------------------------------------------------------------------------------
1 | inurl:"contact"
2 | intext:email filetype:log
3 | "Index of /mail"
4 | "admin account info" filetype:log
5 | intext:@
6 | administrator accounts/
7 | intitle:"Index of" .bash_history
8 | intitle:"index of" members OR accounts
9 | inurl:/shared/help.php
10 | inurl:public
11 | intitle:index.of inbox
12 | intitle:"Server Administration"
13 | inurl:passwd.txt
14 | robots.txt
15 | php-addressbook "This is the addressbook for *" -warning
--------------------------------------------------------------------------------
/wordlists/general/common.txt:
--------------------------------------------------------------------------------
1 | admin
2 | test
3 | hello
4 | uk
5 | login
6 | book
7 | robots.txt
8 |
--------------------------------------------------------------------------------