├── .gitignore ├── .vscode └── settings.json ├── InfoHunter.py ├── LICENSE ├── README.md ├── api-keys example.yaml ├── api_keys example.json ├── images ├── logo1.png └── logo2.png ├── proxies.yaml ├── requirements.txt ├── src ├── evaluacion │ ├── __init__.py │ └── mejoras.py ├── maigret │ ├── .dockerignore │ ├── .githooks │ │ └── pre-commit │ ├── .github │ │ ├── FUNDING.yml │ │ ├── ISSUE_TEMPLATE │ │ │ ├── add-a-site.md │ │ │ ├── bug.md │ │ │ └── report-false-result.md │ │ ├── dependabot.yml │ │ └── workflows │ │ │ ├── build-docker-image.yml │ │ │ ├── codeql-analysis.yml │ │ │ ├── pyinstaller.yml │ │ │ ├── python-package.yml │ │ │ ├── python-publish.yml │ │ │ └── update-site-data.yml │ ├── .gitignore │ ├── CHANGELOG.md │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── Dockerfile │ ├── LICENSE │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── docs │ │ ├── Makefile │ │ ├── make.bat │ │ ├── requirements.txt │ │ └── source │ │ │ ├── command-line-options.rst │ │ │ ├── conf.py │ │ │ ├── development.rst │ │ │ ├── extracting-information-from-pages.rst │ │ │ ├── features.rst │ │ │ ├── index.rst │ │ │ ├── philosophy.rst │ │ │ ├── roadmap.rst │ │ │ ├── settings.rst │ │ │ ├── supported-identifier-types.rst │ │ │ ├── tags.rst │ │ │ └── usage-examples.rst │ ├── maigret.py │ ├── maigret │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── __version__.py │ │ ├── activation.py │ │ ├── checking.py │ │ ├── errors.py │ │ ├── executors.py │ │ ├── maigret.py │ │ ├── notify.py │ │ ├── report.py │ │ ├── resources │ │ │ ├── data.json │ │ │ ├── simple_report.tpl │ │ │ ├── simple_report_pdf.css │ │ │ └── simple_report_pdf.tpl │ │ ├── result.py │ │ ├── settings.py │ │ ├── sites.py │ │ ├── submit.py │ │ ├── types.py │ │ └── utils.py │ ├── pyinstaller │ │ ├── maigret_standalone.py │ │ └── requirements.txt │ ├── pytest.ini │ ├── requirements.txt │ ├── setup.cfg │ ├── setup.py │ ├── sites.md │ ├── snapcraft.yaml │ ├── static │ │ ├── chat_gitter.svg │ │ ├── maigret.png │ │ ├── recursive_search.md │ │ ├── recursive_search.svg │ │ ├── report_alexaimephotography_html_screenshot.png │ │ ├── report_alexaimephotography_xmind_screenshot.png │ │ ├── report_alexaimephotographycars.html │ │ └── report_alexaimephotographycars.pdf │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── db.json │ │ ├── local.json │ │ ├── test_activation.py │ │ ├── test_checking.py │ │ ├── test_cli.py │ │ ├── test_data.py │ │ ├── test_executors.py │ │ ├── test_maigret.py │ │ ├── test_notify.py │ │ ├── test_report.py │ │ ├── test_sites.py │ │ └── test_utils.py │ ├── utils │ │ ├── __init__.py │ │ ├── add_tags.py │ │ ├── check_engines.py │ │ ├── import_sites.py │ │ ├── sites_diff.py │ │ └── update_site_data.py │ └── wizard.py ├── recopilacion │ ├── __init__.py │ ├── consultas.py │ ├── extraccion.py │ └── fuentes.py ├── riesgos │ ├── __init__.py │ └── evaluacion.py ├── sherlock │ ├── .dockerignore │ ├── .editorconfig │ ├── .github │ │ ├── ISSUE_TEMPLATE │ │ │ ├── bug-report.md │ │ │ ├── feature-request.md │ │ │ ├── question.md │ │ │ ├── reporting-false-negative.md │ │ │ ├── reporting-false-positive.md │ │ │ └── site-support-request.md │ │ └── workflows │ │ │ ├── main.yml │ │ │ ├── nightly.yml │ │ │ ├── pull_request.yml │ │ │ └── update-site-list.yml │ ├── .gitignore │ ├── .replit │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── Dockerfile │ ├── LICENSE │ ├── README.md │ ├── docker-compose.yml │ ├── images │ │ └── preview.png │ ├── removed_sites.json │ ├── removed_sites.md │ ├── requirements.txt │ ├── sherlock │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── notify.py │ │ ├── resources │ │ │ └── data.json │ │ ├── result.py │ │ ├── sherlock.py │ │ ├── sites.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── all.py │ │ │ ├── base.py │ │ │ └── test_multiple_usernames.py │ ├── site_list.py │ └── sites.md └── theHarvester │ ├── .dockerignore │ ├── .flake8 │ ├── .git-blame-ignore-revs │ ├── .gitattributes │ ├── .github │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE │ │ └── issue-template.md │ ├── dependabot.yml │ └── workflows │ │ ├── codeql-analysis.yml │ │ ├── dockerci.yml │ │ └── theHarvester.yml │ ├── .gitignore │ ├── .isort.cfg │ ├── .pyre_configuration │ ├── Dockerfile │ ├── README.md │ ├── README │ ├── CONTRIBUTING.md │ ├── COPYING │ └── LICENSES │ ├── docker-compose.yml │ ├── mypy.ini │ ├── pyproject.toml │ ├── pytest.ini │ ├── requirements.txt │ ├── requirements │ ├── base.txt │ └── dev.txt │ ├── restfulHarvest.py │ ├── setup.cfg │ ├── tests │ ├── __init__.py │ ├── discovery │ │ ├── __init__.py │ │ ├── test_anubis.py │ │ ├── test_certspotter.py │ │ ├── test_githubcode.py │ │ └── test_otx.py │ └── test_myparser.py │ ├── theHarvester-logo.png │ ├── theHarvester-logo.webp │ ├── theHarvester.py │ └── theHarvester │ ├── __init__.py │ ├── __main__.py │ ├── data │ ├── proxies.yaml │ └── wordlists │ │ ├── dns-big.txt │ │ ├── dns-names.txt │ │ ├── dorks.txt │ │ ├── general │ │ └── common.txt │ │ └── names_small.txt │ ├── discovery │ ├── __init__.py │ ├── anubis.py │ ├── baidusearch.py │ ├── bevigil.py │ ├── binaryedgesearch.py │ ├── bingsearch.py │ ├── bravesearch.py │ ├── bufferoverun.py │ ├── censysearch.py │ ├── certspottersearch.py │ ├── constants.py │ ├── criminalip.py │ ├── crtsh.py │ ├── dnsdumpster.py │ ├── dnssearch.py │ ├── duckduckgosearch.py │ ├── fullhuntsearch.py │ ├── githubcode.py │ ├── hackertarget.py │ ├── huntersearch.py │ ├── intelxsearch.py │ ├── netlas.py │ ├── onyphe.py │ ├── otxsearch.py │ ├── pentesttools.py │ ├── projectdiscovery.py │ ├── rapiddns.py │ ├── rocketreach.py │ ├── searchhunterhow.py │ ├── securitytrailssearch.py │ ├── shodansearch.py │ ├── sitedossier.py │ ├── subdomaincenter.py │ ├── subdomainfinderc99.py │ ├── takeover.py │ ├── threatminer.py │ ├── tombasearch.py │ ├── urlscan.py │ ├── virustotal.py │ ├── yahoosearch.py │ └── zoomeyesearch.py │ ├── parsers │ ├── __init__.py │ ├── intelxparser.py │ ├── myparser.py │ └── securitytrailsparser.py │ ├── restfulHarvest.py │ ├── screenshot │ ├── __init__.py │ └── screenshot.py │ └── theHarvester.py └── wordlists ├── dns-big.txt ├── dns-names.txt ├── dorks.txt ├── general └── common.txt └── names_small.txt /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.defaultFormatter": "ms-python.black-formatter" 4 | }, 5 | "python.formatting.provider": "none" 6 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
Мы не нашли страницу': CheckError(
52 | 'Resolving', 'MegaFon 404 page'
53 | ),
54 | 'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError(
55 | 'Censorship', 'MGTS'
56 | ),
57 | 'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
58 | 'Сайт заблокирован хостинг-провайдером': CheckError(
59 | 'Site-specific', 'Site is disabled (Beget)'
60 | ),
61 | }
62 |
63 | ERRORS_TYPES = {
64 | 'Captcha': 'Try to switch to another IP address or to use service cookies',
65 | 'Bot protection': 'Try to switch to another IP address',
66 | 'Censorship': 'Switch to another internet service provider',
67 | 'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
68 | 'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)',
69 | }
70 |
71 | # TODO: checking for reason
72 | ERRORS_REASONS = {
73 | 'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
74 | }
75 |
76 | TEMPORARY_ERRORS_TYPES = [
77 | 'Request timeout',
78 | 'Unknown',
79 | 'Request failed',
80 | 'Connecting failure',
81 | 'HTTP',
82 | 'Proxy',
83 | 'Interrupted',
84 | 'Connection lost',
85 | ]
86 |
87 | THRESHOLD = 3 # percent
88 |
89 |
90 | def is_important(err_data):
91 | return err_data['perc'] >= THRESHOLD
92 |
93 |
94 | def is_permanent(err_type):
95 | return err_type not in TEMPORARY_ERRORS_TYPES
96 |
97 |
98 | def detect(text):
99 | for flag, err in COMMON_ERRORS.items():
100 | if flag in text:
101 | return err
102 | return None
103 |
104 |
105 | def solution_of(err_type) -> str:
106 | return ERRORS_TYPES.get(err_type, '')
107 |
108 |
109 | def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]:
110 | errors_counts: Dict[str, int] = {}
111 | for r in search_res.values():
112 | if r and isinstance(r, dict) and r.get('status'):
113 | if not isinstance(r['status'], QueryResult):
114 | continue
115 |
116 | err = r['status'].error
117 | if not err:
118 | continue
119 | errors_counts[err.type] = errors_counts.get(err.type, 0) + 1
120 |
121 | counts = []
122 | for err, count in sorted(errors_counts.items(), key=lambda x: x[1], reverse=True):
123 | counts.append(
124 | {
125 | 'err': err,
126 | 'count': count,
127 | 'perc': round(count / len(search_res), 2) * 100,
128 | }
129 | )
130 |
131 | return counts
132 |
--------------------------------------------------------------------------------
/src/maigret/maigret/resources/simple_report_pdf.css:
--------------------------------------------------------------------------------
1 | h2 {
2 | font-size: 30px;
3 | width: 100%;
4 | display:block;
5 | }
6 | h3 {
7 | font-size: 25px;
8 | width: 100%;
9 | display:block;
10 | }
11 | h4 {
12 | font-size: 20px;
13 | width: 100%;
14 | display:block;
15 | }
16 | p {
17 | margin: 0 0 5px;
18 | display: block;
19 | }
20 |
21 |
22 | table {
23 | margin-bottom: 10px;
24 | width:100%;
25 | }
26 | th {
27 | font-weight: bold;
28 | }
29 | th,td,caption {
30 | padding: 4px 10px 4px 5px;
31 | }
32 | table tr:nth-child(even) td,
33 | table tr.even td {
34 | background-color: #e5ecf9;
35 | }
36 |
37 | div {
38 | border-bottom-color: #3e3e3e;
39 | border-bottom-width: 1px;
40 | border-bottom-style: solid;
41 | }
42 | .invalid-button {
43 | position: absolute;
44 | left: 10px;
45 | }
--------------------------------------------------------------------------------
/src/maigret/maigret/result.py:
--------------------------------------------------------------------------------
1 | """Maigret Result Module
2 |
3 | This module defines various objects for recording the results of queries.
4 | """
5 | from enum import Enum
6 |
7 |
8 | class QueryStatus(Enum):
9 | """Query Status Enumeration.
10 |
11 | Describes status of query about a given username.
12 | """
13 |
14 | CLAIMED = "Claimed" # Username Detected
15 | AVAILABLE = "Available" # Username Not Detected
16 | UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username
17 | ILLEGAL = "Illegal" # Username Not Allowable For This Site
18 |
19 | def __str__(self):
20 | """Convert Object To String.
21 |
22 | Keyword Arguments:
23 | self -- This object.
24 |
25 | Return Value:
26 | Nicely formatted string to get information about this object.
27 | """
28 | return self.value
29 |
30 |
31 | class QueryResult:
32 | """Query Result Object.
33 |
34 | Describes result of query about a given username.
35 | """
36 |
37 | def __init__(
38 | self,
39 | username,
40 | site_name,
41 | site_url_user,
42 | status,
43 | ids_data=None,
44 | query_time=None,
45 | context=None,
46 | error=None,
47 | tags=[],
48 | ):
49 | """Create Query Result Object.
50 |
51 | Contains information about a specific method of detecting usernames on
52 | a given type of web sites.
53 |
54 | Keyword Arguments:
55 | self -- This object.
56 | username -- String indicating username that query result
57 | was about.
58 | site_name -- String which identifies site.
59 | site_url_user -- String containing URL for username on site.
60 | NOTE: The site may or may not exist: this
61 | just indicates what the name would
62 | be, if it existed.
63 | status -- Enumeration of type QueryStatus() indicating
64 | the status of the query.
65 | query_time -- Time (in seconds) required to perform query.
66 | Default of None.
67 | context -- String indicating any additional context
68 | about the query. For example, if there was
69 | an error, this might indicate the type of
70 | error that occurred.
71 | Default of None.
72 | ids_data -- Extracted from website page info about other
73 | usernames and inner ids.
74 |
75 | Return Value:
76 | Nothing.
77 | """
78 |
79 | self.username = username
80 | self.site_name = site_name
81 | self.site_url_user = site_url_user
82 | self.status = status
83 | self.query_time = query_time
84 | self.context = context
85 | self.ids_data = ids_data
86 | self.tags = tags
87 | self.error = error
88 |
89 | def json(self):
90 | return {
91 | "username": self.username,
92 | "site_name": self.site_name,
93 | "url": self.site_url_user,
94 | "status": str(self.status),
95 | "ids": self.ids_data or {},
96 | "tags": self.tags,
97 | }
98 |
99 | def is_found(self):
100 | return self.status == QueryStatus.CLAIMED
101 |
102 | def __str__(self):
103 | """Convert Object To String.
104 |
105 | Keyword Arguments:
106 | self -- This object.
107 |
108 | Return Value:
109 | Nicely formatted string to get information about this object.
110 | """
111 | status = str(self.status)
112 | if self.context is not None:
113 | # There is extra context information available about the results.
114 | # Append it to the normal response text.
115 | status += f" ({self.context})"
116 |
117 | return status
118 |
--------------------------------------------------------------------------------
/src/maigret/maigret/settings.py:
--------------------------------------------------------------------------------
1 | import os
2 | import os.path as path
3 | import json
4 | from typing import List
5 |
6 | SETTINGS_FILES_PATHS = [
7 | path.join(path.dirname(path.realpath(__file__)), "resources/settings.json"),
8 | '~/.maigret/settings.json',
9 | path.join(os.getcwd(), 'settings.json'),
10 | ]
11 |
12 |
13 | class Settings:
14 | # main maigret setting
15 | retries_count: int
16 | sites_db_path: str
17 | timeout: int
18 | max_connections: int
19 | recursive_search: bool
20 | info_extracting: bool
21 | cookie_jar_file: str
22 | ignore_ids_list: List
23 | reports_path: str
24 | proxy_url: str
25 | tor_proxy_url: str
26 | i2p_proxy_url: str
27 | domain_search: bool
28 | scan_all_sites: bool
29 | top_sites_count: int
30 | scan_disabled_sites: bool
31 | scan_sites_list: List
32 | self_check_enabled: bool
33 | print_not_found: bool
34 | print_check_errors: bool
35 | colored_print: bool
36 | show_progressbar: bool
37 | report_sorting: str
38 | json_report_type: str
39 | txt_report: bool
40 | csv_report: bool
41 | xmind_report: bool
42 | pdf_report: bool
43 | html_report: bool
44 | graph_report: bool
45 |
46 | # submit mode settings
47 | presence_strings: list
48 | supposed_usernames: list
49 |
50 | def __init__(self):
51 | pass
52 |
53 | def load(self, paths=None):
54 | was_inited = False
55 |
56 | if not paths:
57 | paths = SETTINGS_FILES_PATHS
58 |
59 | for filename in paths:
60 | data = {}
61 |
62 | try:
63 | with open(filename, "r", encoding="utf-8") as file:
64 | data = json.load(file)
65 | except FileNotFoundError:
66 | # treast as a normal situation
67 | pass
68 | except Exception as error:
69 | return False, ValueError(
70 | f"Problem with parsing json contents of "
71 | f"settings file '{filename}': {str(error)}."
72 | )
73 |
74 | self.__dict__.update(data)
75 | if data:
76 | was_inited = True
77 |
78 | return (
79 | was_inited,
80 | f'None of the default settings files found: {", ".join(paths)}',
81 | )
82 |
83 | @property
84 | def json(self):
85 | return self.__dict__
86 |
--------------------------------------------------------------------------------
/src/maigret/maigret/types.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, List, Dict, Tuple, Any
2 |
3 |
4 | # search query
5 | QueryDraft = Tuple[Callable, List, Dict]
6 |
7 | # options dict
8 | QueryOptions = Dict[str, Any]
9 |
10 | # TODO: throw out
11 | QueryResultWrapper = Dict[str, Any]
12 |
--------------------------------------------------------------------------------
/src/maigret/maigret/utils.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | import ast
3 | import difflib
4 | import re
5 | import random
6 | from typing import Any
7 |
8 |
9 | DEFAULT_USER_AGENTS = [
10 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
11 | ]
12 |
13 |
14 | class CaseConverter:
15 | @staticmethod
16 | def camel_to_snake(camelcased_string: str) -> str:
17 | return re.sub(r"(? str:
21 | formatted = "".join(word.title() for word in snakecased_string.split("_"))
22 | result = formatted[0].lower() + formatted[1:]
23 | return result
24 |
25 | @staticmethod
26 | def snake_to_title(snakecased_string: str) -> str:
27 | words = snakecased_string.split("_")
28 | words[0] = words[0].title()
29 | return " ".join(words)
30 |
31 |
32 | def is_country_tag(tag: str) -> bool:
33 | """detect if tag represent a country"""
34 | return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == "global"
35 |
36 |
37 | def enrich_link_str(link: str) -> str:
38 | link = link.strip()
39 | if link.startswith("www.") or (link.startswith("http") and "//" in link):
40 | return f'{link}'
41 | return link
42 |
43 |
44 | class URLMatcher:
45 | _HTTP_URL_RE_STR = "^https?://(www.|m.)?(.+)$"
46 | HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
47 | UNSAFE_SYMBOLS = ".?"
48 |
49 | @classmethod
50 | def extract_main_part(self, url: str) -> str:
51 | match = self.HTTP_URL_RE.search(url)
52 | if match and match.group(2):
53 | return match.group(2).rstrip("/")
54 |
55 | return ""
56 |
57 | @classmethod
58 | def make_profile_url_regexp(self, url: str, username_regexp: str = ""):
59 | url_main_part = self.extract_main_part(url)
60 | for c in self.UNSAFE_SYMBOLS:
61 | url_main_part = url_main_part.replace(c, f"\\{c}")
62 | prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
63 |
64 | url_regexp = url_main_part.replace(
65 | "{username}", f"({prepared_username_regexp})"
66 | )
67 | regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
68 |
69 | return re.compile(regexp_str, re.IGNORECASE)
70 |
71 |
72 | def ascii_data_display(data: str) -> Any:
73 | return ast.literal_eval(data)
74 |
75 |
76 | def get_dict_ascii_tree(items, prepend="", new_line=True):
77 | new_result = b'\xe2\x94\x9c'.decode()
78 | new_line = b'\xe2\x94\x80'.decode()
79 | last_result = b'\xe2\x94\x94'.decode()
80 | skip_result = b'\xe2\x94\x82'.decode()
81 |
82 | text = ""
83 | for num, item in enumerate(items):
84 | box_symbol = (
85 | new_result + new_line if num != len(items) - 1 else last_result + new_line
86 | )
87 |
88 | if type(item) == tuple:
89 | field_name, field_value = item
90 | if field_value.startswith("['"):
91 | is_last_item = num == len(items) - 1
92 | prepend_symbols = " " * 3 if is_last_item else f" {skip_result} "
93 | data = ascii_data_display(field_value)
94 | field_value = get_dict_ascii_tree(data, prepend_symbols)
95 | text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
96 | else:
97 | text += f"\n{prepend}{box_symbol} {item}"
98 |
99 | if not new_line:
100 | text = text[1:]
101 |
102 | return text
103 |
104 |
105 | def get_random_user_agent():
106 | return random.choice(DEFAULT_USER_AGENTS)
107 |
108 |
109 | def get_match_ratio(base_strs: list):
110 | def get_match_inner(s: str):
111 | return round(
112 | max(
113 | [
114 | difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
115 | for s2 in base_strs
116 | ]
117 | ),
118 | 2,
119 | )
120 |
121 | return get_match_inner
122 |
--------------------------------------------------------------------------------
/src/maigret/pyinstaller/maigret_standalone.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import asyncio
3 |
4 | import maigret
5 |
6 | if __name__ == "__main__":
7 | asyncio.run(maigret.cli())
--------------------------------------------------------------------------------
/src/maigret/pyinstaller/requirements.txt:
--------------------------------------------------------------------------------
1 | maigret @ https://github.com/soxoj/maigret/archive/refs/heads/main.zip
2 | pefile==2022.5.30
3 | psutil==5.9.5
4 | pyinstaller @ https://github.com/pyinstaller/pyinstaller/archive/develop.zip
5 | pywin32-ctypes==0.2.0
--------------------------------------------------------------------------------
/src/maigret/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 | [pytest]
3 | filterwarnings =
4 | error
5 | ignore::UserWarning
6 | asyncio_mode=auto
--------------------------------------------------------------------------------
/src/maigret/requirements.txt:
--------------------------------------------------------------------------------
1 | aiodns==3.0.0
2 | aiohttp==3.8.3
3 | aiohttp-socks==0.7.1
4 | arabic-reshaper==2.1.4
5 | async-timeout==4.0.2
6 | attrs==22.2.0
7 | certifi==2022.12.7
8 | chardet==5.0.0
9 | colorama==0.4.6
10 | future==0.18.3
11 | future-annotations==1.0.0
12 | html5lib==1.1
13 | idna==3.4
14 | Jinja2==3.1.2
15 | lxml==4.9.2
16 | MarkupSafe==2.1.1
17 | mock==4.0.3
18 | multidict==6.0.4
19 | pycountry==22.3.5
20 | PyPDF2==2.10.8
21 | PySocks==1.7.1
22 | python-bidi==0.4.2
23 | requests==2.28.2
24 | requests-futures==1.0.0
25 | six==1.16.0
26 | socid-extractor>=0.0.21
27 | soupsieve==2.3.2.post1
28 | stem==1.8.1
29 | torrequest==0.1.0
30 | tqdm==4.65.0
31 | typing-extensions==4.5.0
32 | webencodings==0.5.1
33 | xhtml2pdf==0.2.8
34 | XMind==1.2.0
35 | yarl==1.8.2
36 | networkx==2.6.3
37 | pyvis==0.2.1
38 | reportlab==3.6.12
39 | cloudscraper==1.2.66
40 |
--------------------------------------------------------------------------------
/src/maigret/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | tag_build =
3 | tag_date = 0
4 |
5 | [flake8]
6 | per-file-ignores = __init__.py:F401
7 |
8 | [mypy]
9 | ignore_missing_imports = True
--------------------------------------------------------------------------------
/src/maigret/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import (
2 | setup,
3 | find_packages,
4 | )
5 |
6 |
7 | with open('README.md') as fh:
8 | long_description = fh.read()
9 |
10 | with open('requirements.txt') as rf:
11 | requires = rf.read().splitlines()
12 |
13 | setup(name='maigret',
14 | version='0.4.4',
15 | description='Collect a dossier on a person by username from a huge number of sites',
16 | long_description=long_description,
17 | long_description_content_type="text/markdown",
18 | url='https://github.com/soxoj/maigret',
19 | install_requires=requires,
20 | entry_points={'console_scripts': ['maigret = maigret.maigret:run']},
21 | packages=find_packages(exclude=["tests*"]),
22 | include_package_data=True,
23 | author='Soxoj',
24 | author_email='soxoj@protonmail.com',
25 | license='MIT',
26 | zip_safe=False)
27 |
--------------------------------------------------------------------------------
/src/maigret/snapcraft.yaml:
--------------------------------------------------------------------------------
1 | name: maigret2
2 | adopt-info: maigret2
3 | summary: SOCMINT / Instagram
4 | description: |
5 | Test Test Test
6 |
7 | license: MIT
8 |
9 | base: core20
10 | grade: stable
11 | confinement: strict
12 | compression: lzo
13 |
14 | architectures:
15 | - build-on: amd64
16 |
17 | apps:
18 | maigret2:
19 | command: bin/maigret
20 | environment:
21 | LC_ALL: C.UTF-8
22 | plugs:
23 | - home
24 | - network
25 |
26 | parts:
27 | maigret2:
28 | plugin: python
29 | source: https://github.com/soxoj/maigret
30 | source-type: git
31 |
32 | build-packages:
33 | - python3-pip
34 | - python3-six
35 | - python3
36 |
37 | stage-packages:
38 | - python3
39 | - python3-six
40 |
41 | override-pull: |
42 | snapcraftctl pull
43 | snapcraftctl set-version "$(git describe --tags | sed 's/^v//' | cut -d "-" -f1)"
44 |
--------------------------------------------------------------------------------
/src/maigret/static/chat_gitter.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/maigret/static/maigret.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/maigret.png
--------------------------------------------------------------------------------
/src/maigret/static/report_alexaimephotography_html_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotography_html_screenshot.png
--------------------------------------------------------------------------------
/src/maigret/static/report_alexaimephotography_xmind_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotography_xmind_screenshot.png
--------------------------------------------------------------------------------
/src/maigret/static/report_alexaimephotographycars.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotographycars.pdf
--------------------------------------------------------------------------------
/src/maigret/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/tests/__init__.py
--------------------------------------------------------------------------------
/src/maigret/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import logging
3 | import os
4 |
5 | import pytest
6 | from _pytest.mark import Mark
7 |
8 | from maigret.sites import MaigretDatabase
9 | from maigret.maigret import setup_arguments_parser
10 | from maigret.settings import Settings
11 |
12 |
13 | CUR_PATH = os.path.dirname(os.path.realpath(__file__))
14 | JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
15 | SETTINGS_FILE = os.path.join(CUR_PATH, '../maigret/resources/settings.json')
16 | TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json')
17 | LOCAL_TEST_JSON_FILE = os.path.join(CUR_PATH, 'local.json')
18 | empty_mark = Mark('', (), {})
19 |
20 |
21 | def by_slow_marker(item):
22 | return item.get_closest_marker('slow', default=empty_mark)
23 |
24 |
25 | def pytest_collection_modifyitems(items):
26 | items.sort(key=by_slow_marker, reverse=False)
27 |
28 |
29 | def get_test_reports_filenames():
30 | return glob.glob(os.path.join('report_*'), recursive=False)
31 |
32 |
33 | def remove_test_reports():
34 | reports_list = get_test_reports_filenames()
35 | for f in reports_list:
36 | os.remove(f)
37 | logging.error(f'Removed test reports {reports_list}')
38 |
39 |
40 | @pytest.fixture(scope='session')
41 | def default_db():
42 | return MaigretDatabase().load_from_file(JSON_FILE)
43 |
44 |
45 | @pytest.fixture(scope='function')
46 | def test_db():
47 | return MaigretDatabase().load_from_file(TEST_JSON_FILE)
48 |
49 |
50 | @pytest.fixture(scope='function')
51 | def local_test_db():
52 | return MaigretDatabase().load_from_file(LOCAL_TEST_JSON_FILE)
53 |
54 |
55 | @pytest.fixture(autouse=True)
56 | def reports_autoclean():
57 | remove_test_reports()
58 | yield
59 | remove_test_reports()
60 |
61 |
62 | @pytest.fixture(scope='session')
63 | def argparser():
64 | settings = Settings()
65 | settings.load([SETTINGS_FILE])
66 | return setup_arguments_parser(settings)
67 |
68 |
69 | @pytest.fixture(scope="session")
70 | def httpserver_listen_address():
71 | return ("localhost", 8989)
72 |
--------------------------------------------------------------------------------
/src/maigret/tests/db.json:
--------------------------------------------------------------------------------
1 | {
2 | "engines": {},
3 | "sites": {
4 | "GooglePlayStore": {
5 | "tags": ["global", "us"],
6 | "disabled": false,
7 | "checkType": "status_code",
8 | "alexaRank": 1,
9 | "url": "https://play.google.com/store/apps/developer?id={username}",
10 | "urlMain": "https://play.google.com/store",
11 | "usernameClaimed": "Facebook_nosuchname",
12 | "usernameUnclaimed": "noonewouldeverusethis7"
13 | },
14 | "Reddit": {
15 | "tags": ["news", "social", "us"],
16 | "checkType": "status_code",
17 | "presenseStrs": ["totalKarma"],
18 | "disabled": true,
19 | "alexaRank": 17,
20 | "url": "https://www.reddit.com/user/{username}",
21 | "urlMain": "https://www.reddit.com/",
22 | "usernameClaimed": "blue",
23 | "usernameUnclaimed": "noonewouldeverusethis7"
24 | }
25 | }
26 | }
--------------------------------------------------------------------------------
/src/maigret/tests/local.json:
--------------------------------------------------------------------------------
1 | {
2 | "engines": {},
3 | "sites": {
4 | "StatusCode": {
5 | "checkType": "status_code",
6 | "url": "http://localhost:8989/url?id={username}",
7 | "urlMain": "http://localhost:8989/",
8 | "usernameClaimed": "claimed",
9 | "usernameUnclaimed": "unclaimed"
10 | },
11 | "Message": {
12 | "checkType": "message",
13 | "url": "http://localhost:8989/url?id={username}",
14 | "urlMain": "http://localhost:8989/",
15 | "presenseStrs": ["user", "profile"],
16 | "absenseStrs": ["not found", "404"],
17 | "usernameClaimed": "claimed",
18 | "usernameUnclaimed": "unclaimed"
19 | }
20 | }
21 | }
--------------------------------------------------------------------------------
/src/maigret/tests/test_activation.py:
--------------------------------------------------------------------------------
1 | """Maigret activation test functions"""
2 | import json
3 |
4 | import aiohttp
5 | import pytest
6 | from mock import Mock
7 |
8 | from maigret.activation import ParsingActivator, import_aiohttp_cookies
9 |
10 | COOKIES_TXT = """# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
11 | # This file can be used by wget, curl, aria2c and other standard compliant tools.
12 | # Usage Examples:
13 | # 1) wget -x --load-cookies cookies.txt "https://xss.is/search/"
14 | # 2) curl --cookie cookies.txt "https://xss.is/search/"
15 | # 3) aria2c --load-cookies cookies.txt "https://xss.is/search/"
16 | #
17 | xss.is FALSE / TRUE 0 xf_csrf test
18 | xss.is FALSE / TRUE 1642709308 xf_user tset
19 | .xss.is TRUE / FALSE 0 muchacho_cache test
20 | .xss.is TRUE / FALSE 1924905600 132_evc test
21 | httpbin.org FALSE / FALSE 0 a b
22 | """
23 |
24 |
25 | @pytest.mark.skip(reason="periodically fails")
26 | @pytest.mark.slow
27 | def test_twitter_activation(default_db):
28 | twitter_site = default_db.sites_dict['Twitter']
29 | token1 = twitter_site.headers['x-guest-token']
30 |
31 | ParsingActivator.twitter(twitter_site, Mock())
32 | token2 = twitter_site.headers['x-guest-token']
33 |
34 | assert token1 != token2
35 |
36 |
37 | @pytest.mark.asyncio
38 | async def test_import_aiohttp_cookies():
39 | cookies_filename = 'cookies_test.txt'
40 | with open(cookies_filename, 'w') as f:
41 | f.write(COOKIES_TXT)
42 |
43 | cookie_jar = import_aiohttp_cookies(cookies_filename)
44 | assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org']
45 |
46 | url = 'https://httpbin.org/cookies'
47 | connector = aiohttp.TCPConnector(ssl=False)
48 | session = aiohttp.ClientSession(
49 | connector=connector, trust_env=True, cookie_jar=cookie_jar
50 | )
51 |
52 | response = await session.get(url=url)
53 | result = json.loads(await response.content.read())
54 | await session.close()
55 |
56 | assert result == {'cookies': {'a': 'b'}}
57 |
--------------------------------------------------------------------------------
/src/maigret/tests/test_checking.py:
--------------------------------------------------------------------------------
1 | from mock import Mock
2 | import pytest
3 |
4 | from maigret import search
5 |
6 |
7 | def site_result_except(server, username, **kwargs):
8 | query = f'id={username}'
9 | server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
10 |
11 |
12 | @pytest.mark.slow
13 | @pytest.mark.asyncio
14 | async def test_checking_by_status_code(httpserver, local_test_db):
15 | sites_dict = local_test_db.sites_dict
16 |
17 | site_result_except(httpserver, 'claimed', status=200)
18 | site_result_except(httpserver, 'unclaimed', status=404)
19 |
20 | result = await search('claimed', site_dict=sites_dict, logger=Mock())
21 | assert result['StatusCode']['status'].is_found() is True
22 |
23 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
24 | assert result['StatusCode']['status'].is_found() is False
25 |
26 |
27 | @pytest.mark.slow
28 | @pytest.mark.asyncio
29 | async def test_checking_by_message_positive_full(httpserver, local_test_db):
30 | sites_dict = local_test_db.sites_dict
31 |
32 | site_result_except(httpserver, 'claimed', response_data="user profile")
33 | site_result_except(httpserver, 'unclaimed', response_data="404 not found")
34 |
35 | result = await search('claimed', site_dict=sites_dict, logger=Mock())
36 | assert result['Message']['status'].is_found() is True
37 |
38 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
39 | assert result['Message']['status'].is_found() is False
40 |
41 |
42 | @pytest.mark.slow
43 | @pytest.mark.asyncio
44 | async def test_checking_by_message_positive_part(httpserver, local_test_db):
45 | sites_dict = local_test_db.sites_dict
46 |
47 | site_result_except(httpserver, 'claimed', response_data="profile")
48 | site_result_except(httpserver, 'unclaimed', response_data="404")
49 |
50 | result = await search('claimed', site_dict=sites_dict, logger=Mock())
51 | assert result['Message']['status'].is_found() is True
52 |
53 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
54 | assert result['Message']['status'].is_found() is False
55 |
56 |
57 | @pytest.mark.slow
58 | @pytest.mark.asyncio
59 | async def test_checking_by_message_negative(httpserver, local_test_db):
60 | sites_dict = local_test_db.sites_dict
61 |
62 | site_result_except(httpserver, 'claimed', response_data="")
63 | site_result_except(httpserver, 'unclaimed', response_data="user 404")
64 |
65 | result = await search('claimed', site_dict=sites_dict, logger=Mock())
66 | assert result['Message']['status'].is_found() is False
67 |
68 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
69 | assert result['Message']['status'].is_found() is True
70 |
--------------------------------------------------------------------------------
/src/maigret/tests/test_cli.py:
--------------------------------------------------------------------------------
1 | """Maigret command-line arguments parsing tests"""
2 | from argparse import Namespace
3 | from typing import Dict, Any
4 |
5 | DEFAULT_ARGS: Dict[str, Any] = {
6 | 'all_sites': False,
7 | 'connections': 100,
8 | 'cookie_file': None,
9 | 'csv': False,
10 | 'db_file': 'resources/data.json',
11 | 'debug': False,
12 | 'disable_extracting': False,
13 | 'disable_recursive_search': False,
14 | 'folderoutput': 'reports',
15 | 'html': False,
16 | 'graph': False,
17 | 'id_type': 'username',
18 | 'ignore_ids_list': [],
19 | 'info': False,
20 | 'json': '',
21 | 'new_site_to_submit': False,
22 | 'no_color': False,
23 | 'no_progressbar': False,
24 | 'parse_url': '',
25 | 'pdf': False,
26 | 'print_check_errors': False,
27 | 'print_not_found': False,
28 | 'proxy': None,
29 | 'reports_sorting': 'default',
30 | 'retries': 1,
31 | 'self_check': False,
32 | 'site_list': [],
33 | 'stats': False,
34 | 'tags': '',
35 | 'timeout': 30,
36 | 'tor_proxy': 'socks5://127.0.0.1:9050',
37 | 'i2p_proxy': 'http://127.0.0.1:4444',
38 | 'top_sites': 500,
39 | 'txt': False,
40 | 'use_disabled_sites': False,
41 | 'username': [],
42 | 'verbose': False,
43 | 'with_domains': False,
44 | 'xmind': False,
45 | }
46 |
47 |
48 | def test_args_search_mode(argparser):
49 | args = argparser.parse_args('username'.split())
50 |
51 | assert args.username == ['username']
52 |
53 | want_args = dict(DEFAULT_ARGS)
54 | want_args.update({'username': ['username']})
55 |
56 | assert args == Namespace(**want_args)
57 |
58 |
59 | def test_args_search_mode_several_usernames(argparser):
60 | args = argparser.parse_args('username1 username2'.split())
61 |
62 | assert args.username == ['username1', 'username2']
63 |
64 | want_args = dict(DEFAULT_ARGS)
65 | want_args.update({'username': ['username1', 'username2']})
66 |
67 | assert args == Namespace(**want_args)
68 |
69 |
70 | def test_args_self_check_mode(argparser):
71 | args = argparser.parse_args('--self-check --site GitHub'.split())
72 |
73 | want_args = dict(DEFAULT_ARGS)
74 | want_args.update(
75 | {
76 | 'self_check': True,
77 | 'site_list': ['GitHub'],
78 | 'username': [],
79 | }
80 | )
81 |
82 | assert args == Namespace(**want_args)
83 |
84 |
85 | def test_args_multiple_sites(argparser):
86 | args = argparser.parse_args(
87 | '--site GitHub VK --site PornHub --site Taringa,Steam'.split()
88 | )
89 |
90 | want_args = dict(DEFAULT_ARGS)
91 | want_args.update(
92 | {
93 | 'site_list': ['GitHub', 'PornHub', 'Taringa,Steam'],
94 | 'username': ['VK'],
95 | }
96 | )
97 |
98 | assert args == Namespace(**want_args)
99 |
--------------------------------------------------------------------------------
/src/maigret/tests/test_data.py:
--------------------------------------------------------------------------------
1 | """Maigret data test functions"""
2 |
3 | from maigret.utils import is_country_tag
4 |
5 |
6 | def test_tags_validity(default_db):
7 | unknown_tags = set()
8 |
9 | tags = default_db._tags
10 |
11 | for site in default_db.sites:
12 | for tag in filter(lambda x: not is_country_tag(x), site.tags):
13 | if tag not in tags:
14 | unknown_tags.add(tag)
15 |
16 | assert unknown_tags == set()
17 |
--------------------------------------------------------------------------------
/src/maigret/tests/test_executors.py:
--------------------------------------------------------------------------------
1 | """Maigret checking logic test functions"""
2 | import pytest
3 | import asyncio
4 | import logging
5 | from maigret.executors import (
6 | AsyncioSimpleExecutor,
7 | AsyncioProgressbarExecutor,
8 | AsyncioProgressbarSemaphoreExecutor,
9 | AsyncioProgressbarQueueExecutor,
10 | )
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | async def func(n):
16 | await asyncio.sleep(0.1 * (n % 3))
17 | return n
18 |
19 |
20 | @pytest.mark.asyncio
21 | async def test_simple_asyncio_executor():
22 | tasks = [(func, [n], {}) for n in range(10)]
23 | executor = AsyncioSimpleExecutor(logger=logger)
24 | assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
25 | assert executor.execution_time > 0.2
26 | assert executor.execution_time < 0.3
27 |
28 |
29 | @pytest.mark.asyncio
30 | async def test_asyncio_progressbar_executor():
31 | tasks = [(func, [n], {}) for n in range(10)]
32 |
33 | executor = AsyncioProgressbarExecutor(logger=logger)
34 | # no guarantees for the results order
35 | assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
36 | assert executor.execution_time > 0.2
37 | assert executor.execution_time < 0.3
38 |
39 |
40 | @pytest.mark.asyncio
41 | async def test_asyncio_progressbar_semaphore_executor():
42 | tasks = [(func, [n], {}) for n in range(10)]
43 |
44 | executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
45 | # no guarantees for the results order
46 | assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
47 | assert executor.execution_time > 0.2
48 | assert executor.execution_time < 0.4
49 |
50 |
51 | @pytest.mark.asyncio
52 | async def test_asyncio_progressbar_queue_executor():
53 | tasks = [(func, [n], {}) for n in range(10)]
54 |
55 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
56 | assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
57 | assert executor.execution_time > 0.5
58 | assert executor.execution_time < 0.6
59 |
60 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=3)
61 | assert await executor.run(tasks) == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
62 | assert executor.execution_time > 0.4
63 | assert executor.execution_time < 0.5
64 |
65 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5)
66 | assert await executor.run(tasks) in (
67 | [0, 3, 6, 1, 4, 7, 9, 2, 5, 8],
68 | [0, 3, 6, 1, 4, 9, 7, 2, 5, 8],
69 | )
70 | assert executor.execution_time > 0.3
71 | assert executor.execution_time < 0.4
72 |
73 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=10)
74 | assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
75 | assert executor.execution_time > 0.2
76 | assert executor.execution_time < 0.3
77 |
--------------------------------------------------------------------------------
/src/maigret/tests/test_notify.py:
--------------------------------------------------------------------------------
1 | from maigret.errors import CheckError
2 | from maigret.notify import QueryNotifyPrint
3 | from maigret.result import QueryStatus, QueryResult
4 |
5 |
6 | def test_notify_illegal():
7 | n = QueryNotifyPrint(color=False)
8 |
9 | assert (
10 | n.update(
11 | QueryResult(
12 | username="test",
13 | status=QueryStatus.ILLEGAL,
14 | site_name="TEST_SITE",
15 | site_url_user="http://example.com/test",
16 | )
17 | )
18 | == "[-] TEST_SITE: Illegal Username Format For This Site!"
19 | )
20 |
21 |
22 | def test_notify_claimed():
23 | n = QueryNotifyPrint(color=False)
24 |
25 | assert (
26 | n.update(
27 | QueryResult(
28 | username="test",
29 | status=QueryStatus.CLAIMED,
30 | site_name="TEST_SITE",
31 | site_url_user="http://example.com/test",
32 | )
33 | )
34 | == "[+] TEST_SITE: http://example.com/test"
35 | )
36 |
37 |
38 | def test_notify_available():
39 | n = QueryNotifyPrint(color=False)
40 |
41 | assert (
42 | n.update(
43 | QueryResult(
44 | username="test",
45 | status=QueryStatus.AVAILABLE,
46 | site_name="TEST_SITE",
47 | site_url_user="http://example.com/test",
48 | )
49 | )
50 | == "[-] TEST_SITE: Not found!"
51 | )
52 |
53 |
54 | def test_notify_unknown():
55 | n = QueryNotifyPrint(color=False)
56 | result = QueryResult(
57 | username="test",
58 | status=QueryStatus.UNKNOWN,
59 | site_name="TEST_SITE",
60 | site_url_user="http://example.com/test",
61 | )
62 | result.error = CheckError('Type', 'Reason')
63 |
64 | assert n.update(result) == "[?] TEST_SITE: Type error: Reason"
65 |
--------------------------------------------------------------------------------
/src/maigret/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/utils/__init__.py
--------------------------------------------------------------------------------
/src/maigret/utils/add_tags.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import random
3 | from argparse import ArgumentParser, RawDescriptionHelpFormatter
4 |
5 | from maigret.maigret import MaigretDatabase
6 | from maigret.submit import Submitter
7 |
8 |
9 | def update_tags(site):
10 | tags = []
11 | if not site.tags:
12 | print(f'Site {site.name} doesn\'t have tags')
13 | else:
14 | tags = site.tags
15 | print(f'Site {site.name} tags: ' + ', '.join(tags))
16 |
17 | print(f'URL: {site.url_main}')
18 |
19 | new_tags = set(input('Enter new tags: ').split(', '))
20 | if "disabled" in new_tags:
21 | new_tags.remove("disabled")
22 | site.disabled = True
23 |
24 | print(f'Old alexa rank: {site.alexa_rank}')
25 | rank = Submitter.get_alexa_rank(site.url_main)
26 | if rank:
27 | print(f'New alexa rank: {rank}')
28 | site.alexa_rank = rank
29 |
30 | site.tags = [x for x in list(new_tags) if x]
31 |
32 |
33 | if __name__ == '__main__':
34 | parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
35 | )
36 | parser.add_argument("--base","-b", metavar="BASE_FILE",
37 | dest="base_file", default="maigret/resources/data.json",
38 | help="JSON file with sites data to update.")
39 | parser.add_argument("--name", help="Name of site to check")
40 |
41 | pool = list()
42 |
43 | args = parser.parse_args()
44 |
45 | db = MaigretDatabase()
46 | db.load_from_file(args.base_file).sites
47 |
48 | while True:
49 | if args.name:
50 | sites = list(db.ranked_sites_dict(names=[args.name]).values())
51 | site = random.choice(sites)
52 | else:
53 | site = random.choice(db.sites)
54 |
55 | if site.engine == 'uCoz':
56 | continue
57 |
58 | # if not 'in' in site.tags:
59 | # continue
60 |
61 | update_tags(site)
62 |
63 | db.save_to_file(args.base_file)
--------------------------------------------------------------------------------
/src/maigret/utils/sites_diff.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import difflib
3 | import requests
4 |
5 |
6 | a = requests.get(sys.argv[1]).text
7 | b = requests.get(sys.argv[2]).text
8 |
9 |
10 | tokens_a = set(a.split('"'))
11 | tokens_b = set(b.split('"'))
12 |
13 | a_minus_b = tokens_a.difference(tokens_b)
14 | b_minus_a = tokens_b.difference(tokens_a)
15 |
16 | print(a_minus_b)
17 | print(b_minus_a)
18 |
19 | print(len(a_minus_b))
20 | print(len(b_minus_a))
21 |
22 | desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
23 | "birthday", "репутация", "информация", "e-mail"]
24 |
25 |
26 | def get_match_ratio(x):
27 | return round(max([
28 | difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
29 | for y in desired_strings
30 | ]), 2)
31 |
32 |
33 | RATIO = 0.6
34 |
35 | print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10])
36 | print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10])
--------------------------------------------------------------------------------
/src/maigret/wizard.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import asyncio
3 | import logging
4 | import maigret
5 |
6 |
7 | # top popular sites from the Maigret database
8 | TOP_SITES_COUNT = 300
9 | # Maigret HTTP requests timeout
10 | TIMEOUT = 10
11 | # max parallel requests
12 | MAX_CONNECTIONS = 50
13 |
14 |
15 | if __name__ == '__main__':
16 | # setup logging and asyncio
17 | logger = logging.getLogger('maigret')
18 | logger.setLevel(logging.WARNING)
19 | loop = asyncio.get_event_loop()
20 |
21 | # setup Maigret
22 | db = maigret.MaigretDatabase().load_from_file('./maigret/resources/data.json')
23 | # also can be downloaded from web
24 | # db = MaigretDatabase().load_from_url(MAIGRET_DB_URL)
25 |
26 | # user input
27 | username = input('Enter username to search: ')
28 |
29 | sites_count_raw = input(
30 | f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): '
31 | )
32 | sites_count = int(sites_count_raw) or TOP_SITES_COUNT
33 |
34 | sites = db.ranked_sites_dict(top=sites_count)
35 |
36 | show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ')
37 | show_progressbar = show_progressbar_raw.lower() != 'n'
38 |
39 | extract_info_raw = input(
40 | 'Do you want to extract additional info from accounts\' pages? [Yn] '
41 | )
42 | extract_info = extract_info_raw.lower() != 'n'
43 |
44 | use_notifier_raw = input(
45 | 'Do you want to use notifier for displaying results while searching? [Yn] '
46 | )
47 | use_notifier = use_notifier_raw.lower() != 'n'
48 |
49 | notifier = None
50 | if use_notifier:
51 | notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True)
52 |
53 | # search!
54 | search_func = maigret.search(
55 | username=username,
56 | site_dict=sites,
57 | timeout=TIMEOUT,
58 | logger=logger,
59 | max_connections=MAX_CONNECTIONS,
60 | query_notify=notifier,
61 | no_progressbar=(not show_progressbar),
62 | is_parsing_enabled=extract_info,
63 | )
64 |
65 | results = loop.run_until_complete(search_func)
66 |
67 | input('Search completed. Press any key to show results.')
68 |
69 | for sitename, data in results.items():
70 | is_found = data['status'].is_found()
71 | print(f'{sitename} - {"Found!" if is_found else "Not found"}')
72 |
--------------------------------------------------------------------------------
/src/recopilacion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/recopilacion/__init__.py
--------------------------------------------------------------------------------
/src/recopilacion/extraccion.py:
--------------------------------------------------------------------------------
1 | def procesar_resultados():
2 | pass
--------------------------------------------------------------------------------
/src/riesgos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/riesgos/__init__.py
--------------------------------------------------------------------------------
/src/sherlock/.dockerignore:
--------------------------------------------------------------------------------
1 | .git/
2 | .vscode/
3 | screenshot/
4 | tests/
5 | *.txt
6 | !/requirements.txt
7 | venv/
8 |
9 |
--------------------------------------------------------------------------------
/src/sherlock/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | indent_style = space
5 | indent_size = 2
6 | end_of_line = lf
7 | charset = utf-8
8 | trim_trailing_whitespace = true
9 | insert_final_newline = true
10 | curly_bracket_next_line = false
11 | spaces_around_operators = true
12 |
13 | [*.{markdown,md}]
14 | trim_trailing_whitespace = false
15 |
16 | [*.py]
17 | indent_size = 4
18 | quote_type = double
19 |
--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Report a bug in Sherlock's functionality
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 |
18 |
19 |
20 | ## Checklist
21 |
25 |
26 | - [ ] I'm reporting a bug in Sherlock's functionality
27 | - [ ] The bug I'm reporting is not a false positive or a false negative
28 | - [ ] I've verified that I'm running the latest version of Sherlock
29 | - [ ] I've checked for similar bug reports including closed ones
30 | - [ ] I've checked for pull requests that attempt to fix this bug
31 |
32 | ## Description
33 |
37 |
38 | WRITE DESCRIPTION HERE
39 |
--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Request a new functionality for Sherlock
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 |
18 |
19 | ## Checklist
20 |
24 | - [ ] I'm reporting a feature request
25 | - [ ] I've checked for similar feature requests including closed ones
26 |
27 | ## Description
28 |
31 |
32 | WRITE DESCRIPTION HERE
33 |
--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Question
3 | about: Ask us a question
4 | title: ''
5 | labels: question
6 | assignees: ''
7 |
8 | ---
9 |
10 |
18 |
19 | ## Checklist
20 |
24 | - [ ] I'm asking a question regarding Sherlock
25 | - [ ] My question is not a tech support question.
26 |
27 | **We are not your tech support**.
28 | If you have questions related to `pip`, `git`, or something that is not related to Sherlock, please ask them on [Stack Overflow](https://stackoverflow.com/) or [r/learnpython](https://www.reddit.com/r/learnpython/)
29 |
30 |
31 | ## Question
32 |
33 | ASK YOUR QUESTION HERE
34 |
--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/reporting-false-negative.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Reporting false negative
3 | about: Reporting a site that is returning false positives
4 | title: ''
5 | labels: false negative
6 | assignees: ''
7 |
8 | ---
9 |
10 |
18 |
19 | ## Checklist
20 |
24 | - [ ] I'm reporting a website that is returning **false negative** results
25 | - [ ] I've checked for similar site support requests including closed ones
26 | - [ ] I've checked for pull requests attempting to fix this false negative
27 | - [ ] I'm only reporting **one** site (create a separate issue for each site)
28 |
29 | ## Description
30 |
33 |
34 | WRITE DESCRIPTION HERE
35 |
--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/reporting-false-positive.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Reporting false positive
3 | about: Reporting a site that is returning false positives
4 | title: ''
5 | labels: false positive
6 | assignees: ''
7 |
8 | ---
9 |
10 |
18 |
19 | ## Checklist
20 |
24 | - [ ] I'm reporting a website that is returning **false positive** results
25 | - [ ] I've checked for similar site support requests including closed ones
26 | - [ ] I've checked for pull requests attempting to fix this false positive
27 | - [ ] I'm only reporting **one** site (create a separate issue for each site)
28 |
29 | ## Description
30 |
33 |
34 | WRITE DESCRIPTION HERE
35 |
--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/site-support-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Site support request
3 | about: Request support for a new site
4 | title: ''
5 | labels: site support request
6 | assignees: ''
7 |
8 | ---
9 |
10 |
18 |
19 | ## Checklist
20 |
24 |
25 | - [ ] I'm requesting support for a new site
26 | - [ ] I've checked for similar site support requests including closed ones
27 | - [ ] I've checked that the site I am requesting has not been removed in the past and is not documented in [removed_sites.md](https://github.com/sherlock-project/sherlock/blob/master/removed_sites.md)
28 | - [ ] The site I am requesting support for is not a pornographic website
29 | - [ ] I'm only requesting support of **one** website (create a separate issue for each site)
30 |
31 | ## Description
32 |
36 |
37 | URL:
38 |
--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 |
7 | jobs:
8 | build:
9 | runs-on: ubuntu-latest
10 | strategy:
11 | matrix:
12 | python-version: [3.7, 3.8, 3.9, "3.10", 3.11]
13 |
14 | steps:
15 | - uses: actions/checkout@v3
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v4
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | - name: Install Dependencies
21 | run: |
22 | python -m pip install --upgrade pip
23 | pip install ruff flake8 pytest
24 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
25 | - name: Lint with ruff
26 | run: |
27 | # stop the build if there are Python syntax errors or undefined names
28 | ruff . --format=github --select=E9,F63,F7,F82
29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 | - name: Sherlock Site Detect Tests
32 | run: |
33 | cd sherlock && python -m unittest tests.all.SherlockDetectTests --verbose
34 |
--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/nightly.yml:
--------------------------------------------------------------------------------
1 | name: Nightly
2 |
3 | on:
4 | schedule:
5 | # Run Nightly Tests At 3AM (The Hour Of The Wolf) Every Day
6 | - cron: '0 3 * * *'
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | strategy:
12 | matrix:
13 | python-version: [3.x]
14 |
15 | steps:
16 | - uses: actions/checkout@v3
17 | - name: Set up Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v4
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Install Dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
25 | - name: Sherlock Site Coverage Tests
26 | run: |
27 | cd sherlock && python -m unittest tests.all.SherlockSiteCoverageTests --verbose
28 |
--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
1 | name: Pull Request Action
2 |
3 | on:
4 | pull_request:
5 | branches: [ master ]
6 |
7 | jobs:
8 | getchange:
9 | runs-on: ubuntu-latest
10 | outputs:
11 | matrix: ${{ steps.changes.outputs.matrix }}
12 | steps:
13 | - id: changes
14 | run: |
15 | URL="https://api.github.com/repos/sherlock-project/sherlock/pulls/${{ github.event.pull_request.number }}/files"
16 | FILES=$(curl -s -X GET -G $URL | jq -r '.[] | .filename')
17 | if echo $FILES | grep -q ".json"; then
18 | echo "::set-output name=matrix::{\"include\":[{\"python\":\"3.x\"}]}"
19 | else
20 | echo "::set-output name=matrix::{\"include\":[{\"python\":\"3.7\"},{\"python\":\"3.8\"}]},{\"python\":\"3.9\"},{\"python\":\"3.10\"}]},{\"python\":\"3.11\"}]}"
21 | fi
22 | build:
23 | needs: [getchange]
24 | runs-on: ubuntu-latest
25 | strategy:
26 | matrix: ${{ fromJson(needs.getchange.outputs.matrix) }}
27 |
28 | steps:
29 | - uses: actions/checkout@v3
30 | - name: Set up Python ${{ matrix.python }}
31 | uses: actions/setup-python@v4
32 | with:
33 | python-version: ${{ matrix.python }}
34 | - name: Install Dependencies
35 | run: |
36 | python -m pip install --upgrade pip
37 | pip install flake8 pytest
38 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
39 | - name: Lint With flake8
40 | run: |
41 | # stop the build if there are Python syntax errors or undefined names
42 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
43 |
44 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
45 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
46 | - name: Sherlock Site Detect Tests
47 | run: |
48 | cd sherlock && python -m unittest tests.all.SherlockDetectTests --verbose
49 |
--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/update-site-list.yml:
--------------------------------------------------------------------------------
1 | name: Update Site List
2 |
3 | # Trigger the workflow when changes are pushed to the main branch
4 | # and the changes include the sherlock/resources/data.json file
5 | on:
6 | push:
7 | branches:
8 | - master
9 | paths:
10 | - sherlock/resources/data.json
11 |
12 | jobs:
13 | sync-json-data:
14 | # Use the latest version of Ubuntu as the runner environment
15 | runs-on: ubuntu-latest
16 |
17 | steps:
18 | # Check out the code at the specified pull request head commit
19 | - name: Checkout code
20 | uses: actions/checkout@v3
21 | with:
22 | ref: ${{ github.event.pull_request.head.sha }}
23 | fetch-depth: 0
24 |
25 | # Install Python 3
26 | - name: Install Python
27 | uses: actions/setup-python@v4
28 | with:
29 | python-version: '3.x'
30 |
31 | # Execute the site_list.py Python script
32 | - name: Execute site_list.py
33 | run: python site_list.py
34 |
35 | # Commit any changes made by the script
36 | - name: Commit files
37 | run: |
38 | git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
39 | git config --local user.name "github-actions[bot]"
40 | if ! git diff --exit-code; then
41 | git commit -a -m "Updated Site List"
42 | fi
43 |
44 | # Push the changes to the remote repository
45 | - name: Push changes
46 | uses: ad-m/github-push-action@master
47 | with:
48 | github_token: ${{ secrets.GITHUB_TOKEN }}
49 | branch: ${{ github.ref }}
50 |
--------------------------------------------------------------------------------
/src/sherlock/.gitignore:
--------------------------------------------------------------------------------
1 | # Virtual Environment
2 | venv/
3 |
4 | # Editor Configurations
5 | .vscode/
6 | .idea/
7 |
8 | # Python
9 | __pycache__/
10 |
11 | # Pip
12 | src/
13 |
14 | # Jupyter Notebook
15 | .ipynb_checkpoints
16 | *.ipynb
17 |
18 | # Output files, except requirements.txt
19 | *.txt
20 | !requirements.txt
21 |
22 | # Comma-Separated Values (CSV) Reports
23 | *.csv
24 |
25 | #XLSX Reports
26 | *.xlsx
27 |
28 | # Excluded sites list
29 | tests/.excluded_sites
30 |
31 | # MacOS Folder Metadata File
32 | .DS_Store
33 |
34 | # Vim swap files
35 | *.swp
36 |
--------------------------------------------------------------------------------
/src/sherlock/.replit:
--------------------------------------------------------------------------------
1 | language = "python3"
2 | run = ""
3 |
--------------------------------------------------------------------------------
/src/sherlock/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How To Contribute To Sherlock
2 | First off, thank you for the help!
3 |
4 | There are many ways to contribute. Here is some high level grouping.
5 |
6 | ## Adding New Sites
7 |
8 | Please look at the Wiki entry on
9 | [adding new sites](https://github.com/sherlock-project/sherlock/wiki/Adding-Sites-To-Sherlock)
10 | to understand the issues.
11 |
12 | Any new sites that are added need to have a username that has been claimed, and one
13 | that is unclaimed documented in the site data. This allows the regression tests
14 | to ensure that everything is working.
15 |
16 | It is required that a contributor test any new sites by either running the full tests, or running
17 | a site-specific query against the claimed and unclaimed usernames.
18 |
19 | It is not required that a contributor run the
20 | [site_list.py](https://github.com/sherlock-project/sherlock/blob/master/site_list.py)
21 | script.
22 |
23 | If there are performance problems with a site (e.g. slow to respond, unreliable uptime, ...), then
24 | the site may be removed from the list. The
25 | [removed_sites.md](https://github.com/sherlock-project/sherlock/blob/master/removed_sites.md)
26 | file contains sites that were included at one time in Sherlock, but had to be removed for
27 | one reason or another.
28 |
29 | ## Adding New Functionality
30 |
31 | Please ensure that the content on your branch passes all tests before submitting a pull request.
32 |
--------------------------------------------------------------------------------
/src/sherlock/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11-slim-bullseye as build
2 | WORKDIR /wheels
3 |
4 | COPY requirements.txt /opt/sherlock/
5 | RUN apt-get update \
6 | && apt-get install -y build-essential \
7 | && pip3 wheel -r /opt/sherlock/requirements.txt
8 |
9 | FROM python:3.11-slim-bullseye
10 | WORKDIR /opt/sherlock
11 |
12 | ARG VCS_REF
13 | ARG VCS_URL="https://github.com/sherlock-project/sherlock"
14 |
15 | LABEL org.label-schema.vcs-ref=$VCS_REF \
16 | org.label-schema.vcs-url=$VCS_URL
17 |
18 | COPY --from=build /wheels /wheels
19 | COPY . /opt/sherlock/
20 |
21 | RUN pip3 install --no-cache-dir -r requirements.txt -f /wheels \
22 | && rm -rf /wheels
23 |
24 | WORKDIR /opt/sherlock/sherlock
25 |
26 | ENTRYPOINT ["python", "sherlock.py"]
27 |
--------------------------------------------------------------------------------
/src/sherlock/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Sherlock Project
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/sherlock/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 | services:
4 | sherlock:
5 | build: .
6 | volumes:
7 | - "./results:/opt/sherlock/results"
8 |
--------------------------------------------------------------------------------
/src/sherlock/images/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/sherlock/images/preview.png
--------------------------------------------------------------------------------
/src/sherlock/requirements.txt:
--------------------------------------------------------------------------------
1 | certifi>=2019.6.16
2 | colorama>=0.4.1
3 | PySocks>=1.7.0
4 | requests>=2.22.0
5 | requests-futures>=1.0.0
6 | stem>=1.8.0
7 | torrequest>=0.1.0
8 | pandas>=1.0.0
9 | openpyxl<=3.0.10
10 | exrex>=0.11.0
--------------------------------------------------------------------------------
/src/sherlock/sherlock/__init__.py:
--------------------------------------------------------------------------------
1 | """ Sherlock Module
2 |
3 | This module contains the main logic to search for usernames at social
4 | networks.
5 |
6 | """
7 |
--------------------------------------------------------------------------------
/src/sherlock/sherlock/__main__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python3
2 |
3 | """
4 | Sherlock: Find Usernames Across Social Networks Module
5 |
6 | This module contains the main logic to search for usernames at social
7 | networks.
8 | """
9 |
10 | import sys
11 |
12 |
13 | if __name__ == "__main__":
14 | # Check if the user is using the correct version of Python
15 | python_version = sys.version.split()[0]
16 |
17 | if sys.version_info < (3, 6):
18 | print("Sherlock requires Python 3.6+\nYou are using Python %s, which is not supported by Sherlock" % (python_version))
19 | sys.exit(1)
20 |
21 | import sherlock
22 | sherlock.main()
23 |
--------------------------------------------------------------------------------
/src/sherlock/sherlock/result.py:
--------------------------------------------------------------------------------
1 | """Sherlock Result Module
2 |
3 | This module defines various objects for recording the results of queries.
4 | """
5 | from enum import Enum
6 |
7 |
8 | class QueryStatus(Enum):
9 | """Query Status Enumeration.
10 |
11 | Describes status of query about a given username.
12 | """
13 | CLAIMED = "Claimed" # Username Detected
14 | AVAILABLE = "Available" # Username Not Detected
15 | UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username
16 | ILLEGAL = "Illegal" # Username Not Allowable For This Site
17 |
18 | def __str__(self):
19 | """Convert Object To String.
20 |
21 | Keyword Arguments:
22 | self -- This object.
23 |
24 | Return Value:
25 | Nicely formatted string to get information about this object.
26 | """
27 | return self.value
28 |
29 | class QueryResult():
30 | """Query Result Object.
31 |
32 | Describes result of query about a given username.
33 | """
34 | def __init__(self, username, site_name, site_url_user, status,
35 | query_time=None, context=None):
36 | """Create Query Result Object.
37 |
38 | Contains information about a specific method of detecting usernames on
39 | a given type of web sites.
40 |
41 | Keyword Arguments:
42 | self -- This object.
43 | username -- String indicating username that query result
44 | was about.
45 | site_name -- String which identifies site.
46 | site_url_user -- String containing URL for username on site.
47 | NOTE: The site may or may not exist: this
48 | just indicates what the name would
49 | be, if it existed.
50 | status -- Enumeration of type QueryStatus() indicating
51 | the status of the query.
52 | query_time -- Time (in seconds) required to perform query.
53 | Default of None.
54 | context -- String indicating any additional context
55 | about the query. For example, if there was
56 | an error, this might indicate the type of
57 | error that occurred.
58 | Default of None.
59 |
60 | Return Value:
61 | Nothing.
62 | """
63 |
64 | self.username = username
65 | self.site_name = site_name
66 | self.site_url_user = site_url_user
67 | self.status = status
68 | self.query_time = query_time
69 | self.context = context
70 |
71 | return
72 |
73 | def __str__(self):
74 | """Convert Object To String.
75 |
76 | Keyword Arguments:
77 | self -- This object.
78 |
79 | Return Value:
80 | Nicely formatted string to get information about this object.
81 | """
82 | status = str(self.status)
83 | if self.context is not None:
84 | # There is extra context information available about the results.
85 | # Append it to the normal response text.
86 | status += f" ({self.context})"
87 |
88 | return status
89 |
--------------------------------------------------------------------------------
/src/sherlock/sherlock/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Sherlock Tests
2 |
3 | This package contains various submodules used to run tests.
4 | """
5 |
--------------------------------------------------------------------------------
/src/sherlock/sherlock/tests/test_multiple_usernames.py:
--------------------------------------------------------------------------------
1 | import imp
2 | import unittest
3 | import sys
4 | sys.path.append('../')
5 | import sherlock as sh
6 |
7 | checksymbols = []
8 | checksymbols = ["_", "-", "."]
9 |
10 | """Test for mulriple usernames.
11 |
12 | This test ensures that the function MultipleUsernames works properly. More specific,
13 | different scenarios are tested and only usernames that contain this specific sequence: {?}
14 | should return positive.
15 |
16 | Keyword Arguments:
17 | self -- This object.
18 |
19 | Return Value:
20 | Nothing.
21 | """
22 | class TestMultipleUsernames(unittest.TestCase):
23 | def test_area(self):
24 | test_usernames = ["test{?}test" , "test{?feo" , "test"]
25 | for name in test_usernames:
26 | if(sh.CheckForParameter(name)):
27 | self.assertAlmostEqual(sh.MultipleUsernames(name), ["test_test" , "test-test" , "test.test"])
28 | else:
29 | self.assertAlmostEqual(name, name)
--------------------------------------------------------------------------------
/src/sherlock/site_list.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # This module generates the listing of supported sites which can be found in
3 | # sites.md. It also organizes all the sites in alphanumeric order
4 | import json
5 |
6 | # Read the data.json file
7 | with open("sherlock/resources/data.json", "r", encoding="utf-8") as data_file:
8 | data = json.load(data_file)
9 |
10 | # Sort the social networks in alphanumeric order
11 | social_networks = sorted(data.items())
12 |
13 | # Write the list of supported sites to sites.md
14 | with open("sites.md", "w") as site_file:
15 | site_file.write(f"## List Of Supported Sites ({len(social_networks)} Sites In Total!)\n")
16 | for social_network, info in social_networks:
17 | url_main = info["urlMain"]
18 | is_nsfw = "**(NSFW)**" if info.get("isNSFW") else ""
19 | site_file.write(f"1.  [{social_network}]({url_main}) {is_nsfw}\n")
20 |
21 | # Overwrite the data.json file with sorted data
22 | with open("sherlock/resources/data.json", "w") as data_file:
23 | sorted_data = json.dumps(data, indent=2, sort_keys=True)
24 | data_file.write(sorted_data)
25 | data_file.write("\n")
26 |
27 | print("Finished updating supported site listing!")
28 |
--------------------------------------------------------------------------------
/src/theHarvester/.dockerignore:
--------------------------------------------------------------------------------
1 | .github/*
2 | .gitattributes
3 | .idea/
4 | .lgtm.yml
5 | mypy.ini
6 | .pytest_cache
7 | .mypy_cache
8 | tests/*
9 | README/
10 | bin/
--------------------------------------------------------------------------------
/src/theHarvester/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E501, F405, F403, F401, E402, W503
--------------------------------------------------------------------------------
/src/theHarvester/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # #1492 run `black .` and `isort .`
2 | c13843ec0d513ac7f9c35b7bd0501fa46e356415
--------------------------------------------------------------------------------
/src/theHarvester/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set the default behavior, which is to have git automatically determine
2 | # whether a file is a text or binary, unless otherwise specified.
3 |
4 | * text=auto
5 |
6 | # Basic .gitattributes for a python repo.
7 |
8 | # Source files
9 | # ============
10 | *.pxd text diff=python
11 | *.py text diff=python
12 | *.py3 text diff=python
13 | *.pyw text diff=python
14 | *.pyx text diff=python
15 |
16 | # Binary files
17 | # ============
18 | *.db binary
19 | *.p binary
20 | *.pkl binary
21 | *.pyc binary
22 | *.pyd binary
23 | *.pyo binary
24 |
25 | # Note: .db, .p, and .pkl files are associated with the python modules
26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
27 | # (among others).
28 |
--------------------------------------------------------------------------------
/src/theHarvester/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: [L1ghtn1ng, NotoriousRebel]
4 | open_collective: # Replace with a single Open Collective username
5 | ko_fi: #
6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
8 | liberapay: # Replace with a single Liberapay username
9 | issuehunt: # Replace with a single IssueHunt username
10 | otechie: # Replace with a single Otechie username
11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
12 |
--------------------------------------------------------------------------------
/src/theHarvester/.github/ISSUE_TEMPLATE/issue-template.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Issue Template
3 | about: A template for new issues.
4 | title: "[Bug|Feature Request|Other] Short Description of Issue"
5 | labels: ''
6 |
7 | ---
8 |
9 | ## Note we do not support installing theHarvester on android
10 |
11 | **Feature Request or Bug or Another**
12 | Feature Request | Bug | Other
13 |
14 | **Describe the feature request or bug or other**
15 | A clear and concise description of what the bug, feature request,
16 | or other request is.
17 |
18 | **To Reproduce**
19 | Steps to reproduce the behaviour:
20 | 1. Run tool like this: '...'
21 | 2. See error
22 |
23 | **Expected behaviour**
24 | A clear and concise description of what you expected to happen.
25 |
26 | **Screenshots**
27 | If possible please add screenshots to help explain your problem.
28 |
29 | **System Information (System that tool is running on):**
30 | - OS: [e.g. Windows10]
31 | - Version [e.g. 2.7]
32 |
33 | **Additional context**
34 | Add any other context about the problem here.
35 |
--------------------------------------------------------------------------------
/src/theHarvester/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: github-actions
4 | directory: "/"
5 | schedule:
6 | interval: daily
7 | timezone: Europe/London
8 | - package-ecosystem: pip
9 | directory: "/"
10 | schedule:
11 | interval: daily
12 | timezone: Europe/London
13 | open-pull-requests-limit: 10
14 | target-branch: master
15 | allow:
16 | - dependency-type: direct
17 | - dependency-type: indirect
18 |
--------------------------------------------------------------------------------
/src/theHarvester/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ master, dev ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ master, dev ]
20 | schedule:
21 | - cron: '19 11 * * 4'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 |
28 | strategy:
29 | fail-fast: false
30 | matrix:
31 | language: [ 'python' ]
32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 | # Learn more:
34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 |
36 | steps:
37 | - name: Checkout repository
38 | uses: actions/checkout@v4
39 |
40 | # Initializes the CodeQL tools for scanning.
41 | - name: Initialize CodeQL
42 | uses: github/codeql-action/init@v3
43 | with:
44 | languages: ${{ matrix.language }}
45 | # If you wish to specify custom queries, you can do so here or in a config file.
46 | # By default, queries listed here will override any specified in a config file.
47 | # Prefix the list here with "+" to use these queries and those in the config file.
48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 |
50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
51 | # If this step fails, then you should remove it and run the build manually (see below)
52 | - name: Autobuild
53 | uses: github/codeql-action/autobuild@v3
54 |
55 | # ℹ️ Command-line programs to run using the OS shell.
56 | # 📚 https://git.io/JvXDl
57 |
58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 | # and modify them (or add more) to build your code if your project
60 | # uses a compiled language
61 |
62 | #- run: |
63 | # make bootstrap
64 | # make release
65 |
66 | - name: Perform CodeQL Analysis
67 | uses: github/codeql-action/analyze@v3
68 |
--------------------------------------------------------------------------------
/src/theHarvester/.github/workflows/dockerci.yml:
--------------------------------------------------------------------------------
1 | name: TheHarvester Docker Image CI
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: actions/checkout@v4
10 | - name: Build the Docker image
11 | run: docker build . --file Dockerfile --tag theharvester:$(date +%s)
--------------------------------------------------------------------------------
/src/theHarvester/.github/workflows/theHarvester.yml:
--------------------------------------------------------------------------------
1 | ---
2 | name: TheHarvester Python CI
3 |
4 | on:
5 | push:
6 | branches:
7 | - '*'
8 |
9 | pull_request:
10 | branches:
11 | - '*'
12 |
13 | jobs:
14 | Python:
15 | runs-on: ${{ matrix.os }}
16 | strategy:
17 | max-parallel: 8
18 | matrix:
19 | os: [ ubuntu-latest, macos-latest ]
20 | python-version: [ 3.10.12, 3.11 ]
21 |
22 | steps:
23 | - uses: actions/checkout@v4
24 | - name: Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v5
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | - name: Install dependencies
29 | run: |
30 | pip install --upgrade pip
31 | pip install .[dev]
32 |
33 | - name: Lint with black
34 | run: |
35 | black . --diff --check
36 |
37 | - name: Lint with isort
38 | run: |
39 | isort . --diff --check
40 |
41 | - name: Lint with flake8
42 | run: |
43 | # stop the build if there are Python syntax errors or undefined names
44 | flake8 . --count --show-source --statistics --config .flake8
45 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
46 | flake8 . --count --exit-zero --max-line-length=127 --statistics --config .flake8
47 |
48 | - name: Test with pytest
49 | run: |
50 | pytest
51 |
52 | - name: Run theHarvester module Anubis
53 | run: |
54 | theHarvester -d apple.com -b anubis
55 |
56 | - name: Run theHarvester module Baidu
57 | run: |
58 | theHarvester -d yale.edu -b baidu
59 |
60 | - name: Run theHarvester module Bing
61 | run: |
62 | theHarvester -d yale.edu -b bing
63 |
64 | - name: Run theHarvester module CertSpotter
65 | run: |
66 | theHarvester -d yale.edu -b certspotter
67 |
68 | - name: Run theHarvester module Crtsh
69 | run: |
70 | theHarvester -d hcl.com -b crtsh
71 |
72 | - name: Run theHarvester module DnsDumpster
73 | run: |
74 | theHarvester -d yale.edu -b dnsdumpster
75 |
76 | - name: Run theHarvester module DuckDuckGo
77 | run: |
78 | theHarvester -d yale.edu -b duckduckgo
79 |
80 | - name: Run theHarvester module HackerTarget
81 | run: |
82 | theHarvester -d yale.edu -b hackertarget
83 |
84 | - name: Run theHarvester module Intelx
85 | run: |
86 | theHarvester -d yale.edu -b intelx
87 |
88 | - name: Run theHarvester module Otx
89 | run: |
90 | theHarvester -d yale.edu -b otx
91 |
92 | - name: Run theHarvester module RapidDns
93 | run: |
94 | theHarvester -d yale.edu -b rapiddns
95 |
96 | - name: Run theHarvester module Threatminer
97 | run: |
98 | theHarvester -d yale.edu -b threatminer
99 |
100 | - name: Run theHarvester module Urlscan
101 | run: |
102 | theHarvester -d yale.edu -b urlscan
103 |
104 | - name: Run theHarvester module Yahoo
105 | run: |
106 | theHarvester -d yale.edu -b yahoo
107 |
108 | - name: Run theHarvester module DNS brute force
109 | run: |
110 | theHarvester -d yale.edu -c
111 |
112 | - name: Static type checking with mypy
113 | run: |
114 | mypy --pretty theHarvester/*/*.py
115 | mypy --pretty theHarvester/*/*/*.py
116 |
--------------------------------------------------------------------------------
/src/theHarvester/.gitignore:
--------------------------------------------------------------------------------
1 | *.idea
2 | *.pyc
3 | *.sqlite
4 | *.html
5 | *.htm
6 | *.vscode
7 | *.xml
8 | *.json
9 | debug_results.txt
10 | venv
11 | .mypy_cache
12 | .pytest_cache
13 | build/
14 | dist/
15 | theHarvester.egg-info
16 | api-keys.yaml
17 | .DS_Store
18 | .venv
19 | .pyre
20 |
--------------------------------------------------------------------------------
/src/theHarvester/.isort.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | profile = black
3 |
--------------------------------------------------------------------------------
/src/theHarvester/.pyre_configuration:
--------------------------------------------------------------------------------
1 | {
2 | "site_package_search_strategy": "pep561",
3 | "source_directories": [
4 | "."
5 | ]
6 | }
7 |
--------------------------------------------------------------------------------
/src/theHarvester/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:3
2 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1"
3 | RUN apk update && apk upgrade --available && apk add --no-cache musl-dev git libffi-dev gcc python3-dev pipx libxml2-dev libxslt-dev bash
4 | RUN mkdir -p "~/.local/share/theHarvester/static/"
5 | RUN pipx install git+https://github.com/laramies/theHarvester.git
6 | RUN pipx ensurepath
7 | ENTRYPOINT ["/root/.local/bin/restfulHarvest", "-H", "0.0.0.0", "-p", "80"]
8 | EXPOSE 80
9 |
--------------------------------------------------------------------------------
/src/theHarvester/README/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to theHarvester Project
2 | Welcome to theHarvester project, so you would like to contribute.
3 | The following below must be met to get accepted.
4 |
5 | # CI
6 | Make sure all CI passes and you do not introduce any alerts from lgtm.
7 |
8 | # Unit Tests
9 | For new modules a unit test for that module is required and we use pytest.
10 |
11 | # Coding Standards
12 | * No single letter variables and variable names must represent the action that it is performing
13 | * Have static typing on functions etc
14 | * Make sure no errors are reported from mypy
15 | * No issues reported with flake8
16 |
17 | # Submitting Bugs
18 | If you find a bug in a module that you want to submit an issue for and know how to write python code.
19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project.
20 |
--------------------------------------------------------------------------------
/src/theHarvester/README/LICENSES:
--------------------------------------------------------------------------------
1 | Released under the GPL v 2.0.
2 |
3 | If you did not receive a copy of the GPL, try http://www.gnu.org/.
4 |
5 | Copyright 2011 Christian Martorella
6 |
7 | theHarvester is free software; you can redistribute it and/or modify
8 | it under the terms of the GNU General Public License as published by
9 | the Free Software Foundation version 2 of the License.
10 |
11 | theHarvester is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | GNU General Public License for more details.
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 |
--------------------------------------------------------------------------------
/src/theHarvester/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3.9"
2 | services:
3 | theharvester.svc.local:
4 | container_name: theHarvester
5 | volumes:
6 | - ./api-keys.yaml:/root/.theHarvester/api-keys.yaml
7 | - ./api-keys.yaml:/etc/theHarvester/api-keys.yaml
8 | - ./proxies.yaml:/etc/theHarvester/proxies.yaml
9 | - ./proxies.yaml:/root/.theHarvester/proxies.yaml
10 | build: .
11 | ports:
12 | - "8080:80"
13 |
14 | networks:
15 | default:
16 | name: app_theHarvester_network
17 |
--------------------------------------------------------------------------------
/src/theHarvester/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | show_traceback = True
4 | show_error_codes = True
5 | namespace_packages = True
6 |
--------------------------------------------------------------------------------
/src/theHarvester/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "theHarvester"
3 | description = "theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test"
4 | readme = "README.md"
5 | authors = [
6 | { name = "Christian Martorella", email = "cmartorella@edge-security.com" },
7 | { name = "Jay Townsend", email = "jay@cybermon.uk" },
8 | { name = "Matthew Brown", email = "36310667+NotoriousRebel@users.noreply.github.com" },
9 | ]
10 | requires-python = ">=3.9"
11 | urls.Homepage = "https://github.com/laramies/theHarvester"
12 | classifiers = [
13 | "Programming Language :: Python :: 3",
14 | "Programming Language :: Python :: 3.9",
15 | "Programming Language :: Python :: 3.10",
16 | "Programming Language :: Python :: 3.11",
17 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
18 | "Operating System :: OS Independent",
19 | ]
20 | dynamic = ["dependencies", "optional-dependencies", "version"]
21 |
22 | [project.scripts]
23 | theHarvester = "theHarvester.theHarvester:main"
24 | restfulHarvest = "theHarvester.restfulHarvest:main"
25 |
26 | [tool.setuptools.dynamic]
27 | version = { attr = "theHarvester.lib.version.VERSION" }
28 | dependencies = { file = "requirements/base.txt" }
29 | optional-dependencies.dev = { file = "requirements/dev.txt" }
30 |
31 | [tool.setuptools.packages.find]
32 | include = ["theHarvester*"]
33 |
34 | [tool.setuptools.package-data]
35 | "*" = ["*.txt", "*.yaml"]
36 |
37 | [tool.pytest.ini_options]
38 | minversion = "7.1"
39 | addopts = "--no-header --asyncio-mode=auto"
40 | testpaths = [
41 | "tests",
42 | "tests/discovery/",
43 | ]
44 |
45 | [build-system]
46 | requires = ["setuptools>=68"]
47 | build-backend = "setuptools.build_meta"
48 |
--------------------------------------------------------------------------------
/src/theHarvester/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | minversion = 7.1.1
3 | testpaths = tests
4 | asyncio_mode=auto
--------------------------------------------------------------------------------
/src/theHarvester/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/base.txt
2 |
--------------------------------------------------------------------------------
/src/theHarvester/requirements/base.txt:
--------------------------------------------------------------------------------
1 | aiodns==3.1.1
2 | aiofiles==23.2.1
3 | aiohttp==3.9.3
4 | aiomultiprocess==0.9.0
5 | aiosqlite==0.19.0
6 | beautifulsoup4==4.12.3
7 | censys==2.2.11
8 | certifi==2024.2.2
9 | dnspython==2.5.0
10 | fastapi==0.109.0
11 | lxml==5.1.0
12 | netaddr==0.10.1
13 | ujson==5.9.0
14 | pyppeteer==1.0.2
15 | PyYAML==6.0.1
16 | python-dateutil==2.8.2
17 | requests==2.31.0
18 | retrying==1.3.4
19 | setuptools==69.0.3
20 | shodan==1.31.0
21 | slowapi==0.1.8
22 | uvicorn==0.27.0.post1
23 | uvloop==0.19.0; platform_system != "Windows"
24 |
--------------------------------------------------------------------------------
/src/theHarvester/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | black==24.1.1
2 | flake8==7.0.0
3 | isort==5.13.2
4 | mypy==1.8.0
5 | mypy-extensions==1.0.0
6 | pydantic==2.5.3
7 | pyre-check==0.9.19
8 | pyflakes==3.2.0
9 | pytest==7.4.4
10 | pytest-asyncio==0.23.4
11 | types-certifi==2021.10.8.3
12 | types-chardet==5.0.4.6
13 | types-ujson==5.9.0.0
14 | types-PyYAML==6.0.12.12
15 | types-requests==2.31.0.6 # 2.31.0.7 introduced a regression
16 | types-python-dateutil==2.8.19.20240106
17 | wheel==0.42.0
--------------------------------------------------------------------------------
/src/theHarvester/restfulHarvest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from theHarvester.restfulHarvest import main
3 |
4 | if __name__ == "__main__":
5 | main()
6 |
--------------------------------------------------------------------------------
/src/theHarvester/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F405, F403, E402, F401, F402
--------------------------------------------------------------------------------
/src/theHarvester/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/tests/__init__.py
--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/tests/discovery/__init__.py
--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_anubis.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import os
4 | from typing import Optional
5 |
6 | import pytest
7 | import requests
8 | from _pytest.mark.structures import MarkDecorator
9 |
10 | from theHarvester.discovery import anubis
11 | from theHarvester.lib.core import *
12 |
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 | "GITHUB_ACTIONS"
16 | ) # Github set this to be the following: true instead of True
17 |
18 |
19 | class TestAnubis:
20 | @staticmethod
21 | def domain() -> str:
22 | return "apple.com"
23 |
24 | async def test_api(self) -> None:
25 | base_url = f"https://jldc.me/anubis/subdomains/{TestAnubis.domain()}"
26 | headers = {"User-Agent": Core.get_user_agent()}
27 | request = requests.get(base_url, headers=headers)
28 | assert request.status_code == 200
29 |
30 | async def test_do_search(self):
31 | search = anubis.SearchAnubis(word=TestAnubis.domain())
32 | await search.do_search()
33 | return await search.get_hostnames()
34 |
35 | async def test_process(self) -> None:
36 | await self.test_do_search()
37 | assert len(await self.test_do_search()) > 0
38 |
--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_certspotter.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import os
4 | from typing import Optional
5 |
6 | import pytest
7 | import requests
8 | from _pytest.mark.structures import MarkDecorator
9 |
10 | from theHarvester.discovery import certspottersearch
11 | from theHarvester.lib.core import *
12 |
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 | "GITHUB_ACTIONS"
16 | ) # Github set this to be the following: true instead of True
17 |
18 |
19 | class TestCertspotter(object):
20 | @staticmethod
21 | def domain() -> str:
22 | return "metasploit.com"
23 |
24 | async def test_api(self) -> None:
25 | base_url = f"https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names"
26 | headers = {"User-Agent": Core.get_user_agent()}
27 | request = requests.get(base_url, headers=headers)
28 | assert request.status_code == 200
29 |
30 | async def test_search(self) -> None:
31 | search = certspottersearch.SearchCertspoter(TestCertspotter.domain())
32 | await search.process()
33 | assert isinstance(await search.get_hostnames(), set)
34 |
35 |
36 | if __name__ == "__main__":
37 | pytest.main()
38 |
--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_githubcode.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import MagicMock
2 |
3 | import pytest
4 | from _pytest.mark.structures import MarkDecorator
5 | from requests import Response
6 |
7 | from theHarvester.discovery import githubcode
8 | from theHarvester.discovery.constants import MissingKey
9 | from theHarvester.lib.core import Core
10 |
11 | pytestmark: MarkDecorator = pytest.mark.asyncio
12 |
13 |
14 | class TestSearchGithubCode:
15 | class OkResponse:
16 | response = Response()
17 | json = {
18 | "items": [
19 | {"text_matches": [{"fragment": "test1"}]},
20 | {"text_matches": [{"fragment": "test2"}]},
21 | ]
22 | }
23 | response.status_code = 200
24 | response.json = MagicMock(return_value=json)
25 |
26 | class FailureResponse:
27 | response = Response()
28 | response.json = MagicMock(return_value={})
29 | response.status_code = 401
30 |
31 | class RetryResponse:
32 | response = Response()
33 | response.json = MagicMock(return_value={})
34 | response.status_code = 403
35 |
36 | class MalformedResponse:
37 | response = Response()
38 | json = {
39 | "items": [
40 | {"fail": True},
41 | {"text_matches": []},
42 | {"text_matches": [{"weird": "result"}]},
43 | ]
44 | }
45 | response.json = MagicMock(return_value=json)
46 | response.status_code = 200
47 |
48 | async def test_missing_key(self) -> None:
49 | with pytest.raises(MissingKey):
50 | Core.github_key = MagicMock(return_value=None)
51 | githubcode.SearchGithubCode(word="test", limit=500)
52 |
53 | async def test_fragments_from_response(self) -> None:
54 | Core.github_key = MagicMock(return_value="lol")
55 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
56 | test_result = await test_class_instance.fragments_from_response(
57 | self.OkResponse.response.json()
58 | )
59 | print("test_result: ", test_result)
60 | assert test_result == ["test1", "test2"]
61 |
62 | async def test_invalid_fragments_from_response(self) -> None:
63 | Core.github_key = MagicMock(return_value="lol")
64 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
65 | test_result = await test_class_instance.fragments_from_response(
66 | self.MalformedResponse.response.json()
67 | )
68 | assert test_result == []
69 |
70 | async def test_next_page(self) -> None:
71 | Core.github_key = MagicMock(return_value="lol")
72 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
73 | test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4)
74 | assert 2 == await test_class_instance.next_page_or_end(test_result)
75 |
76 | async def test_last_page(self) -> None:
77 | Core.github_key = MagicMock(return_value="lol")
78 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
79 | test_result = githubcode.SuccessResult(list(), None, None)
80 | assert None is await test_class_instance.next_page_or_end(test_result)
81 |
82 | if __name__ == "__main__":
83 | pytest.main()
84 |
--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_otx.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 | import os
4 | from typing import Optional
5 |
6 | import pytest
7 | import requests
8 | from _pytest.mark.structures import MarkDecorator
9 |
10 | from theHarvester.discovery import otxsearch
11 | from theHarvester.lib.core import *
12 |
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 | "GITHUB_ACTIONS"
16 | ) # Github set this to be the following: true instead of True
17 |
18 |
19 | class TestOtx(object):
20 | @staticmethod
21 | def domain() -> str:
22 | return "cybermon.uk"
23 |
24 | async def test_api(self) -> None:
25 | base_url = f"https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns"
26 | headers = {"User-Agent": Core.get_user_agent()}
27 | request = requests.get(base_url, headers=headers)
28 | assert request.status_code == 200
29 |
30 | async def test_search(self) -> None:
31 | search = otxsearch.SearchOtx(TestOtx.domain())
32 | await search.process()
33 | assert isinstance(await search.get_hostnames(), set)
34 | assert isinstance(await search.get_ips(), set)
35 |
36 |
37 | if __name__ == "__main__":
38 | pytest.main()
39 |
--------------------------------------------------------------------------------
/src/theHarvester/tests/test_myparser.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # coding=utf-8
3 |
4 | import pytest
5 |
6 | from theHarvester.parsers import myparser
7 |
8 |
9 | class TestMyParser(object):
10 | @pytest.mark.asyncio
11 | async def test_emails(self) -> None:
12 | word = "domain.com"
13 | results = "@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***"
14 | parse = myparser.Parser(results, word)
15 | emails = sorted(await parse.emails())
16 | assert emails, ["c@domain.com", "d@sub.domain.com"]
17 |
18 |
19 | if __name__ == "__main__":
20 | pytest.main()
21 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester-logo.png
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester-logo.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester-logo.webp
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # Note: This script runs theHarvester
3 | import sys
4 |
5 | from theHarvester.theHarvester import main
6 |
7 | if sys.version_info.major < 3 or sys.version_info.minor < 9:
8 | print("\033[93m[!] Make sure you have Python 3.9+ installed, quitting.\n\n \033[0m")
9 | sys.exit(1)
10 |
11 | if __name__ == "__main__":
12 | main()
13 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["hostchecker"]
2 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/data/proxies.yaml:
--------------------------------------------------------------------------------
1 | http:
2 | - ip:port
3 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/data/wordlists/dorks.txt:
--------------------------------------------------------------------------------
1 | inurl:"contact"
2 | intext:email filetype:log
3 | "Index of /mail"
4 | "admin account info" filetype:log
5 | intext:@
6 | administrator accounts/
7 | intitle:"Index of" .bash_history
8 | intitle:"index of" members OR accounts
9 | inurl:/shared/help.php
10 | inurl:public
11 | intitle:index.of inbox
12 | intitle:"Server Administration"
13 | inurl:passwd.txt
14 | robots.txt
15 | php-addressbook "This is the addressbook for *" -warning
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/data/wordlists/general/common.txt:
--------------------------------------------------------------------------------
1 | admin
2 | test
3 | hello
4 | uk
5 | login
6 | book
7 | robots.txt
8 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester/discovery/__init__.py
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/anubis.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import AsyncFetcher
2 |
3 |
4 | class SearchAnubis:
5 | def __init__(self, word) -> None:
6 | self.word = word
7 | self.totalhosts: list = []
8 | self.proxy = False
9 |
10 | async def do_search(self) -> None:
11 | url = f"https://jldc.me/anubis/subdomains/{self.word}"
12 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
13 | self.totalhosts = response[0]
14 |
15 | async def get_hostnames(self) -> list:
16 | return self.totalhosts
17 |
18 | async def process(self, proxy: bool = False) -> None:
19 | self.proxy = proxy
20 | await self.do_search()
21 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/baidusearch.py:
--------------------------------------------------------------------------------
1 | from theHarvester.lib.core import AsyncFetcher, Core
2 | from theHarvester.parsers import myparser
3 |
4 |
5 | class SearchBaidu:
6 | def __init__(self, word, limit) -> None:
7 | self.word = word
8 | self.total_results = ""
9 | self.server = "www.baidu.com"
10 | self.hostname = "www.baidu.com"
11 | self.limit = limit
12 | self.proxy = False
13 |
14 | async def do_search(self) -> None:
15 | headers = {"Host": self.hostname, "User-agent": Core.get_user_agent()}
16 | base_url = f"https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}"
17 | urls = [
18 | base_url.replace("xx", str(num))
19 | for num in range(0, self.limit, 10)
20 | if num <= self.limit
21 | ]
22 | responses = await AsyncFetcher.fetch_all(
23 | urls, headers=headers, proxy=self.proxy
24 | )
25 | for response in responses:
26 | self.total_results += response
27 |
28 | async def process(self, proxy: bool = False) -> None:
29 | self.proxy = proxy
30 | await self.do_search()
31 |
32 | async def get_emails(self):
33 | rawres = myparser.Parser(self.total_results, self.word)
34 | return await rawres.emails()
35 |
36 | async def get_hostnames(self):
37 | rawres = myparser.Parser(self.total_results, self.word)
38 | return await rawres.hostnames()
39 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bevigil.py:
--------------------------------------------------------------------------------
1 | from typing import Set
2 |
3 | from theHarvester.discovery.constants import MissingKey
4 | from theHarvester.lib.core import AsyncFetcher, Core
5 |
6 |
7 | class SearchBeVigil:
8 | def __init__(self, word) -> None:
9 | self.word = word
10 | self.totalhosts: Set = set()
11 | self.interestingurls: Set = set()
12 | self.key = Core.bevigil_key()
13 | if self.key is None:
14 | self.key = ""
15 | raise MissingKey("bevigil")
16 | self.proxy = False
17 |
18 | async def do_search(self) -> None:
19 | subdomain_endpoint = f"https://osint.bevigil.com/api/{self.word}/subdomains/"
20 | url_endpoint = f"https://osint.bevigil.com/api/{self.word}/urls/"
21 | headers = {"X-Access-Token": self.key}
22 |
23 | responses = await AsyncFetcher.fetch_all(
24 | [subdomain_endpoint], json=True, proxy=self.proxy, headers=headers
25 | )
26 | response = responses[0]
27 | for subdomain in response["subdomains"]:
28 | self.totalhosts.add(subdomain)
29 |
30 | responses = await AsyncFetcher.fetch_all(
31 | [url_endpoint], json=True, proxy=self.proxy, headers=headers
32 | )
33 | response = responses[0]
34 | for url in response["urls"]:
35 | self.interestingurls.add(url)
36 |
37 | async def get_hostnames(self) -> set:
38 | return self.totalhosts
39 |
40 | async def get_interestingurls(self) -> set:
41 | return self.interestingurls
42 |
43 | async def process(self, proxy: bool = False) -> None:
44 | self.proxy = proxy
45 | await self.do_search()
46 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/binaryedgesearch.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import Set
3 |
4 | from theHarvester.discovery.constants import MissingKey, get_delay
5 | from theHarvester.lib.core import AsyncFetcher, Core
6 |
7 |
8 | class SearchBinaryEdge:
9 | def __init__(self, word, limit) -> None:
10 | self.word = word
11 | self.totalhosts: Set = set()
12 | self.proxy = False
13 | self.key = Core.binaryedge_key()
14 | self.limit = 501 if limit >= 501 else limit
15 | self.limit = 2 if self.limit == 1 else self.limit
16 | if self.key is None:
17 | raise MissingKey("binaryedge")
18 |
19 | async def do_search(self) -> None:
20 | base_url = f"https://api.binaryedge.io/v2/query/domains/subdomain/{self.word}"
21 | headers = {"X-KEY": self.key, "User-Agent": Core.get_user_agent()}
22 | for page in range(1, self.limit):
23 | params = {"page": page}
24 | response = await AsyncFetcher.fetch_all(
25 | [base_url], json=True, proxy=self.proxy, params=params, headers=headers
26 | )
27 | responses = response[0]
28 | dct = responses
29 | if ("status" in dct.keys() and "message" in dct.keys()) and (
30 | dct["status"] == 400
31 | or "Bad Parameter" in dct["message"]
32 | or "Error" in dct["message"]
33 | ):
34 | # 400 status code means no more results
35 | break
36 | if "events" in dct.keys():
37 | if len(dct["events"]) == 0:
38 | break
39 | self.totalhosts.update({host for host in dct["events"]})
40 | await asyncio.sleep(get_delay())
41 |
42 | async def get_hostnames(self) -> set:
43 | return self.totalhosts
44 |
45 | async def process(self, proxy: bool = False) -> None:
46 | self.proxy = proxy
47 | await self.do_search()
48 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bingsearch.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 |
3 | from theHarvester.discovery.constants import MissingKey
4 | from theHarvester.lib.core import AsyncFetcher, Core
5 | from theHarvester.parsers import myparser
6 |
7 |
8 | class SearchBing:
9 | def __init__(self, word, limit, start) -> None:
10 | self.word = word.replace(" ", "%20")
11 | self.results: list[Any] = []
12 | self.total_results = ""
13 | self.server = "www.bing.com"
14 | self.apiserver = "api.search.live.net"
15 | self.hostname = "www.bing.com"
16 | self.limit = int(limit)
17 | self.bingApi = Core.bing_key()
18 | self.counter = start
19 | self.proxy = False
20 |
21 | async def do_search(self) -> None:
22 | headers = {
23 | "Host": self.hostname,
24 | "Cookie": "SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50",
25 | "Accept-Language": "en-us,en",
26 | "User-agent": Core.get_user_agent(),
27 | }
28 | base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
29 | urls = [
30 | base_url.replace("xx", str(num))
31 | for num in range(0, self.limit, 50)
32 | if num <= self.limit
33 | ]
34 | responses = await AsyncFetcher.fetch_all(
35 | urls, headers=headers, proxy=self.proxy
36 | )
37 | for response in responses:
38 | self.total_results += response
39 |
40 | async def do_search_api(self) -> None:
41 | url = "https://api.bing.microsoft.com/v7.0/search?"
42 | params = {
43 | "q": self.word,
44 | "count": str(self.limit),
45 | "offset": "0",
46 | "mkt": "en-us",
47 | "safesearch": "Off",
48 | }
49 | headers = {
50 | "User-Agent": Core.get_user_agent(),
51 | "Ocp-Apim-Subscription-Key": self.bingApi,
52 | }
53 | self.results = await AsyncFetcher.fetch_all(
54 | [url], headers=headers, params=params, proxy=self.proxy
55 | )
56 | for res in self.results:
57 | self.total_results += res
58 |
59 | async def do_search_vhost(self) -> None:
60 | headers = {
61 | "Host": self.hostname,
62 | "Cookie": "mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50",
63 | "Accept-Language": "en-us,en",
64 | "User-agent": Core.get_user_agent(),
65 | }
66 | base_url = f"http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx"
67 | urls = [
68 | base_url.replace("xx", str(num))
69 | for num in range(0, self.limit, 50)
70 | if num <= self.limit
71 | ]
72 | responses = await AsyncFetcher.fetch_all(
73 | urls, headers=headers, proxy=self.proxy
74 | )
75 | for response in responses:
76 | self.total_results += response
77 |
78 | async def get_emails(self):
79 | rawres = myparser.Parser(self.total_results, self.word)
80 | return await rawres.emails()
81 |
82 | async def get_hostnames(self):
83 | rawres = myparser.Parser(self.total_results, self.word)
84 | return await rawres.hostnames()
85 |
86 | async def get_allhostnames(self):
87 | rawres = myparser.Parser(self.total_results, self.word)
88 | return await rawres.hostnames_all()
89 |
90 | async def process(self, api, proxy: bool = False) -> None:
91 | self.proxy = proxy
92 | if api == "yes":
93 | if self.bingApi is None:
94 | raise MissingKey("BingAPI")
95 | await self.do_search_api()
96 | else:
97 | await self.do_search()
98 | print(f"\tSearching {self.counter} results.")
99 |
100 | async def process_vhost(self) -> None:
101 | await self.do_search_vhost()
102 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bravesearch.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | from theHarvester.discovery.constants import get_delay
4 | from theHarvester.lib.core import AsyncFetcher, Core
5 | from theHarvester.parsers import myparser
6 |
7 |
8 | class SearchBrave:
9 | def __init__(self, word, limit):
10 | self.word = word
11 | self.results = ""
12 | self.totalresults = ""
13 | self.server = "https://search.brave.com/search?q="
14 | self.limit = limit
15 | self.proxy = False
16 |
17 | async def do_search(self):
18 | headers = {"User-Agent": Core.get_user_agent()}
19 | for query in [f'"{self.word}"', f"site:{self.word}"]:
20 | try:
21 | for offset in range(0, 50):
22 | # To reduce the total number of requests, only two queries are made "self.word" and site:self.word
23 | current_url = f"{self.server}{query}&offset={offset}&source=web&show_local=0&spellcheck=0"
24 | resp = await AsyncFetcher.fetch_all(
25 | [current_url], headers=headers, proxy=self.proxy
26 | )
27 | self.results = resp[0]
28 | self.totalresults += self.results
29 | # if 'Results from Microsoft Bing.' in resp[0] \
30 | if (
31 | "Not many great matches came back for your search" in resp[0]
32 | or "Your request has been flagged as being suspicious and Brave Search"
33 | in resp[0]
34 | or "Prove" in resp[0]
35 | and "robot" in resp[0]
36 | or "Robot" in resp[0]
37 | ):
38 | break
39 | await asyncio.sleep(get_delay() + 15)
40 | except Exception as e:
41 | print(f"An exception has occurred in bravesearch: {e}")
42 | await asyncio.sleep(get_delay() + 80)
43 | continue
44 |
45 | async def get_emails(self):
46 | rawres = myparser.Parser(self.totalresults, self.word)
47 | return await rawres.emails()
48 |
49 | async def get_hostnames(self):
50 | rawres = myparser.Parser(self.totalresults, self.word)
51 | return await rawres.hostnames()
52 |
53 | async def process(self, proxy=False):
54 | self.proxy = proxy
55 | await self.do_search()
56 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bufferoverun.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Set
3 |
4 | from theHarvester.discovery.constants import MissingKey
5 | from theHarvester.lib.core import AsyncFetcher, Core
6 |
7 |
8 | class SearchBufferover:
9 | def __init__(self, word) -> None:
10 | self.word = word
11 | self.totalhosts: Set = set()
12 | self.totalips: Set = set()
13 | self.key = Core.bufferoverun_key()
14 | if self.key is None:
15 | raise MissingKey("bufferoverun")
16 | self.proxy = False
17 |
18 | async def do_search(self) -> None:
19 | url = f"https://tls.bufferover.run/dns?q={self.word}"
20 | response = await AsyncFetcher.fetch_all(
21 | [url],
22 | json=True,
23 | headers={"User-Agent": Core.get_user_agent(), "x-api-key": f"{self.key}"},
24 | proxy=self.proxy,
25 | )
26 | dct = response[0]
27 | if dct["Results"]:
28 | self.totalhosts = {
29 | host.split(",")
30 | if "," in host
31 | and self.word.replace("www.", "") in host.split(",")[0] in host
32 | else host.split(",")[4]
33 | for host in dct["Results"]
34 | }
35 |
36 | self.totalips = {
37 | ip.split(",")[0]
38 | for ip in dct["Results"]
39 | if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip.split(",")[0])
40 | }
41 |
42 | async def get_hostnames(self) -> set:
43 | return self.totalhosts
44 |
45 | async def get_ips(self) -> set:
46 | return self.totalips
47 |
48 | async def process(self, proxy: bool = False) -> None:
49 | self.proxy = proxy
50 | await self.do_search()
51 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/censysearch.py:
--------------------------------------------------------------------------------
1 | from typing import Set
2 |
3 | from censys.common import __version__
4 | from censys.common.exceptions import (
5 | CensysRateLimitExceededException,
6 | CensysUnauthorizedException,
7 | )
8 | from censys.search import CensysCerts
9 |
10 | from theHarvester.discovery.constants import MissingKey
11 | from theHarvester.lib.core import Core
12 | from theHarvester.lib.version import version as thehavester_version
13 |
14 |
15 | class SearchCensys:
16 | def __init__(self, domain, limit: int = 500) -> None:
17 | self.word = domain
18 | self.key = Core.censys_key()
19 | if self.key[0] is None or self.key[1] is None:
20 | raise MissingKey("Censys ID and/or Secret")
21 | self.totalhosts: Set = set()
22 | self.emails: Set = set()
23 | self.limit = limit
24 | self.proxy = False
25 |
26 | async def do_search(self) -> None:
27 | try:
28 | cert_search = CensysCerts(
29 | api_id=self.key[0],
30 | api_secret=self.key[1],
31 | user_agent=f"censys-python/{__version__} (theHarvester/{thehavester_version}); +https://github.com/laramies/theHarvester)",
32 | )
33 | except CensysUnauthorizedException:
34 | raise MissingKey("Censys ID and/or Secret")
35 |
36 | query = f"names: {self.word}"
37 | try:
38 | response = cert_search.search(
39 | query=query,
40 | fields=["names", "parsed.subject.email_address"],
41 | max_records=self.limit,
42 | )
43 | for cert in response():
44 | self.totalhosts.update(cert.get("names", []))
45 | email_address = (
46 | cert.get("parsed", {}).get("subject", {}).get("email_address", [])
47 | )
48 | self.emails.update(email_address)
49 | except CensysRateLimitExceededException:
50 | print("Censys rate limit exceeded")
51 |
52 | async def get_hostnames(self) -> set:
53 | return self.totalhosts
54 |
55 | async def get_emails(self) -> set:
56 | return self.emails
57 |
58 | async def process(self, proxy: bool = False) -> None:
59 | self.proxy = proxy
60 | await self.do_search()
61 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/certspottersearch.py:
--------------------------------------------------------------------------------
1 | from typing import Set
2 |
3 | from theHarvester.lib.core import AsyncFetcher
4 |
5 |
6 | class SearchCertspoter:
7 | def __init__(self, word) -> None:
8 | self.word = word
9 | self.totalhosts: Set = set()
10 | self.proxy = False
11 |
12 | async def do_search(self) -> None:
13 | base_url = f"https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names"
14 | try:
15 | response = await AsyncFetcher.fetch_all(
16 | [base_url], json=True, proxy=self.proxy
17 | )
18 | response = response[0]
19 | if isinstance(response, list):
20 | for dct in response:
21 | for key, value in dct.items():
22 | if key == "dns_names":
23 | self.totalhosts.update({name for name in value if name})
24 | elif isinstance(response, dict):
25 | self.totalhosts.update({response["dns_names"] if "dns_names" in response.keys() else ""}) # type: ignore
26 | else:
27 | self.totalhosts.update({""})
28 | except Exception as e:
29 | print(e)
30 |
31 | async def get_hostnames(self) -> set:
32 | return self.totalhosts
33 |
34 | async def process(self, proxy: bool = False) -> None:
35 | self.proxy = proxy
36 | await self.do_search()
37 | print("\tSearching results.")
38 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/crtsh.py:
--------------------------------------------------------------------------------
1 | from typing import List, Set
2 |
3 | from theHarvester.lib.core import AsyncFetcher
4 |
5 |
6 | class SearchCrtsh:
7 | def __init__(self, word) -> None:
8 | self.word = word
9 | self.data: List = []
10 | self.proxy = False
11 |
12 | async def do_search(self) -> List:
13 | data: Set = set()
14 | try:
15 | url = f"https://crt.sh/?q=%25.{self.word}&output=json"
16 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
17 | response = response[0]
18 | data = set(
19 | [
20 | dct["name_value"][2:]
21 | if "*." == dct["name_value"][:2]
22 | else dct["name_value"]
23 | for dct in response
24 | ]
25 | )
26 | data = {
27 | domain
28 | for domain in data
29 | if (domain[0] != "*" and str(domain[0:4]).isnumeric() is False)
30 | }
31 | except Exception as e:
32 | print(e)
33 | clean: List = []
34 | for x in data:
35 | pre = x.split()
36 | for y in pre:
37 | clean.append(y)
38 | return clean
39 |
40 | async def process(self, proxy: bool = False) -> None:
41 | self.proxy = proxy
42 | data = await self.do_search()
43 | self.data = data
44 |
45 | async def get_hostnames(self) -> list:
46 | return self.data
47 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/dnsdumpster.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 | import aiohttp
4 |
5 | from theHarvester.lib.core import Core
6 | from theHarvester.parsers import myparser
7 |
8 |
9 | class SearchDnsDumpster:
10 | def __init__(self, word) -> None:
11 | self.word = word.replace(" ", "%20")
12 | self.results = ""
13 | self.totalresults = ""
14 | self.server = "dnsdumpster.com"
15 | self.proxy = False
16 |
17 | async def do_search(self) -> None:
18 | try:
19 | agent = Core.get_user_agent()
20 | headers = {"User-Agent": agent}
21 | session = aiohttp.ClientSession(headers=headers)
22 | # create a session to properly verify
23 | url = f"https://{self.server}"
24 | csrftoken = ""
25 | if self.proxy is False:
26 | async with session.get(url, headers=headers) as resp:
27 | resp_cookies = str(resp.cookies)
28 | cookies = resp_cookies.split("csrftoken=")
29 | csrftoken += cookies[1][: cookies[1].find(";")]
30 | else:
31 | async with session.get(url, headers=headers, proxy=self.proxy) as resp:
32 | resp_cookies = str(resp.cookies)
33 | cookies = resp_cookies.split("csrftoken=")
34 | csrftoken += cookies[1][: cookies[1].find(";")]
35 | await asyncio.sleep(5)
36 |
37 | # extract csrftoken from cookies
38 | data = {
39 | "Cookie": f"csfrtoken={csrftoken}",
40 | "csrfmiddlewaretoken": csrftoken,
41 | "targetip": self.word,
42 | "user": "free",
43 | }
44 | headers["Referer"] = url
45 | if self.proxy is False:
46 | async with session.post(url, headers=headers, data=data) as resp:
47 | self.results = await resp.text()
48 | else:
49 | async with session.post(
50 | url, headers=headers, data=data, proxy=self.proxy
51 | ) as resp:
52 | self.results = await resp.text()
53 | await session.close()
54 | except Exception as e:
55 | print(f"An exception occurred: {e}")
56 | self.totalresults += self.results
57 |
58 | async def get_hostnames(self):
59 | rawres = myparser.Parser(self.totalresults, self.word)
60 | return await rawres.hostnames()
61 |
62 | async def process(self, proxy: bool = False) -> None:
63 | self.proxy = proxy
64 | await self.do_search() # Only need to do it once.
65 |
--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/duckduckgosearch.py:
--------------------------------------------------------------------------------
1 | import ujson
2 |
3 | from theHarvester.lib.core import AsyncFetcher, Core
4 | from theHarvester.parsers import myparser
5 |
6 |
7 | class SearchDuckDuckGo:
8 | def __init__(self, word, limit) -> None:
9 | self.word = word
10 | self.results = ""
11 | self.totalresults = ""
12 | self.dorks: list = []
13 | self.links: list = []
14 | self.database = "https://duckduckgo.com/?q="
15 | self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1" # Currently using API.
16 | self.quantity = "100"
17 | self.limit = limit
18 | self.proxy = False
19 |
20 | async def do_search(self) -> None:
21 | # Do normal scraping.
22 | url = self.api.replace("x", self.word)
23 | headers = {"User-Agent": Core.get_user_agent()}
24 | first_resp = await AsyncFetcher.fetch_all(
25 | [url], headers=headers, proxy=self.proxy
26 | )
27 | self.results = first_resp[0]
28 | self.totalresults += self.results
29 | urls = await self.crawl(self.results)
30 | urls = {url for url in urls if len(url) > 5}
31 | all_resps = await AsyncFetcher.fetch_all(urls)
32 | self.totalresults += "".join(all_resps)
33 |
34 | async def crawl(self, text):
35 | """
36 | Function parses json and returns URLs.
37 | :param text: formatted json
38 | :return: set of URLs
39 | """
40 | urls = set()
41 | try:
42 | load = ujson.loads(text)
43 | for keys in load.keys(): # Iterate through keys of dict.
44 | val = load.get(keys)
45 |
46 | if isinstance(val, int) or isinstance(val, dict) or val is None:
47 | continue
48 |
49 | if isinstance(val, list):
50 | if len(val) == 0: # Make sure not indexing an empty list.
51 | continue
52 | val = val[0] # The First value should be dict.
53 |
54 | if isinstance(val, dict): # Validation check.
55 | for key in val.keys():
56 | value = val.get(key)
57 | if (
58 | isinstance(value, str)
59 | and value != ""
60 | and "https://" in value
61 | or "http://" in value
62 | ):
63 | urls.add(value)
64 |
65 | if (
66 | isinstance(val, str)
67 | and val != ""
68 | and "https://" in val
69 | or "http://" in val
70 | ):
71 | urls.add(val)
72 | tmp = set()
73 | for url in urls:
74 | if (
75 | "<" in url and "href=" in url
76 | ): # Format is