├── .gitignore ├── .vscode └── settings.json ├── InfoHunter.py ├── LICENSE ├── README.md ├── api-keys example.yaml ├── api_keys example.json ├── images ├── logo1.png └── logo2.png ├── proxies.yaml ├── requirements.txt ├── src ├── evaluacion │ ├── __init__.py │ └── mejoras.py ├── maigret │ ├── .dockerignore │ ├── .githooks │ │ └── pre-commit │ ├── .github │ │ ├── FUNDING.yml │ │ ├── ISSUE_TEMPLATE │ │ │ ├── add-a-site.md │ │ │ ├── bug.md │ │ │ └── report-false-result.md │ │ ├── dependabot.yml │ │ └── workflows │ │ │ ├── build-docker-image.yml │ │ │ ├── codeql-analysis.yml │ │ │ ├── pyinstaller.yml │ │ │ ├── python-package.yml │ │ │ ├── python-publish.yml │ │ │ └── update-site-data.yml │ ├── .gitignore │ ├── CHANGELOG.md │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── Dockerfile │ ├── LICENSE │ ├── MANIFEST.in │ ├── Makefile │ ├── README.md │ ├── docs │ │ ├── Makefile │ │ ├── make.bat │ │ ├── requirements.txt │ │ └── source │ │ │ ├── command-line-options.rst │ │ │ ├── conf.py │ │ │ ├── development.rst │ │ │ ├── extracting-information-from-pages.rst │ │ │ ├── features.rst │ │ │ ├── index.rst │ │ │ ├── philosophy.rst │ │ │ ├── roadmap.rst │ │ │ ├── settings.rst │ │ │ ├── supported-identifier-types.rst │ │ │ ├── tags.rst │ │ │ └── usage-examples.rst │ ├── maigret.py │ ├── maigret │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── __version__.py │ │ ├── activation.py │ │ ├── checking.py │ │ ├── errors.py │ │ ├── executors.py │ │ ├── maigret.py │ │ ├── notify.py │ │ ├── report.py │ │ ├── resources │ │ │ ├── data.json │ │ │ ├── simple_report.tpl │ │ │ ├── simple_report_pdf.css │ │ │ └── simple_report_pdf.tpl │ │ ├── result.py │ │ ├── settings.py │ │ ├── sites.py │ │ ├── submit.py │ │ ├── types.py │ │ └── utils.py │ ├── pyinstaller │ │ ├── maigret_standalone.py │ │ └── requirements.txt │ ├── pytest.ini │ ├── requirements.txt │ ├── setup.cfg │ ├── setup.py │ ├── sites.md │ ├── snapcraft.yaml │ ├── static │ │ ├── chat_gitter.svg │ │ ├── maigret.png │ │ ├── recursive_search.md │ │ ├── recursive_search.svg │ │ ├── report_alexaimephotography_html_screenshot.png │ │ ├── report_alexaimephotography_xmind_screenshot.png │ │ ├── report_alexaimephotographycars.html │ │ └── report_alexaimephotographycars.pdf │ ├── tests │ │ ├── __init__.py │ │ ├── conftest.py │ │ ├── db.json │ │ ├── local.json │ │ ├── test_activation.py │ │ ├── test_checking.py │ │ ├── test_cli.py │ │ ├── test_data.py │ │ ├── test_executors.py │ │ ├── test_maigret.py │ │ ├── test_notify.py │ │ ├── test_report.py │ │ ├── test_sites.py │ │ └── test_utils.py │ ├── utils │ │ ├── __init__.py │ │ ├── add_tags.py │ │ ├── check_engines.py │ │ ├── import_sites.py │ │ ├── sites_diff.py │ │ └── update_site_data.py │ └── wizard.py ├── recopilacion │ ├── __init__.py │ ├── consultas.py │ ├── extraccion.py │ └── fuentes.py ├── riesgos │ ├── __init__.py │ └── evaluacion.py ├── sherlock │ ├── .dockerignore │ ├── .editorconfig │ ├── .github │ │ ├── ISSUE_TEMPLATE │ │ │ ├── bug-report.md │ │ │ ├── feature-request.md │ │ │ ├── question.md │ │ │ ├── reporting-false-negative.md │ │ │ ├── reporting-false-positive.md │ │ │ └── site-support-request.md │ │ └── workflows │ │ │ ├── main.yml │ │ │ ├── nightly.yml │ │ │ ├── pull_request.yml │ │ │ └── update-site-list.yml │ ├── .gitignore │ ├── .replit │ ├── CODE_OF_CONDUCT.md │ ├── CONTRIBUTING.md │ ├── Dockerfile │ ├── LICENSE │ ├── README.md │ ├── docker-compose.yml │ ├── images │ │ └── preview.png │ ├── removed_sites.json │ ├── removed_sites.md │ ├── requirements.txt │ ├── sherlock │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── notify.py │ │ ├── resources │ │ │ └── data.json │ │ ├── result.py │ │ ├── sherlock.py │ │ ├── sites.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── all.py │ │ │ ├── base.py │ │ │ └── test_multiple_usernames.py │ ├── site_list.py │ └── sites.md └── theHarvester │ ├── .dockerignore │ ├── .flake8 │ ├── .git-blame-ignore-revs │ ├── .gitattributes │ ├── .github │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE │ │ └── issue-template.md │ ├── dependabot.yml │ └── workflows │ │ ├── codeql-analysis.yml │ │ ├── dockerci.yml │ │ └── theHarvester.yml │ ├── .gitignore │ ├── .isort.cfg │ ├── .pyre_configuration │ ├── Dockerfile │ ├── README.md │ ├── README │ ├── CONTRIBUTING.md │ ├── COPYING │ └── LICENSES │ ├── docker-compose.yml │ ├── mypy.ini │ ├── pyproject.toml │ ├── pytest.ini │ ├── requirements.txt │ ├── requirements │ ├── base.txt │ └── dev.txt │ ├── restfulHarvest.py │ ├── setup.cfg │ ├── tests │ ├── __init__.py │ ├── discovery │ │ ├── __init__.py │ │ ├── test_anubis.py │ │ ├── test_certspotter.py │ │ ├── test_githubcode.py │ │ └── test_otx.py │ └── test_myparser.py │ ├── theHarvester-logo.png │ ├── theHarvester-logo.webp │ ├── theHarvester.py │ └── theHarvester │ ├── __init__.py │ ├── __main__.py │ ├── data │ ├── proxies.yaml │ └── wordlists │ │ ├── dns-big.txt │ │ ├── dns-names.txt │ │ ├── dorks.txt │ │ ├── general │ │ └── common.txt │ │ └── names_small.txt │ ├── discovery │ ├── __init__.py │ ├── anubis.py │ ├── baidusearch.py │ ├── bevigil.py │ ├── binaryedgesearch.py │ ├── bingsearch.py │ ├── bravesearch.py │ ├── bufferoverun.py │ ├── censysearch.py │ ├── certspottersearch.py │ ├── constants.py │ ├── criminalip.py │ ├── crtsh.py │ ├── dnsdumpster.py │ ├── dnssearch.py │ ├── duckduckgosearch.py │ ├── fullhuntsearch.py │ ├── githubcode.py │ ├── hackertarget.py │ ├── huntersearch.py │ ├── intelxsearch.py │ ├── netlas.py │ ├── onyphe.py │ ├── otxsearch.py │ ├── pentesttools.py │ ├── projectdiscovery.py │ ├── rapiddns.py │ ├── rocketreach.py │ ├── searchhunterhow.py │ ├── securitytrailssearch.py │ ├── shodansearch.py │ ├── sitedossier.py │ ├── subdomaincenter.py │ ├── subdomainfinderc99.py │ ├── takeover.py │ ├── threatminer.py │ ├── tombasearch.py │ ├── urlscan.py │ ├── virustotal.py │ ├── yahoosearch.py │ └── zoomeyesearch.py │ ├── parsers │ ├── __init__.py │ ├── intelxparser.py │ ├── myparser.py │ └── securitytrailsparser.py │ ├── restfulHarvest.py │ ├── screenshot │ ├── __init__.py │ └── screenshot.py │ └── theHarvester.py └── wordlists ├── dns-big.txt ├── dns-names.txt ├── dorks.txt ├── general └── common.txt └── names_small.txt /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.defaultFormatter": "ms-python.black-formatter" 4 | }, 5 | "python.formatting.provider": "none" 6 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | Logo de InfoHunter 3 |
4 | 5 |   6 | 7 | # 🔎 InfoHunter 8 | 9 | InfoHunter es una herramienta de código abierto para la recopilación de información en OSINT, diseñada para proteger la seguridad y privacidad de personas y empresas. 10 | 11 | ![Licencia](https://img.shields.io/github/license/sweetnight19/InfoHunter) 12 | [![Estado del Proyecto](https://img.shields.io/badge/Estado-En%20Desarrollo-yellow.svg)](https://github.com/sweetnight19/InfoHunter) 13 | ![GitHub release (latest by date)](https://img.shields.io/github/v/release/sweetnight19/infohunter) 14 | ![GitHub all releases](https://img.shields.io/github/downloads/Sweetnight19/InfoHunter/total) 15 | ![GitHub Repo stars](https://img.shields.io/github/stars/sweetnight19/infohunter?style=plastic) 16 | 17 |   18 | 19 | ## 📚 Descripción del proyecto 20 | 21 | InfoHunter es una herramienta desarrollada en Python que utiliza técnicas de OSINT (Open Source Intelligence) para recopilar información relevante de diversas fuentes en línea. La herramienta se enfoca en identificar y analizar información para crear perfiles completos de las personas o empresas investigadas, así como para identificar posibles riesgos de seguridad o privacidad. Además, proporciona medidas y buenas prácticas para proteger la privacidad y seguridad de los sujetos de interés. 22 | 23 | ## 🚀 Funcionalidades principales 24 | 25 | - Recopilación de información de fuentes en línea. 26 | - Análisis de la información para crear perfiles completos. 27 | - Identificación y evaluación de riesgos de seguridad o privacidad. 28 | - Medidas y buenas prácticas para proteger la privacidad y seguridad. 29 | - Evaluación de la eficacia de las medidas implementadas. 30 | 31 | ## 🔧 Instalación 32 | 33 | 1. Clona el repositorio de GitHub: 34 | 35 | ```bash 36 | git clone https://github.com/sweetnight19/InfoHunter.git 37 | 38 | ``` 39 | 40 | 2. Instala las dependencias: 41 | 42 | ```bash 43 | pip install -r requirements.txt 44 | ``` 45 | 46 | ## 📖 Uso 47 | 48 | 1. Ejecuta el archivo InfoHunter.py: 49 | 50 | ```bash 51 | python InfoHunter.py 52 | 53 | ``` 54 | 55 | 2. Sigue las instrucciones en la interfaz de línea de comandos para utilizar las funcionalidades de InfoHunter. 56 | 57 | ## 🗺️ Roadmap 58 | 59 | A continuación se muestra el plan de desarrollo para el proyecto: 60 | 61 | - [x] Implementación de la búsqueda de información en fuentes abiertas. 62 | - [x] Análisis de datos recopilados para crear perfiles de personas y empresas. 63 | - [x] Identificación de posibles riesgos de seguridad y privacidad. 64 | - [x] Establecimiento de medidas para proteger la privacidad y seguridad. 65 | - [x] Evaluación de la eficacia de las medidas implementadas. 66 | 67 | ## 👤 Autor 68 | 69 | - Sweetnight19 70 | - Email: sweetnight19@protonmail.com 71 | - GitHub: [@sweetnight19](https://github.com/sweetnight19) 72 | 73 | ## 🤝 Contribuciones 74 | 75 | Las contribuciones son bienvenidas. Si deseas contribuir a este proyecto, sigue los siguientes pasos: 76 | 77 | 1. Haz un fork del repositorio. 78 | 2. Crea una nueva rama para tu contribución. 79 | 3. Realiza tus modificaciones y mejoras. 80 | 4. Envía un pull request. 81 | 82 | ## 📜 Licencia 83 | 84 | Este proyecto está licenciado bajo la [Licencia GPL v3](https://www.gnu.org/licenses/gpl-3.0.en.html). 85 | -------------------------------------------------------------------------------- /api-keys example.yaml: -------------------------------------------------------------------------------- 1 | apikeys: 2 | bevigil: 3 | key: "API-KEY" 4 | 5 | binaryedge: 6 | key: "API-KEY" 7 | 8 | bing: 9 | key: 10 | 11 | bufferoverun: 12 | key: 13 | 14 | censys: 15 | id: "API-KEY" 16 | secret: "API-KEY" 17 | 18 | criminalip: 19 | key: 20 | 21 | fullhunt: 22 | key: "API-KEY" 23 | 24 | github: 25 | key: 26 | 27 | hunter: 28 | key: "API-KEY" 29 | 30 | intelx: 31 | key: "API-KEY" 32 | 33 | netlas: 34 | key: "API-KEY" 35 | 36 | pentestTools: 37 | key: 38 | 39 | projectDiscovery: 40 | key: 41 | 42 | rocketreach: 43 | key: 44 | 45 | securityTrails: 46 | key: "API-KEY" 47 | 48 | shodan: 49 | key: "API-KEY" 50 | 51 | virustotal: 52 | key: "API-KEY" 53 | 54 | zoomeye: 55 | key: "API-KEY" 56 | -------------------------------------------------------------------------------- /api_keys example.json: -------------------------------------------------------------------------------- 1 | { 2 | "pyhunter": "API-KEY", 3 | "hibp-api-key": "API-KEY", 4 | "breachdirectory": "API-KEY", 5 | "similar-web": "API-KEY" 6 | } -------------------------------------------------------------------------------- /images/logo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/images/logo1.png -------------------------------------------------------------------------------- /images/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/images/logo2.png -------------------------------------------------------------------------------- /proxies.yaml: -------------------------------------------------------------------------------- 1 | http: 2 | - ip:port 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiodns==3.1.1 2 | aiofiles==23.2.1 3 | aiohttp==3.9.3 4 | aiomultiprocess==0.9.0 5 | aiosignal==1.3.1 6 | aiosqlite==0.19.0 7 | annotated-types==0.6.0 8 | anyio==3.7.1 9 | appdirs==1.4.4 10 | argcomplete==3.2.2 11 | attrs==23.2.0 12 | backoff==2.2.1 13 | beautifulsoup4==4.12.3 14 | censys==2.2.11 15 | certifi==2024.2.2 16 | cffi==1.16.0 17 | chardet==5.2.0 18 | charset-normalizer==3.3.2 19 | click==8.1.7 20 | click-plugins==1.1.1 21 | colorama==0.4.6 22 | Deprecated==1.2.14 23 | dnspython==2.5.0 24 | fastapi==0.109.2 25 | filelock==3.13.1 26 | fpdf==1.7.2 27 | frozenlist==1.4.1 28 | h11==0.14.0 29 | idna==3.6 30 | importlib-metadata==7.0.1 31 | importlib-resources==6.1.1 32 | limits==3.7.0 33 | lxml==5.1.0 34 | markdown-it-py==3.0.0 35 | mdurl==0.1.2 36 | multidict==6.0.5 37 | netaddr==0.10.1 38 | packaging==23.2 39 | pillow==10.2.0 40 | pycares==4.4.0 41 | pycparser==2.21 42 | pydantic==2.6.1 43 | pydantic_core==2.16.2 44 | pyee==8.2.2 45 | pyfiglet==1.0.2 46 | Pygments==2.17.2 47 | pyhunter==1.7 48 | pyppeteer==1.0.2 49 | python-dateutil==2.8.2 50 | PyYAML==6.0.1 51 | reportlab==4.1.0 52 | requests==2.31.0 53 | requests-file==2.0.0 54 | retrying==1.3.4 55 | rich==13.7.0 56 | setuptools==69.0.3 57 | shodan==1.31.0 58 | six==1.16.0 59 | slowapi==0.1.9 60 | sniffio==1.3.0 61 | soupsieve==2.5 62 | starlette==0.36.3 63 | theHarvester==0.0.1 64 | tldextract==5.1.1 65 | tqdm==4.66.2 66 | typing_extensions==4.9.0 67 | ujson==5.9.0 68 | urllib3==1.26.18 69 | uvicorn==0.27.0.post1 70 | websockets==10.4 71 | wrapt==1.16.0 72 | XlsxWriter==3.1.9 73 | yarl==1.9.4 74 | zipp==3.17.0 75 | -------------------------------------------------------------------------------- /src/evaluacion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/evaluacion/__init__.py -------------------------------------------------------------------------------- /src/maigret/.dockerignore: -------------------------------------------------------------------------------- 1 | .git/ 2 | .vscode/ 3 | static/ 4 | tests/ 5 | *.txt 6 | !/requirements.txt 7 | venv/ 8 | 9 | -------------------------------------------------------------------------------- /src/maigret/.githooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | python3 ./utils/update_site_data.py 3 | -------------------------------------------------------------------------------- /src/maigret/.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | patreon: soxoj 4 | -------------------------------------------------------------------------------- /src/maigret/.github/ISSUE_TEMPLATE/add-a-site.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Add a site 3 | about: I want to add a new site for Maigret checks 4 | title: New site 5 | labels: new-site 6 | assignees: soxoj 7 | 8 | --- 9 | 10 | Link to the site main page: https://example.com 11 | Link to an existing account: https://example.com/users/john 12 | Link to a nonexistent account: https://example.com/users/noonewouldeverusethis7 13 | Tags: photo, us, ... 14 | -------------------------------------------------------------------------------- /src/maigret/.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Maigret bug report 3 | about: I want to report a bug in Maigret functionality 4 | title: '' 5 | labels: bug 6 | assignees: soxoj 7 | 8 | --- 9 | 10 | ## Checklist 11 | 12 | - [ ] I'm reporting a bug in Maigret functionality 13 | - [ ] I've checked for similar bug reports including closed ones 14 | - [ ] I've checked for pull requests that attempt to fix this bug 15 | 16 | ## Description 17 | 18 | Info about Maigret version you are running and environment (`--version`, operation system, ISP provider): 19 | 20 | 21 | How to reproduce this bug (commandline options / conditions): 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/maigret/.github/ISSUE_TEMPLATE/report-false-result.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Report invalid result 3 | about: I want to report invalid result of Maigret search 4 | title: Invalid result 5 | labels: false-result 6 | assignees: soxoj 7 | 8 | --- 9 | 10 | Invalid link: 11 | 12 | 19 | 20 | - [ ] I'm sure that the link leads to "not found" page 21 | -------------------------------------------------------------------------------- /src/maigret/.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /src/maigret/.github/workflows/build-docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Build docker image and push to DockerHub 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | docker: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - 12 | name: Set up QEMU 13 | uses: docker/setup-qemu-action@v1 14 | - 15 | name: Set up Docker Buildx 16 | uses: docker/setup-buildx-action@v1 17 | - 18 | name: Login to DockerHub 19 | uses: docker/login-action@v1 20 | with: 21 | username: ${{ secrets.DOCKER_HUB_USERNAME }} 22 | password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }} 23 | - 24 | name: Build and push 25 | id: docker_build 26 | uses: docker/build-push-action@v2 27 | with: 28 | push: true 29 | tags: ${{ secrets.DOCKER_HUB_USERNAME }}/maigret:latest 30 | platforms: linux/amd64,linux/arm64 31 | - 32 | name: Image digest 33 | run: echo ${{ steps.docker_build.outputs.digest }} 34 | -------------------------------------------------------------------------------- /src/maigret/.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | schedule: 18 | - cron: '23 6 * * 6' 19 | 20 | jobs: 21 | analyze: 22 | name: Analyze 23 | runs-on: ubuntu-latest 24 | permissions: 25 | actions: read 26 | contents: read 27 | security-events: write 28 | 29 | strategy: 30 | fail-fast: false 31 | matrix: 32 | language: [ 'python' ] 33 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 34 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v2 39 | 40 | # Initializes the CodeQL tools for scanning. 41 | - name: Initialize CodeQL 42 | uses: github/codeql-action/init@v1 43 | with: 44 | languages: ${{ matrix.language }} 45 | # If you wish to specify custom queries, you can do so here or in a config file. 46 | # By default, queries listed here will override any specified in a config file. 47 | # Prefix the list here with "+" to use these queries and those in the config file. 48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 49 | 50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 51 | # If this step fails, then you should remove it and run the build manually (see below) 52 | - name: Autobuild 53 | uses: github/codeql-action/autobuild@v1 54 | 55 | # ℹ️ Command-line programs to run using the OS shell. 56 | # 📚 https://git.io/JvXDl 57 | 58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 59 | # and modify them (or add more) to build your code if your project 60 | # uses a compiled language 61 | 62 | #- run: | 63 | # make bootstrap 64 | # make release 65 | 66 | - name: Perform CodeQL Analysis 67 | uses: github/codeql-action/analyze@v1 68 | -------------------------------------------------------------------------------- /src/maigret/.github/workflows/pyinstaller.yml: -------------------------------------------------------------------------------- 1 | name: Package exe with PyInstaller - Windows 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | 7 | jobs: 8 | build: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: PyInstaller Windows 15 | uses: JackMcKew/pyinstaller-action-windows@main 16 | with: 17 | path: pyinstaller 18 | 19 | - uses: actions/upload-artifact@v2 20 | with: 21 | name: maigret_standalone_win32 22 | path: pyinstaller/dist/windows # or path/to/artifact 23 | -------------------------------------------------------------------------------- /src/maigret/.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Linting and testing 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | types: [opened, synchronize, reopened] 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.7, 3.8, 3.9] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install -r test-requirements.txt 28 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 29 | - name: Test with pytest 30 | run: | 31 | pytest --reruns 3 --reruns-delay 5 32 | -------------------------------------------------------------------------------- /src/maigret/.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.x' 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install setuptools wheel twine 22 | - name: Build and publish 23 | env: 24 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 25 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 26 | run: | 27 | python setup.py sdist bdist_wheel 28 | twine upload dist/* 29 | -------------------------------------------------------------------------------- /src/maigret/.github/workflows/update-site-data.yml: -------------------------------------------------------------------------------- 1 | name: Update sites rating and statistics 2 | 3 | on: 4 | pull_request: 5 | branches: [ dev ] 6 | types: [opened, synchronize] 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout repository 13 | uses: actions/checkout@v2.3.2 14 | with: 15 | ref: ${{ github.event.pull_request.head.sha }} 16 | fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. 17 | 18 | - name: build application 19 | run: | 20 | pip3 install . 21 | python3 ./utils/update_site_data.py --empty-only 22 | 23 | - name: Commit and push changes 24 | run: | 25 | git config --global user.name "Maigret autoupdate" 26 | git config --global user.email "soxoj@protonmail.com" 27 | echo `git name-rev ${{ github.event.pull_request.head.sha }} --name-only` 28 | export BRANCH=`git name-rev ${{ github.event.pull_request.head.sha }} --name-only | sed 's/remotes\/origin\///'` 29 | echo $BRANCH 30 | git remote -v 31 | git checkout $BRANCH 32 | git add sites.md 33 | git commit -m "Updated site list and statistics" 34 | git push origin $BRANCH -------------------------------------------------------------------------------- /src/maigret/.gitignore: -------------------------------------------------------------------------------- 1 | # Virtual Environment 2 | venv/ 3 | 4 | # Editor Configurations 5 | .vscode/ 6 | .idea/ 7 | 8 | # Python 9 | __pycache__/ 10 | 11 | # Pip 12 | src/ 13 | 14 | # Jupyter Notebook 15 | .ipynb_checkpoints 16 | *.ipynb 17 | 18 | # Logs and backups 19 | *.log 20 | *.bak 21 | 22 | # Output files, except requirements.txt 23 | *.txt 24 | !requirements.txt 25 | 26 | # Comma-Separated Values (CSV) Reports 27 | *.csv 28 | 29 | # MacOS Folder Metadata File 30 | .DS_Store 31 | /reports/ 32 | 33 | # Testing 34 | .coverage 35 | dist/ 36 | htmlcov/ 37 | /test_* 38 | 39 | # Maigret files 40 | settings.json 41 | -------------------------------------------------------------------------------- /src/maigret/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to contribute 2 | 3 | Hey! I'm really glad you're reading this. Maigret contains a lot of sites, and it is very hard to keep all the sites operational. That's why any fix is important. 4 | 5 | ## How to add a new site 6 | 7 | #### Beginner level 8 | 9 | You can use Maigret **submit mode** (`maigret --submit URL`) to add a new site or update an existing site. In this mode Maigret do an automatic analysis of the given account URL or site main page URL to determine the site engine and methods to check account presence. After checking Maigret asks if you want to add the site, answering y/Y will rewrite the local database. 10 | 11 | #### Advanced level 12 | 13 | You can edit [the database JSON file](https://github.com/soxoj/maigret/blob/main/maigret/resources/data.json) (`./maigret/resources/data.json`) manually. 14 | 15 | ## Testing 16 | 17 | There are CI checks for every PR to the Maigret repository. But it will be better to run `make format`, `make link` and `make test` to ensure you've made a corrent changes. 18 | 19 | ## Submitting changes 20 | 21 | To submit you changes you must [send a GitHub PR](https://github.com/soxoj/maigret/pulls) to the Maigret project. 22 | Always write a clear log message for your commits. One-line messages are fine for small changes, but bigger changes should look like this: 23 | 24 | $ git commit -m "A brief summary of the commit 25 | > 26 | > A paragraph describing what changed and its impact." 27 | 28 | ## Coding conventions 29 | 30 | Start reading the code and you'll get the hang of it. ;) 31 | -------------------------------------------------------------------------------- /src/maigret/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | LABEL maintainer="Soxoj " 3 | WORKDIR /app 4 | RUN pip install --no-cache-dir --upgrade pip 5 | RUN apt-get update && \ 6 | apt-get install --no-install-recommends -y \ 7 | gcc \ 8 | musl-dev \ 9 | libxml2 \ 10 | libxml2-dev \ 11 | libxslt-dev \ 12 | && \ 13 | rm -rf /var/lib/apt/lists/* /tmp/* 14 | COPY . . 15 | RUN YARL_NO_EXTENSIONS=1 python3 -m pip install --no-cache-dir . 16 | ENTRYPOINT ["maigret"] 17 | -------------------------------------------------------------------------------- /src/maigret/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Sherlock Project 4 | Copyright (c) 2020-2021 Soxoj 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /src/maigret/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | include requirements.txt 4 | include maigret/resources/* 5 | -------------------------------------------------------------------------------- /src/maigret/Makefile: -------------------------------------------------------------------------------- 1 | LINT_FILES=maigret wizard.py tests 2 | 3 | test: 4 | coverage run --source=./maigret -m pytest tests 5 | coverage report -m 6 | coverage html 7 | 8 | rerun-tests: 9 | pytest --lf -vv 10 | 11 | lint: 12 | @echo 'syntax errors or undefined names' 13 | flake8 --count --select=E9,F63,F7,F82 --show-source --statistics ${LINT_FILES} maigret.py 14 | 15 | @echo 'warning' 16 | flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503,E501 ${LINT_FILES} maigret.py 17 | 18 | @echo 'mypy' 19 | mypy ${LINT_FILES} 20 | 21 | speed: 22 | time python3 ./maigret.py --version 23 | python3 -c "import timeit; t = timeit.Timer('import maigret'); print(t.timeit(number = 1000000))" 24 | python3 -X importtime -c "import maigret" 2> maigret-import.log 25 | python3 -m tuna maigret-import.log 26 | 27 | format: 28 | @echo 'black' 29 | black --skip-string-normalization ${LINT_FILES} 30 | 31 | pull: 32 | git stash 33 | git checkout main 34 | git pull origin main 35 | git stash pop 36 | 37 | clean: 38 | rm -rf reports htmcov dist 39 | 40 | install: 41 | pip3 install . 42 | -------------------------------------------------------------------------------- /src/maigret/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /src/maigret/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /src/maigret/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx-copybutton 2 | -------------------------------------------------------------------------------- /src/maigret/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | 3 | # -- Project information 4 | 5 | project = 'Maigret' 6 | copyright = '2021, soxoj' 7 | author = 'soxoj' 8 | 9 | release = '0.4.4' 10 | version = '0.4.4' 11 | 12 | # -- General configuration 13 | 14 | extensions = [ 15 | 'sphinx.ext.duration', 16 | 'sphinx.ext.doctest', 17 | 'sphinx.ext.autodoc', 18 | 'sphinx.ext.autosummary', 19 | 'sphinx.ext.intersphinx', 20 | 'sphinx_copybutton' 21 | ] 22 | 23 | intersphinx_mapping = { 24 | 'python': ('https://docs.python.org/3/', None), 25 | 'sphinx': ('https://www.sphinx-doc.org/en/master/', None), 26 | } 27 | intersphinx_disabled_domains = ['std'] 28 | 29 | templates_path = ['_templates'] 30 | 31 | # -- Options for HTML output 32 | 33 | html_theme = 'sphinx_rtd_theme' 34 | 35 | # -- Options for EPUB output 36 | epub_show_urls = 'footnote' 37 | -------------------------------------------------------------------------------- /src/maigret/docs/source/development.rst: -------------------------------------------------------------------------------- 1 | .. _development: 2 | 3 | Development 4 | ============== 5 | 6 | Testing 7 | ------- 8 | 9 | It is recommended use Python 3.7/3.8 for test due to some conflicts in 3.9. 10 | 11 | Install test requirements: 12 | 13 | .. code-block:: console 14 | 15 | pip install -r test-requirements.txt 16 | 17 | 18 | Use the following commands to check Maigret: 19 | 20 | .. code-block:: console 21 | 22 | # run linter and typing checks 23 | # order of checks% 24 | # - critical syntax errors or undefined names 25 | # - flake checks 26 | # - mypy checks 27 | make lint 28 | 29 | # run testing with coverage html report 30 | # current test coverage is 60% 31 | make text 32 | 33 | # open html report 34 | open htmlcov/index.html 35 | 36 | 37 | How to publish new version of Maigret 38 | ------------------------------------- 39 | 40 | **Collaborats rights are requires, write Soxoj to get them**. 41 | 42 | For new version publishing you must create a new branch in repository 43 | with a bumped version number and actual changelog first. After it you 44 | must create a release, and GitHub action automatically create a new 45 | PyPi package. 46 | 47 | - New branch example: https://github.com/soxoj/maigret/commit/e520418f6a25d7edacde2d73b41a8ae7c80ddf39 48 | - Release example: https://github.com/soxoj/maigret/releases/tag/v0.4.1 49 | 50 | 1. Make a new branch locally with a new version name. Check the current version number here: https://pypi.org/project/maigret/. 51 | **Increase only patch version (third number)** if there are no breaking changes. 52 | 53 | .. code-block:: console 54 | 55 | git checkout -b 0.4.0 56 | 57 | 2. Update Maigret version in three files manually: 58 | 59 | - setup.py 60 | - maigret/__version__.py 61 | - docs/source/conf.py 62 | 63 | 3. Create a new empty text section in the beginning of the file `CHANGELOG.md` with a current date: 64 | 65 | .. code-block:: console 66 | 67 | ## [0.4.0] - 2022-01-03 68 | 69 | 4. Get auto-generate release notes: 70 | 71 | - Open https://github.com/soxoj/maigret/releases/new 72 | - Click `Choose a tag`, enter `v0.4.0` (your version) 73 | - Click `Create new tag` 74 | - Press `+ Auto-generate release notes` 75 | - Copy all the text from description text field below 76 | - Paste it to empty text section in `CHANGELOG.txt` 77 | - Remove redundant lines `## What's Changed` and `## New Contributors` section if it exists 78 | - *Close the new release page* 79 | 80 | 5. Commit all the changes, push, make pull request 81 | 82 | .. code-block:: console 83 | 84 | git add -p 85 | git commit -m 'Bump to YOUR VERSION' 86 | git push origin head 87 | 88 | 89 | 6. Merge pull request 90 | 91 | 7. Create new release 92 | 93 | - Open https://github.com/soxoj/maigret/releases/new again 94 | - Click `Choose a tag` 95 | - Enter actual version in format `v0.4.0` 96 | - Also enter actual version in the field `Release title` 97 | - Click `Create new tag` 98 | - Press `+ Auto-generate release notes` 99 | - **Press "Publish release" button** 100 | 101 | 8. That's all, now you can simply wait push to PyPi. You can monitor it in Action page: https://github.com/soxoj/maigret/actions/workflows/python-publish.yml -------------------------------------------------------------------------------- /src/maigret/docs/source/extracting-information-from-pages.rst: -------------------------------------------------------------------------------- 1 | .. _extracting-information-from-pages: 2 | 3 | Extracting information from pages 4 | ================================= 5 | Maigret can parse URLs and content of web pages by URLs to extract info about account owner and other meta information. 6 | 7 | You must specify the URL with the option ``--parse``, it's can be a link to an account or an online document. List of supported sites `see here `_. 8 | 9 | After the end of the parsing phase, Maigret will start the search phase by :doc:`supported identifiers ` found (usernames, ids, etc.). 10 | 11 | Examples 12 | -------- 13 | .. code-block:: console 14 | 15 | $ maigret --parse https://docs.google.com/spreadsheets/d/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw/edit\#gid\=0 16 | 17 | Scanning webpage by URL https://docs.google.com/spreadsheets/d/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw/edit#gid=0... 18 | ┣╸org_name: Gooten 19 | ┗╸mime_type: application/vnd.google-apps.ritz 20 | Scanning webpage by URL https://clients6.google.com/drive/v2beta/files/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw?fields=alternateLink%2CcopyRequiresWriterPermission%2CcreatedDate%2Cdescription%2CdriveId%2CfileSize%2CiconLink%2Cid%2Clabels(starred%2C%20trashed)%2ClastViewedByMeDate%2CmodifiedDate%2Cshared%2CteamDriveId%2CuserPermission(id%2Cname%2CemailAddress%2Cdomain%2Crole%2CadditionalRoles%2CphotoLink%2Ctype%2CwithLink)%2Cpermissions(id%2Cname%2CemailAddress%2Cdomain%2Crole%2CadditionalRoles%2CphotoLink%2Ctype%2CwithLink)%2Cparents(id)%2Ccapabilities(canMoveItemWithinDrive%2CcanMoveItemOutOfDrive%2CcanMoveItemOutOfTeamDrive%2CcanAddChildren%2CcanEdit%2CcanDownload%2CcanComment%2CcanMoveChildrenWithinDrive%2CcanRename%2CcanRemoveChildren%2CcanMoveItemIntoTeamDrive)%2Ckind&supportsTeamDrives=true&enforceSingleParent=true&key=AIzaSyC1eQ1xj69IdTMeii5r7brs3R90eck-m7k... 21 | ┣╸created_at: 2016-02-16T18:51:52.021Z 22 | ┣╸updated_at: 2019-10-23T17:15:47.157Z 23 | ┣╸gaia_id: 15696155517366416778 24 | ┣╸fullname: Nadia Burgess 25 | ┣╸email: nadia@gooten.com 26 | ┣╸image: https://lh3.googleusercontent.com/a-/AOh14GheZe1CyNa3NeJInWAl70qkip4oJ7qLsD8vDy6X=s64 27 | ┗╸email_username: nadia 28 | 29 | .. code-block:: console 30 | 31 | $ maigret.py --parse https://steamcommunity.com/profiles/76561199113454789 32 | Scanning webpage by URL https://steamcommunity.com/profiles/76561199113454789... 33 | ┣╸steam_id: 76561199113454789 34 | ┣╸nickname: Pok 35 | ┗╸username: Machine42 36 | -------------------------------------------------------------------------------- /src/maigret/docs/source/features.rst: -------------------------------------------------------------------------------- 1 | .. _features: 2 | 3 | Features 4 | ======== 5 | 6 | This is the list of Maigret features. 7 | 8 | Personal info gathering 9 | ----------------------- 10 | 11 | Maigret does the `parsing of accounts webpages and extraction `_ of personal info, links to other profiles, etc. 12 | Extracted info displayed as an additional result in CLI output and as tables in HTML and PDF reports. 13 | Also, Maigret use found ids and usernames from links to start a recursive search. 14 | 15 | Enabled by default, can be disabled with ``--no extracting``. 16 | 17 | Recursive search 18 | ---------------- 19 | 20 | Maigret can extract some :ref:`common ids ` and usernames from links on the account page (often people placed links to their other accounts) and immediately start new searches. All the gathered information will be displayed in CLI output and reports. 21 | 22 | Enabled by default, can be disabled with ``--no-recursion``. 23 | 24 | Reports 25 | ------- 26 | 27 | Maigret currently supports HTML, PDF, TXT, XMind 8 mindmap, and JSON reports. 28 | 29 | HTML/PDF reports contain: 30 | 31 | - profile photo 32 | - all the gathered personal info 33 | - additional information about supposed personal data (full name, gender, location), resulting from statistics of all found accounts 34 | 35 | Also, there is a short text report in the CLI output after the end of a searching phase. 36 | 37 | **Warning**: XMind 8 mindmaps are incompatible with XMind 2022! 38 | 39 | Tags 40 | ---- 41 | 42 | The Maigret sites database very big (and will be bigger), and it is maybe an overhead to run a search for all the sites. 43 | Also, it is often hard to understand, what sites more interesting for us in the case of a certain person. 44 | 45 | Tags markup allows selecting a subset of sites by interests (photo, messaging, finance, etc.) or by country. Tags of found accounts grouped and displayed in the reports. 46 | 47 | See full description :doc:`in the Tags Wiki page `. 48 | 49 | Censorship and captcha detection 50 | -------------------------------- 51 | 52 | Maigret can detect common errors such as censorship stub pages, CloudFlare captcha pages, and others. 53 | If you get more them 3% errors of a certain type in a session, you've got a warning message in the CLI output with recommendations to improve performance and avoid problems. 54 | 55 | Retries 56 | ------- 57 | 58 | Maigret will do retries of the requests with temporary errors got (connection failures, proxy errors, etc.). 59 | 60 | One attempt by default, can be changed with option ``--retries N``. 61 | 62 | Archives and mirrors checking 63 | ----------------------------- 64 | 65 | The Maigret database contains not only the original websites, but also mirrors, archives, and aggregators. For example: 66 | 67 | - `Reddit BigData search `_ 68 | - `Picuki `_, Instagram mirror 69 | - `Twitter shadowban `_ checker 70 | 71 | It allows getting additional info about the person and checking the existence of the account even if the main site is unavailable (bot protection, captcha, etc.) 72 | 73 | Simple API 74 | ---------- 75 | 76 | Maigret can be easily integrated with the use of Python package `maigret `_. 77 | 78 | Example: the official `Telegram bot `_ 79 | -------------------------------------------------------------------------------- /src/maigret/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. _index: 2 | 3 | Welcome to the Maigret docs! 4 | ============================ 5 | 6 | **Maigret** is an easy-to-use and powerful OSINT tool for collecting a dossier on a person by username only. 7 | 8 | This is achieved by checking for accounts on a huge number of sites and gathering all the available information from web pages. 9 | 10 | The project's main goal - give to OSINT researchers and pentesters a **universal tool** to get maximum information about a subject and integrate it with other tools in automatization pipelines. 11 | 12 | You may be interested in: 13 | ------------------------- 14 | - :doc:`Command line options description ` and :doc:`usage examples ` 15 | - :doc:`Features list ` 16 | - :doc:`Project roadmap ` 17 | 18 | .. toctree:: 19 | :hidden: 20 | :caption: Sections 21 | 22 | command-line-options 23 | extracting-information-from-pages 24 | features 25 | philosophy 26 | roadmap 27 | supported-identifier-types 28 | tags 29 | usage-examples 30 | settings 31 | development 32 | -------------------------------------------------------------------------------- /src/maigret/docs/source/philosophy.rst: -------------------------------------------------------------------------------- 1 | .. _philosophy: 2 | 3 | Philosophy 4 | ========== 5 | 6 | TL;DR: Username => Dossier 7 | 8 | Maigret is designed to gather all the available information about person by his username. 9 | 10 | What kind of information is this? First, links to person accounts. Secondly, all the machine-extractable 11 | pieces of info, such as: other usernames, full name, URLs to people's images, birthday, location (country, 12 | city, etc.), gender. 13 | 14 | All this information forms some dossier, but it also useful for other tools and analytical purposes. 15 | Each collected piece of data has a label of a certain format (for example, ``follower_count`` for the number 16 | of subscribers or ``created_at`` for account creation time) so that it can be parsed and analyzed by various 17 | systems and stored in databases. 18 | -------------------------------------------------------------------------------- /src/maigret/docs/source/roadmap.rst: -------------------------------------------------------------------------------- 1 | .. _roadmap: 2 | 3 | Roadmap 4 | ======= 5 | 6 | .. figure:: https://i.imgur.com/kk8cFdR.png 7 | :target: https://i.imgur.com/kk8cFdR.png 8 | :align: center 9 | 10 | Current status 11 | -------------- 12 | 13 | - Sites DB stats - ok 14 | - Scan sessions stats - ok 15 | - Site engine autodetect - ok 16 | - Engines for all the sites - WIP 17 | - Unified reporting flow - ok 18 | - Retries - ok 19 | -------------------------------------------------------------------------------- /src/maigret/docs/source/settings.rst: -------------------------------------------------------------------------------- 1 | .. _settings: 2 | 3 | Settings 4 | ============== 5 | 6 | Options are also configurable through settings files. See 7 | `settings JSON file `_ 8 | for the list of currently supported options. 9 | 10 | After start Maigret tries to load configuration from the following sources in exactly the same order: 11 | 12 | .. code-block:: console 13 | 14 | # relative path, based on installed package path 15 | resources/settings.json 16 | 17 | # absolute path, configuration file in home directory 18 | ~/.maigret/settings.json 19 | 20 | # relative path, based on current working directory 21 | settings.json 22 | 23 | Missing any of these files is not an error. 24 | If the next settings file contains already known option, 25 | this option will be rewrited. So it is possible to make 26 | custom configuration for different users and directories. 27 | -------------------------------------------------------------------------------- /src/maigret/docs/source/supported-identifier-types.rst: -------------------------------------------------------------------------------- 1 | .. _supported-identifier-types: 2 | 3 | Supported identifier types 4 | ========================== 5 | 6 | Maigret can search against not only ordinary usernames, but also through certain common identifiers. There is a list of all currently supported identifiers. 7 | 8 | - **gaia_id** - Google inner numeric user identifier, in former times was placed in a Google Plus account URL. 9 | - **steam_id** - Steam inner numeric user identifier. 10 | - **wikimapia_uid** - Wikimapia.org inner numeric user identifier. 11 | - **uidme_uguid** - uID.me inner numeric user identifier. 12 | - **yandex_public_id** - Yandex sites inner letter user identifier. See also: `YaSeeker `_. 13 | - **vk_id** - VK.com inner numeric user identifier. 14 | - **ok_id** - OK.ru inner numeric user identifier. 15 | - **yelp_userid** - Yelp inner user identifier. 16 | -------------------------------------------------------------------------------- /src/maigret/docs/source/tags.rst: -------------------------------------------------------------------------------- 1 | .. _tags: 2 | 3 | Tags 4 | ==== 5 | 6 | The use of tags allows you to select a subset of the sites from big Maigret DB for search. 7 | 8 | **Warning: tags markup is not stable now.** 9 | 10 | There are several types of tags: 11 | 12 | 1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 `_). These tags reflect the site language and regional origin of its users and are then used to locate the owner of a username. If the regional origin is difficult to establish or a site is positioned as worldwide, `no country code is given`. There could be multiple country code tags for one site. 13 | 14 | 2. **Site engines**. Most of them are forum engines now: ``uCoz``, ``vBulletin``, ``XenForo`` et al. Full list of engines stored in the Maigret database. 15 | 16 | 3. **Sites' subject/type and interests of its users**. Full list of "standard" tags is `present in the source code `_ only for a moment. 17 | 18 | Usage 19 | ----- 20 | ``--tags us,jp`` -- search on US and Japanese sites (actually marked as such in the Maigret database) 21 | 22 | ``--tags coding`` -- search on sites related to software development. 23 | 24 | ``--tags ucoz`` -- search on uCoz sites only (mostly CIS countries) 25 | -------------------------------------------------------------------------------- /src/maigret/docs/source/usage-examples.rst: -------------------------------------------------------------------------------- 1 | .. _usage-examples: 2 | 3 | Usage examples 4 | ============== 5 | 6 | Start a search for accounts with username ``machine42`` on top 500 sites from the Maigret DB. 7 | 8 | .. code-block:: console 9 | 10 | maigret machine42 11 | 12 | Start a search for accounts with username ``machine42`` on **all sites** from the Maigret DB. 13 | 14 | .. code-block:: console 15 | 16 | maigret machine42 -a 17 | 18 | Start a search [...] and generate HTML and PDF reports. 19 | 20 | .. code-block:: console 21 | 22 | maigret machine42 -a -HP 23 | 24 | Start a search for accounts with username ``machine42`` only on Facebook. 25 | 26 | .. code-block:: console 27 | 28 | maigret machine42 --site Facebook 29 | 30 | Extract information from the Steam page by URL and start a search for accounts with found username ``machine42``. 31 | 32 | .. code-block:: console 33 | 34 | maigret --parse https://steamcommunity.com/profiles/76561199113454789 35 | 36 | Start a search for accounts with username ``machine42`` only on US and Japanese sites. 37 | 38 | .. code-block:: console 39 | 40 | maigret machine42 --tags en,jp 41 | 42 | Start a search for accounts with username ``machine42`` only on sites related to software development. 43 | 44 | .. code-block:: console 45 | 46 | maigret machine42 --tags coding 47 | 48 | Start a search for accounts with username ``machine42`` on uCoz sites only (mostly CIS countries). 49 | 50 | .. code-block:: console 51 | 52 | maigret machine42 --tags ucoz 53 | 54 | -------------------------------------------------------------------------------- /src/maigret/maigret.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import asyncio 3 | import sys 4 | 5 | from maigret.maigret import main 6 | 7 | 8 | def run(): 9 | try: 10 | if sys.version_info.minor >= 10: 11 | asyncio.run(main()) 12 | else: 13 | loop = asyncio.get_event_loop() 14 | loop.run_until_complete(main()) 15 | except KeyboardInterrupt: 16 | print('Maigret is interrupted.') 17 | sys.exit(1) 18 | 19 | 20 | if __name__ == "__main__": 21 | run() 22 | -------------------------------------------------------------------------------- /src/maigret/maigret/__init__.py: -------------------------------------------------------------------------------- 1 | """Maigret""" 2 | 3 | __title__ = 'Maigret' 4 | __package__ = 'maigret' 5 | __author__ = 'Soxoj' 6 | __author_email__ = 'soxoj@protonmail.com' 7 | 8 | 9 | from .__version__ import __version__ 10 | from .checking import maigret as search 11 | from .maigret import main as cli 12 | from .sites import MaigretEngine, MaigretSite, MaigretDatabase 13 | from .notify import QueryNotifyPrint as Notifier 14 | -------------------------------------------------------------------------------- /src/maigret/maigret/__main__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """ 4 | Maigret entrypoint 5 | """ 6 | 7 | import asyncio 8 | 9 | from .maigret import main 10 | 11 | if __name__ == "__main__": 12 | asyncio.run(main()) 13 | -------------------------------------------------------------------------------- /src/maigret/maigret/__version__.py: -------------------------------------------------------------------------------- 1 | """Maigret version file""" 2 | 3 | __version__ = '0.4.4' 4 | -------------------------------------------------------------------------------- /src/maigret/maigret/activation.py: -------------------------------------------------------------------------------- 1 | from http.cookiejar import MozillaCookieJar 2 | from http.cookies import Morsel 3 | 4 | from aiohttp import CookieJar 5 | 6 | 7 | class ParsingActivator: 8 | @staticmethod 9 | def twitter(site, logger, cookies={}): 10 | headers = dict(site.headers) 11 | del headers["x-guest-token"] 12 | import requests 13 | 14 | r = requests.post(site.activation["url"], headers=headers) 15 | logger.info(r) 16 | j = r.json() 17 | guest_token = j[site.activation["src"]] 18 | site.headers["x-guest-token"] = guest_token 19 | 20 | @staticmethod 21 | def vimeo(site, logger, cookies={}): 22 | headers = dict(site.headers) 23 | if "Authorization" in headers: 24 | del headers["Authorization"] 25 | import requests 26 | 27 | r = requests.get(site.activation["url"], headers=headers) 28 | jwt_token = r.json()["jwt"] 29 | site.headers["Authorization"] = "jwt " + jwt_token 30 | 31 | @staticmethod 32 | def spotify(site, logger, cookies={}): 33 | headers = dict(site.headers) 34 | if "Authorization" in headers: 35 | del headers["Authorization"] 36 | import requests 37 | 38 | r = requests.get(site.activation["url"]) 39 | bearer_token = r.json()["accessToken"] 40 | site.headers["authorization"] = f"Bearer {bearer_token}" 41 | 42 | 43 | def import_aiohttp_cookies(cookiestxt_filename): 44 | cookies_obj = MozillaCookieJar(cookiestxt_filename) 45 | cookies_obj.load(ignore_discard=True, ignore_expires=True) 46 | 47 | cookies = CookieJar() 48 | 49 | cookies_list = [] 50 | for domain in cookies_obj._cookies.values(): 51 | for key, cookie in list(domain.values())[0].items(): 52 | c = Morsel() 53 | c.set(key, cookie.value, cookie.value) 54 | c["domain"] = cookie.domain 55 | c["path"] = cookie.path 56 | cookies_list.append((key, c)) 57 | 58 | cookies.update_cookies(cookies_list) 59 | 60 | return cookies 61 | -------------------------------------------------------------------------------- /src/maigret/maigret/errors.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Any 2 | 3 | from .result import QueryResult 4 | from .types import QueryResultWrapper 5 | 6 | 7 | # error got as a result of completed search query 8 | class CheckError: 9 | _type = 'Unknown' 10 | _desc = '' 11 | 12 | def __init__(self, typename, desc=''): 13 | self._type = typename 14 | self._desc = desc 15 | 16 | def __str__(self): 17 | if not self._desc: 18 | return f'{self._type} error' 19 | 20 | return f'{self._type} error: {self._desc}' 21 | 22 | @property 23 | def type(self): 24 | return self._type 25 | 26 | @property 27 | def desc(self): 28 | return self._desc 29 | 30 | 31 | COMMON_ERRORS = { 32 | 'Attention Required! | Cloudflare': CheckError( 33 | 'Captcha', 'Cloudflare' 34 | ), 35 | 'Please stand by, while we are checking your browser': CheckError( 36 | 'Bot protection', 'Cloudflare' 37 | ), 38 | 'Checking your browser before accessing': CheckError( 39 | 'Bot protection', 'Cloudflare' 40 | ), 41 | 'This website is using a security service to protect itself from online attacks.': CheckError( 42 | 'Access denied', 'Cloudflare' 43 | ), 44 | 'Доступ ограничен': CheckError('Censorship', 'Rostelecom'), 45 | 'document.getElementById(\'validate_form_submit\').disabled=true': CheckError( 46 | 'Captcha', 'Mail.ru' 47 | ), 48 | 'Verifying your browser, please wait...
DDoS Protection by Blazingfast.io': CheckError( 49 | 'Bot protection', 'Blazingfast' 50 | ), 51 | '404

Мы не нашли страницу': CheckError( 52 | 'Resolving', 'MegaFon 404 page' 53 | ), 54 | 'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError( 55 | 'Censorship', 'MGTS' 56 | ), 57 | 'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'), 58 | 'Сайт заблокирован хостинг-провайдером': CheckError( 59 | 'Site-specific', 'Site is disabled (Beget)' 60 | ), 61 | } 62 | 63 | ERRORS_TYPES = { 64 | 'Captcha': 'Try to switch to another IP address or to use service cookies', 65 | 'Bot protection': 'Try to switch to another IP address', 66 | 'Censorship': 'Switch to another internet service provider', 67 | 'Request timeout': 'Try to increase timeout or to switch to another internet service provider', 68 | 'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)', 69 | } 70 | 71 | # TODO: checking for reason 72 | ERRORS_REASONS = { 73 | 'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)', 74 | } 75 | 76 | TEMPORARY_ERRORS_TYPES = [ 77 | 'Request timeout', 78 | 'Unknown', 79 | 'Request failed', 80 | 'Connecting failure', 81 | 'HTTP', 82 | 'Proxy', 83 | 'Interrupted', 84 | 'Connection lost', 85 | ] 86 | 87 | THRESHOLD = 3 # percent 88 | 89 | 90 | def is_important(err_data): 91 | return err_data['perc'] >= THRESHOLD 92 | 93 | 94 | def is_permanent(err_type): 95 | return err_type not in TEMPORARY_ERRORS_TYPES 96 | 97 | 98 | def detect(text): 99 | for flag, err in COMMON_ERRORS.items(): 100 | if flag in text: 101 | return err 102 | return None 103 | 104 | 105 | def solution_of(err_type) -> str: 106 | return ERRORS_TYPES.get(err_type, '') 107 | 108 | 109 | def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]: 110 | errors_counts: Dict[str, int] = {} 111 | for r in search_res.values(): 112 | if r and isinstance(r, dict) and r.get('status'): 113 | if not isinstance(r['status'], QueryResult): 114 | continue 115 | 116 | err = r['status'].error 117 | if not err: 118 | continue 119 | errors_counts[err.type] = errors_counts.get(err.type, 0) + 1 120 | 121 | counts = [] 122 | for err, count in sorted(errors_counts.items(), key=lambda x: x[1], reverse=True): 123 | counts.append( 124 | { 125 | 'err': err, 126 | 'count': count, 127 | 'perc': round(count / len(search_res), 2) * 100, 128 | } 129 | ) 130 | 131 | return counts 132 | -------------------------------------------------------------------------------- /src/maigret/maigret/resources/simple_report_pdf.css: -------------------------------------------------------------------------------- 1 | h2 { 2 | font-size: 30px; 3 | width: 100%; 4 | display:block; 5 | } 6 | h3 { 7 | font-size: 25px; 8 | width: 100%; 9 | display:block; 10 | } 11 | h4 { 12 | font-size: 20px; 13 | width: 100%; 14 | display:block; 15 | } 16 | p { 17 | margin: 0 0 5px; 18 | display: block; 19 | } 20 | 21 | 22 | table { 23 | margin-bottom: 10px; 24 | width:100%; 25 | } 26 | th { 27 | font-weight: bold; 28 | } 29 | th,td,caption { 30 | padding: 4px 10px 4px 5px; 31 | } 32 | table tr:nth-child(even) td, 33 | table tr.even td { 34 | background-color: #e5ecf9; 35 | } 36 | 37 | div { 38 | border-bottom-color: #3e3e3e; 39 | border-bottom-width: 1px; 40 | border-bottom-style: solid; 41 | } 42 | .invalid-button { 43 | position: absolute; 44 | left: 10px; 45 | } -------------------------------------------------------------------------------- /src/maigret/maigret/result.py: -------------------------------------------------------------------------------- 1 | """Maigret Result Module 2 | 3 | This module defines various objects for recording the results of queries. 4 | """ 5 | from enum import Enum 6 | 7 | 8 | class QueryStatus(Enum): 9 | """Query Status Enumeration. 10 | 11 | Describes status of query about a given username. 12 | """ 13 | 14 | CLAIMED = "Claimed" # Username Detected 15 | AVAILABLE = "Available" # Username Not Detected 16 | UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username 17 | ILLEGAL = "Illegal" # Username Not Allowable For This Site 18 | 19 | def __str__(self): 20 | """Convert Object To String. 21 | 22 | Keyword Arguments: 23 | self -- This object. 24 | 25 | Return Value: 26 | Nicely formatted string to get information about this object. 27 | """ 28 | return self.value 29 | 30 | 31 | class QueryResult: 32 | """Query Result Object. 33 | 34 | Describes result of query about a given username. 35 | """ 36 | 37 | def __init__( 38 | self, 39 | username, 40 | site_name, 41 | site_url_user, 42 | status, 43 | ids_data=None, 44 | query_time=None, 45 | context=None, 46 | error=None, 47 | tags=[], 48 | ): 49 | """Create Query Result Object. 50 | 51 | Contains information about a specific method of detecting usernames on 52 | a given type of web sites. 53 | 54 | Keyword Arguments: 55 | self -- This object. 56 | username -- String indicating username that query result 57 | was about. 58 | site_name -- String which identifies site. 59 | site_url_user -- String containing URL for username on site. 60 | NOTE: The site may or may not exist: this 61 | just indicates what the name would 62 | be, if it existed. 63 | status -- Enumeration of type QueryStatus() indicating 64 | the status of the query. 65 | query_time -- Time (in seconds) required to perform query. 66 | Default of None. 67 | context -- String indicating any additional context 68 | about the query. For example, if there was 69 | an error, this might indicate the type of 70 | error that occurred. 71 | Default of None. 72 | ids_data -- Extracted from website page info about other 73 | usernames and inner ids. 74 | 75 | Return Value: 76 | Nothing. 77 | """ 78 | 79 | self.username = username 80 | self.site_name = site_name 81 | self.site_url_user = site_url_user 82 | self.status = status 83 | self.query_time = query_time 84 | self.context = context 85 | self.ids_data = ids_data 86 | self.tags = tags 87 | self.error = error 88 | 89 | def json(self): 90 | return { 91 | "username": self.username, 92 | "site_name": self.site_name, 93 | "url": self.site_url_user, 94 | "status": str(self.status), 95 | "ids": self.ids_data or {}, 96 | "tags": self.tags, 97 | } 98 | 99 | def is_found(self): 100 | return self.status == QueryStatus.CLAIMED 101 | 102 | def __str__(self): 103 | """Convert Object To String. 104 | 105 | Keyword Arguments: 106 | self -- This object. 107 | 108 | Return Value: 109 | Nicely formatted string to get information about this object. 110 | """ 111 | status = str(self.status) 112 | if self.context is not None: 113 | # There is extra context information available about the results. 114 | # Append it to the normal response text. 115 | status += f" ({self.context})" 116 | 117 | return status 118 | -------------------------------------------------------------------------------- /src/maigret/maigret/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as path 3 | import json 4 | from typing import List 5 | 6 | SETTINGS_FILES_PATHS = [ 7 | path.join(path.dirname(path.realpath(__file__)), "resources/settings.json"), 8 | '~/.maigret/settings.json', 9 | path.join(os.getcwd(), 'settings.json'), 10 | ] 11 | 12 | 13 | class Settings: 14 | # main maigret setting 15 | retries_count: int 16 | sites_db_path: str 17 | timeout: int 18 | max_connections: int 19 | recursive_search: bool 20 | info_extracting: bool 21 | cookie_jar_file: str 22 | ignore_ids_list: List 23 | reports_path: str 24 | proxy_url: str 25 | tor_proxy_url: str 26 | i2p_proxy_url: str 27 | domain_search: bool 28 | scan_all_sites: bool 29 | top_sites_count: int 30 | scan_disabled_sites: bool 31 | scan_sites_list: List 32 | self_check_enabled: bool 33 | print_not_found: bool 34 | print_check_errors: bool 35 | colored_print: bool 36 | show_progressbar: bool 37 | report_sorting: str 38 | json_report_type: str 39 | txt_report: bool 40 | csv_report: bool 41 | xmind_report: bool 42 | pdf_report: bool 43 | html_report: bool 44 | graph_report: bool 45 | 46 | # submit mode settings 47 | presence_strings: list 48 | supposed_usernames: list 49 | 50 | def __init__(self): 51 | pass 52 | 53 | def load(self, paths=None): 54 | was_inited = False 55 | 56 | if not paths: 57 | paths = SETTINGS_FILES_PATHS 58 | 59 | for filename in paths: 60 | data = {} 61 | 62 | try: 63 | with open(filename, "r", encoding="utf-8") as file: 64 | data = json.load(file) 65 | except FileNotFoundError: 66 | # treast as a normal situation 67 | pass 68 | except Exception as error: 69 | return False, ValueError( 70 | f"Problem with parsing json contents of " 71 | f"settings file '{filename}': {str(error)}." 72 | ) 73 | 74 | self.__dict__.update(data) 75 | if data: 76 | was_inited = True 77 | 78 | return ( 79 | was_inited, 80 | f'None of the default settings files found: {", ".join(paths)}', 81 | ) 82 | 83 | @property 84 | def json(self): 85 | return self.__dict__ 86 | -------------------------------------------------------------------------------- /src/maigret/maigret/types.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, List, Dict, Tuple, Any 2 | 3 | 4 | # search query 5 | QueryDraft = Tuple[Callable, List, Dict] 6 | 7 | # options dict 8 | QueryOptions = Dict[str, Any] 9 | 10 | # TODO: throw out 11 | QueryResultWrapper = Dict[str, Any] 12 | -------------------------------------------------------------------------------- /src/maigret/maigret/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | import ast 3 | import difflib 4 | import re 5 | import random 6 | from typing import Any 7 | 8 | 9 | DEFAULT_USER_AGENTS = [ 10 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36", 11 | ] 12 | 13 | 14 | class CaseConverter: 15 | @staticmethod 16 | def camel_to_snake(camelcased_string: str) -> str: 17 | return re.sub(r"(? str: 21 | formatted = "".join(word.title() for word in snakecased_string.split("_")) 22 | result = formatted[0].lower() + formatted[1:] 23 | return result 24 | 25 | @staticmethod 26 | def snake_to_title(snakecased_string: str) -> str: 27 | words = snakecased_string.split("_") 28 | words[0] = words[0].title() 29 | return " ".join(words) 30 | 31 | 32 | def is_country_tag(tag: str) -> bool: 33 | """detect if tag represent a country""" 34 | return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == "global" 35 | 36 | 37 | def enrich_link_str(link: str) -> str: 38 | link = link.strip() 39 | if link.startswith("www.") or (link.startswith("http") and "//" in link): 40 | return f'{link}' 41 | return link 42 | 43 | 44 | class URLMatcher: 45 | _HTTP_URL_RE_STR = "^https?://(www.|m.)?(.+)$" 46 | HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR) 47 | UNSAFE_SYMBOLS = ".?" 48 | 49 | @classmethod 50 | def extract_main_part(self, url: str) -> str: 51 | match = self.HTTP_URL_RE.search(url) 52 | if match and match.group(2): 53 | return match.group(2).rstrip("/") 54 | 55 | return "" 56 | 57 | @classmethod 58 | def make_profile_url_regexp(self, url: str, username_regexp: str = ""): 59 | url_main_part = self.extract_main_part(url) 60 | for c in self.UNSAFE_SYMBOLS: 61 | url_main_part = url_main_part.replace(c, f"\\{c}") 62 | prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$') 63 | 64 | url_regexp = url_main_part.replace( 65 | "{username}", f"({prepared_username_regexp})" 66 | ) 67 | regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp) 68 | 69 | return re.compile(regexp_str, re.IGNORECASE) 70 | 71 | 72 | def ascii_data_display(data: str) -> Any: 73 | return ast.literal_eval(data) 74 | 75 | 76 | def get_dict_ascii_tree(items, prepend="", new_line=True): 77 | new_result = b'\xe2\x94\x9c'.decode() 78 | new_line = b'\xe2\x94\x80'.decode() 79 | last_result = b'\xe2\x94\x94'.decode() 80 | skip_result = b'\xe2\x94\x82'.decode() 81 | 82 | text = "" 83 | for num, item in enumerate(items): 84 | box_symbol = ( 85 | new_result + new_line if num != len(items) - 1 else last_result + new_line 86 | ) 87 | 88 | if type(item) == tuple: 89 | field_name, field_value = item 90 | if field_value.startswith("['"): 91 | is_last_item = num == len(items) - 1 92 | prepend_symbols = " " * 3 if is_last_item else f" {skip_result} " 93 | data = ascii_data_display(field_value) 94 | field_value = get_dict_ascii_tree(data, prepend_symbols) 95 | text += f"\n{prepend}{box_symbol}{field_name}: {field_value}" 96 | else: 97 | text += f"\n{prepend}{box_symbol} {item}" 98 | 99 | if not new_line: 100 | text = text[1:] 101 | 102 | return text 103 | 104 | 105 | def get_random_user_agent(): 106 | return random.choice(DEFAULT_USER_AGENTS) 107 | 108 | 109 | def get_match_ratio(base_strs: list): 110 | def get_match_inner(s: str): 111 | return round( 112 | max( 113 | [ 114 | difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio() 115 | for s2 in base_strs 116 | ] 117 | ), 118 | 2, 119 | ) 120 | 121 | return get_match_inner 122 | -------------------------------------------------------------------------------- /src/maigret/pyinstaller/maigret_standalone.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import asyncio 3 | 4 | import maigret 5 | 6 | if __name__ == "__main__": 7 | asyncio.run(maigret.cli()) -------------------------------------------------------------------------------- /src/maigret/pyinstaller/requirements.txt: -------------------------------------------------------------------------------- 1 | maigret @ https://github.com/soxoj/maigret/archive/refs/heads/main.zip 2 | pefile==2022.5.30 3 | psutil==5.9.5 4 | pyinstaller @ https://github.com/pyinstaller/pyinstaller/archive/develop.zip 5 | pywin32-ctypes==0.2.0 -------------------------------------------------------------------------------- /src/maigret/pytest.ini: -------------------------------------------------------------------------------- 1 | # pytest.ini 2 | [pytest] 3 | filterwarnings = 4 | error 5 | ignore::UserWarning 6 | asyncio_mode=auto -------------------------------------------------------------------------------- /src/maigret/requirements.txt: -------------------------------------------------------------------------------- 1 | aiodns==3.0.0 2 | aiohttp==3.8.3 3 | aiohttp-socks==0.7.1 4 | arabic-reshaper==2.1.4 5 | async-timeout==4.0.2 6 | attrs==22.2.0 7 | certifi==2022.12.7 8 | chardet==5.0.0 9 | colorama==0.4.6 10 | future==0.18.3 11 | future-annotations==1.0.0 12 | html5lib==1.1 13 | idna==3.4 14 | Jinja2==3.1.2 15 | lxml==4.9.2 16 | MarkupSafe==2.1.1 17 | mock==4.0.3 18 | multidict==6.0.4 19 | pycountry==22.3.5 20 | PyPDF2==2.10.8 21 | PySocks==1.7.1 22 | python-bidi==0.4.2 23 | requests==2.28.2 24 | requests-futures==1.0.0 25 | six==1.16.0 26 | socid-extractor>=0.0.21 27 | soupsieve==2.3.2.post1 28 | stem==1.8.1 29 | torrequest==0.1.0 30 | tqdm==4.65.0 31 | typing-extensions==4.5.0 32 | webencodings==0.5.1 33 | xhtml2pdf==0.2.8 34 | XMind==1.2.0 35 | yarl==1.8.2 36 | networkx==2.6.3 37 | pyvis==0.2.1 38 | reportlab==3.6.12 39 | cloudscraper==1.2.66 40 | -------------------------------------------------------------------------------- /src/maigret/setup.cfg: -------------------------------------------------------------------------------- 1 | [egg_info] 2 | tag_build = 3 | tag_date = 0 4 | 5 | [flake8] 6 | per-file-ignores = __init__.py:F401 7 | 8 | [mypy] 9 | ignore_missing_imports = True -------------------------------------------------------------------------------- /src/maigret/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import ( 2 | setup, 3 | find_packages, 4 | ) 5 | 6 | 7 | with open('README.md') as fh: 8 | long_description = fh.read() 9 | 10 | with open('requirements.txt') as rf: 11 | requires = rf.read().splitlines() 12 | 13 | setup(name='maigret', 14 | version='0.4.4', 15 | description='Collect a dossier on a person by username from a huge number of sites', 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | url='https://github.com/soxoj/maigret', 19 | install_requires=requires, 20 | entry_points={'console_scripts': ['maigret = maigret.maigret:run']}, 21 | packages=find_packages(exclude=["tests*"]), 22 | include_package_data=True, 23 | author='Soxoj', 24 | author_email='soxoj@protonmail.com', 25 | license='MIT', 26 | zip_safe=False) 27 | -------------------------------------------------------------------------------- /src/maigret/snapcraft.yaml: -------------------------------------------------------------------------------- 1 | name: maigret2 2 | adopt-info: maigret2 3 | summary: SOCMINT / Instagram 4 | description: | 5 | Test Test Test 6 | 7 | license: MIT 8 | 9 | base: core20 10 | grade: stable 11 | confinement: strict 12 | compression: lzo 13 | 14 | architectures: 15 | - build-on: amd64 16 | 17 | apps: 18 | maigret2: 19 | command: bin/maigret 20 | environment: 21 | LC_ALL: C.UTF-8 22 | plugs: 23 | - home 24 | - network 25 | 26 | parts: 27 | maigret2: 28 | plugin: python 29 | source: https://github.com/soxoj/maigret 30 | source-type: git 31 | 32 | build-packages: 33 | - python3-pip 34 | - python3-six 35 | - python3 36 | 37 | stage-packages: 38 | - python3 39 | - python3-six 40 | 41 | override-pull: | 42 | snapcraftctl pull 43 | snapcraftctl set-version "$(git describe --tags | sed 's/^v//' | cut -d "-" -f1)" 44 | -------------------------------------------------------------------------------- /src/maigret/static/chat_gitter.svg: -------------------------------------------------------------------------------- 1 | chaton gitter -------------------------------------------------------------------------------- /src/maigret/static/maigret.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/maigret.png -------------------------------------------------------------------------------- /src/maigret/static/report_alexaimephotography_html_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotography_html_screenshot.png -------------------------------------------------------------------------------- /src/maigret/static/report_alexaimephotography_xmind_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotography_xmind_screenshot.png -------------------------------------------------------------------------------- /src/maigret/static/report_alexaimephotographycars.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotographycars.pdf -------------------------------------------------------------------------------- /src/maigret/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/tests/__init__.py -------------------------------------------------------------------------------- /src/maigret/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import logging 3 | import os 4 | 5 | import pytest 6 | from _pytest.mark import Mark 7 | 8 | from maigret.sites import MaigretDatabase 9 | from maigret.maigret import setup_arguments_parser 10 | from maigret.settings import Settings 11 | 12 | 13 | CUR_PATH = os.path.dirname(os.path.realpath(__file__)) 14 | JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json') 15 | SETTINGS_FILE = os.path.join(CUR_PATH, '../maigret/resources/settings.json') 16 | TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json') 17 | LOCAL_TEST_JSON_FILE = os.path.join(CUR_PATH, 'local.json') 18 | empty_mark = Mark('', (), {}) 19 | 20 | 21 | def by_slow_marker(item): 22 | return item.get_closest_marker('slow', default=empty_mark) 23 | 24 | 25 | def pytest_collection_modifyitems(items): 26 | items.sort(key=by_slow_marker, reverse=False) 27 | 28 | 29 | def get_test_reports_filenames(): 30 | return glob.glob(os.path.join('report_*'), recursive=False) 31 | 32 | 33 | def remove_test_reports(): 34 | reports_list = get_test_reports_filenames() 35 | for f in reports_list: 36 | os.remove(f) 37 | logging.error(f'Removed test reports {reports_list}') 38 | 39 | 40 | @pytest.fixture(scope='session') 41 | def default_db(): 42 | return MaigretDatabase().load_from_file(JSON_FILE) 43 | 44 | 45 | @pytest.fixture(scope='function') 46 | def test_db(): 47 | return MaigretDatabase().load_from_file(TEST_JSON_FILE) 48 | 49 | 50 | @pytest.fixture(scope='function') 51 | def local_test_db(): 52 | return MaigretDatabase().load_from_file(LOCAL_TEST_JSON_FILE) 53 | 54 | 55 | @pytest.fixture(autouse=True) 56 | def reports_autoclean(): 57 | remove_test_reports() 58 | yield 59 | remove_test_reports() 60 | 61 | 62 | @pytest.fixture(scope='session') 63 | def argparser(): 64 | settings = Settings() 65 | settings.load([SETTINGS_FILE]) 66 | return setup_arguments_parser(settings) 67 | 68 | 69 | @pytest.fixture(scope="session") 70 | def httpserver_listen_address(): 71 | return ("localhost", 8989) 72 | -------------------------------------------------------------------------------- /src/maigret/tests/db.json: -------------------------------------------------------------------------------- 1 | { 2 | "engines": {}, 3 | "sites": { 4 | "GooglePlayStore": { 5 | "tags": ["global", "us"], 6 | "disabled": false, 7 | "checkType": "status_code", 8 | "alexaRank": 1, 9 | "url": "https://play.google.com/store/apps/developer?id={username}", 10 | "urlMain": "https://play.google.com/store", 11 | "usernameClaimed": "Facebook_nosuchname", 12 | "usernameUnclaimed": "noonewouldeverusethis7" 13 | }, 14 | "Reddit": { 15 | "tags": ["news", "social", "us"], 16 | "checkType": "status_code", 17 | "presenseStrs": ["totalKarma"], 18 | "disabled": true, 19 | "alexaRank": 17, 20 | "url": "https://www.reddit.com/user/{username}", 21 | "urlMain": "https://www.reddit.com/", 22 | "usernameClaimed": "blue", 23 | "usernameUnclaimed": "noonewouldeverusethis7" 24 | } 25 | } 26 | } -------------------------------------------------------------------------------- /src/maigret/tests/local.json: -------------------------------------------------------------------------------- 1 | { 2 | "engines": {}, 3 | "sites": { 4 | "StatusCode": { 5 | "checkType": "status_code", 6 | "url": "http://localhost:8989/url?id={username}", 7 | "urlMain": "http://localhost:8989/", 8 | "usernameClaimed": "claimed", 9 | "usernameUnclaimed": "unclaimed" 10 | }, 11 | "Message": { 12 | "checkType": "message", 13 | "url": "http://localhost:8989/url?id={username}", 14 | "urlMain": "http://localhost:8989/", 15 | "presenseStrs": ["user", "profile"], 16 | "absenseStrs": ["not found", "404"], 17 | "usernameClaimed": "claimed", 18 | "usernameUnclaimed": "unclaimed" 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /src/maigret/tests/test_activation.py: -------------------------------------------------------------------------------- 1 | """Maigret activation test functions""" 2 | import json 3 | 4 | import aiohttp 5 | import pytest 6 | from mock import Mock 7 | 8 | from maigret.activation import ParsingActivator, import_aiohttp_cookies 9 | 10 | COOKIES_TXT = """# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous 11 | # This file can be used by wget, curl, aria2c and other standard compliant tools. 12 | # Usage Examples: 13 | # 1) wget -x --load-cookies cookies.txt "https://xss.is/search/" 14 | # 2) curl --cookie cookies.txt "https://xss.is/search/" 15 | # 3) aria2c --load-cookies cookies.txt "https://xss.is/search/" 16 | # 17 | xss.is FALSE / TRUE 0 xf_csrf test 18 | xss.is FALSE / TRUE 1642709308 xf_user tset 19 | .xss.is TRUE / FALSE 0 muchacho_cache test 20 | .xss.is TRUE / FALSE 1924905600 132_evc test 21 | httpbin.org FALSE / FALSE 0 a b 22 | """ 23 | 24 | 25 | @pytest.mark.skip(reason="periodically fails") 26 | @pytest.mark.slow 27 | def test_twitter_activation(default_db): 28 | twitter_site = default_db.sites_dict['Twitter'] 29 | token1 = twitter_site.headers['x-guest-token'] 30 | 31 | ParsingActivator.twitter(twitter_site, Mock()) 32 | token2 = twitter_site.headers['x-guest-token'] 33 | 34 | assert token1 != token2 35 | 36 | 37 | @pytest.mark.asyncio 38 | async def test_import_aiohttp_cookies(): 39 | cookies_filename = 'cookies_test.txt' 40 | with open(cookies_filename, 'w') as f: 41 | f.write(COOKIES_TXT) 42 | 43 | cookie_jar = import_aiohttp_cookies(cookies_filename) 44 | assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org'] 45 | 46 | url = 'https://httpbin.org/cookies' 47 | connector = aiohttp.TCPConnector(ssl=False) 48 | session = aiohttp.ClientSession( 49 | connector=connector, trust_env=True, cookie_jar=cookie_jar 50 | ) 51 | 52 | response = await session.get(url=url) 53 | result = json.loads(await response.content.read()) 54 | await session.close() 55 | 56 | assert result == {'cookies': {'a': 'b'}} 57 | -------------------------------------------------------------------------------- /src/maigret/tests/test_checking.py: -------------------------------------------------------------------------------- 1 | from mock import Mock 2 | import pytest 3 | 4 | from maigret import search 5 | 6 | 7 | def site_result_except(server, username, **kwargs): 8 | query = f'id={username}' 9 | server.expect_request('/url', query_string=query).respond_with_data(**kwargs) 10 | 11 | 12 | @pytest.mark.slow 13 | @pytest.mark.asyncio 14 | async def test_checking_by_status_code(httpserver, local_test_db): 15 | sites_dict = local_test_db.sites_dict 16 | 17 | site_result_except(httpserver, 'claimed', status=200) 18 | site_result_except(httpserver, 'unclaimed', status=404) 19 | 20 | result = await search('claimed', site_dict=sites_dict, logger=Mock()) 21 | assert result['StatusCode']['status'].is_found() is True 22 | 23 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock()) 24 | assert result['StatusCode']['status'].is_found() is False 25 | 26 | 27 | @pytest.mark.slow 28 | @pytest.mark.asyncio 29 | async def test_checking_by_message_positive_full(httpserver, local_test_db): 30 | sites_dict = local_test_db.sites_dict 31 | 32 | site_result_except(httpserver, 'claimed', response_data="user profile") 33 | site_result_except(httpserver, 'unclaimed', response_data="404 not found") 34 | 35 | result = await search('claimed', site_dict=sites_dict, logger=Mock()) 36 | assert result['Message']['status'].is_found() is True 37 | 38 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock()) 39 | assert result['Message']['status'].is_found() is False 40 | 41 | 42 | @pytest.mark.slow 43 | @pytest.mark.asyncio 44 | async def test_checking_by_message_positive_part(httpserver, local_test_db): 45 | sites_dict = local_test_db.sites_dict 46 | 47 | site_result_except(httpserver, 'claimed', response_data="profile") 48 | site_result_except(httpserver, 'unclaimed', response_data="404") 49 | 50 | result = await search('claimed', site_dict=sites_dict, logger=Mock()) 51 | assert result['Message']['status'].is_found() is True 52 | 53 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock()) 54 | assert result['Message']['status'].is_found() is False 55 | 56 | 57 | @pytest.mark.slow 58 | @pytest.mark.asyncio 59 | async def test_checking_by_message_negative(httpserver, local_test_db): 60 | sites_dict = local_test_db.sites_dict 61 | 62 | site_result_except(httpserver, 'claimed', response_data="") 63 | site_result_except(httpserver, 'unclaimed', response_data="user 404") 64 | 65 | result = await search('claimed', site_dict=sites_dict, logger=Mock()) 66 | assert result['Message']['status'].is_found() is False 67 | 68 | result = await search('unclaimed', site_dict=sites_dict, logger=Mock()) 69 | assert result['Message']['status'].is_found() is True 70 | -------------------------------------------------------------------------------- /src/maigret/tests/test_cli.py: -------------------------------------------------------------------------------- 1 | """Maigret command-line arguments parsing tests""" 2 | from argparse import Namespace 3 | from typing import Dict, Any 4 | 5 | DEFAULT_ARGS: Dict[str, Any] = { 6 | 'all_sites': False, 7 | 'connections': 100, 8 | 'cookie_file': None, 9 | 'csv': False, 10 | 'db_file': 'resources/data.json', 11 | 'debug': False, 12 | 'disable_extracting': False, 13 | 'disable_recursive_search': False, 14 | 'folderoutput': 'reports', 15 | 'html': False, 16 | 'graph': False, 17 | 'id_type': 'username', 18 | 'ignore_ids_list': [], 19 | 'info': False, 20 | 'json': '', 21 | 'new_site_to_submit': False, 22 | 'no_color': False, 23 | 'no_progressbar': False, 24 | 'parse_url': '', 25 | 'pdf': False, 26 | 'print_check_errors': False, 27 | 'print_not_found': False, 28 | 'proxy': None, 29 | 'reports_sorting': 'default', 30 | 'retries': 1, 31 | 'self_check': False, 32 | 'site_list': [], 33 | 'stats': False, 34 | 'tags': '', 35 | 'timeout': 30, 36 | 'tor_proxy': 'socks5://127.0.0.1:9050', 37 | 'i2p_proxy': 'http://127.0.0.1:4444', 38 | 'top_sites': 500, 39 | 'txt': False, 40 | 'use_disabled_sites': False, 41 | 'username': [], 42 | 'verbose': False, 43 | 'with_domains': False, 44 | 'xmind': False, 45 | } 46 | 47 | 48 | def test_args_search_mode(argparser): 49 | args = argparser.parse_args('username'.split()) 50 | 51 | assert args.username == ['username'] 52 | 53 | want_args = dict(DEFAULT_ARGS) 54 | want_args.update({'username': ['username']}) 55 | 56 | assert args == Namespace(**want_args) 57 | 58 | 59 | def test_args_search_mode_several_usernames(argparser): 60 | args = argparser.parse_args('username1 username2'.split()) 61 | 62 | assert args.username == ['username1', 'username2'] 63 | 64 | want_args = dict(DEFAULT_ARGS) 65 | want_args.update({'username': ['username1', 'username2']}) 66 | 67 | assert args == Namespace(**want_args) 68 | 69 | 70 | def test_args_self_check_mode(argparser): 71 | args = argparser.parse_args('--self-check --site GitHub'.split()) 72 | 73 | want_args = dict(DEFAULT_ARGS) 74 | want_args.update( 75 | { 76 | 'self_check': True, 77 | 'site_list': ['GitHub'], 78 | 'username': [], 79 | } 80 | ) 81 | 82 | assert args == Namespace(**want_args) 83 | 84 | 85 | def test_args_multiple_sites(argparser): 86 | args = argparser.parse_args( 87 | '--site GitHub VK --site PornHub --site Taringa,Steam'.split() 88 | ) 89 | 90 | want_args = dict(DEFAULT_ARGS) 91 | want_args.update( 92 | { 93 | 'site_list': ['GitHub', 'PornHub', 'Taringa,Steam'], 94 | 'username': ['VK'], 95 | } 96 | ) 97 | 98 | assert args == Namespace(**want_args) 99 | -------------------------------------------------------------------------------- /src/maigret/tests/test_data.py: -------------------------------------------------------------------------------- 1 | """Maigret data test functions""" 2 | 3 | from maigret.utils import is_country_tag 4 | 5 | 6 | def test_tags_validity(default_db): 7 | unknown_tags = set() 8 | 9 | tags = default_db._tags 10 | 11 | for site in default_db.sites: 12 | for tag in filter(lambda x: not is_country_tag(x), site.tags): 13 | if tag not in tags: 14 | unknown_tags.add(tag) 15 | 16 | assert unknown_tags == set() 17 | -------------------------------------------------------------------------------- /src/maigret/tests/test_executors.py: -------------------------------------------------------------------------------- 1 | """Maigret checking logic test functions""" 2 | import pytest 3 | import asyncio 4 | import logging 5 | from maigret.executors import ( 6 | AsyncioSimpleExecutor, 7 | AsyncioProgressbarExecutor, 8 | AsyncioProgressbarSemaphoreExecutor, 9 | AsyncioProgressbarQueueExecutor, 10 | ) 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | async def func(n): 16 | await asyncio.sleep(0.1 * (n % 3)) 17 | return n 18 | 19 | 20 | @pytest.mark.asyncio 21 | async def test_simple_asyncio_executor(): 22 | tasks = [(func, [n], {}) for n in range(10)] 23 | executor = AsyncioSimpleExecutor(logger=logger) 24 | assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 25 | assert executor.execution_time > 0.2 26 | assert executor.execution_time < 0.3 27 | 28 | 29 | @pytest.mark.asyncio 30 | async def test_asyncio_progressbar_executor(): 31 | tasks = [(func, [n], {}) for n in range(10)] 32 | 33 | executor = AsyncioProgressbarExecutor(logger=logger) 34 | # no guarantees for the results order 35 | assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 36 | assert executor.execution_time > 0.2 37 | assert executor.execution_time < 0.3 38 | 39 | 40 | @pytest.mark.asyncio 41 | async def test_asyncio_progressbar_semaphore_executor(): 42 | tasks = [(func, [n], {}) for n in range(10)] 43 | 44 | executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5) 45 | # no guarantees for the results order 46 | assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 47 | assert executor.execution_time > 0.2 48 | assert executor.execution_time < 0.4 49 | 50 | 51 | @pytest.mark.asyncio 52 | async def test_asyncio_progressbar_queue_executor(): 53 | tasks = [(func, [n], {}) for n in range(10)] 54 | 55 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2) 56 | assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8] 57 | assert executor.execution_time > 0.5 58 | assert executor.execution_time < 0.6 59 | 60 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=3) 61 | assert await executor.run(tasks) == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8] 62 | assert executor.execution_time > 0.4 63 | assert executor.execution_time < 0.5 64 | 65 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5) 66 | assert await executor.run(tasks) in ( 67 | [0, 3, 6, 1, 4, 7, 9, 2, 5, 8], 68 | [0, 3, 6, 1, 4, 9, 7, 2, 5, 8], 69 | ) 70 | assert executor.execution_time > 0.3 71 | assert executor.execution_time < 0.4 72 | 73 | executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=10) 74 | assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8] 75 | assert executor.execution_time > 0.2 76 | assert executor.execution_time < 0.3 77 | -------------------------------------------------------------------------------- /src/maigret/tests/test_notify.py: -------------------------------------------------------------------------------- 1 | from maigret.errors import CheckError 2 | from maigret.notify import QueryNotifyPrint 3 | from maigret.result import QueryStatus, QueryResult 4 | 5 | 6 | def test_notify_illegal(): 7 | n = QueryNotifyPrint(color=False) 8 | 9 | assert ( 10 | n.update( 11 | QueryResult( 12 | username="test", 13 | status=QueryStatus.ILLEGAL, 14 | site_name="TEST_SITE", 15 | site_url_user="http://example.com/test", 16 | ) 17 | ) 18 | == "[-] TEST_SITE: Illegal Username Format For This Site!" 19 | ) 20 | 21 | 22 | def test_notify_claimed(): 23 | n = QueryNotifyPrint(color=False) 24 | 25 | assert ( 26 | n.update( 27 | QueryResult( 28 | username="test", 29 | status=QueryStatus.CLAIMED, 30 | site_name="TEST_SITE", 31 | site_url_user="http://example.com/test", 32 | ) 33 | ) 34 | == "[+] TEST_SITE: http://example.com/test" 35 | ) 36 | 37 | 38 | def test_notify_available(): 39 | n = QueryNotifyPrint(color=False) 40 | 41 | assert ( 42 | n.update( 43 | QueryResult( 44 | username="test", 45 | status=QueryStatus.AVAILABLE, 46 | site_name="TEST_SITE", 47 | site_url_user="http://example.com/test", 48 | ) 49 | ) 50 | == "[-] TEST_SITE: Not found!" 51 | ) 52 | 53 | 54 | def test_notify_unknown(): 55 | n = QueryNotifyPrint(color=False) 56 | result = QueryResult( 57 | username="test", 58 | status=QueryStatus.UNKNOWN, 59 | site_name="TEST_SITE", 60 | site_url_user="http://example.com/test", 61 | ) 62 | result.error = CheckError('Type', 'Reason') 63 | 64 | assert n.update(result) == "[?] TEST_SITE: Type error: Reason" 65 | -------------------------------------------------------------------------------- /src/maigret/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/utils/__init__.py -------------------------------------------------------------------------------- /src/maigret/utils/add_tags.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import random 3 | from argparse import ArgumentParser, RawDescriptionHelpFormatter 4 | 5 | from maigret.maigret import MaigretDatabase 6 | from maigret.submit import Submitter 7 | 8 | 9 | def update_tags(site): 10 | tags = [] 11 | if not site.tags: 12 | print(f'Site {site.name} doesn\'t have tags') 13 | else: 14 | tags = site.tags 15 | print(f'Site {site.name} tags: ' + ', '.join(tags)) 16 | 17 | print(f'URL: {site.url_main}') 18 | 19 | new_tags = set(input('Enter new tags: ').split(', ')) 20 | if "disabled" in new_tags: 21 | new_tags.remove("disabled") 22 | site.disabled = True 23 | 24 | print(f'Old alexa rank: {site.alexa_rank}') 25 | rank = Submitter.get_alexa_rank(site.url_main) 26 | if rank: 27 | print(f'New alexa rank: {rank}') 28 | site.alexa_rank = rank 29 | 30 | site.tags = [x for x in list(new_tags) if x] 31 | 32 | 33 | if __name__ == '__main__': 34 | parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter 35 | ) 36 | parser.add_argument("--base","-b", metavar="BASE_FILE", 37 | dest="base_file", default="maigret/resources/data.json", 38 | help="JSON file with sites data to update.") 39 | parser.add_argument("--name", help="Name of site to check") 40 | 41 | pool = list() 42 | 43 | args = parser.parse_args() 44 | 45 | db = MaigretDatabase() 46 | db.load_from_file(args.base_file).sites 47 | 48 | while True: 49 | if args.name: 50 | sites = list(db.ranked_sites_dict(names=[args.name]).values()) 51 | site = random.choice(sites) 52 | else: 53 | site = random.choice(db.sites) 54 | 55 | if site.engine == 'uCoz': 56 | continue 57 | 58 | # if not 'in' in site.tags: 59 | # continue 60 | 61 | update_tags(site) 62 | 63 | db.save_to_file(args.base_file) -------------------------------------------------------------------------------- /src/maigret/utils/sites_diff.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import difflib 3 | import requests 4 | 5 | 6 | a = requests.get(sys.argv[1]).text 7 | b = requests.get(sys.argv[2]).text 8 | 9 | 10 | tokens_a = set(a.split('"')) 11 | tokens_b = set(b.split('"')) 12 | 13 | a_minus_b = tokens_a.difference(tokens_b) 14 | b_minus_a = tokens_b.difference(tokens_a) 15 | 16 | print(a_minus_b) 17 | print(b_minus_a) 18 | 19 | print(len(a_minus_b)) 20 | print(len(b_minus_a)) 21 | 22 | desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography", 23 | "birthday", "репутация", "информация", "e-mail"] 24 | 25 | 26 | def get_match_ratio(x): 27 | return round(max([ 28 | difflib.SequenceMatcher(a=x.lower(), b=y).ratio() 29 | for y in desired_strings 30 | ]), 2) 31 | 32 | 33 | RATIO = 0.6 34 | 35 | print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10]) 36 | print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10]) -------------------------------------------------------------------------------- /src/maigret/wizard.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import asyncio 3 | import logging 4 | import maigret 5 | 6 | 7 | # top popular sites from the Maigret database 8 | TOP_SITES_COUNT = 300 9 | # Maigret HTTP requests timeout 10 | TIMEOUT = 10 11 | # max parallel requests 12 | MAX_CONNECTIONS = 50 13 | 14 | 15 | if __name__ == '__main__': 16 | # setup logging and asyncio 17 | logger = logging.getLogger('maigret') 18 | logger.setLevel(logging.WARNING) 19 | loop = asyncio.get_event_loop() 20 | 21 | # setup Maigret 22 | db = maigret.MaigretDatabase().load_from_file('./maigret/resources/data.json') 23 | # also can be downloaded from web 24 | # db = MaigretDatabase().load_from_url(MAIGRET_DB_URL) 25 | 26 | # user input 27 | username = input('Enter username to search: ') 28 | 29 | sites_count_raw = input( 30 | f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): ' 31 | ) 32 | sites_count = int(sites_count_raw) or TOP_SITES_COUNT 33 | 34 | sites = db.ranked_sites_dict(top=sites_count) 35 | 36 | show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ') 37 | show_progressbar = show_progressbar_raw.lower() != 'n' 38 | 39 | extract_info_raw = input( 40 | 'Do you want to extract additional info from accounts\' pages? [Yn] ' 41 | ) 42 | extract_info = extract_info_raw.lower() != 'n' 43 | 44 | use_notifier_raw = input( 45 | 'Do you want to use notifier for displaying results while searching? [Yn] ' 46 | ) 47 | use_notifier = use_notifier_raw.lower() != 'n' 48 | 49 | notifier = None 50 | if use_notifier: 51 | notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True) 52 | 53 | # search! 54 | search_func = maigret.search( 55 | username=username, 56 | site_dict=sites, 57 | timeout=TIMEOUT, 58 | logger=logger, 59 | max_connections=MAX_CONNECTIONS, 60 | query_notify=notifier, 61 | no_progressbar=(not show_progressbar), 62 | is_parsing_enabled=extract_info, 63 | ) 64 | 65 | results = loop.run_until_complete(search_func) 66 | 67 | input('Search completed. Press any key to show results.') 68 | 69 | for sitename, data in results.items(): 70 | is_found = data['status'].is_found() 71 | print(f'{sitename} - {"Found!" if is_found else "Not found"}') 72 | -------------------------------------------------------------------------------- /src/recopilacion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/recopilacion/__init__.py -------------------------------------------------------------------------------- /src/recopilacion/extraccion.py: -------------------------------------------------------------------------------- 1 | def procesar_resultados(): 2 | pass -------------------------------------------------------------------------------- /src/riesgos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/riesgos/__init__.py -------------------------------------------------------------------------------- /src/sherlock/.dockerignore: -------------------------------------------------------------------------------- 1 | .git/ 2 | .vscode/ 3 | screenshot/ 4 | tests/ 5 | *.txt 6 | !/requirements.txt 7 | venv/ 8 | 9 | -------------------------------------------------------------------------------- /src/sherlock/.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | end_of_line = lf 7 | charset = utf-8 8 | trim_trailing_whitespace = true 9 | insert_final_newline = true 10 | curly_bracket_next_line = false 11 | spaces_around_operators = true 12 | 13 | [*.{markdown,md}] 14 | trim_trailing_whitespace = false 15 | 16 | [*.py] 17 | indent_size = 4 18 | quote_type = double 19 | -------------------------------------------------------------------------------- /src/sherlock/.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report a bug in Sherlock's functionality 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | 18 | 19 | 20 | ## Checklist 21 | 25 | 26 | - [ ] I'm reporting a bug in Sherlock's functionality 27 | - [ ] The bug I'm reporting is not a false positive or a false negative 28 | - [ ] I've verified that I'm running the latest version of Sherlock 29 | - [ ] I've checked for similar bug reports including closed ones 30 | - [ ] I've checked for pull requests that attempt to fix this bug 31 | 32 | ## Description 33 | 37 | 38 | WRITE DESCRIPTION HERE 39 | -------------------------------------------------------------------------------- /src/sherlock/.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Request a new functionality for Sherlock 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | 18 | 19 | ## Checklist 20 | 24 | - [ ] I'm reporting a feature request 25 | - [ ] I've checked for similar feature requests including closed ones 26 | 27 | ## Description 28 | 31 | 32 | WRITE DESCRIPTION HERE 33 | -------------------------------------------------------------------------------- /src/sherlock/.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask us a question 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | 18 | 19 | ## Checklist 20 | 24 | - [ ] I'm asking a question regarding Sherlock 25 | - [ ] My question is not a tech support question. 26 | 27 | **We are not your tech support**. 28 | If you have questions related to `pip`, `git`, or something that is not related to Sherlock, please ask them on [Stack Overflow](https://stackoverflow.com/) or [r/learnpython](https://www.reddit.com/r/learnpython/) 29 | 30 | 31 | ## Question 32 | 33 | ASK YOUR QUESTION HERE 34 | -------------------------------------------------------------------------------- /src/sherlock/.github/ISSUE_TEMPLATE/reporting-false-negative.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Reporting false negative 3 | about: Reporting a site that is returning false positives 4 | title: '' 5 | labels: false negative 6 | assignees: '' 7 | 8 | --- 9 | 10 | 18 | 19 | ## Checklist 20 | 24 | - [ ] I'm reporting a website that is returning **false negative** results 25 | - [ ] I've checked for similar site support requests including closed ones 26 | - [ ] I've checked for pull requests attempting to fix this false negative 27 | - [ ] I'm only reporting **one** site (create a separate issue for each site) 28 | 29 | ## Description 30 | 33 | 34 | WRITE DESCRIPTION HERE 35 | -------------------------------------------------------------------------------- /src/sherlock/.github/ISSUE_TEMPLATE/reporting-false-positive.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Reporting false positive 3 | about: Reporting a site that is returning false positives 4 | title: '' 5 | labels: false positive 6 | assignees: '' 7 | 8 | --- 9 | 10 | 18 | 19 | ## Checklist 20 | 24 | - [ ] I'm reporting a website that is returning **false positive** results 25 | - [ ] I've checked for similar site support requests including closed ones 26 | - [ ] I've checked for pull requests attempting to fix this false positive 27 | - [ ] I'm only reporting **one** site (create a separate issue for each site) 28 | 29 | ## Description 30 | 33 | 34 | WRITE DESCRIPTION HERE 35 | -------------------------------------------------------------------------------- /src/sherlock/.github/ISSUE_TEMPLATE/site-support-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Site support request 3 | about: Request support for a new site 4 | title: '' 5 | labels: site support request 6 | assignees: '' 7 | 8 | --- 9 | 10 | 18 | 19 | ## Checklist 20 | 24 | 25 | - [ ] I'm requesting support for a new site 26 | - [ ] I've checked for similar site support requests including closed ones 27 | - [ ] I've checked that the site I am requesting has not been removed in the past and is not documented in [removed_sites.md](https://github.com/sherlock-project/sherlock/blob/master/removed_sites.md) 28 | - [ ] The site I am requesting support for is not a pornographic website 29 | - [ ] I'm only requesting support of **one** website (create a separate issue for each site) 30 | 31 | ## Description 32 | 36 | 37 | URL: 38 | -------------------------------------------------------------------------------- /src/sherlock/.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | strategy: 11 | matrix: 12 | python-version: [3.7, 3.8, 3.9, "3.10", 3.11] 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install Dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install ruff flake8 pytest 24 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 25 | - name: Lint with ruff 26 | run: | 27 | # stop the build if there are Python syntax errors or undefined names 28 | ruff . --format=github --select=E9,F63,F7,F82 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Sherlock Site Detect Tests 32 | run: | 33 | cd sherlock && python -m unittest tests.all.SherlockDetectTests --verbose 34 | -------------------------------------------------------------------------------- /src/sherlock/.github/workflows/nightly.yml: -------------------------------------------------------------------------------- 1 | name: Nightly 2 | 3 | on: 4 | schedule: 5 | # Run Nightly Tests At 3AM (The Hour Of The Wolf) Every Day 6 | - cron: '0 3 * * *' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: [3.x] 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install Dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 25 | - name: Sherlock Site Coverage Tests 26 | run: | 27 | cd sherlock && python -m unittest tests.all.SherlockSiteCoverageTests --verbose 28 | -------------------------------------------------------------------------------- /src/sherlock/.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: Pull Request Action 2 | 3 | on: 4 | pull_request: 5 | branches: [ master ] 6 | 7 | jobs: 8 | getchange: 9 | runs-on: ubuntu-latest 10 | outputs: 11 | matrix: ${{ steps.changes.outputs.matrix }} 12 | steps: 13 | - id: changes 14 | run: | 15 | URL="https://api.github.com/repos/sherlock-project/sherlock/pulls/${{ github.event.pull_request.number }}/files" 16 | FILES=$(curl -s -X GET -G $URL | jq -r '.[] | .filename') 17 | if echo $FILES | grep -q ".json"; then 18 | echo "::set-output name=matrix::{\"include\":[{\"python\":\"3.x\"}]}" 19 | else 20 | echo "::set-output name=matrix::{\"include\":[{\"python\":\"3.7\"},{\"python\":\"3.8\"}]},{\"python\":\"3.9\"},{\"python\":\"3.10\"}]},{\"python\":\"3.11\"}]}" 21 | fi 22 | build: 23 | needs: [getchange] 24 | runs-on: ubuntu-latest 25 | strategy: 26 | matrix: ${{ fromJson(needs.getchange.outputs.matrix) }} 27 | 28 | steps: 29 | - uses: actions/checkout@v3 30 | - name: Set up Python ${{ matrix.python }} 31 | uses: actions/setup-python@v4 32 | with: 33 | python-version: ${{ matrix.python }} 34 | - name: Install Dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | pip install flake8 pytest 38 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 39 | - name: Lint With flake8 40 | run: | 41 | # stop the build if there are Python syntax errors or undefined names 42 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 43 | 44 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 45 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 46 | - name: Sherlock Site Detect Tests 47 | run: | 48 | cd sherlock && python -m unittest tests.all.SherlockDetectTests --verbose 49 | -------------------------------------------------------------------------------- /src/sherlock/.github/workflows/update-site-list.yml: -------------------------------------------------------------------------------- 1 | name: Update Site List 2 | 3 | # Trigger the workflow when changes are pushed to the main branch 4 | # and the changes include the sherlock/resources/data.json file 5 | on: 6 | push: 7 | branches: 8 | - master 9 | paths: 10 | - sherlock/resources/data.json 11 | 12 | jobs: 13 | sync-json-data: 14 | # Use the latest version of Ubuntu as the runner environment 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | # Check out the code at the specified pull request head commit 19 | - name: Checkout code 20 | uses: actions/checkout@v3 21 | with: 22 | ref: ${{ github.event.pull_request.head.sha }} 23 | fetch-depth: 0 24 | 25 | # Install Python 3 26 | - name: Install Python 27 | uses: actions/setup-python@v4 28 | with: 29 | python-version: '3.x' 30 | 31 | # Execute the site_list.py Python script 32 | - name: Execute site_list.py 33 | run: python site_list.py 34 | 35 | # Commit any changes made by the script 36 | - name: Commit files 37 | run: | 38 | git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com" 39 | git config --local user.name "github-actions[bot]" 40 | if ! git diff --exit-code; then 41 | git commit -a -m "Updated Site List" 42 | fi 43 | 44 | # Push the changes to the remote repository 45 | - name: Push changes 46 | uses: ad-m/github-push-action@master 47 | with: 48 | github_token: ${{ secrets.GITHUB_TOKEN }} 49 | branch: ${{ github.ref }} 50 | -------------------------------------------------------------------------------- /src/sherlock/.gitignore: -------------------------------------------------------------------------------- 1 | # Virtual Environment 2 | venv/ 3 | 4 | # Editor Configurations 5 | .vscode/ 6 | .idea/ 7 | 8 | # Python 9 | __pycache__/ 10 | 11 | # Pip 12 | src/ 13 | 14 | # Jupyter Notebook 15 | .ipynb_checkpoints 16 | *.ipynb 17 | 18 | # Output files, except requirements.txt 19 | *.txt 20 | !requirements.txt 21 | 22 | # Comma-Separated Values (CSV) Reports 23 | *.csv 24 | 25 | #XLSX Reports 26 | *.xlsx 27 | 28 | # Excluded sites list 29 | tests/.excluded_sites 30 | 31 | # MacOS Folder Metadata File 32 | .DS_Store 33 | 34 | # Vim swap files 35 | *.swp 36 | -------------------------------------------------------------------------------- /src/sherlock/.replit: -------------------------------------------------------------------------------- 1 | language = "python3" 2 | run = "" 3 | -------------------------------------------------------------------------------- /src/sherlock/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How To Contribute To Sherlock 2 | First off, thank you for the help! 3 | 4 | There are many ways to contribute. Here is some high level grouping. 5 | 6 | ## Adding New Sites 7 | 8 | Please look at the Wiki entry on 9 | [adding new sites](https://github.com/sherlock-project/sherlock/wiki/Adding-Sites-To-Sherlock) 10 | to understand the issues. 11 | 12 | Any new sites that are added need to have a username that has been claimed, and one 13 | that is unclaimed documented in the site data. This allows the regression tests 14 | to ensure that everything is working. 15 | 16 | It is required that a contributor test any new sites by either running the full tests, or running 17 | a site-specific query against the claimed and unclaimed usernames. 18 | 19 | It is not required that a contributor run the 20 | [site_list.py](https://github.com/sherlock-project/sherlock/blob/master/site_list.py) 21 | script. 22 | 23 | If there are performance problems with a site (e.g. slow to respond, unreliable uptime, ...), then 24 | the site may be removed from the list. The 25 | [removed_sites.md](https://github.com/sherlock-project/sherlock/blob/master/removed_sites.md) 26 | file contains sites that were included at one time in Sherlock, but had to be removed for 27 | one reason or another. 28 | 29 | ## Adding New Functionality 30 | 31 | Please ensure that the content on your branch passes all tests before submitting a pull request. 32 | -------------------------------------------------------------------------------- /src/sherlock/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim-bullseye as build 2 | WORKDIR /wheels 3 | 4 | COPY requirements.txt /opt/sherlock/ 5 | RUN apt-get update \ 6 | && apt-get install -y build-essential \ 7 | && pip3 wheel -r /opt/sherlock/requirements.txt 8 | 9 | FROM python:3.11-slim-bullseye 10 | WORKDIR /opt/sherlock 11 | 12 | ARG VCS_REF 13 | ARG VCS_URL="https://github.com/sherlock-project/sherlock" 14 | 15 | LABEL org.label-schema.vcs-ref=$VCS_REF \ 16 | org.label-schema.vcs-url=$VCS_URL 17 | 18 | COPY --from=build /wheels /wheels 19 | COPY . /opt/sherlock/ 20 | 21 | RUN pip3 install --no-cache-dir -r requirements.txt -f /wheels \ 22 | && rm -rf /wheels 23 | 24 | WORKDIR /opt/sherlock/sherlock 25 | 26 | ENTRYPOINT ["python", "sherlock.py"] 27 | -------------------------------------------------------------------------------- /src/sherlock/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Sherlock Project 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/sherlock/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | sherlock: 5 | build: . 6 | volumes: 7 | - "./results:/opt/sherlock/results" 8 | -------------------------------------------------------------------------------- /src/sherlock/images/preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/sherlock/images/preview.png -------------------------------------------------------------------------------- /src/sherlock/requirements.txt: -------------------------------------------------------------------------------- 1 | certifi>=2019.6.16 2 | colorama>=0.4.1 3 | PySocks>=1.7.0 4 | requests>=2.22.0 5 | requests-futures>=1.0.0 6 | stem>=1.8.0 7 | torrequest>=0.1.0 8 | pandas>=1.0.0 9 | openpyxl<=3.0.10 10 | exrex>=0.11.0 -------------------------------------------------------------------------------- /src/sherlock/sherlock/__init__.py: -------------------------------------------------------------------------------- 1 | """ Sherlock Module 2 | 3 | This module contains the main logic to search for usernames at social 4 | networks. 5 | 6 | """ 7 | -------------------------------------------------------------------------------- /src/sherlock/sherlock/__main__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | """ 4 | Sherlock: Find Usernames Across Social Networks Module 5 | 6 | This module contains the main logic to search for usernames at social 7 | networks. 8 | """ 9 | 10 | import sys 11 | 12 | 13 | if __name__ == "__main__": 14 | # Check if the user is using the correct version of Python 15 | python_version = sys.version.split()[0] 16 | 17 | if sys.version_info < (3, 6): 18 | print("Sherlock requires Python 3.6+\nYou are using Python %s, which is not supported by Sherlock" % (python_version)) 19 | sys.exit(1) 20 | 21 | import sherlock 22 | sherlock.main() 23 | -------------------------------------------------------------------------------- /src/sherlock/sherlock/result.py: -------------------------------------------------------------------------------- 1 | """Sherlock Result Module 2 | 3 | This module defines various objects for recording the results of queries. 4 | """ 5 | from enum import Enum 6 | 7 | 8 | class QueryStatus(Enum): 9 | """Query Status Enumeration. 10 | 11 | Describes status of query about a given username. 12 | """ 13 | CLAIMED = "Claimed" # Username Detected 14 | AVAILABLE = "Available" # Username Not Detected 15 | UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username 16 | ILLEGAL = "Illegal" # Username Not Allowable For This Site 17 | 18 | def __str__(self): 19 | """Convert Object To String. 20 | 21 | Keyword Arguments: 22 | self -- This object. 23 | 24 | Return Value: 25 | Nicely formatted string to get information about this object. 26 | """ 27 | return self.value 28 | 29 | class QueryResult(): 30 | """Query Result Object. 31 | 32 | Describes result of query about a given username. 33 | """ 34 | def __init__(self, username, site_name, site_url_user, status, 35 | query_time=None, context=None): 36 | """Create Query Result Object. 37 | 38 | Contains information about a specific method of detecting usernames on 39 | a given type of web sites. 40 | 41 | Keyword Arguments: 42 | self -- This object. 43 | username -- String indicating username that query result 44 | was about. 45 | site_name -- String which identifies site. 46 | site_url_user -- String containing URL for username on site. 47 | NOTE: The site may or may not exist: this 48 | just indicates what the name would 49 | be, if it existed. 50 | status -- Enumeration of type QueryStatus() indicating 51 | the status of the query. 52 | query_time -- Time (in seconds) required to perform query. 53 | Default of None. 54 | context -- String indicating any additional context 55 | about the query. For example, if there was 56 | an error, this might indicate the type of 57 | error that occurred. 58 | Default of None. 59 | 60 | Return Value: 61 | Nothing. 62 | """ 63 | 64 | self.username = username 65 | self.site_name = site_name 66 | self.site_url_user = site_url_user 67 | self.status = status 68 | self.query_time = query_time 69 | self.context = context 70 | 71 | return 72 | 73 | def __str__(self): 74 | """Convert Object To String. 75 | 76 | Keyword Arguments: 77 | self -- This object. 78 | 79 | Return Value: 80 | Nicely formatted string to get information about this object. 81 | """ 82 | status = str(self.status) 83 | if self.context is not None: 84 | # There is extra context information available about the results. 85 | # Append it to the normal response text. 86 | status += f" ({self.context})" 87 | 88 | return status 89 | -------------------------------------------------------------------------------- /src/sherlock/sherlock/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Sherlock Tests 2 | 3 | This package contains various submodules used to run tests. 4 | """ 5 | -------------------------------------------------------------------------------- /src/sherlock/sherlock/tests/test_multiple_usernames.py: -------------------------------------------------------------------------------- 1 | import imp 2 | import unittest 3 | import sys 4 | sys.path.append('../') 5 | import sherlock as sh 6 | 7 | checksymbols = [] 8 | checksymbols = ["_", "-", "."] 9 | 10 | """Test for mulriple usernames. 11 | 12 | This test ensures that the function MultipleUsernames works properly. More specific, 13 | different scenarios are tested and only usernames that contain this specific sequence: {?} 14 | should return positive. 15 | 16 | Keyword Arguments: 17 | self -- This object. 18 | 19 | Return Value: 20 | Nothing. 21 | """ 22 | class TestMultipleUsernames(unittest.TestCase): 23 | def test_area(self): 24 | test_usernames = ["test{?}test" , "test{?feo" , "test"] 25 | for name in test_usernames: 26 | if(sh.CheckForParameter(name)): 27 | self.assertAlmostEqual(sh.MultipleUsernames(name), ["test_test" , "test-test" , "test.test"]) 28 | else: 29 | self.assertAlmostEqual(name, name) -------------------------------------------------------------------------------- /src/sherlock/site_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # This module generates the listing of supported sites which can be found in 3 | # sites.md. It also organizes all the sites in alphanumeric order 4 | import json 5 | 6 | # Read the data.json file 7 | with open("sherlock/resources/data.json", "r", encoding="utf-8") as data_file: 8 | data = json.load(data_file) 9 | 10 | # Sort the social networks in alphanumeric order 11 | social_networks = sorted(data.items()) 12 | 13 | # Write the list of supported sites to sites.md 14 | with open("sites.md", "w") as site_file: 15 | site_file.write(f"## List Of Supported Sites ({len(social_networks)} Sites In Total!)\n") 16 | for social_network, info in social_networks: 17 | url_main = info["urlMain"] 18 | is_nsfw = "**(NSFW)**" if info.get("isNSFW") else "" 19 | site_file.write(f"1. ![](https://www.google.com/s2/favicons?domain={url_main}) [{social_network}]({url_main}) {is_nsfw}\n") 20 | 21 | # Overwrite the data.json file with sorted data 22 | with open("sherlock/resources/data.json", "w") as data_file: 23 | sorted_data = json.dumps(data, indent=2, sort_keys=True) 24 | data_file.write(sorted_data) 25 | data_file.write("\n") 26 | 27 | print("Finished updating supported site listing!") 28 | -------------------------------------------------------------------------------- /src/theHarvester/.dockerignore: -------------------------------------------------------------------------------- 1 | .github/* 2 | .gitattributes 3 | .idea/ 4 | .lgtm.yml 5 | mypy.ini 6 | .pytest_cache 7 | .mypy_cache 8 | tests/* 9 | README/ 10 | bin/ -------------------------------------------------------------------------------- /src/theHarvester/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E501, F405, F403, F401, E402, W503 -------------------------------------------------------------------------------- /src/theHarvester/.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # #1492 run `black .` and `isort .` 2 | c13843ec0d513ac7f9c35b7bd0501fa46e356415 -------------------------------------------------------------------------------- /src/theHarvester/.gitattributes: -------------------------------------------------------------------------------- 1 | # Set the default behavior, which is to have git automatically determine 2 | # whether a file is a text or binary, unless otherwise specified. 3 | 4 | * text=auto 5 | 6 | # Basic .gitattributes for a python repo. 7 | 8 | # Source files 9 | # ============ 10 | *.pxd text diff=python 11 | *.py text diff=python 12 | *.py3 text diff=python 13 | *.pyw text diff=python 14 | *.pyx text diff=python 15 | 16 | # Binary files 17 | # ============ 18 | *.db binary 19 | *.p binary 20 | *.pkl binary 21 | *.pyc binary 22 | *.pyd binary 23 | *.pyo binary 24 | 25 | # Note: .db, .p, and .pkl files are associated with the python modules 26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb`` 27 | # (among others). 28 | -------------------------------------------------------------------------------- /src/theHarvester/.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [L1ghtn1ng, NotoriousRebel] 4 | open_collective: # Replace with a single Open Collective username 5 | ko_fi: # 6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 8 | liberapay: # Replace with a single Liberapay username 9 | issuehunt: # Replace with a single IssueHunt username 10 | otechie: # Replace with a single Otechie username 11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 12 | -------------------------------------------------------------------------------- /src/theHarvester/.github/ISSUE_TEMPLATE/issue-template.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Issue Template 3 | about: A template for new issues. 4 | title: "[Bug|Feature Request|Other] Short Description of Issue" 5 | labels: '' 6 | 7 | --- 8 | 9 | ## Note we do not support installing theHarvester on android 10 | 11 | **Feature Request or Bug or Another** 12 | Feature Request | Bug | Other 13 | 14 | **Describe the feature request or bug or other** 15 | A clear and concise description of what the bug, feature request, 16 | or other request is. 17 | 18 | **To Reproduce** 19 | Steps to reproduce the behaviour: 20 | 1. Run tool like this: '...' 21 | 2. See error 22 | 23 | **Expected behaviour** 24 | A clear and concise description of what you expected to happen. 25 | 26 | **Screenshots** 27 | If possible please add screenshots to help explain your problem. 28 | 29 | **System Information (System that tool is running on):** 30 | - OS: [e.g. Windows10] 31 | - Version [e.g. 2.7] 32 | 33 | **Additional context** 34 | Add any other context about the problem here. 35 | -------------------------------------------------------------------------------- /src/theHarvester/.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | timezone: Europe/London 8 | - package-ecosystem: pip 9 | directory: "/" 10 | schedule: 11 | interval: daily 12 | timezone: Europe/London 13 | open-pull-requests-limit: 10 14 | target-branch: master 15 | allow: 16 | - dependency-type: direct 17 | - dependency-type: indirect 18 | -------------------------------------------------------------------------------- /src/theHarvester/.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master, dev ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master, dev ] 20 | schedule: 21 | - cron: '19 11 * * 4' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | 28 | strategy: 29 | fail-fast: false 30 | matrix: 31 | language: [ 'python' ] 32 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ] 33 | # Learn more: 34 | # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed 35 | 36 | steps: 37 | - name: Checkout repository 38 | uses: actions/checkout@v4 39 | 40 | # Initializes the CodeQL tools for scanning. 41 | - name: Initialize CodeQL 42 | uses: github/codeql-action/init@v3 43 | with: 44 | languages: ${{ matrix.language }} 45 | # If you wish to specify custom queries, you can do so here or in a config file. 46 | # By default, queries listed here will override any specified in a config file. 47 | # Prefix the list here with "+" to use these queries and those in the config file. 48 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 49 | 50 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 51 | # If this step fails, then you should remove it and run the build manually (see below) 52 | - name: Autobuild 53 | uses: github/codeql-action/autobuild@v3 54 | 55 | # ℹ️ Command-line programs to run using the OS shell. 56 | # 📚 https://git.io/JvXDl 57 | 58 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 59 | # and modify them (or add more) to build your code if your project 60 | # uses a compiled language 61 | 62 | #- run: | 63 | # make bootstrap 64 | # make release 65 | 66 | - name: Perform CodeQL Analysis 67 | uses: github/codeql-action/analyze@v3 68 | -------------------------------------------------------------------------------- /src/theHarvester/.github/workflows/dockerci.yml: -------------------------------------------------------------------------------- 1 | name: TheHarvester Docker Image CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - name: Build the Docker image 11 | run: docker build . --file Dockerfile --tag theharvester:$(date +%s) -------------------------------------------------------------------------------- /src/theHarvester/.github/workflows/theHarvester.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: TheHarvester Python CI 3 | 4 | on: 5 | push: 6 | branches: 7 | - '*' 8 | 9 | pull_request: 10 | branches: 11 | - '*' 12 | 13 | jobs: 14 | Python: 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | max-parallel: 8 18 | matrix: 19 | os: [ ubuntu-latest, macos-latest ] 20 | python-version: [ 3.10.12, 3.11 ] 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | pip install --upgrade pip 31 | pip install .[dev] 32 | 33 | - name: Lint with black 34 | run: | 35 | black . --diff --check 36 | 37 | - name: Lint with isort 38 | run: | 39 | isort . --diff --check 40 | 41 | - name: Lint with flake8 42 | run: | 43 | # stop the build if there are Python syntax errors or undefined names 44 | flake8 . --count --show-source --statistics --config .flake8 45 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 46 | flake8 . --count --exit-zero --max-line-length=127 --statistics --config .flake8 47 | 48 | - name: Test with pytest 49 | run: | 50 | pytest 51 | 52 | - name: Run theHarvester module Anubis 53 | run: | 54 | theHarvester -d apple.com -b anubis 55 | 56 | - name: Run theHarvester module Baidu 57 | run: | 58 | theHarvester -d yale.edu -b baidu 59 | 60 | - name: Run theHarvester module Bing 61 | run: | 62 | theHarvester -d yale.edu -b bing 63 | 64 | - name: Run theHarvester module CertSpotter 65 | run: | 66 | theHarvester -d yale.edu -b certspotter 67 | 68 | - name: Run theHarvester module Crtsh 69 | run: | 70 | theHarvester -d hcl.com -b crtsh 71 | 72 | - name: Run theHarvester module DnsDumpster 73 | run: | 74 | theHarvester -d yale.edu -b dnsdumpster 75 | 76 | - name: Run theHarvester module DuckDuckGo 77 | run: | 78 | theHarvester -d yale.edu -b duckduckgo 79 | 80 | - name: Run theHarvester module HackerTarget 81 | run: | 82 | theHarvester -d yale.edu -b hackertarget 83 | 84 | - name: Run theHarvester module Intelx 85 | run: | 86 | theHarvester -d yale.edu -b intelx 87 | 88 | - name: Run theHarvester module Otx 89 | run: | 90 | theHarvester -d yale.edu -b otx 91 | 92 | - name: Run theHarvester module RapidDns 93 | run: | 94 | theHarvester -d yale.edu -b rapiddns 95 | 96 | - name: Run theHarvester module Threatminer 97 | run: | 98 | theHarvester -d yale.edu -b threatminer 99 | 100 | - name: Run theHarvester module Urlscan 101 | run: | 102 | theHarvester -d yale.edu -b urlscan 103 | 104 | - name: Run theHarvester module Yahoo 105 | run: | 106 | theHarvester -d yale.edu -b yahoo 107 | 108 | - name: Run theHarvester module DNS brute force 109 | run: | 110 | theHarvester -d yale.edu -c 111 | 112 | - name: Static type checking with mypy 113 | run: | 114 | mypy --pretty theHarvester/*/*.py 115 | mypy --pretty theHarvester/*/*/*.py 116 | -------------------------------------------------------------------------------- /src/theHarvester/.gitignore: -------------------------------------------------------------------------------- 1 | *.idea 2 | *.pyc 3 | *.sqlite 4 | *.html 5 | *.htm 6 | *.vscode 7 | *.xml 8 | *.json 9 | debug_results.txt 10 | venv 11 | .mypy_cache 12 | .pytest_cache 13 | build/ 14 | dist/ 15 | theHarvester.egg-info 16 | api-keys.yaml 17 | .DS_Store 18 | .venv 19 | .pyre 20 | -------------------------------------------------------------------------------- /src/theHarvester/.isort.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | profile = black 3 | -------------------------------------------------------------------------------- /src/theHarvester/.pyre_configuration: -------------------------------------------------------------------------------- 1 | { 2 | "site_package_search_strategy": "pep561", 3 | "source_directories": [ 4 | "." 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /src/theHarvester/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3 2 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1" 3 | RUN apk update && apk upgrade --available && apk add --no-cache musl-dev git libffi-dev gcc python3-dev pipx libxml2-dev libxslt-dev bash 4 | RUN mkdir -p "~/.local/share/theHarvester/static/" 5 | RUN pipx install git+https://github.com/laramies/theHarvester.git 6 | RUN pipx ensurepath 7 | ENTRYPOINT ["/root/.local/bin/restfulHarvest", "-H", "0.0.0.0", "-p", "80"] 8 | EXPOSE 80 9 | -------------------------------------------------------------------------------- /src/theHarvester/README/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to theHarvester Project 2 | Welcome to theHarvester project, so you would like to contribute. 3 | The following below must be met to get accepted. 4 | 5 | # CI 6 | Make sure all CI passes and you do not introduce any alerts from lgtm. 7 | 8 | # Unit Tests 9 | For new modules a unit test for that module is required and we use pytest. 10 | 11 | # Coding Standards 12 | * No single letter variables and variable names must represent the action that it is performing 13 | * Have static typing on functions etc 14 | * Make sure no errors are reported from mypy 15 | * No issues reported with flake8 16 | 17 | # Submitting Bugs 18 | If you find a bug in a module that you want to submit an issue for and know how to write python code. 19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project. 20 | -------------------------------------------------------------------------------- /src/theHarvester/README/LICENSES: -------------------------------------------------------------------------------- 1 | Released under the GPL v 2.0. 2 | 3 | If you did not receive a copy of the GPL, try http://www.gnu.org/. 4 | 5 | Copyright 2011 Christian Martorella 6 | 7 | theHarvester is free software; you can redistribute it and/or modify 8 | it under the terms of the GNU General Public License as published by 9 | the Free Software Foundation version 2 of the License. 10 | 11 | theHarvester is distributed in the hope that it will be useful, 12 | but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | GNU General Public License for more details. 15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 | -------------------------------------------------------------------------------- /src/theHarvester/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | theharvester.svc.local: 4 | container_name: theHarvester 5 | volumes: 6 | - ./api-keys.yaml:/root/.theHarvester/api-keys.yaml 7 | - ./api-keys.yaml:/etc/theHarvester/api-keys.yaml 8 | - ./proxies.yaml:/etc/theHarvester/proxies.yaml 9 | - ./proxies.yaml:/root/.theHarvester/proxies.yaml 10 | build: . 11 | ports: 12 | - "8080:80" 13 | 14 | networks: 15 | default: 16 | name: app_theHarvester_network 17 | -------------------------------------------------------------------------------- /src/theHarvester/mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | show_traceback = True 4 | show_error_codes = True 5 | namespace_packages = True 6 | -------------------------------------------------------------------------------- /src/theHarvester/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "theHarvester" 3 | description = "theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test" 4 | readme = "README.md" 5 | authors = [ 6 | { name = "Christian Martorella", email = "cmartorella@edge-security.com" }, 7 | { name = "Jay Townsend", email = "jay@cybermon.uk" }, 8 | { name = "Matthew Brown", email = "36310667+NotoriousRebel@users.noreply.github.com" }, 9 | ] 10 | requires-python = ">=3.9" 11 | urls.Homepage = "https://github.com/laramies/theHarvester" 12 | classifiers = [ 13 | "Programming Language :: Python :: 3", 14 | "Programming Language :: Python :: 3.9", 15 | "Programming Language :: Python :: 3.10", 16 | "Programming Language :: Python :: 3.11", 17 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", 18 | "Operating System :: OS Independent", 19 | ] 20 | dynamic = ["dependencies", "optional-dependencies", "version"] 21 | 22 | [project.scripts] 23 | theHarvester = "theHarvester.theHarvester:main" 24 | restfulHarvest = "theHarvester.restfulHarvest:main" 25 | 26 | [tool.setuptools.dynamic] 27 | version = { attr = "theHarvester.lib.version.VERSION" } 28 | dependencies = { file = "requirements/base.txt" } 29 | optional-dependencies.dev = { file = "requirements/dev.txt" } 30 | 31 | [tool.setuptools.packages.find] 32 | include = ["theHarvester*"] 33 | 34 | [tool.setuptools.package-data] 35 | "*" = ["*.txt", "*.yaml"] 36 | 37 | [tool.pytest.ini_options] 38 | minversion = "7.1" 39 | addopts = "--no-header --asyncio-mode=auto" 40 | testpaths = [ 41 | "tests", 42 | "tests/discovery/", 43 | ] 44 | 45 | [build-system] 46 | requires = ["setuptools>=68"] 47 | build-backend = "setuptools.build_meta" 48 | -------------------------------------------------------------------------------- /src/theHarvester/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | minversion = 7.1.1 3 | testpaths = tests 4 | asyncio_mode=auto -------------------------------------------------------------------------------- /src/theHarvester/requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements/base.txt 2 | -------------------------------------------------------------------------------- /src/theHarvester/requirements/base.txt: -------------------------------------------------------------------------------- 1 | aiodns==3.1.1 2 | aiofiles==23.2.1 3 | aiohttp==3.9.3 4 | aiomultiprocess==0.9.0 5 | aiosqlite==0.19.0 6 | beautifulsoup4==4.12.3 7 | censys==2.2.11 8 | certifi==2024.2.2 9 | dnspython==2.5.0 10 | fastapi==0.109.0 11 | lxml==5.1.0 12 | netaddr==0.10.1 13 | ujson==5.9.0 14 | pyppeteer==1.0.2 15 | PyYAML==6.0.1 16 | python-dateutil==2.8.2 17 | requests==2.31.0 18 | retrying==1.3.4 19 | setuptools==69.0.3 20 | shodan==1.31.0 21 | slowapi==0.1.8 22 | uvicorn==0.27.0.post1 23 | uvloop==0.19.0; platform_system != "Windows" 24 | -------------------------------------------------------------------------------- /src/theHarvester/requirements/dev.txt: -------------------------------------------------------------------------------- 1 | black==24.1.1 2 | flake8==7.0.0 3 | isort==5.13.2 4 | mypy==1.8.0 5 | mypy-extensions==1.0.0 6 | pydantic==2.5.3 7 | pyre-check==0.9.19 8 | pyflakes==3.2.0 9 | pytest==7.4.4 10 | pytest-asyncio==0.23.4 11 | types-certifi==2021.10.8.3 12 | types-chardet==5.0.4.6 13 | types-ujson==5.9.0.0 14 | types-PyYAML==6.0.12.12 15 | types-requests==2.31.0.6 # 2.31.0.7 introduced a regression 16 | types-python-dateutil==2.8.19.20240106 17 | wheel==0.42.0 -------------------------------------------------------------------------------- /src/theHarvester/restfulHarvest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from theHarvester.restfulHarvest import main 3 | 4 | if __name__ == "__main__": 5 | main() 6 | -------------------------------------------------------------------------------- /src/theHarvester/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E501, F405, F403, E402, F401, F402 -------------------------------------------------------------------------------- /src/theHarvester/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/tests/__init__.py -------------------------------------------------------------------------------- /src/theHarvester/tests/discovery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/tests/discovery/__init__.py -------------------------------------------------------------------------------- /src/theHarvester/tests/discovery/test_anubis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import os 4 | from typing import Optional 5 | 6 | import pytest 7 | import requests 8 | from _pytest.mark.structures import MarkDecorator 9 | 10 | from theHarvester.discovery import anubis 11 | from theHarvester.lib.core import * 12 | 13 | pytestmark: MarkDecorator = pytest.mark.asyncio 14 | github_ci: Optional[str] = os.getenv( 15 | "GITHUB_ACTIONS" 16 | ) # Github set this to be the following: true instead of True 17 | 18 | 19 | class TestAnubis: 20 | @staticmethod 21 | def domain() -> str: 22 | return "apple.com" 23 | 24 | async def test_api(self) -> None: 25 | base_url = f"https://jldc.me/anubis/subdomains/{TestAnubis.domain()}" 26 | headers = {"User-Agent": Core.get_user_agent()} 27 | request = requests.get(base_url, headers=headers) 28 | assert request.status_code == 200 29 | 30 | async def test_do_search(self): 31 | search = anubis.SearchAnubis(word=TestAnubis.domain()) 32 | await search.do_search() 33 | return await search.get_hostnames() 34 | 35 | async def test_process(self) -> None: 36 | await self.test_do_search() 37 | assert len(await self.test_do_search()) > 0 38 | -------------------------------------------------------------------------------- /src/theHarvester/tests/discovery/test_certspotter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import os 4 | from typing import Optional 5 | 6 | import pytest 7 | import requests 8 | from _pytest.mark.structures import MarkDecorator 9 | 10 | from theHarvester.discovery import certspottersearch 11 | from theHarvester.lib.core import * 12 | 13 | pytestmark: MarkDecorator = pytest.mark.asyncio 14 | github_ci: Optional[str] = os.getenv( 15 | "GITHUB_ACTIONS" 16 | ) # Github set this to be the following: true instead of True 17 | 18 | 19 | class TestCertspotter(object): 20 | @staticmethod 21 | def domain() -> str: 22 | return "metasploit.com" 23 | 24 | async def test_api(self) -> None: 25 | base_url = f"https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names" 26 | headers = {"User-Agent": Core.get_user_agent()} 27 | request = requests.get(base_url, headers=headers) 28 | assert request.status_code == 200 29 | 30 | async def test_search(self) -> None: 31 | search = certspottersearch.SearchCertspoter(TestCertspotter.domain()) 32 | await search.process() 33 | assert isinstance(await search.get_hostnames(), set) 34 | 35 | 36 | if __name__ == "__main__": 37 | pytest.main() 38 | -------------------------------------------------------------------------------- /src/theHarvester/tests/discovery/test_githubcode.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | import pytest 4 | from _pytest.mark.structures import MarkDecorator 5 | from requests import Response 6 | 7 | from theHarvester.discovery import githubcode 8 | from theHarvester.discovery.constants import MissingKey 9 | from theHarvester.lib.core import Core 10 | 11 | pytestmark: MarkDecorator = pytest.mark.asyncio 12 | 13 | 14 | class TestSearchGithubCode: 15 | class OkResponse: 16 | response = Response() 17 | json = { 18 | "items": [ 19 | {"text_matches": [{"fragment": "test1"}]}, 20 | {"text_matches": [{"fragment": "test2"}]}, 21 | ] 22 | } 23 | response.status_code = 200 24 | response.json = MagicMock(return_value=json) 25 | 26 | class FailureResponse: 27 | response = Response() 28 | response.json = MagicMock(return_value={}) 29 | response.status_code = 401 30 | 31 | class RetryResponse: 32 | response = Response() 33 | response.json = MagicMock(return_value={}) 34 | response.status_code = 403 35 | 36 | class MalformedResponse: 37 | response = Response() 38 | json = { 39 | "items": [ 40 | {"fail": True}, 41 | {"text_matches": []}, 42 | {"text_matches": [{"weird": "result"}]}, 43 | ] 44 | } 45 | response.json = MagicMock(return_value=json) 46 | response.status_code = 200 47 | 48 | async def test_missing_key(self) -> None: 49 | with pytest.raises(MissingKey): 50 | Core.github_key = MagicMock(return_value=None) 51 | githubcode.SearchGithubCode(word="test", limit=500) 52 | 53 | async def test_fragments_from_response(self) -> None: 54 | Core.github_key = MagicMock(return_value="lol") 55 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 56 | test_result = await test_class_instance.fragments_from_response( 57 | self.OkResponse.response.json() 58 | ) 59 | print("test_result: ", test_result) 60 | assert test_result == ["test1", "test2"] 61 | 62 | async def test_invalid_fragments_from_response(self) -> None: 63 | Core.github_key = MagicMock(return_value="lol") 64 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 65 | test_result = await test_class_instance.fragments_from_response( 66 | self.MalformedResponse.response.json() 67 | ) 68 | assert test_result == [] 69 | 70 | async def test_next_page(self) -> None: 71 | Core.github_key = MagicMock(return_value="lol") 72 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 73 | test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4) 74 | assert 2 == await test_class_instance.next_page_or_end(test_result) 75 | 76 | async def test_last_page(self) -> None: 77 | Core.github_key = MagicMock(return_value="lol") 78 | test_class_instance = githubcode.SearchGithubCode(word="test", limit=500) 79 | test_result = githubcode.SuccessResult(list(), None, None) 80 | assert None is await test_class_instance.next_page_or_end(test_result) 81 | 82 | if __name__ == "__main__": 83 | pytest.main() 84 | -------------------------------------------------------------------------------- /src/theHarvester/tests/discovery/test_otx.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | import os 4 | from typing import Optional 5 | 6 | import pytest 7 | import requests 8 | from _pytest.mark.structures import MarkDecorator 9 | 10 | from theHarvester.discovery import otxsearch 11 | from theHarvester.lib.core import * 12 | 13 | pytestmark: MarkDecorator = pytest.mark.asyncio 14 | github_ci: Optional[str] = os.getenv( 15 | "GITHUB_ACTIONS" 16 | ) # Github set this to be the following: true instead of True 17 | 18 | 19 | class TestOtx(object): 20 | @staticmethod 21 | def domain() -> str: 22 | return "cybermon.uk" 23 | 24 | async def test_api(self) -> None: 25 | base_url = f"https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns" 26 | headers = {"User-Agent": Core.get_user_agent()} 27 | request = requests.get(base_url, headers=headers) 28 | assert request.status_code == 200 29 | 30 | async def test_search(self) -> None: 31 | search = otxsearch.SearchOtx(TestOtx.domain()) 32 | await search.process() 33 | assert isinstance(await search.get_hostnames(), set) 34 | assert isinstance(await search.get_ips(), set) 35 | 36 | 37 | if __name__ == "__main__": 38 | pytest.main() 39 | -------------------------------------------------------------------------------- /src/theHarvester/tests/test_myparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding=utf-8 3 | 4 | import pytest 5 | 6 | from theHarvester.parsers import myparser 7 | 8 | 9 | class TestMyParser(object): 10 | @pytest.mark.asyncio 11 | async def test_emails(self) -> None: 12 | word = "domain.com" 13 | results = "@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***" 14 | parse = myparser.Parser(results, word) 15 | emails = sorted(await parse.emails()) 16 | assert emails, ["c@domain.com", "d@sub.domain.com"] 17 | 18 | 19 | if __name__ == "__main__": 20 | pytest.main() 21 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester-logo.png -------------------------------------------------------------------------------- /src/theHarvester/theHarvester-logo.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester-logo.webp -------------------------------------------------------------------------------- /src/theHarvester/theHarvester.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Note: This script runs theHarvester 3 | import sys 4 | 5 | from theHarvester.theHarvester import main 6 | 7 | if sys.version_info.major < 3 or sys.version_info.minor < 9: 8 | print("\033[93m[!] Make sure you have Python 3.9+ installed, quitting.\n\n \033[0m") 9 | sys.exit(1) 10 | 11 | if __name__ == "__main__": 12 | main() 13 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["hostchecker"] 2 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/data/proxies.yaml: -------------------------------------------------------------------------------- 1 | http: 2 | - ip:port 3 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/data/wordlists/dorks.txt: -------------------------------------------------------------------------------- 1 | inurl:"contact" 2 | intext:email filetype:log 3 | "Index of /mail" 4 | "admin account info" filetype:log 5 | intext:@ 6 | administrator accounts/ 7 | intitle:"Index of" .bash_history 8 | intitle:"index of" members OR accounts 9 | inurl:/shared/help.php 10 | inurl:public 11 | intitle:index.of inbox 12 | intitle:"Server Administration" 13 | inurl:passwd.txt 14 | robots.txt 15 | php-addressbook "This is the addressbook for *" -warning -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/data/wordlists/general/common.txt: -------------------------------------------------------------------------------- 1 | admin 2 | test 3 | hello 4 | uk 5 | login 6 | book 7 | robots.txt 8 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester/discovery/__init__.py -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/anubis.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher 2 | 3 | 4 | class SearchAnubis: 5 | def __init__(self, word) -> None: 6 | self.word = word 7 | self.totalhosts: list = [] 8 | self.proxy = False 9 | 10 | async def do_search(self) -> None: 11 | url = f"https://jldc.me/anubis/subdomains/{self.word}" 12 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 13 | self.totalhosts = response[0] 14 | 15 | async def get_hostnames(self) -> list: 16 | return self.totalhosts 17 | 18 | async def process(self, proxy: bool = False) -> None: 19 | self.proxy = proxy 20 | await self.do_search() 21 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/baidusearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | from theHarvester.parsers import myparser 3 | 4 | 5 | class SearchBaidu: 6 | def __init__(self, word, limit) -> None: 7 | self.word = word 8 | self.total_results = "" 9 | self.server = "www.baidu.com" 10 | self.hostname = "www.baidu.com" 11 | self.limit = limit 12 | self.proxy = False 13 | 14 | async def do_search(self) -> None: 15 | headers = {"Host": self.hostname, "User-agent": Core.get_user_agent()} 16 | base_url = f"https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}" 17 | urls = [ 18 | base_url.replace("xx", str(num)) 19 | for num in range(0, self.limit, 10) 20 | if num <= self.limit 21 | ] 22 | responses = await AsyncFetcher.fetch_all( 23 | urls, headers=headers, proxy=self.proxy 24 | ) 25 | for response in responses: 26 | self.total_results += response 27 | 28 | async def process(self, proxy: bool = False) -> None: 29 | self.proxy = proxy 30 | await self.do_search() 31 | 32 | async def get_emails(self): 33 | rawres = myparser.Parser(self.total_results, self.word) 34 | return await rawres.emails() 35 | 36 | async def get_hostnames(self): 37 | rawres = myparser.Parser(self.total_results, self.word) 38 | return await rawres.hostnames() 39 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/bevigil.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | 6 | 7 | class SearchBeVigil: 8 | def __init__(self, word) -> None: 9 | self.word = word 10 | self.totalhosts: Set = set() 11 | self.interestingurls: Set = set() 12 | self.key = Core.bevigil_key() 13 | if self.key is None: 14 | self.key = "" 15 | raise MissingKey("bevigil") 16 | self.proxy = False 17 | 18 | async def do_search(self) -> None: 19 | subdomain_endpoint = f"https://osint.bevigil.com/api/{self.word}/subdomains/" 20 | url_endpoint = f"https://osint.bevigil.com/api/{self.word}/urls/" 21 | headers = {"X-Access-Token": self.key} 22 | 23 | responses = await AsyncFetcher.fetch_all( 24 | [subdomain_endpoint], json=True, proxy=self.proxy, headers=headers 25 | ) 26 | response = responses[0] 27 | for subdomain in response["subdomains"]: 28 | self.totalhosts.add(subdomain) 29 | 30 | responses = await AsyncFetcher.fetch_all( 31 | [url_endpoint], json=True, proxy=self.proxy, headers=headers 32 | ) 33 | response = responses[0] 34 | for url in response["urls"]: 35 | self.interestingurls.add(url) 36 | 37 | async def get_hostnames(self) -> set: 38 | return self.totalhosts 39 | 40 | async def get_interestingurls(self) -> set: 41 | return self.interestingurls 42 | 43 | async def process(self, proxy: bool = False) -> None: 44 | self.proxy = proxy 45 | await self.do_search() 46 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/binaryedgesearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Set 3 | 4 | from theHarvester.discovery.constants import MissingKey, get_delay 5 | from theHarvester.lib.core import AsyncFetcher, Core 6 | 7 | 8 | class SearchBinaryEdge: 9 | def __init__(self, word, limit) -> None: 10 | self.word = word 11 | self.totalhosts: Set = set() 12 | self.proxy = False 13 | self.key = Core.binaryedge_key() 14 | self.limit = 501 if limit >= 501 else limit 15 | self.limit = 2 if self.limit == 1 else self.limit 16 | if self.key is None: 17 | raise MissingKey("binaryedge") 18 | 19 | async def do_search(self) -> None: 20 | base_url = f"https://api.binaryedge.io/v2/query/domains/subdomain/{self.word}" 21 | headers = {"X-KEY": self.key, "User-Agent": Core.get_user_agent()} 22 | for page in range(1, self.limit): 23 | params = {"page": page} 24 | response = await AsyncFetcher.fetch_all( 25 | [base_url], json=True, proxy=self.proxy, params=params, headers=headers 26 | ) 27 | responses = response[0] 28 | dct = responses 29 | if ("status" in dct.keys() and "message" in dct.keys()) and ( 30 | dct["status"] == 400 31 | or "Bad Parameter" in dct["message"] 32 | or "Error" in dct["message"] 33 | ): 34 | # 400 status code means no more results 35 | break 36 | if "events" in dct.keys(): 37 | if len(dct["events"]) == 0: 38 | break 39 | self.totalhosts.update({host for host in dct["events"]}) 40 | await asyncio.sleep(get_delay()) 41 | 42 | async def get_hostnames(self) -> set: 43 | return self.totalhosts 44 | 45 | async def process(self, proxy: bool = False) -> None: 46 | self.proxy = proxy 47 | await self.do_search() 48 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/bingsearch.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | from theHarvester.discovery.constants import MissingKey 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | from theHarvester.parsers import myparser 6 | 7 | 8 | class SearchBing: 9 | def __init__(self, word, limit, start) -> None: 10 | self.word = word.replace(" ", "%20") 11 | self.results: list[Any] = [] 12 | self.total_results = "" 13 | self.server = "www.bing.com" 14 | self.apiserver = "api.search.live.net" 15 | self.hostname = "www.bing.com" 16 | self.limit = int(limit) 17 | self.bingApi = Core.bing_key() 18 | self.counter = start 19 | self.proxy = False 20 | 21 | async def do_search(self) -> None: 22 | headers = { 23 | "Host": self.hostname, 24 | "Cookie": "SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50", 25 | "Accept-Language": "en-us,en", 26 | "User-agent": Core.get_user_agent(), 27 | } 28 | base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx' 29 | urls = [ 30 | base_url.replace("xx", str(num)) 31 | for num in range(0, self.limit, 50) 32 | if num <= self.limit 33 | ] 34 | responses = await AsyncFetcher.fetch_all( 35 | urls, headers=headers, proxy=self.proxy 36 | ) 37 | for response in responses: 38 | self.total_results += response 39 | 40 | async def do_search_api(self) -> None: 41 | url = "https://api.bing.microsoft.com/v7.0/search?" 42 | params = { 43 | "q": self.word, 44 | "count": str(self.limit), 45 | "offset": "0", 46 | "mkt": "en-us", 47 | "safesearch": "Off", 48 | } 49 | headers = { 50 | "User-Agent": Core.get_user_agent(), 51 | "Ocp-Apim-Subscription-Key": self.bingApi, 52 | } 53 | self.results = await AsyncFetcher.fetch_all( 54 | [url], headers=headers, params=params, proxy=self.proxy 55 | ) 56 | for res in self.results: 57 | self.total_results += res 58 | 59 | async def do_search_vhost(self) -> None: 60 | headers = { 61 | "Host": self.hostname, 62 | "Cookie": "mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50", 63 | "Accept-Language": "en-us,en", 64 | "User-agent": Core.get_user_agent(), 65 | } 66 | base_url = f"http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx" 67 | urls = [ 68 | base_url.replace("xx", str(num)) 69 | for num in range(0, self.limit, 50) 70 | if num <= self.limit 71 | ] 72 | responses = await AsyncFetcher.fetch_all( 73 | urls, headers=headers, proxy=self.proxy 74 | ) 75 | for response in responses: 76 | self.total_results += response 77 | 78 | async def get_emails(self): 79 | rawres = myparser.Parser(self.total_results, self.word) 80 | return await rawres.emails() 81 | 82 | async def get_hostnames(self): 83 | rawres = myparser.Parser(self.total_results, self.word) 84 | return await rawres.hostnames() 85 | 86 | async def get_allhostnames(self): 87 | rawres = myparser.Parser(self.total_results, self.word) 88 | return await rawres.hostnames_all() 89 | 90 | async def process(self, api, proxy: bool = False) -> None: 91 | self.proxy = proxy 92 | if api == "yes": 93 | if self.bingApi is None: 94 | raise MissingKey("BingAPI") 95 | await self.do_search_api() 96 | else: 97 | await self.do_search() 98 | print(f"\tSearching {self.counter} results.") 99 | 100 | async def process_vhost(self) -> None: 101 | await self.do_search_vhost() 102 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/bravesearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from theHarvester.discovery.constants import get_delay 4 | from theHarvester.lib.core import AsyncFetcher, Core 5 | from theHarvester.parsers import myparser 6 | 7 | 8 | class SearchBrave: 9 | def __init__(self, word, limit): 10 | self.word = word 11 | self.results = "" 12 | self.totalresults = "" 13 | self.server = "https://search.brave.com/search?q=" 14 | self.limit = limit 15 | self.proxy = False 16 | 17 | async def do_search(self): 18 | headers = {"User-Agent": Core.get_user_agent()} 19 | for query in [f'"{self.word}"', f"site:{self.word}"]: 20 | try: 21 | for offset in range(0, 50): 22 | # To reduce the total number of requests, only two queries are made "self.word" and site:self.word 23 | current_url = f"{self.server}{query}&offset={offset}&source=web&show_local=0&spellcheck=0" 24 | resp = await AsyncFetcher.fetch_all( 25 | [current_url], headers=headers, proxy=self.proxy 26 | ) 27 | self.results = resp[0] 28 | self.totalresults += self.results 29 | # if 'Results from Microsoft Bing.' in resp[0] \ 30 | if ( 31 | "Not many great matches came back for your search" in resp[0] 32 | or "Your request has been flagged as being suspicious and Brave Search" 33 | in resp[0] 34 | or "Prove" in resp[0] 35 | and "robot" in resp[0] 36 | or "Robot" in resp[0] 37 | ): 38 | break 39 | await asyncio.sleep(get_delay() + 15) 40 | except Exception as e: 41 | print(f"An exception has occurred in bravesearch: {e}") 42 | await asyncio.sleep(get_delay() + 80) 43 | continue 44 | 45 | async def get_emails(self): 46 | rawres = myparser.Parser(self.totalresults, self.word) 47 | return await rawres.emails() 48 | 49 | async def get_hostnames(self): 50 | rawres = myparser.Parser(self.totalresults, self.word) 51 | return await rawres.hostnames() 52 | 53 | async def process(self, proxy=False): 54 | self.proxy = proxy 55 | await self.do_search() 56 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/bufferoverun.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Set 3 | 4 | from theHarvester.discovery.constants import MissingKey 5 | from theHarvester.lib.core import AsyncFetcher, Core 6 | 7 | 8 | class SearchBufferover: 9 | def __init__(self, word) -> None: 10 | self.word = word 11 | self.totalhosts: Set = set() 12 | self.totalips: Set = set() 13 | self.key = Core.bufferoverun_key() 14 | if self.key is None: 15 | raise MissingKey("bufferoverun") 16 | self.proxy = False 17 | 18 | async def do_search(self) -> None: 19 | url = f"https://tls.bufferover.run/dns?q={self.word}" 20 | response = await AsyncFetcher.fetch_all( 21 | [url], 22 | json=True, 23 | headers={"User-Agent": Core.get_user_agent(), "x-api-key": f"{self.key}"}, 24 | proxy=self.proxy, 25 | ) 26 | dct = response[0] 27 | if dct["Results"]: 28 | self.totalhosts = { 29 | host.split(",") 30 | if "," in host 31 | and self.word.replace("www.", "") in host.split(",")[0] in host 32 | else host.split(",")[4] 33 | for host in dct["Results"] 34 | } 35 | 36 | self.totalips = { 37 | ip.split(",")[0] 38 | for ip in dct["Results"] 39 | if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip.split(",")[0]) 40 | } 41 | 42 | async def get_hostnames(self) -> set: 43 | return self.totalhosts 44 | 45 | async def get_ips(self) -> set: 46 | return self.totalips 47 | 48 | async def process(self, proxy: bool = False) -> None: 49 | self.proxy = proxy 50 | await self.do_search() 51 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/censysearch.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | from censys.common import __version__ 4 | from censys.common.exceptions import ( 5 | CensysRateLimitExceededException, 6 | CensysUnauthorizedException, 7 | ) 8 | from censys.search import CensysCerts 9 | 10 | from theHarvester.discovery.constants import MissingKey 11 | from theHarvester.lib.core import Core 12 | from theHarvester.lib.version import version as thehavester_version 13 | 14 | 15 | class SearchCensys: 16 | def __init__(self, domain, limit: int = 500) -> None: 17 | self.word = domain 18 | self.key = Core.censys_key() 19 | if self.key[0] is None or self.key[1] is None: 20 | raise MissingKey("Censys ID and/or Secret") 21 | self.totalhosts: Set = set() 22 | self.emails: Set = set() 23 | self.limit = limit 24 | self.proxy = False 25 | 26 | async def do_search(self) -> None: 27 | try: 28 | cert_search = CensysCerts( 29 | api_id=self.key[0], 30 | api_secret=self.key[1], 31 | user_agent=f"censys-python/{__version__} (theHarvester/{thehavester_version}); +https://github.com/laramies/theHarvester)", 32 | ) 33 | except CensysUnauthorizedException: 34 | raise MissingKey("Censys ID and/or Secret") 35 | 36 | query = f"names: {self.word}" 37 | try: 38 | response = cert_search.search( 39 | query=query, 40 | fields=["names", "parsed.subject.email_address"], 41 | max_records=self.limit, 42 | ) 43 | for cert in response(): 44 | self.totalhosts.update(cert.get("names", [])) 45 | email_address = ( 46 | cert.get("parsed", {}).get("subject", {}).get("email_address", []) 47 | ) 48 | self.emails.update(email_address) 49 | except CensysRateLimitExceededException: 50 | print("Censys rate limit exceeded") 51 | 52 | async def get_hostnames(self) -> set: 53 | return self.totalhosts 54 | 55 | async def get_emails(self) -> set: 56 | return self.emails 57 | 58 | async def process(self, proxy: bool = False) -> None: 59 | self.proxy = proxy 60 | await self.do_search() 61 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/certspottersearch.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | from theHarvester.lib.core import AsyncFetcher 4 | 5 | 6 | class SearchCertspoter: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.totalhosts: Set = set() 10 | self.proxy = False 11 | 12 | async def do_search(self) -> None: 13 | base_url = f"https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names" 14 | try: 15 | response = await AsyncFetcher.fetch_all( 16 | [base_url], json=True, proxy=self.proxy 17 | ) 18 | response = response[0] 19 | if isinstance(response, list): 20 | for dct in response: 21 | for key, value in dct.items(): 22 | if key == "dns_names": 23 | self.totalhosts.update({name for name in value if name}) 24 | elif isinstance(response, dict): 25 | self.totalhosts.update({response["dns_names"] if "dns_names" in response.keys() else ""}) # type: ignore 26 | else: 27 | self.totalhosts.update({""}) 28 | except Exception as e: 29 | print(e) 30 | 31 | async def get_hostnames(self) -> set: 32 | return self.totalhosts 33 | 34 | async def process(self, proxy: bool = False) -> None: 35 | self.proxy = proxy 36 | await self.do_search() 37 | print("\tSearching results.") 38 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/crtsh.py: -------------------------------------------------------------------------------- 1 | from typing import List, Set 2 | 3 | from theHarvester.lib.core import AsyncFetcher 4 | 5 | 6 | class SearchCrtsh: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.data: List = [] 10 | self.proxy = False 11 | 12 | async def do_search(self) -> List: 13 | data: Set = set() 14 | try: 15 | url = f"https://crt.sh/?q=%25.{self.word}&output=json" 16 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 17 | response = response[0] 18 | data = set( 19 | [ 20 | dct["name_value"][2:] 21 | if "*." == dct["name_value"][:2] 22 | else dct["name_value"] 23 | for dct in response 24 | ] 25 | ) 26 | data = { 27 | domain 28 | for domain in data 29 | if (domain[0] != "*" and str(domain[0:4]).isnumeric() is False) 30 | } 31 | except Exception as e: 32 | print(e) 33 | clean: List = [] 34 | for x in data: 35 | pre = x.split() 36 | for y in pre: 37 | clean.append(y) 38 | return clean 39 | 40 | async def process(self, proxy: bool = False) -> None: 41 | self.proxy = proxy 42 | data = await self.do_search() 43 | self.data = data 44 | 45 | async def get_hostnames(self) -> list: 46 | return self.data 47 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/dnsdumpster.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | import aiohttp 4 | 5 | from theHarvester.lib.core import Core 6 | from theHarvester.parsers import myparser 7 | 8 | 9 | class SearchDnsDumpster: 10 | def __init__(self, word) -> None: 11 | self.word = word.replace(" ", "%20") 12 | self.results = "" 13 | self.totalresults = "" 14 | self.server = "dnsdumpster.com" 15 | self.proxy = False 16 | 17 | async def do_search(self) -> None: 18 | try: 19 | agent = Core.get_user_agent() 20 | headers = {"User-Agent": agent} 21 | session = aiohttp.ClientSession(headers=headers) 22 | # create a session to properly verify 23 | url = f"https://{self.server}" 24 | csrftoken = "" 25 | if self.proxy is False: 26 | async with session.get(url, headers=headers) as resp: 27 | resp_cookies = str(resp.cookies) 28 | cookies = resp_cookies.split("csrftoken=") 29 | csrftoken += cookies[1][: cookies[1].find(";")] 30 | else: 31 | async with session.get(url, headers=headers, proxy=self.proxy) as resp: 32 | resp_cookies = str(resp.cookies) 33 | cookies = resp_cookies.split("csrftoken=") 34 | csrftoken += cookies[1][: cookies[1].find(";")] 35 | await asyncio.sleep(5) 36 | 37 | # extract csrftoken from cookies 38 | data = { 39 | "Cookie": f"csfrtoken={csrftoken}", 40 | "csrfmiddlewaretoken": csrftoken, 41 | "targetip": self.word, 42 | "user": "free", 43 | } 44 | headers["Referer"] = url 45 | if self.proxy is False: 46 | async with session.post(url, headers=headers, data=data) as resp: 47 | self.results = await resp.text() 48 | else: 49 | async with session.post( 50 | url, headers=headers, data=data, proxy=self.proxy 51 | ) as resp: 52 | self.results = await resp.text() 53 | await session.close() 54 | except Exception as e: 55 | print(f"An exception occurred: {e}") 56 | self.totalresults += self.results 57 | 58 | async def get_hostnames(self): 59 | rawres = myparser.Parser(self.totalresults, self.word) 60 | return await rawres.hostnames() 61 | 62 | async def process(self, proxy: bool = False) -> None: 63 | self.proxy = proxy 64 | await self.do_search() # Only need to do it once. 65 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/duckduckgosearch.py: -------------------------------------------------------------------------------- 1 | import ujson 2 | 3 | from theHarvester.lib.core import AsyncFetcher, Core 4 | from theHarvester.parsers import myparser 5 | 6 | 7 | class SearchDuckDuckGo: 8 | def __init__(self, word, limit) -> None: 9 | self.word = word 10 | self.results = "" 11 | self.totalresults = "" 12 | self.dorks: list = [] 13 | self.links: list = [] 14 | self.database = "https://duckduckgo.com/?q=" 15 | self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1" # Currently using API. 16 | self.quantity = "100" 17 | self.limit = limit 18 | self.proxy = False 19 | 20 | async def do_search(self) -> None: 21 | # Do normal scraping. 22 | url = self.api.replace("x", self.word) 23 | headers = {"User-Agent": Core.get_user_agent()} 24 | first_resp = await AsyncFetcher.fetch_all( 25 | [url], headers=headers, proxy=self.proxy 26 | ) 27 | self.results = first_resp[0] 28 | self.totalresults += self.results 29 | urls = await self.crawl(self.results) 30 | urls = {url for url in urls if len(url) > 5} 31 | all_resps = await AsyncFetcher.fetch_all(urls) 32 | self.totalresults += "".join(all_resps) 33 | 34 | async def crawl(self, text): 35 | """ 36 | Function parses json and returns URLs. 37 | :param text: formatted json 38 | :return: set of URLs 39 | """ 40 | urls = set() 41 | try: 42 | load = ujson.loads(text) 43 | for keys in load.keys(): # Iterate through keys of dict. 44 | val = load.get(keys) 45 | 46 | if isinstance(val, int) or isinstance(val, dict) or val is None: 47 | continue 48 | 49 | if isinstance(val, list): 50 | if len(val) == 0: # Make sure not indexing an empty list. 51 | continue 52 | val = val[0] # The First value should be dict. 53 | 54 | if isinstance(val, dict): # Validation check. 55 | for key in val.keys(): 56 | value = val.get(key) 57 | if ( 58 | isinstance(value, str) 59 | and value != "" 60 | and "https://" in value 61 | or "http://" in value 62 | ): 63 | urls.add(value) 64 | 65 | if ( 66 | isinstance(val, str) 67 | and val != "" 68 | and "https://" in val 69 | or "http://" in val 70 | ): 71 | urls.add(val) 72 | tmp = set() 73 | for url in urls: 74 | if ( 75 | "<" in url and "href=" in url 76 | ): # Format is 77 | equal_index = url.index("=") 78 | true_url = "" 79 | for ch in url[equal_index + 1 :]: 80 | if ch == '"': 81 | tmp.add(true_url) 82 | break 83 | true_url += ch 84 | else: 85 | if url != "": 86 | tmp.add(url) 87 | return tmp 88 | except Exception as e: 89 | print(f"Exception occurred: {e}") 90 | return [] 91 | 92 | async def get_emails(self): 93 | rawres = myparser.Parser(self.totalresults, self.word) 94 | return await rawres.emails() 95 | 96 | async def get_hostnames(self): 97 | rawres = myparser.Parser(self.totalresults, self.word) 98 | return await rawres.hostnames() 99 | 100 | async def process(self, proxy: bool = False) -> None: 101 | self.proxy = proxy 102 | await self.do_search() # Only need to search once since using API. 103 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/fullhuntsearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import MissingKey 2 | from theHarvester.lib.core import AsyncFetcher, Core 3 | 4 | 5 | class SearchFullHunt: 6 | def __init__(self, word) -> None: 7 | self.word = word 8 | self.key = Core.fullhunt_key() 9 | if self.key is None: 10 | raise MissingKey("fullhunt") 11 | self.total_results = None 12 | self.proxy = False 13 | 14 | async def do_search(self) -> None: 15 | url = f"https://fullhunt.io/api/v1/domain/{self.word}/subdomains" 16 | response = await AsyncFetcher.fetch_all( 17 | [url], 18 | json=True, 19 | headers={"User-Agent": Core.get_user_agent(), "X-API-KEY": self.key}, 20 | proxy=self.proxy, 21 | ) 22 | self.total_results = response[0]["hosts"] 23 | 24 | async def get_hostnames(self): 25 | return self.total_results 26 | 27 | async def process(self, proxy: bool = False) -> None: 28 | self.proxy = proxy 29 | await self.do_search() 30 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/hackertarget.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | 3 | 4 | class SearchHackerTarget: 5 | """ 6 | Class uses the HackerTarget api to gather subdomains and ips 7 | """ 8 | 9 | def __init__(self, word) -> None: 10 | self.word = word 11 | self.total_results = "" 12 | self.hostname = "https://api.hackertarget.com" 13 | self.proxy = False 14 | self.results = None 15 | 16 | async def do_search(self) -> None: 17 | headers = {"User-agent": Core.get_user_agent()} 18 | urls = [ 19 | f"{self.hostname}/hostsearch/?q={self.word}", 20 | f"{self.hostname}/reversedns/?q={self.word}", 21 | ] 22 | responses = await AsyncFetcher.fetch_all( 23 | urls, headers=headers, proxy=self.proxy 24 | ) 25 | for response in responses: 26 | self.total_results += response.replace(",", ":") 27 | 28 | async def process(self, proxy: bool = False) -> None: 29 | self.proxy = proxy 30 | await self.do_search() 31 | 32 | async def get_hostnames(self) -> list: 33 | return [ 34 | result 35 | for result in self.total_results.splitlines() 36 | if "No PTR records found" not in result 37 | ] 38 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/intelxsearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Any 3 | 4 | import requests 5 | import ujson 6 | 7 | from theHarvester.discovery.constants import MissingKey 8 | from theHarvester.lib.core import AsyncFetcher, Core 9 | from theHarvester.parsers import intelxparser 10 | 11 | 12 | class SearchIntelx: 13 | def __init__(self, word) -> None: 14 | self.word = word 15 | self.key = Core.intelx_key() 16 | if self.key is None: 17 | raise MissingKey("Intelx") 18 | self.database = "https://2.intelx.io" 19 | self.results: Any = None 20 | self.info: tuple[Any, ...] = () 21 | self.limit: int = 10000 22 | self.proxy = False 23 | self.offset = -1 24 | 25 | async def do_search(self) -> None: 26 | try: 27 | # Based on: https://github.com/IntelligenceX/SDK/blob/master/Python/intelxapi.py 28 | # API requests self identification 29 | # https://intelx.io/integrations 30 | headers = { 31 | "x-key": self.key, 32 | "User-Agent": f"{Core.get_user_agent()}-theHarvester", 33 | } 34 | data = { 35 | "term": self.word, 36 | "buckets": [], 37 | "lookuplevel": 0, 38 | "maxresults": self.limit, 39 | "timeout": 5, 40 | "datefrom": "", 41 | "dateto": "", 42 | "sort": 2, 43 | "media": 0, 44 | "terminate": [], 45 | "target": 0, 46 | } 47 | 48 | total_resp = requests.post( 49 | f"{self.database}/phonebook/search", headers=headers, json=data 50 | ) 51 | phonebook_id = ujson.loads(total_resp.text)["id"] 52 | await asyncio.sleep(5) 53 | 54 | # Fetch results from phonebook based on ID 55 | resp = await AsyncFetcher.fetch_all( 56 | [ 57 | f"{self.database}/phonebook/search/result?id={phonebook_id}&limit={self.limit}&offset={self.offset}" 58 | ], 59 | headers=headers, 60 | json=True, 61 | proxy=self.proxy, 62 | ) 63 | resp = resp[0] 64 | self.results = resp # TODO: give self.results more appropriate typing 65 | except Exception as e: 66 | print(f"An exception has occurred in Intelx: {e}") 67 | 68 | async def process(self, proxy: bool = False): 69 | self.proxy = proxy 70 | await self.do_search() 71 | intelx_parser = intelxparser.Parser() 72 | # TODO: give self.info more appropriate typing 73 | self.info = await intelx_parser.parse_dictionaries(self.results) 74 | 75 | async def get_emails(self): 76 | return self.info[0] 77 | 78 | async def get_interestingurls(self): 79 | # TODO parse add return hostnames for subdomains of urls 80 | return self.info[1] 81 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/netlas.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import MissingKey 2 | from theHarvester.lib.core import AsyncFetcher, Core 3 | 4 | 5 | class SearchNetlas: 6 | def __init__(self, word) -> None: 7 | self.word = word 8 | self.totalhosts: list = [] 9 | self.totalips: list = [] 10 | self.key = Core.netlas_key() 11 | if self.key is None: 12 | raise MissingKey("netlas") 13 | self.proxy = False 14 | 15 | async def do_search(self) -> None: 16 | api = f"https://app.netlas.io/api/domains/?q=*.{self.word}&source_type=include&start=0&fields=*" 17 | headers = {"X-API-Key": self.key} 18 | response = await AsyncFetcher.fetch_all( 19 | [api], json=True, headers=headers, proxy=self.proxy 20 | ) 21 | for domain in response[0]["items"]: 22 | self.totalhosts.append(domain["data"]["domain"]) 23 | 24 | async def get_hostnames(self) -> list: 25 | return self.totalhosts 26 | 27 | async def process(self, proxy: bool = False) -> None: 28 | self.proxy = proxy 29 | await self.do_search() 30 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/otxsearch.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Set 3 | 4 | from theHarvester.lib.core import AsyncFetcher 5 | 6 | 7 | class SearchOtx: 8 | def __init__(self, word) -> None: 9 | self.word = word 10 | self.totalhosts: Set = set() 11 | self.totalips: Set = set() 12 | self.proxy = False 13 | 14 | async def do_search(self) -> None: 15 | url = f"https://otx.alienvault.com/api/v1/indicators/domain/{self.word}/passive_dns" 16 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 17 | responses = response[0] 18 | dct = responses 19 | self.totalhosts = {host["hostname"] for host in dct["passive_dns"]} 20 | # filter out ips that are just called NXDOMAIN 21 | self.totalips = { 22 | ip["address"] 23 | for ip in dct["passive_dns"] 24 | if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip["address"]) 25 | } 26 | 27 | async def get_hostnames(self) -> set: 28 | return self.totalhosts 29 | 30 | async def get_ips(self) -> set: 31 | return self.totalips 32 | 33 | async def process(self, proxy: bool = False) -> None: 34 | self.proxy = proxy 35 | await self.do_search() 36 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/pentesttools.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import List 3 | 4 | import ujson 5 | 6 | from theHarvester.discovery.constants import MissingKey 7 | from theHarvester.lib.core import AsyncFetcher, Core 8 | 9 | 10 | class SearchPentestTools: 11 | def __init__(self, word) -> None: 12 | # Script is largely based off https://pentest-tools.com/public/api_client.py.txt 13 | self.word = word 14 | self.key = Core.pentest_tools_key() 15 | if self.key is None: 16 | raise MissingKey("PentestTools") 17 | self.total_results: List = [] 18 | self.api = f"https://pentest-tools.com/api?key={self.key}" 19 | self.proxy = False 20 | 21 | async def poll(self, scan_id): 22 | while True: 23 | time.sleep(3) 24 | # Get the status of our scan 25 | scan_status_data = {"op": "get_scan_status", "scan_id": scan_id} 26 | responses = await AsyncFetcher.post_fetch( 27 | url=self.api, data=ujson.dumps(scan_status_data), proxy=self.proxy 28 | ) 29 | res_json = ujson.loads(responses.strip()) 30 | if res_json["op_status"] == "success": 31 | if ( 32 | res_json["scan_status"] != "waiting" 33 | and res_json["scan_status"] != "running" 34 | ): 35 | getoutput_data = { 36 | "op": "get_output", 37 | "scan_id": scan_id, 38 | "output_format": "json", 39 | } 40 | responses = await AsyncFetcher.post_fetch( 41 | url=self.api, data=ujson.dumps(getoutput_data), proxy=self.proxy 42 | ) 43 | 44 | res_json = ujson.loads(responses.strip("\n")) 45 | self.total_results = await self.parse_json(res_json) 46 | break 47 | else: 48 | print( 49 | f"Operation get_scan_status failed because: {res_json['error']}. {res_json['details']}" 50 | ) 51 | break 52 | 53 | @staticmethod 54 | async def parse_json(json_results): 55 | status = json_results["op_status"] 56 | if status == "success": 57 | scan_tests = json_results["scan_output"]["output_json"] 58 | output_data = scan_tests[0]["output_data"] 59 | host_to_ip = [ 60 | f"{subdomain[0]}:{subdomain[1]}" 61 | for subdomain in output_data 62 | if len(subdomain) > 0 63 | ] 64 | return host_to_ip 65 | return [] 66 | 67 | async def get_hostnames(self) -> list: 68 | return self.total_results 69 | 70 | async def do_search(self) -> None: 71 | subdomain_payload = { 72 | "op": "start_scan", 73 | "tool_id": 20, 74 | "tool_params": { 75 | "target": f"{self.word}", 76 | "web_details": "off", 77 | "do_smart_search": "off", 78 | }, 79 | } 80 | responses = await AsyncFetcher.post_fetch( 81 | url=self.api, data=ujson.dumps(subdomain_payload), proxy=self.proxy 82 | ) 83 | res_json = ujson.loads(responses.strip()) 84 | if res_json["op_status"] == "success": 85 | scan_id = res_json["scan_id"] 86 | await self.poll(scan_id) 87 | 88 | async def process(self, proxy: bool = False) -> None: 89 | self.proxy = proxy 90 | await self.do_search() # Only need to do it once. 91 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/projectdiscovery.py: -------------------------------------------------------------------------------- 1 | from theHarvester.discovery.constants import MissingKey 2 | from theHarvester.lib.core import AsyncFetcher, Core 3 | 4 | 5 | class SearchDiscovery: 6 | def __init__(self, word) -> None: 7 | self.word = word 8 | self.key = Core.projectdiscovery_key() 9 | if self.key is None: 10 | raise MissingKey("ProjectDiscovery") 11 | self.total_results = None 12 | self.proxy = False 13 | 14 | async def do_search(self): 15 | url = f"https://dns.projectdiscovery.io/dns/{self.word}/subdomains" 16 | response = await AsyncFetcher.fetch_all( 17 | [url], 18 | json=True, 19 | headers={"User-Agent": Core.get_user_agent(), "Authorization": self.key}, 20 | proxy=self.proxy, 21 | ) 22 | self.total_results = [ 23 | f"{domains}.{self.word}" for domains in response[0]["subdomains"] 24 | ] 25 | 26 | async def get_hostnames(self): 27 | return self.total_results 28 | 29 | async def process(self, proxy: bool = False) -> None: 30 | self.proxy = proxy 31 | await self.do_search() 32 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/rapiddns.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | from theHarvester.lib.core import AsyncFetcher, Core 4 | 5 | 6 | class SearchRapidDns: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.total_results: list = [] 10 | self.proxy = False 11 | 12 | async def do_search(self): 13 | try: 14 | headers = {"User-agent": Core.get_user_agent()} 15 | # TODO see if it's worth adding sameip searches 16 | # f'{self.hostname}/sameip/{self.word}?full=1#result' 17 | urls = [f"https://rapiddns.io/subdomain/{self.word}?full=1#result"] 18 | responses = await AsyncFetcher.fetch_all( 19 | urls, headers=headers, proxy=self.proxy 20 | ) 21 | if len(responses[0]) <= 1: 22 | return self.total_results 23 | soup = BeautifulSoup(responses[0], "html.parser") 24 | rows = soup.find("table").find("tbody").find_all("tr") 25 | if rows: 26 | # Validation check 27 | for row in rows: 28 | cells = row.find_all("td") 29 | if len(cells) > 0: 30 | # sanity check 31 | subdomain = str(cells[0].get_text()) 32 | if cells[-1].get_text() == "CNAME": 33 | self.total_results.append(f"{subdomain}") 34 | else: 35 | self.total_results.append( 36 | f"{subdomain}:{str(cells[1].get_text()).strip()}" 37 | ) 38 | self.total_results = list({domain for domain in self.total_results}) 39 | except Exception as e: 40 | print(f"An exception has occurred: {str(e)}") 41 | 42 | async def process(self, proxy: bool = False) -> None: 43 | self.proxy = proxy 44 | await self.do_search() 45 | 46 | async def get_hostnames(self): 47 | return self.total_results 48 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/rocketreach.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Set 3 | 4 | from theHarvester.discovery.constants import MissingKey, get_delay 5 | from theHarvester.lib.core import AsyncFetcher, Core 6 | 7 | 8 | class SearchRocketReach: 9 | def __init__(self, word, limit) -> None: 10 | self.ips: Set = set() 11 | self.word = word 12 | self.key = Core.rocketreach_key() 13 | if self.key is None: 14 | raise MissingKey("RocketReach") 15 | self.hosts: Set = set() 16 | self.proxy = False 17 | self.baseurl = "https://rocketreach.co/api/v2/person/search" 18 | self.links: Set = set() 19 | self.limit = limit 20 | 21 | async def do_search(self) -> None: 22 | try: 23 | headers = { 24 | "Api-Key": self.key, 25 | "Content-Type": "application/json", 26 | "User-Agent": Core.get_user_agent(), 27 | } 28 | 29 | next_page = 1 # track pagination 30 | for count in range(1, self.limit): 31 | data = f'{{"query":{{"company_domain": ["{self.word}"]}}, "start": {next_page}, "page_size": 100}}' 32 | result = await AsyncFetcher.post_fetch( 33 | self.baseurl, headers=headers, data=data, json=True 34 | ) 35 | if ( 36 | "detail" in result.keys() 37 | and "error" in result.keys() 38 | and "Subscribe to a plan to access" in result["detail"] 39 | ): 40 | # No more results can be fetched 41 | break 42 | if ( 43 | "detail" in result.keys() 44 | and "Request was throttled." in result["detail"] 45 | ): 46 | # Rate limit has been triggered need to sleep extra 47 | print( 48 | f"RocketReach requests have been throttled; " 49 | f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}' 50 | ) 51 | break 52 | if "profiles" in dict(result).keys(): 53 | if len(result["profiles"]) == 0: 54 | break 55 | for profile in result["profiles"]: 56 | if "linkedin_url" in dict(profile).keys(): 57 | self.links.add(profile["linkedin_url"]) 58 | if "pagination" in dict(result).keys(): 59 | next_page = int(result["pagination"]["next"]) 60 | if next_page > int(result["pagination"]["total"]): 61 | break 62 | 63 | await asyncio.sleep(get_delay() + 5) 64 | 65 | except Exception as e: 66 | print(f"An exception has occurred: {e}") 67 | 68 | async def get_links(self): 69 | return self.links 70 | 71 | async def process(self, proxy: bool = False) -> None: 72 | self.proxy = proxy 73 | await self.do_search() 74 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/searchhunterhow.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from datetime import datetime 3 | from typing import Set 4 | 5 | from dateutil.relativedelta import relativedelta 6 | 7 | from theHarvester.discovery.constants import MissingKey 8 | from theHarvester.lib.core import AsyncFetcher, Core 9 | 10 | 11 | class SearchHunterHow: 12 | def __init__(self, word) -> None: 13 | self.word = word 14 | self.total_hostnames: Set = set() 15 | self.key = Core.hunterhow_key() 16 | if self.key is None: 17 | raise MissingKey("hunterhow") 18 | self.proxy = False 19 | 20 | async def do_search(self) -> None: 21 | # https://hunter.how/search-api 22 | query = f'domain.suffix="{self.word}"' 23 | # second_query = f'domain="{self.word}"' 24 | encoded_query = base64.urlsafe_b64encode(query.encode("utf-8")).decode("ascii") 25 | page = 1 26 | page_size = 100 # can be either: 10,20,50,100) 27 | # The interval between the start time and the end time cannot exceed one year 28 | # Can not exceed one year, but years=1 does not work due to their backend, 364 will suffice 29 | today = datetime.today() 30 | one_year_ago = today - relativedelta(days=364) 31 | start_time = one_year_ago.strftime("%Y-%m-%d") 32 | end_time = today.strftime("%Y-%m-%d") 33 | # two_years_ago = one_year_ago - relativedelta(days=364) 34 | # start_time = two_years_ago.strftime('%Y-%m-%d') 35 | # end_time = one_year_ago.strftime('%Y-%m-%d') 36 | url = ( 37 | "https://api.hunter.how/search?api-key=%s&query=%s&page=%d&page_size=%d&start_time=%s&end_time=%s" 38 | % ( 39 | # self.key, encoded_query, page, page_size, start_time, end_time 40 | self.key, 41 | encoded_query, 42 | page, 43 | page_size, 44 | start_time, 45 | end_time, 46 | ) 47 | ) 48 | # print(f'Sending url: {url}') 49 | response = await AsyncFetcher.fetch_all( 50 | [url], 51 | json=True, 52 | headers={"User-Agent": Core.get_user_agent(), "x-api-key": f"{self.key}"}, 53 | proxy=self.proxy, 54 | ) 55 | dct = response[0] 56 | # print(f'json response: ') 57 | # print(dct) 58 | if "code" in dct.keys(): 59 | if dct["code"] == 40001: 60 | print(f'Code 40001 indicates for searchhunterhow: {dct["message"]}') 61 | return 62 | # total = dct['data']['total'] 63 | # TODO determine if total is ever 100 how to get more subdomains? 64 | for sub in dct["data"]["list"]: 65 | self.total_hostnames.add(sub["domain"]) 66 | 67 | async def get_hostnames(self) -> set: 68 | return self.total_hostnames 69 | 70 | async def process(self, proxy: bool = False) -> None: 71 | self.proxy = proxy 72 | await self.do_search() 73 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/securitytrailssearch.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Sequence 3 | 4 | from theHarvester.discovery.constants import MissingKey 5 | from theHarvester.lib.core import AsyncFetcher, Core 6 | from theHarvester.parsers import securitytrailsparser 7 | 8 | 9 | class SearchSecuritytrail: 10 | def __init__(self, word) -> None: 11 | self.word = word 12 | self.key = Core.security_trails_key() 13 | if self.key is None: 14 | raise MissingKey("Securitytrail") 15 | self.results = "" 16 | self.totalresults = "" 17 | self.api = "https://api.securitytrails.com/v1/" 18 | self.info: tuple[set, set] = (set(), set()) 19 | self.proxy = False 20 | 21 | async def authenticate(self) -> None: 22 | # Method to authenticate API key before sending requests. 23 | headers = {"APIKEY": self.key} 24 | url = f"{self.api}ping" 25 | auth_responses = await AsyncFetcher.fetch_all( 26 | [url], headers=headers, proxy=self.proxy 27 | ) 28 | auth_responses = auth_responses[0] 29 | if "False" in auth_responses or "Invalid authentication" in auth_responses: 30 | print("\tKey could not be authenticated exiting program.") 31 | await asyncio.sleep(5) 32 | 33 | async def do_search(self) -> None: 34 | # https://api.securitytrails.com/v1/domain/domain.com 35 | url = f"{self.api}domain/{self.word}" 36 | headers = {"APIKEY": self.key} 37 | response = await AsyncFetcher.fetch_all( 38 | [url], headers=headers, proxy=self.proxy 39 | ) 40 | await asyncio.sleep( 41 | 5 42 | ) # Not random delay because 2 seconds is required due to rate limit. 43 | self.results = response[0] 44 | self.totalresults += self.results 45 | url += "/subdomains" # Get subdomains now. 46 | subdomain_response = await AsyncFetcher.fetch_all( 47 | [url], headers=headers, proxy=self.proxy 48 | ) 49 | await asyncio.sleep(5) 50 | self.results = subdomain_response[0] 51 | self.totalresults += self.results 52 | 53 | async def process(self, proxy: bool = False) -> None: 54 | self.proxy = proxy 55 | await self.authenticate() 56 | await self.do_search() 57 | parser = securitytrailsparser.Parser(word=self.word, text=self.totalresults) 58 | self.info = await parser.parse_text() 59 | # Create parser and set self.info to tuple returned from parsing text. 60 | print("\tDone Searching Results") 61 | 62 | async def get_ips(self) -> set: 63 | return self.info[0] 64 | 65 | async def get_hostnames(self) -> set: 66 | return self.info[1] 67 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/shodansearch.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from typing import List 3 | 4 | from shodan import Shodan, exception 5 | 6 | from theHarvester.discovery.constants import MissingKey 7 | from theHarvester.lib.core import Core 8 | 9 | 10 | class SearchShodan: 11 | def __init__(self) -> None: 12 | self.key = Core.shodan_key() 13 | if self.key is None: 14 | raise MissingKey("Shodan") 15 | self.api = Shodan(self.key) 16 | self.hostdatarow: List = [] 17 | self.tracker: OrderedDict = OrderedDict() 18 | 19 | async def search_ip(self, ip) -> OrderedDict: 20 | try: 21 | ipaddress = ip 22 | results = self.api.host(ipaddress) 23 | asn = "" 24 | domains: List = list() 25 | hostnames: List = list() 26 | ip_str = "" 27 | isp = "" 28 | org = "" 29 | ports: List = list() 30 | title = "" 31 | server = "" 32 | product = "" 33 | technologies: List = list() 34 | 35 | data_first_dict = dict(results["data"][0]) 36 | 37 | if "ip_str" in data_first_dict.keys(): 38 | ip_str += data_first_dict["ip_str"] 39 | 40 | if "http" in data_first_dict.keys(): 41 | http_results_dict = dict(data_first_dict["http"]) 42 | if "title" in http_results_dict.keys(): 43 | title_val = str(http_results_dict["title"]).strip() 44 | if title_val != "None": 45 | title += title_val 46 | if "components" in http_results_dict.keys(): 47 | for key in http_results_dict["components"].keys(): 48 | technologies.append(key) 49 | if "server" in http_results_dict.keys(): 50 | server_val = str(http_results_dict["server"]).strip() 51 | if server_val != "None": 52 | server += server_val 53 | 54 | for key, value in results.items(): 55 | if key == "asn": 56 | asn += value 57 | if key == "domains": 58 | value = list(value) 59 | value.sort() 60 | domains.extend(value) 61 | if key == "hostnames": 62 | value = [host.strip() for host in list(value)] 63 | value.sort() 64 | hostnames.extend(value) 65 | if key == "isp": 66 | isp += value 67 | if key == "org": 68 | org += str(value) 69 | if key == "ports": 70 | value = list(value) 71 | value.sort() 72 | ports.extend(value) 73 | if key == "product": 74 | product += value 75 | 76 | technologies = list(set(technologies)) 77 | 78 | self.tracker[ip] = { 79 | "asn": asn.strip(), 80 | "domains": domains, 81 | "hostnames": hostnames, 82 | "ip_str": ip_str.strip(), 83 | "isp": isp.strip(), 84 | "org": org.strip(), 85 | "ports": ports, 86 | "product": product.strip(), 87 | "server": server.strip(), 88 | "technologies": technologies, 89 | "title": title.strip(), 90 | } 91 | 92 | return self.tracker 93 | except exception.APIError: 94 | print(f"{ip}: Not in Shodan") 95 | self.tracker[ip] = "Not in Shodan" 96 | except Exception as e: 97 | # print(f'Error occurred in the Shodan IP search module: {e}') 98 | self.tracker[ip] = f"Error occurred in the Shodan IP search module: {e}" 99 | finally: 100 | return self.tracker 101 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/subdomaincenter.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | 3 | 4 | class SubdomainCenter: 5 | def __init__(self, word): 6 | self.word = word 7 | self.results = set() 8 | self.server = "https://api.subdomain.center/?domain=" 9 | self.proxy = False 10 | 11 | async def do_search(self): 12 | headers = {"User-Agent": Core.get_user_agent()} 13 | try: 14 | current_url = f"{self.server}{self.word}" 15 | resp = await AsyncFetcher.fetch_all( 16 | [current_url], headers=headers, proxy=self.proxy, json=True 17 | ) 18 | self.results = resp[0] 19 | self.results = { 20 | sub[4:] if sub[:4] == "www." and sub[4:] else sub 21 | for sub in self.results 22 | } 23 | except Exception as e: 24 | print(f"An exception has occurred in SubdomainCenter on : {e}") 25 | 26 | async def get_hostnames(self): 27 | return self.results 28 | 29 | async def process(self, proxy=False): 30 | self.proxy = proxy 31 | await self.do_search() 32 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/subdomainfinderc99.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Set 3 | 4 | import ujson 5 | from bs4 import BeautifulSoup 6 | 7 | from theHarvester.discovery.constants import get_delay 8 | from theHarvester.lib.core import AsyncFetcher, Core 9 | from theHarvester.parsers import myparser 10 | 11 | 12 | class SearchSubdomainfinderc99: 13 | def __init__(self, word) -> None: 14 | self.word = word 15 | self.total_results: Set = set() 16 | self.proxy = False 17 | # TODO add api support 18 | self.server = "https://subdomainfinder.c99.nl/" 19 | self.totalresults = "" 20 | 21 | async def do_search(self) -> None: 22 | # Based on https://gist.github.com/th3gundy/bc83580cbe04031e9164362b33600962 23 | headers = {"User-Agent": Core.get_user_agent()} 24 | resp = await AsyncFetcher.fetch_all( 25 | [self.server], headers=headers, proxy=self.proxy 26 | ) 27 | data = await self.get_csrf_params(resp[0]) 28 | 29 | data["scan_subdomains"] = "" 30 | data["domain"] = self.word 31 | data["privatequery"] = "on" 32 | await asyncio.sleep(get_delay()) 33 | second_resp = await AsyncFetcher.post_fetch( 34 | self.server, headers=headers, proxy=self.proxy, data=ujson.dumps(data) 35 | ) 36 | 37 | # print(second_resp) 38 | self.totalresults += second_resp 39 | # y = await self.get_hostnames() 40 | # print(list(sorted(y))) 41 | # print(f'Found: {len(y)} subdomains') 42 | 43 | # regex = r"value='(https://subdomainfinder\.c99\.nl/scans/\d{4}-\d{2}-\d{2}/" + self.word + r")'" 44 | # match = re.search(regex, second_resp) 45 | # if match: 46 | # print(match.group(1)) 47 | 48 | async def get_hostnames(self): 49 | rawres = myparser.Parser(self.totalresults, self.word) 50 | return await rawres.hostnames() 51 | 52 | async def process(self, proxy: bool = False) -> None: 53 | self.proxy = proxy 54 | await self.do_search() 55 | 56 | @staticmethod 57 | async def get_csrf_params(data): 58 | csrf_params = {} 59 | html = BeautifulSoup(data, "html.parser").find("div", {"class": "input-group"}) 60 | for c in html.find_all("input"): 61 | try: 62 | csrf_params[c.get("name")] = c.get("value") 63 | except Exception: 64 | continue 65 | 66 | return csrf_params 67 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/threatminer.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | from theHarvester.lib.core import AsyncFetcher 4 | 5 | 6 | class SearchThreatminer: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.totalhosts: Set = set() 10 | self.totalips: Set = set() 11 | self.proxy = False 12 | 13 | async def do_search(self) -> None: 14 | url = f"https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5" 15 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 16 | self.totalhosts = {host for host in response[0]["results"]} 17 | second_url = f"https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2" 18 | secondresp = await AsyncFetcher.fetch_all( 19 | [second_url], json=True, proxy=self.proxy 20 | ) 21 | try: 22 | self.totalips = {resp["ip"] for resp in secondresp[0]["results"]} 23 | except TypeError: 24 | pass 25 | 26 | async def get_hostnames(self) -> Set: 27 | return self.totalhosts 28 | 29 | async def get_ips(self) -> Set: 30 | return self.totalips 31 | 32 | async def process(self, proxy: bool = False) -> None: 33 | self.proxy = proxy 34 | await self.do_search() 35 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/urlscan.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | from theHarvester.lib.core import AsyncFetcher 4 | 5 | 6 | class SearchUrlscan: 7 | def __init__(self, word) -> None: 8 | self.word = word 9 | self.totalhosts: Set = set() 10 | self.totalips: Set = set() 11 | self.interestingurls: Set = set() 12 | self.totalasns: Set = set() 13 | self.proxy = False 14 | 15 | async def do_search(self) -> None: 16 | url = f"https://urlscan.io/api/v1/search/?q=domain:{self.word}" 17 | response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy) 18 | resp = response[0] 19 | self.totalhosts = {f"{page['page']['domain']}" for page in resp["results"]} 20 | self.totalips = { 21 | f"{page['page']['ip']}" 22 | for page in resp["results"] 23 | if "ip" in page["page"].keys() 24 | } 25 | self.interestingurls = { 26 | f"{page['page']['url']}" 27 | for page in resp["results"] 28 | if self.word in page["page"]["url"] and "url" in page["page"].keys() 29 | } 30 | self.totalasns = { 31 | f"{page['page']['asn']}" 32 | for page in resp["results"] 33 | if "asn" in page["page"].keys() 34 | } 35 | 36 | async def get_hostnames(self) -> Set: 37 | return self.totalhosts 38 | 39 | async def get_ips(self) -> Set: 40 | return self.totalips 41 | 42 | async def get_interestingurls(self) -> Set: 43 | return self.interestingurls 44 | 45 | async def get_asns(self) -> Set: 46 | return self.totalasns 47 | 48 | async def process(self, proxy: bool = False) -> None: 49 | self.proxy = proxy 50 | await self.do_search() 51 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/discovery/yahoosearch.py: -------------------------------------------------------------------------------- 1 | from theHarvester.lib.core import AsyncFetcher, Core 2 | from theHarvester.parsers import myparser 3 | 4 | 5 | class SearchYahoo: 6 | def __init__(self, word, limit) -> None: 7 | self.word = word 8 | self.total_results = "" 9 | self.server = "search.yahoo.com" 10 | self.limit = limit 11 | self.proxy = False 12 | 13 | async def do_search(self) -> None: 14 | base_url = f"https://{self.server}/search?p=%40{self.word}&b=xx&pz=10" 15 | headers = {"Host": self.server, "User-agent": Core.get_user_agent()} 16 | urls = [ 17 | base_url.replace("xx", str(num)) 18 | for num in range(0, self.limit, 10) 19 | if num <= self.limit 20 | ] 21 | responses = await AsyncFetcher.fetch_all( 22 | urls, headers=headers, proxy=self.proxy 23 | ) 24 | for response in responses: 25 | self.total_results += response 26 | 27 | async def process(self, proxy: bool = False) -> None: 28 | self.proxy = proxy 29 | await self.do_search() 30 | 31 | async def get_emails(self): 32 | rawres = myparser.Parser(self.total_results, self.word) 33 | toparse_emails = await rawres.emails() 34 | emails = set() 35 | # strip out numbers and dashes for emails that look like xxx-xxx-xxxemail@host.tld 36 | for email in toparse_emails: 37 | email = str(email) 38 | if "-" in email and email[0].isdigit() and email.index("-") <= 9: 39 | while email[0] == "-" or email[0].isdigit(): 40 | email = email[1:] 41 | emails.add(email) 42 | return list(emails) 43 | 44 | async def get_hostnames(self, proxy: bool = False): 45 | self.proxy = proxy 46 | rawres = myparser.Parser(self.total_results, self.word) 47 | return await rawres.hostnames() 48 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester/parsers/__init__.py -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/parsers/intelxparser.py: -------------------------------------------------------------------------------- 1 | from typing import Set 2 | 3 | 4 | class Parser: 5 | def __init__(self) -> None: 6 | self.emails: Set = set() 7 | self.hosts: Set = set() 8 | 9 | async def parse_dictionaries(self, results: dict) -> tuple: 10 | """ 11 | Parse method to parse json results 12 | :param results: Dictionary containing a list of dictionaries known as selectors 13 | :return: tuple of emails and hosts 14 | """ 15 | if results is not None: 16 | for dictionary in results["selectors"]: 17 | field = dictionary["selectorvalue"] 18 | if "@" in field: 19 | self.emails.add(field) 20 | else: 21 | field = str(field) 22 | if "http" in field or "https" in field: 23 | if field[:5] == "https": 24 | field = field[8:] 25 | else: 26 | field = field[7:] 27 | self.hosts.add(field.replace(")", "").replace(",", "")) 28 | return self.emails, self.hosts 29 | return None, None 30 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/parsers/securitytrailsparser.py: -------------------------------------------------------------------------------- 1 | from typing import List, Set, Tuple, Union 2 | 3 | 4 | class Parser: 5 | def __init__(self, word, text) -> None: 6 | self.word = word 7 | self.text = text 8 | self.hostnames: Set = set() 9 | self.ips: Set = set() 10 | 11 | async def parse_text(self) -> Tuple[set, set]: 12 | sub_domain_flag = 0 13 | self.text = str(self.text).splitlines() 14 | # Split lines to get a list of lines. 15 | for index in range(0, len(self.text)): 16 | line = self.text[index].strip() 17 | if '"ip":' in line: 18 | # Extract IP. 19 | ip = "" 20 | for ch in line[7:]: 21 | if ch == '"': 22 | break 23 | else: 24 | ip += ch 25 | self.ips.add(ip) 26 | elif '"subdomains":' in line: 27 | # subdomains start here so set flag to 1 28 | sub_domain_flag = 1 29 | continue 30 | elif sub_domain_flag > 0: 31 | if "]" in line: 32 | sub_domain_flag = 0 33 | else: 34 | if "www" in self.word: 35 | self.word = ( 36 | str(self.word).replace("www.", "").replace("www", "") 37 | ) 38 | # Remove www from word if entered 39 | self.hostnames.add( 40 | str(line).replace('"', "").replace(",", "") + "." + self.word 41 | ) 42 | else: 43 | continue 44 | return self.ips, self.hostnames 45 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/restfulHarvest.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import uvicorn 4 | 5 | 6 | def main(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument( 9 | "-H", 10 | "--host", 11 | default="127.0.0.1", 12 | help="IP address to listen on default is 127.0.0.1", 13 | ) 14 | parser.add_argument( 15 | "-p", 16 | "--port", 17 | default=5000, 18 | help="Port to bind the web server to, default is 5000", 19 | type=int, 20 | ) 21 | parser.add_argument( 22 | "-l", 23 | "--log-level", 24 | default="info", 25 | help="Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set", 26 | ) 27 | parser.add_argument( 28 | "-r", 29 | "--reload", 30 | default=False, 31 | help="Enable automatic reload used during development of the api", 32 | action="store_true", 33 | ) 34 | 35 | args: argparse.Namespace = parser.parse_args() 36 | uvicorn.run( 37 | "theHarvester.lib.api.api:app", 38 | host=args.host, 39 | port=args.port, 40 | log_level=args.log_level, 41 | reload=args.reload, 42 | ) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/screenshot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester/screenshot/__init__.py -------------------------------------------------------------------------------- /src/theHarvester/theHarvester/theHarvester.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import sys 3 | 4 | from theHarvester import __main__ 5 | 6 | 7 | def main(): 8 | platform = sys.platform 9 | if platform == "win32": 10 | # Required or things will break if trying to take screenshots 11 | import multiprocessing 12 | 13 | multiprocessing.freeze_support() 14 | asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy 15 | else: 16 | import uvloop 17 | 18 | uvloop.install() 19 | 20 | if "linux" in platform: 21 | import aiomultiprocess 22 | 23 | # As we are not using Windows, we can change the spawn method to fork for greater performance 24 | aiomultiprocess.set_context("fork") 25 | asyncio.run(__main__.entry_point()) 26 | -------------------------------------------------------------------------------- /wordlists/dorks.txt: -------------------------------------------------------------------------------- 1 | inurl:"contact" 2 | intext:email filetype:log 3 | "Index of /mail" 4 | "admin account info" filetype:log 5 | intext:@ 6 | administrator accounts/ 7 | intitle:"Index of" .bash_history 8 | intitle:"index of" members OR accounts 9 | inurl:/shared/help.php 10 | inurl:public 11 | intitle:index.of inbox 12 | intitle:"Server Administration" 13 | inurl:passwd.txt 14 | robots.txt 15 | php-addressbook "This is the addressbook for *" -warning -------------------------------------------------------------------------------- /wordlists/general/common.txt: -------------------------------------------------------------------------------- 1 | admin 2 | test 3 | hello 4 | uk 5 | login 6 | book 7 | robots.txt 8 | --------------------------------------------------------------------------------