├── .gitignore
├── .vscode
    └── settings.json
├── InfoHunter.py
├── LICENSE
├── README.md
├── api-keys example.yaml
├── api_keys example.json
├── images
    ├── logo1.png
    └── logo2.png
├── proxies.yaml
├── requirements.txt
├── src
    ├── evaluacion
    │   ├── __init__.py
    │   └── mejoras.py
    ├── maigret
    │   ├── .dockerignore
    │   ├── .githooks
    │   │   └── pre-commit
    │   ├── .github
    │   │   ├── FUNDING.yml
    │   │   ├── ISSUE_TEMPLATE
    │   │   │   ├── add-a-site.md
    │   │   │   ├── bug.md
    │   │   │   └── report-false-result.md
    │   │   ├── dependabot.yml
    │   │   └── workflows
    │   │   │   ├── build-docker-image.yml
    │   │   │   ├── codeql-analysis.yml
    │   │   │   ├── pyinstaller.yml
    │   │   │   ├── python-package.yml
    │   │   │   ├── python-publish.yml
    │   │   │   └── update-site-data.yml
    │   ├── .gitignore
    │   ├── CHANGELOG.md
    │   ├── CODE_OF_CONDUCT.md
    │   ├── CONTRIBUTING.md
    │   ├── Dockerfile
    │   ├── LICENSE
    │   ├── MANIFEST.in
    │   ├── Makefile
    │   ├── README.md
    │   ├── docs
    │   │   ├── Makefile
    │   │   ├── make.bat
    │   │   ├── requirements.txt
    │   │   └── source
    │   │   │   ├── command-line-options.rst
    │   │   │   ├── conf.py
    │   │   │   ├── development.rst
    │   │   │   ├── extracting-information-from-pages.rst
    │   │   │   ├── features.rst
    │   │   │   ├── index.rst
    │   │   │   ├── philosophy.rst
    │   │   │   ├── roadmap.rst
    │   │   │   ├── settings.rst
    │   │   │   ├── supported-identifier-types.rst
    │   │   │   ├── tags.rst
    │   │   │   └── usage-examples.rst
    │   ├── maigret.py
    │   ├── maigret
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── __version__.py
    │   │   ├── activation.py
    │   │   ├── checking.py
    │   │   ├── errors.py
    │   │   ├── executors.py
    │   │   ├── maigret.py
    │   │   ├── notify.py
    │   │   ├── report.py
    │   │   ├── resources
    │   │   │   ├── data.json
    │   │   │   ├── simple_report.tpl
    │   │   │   ├── simple_report_pdf.css
    │   │   │   └── simple_report_pdf.tpl
    │   │   ├── result.py
    │   │   ├── settings.py
    │   │   ├── sites.py
    │   │   ├── submit.py
    │   │   ├── types.py
    │   │   └── utils.py
    │   ├── pyinstaller
    │   │   ├── maigret_standalone.py
    │   │   └── requirements.txt
    │   ├── pytest.ini
    │   ├── requirements.txt
    │   ├── setup.cfg
    │   ├── setup.py
    │   ├── sites.md
    │   ├── snapcraft.yaml
    │   ├── static
    │   │   ├── chat_gitter.svg
    │   │   ├── maigret.png
    │   │   ├── recursive_search.md
    │   │   ├── recursive_search.svg
    │   │   ├── report_alexaimephotography_html_screenshot.png
    │   │   ├── report_alexaimephotography_xmind_screenshot.png
    │   │   ├── report_alexaimephotographycars.html
    │   │   └── report_alexaimephotographycars.pdf
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── conftest.py
    │   │   ├── db.json
    │   │   ├── local.json
    │   │   ├── test_activation.py
    │   │   ├── test_checking.py
    │   │   ├── test_cli.py
    │   │   ├── test_data.py
    │   │   ├── test_executors.py
    │   │   ├── test_maigret.py
    │   │   ├── test_notify.py
    │   │   ├── test_report.py
    │   │   ├── test_sites.py
    │   │   └── test_utils.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── add_tags.py
    │   │   ├── check_engines.py
    │   │   ├── import_sites.py
    │   │   ├── sites_diff.py
    │   │   └── update_site_data.py
    │   └── wizard.py
    ├── recopilacion
    │   ├── __init__.py
    │   ├── consultas.py
    │   ├── extraccion.py
    │   └── fuentes.py
    ├── riesgos
    │   ├── __init__.py
    │   └── evaluacion.py
    ├── sherlock
    │   ├── .dockerignore
    │   ├── .editorconfig
    │   ├── .github
    │   │   ├── ISSUE_TEMPLATE
    │   │   │   ├── bug-report.md
    │   │   │   ├── feature-request.md
    │   │   │   ├── question.md
    │   │   │   ├── reporting-false-negative.md
    │   │   │   ├── reporting-false-positive.md
    │   │   │   └── site-support-request.md
    │   │   └── workflows
    │   │   │   ├── main.yml
    │   │   │   ├── nightly.yml
    │   │   │   ├── pull_request.yml
    │   │   │   └── update-site-list.yml
    │   ├── .gitignore
    │   ├── .replit
    │   ├── CODE_OF_CONDUCT.md
    │   ├── CONTRIBUTING.md
    │   ├── Dockerfile
    │   ├── LICENSE
    │   ├── README.md
    │   ├── docker-compose.yml
    │   ├── images
    │   │   └── preview.png
    │   ├── removed_sites.json
    │   ├── removed_sites.md
    │   ├── requirements.txt
    │   ├── sherlock
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── notify.py
    │   │   ├── resources
    │   │   │   └── data.json
    │   │   ├── result.py
    │   │   ├── sherlock.py
    │   │   ├── sites.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── all.py
    │   │   │   ├── base.py
    │   │   │   └── test_multiple_usernames.py
    │   ├── site_list.py
    │   └── sites.md
    └── theHarvester
    │   ├── .dockerignore
    │   ├── .flake8
    │   ├── .git-blame-ignore-revs
    │   ├── .gitattributes
    │   ├── .github
    │       ├── FUNDING.yml
    │       ├── ISSUE_TEMPLATE
    │       │   └── issue-template.md
    │       ├── dependabot.yml
    │       └── workflows
    │       │   ├── codeql-analysis.yml
    │       │   ├── dockerci.yml
    │       │   └── theHarvester.yml
    │   ├── .gitignore
    │   ├── .isort.cfg
    │   ├── .pyre_configuration
    │   ├── Dockerfile
    │   ├── README.md
    │   ├── README
    │       ├── CONTRIBUTING.md
    │       ├── COPYING
    │       └── LICENSES
    │   ├── docker-compose.yml
    │   ├── mypy.ini
    │   ├── pyproject.toml
    │   ├── pytest.ini
    │   ├── requirements.txt
    │   ├── requirements
    │       ├── base.txt
    │       └── dev.txt
    │   ├── restfulHarvest.py
    │   ├── setup.cfg
    │   ├── tests
    │       ├── __init__.py
    │       ├── discovery
    │       │   ├── __init__.py
    │       │   ├── test_anubis.py
    │       │   ├── test_certspotter.py
    │       │   ├── test_githubcode.py
    │       │   └── test_otx.py
    │       └── test_myparser.py
    │   ├── theHarvester-logo.png
    │   ├── theHarvester-logo.webp
    │   ├── theHarvester.py
    │   └── theHarvester
    │       ├── __init__.py
    │       ├── __main__.py
    │       ├── data
    │           ├── proxies.yaml
    │           └── wordlists
    │           │   ├── dns-big.txt
    │           │   ├── dns-names.txt
    │           │   ├── dorks.txt
    │           │   ├── general
    │           │       └── common.txt
    │           │   └── names_small.txt
    │       ├── discovery
    │           ├── __init__.py
    │           ├── anubis.py
    │           ├── baidusearch.py
    │           ├── bevigil.py
    │           ├── binaryedgesearch.py
    │           ├── bingsearch.py
    │           ├── bravesearch.py
    │           ├── bufferoverun.py
    │           ├── censysearch.py
    │           ├── certspottersearch.py
    │           ├── constants.py
    │           ├── criminalip.py
    │           ├── crtsh.py
    │           ├── dnsdumpster.py
    │           ├── dnssearch.py
    │           ├── duckduckgosearch.py
    │           ├── fullhuntsearch.py
    │           ├── githubcode.py
    │           ├── hackertarget.py
    │           ├── huntersearch.py
    │           ├── intelxsearch.py
    │           ├── netlas.py
    │           ├── onyphe.py
    │           ├── otxsearch.py
    │           ├── pentesttools.py
    │           ├── projectdiscovery.py
    │           ├── rapiddns.py
    │           ├── rocketreach.py
    │           ├── searchhunterhow.py
    │           ├── securitytrailssearch.py
    │           ├── shodansearch.py
    │           ├── sitedossier.py
    │           ├── subdomaincenter.py
    │           ├── subdomainfinderc99.py
    │           ├── takeover.py
    │           ├── threatminer.py
    │           ├── tombasearch.py
    │           ├── urlscan.py
    │           ├── virustotal.py
    │           ├── yahoosearch.py
    │           └── zoomeyesearch.py
    │       ├── parsers
    │           ├── __init__.py
    │           ├── intelxparser.py
    │           ├── myparser.py
    │           └── securitytrailsparser.py
    │       ├── restfulHarvest.py
    │       ├── screenshot
    │           ├── __init__.py
    │           └── screenshot.py
    │       └── theHarvester.py
└── wordlists
    ├── dns-big.txt
    ├── dns-names.txt
    ├── dorks.txt
    ├── general
        └── common.txt
    └── names_small.txt


/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "[python]": {
3 |         "editor.defaultFormatter": "ms-python.black-formatter"
4 |     },
5 |     "python.formatting.provider": "none"
6 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |    <img src="images/logo1.png" alt="Logo de InfoHunter">
 3 | </div>
 4 | 
 5 | &nbsp;
 6 | 
 7 | # 🔎 InfoHunter
 8 | 
 9 | InfoHunter es una herramienta de código abierto para la recopilación de información en OSINT, diseñada para proteger la seguridad y privacidad de personas y empresas.
10 | 
11 | ![Licencia](https://img.shields.io/github/license/sweetnight19/InfoHunter)
12 | [![Estado del Proyecto](https://img.shields.io/badge/Estado-En%20Desarrollo-yellow.svg)](https://github.com/sweetnight19/InfoHunter)
13 | ![GitHub release (latest by date)](https://img.shields.io/github/v/release/sweetnight19/infohunter)
14 | ![GitHub all releases](https://img.shields.io/github/downloads/Sweetnight19/InfoHunter/total)
15 | ![GitHub Repo stars](https://img.shields.io/github/stars/sweetnight19/infohunter?style=plastic)
16 | 
17 | &nbsp;
18 | 
19 | ## 📚 Descripción del proyecto
20 | 
21 | InfoHunter es una herramienta desarrollada en Python que utiliza técnicas de OSINT (Open Source Intelligence) para recopilar información relevante de diversas fuentes en línea. La herramienta se enfoca en identificar y analizar información para crear perfiles completos de las personas o empresas investigadas, así como para identificar posibles riesgos de seguridad o privacidad. Además, proporciona medidas y buenas prácticas para proteger la privacidad y seguridad de los sujetos de interés.
22 | 
23 | ## 🚀 Funcionalidades principales
24 | 
25 | - Recopilación de información de fuentes en línea.
26 | - Análisis de la información para crear perfiles completos.
27 | - Identificación y evaluación de riesgos de seguridad o privacidad.
28 | - Medidas y buenas prácticas para proteger la privacidad y seguridad.
29 | - Evaluación de la eficacia de las medidas implementadas.
30 | 
31 | ## 🔧 Instalación
32 | 
33 | 1. Clona el repositorio de GitHub:
34 | 
35 |    ```bash
36 |    git clone https://github.com/sweetnight19/InfoHunter.git
37 | 
38 |    ```
39 | 
40 | 2. Instala las dependencias:
41 | 
42 |    ```bash
43 |    pip install -r requirements.txt
44 |    ```
45 | 
46 | ## 📖 Uso
47 | 
48 | 1. Ejecuta el archivo InfoHunter.py:
49 | 
50 |    ```bash
51 |    python InfoHunter.py
52 | 
53 |    ```
54 | 
55 | 2. Sigue las instrucciones en la interfaz de línea de comandos para utilizar las funcionalidades de InfoHunter.
56 | 
57 | ## 🗺️ Roadmap
58 | 
59 | A continuación se muestra el plan de desarrollo para el proyecto:
60 | 
61 | - [x] Implementación de la búsqueda de información en fuentes abiertas.
62 | - [x] Análisis de datos recopilados para crear perfiles de personas y empresas.
63 | - [x] Identificación de posibles riesgos de seguridad y privacidad.
64 | - [x] Establecimiento de medidas para proteger la privacidad y seguridad.
65 | - [x] Evaluación de la eficacia de las medidas implementadas.
66 | 
67 | ## 👤 Autor
68 | 
69 | - Sweetnight19
70 | - Email: sweetnight19@protonmail.com
71 | - GitHub: [@sweetnight19](https://github.com/sweetnight19)
72 | 
73 | ## 🤝 Contribuciones
74 | 
75 | Las contribuciones son bienvenidas. Si deseas contribuir a este proyecto, sigue los siguientes pasos:
76 | 
77 | 1. Haz un fork del repositorio.
78 | 2. Crea una nueva rama para tu contribución.
79 | 3. Realiza tus modificaciones y mejoras.
80 | 4. Envía un pull request.
81 | 
82 | ## 📜 Licencia
83 | 
84 | Este proyecto está licenciado bajo la [Licencia GPL v3](https://www.gnu.org/licenses/gpl-3.0.en.html).
85 | 


--------------------------------------------------------------------------------
/api-keys example.yaml:
--------------------------------------------------------------------------------
 1 | apikeys:
 2 |   bevigil:
 3 |     key: "API-KEY"
 4 | 
 5 |   binaryedge:
 6 |     key: "API-KEY"
 7 | 
 8 |   bing:
 9 |     key:
10 | 
11 |   bufferoverun:
12 |     key:
13 | 
14 |   censys:
15 |     id: "API-KEY"
16 |     secret: "API-KEY"
17 | 
18 |   criminalip:
19 |     key:
20 | 
21 |   fullhunt:
22 |     key: "API-KEY"
23 | 
24 |   github:
25 |     key:
26 | 
27 |   hunter:
28 |     key: "API-KEY"
29 | 
30 |   intelx:
31 |     key: "API-KEY"
32 | 
33 |   netlas:
34 |     key: "API-KEY"
35 | 
36 |   pentestTools:
37 |     key:
38 | 
39 |   projectDiscovery:
40 |     key:
41 | 
42 |   rocketreach:
43 |     key:
44 | 
45 |   securityTrails:
46 |     key: "API-KEY"
47 | 
48 |   shodan:
49 |     key: "API-KEY"
50 | 
51 |   virustotal:
52 |     key: "API-KEY"
53 | 
54 |   zoomeye:
55 |     key: "API-KEY"
56 | 


--------------------------------------------------------------------------------
/api_keys example.json:
--------------------------------------------------------------------------------
1 | {
2 |     "pyhunter": "API-KEY",
3 |     "hibp-api-key": "API-KEY",
4 |     "breachdirectory": "API-KEY",
5 |     "similar-web": "API-KEY"
6 | }


--------------------------------------------------------------------------------
/images/logo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/images/logo1.png


--------------------------------------------------------------------------------
/images/logo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/images/logo2.png


--------------------------------------------------------------------------------
/proxies.yaml:
--------------------------------------------------------------------------------
1 | http:
2 |     - ip:port
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiodns==3.1.1
 2 | aiofiles==23.2.1
 3 | aiohttp==3.9.3
 4 | aiomultiprocess==0.9.0
 5 | aiosignal==1.3.1
 6 | aiosqlite==0.19.0
 7 | annotated-types==0.6.0
 8 | anyio==3.7.1
 9 | appdirs==1.4.4
10 | argcomplete==3.2.2
11 | attrs==23.2.0
12 | backoff==2.2.1
13 | beautifulsoup4==4.12.3
14 | censys==2.2.11
15 | certifi==2024.2.2
16 | cffi==1.16.0
17 | chardet==5.2.0
18 | charset-normalizer==3.3.2
19 | click==8.1.7
20 | click-plugins==1.1.1
21 | colorama==0.4.6
22 | Deprecated==1.2.14
23 | dnspython==2.5.0
24 | fastapi==0.109.2
25 | filelock==3.13.1
26 | fpdf==1.7.2
27 | frozenlist==1.4.1
28 | h11==0.14.0
29 | idna==3.6
30 | importlib-metadata==7.0.1
31 | importlib-resources==6.1.1
32 | limits==3.7.0
33 | lxml==5.1.0
34 | markdown-it-py==3.0.0
35 | mdurl==0.1.2
36 | multidict==6.0.5
37 | netaddr==0.10.1
38 | packaging==23.2
39 | pillow==10.2.0
40 | pycares==4.4.0
41 | pycparser==2.21
42 | pydantic==2.6.1
43 | pydantic_core==2.16.2
44 | pyee==8.2.2
45 | pyfiglet==1.0.2
46 | Pygments==2.17.2
47 | pyhunter==1.7
48 | pyppeteer==1.0.2
49 | python-dateutil==2.8.2
50 | PyYAML==6.0.1
51 | reportlab==4.1.0
52 | requests==2.31.0
53 | requests-file==2.0.0
54 | retrying==1.3.4
55 | rich==13.7.0
56 | setuptools==69.0.3
57 | shodan==1.31.0
58 | six==1.16.0
59 | slowapi==0.1.9
60 | sniffio==1.3.0
61 | soupsieve==2.5
62 | starlette==0.36.3
63 | theHarvester==0.0.1
64 | tldextract==5.1.1
65 | tqdm==4.66.2
66 | typing_extensions==4.9.0
67 | ujson==5.9.0
68 | urllib3==1.26.18
69 | uvicorn==0.27.0.post1
70 | websockets==10.4
71 | wrapt==1.16.0
72 | XlsxWriter==3.1.9
73 | yarl==1.9.4
74 | zipp==3.17.0
75 | 


--------------------------------------------------------------------------------
/src/evaluacion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/evaluacion/__init__.py


--------------------------------------------------------------------------------
/src/maigret/.dockerignore:
--------------------------------------------------------------------------------
1 | .git/
2 | .vscode/
3 | static/
4 | tests/
5 | *.txt
6 | !/requirements.txt
7 | venv/
8 | 
9 | 


--------------------------------------------------------------------------------
/src/maigret/.githooks/pre-commit:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python3 ./utils/update_site_data.py
3 | 


--------------------------------------------------------------------------------
/src/maigret/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | patreon: soxoj
4 | 


--------------------------------------------------------------------------------
/src/maigret/.github/ISSUE_TEMPLATE/add-a-site.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Add a site
 3 | about: I want to add a new site for Maigret checks
 4 | title: New site
 5 | labels: new-site
 6 | assignees: soxoj
 7 | 
 8 | ---
 9 | 
10 | Link to the site main page: https://example.com
11 | Link to an existing account: https://example.com/users/john
12 | Link to a nonexistent account: https://example.com/users/noonewouldeverusethis7
13 | Tags: photo, us, ...
14 | 


--------------------------------------------------------------------------------
/src/maigret/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Maigret bug report
 3 | about: I want to report a bug in Maigret functionality
 4 | title: ''
 5 | labels: bug
 6 | assignees: soxoj
 7 | 
 8 | ---
 9 | 
10 | ## Checklist
11 | 
12 | - [ ] I'm reporting a bug in Maigret functionality
13 | - [ ] I've checked for similar bug reports including closed ones
14 | - [ ] I've checked for pull requests that attempt to fix this bug
15 | 
16 | ## Description
17 | 
18 | Info about Maigret version you are running and environment (`--version`, operation system, ISP provider):
19 | <INSERT VERSION INFO HERE>
20 | 
21 | How to reproduce this bug (commandline options / conditions):
22 | <INSERT EXAMPLE OF CLI COMMAND HERE>
23 | 
24 | <DESCRIPTION>
25 | 
26 | <PASTE SCREENSHOT>
27 | 
28 | <ATTACH LOG FILE>
29 | 


--------------------------------------------------------------------------------
/src/maigret/.github/ISSUE_TEMPLATE/report-false-result.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Report invalid result
 3 | about: I want to report invalid result of Maigret search
 4 | title: Invalid result
 5 | labels: false-result
 6 | assignees: soxoj
 7 | 
 8 | ---
 9 | 
10 | Invalid link: <INSERT LINK HERE>
11 | 
12 | <!--
13 | 
14 | Put x into the box
15 | 
16 | [ ] ==> [x]
17 | 
18 | -->
19 | 
20 | - [ ] I'm sure that the link leads to "not found" page
21 | 


--------------------------------------------------------------------------------
/src/maigret/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "pip"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"
7 | 


--------------------------------------------------------------------------------
/src/maigret/.github/workflows/build-docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Build docker image and push to DockerHub
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   docker:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       -
12 |         name: Set up QEMU
13 |         uses: docker/setup-qemu-action@v1
14 |       -
15 |         name: Set up Docker Buildx
16 |         uses: docker/setup-buildx-action@v1
17 |       -
18 |         name: Login to DockerHub
19 |         uses: docker/login-action@v1 
20 |         with:
21 |           username: ${{ secrets.DOCKER_HUB_USERNAME }}
22 |           password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
23 |       -
24 |         name: Build and push
25 |         id: docker_build
26 |         uses: docker/build-push-action@v2
27 |         with:
28 |           push: true
29 |           tags: ${{ secrets.DOCKER_HUB_USERNAME }}/maigret:latest
30 |           platforms: linux/amd64,linux/arm64
31 |       -
32 |         name: Image digest
33 |         run: echo ${{ steps.docker_build.outputs.digest }}
34 | 


--------------------------------------------------------------------------------
/src/maigret/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ main ]
17 |   schedule:
18 |     - cron: '23 6 * * 6'
19 | 
20 | jobs:
21 |   analyze:
22 |     name: Analyze
23 |     runs-on: ubuntu-latest
24 |     permissions:
25 |       actions: read
26 |       contents: read
27 |       security-events: write
28 | 
29 |     strategy:
30 |       fail-fast: false
31 |       matrix:
32 |         language: [ 'python' ]
33 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
34 |         # Learn more about CodeQL language support at https://git.io/codeql-language-support
35 | 
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v2
39 | 
40 |     # Initializes the CodeQL tools for scanning.
41 |     - name: Initialize CodeQL
42 |       uses: github/codeql-action/init@v1
43 |       with:
44 |         languages: ${{ matrix.language }}
45 |         # If you wish to specify custom queries, you can do so here or in a config file.
46 |         # By default, queries listed here will override any specified in a config file.
47 |         # Prefix the list here with "+" to use these queries and those in the config file.
48 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 | 
50 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
51 |     # If this step fails, then you should remove it and run the build manually (see below)
52 |     - name: Autobuild
53 |       uses: github/codeql-action/autobuild@v1
54 | 
55 |     # ℹ️ Command-line programs to run using the OS shell.
56 |     # 📚 https://git.io/JvXDl
57 | 
58 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 |     #    and modify them (or add more) to build your code if your project
60 |     #    uses a compiled language
61 | 
62 |     #- run: |
63 |     #   make bootstrap
64 |     #   make release
65 | 
66 |     - name: Perform CodeQL Analysis
67 |       uses: github/codeql-action/analyze@v1
68 | 


--------------------------------------------------------------------------------
/src/maigret/.github/workflows/pyinstaller.yml:
--------------------------------------------------------------------------------
 1 | name: Package exe with PyInstaller - Windows
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 | 
 7 | jobs:
 8 |   build:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: PyInstaller Windows
15 |       uses: JackMcKew/pyinstaller-action-windows@main
16 |       with:
17 |         path: pyinstaller
18 | 
19 |     - uses: actions/upload-artifact@v2
20 |       with:
21 |         name: maigret_standalone_win32
22 |         path: pyinstaller/dist/windows # or path/to/artifact
23 | 


--------------------------------------------------------------------------------
/src/maigret/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | name: Linting and testing
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |     types: [opened, synchronize, reopened]
 9 | 
10 | jobs:
11 |   build:
12 | 
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: [3.7, 3.8, 3.9]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         python -m pip install -r test-requirements.txt
28 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
29 |     - name: Test with pytest
30 |       run: |
31 |         pytest --reruns 3 --reruns-delay 5
32 | 


--------------------------------------------------------------------------------
/src/maigret/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v2
16 |       with:
17 |         python-version: '3.x'
18 |     - name: Install dependencies
19 |       run: |
20 |         python -m pip install --upgrade pip
21 |         pip install setuptools wheel twine
22 |     - name: Build and publish
23 |       env:
24 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
25 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
26 |       run: |
27 |         python setup.py sdist bdist_wheel
28 |         twine upload dist/*
29 | 


--------------------------------------------------------------------------------
/src/maigret/.github/workflows/update-site-data.yml:
--------------------------------------------------------------------------------
 1 | name: Update sites rating and statistics
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ dev ]
 6 |     types: [opened, synchronize]
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - name: Checkout repository
13 |       uses: actions/checkout@v2.3.2
14 |       with:
15 |         ref: ${{ github.event.pull_request.head.sha }}
16 |         fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
17 | 
18 |     - name: build application
19 |       run: |
20 |         pip3 install .
21 |         python3 ./utils/update_site_data.py --empty-only
22 | 
23 |     - name: Commit and push changes
24 |       run: |
25 |         git config --global user.name "Maigret autoupdate"
26 |         git config --global user.email "soxoj@protonmail.com"
27 |         echo `git name-rev ${{ github.event.pull_request.head.sha }} --name-only`
28 |         export BRANCH=`git name-rev ${{ github.event.pull_request.head.sha }} --name-only | sed 's/remotes\/origin\///'`
29 |         echo $BRANCH
30 |         git remote -v
31 |         git checkout $BRANCH
32 |         git add sites.md
33 |         git commit -m "Updated site list and statistics"
34 |         git push origin $BRANCH


--------------------------------------------------------------------------------
/src/maigret/.gitignore:
--------------------------------------------------------------------------------
 1 | # Virtual Environment
 2 | venv/
 3 | 
 4 | # Editor Configurations
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Python
 9 | __pycache__/
10 | 
11 | # Pip
12 | src/
13 | 
14 | # Jupyter Notebook
15 | .ipynb_checkpoints
16 | *.ipynb
17 | 
18 | # Logs and backups
19 | *.log
20 | *.bak
21 | 
22 | # Output files, except requirements.txt
23 | *.txt
24 | !requirements.txt
25 | 
26 | # Comma-Separated Values (CSV) Reports
27 | *.csv
28 | 
29 | # MacOS Folder Metadata File
30 | .DS_Store
31 | /reports/
32 | 
33 | # Testing
34 | .coverage
35 | dist/
36 | htmlcov/
37 | /test_*
38 | 
39 | # Maigret files
40 | settings.json
41 | 


--------------------------------------------------------------------------------
/src/maigret/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to contribute
 2 | 
 3 | Hey! I'm really glad you're reading this. Maigret contains a lot of sites, and it is very hard to keep all the sites operational. That's why any fix is important. 
 4 | 
 5 | ## How to add a new site
 6 | 
 7 | #### Beginner level
 8 | 
 9 | You can use Maigret **submit mode** (`maigret --submit URL`) to add a new site or update an existing site. In this mode Maigret do an automatic analysis of the given account URL or site main page URL to determine the site engine and methods to check account presence. After checking Maigret asks if you want to add the site, answering y/Y will rewrite the local database.
10 | 
11 | #### Advanced level
12 | 
13 | You can edit [the database JSON file](https://github.com/soxoj/maigret/blob/main/maigret/resources/data.json) (`./maigret/resources/data.json`) manually.
14 | 
15 | ## Testing
16 | 
17 | There are CI checks for every PR to the Maigret repository. But it will be better to run `make format`, `make link` and `make test` to ensure you've made a corrent changes. 
18 | 
19 | ## Submitting changes
20 | 
21 | To submit you changes you must [send a GitHub PR](https://github.com/soxoj/maigret/pulls) to the Maigret project.
22 | Always write a clear log message for your commits. One-line messages are fine for small changes, but bigger changes should look like this:
23 | 
24 |     $ git commit -m "A brief summary of the commit
25 |     > 
26 |     > A paragraph describing what changed and its impact."
27 | 
28 | ## Coding conventions
29 | 
30 | Start reading the code and you'll get the hang of it. ;)
31 | 


--------------------------------------------------------------------------------
/src/maigret/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | LABEL maintainer="Soxoj <soxoj@protonmail.com>"
 3 | WORKDIR /app
 4 | RUN pip install --no-cache-dir --upgrade pip
 5 | RUN apt-get update && \
 6 |     apt-get install --no-install-recommends -y \
 7 |       gcc \
 8 |       musl-dev \
 9 |       libxml2 \
10 |       libxml2-dev \
11 |       libxslt-dev \
12 |     && \
13 |     rm -rf /var/lib/apt/lists/* /tmp/*
14 | COPY . .
15 | RUN YARL_NO_EXTENSIONS=1 python3 -m pip install --no-cache-dir .
16 | ENTRYPOINT ["maigret"]
17 | 


--------------------------------------------------------------------------------
/src/maigret/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Sherlock Project
 4 | Copyright (c) 2020-2021 Soxoj
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/src/maigret/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | include requirements.txt
4 | include maigret/resources/*
5 | 


--------------------------------------------------------------------------------
/src/maigret/Makefile:
--------------------------------------------------------------------------------
 1 | LINT_FILES=maigret wizard.py tests
 2 | 
 3 | test:
 4 | 	coverage run --source=./maigret -m pytest tests
 5 | 	coverage report -m
 6 | 	coverage html
 7 | 
 8 | rerun-tests:
 9 | 	pytest --lf -vv
10 | 
11 | lint:
12 | 	@echo 'syntax errors or undefined names'
13 | 	flake8 --count --select=E9,F63,F7,F82 --show-source --statistics ${LINT_FILES} maigret.py
14 | 
15 | 	@echo 'warning'
16 | 	flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503,E501 ${LINT_FILES} maigret.py
17 | 
18 | 	@echo 'mypy'
19 | 	mypy ${LINT_FILES}
20 | 
21 | speed:
22 | 	time python3 ./maigret.py --version
23 | 	python3 -c "import timeit; t = timeit.Timer('import maigret'); print(t.timeit(number = 1000000))"
24 | 	python3 -X importtime -c "import maigret" 2> maigret-import.log
25 | 	python3 -m tuna maigret-import.log
26 | 
27 | format:
28 | 	@echo 'black'
29 | 	black --skip-string-normalization ${LINT_FILES}
30 | 
31 | pull:
32 | 	git stash
33 | 	git checkout main
34 | 	git pull origin main
35 | 	git stash pop
36 | 
37 | clean:
38 | 	rm -rf reports htmcov dist
39 | 
40 | install:
41 | 	pip3 install .
42 | 


--------------------------------------------------------------------------------
/src/maigret/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/src/maigret/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/src/maigret/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx-copybutton
2 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | 
 3 | # -- Project information
 4 | 
 5 | project = 'Maigret'
 6 | copyright = '2021, soxoj'
 7 | author = 'soxoj'
 8 | 
 9 | release = '0.4.4'
10 | version = '0.4.4'
11 | 
12 | # -- General configuration
13 | 
14 | extensions = [
15 |     'sphinx.ext.duration',
16 |     'sphinx.ext.doctest',
17 |     'sphinx.ext.autodoc',
18 |     'sphinx.ext.autosummary',
19 |     'sphinx.ext.intersphinx',
20 |     'sphinx_copybutton'
21 | ]
22 | 
23 | intersphinx_mapping = {
24 |     'python': ('https://docs.python.org/3/', None),
25 |     'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
26 | }
27 | intersphinx_disabled_domains = ['std']
28 | 
29 | templates_path = ['_templates']
30 | 
31 | # -- Options for HTML output
32 | 
33 | html_theme = 'sphinx_rtd_theme'
34 | 
35 | # -- Options for EPUB output
36 | epub_show_urls = 'footnote'
37 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/development.rst:
--------------------------------------------------------------------------------
  1 | .. _development:
  2 | 
  3 | Development
  4 | ==============
  5 | 
  6 | Testing
  7 | -------
  8 | 
  9 | It is recommended use Python 3.7/3.8 for test due to some conflicts in 3.9.
 10 | 
 11 | Install test requirements:
 12 | 
 13 | .. code-block:: console
 14 | 
 15 |   pip install -r test-requirements.txt
 16 | 
 17 | 
 18 | Use the following commands to check Maigret:
 19 | 
 20 | .. code-block:: console
 21 | 
 22 |   # run linter and typing checks
 23 |   # order of checks%
 24 |   # - critical syntax errors or undefined names
 25 |   # - flake checks
 26 |   # - mypy checks
 27 |   make lint
 28 | 
 29 |   # run testing with coverage html report
 30 |   # current test coverage is 60%
 31 |   make text
 32 | 
 33 |   # open html report
 34 |   open htmlcov/index.html
 35 | 
 36 | 
 37 | How to publish new version of Maigret
 38 | -------------------------------------
 39 | 
 40 | **Collaborats rights are requires, write Soxoj to get them**.
 41 | 
 42 | For new version publishing you must create a new branch in repository
 43 | with a bumped version number and actual changelog first. After it you
 44 | must create a release, and GitHub action automatically create a new 
 45 | PyPi package. 
 46 | 
 47 | - New branch example: https://github.com/soxoj/maigret/commit/e520418f6a25d7edacde2d73b41a8ae7c80ddf39
 48 | - Release example: https://github.com/soxoj/maigret/releases/tag/v0.4.1
 49 | 
 50 | 1. Make a new branch locally with a new version name. Check the current version number here: https://pypi.org/project/maigret/.
 51 | **Increase only patch version (third number)** if there are no breaking changes.
 52 | 
 53 | .. code-block:: console
 54 | 
 55 |   git checkout -b 0.4.0
 56 | 
 57 | 2. Update Maigret version in three files manually:
 58 | 
 59 | - setup.py
 60 | - maigret/__version__.py 
 61 | - docs/source/conf.py 
 62 | 
 63 | 3. Create a new empty text section in the beginning of the file `CHANGELOG.md` with a current date:
 64 | 
 65 | .. code-block:: console
 66 | 
 67 |   ## [0.4.0] - 2022-01-03
 68 | 
 69 | 4. Get auto-generate release notes:
 70 | 
 71 | - Open https://github.com/soxoj/maigret/releases/new
 72 | - Click `Choose a tag`, enter `v0.4.0` (your version)
 73 | - Click `Create new tag`
 74 | - Press `+ Auto-generate release notes`
 75 | - Copy all the text from description text field below
 76 | - Paste it to empty text section in `CHANGELOG.txt`
 77 | - Remove redundant lines `## What's Changed` and `## New Contributors` section if it exists
 78 | - *Close the new release page*
 79 | 
 80 | 5. Commit all the changes, push, make pull request
 81 | 
 82 | .. code-block:: console
 83 | 
 84 |   git add -p
 85 |   git commit -m 'Bump to YOUR VERSION'
 86 |   git push origin head
 87 | 
 88 | 
 89 | 6. Merge pull request
 90 | 
 91 | 7. Create new release
 92 | 
 93 | - Open https://github.com/soxoj/maigret/releases/new again
 94 | - Click `Choose a tag`
 95 | - Enter actual version in format `v0.4.0`
 96 | - Also enter actual version in the field `Release title` 
 97 | - Click `Create new tag`
 98 | - Press `+ Auto-generate release notes`
 99 | - **Press "Publish release" button**
100 | 
101 | 8. That's all, now you can simply wait push to PyPi. You can monitor it in Action page: https://github.com/soxoj/maigret/actions/workflows/python-publish.yml


--------------------------------------------------------------------------------
/src/maigret/docs/source/extracting-information-from-pages.rst:
--------------------------------------------------------------------------------
 1 | .. _extracting-information-from-pages:
 2 | 
 3 | Extracting information from pages
 4 | =================================
 5 | Maigret can parse URLs and content of web pages by URLs to extract info about account owner and other meta information.
 6 | 
 7 | You must specify the URL with the option ``--parse``, it's can be a link to an account or an online document. List of supported sites `see here <https://github.com/soxoj/socid-extractor#sites>`_.
 8 | 
 9 | After the end of the parsing phase, Maigret will start the search phase by :doc:`supported identifiers <supported-identifier-types>` found (usernames, ids, etc.).
10 | 
11 | Examples
12 | --------
13 | .. code-block:: console
14 | 
15 |   $ maigret --parse https://docs.google.com/spreadsheets/d/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw/edit\#gid\=0
16 | 
17 |   Scanning webpage by URL https://docs.google.com/spreadsheets/d/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw/edit#gid=0...
18 |   ┣╸org_name: Gooten
19 |   ┗╸mime_type: application/vnd.google-apps.ritz
20 |   Scanning webpage by URL https://clients6.google.com/drive/v2beta/files/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw?fields=alternateLink%2CcopyRequiresWriterPermission%2CcreatedDate%2Cdescription%2CdriveId%2CfileSize%2CiconLink%2Cid%2Clabels(starred%2C%20trashed)%2ClastViewedByMeDate%2CmodifiedDate%2Cshared%2CteamDriveId%2CuserPermission(id%2Cname%2CemailAddress%2Cdomain%2Crole%2CadditionalRoles%2CphotoLink%2Ctype%2CwithLink)%2Cpermissions(id%2Cname%2CemailAddress%2Cdomain%2Crole%2CadditionalRoles%2CphotoLink%2Ctype%2CwithLink)%2Cparents(id)%2Ccapabilities(canMoveItemWithinDrive%2CcanMoveItemOutOfDrive%2CcanMoveItemOutOfTeamDrive%2CcanAddChildren%2CcanEdit%2CcanDownload%2CcanComment%2CcanMoveChildrenWithinDrive%2CcanRename%2CcanRemoveChildren%2CcanMoveItemIntoTeamDrive)%2Ckind&supportsTeamDrives=true&enforceSingleParent=true&key=AIzaSyC1eQ1xj69IdTMeii5r7brs3R90eck-m7k...
21 |   ┣╸created_at: 2016-02-16T18:51:52.021Z
22 |   ┣╸updated_at: 2019-10-23T17:15:47.157Z
23 |   ┣╸gaia_id: 15696155517366416778
24 |   ┣╸fullname: Nadia Burgess
25 |   ┣╸email: nadia@gooten.com
26 |   ┣╸image: https://lh3.googleusercontent.com/a-/AOh14GheZe1CyNa3NeJInWAl70qkip4oJ7qLsD8vDy6X=s64
27 |   ┗╸email_username: nadia
28 | 
29 | .. code-block:: console
30 | 
31 |   $ maigret.py --parse https://steamcommunity.com/profiles/76561199113454789
32 |   Scanning webpage by URL https://steamcommunity.com/profiles/76561199113454789...
33 |   ┣╸steam_id: 76561199113454789
34 |   ┣╸nickname: Pok
35 |   ┗╸username: Machine42
36 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/features.rst:
--------------------------------------------------------------------------------
 1 | .. _features:
 2 | 
 3 | Features
 4 | ========
 5 | 
 6 | This is the list of Maigret features.
 7 | 
 8 | Personal info gathering
 9 | -----------------------
10 | 
11 | Maigret does the `parsing of accounts webpages and extraction <https://github.com/soxoj/socid-extractor>`_ of personal info, links to other profiles, etc.
12 | Extracted info displayed as an additional result in CLI output and as tables in HTML and PDF reports.
13 | Also, Maigret use found ids and usernames from links to start a recursive search.
14 | 
15 | Enabled by default, can be disabled with ``--no extracting``.
16 | 
17 | Recursive search
18 | ----------------
19 | 
20 | Maigret can extract some :ref:`common ids <supported-identifier-types>` and usernames from links on the account page (often people placed links to their other accounts) and immediately start new searches. All the gathered information will be displayed in CLI output and reports.
21 | 
22 | Enabled by default, can be disabled with ``--no-recursion``.
23 | 
24 | Reports
25 | -------
26 | 
27 | Maigret currently supports HTML, PDF, TXT, XMind 8 mindmap, and JSON reports.
28 | 
29 | HTML/PDF reports contain:
30 | 
31 | - profile photo
32 | - all the gathered personal info
33 | - additional information about supposed personal data (full name, gender, location), resulting from statistics of all found accounts
34 | 
35 | Also, there is a short text report in the CLI output after the end of a searching phase.
36 | 
37 | **Warning**: XMind 8 mindmaps are incompatible with XMind 2022!
38 | 
39 | Tags
40 | ----
41 | 
42 | The Maigret sites database very big (and will be bigger), and it is maybe an overhead to run a search for all the sites.
43 | Also, it is often hard to understand, what sites more interesting for us in the case of a certain person.
44 | 
45 | Tags markup allows selecting a subset of sites by interests (photo, messaging, finance, etc.) or by country. Tags of found accounts grouped and displayed in the reports.
46 | 
47 | See full description :doc:`in the Tags Wiki page <tags>`.
48 | 
49 | Censorship and captcha detection
50 | --------------------------------
51 | 
52 | Maigret can detect common errors such as censorship stub pages, CloudFlare captcha pages, and others. 
53 | If you get more them 3% errors of a certain type in a session, you've got a warning message in the CLI output with recommendations to improve performance and avoid problems.
54 | 
55 | Retries
56 | -------
57 | 
58 | Maigret will do retries of the requests with temporary errors got (connection failures, proxy errors, etc.).
59 | 
60 | One attempt by default, can be changed with option ``--retries N``.
61 | 
62 | Archives and mirrors checking
63 | -----------------------------
64 | 
65 | The Maigret database contains not only the original websites, but also mirrors, archives, and aggregators. For example:
66 | 
67 | - `Reddit BigData search <https://camas.github.io/reddit-search/>`_
68 | - `Picuki <https://www.picuki.com/>`_, Instagram mirror
69 | - `Twitter shadowban <https://shadowban.eu/>`_ checker
70 | 
71 | It allows getting additional info about the person and checking the existence of the account even if the main site is unavailable (bot protection, captcha, etc.)
72 | 
73 | Simple API
74 | ----------
75 | 
76 | Maigret can be easily integrated with the use of Python package `maigret <https://pypi.org/project/maigret/>`_.
77 | 
78 | Example: the official `Telegram bot <https://github.com/soxoj/maigret-tg-bot>`_
79 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. _index:
 2 | 
 3 | Welcome to the Maigret docs!
 4 | ============================
 5 | 
 6 | **Maigret** is an easy-to-use and powerful OSINT tool for collecting a dossier on a person by username only.
 7 | 
 8 | This is achieved by checking for accounts on a huge number of sites and gathering all the available information from web pages.
 9 | 
10 | The project's main goal - give to OSINT researchers and pentesters a **universal tool** to get maximum information about a subject and integrate it with other tools in automatization pipelines.
11 | 
12 | You may be interested in:
13 | -------------------------
14 | - :doc:`Command line options description <command-line-options>` and :doc:`usage examples <usage-examples>`
15 | - :doc:`Features list <features>`
16 | - :doc:`Project roadmap <roadmap>`
17 | 
18 | .. toctree::
19 |    :hidden:
20 |    :caption: Sections
21 | 
22 |    command-line-options
23 |    extracting-information-from-pages
24 |    features
25 |    philosophy
26 |    roadmap
27 |    supported-identifier-types
28 |    tags
29 |    usage-examples
30 |    settings
31 |    development
32 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/philosophy.rst:
--------------------------------------------------------------------------------
 1 | .. _philosophy:
 2 | 
 3 | Philosophy
 4 | ==========
 5 | 
 6 | TL;DR: Username => Dossier
 7 | 
 8 | Maigret is designed to gather all the available information about person by his username.
 9 | 
10 | What kind of information is this? First, links to person accounts. Secondly, all the machine-extractable
11 | pieces of info, such as: other usernames, full name, URLs to people's images, birthday, location (country,
12 | city, etc.), gender.
13 | 
14 | All this information forms some dossier, but it also useful for other tools and analytical purposes.
15 | Each collected piece of data has a label of a certain format (for example, ``follower_count`` for the number
16 | of subscribers or ``created_at`` for account creation time) so that it can be parsed and analyzed by various
17 | systems and stored in databases.
18 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/roadmap.rst:
--------------------------------------------------------------------------------
 1 | .. _roadmap:
 2 | 
 3 | Roadmap
 4 | =======
 5 | 
 6 | .. figure:: https://i.imgur.com/kk8cFdR.png   
 7 |    :target: https://i.imgur.com/kk8cFdR.png
 8 |    :align: center
 9 | 
10 | Current status
11 | --------------
12 | 
13 | - Sites DB stats - ok
14 | - Scan sessions stats - ok
15 | - Site engine autodetect - ok
16 | - Engines for all the sites - WIP
17 | - Unified reporting flow - ok
18 | - Retries - ok
19 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/settings.rst:
--------------------------------------------------------------------------------
 1 | .. _settings:
 2 | 
 3 | Settings
 4 | ==============
 5 | 
 6 | Options are also configurable through settings files. See
 7 | `settings JSON file <https://github.com/soxoj/maigret/blob/main/maigret/resources/settings.json>`_
 8 | for the list of currently supported options.
 9 | 
10 | After start Maigret tries to load configuration from the following sources in exactly the same order:
11 | 
12 | .. code-block:: console
13 | 
14 |   # relative path, based on installed package path
15 |   resources/settings.json
16 | 
17 |   # absolute path, configuration file in home directory
18 |   ~/.maigret/settings.json
19 | 
20 |   # relative path, based on current working directory
21 |   settings.json
22 | 
23 | Missing any of these files is not an error.
24 | If the next settings file contains already known option,
25 | this option will be rewrited. So it is possible to make
26 | custom configuration for different users and directories.
27 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/supported-identifier-types.rst:
--------------------------------------------------------------------------------
 1 | .. _supported-identifier-types:
 2 | 
 3 | Supported identifier types
 4 | ==========================
 5 | 
 6 | Maigret can search against not only ordinary usernames, but also through certain common identifiers. There is a list of all currently supported identifiers.
 7 | 
 8 | - **gaia_id** - Google inner numeric user identifier, in former times was placed in a Google Plus account URL. 
 9 | - **steam_id** - Steam inner numeric user identifier.
10 | - **wikimapia_uid** - Wikimapia.org inner numeric user identifier.
11 | - **uidme_uguid** - uID.me inner numeric user identifier.
12 | - **yandex_public_id** - Yandex sites inner letter user identifier. See also: `YaSeeker <https://github.com/HowToFind-bot/YaSeeker>`_. 
13 | - **vk_id** - VK.com inner numeric user identifier.
14 | - **ok_id** - OK.ru inner numeric user identifier.
15 | - **yelp_userid** - Yelp inner user identifier.
16 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/tags.rst:
--------------------------------------------------------------------------------
 1 | .. _tags:
 2 | 
 3 | Tags
 4 | ====
 5 | 
 6 | The use of tags allows you to select a subset of the sites from big Maigret DB for search.
 7 | 
 8 | **Warning: tags markup is not stable now.**
 9 | 
10 | There are several types of tags:
11 | 
12 | 1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). These tags reflect the site language and regional origin of its users and are then used to locate the owner of a username. If the regional origin is difficult to establish or a site is positioned as worldwide, `no country code is given`. There could be multiple country code tags for one site.
13 | 
14 | 2. **Site engines**. Most of them are forum engines now: ``uCoz``, ``vBulletin``, ``XenForo`` et al. Full list of engines stored in the Maigret database.
15 | 
16 | 3. **Sites' subject/type and interests of its users**. Full list of "standard" tags is `present in the source code <https://github.com/soxoj/maigret/blob/main/maigret/sites.py#L13>`_ only for a moment. 
17 | 
18 | Usage
19 | -----
20 | ``--tags us,jp`` -- search on US and Japanese sites (actually marked as such in the Maigret database)
21 | 
22 | ``--tags coding`` -- search on sites related to software development.
23 | 
24 | ``--tags ucoz`` -- search on uCoz sites only (mostly CIS countries)
25 | 


--------------------------------------------------------------------------------
/src/maigret/docs/source/usage-examples.rst:
--------------------------------------------------------------------------------
 1 | .. _usage-examples:
 2 | 
 3 | Usage examples
 4 | ==============
 5 | 
 6 | Start a search for accounts with username ``machine42`` on top 500 sites from the Maigret DB.
 7 | 
 8 | .. code-block:: console
 9 | 
10 |   maigret machine42
11 | 
12 | Start a search for accounts with username ``machine42`` on **all sites** from the Maigret DB.
13 | 
14 | .. code-block:: console
15 | 
16 |   maigret machine42 -a
17 | 
18 | Start a search [...] and generate HTML and PDF reports.
19 | 
20 | .. code-block:: console
21 | 
22 |   maigret machine42 -a -HP
23 | 
24 | Start a search for accounts with username ``machine42`` only on Facebook.
25 | 
26 | .. code-block:: console
27 | 
28 |   maigret machine42 --site Facebook
29 | 
30 | Extract information from the Steam page by URL and start a search for accounts with found username ``machine42``.
31 | 
32 | .. code-block:: console
33 | 
34 |   maigret --parse https://steamcommunity.com/profiles/76561199113454789 
35 | 
36 | Start a search for accounts with username ``machine42`` only on US and Japanese sites.
37 | 
38 | .. code-block:: console
39 | 
40 |   maigret machine42 --tags en,jp
41 | 
42 | Start a search for accounts with username ``machine42`` only on sites related to software development.
43 | 
44 | .. code-block:: console
45 | 
46 |   maigret machine42 --tags coding
47 | 
48 | Start a search for accounts with username ``machine42`` on uCoz sites only (mostly CIS countries).
49 | 
50 | .. code-block:: console
51 | 
52 |   maigret machine42 --tags ucoz
53 | 
54 | 


--------------------------------------------------------------------------------
/src/maigret/maigret.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import asyncio
 3 | import sys
 4 | 
 5 | from maigret.maigret import main
 6 | 
 7 | 
 8 | def run():
 9 |     try:
10 |         if sys.version_info.minor >= 10:
11 |             asyncio.run(main())
12 |         else:
13 |             loop = asyncio.get_event_loop()
14 |             loop.run_until_complete(main())
15 |     except KeyboardInterrupt:
16 |         print('Maigret is interrupted.')
17 |         sys.exit(1)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     run()
22 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/__init__.py:
--------------------------------------------------------------------------------
 1 | """Maigret"""
 2 | 
 3 | __title__ = 'Maigret'
 4 | __package__ = 'maigret'
 5 | __author__ = 'Soxoj'
 6 | __author_email__ = 'soxoj@protonmail.com'
 7 | 
 8 | 
 9 | from .__version__ import __version__
10 | from .checking import maigret as search
11 | from .maigret import main as cli
12 | from .sites import MaigretEngine, MaigretSite, MaigretDatabase
13 | from .notify import QueryNotifyPrint as Notifier
14 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/__main__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | """
 4 | Maigret entrypoint
 5 | """
 6 | 
 7 | import asyncio
 8 | 
 9 | from .maigret import main
10 | 
11 | if __name__ == "__main__":
12 |     asyncio.run(main())
13 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/__version__.py:
--------------------------------------------------------------------------------
1 | """Maigret version file"""
2 | 
3 | __version__ = '0.4.4'
4 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/activation.py:
--------------------------------------------------------------------------------
 1 | from http.cookiejar import MozillaCookieJar
 2 | from http.cookies import Morsel
 3 | 
 4 | from aiohttp import CookieJar
 5 | 
 6 | 
 7 | class ParsingActivator:
 8 |     @staticmethod
 9 |     def twitter(site, logger, cookies={}):
10 |         headers = dict(site.headers)
11 |         del headers["x-guest-token"]
12 |         import requests
13 | 
14 |         r = requests.post(site.activation["url"], headers=headers)
15 |         logger.info(r)
16 |         j = r.json()
17 |         guest_token = j[site.activation["src"]]
18 |         site.headers["x-guest-token"] = guest_token
19 | 
20 |     @staticmethod
21 |     def vimeo(site, logger, cookies={}):
22 |         headers = dict(site.headers)
23 |         if "Authorization" in headers:
24 |             del headers["Authorization"]
25 |         import requests
26 | 
27 |         r = requests.get(site.activation["url"], headers=headers)
28 |         jwt_token = r.json()["jwt"]
29 |         site.headers["Authorization"] = "jwt " + jwt_token
30 | 
31 |     @staticmethod
32 |     def spotify(site, logger, cookies={}):
33 |         headers = dict(site.headers)
34 |         if "Authorization" in headers:
35 |             del headers["Authorization"]
36 |         import requests
37 | 
38 |         r = requests.get(site.activation["url"])
39 |         bearer_token = r.json()["accessToken"]
40 |         site.headers["authorization"] = f"Bearer {bearer_token}"
41 | 
42 | 
43 | def import_aiohttp_cookies(cookiestxt_filename):
44 |     cookies_obj = MozillaCookieJar(cookiestxt_filename)
45 |     cookies_obj.load(ignore_discard=True, ignore_expires=True)
46 | 
47 |     cookies = CookieJar()
48 | 
49 |     cookies_list = []
50 |     for domain in cookies_obj._cookies.values():
51 |         for key, cookie in list(domain.values())[0].items():
52 |             c = Morsel()
53 |             c.set(key, cookie.value, cookie.value)
54 |             c["domain"] = cookie.domain
55 |             c["path"] = cookie.path
56 |             cookies_list.append((key, c))
57 | 
58 |     cookies.update_cookies(cookies_list)
59 | 
60 |     return cookies
61 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/errors.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Any
  2 | 
  3 | from .result import QueryResult
  4 | from .types import QueryResultWrapper
  5 | 
  6 | 
  7 | # error got as a result of completed search query
  8 | class CheckError:
  9 |     _type = 'Unknown'
 10 |     _desc = ''
 11 | 
 12 |     def __init__(self, typename, desc=''):
 13 |         self._type = typename
 14 |         self._desc = desc
 15 | 
 16 |     def __str__(self):
 17 |         if not self._desc:
 18 |             return f'{self._type} error'
 19 | 
 20 |         return f'{self._type} error: {self._desc}'
 21 | 
 22 |     @property
 23 |     def type(self):
 24 |         return self._type
 25 | 
 26 |     @property
 27 |     def desc(self):
 28 |         return self._desc
 29 | 
 30 | 
 31 | COMMON_ERRORS = {
 32 |     '<title>Attention Required! | Cloudflare</title>': CheckError(
 33 |         'Captcha', 'Cloudflare'
 34 |     ),
 35 |     'Please stand by, while we are checking your browser': CheckError(
 36 |         'Bot protection', 'Cloudflare'
 37 |     ),
 38 |     '<span data-translate="checking_browser">Checking your browser before accessing</span>': CheckError(
 39 |         'Bot protection', 'Cloudflare'
 40 |     ),
 41 |     'This website is using a security service to protect itself from online attacks.': CheckError(
 42 |         'Access denied', 'Cloudflare'
 43 |     ),
 44 |     '<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
 45 |     'document.getElementById(\'validate_form_submit\').disabled=true': CheckError(
 46 |         'Captcha', 'Mail.ru'
 47 |     ),
 48 |     'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError(
 49 |         'Bot protection', 'Blazingfast'
 50 |     ),
 51 |     '404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': CheckError(
 52 |         'Resolving', 'MegaFon 404 page'
 53 |     ),
 54 |     'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError(
 55 |         'Censorship', 'MGTS'
 56 |     ),
 57 |     'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
 58 |     'Сайт заблокирован хостинг-провайдером': CheckError(
 59 |         'Site-specific', 'Site is disabled (Beget)'
 60 |     ),
 61 | }
 62 | 
 63 | ERRORS_TYPES = {
 64 |     'Captcha': 'Try to switch to another IP address or to use service cookies',
 65 |     'Bot protection': 'Try to switch to another IP address',
 66 |     'Censorship': 'Switch to another internet service provider',
 67 |     'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
 68 |     'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)',
 69 | }
 70 | 
 71 | # TODO: checking for reason
 72 | ERRORS_REASONS = {
 73 |     'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
 74 | }
 75 | 
 76 | TEMPORARY_ERRORS_TYPES = [
 77 |     'Request timeout',
 78 |     'Unknown',
 79 |     'Request failed',
 80 |     'Connecting failure',
 81 |     'HTTP',
 82 |     'Proxy',
 83 |     'Interrupted',
 84 |     'Connection lost',
 85 | ]
 86 | 
 87 | THRESHOLD = 3  # percent
 88 | 
 89 | 
 90 | def is_important(err_data):
 91 |     return err_data['perc'] >= THRESHOLD
 92 | 
 93 | 
 94 | def is_permanent(err_type):
 95 |     return err_type not in TEMPORARY_ERRORS_TYPES
 96 | 
 97 | 
 98 | def detect(text):
 99 |     for flag, err in COMMON_ERRORS.items():
100 |         if flag in text:
101 |             return err
102 |     return None
103 | 
104 | 
105 | def solution_of(err_type) -> str:
106 |     return ERRORS_TYPES.get(err_type, '')
107 | 
108 | 
109 | def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]:
110 |     errors_counts: Dict[str, int] = {}
111 |     for r in search_res.values():
112 |         if r and isinstance(r, dict) and r.get('status'):
113 |             if not isinstance(r['status'], QueryResult):
114 |                 continue
115 | 
116 |             err = r['status'].error
117 |             if not err:
118 |                 continue
119 |             errors_counts[err.type] = errors_counts.get(err.type, 0) + 1
120 | 
121 |     counts = []
122 |     for err, count in sorted(errors_counts.items(), key=lambda x: x[1], reverse=True):
123 |         counts.append(
124 |             {
125 |                 'err': err,
126 |                 'count': count,
127 |                 'perc': round(count / len(search_res), 2) * 100,
128 |             }
129 |         )
130 | 
131 |     return counts
132 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/resources/simple_report_pdf.css:
--------------------------------------------------------------------------------
 1 | h2 {
 2 |   font-size: 30px;
 3 |   width: 100%;
 4 |   display:block;
 5 | }
 6 | h3 {
 7 |   font-size: 25px;
 8 |   width: 100%;
 9 |   display:block;
10 | }
11 | h4 {
12 |   font-size: 20px;
13 |   width: 100%;
14 |   display:block;
15 | }
16 | p {
17 |   margin: 0 0 5px;
18 |   display: block;
19 | }
20 | 
21 | 
22 | table {
23 |   margin-bottom: 10px;
24 |   width:100%;
25 | }
26 | th {
27 |   font-weight: bold;
28 | }
29 | th,td,caption {
30 |   padding: 4px 10px 4px 5px;
31 | }
32 | table tr:nth-child(even) td,
33 | table tr.even td  {
34 |   background-color: #e5ecf9;
35 | }
36 | 
37 | div {
38 |   border-bottom-color: #3e3e3e;
39 |   border-bottom-width: 1px;
40 |   border-bottom-style: solid;
41 | }
42 | .invalid-button {
43 |   position: absolute;
44 |   left: 10px;
45 | }


--------------------------------------------------------------------------------
/src/maigret/maigret/result.py:
--------------------------------------------------------------------------------
  1 | """Maigret Result Module
  2 | 
  3 | This module defines various objects for recording the results of queries.
  4 | """
  5 | from enum import Enum
  6 | 
  7 | 
  8 | class QueryStatus(Enum):
  9 |     """Query Status Enumeration.
 10 | 
 11 |     Describes status of query about a given username.
 12 |     """
 13 | 
 14 |     CLAIMED = "Claimed"  # Username Detected
 15 |     AVAILABLE = "Available"  # Username Not Detected
 16 |     UNKNOWN = "Unknown"  # Error Occurred While Trying To Detect Username
 17 |     ILLEGAL = "Illegal"  # Username Not Allowable For This Site
 18 | 
 19 |     def __str__(self):
 20 |         """Convert Object To String.
 21 | 
 22 |         Keyword Arguments:
 23 |         self                   -- This object.
 24 | 
 25 |         Return Value:
 26 |         Nicely formatted string to get information about this object.
 27 |         """
 28 |         return self.value
 29 | 
 30 | 
 31 | class QueryResult:
 32 |     """Query Result Object.
 33 | 
 34 |     Describes result of query about a given username.
 35 |     """
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         username,
 40 |         site_name,
 41 |         site_url_user,
 42 |         status,
 43 |         ids_data=None,
 44 |         query_time=None,
 45 |         context=None,
 46 |         error=None,
 47 |         tags=[],
 48 |     ):
 49 |         """Create Query Result Object.
 50 | 
 51 |         Contains information about a specific method of detecting usernames on
 52 |         a given type of web sites.
 53 | 
 54 |         Keyword Arguments:
 55 |         self                   -- This object.
 56 |         username               -- String indicating username that query result
 57 |                                   was about.
 58 |         site_name              -- String which identifies site.
 59 |         site_url_user          -- String containing URL for username on site.
 60 |                                   NOTE:  The site may or may not exist:  this
 61 |                                          just indicates what the name would
 62 |                                          be, if it existed.
 63 |         status                 -- Enumeration of type QueryStatus() indicating
 64 |                                   the status of the query.
 65 |         query_time             -- Time (in seconds) required to perform query.
 66 |                                   Default of None.
 67 |         context                -- String indicating any additional context
 68 |                                   about the query.  For example, if there was
 69 |                                   an error, this might indicate the type of
 70 |                                   error that occurred.
 71 |                                   Default of None.
 72 |         ids_data               -- Extracted from website page info about other
 73 |                                   usernames and inner ids.
 74 | 
 75 |         Return Value:
 76 |         Nothing.
 77 |         """
 78 | 
 79 |         self.username = username
 80 |         self.site_name = site_name
 81 |         self.site_url_user = site_url_user
 82 |         self.status = status
 83 |         self.query_time = query_time
 84 |         self.context = context
 85 |         self.ids_data = ids_data
 86 |         self.tags = tags
 87 |         self.error = error
 88 | 
 89 |     def json(self):
 90 |         return {
 91 |             "username": self.username,
 92 |             "site_name": self.site_name,
 93 |             "url": self.site_url_user,
 94 |             "status": str(self.status),
 95 |             "ids": self.ids_data or {},
 96 |             "tags": self.tags,
 97 |         }
 98 | 
 99 |     def is_found(self):
100 |         return self.status == QueryStatus.CLAIMED
101 | 
102 |     def __str__(self):
103 |         """Convert Object To String.
104 | 
105 |         Keyword Arguments:
106 |         self                   -- This object.
107 | 
108 |         Return Value:
109 |         Nicely formatted string to get information about this object.
110 |         """
111 |         status = str(self.status)
112 |         if self.context is not None:
113 |             # There is extra context information available about the results.
114 |             # Append it to the normal response text.
115 |             status += f" ({self.context})"
116 | 
117 |         return status
118 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import os.path as path
 3 | import json
 4 | from typing import List
 5 | 
 6 | SETTINGS_FILES_PATHS = [
 7 |     path.join(path.dirname(path.realpath(__file__)), "resources/settings.json"),
 8 |     '~/.maigret/settings.json',
 9 |     path.join(os.getcwd(), 'settings.json'),
10 | ]
11 | 
12 | 
13 | class Settings:
14 |     # main maigret setting
15 |     retries_count: int
16 |     sites_db_path: str
17 |     timeout: int
18 |     max_connections: int
19 |     recursive_search: bool
20 |     info_extracting: bool
21 |     cookie_jar_file: str
22 |     ignore_ids_list: List
23 |     reports_path: str
24 |     proxy_url: str
25 |     tor_proxy_url: str
26 |     i2p_proxy_url: str
27 |     domain_search: bool
28 |     scan_all_sites: bool
29 |     top_sites_count: int
30 |     scan_disabled_sites: bool
31 |     scan_sites_list: List
32 |     self_check_enabled: bool
33 |     print_not_found: bool
34 |     print_check_errors: bool
35 |     colored_print: bool
36 |     show_progressbar: bool
37 |     report_sorting: str
38 |     json_report_type: str
39 |     txt_report: bool
40 |     csv_report: bool
41 |     xmind_report: bool
42 |     pdf_report: bool
43 |     html_report: bool
44 |     graph_report: bool
45 | 
46 |     # submit mode settings
47 |     presence_strings: list
48 |     supposed_usernames: list
49 | 
50 |     def __init__(self):
51 |         pass
52 | 
53 |     def load(self, paths=None):
54 |         was_inited = False
55 | 
56 |         if not paths:
57 |             paths = SETTINGS_FILES_PATHS
58 | 
59 |         for filename in paths:
60 |             data = {}
61 | 
62 |             try:
63 |                 with open(filename, "r", encoding="utf-8") as file:
64 |                     data = json.load(file)
65 |             except FileNotFoundError:
66 |                 # treast as a normal situation
67 |                 pass
68 |             except Exception as error:
69 |                 return False, ValueError(
70 |                     f"Problem with parsing json contents of "
71 |                     f"settings file '{filename}':  {str(error)}."
72 |                 )
73 | 
74 |             self.__dict__.update(data)
75 |             if data:
76 |                 was_inited = True
77 | 
78 |         return (
79 |             was_inited,
80 |             f'None of the default settings files found: {", ".join(paths)}',
81 |         )
82 | 
83 |     @property
84 |     def json(self):
85 |         return self.__dict__
86 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/types.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, List, Dict, Tuple, Any
 2 | 
 3 | 
 4 | # search query
 5 | QueryDraft = Tuple[Callable, List, Dict]
 6 | 
 7 | # options dict
 8 | QueryOptions = Dict[str, Any]
 9 | 
10 | # TODO: throw out
11 | QueryResultWrapper = Dict[str, Any]
12 | 


--------------------------------------------------------------------------------
/src/maigret/maigret/utils.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | import ast
  3 | import difflib
  4 | import re
  5 | import random
  6 | from typing import Any
  7 | 
  8 | 
  9 | DEFAULT_USER_AGENTS = [
 10 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
 11 | ]
 12 | 
 13 | 
 14 | class CaseConverter:
 15 |     @staticmethod
 16 |     def camel_to_snake(camelcased_string: str) -> str:
 17 |         return re.sub(r"(?<!^)(?=[A-Z])", "_", camelcased_string).lower()
 18 | 
 19 |     @staticmethod
 20 |     def snake_to_camel(snakecased_string: str) -> str:
 21 |         formatted = "".join(word.title() for word in snakecased_string.split("_"))
 22 |         result = formatted[0].lower() + formatted[1:]
 23 |         return result
 24 | 
 25 |     @staticmethod
 26 |     def snake_to_title(snakecased_string: str) -> str:
 27 |         words = snakecased_string.split("_")
 28 |         words[0] = words[0].title()
 29 |         return " ".join(words)
 30 | 
 31 | 
 32 | def is_country_tag(tag: str) -> bool:
 33 |     """detect if tag represent a country"""
 34 |     return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == "global"
 35 | 
 36 | 
 37 | def enrich_link_str(link: str) -> str:
 38 |     link = link.strip()
 39 |     if link.startswith("www.") or (link.startswith("http") and "//" in link):
 40 |         return f'<a class="auto-link" href="{link}">{link}</a>'
 41 |     return link
 42 | 
 43 | 
 44 | class URLMatcher:
 45 |     _HTTP_URL_RE_STR = "^https?://(www.|m.)?(.+)$"
 46 |     HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
 47 |     UNSAFE_SYMBOLS = ".?"
 48 | 
 49 |     @classmethod
 50 |     def extract_main_part(self, url: str) -> str:
 51 |         match = self.HTTP_URL_RE.search(url)
 52 |         if match and match.group(2):
 53 |             return match.group(2).rstrip("/")
 54 | 
 55 |         return ""
 56 | 
 57 |     @classmethod
 58 |     def make_profile_url_regexp(self, url: str, username_regexp: str = ""):
 59 |         url_main_part = self.extract_main_part(url)
 60 |         for c in self.UNSAFE_SYMBOLS:
 61 |             url_main_part = url_main_part.replace(c, f"\\{c}")
 62 |         prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
 63 | 
 64 |         url_regexp = url_main_part.replace(
 65 |             "{username}", f"({prepared_username_regexp})"
 66 |         )
 67 |         regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
 68 | 
 69 |         return re.compile(regexp_str, re.IGNORECASE)
 70 | 
 71 | 
 72 | def ascii_data_display(data: str) -> Any:
 73 |     return ast.literal_eval(data)
 74 | 
 75 | 
 76 | def get_dict_ascii_tree(items, prepend="", new_line=True):
 77 |     new_result = b'\xe2\x94\x9c'.decode()
 78 |     new_line = b'\xe2\x94\x80'.decode()
 79 |     last_result = b'\xe2\x94\x94'.decode()
 80 |     skip_result = b'\xe2\x94\x82'.decode()
 81 | 
 82 |     text = ""
 83 |     for num, item in enumerate(items):
 84 |         box_symbol = (
 85 |             new_result + new_line if num != len(items) - 1 else last_result + new_line
 86 |         )
 87 | 
 88 |         if type(item) == tuple:
 89 |             field_name, field_value = item
 90 |             if field_value.startswith("['"):
 91 |                 is_last_item = num == len(items) - 1
 92 |                 prepend_symbols = " " * 3 if is_last_item else f" {skip_result} "
 93 |                 data = ascii_data_display(field_value)
 94 |                 field_value = get_dict_ascii_tree(data, prepend_symbols)
 95 |             text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
 96 |         else:
 97 |             text += f"\n{prepend}{box_symbol} {item}"
 98 | 
 99 |     if not new_line:
100 |         text = text[1:]
101 | 
102 |     return text
103 | 
104 | 
105 | def get_random_user_agent():
106 |     return random.choice(DEFAULT_USER_AGENTS)
107 | 
108 | 
109 | def get_match_ratio(base_strs: list):
110 |     def get_match_inner(s: str):
111 |         return round(
112 |             max(
113 |                 [
114 |                     difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
115 |                     for s2 in base_strs
116 |                 ]
117 |             ),
118 |             2,
119 |         )
120 | 
121 |     return get_match_inner
122 | 


--------------------------------------------------------------------------------
/src/maigret/pyinstaller/maigret_standalone.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import asyncio
3 | 
4 | import maigret
5 | 
6 | if __name__ == "__main__":
7 |     asyncio.run(maigret.cli())


--------------------------------------------------------------------------------
/src/maigret/pyinstaller/requirements.txt:
--------------------------------------------------------------------------------
1 | maigret @ https://github.com/soxoj/maigret/archive/refs/heads/main.zip
2 | pefile==2022.5.30
3 | psutil==5.9.5
4 | pyinstaller @ https://github.com/pyinstaller/pyinstaller/archive/develop.zip
5 | pywin32-ctypes==0.2.0


--------------------------------------------------------------------------------
/src/maigret/pytest.ini:
--------------------------------------------------------------------------------
1 | # pytest.ini
2 | [pytest]
3 | filterwarnings =
4 |     error
5 |     ignore::UserWarning
6 | asyncio_mode=auto


--------------------------------------------------------------------------------
/src/maigret/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiodns==3.0.0
 2 | aiohttp==3.8.3
 3 | aiohttp-socks==0.7.1
 4 | arabic-reshaper==2.1.4
 5 | async-timeout==4.0.2
 6 | attrs==22.2.0
 7 | certifi==2022.12.7
 8 | chardet==5.0.0
 9 | colorama==0.4.6
10 | future==0.18.3
11 | future-annotations==1.0.0
12 | html5lib==1.1
13 | idna==3.4
14 | Jinja2==3.1.2
15 | lxml==4.9.2
16 | MarkupSafe==2.1.1
17 | mock==4.0.3
18 | multidict==6.0.4
19 | pycountry==22.3.5
20 | PyPDF2==2.10.8
21 | PySocks==1.7.1
22 | python-bidi==0.4.2
23 | requests==2.28.2
24 | requests-futures==1.0.0
25 | six==1.16.0
26 | socid-extractor>=0.0.21
27 | soupsieve==2.3.2.post1
28 | stem==1.8.1
29 | torrequest==0.1.0
30 | tqdm==4.65.0
31 | typing-extensions==4.5.0
32 | webencodings==0.5.1
33 | xhtml2pdf==0.2.8
34 | XMind==1.2.0
35 | yarl==1.8.2
36 | networkx==2.6.3
37 | pyvis==0.2.1
38 | reportlab==3.6.12
39 | cloudscraper==1.2.66
40 | 


--------------------------------------------------------------------------------
/src/maigret/setup.cfg:
--------------------------------------------------------------------------------
1 | [egg_info]
2 | tag_build = 
3 | tag_date = 0
4 | 
5 | [flake8]
6 | per-file-ignores = __init__.py:F401
7 | 
8 | [mypy]
9 | ignore_missing_imports = True


--------------------------------------------------------------------------------
/src/maigret/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import (
 2 |     setup,
 3 |     find_packages,
 4 | )
 5 | 
 6 | 
 7 | with open('README.md') as fh:
 8 |     long_description = fh.read()
 9 | 
10 | with open('requirements.txt') as rf:
11 |     requires = rf.read().splitlines()
12 | 
13 | setup(name='maigret',
14 |       version='0.4.4',
15 |       description='Collect a dossier on a person by username from a huge number of sites',
16 |       long_description=long_description,
17 |       long_description_content_type="text/markdown",
18 |       url='https://github.com/soxoj/maigret',
19 |       install_requires=requires,
20 |       entry_points={'console_scripts': ['maigret = maigret.maigret:run']},
21 |       packages=find_packages(exclude=["tests*"]),
22 |       include_package_data=True,
23 |       author='Soxoj',
24 |       author_email='soxoj@protonmail.com',
25 |       license='MIT',
26 |       zip_safe=False)
27 | 


--------------------------------------------------------------------------------
/src/maigret/snapcraft.yaml:
--------------------------------------------------------------------------------
 1 | name: maigret2
 2 | adopt-info: maigret2
 3 | summary: SOCMINT / Instagram
 4 | description: |
 5 |   Test Test Test
 6 | 
 7 | license: MIT
 8 | 
 9 | base: core20
10 | grade: stable
11 | confinement: strict
12 | compression: lzo
13 | 
14 | architectures:
15 |   - build-on: amd64
16 | 
17 | apps:
18 |   maigret2:
19 |     command: bin/maigret
20 |     environment:
21 |       LC_ALL: C.UTF-8
22 |     plugs:
23 |       - home
24 |       - network
25 |       
26 | parts:
27 |   maigret2:
28 |     plugin: python
29 |     source: https://github.com/soxoj/maigret
30 |     source-type: git
31 |     
32 |     build-packages:
33 |       - python3-pip
34 |       - python3-six
35 |       - python3
36 |       
37 |     stage-packages:
38 |       - python3
39 |       - python3-six
40 | 
41 |     override-pull: |
42 |       snapcraftctl pull
43 |       snapcraftctl set-version "$(git describe --tags | sed 's/^v//' | cut -d "-" -f1)"
44 | 


--------------------------------------------------------------------------------
/src/maigret/static/chat_gitter.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="92" height="20"><linearGradient id="b" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><mask id="a"><rect width="92" height="20" fill="#fff"/></mask><g mask="url(#a)"><path fill="#555" d="M0 0h34v20H0z"/><path fill="#46BC99" d="M34 0h58v20H34z"/></g><g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11"><text x="17" y="14">chat</text><text x="62" y="14">on gitter</text></g></svg>


--------------------------------------------------------------------------------
/src/maigret/static/maigret.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/maigret.png


--------------------------------------------------------------------------------
/src/maigret/static/report_alexaimephotography_html_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotography_html_screenshot.png


--------------------------------------------------------------------------------
/src/maigret/static/report_alexaimephotography_xmind_screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotography_xmind_screenshot.png


--------------------------------------------------------------------------------
/src/maigret/static/report_alexaimephotographycars.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/static/report_alexaimephotographycars.pdf


--------------------------------------------------------------------------------
/src/maigret/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/tests/__init__.py


--------------------------------------------------------------------------------
/src/maigret/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import logging
 3 | import os
 4 | 
 5 | import pytest
 6 | from _pytest.mark import Mark
 7 | 
 8 | from maigret.sites import MaigretDatabase
 9 | from maigret.maigret import setup_arguments_parser
10 | from maigret.settings import Settings
11 | 
12 | 
13 | CUR_PATH = os.path.dirname(os.path.realpath(__file__))
14 | JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
15 | SETTINGS_FILE = os.path.join(CUR_PATH, '../maigret/resources/settings.json')
16 | TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json')
17 | LOCAL_TEST_JSON_FILE = os.path.join(CUR_PATH, 'local.json')
18 | empty_mark = Mark('', (), {})
19 | 
20 | 
21 | def by_slow_marker(item):
22 |     return item.get_closest_marker('slow', default=empty_mark)
23 | 
24 | 
25 | def pytest_collection_modifyitems(items):
26 |     items.sort(key=by_slow_marker, reverse=False)
27 | 
28 | 
29 | def get_test_reports_filenames():
30 |     return glob.glob(os.path.join('report_*'), recursive=False)
31 | 
32 | 
33 | def remove_test_reports():
34 |     reports_list = get_test_reports_filenames()
35 |     for f in reports_list:
36 |         os.remove(f)
37 |     logging.error(f'Removed test reports {reports_list}')
38 | 
39 | 
40 | @pytest.fixture(scope='session')
41 | def default_db():
42 |     return MaigretDatabase().load_from_file(JSON_FILE)
43 | 
44 | 
45 | @pytest.fixture(scope='function')
46 | def test_db():
47 |     return MaigretDatabase().load_from_file(TEST_JSON_FILE)
48 | 
49 | 
50 | @pytest.fixture(scope='function')
51 | def local_test_db():
52 |     return MaigretDatabase().load_from_file(LOCAL_TEST_JSON_FILE)
53 | 
54 | 
55 | @pytest.fixture(autouse=True)
56 | def reports_autoclean():
57 |     remove_test_reports()
58 |     yield
59 |     remove_test_reports()
60 | 
61 | 
62 | @pytest.fixture(scope='session')
63 | def argparser():
64 |     settings = Settings()
65 |     settings.load([SETTINGS_FILE])
66 |     return setup_arguments_parser(settings)
67 | 
68 | 
69 | @pytest.fixture(scope="session")
70 | def httpserver_listen_address():
71 |     return ("localhost", 8989)
72 | 


--------------------------------------------------------------------------------
/src/maigret/tests/db.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "engines": {},
 3 |     "sites": {
 4 |         "GooglePlayStore": {
 5 |             "tags": ["global", "us"],
 6 |             "disabled": false,
 7 |             "checkType": "status_code",
 8 |             "alexaRank": 1,
 9 |             "url": "https://play.google.com/store/apps/developer?id={username}",
10 |             "urlMain": "https://play.google.com/store",
11 |             "usernameClaimed": "Facebook_nosuchname",
12 |             "usernameUnclaimed": "noonewouldeverusethis7"
13 |         },
14 |         "Reddit": {
15 |             "tags": ["news", "social", "us"],
16 |             "checkType": "status_code",
17 |             "presenseStrs": ["totalKarma"],
18 |             "disabled": true,
19 |             "alexaRank": 17,
20 |             "url": "https://www.reddit.com/user/{username}",
21 |             "urlMain": "https://www.reddit.com/",
22 |             "usernameClaimed": "blue",
23 |             "usernameUnclaimed": "noonewouldeverusethis7"
24 |         }
25 |     }
26 | }


--------------------------------------------------------------------------------
/src/maigret/tests/local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "engines": {},
 3 |     "sites": {
 4 |         "StatusCode": {
 5 |             "checkType": "status_code",
 6 |             "url": "http://localhost:8989/url?id={username}",
 7 |             "urlMain": "http://localhost:8989/",
 8 |             "usernameClaimed": "claimed",
 9 |             "usernameUnclaimed": "unclaimed"
10 |         },
11 |         "Message": {
12 |             "checkType": "message",
13 |             "url": "http://localhost:8989/url?id={username}",
14 |             "urlMain": "http://localhost:8989/",
15 |             "presenseStrs": ["user", "profile"],
16 |             "absenseStrs": ["not found", "404"],
17 |             "usernameClaimed": "claimed",
18 |             "usernameUnclaimed": "unclaimed"
19 |         }
20 |     }
21 | }


--------------------------------------------------------------------------------
/src/maigret/tests/test_activation.py:
--------------------------------------------------------------------------------
 1 | """Maigret activation test functions"""
 2 | import json
 3 | 
 4 | import aiohttp
 5 | import pytest
 6 | from mock import Mock
 7 | 
 8 | from maigret.activation import ParsingActivator, import_aiohttp_cookies
 9 | 
10 | COOKIES_TXT = """# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
11 | # This file can be used by wget, curl, aria2c and other standard compliant tools.
12 | # Usage Examples:
13 | #   1) wget -x --load-cookies cookies.txt "https://xss.is/search/"
14 | #   2) curl --cookie cookies.txt "https://xss.is/search/"
15 | #   3) aria2c --load-cookies cookies.txt "https://xss.is/search/"
16 | #
17 | xss.is	FALSE	/	TRUE	0	xf_csrf	test
18 | xss.is	FALSE	/	TRUE	1642709308	xf_user	tset
19 | .xss.is	TRUE	/	FALSE	0	muchacho_cache	test
20 | .xss.is	TRUE	/	FALSE	1924905600	132_evc	test
21 | httpbin.org	FALSE	/	FALSE	0	a	b
22 | """
23 | 
24 | 
25 | @pytest.mark.skip(reason="periodically fails")
26 | @pytest.mark.slow
27 | def test_twitter_activation(default_db):
28 |     twitter_site = default_db.sites_dict['Twitter']
29 |     token1 = twitter_site.headers['x-guest-token']
30 | 
31 |     ParsingActivator.twitter(twitter_site, Mock())
32 |     token2 = twitter_site.headers['x-guest-token']
33 | 
34 |     assert token1 != token2
35 | 
36 | 
37 | @pytest.mark.asyncio
38 | async def test_import_aiohttp_cookies():
39 |     cookies_filename = 'cookies_test.txt'
40 |     with open(cookies_filename, 'w') as f:
41 |         f.write(COOKIES_TXT)
42 | 
43 |     cookie_jar = import_aiohttp_cookies(cookies_filename)
44 |     assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org']
45 | 
46 |     url = 'https://httpbin.org/cookies'
47 |     connector = aiohttp.TCPConnector(ssl=False)
48 |     session = aiohttp.ClientSession(
49 |         connector=connector, trust_env=True, cookie_jar=cookie_jar
50 |     )
51 | 
52 |     response = await session.get(url=url)
53 |     result = json.loads(await response.content.read())
54 |     await session.close()
55 | 
56 |     assert result == {'cookies': {'a': 'b'}}
57 | 


--------------------------------------------------------------------------------
/src/maigret/tests/test_checking.py:
--------------------------------------------------------------------------------
 1 | from mock import Mock
 2 | import pytest
 3 | 
 4 | from maigret import search
 5 | 
 6 | 
 7 | def site_result_except(server, username, **kwargs):
 8 |     query = f'id={username}'
 9 |     server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
10 | 
11 | 
12 | @pytest.mark.slow
13 | @pytest.mark.asyncio
14 | async def test_checking_by_status_code(httpserver, local_test_db):
15 |     sites_dict = local_test_db.sites_dict
16 | 
17 |     site_result_except(httpserver, 'claimed', status=200)
18 |     site_result_except(httpserver, 'unclaimed', status=404)
19 | 
20 |     result = await search('claimed', site_dict=sites_dict, logger=Mock())
21 |     assert result['StatusCode']['status'].is_found() is True
22 | 
23 |     result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
24 |     assert result['StatusCode']['status'].is_found() is False
25 | 
26 | 
27 | @pytest.mark.slow
28 | @pytest.mark.asyncio
29 | async def test_checking_by_message_positive_full(httpserver, local_test_db):
30 |     sites_dict = local_test_db.sites_dict
31 | 
32 |     site_result_except(httpserver, 'claimed', response_data="user profile")
33 |     site_result_except(httpserver, 'unclaimed', response_data="404 not found")
34 | 
35 |     result = await search('claimed', site_dict=sites_dict, logger=Mock())
36 |     assert result['Message']['status'].is_found() is True
37 | 
38 |     result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
39 |     assert result['Message']['status'].is_found() is False
40 | 
41 | 
42 | @pytest.mark.slow
43 | @pytest.mark.asyncio
44 | async def test_checking_by_message_positive_part(httpserver, local_test_db):
45 |     sites_dict = local_test_db.sites_dict
46 | 
47 |     site_result_except(httpserver, 'claimed', response_data="profile")
48 |     site_result_except(httpserver, 'unclaimed', response_data="404")
49 | 
50 |     result = await search('claimed', site_dict=sites_dict, logger=Mock())
51 |     assert result['Message']['status'].is_found() is True
52 | 
53 |     result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
54 |     assert result['Message']['status'].is_found() is False
55 | 
56 | 
57 | @pytest.mark.slow
58 | @pytest.mark.asyncio
59 | async def test_checking_by_message_negative(httpserver, local_test_db):
60 |     sites_dict = local_test_db.sites_dict
61 | 
62 |     site_result_except(httpserver, 'claimed', response_data="")
63 |     site_result_except(httpserver, 'unclaimed', response_data="user 404")
64 | 
65 |     result = await search('claimed', site_dict=sites_dict, logger=Mock())
66 |     assert result['Message']['status'].is_found() is False
67 | 
68 |     result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
69 |     assert result['Message']['status'].is_found() is True
70 | 


--------------------------------------------------------------------------------
/src/maigret/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | """Maigret command-line arguments parsing tests"""
 2 | from argparse import Namespace
 3 | from typing import Dict, Any
 4 | 
 5 | DEFAULT_ARGS: Dict[str, Any] = {
 6 |     'all_sites': False,
 7 |     'connections': 100,
 8 |     'cookie_file': None,
 9 |     'csv': False,
10 |     'db_file': 'resources/data.json',
11 |     'debug': False,
12 |     'disable_extracting': False,
13 |     'disable_recursive_search': False,
14 |     'folderoutput': 'reports',
15 |     'html': False,
16 |     'graph': False,
17 |     'id_type': 'username',
18 |     'ignore_ids_list': [],
19 |     'info': False,
20 |     'json': '',
21 |     'new_site_to_submit': False,
22 |     'no_color': False,
23 |     'no_progressbar': False,
24 |     'parse_url': '',
25 |     'pdf': False,
26 |     'print_check_errors': False,
27 |     'print_not_found': False,
28 |     'proxy': None,
29 |     'reports_sorting': 'default',
30 |     'retries': 1,
31 |     'self_check': False,
32 |     'site_list': [],
33 |     'stats': False,
34 |     'tags': '',
35 |     'timeout': 30,
36 |     'tor_proxy': 'socks5://127.0.0.1:9050',
37 |     'i2p_proxy': 'http://127.0.0.1:4444',
38 |     'top_sites': 500,
39 |     'txt': False,
40 |     'use_disabled_sites': False,
41 |     'username': [],
42 |     'verbose': False,
43 |     'with_domains': False,
44 |     'xmind': False,
45 | }
46 | 
47 | 
48 | def test_args_search_mode(argparser):
49 |     args = argparser.parse_args('username'.split())
50 | 
51 |     assert args.username == ['username']
52 | 
53 |     want_args = dict(DEFAULT_ARGS)
54 |     want_args.update({'username': ['username']})
55 | 
56 |     assert args == Namespace(**want_args)
57 | 
58 | 
59 | def test_args_search_mode_several_usernames(argparser):
60 |     args = argparser.parse_args('username1 username2'.split())
61 | 
62 |     assert args.username == ['username1', 'username2']
63 | 
64 |     want_args = dict(DEFAULT_ARGS)
65 |     want_args.update({'username': ['username1', 'username2']})
66 | 
67 |     assert args == Namespace(**want_args)
68 | 
69 | 
70 | def test_args_self_check_mode(argparser):
71 |     args = argparser.parse_args('--self-check --site GitHub'.split())
72 | 
73 |     want_args = dict(DEFAULT_ARGS)
74 |     want_args.update(
75 |         {
76 |             'self_check': True,
77 |             'site_list': ['GitHub'],
78 |             'username': [],
79 |         }
80 |     )
81 | 
82 |     assert args == Namespace(**want_args)
83 | 
84 | 
85 | def test_args_multiple_sites(argparser):
86 |     args = argparser.parse_args(
87 |         '--site GitHub VK --site PornHub --site Taringa,Steam'.split()
88 |     )
89 | 
90 |     want_args = dict(DEFAULT_ARGS)
91 |     want_args.update(
92 |         {
93 |             'site_list': ['GitHub', 'PornHub', 'Taringa,Steam'],
94 |             'username': ['VK'],
95 |         }
96 |     )
97 | 
98 |     assert args == Namespace(**want_args)
99 | 


--------------------------------------------------------------------------------
/src/maigret/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | """Maigret data test functions"""
 2 | 
 3 | from maigret.utils import is_country_tag
 4 | 
 5 | 
 6 | def test_tags_validity(default_db):
 7 |     unknown_tags = set()
 8 | 
 9 |     tags = default_db._tags
10 | 
11 |     for site in default_db.sites:
12 |         for tag in filter(lambda x: not is_country_tag(x), site.tags):
13 |             if tag not in tags:
14 |                 unknown_tags.add(tag)
15 | 
16 |     assert unknown_tags == set()
17 | 


--------------------------------------------------------------------------------
/src/maigret/tests/test_executors.py:
--------------------------------------------------------------------------------
 1 | """Maigret checking logic test functions"""
 2 | import pytest
 3 | import asyncio
 4 | import logging
 5 | from maigret.executors import (
 6 |     AsyncioSimpleExecutor,
 7 |     AsyncioProgressbarExecutor,
 8 |     AsyncioProgressbarSemaphoreExecutor,
 9 |     AsyncioProgressbarQueueExecutor,
10 | )
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | async def func(n):
16 |     await asyncio.sleep(0.1 * (n % 3))
17 |     return n
18 | 
19 | 
20 | @pytest.mark.asyncio
21 | async def test_simple_asyncio_executor():
22 |     tasks = [(func, [n], {}) for n in range(10)]
23 |     executor = AsyncioSimpleExecutor(logger=logger)
24 |     assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
25 |     assert executor.execution_time > 0.2
26 |     assert executor.execution_time < 0.3
27 | 
28 | 
29 | @pytest.mark.asyncio
30 | async def test_asyncio_progressbar_executor():
31 |     tasks = [(func, [n], {}) for n in range(10)]
32 | 
33 |     executor = AsyncioProgressbarExecutor(logger=logger)
34 |     # no guarantees for the results order
35 |     assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
36 |     assert executor.execution_time > 0.2
37 |     assert executor.execution_time < 0.3
38 | 
39 | 
40 | @pytest.mark.asyncio
41 | async def test_asyncio_progressbar_semaphore_executor():
42 |     tasks = [(func, [n], {}) for n in range(10)]
43 | 
44 |     executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
45 |     # no guarantees for the results order
46 |     assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
47 |     assert executor.execution_time > 0.2
48 |     assert executor.execution_time < 0.4
49 | 
50 | 
51 | @pytest.mark.asyncio
52 | async def test_asyncio_progressbar_queue_executor():
53 |     tasks = [(func, [n], {}) for n in range(10)]
54 | 
55 |     executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
56 |     assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
57 |     assert executor.execution_time > 0.5
58 |     assert executor.execution_time < 0.6
59 | 
60 |     executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=3)
61 |     assert await executor.run(tasks) == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
62 |     assert executor.execution_time > 0.4
63 |     assert executor.execution_time < 0.5
64 | 
65 |     executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5)
66 |     assert await executor.run(tasks) in (
67 |         [0, 3, 6, 1, 4, 7, 9, 2, 5, 8],
68 |         [0, 3, 6, 1, 4, 9, 7, 2, 5, 8],
69 |     )
70 |     assert executor.execution_time > 0.3
71 |     assert executor.execution_time < 0.4
72 | 
73 |     executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=10)
74 |     assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
75 |     assert executor.execution_time > 0.2
76 |     assert executor.execution_time < 0.3
77 | 


--------------------------------------------------------------------------------
/src/maigret/tests/test_notify.py:
--------------------------------------------------------------------------------
 1 | from maigret.errors import CheckError
 2 | from maigret.notify import QueryNotifyPrint
 3 | from maigret.result import QueryStatus, QueryResult
 4 | 
 5 | 
 6 | def test_notify_illegal():
 7 |     n = QueryNotifyPrint(color=False)
 8 | 
 9 |     assert (
10 |         n.update(
11 |             QueryResult(
12 |                 username="test",
13 |                 status=QueryStatus.ILLEGAL,
14 |                 site_name="TEST_SITE",
15 |                 site_url_user="http://example.com/test",
16 |             )
17 |         )
18 |         == "[-] TEST_SITE: Illegal Username Format For This Site!"
19 |     )
20 | 
21 | 
22 | def test_notify_claimed():
23 |     n = QueryNotifyPrint(color=False)
24 | 
25 |     assert (
26 |         n.update(
27 |             QueryResult(
28 |                 username="test",
29 |                 status=QueryStatus.CLAIMED,
30 |                 site_name="TEST_SITE",
31 |                 site_url_user="http://example.com/test",
32 |             )
33 |         )
34 |         == "[+] TEST_SITE: http://example.com/test"
35 |     )
36 | 
37 | 
38 | def test_notify_available():
39 |     n = QueryNotifyPrint(color=False)
40 | 
41 |     assert (
42 |         n.update(
43 |             QueryResult(
44 |                 username="test",
45 |                 status=QueryStatus.AVAILABLE,
46 |                 site_name="TEST_SITE",
47 |                 site_url_user="http://example.com/test",
48 |             )
49 |         )
50 |         == "[-] TEST_SITE: Not found!"
51 |     )
52 | 
53 | 
54 | def test_notify_unknown():
55 |     n = QueryNotifyPrint(color=False)
56 |     result = QueryResult(
57 |         username="test",
58 |         status=QueryStatus.UNKNOWN,
59 |         site_name="TEST_SITE",
60 |         site_url_user="http://example.com/test",
61 |     )
62 |     result.error = CheckError('Type', 'Reason')
63 | 
64 |     assert n.update(result) == "[?] TEST_SITE: Type error: Reason"
65 | 


--------------------------------------------------------------------------------
/src/maigret/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/maigret/utils/__init__.py


--------------------------------------------------------------------------------
/src/maigret/utils/add_tags.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import random
 3 | from argparse import ArgumentParser, RawDescriptionHelpFormatter
 4 | 
 5 | from maigret.maigret import MaigretDatabase
 6 | from maigret.submit import Submitter
 7 | 
 8 | 
 9 | def update_tags(site):
10 |     tags = []
11 |     if not site.tags:
12 |         print(f'Site {site.name} doesn\'t have tags')
13 |     else:
14 |         tags = site.tags
15 |         print(f'Site {site.name} tags: ' + ', '.join(tags))
16 | 
17 |     print(f'URL: {site.url_main}')
18 | 
19 |     new_tags = set(input('Enter new tags: ').split(', '))
20 |     if "disabled" in new_tags:
21 |         new_tags.remove("disabled")
22 |         site.disabled = True
23 | 
24 |     print(f'Old alexa rank: {site.alexa_rank}')
25 |     rank = Submitter.get_alexa_rank(site.url_main)
26 |     if rank:
27 |         print(f'New alexa rank: {rank}')
28 |         site.alexa_rank = rank
29 | 
30 |     site.tags = [x for x in list(new_tags) if x]
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
35 |                             )
36 |     parser.add_argument("--base","-b", metavar="BASE_FILE",
37 |                         dest="base_file", default="maigret/resources/data.json",
38 |                         help="JSON file with sites data to update.")
39 |     parser.add_argument("--name", help="Name of site to check")
40 | 
41 |     pool = list()
42 | 
43 |     args = parser.parse_args()
44 | 
45 |     db = MaigretDatabase()
46 |     db.load_from_file(args.base_file).sites
47 | 
48 |     while True:
49 |         if args.name:
50 |             sites = list(db.ranked_sites_dict(names=[args.name]).values())
51 |             site = random.choice(sites)
52 |         else:
53 |             site = random.choice(db.sites)
54 | 
55 |         if site.engine == 'uCoz':
56 |             continue
57 | 
58 |         # if not 'in' in site.tags:
59 |         #     continue
60 | 
61 |         update_tags(site)
62 | 
63 |         db.save_to_file(args.base_file)


--------------------------------------------------------------------------------
/src/maigret/utils/sites_diff.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import difflib
 3 | import requests
 4 | 
 5 | 
 6 | a = requests.get(sys.argv[1]).text
 7 | b = requests.get(sys.argv[2]).text
 8 | 
 9 | 
10 | tokens_a = set(a.split('"'))
11 | tokens_b = set(b.split('"'))
12 | 
13 | a_minus_b = tokens_a.difference(tokens_b)
14 | b_minus_a = tokens_b.difference(tokens_a)
15 | 
16 | print(a_minus_b)
17 | print(b_minus_a)
18 | 
19 | print(len(a_minus_b))
20 | print(len(b_minus_a))
21 | 
22 | desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
23 | "birthday", "репутация", "информация", "e-mail"]
24 | 
25 | 
26 | def get_match_ratio(x):
27 |     return round(max([
28 |     	difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
29 |     	for y in desired_strings
30 |     ]), 2)
31 | 
32 | 
33 | RATIO = 0.6
34 | 
35 | print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10])
36 | print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10])


--------------------------------------------------------------------------------
/src/maigret/wizard.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import asyncio
 3 | import logging
 4 | import maigret
 5 | 
 6 | 
 7 | # top popular sites from the Maigret database
 8 | TOP_SITES_COUNT = 300
 9 | # Maigret HTTP requests timeout
10 | TIMEOUT = 10
11 | # max parallel requests
12 | MAX_CONNECTIONS = 50
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     # setup logging and asyncio
17 |     logger = logging.getLogger('maigret')
18 |     logger.setLevel(logging.WARNING)
19 |     loop = asyncio.get_event_loop()
20 | 
21 |     # setup Maigret
22 |     db = maigret.MaigretDatabase().load_from_file('./maigret/resources/data.json')
23 |     # also can be downloaded from web
24 |     # db = MaigretDatabase().load_from_url(MAIGRET_DB_URL)
25 | 
26 |     # user input
27 |     username = input('Enter username to search: ')
28 | 
29 |     sites_count_raw = input(
30 |         f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): '
31 |     )
32 |     sites_count = int(sites_count_raw) or TOP_SITES_COUNT
33 | 
34 |     sites = db.ranked_sites_dict(top=sites_count)
35 | 
36 |     show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ')
37 |     show_progressbar = show_progressbar_raw.lower() != 'n'
38 | 
39 |     extract_info_raw = input(
40 |         'Do you want to extract additional info from accounts\' pages? [Yn] '
41 |     )
42 |     extract_info = extract_info_raw.lower() != 'n'
43 | 
44 |     use_notifier_raw = input(
45 |         'Do you want to use notifier for displaying results while searching? [Yn] '
46 |     )
47 |     use_notifier = use_notifier_raw.lower() != 'n'
48 | 
49 |     notifier = None
50 |     if use_notifier:
51 |         notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True)
52 | 
53 |     # search!
54 |     search_func = maigret.search(
55 |         username=username,
56 |         site_dict=sites,
57 |         timeout=TIMEOUT,
58 |         logger=logger,
59 |         max_connections=MAX_CONNECTIONS,
60 |         query_notify=notifier,
61 |         no_progressbar=(not show_progressbar),
62 |         is_parsing_enabled=extract_info,
63 |     )
64 | 
65 |     results = loop.run_until_complete(search_func)
66 | 
67 |     input('Search completed. Press any key to show results.')
68 | 
69 |     for sitename, data in results.items():
70 |         is_found = data['status'].is_found()
71 |         print(f'{sitename} - {"Found!" if is_found else "Not found"}')
72 | 


--------------------------------------------------------------------------------
/src/recopilacion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/recopilacion/__init__.py


--------------------------------------------------------------------------------
/src/recopilacion/extraccion.py:
--------------------------------------------------------------------------------
1 | def procesar_resultados():
2 |     pass


--------------------------------------------------------------------------------
/src/riesgos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/riesgos/__init__.py


--------------------------------------------------------------------------------
/src/sherlock/.dockerignore:
--------------------------------------------------------------------------------
1 | .git/
2 | .vscode/
3 | screenshot/
4 | tests/
5 | *.txt
6 | !/requirements.txt
7 | venv/
8 | 
9 | 


--------------------------------------------------------------------------------
/src/sherlock/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 2
 6 | end_of_line = lf
 7 | charset = utf-8
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | curly_bracket_next_line = false
11 | spaces_around_operators = true
12 | 
13 | [*.{markdown,md}]
14 | trim_trailing_whitespace = false
15 | 
16 | [*.py]
17 | indent_size = 4
18 | quote_type = double
19 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Report a bug in Sherlock's functionality
 4 | title: ''
 5 | labels: bug 
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!--
11 | 
12 | ######################################################################
13 |   WARNING!
14 |   IGNORING THE FOLLOWING TEMPLATE WILL RESULT IN ISSUE CLOSED AS INCOMPLETE
15 | ######################################################################
16 | 
17 | -->
18 | 
19 | 
20 | ## Checklist
21 | <!--
22 | Put x into all boxes (like this [x]) once you have completed what they say.
23 | Make sure complete everything in the checklist.
24 | -->
25 | 
26 | - [ ] I'm reporting a bug in Sherlock's functionality
27 | - [ ] The bug I'm reporting is not a false positive or a false negative
28 | - [ ] I've verified that I'm running the latest version of Sherlock
29 | - [ ] I've checked for similar bug reports including closed ones
30 | - [ ] I've checked for pull requests that attempt to fix this bug
31 | 
32 | ## Description
33 | <!--
34 | Provide a detailed description of the bug that you have found in Sherlock.
35 | Provide the version of Sherlock you are running.
36 | -->
37 | 
38 | WRITE DESCRIPTION HERE
39 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Request a new functionality for Sherlock
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!--
11 | 
12 | ######################################################################
13 |   WARNING!
14 |   IGNORING THE FOLLOWING TEMPLATE WILL RESULT IN ISSUE CLOSED AS INCOMPLETE
15 | ######################################################################
16 | 
17 | -->
18 | 
19 | ## Checklist
20 | <!--
21 | Put x into all boxes (like this [x]) once you have completed what they say.
22 | Make sure complete everything in the checklist.
23 | -->
24 | - [ ] I'm reporting a feature request
25 | - [ ] I've checked for similar feature requests including closed ones
26 | 
27 | ## Description
28 | <!-- 
29 | Provide a detailed description of the feature you would like Sherlock to have
30 | -->
31 | 
32 | WRITE DESCRIPTION HERE
33 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask us a question
 4 | title: ''
 5 | labels: question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!--
11 | 
12 | ######################################################################
13 |   WARNING!
14 |   IGNORING THE FOLLOWING TEMPLATE WILL RESULT IN ISSUE CLOSED AS INCOMPLETE.
15 | ######################################################################
16 | 
17 | -->
18 | 
19 | ## Checklist
20 | <!--
21 | Put x into all boxes (like this [x]) once you have completed what they say.
22 | Make sure complete everything in the checklist.
23 | -->
24 | - [ ] I'm asking a question regarding Sherlock
25 | - [ ] My question is not a tech support question.
26 | 
27 | **We are not your tech support**. 
28 | If you have questions related to `pip`, `git`, or something that is not related to Sherlock, please ask them on [Stack Overflow](https://stackoverflow.com/) or [r/learnpython](https://www.reddit.com/r/learnpython/)
29 | 
30 | 
31 | ## Question
32 | 
33 | ASK YOUR QUESTION HERE
34 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/reporting-false-negative.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Reporting false negative
 3 | about: Reporting a site that is returning false positives
 4 | title: ''
 5 | labels: false negative 
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!--
11 | 
12 | ######################################################################
13 |   WARNING!
14 |   IGNORING THE FOLLOWING TEMPLATE WILL RESULT IN ISSUE CLOSED AS INCOMPLETE
15 | ######################################################################
16 | 
17 | -->
18 | 
19 | ## Checklist
20 | <!--
21 | Put x into all boxes (like this [x]) once you have completed what they say.
22 | Make sure complete everything in the checklist.
23 | -->
24 | - [ ] I'm reporting a website that is returning **false negative** results
25 | - [ ] I've checked for similar site support requests including closed ones
26 | - [ ] I've checked for pull requests attempting to fix this false negative
27 | - [ ] I'm only reporting **one** site (create a separate issue for each site)
28 | 
29 | ## Description
30 | <!--
31 | Provide the username that is causing Sherlock to return a false negative, along with any other information that might help us fix this false negative.
32 | -->
33 | 
34 | WRITE DESCRIPTION HERE
35 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/reporting-false-positive.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Reporting false positive
 3 | about: Reporting a site that is returning false positives
 4 | title: ''
 5 | labels: false positive 
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!--
11 | 
12 | ######################################################################
13 |   WARNING!
14 |   IGNORING THE FOLLOWING TEMPLATE WILL RESULT IN ISSUE CLOSED AS INCOMPLETE
15 | ######################################################################
16 | 
17 | -->
18 | 
19 | ## Checklist
20 | <!--
21 | Put x into all boxes (like this [x]) once you have completed what they say.
22 | Make sure complete everything in the checklist.
23 | -->
24 | - [ ] I'm reporting a website that is returning **false positive** results
25 | - [ ] I've checked for similar site support requests including closed ones
26 | - [ ] I've checked for pull requests attempting to fix this false positive
27 | - [ ] I'm only reporting **one** site (create a separate issue for each site)
28 | 
29 | ## Description
30 | <!--
31 | Provide the username that is causing Sherlock to return a false positive, along with any other information that might help us fix this false positive.
32 | -->
33 | 
34 | WRITE DESCRIPTION HERE
35 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/ISSUE_TEMPLATE/site-support-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Site support request
 3 | about: Request support for a new site
 4 | title: ''
 5 | labels: site support request
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!--
11 | 
12 | ######################################################################
13 |   WARNING!
14 |   IGNORING THE FOLLOWING TEMPLATE WILL RESULT IN ISSUE CLOSED AS INCOMPLETE
15 | ######################################################################
16 | 
17 | -->
18 | 
19 | ## Checklist
20 | <!--
21 | Put x into all boxes (like this [x]) once you have completed what they say.
22 | Make sure complete everything in the checklist.
23 | -->
24 | 
25 | - [ ] I'm requesting support for a new site
26 | - [ ] I've checked for similar site support requests including closed ones
27 | - [ ] I've checked that the site I am requesting has not been removed in the past and is not documented in [removed_sites.md](https://github.com/sherlock-project/sherlock/blob/master/removed_sites.md)
28 | - [ ] The site I am requesting support for is not a pornographic website
29 | - [ ] I'm only requesting support of **one** website (create a separate issue for each site)
30 | 
31 | ## Description
32 | <!--
33 | Provide the url to the website and the name of the website.
34 | If there is anything else you want to mention regarding the site support request include that in this section.
35 | -->
36 | 
37 | URL:
38 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         python-version: [3.7, 3.8, 3.9, "3.10", 3.11]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v3
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v4
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Install Dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install ruff flake8 pytest
24 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
25 |     - name: Lint with ruff
26 |       run: |
27 |         # stop the build if there are Python syntax errors or undefined names
28 |         ruff . --format=github --select=E9,F63,F7,F82
29 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 |     - name: Sherlock Site Detect Tests
32 |       run: |
33 |         cd sherlock && python -m unittest tests.all.SherlockDetectTests --verbose
34 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/nightly.yml:
--------------------------------------------------------------------------------
 1 | name: Nightly
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # Run Nightly Tests At 3AM (The Hour Of The Wolf) Every Day
 6 |     - cron:  '0 3 * * *'
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       matrix:
13 |         python-version: [3.x]
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v4
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - name: Install Dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
25 |     - name: Sherlock Site Coverage Tests
26 |       run: |
27 |         cd sherlock && python -m unittest tests.all.SherlockSiteCoverageTests --verbose
28 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | name: Pull Request Action
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ master ]
 6 | 
 7 | jobs:
 8 |   getchange:
 9 |     runs-on: ubuntu-latest
10 |     outputs:
11 |       matrix: ${{ steps.changes.outputs.matrix }}
12 |     steps:
13 |       - id: changes
14 |         run: |
15 |           URL="https://api.github.com/repos/sherlock-project/sherlock/pulls/${{ github.event.pull_request.number }}/files"
16 |           FILES=$(curl -s -X GET -G $URL | jq -r '.[] | .filename')
17 |           if echo $FILES | grep -q ".json"; then
18 |             echo "::set-output name=matrix::{\"include\":[{\"python\":\"3.x\"}]}"
19 |           else
20 |             echo "::set-output name=matrix::{\"include\":[{\"python\":\"3.7\"},{\"python\":\"3.8\"}]},{\"python\":\"3.9\"},{\"python\":\"3.10\"}]},{\"python\":\"3.11\"}]}"
21 |           fi
22 |   build:
23 |     needs: [getchange]
24 |     runs-on: ubuntu-latest
25 |     strategy:
26 |       matrix: ${{ fromJson(needs.getchange.outputs.matrix) }}
27 | 
28 |     steps:
29 |     - uses: actions/checkout@v3
30 |     - name: Set up Python ${{ matrix.python }}
31 |       uses: actions/setup-python@v4
32 |       with:
33 |         python-version: ${{ matrix.python }}
34 |     - name: Install Dependencies
35 |       run: |
36 |         python -m pip install --upgrade pip
37 |         pip install flake8 pytest
38 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
39 |     - name: Lint With flake8
40 |       run: |
41 |         # stop the build if there are Python syntax errors or undefined names
42 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
43 | 
44 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
45 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
46 |     - name: Sherlock Site Detect Tests
47 |       run: |
48 |         cd sherlock && python -m unittest tests.all.SherlockDetectTests --verbose
49 | 


--------------------------------------------------------------------------------
/src/sherlock/.github/workflows/update-site-list.yml:
--------------------------------------------------------------------------------
 1 | name: Update Site List 
 2 | 
 3 | # Trigger the workflow when changes are pushed to the main branch
 4 | # and the changes include the sherlock/resources/data.json file
 5 | on:
 6 |   push:
 7 |     branches:
 8 |       - master 
 9 |     paths:
10 |       - sherlock/resources/data.json
11 | 
12 | jobs:
13 |   sync-json-data:
14 |     # Use the latest version of Ubuntu as the runner environment
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |       # Check out the code at the specified pull request head commit
19 |       - name: Checkout code
20 |         uses: actions/checkout@v3
21 |         with:
22 |           ref: ${{ github.event.pull_request.head.sha }}
23 |           fetch-depth: 0
24 | 
25 |       # Install Python 3
26 |       - name: Install Python
27 |         uses: actions/setup-python@v4
28 |         with:
29 |           python-version: '3.x' 
30 | 
31 |       # Execute the site_list.py Python script
32 |       - name: Execute site_list.py
33 |         run: python site_list.py
34 | 
35 |        # Commit any changes made by the script
36 |       - name: Commit files
37 |         run: |
38 |           git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
39 |           git config --local user.name "github-actions[bot]"
40 |           if ! git diff --exit-code; then
41 |             git commit -a -m "Updated Site List"
42 |           fi
43 | 
44 |       # Push the changes to the remote repository
45 |       - name: Push changes
46 |         uses: ad-m/github-push-action@master
47 |         with:
48 |           github_token: ${{ secrets.GITHUB_TOKEN }}
49 |           branch: ${{ github.ref }}
50 | 


--------------------------------------------------------------------------------
/src/sherlock/.gitignore:
--------------------------------------------------------------------------------
 1 | # Virtual Environment
 2 | venv/
 3 | 
 4 | # Editor Configurations
 5 | .vscode/
 6 | .idea/
 7 | 
 8 | # Python
 9 | __pycache__/
10 | 
11 | # Pip
12 | src/
13 | 
14 | # Jupyter Notebook
15 | .ipynb_checkpoints
16 | *.ipynb
17 | 
18 | # Output files, except requirements.txt
19 | *.txt
20 | !requirements.txt
21 | 
22 | # Comma-Separated Values (CSV) Reports
23 | *.csv
24 | 
25 | #XLSX Reports
26 | *.xlsx
27 | 
28 | # Excluded sites list
29 | tests/.excluded_sites
30 | 
31 | # MacOS Folder Metadata File
32 | .DS_Store
33 | 
34 | # Vim swap files
35 | *.swp
36 | 


--------------------------------------------------------------------------------
/src/sherlock/.replit:
--------------------------------------------------------------------------------
1 | language = "python3"
2 | run = ""
3 | 


--------------------------------------------------------------------------------
/src/sherlock/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How To Contribute To Sherlock
 2 | First off, thank you for the help!
 3 | 
 4 | There are many ways to contribute.  Here is some high level grouping.
 5 | 
 6 | ## Adding New Sites
 7 | 
 8 | Please look at the Wiki entry on
 9 | [adding new sites](https://github.com/sherlock-project/sherlock/wiki/Adding-Sites-To-Sherlock)
10 | to understand the issues.
11 | 
12 | Any new sites that are added need to have a username that has been claimed, and one
13 | that is unclaimed documented in the site data.  This allows the regression tests
14 | to ensure that everything is working.
15 | 
16 | It is required that a contributor test any new sites by either running the full tests, or running
17 | a site-specific query against the claimed and unclaimed usernames.
18 | 
19 | It is not required that a contributor run the 
20 | [site_list.py](https://github.com/sherlock-project/sherlock/blob/master/site_list.py)
21 | script.
22 | 
23 | If there are performance problems with a site (e.g. slow to respond, unreliable uptime, ...), then
24 | the site may be removed from the list.  The 
25 | [removed_sites.md](https://github.com/sherlock-project/sherlock/blob/master/removed_sites.md)
26 | file contains sites that were included at one time in Sherlock, but had to be removed for
27 | one reason or another.
28 | 
29 | ## Adding New Functionality
30 | 
31 | Please ensure that the content on your branch passes all tests before submitting a pull request.
32 | 


--------------------------------------------------------------------------------
/src/sherlock/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim-bullseye as build
 2 | WORKDIR /wheels
 3 | 
 4 | COPY requirements.txt /opt/sherlock/
 5 | RUN apt-get update \
 6 |   && apt-get install -y build-essential \
 7 |   && pip3 wheel -r /opt/sherlock/requirements.txt
 8 | 
 9 | FROM python:3.11-slim-bullseye
10 | WORKDIR /opt/sherlock
11 | 
12 | ARG VCS_REF
13 | ARG VCS_URL="https://github.com/sherlock-project/sherlock"
14 | 
15 | LABEL org.label-schema.vcs-ref=$VCS_REF \
16 |       org.label-schema.vcs-url=$VCS_URL
17 | 
18 | COPY --from=build /wheels /wheels
19 | COPY . /opt/sherlock/
20 | 
21 | RUN pip3 install --no-cache-dir -r requirements.txt -f /wheels \
22 |   && rm -rf /wheels
23 | 
24 | WORKDIR /opt/sherlock/sherlock
25 | 
26 | ENTRYPOINT ["python", "sherlock.py"]
27 | 


--------------------------------------------------------------------------------
/src/sherlock/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Sherlock Project
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/sherlock/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | 
3 | services:
4 |   sherlock:
5 |     build: .
6 |     volumes:
7 |       - "./results:/opt/sherlock/results"
8 | 


--------------------------------------------------------------------------------
/src/sherlock/images/preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/sherlock/images/preview.png


--------------------------------------------------------------------------------
/src/sherlock/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi>=2019.6.16
 2 | colorama>=0.4.1
 3 | PySocks>=1.7.0
 4 | requests>=2.22.0
 5 | requests-futures>=1.0.0
 6 | stem>=1.8.0 
 7 | torrequest>=0.1.0
 8 | pandas>=1.0.0
 9 | openpyxl<=3.0.10
10 | exrex>=0.11.0


--------------------------------------------------------------------------------
/src/sherlock/sherlock/__init__.py:
--------------------------------------------------------------------------------
1 | """ Sherlock Module
2 | 
3 | This module contains the main logic to search for usernames at social
4 | networks.
5 | 
6 | """
7 | 


--------------------------------------------------------------------------------
/src/sherlock/sherlock/__main__.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | """
 4 | Sherlock: Find Usernames Across Social Networks Module
 5 | 
 6 | This module contains the main logic to search for usernames at social
 7 | networks.
 8 | """
 9 | 
10 | import sys
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     # Check if the user is using the correct version of Python
15 |     python_version = sys.version.split()[0]
16 | 
17 |     if sys.version_info < (3, 6):
18 |         print("Sherlock requires Python 3.6+\nYou are using Python %s, which is not supported by Sherlock" % (python_version))
19 |         sys.exit(1)
20 | 
21 |     import sherlock
22 |     sherlock.main()
23 | 


--------------------------------------------------------------------------------
/src/sherlock/sherlock/result.py:
--------------------------------------------------------------------------------
 1 | """Sherlock Result Module
 2 | 
 3 | This module defines various objects for recording the results of queries.
 4 | """
 5 | from enum import Enum
 6 | 
 7 | 
 8 | class QueryStatus(Enum):
 9 |     """Query Status Enumeration.
10 | 
11 |     Describes status of query about a given username.
12 |     """
13 |     CLAIMED   = "Claimed"   # Username Detected
14 |     AVAILABLE = "Available" # Username Not Detected
15 |     UNKNOWN   = "Unknown"   # Error Occurred While Trying To Detect Username
16 |     ILLEGAL   = "Illegal"   # Username Not Allowable For This Site
17 | 
18 |     def __str__(self):
19 |         """Convert Object To String.
20 | 
21 |         Keyword Arguments:
22 |         self                   -- This object.
23 | 
24 |         Return Value:
25 |         Nicely formatted string to get information about this object.
26 |         """
27 |         return self.value
28 | 
29 | class QueryResult():
30 |     """Query Result Object.
31 | 
32 |     Describes result of query about a given username.
33 |     """
34 |     def __init__(self, username, site_name, site_url_user, status,
35 |                  query_time=None, context=None):
36 |         """Create Query Result Object.
37 | 
38 |         Contains information about a specific method of detecting usernames on
39 |         a given type of web sites.
40 | 
41 |         Keyword Arguments:
42 |         self                   -- This object.
43 |         username               -- String indicating username that query result
44 |                                   was about.
45 |         site_name              -- String which identifies site.
46 |         site_url_user          -- String containing URL for username on site.
47 |                                   NOTE:  The site may or may not exist:  this
48 |                                          just indicates what the name would
49 |                                          be, if it existed.
50 |         status                 -- Enumeration of type QueryStatus() indicating
51 |                                   the status of the query.
52 |         query_time             -- Time (in seconds) required to perform query.
53 |                                   Default of None.
54 |         context                -- String indicating any additional context
55 |                                   about the query.  For example, if there was
56 |                                   an error, this might indicate the type of
57 |                                   error that occurred.
58 |                                   Default of None.
59 | 
60 |         Return Value:
61 |         Nothing.
62 |         """
63 | 
64 |         self.username      = username
65 |         self.site_name     = site_name
66 |         self.site_url_user = site_url_user
67 |         self.status        = status
68 |         self.query_time    = query_time
69 |         self.context       = context
70 | 
71 |         return
72 | 
73 |     def __str__(self):
74 |         """Convert Object To String.
75 | 
76 |         Keyword Arguments:
77 |         self                   -- This object.
78 | 
79 |         Return Value:
80 |         Nicely formatted string to get information about this object.
81 |         """
82 |         status = str(self.status)
83 |         if self.context is not None:
84 |             # There is extra context information available about the results.
85 |             # Append it to the normal response text.
86 |             status += f" ({self.context})"
87 | 
88 |         return status
89 | 


--------------------------------------------------------------------------------
/src/sherlock/sherlock/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Sherlock Tests
2 | 
3 | This package contains various submodules used to run tests.
4 | """
5 | 


--------------------------------------------------------------------------------
/src/sherlock/sherlock/tests/test_multiple_usernames.py:
--------------------------------------------------------------------------------
 1 | import imp
 2 | import unittest
 3 | import sys
 4 | sys.path.append('../')
 5 | import sherlock as sh
 6 | 
 7 | checksymbols = []
 8 | checksymbols = ["_", "-", "."]
 9 | 
10 | """Test for mulriple usernames.
11 | 
12 |         This test ensures that the function MultipleUsernames works properly. More specific,
13 |         different scenarios are tested and only usernames that contain this specific sequence: {?} 
14 |         should return positive.
15 |       
16 |         Keyword Arguments:
17 |         self                   -- This object.
18 | 
19 |         Return Value:
20 |         Nothing.
21 |         """
22 | class TestMultipleUsernames(unittest.TestCase):
23 |     def test_area(self):
24 |         test_usernames = ["test{?}test" , "test{?feo" , "test"]
25 |         for name in test_usernames:
26 |             if(sh.CheckForParameter(name)):
27 |                 self.assertAlmostEqual(sh.MultipleUsernames(name), ["test_test" , "test-test" , "test.test"])
28 |             else:
29 |                 self.assertAlmostEqual(name, name)


--------------------------------------------------------------------------------
/src/sherlock/site_list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # This module generates the listing of supported sites which can be found in
 3 | # sites.md. It also organizes all the sites in alphanumeric order
 4 | import json
 5 | 
 6 | # Read the data.json file
 7 | with open("sherlock/resources/data.json", "r", encoding="utf-8") as data_file:
 8 |     data = json.load(data_file)
 9 | 
10 | # Sort the social networks in alphanumeric order
11 | social_networks = sorted(data.items())
12 | 
13 | # Write the list of supported sites to sites.md
14 | with open("sites.md", "w") as site_file:
15 |     site_file.write(f"## List Of Supported Sites ({len(social_networks)} Sites In Total!)\n")
16 |     for social_network, info in social_networks:
17 |         url_main = info["urlMain"]
18 |         is_nsfw = "**(NSFW)**" if info.get("isNSFW") else ""
19 |         site_file.write(f"1. ![](https://www.google.com/s2/favicons?domain={url_main}) [{social_network}]({url_main}) {is_nsfw}\n")
20 | 
21 | # Overwrite the data.json file with sorted data
22 | with open("sherlock/resources/data.json", "w") as data_file:
23 |     sorted_data = json.dumps(data, indent=2, sort_keys=True)
24 |     data_file.write(sorted_data)
25 |     data_file.write("\n")
26 | 
27 | print("Finished updating supported site listing!")
28 | 


--------------------------------------------------------------------------------
/src/theHarvester/.dockerignore:
--------------------------------------------------------------------------------
 1 | .github/*
 2 | .gitattributes
 3 | .idea/
 4 | .lgtm.yml
 5 | mypy.ini
 6 | .pytest_cache
 7 | .mypy_cache
 8 | tests/*
 9 | README/
10 | bin/


--------------------------------------------------------------------------------
/src/theHarvester/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E501, F405, F403, F401, E402, W503


--------------------------------------------------------------------------------
/src/theHarvester/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # #1492 run `black .` and `isort .`
2 | c13843ec0d513ac7f9c35b7bd0501fa46e356415


--------------------------------------------------------------------------------
/src/theHarvester/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Set the default behavior, which is to have git automatically determine
 2 | # whether a file is a text or binary, unless otherwise specified.
 3 | 
 4 | * text=auto
 5 | 
 6 | # Basic .gitattributes for a python repo.
 7 | 
 8 | # Source files
 9 | # ============
10 | *.pxd       text diff=python
11 | *.py        text diff=python
12 | *.py3       text diff=python
13 | *.pyw       text diff=python
14 | *.pyx       text diff=python
15 | 
16 | # Binary files
17 | # ============
18 | *.db        binary
19 | *.p         binary
20 | *.pkl       binary
21 | *.pyc       binary
22 | *.pyd       binary
23 | *.pyo       binary
24 | 
25 | # Note: .db, .p, and .pkl files are associated with the python modules
26 | # ``pickle``, ``dbm.*``, # ``shelve``, ``marshal``, ``anydbm``, & ``bsddb``
27 | # (among others).
28 | 


--------------------------------------------------------------------------------
/src/theHarvester/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [L1ghtn1ng, NotoriousRebel]
 4 | open_collective: # Replace with a single Open Collective username
 5 | ko_fi: #
 6 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 7 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 8 | liberapay: # Replace with a single Liberapay username
 9 | issuehunt: # Replace with a single IssueHunt username
10 | otechie: # Replace with a single Otechie username
11 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
12 | 


--------------------------------------------------------------------------------
/src/theHarvester/.github/ISSUE_TEMPLATE/issue-template.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Issue Template
 3 | about: A template for new issues.
 4 | title: "[Bug|Feature Request|Other] Short Description of Issue"
 5 | labels: ''
 6 | 
 7 | ---
 8 | 
 9 | ## Note we do not support installing theHarvester on android
10 | 
11 | **Feature Request or Bug or Another**
12 | Feature Request | Bug | Other
13 | 
14 | **Describe the feature request or bug or other**
15 | A clear and concise description of what the bug, feature request,
16 | or other request is.
17 | 
18 | **To Reproduce**
19 | Steps to reproduce the behaviour:
20 | 1. Run tool like this: '...'
21 | 2. See error
22 | 
23 | **Expected behaviour**
24 | A clear and concise description of what you expected to happen.
25 | 
26 | **Screenshots**
27 | If possible please add screenshots to help explain your problem.
28 | 
29 | **System Information (System that tool is running on):**
30 |  - OS: [e.g. Windows10]
31 |  - Version [e.g. 2.7]
32 | 
33 | **Additional context**
34 | Add any other context about the problem here.
35 | 


--------------------------------------------------------------------------------
/src/theHarvester/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: github-actions
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     timezone: Europe/London
 8 | - package-ecosystem: pip
 9 |   directory: "/"
10 |   schedule:
11 |     interval: daily
12 |     timezone: Europe/London
13 |   open-pull-requests-limit: 10
14 |   target-branch: master
15 |   allow:
16 |   - dependency-type: direct
17 |   - dependency-type: indirect
18 | 


--------------------------------------------------------------------------------
/src/theHarvester/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ master, dev ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ master, dev ]
20 |   schedule:
21 |     - cron: '19 11 * * 4'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 | 
28 |     strategy:
29 |       fail-fast: false
30 |       matrix:
31 |         language: [ 'python' ]
32 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python' ]
33 |         # Learn more:
34 |         # https://docs.github.com/en/free-pro-team@latest/github/finding-security-vulnerabilities-and-errors-in-your-code/configuring-code-scanning#changing-the-languages-that-are-analyzed
35 | 
36 |     steps:
37 |     - name: Checkout repository
38 |       uses: actions/checkout@v4
39 | 
40 |     # Initializes the CodeQL tools for scanning.
41 |     - name: Initialize CodeQL
42 |       uses: github/codeql-action/init@v3
43 |       with:
44 |         languages: ${{ matrix.language }}
45 |         # If you wish to specify custom queries, you can do so here or in a config file.
46 |         # By default, queries listed here will override any specified in a config file.
47 |         # Prefix the list here with "+" to use these queries and those in the config file.
48 |         # queries: ./path/to/local/query, your-org/your-repo/queries@main
49 | 
50 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
51 |     # If this step fails, then you should remove it and run the build manually (see below)
52 |     - name: Autobuild
53 |       uses: github/codeql-action/autobuild@v3
54 | 
55 |     # ℹ️ Command-line programs to run using the OS shell.
56 |     # 📚 https://git.io/JvXDl
57 | 
58 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
59 |     #    and modify them (or add more) to build your code if your project
60 |     #    uses a compiled language
61 | 
62 |     #- run: |
63 |     #   make bootstrap
64 |     #   make release
65 | 
66 |     - name: Perform CodeQL Analysis
67 |       uses: github/codeql-action/analyze@v3
68 | 


--------------------------------------------------------------------------------
/src/theHarvester/.github/workflows/dockerci.yml:
--------------------------------------------------------------------------------
 1 | name: TheHarvester Docker Image CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |     - uses: actions/checkout@v4
10 |     - name: Build the Docker image
11 |       run: docker build . --file Dockerfile --tag theharvester:$(date +%s)


--------------------------------------------------------------------------------
/src/theHarvester/.github/workflows/theHarvester.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: TheHarvester Python CI
  3 | 
  4 | on:
  5 |   push:
  6 |     branches:
  7 |       - '*'
  8 | 
  9 |   pull_request:
 10 |     branches:
 11 |       - '*'
 12 | 
 13 | jobs:
 14 |   Python:
 15 |     runs-on: ${{ matrix.os }}
 16 |     strategy:
 17 |       max-parallel: 8
 18 |       matrix:
 19 |         os: [ ubuntu-latest, macos-latest ]
 20 |         python-version: [ 3.10.12, 3.11 ]
 21 | 
 22 |     steps:
 23 |       - uses: actions/checkout@v4
 24 |       - name: Python ${{ matrix.python-version }}
 25 |         uses: actions/setup-python@v5
 26 |         with:
 27 |           python-version: ${{ matrix.python-version }}
 28 |       - name: Install dependencies
 29 |         run: |
 30 |           pip install --upgrade pip
 31 |           pip install .[dev]
 32 | 
 33 |       - name: Lint with black
 34 |         run: |
 35 |           black . --diff --check
 36 | 
 37 |       - name: Lint with isort
 38 |         run: |
 39 |           isort . --diff --check
 40 | 
 41 |       - name: Lint with flake8
 42 |         run: |
 43 |           # stop the build if there are Python syntax errors or undefined names
 44 |           flake8 . --count --show-source --statistics --config .flake8
 45 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
 46 |           flake8 . --count --exit-zero  --max-line-length=127 --statistics --config .flake8
 47 | 
 48 |       - name: Test with pytest
 49 |         run: |
 50 |           pytest
 51 | 
 52 |       - name: Run theHarvester module Anubis
 53 |         run: |
 54 |           theHarvester -d apple.com -b anubis
 55 | 
 56 |       - name: Run theHarvester module Baidu
 57 |         run: |
 58 |           theHarvester -d yale.edu -b baidu
 59 | 
 60 |       - name: Run theHarvester module Bing
 61 |         run: |
 62 |           theHarvester -d yale.edu -b bing
 63 | 
 64 |       - name: Run theHarvester module CertSpotter
 65 |         run: |
 66 |           theHarvester -d yale.edu -b certspotter
 67 | 
 68 |       - name: Run theHarvester module Crtsh
 69 |         run: |
 70 |           theHarvester -d hcl.com -b crtsh
 71 | 
 72 |       - name: Run theHarvester module DnsDumpster
 73 |         run: |
 74 |           theHarvester -d yale.edu -b dnsdumpster
 75 | 
 76 |       - name: Run theHarvester module DuckDuckGo
 77 |         run: |
 78 |           theHarvester -d yale.edu -b duckduckgo
 79 | 
 80 |       - name: Run theHarvester module HackerTarget
 81 |         run: |
 82 |           theHarvester -d yale.edu -b hackertarget
 83 | 
 84 |       - name: Run theHarvester module Intelx
 85 |         run: |
 86 |           theHarvester -d yale.edu -b intelx
 87 | 
 88 |       - name: Run theHarvester module Otx
 89 |         run: |
 90 |           theHarvester -d yale.edu -b otx
 91 | 
 92 |       - name: Run theHarvester module RapidDns
 93 |         run: |
 94 |           theHarvester -d yale.edu -b rapiddns
 95 | 
 96 |       - name: Run theHarvester module Threatminer
 97 |         run: |
 98 |           theHarvester -d yale.edu -b threatminer
 99 | 
100 |       - name: Run theHarvester module Urlscan
101 |         run: |
102 |           theHarvester -d yale.edu -b urlscan
103 | 
104 |       - name: Run theHarvester module Yahoo
105 |         run: |
106 |           theHarvester -d yale.edu -b yahoo
107 | 
108 |       - name: Run theHarvester module DNS brute force
109 |         run: |
110 |           theHarvester -d yale.edu -c
111 | 
112 |       - name: Static type checking with mypy
113 |         run: |
114 |           mypy --pretty theHarvester/*/*.py
115 |           mypy --pretty theHarvester/*/*/*.py
116 | 


--------------------------------------------------------------------------------
/src/theHarvester/.gitignore:
--------------------------------------------------------------------------------
 1 | *.idea
 2 | *.pyc
 3 | *.sqlite
 4 | *.html
 5 | *.htm
 6 | *.vscode
 7 | *.xml
 8 | *.json
 9 | debug_results.txt
10 | venv
11 | .mypy_cache
12 | .pytest_cache
13 | build/
14 | dist/
15 | theHarvester.egg-info
16 | api-keys.yaml
17 | .DS_Store
18 | .venv
19 | .pyre
20 | 


--------------------------------------------------------------------------------
/src/theHarvester/.isort.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | profile = black
3 | 


--------------------------------------------------------------------------------
/src/theHarvester/.pyre_configuration:
--------------------------------------------------------------------------------
1 | {
2 |   "site_package_search_strategy": "pep561",
3 |   "source_directories": [
4 |     "."
5 |   ]
6 | }
7 | 


--------------------------------------------------------------------------------
/src/theHarvester/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:3
2 | LABEL maintainer="@jay_townsend1 & @NotoriousRebel1"
3 | RUN apk update && apk upgrade --available && apk add --no-cache musl-dev git libffi-dev gcc python3-dev pipx libxml2-dev libxslt-dev bash
4 | RUN mkdir -p "~/.local/share/theHarvester/static/"
5 | RUN pipx install git+https://github.com/laramies/theHarvester.git
6 | RUN pipx ensurepath
7 | ENTRYPOINT ["/root/.local/bin/restfulHarvest", "-H", "0.0.0.0", "-p", "80"]
8 | EXPOSE 80
9 | 


--------------------------------------------------------------------------------
/src/theHarvester/README/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to theHarvester Project
 2 | Welcome to theHarvester project, so you would like to contribute.
 3 | The following below must be met to get accepted.
 4 | 
 5 | # CI
 6 | Make sure all CI passes and you do not introduce any alerts from lgtm.
 7 | 
 8 | # Unit Tests
 9 | For new modules a unit test for that module is required and we use pytest.
10 | 
11 | # Coding Standards
12 | * No single letter variables and variable names must represent the action that it is performing
13 | * Have static typing on functions etc
14 | * Make sure no errors are reported from mypy
15 | * No issues reported with flake8
16 |  
17 | # Submitting Bugs
18 | If you find a bug in a module that you want to submit an issue for and know how to write python code.
19 | Please create a unit test for that bug(If possible) and submit a fix for it as it would be a big help to the project. 
20 | 


--------------------------------------------------------------------------------
/src/theHarvester/README/LICENSES:
--------------------------------------------------------------------------------
 1 | Released under the GPL v 2.0.
 2 | 
 3 | If you did not receive a copy of the GPL, try http://www.gnu.org/.
 4 | 
 5 | Copyright 2011 Christian Martorella 
 6 | 
 7 | theHarvester is free software; you can redistribute it and/or modify
 8 | it under the terms of the GNU General Public License as published by
 9 | the Free Software Foundation version 2 of the License.
10 | 
11 | theHarvester is distributed in the hope that it will be useful,
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 | GNU General Public License for more details.
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
16 | 


--------------------------------------------------------------------------------
/src/theHarvester/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | services:
 3 |   theharvester.svc.local:
 4 |     container_name: theHarvester
 5 |     volumes:
 6 |       - ./api-keys.yaml:/root/.theHarvester/api-keys.yaml
 7 |       - ./api-keys.yaml:/etc/theHarvester/api-keys.yaml
 8 |       - ./proxies.yaml:/etc/theHarvester/proxies.yaml
 9 |       - ./proxies.yaml:/root/.theHarvester/proxies.yaml
10 |     build: .
11 |     ports:
12 |       - "8080:80"
13 | 
14 | networks:
15 |   default:
16 |     name: app_theHarvester_network
17 | 


--------------------------------------------------------------------------------
/src/theHarvester/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | show_traceback = True
4 | show_error_codes = True
5 | namespace_packages = True
6 | 


--------------------------------------------------------------------------------
/src/theHarvester/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "theHarvester"
 3 | description = "theHarvester is a very simple, yet effective tool designed to be used in the early stages of a penetration test"
 4 | readme = "README.md"
 5 | authors = [
 6 |     { name = "Christian Martorella", email = "cmartorella@edge-security.com" },
 7 |     { name = "Jay Townsend", email = "jay@cybermon.uk" },
 8 |     { name = "Matthew Brown", email = "36310667+NotoriousRebel@users.noreply.github.com" },
 9 | ]
10 | requires-python = ">=3.9"
11 | urls.Homepage = "https://github.com/laramies/theHarvester"
12 | classifiers = [
13 |     "Programming Language :: Python :: 3",
14 |     "Programming Language :: Python :: 3.9",
15 |     "Programming Language :: Python :: 3.10",
16 |     "Programming Language :: Python :: 3.11",
17 |     "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
18 |     "Operating System :: OS Independent",
19 | ]
20 | dynamic = ["dependencies", "optional-dependencies", "version"]
21 | 
22 | [project.scripts]
23 | theHarvester = "theHarvester.theHarvester:main"
24 | restfulHarvest = "theHarvester.restfulHarvest:main"
25 | 
26 | [tool.setuptools.dynamic]
27 | version = { attr = "theHarvester.lib.version.VERSION" }
28 | dependencies = { file = "requirements/base.txt" }
29 | optional-dependencies.dev = { file = "requirements/dev.txt" }
30 | 
31 | [tool.setuptools.packages.find]
32 | include = ["theHarvester*"]
33 | 
34 | [tool.setuptools.package-data]
35 | "*" = ["*.txt", "*.yaml"]
36 | 
37 | [tool.pytest.ini_options]
38 | minversion = "7.1"
39 | addopts = "--no-header --asyncio-mode=auto"
40 | testpaths = [
41 |     "tests",
42 |     "tests/discovery/",
43 | ]
44 | 
45 | [build-system]
46 | requires = ["setuptools>=68"]
47 | build-backend = "setuptools.build_meta"
48 | 


--------------------------------------------------------------------------------
/src/theHarvester/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | minversion = 7.1.1
3 | testpaths = tests
4 | asyncio_mode=auto


--------------------------------------------------------------------------------
/src/theHarvester/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/base.txt
2 | 


--------------------------------------------------------------------------------
/src/theHarvester/requirements/base.txt:
--------------------------------------------------------------------------------
 1 | aiodns==3.1.1
 2 | aiofiles==23.2.1
 3 | aiohttp==3.9.3
 4 | aiomultiprocess==0.9.0
 5 | aiosqlite==0.19.0
 6 | beautifulsoup4==4.12.3
 7 | censys==2.2.11
 8 | certifi==2024.2.2
 9 | dnspython==2.5.0
10 | fastapi==0.109.0
11 | lxml==5.1.0
12 | netaddr==0.10.1
13 | ujson==5.9.0
14 | pyppeteer==1.0.2
15 | PyYAML==6.0.1
16 | python-dateutil==2.8.2
17 | requests==2.31.0
18 | retrying==1.3.4
19 | setuptools==69.0.3
20 | shodan==1.31.0
21 | slowapi==0.1.8
22 | uvicorn==0.27.0.post1
23 | uvloop==0.19.0; platform_system != "Windows"
24 | 


--------------------------------------------------------------------------------
/src/theHarvester/requirements/dev.txt:
--------------------------------------------------------------------------------
 1 | black==24.1.1
 2 | flake8==7.0.0
 3 | isort==5.13.2
 4 | mypy==1.8.0
 5 | mypy-extensions==1.0.0
 6 | pydantic==2.5.3
 7 | pyre-check==0.9.19
 8 | pyflakes==3.2.0
 9 | pytest==7.4.4
10 | pytest-asyncio==0.23.4
11 | types-certifi==2021.10.8.3
12 | types-chardet==5.0.4.6
13 | types-ujson==5.9.0.0
14 | types-PyYAML==6.0.12.12
15 | types-requests==2.31.0.6 # 2.31.0.7 introduced a regression
16 | types-python-dateutil==2.8.19.20240106
17 | wheel==0.42.0


--------------------------------------------------------------------------------
/src/theHarvester/restfulHarvest.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from theHarvester.restfulHarvest import main
3 | 
4 | if __name__ == "__main__":
5 |     main()
6 | 


--------------------------------------------------------------------------------
/src/theHarvester/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E501, F405, F403, E402, F401, F402


--------------------------------------------------------------------------------
/src/theHarvester/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/tests/__init__.py


--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/tests/discovery/__init__.py


--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_anubis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import os
 4 | from typing import Optional
 5 | 
 6 | import pytest
 7 | import requests
 8 | from _pytest.mark.structures import MarkDecorator
 9 | 
10 | from theHarvester.discovery import anubis
11 | from theHarvester.lib.core import *
12 | 
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 |     "GITHUB_ACTIONS"
16 | )  # Github set this to be the following: true instead of True
17 | 
18 | 
19 | class TestAnubis:
20 |     @staticmethod
21 |     def domain() -> str:
22 |         return "apple.com"
23 | 
24 |     async def test_api(self) -> None:
25 |         base_url = f"https://jldc.me/anubis/subdomains/{TestAnubis.domain()}"
26 |         headers = {"User-Agent": Core.get_user_agent()}
27 |         request = requests.get(base_url, headers=headers)
28 |         assert request.status_code == 200
29 | 
30 |     async def test_do_search(self):
31 |         search = anubis.SearchAnubis(word=TestAnubis.domain())
32 |         await search.do_search()
33 |         return await search.get_hostnames()
34 | 
35 |     async def test_process(self) -> None:
36 |         await self.test_do_search()
37 |         assert len(await self.test_do_search()) > 0
38 | 


--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_certspotter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import os
 4 | from typing import Optional
 5 | 
 6 | import pytest
 7 | import requests
 8 | from _pytest.mark.structures import MarkDecorator
 9 | 
10 | from theHarvester.discovery import certspottersearch
11 | from theHarvester.lib.core import *
12 | 
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 |     "GITHUB_ACTIONS"
16 | )  # Github set this to be the following: true instead of True
17 | 
18 | 
19 | class TestCertspotter(object):
20 |     @staticmethod
21 |     def domain() -> str:
22 |         return "metasploit.com"
23 | 
24 |     async def test_api(self) -> None:
25 |         base_url = f"https://api.certspotter.com/v1/issuances?domain={TestCertspotter.domain()}&expand=dns_names"
26 |         headers = {"User-Agent": Core.get_user_agent()}
27 |         request = requests.get(base_url, headers=headers)
28 |         assert request.status_code == 200
29 | 
30 |     async def test_search(self) -> None:
31 |         search = certspottersearch.SearchCertspoter(TestCertspotter.domain())
32 |         await search.process()
33 |         assert isinstance(await search.get_hostnames(), set)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     pytest.main()
38 | 


--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_githubcode.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | 
 3 | import pytest
 4 | from _pytest.mark.structures import MarkDecorator
 5 | from requests import Response
 6 | 
 7 | from theHarvester.discovery import githubcode
 8 | from theHarvester.discovery.constants import MissingKey
 9 | from theHarvester.lib.core import Core
10 | 
11 | pytestmark: MarkDecorator = pytest.mark.asyncio
12 | 
13 | 
14 | class TestSearchGithubCode:
15 |     class OkResponse:
16 |         response = Response()
17 |         json = {
18 |             "items": [
19 |                 {"text_matches": [{"fragment": "test1"}]},
20 |                 {"text_matches": [{"fragment": "test2"}]},
21 |             ]
22 |         }
23 |         response.status_code = 200
24 |         response.json = MagicMock(return_value=json)
25 | 
26 |     class FailureResponse:
27 |         response = Response()
28 |         response.json = MagicMock(return_value={})
29 |         response.status_code = 401
30 | 
31 |     class RetryResponse:
32 |         response = Response()
33 |         response.json = MagicMock(return_value={})
34 |         response.status_code = 403
35 | 
36 |     class MalformedResponse:
37 |         response = Response()
38 |         json = {
39 |             "items": [
40 |                 {"fail": True},
41 |                 {"text_matches": []},
42 |                 {"text_matches": [{"weird": "result"}]},
43 |             ]
44 |         }
45 |         response.json = MagicMock(return_value=json)
46 |         response.status_code = 200
47 | 
48 |     async def test_missing_key(self) -> None:
49 |         with pytest.raises(MissingKey):
50 |             Core.github_key = MagicMock(return_value=None)
51 |             githubcode.SearchGithubCode(word="test", limit=500)
52 | 
53 |     async def test_fragments_from_response(self) -> None:
54 |         Core.github_key = MagicMock(return_value="lol")
55 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
56 |         test_result = await test_class_instance.fragments_from_response(
57 |             self.OkResponse.response.json()
58 |         )
59 |         print("test_result: ", test_result)
60 |         assert test_result == ["test1", "test2"]
61 | 
62 |     async def test_invalid_fragments_from_response(self) -> None:
63 |         Core.github_key = MagicMock(return_value="lol")
64 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
65 |         test_result = await test_class_instance.fragments_from_response(
66 |             self.MalformedResponse.response.json()
67 |         )
68 |         assert test_result == []
69 | 
70 |     async def test_next_page(self) -> None:
71 |         Core.github_key = MagicMock(return_value="lol")
72 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
73 |         test_result = githubcode.SuccessResult(list(), next_page=2, last_page=4)
74 |         assert 2 == await test_class_instance.next_page_or_end(test_result)
75 | 
76 |     async def test_last_page(self) -> None:
77 |         Core.github_key = MagicMock(return_value="lol")
78 |         test_class_instance = githubcode.SearchGithubCode(word="test", limit=500)
79 |         test_result = githubcode.SuccessResult(list(), None, None)
80 |         assert None is await test_class_instance.next_page_or_end(test_result)
81 | 
82 |     if __name__ == "__main__":
83 |         pytest.main()
84 | 


--------------------------------------------------------------------------------
/src/theHarvester/tests/discovery/test_otx.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | import os
 4 | from typing import Optional
 5 | 
 6 | import pytest
 7 | import requests
 8 | from _pytest.mark.structures import MarkDecorator
 9 | 
10 | from theHarvester.discovery import otxsearch
11 | from theHarvester.lib.core import *
12 | 
13 | pytestmark: MarkDecorator = pytest.mark.asyncio
14 | github_ci: Optional[str] = os.getenv(
15 |     "GITHUB_ACTIONS"
16 | )  # Github set this to be the following: true instead of True
17 | 
18 | 
19 | class TestOtx(object):
20 |     @staticmethod
21 |     def domain() -> str:
22 |         return "cybermon.uk"
23 | 
24 |     async def test_api(self) -> None:
25 |         base_url = f"https://otx.alienvault.com/api/v1/indicators/domain/{TestOtx.domain()}/passive_dns"
26 |         headers = {"User-Agent": Core.get_user_agent()}
27 |         request = requests.get(base_url, headers=headers)
28 |         assert request.status_code == 200
29 | 
30 |     async def test_search(self) -> None:
31 |         search = otxsearch.SearchOtx(TestOtx.domain())
32 |         await search.process()
33 |         assert isinstance(await search.get_hostnames(), set)
34 |         assert isinstance(await search.get_ips(), set)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     pytest.main()
39 | 


--------------------------------------------------------------------------------
/src/theHarvester/tests/test_myparser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # coding=utf-8
 3 | 
 4 | import pytest
 5 | 
 6 | from theHarvester.parsers import myparser
 7 | 
 8 | 
 9 | class TestMyParser(object):
10 |     @pytest.mark.asyncio
11 |     async def test_emails(self) -> None:
12 |         word = "domain.com"
13 |         results = "@domain.com***a@domain***banotherdomain.com***c@domain.com***d@sub.domain.com***"
14 |         parse = myparser.Parser(results, word)
15 |         emails = sorted(await parse.emails())
16 |         assert emails, ["c@domain.com", "d@sub.domain.com"]
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     pytest.main()
21 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester-logo.png


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester-logo.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester-logo.webp


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # Note: This script runs theHarvester
 3 | import sys
 4 | 
 5 | from theHarvester.theHarvester import main
 6 | 
 7 | if sys.version_info.major < 3 or sys.version_info.minor < 9:
 8 |     print("\033[93m[!] Make sure you have Python 3.9+ installed, quitting.\n\n \033[0m")
 9 |     sys.exit(1)
10 | 
11 | if __name__ == "__main__":
12 |     main()
13 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["hostchecker"]
2 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/data/proxies.yaml:
--------------------------------------------------------------------------------
1 | http:
2 |     - ip:port
3 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/data/wordlists/dorks.txt:
--------------------------------------------------------------------------------
 1 | inurl:"contact"
 2 | intext:email filetype:log
 3 | "Index of /mail"
 4 | "admin account info" filetype:log
 5 | intext:@
 6 | administrator accounts/
 7 | intitle:"Index of" .bash_history
 8 | intitle:"index of" members OR accounts
 9 | inurl:/shared/help.php
10 | inurl:public
11 | intitle:index.of inbox
12 | intitle:"Server Administration" 
13 | inurl:passwd.txt
14 | robots.txt
15 | php-addressbook "This is the addressbook for *" -warning


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/data/wordlists/general/common.txt:
--------------------------------------------------------------------------------
1 | admin
2 | test
3 | hello
4 | uk
5 | login
6 | book
7 | robots.txt
8 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester/discovery/__init__.py


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/anubis.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher
 2 | 
 3 | 
 4 | class SearchAnubis:
 5 |     def __init__(self, word) -> None:
 6 |         self.word = word
 7 |         self.totalhosts: list = []
 8 |         self.proxy = False
 9 | 
10 |     async def do_search(self) -> None:
11 |         url = f"https://jldc.me/anubis/subdomains/{self.word}"
12 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
13 |         self.totalhosts = response[0]
14 | 
15 |     async def get_hostnames(self) -> list:
16 |         return self.totalhosts
17 | 
18 |     async def process(self, proxy: bool = False) -> None:
19 |         self.proxy = proxy
20 |         await self.do_search()
21 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/baidusearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | from theHarvester.parsers import myparser
 3 | 
 4 | 
 5 | class SearchBaidu:
 6 |     def __init__(self, word, limit) -> None:
 7 |         self.word = word
 8 |         self.total_results = ""
 9 |         self.server = "www.baidu.com"
10 |         self.hostname = "www.baidu.com"
11 |         self.limit = limit
12 |         self.proxy = False
13 | 
14 |     async def do_search(self) -> None:
15 |         headers = {"Host": self.hostname, "User-agent": Core.get_user_agent()}
16 |         base_url = f"https://{self.server}/s?wd=%40{self.word}&pn=xx&oq={self.word}"
17 |         urls = [
18 |             base_url.replace("xx", str(num))
19 |             for num in range(0, self.limit, 10)
20 |             if num <= self.limit
21 |         ]
22 |         responses = await AsyncFetcher.fetch_all(
23 |             urls, headers=headers, proxy=self.proxy
24 |         )
25 |         for response in responses:
26 |             self.total_results += response
27 | 
28 |     async def process(self, proxy: bool = False) -> None:
29 |         self.proxy = proxy
30 |         await self.do_search()
31 | 
32 |     async def get_emails(self):
33 |         rawres = myparser.Parser(self.total_results, self.word)
34 |         return await rawres.emails()
35 | 
36 |     async def get_hostnames(self):
37 |         rawres = myparser.Parser(self.total_results, self.word)
38 |         return await rawres.hostnames()
39 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bevigil.py:
--------------------------------------------------------------------------------
 1 | from typing import Set
 2 | 
 3 | from theHarvester.discovery.constants import MissingKey
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | 
 6 | 
 7 | class SearchBeVigil:
 8 |     def __init__(self, word) -> None:
 9 |         self.word = word
10 |         self.totalhosts: Set = set()
11 |         self.interestingurls: Set = set()
12 |         self.key = Core.bevigil_key()
13 |         if self.key is None:
14 |             self.key = ""
15 |             raise MissingKey("bevigil")
16 |         self.proxy = False
17 | 
18 |     async def do_search(self) -> None:
19 |         subdomain_endpoint = f"https://osint.bevigil.com/api/{self.word}/subdomains/"
20 |         url_endpoint = f"https://osint.bevigil.com/api/{self.word}/urls/"
21 |         headers = {"X-Access-Token": self.key}
22 | 
23 |         responses = await AsyncFetcher.fetch_all(
24 |             [subdomain_endpoint], json=True, proxy=self.proxy, headers=headers
25 |         )
26 |         response = responses[0]
27 |         for subdomain in response["subdomains"]:
28 |             self.totalhosts.add(subdomain)
29 | 
30 |         responses = await AsyncFetcher.fetch_all(
31 |             [url_endpoint], json=True, proxy=self.proxy, headers=headers
32 |         )
33 |         response = responses[0]
34 |         for url in response["urls"]:
35 |             self.interestingurls.add(url)
36 | 
37 |     async def get_hostnames(self) -> set:
38 |         return self.totalhosts
39 | 
40 |     async def get_interestingurls(self) -> set:
41 |         return self.interestingurls
42 | 
43 |     async def process(self, proxy: bool = False) -> None:
44 |         self.proxy = proxy
45 |         await self.do_search()
46 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/binaryedgesearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Set
 3 | 
 4 | from theHarvester.discovery.constants import MissingKey, get_delay
 5 | from theHarvester.lib.core import AsyncFetcher, Core
 6 | 
 7 | 
 8 | class SearchBinaryEdge:
 9 |     def __init__(self, word, limit) -> None:
10 |         self.word = word
11 |         self.totalhosts: Set = set()
12 |         self.proxy = False
13 |         self.key = Core.binaryedge_key()
14 |         self.limit = 501 if limit >= 501 else limit
15 |         self.limit = 2 if self.limit == 1 else self.limit
16 |         if self.key is None:
17 |             raise MissingKey("binaryedge")
18 | 
19 |     async def do_search(self) -> None:
20 |         base_url = f"https://api.binaryedge.io/v2/query/domains/subdomain/{self.word}"
21 |         headers = {"X-KEY": self.key, "User-Agent": Core.get_user_agent()}
22 |         for page in range(1, self.limit):
23 |             params = {"page": page}
24 |             response = await AsyncFetcher.fetch_all(
25 |                 [base_url], json=True, proxy=self.proxy, params=params, headers=headers
26 |             )
27 |             responses = response[0]
28 |             dct = responses
29 |             if ("status" in dct.keys() and "message" in dct.keys()) and (
30 |                 dct["status"] == 400
31 |                 or "Bad Parameter" in dct["message"]
32 |                 or "Error" in dct["message"]
33 |             ):
34 |                 # 400 status code means no more results
35 |                 break
36 |             if "events" in dct.keys():
37 |                 if len(dct["events"]) == 0:
38 |                     break
39 |                 self.totalhosts.update({host for host in dct["events"]})
40 |             await asyncio.sleep(get_delay())
41 | 
42 |     async def get_hostnames(self) -> set:
43 |         return self.totalhosts
44 | 
45 |     async def process(self, proxy: bool = False) -> None:
46 |         self.proxy = proxy
47 |         await self.do_search()
48 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bingsearch.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from theHarvester.discovery.constants import MissingKey
  4 | from theHarvester.lib.core import AsyncFetcher, Core
  5 | from theHarvester.parsers import myparser
  6 | 
  7 | 
  8 | class SearchBing:
  9 |     def __init__(self, word, limit, start) -> None:
 10 |         self.word = word.replace(" ", "%20")
 11 |         self.results: list[Any] = []
 12 |         self.total_results = ""
 13 |         self.server = "www.bing.com"
 14 |         self.apiserver = "api.search.live.net"
 15 |         self.hostname = "www.bing.com"
 16 |         self.limit = int(limit)
 17 |         self.bingApi = Core.bing_key()
 18 |         self.counter = start
 19 |         self.proxy = False
 20 | 
 21 |     async def do_search(self) -> None:
 22 |         headers = {
 23 |             "Host": self.hostname,
 24 |             "Cookie": "SRCHHPGUSR=ADLT=DEMOTE&NRSLT=50",
 25 |             "Accept-Language": "en-us,en",
 26 |             "User-agent": Core.get_user_agent(),
 27 |         }
 28 |         base_url = f'https://{self.server}/search?q=%40"{self.word}"&count=50&first=xx'
 29 |         urls = [
 30 |             base_url.replace("xx", str(num))
 31 |             for num in range(0, self.limit, 50)
 32 |             if num <= self.limit
 33 |         ]
 34 |         responses = await AsyncFetcher.fetch_all(
 35 |             urls, headers=headers, proxy=self.proxy
 36 |         )
 37 |         for response in responses:
 38 |             self.total_results += response
 39 | 
 40 |     async def do_search_api(self) -> None:
 41 |         url = "https://api.bing.microsoft.com/v7.0/search?"
 42 |         params = {
 43 |             "q": self.word,
 44 |             "count": str(self.limit),
 45 |             "offset": "0",
 46 |             "mkt": "en-us",
 47 |             "safesearch": "Off",
 48 |         }
 49 |         headers = {
 50 |             "User-Agent": Core.get_user_agent(),
 51 |             "Ocp-Apim-Subscription-Key": self.bingApi,
 52 |         }
 53 |         self.results = await AsyncFetcher.fetch_all(
 54 |             [url], headers=headers, params=params, proxy=self.proxy
 55 |         )
 56 |         for res in self.results:
 57 |             self.total_results += res
 58 | 
 59 |     async def do_search_vhost(self) -> None:
 60 |         headers = {
 61 |             "Host": self.hostname,
 62 |             "Cookie": "mkt=en-US;ui=en-US;SRCHHPGUSR=NEWWND=0&ADLT=DEMOTE&NRSLT=50",
 63 |             "Accept-Language": "en-us,en",
 64 |             "User-agent": Core.get_user_agent(),
 65 |         }
 66 |         base_url = f"http://{self.server}/search?q=ip:{self.word}&go=&count=50&FORM=QBHL&qs=n&first=xx"
 67 |         urls = [
 68 |             base_url.replace("xx", str(num))
 69 |             for num in range(0, self.limit, 50)
 70 |             if num <= self.limit
 71 |         ]
 72 |         responses = await AsyncFetcher.fetch_all(
 73 |             urls, headers=headers, proxy=self.proxy
 74 |         )
 75 |         for response in responses:
 76 |             self.total_results += response
 77 | 
 78 |     async def get_emails(self):
 79 |         rawres = myparser.Parser(self.total_results, self.word)
 80 |         return await rawres.emails()
 81 | 
 82 |     async def get_hostnames(self):
 83 |         rawres = myparser.Parser(self.total_results, self.word)
 84 |         return await rawres.hostnames()
 85 | 
 86 |     async def get_allhostnames(self):
 87 |         rawres = myparser.Parser(self.total_results, self.word)
 88 |         return await rawres.hostnames_all()
 89 | 
 90 |     async def process(self, api, proxy: bool = False) -> None:
 91 |         self.proxy = proxy
 92 |         if api == "yes":
 93 |             if self.bingApi is None:
 94 |                 raise MissingKey("BingAPI")
 95 |             await self.do_search_api()
 96 |         else:
 97 |             await self.do_search()
 98 |             print(f"\tSearching {self.counter} results.")
 99 | 
100 |     async def process_vhost(self) -> None:
101 |         await self.do_search_vhost()
102 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bravesearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | from theHarvester.discovery.constants import get_delay
 4 | from theHarvester.lib.core import AsyncFetcher, Core
 5 | from theHarvester.parsers import myparser
 6 | 
 7 | 
 8 | class SearchBrave:
 9 |     def __init__(self, word, limit):
10 |         self.word = word
11 |         self.results = ""
12 |         self.totalresults = ""
13 |         self.server = "https://search.brave.com/search?q="
14 |         self.limit = limit
15 |         self.proxy = False
16 | 
17 |     async def do_search(self):
18 |         headers = {"User-Agent": Core.get_user_agent()}
19 |         for query in [f'"{self.word}"', f"site:{self.word}"]:
20 |             try:
21 |                 for offset in range(0, 50):
22 |                     # To reduce the total number of requests, only two queries are made "self.word" and site:self.word
23 |                     current_url = f"{self.server}{query}&offset={offset}&source=web&show_local=0&spellcheck=0"
24 |                     resp = await AsyncFetcher.fetch_all(
25 |                         [current_url], headers=headers, proxy=self.proxy
26 |                     )
27 |                     self.results = resp[0]
28 |                     self.totalresults += self.results
29 |                     # if 'Results from Microsoft Bing.' in resp[0] \
30 |                     if (
31 |                         "Not many great matches came back for your search" in resp[0]
32 |                         or "Your request has been flagged as being suspicious and Brave Search"
33 |                         in resp[0]
34 |                         or "Prove" in resp[0]
35 |                         and "robot" in resp[0]
36 |                         or "Robot" in resp[0]
37 |                     ):
38 |                         break
39 |                     await asyncio.sleep(get_delay() + 15)
40 |             except Exception as e:
41 |                 print(f"An exception has occurred in bravesearch: {e}")
42 |                 await asyncio.sleep(get_delay() + 80)
43 |                 continue
44 | 
45 |     async def get_emails(self):
46 |         rawres = myparser.Parser(self.totalresults, self.word)
47 |         return await rawres.emails()
48 | 
49 |     async def get_hostnames(self):
50 |         rawres = myparser.Parser(self.totalresults, self.word)
51 |         return await rawres.hostnames()
52 | 
53 |     async def process(self, proxy=False):
54 |         self.proxy = proxy
55 |         await self.do_search()
56 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/bufferoverun.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Set
 3 | 
 4 | from theHarvester.discovery.constants import MissingKey
 5 | from theHarvester.lib.core import AsyncFetcher, Core
 6 | 
 7 | 
 8 | class SearchBufferover:
 9 |     def __init__(self, word) -> None:
10 |         self.word = word
11 |         self.totalhosts: Set = set()
12 |         self.totalips: Set = set()
13 |         self.key = Core.bufferoverun_key()
14 |         if self.key is None:
15 |             raise MissingKey("bufferoverun")
16 |         self.proxy = False
17 | 
18 |     async def do_search(self) -> None:
19 |         url = f"https://tls.bufferover.run/dns?q={self.word}"
20 |         response = await AsyncFetcher.fetch_all(
21 |             [url],
22 |             json=True,
23 |             headers={"User-Agent": Core.get_user_agent(), "x-api-key": f"{self.key}"},
24 |             proxy=self.proxy,
25 |         )
26 |         dct = response[0]
27 |         if dct["Results"]:
28 |             self.totalhosts = {
29 |                 host.split(",")
30 |                 if "," in host
31 |                 and self.word.replace("www.", "") in host.split(",")[0] in host
32 |                 else host.split(",")[4]
33 |                 for host in dct["Results"]
34 |             }
35 | 
36 |         self.totalips = {
37 |             ip.split(",")[0]
38 |             for ip in dct["Results"]
39 |             if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip.split(",")[0])
40 |         }
41 | 
42 |     async def get_hostnames(self) -> set:
43 |         return self.totalhosts
44 | 
45 |     async def get_ips(self) -> set:
46 |         return self.totalips
47 | 
48 |     async def process(self, proxy: bool = False) -> None:
49 |         self.proxy = proxy
50 |         await self.do_search()
51 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/censysearch.py:
--------------------------------------------------------------------------------
 1 | from typing import Set
 2 | 
 3 | from censys.common import __version__
 4 | from censys.common.exceptions import (
 5 |     CensysRateLimitExceededException,
 6 |     CensysUnauthorizedException,
 7 | )
 8 | from censys.search import CensysCerts
 9 | 
10 | from theHarvester.discovery.constants import MissingKey
11 | from theHarvester.lib.core import Core
12 | from theHarvester.lib.version import version as thehavester_version
13 | 
14 | 
15 | class SearchCensys:
16 |     def __init__(self, domain, limit: int = 500) -> None:
17 |         self.word = domain
18 |         self.key = Core.censys_key()
19 |         if self.key[0] is None or self.key[1] is None:
20 |             raise MissingKey("Censys ID and/or Secret")
21 |         self.totalhosts: Set = set()
22 |         self.emails: Set = set()
23 |         self.limit = limit
24 |         self.proxy = False
25 | 
26 |     async def do_search(self) -> None:
27 |         try:
28 |             cert_search = CensysCerts(
29 |                 api_id=self.key[0],
30 |                 api_secret=self.key[1],
31 |                 user_agent=f"censys-python/{__version__} (theHarvester/{thehavester_version}); +https://github.com/laramies/theHarvester)",
32 |             )
33 |         except CensysUnauthorizedException:
34 |             raise MissingKey("Censys ID and/or Secret")
35 | 
36 |         query = f"names: {self.word}"
37 |         try:
38 |             response = cert_search.search(
39 |                 query=query,
40 |                 fields=["names", "parsed.subject.email_address"],
41 |                 max_records=self.limit,
42 |             )
43 |             for cert in response():
44 |                 self.totalhosts.update(cert.get("names", []))
45 |                 email_address = (
46 |                     cert.get("parsed", {}).get("subject", {}).get("email_address", [])
47 |                 )
48 |                 self.emails.update(email_address)
49 |         except CensysRateLimitExceededException:
50 |             print("Censys rate limit exceeded")
51 | 
52 |     async def get_hostnames(self) -> set:
53 |         return self.totalhosts
54 | 
55 |     async def get_emails(self) -> set:
56 |         return self.emails
57 | 
58 |     async def process(self, proxy: bool = False) -> None:
59 |         self.proxy = proxy
60 |         await self.do_search()
61 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/certspottersearch.py:
--------------------------------------------------------------------------------
 1 | from typing import Set
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher
 4 | 
 5 | 
 6 | class SearchCertspoter:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.totalhosts: Set = set()
10 |         self.proxy = False
11 | 
12 |     async def do_search(self) -> None:
13 |         base_url = f"https://api.certspotter.com/v1/issuances?domain={self.word}&expand=dns_names"
14 |         try:
15 |             response = await AsyncFetcher.fetch_all(
16 |                 [base_url], json=True, proxy=self.proxy
17 |             )
18 |             response = response[0]
19 |             if isinstance(response, list):
20 |                 for dct in response:
21 |                     for key, value in dct.items():
22 |                         if key == "dns_names":
23 |                             self.totalhosts.update({name for name in value if name})
24 |             elif isinstance(response, dict):
25 |                 self.totalhosts.update({response["dns_names"] if "dns_names" in response.keys() else ""})  # type: ignore
26 |             else:
27 |                 self.totalhosts.update({""})
28 |         except Exception as e:
29 |             print(e)
30 | 
31 |     async def get_hostnames(self) -> set:
32 |         return self.totalhosts
33 | 
34 |     async def process(self, proxy: bool = False) -> None:
35 |         self.proxy = proxy
36 |         await self.do_search()
37 |         print("\tSearching results.")
38 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/crtsh.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Set
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher
 4 | 
 5 | 
 6 | class SearchCrtsh:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.data: List = []
10 |         self.proxy = False
11 | 
12 |     async def do_search(self) -> List:
13 |         data: Set = set()
14 |         try:
15 |             url = f"https://crt.sh/?q=%25.{self.word}&output=json"
16 |             response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
17 |             response = response[0]
18 |             data = set(
19 |                 [
20 |                     dct["name_value"][2:]
21 |                     if "*." == dct["name_value"][:2]
22 |                     else dct["name_value"]
23 |                     for dct in response
24 |                 ]
25 |             )
26 |             data = {
27 |                 domain
28 |                 for domain in data
29 |                 if (domain[0] != "*" and str(domain[0:4]).isnumeric() is False)
30 |             }
31 |         except Exception as e:
32 |             print(e)
33 |         clean: List = []
34 |         for x in data:
35 |             pre = x.split()
36 |             for y in pre:
37 |                 clean.append(y)
38 |         return clean
39 | 
40 |     async def process(self, proxy: bool = False) -> None:
41 |         self.proxy = proxy
42 |         data = await self.do_search()
43 |         self.data = data
44 | 
45 |     async def get_hostnames(self) -> list:
46 |         return self.data
47 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/dnsdumpster.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | import aiohttp
 4 | 
 5 | from theHarvester.lib.core import Core
 6 | from theHarvester.parsers import myparser
 7 | 
 8 | 
 9 | class SearchDnsDumpster:
10 |     def __init__(self, word) -> None:
11 |         self.word = word.replace(" ", "%20")
12 |         self.results = ""
13 |         self.totalresults = ""
14 |         self.server = "dnsdumpster.com"
15 |         self.proxy = False
16 | 
17 |     async def do_search(self) -> None:
18 |         try:
19 |             agent = Core.get_user_agent()
20 |             headers = {"User-Agent": agent}
21 |             session = aiohttp.ClientSession(headers=headers)
22 |             # create a session to properly verify
23 |             url = f"https://{self.server}"
24 |             csrftoken = ""
25 |             if self.proxy is False:
26 |                 async with session.get(url, headers=headers) as resp:
27 |                     resp_cookies = str(resp.cookies)
28 |                     cookies = resp_cookies.split("csrftoken=")
29 |                     csrftoken += cookies[1][: cookies[1].find(";")]
30 |             else:
31 |                 async with session.get(url, headers=headers, proxy=self.proxy) as resp:
32 |                     resp_cookies = str(resp.cookies)
33 |                     cookies = resp_cookies.split("csrftoken=")
34 |                     csrftoken += cookies[1][: cookies[1].find(";")]
35 |             await asyncio.sleep(5)
36 | 
37 |             # extract csrftoken from cookies
38 |             data = {
39 |                 "Cookie": f"csfrtoken={csrftoken}",
40 |                 "csrfmiddlewaretoken": csrftoken,
41 |                 "targetip": self.word,
42 |                 "user": "free",
43 |             }
44 |             headers["Referer"] = url
45 |             if self.proxy is False:
46 |                 async with session.post(url, headers=headers, data=data) as resp:
47 |                     self.results = await resp.text()
48 |             else:
49 |                 async with session.post(
50 |                     url, headers=headers, data=data, proxy=self.proxy
51 |                 ) as resp:
52 |                     self.results = await resp.text()
53 |             await session.close()
54 |         except Exception as e:
55 |             print(f"An exception occurred: {e}")
56 |         self.totalresults += self.results
57 | 
58 |     async def get_hostnames(self):
59 |         rawres = myparser.Parser(self.totalresults, self.word)
60 |         return await rawres.hostnames()
61 | 
62 |     async def process(self, proxy: bool = False) -> None:
63 |         self.proxy = proxy
64 |         await self.do_search()  # Only need to do it once.
65 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/duckduckgosearch.py:
--------------------------------------------------------------------------------
  1 | import ujson
  2 | 
  3 | from theHarvester.lib.core import AsyncFetcher, Core
  4 | from theHarvester.parsers import myparser
  5 | 
  6 | 
  7 | class SearchDuckDuckGo:
  8 |     def __init__(self, word, limit) -> None:
  9 |         self.word = word
 10 |         self.results = ""
 11 |         self.totalresults = ""
 12 |         self.dorks: list = []
 13 |         self.links: list = []
 14 |         self.database = "https://duckduckgo.com/?q="
 15 |         self.api = "https://api.duckduckgo.com/?q=x&format=json&pretty=1"  # Currently using API.
 16 |         self.quantity = "100"
 17 |         self.limit = limit
 18 |         self.proxy = False
 19 | 
 20 |     async def do_search(self) -> None:
 21 |         # Do normal scraping.
 22 |         url = self.api.replace("x", self.word)
 23 |         headers = {"User-Agent": Core.get_user_agent()}
 24 |         first_resp = await AsyncFetcher.fetch_all(
 25 |             [url], headers=headers, proxy=self.proxy
 26 |         )
 27 |         self.results = first_resp[0]
 28 |         self.totalresults += self.results
 29 |         urls = await self.crawl(self.results)
 30 |         urls = {url for url in urls if len(url) > 5}
 31 |         all_resps = await AsyncFetcher.fetch_all(urls)
 32 |         self.totalresults += "".join(all_resps)
 33 | 
 34 |     async def crawl(self, text):
 35 |         """
 36 |         Function parses json and returns URLs.
 37 |         :param text: formatted json
 38 |         :return: set of URLs
 39 |         """
 40 |         urls = set()
 41 |         try:
 42 |             load = ujson.loads(text)
 43 |             for keys in load.keys():  # Iterate through keys of dict.
 44 |                 val = load.get(keys)
 45 | 
 46 |                 if isinstance(val, int) or isinstance(val, dict) or val is None:
 47 |                     continue
 48 | 
 49 |                 if isinstance(val, list):
 50 |                     if len(val) == 0:  # Make sure not indexing an empty list.
 51 |                         continue
 52 |                     val = val[0]  # The First value should be dict.
 53 | 
 54 |                     if isinstance(val, dict):  # Validation check.
 55 |                         for key in val.keys():
 56 |                             value = val.get(key)
 57 |                             if (
 58 |                                 isinstance(value, str)
 59 |                                 and value != ""
 60 |                                 and "https://" in value
 61 |                                 or "http://" in value
 62 |                             ):
 63 |                                 urls.add(value)
 64 | 
 65 |                 if (
 66 |                     isinstance(val, str)
 67 |                     and val != ""
 68 |                     and "https://" in val
 69 |                     or "http://" in val
 70 |                 ):
 71 |                     urls.add(val)
 72 |             tmp = set()
 73 |             for url in urls:
 74 |                 if (
 75 |                     "<" in url and "href=" in url
 76 |                 ):  # Format is <href="https://www.website.com"/>
 77 |                     equal_index = url.index("=")
 78 |                     true_url = ""
 79 |                     for ch in url[equal_index + 1 :]:
 80 |                         if ch == '"':
 81 |                             tmp.add(true_url)
 82 |                             break
 83 |                         true_url += ch
 84 |                 else:
 85 |                     if url != "":
 86 |                         tmp.add(url)
 87 |             return tmp
 88 |         except Exception as e:
 89 |             print(f"Exception occurred: {e}")
 90 |             return []
 91 | 
 92 |     async def get_emails(self):
 93 |         rawres = myparser.Parser(self.totalresults, self.word)
 94 |         return await rawres.emails()
 95 | 
 96 |     async def get_hostnames(self):
 97 |         rawres = myparser.Parser(self.totalresults, self.word)
 98 |         return await rawres.hostnames()
 99 | 
100 |     async def process(self, proxy: bool = False) -> None:
101 |         self.proxy = proxy
102 |         await self.do_search()  # Only need to search once since using API.
103 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/fullhuntsearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import MissingKey
 2 | from theHarvester.lib.core import AsyncFetcher, Core
 3 | 
 4 | 
 5 | class SearchFullHunt:
 6 |     def __init__(self, word) -> None:
 7 |         self.word = word
 8 |         self.key = Core.fullhunt_key()
 9 |         if self.key is None:
10 |             raise MissingKey("fullhunt")
11 |         self.total_results = None
12 |         self.proxy = False
13 | 
14 |     async def do_search(self) -> None:
15 |         url = f"https://fullhunt.io/api/v1/domain/{self.word}/subdomains"
16 |         response = await AsyncFetcher.fetch_all(
17 |             [url],
18 |             json=True,
19 |             headers={"User-Agent": Core.get_user_agent(), "X-API-KEY": self.key},
20 |             proxy=self.proxy,
21 |         )
22 |         self.total_results = response[0]["hosts"]
23 | 
24 |     async def get_hostnames(self):
25 |         return self.total_results
26 | 
27 |     async def process(self, proxy: bool = False) -> None:
28 |         self.proxy = proxy
29 |         await self.do_search()
30 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/hackertarget.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | 
 3 | 
 4 | class SearchHackerTarget:
 5 |     """
 6 |     Class uses the HackerTarget api to gather subdomains and ips
 7 |     """
 8 | 
 9 |     def __init__(self, word) -> None:
10 |         self.word = word
11 |         self.total_results = ""
12 |         self.hostname = "https://api.hackertarget.com"
13 |         self.proxy = False
14 |         self.results = None
15 | 
16 |     async def do_search(self) -> None:
17 |         headers = {"User-agent": Core.get_user_agent()}
18 |         urls = [
19 |             f"{self.hostname}/hostsearch/?q={self.word}",
20 |             f"{self.hostname}/reversedns/?q={self.word}",
21 |         ]
22 |         responses = await AsyncFetcher.fetch_all(
23 |             urls, headers=headers, proxy=self.proxy
24 |         )
25 |         for response in responses:
26 |             self.total_results += response.replace(",", ":")
27 | 
28 |     async def process(self, proxy: bool = False) -> None:
29 |         self.proxy = proxy
30 |         await self.do_search()
31 | 
32 |     async def get_hostnames(self) -> list:
33 |         return [
34 |             result
35 |             for result in self.total_results.splitlines()
36 |             if "No PTR records found" not in result
37 |         ]
38 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/intelxsearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Any
 3 | 
 4 | import requests
 5 | import ujson
 6 | 
 7 | from theHarvester.discovery.constants import MissingKey
 8 | from theHarvester.lib.core import AsyncFetcher, Core
 9 | from theHarvester.parsers import intelxparser
10 | 
11 | 
12 | class SearchIntelx:
13 |     def __init__(self, word) -> None:
14 |         self.word = word
15 |         self.key = Core.intelx_key()
16 |         if self.key is None:
17 |             raise MissingKey("Intelx")
18 |         self.database = "https://2.intelx.io"
19 |         self.results: Any = None
20 |         self.info: tuple[Any, ...] = ()
21 |         self.limit: int = 10000
22 |         self.proxy = False
23 |         self.offset = -1
24 | 
25 |     async def do_search(self) -> None:
26 |         try:
27 |             # Based on: https://github.com/IntelligenceX/SDK/blob/master/Python/intelxapi.py
28 |             # API requests self identification
29 |             # https://intelx.io/integrations
30 |             headers = {
31 |                 "x-key": self.key,
32 |                 "User-Agent": f"{Core.get_user_agent()}-theHarvester",
33 |             }
34 |             data = {
35 |                 "term": self.word,
36 |                 "buckets": [],
37 |                 "lookuplevel": 0,
38 |                 "maxresults": self.limit,
39 |                 "timeout": 5,
40 |                 "datefrom": "",
41 |                 "dateto": "",
42 |                 "sort": 2,
43 |                 "media": 0,
44 |                 "terminate": [],
45 |                 "target": 0,
46 |             }
47 | 
48 |             total_resp = requests.post(
49 |                 f"{self.database}/phonebook/search", headers=headers, json=data
50 |             )
51 |             phonebook_id = ujson.loads(total_resp.text)["id"]
52 |             await asyncio.sleep(5)
53 | 
54 |             # Fetch results from phonebook based on ID
55 |             resp = await AsyncFetcher.fetch_all(
56 |                 [
57 |                     f"{self.database}/phonebook/search/result?id={phonebook_id}&limit={self.limit}&offset={self.offset}"
58 |                 ],
59 |                 headers=headers,
60 |                 json=True,
61 |                 proxy=self.proxy,
62 |             )
63 |             resp = resp[0]
64 |             self.results = resp  # TODO: give self.results more appropriate typing
65 |         except Exception as e:
66 |             print(f"An exception has occurred in Intelx: {e}")
67 | 
68 |     async def process(self, proxy: bool = False):
69 |         self.proxy = proxy
70 |         await self.do_search()
71 |         intelx_parser = intelxparser.Parser()
72 |         # TODO: give self.info more appropriate typing
73 |         self.info = await intelx_parser.parse_dictionaries(self.results)
74 | 
75 |     async def get_emails(self):
76 |         return self.info[0]
77 | 
78 |     async def get_interestingurls(self):
79 |         # TODO parse add return hostnames for subdomains of urls
80 |         return self.info[1]
81 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/netlas.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import MissingKey
 2 | from theHarvester.lib.core import AsyncFetcher, Core
 3 | 
 4 | 
 5 | class SearchNetlas:
 6 |     def __init__(self, word) -> None:
 7 |         self.word = word
 8 |         self.totalhosts: list = []
 9 |         self.totalips: list = []
10 |         self.key = Core.netlas_key()
11 |         if self.key is None:
12 |             raise MissingKey("netlas")
13 |         self.proxy = False
14 | 
15 |     async def do_search(self) -> None:
16 |         api = f"https://app.netlas.io/api/domains/?q=*.{self.word}&source_type=include&start=0&fields=*"
17 |         headers = {"X-API-Key": self.key}
18 |         response = await AsyncFetcher.fetch_all(
19 |             [api], json=True, headers=headers, proxy=self.proxy
20 |         )
21 |         for domain in response[0]["items"]:
22 |             self.totalhosts.append(domain["data"]["domain"])
23 | 
24 |     async def get_hostnames(self) -> list:
25 |         return self.totalhosts
26 | 
27 |     async def process(self, proxy: bool = False) -> None:
28 |         self.proxy = proxy
29 |         await self.do_search()
30 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/otxsearch.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Set
 3 | 
 4 | from theHarvester.lib.core import AsyncFetcher
 5 | 
 6 | 
 7 | class SearchOtx:
 8 |     def __init__(self, word) -> None:
 9 |         self.word = word
10 |         self.totalhosts: Set = set()
11 |         self.totalips: Set = set()
12 |         self.proxy = False
13 | 
14 |     async def do_search(self) -> None:
15 |         url = f"https://otx.alienvault.com/api/v1/indicators/domain/{self.word}/passive_dns"
16 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
17 |         responses = response[0]
18 |         dct = responses
19 |         self.totalhosts = {host["hostname"] for host in dct["passive_dns"]}
20 |         # filter out ips that are just called NXDOMAIN
21 |         self.totalips = {
22 |             ip["address"]
23 |             for ip in dct["passive_dns"]
24 |             if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip["address"])
25 |         }
26 | 
27 |     async def get_hostnames(self) -> set:
28 |         return self.totalhosts
29 | 
30 |     async def get_ips(self) -> set:
31 |         return self.totalips
32 | 
33 |     async def process(self, proxy: bool = False) -> None:
34 |         self.proxy = proxy
35 |         await self.do_search()
36 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/pentesttools.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import List
 3 | 
 4 | import ujson
 5 | 
 6 | from theHarvester.discovery.constants import MissingKey
 7 | from theHarvester.lib.core import AsyncFetcher, Core
 8 | 
 9 | 
10 | class SearchPentestTools:
11 |     def __init__(self, word) -> None:
12 |         # Script is largely based off https://pentest-tools.com/public/api_client.py.txt
13 |         self.word = word
14 |         self.key = Core.pentest_tools_key()
15 |         if self.key is None:
16 |             raise MissingKey("PentestTools")
17 |         self.total_results: List = []
18 |         self.api = f"https://pentest-tools.com/api?key={self.key}"
19 |         self.proxy = False
20 | 
21 |     async def poll(self, scan_id):
22 |         while True:
23 |             time.sleep(3)
24 |             # Get the status of our scan
25 |             scan_status_data = {"op": "get_scan_status", "scan_id": scan_id}
26 |             responses = await AsyncFetcher.post_fetch(
27 |                 url=self.api, data=ujson.dumps(scan_status_data), proxy=self.proxy
28 |             )
29 |             res_json = ujson.loads(responses.strip())
30 |             if res_json["op_status"] == "success":
31 |                 if (
32 |                     res_json["scan_status"] != "waiting"
33 |                     and res_json["scan_status"] != "running"
34 |                 ):
35 |                     getoutput_data = {
36 |                         "op": "get_output",
37 |                         "scan_id": scan_id,
38 |                         "output_format": "json",
39 |                     }
40 |                     responses = await AsyncFetcher.post_fetch(
41 |                         url=self.api, data=ujson.dumps(getoutput_data), proxy=self.proxy
42 |                     )
43 | 
44 |                     res_json = ujson.loads(responses.strip("\n"))
45 |                     self.total_results = await self.parse_json(res_json)
46 |                     break
47 |             else:
48 |                 print(
49 |                     f"Operation get_scan_status failed because: {res_json['error']}. {res_json['details']}"
50 |                 )
51 |                 break
52 | 
53 |     @staticmethod
54 |     async def parse_json(json_results):
55 |         status = json_results["op_status"]
56 |         if status == "success":
57 |             scan_tests = json_results["scan_output"]["output_json"]
58 |             output_data = scan_tests[0]["output_data"]
59 |             host_to_ip = [
60 |                 f"{subdomain[0]}:{subdomain[1]}"
61 |                 for subdomain in output_data
62 |                 if len(subdomain) > 0
63 |             ]
64 |             return host_to_ip
65 |         return []
66 | 
67 |     async def get_hostnames(self) -> list:
68 |         return self.total_results
69 | 
70 |     async def do_search(self) -> None:
71 |         subdomain_payload = {
72 |             "op": "start_scan",
73 |             "tool_id": 20,
74 |             "tool_params": {
75 |                 "target": f"{self.word}",
76 |                 "web_details": "off",
77 |                 "do_smart_search": "off",
78 |             },
79 |         }
80 |         responses = await AsyncFetcher.post_fetch(
81 |             url=self.api, data=ujson.dumps(subdomain_payload), proxy=self.proxy
82 |         )
83 |         res_json = ujson.loads(responses.strip())
84 |         if res_json["op_status"] == "success":
85 |             scan_id = res_json["scan_id"]
86 |             await self.poll(scan_id)
87 | 
88 |     async def process(self, proxy: bool = False) -> None:
89 |         self.proxy = proxy
90 |         await self.do_search()  # Only need to do it once.
91 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/projectdiscovery.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.discovery.constants import MissingKey
 2 | from theHarvester.lib.core import AsyncFetcher, Core
 3 | 
 4 | 
 5 | class SearchDiscovery:
 6 |     def __init__(self, word) -> None:
 7 |         self.word = word
 8 |         self.key = Core.projectdiscovery_key()
 9 |         if self.key is None:
10 |             raise MissingKey("ProjectDiscovery")
11 |         self.total_results = None
12 |         self.proxy = False
13 | 
14 |     async def do_search(self):
15 |         url = f"https://dns.projectdiscovery.io/dns/{self.word}/subdomains"
16 |         response = await AsyncFetcher.fetch_all(
17 |             [url],
18 |             json=True,
19 |             headers={"User-Agent": Core.get_user_agent(), "Authorization": self.key},
20 |             proxy=self.proxy,
21 |         )
22 |         self.total_results = [
23 |             f"{domains}.{self.word}" for domains in response[0]["subdomains"]
24 |         ]
25 | 
26 |     async def get_hostnames(self):
27 |         return self.total_results
28 | 
29 |     async def process(self, proxy: bool = False) -> None:
30 |         self.proxy = proxy
31 |         await self.do_search()
32 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/rapiddns.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher, Core
 4 | 
 5 | 
 6 | class SearchRapidDns:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.total_results: list = []
10 |         self.proxy = False
11 | 
12 |     async def do_search(self):
13 |         try:
14 |             headers = {"User-agent": Core.get_user_agent()}
15 |             # TODO see if it's worth adding sameip searches
16 |             # f'{self.hostname}/sameip/{self.word}?full=1#result'
17 |             urls = [f"https://rapiddns.io/subdomain/{self.word}?full=1#result"]
18 |             responses = await AsyncFetcher.fetch_all(
19 |                 urls, headers=headers, proxy=self.proxy
20 |             )
21 |             if len(responses[0]) <= 1:
22 |                 return self.total_results
23 |             soup = BeautifulSoup(responses[0], "html.parser")
24 |             rows = soup.find("table").find("tbody").find_all("tr")
25 |             if rows:
26 |                 # Validation check
27 |                 for row in rows:
28 |                     cells = row.find_all("td")
29 |                     if len(cells) > 0:
30 |                         # sanity check
31 |                         subdomain = str(cells[0].get_text())
32 |                         if cells[-1].get_text() == "CNAME":
33 |                             self.total_results.append(f"{subdomain}")
34 |                         else:
35 |                             self.total_results.append(
36 |                                 f"{subdomain}:{str(cells[1].get_text()).strip()}"
37 |                             )
38 |                 self.total_results = list({domain for domain in self.total_results})
39 |         except Exception as e:
40 |             print(f"An exception has occurred: {str(e)}")
41 | 
42 |     async def process(self, proxy: bool = False) -> None:
43 |         self.proxy = proxy
44 |         await self.do_search()
45 | 
46 |     async def get_hostnames(self):
47 |         return self.total_results
48 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/rocketreach.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Set
 3 | 
 4 | from theHarvester.discovery.constants import MissingKey, get_delay
 5 | from theHarvester.lib.core import AsyncFetcher, Core
 6 | 
 7 | 
 8 | class SearchRocketReach:
 9 |     def __init__(self, word, limit) -> None:
10 |         self.ips: Set = set()
11 |         self.word = word
12 |         self.key = Core.rocketreach_key()
13 |         if self.key is None:
14 |             raise MissingKey("RocketReach")
15 |         self.hosts: Set = set()
16 |         self.proxy = False
17 |         self.baseurl = "https://rocketreach.co/api/v2/person/search"
18 |         self.links: Set = set()
19 |         self.limit = limit
20 | 
21 |     async def do_search(self) -> None:
22 |         try:
23 |             headers = {
24 |                 "Api-Key": self.key,
25 |                 "Content-Type": "application/json",
26 |                 "User-Agent": Core.get_user_agent(),
27 |             }
28 | 
29 |             next_page = 1  # track pagination
30 |             for count in range(1, self.limit):
31 |                 data = f'{{"query":{{"company_domain": ["{self.word}"]}}, "start": {next_page}, "page_size": 100}}'
32 |                 result = await AsyncFetcher.post_fetch(
33 |                     self.baseurl, headers=headers, data=data, json=True
34 |                 )
35 |                 if (
36 |                     "detail" in result.keys()
37 |                     and "error" in result.keys()
38 |                     and "Subscribe to a plan to access" in result["detail"]
39 |                 ):
40 |                     # No more results can be fetched
41 |                     break
42 |                 if (
43 |                     "detail" in result.keys()
44 |                     and "Request was throttled." in result["detail"]
45 |                 ):
46 |                     # Rate limit has been triggered need to sleep extra
47 |                     print(
48 |                         f"RocketReach requests have been throttled; "
49 |                         f'{result["detail"].split(" ", 3)[-1].replace("available", "availability")}'
50 |                     )
51 |                     break
52 |                 if "profiles" in dict(result).keys():
53 |                     if len(result["profiles"]) == 0:
54 |                         break
55 |                     for profile in result["profiles"]:
56 |                         if "linkedin_url" in dict(profile).keys():
57 |                             self.links.add(profile["linkedin_url"])
58 |                 if "pagination" in dict(result).keys():
59 |                     next_page = int(result["pagination"]["next"])
60 |                     if next_page > int(result["pagination"]["total"]):
61 |                         break
62 | 
63 |             await asyncio.sleep(get_delay() + 5)
64 | 
65 |         except Exception as e:
66 |             print(f"An exception has occurred: {e}")
67 | 
68 |     async def get_links(self):
69 |         return self.links
70 | 
71 |     async def process(self, proxy: bool = False) -> None:
72 |         self.proxy = proxy
73 |         await self.do_search()
74 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/searchhunterhow.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | from datetime import datetime
 3 | from typing import Set
 4 | 
 5 | from dateutil.relativedelta import relativedelta
 6 | 
 7 | from theHarvester.discovery.constants import MissingKey
 8 | from theHarvester.lib.core import AsyncFetcher, Core
 9 | 
10 | 
11 | class SearchHunterHow:
12 |     def __init__(self, word) -> None:
13 |         self.word = word
14 |         self.total_hostnames: Set = set()
15 |         self.key = Core.hunterhow_key()
16 |         if self.key is None:
17 |             raise MissingKey("hunterhow")
18 |         self.proxy = False
19 | 
20 |     async def do_search(self) -> None:
21 |         # https://hunter.how/search-api
22 |         query = f'domain.suffix="{self.word}"'
23 |         # second_query = f'domain="{self.word}"'
24 |         encoded_query = base64.urlsafe_b64encode(query.encode("utf-8")).decode("ascii")
25 |         page = 1
26 |         page_size = 100  # can be either: 10,20,50,100)
27 |         # The interval between the start time and the end time cannot exceed one year
28 |         # Can not exceed one year, but years=1 does not work due to their backend, 364 will suffice
29 |         today = datetime.today()
30 |         one_year_ago = today - relativedelta(days=364)
31 |         start_time = one_year_ago.strftime("%Y-%m-%d")
32 |         end_time = today.strftime("%Y-%m-%d")
33 |         # two_years_ago = one_year_ago - relativedelta(days=364)
34 |         # start_time = two_years_ago.strftime('%Y-%m-%d')
35 |         # end_time = one_year_ago.strftime('%Y-%m-%d')
36 |         url = (
37 |             "https://api.hunter.how/search?api-key=%s&query=%s&page=%d&page_size=%d&start_time=%s&end_time=%s"
38 |             % (
39 |                 # self.key, encoded_query, page, page_size, start_time, end_time
40 |                 self.key,
41 |                 encoded_query,
42 |                 page,
43 |                 page_size,
44 |                 start_time,
45 |                 end_time,
46 |             )
47 |         )
48 |         # print(f'Sending url: {url}')
49 |         response = await AsyncFetcher.fetch_all(
50 |             [url],
51 |             json=True,
52 |             headers={"User-Agent": Core.get_user_agent(), "x-api-key": f"{self.key}"},
53 |             proxy=self.proxy,
54 |         )
55 |         dct = response[0]
56 |         # print(f'json response: ')
57 |         # print(dct)
58 |         if "code" in dct.keys():
59 |             if dct["code"] == 40001:
60 |                 print(f'Code 40001 indicates for searchhunterhow: {dct["message"]}')
61 |                 return
62 |         # total = dct['data']['total']
63 |         # TODO determine if total is ever 100 how to get more subdomains?
64 |         for sub in dct["data"]["list"]:
65 |             self.total_hostnames.add(sub["domain"])
66 | 
67 |     async def get_hostnames(self) -> set:
68 |         return self.total_hostnames
69 | 
70 |     async def process(self, proxy: bool = False) -> None:
71 |         self.proxy = proxy
72 |         await self.do_search()
73 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/securitytrailssearch.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Sequence
 3 | 
 4 | from theHarvester.discovery.constants import MissingKey
 5 | from theHarvester.lib.core import AsyncFetcher, Core
 6 | from theHarvester.parsers import securitytrailsparser
 7 | 
 8 | 
 9 | class SearchSecuritytrail:
10 |     def __init__(self, word) -> None:
11 |         self.word = word
12 |         self.key = Core.security_trails_key()
13 |         if self.key is None:
14 |             raise MissingKey("Securitytrail")
15 |         self.results = ""
16 |         self.totalresults = ""
17 |         self.api = "https://api.securitytrails.com/v1/"
18 |         self.info: tuple[set, set] = (set(), set())
19 |         self.proxy = False
20 | 
21 |     async def authenticate(self) -> None:
22 |         # Method to authenticate API key before sending requests.
23 |         headers = {"APIKEY": self.key}
24 |         url = f"{self.api}ping"
25 |         auth_responses = await AsyncFetcher.fetch_all(
26 |             [url], headers=headers, proxy=self.proxy
27 |         )
28 |         auth_responses = auth_responses[0]
29 |         if "False" in auth_responses or "Invalid authentication" in auth_responses:
30 |             print("\tKey could not be authenticated exiting program.")
31 |         await asyncio.sleep(5)
32 | 
33 |     async def do_search(self) -> None:
34 |         # https://api.securitytrails.com/v1/domain/domain.com
35 |         url = f"{self.api}domain/{self.word}"
36 |         headers = {"APIKEY": self.key}
37 |         response = await AsyncFetcher.fetch_all(
38 |             [url], headers=headers, proxy=self.proxy
39 |         )
40 |         await asyncio.sleep(
41 |             5
42 |         )  # Not random delay because 2 seconds is required due to rate limit.
43 |         self.results = response[0]
44 |         self.totalresults += self.results
45 |         url += "/subdomains"  # Get subdomains now.
46 |         subdomain_response = await AsyncFetcher.fetch_all(
47 |             [url], headers=headers, proxy=self.proxy
48 |         )
49 |         await asyncio.sleep(5)
50 |         self.results = subdomain_response[0]
51 |         self.totalresults += self.results
52 | 
53 |     async def process(self, proxy: bool = False) -> None:
54 |         self.proxy = proxy
55 |         await self.authenticate()
56 |         await self.do_search()
57 |         parser = securitytrailsparser.Parser(word=self.word, text=self.totalresults)
58 |         self.info = await parser.parse_text()
59 |         # Create parser and set self.info to tuple returned from parsing text.
60 |         print("\tDone Searching Results")
61 | 
62 |     async def get_ips(self) -> set:
63 |         return self.info[0]
64 | 
65 |     async def get_hostnames(self) -> set:
66 |         return self.info[1]
67 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/shodansearch.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | from typing import List
  3 | 
  4 | from shodan import Shodan, exception
  5 | 
  6 | from theHarvester.discovery.constants import MissingKey
  7 | from theHarvester.lib.core import Core
  8 | 
  9 | 
 10 | class SearchShodan:
 11 |     def __init__(self) -> None:
 12 |         self.key = Core.shodan_key()
 13 |         if self.key is None:
 14 |             raise MissingKey("Shodan")
 15 |         self.api = Shodan(self.key)
 16 |         self.hostdatarow: List = []
 17 |         self.tracker: OrderedDict = OrderedDict()
 18 | 
 19 |     async def search_ip(self, ip) -> OrderedDict:
 20 |         try:
 21 |             ipaddress = ip
 22 |             results = self.api.host(ipaddress)
 23 |             asn = ""
 24 |             domains: List = list()
 25 |             hostnames: List = list()
 26 |             ip_str = ""
 27 |             isp = ""
 28 |             org = ""
 29 |             ports: List = list()
 30 |             title = ""
 31 |             server = ""
 32 |             product = ""
 33 |             technologies: List = list()
 34 | 
 35 |             data_first_dict = dict(results["data"][0])
 36 | 
 37 |             if "ip_str" in data_first_dict.keys():
 38 |                 ip_str += data_first_dict["ip_str"]
 39 | 
 40 |             if "http" in data_first_dict.keys():
 41 |                 http_results_dict = dict(data_first_dict["http"])
 42 |                 if "title" in http_results_dict.keys():
 43 |                     title_val = str(http_results_dict["title"]).strip()
 44 |                     if title_val != "None":
 45 |                         title += title_val
 46 |                 if "components" in http_results_dict.keys():
 47 |                     for key in http_results_dict["components"].keys():
 48 |                         technologies.append(key)
 49 |                 if "server" in http_results_dict.keys():
 50 |                     server_val = str(http_results_dict["server"]).strip()
 51 |                     if server_val != "None":
 52 |                         server += server_val
 53 | 
 54 |             for key, value in results.items():
 55 |                 if key == "asn":
 56 |                     asn += value
 57 |                 if key == "domains":
 58 |                     value = list(value)
 59 |                     value.sort()
 60 |                     domains.extend(value)
 61 |                 if key == "hostnames":
 62 |                     value = [host.strip() for host in list(value)]
 63 |                     value.sort()
 64 |                     hostnames.extend(value)
 65 |                 if key == "isp":
 66 |                     isp += value
 67 |                 if key == "org":
 68 |                     org += str(value)
 69 |                 if key == "ports":
 70 |                     value = list(value)
 71 |                     value.sort()
 72 |                     ports.extend(value)
 73 |                 if key == "product":
 74 |                     product += value
 75 | 
 76 |             technologies = list(set(technologies))
 77 | 
 78 |             self.tracker[ip] = {
 79 |                 "asn": asn.strip(),
 80 |                 "domains": domains,
 81 |                 "hostnames": hostnames,
 82 |                 "ip_str": ip_str.strip(),
 83 |                 "isp": isp.strip(),
 84 |                 "org": org.strip(),
 85 |                 "ports": ports,
 86 |                 "product": product.strip(),
 87 |                 "server": server.strip(),
 88 |                 "technologies": technologies,
 89 |                 "title": title.strip(),
 90 |             }
 91 | 
 92 |             return self.tracker
 93 |         except exception.APIError:
 94 |             print(f"{ip}: Not in Shodan")
 95 |             self.tracker[ip] = "Not in Shodan"
 96 |         except Exception as e:
 97 |             # print(f'Error occurred in the Shodan IP search module: {e}')
 98 |             self.tracker[ip] = f"Error occurred in the Shodan IP search module: {e}"
 99 |         finally:
100 |             return self.tracker
101 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/subdomaincenter.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | 
 3 | 
 4 | class SubdomainCenter:
 5 |     def __init__(self, word):
 6 |         self.word = word
 7 |         self.results = set()
 8 |         self.server = "https://api.subdomain.center/?domain="
 9 |         self.proxy = False
10 | 
11 |     async def do_search(self):
12 |         headers = {"User-Agent": Core.get_user_agent()}
13 |         try:
14 |             current_url = f"{self.server}{self.word}"
15 |             resp = await AsyncFetcher.fetch_all(
16 |                 [current_url], headers=headers, proxy=self.proxy, json=True
17 |             )
18 |             self.results = resp[0]
19 |             self.results = {
20 |                 sub[4:] if sub[:4] == "www." and sub[4:] else sub
21 |                 for sub in self.results
22 |             }
23 |         except Exception as e:
24 |             print(f"An exception has occurred in SubdomainCenter on : {e}")
25 | 
26 |     async def get_hostnames(self):
27 |         return self.results
28 | 
29 |     async def process(self, proxy=False):
30 |         self.proxy = proxy
31 |         await self.do_search()
32 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/subdomainfinderc99.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Set
 3 | 
 4 | import ujson
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | from theHarvester.discovery.constants import get_delay
 8 | from theHarvester.lib.core import AsyncFetcher, Core
 9 | from theHarvester.parsers import myparser
10 | 
11 | 
12 | class SearchSubdomainfinderc99:
13 |     def __init__(self, word) -> None:
14 |         self.word = word
15 |         self.total_results: Set = set()
16 |         self.proxy = False
17 |         # TODO add api support
18 |         self.server = "https://subdomainfinder.c99.nl/"
19 |         self.totalresults = ""
20 | 
21 |     async def do_search(self) -> None:
22 |         # Based on https://gist.github.com/th3gundy/bc83580cbe04031e9164362b33600962
23 |         headers = {"User-Agent": Core.get_user_agent()}
24 |         resp = await AsyncFetcher.fetch_all(
25 |             [self.server], headers=headers, proxy=self.proxy
26 |         )
27 |         data = await self.get_csrf_params(resp[0])
28 | 
29 |         data["scan_subdomains"] = ""
30 |         data["domain"] = self.word
31 |         data["privatequery"] = "on"
32 |         await asyncio.sleep(get_delay())
33 |         second_resp = await AsyncFetcher.post_fetch(
34 |             self.server, headers=headers, proxy=self.proxy, data=ujson.dumps(data)
35 |         )
36 | 
37 |         # print(second_resp)
38 |         self.totalresults += second_resp
39 |         # y = await self.get_hostnames()
40 |         # print(list(sorted(y)))
41 |         # print(f'Found: {len(y)} subdomains')
42 | 
43 |         # regex = r"value='(https://subdomainfinder\.c99\.nl/scans/\d{4}-\d{2}-\d{2}/" + self.word + r")'"
44 |         # match = re.search(regex, second_resp)
45 |         # if match:
46 |         #     print(match.group(1))
47 | 
48 |     async def get_hostnames(self):
49 |         rawres = myparser.Parser(self.totalresults, self.word)
50 |         return await rawres.hostnames()
51 | 
52 |     async def process(self, proxy: bool = False) -> None:
53 |         self.proxy = proxy
54 |         await self.do_search()
55 | 
56 |     @staticmethod
57 |     async def get_csrf_params(data):
58 |         csrf_params = {}
59 |         html = BeautifulSoup(data, "html.parser").find("div", {"class": "input-group"})
60 |         for c in html.find_all("input"):
61 |             try:
62 |                 csrf_params[c.get("name")] = c.get("value")
63 |             except Exception:
64 |                 continue
65 | 
66 |         return csrf_params
67 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/threatminer.py:
--------------------------------------------------------------------------------
 1 | from typing import Set
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher
 4 | 
 5 | 
 6 | class SearchThreatminer:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.totalhosts: Set = set()
10 |         self.totalips: Set = set()
11 |         self.proxy = False
12 | 
13 |     async def do_search(self) -> None:
14 |         url = f"https://api.threatminer.org/v2/domain.php?q={self.word}&rt=5"
15 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
16 |         self.totalhosts = {host for host in response[0]["results"]}
17 |         second_url = f"https://api.threatminer.org/v2/domain.php?q={self.word}&rt=2"
18 |         secondresp = await AsyncFetcher.fetch_all(
19 |             [second_url], json=True, proxy=self.proxy
20 |         )
21 |         try:
22 |             self.totalips = {resp["ip"] for resp in secondresp[0]["results"]}
23 |         except TypeError:
24 |             pass
25 | 
26 |     async def get_hostnames(self) -> Set:
27 |         return self.totalhosts
28 | 
29 |     async def get_ips(self) -> Set:
30 |         return self.totalips
31 | 
32 |     async def process(self, proxy: bool = False) -> None:
33 |         self.proxy = proxy
34 |         await self.do_search()
35 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/urlscan.py:
--------------------------------------------------------------------------------
 1 | from typing import Set
 2 | 
 3 | from theHarvester.lib.core import AsyncFetcher
 4 | 
 5 | 
 6 | class SearchUrlscan:
 7 |     def __init__(self, word) -> None:
 8 |         self.word = word
 9 |         self.totalhosts: Set = set()
10 |         self.totalips: Set = set()
11 |         self.interestingurls: Set = set()
12 |         self.totalasns: Set = set()
13 |         self.proxy = False
14 | 
15 |     async def do_search(self) -> None:
16 |         url = f"https://urlscan.io/api/v1/search/?q=domain:{self.word}"
17 |         response = await AsyncFetcher.fetch_all([url], json=True, proxy=self.proxy)
18 |         resp = response[0]
19 |         self.totalhosts = {f"{page['page']['domain']}" for page in resp["results"]}
20 |         self.totalips = {
21 |             f"{page['page']['ip']}"
22 |             for page in resp["results"]
23 |             if "ip" in page["page"].keys()
24 |         }
25 |         self.interestingurls = {
26 |             f"{page['page']['url']}"
27 |             for page in resp["results"]
28 |             if self.word in page["page"]["url"] and "url" in page["page"].keys()
29 |         }
30 |         self.totalasns = {
31 |             f"{page['page']['asn']}"
32 |             for page in resp["results"]
33 |             if "asn" in page["page"].keys()
34 |         }
35 | 
36 |     async def get_hostnames(self) -> Set:
37 |         return self.totalhosts
38 | 
39 |     async def get_ips(self) -> Set:
40 |         return self.totalips
41 | 
42 |     async def get_interestingurls(self) -> Set:
43 |         return self.interestingurls
44 | 
45 |     async def get_asns(self) -> Set:
46 |         return self.totalasns
47 | 
48 |     async def process(self, proxy: bool = False) -> None:
49 |         self.proxy = proxy
50 |         await self.do_search()
51 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/discovery/yahoosearch.py:
--------------------------------------------------------------------------------
 1 | from theHarvester.lib.core import AsyncFetcher, Core
 2 | from theHarvester.parsers import myparser
 3 | 
 4 | 
 5 | class SearchYahoo:
 6 |     def __init__(self, word, limit) -> None:
 7 |         self.word = word
 8 |         self.total_results = ""
 9 |         self.server = "search.yahoo.com"
10 |         self.limit = limit
11 |         self.proxy = False
12 | 
13 |     async def do_search(self) -> None:
14 |         base_url = f"https://{self.server}/search?p=%40{self.word}&b=xx&pz=10"
15 |         headers = {"Host": self.server, "User-agent": Core.get_user_agent()}
16 |         urls = [
17 |             base_url.replace("xx", str(num))
18 |             for num in range(0, self.limit, 10)
19 |             if num <= self.limit
20 |         ]
21 |         responses = await AsyncFetcher.fetch_all(
22 |             urls, headers=headers, proxy=self.proxy
23 |         )
24 |         for response in responses:
25 |             self.total_results += response
26 | 
27 |     async def process(self, proxy: bool = False) -> None:
28 |         self.proxy = proxy
29 |         await self.do_search()
30 | 
31 |     async def get_emails(self):
32 |         rawres = myparser.Parser(self.total_results, self.word)
33 |         toparse_emails = await rawres.emails()
34 |         emails = set()
35 |         # strip out numbers and dashes for emails that look like xxx-xxx-xxxemail@host.tld
36 |         for email in toparse_emails:
37 |             email = str(email)
38 |             if "-" in email and email[0].isdigit() and email.index("-") <= 9:
39 |                 while email[0] == "-" or email[0].isdigit():
40 |                     email = email[1:]
41 |             emails.add(email)
42 |         return list(emails)
43 | 
44 |     async def get_hostnames(self, proxy: bool = False):
45 |         self.proxy = proxy
46 |         rawres = myparser.Parser(self.total_results, self.word)
47 |         return await rawres.hostnames()
48 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/parsers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester/parsers/__init__.py


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/parsers/intelxparser.py:
--------------------------------------------------------------------------------
 1 | from typing import Set
 2 | 
 3 | 
 4 | class Parser:
 5 |     def __init__(self) -> None:
 6 |         self.emails: Set = set()
 7 |         self.hosts: Set = set()
 8 | 
 9 |     async def parse_dictionaries(self, results: dict) -> tuple:
10 |         """
11 |         Parse method to parse json results
12 |         :param results: Dictionary containing a list of dictionaries known as selectors
13 |         :return: tuple of emails and hosts
14 |         """
15 |         if results is not None:
16 |             for dictionary in results["selectors"]:
17 |                 field = dictionary["selectorvalue"]
18 |                 if "@" in field:
19 |                     self.emails.add(field)
20 |                 else:
21 |                     field = str(field)
22 |                     if "http" in field or "https" in field:
23 |                         if field[:5] == "https":
24 |                             field = field[8:]
25 |                         else:
26 |                             field = field[7:]
27 |                     self.hosts.add(field.replace(")", "").replace(",", ""))
28 |             return self.emails, self.hosts
29 |         return None, None
30 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/parsers/securitytrailsparser.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Set, Tuple, Union
 2 | 
 3 | 
 4 | class Parser:
 5 |     def __init__(self, word, text) -> None:
 6 |         self.word = word
 7 |         self.text = text
 8 |         self.hostnames: Set = set()
 9 |         self.ips: Set = set()
10 | 
11 |     async def parse_text(self) -> Tuple[set, set]:
12 |         sub_domain_flag = 0
13 |         self.text = str(self.text).splitlines()
14 |         # Split lines to get a list of lines.
15 |         for index in range(0, len(self.text)):
16 |             line = self.text[index].strip()
17 |             if '"ip":' in line:
18 |                 # Extract IP.
19 |                 ip = ""
20 |                 for ch in line[7:]:
21 |                     if ch == '"':
22 |                         break
23 |                     else:
24 |                         ip += ch
25 |                 self.ips.add(ip)
26 |             elif '"subdomains":' in line:
27 |                 # subdomains start here so set flag to 1
28 |                 sub_domain_flag = 1
29 |                 continue
30 |             elif sub_domain_flag > 0:
31 |                 if "]" in line:
32 |                     sub_domain_flag = 0
33 |                 else:
34 |                     if "www" in self.word:
35 |                         self.word = (
36 |                             str(self.word).replace("www.", "").replace("www", "")
37 |                         )
38 |                     # Remove www from word if entered
39 |                     self.hostnames.add(
40 |                         str(line).replace('"', "").replace(",", "") + "." + self.word
41 |                     )
42 |             else:
43 |                 continue
44 |         return self.ips, self.hostnames
45 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/restfulHarvest.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import uvicorn
 4 | 
 5 | 
 6 | def main():
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument(
 9 |         "-H",
10 |         "--host",
11 |         default="127.0.0.1",
12 |         help="IP address to listen on default is 127.0.0.1",
13 |     )
14 |     parser.add_argument(
15 |         "-p",
16 |         "--port",
17 |         default=5000,
18 |         help="Port to bind the web server to, default is 5000",
19 |         type=int,
20 |     )
21 |     parser.add_argument(
22 |         "-l",
23 |         "--log-level",
24 |         default="info",
25 |         help="Set logging level, default is info but [critical|error|warning|info|debug|trace] can be set",
26 |     )
27 |     parser.add_argument(
28 |         "-r",
29 |         "--reload",
30 |         default=False,
31 |         help="Enable automatic reload used during development of the api",
32 |         action="store_true",
33 |     )
34 | 
35 |     args: argparse.Namespace = parser.parse_args()
36 |     uvicorn.run(
37 |         "theHarvester.lib.api.api:app",
38 |         host=args.host,
39 |         port=args.port,
40 |         log_level=args.log_level,
41 |         reload=args.reload,
42 |     )
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/screenshot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sweetnight19/InfoHunter/d8de853d42017328b4f36483068ef99a052161fb/src/theHarvester/theHarvester/screenshot/__init__.py


--------------------------------------------------------------------------------
/src/theHarvester/theHarvester/theHarvester.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import sys
 3 | 
 4 | from theHarvester import __main__
 5 | 
 6 | 
 7 | def main():
 8 |     platform = sys.platform
 9 |     if platform == "win32":
10 |         # Required or things will break if trying to take screenshots
11 |         import multiprocessing
12 | 
13 |         multiprocessing.freeze_support()
14 |         asyncio.DefaultEventLoopPolicy = asyncio.WindowsSelectorEventLoopPolicy
15 |     else:
16 |         import uvloop
17 | 
18 |         uvloop.install()
19 | 
20 |         if "linux" in platform:
21 |             import aiomultiprocess
22 | 
23 |             # As we are not using Windows, we can change the spawn method to fork for greater performance
24 |             aiomultiprocess.set_context("fork")
25 |     asyncio.run(__main__.entry_point())
26 | 


--------------------------------------------------------------------------------
/wordlists/dorks.txt:
--------------------------------------------------------------------------------
 1 | inurl:"contact"
 2 | intext:email filetype:log
 3 | "Index of /mail"
 4 | "admin account info" filetype:log
 5 | intext:@
 6 | administrator accounts/
 7 | intitle:"Index of" .bash_history
 8 | intitle:"index of" members OR accounts
 9 | inurl:/shared/help.php
10 | inurl:public
11 | intitle:index.of inbox
12 | intitle:"Server Administration" 
13 | inurl:passwd.txt
14 | robots.txt
15 | php-addressbook "This is the addressbook for *" -warning


--------------------------------------------------------------------------------
/wordlists/general/common.txt:
--------------------------------------------------------------------------------
1 | admin
2 | test
3 | hello
4 | uk
5 | login
6 | book
7 | robots.txt
8 | 


--------------------------------------------------------------------------------