├── src
    └── godork
    │   ├── __init__.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── colors.py
    │       ├── exceptions.py
    │       ├── parse.py
    │       ├── banner.py
    │       └── user_agents.py
    │   ├── helpers
    │       ├── __init__.py
    │       ├── options.py
    │       ├── extractor.py
    │       ├── reports.py
    │       └── console.py
    │   ├── services
    │       ├── __init__.py
    │       ├── driver.py
    │       ├── version.py
    │       ├── requester.py
    │       ├── recaptcha.py
    │       └── scrape.py
    │   └── godork.py
├── MANIFEST.in
├── requirements.txt
├── .dockerignore
├── LICENSE
├── setup.py
├── Dockerfile
├── .gitignore
└── README.md


/src/godork/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/godork/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/godork/helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/godork/services/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bs4
 2 | pydub
 3 | psutil
 4 | aiohttp
 5 | asyncio
 6 | selenium
 7 | setuptools
 8 | SpeechRecognition
 9 | webdriver-manager
10 | undetected-chromedriver
11 | 


--------------------------------------------------------------------------------
/src/godork/utils/colors.py:
--------------------------------------------------------------------------------
 1 | class Bgcolor:
 2 | 
 3 |     # Just a bunch of colors in one place.
 4 |     
 5 |     DEFAULT  = '\033[0m'
 6 |     WARNING  = '\033[33m'
 7 |     PURPLE   = '\033[35m'
 8 |     GREEN    = '\033[32m'
 9 |     BOLD     = '\033[1m'
10 |     GRAY     = '\033[2m'
11 |     BLUE     = '\033[34m'
12 |     CYAN     = '\033[36m'
13 |     RED      = '\033[31m'


--------------------------------------------------------------------------------
/src/godork/utils/exceptions.py:
--------------------------------------------------------------------------------
 1 | class GodorkException(Exception):
 2 |     # This exception is used for general errors
 3 |     pass
 4 | 
 5 | class GodorkTimeout(TimeoutError):
 6 |     # This exception is used for timeout errors
 7 |     pass
 8 | 
 9 | class GodorkMaxRetries(Exception):
10 |     # This exception is used for max retries errors
11 |     pass
12 | 
13 | class GodorkNoData(Exception):
14 |     # This exception is used for no data errors
15 |     pass


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Version control
 2 | .git
 3 | .github
 4 | .gitignore
 5 | 
 6 | # Environment and cache
 7 | .venv
 8 | .env
 9 | .env.local
10 | __pycache__
11 | *.pyc
12 | *.pyo
13 | *.pyd
14 | .Python
15 | .pytest_cache
16 | .pdm-build
17 | 
18 | # Distribution / packaging
19 | dist
20 | build
21 | *.egg-info
22 | 
23 | # Development
24 | .vscode
25 | .idea
26 | *.swp
27 | *.swo
28 | 
29 | # Docs
30 | docs/site
31 | # Notebooks
32 | notebooks/.ipynb_checkpoints
33 | 
34 | # Docker
35 | Dockerfile
36 | .dockerignore 
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023-2025 Thunder
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/godork/godork.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import asyncio
 4 | 
 5 | from .utils.colors import Bgcolor
 6 | from .helpers.console import Console
 7 | from .helpers.options import OptionParser
 8 | from .services.version import check_version
 9 | from .services.scrape import Scraper
10 | 
11 | def main():
12 |     check_version()
13 | 
14 |     args = OptionParser.argument_parser()
15 | 
16 |     if len(args.dorks) < 1:
17 |         print(f"""{Bgcolor.RED}error{Bgcolor.DEFAULT}: the following required arguments were not provided:
18 |   --dorks <DORKS>
19 |               
20 | usage: godork --dorks <DORKS>
21 | 
22 | For more information, try 'godork --help'""")
23 |         return
24 |     
25 |     scrape = Scraper(
26 |         dorks=args.dorks,  
27 |         proxy=args.proxy,
28 |         debug=args.debug,
29 |         retries=args.retries,
30 |         max_retries=args.max_retries, 
31 |         headless_mode=args.no_headless
32 |     )
33 |     
34 |     try:
35 |         asyncio.run(scrape.run_with_async())
36 |     except KeyboardInterrupt:
37 |         print(f"\r{Console().text_format('info', msg='We appreciate your use of our tool ;) Goodbye!')}")
38 | 
39 | if __name__ == '__main__':
40 |     main()


--------------------------------------------------------------------------------
/src/godork/utils/parse.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse, parse_qs, unquote
 2 | 
 3 | def get_page_num(url):
 4 |     """
 5 |     This function handles URL parsing, extracts query parameters, gets their values and returns the value
 6 |     """
 7 | 
 8 |     parsed = urlparse(unquote(url))
 9 |     query_params = parse_qs(parsed.query)
10 | 
11 |     return query_params["start"][0]
12 | 
13 | def get_query(url):
14 |     """
15 |     This function handles URL parsing, extracts query parameters, gets their values and returns the value
16 |     """
17 | 
18 |     query_params = parse_qs(urlparse(unquote(str(url))).query)
19 | 
20 |     try:
21 |         query_params = parse_qs(urlparse(query_params["continue"][0]).query)
22 |         return query_params["q"][0]
23 |     except KeyError:
24 |         return query_params["q"][0]
25 | 
26 | def set_page_num(num):
27 |     """
28 |     This is where the page data is set.
29 |     """
30 |     
31 |     return int(num) // 10 + 1
32 | 
33 | def no_data(data_title):
34 |     """
35 |     This function checks if the desired data is not present and returns a boolean value
36 |     """
37 | 
38 |     try:
39 |         return len(data_title) < 1
40 |     except:
41 |         return False


--------------------------------------------------------------------------------
/src/godork/utils/banner.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The print_banner function is a simple yet visually impactful component designed to display a startup banner when the program is launched. 
 3 | It provides users with immediate version information and the current status of the tool in an aesthetically styled format using ASCII art.
 4 | 
 5 | Purpose:
 6 | 
 7 |     * The function is primarily used to enhance user experience by visually indicating:
 8 |     * The tool's name or identity (in this case, associated with thd3r & societyprojects)
 9 |     * The current version of the tool (CURRENT_VERSION)
10 |     * The status of the tool (e.g., latest, outdated, or other custom labels)
11 | 
12 | This shows a banner when the program starts.
13 | 
14 | """
15 | 
16 | def print_banner(status, version):
17 |     banner = rf"""
18 |                              __         __  
19 |                ___ ____  ___/ /__  ____/ /__
20 |               / _ `/ _ \/ _  / _ \/ __/  '_/  {version}
21 |               \_, /\___/\_,_/\___/_/ /_/\_\    {status}
22 |              /___/                                                                                                            
23 |                         thd3r & societyprojects                       
24 |     """
25 |     print(banner)
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | version = re.search(
 6 |     r'^CURRENT_VERSION\s*=\s*"(.*)"',
 7 |     open('src/godork/services/version.py').read(),
 8 |     re.M
 9 | ).group(1)
10 | 
11 | setup(
12 |     name='godork',
13 |     version=version,
14 |     author='Thunder (@thd3r)',
15 |     author_email='thd3r@proton.me',
16 |     description='Advanced & Fast Google Dorking Tool',
17 |     packages=find_packages(where='src'),
18 |     package_dir={'godork': 'src/godork'},
19 |     install_requires=[
20 |         'bs4',
21 |         'rich',
22 |         'pydub',
23 |         'psutil',
24 |         'aiohttp',
25 |         'asyncio',
26 |         'selenium',
27 |         'setuptools',
28 |         'SpeechRecognition',
29 |         'webdriver-manager',
30 |         'undetected-chromedriver',
31 |     ],
32 |     entry_points={
33 |         'console_scripts': [
34 |             'godork = godork.godork:main'
35 |         ]
36 |     },
37 |     license='MIT',
38 |     url='https://github.com/thd3r/godork',
39 |     long_description=open('README.md').read(),
40 |     long_description_content_type='text/markdown',
41 |     keywords=['godork', 'google dorks', 'google dorking'],
42 |     classifiers=(
43 |         'Development Status :: 4 - Beta',
44 |         'Natural Language :: English',
45 |         'Programming Language :: Python :: 3',
46 |     )
47 | )
48 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use a slim base image
 2 | FROM python:3.12-slim
 3 | 
 4 | # Set environment variables
 5 | ENV PYTHONUNBUFFERED=1 \
 6 |     PIP_NO_CACHE_DIR=1 \
 7 |     PIP_DISABLE_PIP_VERSION_CHECK=1
 8 | 
 9 | # Install system dependencies
10 | RUN apt-get update && apt-get install -y \
11 |     git \
12 |     wget \
13 |     ffmpeg \
14 |     unzip \
15 |     && apt-get clean \
16 |     && python -m pip install --upgrade pip
17 | 
18 | # Set working directory
19 | WORKDIR /app
20 | 
21 | # Download Chrome browser and install
22 | RUN wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
23 |     && apt -f install \
24 |     && apt-get install ./google-chrome-stable_current_amd64.deb -y 
25 | 
26 | # Take the Chrome version and put it through the files
27 | RUN google-chrome --version | cut -d ' ' -f3 | while read -r line; do echo $line > /tmp/google-version.txt; done
28 | 
29 | # Download chromedriver based on Chrome browser version
30 | RUN cat /tmp/google-version.txt | while read -r version; do wget https://storage.googleapis.com/chrome-for-testing-public/$version/linux64/chromedriver-linux64.zip; done
31 | 
32 | # Extract chromedriver and move the path
33 | RUN unzip /app/chromedriver-linux64.zip && cp /app/chromedriver-linux64/chromedriver /usr/bin
34 | 
35 | # Remove tracks
36 | RUN rm /app/google-chrome-stable_current_amd64.deb && rm -rf /app/chromedriver-linux64
37 | 
38 | # Copy the entire project
39 | COPY . /app/
40 | 
41 | # Install dependencies using pip
42 | RUN pip install -r requirements.txt
43 | 
44 | # Install the godork tool
45 | RUN python setup.py install
46 | 
47 | # Set entrypoint
48 | ENTRYPOINT ["godork"]
49 | 


--------------------------------------------------------------------------------
/src/godork/utils/user_agents.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | USER_AGENTS = [
 4 |     "Mozilla/5.0 (Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0",
 5 |     "Mozilla/5.0 (Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0",
 6 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:57.0) Gecko/20100101 Firefox/57.0",
 7 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
 8 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 12.1; rv:91.0) Gecko/20100101 Firefox/91.0",
 9 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 12.1; rv:96.0) Gecko/20100101 Firefox/96.0",
10 |     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
11 |     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
12 |     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
13 |     "Mozilla/5.0 (Windows NT 10.0; WOW64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 OPR/83.0.4254.16",
14 |     "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0",
15 |     "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0",
16 |     "Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0",
17 |     "Mozilla/5.0 (X11; Linux i686; rv:91.0) Gecko/20100101 Firefox/91.0",
18 |     "Mozilla/5.0 (X11; Linux i686; rv:96.0) Gecko/20100101 Firefox/96.0",
19 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
20 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
21 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36",
22 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36",
23 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 OPR/83.0.4254.16",
24 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/63.0.3239.84 Chrome/63.0.3239.84 Safari/537.36",
25 |     "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0",
26 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0",
27 |     "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:96.0) Gecko/20100101 Firefox/96.0"
28 | ]
29 | 
30 | random_agent = random.choice(USER_AGENTS)


--------------------------------------------------------------------------------
/src/godork/helpers/options.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | class OptionParser:
 4 | 
 5 |     """
 6 |     The OptionParser class is a lightweight, static utility that serves as the command-line interface (CLI) parser for the GoDork tool. 
 7 |     It leverages Python's built-in argparse module to handle user input from the terminal, making the tool both flexible and easy to configure.
 8 | 
 9 |     Purpose:
10 | 
11 |         * This class is responsible for:
12 | 
13 |             - Defining all supported command-line options
14 |             - Validating and parsing input from the terminal
15 |             - Returning the parsed arguments for use within the application
16 | 
17 |     Key Features:
18 | 
19 |         1. argument_parser() (static method)
20 | 
21 |             * This is the core method of the class. 
22 |               It creates and configures an ArgumentParser instance with several options to tailor the scraping behavior.
23 |             * The method returns a parsed Namespace object that contains all user-specified or default values. This object is then used throughout the tool to control execution flow and feature toggles.
24 | 
25 |     """
26 | 
27 |     @staticmethod
28 |     def argument_parser():
29 |         parser = argparse.ArgumentParser(
30 |             prog="godork",
31 |             usage="%(prog)s [OPTIONS] "
32 |         )
33 |         parser.add_argument(
34 |             "-v",
35 |             "--version",
36 |             action="version",
37 |             version=f"%(prog)s 2.0.5",
38 |         )
39 |         parser.add_argument(
40 |             "-d",
41 |             "--dorks",
42 |             action="store",
43 |             default="",
44 |             help="single dork or file containing multiple dorks"
45 |         )
46 |         parser.add_argument(
47 |             "-p",
48 |             "--proxy",
49 |             action="store",
50 |             help="http proxy to use with godork (e.g. http://127.0.0.1:8080)"
51 |         )
52 |         parser.add_argument(
53 |             "--retries",
54 |             type=int,
55 |             action="store",
56 |             default=40,
57 |             help="retries when request is blocked (default: 40)"
58 |         )
59 |         parser.add_argument(
60 |             "--max-retries",
61 |             type=int,
62 |             action="store",
63 |             default=2,
64 |             help="max attempts to bypass protection mechanisms (default: 2)"
65 |         )
66 |         parser.add_argument(
67 |             "--debug",
68 |             action="store_true",
69 |             default=False,
70 |             help="show detailed logs and error for debugging"
71 |         )
72 |         parser.add_argument(
73 |             "--no-headless",
74 |             action="store_false",
75 |             default=True,
76 |             help="run in graphical mode when bypassing"
77 |         )
78 | 
79 |         return parser.parse_args()


--------------------------------------------------------------------------------
/src/godork/services/driver.py:
--------------------------------------------------------------------------------
 1 | import undetected_chromedriver as uc
 2 | 
 3 | from ..utils.user_agents import random_agent
 4 | 
 5 | from webdriver_manager.chrome import ChromeDriverManager
 6 | from selenium.webdriver import ChromeService
 7 | 
 8 | CHROME_DRIVER_PATH = ChromeDriverManager().install() 
 9 | 
10 | class SeleniumDriver:
11 | 
12 |     """
13 |     The SeleniumDriver class is designed to manage the creation and configuration of a Selenium WebDriver instance for automated web browsing. 
14 |     It utilizes undetected-chromedriver (uc) to handle interactions with Chrome in a way that minimizes the chance of detection by websites using anti-bot mechanisms.
15 | 
16 |     Key Features:
17 | 
18 |         1. Initialization (__init__):
19 | 
20 |             * The class accepts a headless_mode argument that determines whether the browser will run in headless mode (without a visible UI).
21 |             * Initializes a driver attribute set to None at the start.
22 | 
23 |         2. Context Manager (__enter__):
24 | 
25 |             * When entering the context (via a with statement), the class configures the Chrome browser by setting up ChromeService with an automatically downloaded driver using ChromeDriverManager.
26 |             * Configures the Chrome options for the WebDriver:
27 | 
28 |                 - Disables automation flags to avoid detection (--disable-blink-features=AutomationControlled).
29 |                 - Disables unnecessary features like extensions and GPU usage for better performance.
30 |                 - Sets a custom user-agent string (likely to simulate a real browser environment).
31 |                 - Optionally enables headless mode based on the headless_mode flag.
32 |                 - Creates a Chrome WebDriver instance (uc.Chrome), applies the configurations, and sets a page load timeout of 10 seconds.
33 |                 - Returns the WebDriver instance for use within the with block.
34 | 
35 |         3. Exit (__exit__):
36 | 
37 |             * The __exit__ method is a placeholder that ensures proper cleanup and exit behavior when leaving the context. 
38 |               Currently, it does nothing but could be expanded for proper resource management (e.g. closing the driver).
39 |               
40 |     """
41 | 
42 |     def __init__(self, headless_mode:bool):
43 |         self.headless = headless_mode
44 |         self.driver = None
45 | 
46 |     def __enter__(self):
47 |         chrome_service = ChromeService(CHROME_DRIVER_PATH)
48 | 
49 |         options = uc.ChromeOptions()
50 |         options.add_argument("--disable-blink-features=AutomationControlled")
51 |         options.add_argument("--disable-extensions")
52 |         options.add_argument("--disable-gpu")
53 |         options.add_argument("--disable-dev-shm-usage")
54 |         options.add_argument("--no-sandbox")
55 |         options.add_argument(f"--user-agent={random_agent}")
56 | 
57 |         if self.headless:
58 |             options.add_argument("--headless=new")
59 | 
60 |         self.driver = uc.Chrome(service=chrome_service, options=options)
61 |         self.driver.set_page_load_timeout(10)
62 |         return self.driver
63 | 
64 |     def __exit__(self, exc_type, exc_val, exc_tb):
65 |         if self.driver:
66 |             self.driver.quit()
67 | 


--------------------------------------------------------------------------------
/src/godork/helpers/extractor.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from ..utils.colors import Bgcolor
 4 | from ..utils.parse import no_data
 5 | from .console import Console
 6 | from ..utils.exceptions import GodorkNoData
 7 | 
 8 | from urllib.parse import urlparse, unquote
 9 | from datetime import datetime
10 | from bs4 import BeautifulSoup
11 | 
12 | def extract_pages(html):
13 |     """
14 |     This function will use a pattern to extract each available page and will return the last page.
15 |     """
16 | 
17 |     pages = re.findall(r'aria-label=\"Page ([0-9]+)\"', html)
18 |     return pages[-1]
19 | 
20 | def extract_title(html):
21 |     """
22 |     This function extracts each title based on the <h3> tag and adds the title data to a list. It then returns a list containing the title content.
23 |     """
24 | 
25 |     data_title = []
26 |     soup = BeautifulSoup(html, "html.parser")
27 | 
28 |     for title in soup.find_all("h3"):
29 |         if not re.search("Google Search Console|Google Search", title.getText()):
30 |             data_title.append(title.getText().strip())
31 |     
32 |     return data_title
33 | 
34 | def extract_link(text):
35 |     """
36 |     This function extracts all available links from the search results by applying various patterns to assist in the extraction. 
37 |     It also checks if a domain is part of the excluded domains list. The function returns a list of links.
38 |     """
39 | 
40 |     data_links = []
41 |     exclude_domains = re.findall(r'https?://([a-zA-Z0-9\-.]+\.google\.com)', text)
42 | 
43 |     pattern = re.compile(
44 |         r'\"><a href=\"\/url\?q=(.*?)&amp|href=\"/url\?q=(.*?)&amp;sa=U&amp;ved=|&amp;url=(.*?)&amp;ved='
45 |     )
46 | 
47 |     links = pattern.findall(text)
48 |     if links:
49 |         for link in links:
50 |             link = "".join(list(dict.fromkeys(link)))
51 |             if link.startswith(('http', 'https')) and urlparse(link).netloc not in exclude_domains:
52 |                 data_links.append(unquote(link))
53 |     
54 |     return data_links
55 | 
56 | def extract_data(html, reports, metadata):
57 |     """
58 |     This function combines title and link extraction to process the data. If both the title and link are valid, they will be printed. 
59 |     Additionally, the function generates a report if valid data is found.
60 |     """
61 | 
62 |     query = metadata.get("query")
63 |     num_page = metadata.get("num_page")
64 | 
65 |     data_title = extract_title(html)
66 |     data_links = extract_link(html)
67 | 
68 |     if no_data(data_title) == True:
69 |         raise GodorkNoData(f"No data can be collected on page {num_page}")
70 |         
71 |     if len(data_title) > 0 and len(data_links) > 0:
72 |         reports.logs_report("info", data=f"Found {len(data_title)} title and {len(data_links)} links on page {num_page}")
73 |         Console().log_print("info", msg=f"Found {len(data_title)} title and {len(data_links)} links on page {num_page}")
74 | 
75 |         reports.json_report({
76 |             "timestamp": str(datetime.now()),
77 |             "query": query,
78 |             "page": num_page,
79 |             "size_page": len(html),
80 |             "data_output": {
81 |                 "title": data_title,
82 |                 "links": data_links
83 |             },
84 |         })
85 | 
86 |         for i, title in enumerate(data_title):
87 |             try:
88 |                 print(f"{title} [{Bgcolor.GREEN}{data_links[i]}{Bgcolor.DEFAULT}]")
89 |             except IndexError:
90 |                 pass


--------------------------------------------------------------------------------
/src/godork/helpers/reports.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from datetime import datetime
 5 | from .console import Console
 6 | 
 7 | class Reports:
 8 | 
 9 |     """
10 |     The Reports class is a centralized utility designed to manage logging and reporting for the GoDork tool. 
11 |     Its primary role is to handle output storage by writing execution logs and structured JSON results to organized directories, ensuring traceability and easy analysis of scraping sessions.
12 | 
13 |     Purpose:
14 | 
15 |         * This class automates the creation, formatting, and saving of:
16 | 
17 |             - Log files in plain text (human-readable)
18 |             - JSON reports for structured, machine-readable data
19 |             - Organized directories for persistent reporting
20 |     
21 |     Key Features:
22 | 
23 |         1. Initialization (__init__)
24 | 
25 |             * Upon initialization:
26 | 
27 |                 - Determines the appropriate temp directory (Windows or Unix-based systems)
28 |                 - Sets up paths for logs and JSON reports using timestamps
29 |                 - Automatically creates required directories (logs and json) under /tmp/godork/reports (or %TEMP%/godork/reports on Windows)
30 |                 - Initializes the Console utility for consistent and colored terminal output
31 | 
32 |         2. write_file_json(filename, data)
33 | 
34 |             * Appends structured data to a JSON file. Ideal for storing detailed metadata or search results.
35 | 
36 |         3. write_file_text(filename, data)
37 | 
38 |             * Appends plain text to a given file. Primarily used for saving logs and console-style outputs.
39 | 
40 |         4. logs_report(status, data)
41 | 
42 |             * Handles writing formatted log entries (with timestamps and status levels like INFO, ERROR, DEBUG) to the log file. Uses the Console class for formatting consistency.
43 | 
44 |         5. json_report(data)
45 | 
46 |             * Writes a JSON entry to the report file. Useful for capturing individual result items in structured form.
47 | 
48 |     Report Paths:
49 | 
50 |         * Logs: Saved under reports/logs/ with timestamped filenames.
51 |         * JSON: Saved under reports/json/ for structured result data.
52 | 
53 |     Error Handling:
54 | 
55 |         * Both logs_report() and json_report() include internal exception handling. If writing to a file fails, an error is printed to the console, ensuring that such failures are visible but non-fatal.
56 | 
57 |     """
58 | 
59 |     def __init__(self):
60 |         self.temp_dir = os.getenv("TEMP") if os.name == "nt" else "/tmp"
61 |         self.base_dir = f"{self.temp_dir}/godork/reports"
62 | 
63 |         self.log_file = f"{self.base_dir}/logs/{str(datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))}_godork.log"
64 |         self.json_file = f"{self.base_dir}/json/{str(datetime.now().strftime('%Y-%m-%d-%H:%M:%S'))}_godork.json"
65 | 
66 |         self.console = Console()
67 | 
68 |         try:
69 |             os.makedirs(f"{self.base_dir}/logs")
70 |             os.makedirs(f"{self.base_dir}/json")
71 |         except FileExistsError:
72 |             self.base_dir = self.base_dir
73 | 
74 |     def write_file_json(self, filename, data):
75 |         with open(filename, "at") as f:
76 |             try:
77 |                 f.write(json.dumps(data, indent=4, ensure_ascii=False) + os.linesep)
78 |             finally:
79 |                 f.close()
80 | 
81 |     def write_file_text(self, filename, data):
82 |         with open(filename, "at") as f:
83 |             try:
84 |                 f.write(str(data) + os.linesep)
85 |             finally:
86 |                 f.close()
87 | 
88 |     def logs_report(self, status, data):
89 |         try:
90 |             self.write_file_text(self.log_file, data=self.console.out_log_format(status, msg=data))
91 |         except Exception as err:
92 |             self.console.log_print("error", msg=err)
93 | 
94 |     def json_report(self, data):
95 |         try:
96 |             self.write_file_json(self.json_file, data=data)
97 |         except Exception as err:
98 |             self.console.log_print("error", msg=err)


--------------------------------------------------------------------------------
/src/godork/services/version.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import requests
 3 | 
 4 | from .requester import Requester
 5 | from ..utils.banner import print_banner
 6 | from ..utils.colors import Bgcolor
 7 | 
 8 | """
 9 | The release_version() function is a simple yet effective mechanism for retrieving the latest release version of a tool from GitHub. 
10 | It ensures that users are informed about the availability of newer versions and provides a fallback in case the release information cannot be fetched. 
11 | This function is crucial for maintaining the tool up to date, offering smooth integration with GitHub's release system and preventing potential downtime or errors caused by outdated versions.
12 | 
13 | Key Elements:
14 | 
15 |     1. CURRENT_VERSION:
16 | 
17 |         * This constant represents the current, locally installed version of the software tool.
18 |         * This version is used as a fallback in the event that the latest release cannot be fetched from the GitHub repository.
19 | 
20 |     2. release_version() Function:
21 | 
22 |         * This function checks the latest release version of the "godork" tool from its official GitHub repository.
23 |         * It performs the following steps:
24 | 
25 |             - Creating a Session: A session is created using the requests.session() to enable persistent connections, allowing efficient HTTP requests.
26 |             - Making a GET Request: The function sends a GET request to GitHub's API endpoint that provides details about the latest release (https://api.github.com/repos/thd3r/godork/releases/latest).
27 | 
28 |         * Handling the Response:
29 | 
30 |             - If the request is successful, the response is expected to be in JSON format. The json.loads() function is used to parse the response body.
31 |             - The parsed data contains various details about the latest release, including the tag name (which represents the version) and the release notes (body). These are returned from the function.
32 | 
33 |         * Error Handling:
34 | 
35 |             - If the request fails for any reason (e.g., network issues, API issues), 
36 |               the function catches the exception and returns the current local version (CURRENT_VERSION) along with None for the release notes.
37 | 
38 | Key Points:
39 | 
40 |         * GitHub API Integration: The function leverages GitHub's API to fetch the latest release information for the "godork" tool, ensuring that users can easily stay up to date with the latest version.
41 |         * Fallback Mechanism: If there are any issues fetching the release version, the current local version is returned to avoid errors in the application.
42 |         * Error Handling: The use of a try-except block ensures that even if the request to the GitHub API fails, the program will continue running without crashing, and the user will receive information about the current version of the software.
43 | 
44 | Usage Scenario:
45 | 
46 |     This function is typically used as part of a larger update management system, where it can be called to check whether a new release is available for the software. 
47 |     If a newer version is found, it can trigger an update process, or if no update is found, the system can reassure the user that they are already using the latest version.
48 | 
49 | """
50 | 
51 | CURRENT_VERSION = "v2.6.2"
52 | 
53 | def check_version():
54 |     release_vers, _ = release_version()
55 |     if release_vers is not None and CURRENT_VERSION < release_vers:
56 |         print_banner(status=f"{Bgcolor.RED}outdated{Bgcolor.DEFAULT}", version=CURRENT_VERSION)
57 |     if release_vers is not None and CURRENT_VERSION == release_vers:
58 |         print_banner(status=f"{Bgcolor.GREEN}latest{Bgcolor.DEFAULT}", version=CURRENT_VERSION)
59 |     if release_vers is None:
60 |         print_banner(status=f"{Bgcolor.RED}outdated{Bgcolor.DEFAULT}", version=CURRENT_VERSION)
61 | 
62 | def release_version():
63 |     session = requests.session()
64 |     try:
65 |         response = Requester().reqwest(session, "GET", url="https://api.github.com/repos/thd3r/godork/releases/latest", timeout=10)
66 |         data_json = json.loads(response.text)
67 |         return data_json["tag_name"], data_json["body"]
68 |     except:
69 |         return CURRENT_VERSION, None


--------------------------------------------------------------------------------
/src/godork/services/requester.py:
--------------------------------------------------------------------------------
 1 | from ..utils.user_agents import random_agent
 2 | 
 3 | class Requester:
 4 | 
 5 |     """
 6 |     The Requester class is a Python utility designed for making HTTP requests with customizable options for both synchronous and asynchronous operations. 
 7 |     It simplifies sending requests with custom headers, proxies, cookies, and additional parameters, while also handling both standard and asynchronous HTTP methods.
 8 | 
 9 |     Key Features:
10 | 
11 |         1. Initialization (__init__):
12 | 
13 |             * The class initializes a default set of HTTP headers, including a User-Agent string (which is randomly chosen), Accept, Accept-Language, and Referer. 
14 |               These headers are typically used to simulate real user traffic, helping to avoid detection by web servers or bot protection mechanisms.
15 |             * It also initializes an empty dictionary, response_dict, to store response content asynchronously.
16 | 
17 |         2. HTTP Request (Synchronous) - reqwest:
18 | 
19 |             * This method sends a synchronous HTTP request using the session.request() function from the requests library.
20 |             * It takes various arguments:
21 | 
22 |                 - method: The HTTP method (e.g., GET, POST).
23 |                 - url: The target URL for the request.
24 |                 - Additional keyword arguments (kwargs) include optional parameters such as proxies, request parameters (params), timeouts, cookies, custom headers, and the ability to allow redirects.
25 | 
26 |             * The method sends the request and returns the response object.
27 |             * This method is ideal for situations where blocking operations (synchronous requests) are acceptable.
28 | 
29 |         3. HTTP Request (Asynchronous) - aioreqwest:
30 | 
31 |             * This method allows for asynchronous HTTP requests using aiohttp, making it more suitable for high-performance web scraping or API interactions that require non-blocking calls.
32 |             * The function accepts similar parameters as the synchronous version, with the main difference being the use of async with to handle the asynchronous nature of the request.
33 |             * Upon receiving the response, it updates response_dict with the body content of the response, allowing asynchronous access to the data. 
34 |               The function then returns the response object, providing an efficient way to handle multiple requests concurrently.
35 |             * This method is ideal for situations requiring non-blocking I/O operations, such as when dealing with large-scale web scraping or API calls.
36 |     
37 |     The class leverages both requests for traditional synchronous requests and aiohttp for asynchronous tasks, offering flexibility depending on the needs of the application.
38 | 
39 |     """
40 | 
41 |     def __init__(self):
42 |         self.headers = {
43 |             "User-Agent": str(random_agent),
44 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
45 |             "Accept-Language": "en-US,en;q=0.5",
46 |             "Referer": "https://www.google.com/",
47 |         }
48 | 
49 |     def reqwest(self, session, method, url, **kwargs):
50 |         response = session.request(
51 |             method=method,
52 |             url=url,
53 |             proxies=kwargs.get("proxy"),
54 |             params=kwargs.get("params"),
55 |             timeout=kwargs.get("timeout"),
56 |             cookies=kwargs.get("cookies"),
57 |             headers=self.headers if not kwargs.get("headers") else kwargs.get("headers"),
58 |             allow_redirects=kwargs.get("redirects")
59 |         )
60 |         return response
61 | 
62 |     async def aioreqwest(self, session, method, url, **kwargs):
63 |         async with session.request(
64 |             method=method,
65 |             url=url,
66 |             proxy=kwargs.get("proxy"),
67 |             params=kwargs.get("params"),
68 |             timeout=kwargs.get("timeout"),
69 |             cookies=kwargs.get("cookies"),
70 |             headers=self.headers if not kwargs.get("headers") else kwargs.get("headers"),
71 |             allow_redirects=kwargs.get("redirects")
72 |         ) as response:
73 |             body = await response.text()
74 |             return response, body


--------------------------------------------------------------------------------
/src/godork/helpers/console.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime
  2 | 
  3 | from ..utils.colors import Bgcolor
  4 | 
  5 | class Console:
  6 | 
  7 |     """
  8 |     The Console class provides a lightweight and extensible logging utility for managing output in the terminal with structured formatting, colors, and real-time timestamps. 
  9 |     It's designed to enhance the developer and user experience by improving readability, debugging, and log categorization during runtime.
 10 | 
 11 |     Purpose:
 12 | 
 13 |         * This class handles all console-related operations for:
 14 | 
 15 |             - Structured logging with colored labels (INFO, ERROR, DEBUG, etc.)
 16 |             - Debug control, allowing conditional logging based on a debug flag
 17 |             - Human-readable messages with timestamps
 18 |             - Graceful error handling for incorrect log usage
 19 | 
 20 |     Key Features:
 21 | 
 22 |         1. debugging(self, debug, msg)
 23 | 
 24 |             * Conditionally logs a debug message only if the debug flag is True.
 25 |             * Uses log_print() to format and display the message in a consistent "DEBUG" style.
 26 |             * Helps developers toggle verbose logs without modifying other parts of the code.
 27 | 
 28 |         2. log_print(self, status, msg)
 29 | 
 30 |             * Main method to output a log message with timestamp and status level.
 31 |             * Calls out_log_format() to structure the message before printing.
 32 | 
 33 |         3. out_log_format(self, status, msg)
 34 | 
 35 |             * Formats log messages with timestamps, colors, and labeled tags (INFO, ERROR, DEBUG, WARNING).
 36 |             * Accepts a status string to determine the log level.
 37 |             * Each status type is styled using ANSI escape codes (through Bgcolor) for color coding.
 38 |             * Falls back to an error message if an unknown status is passed, and terminates the program.
 39 | 
 40 |         4. text_format(self, status, msg)
 41 | 
 42 |             * Similar to out_log_format(), but without timestamps.
 43 |             * Intended for simpler, inline use when a timestamp isn't required.
 44 |             * Returns a stylized message string, ideal for banners, summaries, or compact logs.
 45 | 
 46 |     Error Handling:
 47 | 
 48 |         * Both out_log_format and text_format include checks to ensure only supported status values are used. If not, they:
 49 | 
 50 |             - Log an error message.
 51 |             - Exit the program with status code 1.
 52 | 
 53 |     """
 54 | 
 55 |     def debugging(self, debug, msg):
 56 |         if debug == True:
 57 |             self.log_print("debug", msg=f"{Bgcolor.GRAY}{msg}{Bgcolor.DEFAULT}")
 58 | 
 59 |     def log_print(self, status, msg):
 60 |         print(self.out_log_format(status, msg))
 61 | 
 62 |     def out_log_format(self, status, msg):
 63 |         log_time = str(datetime.now().strftime('%Y/%m/%d %H:%M:%S'))
 64 | 
 65 |         if status.lower() == "info":
 66 |             detailed_info = f"[{Bgcolor.CYAN}{log_time}{Bgcolor.DEFAULT}] [{Bgcolor.BLUE}INFO{Bgcolor.DEFAULT}] {msg}"
 67 |             return detailed_info
 68 | 
 69 |         if status.lower() == "error":
 70 |             detailed_error = f"[{Bgcolor.CYAN}{log_time}{Bgcolor.DEFAULT}] [{Bgcolor.RED}EROR{Bgcolor.DEFAULT}] {msg}"
 71 |             return detailed_error
 72 | 
 73 |         if status.lower() == "debug":
 74 |             detailed_debug = f"[{Bgcolor.CYAN}{log_time}{Bgcolor.DEFAULT}] [{Bgcolor.PURPLE}DBUG{Bgcolor.DEFAULT}] {msg}"
 75 |             return detailed_debug
 76 | 
 77 |         if status.lower() == "warning":
 78 |             detailed_warning = f"[{Bgcolor.CYAN}{log_time}{Bgcolor.DEFAULT}] [{Bgcolor.WARNING}WARN{Bgcolor.DEFAULT}] {msg}"
 79 |             return detailed_warning
 80 | 
 81 |         if status.lower() not in ["info", "debug", "error", "warning"]:
 82 |             self.log_print(status="error", msg="status=REQUIRED args required with msg:{}".format(msg))
 83 |             exit(1)
 84 | 
 85 |     def text_format(self, status, msg):
 86 |         if status.lower() == "info":
 87 |             detailed_info = f"[{Bgcolor.BLUE}INFO{Bgcolor.DEFAULT}] {msg}"
 88 |             return detailed_info
 89 | 
 90 |         if status.lower() == "error":
 91 |             detailed_error = f"[{Bgcolor.RED}EROR{Bgcolor.DEFAULT}] {msg}"
 92 |             return detailed_error
 93 | 
 94 |         if status.lower() == "debug":
 95 |             detailed_debug = f"[{Bgcolor.PURPLE}DBUG{Bgcolor.DEFAULT}] {msg}"
 96 |             return detailed_debug
 97 | 
 98 |         if status.lower() == "warning":
 99 |             detailed_warning = f"[{Bgcolor.WARNING}WARN{Bgcolor.DEFAULT}] {msg}"
100 |             return detailed_warning
101 | 
102 |         if status.lower() not in ["info", "debug", "error", "warning"]:
103 |             self.log_print(status="error", msg="status=REQUIRED args required with msg:{}".format(msg))
104 |             exit(1)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | !libs/lume/scripts/build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | .pdm.toml
 87 | .pdm-python
 88 | .pdm-build/
 89 | 
 90 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 91 | __pypackages__/
 92 | 
 93 | # Celery stuff
 94 | celerybeat-schedule
 95 | celerybeat.pid
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Scripts
125 | server/scripts/
126 | 
127 | # Pyre type checker
128 | .pyre/
129 | 
130 | # pytype static type analyzer
131 | .pytype/
132 | 
133 | # Cython debug symbols
134 | cython_debug/
135 | 
136 | # Ruff stuff:
137 | .ruff_cache/
138 | 
139 | # PyPI configuration file
140 | .pypirc
141 | 
142 | # Conda
143 | .conda/
144 | 
145 | # Local environment
146 | .env.local
147 | 
148 | # macOS DS_Store
149 | .DS_Store
150 | 
151 | weights/
152 | weights/icon_detect/
153 | weights/icon_detect/model.pt
154 | weights/icon_detect/model.pt.zip
155 | weights/icon_detect/model.pt.zip.part*
156 | 
157 | libs/omniparser/weights/icon_detect/model.pt
158 | 
159 | # Example test data and output
160 | examples/test_data/
161 | examples/output/
162 | 
163 | /screenshots/
164 | 
165 | /experiments/
166 | 
167 | /logs/
168 | 
169 | # Xcode
170 | #
171 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
172 | 
173 | ## User settings
174 | xcuserdata/
175 | 
176 | ## Obj-C/Swift specific
177 | *.hmap
178 | 
179 | ## App packaging
180 | *.ipa
181 | *.dSYM.zip
182 | *.dSYM
183 | 
184 | ## Playgrounds
185 | timeline.xctimeline
186 | playground.xcworkspace
187 | 
188 | # Swift Package Manager
189 | #
190 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies.
191 | # Packages/
192 | # Package.pins
193 | # Package.resolved
194 | # *.xcodeproj
195 | #
196 | # Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata
197 | # hence it is not needed unless you have added a package configuration file to your project
198 | .swiftpm/
199 | .build/
200 | 
201 | # CocoaPods
202 | #
203 | # We recommend against adding the Pods directory to your .gitignore. However
204 | # you should judge for yourself, the pros and cons are mentioned at:
205 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
206 | #
207 | # Pods/
208 | #
209 | # Add this line if you want to avoid checking in source code from the Xcode workspace
210 | # *.xcworkspace
211 | 
212 | # Carthage
213 | #
214 | # Add this line if you want to avoid checking in source code from Carthage dependencies.
215 | # Carthage/Checkouts
216 | Carthage/Build/
217 | 
218 | # fastlane
219 | #
220 | # It is recommended to not store the screenshots in the git repo.
221 | # Instead, use fastlane to re-generate the screenshots whenever they are needed.
222 | # For more information about the recommended setup visit:
223 | # https://docs.fastlane.tools/best-practices/source-control/#source-control
224 | fastlane/report.xml
225 | fastlane/Preview.html
226 | fastlane/screenshots/**/*.png
227 | fastlane/test_output
228 | 
229 | # Ignore folder
230 | ignore
231 | 
232 | # .release
233 | .release/
234 | 
235 | # Shared folder
236 | shared
237 | 
238 | # Trajectories
239 | trajectories/
240 | 
241 | # Installation ID Storage
242 | .storage/
243 | 
244 | # Gradio settings
245 | .gradio_settings.json
246 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="left">
  2 |   Godork - Advanced & Fast Google Dorking Tool
  3 | </h1>
  4 | 
  5 | <div align="left">
  6 |   <a href="https://python.org"><img src="https://img.shields.io/badge/Built%20with-Python-Blue"></a>
  7 |   <a href="https://opensource.org/licenses/MIT"><img src="https://img.shields.io/badge/license-MIT-_red.svg"></a>
  8 |   <a href="https://github.com/thd3r/godork/releases"><img src="https://img.shields.io/github/release/thd3r/godork.svg"></a>
  9 |   <a href="https://pypi.python.org/pypi/godork/"><img src="https://img.shields.io/pypi/v/godork.svg"></a>
 10 |   <a href="https://github.com/thd3r/godork/issues?q=is%3Aissue+is%3Aclosed"><img src="https://img.shields.io/github/issues-closed-raw/thd3r/godork?color=dark-green&label=issues%20fixed"></a>
 11 | </div>
 12 | 
 13 | ```sh
 14 |                 __         __  
 15 |   ___ ____  ___/ /__  ____/ /__
 16 |  / _ `/ _ \/ _  / _ \/ __/  '_/  v2.6.2
 17 |  \_, /\___/\_,_/\___/_/ /_/\_\    latest
 18 | /___/                                                                                                            
 19 |            thd3r & societyprojects
 20 | ```
 21 | 
 22 | **Godork** is a high-performance tool designed to scrape links and titles from Google search results using the [asyncio](https://docs.python.org/3/library/asyncio.html) library, which enables efficient cooperative multitasking. Combined with [aiohttp](https://docs.aiohttp.org), this tool allows you to quickly and reliably extract URLs along with their corresponding titles. Additionally, Godork is capable of bypassing restrictions imposed by network providers, ensuring uninterrupted access to search data
 23 | 
 24 | ## ✨ Why Godork?
 25 | 
 26 | * ⚡ Blazing-fast performance using asynchronous HTTP requests (aiohttp)
 27 | 
 28 | * 🔍 Automated dork execution with support for lists, batches, and single queries
 29 | 
 30 | * 🌐 Proxy-ready: Bypass restrictions and stay anonymous with HTTP proxy integration
 31 | 
 32 | * 🕶️ Headless browser mode with Selenium to defeat CAPTCHAs and JS-based blocks
 33 | 
 34 | * 🐳 Docker-compatible: Seamlessly containerize and deploy in any environment
 35 | 
 36 | ## Resources
 37 | - [Requirements](#requirements)
 38 | - [Installation](#installation)
 39 | 	- [Install with pip](#install-with-pip)
 40 | - [Options](#options)
 41 | - [Example Usage](#example-usage)
 42 |   - [Basic dorking](#basic-dorking)
 43 |   - [Batch mode](#batch-mode)
 44 | - [Help & Bugs](#help--bugs)
 45 | - [Contributors](#contributors-heart)
 46 | - [License](#license)
 47 | - [Support](#support)
 48 | 
 49 | 
 50 | ## Requirements
 51 | 
 52 | ```sh
 53 | # This is required for the pydub library
 54 | $ sudo apt install ffmpeg
 55 | 
 56 | # Check the version of the google-chrome browser
 57 | $ google-chrome --version
 58 | 
 59 | # If the browser version does not exist run this command
 60 | $ wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
 61 | $ sudo apt -f install
 62 | $ sudo dpkg -i google-chrome-stable_current_amd64.deb
 63 | 
 64 | # After that, take the version from your google-chrome browser and place it here
 65 | $ wget https://storage.googleapis.com/chrome-for-testing-public/{PUT_THAT_VERSION_HERE}/linux64/chromedriver-linux64.zip
 66 | $ unzip chromedriver-linux64.zip
 67 | $ cd chromedriver-linux64 
 68 | $ sudo mv chromedriver /usr/bin
 69 | ```
 70 | 
 71 | ## Installation
 72 | 
 73 | **Godork** requires **python 3.8** or higher to install successfully
 74 | 
 75 | ### Install with pip:
 76 | 
 77 | ```sh
 78 | pip install godork
 79 | ```
 80 | 
 81 | ## Options
 82 | 
 83 | | Option            | Type         | Description                             	      |
 84 | |-------------------|--------------|------------------------------------------------|
 85 | | -v, --version     | flag         | displays the current version of godork |
 86 | | -d, --dorks       | string       | single dork or file containing multiple dorks            |
 87 | | -p, --proxy       | string       | http proxy to use with godork (e.g. http://127.0.0.1:8080) |
 88 | | --retries         | integer      | retries when request is blocked (default: 40) |
 89 | | --max-retries     | integer      | max attempts to bypass protection mechanisms (default: 2) |
 90 | | --debug           | boolean      | show detailed logs and error for debugging |
 91 | | --no-headless     | boolean      | run in graphical mode when bypassing |
 92 | 
 93 | ## Example Usage
 94 | 
 95 | ### Basic dorking:
 96 | 
 97 | ```sh
 98 | godork --dorks "intitle:index.of site:example.com"
 99 | ```
100 | 
101 | > [!WARNING]
102 | > Developers assume no liability and are not responsible for any issue or damage.
103 | 
104 | ### Batch mode:
105 | 
106 | ```sh
107 | godork --dorks dorks.txt --proxy http://127.0.0.1:8080 --no-headless
108 | ```
109 | 
110 | ## Help & Bugs
111 | 
112 | If you are still confused or found a bug, please [open the issue](https://github.com/thd3r/godork/issues). All bug reports are appreciated, some features have not been tested yet due to lack of free time.
113 | 
114 | ## Contributors :heart:
115 | 
116 | <p align="left">
117 | <a href="https://github.com/societyprojects"><img src="https://avatars.githubusercontent.com/u/181974230?s=400&v=4" width="50" height="50" alt="" style="max-width: 100%;"></a>
118 | </p>
119 | 
120 | ## License
121 | 
122 | Licensed under the [MIT License](https://github.com/thd3r/godork/blob/main/LICENSE.md).
123 | 
124 | Contributions are welcome :) feel free to fork, suggest improvements, or submit pull requests.
125 | 
126 | ## Support
127 | 
128 | <a href="https://www.buymeacoffee.com/thd3r" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 60px !important;width: 217px !important;" ></a>
129 | 


--------------------------------------------------------------------------------
/src/godork/services/recaptcha.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import pydub
  4 | import urllib
  5 | import asyncio
  6 | import speech_recognition
  7 | 
  8 | from tempfile import gettempdir
  9 | from datetime import datetime
 10 | 
 11 | from ..helpers.console import Console
 12 | from ..helpers.reports import Reports
 13 | from ..utils.colors import Bgcolor
 14 | from ..utils.exceptions import GodorkException, GodorkTimeout
 15 | 
 16 | from selenium.webdriver.common.by import By
 17 | from selenium.webdriver.common.keys import Keys
 18 | from selenium.webdriver.support.ui import WebDriverWait
 19 | from selenium.webdriver.support import expected_conditions as EC
 20 | from selenium.common.exceptions import TimeoutException, NoSuchElementException
 21 | 
 22 | class RecaptchaBypass:
 23 | 
 24 |     """
 25 |     The RecaptchaBypass class is an advanced solution for bypassing reCAPTCHA v2 challenges, specifically designed for handling audio-based CAPTCHAs. 
 26 |     Using Selenium, undetected-chromedriver, and various media-processing libraries, it automates the process of solving the CAPTCHA by downloading, converting, and transcribing audio challenges. 
 27 |     The class also incorporates detailed logging, debugging, and error handling to ensure smooth and efficient operation, even in environments with strict bot protection.
 28 | 
 29 |     Key Features:
 30 |     
 31 |         1. Initialization (__init__):
 32 | 
 33 |             * The class is initialized with debug and headless_mode flags, allowing control over the debugging output and 
 34 |               whether the browser runs in headless mode (without a visible UI).
 35 |             * It also initializes instances of Console and Reports to handle logging and reporting during execution.
 36 | 
 37 |         2. reCAPTCHA Handling (recaptcha_service):
 38 | 
 39 |             * This asynchronous method automates the process of solving a reCAPTCHA by interacting with the CAPTCHA iframe, clicking the checkbox, and navigating through the audio challenge.
 40 |             * The method performs several actions in sequence:
 41 | 
 42 |                 - Switching to the reCAPTCHA iframe.
 43 |                 - Clicking the reCAPTCHA checkbox and waiting for it to become clickable.
 44 |                 - Switching back to the default frame.
 45 |                 - Locating the audio challenge iframe and clicking the audio button.
 46 |                 - Retrieving the audio source URL.
 47 |                 - Downloading, converting, and decoding the audio to extract the CAPTCHA key.
 48 |                 - Entering the transcribed key and submitting the response.
 49 | 
 50 |         3. Audio CAPTCHA Processing:
 51 | 
 52 |             * The handle_audio_captcha method is responsible for downloading the audio file, converting it from MP3 to WAV format, and 
 53 |               decoding it to extract the text. The transcription is handled using the speech_recognition library.
 54 |             * Temporary files (MP3 and WAV) are cleaned up after processing to ensure no unnecessary files remain on the system.
 55 | 
 56 |         4. Audio Download and Conversion:
 57 | 
 58 |             * The download_audio method downloads the audio file to a temporary directory.
 59 |             * The convert_mp3_to_wav method converts the downloaded MP3 audio to WAV format using the pydub library.
 60 | 
 61 |         5. Transcription and Cleanup:
 62 | 
 63 |             * The audio is transcribed using the Google Speech Recognition API. If transcription fails, appropriate exceptions are raised.
 64 |             * Temporary audio files are cleaned up after the process, ensuring proper resource management.
 65 | 
 66 |         6. Error Handling:
 67 | 
 68 |             * If any CAPTCHA challenge cannot be completed, or if the IP address is blocked, relevant error messages are logged and displayed. 
 69 |               The system gracefully handles exceptions like NoSuchElementException and TimeoutException, ensuring robust operation.
 70 | 
 71 |         7. IP Blocking Detection (is_blocked):
 72 | 
 73 |             * This method checks if the IP address has been blocked by detecting the "captcha body text" indicating a block. 
 74 |               It helps in identifying if reCAPTCHA protection is preventing further attempts.
 75 | 
 76 |         8. Solve CAPTCHA (solve_captcha):
 77 | 
 78 |             * This asynchronous method accepts a URL, launches a Selenium WebDriver, and 
 79 |               attempts to solve the CAPTCHA on the page using the previously mentioned methods.
 80 |             * The process is wrapped in a try-except block to handle errors gracefully, with reports and console logs to provide real-time feedback.
 81 | 
 82 |     """
 83 | 
 84 |     def __init__(self, debug:bool, headless_mode:bool):
 85 |         self.console = Console()
 86 |         self.reports = Reports()
 87 | 
 88 |         self.debug = debug
 89 |         self.headless = headless_mode
 90 | 
 91 |         self.wait = None
 92 |     
 93 |     async def recaptcha_service(self, driver):
 94 |         # Switching to iframe containing reCAPTCHA
 95 |         self.reports.logs_report("debug", data="Switching to iframe containing reCAPTCHA")
 96 |         self.console.debugging(self.debug, msg="Switching to iframe containing reCAPTCHA")
 97 | 
 98 |         try:
 99 |             iframe_inner = driver.find_element(By.XPATH, "//iframe[@title='reCAPTCHA']")
100 |             driver.switch_to.frame(iframe_inner)
101 |         except NoSuchElementException:
102 |             raise GodorkException("Failed to locate reCAPTCHA iframe element")
103 | 
104 |         time.sleep(1)
105 | 
106 |         # Click on the recaptcha
107 |         self.reports.logs_report("debug", data="Clicking the reCAPTCHA checkbox")
108 |         self.console.debugging(self.debug, msg="Clicking the reCAPTCHA checkbox")
109 | 
110 |         try:
111 |             self.wait.until(
112 |                 EC.element_to_be_clickable((By.CSS_SELECTOR, ".rc-anchor-content"))
113 |             ).click()
114 |         except TimeoutException:
115 |             raise GodorkTimeout("Failed to click reCAPTCHA checkbox")
116 | 
117 |         # Switch back to the default frame
118 |         driver.switch_to.default_content()
119 | 
120 |         time.sleep(1)
121 | 
122 |         # Locating audio challenge iframe
123 |         self.reports.logs_report("debug", data="Locating audio challenge iframe")
124 |         self.console.debugging(self.debug, msg="Locating audio challenge iframe")
125 | 
126 |         try:
127 |             iframe = driver.find_element(By.XPATH, "//iframe[contains(@title, 'recaptcha')]")
128 |             driver.switch_to.frame(iframe)
129 |         except NoSuchElementException:
130 |             raise GodorkException("Failed to locate reCAPTCHA iframe element")
131 | 
132 |         # Click on the audio button
133 |         self.reports.logs_report("debug", data="Clicking the audio button")
134 |         self.console.debugging(self.debug, msg="Clicking the audio button")
135 | 
136 |         try:
137 |             self.wait.until(
138 |                 EC.element_to_be_clickable((By.CSS_SELECTOR, "#recaptcha-audio-button"))
139 |             ).click()
140 |         except TimeoutException:
141 |             raise GodorkTimeout("Failed to click audio button")
142 | 
143 |         time.sleep(1)
144 | 
145 |         # Wait for the audio source to load
146 |         self.reports.logs_report("debug", data="Waiting for the audio source to load completely")
147 |         self.console.debugging(self.debug, msg="Waiting for the audio source to load completely")
148 | 
149 |         try:
150 |             audio_source = self.wait.until(
151 |                 EC.presence_of_element_located((By.CSS_SELECTOR, "#audio-source"))
152 |             )
153 |             src = audio_source.get_attribute("src")
154 |             self.reports.logs_report("debug", data=f"Getting the audio URL {src}")
155 |             self.console.debugging(self.debug, msg=f"Getting the audio URL {src}")
156 |         except TimeoutException:
157 |             raise GodorkTimeout("Failed to load audio source")
158 | 
159 |         # Download, convert, and decode audio reCAPTCHA
160 |         try:
161 |             key = await self.handle_audio_captcha(src)
162 |         except (speech_recognition.exceptions.UnknownValueError, speech_recognition.exceptions.RequestError):
163 |             raise GodorkException("Failed to recognize")
164 | 
165 |         # Input the key
166 |         self.reports.logs_report("debug", data="Entering the transcribed phrase")
167 |         self.console.debugging(self.debug, msg="Entering the transcribed phrase")
168 | 
169 |         try:
170 |             self.wait.until(
171 |                 EC.presence_of_element_located((By.CSS_SELECTOR, "#audio-response"))
172 |             ).send_keys(key.lower())
173 |         except TimeoutException:
174 |             raise GodorkTimeout("Failed to input key")
175 | 
176 |         # Submit the key
177 |         self.reports.logs_report("debug", data="Submitting the phrase")
178 |         self.console.debugging(self.debug, msg="Submitting the phrase")
179 | 
180 |         try:
181 |             self.wait.until(
182 |                 EC.presence_of_element_located((By.CSS_SELECTOR, "#audio-response"))
183 |             ).send_keys(Keys.RETURN)
184 |         except TimeoutException:
185 |             raise GodorkTimeout("Failed to submit key")
186 | 
187 |         # Waiting briefly for reCAPTCHA to process the input
188 |         self.reports.logs_report("debug", data="Waiting briefly for reCAPTCHA to process the input")
189 |         self.console.debugging(self.debug, msg="Waiting briefly for reCAPTCHA to process the input")
190 | 
191 |         time.sleep(3)
192 | 
193 |         if self.is_blocked(driver):
194 |             return
195 | 
196 |         self.console.log_print("info", msg="Successfully bypassed v2 protection")
197 |     
198 |     async def handle_audio_captcha(self, src_url):
199 |         """Main handler to download, convert and decode audio CAPTCHA"""
200 |         mp3_path, wav_path = self.get_temp_audio_paths()
201 | 
202 |         self.download_audio(src_url, mp3_path)
203 |         self.convert_mp3_to_wav(mp3_path, wav_path)
204 | 
205 |         try:
206 |             phrase = await self.async_decode_audio(wav_path)
207 |         finally:
208 |             # Delete temporary files
209 |             self.reports.logs_report("debug", data="Deleting temporary audio files")
210 |             self.console.debugging(self.debug, msg="Deleting temporary audio files")
211 |             self.cleanup_temp_files(mp3_path, wav_path)
212 | 
213 |         return phrase
214 |     
215 |     def download_audio(self, src, save_path):
216 |         self.reports.logs_report("debug", data="Downloading the audio to the temp folder")
217 |         self.console.debugging(self.debug, msg="Downloading the audio to the temp folder")
218 |         
219 |         urllib.request.urlretrieve(src, save_path)
220 | 
221 |     def convert_mp3_to_wav(self, mp3_path, wav_path):
222 |         self.reports.logs_report("debug", data="Converting MP3 to WAV format")
223 |         self.console.debugging(self.debug, msg="Converting MP3 to WAV format")
224 | 
225 |         sound = pydub.AudioSegment.from_mp3(mp3_path)
226 |         sound.export(wav_path, format="wav")
227 | 
228 |     async def async_decode_audio(self, wav_path):
229 |         loop = asyncio.get_event_loop()
230 |         return await loop.run_in_executor(None, self.decode_audio, wav_path)
231 |     
232 |     def decode_audio(self, wav_path):
233 |         self.reports.logs_report("debug", data="Transcribing the audio content")
234 |         self.console.debugging(self.debug, msg="Transcribing the audio content")
235 | 
236 |         recognizer = speech_recognition.Recognizer()
237 |         with speech_recognition.AudioFile(wav_path) as source:
238 |             audio = recognizer.record(source)
239 |         return recognizer.recognize_google(audio)
240 | 
241 |     def cleanup_temp_files(self, *paths):
242 |         for path in paths:
243 |             try:
244 |                 os.remove(path)
245 |             except Exception as e:
246 |                 self.reports.logs_report("warning", data=f"Failed to delete {path}: {e}")
247 |                 self.console.log_print("warning", f"Failed to delete {path}: {e}")
248 | 
249 |     def get_temp_audio_paths(self):
250 |         timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
251 |         mp3 = os.path.join(gettempdir(), f"{timestamp}.mp3")
252 |         wav = os.path.join(gettempdir(), f"{timestamp}.wav")
253 | 
254 |         return mp3, wav
255 | 
256 |     def get_text_blocked(self, driver):
257 |         try:
258 |             recaptcha_header = driver.find_element(By.CLASS_NAME, "rc-doscaptcha-body-text")
259 |             return recaptcha_header
260 |         except NoSuchElementException:
261 |             return None
262 | 
263 |     def is_blocked(self, driver):
264 |         blocked = self.get_text_blocked(driver)
265 |         if blocked is not None:
266 |             self.reports.logs_report("error", data=f"Failed to bypass v2 protection. IP has been blocked! {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:{blocked.text}")
267 |             self.console.log_print("error", msg=f"Failed to bypass v2 protection. IP has been blocked! {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:{blocked.text}")
268 | 
269 |         if driver.current_url == "https://www.google.com/sorry/index":
270 |             self.reports.logs_report("error", data="Unexpected response comes from search engines")
271 |             self.console.log_print("error", msg="Unexpected response comes from search engines")
272 |     
273 |     async def solve_captcha(self, driver, url):
274 |         self.reports.logs_report("debug", data=f"Bad URL {url}")
275 |         self.console.debugging(self.debug, msg=f"Bad URL {url}")
276 | 
277 |         self.wait = WebDriverWait(driver, 5)
278 | 
279 |         driver.get(url)
280 |         await self.recaptcha_service(driver)


--------------------------------------------------------------------------------
/src/godork/services/scrape.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import time
  4 | import random
  5 | import psutil
  6 | 
  7 | from aiohttp import ClientSession, TCPConnector
  8 | 
  9 | from ..utils.colors import Bgcolor
 10 | from ..utils.exceptions import GodorkException, GodorkTimeout, GodorkNoData, GodorkMaxRetries
 11 | from ..utils.parse import get_query, get_page_num, set_page_num
 12 | from ..helpers.console import Console
 13 | from ..helpers.reports import Reports
 14 | from ..helpers.extractor import extract_pages, extract_data
 15 | from .requester import Requester
 16 | from .driver import SeleniumDriver
 17 | from .recaptcha import RecaptchaBypass
 18 | 
 19 | class Scraper:
 20 | 
 21 |     """
 22 |     The Scraper class is a sophisticated tool designed for scraping Google search results, efficiently managing retries, handling CAPTCHAs, and extracting valuable data like links and titles. 
 23 |     It provides both synchronous and asynchronous scraping capabilities, ensuring flexibility and robustness while interacting with Google's search engine.
 24 |     
 25 |     Key Features:
 26 | 
 27 |         1. Initialization (__init__):
 28 | 
 29 |             * The class initializes a number of key parameters like:
 30 | 
 31 |                 - Dorks: A list of search queries (either from a file or input string).
 32 |                 - Debugging and Proxy Settings: Configuration for debugging and using proxies.
 33 |                 - Retries: Mechanism to retry failed requests with a configurable retry count and maximum retry limit.
 34 |                 - Headless Mode: Configuration to run the scraper in headless mode for browser interactions.
 35 | 
 36 |             * The scraper utilizes several components for functionality:
 37 | 
 38 |                 - Console: For logging and output management.
 39 |                 - Reports: For generating detailed reports about the scraping process.
 40 |                 - Requester: For handling the HTTP requests (both synchronous and asynchronous).
 41 |                 - RecaptchaBypass: A service for bypassing CAPTCHA protections encountered during scraping.
 42 | 
 43 |         2. Parameter Construction (params):
 44 | 
 45 |             * This method generates the request parameters for querying Google search results, including:
 46 | 
 47 |                 - q: The search query.
 48 |                 - client: Randomly selecting a user agent string (chrome, firefox, ubuntu, etc.)
 49 |                 - start: The page number for pagination.
 50 | 
 51 |         3. Asynchronous Connection Handling (reuse_connection):
 52 | 
 53 |             * This function manages retries when CAPTCHA protection is triggered on Google search results pages.
 54 |             * It calls the RecaptchaBypass.solve_captcha method to handle CAPTCHA challenges.
 55 |             * If CAPTCHA is detected, the method tries to bypass it by interacting with the page's reCAPTCHA service and retries the process for a set number of attempts.
 56 | 
 57 |         4. Fetching URLs (fetch_urls):
 58 | 
 59 |             * This method initiates an HTTP GET request to retrieve search result pages.
 60 |             * It handles various HTTP response codes:
 61 | 
 62 |                 - 200 OK: Processes valid search results and extracts data.
 63 |                 - 3xx Redirects: Detects CAPTCHA challenges and attempts to bypass them.
 64 |                 - 4xx and 5xx Errors: Logs client and server errors.
 65 | 
 66 |             * It attempts to bypass reCAPTCHA challenges automatically when detected.
 67 |             * If the request fails, it retries a set number of times before throwing an error.
 68 | 
 69 |         5. Fetching Links (fetch_links):
 70 | 
 71 |             * This method handles the core logic of iterating through search queries (dorks) and fetching search result pages.
 72 |             * For each query, it sends requests to multiple pages (using the params method to adjust the page number) and attempts to extract links and titles from the result.
 73 |             * It also gracefully handles exceptions such as timeouts and CAPTCHA protection issues, retrying requests when necessary.
 74 | 
 75 |         6. Running the Scraper (run_with_async):
 76 | 
 77 |             * This method serves as the entry point for running the asynchronous scraper.
 78 |             * It starts by printing introductory messages, including warnings about using the scraper responsibly.
 79 |             * Using async with ClientSession, it establishes a session to interact with Google search, managing retries and exceptions along the way.
 80 |             * Once the scraping process completes, the session is closed, and the final report is saved.
 81 | 
 82 |     """
 83 | 
 84 |     def __init__(self, dorks, proxy, debug, retries, max_retries, headless_mode):
 85 |         self.base_url = "https://www.google.com/search"
 86 |     
 87 |         self.dorks = dorks.strip().splitlines() if not os.path.isfile(dorks) else open(dorks, 'r').read().strip().splitlines()
 88 |         self.proxy = proxy
 89 |         self.debug = debug
 90 |         self.retries = retries
 91 |         self.max_retries = max_retries
 92 |         self.headless = headless_mode
 93 | 
 94 |         self.console = Console()
 95 |         self.reports = Reports()
 96 |         self.requester = Requester()
 97 |         self.recaptcha_service = RecaptchaBypass(debug, headless_mode=headless_mode)
 98 | 
 99 |     def get_memory_usage(self):
100 |         process = psutil.Process(os.getpid())
101 |         print(self.console.text_format("info", msg=f"Memory usage: {process.memory_info().rss / 1024 ** 2:.2f} MB"))
102 | 
103 |     def params(self, query, page):
104 |         return {
105 |             "q": query,
106 |             "channel": "fs",
107 |             "client": random.choice(["chrome", "firefox", "ubuntu", "gws"]),
108 |             "start": page,
109 |         }
110 |     
111 |     async def reuse_connection(self, session, url, retry_count=0, **kwargs):
112 |         num_page = kwargs.get("num_page")
113 | 
114 |         if retry_count >= self.max_retries:
115 |             raise GodorkMaxRetries("Maximum retries attempts reached for solving v2 protection")
116 | 
117 |         self.reports.logs_report("info", data="Initiating v2 bypass...")
118 |         self.console.log_print("info", msg="Initiating v2 bypass...")
119 | 
120 |         with SeleniumDriver(headless_mode=self.headless) as driver:
121 |             try:
122 |                 await self.recaptcha_service.solve_captcha(driver, url)
123 | 
124 |                 target_url = driver.current_url
125 |                 data_html = driver.page_source
126 | 
127 |                 try:
128 |                     last_page = extract_pages(data_html)
129 |                     self.reports.logs_report("info", data=f"Total known pages: {last_page}")
130 |                     self.console.log_print("info", msg=f"Total known pages: {last_page}")
131 |                 except IndexError:
132 |                     pass
133 |                 
134 |                 extract_data(data_html, reports=self.reports, metadata={"query": get_query(url), "num_page": set_page_num(num_page)})
135 | 
136 |                 await self.fetch_urls(session, url=target_url, params=None)
137 |             
138 |             except (GodorkException, GodorkTimeout) as err:
139 |                 self.reports.logs_report("error", data=f"Failed to bypass v2 protection. {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:{err}")
140 |                 self.console.log_print("error", msg=f"Failed to bypass v2 protection. {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:{err}")
141 | 
142 |                 self.reports.logs_report("info", data=f"Retrying bypass of v2 protection (attempt: {retry_count+1}) on page {set_page_num(num_page)}")
143 |                 self.console.log_print("info", msg=f"Retrying bypass of v2 protection (attempt: {retry_count+1}) on page {set_page_num(num_page)}")
144 | 
145 |                 await self.reuse_connection(session, url, num_page=num_page, retry_count=retry_count + 1)
146 |             finally:
147 |                 driver.quit()
148 | 
149 |     async def fetch_urls(self, session, url, **kwargs):
150 |         num_page = kwargs.get("params")["start"] if kwargs.get("params") is not None else get_page_num(url)
151 |         i = 0
152 | 
153 |         while True:
154 |             i += 1
155 |             response, data_html = await self.requester.aioreqwest(
156 |                 session,
157 |                 method="GET",
158 |                 url=url,
159 |                 proxy=self.proxy,
160 |                 params=kwargs.get("params"),
161 |                 timeout=10,
162 |                 redirects=False
163 |             )
164 |             
165 |             self.reports.logs_report("debug", data=f"Initiating request to {str(response.url)}")
166 |             self.console.debugging(self.debug, msg=f"Initiating request to {str(response.url)}")
167 | 
168 |             self.reports.logs_report("debug", data=f"Getting response status {response.status}")
169 |             self.console.debugging(self.debug, msg=f"Getting response status {response.status}")
170 | 
171 |             if "Google Search" not in re.findall("<title>(.*?)</title>", data_html):
172 | 
173 |                 if response.status == 200:
174 |                     try:
175 |                         last_page = extract_pages(data_html)
176 |                         self.reports.logs_report("info", data=f"Total known pages: {last_page}")
177 |                         self.console.log_print("info", msg=f"Total known pages: {last_page}")
178 |                     except IndexError:
179 |                         pass
180 | 
181 |                     extract_data(data_html, reports=self.reports, metadata={"query": get_query(response.url), "num_page": set_page_num(num_page)})
182 | 
183 |                 if 300 <= response.status <= 399 and "https://www.google.com/sorry/index" in response.headers.get("Location"):
184 |                     url_redirection = response.headers["Location"]
185 | 
186 |                     self.reports.logs_report("debug", data=f"Getting the redirect URL {url_redirection}")
187 |                     self.console.debugging(self.debug, msg=f"Getting the redirect URL {url_redirection}")
188 | 
189 |                     self.reports.logs_report("warning", data="Requests were blocked due to provider-side protection")
190 |                     self.console.log_print("warning", msg="Requests were blocked due to provider-side protection    ")
191 |                     
192 |                     self.reports.logs_report("warning", data=f"reCAPTCHA detected on the page {set_page_num(num_page)}")
193 |                     self.console.debugging(self.debug, msg=f"reCAPTCHA detected on the page {set_page_num(num_page)}")
194 | 
195 |                     await self.reuse_connection(session, url=url_redirection, num_page=num_page, retry_count=0)
196 | 
197 |                 if 400 <= response.status <= 499:
198 |                     self.reports.logs_report("error", data=f"Failed to fetch request on page {set_page_num(num_page)} {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:Client error occurred")
199 |                     self.console.log_print("error", msg=f"Failed to fetch request on page {set_page_num(num_page)} {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:Client error occurred")
200 | 
201 |                 if 500 <= response.status <= 599:
202 |                     self.reports.logs_report("error", data=f"Failed to fetch request on page {set_page_num(num_page)} {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:Server error occurred")
203 |                     self.console.log_print("error", msg=f"Failed to fetch request on page {set_page_num(num_page)} {Bgcolor.BLUE}reason{Bgcolor.DEFAULT}:Server error occurred")
204 | 
205 |                 break
206 | 
207 |             if i >= self.retries:
208 |                 raise GodorkMaxRetries("The request failed after reaching the maximum number of retries attempts")
209 | 
210 |             else:
211 |                 print(f"\r{self.console.out_log_format('warning', msg=f'Unexpected provider response. Retrying (request: {i})')}", flush=True, end="\r")
212 |                     
213 |     async def fetch_links(self, session, url):
214 |         for query in self.dorks:
215 |             self.reports.logs_report("info", data=f"{Bgcolor.BOLD}Starting enumeration for {query}{Bgcolor.DEFAULT}")
216 |             self.console.log_print("info", msg=f"{Bgcolor.BOLD}Starting enumeration for {query}{Bgcolor.DEFAULT}")
217 | 
218 |             for i in range(0, 501, 10):
219 |                 self.console.debugging(self.debug, msg=f"Performing an HTTP GET request on page {set_page_num(i)}")
220 |                 self.reports.logs_report("info", data=f"Performing an HTTP GET request on page {set_page_num(i)}")
221 | 
222 |                 try:
223 |                     await self.fetch_urls(session, url=url, params=self.params(query=query, page=i))
224 |                 except GodorkMaxRetries as err:
225 |                     self.reports.logs_report("warning", data=err)
226 |                     self.console.log_print("warning", msg=err)
227 |         
228 |                     self.reports.logs_report("info", data="Try using the `--no-headless` option to make changes")
229 |                     self.console.log_print("info", msg="Try using the `--no-headless` option to make changes")
230 |                     break
231 |                 except GodorkNoData as err:
232 |                     self.reports.logs_report("info", data=err)
233 |                     self.console.log_print("info", msg=err)
234 |                     break
235 |                 except Exception as err:
236 |                     self.reports.logs_report("error", data=err)
237 |                     self.console.log_print("error", msg=err)
238 |                     break
239 | 
240 |     async def run_with_async(self):
241 |         print(self.console.text_format("info", msg="A high-speed scraper for collecting links and titles from Google search results"))
242 |         print(self.console.text_format("warning", msg="Use with caution. You are responsible for your actions"))
243 |         print(self.console.text_format("warning", msg="Developers assume no liability and are not responsible for any issue or damage"))
244 | 
245 |         time.sleep(1)
246 | 
247 |         async with ClientSession(connector=TCPConnector(ssl=False if self.proxy else True)) as session:
248 |             try:
249 |                 await self.fetch_links(session, url=self.base_url)
250 |             finally:
251 |                 await session.close()
252 | 
253 |         print(self.console.text_format("info", msg="Report saved to {}".format(self.reports.base_dir)))
254 |         self.get_memory_usage()


--------------------------------------------------------------------------------