├── postscraper.py ├── Groogle.py └── README.md /postscraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from urllib.parse import urljoin, urlparse 4 | from concurrent.futures import ThreadPoolExecutor, as_completed 5 | 6 | class AdvancedWebScraper: 7 | def __init__(self): 8 | self.results = {} 9 | 10 | def extract_data(self, url): 11 | response = requests.get(url, allow_redirects=True, timeout=5) 12 | if response.status_code == 200: 13 | soup = BeautifulSoup(response.content, "html.parser") 14 | paragraphs = [paragraph.text.strip() for paragraph in soup.find_all("p")] 15 | return ''.join(paragraphs) 16 | 17 | if __name__ == "__main__": 18 | scraper = AdvancedWebScraper() 19 | urls = ['http://example.com', 'http://example.org'] 20 | with ThreadPoolExecutor(max_workers=5) as executor: 21 | futures = {executor.submit(scraper.extract_data, url): url for url in urls} 22 | for future in as_completed(futures): 23 | url = futures[future] # Get the URL associated with the future 24 | data = future.result() # Get the result of the future 25 | scraper.results[url] = data # Store the result in the results dictionary 26 | 27 | # Print the results 28 | for url, data in scraper.results.items(): 29 | print(f" {url}: {data}") 30 | -------------------------------------------------------------------------------- /Groogle.py: -------------------------------------------------------------------------------- 1 | # Groogle : Groq + Google 2 | 3 | from groq import Groq 4 | import os 5 | from dotenv import load_dotenv 6 | from googlesearch import search 7 | from postscraper import AdvancedWebScraper 8 | from concurrent.futures import ThreadPoolExecutor, as_completed 9 | 10 | load_dotenv() 11 | API_KEY = os.getenv('groq_api_key') 12 | 13 | prompt = input("ask me anything: ") 14 | client = Groq(api_key=os.environ.get("GROQ_API_KEY"),) 15 | 16 | query_generator = client.chat.completions.create(messages=[{"role": "user","content": f"generate only 1 google search query for this prompt ({prompt}) to get help, put it inside quotes",}],model="llama3-8b-8192",) 17 | 18 | query = ' '.join(query_generator.choices[0].message.content.split('"')[1::2]) 19 | 20 | print(query) 21 | 22 | scraper = AdvancedWebScraper() 23 | 24 | urls = list(search(query,num_results=5)) 25 | with ThreadPoolExecutor(max_workers=5) as executor: 26 | futures = {executor.submit(scraper.extract_data, url): url for url in urls} 27 | for future in as_completed(futures): 28 | url = futures[future] # Get the URL associated with the future 29 | data = future.result() # Get the result of the future 30 | scraper.results[url] = data # Store the result in the results dictionary 31 | 32 | #for url, data in scraper.results.items(): 33 | # print(f" \n {url}: \n ```{data}``` ") 34 | 35 | chat_completion = client.chat.completions.create(messages=[{"role": "user","content": f"answer this prompt '{prompt}' using following web results: \n {scraper.results}" }],model="llama3-70b-8192",) 36 | answer = chat_completion.choices[0].message.content 37 | print(answer) 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Groogle 2 | 3 | Groogle is a Python application that combines the capabilities of the Groq API and the Google search engine to provide a comprehensive solution for information retrieval and processing. The application uses the Groq API for chat completions and the Google search engine to find relevant web results. It then uses a web scraper to extract data from these results and uses the Groq API again to generate a response based on the prompt and the extracted data. 4 | 5 | ## Features 6 | 7 | - **Groq API Integration**: Utilizes the Groq API for chat completions to generate queries and answers. 8 | - **Google Search**: Performs Google searches to find relevant web results. 9 | - **Web Scraping**: Extracts data from the web results using a web scraper. 10 | - **Concurrent Processing**: Uses concurrent processing to speed up the web scraping process. 11 | - **Environment Variables**: Loads API keys from environment variables for secure access. 12 | 13 | ## Usage 14 | 15 | 1. Install the required packages by running `pip install -r requirements.txt`. 16 | 2. Set your Groq API key in your environment variables as `GROQ_API_KEY`. 17 | 3. Run the script and input your prompt when prompted. 18 | 4. The script will generate a Google search query, perform the search, scrape the results, and use the Groq API to generate an answer based on the prompt and the extracted data. 19 | 20 | ## Note 21 | 22 | - The script requires an active internet connection to perform Google searches and scrape web results. 23 | - The script uses the `googlesearch` package, which may not be the most efficient or reliable for large-scale scraping. 24 | - The script uses the `postscraper.py` for web scraping, which may not be the most efficient or reliable for all websites. 25 | - The script uses the `concurrent.futures` module for concurrent processing, which may not be the most efficient or reliable for all use cases. 26 | 27 | ## Contributing 28 | 29 | Contributions are welcome! Please open an issue or submit a pull request for any changes or improvements. 30 | 31 | ## License 32 | 33 | This project is licensed under the [MIT License](LICENSE). 34 | --------------------------------------------------------------------------------