{prompt}

" + markdown.markdown(result.replace("\n", "
")) 21 | 22 | # Add CSS to the HTML content 23 | html = f""" 24 | 25 | 26 | 32 | 33 | 34 | {html} 35 | 36 | 37 | """ 38 | 39 | # Save the HTML to a temporary file 40 | with tempfile.NamedTemporaryFile("w", delete=False, suffix=".html") as f: 41 | url = "file://" + f.name 42 | f.write(html) 43 | 44 | # Open the HTML file in the browser 45 | webbrowser.open(url) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Mason Barnes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Perplexity-Clone-Python 2 | A near perfect replica of Perplexity AI's "Search" function in Python, heavily inspired by [clarity-ai](https://github.com/mckaywrigley/clarity-ai). 3 | 4 | ## Usage 5 | This Perplexity clone can be easily implemented in Python. Here is an example that can also be found in the `example.py` file: 6 | ```python 7 | from perpclone import perplexity_clone 8 | import webbrowser 9 | import markdown 10 | import tempfile 11 | import openai 12 | 13 | # Ask for the API key 14 | openai.api_key = input("Enter your OpenAI API key: ") 15 | 16 | # Ask for the prompt 17 | prompt = input("Enter a prompt: ") 18 | 19 | # Send the prompt to the perplexity_clone function 20 | result = perplexity_clone( 21 | prompt, 22 | verbose=True 23 | ) 24 | 25 | # Convert the result to HTML and add the prompt as a header 26 | html = f"

{prompt}

" + markdown.markdown(result.replace("\n", "
")) 27 | 28 | # Add CSS to the HTML content 29 | html = f""" 30 | 31 | 32 | 38 | 39 | 40 | {html} 41 | 42 | 43 | """ 44 | 45 | # Save the HTML to a temporary file 46 | with tempfile.NamedTemporaryFile("w", delete=False, suffix=".html") as f: 47 | url = "file://" + f.name 48 | f.write(html) 49 | 50 | # Open the HTML file in the browser 51 | webbrowser.open(url) 52 | ``` 53 | 54 | ## Requirements 55 | Requirements can be installed via the `requirements.txt` file. You will also need an [API key from OpenAI](https://openai.com/product), optionally with GPT-4 if you would like to enable higher-quality completions. 56 | ```bash 57 | pip install -r requirements.txt 58 | ``` 59 | 60 | ## Credits 61 | Again, huge credit to Mckay Wrigley's [clarity-ai](https://github.com/mckaywrigley/clarity-ai). A lot of the code from this project was ported from his. -------------------------------------------------------------------------------- /perpclone/__init__.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor 2 | from urllib.parse import urlparse, parse_qs 3 | from readability import Document 4 | from datetime import datetime 5 | from bs4 import BeautifulSoup 6 | import requests 7 | import openai 8 | import ujson 9 | 10 | COMPLETION_MODEL = "gpt-3.5-turbo" 11 | SOURCE_COUNT = 5 12 | 13 | def generate_search_query(text: str, model="gpt-3.5-turbo") -> str: 14 | """ 15 | Uses OpenAI's ChatCompletions to generate a search query from a given text. 16 | 17 | ### Example: 18 | For the text `What is the new Discord username system?`, a search query similar to `discord new username system` would be generated. 19 | """ 20 | return openai.ChatCompletion.create( 21 | model=model, 22 | messages=[ 23 | {"role": "system", "content": "Given a query, respond with the Google search query that would best help to answer the query. Don't use search operators. Respond with only the Google query and nothing else."}, 24 | {"role": "user", "content": text} 25 | ] 26 | )["choices"][0]["message"]["content"] 27 | 28 | def get_google_search_links(query: str, source_count: int = SOURCE_COUNT, proxies: dict = None) -> list[str]: 29 | """ 30 | Scrapes the official Google search page using the `requests` module and returns the first `source_count` links. 31 | """ 32 | url = f"https://www.google.com/search?q={query}" 33 | if proxies: 34 | response = requests.get(url, proxies=proxies) 35 | else: 36 | response = requests.get(url) 37 | soup = BeautifulSoup(response.text, "html.parser") 38 | link_tags = soup.find_all("a") 39 | 40 | links = [] 41 | for link in link_tags: 42 | href = link.get("href") 43 | if href and href.startswith("/url?q="): 44 | cleaned_href = parse_qs(href)["/url?q"][0] 45 | if cleaned_href not in links: 46 | links.append(cleaned_href) 47 | 48 | filtered_links = [] 49 | for link in links: 50 | domain = urlparse(link).hostname 51 | exclude_list = ["google", "facebook", "twitter", "instagram", "youtube", "tiktok"] 52 | if not any(site in domain for site in exclude_list): 53 | if any(new_url.hostname == domain for new_url in [urlparse(l) for l in filtered_links]) == False: 54 | filtered_links.append(link) 55 | 56 | return filtered_links[:source_count] 57 | 58 | def scrape_text_from_links(links: list, proxies: dict = None) -> list[dict]: 59 | """ 60 | Uses a `ThreadPoolExecutor` to run `scrape_text_from_links` on each link in `links` concurrently, allowing for lightning-fast scraping. 61 | """ 62 | with ThreadPoolExecutor(max_workers=len(links)) as executor: 63 | if proxies: 64 | results = list(executor.map(scrape_text_from_link, links, [proxies] * len(links))) 65 | else: 66 | results = list(executor.map(scrape_text_from_link, links)) 67 | 68 | for i, result in enumerate(results, start=1): 69 | result["result_number"] = i 70 | 71 | return results 72 | 73 | def scrape_text_from_link(link: str, proxies: dict = None) -> dict: 74 | """ 75 | Uses the `requests` module to scrape the text from a given link, and then uses the `readability-lxml` module along with `BeautifulSoup4` to parse the text into a readable format. 76 | """ 77 | if proxies: 78 | response = requests.get(link, proxies=proxies) 79 | else: 80 | response = requests.get(link) 81 | 82 | doc = Document(response.text) 83 | parsed = doc.summary() 84 | soup = BeautifulSoup(parsed, "html.parser") 85 | source_text = soup.get_text() 86 | return {"url": link, "text": summarize_text(source_text[:50000])} 87 | 88 | def summarize_text(text: str, model="gpt-3.5-turbo-16k") -> str: 89 | """ 90 | Uses OpenAI's ChatCompletions to summarize a given text. 91 | """ 92 | return openai.ChatCompletion.create( 93 | model=model, 94 | messages=[ 95 | {"role": "system", "content": "Given text, respond with the summarized text (no more than 100 words) and nothing else."}, 96 | {"role": "user", "content": text} 97 | ] 98 | )["choices"][0]["message"]["content"] 99 | 100 | def search(query: str, proxies: dict = None) -> tuple[list[str], list[dict]]: 101 | """ 102 | This function takes a query as input, gets top Google search links for the query, and then scrapes the text from the links. 103 | It returns a tuple containing the list of links and a list of dictionaries. Each dictionary contains the URL and the summarized text from the link. 104 | """ 105 | links = get_google_search_links(query, proxies=proxies) 106 | sources = scrape_text_from_links(links, proxies=proxies) 107 | 108 | return links, sources 109 | 110 | def perplexity_clone(query: str, proxies: dict = None, verbose=False) -> str: 111 | """ 112 | A clone of Perplexity AI's "Search" feature. This function takes a query as input and returns Markdown formatted text containing a response to the query with cited sources. 113 | """ 114 | formatted_time = datetime.utcnow().strftime("%A, %B %d, %Y %H:%M:%S UTC") 115 | 116 | search_query = generate_search_query(query) 117 | if verbose: 118 | print(f"Searching \"{search_query}\"...") 119 | links, sources = search(search_query, proxies=proxies) 120 | 121 | result = openai.ChatCompletion.create( 122 | model=COMPLETION_MODEL, 123 | messages=[ 124 | {"role": "system", "content": "Generate a comprehensive and informative answer for a given question solely based on the provided web Search Results (URL and Summary). You must only use information from the provided search results. Use an unbiased and journalistic tone. Use this current date and time: " + formatted_time + ". Combine search results together into a coherent answer. Do not repeat text. Cite search results using [${number}] notation, and don't link the citations. Only cite the most relevant results that answer the question accurately. If different results refer to different entities with the same name, write separate answers for each entity."}, 125 | {"role": "user", "content": ujson.dumps(sources)}, 126 | {"role": "user", "content": query} 127 | ] 128 | )["choices"][0]["message"]["content"] 129 | 130 | for i, link in enumerate(links, start=1): 131 | result = result.replace(f"[{i}]", f"^{[[{i}]]({link})}") 132 | 133 | return result --------------------------------------------------------------------------------