├── requirements.txt
├── example.py
├── LICENSE
├── README.md
└── perpclone
└── __init__.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | readability-lxml
2 | beautifulsoup4
3 | Markdown
4 | requests
5 | openai
6 | ujson
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | from perpclone import perplexity_clone
2 | import webbrowser
3 | import markdown
4 | import tempfile
5 | import openai
6 |
7 | # Ask for the API key
8 | openai.api_key = input("Enter your OpenAI API key: ")
9 |
10 | # Ask for the prompt
11 | prompt = input("Enter a prompt: ")
12 |
13 | # Send the prompt to the perplexity_clone function
14 | result = perplexity_clone(
15 | prompt,
16 | verbose=True
17 | )
18 |
19 | # Convert the result to HTML and add the prompt as a header
20 | html = f"
{prompt}
" + markdown.markdown(result.replace("\n", "
"))
21 |
22 | # Add CSS to the HTML content
23 | html = f"""
24 |
25 |
26 |
32 |
33 |
34 | {html}
35 |
36 |
37 | """
38 |
39 | # Save the HTML to a temporary file
40 | with tempfile.NamedTemporaryFile("w", delete=False, suffix=".html") as f:
41 | url = "file://" + f.name
42 | f.write(html)
43 |
44 | # Open the HTML file in the browser
45 | webbrowser.open(url)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Mason Barnes
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Perplexity-Clone-Python
2 | A near perfect replica of Perplexity AI's "Search" function in Python, heavily inspired by [clarity-ai](https://github.com/mckaywrigley/clarity-ai).
3 |
4 | ## Usage
5 | This Perplexity clone can be easily implemented in Python. Here is an example that can also be found in the `example.py` file:
6 | ```python
7 | from perpclone import perplexity_clone
8 | import webbrowser
9 | import markdown
10 | import tempfile
11 | import openai
12 |
13 | # Ask for the API key
14 | openai.api_key = input("Enter your OpenAI API key: ")
15 |
16 | # Ask for the prompt
17 | prompt = input("Enter a prompt: ")
18 |
19 | # Send the prompt to the perplexity_clone function
20 | result = perplexity_clone(
21 | prompt,
22 | verbose=True
23 | )
24 |
25 | # Convert the result to HTML and add the prompt as a header
26 | html = f"{prompt}
" + markdown.markdown(result.replace("\n", "
"))
27 |
28 | # Add CSS to the HTML content
29 | html = f"""
30 |
31 |
32 |
38 |
39 |
40 | {html}
41 |
42 |
43 | """
44 |
45 | # Save the HTML to a temporary file
46 | with tempfile.NamedTemporaryFile("w", delete=False, suffix=".html") as f:
47 | url = "file://" + f.name
48 | f.write(html)
49 |
50 | # Open the HTML file in the browser
51 | webbrowser.open(url)
52 | ```
53 |
54 | ## Requirements
55 | Requirements can be installed via the `requirements.txt` file. You will also need an [API key from OpenAI](https://openai.com/product), optionally with GPT-4 if you would like to enable higher-quality completions.
56 | ```bash
57 | pip install -r requirements.txt
58 | ```
59 |
60 | ## Credits
61 | Again, huge credit to Mckay Wrigley's [clarity-ai](https://github.com/mckaywrigley/clarity-ai). A lot of the code from this project was ported from his.
--------------------------------------------------------------------------------
/perpclone/__init__.py:
--------------------------------------------------------------------------------
1 | from concurrent.futures import ThreadPoolExecutor
2 | from urllib.parse import urlparse, parse_qs
3 | from readability import Document
4 | from datetime import datetime
5 | from bs4 import BeautifulSoup
6 | import requests
7 | import openai
8 | import ujson
9 |
10 | COMPLETION_MODEL = "gpt-3.5-turbo"
11 | SOURCE_COUNT = 5
12 |
13 | def generate_search_query(text: str, model="gpt-3.5-turbo") -> str:
14 | """
15 | Uses OpenAI's ChatCompletions to generate a search query from a given text.
16 |
17 | ### Example:
18 | For the text `What is the new Discord username system?`, a search query similar to `discord new username system` would be generated.
19 | """
20 | return openai.ChatCompletion.create(
21 | model=model,
22 | messages=[
23 | {"role": "system", "content": "Given a query, respond with the Google search query that would best help to answer the query. Don't use search operators. Respond with only the Google query and nothing else."},
24 | {"role": "user", "content": text}
25 | ]
26 | )["choices"][0]["message"]["content"]
27 |
28 | def get_google_search_links(query: str, source_count: int = SOURCE_COUNT, proxies: dict = None) -> list[str]:
29 | """
30 | Scrapes the official Google search page using the `requests` module and returns the first `source_count` links.
31 | """
32 | url = f"https://www.google.com/search?q={query}"
33 | if proxies:
34 | response = requests.get(url, proxies=proxies)
35 | else:
36 | response = requests.get(url)
37 | soup = BeautifulSoup(response.text, "html.parser")
38 | link_tags = soup.find_all("a")
39 |
40 | links = []
41 | for link in link_tags:
42 | href = link.get("href")
43 | if href and href.startswith("/url?q="):
44 | cleaned_href = parse_qs(href)["/url?q"][0]
45 | if cleaned_href not in links:
46 | links.append(cleaned_href)
47 |
48 | filtered_links = []
49 | for link in links:
50 | domain = urlparse(link).hostname
51 | exclude_list = ["google", "facebook", "twitter", "instagram", "youtube", "tiktok"]
52 | if not any(site in domain for site in exclude_list):
53 | if any(new_url.hostname == domain for new_url in [urlparse(l) for l in filtered_links]) == False:
54 | filtered_links.append(link)
55 |
56 | return filtered_links[:source_count]
57 |
58 | def scrape_text_from_links(links: list, proxies: dict = None) -> list[dict]:
59 | """
60 | Uses a `ThreadPoolExecutor` to run `scrape_text_from_links` on each link in `links` concurrently, allowing for lightning-fast scraping.
61 | """
62 | with ThreadPoolExecutor(max_workers=len(links)) as executor:
63 | if proxies:
64 | results = list(executor.map(scrape_text_from_link, links, [proxies] * len(links)))
65 | else:
66 | results = list(executor.map(scrape_text_from_link, links))
67 |
68 | for i, result in enumerate(results, start=1):
69 | result["result_number"] = i
70 |
71 | return results
72 |
73 | def scrape_text_from_link(link: str, proxies: dict = None) -> dict:
74 | """
75 | Uses the `requests` module to scrape the text from a given link, and then uses the `readability-lxml` module along with `BeautifulSoup4` to parse the text into a readable format.
76 | """
77 | if proxies:
78 | response = requests.get(link, proxies=proxies)
79 | else:
80 | response = requests.get(link)
81 |
82 | doc = Document(response.text)
83 | parsed = doc.summary()
84 | soup = BeautifulSoup(parsed, "html.parser")
85 | source_text = soup.get_text()
86 | return {"url": link, "text": summarize_text(source_text[:50000])}
87 |
88 | def summarize_text(text: str, model="gpt-3.5-turbo-16k") -> str:
89 | """
90 | Uses OpenAI's ChatCompletions to summarize a given text.
91 | """
92 | return openai.ChatCompletion.create(
93 | model=model,
94 | messages=[
95 | {"role": "system", "content": "Given text, respond with the summarized text (no more than 100 words) and nothing else."},
96 | {"role": "user", "content": text}
97 | ]
98 | )["choices"][0]["message"]["content"]
99 |
100 | def search(query: str, proxies: dict = None) -> tuple[list[str], list[dict]]:
101 | """
102 | This function takes a query as input, gets top Google search links for the query, and then scrapes the text from the links.
103 | It returns a tuple containing the list of links and a list of dictionaries. Each dictionary contains the URL and the summarized text from the link.
104 | """
105 | links = get_google_search_links(query, proxies=proxies)
106 | sources = scrape_text_from_links(links, proxies=proxies)
107 |
108 | return links, sources
109 |
110 | def perplexity_clone(query: str, proxies: dict = None, verbose=False) -> str:
111 | """
112 | A clone of Perplexity AI's "Search" feature. This function takes a query as input and returns Markdown formatted text containing a response to the query with cited sources.
113 | """
114 | formatted_time = datetime.utcnow().strftime("%A, %B %d, %Y %H:%M:%S UTC")
115 |
116 | search_query = generate_search_query(query)
117 | if verbose:
118 | print(f"Searching \"{search_query}\"...")
119 | links, sources = search(search_query, proxies=proxies)
120 |
121 | result = openai.ChatCompletion.create(
122 | model=COMPLETION_MODEL,
123 | messages=[
124 | {"role": "system", "content": "Generate a comprehensive and informative answer for a given question solely based on the provided web Search Results (URL and Summary). You must only use information from the provided search results. Use an unbiased and journalistic tone. Use this current date and time: " + formatted_time + ". Combine search results together into a coherent answer. Do not repeat text. Cite search results using [${number}] notation, and don't link the citations. Only cite the most relevant results that answer the question accurately. If different results refer to different entities with the same name, write separate answers for each entity."},
125 | {"role": "user", "content": ujson.dumps(sources)},
126 | {"role": "user", "content": query}
127 | ]
128 | )["choices"][0]["message"]["content"]
129 |
130 | for i, link in enumerate(links, start=1):
131 | result = result.replace(f"[{i}]", f"[[{i}]]({link})")
132 |
133 | return result
--------------------------------------------------------------------------------