├── README.md
├── __pycache__
└── logic.cpython-310.pyc
├── app.py
├── demo.py
├── logic.py
└── templates
└── index.html
/README.md:
--------------------------------------------------------------------------------
1 | heres an article i wrote explaining the project:
2 | https://wandb.ai/byyoung3/ML-NEWS2/reports/Building-a-real-time-answer-engine-with-Llama-3-1-405B-and-W-B-Weave--Vmlldzo4ODk3OTIz
3 |
4 |
5 |
6 | hire me @perplexity haha @openai u too
7 |
8 |
9 |
10 | also
11 |
12 | im working on a maps focused version of this for finding stores/businesses/gyms etc. nearby -- if you want to be involved (code with me or fund me), email me at byyoung3@gmail.com
13 |
--------------------------------------------------------------------------------
/__pycache__/logic.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bdytx5/open_answer_engine/f632276585ea9081b8d2a1b5e1569b75fb092af2/__pycache__/logic.cpython-310.pyc
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import threading
4 | import nest_asyncio
5 | import asyncio
6 | from flask import Flask, request, render_template, jsonify
7 | from logic import Search, Model
8 |
9 | # Project configuration
10 | PROJECT_ID = "your poject id"
11 | API_ENDPOINT = "us-central1-aiplatform.googleapis.com"
12 | REGION = "us-central1"
13 |
14 | # Apply nest_asyncio to allow nested event loops
15 | nest_asyncio.apply()
16 |
17 | # Set default download folder for screenshots
18 | videos_folder = r"./download"
19 |
20 | # Clear the download folder
21 | if os.path.exists(videos_folder):
22 | for file in os.listdir(videos_folder):
23 | file_path = os.path.join(videos_folder, file)
24 | if os.path.isfile(file_path) or os.path.islink(file_path):
25 | os.unlink(file_path)
26 | else:
27 | os.makedirs(videos_folder)
28 |
29 | # Global stop event
30 | stop_flag = threading.Event()
31 |
32 | # Global variable for response storage
33 | response_storage = ""
34 |
35 | app = Flask(__name__)
36 |
37 | @app.route('/')
38 | def index():
39 | return render_template('index.html')
40 |
41 | @app.route('/search', methods=['POST'])
42 | def search():
43 | global response_storage
44 | query = request.form.get('query')
45 | delay = 1
46 |
47 | # Clear the stop flag before running the function
48 | stop_flag.clear()
49 |
50 | asyncio.run(run_search_and_ocr(query, delay))
51 | return jsonify({'status': 'Search started'})
52 |
53 | async def run_search_and_ocr(query, delay):
54 | global response_storage
55 | context = ""
56 | if Search.decide_search(query):
57 | urls = Search.get_search_results(query, num_results=20)
58 | process_thread = threading.Thread(target=Search.process_urls, args=(urls, delay))
59 | process_thread.start()
60 | await asyncio.sleep(15)
61 | stop_flag.set()
62 | if process_thread.is_alive():
63 | process_thread.join(timeout=0)
64 |
65 | context = Search.get_context_from_ocr_results()
66 |
67 | model = Model(endpoint=API_ENDPOINT, region=REGION, project_id=PROJECT_ID)
68 | response = model.query_model_non_stream(query, context) # Replaced with query_model_nonstream to just return the response
69 | response_storage = response
70 |
71 |
72 | @app.route('/results', methods=['GET'])
73 | def get_results():
74 | global response_storage
75 | return jsonify({'results': response_storage.splitlines()})
76 |
77 | if __name__ == "__main__":
78 | app.run(debug=True)
79 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import pytesseract
4 | from PIL import Image
5 | from googlesearch import search
6 | import asyncio
7 | from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
8 | from concurrent.futures import ThreadPoolExecutor
9 | import threading
10 | import nest_asyncio
11 | import requests
12 | import json
13 | import subprocess
14 | import concurrent
15 | import sys
16 | import weave
17 |
18 |
19 | import requests
20 | import json
21 | import subprocess
22 | gcp_proj = "your project id"
23 |
24 | weave.init("answer_engine")
25 |
26 | # Apply nest_asyncio to allow nested event loops
27 | nest_asyncio.apply()
28 |
29 | # Set default download folder for screenshots
30 | videos_folder = r"./download"
31 |
32 | # Clear the download folder
33 | if os.path.exists(videos_folder):
34 | for file in os.listdir(videos_folder):
35 | file_path = os.path.join(videos_folder, file)
36 | if os.path.isfile(file_path) or os.path.islink(file_path):
37 | os.unlink(file_path)
38 | else:
39 | os.makedirs(videos_folder)
40 |
41 | # Global stop event
42 | stop_flag = threading.Event()
43 |
44 | class Search:
45 |
46 | @staticmethod
47 | def get_search_results(query, num_results=5):
48 | return [url for url in search(query, num_results=num_results)]
49 | @staticmethod
50 | async def download_screenshot(url, delay, index):
51 | async with async_playwright() as p:
52 | browser = await p.chromium.launch(headless=True)
53 | context = await browser.new_context()
54 | page = await context.new_page()
55 | file_name = f'{videos_folder}/Screenshot_{index}.png'
56 | try:
57 | await asyncio.wait_for(page.goto(url), timeout=5)
58 | await page.set_viewport_size({"width": 1920, "height": 1080})
59 | await page.wait_for_timeout(delay * 1000)
60 | await page.screenshot(path=file_name, full_page=True)
61 | print(f"Screenshot saved as {file_name}!")
62 | except (PlaywrightTimeoutError, asyncio.TimeoutError):
63 | print(f"Timeout occurred while loading {url}")
64 | file_name = None
65 | except Exception as e:
66 | print(f"Unexpected error occurred: {e}")
67 | file_name = None
68 | finally:
69 | await browser.close()
70 | return file_name
71 |
72 | @staticmethod
73 | def process_urls(urls, delay):
74 | if os.path.exists(videos_folder):
75 | for file in os.listdir(videos_folder):
76 | file_path = os.path.join(videos_folder, file)
77 | if os.path.isfile(file_path) or os.path.islink(file_path):
78 | os.unlink(file_path)
79 | elif os.path.isdir(file_path):
80 | os.rmdir(file_path)
81 | async def _process_urls():
82 | tasks = [Search.download_screenshot(url, delay, index) for index, url in enumerate(urls)]
83 | results = await asyncio.gather(*tasks, return_exceptions=True)
84 | return results
85 |
86 | loop = asyncio.new_event_loop()
87 | asyncio.set_event_loop(loop)
88 | results = loop.run_until_complete(_process_urls())
89 | return results
90 |
91 | @staticmethod
92 | def perform_ocr(image_path):
93 | if image_path is None:
94 | return None
95 | img = Image.open(image_path)
96 | tesseract_text = pytesseract.image_to_string(img)
97 | print(f"Tesseract OCR text for {image_path}:")
98 | print(tesseract_text)
99 | return tesseract_text
100 |
101 | @staticmethod
102 | def ocr_results_from_screenshots(screenshots):
103 | ocr_results = []
104 | with ThreadPoolExecutor() as executor:
105 | futures = [executor.submit(Search.perform_ocr, screenshot) for screenshot in screenshots]
106 | for future in concurrent.futures.as_completed(futures):
107 | try:
108 | result = future.result()
109 | ocr_results.append(result)
110 | except Exception as e:
111 | print(f"An error occurred during OCR processing: {e}")
112 | return ocr_results
113 |
114 | @staticmethod
115 | def get_context_from_ocr_results():
116 | screenshots = [os.path.join(videos_folder, f) for f in os.listdir(videos_folder) if os.path.isfile(os.path.join(videos_folder, f))]
117 |
118 | if not screenshots:
119 | print("No valid screenshots to process.")
120 | return None
121 |
122 | # Perform OCR on downloaded screenshots and prepare the context
123 | ocr_results = Search.ocr_results_from_screenshots(screenshots)
124 | ocr_results = [val[:1000] for val in ocr_results if isinstance(val, str)]
125 | context = " ".join(ocr_results)[:3000]
126 | return context
127 |
128 |
129 | @staticmethod
130 | def decide_search(query):
131 | # Instantiate the model to decide if a web search is needed
132 | model = Model(endpoint="us-central1-aiplatform.googleapis.com", region="us-central1", project_id=gcp_proj)
133 | context = ""
134 | res = model.query_model_for_search_decision(query)
135 | return res
136 |
137 |
138 | class Model:
139 | def __init__(self, endpoint, region, project_id):
140 | self.endpoint = endpoint
141 | self.region = region
142 | self.project_id = project_id
143 |
144 | def get_access_token(self):
145 | return subprocess.check_output("gcloud auth print-access-token", shell=True).decode('utf-8').strip()
146 |
147 |
148 | @weave.op()
149 | def query_model_non_stream(self, query, context):
150 | if context != "":
151 | q = "Answer the question {}. You can use this as help: {}".format(query, context)
152 | else:
153 | q = query
154 |
155 | access_token = self.get_access_token()
156 | headers = {
157 | "Authorization": f"Bearer {access_token}",
158 | "Content-Type": "application/json"
159 | }
160 | data = {
161 | "model": "meta/llama3-405b-instruct-maas",
162 | "stream": False,
163 | "messages": [
164 | {
165 | "role": "user",
166 | "content": q
167 | }
168 | ]
169 | }
170 | url = f"https://{self.endpoint}/v1beta1/projects/{self.project_id}/locations/{self.region}/endpoints/openapi/chat/completions"
171 | response = requests.post(url, headers=headers, json=data)
172 |
173 | if response.status_code == 200:
174 | data = response.json()
175 | if "choices" in data and len(data["choices"]) > 0:
176 | res = data["choices"][0]["message"]["content"]
177 | return res
178 | else:
179 | print(f"Error: {response.status_code}")
180 | print(response.text)
181 | return ""
182 |
183 | @weave.op()
184 | def query_model_for_search_decision(self, query):
185 | access_token = self.get_access_token()
186 | headers = {
187 | "Authorization": f"Bearer {access_token}",
188 | "Content-Type": "application/json"
189 | }
190 | data = {
191 | "model": "meta/llama3-405b-instruct-maas",
192 | "stream": False,
193 | "messages": [
194 | {
195 | "role": "user",
196 | "content": f"Do we need a web search to answer the question: {query}? usually questions that are asking about time related details or new inforamtion that might be in you initial training set will require a web search. Also information that could be subject to change is also a good to double check with search. Respond with 'yes' or 'no'."
197 | }
198 | ]
199 | }
200 | url = f"https://{self.endpoint}/v1beta1/projects/{self.project_id}/locations/{self.region}/endpoints/openapi/chat/completions"
201 | response = requests.post(url, headers=headers, json=data)
202 |
203 | if response.status_code == 200:
204 | data = response.json()
205 | if "choices" in data and len(data["choices"]) > 0:
206 | decision = data["choices"][0]["message"]["content"].strip().lower()
207 | return 'yes' in decision
208 | else:
209 | print(f"Error: {response.status_code}")
210 | print(response.text)
211 | return False
212 |
213 | async def main():
214 | query = "what is the date tomorrow"
215 | delay = 1
216 |
217 | # Get search results
218 | urls = Search.get_search_results(query, num_results=10)
219 |
220 | # Clear the stop flag before running the function
221 | stop_flag.clear()
222 |
223 | thread = threading.Thread(target=Search.process_urls, args=(urls, delay))
224 | thread.start()
225 |
226 | await asyncio.sleep(10) # wait for 10 seconds or until stopped
227 | # Signal the thread to stop
228 | stop_flag.set()
229 |
230 | # Optionally, you can check if the thread is still alive and take necessary action
231 | if thread.is_alive():
232 | print("Thread is still running. Attempting to stop.")
233 | thread.join(timeout=0) # Join with timeout, don't block indefinitely
234 |
235 | print("done searching...")
236 |
237 | # Get context from OCR results
238 | context = Search.get_context_from_ocr_results()
239 |
240 | # if context:
241 | # Instantiate and query the model
242 | model = Model(endpoint="us-central1-aiplatform.googleapis.com", region="us-central1", project_id=gcp_proj)
243 | res = model.query_model_non_stream(query, context)
244 | print(res)
245 |
246 | if __name__ == "__main__":
247 | asyncio.run(main())
248 |
--------------------------------------------------------------------------------
/logic.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import os
4 | import pytesseract
5 | from PIL import Image
6 | from googlesearch import search
7 | import asyncio
8 | from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
9 | from concurrent.futures import ThreadPoolExecutor
10 | import threading
11 | import nest_asyncio
12 | import requests
13 | import json
14 | import subprocess
15 | import concurrent
16 | import weave
17 |
18 | # Project configuration
19 | PROJECT_ID = "your poject id"
20 | API_ENDPOINT = "us-central1-aiplatform.googleapis.com"
21 | REGION = "us-central1"
22 |
23 | weave.init("answer_engine")
24 |
25 | # Apply nest_asyncio to allow nested event loops
26 | nest_asyncio.apply()
27 |
28 | # Set default download folder for screenshots
29 | videos_folder = r"./download"
30 |
31 | # Clear the download folder
32 | if os.path.exists(videos_folder):
33 | for file in os.listdir(videos_folder):
34 | file_path = os.path.join(videos_folder, file)
35 | if os.path.isfile(file_path) or os.path.islink(file_path):
36 | os.unlink(file_path)
37 | else:
38 | os.makedirs(videos_folder)
39 |
40 | # Global stop event
41 | stop_flag = threading.Event()
42 |
43 |
44 | class Search:
45 |
46 | @staticmethod
47 | def get_search_results(query, num_results=5):
48 | return [url for url in search(query, num_results=num_results)]
49 |
50 | @staticmethod
51 | async def download_screenshot(url, delay, index):
52 | async with async_playwright() as p:
53 | browser = await p.chromium.launch(headless=True)
54 | context = await browser.new_context()
55 | page = await context.new_page()
56 | file_name = f'{videos_folder}/Screenshot_{index}.png'
57 | try:
58 | await asyncio.wait_for(page.goto(url), timeout=5)
59 | await page.set_viewport_size({"width": 1920, "height": 1080})
60 | await page.wait_for_timeout(delay * 1000)
61 | await page.screenshot(path=file_name, full_page=True)
62 | print(f"Screenshot saved as {file_name}!")
63 | except (PlaywrightTimeoutError, asyncio.TimeoutError):
64 | print(f"Timeout occurred while loading {url}")
65 | file_name = None
66 | except Exception as e:
67 | print(f"Unexpected error occurred: {e}")
68 | file_name = None
69 | finally:
70 | await browser.close()
71 | return file_name
72 |
73 | @staticmethod
74 | def process_urls(urls, delay):
75 | if os.path.exists(videos_folder):
76 | for file in os.listdir(videos_folder):
77 | file_path = os.path.join(videos_folder, file)
78 | if os.path.isfile(file_path) or os.path.islink(file_path):
79 | os.unlink(file_path)
80 | elif os.path.isdir(file_path):
81 | os.rmdir(file_path)
82 | async def _process_urls():
83 | tasks = [Search.download_screenshot(url, delay, index) for index, url in enumerate(urls)]
84 | results = await asyncio.gather(*tasks, return_exceptions=True)
85 | return results
86 |
87 | loop = asyncio.new_event_loop()
88 | asyncio.set_event_loop(loop)
89 | results = loop.run_until_complete(_process_urls())
90 | return results
91 |
92 | @staticmethod
93 | def perform_ocr(image_path):
94 | if image_path is None:
95 | return None
96 | img = Image.open(image_path)
97 | tesseract_text = pytesseract.image_to_string(img)
98 | print(f"Tesseract OCR text for {image_path}:")
99 | print(tesseract_text)
100 | return tesseract_text
101 |
102 | @staticmethod
103 | def ocr_results_from_screenshots(screenshots):
104 | ocr_results = []
105 | with ThreadPoolExecutor() as executor:
106 | futures = [executor.submit(Search.perform_ocr, screenshot) for screenshot in screenshots]
107 | for future in concurrent.futures.as_completed(futures):
108 | try:
109 | result = future.result()
110 | ocr_results.append(result)
111 | except Exception as e:
112 | print(f"An error occurred during OCR processing: {e}")
113 | return ocr_results
114 |
115 | @staticmethod
116 | def get_context_from_ocr_results():
117 | screenshots = [os.path.join(videos_folder, f) for f in os.listdir(videos_folder) if os.path.isfile(os.path.join(videos_folder, f))]
118 |
119 | if not screenshots:
120 | print("No valid screenshots to process.")
121 | return None
122 |
123 | # Perform OCR on downloaded screenshots and prepare the context
124 | ocr_results = Search.ocr_results_from_screenshots(screenshots)
125 | ocr_results = [val[:1000] for val in ocr_results if isinstance(val, str)]
126 | context = " ".join(ocr_results)[:3000]
127 | return context
128 |
129 |
130 | @staticmethod
131 | def decide_search(query):
132 | # Instantiate the model to decide if a web search is needed
133 | model = Model(endpoint=API_ENDPOINT, region=REGION, project_id=PROJECT_ID)
134 | context = ""
135 | res = model.query_model_for_search_decision(query)
136 | return res
137 |
138 |
139 | class Model:
140 | def __init__(self, endpoint, region, project_id):
141 | self.endpoint = endpoint
142 | self.region = region
143 | self.project_id = project_id
144 |
145 | def get_access_token(self):
146 | return subprocess.check_output("gcloud auth print-access-token", shell=True).decode('utf-8').strip()
147 |
148 |
149 | @weave.op()
150 | def query_model_non_stream(self, query, context):
151 | if context != "":
152 | q = "Answer the question {}. You can use this as help: {}".format(query, context)
153 | else:
154 | q = query
155 |
156 | access_token = self.get_access_token()
157 | headers = {
158 | "Authorization": f"Bearer {access_token}",
159 | "Content-Type": "application/json"
160 | }
161 | data = {
162 | "model": "meta/llama3-405b-instruct-maas",
163 | "stream": False,
164 | "messages": [
165 | {
166 | "role": "user",
167 | "content": q
168 | }
169 | ]
170 | }
171 | url = f"https://{self.endpoint}/v1beta1/projects/{self.project_id}/locations/{self.region}/endpoints/openapi/chat/completions"
172 | response = requests.post(url, headers=headers, json=data)
173 |
174 | if response.status_code == 200:
175 | data = response.json()
176 | if "choices" in data and len(data["choices"]) > 0:
177 | res = data["choices"][0]["message"]["content"]
178 | return res
179 | else:
180 | print(f"Error: {response.status_code}")
181 | print(response.text)
182 | return ""
183 |
184 |
185 | @weave.op()
186 | def query_model_for_search_decision(self, query):
187 | access_token = self.get_access_token()
188 | headers = {
189 | "Authorization": f"Bearer {access_token}",
190 | "Content-Type": "application/json"
191 | }
192 | data = {
193 | "model": "meta/llama3-405b-instruct-maas",
194 | "stream": False,
195 | "messages": [
196 | {
197 | "role": "user",
198 | "content": f"Do we need a web search to answer the question: {query}? usually questions that are asking about time related details or new inforamtion that might be in you initial training set will require a web search. Also information that could be subject to change is also a good to double check with search. Respond with 'yes' or 'no'."
199 | }
200 | ]
201 | }
202 | url = f"https://{self.endpoint}/v1beta1/projects/{self.project_id}/locations/{self.region}/endpoints/openapi/chat/completions"
203 | response = requests.post(url, headers=headers, json=data)
204 |
205 | if response.status_code == 200:
206 | data = response.json()
207 | if "choices" in data and len(data["choices"]) > 0:
208 | decision = data["choices"][0]["message"]["content"].strip().lower()
209 | return 'yes' in decision
210 | else:
211 | print(f"Error: {response.status_code}")
212 | print(response.text)
213 | return False
214 |
215 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |