├── requirements.txt
├── .gitignore
├── LICENSE
├── llm_config.py
├── web_scraper.py
├── llm_wrapper.py
├── strategic_analysis_parser.py
├── README.md
├── llm_response_parser.py
├── Web-LLM.py
├── Self_Improving_Search.py
└── research_manager.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | llama-cpp-python
 2 | duckduckgo-search
 3 | colorama
 4 | requests
 5 | beautifulsoup4
 6 | trafilatura
 7 | readchar
 8 | keyboard
 9 | windows-curses; sys_platform == 'win32'
10 | tqdm
11 | urllib3
12 | openai>=1.0.0
13 | anthropic>=0.7.0
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | venv
 3 | logs
 4 | modelfile
 5 | research_session_*
 6 | 
 7 | # Python
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | # Virtual Environment
13 | venv/
14 | env/
15 | .env
16 | 
17 | # IDEs and Editors
18 | .vscode/
19 | .idea/
20 | *.swp
21 | *.swo
22 | 
23 | # OS generated files
24 | .DS_Store
25 | Thumbs.db
26 | 
27 | # Logs
28 | *.log
29 | 
30 | # Model files (if they're large, you might want to exclude them)
31 | *.gguf
32 | 
33 | # Distribution / packaging
34 | dist/
35 | build/
36 | *.egg-info/
37 | 
38 | # Jupyter Notebook
39 | .ipynb_checkpoints
40 | 
41 | # Other
42 | *.bak
43 | *.tmp
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 James Warburton
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/llm_config.py:
--------------------------------------------------------------------------------
 1 | # llm_config.py
 2 | 
 3 | LLM_TYPE = "ollama"  # Options: 'ollama', 'openai', 'anthropic'
 4 | 
 5 | # LLM settings for Ollama
 6 | LLM_CONFIG_OLLAMA = {
 7 |     "llm_type": "ollama",
 8 |     "base_url": "http://localhost:11434",  # default Ollama server URL
 9 |     "model_name": "custom-phi3-32k-Q4_K_M",  # Replace with your Ollama model name
10 |     "temperature": 0.7,
11 |     "top_p": 0.9,
12 |     "n_ctx": 55000,
13 |     "stop": ["User:", "\n\n"]
14 | }
15 | 
16 | # LLM settings for OpenAI 
17 | # WARNING: This application makes frequent API calls during research operations. If using paid API services 
18 | # (OpenAI/Anthropic), this can result in significant costs accumulating quickly - proceed with caution and 
19 | # monitor your API usage carefully if it's paid.
20 | LLM_CONFIG_OPENAI = {
21 |     "llm_type": "openai",
22 |     "api_key": "",  # Set via environment variable OPENAI_API_KEY
23 |     "base_url": None,  # Optional: Set to use alternative OpenAI-compatible endpoints
24 |     "model_name": "gpt-4o",  # Required: Specify the model to use
25 |     "temperature": 0.7,
26 |     "top_p": 0.9,
27 |     "max_tokens": 4096,
28 |     "stop": ["User:", "\n\n"],
29 |     "presence_penalty": 0,
30 |     "frequency_penalty": 0
31 | }
32 | 
33 | # LLM settings for Anthropic
34 | # WARNING: This application makes frequent API calls during research operations. If using paid API services 
35 | # (OpenAI/Anthropic), this can result in significant costs accumulating quickly - proceed with caution and 
36 | # monitor your API usage carefully if it's paid.
37 | LLM_CONFIG_ANTHROPIC = {
38 |     "llm_type": "anthropic",
39 |     "api_key": "",  # Set via environment variable ANTHROPIC_API_KEY
40 |     "model_name": "claude-3-5-sonnet-latest",  # Required: Specify the model to use
41 |     "temperature": 0.7,
42 |     "top_p": 0.9,
43 |     "max_tokens": 4096,
44 |     "stop": ["User:", "\n\n"]
45 | }
46 | 
47 | def get_llm_config():
48 |     if LLM_TYPE == "llama_cpp":
49 |         return LLM_CONFIG_LLAMA_CPP
50 |     elif LLM_TYPE == "ollama":
51 |         return LLM_CONFIG_OLLAMA
52 |     elif LLM_TYPE == "openai":
53 |         return LLM_CONFIG_OPENAI
54 |     elif LLM_TYPE == "anthropic":
55 |         return LLM_CONFIG_ANTHROPIC
56 |     else:
57 |         raise ValueError(f"Invalid LLM_TYPE: {LLM_TYPE}")
58 | 


--------------------------------------------------------------------------------
/web_scraper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | from urllib.robotparser import RobotFileParser
  4 | from urllib.parse import urlparse, urljoin
  5 | import time
  6 | import logging
  7 | from concurrent.futures import ThreadPoolExecutor, as_completed
  8 | import re
  9 | 
 10 | # Set up logging
 11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class WebScraper:
 15 |     def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)",
 16 |                  rate_limit=1, timeout=10, max_retries=3):
 17 |         self.session = requests.Session()
 18 |         self.session.headers.update({"User-Agent": user_agent})
 19 |         self.robot_parser = RobotFileParser()
 20 |         self.rate_limit = rate_limit
 21 |         self.timeout = timeout
 22 |         self.max_retries = max_retries
 23 |         self.last_request_time = {}
 24 | 
 25 |     def can_fetch(self, url):
 26 |         # parsed_url = urlparse(url)
 27 |         # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
 28 |         # self.robot_parser.set_url(robots_url)
 29 |         # try:
 30 |         #     self.robot_parser.read()
 31 |         #     return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url)
 32 |         # except Exception as e:
 33 |         #     logger.warning(f"Error reading robots.txt for {url}: {e}")
 34 |             return True  # ignore robots.txt
 35 | 
 36 |     def respect_rate_limit(self, url):
 37 |         domain = urlparse(url).netloc
 38 |         current_time = time.time()
 39 |         if domain in self.last_request_time:
 40 |             time_since_last_request = current_time - self.last_request_time[domain]
 41 |             if time_since_last_request < self.rate_limit:
 42 |                 time.sleep(self.rate_limit - time_since_last_request)
 43 |         self.last_request_time[domain] = time.time()
 44 | 
 45 |     def scrape_page(self, url):
 46 |         if not self.can_fetch(url):
 47 |             logger.info(f"Robots.txt disallows scraping: {url}")
 48 |             return None
 49 | 
 50 |         for attempt in range(self.max_retries):
 51 |             try:
 52 |                 self.respect_rate_limit(url)
 53 |                 response = self.session.get(url, timeout=self.timeout)
 54 |                 response.raise_for_status()
 55 |                 return self.extract_content(response.text, url)
 56 |             except requests.RequestException as e:
 57 |                 logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}")
 58 |                 if attempt == self.max_retries - 1:
 59 |                     logger.error(f"Failed to scrape {url} after {self.max_retries} attempts")
 60 |                     return None
 61 |                 time.sleep(2 ** attempt)  # Exponential backoff
 62 | 
 63 |     def extract_content(self, html, url):
 64 |         soup = BeautifulSoup(html, 'html.parser')
 65 | 
 66 |         # Remove unwanted elements
 67 |         for element in soup(["script", "style", "nav", "footer", "header"]):
 68 |             element.decompose()
 69 | 
 70 |         # Extract title
 71 |         title = soup.title.string if soup.title else ""
 72 | 
 73 |         # Try to find main content
 74 |         main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
 75 | 
 76 |         if main_content:
 77 |             paragraphs = main_content.find_all('p')
 78 |         else:
 79 |             paragraphs = soup.find_all('p')
 80 | 
 81 |         # Extract text from paragraphs
 82 |         text = ' '.join([p.get_text().strip() for p in paragraphs])
 83 | 
 84 |         # If no paragraphs found, get all text
 85 |         if not text:
 86 |             text = soup.get_text()
 87 | 
 88 |         # Clean up whitespace
 89 |         text = re.sub(r'\s+', ' ', text).strip()
 90 | 
 91 |         # Extract and resolve links
 92 |         links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)]
 93 | 
 94 |         return {
 95 |             "url": url,
 96 |             "title": title,
 97 |             "content": text[:2400],  # Limit to first 2400 characters
 98 |             "links": links[:10]  # Limit to first 10 links
 99 |         }
100 | 
101 | def scrape_multiple_pages(urls, max_workers=5):
102 |     scraper = WebScraper()
103 |     results = {}
104 | 
105 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
106 |         future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls}
107 |         for future in as_completed(future_to_url):
108 |             url = future_to_url[future]
109 |             try:
110 |                 data = future.result()
111 |                 if data:
112 |                     results[url] = data
113 |                     logger.info(f"Successfully scraped: {url}")
114 |                 else:
115 |                     logger.warning(f"Failed to scrape: {url}")
116 |             except Exception as exc:
117 |                 logger.error(f"{url} generated an exception: {exc}")
118 | 
119 |     return results
120 | 
121 | # Function to integrate with your main system
122 | def get_web_content(urls):
123 |     scraped_data = scrape_multiple_pages(urls)
124 |     return {url: data['content'] for url, data in scraped_data.items() if data}
125 | 
126 | # Standalone can_fetch function
127 | def can_fetch(url):
128 |     # parsed_url = urlparse(url)
129 |     # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
130 |     # rp = RobotFileParser()
131 |     # rp.set_url(robots_url)
132 |     # try:
133 |     #     rp.read()
134 |     #     return rp.can_fetch("*", url)
135 |     # except Exception as e:
136 |     #     logger.warning(f"Error reading robots.txt for {url}: {e}")
137 |         return True  # ignore robots.xt
138 | 
139 | if __name__ == "__main__":
140 |     test_urls = [
141 |         "https://en.wikipedia.org/wiki/Web_scraping",
142 |         "https://example.com",
143 |         "https://www.python.org"
144 |     ]
145 |     scraped_content = get_web_content(test_urls)
146 |     for url, content in scraped_content.items():
147 |         print(f"Content from {url}:")
148 |         print(content[:500])  # Print first 500 characters
149 |         print("\n---\n")
150 | 


--------------------------------------------------------------------------------
/llm_wrapper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from llama_cpp import Llama
  3 | import requests
  4 | import json
  5 | from llm_config import get_llm_config
  6 | from openai import OpenAI
  7 | from anthropic import Anthropic
  8 | 
  9 | class LLMWrapper:
 10 |     def __init__(self):
 11 |         self.llm_config = get_llm_config()
 12 |         self.llm_type = self.llm_config.get('llm_type', 'llama_cpp')
 13 |         
 14 |         if self.llm_type == 'llama_cpp':
 15 |             self.llm = self._initialize_llama_cpp()
 16 |         elif self.llm_type == 'ollama':
 17 |             self.base_url = self.llm_config.get('base_url', 'http://localhost:11434')
 18 |             self.model_name = self.llm_config.get('model_name', 'your_model_name')
 19 |         elif self.llm_type == 'openai':
 20 |             self._initialize_openai()
 21 |         elif self.llm_type == 'anthropic':
 22 |             self._initialize_anthropic()
 23 |         else:
 24 |             raise ValueError(f"Unsupported LLM type: {self.llm_type}")
 25 | 
 26 |     def _initialize_llama_cpp(self):
 27 |         return Llama(
 28 |             model_path=self.llm_config.get('model_path'),
 29 |             n_ctx=self.llm_config.get('n_ctx', 55000),
 30 |             n_gpu_layers=self.llm_config.get('n_gpu_layers', 0),
 31 |             n_threads=self.llm_config.get('n_threads', 8),
 32 |             verbose=False
 33 |         )
 34 | 
 35 |     def _initialize_openai(self):
 36 |         api_key = os.getenv('OPENAI_API_KEY') or self.llm_config.get('api_key')
 37 |         if not api_key:
 38 |             raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY environment variable.")
 39 |         
 40 |         base_url = self.llm_config.get('base_url')
 41 |         model_name = self.llm_config.get('model_name')
 42 |         
 43 |         if not model_name:
 44 |             raise ValueError("OpenAI model name not specified in config")
 45 |             
 46 |         client_kwargs = {'api_key': api_key}
 47 |         if base_url:
 48 |             client_kwargs['base_url'] = base_url
 49 |             
 50 |         self.client = OpenAI(**client_kwargs)
 51 |         self.model_name = model_name
 52 | 
 53 |     def _initialize_anthropic(self):
 54 |         api_key = os.getenv('ANTHROPIC_API_KEY') or self.llm_config.get('api_key')
 55 |         if not api_key:
 56 |             raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY environment variable.")
 57 |             
 58 |         model_name = self.llm_config.get('model_name')
 59 |         if not model_name:
 60 |             raise ValueError("Anthropic model name not specified in config")
 61 |             
 62 |         self.client = Anthropic(api_key=api_key)
 63 |         self.model_name = model_name
 64 | 
 65 |     def generate(self, prompt, **kwargs):
 66 |         if self.llm_type == 'llama_cpp':
 67 |             llama_kwargs = self._prepare_llama_kwargs(kwargs)
 68 |             response = self.llm(prompt, **llama_kwargs)
 69 |             return response['choices'][0]['text'].strip()
 70 |         elif self.llm_type == 'ollama':
 71 |             return self._ollama_generate(prompt, **kwargs)
 72 |         elif self.llm_type == 'openai':
 73 |             return self._openai_generate(prompt, **kwargs)
 74 |         elif self.llm_type == 'anthropic':
 75 |             return self._anthropic_generate(prompt, **kwargs)
 76 |         else:
 77 |             raise ValueError(f"Unsupported LLM type: {self.llm_type}")
 78 | 
 79 |     def _ollama_generate(self, prompt, **kwargs):
 80 |         url = f"{self.base_url}/api/generate"
 81 |         data = {
 82 |             'model': self.model_name,
 83 |             'prompt': prompt,
 84 |             'options': {
 85 |                 'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
 86 |                 'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
 87 |                 'stop': kwargs.get('stop', self.llm_config.get('stop', [])),
 88 |                 'num_predict': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 55000)),
 89 |                 'num_ctx': self.llm_config.get('n_ctx', 55000)
 90 |             }
 91 |         }
 92 |         response = requests.post(url, json=data, stream=True)
 93 |         if response.status_code != 200:
 94 |             raise Exception(f"Ollama API request failed with status {response.status_code}: {response.text}")
 95 |         text = ''.join(json.loads(line)['response'] for line in response.iter_lines() if line)
 96 |         return text.strip()
 97 | 
 98 |     def _openai_generate(self, prompt, **kwargs):
 99 |         try:
100 |             response = self.client.chat.completions.create(
101 |                 model=self.model_name,
102 |                 messages=[{"role": "user", "content": prompt}],
103 |                 temperature=kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
104 |                 top_p=kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
105 |                 max_tokens=kwargs.get('max_tokens', self.llm_config.get('max_tokens', 4096)),
106 |                 stop=kwargs.get('stop', self.llm_config.get('stop', [])),
107 |                 presence_penalty=self.llm_config.get('presence_penalty', 0),
108 |                 frequency_penalty=self.llm_config.get('frequency_penalty', 0)
109 |             )
110 |             return response.choices[0].message.content.strip()
111 |         except Exception as e:
112 |             raise Exception(f"OpenAI API request failed: {str(e)}")
113 | 
114 |     def _anthropic_generate(self, prompt, **kwargs):
115 |         try:
116 |             response = self.client.messages.create(
117 |                 model=self.model_name,
118 |                 max_tokens=kwargs.get('max_tokens', self.llm_config.get('max_tokens', 4096)),
119 |                 temperature=kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
120 |                 top_p=kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
121 |                 messages=[{
122 |                     "role": "user",
123 |                     "content": prompt
124 |                 }]
125 |             )
126 |             return response.content[0].text.strip()
127 |         except Exception as e:
128 |             raise Exception(f"Anthropic API request failed: {str(e)}")
129 | 
130 |     def _cleanup(self):
131 |         """Force terminate any running LLM processes"""
132 |         if self.llm_type == 'ollama':
133 |             try:
134 |                 # Force terminate Ollama process
135 |                 requests.post(f"{self.base_url}/api/terminate")
136 |             except:
137 |                 pass
138 | 
139 |             try:
140 |                 # Also try to terminate via subprocess if needed
141 |                 import subprocess
142 |                 subprocess.run(['pkill', '-f', 'ollama'], capture_output=True)
143 |             except:
144 |                 pass
145 | 
146 |     def _prepare_llama_kwargs(self, kwargs):
147 |         llama_kwargs = {
148 |             'max_tokens': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 55000)),
149 |             'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)),
150 |             'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)),
151 |             'stop': kwargs.get('stop', self.llm_config.get('stop', [])),
152 |             'echo': False,
153 |         }
154 |         return llama_kwargs
155 | 


--------------------------------------------------------------------------------
/strategic_analysis_parser.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Optional, Union
  2 | import re
  3 | import logging
  4 | from dataclasses import dataclass
  5 | from datetime import datetime
  6 | 
  7 | @dataclass
  8 | class ResearchFocus:
  9 |     """Represents a specific area of research focus"""
 10 |     area: str
 11 |     priority: int
 12 |     source_query: str = ""
 13 |     timestamp: str = ""
 14 |     search_queries: List[str] = None
 15 | 
 16 |     def __post_init__(self):
 17 |         if not self.timestamp:
 18 |             self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 19 |         if self.search_queries is None:
 20 |             self.search_queries = []
 21 | 
 22 | @dataclass
 23 | class AnalysisResult:
 24 |     """Contains the complete analysis result"""
 25 |     original_question: str
 26 |     focus_areas: List[ResearchFocus]
 27 |     raw_response: str
 28 |     timestamp: str = ""
 29 |     confidence_score: float = 0.0
 30 | 
 31 |     def __post_init__(self):
 32 |         if not self.timestamp:
 33 |             self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 34 | 
 35 | # Set up logging
 36 | logger = logging.getLogger(__name__)
 37 | 
 38 | class StrategicAnalysisParser:
 39 |     """Enhanced parser with improved pattern matching and validation"""
 40 |     def __init__(self):
 41 |         self.patterns = {
 42 |             'original_question': [
 43 |                 r"(?i)original question analysis:\s*(.*?)(?=research gap|$)",
 44 |                 r"(?i)original query:\s*(.*?)(?=research gap|$)",
 45 |                 r"(?i)research question:\s*(.*?)(?=research gap|$)",
 46 |                 r"(?i)topic analysis:\s*(.*?)(?=research gap|$)"
 47 |             ],
 48 |             'research_gaps': [
 49 |                 r"(?i)research gaps?:\s*",
 50 |                 r"(?i)gaps identified:\s*",
 51 |                 r"(?i)areas for research:\s*",
 52 |                 r"(?i)investigation areas:\s*"
 53 |             ],
 54 |             'priority': [
 55 |                 r"(?i)priority:\s*(\d+)",
 56 |                 r"(?i)priority level:\s*(\d+)",
 57 |                 r"(?i)\(priority:\s*(\d+)\)",
 58 |                 r"(?i)importance:\s*(\d+)"
 59 |             ]
 60 |         }
 61 |         self.logger = logging.getLogger(__name__)
 62 | 
 63 |     def parse_analysis(self, llm_response: str) -> Optional[AnalysisResult]:
 64 |         """Main parsing method with improved validation"""
 65 |         try:
 66 |             # Clean and normalize the response
 67 |             cleaned_response = self._clean_text(llm_response)
 68 | 
 69 |             # Extract original question with validation
 70 |             original_question = self._extract_original_question(cleaned_response)
 71 |             if not original_question:
 72 |                 self.logger.warning("Failed to extract original question")
 73 |                 original_question = "Original question extraction failed"
 74 | 
 75 |             # Extract and validate research areas
 76 |             focus_areas = self._extract_research_areas(cleaned_response)
 77 |             focus_areas = self._normalize_focus_areas(focus_areas)
 78 | 
 79 |             # Calculate confidence score
 80 |             confidence_score = self._calculate_confidence_score(original_question, focus_areas)
 81 | 
 82 |             return AnalysisResult(
 83 |                 original_question=original_question,
 84 |                 focus_areas=focus_areas,
 85 |                 raw_response=llm_response,
 86 |                 confidence_score=confidence_score
 87 |             )
 88 | 
 89 |         except Exception as e:
 90 |             self.logger.error(f"Error in parse_analysis: {str(e)}")
 91 |             return None
 92 | 
 93 |     def _clean_text(self, text: str) -> str:
 94 |         """Clean and normalize text for parsing"""
 95 |         text = re.sub(r'\n{3,}', '\n\n', text)
 96 |         text = re.sub(r'\s{2,}', ' ', text)
 97 |         text = re.sub(r'(\d+\))', r'\1.', text)
 98 |         return text.strip()
 99 | 
100 |     def _extract_original_question(self, text: str) -> str:
101 |         """Extract original question with improved matching"""
102 |         for pattern in self.patterns['original_question']:
103 |             match = re.search(pattern, text, re.DOTALL)
104 |             if match:
105 |                 return self._clean_text(match.group(1))
106 |         return ""
107 | 
108 |     def _extract_research_areas(self, text: str) -> List[ResearchFocus]:
109 |         """Extract research areas with enhanced validation"""
110 |         areas = []
111 |         for pattern in self.patterns['research_gaps']:
112 |             gap_match = re.search(pattern, text)
113 |             if gap_match:
114 |                 sections = re.split(r'\n\s*\d+[\.)]\s+', text[gap_match.end():])
115 |                 sections = [s for s in sections if s.strip()]
116 | 
117 |                 for section in sections:
118 |                     focus = self._parse_research_focus(section)
119 |                     if focus and self._is_valid_focus(focus):
120 |                         areas.append(focus)
121 |                 break
122 |         return areas
123 | 
124 |     def _parse_research_focus(self, text: str) -> Optional[ResearchFocus]:
125 |         """Parse research focus with improved validation without reasoning."""
126 |         try:
127 |             # Extract area
128 |             area = text.split('\n')[0].strip()
129 | 
130 |             # Extract and validate priority
131 |             priority = self._extract_priority(text)
132 | 
133 |             # Return ResearchFocus without reasoning
134 |             return ResearchFocus(
135 |                 area=area,
136 |                 priority=priority
137 |             )
138 | 
139 |         except Exception as e:
140 |             self.logger.error(f"Error parsing research focus: {str(e)}")
141 |             return None
142 | 
143 |     def _extract_priority(self, text: str) -> int:
144 |         """Extract priority with validation"""
145 |         for pattern in self.patterns['priority']:
146 |             priority_match = re.search(pattern, text)
147 |             if priority_match:
148 |                 try:
149 |                     priority = int(priority_match.group(1))
150 |                     return max(1, min(5, priority))
151 |                 except ValueError:
152 |                     continue
153 |         return 3  # Default priority
154 | 
155 |     def _is_valid_focus(self, focus: ResearchFocus) -> bool:
156 |         """Validate research focus completeness and quality"""
157 |         if not focus.area:  # Only check if area exists and isn't empty
158 |             return False
159 |         if focus.priority < 1 or focus.priority > 5:
160 |             return False
161 |         return True
162 | 
163 |     def _normalize_focus_areas(self, areas: List[ResearchFocus]) -> List[ResearchFocus]:
164 |         """Normalize and validate focus areas"""
165 |         normalized = []
166 |         for area in areas:
167 |             if not area.area.strip():
168 |                 continue
169 | 
170 |             area.priority = max(1, min(5, area.priority))
171 | 
172 |             if self._is_valid_focus(area):
173 |                 normalized.append(area)
174 | 
175 |         # Sort by priority (highest first) but don't add any filler areas
176 |         normalized.sort(key=lambda x: x.priority, reverse=True)
177 | 
178 |         return normalized
179 | 
180 |     def _calculate_confidence_score(self, question: str, areas: List[ResearchFocus]) -> float:
181 |         """Calculate confidence score for analysis quality"""
182 |         score = 0.0
183 | 
184 |         # Question quality (0.3)
185 |         if question and len(question.split()) >= 3:
186 |             score += 0.3
187 | 
188 |         # Areas quality (0.7)
189 |         if areas:
190 |             # Valid areas ratio (0.35) - now based on proportion that are valid vs total
191 |             num_areas = len(areas)
192 |             if num_areas > 0:  # Avoid division by zero
193 |                 valid_areas = sum(1 for a in areas if self._is_valid_focus(a))
194 |                 score += 0.35 * (valid_areas / num_areas)
195 | 
196 |             # Priority distribution (0.35) - now based on having different priorities
197 |             if num_areas > 0:  # Avoid division by zero
198 |                 unique_priorities = len(set(a.priority for a in areas))
199 |                 score += 0.35 * (unique_priorities / num_areas)
200 | 
201 |         return round(score, 2)
202 | 
203 |     def format_analysis_result(self, result: AnalysisResult) -> str:
204 |         """Format analysis result for display without reasoning."""
205 |         formatted = [
206 |             "Strategic Analysis Result",
207 |             "=" * 80,
208 |             f"\nOriginal Question Analysis:\n{result.original_question}\n",
209 |             f"Analysis Confidence Score: {result.confidence_score}",
210 |             "\nResearch Focus Areas:"
211 |         ]
212 | 
213 |         for i, focus in enumerate(result.focus_areas, 1):
214 |             formatted.extend([
215 |                 f"\n{i}. {focus.area}",
216 |                 f"   Priority: {focus.priority}"
217 |             ])
218 | 
219 |         return "\n".join(formatted)
220 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Automated-AI-Web-Researcher-Ollama
  2 | 
  3 | ## Description
  4 | Automated-AI-Web-Researcher is an innovative research assistant that leverages locally run large language models through Ollama to conduct thorough, automated online research on any given topic or question. Unlike traditional LLM interactions, this tool actually performs structured research by breaking down queries into focused research areas, systematically investigating each area via web searching and scraping relevant websites, and compiling its findings. The findings are automatically saved into a text document with all the content found and links to the sources. Whenever you want it to stop its research, you can input a command, which will terminate the research. The LLM will then review all of the content it found and provide a comprehensive final summary of your original topic or question. Afterward, you can ask the LLM questions about its research findings.
  5 | 
  6 | ## Project Demonstration
  7 | [![My Project Demo](https://img.youtube.com/vi/hS7Q1B8N1mQ/0.jpg)](https://youtu.be/hS7Q1B8N1mQ "My Project Demo")
  8 | 
  9 | Click the image above to watch the demonstration of my project.
 10 | 
 11 | ## Here's How It Works:
 12 | 1. You provide a research query (e.g., "What year will the global population begin to decrease rather than increase according to research?").
 13 | 2. The LLM analyzes your query and generates 5 specific research focus areas, each with assigned priorities based on relevance to the topic or question.
 14 | 3. Starting with the highest priority area, the LLM:
 15 |     - Formulates targeted search queries
 16 |     - Performs web searches
 17 |     - Analyzes search results, selecting the most relevant web pages
 18 |     - Scrapes and extracts relevant information from the selected web pages
 19 |     - Documents all content found during the research session into a research text file, including links to the websites that the content was retrieved from
 20 | 4. After investigating all focus areas, the LLM generates new focus areas based on the information found and repeats its research cycle, often discovering new relevant focus areas based on previous findings, leading to interesting and novel research focuses in some cases.
 21 | 5. You can let it research as long as you like, with the ability to input a quit command at any time. This will stop the research and cause the LLM to review all the content collected so far in full, generating a comprehensive summary in response to your original query or topic.
 22 | 6. The LLM will then enter a conversation mode where you can ask specific questions about the research findings if desired.
 23 | 
 24 | The key distinction is that this isn't just a chatbot—it's an automated research assistant that methodically investigates topics and maintains a documented research trail, all from a single question or topic of your choosing. Depending on your system and model, it can perform over a hundred searches and content retrievals in a relatively short amount of time. You can leave it running and return to a full text document with over a hundred pieces of content from relevant websites and then have it summarize the findings, after which you can ask it questions about what it found.
 25 | 
 26 | ## Features
 27 | - Automated research planning with prioritized focus areas
 28 | - Systematic web searching and content analysis
 29 | - All research content and source URLs saved into a detailed text document
 30 | - Research summary generation
 31 | - Post-research Q&A capability about findings
 32 | - Self-improving search mechanism
 33 | - Rich console output with status indicators
 34 | - Comprehensive answer synthesis using web-sourced information
 35 | - Research conversation mode for exploring findings
 36 | 
 37 | ## Installation
 38 | **Note:** To use on Windows, follow the instructions on the [/feature/windows-support](https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama/tree/feature/windows-support) branch. For Linux and MacOS, use this main branch and the follow steps below:
 39 | 
 40 | 1. **Clone the repository:**
 41 | 
 42 |     ```sh
 43 |     git clone https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama
 44 |     cd Automated-AI-Web-Researcher-Ollama
 45 |     ```
 46 | 
 47 | 2. **Create and activate a virtual environment:**
 48 | 
 49 |     ```sh
 50 |     python -m venv venv
 51 |     source venv/bin/activate
 52 |     ```
 53 | 
 54 | 3. **Install dependencies:**
 55 | 
 56 |     ```sh
 57 |     pip install -r requirements.txt
 58 |     ```
 59 | 
 60 | 4. **Install and configure Ollama:**
 61 | 
 62 |     Install Ollama following the instructions at [https://ollama.ai](https://ollama.ai).
 63 | 
 64 |     Using your selected model, reccommended to pick one with the required context length for lots of searches (`phi3:3.8b-mini-128k-instruct` or `phi3:14b-medium-128k-instruct` are recommended).
 65 | 
 66 | 5. Go to the llm_config.py file which should have an ollama section that looks like this:
 67 | 
 68 | ```sh
 69 | LLM_CONFIG_OLLAMA = {
 70 |     "llm_type": "ollama",
 71 |     "base_url": "http://localhost:11434",  # default Ollama server URL
 72 |     "model_name": "custom-phi3-32k-Q4_K_M",  # Replace with your Ollama model name
 73 |     "temperature": 0.7,
 74 |     "top_p": 0.9,
 75 |     "n_ctx": 55000,
 76 |     "stop": ["User:", "\n\n"]
 77 | ```
 78 | 
 79 | Then change to the left of where it says replace with your Ollama model name, the "model_name" function, to the name of the model you have setup in Ollama to use with the program, you can now also change 'n_ctx' to set the desired context size.
 80 |    
 81 | 
 82 | ## Usage
 83 | 1. **Start Ollama:**
 84 | 
 85 |     ```sh
 86 |     ollama serve
 87 |     ```
 88 | 
 89 | 2. **Run the researcher:**
 90 | 
 91 |     ```sh
 92 |     python Web-LLM.py
 93 |     ```
 94 | 
 95 | 3. **Start a research session:**
 96 |     - Type `@` followed by your research query.
 97 |     - Press `CTRL+D` to submit.
 98 |     - Example: `@What year is the global population projected to start declining?`
 99 | 
100 | 4. **During research, you can use the following commands by typing the associated letter and submitting with `CTRL+D`:**
101 |     - Use `s` to show status.
102 |     - Use `f` to show the current focus.
103 |     - Use `p` to pause and assess research progress, which will give you an assessment from the LLM after reviewing the entire research content to determine whether it can answer your query with the content collected so far. It will then wait for you to input one of two commands: `c` to continue with the research or `q` to terminate it, resulting in a summary as if you had terminated it without using the pause feature.
104 |     - Use `q` to quit research.
105 | 
106 | 5. **After the research completes:**
107 |     - Wait for the summary to be generated and review the LLM's findings.
108 |     - Enter conversation mode to ask specific questions about its findings.
109 |     - Access the detailed research content found, available in a research session text file which will be located in the program's directory. This includes:
110 |         - All retrieved content
111 |         - Source URLs for all of the information
112 |         - Focus areas investigated
113 |         - Generated summary
114 | 
115 | ## Configuration
116 | The LLM settings can be modified in `llm_config.py`. You must specify your model name in the configuration for the researcher to function. The default configuration is optimized for research tasks with the specified Phi-3 model.
117 | 
118 | ## Current Status
119 | This is a prototype that demonstrates functional automated research capabilities. While still in development, it successfully performs structured research tasks. It has been tested and works well with the `phi3:3.8b-mini-128k-instruct` model when the context is set as advised previously.
120 | 
121 | ## Dependencies
122 | - Ollama
123 | - Python packages listed in `requirements.txt`
124 | - Recommended models: `phi3:3.8b-mini-128k-instruct` or `phi3:14b-medium-128k-instruct` (with custom context length as specified)
125 | 
126 | ## Contributing
127 | Contributions are welcome! This is a prototype with room for improvements and new features.
128 | 
129 | ## License
130 | This project is licensed under the MIT License—see the [LICENSE](LICENSE) file for details.
131 | 
132 | ## Acknowledgments
133 | - Ollama team for their local LLM runtime
134 | - DuckDuckGo for their search API
135 | 
136 | ## Personal Note
137 | This tool represents an attempt to bridge the gap between simple LLM interactions and genuine research capabilities. By structuring the research process and maintaining documentation, it aims to provide more thorough and verifiable results than traditional LLM conversations. It also represents an attempt to improve on my previous project, 'Web-LLM-Assistant-Llamacpp-Ollama,' which simply gave LLMs the ability to search and scrape websites to answer questions. Unlike its predecessor, I feel this program takes that capability and uses it in a novel and very useful way. As a very new programmer, with this being my second ever program, I feel very good about the result. I hope that it hits the mark!
138 | 
139 | Given how much I have been using it myself, unlike the previous program, which felt more like a novelty than an actual tool, this is actually quite useful and unique—but I am quite biased!
140 | 
141 | Please enjoy! And feel free to submit any suggestions for improvements so that we can make this automated AI researcher even more capable.
142 | 
143 | ## Disclaimer
144 | This project is for educational purposes only. Ensure you comply with the terms of service of all APIs and services used.
145 | 


--------------------------------------------------------------------------------
/llm_response_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Dict, List, Union, Optional
  3 | import logging
  4 | import json
  5 | from strategic_analysis_parser import StrategicAnalysisParser, AnalysisResult, ResearchFocus
  6 | 
  7 | # Set up logging
  8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | class UltimateLLMResponseParser:
 12 |     def __init__(self):
 13 |         self.decision_keywords = {
 14 |             'refine': ['refine', 'need more info', 'insufficient', 'unclear', 'more research', 'additional search'],
 15 |             'answer': ['answer', 'sufficient', 'enough info', 'can respond', 'adequate', 'comprehensive']
 16 |         }
 17 |         self.section_identifiers = [
 18 |             ('decision', r'(?i)decision\s*:'),
 19 |             ('reasoning', r'(?i)reasoning\s*:'),
 20 |             ('selected_results', r'(?i)selected results\s*:'),
 21 |             ('response', r'(?i)response\s*:')
 22 |         ]
 23 |         # Initialize strategic analysis parser
 24 |         self.strategic_parser = StrategicAnalysisParser()
 25 | 
 26 |     def parse_llm_response(self, response: str, mode: str = 'search') -> Dict[str, Union[str, List[int], AnalysisResult]]:
 27 |         """
 28 |         Parse LLM response based on mode
 29 | 
 30 |         Args:
 31 |             response (str): The LLM's response text
 32 |             mode (str): 'search' for web search, 'research' for strategic analysis
 33 | 
 34 |         Returns:
 35 |             Dict containing parsed response
 36 |         """
 37 |         logger.info(f"Starting to parse LLM response in {mode} mode")
 38 | 
 39 |         if mode == 'research':
 40 |             return self._parse_research_response(response)
 41 | 
 42 |         # Original search mode parsing
 43 |         result = {
 44 |             'decision': None,
 45 |             'reasoning': None,
 46 |             'selected_results': [],
 47 |             'response': None
 48 |         }
 49 | 
 50 |         parsing_strategies = [
 51 |             self._parse_structured_response,
 52 |             self._parse_json_response,
 53 |             self._parse_unstructured_response,
 54 |             self._parse_implicit_response
 55 |         ]
 56 | 
 57 |         for strategy in parsing_strategies:
 58 |             try:
 59 |                 parsed_result = strategy(response)
 60 |                 if self._is_valid_result(parsed_result):
 61 |                     result.update(parsed_result)
 62 |                     logger.info(f"Successfully parsed using strategy: {strategy.__name__}")
 63 |                     break
 64 |             except Exception as e:
 65 |                 logger.warning(f"Error in parsing strategy {strategy.__name__}: {str(e)}")
 66 | 
 67 |         if not self._is_valid_result(result):
 68 |             logger.warning("All parsing strategies failed. Using fallback parsing.")
 69 |             result = self._fallback_parsing(response)
 70 | 
 71 |         result = self._post_process_result(result)
 72 | 
 73 |         logger.info("Finished parsing LLM response")
 74 |         return result
 75 | 
 76 |     def _parse_research_response(self, response: str) -> Dict[str, Union[str, AnalysisResult]]:
 77 |         """Handle research mode specific parsing"""
 78 |         try:
 79 |             analysis_result = self.strategic_parser.parse_analysis(response)
 80 |             if analysis_result:
 81 |                 return {
 82 |                     'mode': 'research',
 83 |                     'analysis_result': analysis_result,
 84 |                     'error': None
 85 |                 }
 86 |             else:
 87 |                 logger.error("Failed to parse strategic analysis")
 88 |                 return {
 89 |                     'mode': 'research',
 90 |                     'analysis_result': None,
 91 |                     'error': 'Failed to parse strategic analysis'
 92 |                 }
 93 |         except Exception as e:
 94 |             logger.error(f"Error in research response parsing: {str(e)}")
 95 |             return {
 96 |                 'mode': 'research',
 97 |                 'analysis_result': None,
 98 |                 'error': str(e)
 99 |             }
100 | 
101 |     def parse_search_query(self, query_response: str) -> Dict[str, str]:
102 |         """Parse search query formulation response"""
103 |         try:
104 |             lines = query_response.strip().split('\n')
105 |             result = {
106 |                 'query': '',
107 |                 'time_range': 'none'
108 |             }
109 | 
110 |             for line in lines:
111 |                 if ':' in line:
112 |                     key, value = line.split(':', 1)
113 |                     key = key.strip().lower()
114 |                     value = value.strip()
115 | 
116 |                     if 'query' in key:
117 |                         result['query'] = self._clean_query(value)
118 |                     elif 'time' in key or 'range' in key:
119 |                         result['time_range'] = self._validate_time_range(value)
120 | 
121 |             return result
122 |         except Exception as e:
123 |             logger.error(f"Error parsing search query: {str(e)}")
124 |             return {'query': '', 'time_range': 'none'}
125 | 
126 |     def _parse_structured_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
127 |         result = {}
128 |         for key, pattern in self.section_identifiers:
129 |             match = re.search(f'{pattern}(.*?)(?={"|".join([p for k, p in self.section_identifiers if k != key])}|$)',
130 |                             response, re.IGNORECASE | re.DOTALL)
131 |             if match:
132 |                 result[key] = match.group(1).strip()
133 | 
134 |         if 'selected_results' in result:
135 |             result['selected_results'] = self._extract_numbers(result['selected_results'])
136 | 
137 |         return result
138 | 
139 |     def _parse_json_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
140 |         try:
141 |             json_match = re.search(r'\{.*\}', response, re.DOTALL)
142 |             if json_match:
143 |                 json_str = json_match.group(0)
144 |                 parsed_json = json.loads(json_str)
145 |                 return {k: v for k, v in parsed_json.items()
146 |                        if k in ['decision', 'reasoning', 'selected_results', 'response']}
147 |         except json.JSONDecodeError:
148 |             pass
149 |         return {}
150 | 
151 |     def _parse_unstructured_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
152 |         result = {}
153 |         lines = response.split('\n')
154 |         current_section = None
155 | 
156 |         for line in lines:
157 |             section_match = re.match(r'(.+?)[:.-](.+)', line)
158 |             if section_match:
159 |                 key = self._match_section_to_key(section_match.group(1))
160 |                 if key:
161 |                     current_section = key
162 |                     result[key] = section_match.group(2).strip()
163 |             elif current_section:
164 |                 result[current_section] += ' ' + line.strip()
165 | 
166 |         if 'selected_results' in result:
167 |             result['selected_results'] = self._extract_numbers(result['selected_results'])
168 | 
169 |         return result
170 | 
171 |     def _parse_implicit_response(self, response: str) -> Dict[str, Union[str, List[int]]]:
172 |         result = {}
173 | 
174 |         decision = self._infer_decision(response)
175 |         if decision:
176 |             result['decision'] = decision
177 | 
178 |         numbers = self._extract_numbers(response)
179 |         if numbers:
180 |             result['selected_results'] = numbers
181 | 
182 |         if not result:
183 |             result['response'] = response.strip()
184 | 
185 |         return result
186 | 
187 |     def _fallback_parsing(self, response: str) -> Dict[str, Union[str, List[int]]]:
188 |         return {
189 |             'decision': self._infer_decision(response),
190 |             'reasoning': None,
191 |             'selected_results': self._extract_numbers(response),
192 |             'response': response.strip()
193 |         }
194 | 
195 |     def _post_process_result(self, result: Dict[str, Union[str, List[int]]]) -> Dict[str, Union[str, List[int]]]:
196 |         if result['decision'] not in ['refine', 'answer']:
197 |             result['decision'] = self._infer_decision(str(result))
198 | 
199 |         if not isinstance(result['selected_results'], list):
200 |             result['selected_results'] = self._extract_numbers(str(result['selected_results']))
201 | 
202 |         result['selected_results'] = result['selected_results'][:2]
203 | 
204 |         if not result['reasoning']:
205 |             result['reasoning'] = f"Based on the {'presence' if result['selected_results'] else 'absence'} of selected results and the overall content."
206 | 
207 |         if not result['response']:
208 |             result['response'] = result.get('reasoning', 'No clear response found.')
209 | 
210 |         return result
211 | 
212 |     def _match_section_to_key(self, section: str) -> Optional[str]:
213 |         for key, pattern in self.section_identifiers:
214 |             if re.search(pattern, section, re.IGNORECASE):
215 |                 return key
216 |         return None
217 | 
218 |     def _extract_numbers(self, text: str) -> List[int]:
219 |         return [int(num) for num in re.findall(r'\b(?:10|[1-9])\b', text)]
220 | 
221 |     def _infer_decision(self, text: str) -> str:
222 |         text = text.lower()
223 |         refine_score = sum(text.count(keyword) for keyword in self.decision_keywords['refine'])
224 |         answer_score = sum(text.count(keyword) for keyword in self.decision_keywords['answer'])
225 |         return 'refine' if refine_score > answer_score else 'answer'
226 | 
227 |     def _is_valid_result(self, result: Dict[str, Union[str, List[int]]]) -> bool:
228 |         return bool(result.get('decision') or result.get('response') or result.get('selected_results'))
229 | 
230 |     def _clean_query(self, query: str) -> str:
231 |         """Clean and validate search query"""
232 |         query = re.sub(r'["\'\[\]]', '', query)
233 |         query = re.sub(r'\s+', ' ', query)
234 |         return query.strip()[:100]
235 | 
236 |     def _validate_time_range(self, time_range: str) -> str:
237 |         """Validate time range value"""
238 |         valid_ranges = ['d', 'w', 'm', 'y', 'none']
239 |         time_range = time_range.lower()
240 |         return time_range if time_range in valid_ranges else 'none'
241 | 


--------------------------------------------------------------------------------
/Web-LLM.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | from colorama import init, Fore, Style
  4 | import logging
  5 | import time
  6 | from io import StringIO
  7 | from Self_Improving_Search import EnhancedSelfImprovingSearch
  8 | from llm_config import get_llm_config
  9 | from llm_response_parser import UltimateLLMResponseParser
 10 | from llm_wrapper import LLMWrapper
 11 | from strategic_analysis_parser import StrategicAnalysisParser
 12 | from research_manager import ResearchManager
 13 | 
 14 | # Initialize colorama
 15 | if os.name == 'nt':  # Windows-specific initialization
 16 |     init(convert=True, strip=False, wrap=True)
 17 | else:
 18 |     init()
 19 | 
 20 | # Set up logging
 21 | log_directory = 'logs'
 22 | if not os.path.exists(log_directory):
 23 |     os.makedirs(log_directory)
 24 | 
 25 | logger = logging.getLogger(__name__)
 26 | logger.setLevel(logging.INFO)
 27 | log_file = os.path.join(log_directory, 'web_llm.log')
 28 | file_handler = logging.FileHandler(log_file)
 29 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 30 | file_handler.setFormatter(formatter)
 31 | logger.handlers = []
 32 | logger.addHandler(file_handler)
 33 | logger.propagate = False
 34 | 
 35 | # Disable other loggers
 36 | for name in logging.root.manager.loggerDict:
 37 |     if name != __name__:
 38 |         logging.getLogger(name).disabled = True
 39 | 
 40 | class OutputRedirector:
 41 |     def __init__(self, stream=None):
 42 |         self.stream = stream or StringIO()
 43 |         self.original_stdout = sys.stdout
 44 |         self.original_stderr = sys.stderr
 45 | 
 46 |     def __enter__(self):
 47 |         sys.stdout = self.stream
 48 |         sys.stderr = self.stream
 49 |         return self.stream
 50 | 
 51 |     def __exit__(self, exc_type, exc_val, exc_tb):
 52 |         sys.stdout = self.original_stdout
 53 |         sys.stderr = self.original_stderr
 54 | 
 55 | def print_header():
 56 |     print(Fore.CYAN + Style.BRIGHT + """
 57 |     ╔══════════════════════════════════════════════════════════╗
 58 |     ║             🌐 Advanced Research Assistant 🤖             ║
 59 |     ╚══════════════════════════════════════════════════════════╝
 60 |     """ + Style.RESET_ALL)
 61 |     print(Fore.YELLOW + """
 62 |     Welcome to the Advanced Research Assistant!
 63 | 
 64 |     Usage:
 65 |     - Start your research query with '@'
 66 |       Example: "@analyze the impact of AI on healthcare"
 67 | 
 68 |     Press CTRL+D (Linux/Mac) or CTRL+Z (Windows) to submit input.
 69 |     """ + Style.RESET_ALL)
 70 |     
 71 | def get_multiline_input() -> str:
 72 |     """Get multiline input using raw terminal mode for reliable CTRL+D handling"""
 73 |     print(f"{Fore.GREEN}📝 Enter your message (Press CTRL+D to submit):{Style.RESET_ALL}")
 74 |     lines = []
 75 | 
 76 |     import termios
 77 |     import tty
 78 |     import sys
 79 | 
 80 |     # Save original terminal settings
 81 |     fd = sys.stdin.fileno()
 82 |     old_settings = termios.tcgetattr(fd)
 83 | 
 84 |     try:
 85 |         # Set terminal to raw mode
 86 |         tty.setraw(fd)
 87 | 
 88 |         current_line = []
 89 |         while True:
 90 |             # Read one character at a time
 91 |             char = sys.stdin.read(1)
 92 | 
 93 |             # CTRL+D detection
 94 |             if not char or ord(char) == 4:  # EOF or CTRL+D
 95 |                 sys.stdout.write('\n')  # New line for clean display
 96 |                 if current_line:
 97 |                     lines.append(''.join(current_line))
 98 |                 return ' '.join(lines).strip()
 99 | 
100 |             # Handle special characters
101 |             elif ord(char) == 13:  # Enter
102 |                 sys.stdout.write('\n')
103 |                 lines.append(''.join(current_line))
104 |                 current_line = []
105 | 
106 |             elif ord(char) == 127:  # Backspace
107 |                 if current_line:
108 |                     current_line.pop()
109 |                     sys.stdout.write('\b \b')  # Erase character
110 | 
111 |             elif ord(char) == 3:  # CTRL+C
112 |                 sys.stdout.write('\n')
113 |                 return 'q'
114 | 
115 |             # Normal character
116 |             elif 32 <= ord(char) <= 126:  # Printable characters
117 |                 current_line.append(char)
118 |                 sys.stdout.write(char)
119 | 
120 |             # Flush output
121 |             sys.stdout.flush()
122 | 
123 |     finally:
124 |         # Restore terminal settings
125 |         termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
126 |         print()  # New line for clean display
127 | 
128 | def initialize_system():
129 |     """Initialize system with proper error checking"""
130 |     try:
131 |         print(Fore.YELLOW + "Initializing system..." + Style.RESET_ALL)
132 | 
133 |         llm_config = get_llm_config()
134 |         if llm_config['llm_type'] == 'ollama':
135 |             import requests
136 |             try:
137 |                 response = requests.get(llm_config['base_url'], timeout=5)
138 |                 if response.status_code != 200:
139 |                     raise ConnectionError("Cannot connect to Ollama server")
140 |             except requests.exceptions.RequestException:
141 |                 raise ConnectionError(
142 |                     "\nCannot connect to Ollama server!"
143 |                     "\nPlease ensure:"
144 |                     "\n1. Ollama is installed"
145 |                     "\n2. Ollama server is running (try 'ollama serve')"
146 |                     "\n3. The model specified in llm_config.py is pulled"
147 |                 )
148 |         elif llm_config['llm_type'] == 'llama_cpp':
149 |             model_path = llm_config.get('model_path')
150 |             if not model_path or not os.path.exists(model_path):
151 |                 raise FileNotFoundError(
152 |                     f"\nLLama.cpp model not found at: {model_path}"
153 |                     "\nPlease ensure model path in llm_config.py is correct"
154 |                 )
155 | 
156 |         with OutputRedirector() as output:
157 |             llm_wrapper = LLMWrapper()
158 |             try:
159 |                 test_response = llm_wrapper.generate("Test", max_tokens=10)
160 |                 if not test_response:
161 |                     raise ConnectionError("LLM failed to generate response")
162 |             except Exception as e:
163 |                 raise ConnectionError(f"LLM test failed: {str(e)}")
164 | 
165 |             parser = UltimateLLMResponseParser()
166 |             search_engine = EnhancedSelfImprovingSearch(llm_wrapper, parser)
167 |             research_manager = ResearchManager(llm_wrapper, parser, search_engine)
168 | 
169 |         print(Fore.GREEN + "System initialized successfully." + Style.RESET_ALL)
170 |         return llm_wrapper, parser, search_engine, research_manager
171 |     except Exception as e:
172 |         logger.error(f"Error initializing system: {str(e)}", exc_info=True)
173 |         print(Fore.RED + f"System initialization failed: {str(e)}" + Style.RESET_ALL)
174 |         return None, None, None, None
175 | 
176 | def handle_research_mode(research_manager, query):
177 |     """Handles research mode operations"""
178 |     print(f"{Fore.CYAN}Initiating research mode...{Style.RESET_ALL}")
179 | 
180 |     try:
181 |         # Start the research
182 |         research_manager.start_research(query)
183 | 
184 |         submit_key = "CTRL+Z" if os.name == 'nt' else "CTRL+D"
185 |         print(f"\n{Fore.YELLOW}Research Running. Available Commands:{Style.RESET_ALL}")
186 |         print(f"Type command and press {submit_key}:")
187 |         print("'s' = Show status")
188 |         print("'f' = Show focus")
189 |         print("'q' = Quit research")
190 | 
191 |         while research_manager.is_active():
192 |             try:
193 |                 command = get_multiline_input().strip().lower()
194 |                 if command == 's':
195 |                     print("\n" + research_manager.get_progress())
196 |                 elif command == 'f':
197 |                     if research_manager.current_focus:
198 |                         print(f"\n{Fore.CYAN}Current Focus:{Style.RESET_ALL}")
199 |                         print(f"Area: {research_manager.current_focus.area}")
200 |                         print(f"Priority: {research_manager.current_focus.priority}")
201 |                         print(f"Reasoning: {research_manager.current_focus.reasoning}")
202 |                     else:
203 |                         print(f"\n{Fore.YELLOW}No current focus area{Style.RESET_ALL}")
204 |                 elif command == 'q':
205 |                     break
206 |             except KeyboardInterrupt:
207 |                 break
208 | 
209 |         # Get final summary first
210 |         summary = research_manager.terminate_research()
211 | 
212 |         # Ensure research UI is fully cleaned up
213 |         research_manager._cleanup_research_ui()
214 | 
215 |         # Now in main terminal, show summary
216 |         print(f"\n{Fore.GREEN}Research Summary:{Style.RESET_ALL}")
217 |         print(summary)
218 | 
219 |         # Only NOW start conversation mode if we have a valid summary
220 |         if research_manager.research_complete and research_manager.research_summary:
221 |             time.sleep(0.5)  # Small delay to ensure clean transition
222 |             research_manager.start_conversation_mode()
223 | 
224 |         return
225 | 
226 |     except KeyboardInterrupt:
227 |         print(f"\n{Fore.YELLOW}Research interrupted.{Style.RESET_ALL}")
228 |         research_manager.terminate_research()
229 |     except Exception as e:
230 |         print(f"\n{Fore.RED}Research error: {str(e)}{Style.RESET_ALL}")
231 |         research_manager.terminate_research()
232 | 
233 | def main():
234 |     print_header()
235 |     try:
236 |         llm, parser, search_engine, research_manager = initialize_system()
237 |         if not all([llm, parser, search_engine, research_manager]):
238 |             return
239 | 
240 |         while True:
241 |             try:
242 |                 # Get input with improved CTRL+D handling
243 |                 user_input = get_multiline_input()
244 | 
245 |                 # Handle immediate CTRL+D (empty input)
246 |                 if user_input == "":
247 |                     user_input = "@quit"  # Convert empty CTRL+D to quit command
248 | 
249 |                 user_input = user_input.strip()
250 | 
251 |                 # Check for special quit markers
252 |                 if user_input in ["@quit", "quit", "q"]:
253 |                     print(Fore.YELLOW + "\nGoodbye!" + Style.RESET_ALL)
254 |                     break
255 | 
256 |                 if not user_input:
257 |                     continue
258 | 
259 |                 if user_input.lower() == 'help':
260 |                     print_header()
261 |                     continue
262 | 
263 |                 if user_input.startswith('/'):
264 |                     search_query = user_input[1:].strip()
265 |                     handle_search_mode(search_engine, search_query)
266 | 
267 |                 elif user_input.startswith('@'):
268 |                     research_query = user_input[1:].strip()
269 |                     handle_research_mode(research_manager, research_query)
270 | 
271 |                 else:
272 |                     print(f"{Fore.RED}Please start with '/' for search or '@' for research.{Style.RESET_ALL}")
273 | 
274 |             except KeyboardInterrupt:
275 |                 print(f"\n{Fore.YELLOW}Exiting program...{Style.RESET_ALL}")
276 |                 break
277 | 
278 |             except Exception as e:
279 |                 logger.error(f"Error in main loop: {str(e)}")
280 |                 print(f"{Fore.RED}An error occurred: {str(e)}{Style.RESET_ALL}")
281 |                 continue
282 | 
283 |     except KeyboardInterrupt:
284 |         print(f"\n{Fore.YELLOW}Program terminated by user.{Style.RESET_ALL}")
285 | 
286 |     except Exception as e:
287 |         logger.critical(f"Critical error: {str(e)}")
288 |         print(f"{Fore.RED}Critical error: {str(e)}{Style.RESET_ALL}")
289 | 
290 |     finally:
291 |         # Ensure proper cleanup on exit
292 |         try:
293 |             if 'research_manager' in locals() and research_manager:
294 |                 if hasattr(research_manager, 'ui'):
295 |                     research_manager.ui.cleanup()
296 |             curses.endwin()
297 |         except:
298 |             pass
299 |         os._exit(0)
300 | 
301 | if __name__ == "__main__":
302 |     main()
303 | 


--------------------------------------------------------------------------------
/Self_Improving_Search.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import re
  3 | import os
  4 | from typing import List, Dict, Tuple, Union
  5 | from colorama import Fore, Style
  6 | import logging
  7 | import sys
  8 | from io import StringIO
  9 | from web_scraper import get_web_content, can_fetch
 10 | from llm_config import get_llm_config
 11 | from llm_response_parser import UltimateLLMResponseParser
 12 | from llm_wrapper import LLMWrapper
 13 | from urllib.parse import urlparse
 14 | 
 15 | # Set up logging
 16 | log_directory = 'logs'
 17 | if not os.path.exists(log_directory):
 18 |     os.makedirs(log_directory)
 19 | 
 20 | # Configure logger
 21 | logger = logging.getLogger(__name__)
 22 | logger.setLevel(logging.INFO)
 23 | log_file = os.path.join(log_directory, 'llama_output.log')
 24 | file_handler = logging.FileHandler(log_file)
 25 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 26 | file_handler.setFormatter(formatter)
 27 | logger.handlers = []
 28 | logger.addHandler(file_handler)
 29 | logger.propagate = False
 30 | 
 31 | # Suppress other loggers
 32 | for name in ['root', 'duckduckgo_search', 'requests', 'urllib3']:
 33 |     logging.getLogger(name).setLevel(logging.WARNING)
 34 |     logging.getLogger(name).handlers = []
 35 |     logging.getLogger(name).propagate = False
 36 | 
 37 | class OutputRedirector:
 38 |     def __init__(self, stream=None):
 39 |         self.stream = stream or StringIO()
 40 |         self.original_stdout = sys.stdout
 41 |         self.original_stderr = sys.stderr
 42 | 
 43 |     def __enter__(self):
 44 |         sys.stdout = self.stream
 45 |         sys.stderr = self.stream
 46 |         return self.stream
 47 | 
 48 |     def __exit__(self, exc_type, exc_val, exc_tb):
 49 |         sys.stdout = self.original_stdout
 50 |         sys.stderr = self.original_stderr
 51 | 
 52 | class EnhancedSelfImprovingSearch:
 53 |     def __init__(self, llm: LLMWrapper, parser: UltimateLLMResponseParser, max_attempts: int = 5):
 54 |         self.llm = llm
 55 |         self.parser = parser
 56 |         self.max_attempts = max_attempts
 57 |         self.llm_config = get_llm_config()
 58 | 
 59 |     @staticmethod
 60 |     def initialize_llm():
 61 |         llm_wrapper = LLMWrapper()
 62 |         return llm_wrapper
 63 | 
 64 |     def print_thinking(self):
 65 |         print(Fore.MAGENTA + "🧠 Thinking..." + Style.RESET_ALL)
 66 | 
 67 |     def print_searching(self):
 68 |         print(Fore.MAGENTA + "📝 Searching..." + Style.RESET_ALL)
 69 | 
 70 |     def search_and_improve(self, user_query: str) -> str:
 71 |         attempt = 0
 72 |         while attempt < self.max_attempts:
 73 |             print(f"\n{Fore.CYAN}Search attempt {attempt + 1}:{Style.RESET_ALL}")
 74 |             self.print_searching()
 75 | 
 76 |             try:
 77 |                 formulated_query, time_range = self.formulate_query(user_query, attempt)
 78 | 
 79 |                 print(f"{Fore.YELLOW}Original query: {user_query}{Style.RESET_ALL}")
 80 |                 print(f"{Fore.YELLOW}Formulated query: {formulated_query}{Style.RESET_ALL}")
 81 |                 print(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}")
 82 | 
 83 |                 if not formulated_query:
 84 |                     print(f"{Fore.RED}Error: Empty search query. Retrying...{Style.RESET_ALL}")
 85 |                     attempt += 1
 86 |                     continue
 87 | 
 88 |                 search_results = self.perform_search(formulated_query, time_range)
 89 | 
 90 |                 if not search_results:
 91 |                     print(f"{Fore.RED}No results found. Retrying with a different query...{Style.RESET_ALL}")
 92 |                     attempt += 1
 93 |                     continue
 94 | 
 95 |                 self.display_search_results(search_results)
 96 | 
 97 |                 selected_urls = self.select_relevant_pages(search_results, user_query)
 98 | 
 99 |                 if not selected_urls:
100 |                     print(f"{Fore.RED}No relevant URLs found. Retrying...{Style.RESET_ALL}")
101 |                     attempt += 1
102 |                     continue
103 | 
104 |                 print(Fore.MAGENTA + "⚙️ Scraping selected pages..." + Style.RESET_ALL)
105 |                 # Scraping is done without OutputRedirector to ensure messages are visible
106 |                 scraped_content = self.scrape_content(selected_urls)
107 | 
108 |                 if not scraped_content:
109 |                     print(f"{Fore.RED}Failed to scrape content. Retrying...{Style.RESET_ALL}")
110 |                     attempt += 1
111 |                     continue
112 | 
113 |                 self.display_scraped_content(scraped_content)
114 | 
115 |                 self.print_thinking()
116 | 
117 |                 with OutputRedirector() as output:
118 |                     evaluation, decision = self.evaluate_scraped_content(user_query, scraped_content)
119 |                 llm_output = output.getvalue()
120 |                 logger.info(f"LLM Output in evaluate_scraped_content:\n{llm_output}")
121 | 
122 |                 print(f"{Fore.MAGENTA}Evaluation: {evaluation}{Style.RESET_ALL}")
123 |                 print(f"{Fore.MAGENTA}Decision: {decision}{Style.RESET_ALL}")
124 | 
125 |                 if decision == "answer":
126 |                     return self.generate_final_answer(user_query, scraped_content)
127 |                 elif decision == "refine":
128 |                     print(f"{Fore.YELLOW}Refining search...{Style.RESET_ALL}")
129 |                     attempt += 1
130 |                 else:
131 |                     print(f"{Fore.RED}Unexpected decision. Proceeding to answer.{Style.RESET_ALL}")
132 |                     return self.generate_final_answer(user_query, scraped_content)
133 | 
134 |             except Exception as e:
135 |                 print(f"{Fore.RED}An error occurred during search attempt. Check the log file for details.{Style.RESET_ALL}")
136 |                 logger.error(f"An error occurred during search: {str(e)}", exc_info=True)
137 |                 attempt += 1
138 | 
139 |         return self.synthesize_final_answer(user_query)
140 | 
141 |     def evaluate_scraped_content(self, user_query: str, scraped_content: Dict[str, str]) -> Tuple[str, str]:
142 |         user_query_short = user_query[:200]
143 |         prompt = f"""
144 | Evaluate if the following scraped content contains sufficient information to answer the user's question comprehensively:
145 | 
146 | User's question: "{user_query_short}"
147 | 
148 | Scraped Content:
149 | {self.format_scraped_content(scraped_content)}
150 | 
151 | Your task:
152 | 1. Determine if the scraped content provides enough relevant and detailed information to answer the user's question thoroughly.
153 | 2. If the information is sufficient, decide to 'answer'. If more information or clarification is needed, decide to 'refine' the search.
154 | 
155 | Respond using EXACTLY this format:
156 | Evaluation: [Your evaluation of the scraped content]
157 | Decision: [ONLY 'answer' if content is sufficient, or 'refine' if more information is needed]
158 | """
159 |         max_retries = 3
160 |         for attempt in range(max_retries):
161 |             try:
162 |                 response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
163 |                 evaluation, decision = self.parse_evaluation_response(response_text)
164 |                 if decision in ['answer', 'refine']:
165 |                     return evaluation, decision
166 |             except Exception as e:
167 |                 logger.warning(f"Error in evaluate_scraped_content (attempt {attempt + 1}): {str(e)}")
168 | 
169 |         logger.warning("Failed to get a valid decision in evaluate_scraped_content. Defaulting to 'refine'.")
170 |         return "Failed to evaluate content.", "refine"
171 | 
172 |     def parse_evaluation_response(self, response: str) -> Tuple[str, str]:
173 |         evaluation = ""
174 |         decision = ""
175 |         for line in response.strip().split('\n'):
176 |             if line.startswith('Evaluation:'):
177 |                 evaluation = line.split(':', 1)[1].strip()
178 |             elif line.startswith('Decision:'):
179 |                 decision = line.split(':', 1)[1].strip().lower()
180 |         return evaluation, decision
181 | 
182 |     def formulate_query(self, user_query: str, attempt: int) -> Tuple[str, str]:
183 |         user_query_short = user_query[:200]
184 |         prompt = f"""
185 | Based on the following user question, formulate a concise and effective search query:
186 | "{user_query_short}"
187 | Your task:
188 | 1. Create a search query of 2-5 words that will yield relevant results.
189 | 2. Determine if a specific time range is needed for the search.
190 | Time range options:
191 | - 'd': Limit results to the past day. Use for very recent events or rapidly changing information.
192 | - 'w': Limit results to the past week. Use for recent events or topics with frequent updates.
193 | - 'm': Limit results to the past month. Use for relatively recent information or ongoing events.
194 | - 'y': Limit results to the past year. Use for annual events or information that changes yearly.
195 | - 'none': No time limit. Use for historical information or topics not tied to a specific time frame.
196 | Respond in the following format:
197 | Search query: [Your 2-5 word query]
198 | Time range: [d/w/m/y/none]
199 | Do not provide any additional information or explanation.
200 | """
201 |         max_retries = 3
202 |         for retry in range(max_retries):
203 |             with OutputRedirector() as output:
204 |                 response_text = self.llm.generate(prompt, max_tokens=50, stop=None)
205 |             llm_output = output.getvalue()
206 |             logger.info(f"LLM Output in formulate_query:\n{llm_output}")
207 |             query, time_range = self.parse_query_response(response_text)
208 |             if query and time_range:
209 |                 return query, time_range
210 |         return self.fallback_query(user_query), "none"
211 | 
212 |     def parse_query_response(self, response: str) -> Tuple[str, str]:
213 |         query = ""
214 |         time_range = "none"
215 |         for line in response.strip().split('\n'):
216 |             if ":" in line:
217 |                 key, value = line.split(":", 1)
218 |                 key = key.strip().lower()
219 |                 value = value.strip()
220 |                 if "query" in key:
221 |                     query = self.clean_query(value)
222 |                 elif "time" in key or "range" in key:
223 |                     time_range = self.validate_time_range(value)
224 |         return query, time_range
225 | 
226 |     def clean_query(self, query: str) -> str:
227 |         query = re.sub(r'["\'\[\]]', '', query)
228 |         query = re.sub(r'\s+', ' ', query)
229 |         return query.strip()[:100]
230 | 
231 |     def validate_time_range(self, time_range: str) -> str:
232 |         valid_ranges = ['d', 'w', 'm', 'y', 'none']
233 |         time_range = time_range.lower()
234 |         return time_range if time_range in valid_ranges else 'none'
235 | 
236 |     def fallback_query(self, user_query: str) -> str:
237 |         words = user_query.split()
238 |         return " ".join(words[:5])
239 | 
240 |     def perform_search(self, query: str, time_range: str) -> List[Dict]:
241 |         if not query:
242 |             return []
243 | 
244 |         from duckduckgo_search import DDGS
245 |         max_retries = 3
246 |         base_delay = 2  # Base delay in seconds
247 | 
248 |         for retry in range(max_retries):
249 |             try:
250 |                 # Add delay that increases with each retry
251 |                 if retry > 0:
252 |                     delay = base_delay * (2 ** (retry - 1))  # Exponential backoff
253 |                     print(f"{Fore.YELLOW}Rate limit hit. Waiting {delay} seconds before retry {retry + 1}/{max_retries}...{Style.RESET_ALL}")
254 |                     time.sleep(delay)
255 | 
256 |                 with DDGS() as ddgs:
257 |                     try:
258 |                         with OutputRedirector() as output:
259 |                             if time_range and time_range != 'none':
260 |                                 results = list(ddgs.text(query, timelimit=time_range, max_results=10))
261 |                             else:
262 |                                 results = list(ddgs.text(query, max_results=10))
263 |                             
264 |                             ddg_output = output.getvalue()
265 |                             logger.info(f"DDG Output in perform_search:\n{ddg_output}")
266 |                             
267 |                             # If we get here, search was successful
268 |                             return [{'number': i+1, **result} for i, result in enumerate(results)]
269 |                             
270 |                     except Exception as e:
271 |                         if 'Ratelimit' in str(e):
272 |                             if retry == max_retries - 1:
273 |                                 print(f"{Fore.RED}Final rate limit attempt failed: {str(e)}{Style.RESET_ALL}")
274 |                                 return []
275 |                             continue  # Try again with delay
276 |                         else:
277 |                             print(f"{Fore.RED}Search error: {str(e)}{Style.RESET_ALL}")
278 |                             return []
279 | 
280 |             except Exception as e:
281 |                 print(f"{Fore.RED}Outer error: {str(e)}{Style.RESET_ALL}")
282 |                 return []
283 | 
284 |         print(f"{Fore.RED}All retry attempts failed for query: {query}{Style.RESET_ALL}")
285 |         return []
286 | 
287 |     def display_search_results(self, results: List[Dict]) -> None:
288 |         """Display search results with minimal output"""
289 |         try:
290 |             if not results:
291 |                 return
292 | 
293 |             # Only show search success status
294 |             print(f"\nSearch query sent to DuckDuckGo: {self.last_query}")
295 |             print(f"Time range sent to DuckDuckGo: {self.last_time_range}")
296 |             print(f"Number of results: {len(results)}")
297 | 
298 |         except Exception as e:
299 |             logger.error(f"Error displaying search results: {str(e)}")
300 | 
301 |     def select_relevant_pages(self, search_results: List[Dict], user_query: str) -> List[str]:
302 |         prompt = f"""
303 | Given the following search results for the user's question: "{user_query}"
304 | Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection.
305 | 
306 | Search Results:
307 | {self.format_results(search_results)}
308 | 
309 | Instructions:
310 | 1. You MUST select exactly 2 result numbers from the search results.
311 | 2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question.
312 | 3. Provide a brief reason for each selection.
313 | 
314 | You MUST respond using EXACTLY this format and nothing else:
315 | 
316 | Selected Results: [Two numbers corresponding to the selected results]
317 | Reasoning: [Your reasoning for the selections]
318 | """
319 | 
320 |         max_retries = 3
321 |         for retry in range(max_retries):
322 |             with OutputRedirector() as output:
323 |                 response_text = self.llm.generate(prompt, max_tokens=200, stop=None)
324 |             llm_output = output.getvalue()
325 |             logger.info(f"LLM Output in select_relevant_pages:\n{llm_output}")
326 | 
327 |             parsed_response = self.parse_page_selection_response(response_text)
328 |             if parsed_response and self.validate_page_selection_response(parsed_response, len(search_results)):
329 |                 selected_urls = [result['href'] for result in search_results if result['number'] in parsed_response['selected_results']]
330 | 
331 |                 allowed_urls = [url for url in selected_urls if can_fetch(url)]
332 |                 if allowed_urls:
333 |                     return allowed_urls
334 |                 else:
335 |                     print(f"{Fore.YELLOW}Warning: All selected URLs are disallowed by robots.txt. Retrying selection.{Style.RESET_ALL}")
336 |             else:
337 |                 print(f"{Fore.YELLOW}Warning: Invalid page selection. Retrying.{Style.RESET_ALL}")
338 | 
339 |         print(f"{Fore.YELLOW}Warning: All attempts to select relevant pages failed. Falling back to top allowed results.{Style.RESET_ALL}")
340 |         allowed_urls = [result['href'] for result in search_results if can_fetch(result['href'])][:2]
341 |         return allowed_urls
342 | 
343 |     def parse_page_selection_response(self, response: str) -> Dict[str, Union[List[int], str]]:
344 |         lines = response.strip().split('\n')
345 |         parsed = {}
346 |         for line in lines:
347 |             if line.startswith('Selected Results:'):
348 |                 parsed['selected_results'] = [int(num.strip()) for num in re.findall(r'\d+', line)]
349 |             elif line.startswith('Reasoning:'):
350 |                 parsed['reasoning'] = line.split(':', 1)[1].strip()
351 |         return parsed if 'selected_results' in parsed and 'reasoning' in parsed else None
352 | 
353 |     def validate_page_selection_response(self, parsed_response: Dict[str, Union[List[int], str]], num_results: int) -> bool:
354 |         if len(parsed_response['selected_results']) != 2:
355 |             return False
356 |         if any(num < 1 or num > num_results for num in parsed_response['selected_results']):
357 |             return False
358 |         return True
359 | 
360 |     def format_results(self, results: List[Dict]) -> str:
361 |         formatted_results = []
362 |         for result in results:
363 |             formatted_result = f"{result['number']}. Title: {result.get('title', 'N/A')}\n"
364 |             formatted_result += f"   Snippet: {result.get('body', 'N/A')[:200]}...\n"
365 |             formatted_result += f"   URL: {result.get('href', 'N/A')}\n"
366 |             formatted_results.append(formatted_result)
367 |         return "\n".join(formatted_results)
368 | 
369 |     def scrape_content(self, urls: List[str]) -> Dict[str, str]:
370 |         scraped_content = {}
371 |         blocked_urls = []
372 |         for url in urls:
373 |             robots_allowed = can_fetch(url)
374 |             if robots_allowed:
375 |                 content = get_web_content([url])
376 |                 if content:
377 |                     scraped_content.update(content)
378 |                     print(Fore.YELLOW + f"Successfully scraped: {url}" + Style.RESET_ALL)
379 |                     logger.info(f"Successfully scraped: {url}")
380 |                 else:
381 |                     print(Fore.RED + f"Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
382 |                     logger.warning(f"Robots.txt disallows scraping of {url}")
383 |             else:
384 |                 blocked_urls.append(url)
385 |                 print(Fore.RED + f"Warning: Robots.txt disallows scraping of {url}" + Style.RESET_ALL)
386 |                 logger.warning(f"Robots.txt disallows scraping of {url}")
387 | 
388 |         print(Fore.CYAN + f"Scraped content received for {len(scraped_content)} URLs" + Style.RESET_ALL)
389 |         logger.info(f"Scraped content received for {len(scraped_content)} URLs")
390 | 
391 |         if blocked_urls:
392 |             print(Fore.RED + f"Warning: {len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions." + Style.RESET_ALL)
393 |             logger.warning(f"{len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions: {', '.join(blocked_urls)}")
394 | 
395 |         return scraped_content
396 | 
397 |     def display_scraped_content(self, scraped_content: Dict[str, str]):
398 |         print(f"\n{Fore.CYAN}Scraped Content:{Style.RESET_ALL}")
399 |         for url, content in scraped_content.items():
400 |             print(f"{Fore.GREEN}URL: {url}{Style.RESET_ALL}")
401 |             print(f"Content: {content[:4000]}...\n")
402 | 
403 |     def generate_final_answer(self, user_query: str, scraped_content: Dict[str, str]) -> str:
404 |         user_query_short = user_query[:200]
405 |         prompt = f"""
406 | You are an AI assistant. Provide a comprehensive and detailed answer to the following question using ONLY the information provided in the scraped content. Do not include any references or mention any sources. Answer directly and thoroughly.
407 | 
408 | Question: "{user_query_short}"
409 | 
410 | Scraped Content:
411 | {self.format_scraped_content(scraped_content)}
412 | 
413 | Important Instructions:
414 | 1. Do not use phrases like "Based on the absence of selected results" or similar.
415 | 2. If the scraped content does not contain enough information to answer the question, say so explicitly and explain what information is missing.
416 | 3. Provide as much relevant detail as possible from the scraped content.
417 | 
418 | Answer:
419 | """
420 |         max_retries = 3
421 |         for attempt in range(max_retries):
422 |             with OutputRedirector() as output:
423 |                 response_text = self.llm.generate(prompt, max_tokens=1024, stop=None)
424 |             llm_output = output.getvalue()
425 |             logger.info(f"LLM Output in generate_final_answer:\n{llm_output}")
426 |             if response_text:
427 |                 logger.info(f"LLM Response:\n{response_text}")
428 |                 return response_text
429 | 
430 |         error_message = "I apologize, but I couldn't generate a satisfactory answer based on the available information."
431 |         logger.warning(f"Failed to generate a response after {max_retries} attempts. Returning error message.")
432 |         return error_message
433 | 
434 |     def format_scraped_content(self, scraped_content: Dict[str, str]) -> str:
435 |         formatted_content = []
436 |         for url, content in scraped_content.items():
437 |             content = re.sub(r'\s+', ' ', content)
438 |             formatted_content.append(f"Content from {url}:\n{content}\n")
439 |         return "\n".join(formatted_content)
440 | 
441 |     def synthesize_final_answer(self, user_query: str) -> str:
442 |         prompt = f"""
443 | After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "{user_query}"
444 | 
445 | Please provide the best possible answer you can, acknowledging any limitations or uncertainties.
446 | If appropriate, suggest ways the user might refine their question or where they might find more information.
447 | 
448 | Respond in a clear, concise, and informative manner.
449 | """
450 |         try:
451 |             with OutputRedirector() as output:
452 |                 response_text = self.llm.generate(prompt, max_tokens=self.llm_config.get('max_tokens', 1024), stop=self.llm_config.get('stop', None))
453 |             llm_output = output.getvalue()
454 |             logger.info(f"LLM Output in synthesize_final_answer:\n{llm_output}")
455 |             if response_text:
456 |                 return response_text.strip()
457 |         except Exception as e:
458 |             logger.error(f"Error in synthesize_final_answer: {str(e)}", exc_info=True)
459 |         return "I apologize, but after multiple attempts, I wasn't able to find a satisfactory answer to your question. Please try rephrasing your question or breaking it down into smaller, more specific queries."
460 | 
461 | # End of EnhancedSelfImprovingSearch class
462 | 


--------------------------------------------------------------------------------
/research_manager.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import sys
   3 | import threading
   4 | import time
   5 | import re
   6 | import json
   7 | import logging
   8 | import curses
   9 | import signal
  10 | from typing import List, Dict, Set, Optional, Tuple, Union
  11 | from dataclasses import dataclass
  12 | from queue import Queue
  13 | from datetime import datetime
  14 | from io import StringIO
  15 | from colorama import init, Fore, Style
  16 | import select
  17 | import termios
  18 | import tty
  19 | from threading import Event
  20 | from urllib.parse import urlparse
  21 | from pathlib import Path
  22 | 
  23 | # Initialize colorama for cross-platform color support
  24 | if os.name == 'nt':  # Windows-specific initialization
  25 |     init(convert=True, strip=False, wrap=True)
  26 | else:
  27 |     init()
  28 | 
  29 | # Set up logging
  30 | log_directory = 'logs'
  31 | if not os.path.exists(log_directory):
  32 |     os.makedirs(log_directory)
  33 | 
  34 | logger = logging.getLogger(__name__)
  35 | logger.setLevel(logging.INFO)
  36 | log_file = os.path.join(log_directory, 'research_llm.log')
  37 | file_handler = logging.FileHandler(log_file)
  38 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  39 | file_handler.setFormatter(formatter)
  40 | logger.handlers = []
  41 | logger.addHandler(file_handler)
  42 | logger.propagate = False
  43 | 
  44 | # Suppress other loggers
  45 | for name in logging.root.manager.loggerDict:
  46 |     if name != __name__:
  47 |         logging.getLogger(name).disabled = True
  48 | 
  49 | @dataclass
  50 | class ResearchFocus:
  51 |     """Represents a specific area of research focus"""
  52 |     area: str
  53 |     priority: int
  54 |     source_query: str = ""
  55 |     timestamp: str = ""
  56 |     search_queries: List[str] = None
  57 | 
  58 |     def __post_init__(self):
  59 |         if not self.timestamp:
  60 |             self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  61 |         if self.search_queries is None:
  62 |             self.search_queries = []
  63 | 
  64 | @dataclass
  65 | class AnalysisResult:
  66 |     """Contains the complete analysis result"""
  67 |     original_question: str
  68 |     focus_areas: List[ResearchFocus]
  69 |     raw_response: str
  70 |     timestamp: str = ""
  71 | 
  72 |     def __post_init__(self):
  73 |         if not self.timestamp:
  74 |             self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  75 | 
  76 | class StrategicAnalysisParser:
  77 |     def __init__(self, llm=None):
  78 |         self.llm = llm
  79 |         self.logger = logging.getLogger(__name__)
  80 |         # Simplify patterns to match exactly what we expect
  81 |         self.patterns = {
  82 |             'priority': [
  83 |                 r"Priority:\s*(\d+)",  # Match exactly what's in our prompt
  84 |             ]
  85 |         }
  86 | 
  87 |     def strategic_analysis(self, original_query: str) -> Optional[AnalysisResult]:
  88 |         """Generate and process research areas with retries until success"""
  89 |         max_retries = 3
  90 |         try:
  91 |             self.logger.info("Starting strategic analysis...")
  92 |             prompt = f"""
  93 | You must select exactly 5 areas to investigate in order to explore and gather information to answer the research question:
  94 | "{original_query}"
  95 | 
  96 | You MUST provide exactly 5 areas numbered 1-5. Each must have a priority, YOU MUST ensure that you only assign one priority per area.
  97 | Assign priority based on the likelihood of a focus area being investigated to provide information that directly will allow you to respond to "{original_query}" with 5 being most likely and 1 being least.
  98 | Follow this EXACT format without any deviations or additional text:
  99 | 
 100 | 1. [First research topic]
 101 | Priority: [number 1-5]
 102 | 
 103 | 2. [Second research topic]
 104 | Priority: [number 1-5]
 105 | 
 106 | 3. [Third research topic]
 107 | Priority: [number 1-5]
 108 | 
 109 | 4. [Fourth research topic]
 110 | Priority: [number 1-5]
 111 | 
 112 | 5. [Fifth research topic]
 113 | Priority: [number 1-5]
 114 | """
 115 |             for attempt in range(max_retries):
 116 |                 response = self.llm.generate(prompt, max_tokens=1000)
 117 |                 focus_areas = self._extract_research_areas(response)
 118 | 
 119 |                 if focus_areas:  # If we got any valid areas
 120 |                     # Sort by priority (highest first)
 121 |                     focus_areas.sort(key=lambda x: x.priority, reverse=True)
 122 | 
 123 |                     return AnalysisResult(
 124 |                         original_question=original_query,
 125 |                         focus_areas=focus_areas,
 126 |                         raw_response=response,
 127 |                         timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 128 |                     )
 129 |                 else:
 130 |                     self.logger.warning(f"Attempt {attempt + 1}: No valid areas generated, retrying...")
 131 |                     print(f"\nRetrying research area generation (Attempt {attempt + 1}/{max_retries})...")
 132 | 
 133 |             # If all retries failed, try one final time with a stronger prompt
 134 |             prompt += "\n\nIMPORTANT: You MUST provide exactly 5 research areas with priorities. This is crucial."
 135 |             response = self.llm.generate(prompt, max_tokens=1000)
 136 |             focus_areas = self._extract_research_areas(response)
 137 | 
 138 |             if focus_areas:
 139 |                 focus_areas.sort(key=lambda x: x.priority, reverse=True)
 140 |                 return AnalysisResult(
 141 |                     original_question=original_query,
 142 |                     focus_areas=focus_areas,
 143 |                     raw_response=response,
 144 |                     timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 145 |                 )
 146 | 
 147 |             self.logger.error("Failed to generate any valid research areas after all attempts")
 148 |             return None
 149 | 
 150 |         except Exception as e:
 151 |             self.logger.error(f"Error in strategic analysis: {str(e)}")
 152 |             return None
 153 | 
 154 |     def _extract_research_areas(self, text: str) -> List[ResearchFocus]:
 155 |         """Extract research areas with enhanced parsing to handle priorities in various formats."""
 156 |         areas = []
 157 |         lines = text.strip().split('\n')
 158 | 
 159 |         current_area = None
 160 |         current_priority = None
 161 | 
 162 |         for i in range(len(lines)):
 163 |             line = lines[i].strip()
 164 |             if not line:
 165 |                 continue
 166 | 
 167 |             # Check for numbered items (e.g., '1. Area Name')
 168 |             number_match = re.match(r'^(\d+)\.\s*(.*)', line)
 169 |             if number_match:
 170 |                 # If we have a previous area, add it to our list
 171 |                 if current_area is not None:
 172 |                     areas.append(ResearchFocus(
 173 |                         area=current_area.strip(' -:'),
 174 |                         priority=current_priority or 3,
 175 |                     ))
 176 |                 # Start a new area
 177 |                 area_line = number_match.group(2)
 178 | 
 179 |                 # Search for 'priority' followed by a number, anywhere in the area_line
 180 |                 priority_inline_match = re.search(
 181 |                     r'(?i)\bpriority\b\s*(?:[:=]?\s*)?(\d+)', area_line)
 182 |                 if priority_inline_match:
 183 |                     # Extract and set the priority
 184 |                     try:
 185 |                         current_priority = int(priority_inline_match.group(1))
 186 |                         current_priority = max(1, min(5, current_priority))
 187 |                     except ValueError:
 188 |                         current_priority = 3  # Default priority if parsing fails
 189 |                     # Remove the 'priority' portion from area_line
 190 |                     area_line = area_line[:priority_inline_match.start()] + area_line[priority_inline_match.end():]
 191 |                     area_line = area_line.strip(' -:')
 192 |                 else:
 193 |                     current_priority = None  # Priority might be on the next line
 194 | 
 195 |                 current_area = area_line.strip()
 196 | 
 197 |             elif re.match(r'(?i)^priority\s*(?:[:=]?\s*)?(\d+)', line):
 198 |                 # Extract priority from the line following the area
 199 |                 try:
 200 |                     priority_match = re.match(r'(?i)^priority\s*(?:[:=]?\s*)?(\d+)', line)
 201 |                     current_priority = int(priority_match.group(1))
 202 |                     current_priority = max(1, min(5, current_priority))
 203 |                 except (ValueError, IndexError):
 204 |                     current_priority = 3  # Default priority if parsing fails
 205 | 
 206 |             # Check if this is the last line or the next line is a new area
 207 |             next_line_is_new_area = (i + 1 < len(lines)) and re.match(r'^\d+\.', lines[i + 1].strip())
 208 |             if next_line_is_new_area or i + 1 == len(lines):
 209 |                 if current_area is not None:
 210 |                     # Append the current area and priority to the list
 211 |                     areas.append(ResearchFocus(
 212 |                         area=current_area.strip(' -:'),
 213 |                         priority=current_priority or 3,
 214 |                     ))
 215 |                     current_area = None
 216 |                     current_priority = None
 217 | 
 218 |         return areas
 219 | 
 220 |     def _clean_text(self, text: str) -> str:
 221 |         """Clean and normalize text"""
 222 |         text = re.sub(r'\s+', ' ', text)
 223 |         text = re.sub(r'(\d+\))', r'\1.', text)
 224 |         text = re.sub(r'(?i)priority:', 'P:', text)
 225 |         return text.strip()
 226 | 
 227 |     def _add_area(self, areas: List[ResearchFocus], area: str, priority: Optional[int]):
 228 |         """Add area with basic validation"""
 229 |         if not area or len(area.split()) < 3:  # Basic validation
 230 |             return
 231 | 
 232 |         areas.append(ResearchFocus(
 233 |             area=area,
 234 |             priority=priority or 3,
 235 |             timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 236 |             search_queries=[]
 237 |         ))
 238 | 
 239 |     def _normalize_focus_areas(self, areas: List[ResearchFocus]) -> List[ResearchFocus]:
 240 |         """Normalize and prepare final list of areas"""
 241 |         if not areas:
 242 |             return []
 243 | 
 244 |         # Sort by priority
 245 |         areas.sort(key=lambda x: x.priority, reverse=True)
 246 | 
 247 |         # Ensure priorities are properly spread
 248 |         for i, area in enumerate(areas):
 249 |             area.priority = max(1, min(5, area.priority))
 250 | 
 251 |         return areas[:5]
 252 | 
 253 |     def format_analysis_result(self, result: AnalysisResult) -> str:
 254 |         """Format the results for display"""
 255 |         if not result:
 256 |             return "No valid analysis result generated."
 257 | 
 258 |         formatted = [
 259 |             f"\nResearch Areas for: {result.original_question}\n"
 260 |         ]
 261 | 
 262 |         for i, focus in enumerate(result.focus_areas, 1):
 263 |             formatted.extend([
 264 |                 f"\n{i}. {focus.area}",
 265 |                 f"   Priority: {focus.priority}"
 266 |             ])
 267 | 
 268 |         return "\n".join(formatted)
 269 | 
 270 | class OutputRedirector:
 271 |     """Redirects stdout and stderr to a string buffer"""
 272 |     def __init__(self, stream=None):
 273 |         self.stream = stream or StringIO()
 274 |         self.original_stdout = sys.stdout
 275 |         self.original_stderr = sys.stderr
 276 | 
 277 |     def __enter__(self):
 278 |         sys.stdout = self.stream
 279 |         sys.stderr = self.stream
 280 |         return self.stream
 281 | 
 282 |     def __exit__(self, exc_type, exc_val, exc_tb):
 283 |         sys.stdout = self.original_stdout
 284 |         sys.stderr = self.original_stderr
 285 | 
 286 | class TerminalUI:
 287 |     """Manages terminal display with fixed input area at bottom"""
 288 |     def __init__(self):
 289 |         self.stdscr = None
 290 |         self.input_win = None
 291 |         self.output_win = None
 292 |         self.status_win = None
 293 |         self.max_y = 0
 294 |         self.max_x = 0
 295 |         self.input_buffer = ""
 296 |         self.is_setup = False
 297 |         self.old_terminal_settings = None
 298 |         self.should_terminate = Event()
 299 |         self.shutdown_event = Event()
 300 |         self.research_thread = None
 301 |         self.last_display_height = 0  # Track display height for corruption fix
 302 | 
 303 | 
 304 |     def setup(self):
 305 |         """Initialize the terminal UI"""
 306 |         if self.is_setup:
 307 |             return
 308 | 
 309 |         # Save terminal settings
 310 |         if not os.name == 'nt':  # Unix-like systems
 311 |             self.old_terminal_settings = termios.tcgetattr(sys.stdin.fileno())
 312 | 
 313 |         self.stdscr = curses.initscr()
 314 |         curses.start_color()
 315 |         curses.noecho()
 316 |         curses.cbreak()
 317 |         self.stdscr.keypad(True)
 318 | 
 319 |         # Enable only scroll wheel events, not all mouse events
 320 |         # curses.mousemask(curses.BUTTON4_PRESSED | curses.BUTTON5_PRESSED)
 321 | 
 322 |         # Remove this line that was causing the spam
 323 |         # print('\033[?1003h')  # We don't want mouse movement events
 324 | 
 325 |         # Get terminal dimensions
 326 |         self.max_y, self.max_x = self.stdscr.getmaxyx()
 327 | 
 328 |         # Create windows
 329 |         self.output_win = curses.newwin(self.max_y - 4, self.max_x, 0, 0)
 330 |         self.status_win = curses.newwin(1, self.max_x, self.max_y - 4, 0)
 331 |         self.input_win = curses.newwin(3, self.max_x, self.max_y - 3, 0)
 332 | 
 333 |         # Setup colors
 334 |         curses.init_pair(1, curses.COLOR_GREEN, curses.COLOR_BLACK)
 335 |         curses.init_pair(2, curses.COLOR_CYAN, curses.COLOR_BLACK)
 336 |         curses.init_pair(3, curses.COLOR_YELLOW, curses.COLOR_BLACK)
 337 | 
 338 |         # Enable scrolling
 339 |         self.output_win.scrollok(True)
 340 |         self.output_win.idlok(True)
 341 |         self.input_win.scrollok(True)
 342 | 
 343 |         self.is_setup = True
 344 |         self._refresh_input_prompt()
 345 | 
 346 |     def cleanup(self):
 347 |         """Public cleanup method with enhanced terminal restoration"""
 348 |         if not self.is_setup:
 349 |             return
 350 |         try:
 351 |             # Ensure all windows are properly closed
 352 |             for win in [self.input_win, self.output_win, self.status_win]:
 353 |                 if win:
 354 |                     win.clear()
 355 |                     win.refresh()
 356 | 
 357 |             # Restore terminal state
 358 |             if self.stdscr:
 359 |                 self.stdscr.keypad(False)
 360 |                 curses.nocbreak()
 361 |                 curses.echo()
 362 |                 curses.endwin()
 363 | 
 364 |             # Restore original terminal settings
 365 |             if self.old_terminal_settings and not os.name == 'nt':
 366 |                 termios.tcsetattr(
 367 |                     sys.stdin.fileno(),
 368 |                     termios.TCSADRAIN,
 369 |                     self.old_terminal_settings
 370 |                 )
 371 |         except Exception as e:
 372 |             logger.error(f"Error during terminal cleanup: {str(e)}")
 373 |         finally:
 374 |             self.is_setup = False
 375 |             self.stdscr = None
 376 |             self.input_win = None
 377 |             self.output_win = None
 378 |             self.status_win = None
 379 | 
 380 |     def _cleanup(self):
 381 |         """Enhanced resource cleanup with better process handling"""
 382 |         self.should_terminate.set()
 383 | 
 384 |         # Handle research thread with improved termination
 385 |         if self.research_thread and self.research_thread.is_alive():
 386 |             try:
 387 |                 self.research_thread.join(timeout=1.0)
 388 |                 if self.research_thread.is_alive():
 389 |                     import ctypes
 390 |                     ctypes.pythonapi.PyThreadState_SetAsyncExc(
 391 |                         ctypes.c_long(self.research_thread.ident),
 392 |                         ctypes.py_object(SystemExit))
 393 |                     time.sleep(0.1)  # Give thread time to exit
 394 |                     if self.research_thread.is_alive():  # Double-check
 395 |                         ctypes.pythonapi.PyThreadState_SetAsyncExc(
 396 |                             ctypes.c_long(self.research_thread.ident),
 397 |                             0)  # Reset exception
 398 |             except Exception as e:
 399 |                 logger.error(f"Error terminating research thread: {str(e)}")
 400 | 
 401 |         # Clean up LLM with improved error handling
 402 |         if hasattr(self, 'llm') and hasattr(self.llm, '_cleanup'):
 403 |             try:
 404 |                 self.llm.cleanup()
 405 |             except Exception as e:
 406 |                 logger.error(f"Error cleaning up LLM: {str(e)}")
 407 | 
 408 |         # Ensure terminal is restored
 409 |         try:
 410 |             curses.endwin()
 411 |         except:
 412 |             pass
 413 | 
 414 |         # Final cleanup of UI
 415 |         self.cleanup()
 416 | 
 417 |     def _refresh_input_prompt(self, prompt="Enter command: "):
 418 |         """Refresh the fixed input prompt at bottom with display fix"""
 419 |         if not self.is_setup:
 420 |             return
 421 | 
 422 |         try:
 423 |             # Clear the entire input window first
 424 |             self.input_win.clear()
 425 | 
 426 |             # Calculate proper cursor position
 427 |             cursor_y = 0
 428 |             cursor_x = len(prompt) + len(self.input_buffer)
 429 | 
 430 |             # Add the prompt and buffer
 431 |             self.input_win.addstr(0, 0, f"{prompt}{self.input_buffer}", curses.color_pair(1))
 432 | 
 433 |             # Position cursor correctly
 434 |             try:
 435 |                 self.input_win.move(cursor_y, cursor_x)
 436 |             except curses.error:
 437 |                 pass  # Ignore if cursor would be off-screen
 438 | 
 439 |             self.input_win.refresh()
 440 |         except curses.error:
 441 |             pass
 442 | 
 443 |     def update_output(self, text: str):
 444 |         """Update output window with display corruption fix"""
 445 |         if not self.is_setup:
 446 |             return
 447 | 
 448 |         try:
 449 |             # Clean ANSI escape codes
 450 |             clean_text = re.sub(r'\x1b\[[0-9;]*[mK]', '', text)
 451 | 
 452 |             # Store current position
 453 |             current_y, _ = self.output_win.getyx()
 454 | 
 455 |             # Clear any potential corruption
 456 |             if current_y > self.last_display_height:
 457 |                 self.output_win.clear()
 458 | 
 459 |             self.output_win.addstr(clean_text + "\n", curses.color_pair(2))
 460 |             new_y, _ = self.output_win.getyx()
 461 |             self.last_display_height = new_y
 462 | 
 463 |             self.output_win.refresh()
 464 |             self._refresh_input_prompt()
 465 |         except curses.error:
 466 |             pass
 467 | 
 468 |     def update_status(self, text: str):
 469 |         """Update the status line above input area"""
 470 |         if not self.is_setup:
 471 |             return
 472 | 
 473 |         try:
 474 |             self.status_win.clear()
 475 |             self.status_win.addstr(0, 0, text, curses.color_pair(3))
 476 |             self.status_win.refresh()
 477 |             self._refresh_input_prompt()  # Ensure prompt is refreshed after status update
 478 |         except curses.error:
 479 |             pass
 480 | 
 481 |     def get_input(self, prompt: Optional[str] = None) -> Optional[str]:
 482 |         """Enhanced input handling with mouse scroll support"""
 483 |         try:
 484 |             if prompt:
 485 |                 self.update_status(prompt)
 486 |             if not self.is_setup:
 487 |                 self.setup()
 488 |             self.input_buffer = ""
 489 |             self._refresh_input_prompt()
 490 | 
 491 |             while True:
 492 |                 if self.should_terminate.is_set():
 493 |                     return None
 494 | 
 495 |                 try:
 496 |                     ch = self.input_win.getch()
 497 | 
 498 |                     if ch == curses.KEY_MOUSE:
 499 |                         try:
 500 |                             mouse_event = curses.getmouse()
 501 |                             # Ignore mouse events entirely for now
 502 |                             continue
 503 |                         except curses.error:
 504 |                             continue
 505 | 
 506 |                     if ch == 4:  # Ctrl+D
 507 |                         result = self.input_buffer.strip()
 508 |                         self.input_buffer = ""
 509 |                         if not result:
 510 |                             self.cleanup()
 511 |                             return "@quit"
 512 |                         return result
 513 | 
 514 |                     elif ch == 3:  # Ctrl+C
 515 |                         self.should_terminate.set()
 516 |                         self.cleanup()
 517 |                         return "@quit"
 518 | 
 519 |                     elif ch == ord('\n'):  # Enter
 520 |                         result = self.input_buffer.strip()
 521 |                         if result:
 522 |                             self.input_buffer = ""
 523 |                             return result
 524 |                         continue
 525 | 
 526 |                     elif ch == curses.KEY_BACKSPACE or ch == 127:  # Backspace
 527 |                         if self.input_buffer:
 528 |                             self.input_buffer = self.input_buffer[:-1]
 529 |                             self._refresh_input_prompt()
 530 | 
 531 |                     elif 32 <= ch <= 126:  # Printable characters
 532 |                         self.input_buffer += chr(ch)
 533 |                         self._refresh_input_prompt()
 534 | 
 535 |                 except KeyboardInterrupt:
 536 |                     self.should_terminate.set()
 537 |                     self.cleanup()
 538 |                     return "@quit"
 539 |                 except curses.error:
 540 |                     self._refresh_input_prompt()
 541 | 
 542 |         except Exception as e:
 543 |             logger.error(f"Error in get_input: {str(e)}")
 544 |             self.should_terminate.set()
 545 |             self.cleanup()
 546 |             return "@quit"
 547 | 
 548 |     def force_exit(self):
 549 |         """Force immediate exit with enhanced cleanup"""
 550 |         try:
 551 |             self.should_terminate.set()
 552 |             self.shutdown_event.set()
 553 |             self._cleanup()  # Call private cleanup first
 554 |             self.cleanup()   # Then public cleanup
 555 |             curses.endwin()  # Final attempt to restore terminal
 556 |         except:
 557 |             pass
 558 |         finally:
 559 |             os._exit(0)  # Ensure exit
 560 | 
 561 | class NonBlockingInput:
 562 |     """Handles non-blocking keyboard input for Unix-like systems"""
 563 |     def __init__(self):
 564 |         self.old_settings = None
 565 | 
 566 |     def __enter__(self):
 567 |         if os.name == 'nt':  # Windows
 568 |             return self
 569 |         self.old_settings = termios.tcgetattr(sys.stdin)
 570 |         tty.setcbreak(sys.stdin.fileno())
 571 |         return self
 572 | 
 573 |     def __exit__(self, type, value, traceback):
 574 |         if os.name != 'nt':  # Unix-like
 575 |             termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self.old_settings)
 576 | 
 577 |     def check_input(self, timeout=0.1):
 578 |         """Check for input without blocking, cross-platform"""
 579 |         if os.name == 'nt':  # Windows
 580 |             import msvcrt
 581 |             if msvcrt.kbhit():
 582 |                 return msvcrt.getch().decode('utf-8')
 583 |             return None
 584 |         else:  # Unix-like
 585 |             ready_to_read, _, _ = select.select([sys.stdin], [], [], timeout)
 586 |             if ready_to_read:
 587 |                 return sys.stdin.read(1)
 588 |             return None
 589 | 
 590 | class ResearchManager:
 591 |     """Manages the research process including analysis, search, and documentation"""
 592 |     def __init__(self, llm_wrapper, parser, search_engine, max_searches_per_cycle: int = 5):
 593 |         self.llm = llm_wrapper
 594 |         self.parser = parser
 595 |         self.search_engine = search_engine
 596 |         self.max_searches = max_searches_per_cycle
 597 |         self.should_terminate = threading.Event()
 598 |         self.shutdown_event = Event()
 599 |         self.research_started = threading.Event()
 600 |         self.research_thread = None
 601 |         self.thinking = False
 602 |         self.stop_words = {
 603 |             'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i',
 604 |             'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at'
 605 |         }
 606 | 
 607 |         # State tracking
 608 |         self.searched_urls: Set[str] = set()
 609 |         self.current_focus: Optional[ResearchFocus] = None
 610 |         self.original_query: str = ""
 611 |         self.focus_areas: List[ResearchFocus] = []
 612 |         self.is_running = False
 613 | 
 614 |         # New conversation mode attributes
 615 |         self.research_complete = False
 616 |         self.research_summary = ""
 617 |         self.conversation_active = False
 618 |         self.research_content = ""
 619 | 
 620 |         # Initialize document paths
 621 |         self.document_path = None
 622 |         self.session_files = []
 623 | 
 624 |         # Initialize UI and parser
 625 |         self.ui = TerminalUI()
 626 |         self.strategic_parser = StrategicAnalysisParser(llm=self.llm)
 627 | 
 628 |         # Initialize new flags for pausing and assessment
 629 |         self.research_paused = False
 630 |         self.awaiting_user_decision = False
 631 | 
 632 |         # Setup signal handlers
 633 |         signal.signal(signal.SIGINT, self._signal_handler)
 634 |         signal.signal(signal.SIGTERM, self._signal_handler)
 635 | 
 636 |     def _signal_handler(self, signum, frame):
 637 |         """Handle interrupt signals"""
 638 |         self.shutdown_event.set()
 639 |         self.should_terminate.set()
 640 |         self._cleanup()
 641 | 
 642 |     def print_thinking(self):
 643 |         """Display thinking indicator to user"""
 644 |         self.ui.update_output("🧠 Thinking...")
 645 | 
 646 |     @staticmethod
 647 |     def get_initial_input() -> str:
 648 |         """Get the initial research query from user"""
 649 |         print(f"{Fore.GREEN}📝 Enter your message (Press CTRL+D to submit):{Style.RESET_ALL}")
 650 |         lines = []
 651 |         try:
 652 |             while True:
 653 |                 line = input()
 654 |                 if line:  # Only add non-empty lines
 655 |                     lines.append(line)
 656 |                 if not line:  # Empty line (just Enter pressed)
 657 |                     break
 658 |         except EOFError:  # Ctrl+D pressed
 659 |             pass
 660 |         except KeyboardInterrupt:  # Ctrl+C pressed
 661 |             print("\nOperation cancelled")
 662 |             sys.exit(0)
 663 | 
 664 |         return " ".join(lines).strip()
 665 | 
 666 |     def formulate_search_queries(self, focus_area: ResearchFocus) -> List[str]:
 667 |         """Generate search queries for a focus area"""
 668 |         try:
 669 |             self.print_thinking()
 670 | 
 671 |             prompt = f"""
 672 | In order to research this query/topic:
 673 | 
 674 | Context: {self.original_query}
 675 | 
 676 | Base a search query to investigate the following research focus, which is related to the original query/topic:
 677 | 
 678 | Area: {focus_area.area}
 679 | 
 680 | Create a search query that will yield specific, search results thare are directly relevant to your focus area.
 681 | Format your response EXACTLY like this:
 682 | 
 683 | Search query: [Your 2-5 word query]
 684 | Time range: [d/w/m/y/none]
 685 | 
 686 | Do not provide any additional information or explanation, note that the time range allows you to see results within a time range (d is within the last day, w is within the last week, m is within the last month, y is within the last year, and none is results from anytime, only select one, using only the corresponding letter for whichever of these options you select as indicated in the response format) use your judgement as many searches will not require a time range and some may depending on what the research focus is.
 687 | """
 688 |             response_text = self.llm.generate(prompt, max_tokens=50, stop=None)
 689 |             query, time_range = self.parse_query_response(response_text)
 690 | 
 691 |             if not query:
 692 |                 self.ui.update_output(f"{Fore.RED}Error: Empty search query. Using focus area as query...{Style.RESET_ALL}")
 693 |                 return [focus_area.area]
 694 | 
 695 |             self.ui.update_output(f"{Fore.YELLOW}Original focus: {focus_area.area}{Style.RESET_ALL}")
 696 |             self.ui.update_output(f"{Fore.YELLOW}Formulated query: {query}{Style.RESET_ALL}")
 697 |             self.ui.update_output(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}")
 698 | 
 699 |             return [query]
 700 | 
 701 |         except Exception as e:
 702 |             logger.error(f"Error formulating query: {str(e)}")
 703 |             return [focus_area.area]
 704 | 
 705 |     def parse_search_query(self, query_response: str) -> Dict[str, str]:
 706 |         """Parse search query formulation response with improved time range detection"""
 707 |         try:
 708 |             lines = query_response.strip().split('\n')
 709 |             result = {
 710 |                 'query': '',
 711 |                 'time_range': 'none'
 712 |             }
 713 | 
 714 |             # First try to find standard format
 715 |             for line in lines:
 716 |                 if ':' in line:
 717 |                     key, value = line.split(':', 1)
 718 |                     key = key.strip().lower()
 719 |                     value = value.strip()
 720 | 
 721 |                     if 'query' in key:
 722 |                         result['query'] = self._clean_query(value)
 723 |                     elif ('time' in key or 'range' in key) and value.strip().lower() in ['d', 'w', 'm', 'y', 'none']:
 724 |                         result['time_range'] = value.strip().lower()
 725 | 
 726 |             # If no time range found, look for individual characters
 727 |             if result['time_range'] == 'none':
 728 |                 # Get all text except the query itself
 729 |                 full_text = query_response.lower()
 730 |                 if result['query']:
 731 |                     full_text = full_text.replace(result['query'].lower(), '')
 732 | 
 733 |                 # Look for isolated d, w, m, or y characters
 734 |                 time_chars = set()
 735 |                 for char in ['d', 'w', 'm', 'y']:
 736 |                     # Check if char exists by itself (not part of another word)
 737 |                     matches = re.finditer(r'\b' + char + r'\b', full_text)
 738 |                     for match in matches:
 739 |                         # Verify it's not part of a word
 740 |                         start, end = match.span()
 741 |                         if (start == 0 or not full_text[start-1].isalpha()) and \
 742 |                            (end == len(full_text) or not full_text[end].isalpha()):
 743 |                             time_chars.add(char)
 744 | 
 745 |                 # If exactly one time char found, use it
 746 |                 if len(time_chars) == 1:
 747 |                     result['time_range'] = time_chars.pop()
 748 | 
 749 |             return result
 750 |         except Exception as e:
 751 |             logger.error(f"Error parsing search query: {str(e)}")
 752 |             return {'query': '', 'time_range': 'none'}
 753 | 
 754 |     def _cleanup(self):
 755 |         """Enhanced cleanup to handle conversation mode"""
 756 |         self.conversation_active = False
 757 |         self.should_terminate.set()
 758 | 
 759 |         if self.research_thread and self.research_thread.is_alive():
 760 |             try:
 761 |                 self.research_thread.join(timeout=1.0)
 762 |                 if self.research_thread.is_alive():
 763 |                     import ctypes
 764 |                     ctypes.pythonapi.PyThreadState_SetAsyncExc(
 765 |                         ctypes.c_long(self.research_thread.ident),
 766 |                         ctypes.py_object(SystemExit)
 767 |                     )
 768 |             except Exception as e:
 769 |                 logger.error(f"Error terminating research thread: {str(e)}")
 770 | 
 771 |         if hasattr(self.llm, 'cleanup'):
 772 |             try:
 773 |                 self.llm.cleanup()
 774 |             except Exception as e:
 775 |                 logger.error(f"Error cleaning up LLM: {str(e)}")
 776 | 
 777 |         if hasattr(self.ui, 'cleanup'):
 778 |             self.ui.cleanup()
 779 | 
 780 |     def _initialize_document(self):
 781 |         """Initialize research session document"""
 782 |         try:
 783 |             # Get all existing research session files
 784 |             self.session_files = []
 785 |             for file in os.listdir():
 786 |                 if file.startswith("research_session_") and file.endswith(".txt"):
 787 |                     try:
 788 |                         num = int(file.split("_")[2].split(".")[0])
 789 |                         self.session_files.append(num)
 790 |                     except ValueError:
 791 |                         continue
 792 | 
 793 |             # Determine next session number
 794 |             next_session = 1 if not self.session_files else max(self.session_files) + 1
 795 |             self.document_path = f"research_session_{next_session}.txt"
 796 | 
 797 |             # Initialize the new document
 798 |             with open(self.document_path, 'w', encoding='utf-8') as f:
 799 |                 f.write(f"Research Session {next_session}\n")
 800 |                 f.write(f"Topic: {self.original_query}\n")
 801 |                 f.write(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
 802 |                 f.write("="*80 + "\n\n")
 803 |                 f.flush()
 804 | 
 805 |         except Exception as e:
 806 |             logger.error(f"Error initializing document: {str(e)}")
 807 |             self.document_path = "research_findings.txt"
 808 |             with open(self.document_path, 'w', encoding='utf-8') as f:
 809 |                 f.write("Research Findings:\n\n")
 810 |                 f.flush()
 811 | 
 812 |     def add_to_document(self, content: str, source_url: str, focus_area: str):
 813 |         """Add research findings to current session document"""
 814 |         try:
 815 |             with open(self.document_path, 'a', encoding='utf-8') as f:
 816 |                 if source_url not in self.searched_urls:
 817 |                     f.write(f"\n{'='*80}\n")
 818 |                     f.write(f"Research Focus: {focus_area}\n")
 819 |                     f.write(f"Source: {source_url}\n")
 820 |                     f.write(f"Content:\n{content}\n")
 821 |                     f.write(f"{'='*80}\n")
 822 |                     f.flush()
 823 |                     self.searched_urls.add(source_url)
 824 |                     self.ui.update_output(f"Added content from: {source_url}")
 825 |         except Exception as e:
 826 |             logger.error(f"Error adding to document: {str(e)}")
 827 |             self.ui.update_output(f"Error saving content: {str(e)}")
 828 | 
 829 |     def _process_search_results(self, results: Dict[str, str], focus_area: str):
 830 |         """Process and store search results"""
 831 |         if not results:
 832 |             return
 833 | 
 834 |         for url, content in results.items():
 835 |             if url not in self.searched_urls:
 836 |                 self.add_to_document(content, url, focus_area)
 837 | 
 838 |     def _research_loop(self):
 839 |         """Main research loop with comprehensive functionality"""
 840 |         self.is_running = True
 841 |         try:
 842 |             self.research_started.set()
 843 | 
 844 |             while not self.should_terminate.is_set() and not self.shutdown_event.is_set():
 845 |                 # Check if research is paused
 846 |                 if self.research_paused:
 847 |                     time.sleep(1)
 848 |                     continue
 849 | 
 850 |                 self.ui.update_output("\nAnalyzing research progress...")
 851 | 
 852 |                 # Generate focus areas
 853 |                 self.ui.update_output("\nGenerating research focus areas...")
 854 |                 analysis_result = self.strategic_parser.strategic_analysis(self.original_query)
 855 | 
 856 |                 if not analysis_result:
 857 |                     self.ui.update_output("\nFailed to generate analysis result. Retrying...")
 858 |                     continue
 859 | 
 860 |                 focus_areas = analysis_result.focus_areas
 861 |                 if not focus_areas:
 862 |                     self.ui.update_output("\nNo valid focus areas generated. Retrying...")
 863 |                     continue
 864 | 
 865 |                 self.ui.update_output(f"\nGenerated {len(focus_areas)} research areas:")
 866 |                 for i, focus in enumerate(focus_areas, 1):
 867 |                     self.ui.update_output(f"\nArea {i}: {focus.area}")
 868 |                     self.ui.update_output(f"Priority: {focus.priority}")
 869 | 
 870 |                 # Process each focus area in priority order
 871 |                 for focus_area in focus_areas:
 872 |                     if self.should_terminate.is_set():
 873 |                         break
 874 | 
 875 |                     # Check if research is paused
 876 |                     while self.research_paused and not self.should_terminate.is_set():
 877 |                         time.sleep(1)
 878 | 
 879 |                     if self.should_terminate.is_set():
 880 |                         break
 881 | 
 882 |                     self.current_focus = focus_area
 883 |                     self.ui.update_output(f"\nInvestigating: {focus_area.area}")
 884 | 
 885 |                     queries = self.formulate_search_queries(focus_area)
 886 |                     if not queries:
 887 |                         continue
 888 | 
 889 |                     for query in queries:
 890 |                         if self.should_terminate.is_set():
 891 |                             break
 892 | 
 893 |                         # Check if research is paused
 894 |                         while self.research_paused and not self.should_terminate.is_set():
 895 |                             time.sleep(1)
 896 | 
 897 |                         if self.should_terminate.is_set():
 898 |                             break
 899 | 
 900 |                         try:
 901 |                             self.ui.update_output(f"\nSearching: {query}")
 902 |                             results = self.search_engine.perform_search(query, time_range='none')
 903 | 
 904 |                             if results:
 905 |                                 # self.search_engine.display_search_results(results)
 906 |                                 selected_urls = self.search_engine.select_relevant_pages(results, query)
 907 | 
 908 |                                 if selected_urls:
 909 |                                     self.ui.update_output("\n⚙️ Scraping selected pages...")
 910 |                                     scraped_content = self.search_engine.scrape_content(selected_urls)
 911 |                                     if scraped_content:
 912 |                                         for url, content in scraped_content.items():
 913 |                                             if url not in self.searched_urls:
 914 |                                                 self.add_to_document(content, url, focus_area.area)
 915 | 
 916 |                         except Exception as e:
 917 |                             logger.error(f"Error in search: {str(e)}")
 918 |                             self.ui.update_output(f"Error during search: {str(e)}")
 919 | 
 920 |                     if self.check_document_size():
 921 |                         self.ui.update_output("\nDocument size limit reached. Finalizing research.")
 922 |                         return
 923 | 
 924 |                 # After processing all areas, cycle back to generate new ones
 925 |                 self.ui.update_output("\nAll current focus areas investigated. Generating new areas...")
 926 | 
 927 |         except Exception as e:
 928 |             logger.error(f"Error in research loop: {str(e)}")
 929 |             self.ui.update_output(f"Error in research process: {str(e)}")
 930 |         finally:
 931 |             self.is_running = False
 932 | 
 933 |     def start_research(self, topic: str):
 934 |         """Start research with new session document"""
 935 |         try:
 936 |             self.ui.setup()
 937 |             self.original_query = topic
 938 |             self._initialize_document()
 939 | 
 940 |             self.ui.update_output(f"Starting research on: {topic}")
 941 |             self.ui.update_output(f"Session document: {self.document_path}")
 942 |             self.ui.update_output("\nCommands available during research:")
 943 |             self.ui.update_output("'s' = Show status")
 944 |             self.ui.update_output("'f' = Show current focus")
 945 |             self.ui.update_output("'p' = Pause and assess the research progress")  # New command
 946 |             self.ui.update_output("'q' = Quit research\n")
 947 | 
 948 |             # Reset events
 949 |             self.should_terminate.clear()
 950 |             self.research_started.clear()
 951 |             self.research_paused = False  # Ensure research is not paused at the start
 952 |             self.awaiting_user_decision = False
 953 | 
 954 |             # Start research thread
 955 |             self.research_thread = threading.Thread(target=self._research_loop, daemon=True)
 956 |             self.research_thread.start()
 957 | 
 958 |             # Wait for research to actually start
 959 |             if not self.research_started.wait(timeout=10):
 960 |                 self.ui.update_output("Error: Research failed to start within timeout period")
 961 |                 self.should_terminate.set()
 962 |                 return
 963 | 
 964 |             while not self.should_terminate.is_set():
 965 |                 cmd = self.ui.get_input("Enter command: ")
 966 |                 if cmd is None or self.shutdown_event.is_set():
 967 |                     if self.should_terminate.is_set() and not self.research_complete:
 968 |                         self.ui.update_output("\nGenerating research summary... please wait...")
 969 |                         summary = self.terminate_research()
 970 |                         self.ui.update_output("\nFinal Research Summary:")
 971 |                         self.ui.update_output(summary)
 972 |                     break
 973 |                 if cmd:
 974 |                     self._handle_command(cmd)
 975 | 
 976 |         except Exception as e:
 977 |             logger.error(f"Error in research process: {str(e)}")
 978 |         finally:
 979 |             self._cleanup()
 980 | 
 981 |     def check_document_size(self) -> bool:
 982 |         """Check if document size is approaching context limit"""
 983 |         try:
 984 |             with open(self.document_path, 'r', encoding='utf-8') as f:
 985 |                 content = f.read()
 986 |             estimated_tokens = len(content.split()) * 1.3
 987 |             max_tokens = self.llm.llm_config.get('n_ctx', 2048)
 988 |             current_ratio = estimated_tokens / max_tokens
 989 | 
 990 |             if current_ratio > 0.8:
 991 |                 logger.warning(f"Document size at {current_ratio*100:.1f}% of context limit")
 992 |                 self.ui.update_output(f"Warning: Document size at {current_ratio*100:.1f}% of context limit")
 993 | 
 994 |             return current_ratio > 0.9
 995 |         except Exception as e:
 996 |             logger.error(f"Error checking document size: {str(e)}")
 997 |             return True
 998 | 
 999 |     def _handle_command(self, cmd: str):
1000 |         """Handle user commands during research"""
1001 |         if cmd.lower() == 's':
1002 |             self.ui.update_output(self.get_progress())
1003 |         elif cmd.lower() == 'f':
1004 |             if self.current_focus:
1005 |                 self.ui.update_output("\nCurrent Focus:")
1006 |                 self.ui.update_output(f"Area: {self.current_focus.area}")
1007 |                 self.ui.update_output(f"Priority: {self.current_focus.priority}")
1008 |             else:
1009 |                 self.ui.update_output("\nNo current focus area")
1010 |         elif cmd.lower() == 'p':
1011 |             self.pause_and_assess()
1012 |         elif cmd.lower() == 'q':
1013 |             self.ui.update_output("\nInitiating research termination...")
1014 |             self.should_terminate.set()
1015 |             self.ui.update_output("\nGenerating research summary... please wait...")
1016 |             summary = self.terminate_research()
1017 |             self.ui.update_output("\nFinal Research Summary:")
1018 |             self.ui.update_output(summary)
1019 | 
1020 |     def pause_and_assess(self):
1021 |         """Pause the research and assess if the collected content is sufficient."""
1022 |         try:
1023 |             # Pause the research thread
1024 |             self.ui.update_output("\nPausing research for assessment...")
1025 |             self.research_paused = True
1026 | 
1027 |             # Start progress indicator in a separate thread
1028 |             self.summary_ready = False
1029 |             indicator_thread = threading.Thread(
1030 |                 target=self.show_progress_indicator,
1031 |                 args=("Assessing the researched information...",)
1032 |             )
1033 |             indicator_thread.daemon = True
1034 |             indicator_thread.start()
1035 | 
1036 |             # Read the current research content
1037 |             if not os.path.exists(self.document_path):
1038 |                 self.summary_ready = True
1039 |                 indicator_thread.join()
1040 |                 self.ui.update_output("No research data found to assess.")
1041 |                 self.research_paused = False
1042 |                 return
1043 | 
1044 |             with open(self.document_path, 'r', encoding='utf-8') as f:
1045 |                 content = f.read().strip()
1046 | 
1047 |             if not content:
1048 |                 self.summary_ready = True
1049 |                 indicator_thread.join()
1050 |                 self.ui.update_output("No research data was collected to assess.")
1051 |                 self.research_paused = False
1052 |                 return
1053 | 
1054 |             # Prepare the prompt for the AI assessment
1055 |             assessment_prompt = f"""
1056 | Based on the following research content, please assess whether the original query "{self.original_query}" can be answered sufficiently with the collected information.
1057 | 
1058 | Research Content:
1059 | {content}
1060 | 
1061 | Instructions:
1062 | 1. If the research content provides enough information to answer the original query in detail, respond with: "The research is sufficient to answer the query."
1063 | 2. If not, respond with: "The research is insufficient and it would be advisable to continue gathering information."
1064 | 3. Do not provide any additional information or details.
1065 | 
1066 | Assessment:
1067 | """
1068 | 
1069 |             # Generate the assessment
1070 |             assessment = self.llm.generate(assessment_prompt, max_tokens=200)
1071 | 
1072 |             # Stop the progress indicator
1073 |             self.summary_ready = True
1074 |             indicator_thread.join()
1075 | 
1076 |             # Display the assessment
1077 |             self.ui.update_output("\nAssessment Result:")
1078 |             self.ui.update_output(assessment.strip())
1079 | 
1080 |             # Provide user with options to continue or quit
1081 |             self.ui.update_output("\nEnter 'c' to continue the research or 'q' to terminate and generate the summary.")
1082 |             self.awaiting_user_decision = True  # Flag to indicate we are waiting for user's decision
1083 | 
1084 |             while self.awaiting_user_decision:
1085 |                 cmd = self.ui.get_input("Enter command ('c' to continue, 'q' to quit): ")
1086 |                 if cmd is None:
1087 |                     continue  # Ignore invalid inputs
1088 |                 cmd = cmd.strip().lower()
1089 |                 if cmd == 'c':
1090 |                     self.ui.update_output("\nResuming research...")
1091 |                     self.research_paused = False
1092 |                     self.awaiting_user_decision = False
1093 |                 elif cmd == 'q':
1094 |                     self.ui.update_output("\nTerminating research and generating summary...")
1095 |                     self.awaiting_user_decision = False
1096 |                     self.should_terminate.set()
1097 |                     summary = self.terminate_research()
1098 |                     self.ui.update_output("\nFinal Research Summary:")
1099 |                     self.ui.update_output(summary)
1100 |                     break
1101 |                 else:
1102 |                     self.ui.update_output("Invalid command. Please enter 'c' to continue or 'q' to quit.")
1103 | 
1104 |         except Exception as e:
1105 |             logger.error(f"Error during pause and assess: {str(e)}")
1106 |             self.ui.update_output(f"Error during assessment: {str(e)}")
1107 |             self.research_paused = False
1108 |         finally:
1109 |             self.summary_ready = True  # Ensure the indicator thread can exit
1110 | 
1111 |     def get_progress(self) -> str:
1112 |         """Get current research progress"""
1113 |         return f"""
1114 | Research Progress:
1115 | - Original Query: {self.original_query}
1116 | - Sources analyzed: {len(self.searched_urls)}
1117 | - Status: {'Active' if self.is_running else 'Stopped'}
1118 | - Current focus: {self.current_focus.area if self.current_focus else 'Initializing'}
1119 | """
1120 | 
1121 |     def is_active(self) -> bool:
1122 |         """Check if research is currently active"""
1123 |         return self.is_running and self.research_thread and self.research_thread.is_alive()
1124 | 
1125 |     def terminate_research(self) -> str:
1126 |         """Terminate research and return to main terminal"""
1127 |         try:
1128 |             print("Initiating research termination...")
1129 |             sys.stdout.flush()
1130 | 
1131 |             # Start progress indicator in a separate thread immediately
1132 |             indicator_thread = threading.Thread(target=self.show_progress_indicator)
1133 |             indicator_thread.daemon = True
1134 |             indicator_thread.start()
1135 | 
1136 |             if not os.path.exists(self.document_path):
1137 |                 self.summary_ready = True
1138 |                 indicator_thread.join(timeout=1.0)
1139 |                 self._cleanup()
1140 |                 return "No research data found to summarize."
1141 | 
1142 |             with open(self.document_path, 'r', encoding='utf-8') as f:
1143 |                 content = f.read().strip()
1144 |                 self.research_content = content  # Store for conversation mode
1145 | 
1146 |             if not content or content == "Research Findings:\n\n":
1147 |                 self.summary_ready = True
1148 |                 indicator_thread.join(timeout=1.0)
1149 |                 self._cleanup()
1150 |                 return "No research data was collected to summarize."
1151 | 
1152 |             try:
1153 |                 # Generate summary using LLM
1154 |                 summary_prompt = f"""
1155 |                 Analyze the following content to provide a comprehensive research summary and a response to the user's original query "{self.original_query}" ensuring that you conclusively answer the query in detail:
1156 | 
1157 |                 Research Content:
1158 |                 {content}
1159 | 
1160 |                 Important Instructions:
1161 |                 > Summarize the research findings that are relevant to the Original topic/question: "{self.original_query}"
1162 |                 > Ensure that in your summary you directly answer the original question/topic conclusively to the best of your ability in detail.
1163 |                 > Read the original topic/question again "{self.original_query}" and abide by any additional instructions that it contains, exactly as instructed in your summary otherwise provide it normally should it not have any specific instructions
1164 | 
1165 |                 Summary:
1166 |                 """
1167 | 
1168 |                 summary = self.llm.generate(summary_prompt, max_tokens=4000)
1169 | 
1170 |                 # Signal that summary is complete to stop the progress indicator
1171 |                 self.summary_ready = True
1172 |                 indicator_thread.join(timeout=1.0)
1173 | 
1174 |                 # Store summary and mark research as complete
1175 |                 self.research_summary = summary
1176 |                 self.research_complete = True
1177 | 
1178 |                 # Format summary
1179 |                 formatted_summary = f"""
1180 |                 {'='*80}
1181 |                 RESEARCH SUMMARY
1182 |                 {'='*80}
1183 | 
1184 |                 Original Query: {self.original_query}
1185 |                 Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
1186 | 
1187 |                 {summary}
1188 | 
1189 |                 {'='*80}
1190 |                 End of Summary
1191 |                 {'='*80}
1192 |                 """
1193 | 
1194 |                 # Write to document
1195 |                 with open(self.document_path, 'a', encoding='utf-8') as f:
1196 |                     f.write("\n\n" + formatted_summary)
1197 | 
1198 |                 # Clean up research UI
1199 |                 if hasattr(self, 'ui') and self.ui:
1200 |                     self.ui.cleanup()
1201 | 
1202 |                 return formatted_summary
1203 | 
1204 |             except Exception as e:
1205 |                 self.summary_ready = True
1206 |                 indicator_thread.join(timeout=1.0)
1207 |                 raise e
1208 | 
1209 |         except Exception as e:
1210 |             error_msg = f"Error generating summary: {str(e)}"
1211 |             logger.error(error_msg)
1212 |             return error_msg
1213 | 
1214 |         finally:
1215 |             # Clean up research UI
1216 |             self._cleanup_research_ui()
1217 | 
1218 |     def show_progress_indicator(self, message="Generating summary, please wait..."):
1219 |         """Show a rotating progress indicator until the summary is ready."""
1220 |         symbols = ['|', '/', '-', '\\']
1221 |         idx = 0
1222 |         self.summary_ready = False  # Track whether the summary is complete
1223 |         while not self.summary_ready:
1224 |             sys.stdout.write(f"\r{message} {symbols[idx]}")
1225 |             sys.stdout.flush()
1226 |             idx = (idx + 1) % len(symbols)
1227 |             time.sleep(0.2)  # Adjust the speed of the rotation if needed
1228 |         sys.stdout.write("\r" + " " * (len(message) + 2) + "\r")  # Clear the line when done
1229 | 
1230 |     def _cleanup_research_ui(self):
1231 |         """Clean up just the research UI components"""
1232 |         if hasattr(self, 'ui') and self.ui:
1233 |             self.ui.cleanup()
1234 | 
1235 |     def show_thinking_indicator(self, message: str, stop_flag_name: str):
1236 |         """Show a rotating thinking indicator with custom message"""
1237 |         symbols = ['|', '/', '-', '\\']
1238 |         idx = 0
1239 |         while getattr(self, stop_flag_name):  # Use dynamic attribute lookup
1240 |             sys.stdout.write(f"\r{message} {symbols[idx]}")
1241 |             sys.stdout.flush()
1242 |             idx = (idx + 1) % len(symbols)
1243 |             time.sleep(0.2)
1244 |         sys.stdout.write("\r" + " " * (len(message) + 2) + "\r")  # Clear the line when done
1245 | 
1246 |     def start_conversation_mode(self):
1247 |         """Start interactive conversation mode with CTRL+D input handling and thinking indicator"""
1248 |         self.conversation_active = True
1249 |         self.thinking = False
1250 | 
1251 |         # Print header with clear instructions
1252 |         print("\n" + "="*80)
1253 |         print(Fore.CYAN + "Research Conversation Mode" + Style.RESET_ALL)
1254 |         print("="*80)
1255 |         print(Fore.YELLOW + "\nInstructions:")
1256 |         print("- Type your question and press CTRL+D to submit")
1257 |         print("- Type 'quit' and press CTRL+D to exit")
1258 |         print("- Your messages appear in green")
1259 |         print("- AI responses appear in cyan" + Style.RESET_ALL + "\n")
1260 | 
1261 |         while self.conversation_active:
1262 |             try:
1263 |                 # Show prompt with user input in green
1264 |                 print(Fore.GREEN + "Your question (Press CTRL+D to submit):" + Style.RESET_ALL)
1265 |                 user_input = self.get_multiline_conversation_input()
1266 | 
1267 |                 # Handle exit commands
1268 |                 if not user_input or user_input.lower() in ['quit', 'exit', 'q']:
1269 |                     print(Fore.YELLOW + "\nExiting conversation mode..." + Style.RESET_ALL)
1270 |                     self.conversation_active = False
1271 |                     break
1272 | 
1273 |                 # Skip empty input
1274 |                 if not user_input.strip():
1275 |                     continue
1276 | 
1277 |                 # Echo the submitted question for clarity
1278 |                 print(Fore.GREEN + "Submitted question:" + Style.RESET_ALL)
1279 |                 print(Fore.GREEN + user_input + Style.RESET_ALL + "\n")
1280 | 
1281 |                 # Start thinking indicator in a separate thread
1282 |                 self.thinking = True  # Set flag before starting thread
1283 |                 thinking_thread = threading.Thread(
1284 |                     target=self.show_thinking_indicator,
1285 |                     args=("Thinking...", "thinking")
1286 |                 )
1287 |                 thinking_thread.daemon = True
1288 |                 thinking_thread.start()
1289 | 
1290 |                 try:
1291 |                     # Generate response
1292 |                     response = self._generate_conversation_response(user_input)
1293 | 
1294 |                     # Stop thinking indicator
1295 |                     self.thinking = False
1296 |                     thinking_thread.join()
1297 | 
1298 |                     # Display response in cyan
1299 |                     print(Fore.CYAN + "AI Response:" + Style.RESET_ALL)
1300 |                     print(f"{Fore.CYAN}{response}{Style.RESET_ALL}\n")
1301 |                     print("-" * 80 + "\n")  # Separator between QA pairs
1302 | 
1303 |                 except Exception as e:
1304 |                     self.thinking = False  # Ensure thinking indicator stops
1305 |                     thinking_thread.join()
1306 |                     raise e
1307 | 
1308 |             except KeyboardInterrupt:
1309 |                 self.thinking = False  # Ensure thinking indicator stops
1310 |                 print(Fore.YELLOW + "\nOperation cancelled. Submit 'quit' to exit." + Style.RESET_ALL)
1311 |             except Exception as e:
1312 |                 logger.error(f"Error in conversation mode: {str(e)}")
1313 |                 print(Fore.RED + f"Error processing question: {str(e)}" + Style.RESET_ALL)
1314 | 
1315 |     def _generate_conversation_response(self, user_query: str) -> str:
1316 |         """Generate contextual responses with improved context handling"""
1317 |         try:
1318 |             # Add debug logging to verify content
1319 |             logger.info(f"Research summary length: {len(self.research_summary) if self.research_summary else 0}")
1320 |             logger.info(f"Research content length: {len(self.research_content) if self.research_content else 0}")
1321 | 
1322 |             # First verify we have content
1323 |             if not self.research_content and not self.research_summary:
1324 |                 # Try to reload from file if available
1325 |                 try:
1326 |                     if os.path.exists(self.document_path):
1327 |                         with open(self.document_path, 'r', encoding='utf-8') as f:
1328 |                             self.research_content = f.read().strip()
1329 |                 except Exception as e:
1330 |                     logger.error(f"Failed to reload research content: {str(e)}")
1331 | 
1332 |             # Prepare context, ensuring we have content
1333 |             context = f"""
1334 | Research Content:
1335 | {self.research_content}
1336 | 
1337 | Research Summary:
1338 | {self.research_summary if self.research_summary else 'No summary available'}
1339 | """
1340 | 
1341 |             prompt = f"""
1342 | Based on the following research content and summary, please answer this question:
1343 | 
1344 | {context}
1345 | 
1346 | Question: {user_query}
1347 | 
1348 | you have 2 sets of instructions the applied set and the unapplied set, the applied set should be followed if the question is directly relating to the research content whereas anything else other then direct questions about the content of the research will result in you instead following the unapplied ruleset
1349 | 
1350 | Applied:
1351 | 
1352 | Instructions:
1353 | 1. Answer based ONLY on the research content provided above if asked a question about your research or that content.
1354 | 2. If the information requested isn't in the research, clearly state that it isn't in the content you gathered.
1355 | 3. Be direct and specific in your response, DO NOT directly cite research unless specifically asked to, be concise and give direct answers to questions based on the research, unless instructed otherwise.
1356 | 
1357 | Unapplied:
1358 | 
1359 | Instructions:
1360 | 
1361 | 1. Do not make up anything that isn't actually true.
1362 | 2. Respond directly to the user's question in an honest and thoughtful manner.
1363 | 3. disregard rules in the applied set for queries not DIRECTLY related to the research, including queries about the research process or what you remember about the research should result in the unapplied ruleset being used.
1364 | 
1365 | Answer:
1366 | """
1367 | 
1368 |             response = self.llm.generate(
1369 |                 prompt,
1370 |                 max_tokens=1000,  # Increased for more detailed responses
1371 |                 temperature=0.7
1372 |             )
1373 | 
1374 |             if not response or not response.strip():
1375 |                 return "I apologize, but I cannot find relevant information in the research content to answer your question."
1376 | 
1377 |             return response.strip()
1378 | 
1379 |         except Exception as e:
1380 |             logger.error(f"Error generating response: {str(e)}")
1381 |             return f"I apologize, but I encountered an error processing your question: {str(e)}"
1382 | 
1383 |     def get_multiline_conversation_input(self) -> str:
1384 |         """Get multiline input with CTRL+D handling for conversation mode"""
1385 |         buffer = []
1386 | 
1387 |         # Save original terminal settings
1388 |         fd = sys.stdin.fileno()
1389 |         old_settings = termios.tcgetattr(fd)
1390 | 
1391 |         try:
1392 |             # Set terminal to raw mode
1393 |             tty.setraw(fd)
1394 | 
1395 |             current_line = []
1396 |             while True:
1397 |                 char = sys.stdin.read(1)
1398 | 
1399 |                 # CTRL+D detection
1400 |                 if not char or ord(char) == 4:  # EOF or CTRL+D
1401 |                     sys.stdout.write('\n')
1402 |                     if current_line:
1403 |                         buffer.append(''.join(current_line))
1404 |                     return ' '.join(buffer).strip()
1405 | 
1406 |                 # Handle special characters
1407 |                 elif ord(char) == 13:  # Enter
1408 |                     sys.stdout.write('\n')
1409 |                     buffer.append(''.join(current_line))
1410 |                     current_line = []
1411 | 
1412 |                 elif ord(char) == 127:  # Backspace
1413 |                     if current_line:
1414 |                         current_line.pop()
1415 |                         sys.stdout.write('\b \b')
1416 | 
1417 |                 elif ord(char) == 3:  # CTRL+C
1418 |                     sys.stdout.write('\n')
1419 |                     return 'quit'
1420 | 
1421 |                 # Normal character
1422 |                 elif 32 <= ord(char) <= 126:  # Printable characters
1423 |                     current_line.append(char)
1424 |                     sys.stdout.write(char)
1425 | 
1426 |                 sys.stdout.flush()
1427 | 
1428 |         finally:
1429 |             # Restore terminal settings
1430 |             termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
1431 |             print()  # New line for clean display
1432 | 
1433 | if __name__ == "__main__":
1434 |     from llm_wrapper import LLMWrapper
1435 |     from llm_response_parser import UltimateLLMResponseParser
1436 |     from Self_Improving_Search import EnhancedSelfImprovingSearch
1437 | 
1438 |     try:
1439 |         print(f"{Fore.CYAN}Initializing Research System...{Style.RESET_ALL}")
1440 |         llm = LLMWrapper()
1441 |         parser = UltimateLLMResponseParser()
1442 |         search_engine = EnhancedSelfImprovingSearch(llm, parser)
1443 |         manager = ResearchManager(llm, parser, search_engine)
1444 | 
1445 |         print(f"{Fore.GREEN}System initialized. Enter your research topic or 'quit' to exit.{Style.RESET_ALL}")
1446 |         while True:
1447 |             try:
1448 |                 topic = ResearchManager.get_initial_input()
1449 |                 if topic.lower() == 'quit':
1450 |                     break
1451 | 
1452 |                 if not topic:
1453 |                     continue
1454 | 
1455 |                 if not topic.startswith('@'):
1456 |                     print(f"{Fore.YELLOW}Please start your research query with '@'{Style.RESET_ALL}")
1457 |                     continue
1458 | 
1459 |                 topic = topic[1:]  # Remove @ prefix
1460 |                 manager.start_research(topic)
1461 |                 summary = manager.terminate_research()
1462 |                 print(f"\n{Fore.GREEN}Research Summary:{Style.RESET_ALL}")
1463 |                 print(summary)
1464 |                 print(f"\n{Fore.GREEN}Research completed. Ready for next topic.{Style.RESET_ALL}\n")
1465 | 
1466 |             except KeyboardInterrupt:
1467 |                 print(f"\n{Fore.YELLOW}Operation cancelled. Ready for next topic.{Style.RESET_ALL}")
1468 |                 if 'manager' in locals():
1469 |                     manager.terminate_research()
1470 |                 continue
1471 | 
1472 |     except KeyboardInterrupt:
1473 |         print(f"\n{Fore.YELLOW}Research system shutting down.{Style.RESET_ALL}")
1474 |         if 'manager' in locals():
1475 |             manager.terminate_research()
1476 |     except Exception as e:
1477 |         print(f"{Fore.RED}Critical error: {str(e)}{Style.RESET_ALL}")
1478 |         logger.error("Critical error in main loop", exc_info=True)
1479 | 
1480 |     if os.name == 'nt':
1481 |         print(f"{Fore.YELLOW}Running on Windows - Some features may be limited{Style.RESET_ALL}")
1482 | 


--------------------------------------------------------------------------------