├── requirements.txt ├── .gitignore ├── LICENSE ├── llm_config.py ├── web_scraper.py ├── llm_wrapper.py ├── strategic_analysis_parser.py ├── README.md ├── llm_response_parser.py ├── Web-LLM.py ├── Self_Improving_Search.py └── research_manager.py /requirements.txt: -------------------------------------------------------------------------------- 1 | llama-cpp-python 2 | duckduckgo-search 3 | colorama 4 | requests 5 | beautifulsoup4 6 | trafilatura 7 | readchar 8 | keyboard 9 | windows-curses; sys_platform == 'win32' 10 | tqdm 11 | urllib3 12 | openai>=1.0.0 13 | anthropic>=0.7.0 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | venv 3 | logs 4 | modelfile 5 | research_session_* 6 | 7 | # Python 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # Virtual Environment 13 | venv/ 14 | env/ 15 | .env 16 | 17 | # IDEs and Editors 18 | .vscode/ 19 | .idea/ 20 | *.swp 21 | *.swo 22 | 23 | # OS generated files 24 | .DS_Store 25 | Thumbs.db 26 | 27 | # Logs 28 | *.log 29 | 30 | # Model files (if they're large, you might want to exclude them) 31 | *.gguf 32 | 33 | # Distribution / packaging 34 | dist/ 35 | build/ 36 | *.egg-info/ 37 | 38 | # Jupyter Notebook 39 | .ipynb_checkpoints 40 | 41 | # Other 42 | *.bak 43 | *.tmp 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 James Warburton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /llm_config.py: -------------------------------------------------------------------------------- 1 | # llm_config.py 2 | 3 | LLM_TYPE = "ollama" # Options: 'ollama', 'openai', 'anthropic' 4 | 5 | # LLM settings for Ollama 6 | LLM_CONFIG_OLLAMA = { 7 | "llm_type": "ollama", 8 | "base_url": "http://localhost:11434", # default Ollama server URL 9 | "model_name": "custom-phi3-32k-Q4_K_M", # Replace with your Ollama model name 10 | "temperature": 0.7, 11 | "top_p": 0.9, 12 | "n_ctx": 55000, 13 | "stop": ["User:", "\n\n"] 14 | } 15 | 16 | # LLM settings for OpenAI 17 | # WARNING: This application makes frequent API calls during research operations. If using paid API services 18 | # (OpenAI/Anthropic), this can result in significant costs accumulating quickly - proceed with caution and 19 | # monitor your API usage carefully if it's paid. 20 | LLM_CONFIG_OPENAI = { 21 | "llm_type": "openai", 22 | "api_key": "", # Set via environment variable OPENAI_API_KEY 23 | "base_url": None, # Optional: Set to use alternative OpenAI-compatible endpoints 24 | "model_name": "gpt-4o", # Required: Specify the model to use 25 | "temperature": 0.7, 26 | "top_p": 0.9, 27 | "max_tokens": 4096, 28 | "stop": ["User:", "\n\n"], 29 | "presence_penalty": 0, 30 | "frequency_penalty": 0 31 | } 32 | 33 | # LLM settings for Anthropic 34 | # WARNING: This application makes frequent API calls during research operations. If using paid API services 35 | # (OpenAI/Anthropic), this can result in significant costs accumulating quickly - proceed with caution and 36 | # monitor your API usage carefully if it's paid. 37 | LLM_CONFIG_ANTHROPIC = { 38 | "llm_type": "anthropic", 39 | "api_key": "", # Set via environment variable ANTHROPIC_API_KEY 40 | "model_name": "claude-3-5-sonnet-latest", # Required: Specify the model to use 41 | "temperature": 0.7, 42 | "top_p": 0.9, 43 | "max_tokens": 4096, 44 | "stop": ["User:", "\n\n"] 45 | } 46 | 47 | def get_llm_config(): 48 | if LLM_TYPE == "llama_cpp": 49 | return LLM_CONFIG_LLAMA_CPP 50 | elif LLM_TYPE == "ollama": 51 | return LLM_CONFIG_OLLAMA 52 | elif LLM_TYPE == "openai": 53 | return LLM_CONFIG_OPENAI 54 | elif LLM_TYPE == "anthropic": 55 | return LLM_CONFIG_ANTHROPIC 56 | else: 57 | raise ValueError(f"Invalid LLM_TYPE: {LLM_TYPE}") 58 | -------------------------------------------------------------------------------- /web_scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from urllib.robotparser import RobotFileParser 4 | from urllib.parse import urlparse, urljoin 5 | import time 6 | import logging 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | import re 9 | 10 | # Set up logging 11 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 12 | logger = logging.getLogger(__name__) 13 | 14 | class WebScraper: 15 | def __init__(self, user_agent="WebLLMAssistant/1.0 (+https://github.com/YourUsername/Web-LLM-Assistant-Llama-cpp)", 16 | rate_limit=1, timeout=10, max_retries=3): 17 | self.session = requests.Session() 18 | self.session.headers.update({"User-Agent": user_agent}) 19 | self.robot_parser = RobotFileParser() 20 | self.rate_limit = rate_limit 21 | self.timeout = timeout 22 | self.max_retries = max_retries 23 | self.last_request_time = {} 24 | 25 | def can_fetch(self, url): 26 | # parsed_url = urlparse(url) 27 | # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" 28 | # self.robot_parser.set_url(robots_url) 29 | # try: 30 | # self.robot_parser.read() 31 | # return self.robot_parser.can_fetch(self.session.headers["User-Agent"], url) 32 | # except Exception as e: 33 | # logger.warning(f"Error reading robots.txt for {url}: {e}") 34 | return True # ignore robots.txt 35 | 36 | def respect_rate_limit(self, url): 37 | domain = urlparse(url).netloc 38 | current_time = time.time() 39 | if domain in self.last_request_time: 40 | time_since_last_request = current_time - self.last_request_time[domain] 41 | if time_since_last_request < self.rate_limit: 42 | time.sleep(self.rate_limit - time_since_last_request) 43 | self.last_request_time[domain] = time.time() 44 | 45 | def scrape_page(self, url): 46 | if not self.can_fetch(url): 47 | logger.info(f"Robots.txt disallows scraping: {url}") 48 | return None 49 | 50 | for attempt in range(self.max_retries): 51 | try: 52 | self.respect_rate_limit(url) 53 | response = self.session.get(url, timeout=self.timeout) 54 | response.raise_for_status() 55 | return self.extract_content(response.text, url) 56 | except requests.RequestException as e: 57 | logger.warning(f"Error scraping {url} (attempt {attempt + 1}/{self.max_retries}): {e}") 58 | if attempt == self.max_retries - 1: 59 | logger.error(f"Failed to scrape {url} after {self.max_retries} attempts") 60 | return None 61 | time.sleep(2 ** attempt) # Exponential backoff 62 | 63 | def extract_content(self, html, url): 64 | soup = BeautifulSoup(html, 'html.parser') 65 | 66 | # Remove unwanted elements 67 | for element in soup(["script", "style", "nav", "footer", "header"]): 68 | element.decompose() 69 | 70 | # Extract title 71 | title = soup.title.string if soup.title else "" 72 | 73 | # Try to find main content 74 | main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') 75 | 76 | if main_content: 77 | paragraphs = main_content.find_all('p') 78 | else: 79 | paragraphs = soup.find_all('p') 80 | 81 | # Extract text from paragraphs 82 | text = ' '.join([p.get_text().strip() for p in paragraphs]) 83 | 84 | # If no paragraphs found, get all text 85 | if not text: 86 | text = soup.get_text() 87 | 88 | # Clean up whitespace 89 | text = re.sub(r'\s+', ' ', text).strip() 90 | 91 | # Extract and resolve links 92 | links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True)] 93 | 94 | return { 95 | "url": url, 96 | "title": title, 97 | "content": text[:2400], # Limit to first 2400 characters 98 | "links": links[:10] # Limit to first 10 links 99 | } 100 | 101 | def scrape_multiple_pages(urls, max_workers=5): 102 | scraper = WebScraper() 103 | results = {} 104 | 105 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 106 | future_to_url = {executor.submit(scraper.scrape_page, url): url for url in urls} 107 | for future in as_completed(future_to_url): 108 | url = future_to_url[future] 109 | try: 110 | data = future.result() 111 | if data: 112 | results[url] = data 113 | logger.info(f"Successfully scraped: {url}") 114 | else: 115 | logger.warning(f"Failed to scrape: {url}") 116 | except Exception as exc: 117 | logger.error(f"{url} generated an exception: {exc}") 118 | 119 | return results 120 | 121 | # Function to integrate with your main system 122 | def get_web_content(urls): 123 | scraped_data = scrape_multiple_pages(urls) 124 | return {url: data['content'] for url, data in scraped_data.items() if data} 125 | 126 | # Standalone can_fetch function 127 | def can_fetch(url): 128 | # parsed_url = urlparse(url) 129 | # robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" 130 | # rp = RobotFileParser() 131 | # rp.set_url(robots_url) 132 | # try: 133 | # rp.read() 134 | # return rp.can_fetch("*", url) 135 | # except Exception as e: 136 | # logger.warning(f"Error reading robots.txt for {url}: {e}") 137 | return True # ignore robots.xt 138 | 139 | if __name__ == "__main__": 140 | test_urls = [ 141 | "https://en.wikipedia.org/wiki/Web_scraping", 142 | "https://example.com", 143 | "https://www.python.org" 144 | ] 145 | scraped_content = get_web_content(test_urls) 146 | for url, content in scraped_content.items(): 147 | print(f"Content from {url}:") 148 | print(content[:500]) # Print first 500 characters 149 | print("\n---\n") 150 | -------------------------------------------------------------------------------- /llm_wrapper.py: -------------------------------------------------------------------------------- 1 | import os 2 | from llama_cpp import Llama 3 | import requests 4 | import json 5 | from llm_config import get_llm_config 6 | from openai import OpenAI 7 | from anthropic import Anthropic 8 | 9 | class LLMWrapper: 10 | def __init__(self): 11 | self.llm_config = get_llm_config() 12 | self.llm_type = self.llm_config.get('llm_type', 'llama_cpp') 13 | 14 | if self.llm_type == 'llama_cpp': 15 | self.llm = self._initialize_llama_cpp() 16 | elif self.llm_type == 'ollama': 17 | self.base_url = self.llm_config.get('base_url', 'http://localhost:11434') 18 | self.model_name = self.llm_config.get('model_name', 'your_model_name') 19 | elif self.llm_type == 'openai': 20 | self._initialize_openai() 21 | elif self.llm_type == 'anthropic': 22 | self._initialize_anthropic() 23 | else: 24 | raise ValueError(f"Unsupported LLM type: {self.llm_type}") 25 | 26 | def _initialize_llama_cpp(self): 27 | return Llama( 28 | model_path=self.llm_config.get('model_path'), 29 | n_ctx=self.llm_config.get('n_ctx', 55000), 30 | n_gpu_layers=self.llm_config.get('n_gpu_layers', 0), 31 | n_threads=self.llm_config.get('n_threads', 8), 32 | verbose=False 33 | ) 34 | 35 | def _initialize_openai(self): 36 | api_key = os.getenv('OPENAI_API_KEY') or self.llm_config.get('api_key') 37 | if not api_key: 38 | raise ValueError("OpenAI API key not found. Set OPENAI_API_KEY environment variable.") 39 | 40 | base_url = self.llm_config.get('base_url') 41 | model_name = self.llm_config.get('model_name') 42 | 43 | if not model_name: 44 | raise ValueError("OpenAI model name not specified in config") 45 | 46 | client_kwargs = {'api_key': api_key} 47 | if base_url: 48 | client_kwargs['base_url'] = base_url 49 | 50 | self.client = OpenAI(**client_kwargs) 51 | self.model_name = model_name 52 | 53 | def _initialize_anthropic(self): 54 | api_key = os.getenv('ANTHROPIC_API_KEY') or self.llm_config.get('api_key') 55 | if not api_key: 56 | raise ValueError("Anthropic API key not found. Set ANTHROPIC_API_KEY environment variable.") 57 | 58 | model_name = self.llm_config.get('model_name') 59 | if not model_name: 60 | raise ValueError("Anthropic model name not specified in config") 61 | 62 | self.client = Anthropic(api_key=api_key) 63 | self.model_name = model_name 64 | 65 | def generate(self, prompt, **kwargs): 66 | if self.llm_type == 'llama_cpp': 67 | llama_kwargs = self._prepare_llama_kwargs(kwargs) 68 | response = self.llm(prompt, **llama_kwargs) 69 | return response['choices'][0]['text'].strip() 70 | elif self.llm_type == 'ollama': 71 | return self._ollama_generate(prompt, **kwargs) 72 | elif self.llm_type == 'openai': 73 | return self._openai_generate(prompt, **kwargs) 74 | elif self.llm_type == 'anthropic': 75 | return self._anthropic_generate(prompt, **kwargs) 76 | else: 77 | raise ValueError(f"Unsupported LLM type: {self.llm_type}") 78 | 79 | def _ollama_generate(self, prompt, **kwargs): 80 | url = f"{self.base_url}/api/generate" 81 | data = { 82 | 'model': self.model_name, 83 | 'prompt': prompt, 84 | 'options': { 85 | 'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)), 86 | 'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)), 87 | 'stop': kwargs.get('stop', self.llm_config.get('stop', [])), 88 | 'num_predict': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 55000)), 89 | 'num_ctx': self.llm_config.get('n_ctx', 55000) 90 | } 91 | } 92 | response = requests.post(url, json=data, stream=True) 93 | if response.status_code != 200: 94 | raise Exception(f"Ollama API request failed with status {response.status_code}: {response.text}") 95 | text = ''.join(json.loads(line)['response'] for line in response.iter_lines() if line) 96 | return text.strip() 97 | 98 | def _openai_generate(self, prompt, **kwargs): 99 | try: 100 | response = self.client.chat.completions.create( 101 | model=self.model_name, 102 | messages=[{"role": "user", "content": prompt}], 103 | temperature=kwargs.get('temperature', self.llm_config.get('temperature', 0.7)), 104 | top_p=kwargs.get('top_p', self.llm_config.get('top_p', 0.9)), 105 | max_tokens=kwargs.get('max_tokens', self.llm_config.get('max_tokens', 4096)), 106 | stop=kwargs.get('stop', self.llm_config.get('stop', [])), 107 | presence_penalty=self.llm_config.get('presence_penalty', 0), 108 | frequency_penalty=self.llm_config.get('frequency_penalty', 0) 109 | ) 110 | return response.choices[0].message.content.strip() 111 | except Exception as e: 112 | raise Exception(f"OpenAI API request failed: {str(e)}") 113 | 114 | def _anthropic_generate(self, prompt, **kwargs): 115 | try: 116 | response = self.client.messages.create( 117 | model=self.model_name, 118 | max_tokens=kwargs.get('max_tokens', self.llm_config.get('max_tokens', 4096)), 119 | temperature=kwargs.get('temperature', self.llm_config.get('temperature', 0.7)), 120 | top_p=kwargs.get('top_p', self.llm_config.get('top_p', 0.9)), 121 | messages=[{ 122 | "role": "user", 123 | "content": prompt 124 | }] 125 | ) 126 | return response.content[0].text.strip() 127 | except Exception as e: 128 | raise Exception(f"Anthropic API request failed: {str(e)}") 129 | 130 | def _cleanup(self): 131 | """Force terminate any running LLM processes""" 132 | if self.llm_type == 'ollama': 133 | try: 134 | # Force terminate Ollama process 135 | requests.post(f"{self.base_url}/api/terminate") 136 | except: 137 | pass 138 | 139 | try: 140 | # Also try to terminate via subprocess if needed 141 | import subprocess 142 | subprocess.run(['pkill', '-f', 'ollama'], capture_output=True) 143 | except: 144 | pass 145 | 146 | def _prepare_llama_kwargs(self, kwargs): 147 | llama_kwargs = { 148 | 'max_tokens': kwargs.get('max_tokens', self.llm_config.get('max_tokens', 55000)), 149 | 'temperature': kwargs.get('temperature', self.llm_config.get('temperature', 0.7)), 150 | 'top_p': kwargs.get('top_p', self.llm_config.get('top_p', 0.9)), 151 | 'stop': kwargs.get('stop', self.llm_config.get('stop', [])), 152 | 'echo': False, 153 | } 154 | return llama_kwargs 155 | -------------------------------------------------------------------------------- /strategic_analysis_parser.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Optional, Union 2 | import re 3 | import logging 4 | from dataclasses import dataclass 5 | from datetime import datetime 6 | 7 | @dataclass 8 | class ResearchFocus: 9 | """Represents a specific area of research focus""" 10 | area: str 11 | priority: int 12 | source_query: str = "" 13 | timestamp: str = "" 14 | search_queries: List[str] = None 15 | 16 | def __post_init__(self): 17 | if not self.timestamp: 18 | self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 19 | if self.search_queries is None: 20 | self.search_queries = [] 21 | 22 | @dataclass 23 | class AnalysisResult: 24 | """Contains the complete analysis result""" 25 | original_question: str 26 | focus_areas: List[ResearchFocus] 27 | raw_response: str 28 | timestamp: str = "" 29 | confidence_score: float = 0.0 30 | 31 | def __post_init__(self): 32 | if not self.timestamp: 33 | self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 34 | 35 | # Set up logging 36 | logger = logging.getLogger(__name__) 37 | 38 | class StrategicAnalysisParser: 39 | """Enhanced parser with improved pattern matching and validation""" 40 | def __init__(self): 41 | self.patterns = { 42 | 'original_question': [ 43 | r"(?i)original question analysis:\s*(.*?)(?=research gap|$)", 44 | r"(?i)original query:\s*(.*?)(?=research gap|$)", 45 | r"(?i)research question:\s*(.*?)(?=research gap|$)", 46 | r"(?i)topic analysis:\s*(.*?)(?=research gap|$)" 47 | ], 48 | 'research_gaps': [ 49 | r"(?i)research gaps?:\s*", 50 | r"(?i)gaps identified:\s*", 51 | r"(?i)areas for research:\s*", 52 | r"(?i)investigation areas:\s*" 53 | ], 54 | 'priority': [ 55 | r"(?i)priority:\s*(\d+)", 56 | r"(?i)priority level:\s*(\d+)", 57 | r"(?i)\(priority:\s*(\d+)\)", 58 | r"(?i)importance:\s*(\d+)" 59 | ] 60 | } 61 | self.logger = logging.getLogger(__name__) 62 | 63 | def parse_analysis(self, llm_response: str) -> Optional[AnalysisResult]: 64 | """Main parsing method with improved validation""" 65 | try: 66 | # Clean and normalize the response 67 | cleaned_response = self._clean_text(llm_response) 68 | 69 | # Extract original question with validation 70 | original_question = self._extract_original_question(cleaned_response) 71 | if not original_question: 72 | self.logger.warning("Failed to extract original question") 73 | original_question = "Original question extraction failed" 74 | 75 | # Extract and validate research areas 76 | focus_areas = self._extract_research_areas(cleaned_response) 77 | focus_areas = self._normalize_focus_areas(focus_areas) 78 | 79 | # Calculate confidence score 80 | confidence_score = self._calculate_confidence_score(original_question, focus_areas) 81 | 82 | return AnalysisResult( 83 | original_question=original_question, 84 | focus_areas=focus_areas, 85 | raw_response=llm_response, 86 | confidence_score=confidence_score 87 | ) 88 | 89 | except Exception as e: 90 | self.logger.error(f"Error in parse_analysis: {str(e)}") 91 | return None 92 | 93 | def _clean_text(self, text: str) -> str: 94 | """Clean and normalize text for parsing""" 95 | text = re.sub(r'\n{3,}', '\n\n', text) 96 | text = re.sub(r'\s{2,}', ' ', text) 97 | text = re.sub(r'(\d+\))', r'\1.', text) 98 | return text.strip() 99 | 100 | def _extract_original_question(self, text: str) -> str: 101 | """Extract original question with improved matching""" 102 | for pattern in self.patterns['original_question']: 103 | match = re.search(pattern, text, re.DOTALL) 104 | if match: 105 | return self._clean_text(match.group(1)) 106 | return "" 107 | 108 | def _extract_research_areas(self, text: str) -> List[ResearchFocus]: 109 | """Extract research areas with enhanced validation""" 110 | areas = [] 111 | for pattern in self.patterns['research_gaps']: 112 | gap_match = re.search(pattern, text) 113 | if gap_match: 114 | sections = re.split(r'\n\s*\d+[\.)]\s+', text[gap_match.end():]) 115 | sections = [s for s in sections if s.strip()] 116 | 117 | for section in sections: 118 | focus = self._parse_research_focus(section) 119 | if focus and self._is_valid_focus(focus): 120 | areas.append(focus) 121 | break 122 | return areas 123 | 124 | def _parse_research_focus(self, text: str) -> Optional[ResearchFocus]: 125 | """Parse research focus with improved validation without reasoning.""" 126 | try: 127 | # Extract area 128 | area = text.split('\n')[0].strip() 129 | 130 | # Extract and validate priority 131 | priority = self._extract_priority(text) 132 | 133 | # Return ResearchFocus without reasoning 134 | return ResearchFocus( 135 | area=area, 136 | priority=priority 137 | ) 138 | 139 | except Exception as e: 140 | self.logger.error(f"Error parsing research focus: {str(e)}") 141 | return None 142 | 143 | def _extract_priority(self, text: str) -> int: 144 | """Extract priority with validation""" 145 | for pattern in self.patterns['priority']: 146 | priority_match = re.search(pattern, text) 147 | if priority_match: 148 | try: 149 | priority = int(priority_match.group(1)) 150 | return max(1, min(5, priority)) 151 | except ValueError: 152 | continue 153 | return 3 # Default priority 154 | 155 | def _is_valid_focus(self, focus: ResearchFocus) -> bool: 156 | """Validate research focus completeness and quality""" 157 | if not focus.area: # Only check if area exists and isn't empty 158 | return False 159 | if focus.priority < 1 or focus.priority > 5: 160 | return False 161 | return True 162 | 163 | def _normalize_focus_areas(self, areas: List[ResearchFocus]) -> List[ResearchFocus]: 164 | """Normalize and validate focus areas""" 165 | normalized = [] 166 | for area in areas: 167 | if not area.area.strip(): 168 | continue 169 | 170 | area.priority = max(1, min(5, area.priority)) 171 | 172 | if self._is_valid_focus(area): 173 | normalized.append(area) 174 | 175 | # Sort by priority (highest first) but don't add any filler areas 176 | normalized.sort(key=lambda x: x.priority, reverse=True) 177 | 178 | return normalized 179 | 180 | def _calculate_confidence_score(self, question: str, areas: List[ResearchFocus]) -> float: 181 | """Calculate confidence score for analysis quality""" 182 | score = 0.0 183 | 184 | # Question quality (0.3) 185 | if question and len(question.split()) >= 3: 186 | score += 0.3 187 | 188 | # Areas quality (0.7) 189 | if areas: 190 | # Valid areas ratio (0.35) - now based on proportion that are valid vs total 191 | num_areas = len(areas) 192 | if num_areas > 0: # Avoid division by zero 193 | valid_areas = sum(1 for a in areas if self._is_valid_focus(a)) 194 | score += 0.35 * (valid_areas / num_areas) 195 | 196 | # Priority distribution (0.35) - now based on having different priorities 197 | if num_areas > 0: # Avoid division by zero 198 | unique_priorities = len(set(a.priority for a in areas)) 199 | score += 0.35 * (unique_priorities / num_areas) 200 | 201 | return round(score, 2) 202 | 203 | def format_analysis_result(self, result: AnalysisResult) -> str: 204 | """Format analysis result for display without reasoning.""" 205 | formatted = [ 206 | "Strategic Analysis Result", 207 | "=" * 80, 208 | f"\nOriginal Question Analysis:\n{result.original_question}\n", 209 | f"Analysis Confidence Score: {result.confidence_score}", 210 | "\nResearch Focus Areas:" 211 | ] 212 | 213 | for i, focus in enumerate(result.focus_areas, 1): 214 | formatted.extend([ 215 | f"\n{i}. {focus.area}", 216 | f" Priority: {focus.priority}" 217 | ]) 218 | 219 | return "\n".join(formatted) 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automated-AI-Web-Researcher-Ollama 2 | 3 | ## Description 4 | Automated-AI-Web-Researcher is an innovative research assistant that leverages locally run large language models through Ollama to conduct thorough, automated online research on any given topic or question. Unlike traditional LLM interactions, this tool actually performs structured research by breaking down queries into focused research areas, systematically investigating each area via web searching and scraping relevant websites, and compiling its findings. The findings are automatically saved into a text document with all the content found and links to the sources. Whenever you want it to stop its research, you can input a command, which will terminate the research. The LLM will then review all of the content it found and provide a comprehensive final summary of your original topic or question. Afterward, you can ask the LLM questions about its research findings. 5 | 6 | ## Project Demonstration 7 | [![My Project Demo](https://img.youtube.com/vi/hS7Q1B8N1mQ/0.jpg)](https://youtu.be/hS7Q1B8N1mQ "My Project Demo") 8 | 9 | Click the image above to watch the demonstration of my project. 10 | 11 | ## Here's How It Works: 12 | 1. You provide a research query (e.g., "What year will the global population begin to decrease rather than increase according to research?"). 13 | 2. The LLM analyzes your query and generates 5 specific research focus areas, each with assigned priorities based on relevance to the topic or question. 14 | 3. Starting with the highest priority area, the LLM: 15 | - Formulates targeted search queries 16 | - Performs web searches 17 | - Analyzes search results, selecting the most relevant web pages 18 | - Scrapes and extracts relevant information from the selected web pages 19 | - Documents all content found during the research session into a research text file, including links to the websites that the content was retrieved from 20 | 4. After investigating all focus areas, the LLM generates new focus areas based on the information found and repeats its research cycle, often discovering new relevant focus areas based on previous findings, leading to interesting and novel research focuses in some cases. 21 | 5. You can let it research as long as you like, with the ability to input a quit command at any time. This will stop the research and cause the LLM to review all the content collected so far in full, generating a comprehensive summary in response to your original query or topic. 22 | 6. The LLM will then enter a conversation mode where you can ask specific questions about the research findings if desired. 23 | 24 | The key distinction is that this isn't just a chatbot—it's an automated research assistant that methodically investigates topics and maintains a documented research trail, all from a single question or topic of your choosing. Depending on your system and model, it can perform over a hundred searches and content retrievals in a relatively short amount of time. You can leave it running and return to a full text document with over a hundred pieces of content from relevant websites and then have it summarize the findings, after which you can ask it questions about what it found. 25 | 26 | ## Features 27 | - Automated research planning with prioritized focus areas 28 | - Systematic web searching and content analysis 29 | - All research content and source URLs saved into a detailed text document 30 | - Research summary generation 31 | - Post-research Q&A capability about findings 32 | - Self-improving search mechanism 33 | - Rich console output with status indicators 34 | - Comprehensive answer synthesis using web-sourced information 35 | - Research conversation mode for exploring findings 36 | 37 | ## Installation 38 | **Note:** To use on Windows, follow the instructions on the [/feature/windows-support](https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama/tree/feature/windows-support) branch. For Linux and MacOS, use this main branch and the follow steps below: 39 | 40 | 1. **Clone the repository:** 41 | 42 | ```sh 43 | git clone https://github.com/TheBlewish/Automated-AI-Web-Researcher-Ollama 44 | cd Automated-AI-Web-Researcher-Ollama 45 | ``` 46 | 47 | 2. **Create and activate a virtual environment:** 48 | 49 | ```sh 50 | python -m venv venv 51 | source venv/bin/activate 52 | ``` 53 | 54 | 3. **Install dependencies:** 55 | 56 | ```sh 57 | pip install -r requirements.txt 58 | ``` 59 | 60 | 4. **Install and configure Ollama:** 61 | 62 | Install Ollama following the instructions at [https://ollama.ai](https://ollama.ai). 63 | 64 | Using your selected model, reccommended to pick one with the required context length for lots of searches (`phi3:3.8b-mini-128k-instruct` or `phi3:14b-medium-128k-instruct` are recommended). 65 | 66 | 5. Go to the llm_config.py file which should have an ollama section that looks like this: 67 | 68 | ```sh 69 | LLM_CONFIG_OLLAMA = { 70 | "llm_type": "ollama", 71 | "base_url": "http://localhost:11434", # default Ollama server URL 72 | "model_name": "custom-phi3-32k-Q4_K_M", # Replace with your Ollama model name 73 | "temperature": 0.7, 74 | "top_p": 0.9, 75 | "n_ctx": 55000, 76 | "stop": ["User:", "\n\n"] 77 | ``` 78 | 79 | Then change to the left of where it says replace with your Ollama model name, the "model_name" function, to the name of the model you have setup in Ollama to use with the program, you can now also change 'n_ctx' to set the desired context size. 80 | 81 | 82 | ## Usage 83 | 1. **Start Ollama:** 84 | 85 | ```sh 86 | ollama serve 87 | ``` 88 | 89 | 2. **Run the researcher:** 90 | 91 | ```sh 92 | python Web-LLM.py 93 | ``` 94 | 95 | 3. **Start a research session:** 96 | - Type `@` followed by your research query. 97 | - Press `CTRL+D` to submit. 98 | - Example: `@What year is the global population projected to start declining?` 99 | 100 | 4. **During research, you can use the following commands by typing the associated letter and submitting with `CTRL+D`:** 101 | - Use `s` to show status. 102 | - Use `f` to show the current focus. 103 | - Use `p` to pause and assess research progress, which will give you an assessment from the LLM after reviewing the entire research content to determine whether it can answer your query with the content collected so far. It will then wait for you to input one of two commands: `c` to continue with the research or `q` to terminate it, resulting in a summary as if you had terminated it without using the pause feature. 104 | - Use `q` to quit research. 105 | 106 | 5. **After the research completes:** 107 | - Wait for the summary to be generated and review the LLM's findings. 108 | - Enter conversation mode to ask specific questions about its findings. 109 | - Access the detailed research content found, available in a research session text file which will be located in the program's directory. This includes: 110 | - All retrieved content 111 | - Source URLs for all of the information 112 | - Focus areas investigated 113 | - Generated summary 114 | 115 | ## Configuration 116 | The LLM settings can be modified in `llm_config.py`. You must specify your model name in the configuration for the researcher to function. The default configuration is optimized for research tasks with the specified Phi-3 model. 117 | 118 | ## Current Status 119 | This is a prototype that demonstrates functional automated research capabilities. While still in development, it successfully performs structured research tasks. It has been tested and works well with the `phi3:3.8b-mini-128k-instruct` model when the context is set as advised previously. 120 | 121 | ## Dependencies 122 | - Ollama 123 | - Python packages listed in `requirements.txt` 124 | - Recommended models: `phi3:3.8b-mini-128k-instruct` or `phi3:14b-medium-128k-instruct` (with custom context length as specified) 125 | 126 | ## Contributing 127 | Contributions are welcome! This is a prototype with room for improvements and new features. 128 | 129 | ## License 130 | This project is licensed under the MIT License—see the [LICENSE](LICENSE) file for details. 131 | 132 | ## Acknowledgments 133 | - Ollama team for their local LLM runtime 134 | - DuckDuckGo for their search API 135 | 136 | ## Personal Note 137 | This tool represents an attempt to bridge the gap between simple LLM interactions and genuine research capabilities. By structuring the research process and maintaining documentation, it aims to provide more thorough and verifiable results than traditional LLM conversations. It also represents an attempt to improve on my previous project, 'Web-LLM-Assistant-Llamacpp-Ollama,' which simply gave LLMs the ability to search and scrape websites to answer questions. Unlike its predecessor, I feel this program takes that capability and uses it in a novel and very useful way. As a very new programmer, with this being my second ever program, I feel very good about the result. I hope that it hits the mark! 138 | 139 | Given how much I have been using it myself, unlike the previous program, which felt more like a novelty than an actual tool, this is actually quite useful and unique—but I am quite biased! 140 | 141 | Please enjoy! And feel free to submit any suggestions for improvements so that we can make this automated AI researcher even more capable. 142 | 143 | ## Disclaimer 144 | This project is for educational purposes only. Ensure you comply with the terms of service of all APIs and services used. 145 | -------------------------------------------------------------------------------- /llm_response_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, List, Union, Optional 3 | import logging 4 | import json 5 | from strategic_analysis_parser import StrategicAnalysisParser, AnalysisResult, ResearchFocus 6 | 7 | # Set up logging 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 9 | logger = logging.getLogger(__name__) 10 | 11 | class UltimateLLMResponseParser: 12 | def __init__(self): 13 | self.decision_keywords = { 14 | 'refine': ['refine', 'need more info', 'insufficient', 'unclear', 'more research', 'additional search'], 15 | 'answer': ['answer', 'sufficient', 'enough info', 'can respond', 'adequate', 'comprehensive'] 16 | } 17 | self.section_identifiers = [ 18 | ('decision', r'(?i)decision\s*:'), 19 | ('reasoning', r'(?i)reasoning\s*:'), 20 | ('selected_results', r'(?i)selected results\s*:'), 21 | ('response', r'(?i)response\s*:') 22 | ] 23 | # Initialize strategic analysis parser 24 | self.strategic_parser = StrategicAnalysisParser() 25 | 26 | def parse_llm_response(self, response: str, mode: str = 'search') -> Dict[str, Union[str, List[int], AnalysisResult]]: 27 | """ 28 | Parse LLM response based on mode 29 | 30 | Args: 31 | response (str): The LLM's response text 32 | mode (str): 'search' for web search, 'research' for strategic analysis 33 | 34 | Returns: 35 | Dict containing parsed response 36 | """ 37 | logger.info(f"Starting to parse LLM response in {mode} mode") 38 | 39 | if mode == 'research': 40 | return self._parse_research_response(response) 41 | 42 | # Original search mode parsing 43 | result = { 44 | 'decision': None, 45 | 'reasoning': None, 46 | 'selected_results': [], 47 | 'response': None 48 | } 49 | 50 | parsing_strategies = [ 51 | self._parse_structured_response, 52 | self._parse_json_response, 53 | self._parse_unstructured_response, 54 | self._parse_implicit_response 55 | ] 56 | 57 | for strategy in parsing_strategies: 58 | try: 59 | parsed_result = strategy(response) 60 | if self._is_valid_result(parsed_result): 61 | result.update(parsed_result) 62 | logger.info(f"Successfully parsed using strategy: {strategy.__name__}") 63 | break 64 | except Exception as e: 65 | logger.warning(f"Error in parsing strategy {strategy.__name__}: {str(e)}") 66 | 67 | if not self._is_valid_result(result): 68 | logger.warning("All parsing strategies failed. Using fallback parsing.") 69 | result = self._fallback_parsing(response) 70 | 71 | result = self._post_process_result(result) 72 | 73 | logger.info("Finished parsing LLM response") 74 | return result 75 | 76 | def _parse_research_response(self, response: str) -> Dict[str, Union[str, AnalysisResult]]: 77 | """Handle research mode specific parsing""" 78 | try: 79 | analysis_result = self.strategic_parser.parse_analysis(response) 80 | if analysis_result: 81 | return { 82 | 'mode': 'research', 83 | 'analysis_result': analysis_result, 84 | 'error': None 85 | } 86 | else: 87 | logger.error("Failed to parse strategic analysis") 88 | return { 89 | 'mode': 'research', 90 | 'analysis_result': None, 91 | 'error': 'Failed to parse strategic analysis' 92 | } 93 | except Exception as e: 94 | logger.error(f"Error in research response parsing: {str(e)}") 95 | return { 96 | 'mode': 'research', 97 | 'analysis_result': None, 98 | 'error': str(e) 99 | } 100 | 101 | def parse_search_query(self, query_response: str) -> Dict[str, str]: 102 | """Parse search query formulation response""" 103 | try: 104 | lines = query_response.strip().split('\n') 105 | result = { 106 | 'query': '', 107 | 'time_range': 'none' 108 | } 109 | 110 | for line in lines: 111 | if ':' in line: 112 | key, value = line.split(':', 1) 113 | key = key.strip().lower() 114 | value = value.strip() 115 | 116 | if 'query' in key: 117 | result['query'] = self._clean_query(value) 118 | elif 'time' in key or 'range' in key: 119 | result['time_range'] = self._validate_time_range(value) 120 | 121 | return result 122 | except Exception as e: 123 | logger.error(f"Error parsing search query: {str(e)}") 124 | return {'query': '', 'time_range': 'none'} 125 | 126 | def _parse_structured_response(self, response: str) -> Dict[str, Union[str, List[int]]]: 127 | result = {} 128 | for key, pattern in self.section_identifiers: 129 | match = re.search(f'{pattern}(.*?)(?={"|".join([p for k, p in self.section_identifiers if k != key])}|$)', 130 | response, re.IGNORECASE | re.DOTALL) 131 | if match: 132 | result[key] = match.group(1).strip() 133 | 134 | if 'selected_results' in result: 135 | result['selected_results'] = self._extract_numbers(result['selected_results']) 136 | 137 | return result 138 | 139 | def _parse_json_response(self, response: str) -> Dict[str, Union[str, List[int]]]: 140 | try: 141 | json_match = re.search(r'\{.*\}', response, re.DOTALL) 142 | if json_match: 143 | json_str = json_match.group(0) 144 | parsed_json = json.loads(json_str) 145 | return {k: v for k, v in parsed_json.items() 146 | if k in ['decision', 'reasoning', 'selected_results', 'response']} 147 | except json.JSONDecodeError: 148 | pass 149 | return {} 150 | 151 | def _parse_unstructured_response(self, response: str) -> Dict[str, Union[str, List[int]]]: 152 | result = {} 153 | lines = response.split('\n') 154 | current_section = None 155 | 156 | for line in lines: 157 | section_match = re.match(r'(.+?)[:.-](.+)', line) 158 | if section_match: 159 | key = self._match_section_to_key(section_match.group(1)) 160 | if key: 161 | current_section = key 162 | result[key] = section_match.group(2).strip() 163 | elif current_section: 164 | result[current_section] += ' ' + line.strip() 165 | 166 | if 'selected_results' in result: 167 | result['selected_results'] = self._extract_numbers(result['selected_results']) 168 | 169 | return result 170 | 171 | def _parse_implicit_response(self, response: str) -> Dict[str, Union[str, List[int]]]: 172 | result = {} 173 | 174 | decision = self._infer_decision(response) 175 | if decision: 176 | result['decision'] = decision 177 | 178 | numbers = self._extract_numbers(response) 179 | if numbers: 180 | result['selected_results'] = numbers 181 | 182 | if not result: 183 | result['response'] = response.strip() 184 | 185 | return result 186 | 187 | def _fallback_parsing(self, response: str) -> Dict[str, Union[str, List[int]]]: 188 | return { 189 | 'decision': self._infer_decision(response), 190 | 'reasoning': None, 191 | 'selected_results': self._extract_numbers(response), 192 | 'response': response.strip() 193 | } 194 | 195 | def _post_process_result(self, result: Dict[str, Union[str, List[int]]]) -> Dict[str, Union[str, List[int]]]: 196 | if result['decision'] not in ['refine', 'answer']: 197 | result['decision'] = self._infer_decision(str(result)) 198 | 199 | if not isinstance(result['selected_results'], list): 200 | result['selected_results'] = self._extract_numbers(str(result['selected_results'])) 201 | 202 | result['selected_results'] = result['selected_results'][:2] 203 | 204 | if not result['reasoning']: 205 | result['reasoning'] = f"Based on the {'presence' if result['selected_results'] else 'absence'} of selected results and the overall content." 206 | 207 | if not result['response']: 208 | result['response'] = result.get('reasoning', 'No clear response found.') 209 | 210 | return result 211 | 212 | def _match_section_to_key(self, section: str) -> Optional[str]: 213 | for key, pattern in self.section_identifiers: 214 | if re.search(pattern, section, re.IGNORECASE): 215 | return key 216 | return None 217 | 218 | def _extract_numbers(self, text: str) -> List[int]: 219 | return [int(num) for num in re.findall(r'\b(?:10|[1-9])\b', text)] 220 | 221 | def _infer_decision(self, text: str) -> str: 222 | text = text.lower() 223 | refine_score = sum(text.count(keyword) for keyword in self.decision_keywords['refine']) 224 | answer_score = sum(text.count(keyword) for keyword in self.decision_keywords['answer']) 225 | return 'refine' if refine_score > answer_score else 'answer' 226 | 227 | def _is_valid_result(self, result: Dict[str, Union[str, List[int]]]) -> bool: 228 | return bool(result.get('decision') or result.get('response') or result.get('selected_results')) 229 | 230 | def _clean_query(self, query: str) -> str: 231 | """Clean and validate search query""" 232 | query = re.sub(r'["\'\[\]]', '', query) 233 | query = re.sub(r'\s+', ' ', query) 234 | return query.strip()[:100] 235 | 236 | def _validate_time_range(self, time_range: str) -> str: 237 | """Validate time range value""" 238 | valid_ranges = ['d', 'w', 'm', 'y', 'none'] 239 | time_range = time_range.lower() 240 | return time_range if time_range in valid_ranges else 'none' 241 | -------------------------------------------------------------------------------- /Web-LLM.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from colorama import init, Fore, Style 4 | import logging 5 | import time 6 | from io import StringIO 7 | from Self_Improving_Search import EnhancedSelfImprovingSearch 8 | from llm_config import get_llm_config 9 | from llm_response_parser import UltimateLLMResponseParser 10 | from llm_wrapper import LLMWrapper 11 | from strategic_analysis_parser import StrategicAnalysisParser 12 | from research_manager import ResearchManager 13 | 14 | # Initialize colorama 15 | if os.name == 'nt': # Windows-specific initialization 16 | init(convert=True, strip=False, wrap=True) 17 | else: 18 | init() 19 | 20 | # Set up logging 21 | log_directory = 'logs' 22 | if not os.path.exists(log_directory): 23 | os.makedirs(log_directory) 24 | 25 | logger = logging.getLogger(__name__) 26 | logger.setLevel(logging.INFO) 27 | log_file = os.path.join(log_directory, 'web_llm.log') 28 | file_handler = logging.FileHandler(log_file) 29 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 30 | file_handler.setFormatter(formatter) 31 | logger.handlers = [] 32 | logger.addHandler(file_handler) 33 | logger.propagate = False 34 | 35 | # Disable other loggers 36 | for name in logging.root.manager.loggerDict: 37 | if name != __name__: 38 | logging.getLogger(name).disabled = True 39 | 40 | class OutputRedirector: 41 | def __init__(self, stream=None): 42 | self.stream = stream or StringIO() 43 | self.original_stdout = sys.stdout 44 | self.original_stderr = sys.stderr 45 | 46 | def __enter__(self): 47 | sys.stdout = self.stream 48 | sys.stderr = self.stream 49 | return self.stream 50 | 51 | def __exit__(self, exc_type, exc_val, exc_tb): 52 | sys.stdout = self.original_stdout 53 | sys.stderr = self.original_stderr 54 | 55 | def print_header(): 56 | print(Fore.CYAN + Style.BRIGHT + """ 57 | ╔══════════════════════════════════════════════════════════╗ 58 | ║ 🌐 Advanced Research Assistant 🤖 ║ 59 | ╚══════════════════════════════════════════════════════════╝ 60 | """ + Style.RESET_ALL) 61 | print(Fore.YELLOW + """ 62 | Welcome to the Advanced Research Assistant! 63 | 64 | Usage: 65 | - Start your research query with '@' 66 | Example: "@analyze the impact of AI on healthcare" 67 | 68 | Press CTRL+D (Linux/Mac) or CTRL+Z (Windows) to submit input. 69 | """ + Style.RESET_ALL) 70 | 71 | def get_multiline_input() -> str: 72 | """Get multiline input using raw terminal mode for reliable CTRL+D handling""" 73 | print(f"{Fore.GREEN}📝 Enter your message (Press CTRL+D to submit):{Style.RESET_ALL}") 74 | lines = [] 75 | 76 | import termios 77 | import tty 78 | import sys 79 | 80 | # Save original terminal settings 81 | fd = sys.stdin.fileno() 82 | old_settings = termios.tcgetattr(fd) 83 | 84 | try: 85 | # Set terminal to raw mode 86 | tty.setraw(fd) 87 | 88 | current_line = [] 89 | while True: 90 | # Read one character at a time 91 | char = sys.stdin.read(1) 92 | 93 | # CTRL+D detection 94 | if not char or ord(char) == 4: # EOF or CTRL+D 95 | sys.stdout.write('\n') # New line for clean display 96 | if current_line: 97 | lines.append(''.join(current_line)) 98 | return ' '.join(lines).strip() 99 | 100 | # Handle special characters 101 | elif ord(char) == 13: # Enter 102 | sys.stdout.write('\n') 103 | lines.append(''.join(current_line)) 104 | current_line = [] 105 | 106 | elif ord(char) == 127: # Backspace 107 | if current_line: 108 | current_line.pop() 109 | sys.stdout.write('\b \b') # Erase character 110 | 111 | elif ord(char) == 3: # CTRL+C 112 | sys.stdout.write('\n') 113 | return 'q' 114 | 115 | # Normal character 116 | elif 32 <= ord(char) <= 126: # Printable characters 117 | current_line.append(char) 118 | sys.stdout.write(char) 119 | 120 | # Flush output 121 | sys.stdout.flush() 122 | 123 | finally: 124 | # Restore terminal settings 125 | termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) 126 | print() # New line for clean display 127 | 128 | def initialize_system(): 129 | """Initialize system with proper error checking""" 130 | try: 131 | print(Fore.YELLOW + "Initializing system..." + Style.RESET_ALL) 132 | 133 | llm_config = get_llm_config() 134 | if llm_config['llm_type'] == 'ollama': 135 | import requests 136 | try: 137 | response = requests.get(llm_config['base_url'], timeout=5) 138 | if response.status_code != 200: 139 | raise ConnectionError("Cannot connect to Ollama server") 140 | except requests.exceptions.RequestException: 141 | raise ConnectionError( 142 | "\nCannot connect to Ollama server!" 143 | "\nPlease ensure:" 144 | "\n1. Ollama is installed" 145 | "\n2. Ollama server is running (try 'ollama serve')" 146 | "\n3. The model specified in llm_config.py is pulled" 147 | ) 148 | elif llm_config['llm_type'] == 'llama_cpp': 149 | model_path = llm_config.get('model_path') 150 | if not model_path or not os.path.exists(model_path): 151 | raise FileNotFoundError( 152 | f"\nLLama.cpp model not found at: {model_path}" 153 | "\nPlease ensure model path in llm_config.py is correct" 154 | ) 155 | 156 | with OutputRedirector() as output: 157 | llm_wrapper = LLMWrapper() 158 | try: 159 | test_response = llm_wrapper.generate("Test", max_tokens=10) 160 | if not test_response: 161 | raise ConnectionError("LLM failed to generate response") 162 | except Exception as e: 163 | raise ConnectionError(f"LLM test failed: {str(e)}") 164 | 165 | parser = UltimateLLMResponseParser() 166 | search_engine = EnhancedSelfImprovingSearch(llm_wrapper, parser) 167 | research_manager = ResearchManager(llm_wrapper, parser, search_engine) 168 | 169 | print(Fore.GREEN + "System initialized successfully." + Style.RESET_ALL) 170 | return llm_wrapper, parser, search_engine, research_manager 171 | except Exception as e: 172 | logger.error(f"Error initializing system: {str(e)}", exc_info=True) 173 | print(Fore.RED + f"System initialization failed: {str(e)}" + Style.RESET_ALL) 174 | return None, None, None, None 175 | 176 | def handle_research_mode(research_manager, query): 177 | """Handles research mode operations""" 178 | print(f"{Fore.CYAN}Initiating research mode...{Style.RESET_ALL}") 179 | 180 | try: 181 | # Start the research 182 | research_manager.start_research(query) 183 | 184 | submit_key = "CTRL+Z" if os.name == 'nt' else "CTRL+D" 185 | print(f"\n{Fore.YELLOW}Research Running. Available Commands:{Style.RESET_ALL}") 186 | print(f"Type command and press {submit_key}:") 187 | print("'s' = Show status") 188 | print("'f' = Show focus") 189 | print("'q' = Quit research") 190 | 191 | while research_manager.is_active(): 192 | try: 193 | command = get_multiline_input().strip().lower() 194 | if command == 's': 195 | print("\n" + research_manager.get_progress()) 196 | elif command == 'f': 197 | if research_manager.current_focus: 198 | print(f"\n{Fore.CYAN}Current Focus:{Style.RESET_ALL}") 199 | print(f"Area: {research_manager.current_focus.area}") 200 | print(f"Priority: {research_manager.current_focus.priority}") 201 | print(f"Reasoning: {research_manager.current_focus.reasoning}") 202 | else: 203 | print(f"\n{Fore.YELLOW}No current focus area{Style.RESET_ALL}") 204 | elif command == 'q': 205 | break 206 | except KeyboardInterrupt: 207 | break 208 | 209 | # Get final summary first 210 | summary = research_manager.terminate_research() 211 | 212 | # Ensure research UI is fully cleaned up 213 | research_manager._cleanup_research_ui() 214 | 215 | # Now in main terminal, show summary 216 | print(f"\n{Fore.GREEN}Research Summary:{Style.RESET_ALL}") 217 | print(summary) 218 | 219 | # Only NOW start conversation mode if we have a valid summary 220 | if research_manager.research_complete and research_manager.research_summary: 221 | time.sleep(0.5) # Small delay to ensure clean transition 222 | research_manager.start_conversation_mode() 223 | 224 | return 225 | 226 | except KeyboardInterrupt: 227 | print(f"\n{Fore.YELLOW}Research interrupted.{Style.RESET_ALL}") 228 | research_manager.terminate_research() 229 | except Exception as e: 230 | print(f"\n{Fore.RED}Research error: {str(e)}{Style.RESET_ALL}") 231 | research_manager.terminate_research() 232 | 233 | def main(): 234 | print_header() 235 | try: 236 | llm, parser, search_engine, research_manager = initialize_system() 237 | if not all([llm, parser, search_engine, research_manager]): 238 | return 239 | 240 | while True: 241 | try: 242 | # Get input with improved CTRL+D handling 243 | user_input = get_multiline_input() 244 | 245 | # Handle immediate CTRL+D (empty input) 246 | if user_input == "": 247 | user_input = "@quit" # Convert empty CTRL+D to quit command 248 | 249 | user_input = user_input.strip() 250 | 251 | # Check for special quit markers 252 | if user_input in ["@quit", "quit", "q"]: 253 | print(Fore.YELLOW + "\nGoodbye!" + Style.RESET_ALL) 254 | break 255 | 256 | if not user_input: 257 | continue 258 | 259 | if user_input.lower() == 'help': 260 | print_header() 261 | continue 262 | 263 | if user_input.startswith('/'): 264 | search_query = user_input[1:].strip() 265 | handle_search_mode(search_engine, search_query) 266 | 267 | elif user_input.startswith('@'): 268 | research_query = user_input[1:].strip() 269 | handle_research_mode(research_manager, research_query) 270 | 271 | else: 272 | print(f"{Fore.RED}Please start with '/' for search or '@' for research.{Style.RESET_ALL}") 273 | 274 | except KeyboardInterrupt: 275 | print(f"\n{Fore.YELLOW}Exiting program...{Style.RESET_ALL}") 276 | break 277 | 278 | except Exception as e: 279 | logger.error(f"Error in main loop: {str(e)}") 280 | print(f"{Fore.RED}An error occurred: {str(e)}{Style.RESET_ALL}") 281 | continue 282 | 283 | except KeyboardInterrupt: 284 | print(f"\n{Fore.YELLOW}Program terminated by user.{Style.RESET_ALL}") 285 | 286 | except Exception as e: 287 | logger.critical(f"Critical error: {str(e)}") 288 | print(f"{Fore.RED}Critical error: {str(e)}{Style.RESET_ALL}") 289 | 290 | finally: 291 | # Ensure proper cleanup on exit 292 | try: 293 | if 'research_manager' in locals() and research_manager: 294 | if hasattr(research_manager, 'ui'): 295 | research_manager.ui.cleanup() 296 | curses.endwin() 297 | except: 298 | pass 299 | os._exit(0) 300 | 301 | if __name__ == "__main__": 302 | main() 303 | -------------------------------------------------------------------------------- /Self_Improving_Search.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | import os 4 | from typing import List, Dict, Tuple, Union 5 | from colorama import Fore, Style 6 | import logging 7 | import sys 8 | from io import StringIO 9 | from web_scraper import get_web_content, can_fetch 10 | from llm_config import get_llm_config 11 | from llm_response_parser import UltimateLLMResponseParser 12 | from llm_wrapper import LLMWrapper 13 | from urllib.parse import urlparse 14 | 15 | # Set up logging 16 | log_directory = 'logs' 17 | if not os.path.exists(log_directory): 18 | os.makedirs(log_directory) 19 | 20 | # Configure logger 21 | logger = logging.getLogger(__name__) 22 | logger.setLevel(logging.INFO) 23 | log_file = os.path.join(log_directory, 'llama_output.log') 24 | file_handler = logging.FileHandler(log_file) 25 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 26 | file_handler.setFormatter(formatter) 27 | logger.handlers = [] 28 | logger.addHandler(file_handler) 29 | logger.propagate = False 30 | 31 | # Suppress other loggers 32 | for name in ['root', 'duckduckgo_search', 'requests', 'urllib3']: 33 | logging.getLogger(name).setLevel(logging.WARNING) 34 | logging.getLogger(name).handlers = [] 35 | logging.getLogger(name).propagate = False 36 | 37 | class OutputRedirector: 38 | def __init__(self, stream=None): 39 | self.stream = stream or StringIO() 40 | self.original_stdout = sys.stdout 41 | self.original_stderr = sys.stderr 42 | 43 | def __enter__(self): 44 | sys.stdout = self.stream 45 | sys.stderr = self.stream 46 | return self.stream 47 | 48 | def __exit__(self, exc_type, exc_val, exc_tb): 49 | sys.stdout = self.original_stdout 50 | sys.stderr = self.original_stderr 51 | 52 | class EnhancedSelfImprovingSearch: 53 | def __init__(self, llm: LLMWrapper, parser: UltimateLLMResponseParser, max_attempts: int = 5): 54 | self.llm = llm 55 | self.parser = parser 56 | self.max_attempts = max_attempts 57 | self.llm_config = get_llm_config() 58 | 59 | @staticmethod 60 | def initialize_llm(): 61 | llm_wrapper = LLMWrapper() 62 | return llm_wrapper 63 | 64 | def print_thinking(self): 65 | print(Fore.MAGENTA + "🧠 Thinking..." + Style.RESET_ALL) 66 | 67 | def print_searching(self): 68 | print(Fore.MAGENTA + "📝 Searching..." + Style.RESET_ALL) 69 | 70 | def search_and_improve(self, user_query: str) -> str: 71 | attempt = 0 72 | while attempt < self.max_attempts: 73 | print(f"\n{Fore.CYAN}Search attempt {attempt + 1}:{Style.RESET_ALL}") 74 | self.print_searching() 75 | 76 | try: 77 | formulated_query, time_range = self.formulate_query(user_query, attempt) 78 | 79 | print(f"{Fore.YELLOW}Original query: {user_query}{Style.RESET_ALL}") 80 | print(f"{Fore.YELLOW}Formulated query: {formulated_query}{Style.RESET_ALL}") 81 | print(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}") 82 | 83 | if not formulated_query: 84 | print(f"{Fore.RED}Error: Empty search query. Retrying...{Style.RESET_ALL}") 85 | attempt += 1 86 | continue 87 | 88 | search_results = self.perform_search(formulated_query, time_range) 89 | 90 | if not search_results: 91 | print(f"{Fore.RED}No results found. Retrying with a different query...{Style.RESET_ALL}") 92 | attempt += 1 93 | continue 94 | 95 | self.display_search_results(search_results) 96 | 97 | selected_urls = self.select_relevant_pages(search_results, user_query) 98 | 99 | if not selected_urls: 100 | print(f"{Fore.RED}No relevant URLs found. Retrying...{Style.RESET_ALL}") 101 | attempt += 1 102 | continue 103 | 104 | print(Fore.MAGENTA + "⚙️ Scraping selected pages..." + Style.RESET_ALL) 105 | # Scraping is done without OutputRedirector to ensure messages are visible 106 | scraped_content = self.scrape_content(selected_urls) 107 | 108 | if not scraped_content: 109 | print(f"{Fore.RED}Failed to scrape content. Retrying...{Style.RESET_ALL}") 110 | attempt += 1 111 | continue 112 | 113 | self.display_scraped_content(scraped_content) 114 | 115 | self.print_thinking() 116 | 117 | with OutputRedirector() as output: 118 | evaluation, decision = self.evaluate_scraped_content(user_query, scraped_content) 119 | llm_output = output.getvalue() 120 | logger.info(f"LLM Output in evaluate_scraped_content:\n{llm_output}") 121 | 122 | print(f"{Fore.MAGENTA}Evaluation: {evaluation}{Style.RESET_ALL}") 123 | print(f"{Fore.MAGENTA}Decision: {decision}{Style.RESET_ALL}") 124 | 125 | if decision == "answer": 126 | return self.generate_final_answer(user_query, scraped_content) 127 | elif decision == "refine": 128 | print(f"{Fore.YELLOW}Refining search...{Style.RESET_ALL}") 129 | attempt += 1 130 | else: 131 | print(f"{Fore.RED}Unexpected decision. Proceeding to answer.{Style.RESET_ALL}") 132 | return self.generate_final_answer(user_query, scraped_content) 133 | 134 | except Exception as e: 135 | print(f"{Fore.RED}An error occurred during search attempt. Check the log file for details.{Style.RESET_ALL}") 136 | logger.error(f"An error occurred during search: {str(e)}", exc_info=True) 137 | attempt += 1 138 | 139 | return self.synthesize_final_answer(user_query) 140 | 141 | def evaluate_scraped_content(self, user_query: str, scraped_content: Dict[str, str]) -> Tuple[str, str]: 142 | user_query_short = user_query[:200] 143 | prompt = f""" 144 | Evaluate if the following scraped content contains sufficient information to answer the user's question comprehensively: 145 | 146 | User's question: "{user_query_short}" 147 | 148 | Scraped Content: 149 | {self.format_scraped_content(scraped_content)} 150 | 151 | Your task: 152 | 1. Determine if the scraped content provides enough relevant and detailed information to answer the user's question thoroughly. 153 | 2. If the information is sufficient, decide to 'answer'. If more information or clarification is needed, decide to 'refine' the search. 154 | 155 | Respond using EXACTLY this format: 156 | Evaluation: [Your evaluation of the scraped content] 157 | Decision: [ONLY 'answer' if content is sufficient, or 'refine' if more information is needed] 158 | """ 159 | max_retries = 3 160 | for attempt in range(max_retries): 161 | try: 162 | response_text = self.llm.generate(prompt, max_tokens=200, stop=None) 163 | evaluation, decision = self.parse_evaluation_response(response_text) 164 | if decision in ['answer', 'refine']: 165 | return evaluation, decision 166 | except Exception as e: 167 | logger.warning(f"Error in evaluate_scraped_content (attempt {attempt + 1}): {str(e)}") 168 | 169 | logger.warning("Failed to get a valid decision in evaluate_scraped_content. Defaulting to 'refine'.") 170 | return "Failed to evaluate content.", "refine" 171 | 172 | def parse_evaluation_response(self, response: str) -> Tuple[str, str]: 173 | evaluation = "" 174 | decision = "" 175 | for line in response.strip().split('\n'): 176 | if line.startswith('Evaluation:'): 177 | evaluation = line.split(':', 1)[1].strip() 178 | elif line.startswith('Decision:'): 179 | decision = line.split(':', 1)[1].strip().lower() 180 | return evaluation, decision 181 | 182 | def formulate_query(self, user_query: str, attempt: int) -> Tuple[str, str]: 183 | user_query_short = user_query[:200] 184 | prompt = f""" 185 | Based on the following user question, formulate a concise and effective search query: 186 | "{user_query_short}" 187 | Your task: 188 | 1. Create a search query of 2-5 words that will yield relevant results. 189 | 2. Determine if a specific time range is needed for the search. 190 | Time range options: 191 | - 'd': Limit results to the past day. Use for very recent events or rapidly changing information. 192 | - 'w': Limit results to the past week. Use for recent events or topics with frequent updates. 193 | - 'm': Limit results to the past month. Use for relatively recent information or ongoing events. 194 | - 'y': Limit results to the past year. Use for annual events or information that changes yearly. 195 | - 'none': No time limit. Use for historical information or topics not tied to a specific time frame. 196 | Respond in the following format: 197 | Search query: [Your 2-5 word query] 198 | Time range: [d/w/m/y/none] 199 | Do not provide any additional information or explanation. 200 | """ 201 | max_retries = 3 202 | for retry in range(max_retries): 203 | with OutputRedirector() as output: 204 | response_text = self.llm.generate(prompt, max_tokens=50, stop=None) 205 | llm_output = output.getvalue() 206 | logger.info(f"LLM Output in formulate_query:\n{llm_output}") 207 | query, time_range = self.parse_query_response(response_text) 208 | if query and time_range: 209 | return query, time_range 210 | return self.fallback_query(user_query), "none" 211 | 212 | def parse_query_response(self, response: str) -> Tuple[str, str]: 213 | query = "" 214 | time_range = "none" 215 | for line in response.strip().split('\n'): 216 | if ":" in line: 217 | key, value = line.split(":", 1) 218 | key = key.strip().lower() 219 | value = value.strip() 220 | if "query" in key: 221 | query = self.clean_query(value) 222 | elif "time" in key or "range" in key: 223 | time_range = self.validate_time_range(value) 224 | return query, time_range 225 | 226 | def clean_query(self, query: str) -> str: 227 | query = re.sub(r'["\'\[\]]', '', query) 228 | query = re.sub(r'\s+', ' ', query) 229 | return query.strip()[:100] 230 | 231 | def validate_time_range(self, time_range: str) -> str: 232 | valid_ranges = ['d', 'w', 'm', 'y', 'none'] 233 | time_range = time_range.lower() 234 | return time_range if time_range in valid_ranges else 'none' 235 | 236 | def fallback_query(self, user_query: str) -> str: 237 | words = user_query.split() 238 | return " ".join(words[:5]) 239 | 240 | def perform_search(self, query: str, time_range: str) -> List[Dict]: 241 | if not query: 242 | return [] 243 | 244 | from duckduckgo_search import DDGS 245 | max_retries = 3 246 | base_delay = 2 # Base delay in seconds 247 | 248 | for retry in range(max_retries): 249 | try: 250 | # Add delay that increases with each retry 251 | if retry > 0: 252 | delay = base_delay * (2 ** (retry - 1)) # Exponential backoff 253 | print(f"{Fore.YELLOW}Rate limit hit. Waiting {delay} seconds before retry {retry + 1}/{max_retries}...{Style.RESET_ALL}") 254 | time.sleep(delay) 255 | 256 | with DDGS() as ddgs: 257 | try: 258 | with OutputRedirector() as output: 259 | if time_range and time_range != 'none': 260 | results = list(ddgs.text(query, timelimit=time_range, max_results=10)) 261 | else: 262 | results = list(ddgs.text(query, max_results=10)) 263 | 264 | ddg_output = output.getvalue() 265 | logger.info(f"DDG Output in perform_search:\n{ddg_output}") 266 | 267 | # If we get here, search was successful 268 | return [{'number': i+1, **result} for i, result in enumerate(results)] 269 | 270 | except Exception as e: 271 | if 'Ratelimit' in str(e): 272 | if retry == max_retries - 1: 273 | print(f"{Fore.RED}Final rate limit attempt failed: {str(e)}{Style.RESET_ALL}") 274 | return [] 275 | continue # Try again with delay 276 | else: 277 | print(f"{Fore.RED}Search error: {str(e)}{Style.RESET_ALL}") 278 | return [] 279 | 280 | except Exception as e: 281 | print(f"{Fore.RED}Outer error: {str(e)}{Style.RESET_ALL}") 282 | return [] 283 | 284 | print(f"{Fore.RED}All retry attempts failed for query: {query}{Style.RESET_ALL}") 285 | return [] 286 | 287 | def display_search_results(self, results: List[Dict]) -> None: 288 | """Display search results with minimal output""" 289 | try: 290 | if not results: 291 | return 292 | 293 | # Only show search success status 294 | print(f"\nSearch query sent to DuckDuckGo: {self.last_query}") 295 | print(f"Time range sent to DuckDuckGo: {self.last_time_range}") 296 | print(f"Number of results: {len(results)}") 297 | 298 | except Exception as e: 299 | logger.error(f"Error displaying search results: {str(e)}") 300 | 301 | def select_relevant_pages(self, search_results: List[Dict], user_query: str) -> List[str]: 302 | prompt = f""" 303 | Given the following search results for the user's question: "{user_query}" 304 | Select the 2 most relevant results to scrape and analyze. Explain your reasoning for each selection. 305 | 306 | Search Results: 307 | {self.format_results(search_results)} 308 | 309 | Instructions: 310 | 1. You MUST select exactly 2 result numbers from the search results. 311 | 2. Choose the results that are most likely to contain comprehensive and relevant information to answer the user's question. 312 | 3. Provide a brief reason for each selection. 313 | 314 | You MUST respond using EXACTLY this format and nothing else: 315 | 316 | Selected Results: [Two numbers corresponding to the selected results] 317 | Reasoning: [Your reasoning for the selections] 318 | """ 319 | 320 | max_retries = 3 321 | for retry in range(max_retries): 322 | with OutputRedirector() as output: 323 | response_text = self.llm.generate(prompt, max_tokens=200, stop=None) 324 | llm_output = output.getvalue() 325 | logger.info(f"LLM Output in select_relevant_pages:\n{llm_output}") 326 | 327 | parsed_response = self.parse_page_selection_response(response_text) 328 | if parsed_response and self.validate_page_selection_response(parsed_response, len(search_results)): 329 | selected_urls = [result['href'] for result in search_results if result['number'] in parsed_response['selected_results']] 330 | 331 | allowed_urls = [url for url in selected_urls if can_fetch(url)] 332 | if allowed_urls: 333 | return allowed_urls 334 | else: 335 | print(f"{Fore.YELLOW}Warning: All selected URLs are disallowed by robots.txt. Retrying selection.{Style.RESET_ALL}") 336 | else: 337 | print(f"{Fore.YELLOW}Warning: Invalid page selection. Retrying.{Style.RESET_ALL}") 338 | 339 | print(f"{Fore.YELLOW}Warning: All attempts to select relevant pages failed. Falling back to top allowed results.{Style.RESET_ALL}") 340 | allowed_urls = [result['href'] for result in search_results if can_fetch(result['href'])][:2] 341 | return allowed_urls 342 | 343 | def parse_page_selection_response(self, response: str) -> Dict[str, Union[List[int], str]]: 344 | lines = response.strip().split('\n') 345 | parsed = {} 346 | for line in lines: 347 | if line.startswith('Selected Results:'): 348 | parsed['selected_results'] = [int(num.strip()) for num in re.findall(r'\d+', line)] 349 | elif line.startswith('Reasoning:'): 350 | parsed['reasoning'] = line.split(':', 1)[1].strip() 351 | return parsed if 'selected_results' in parsed and 'reasoning' in parsed else None 352 | 353 | def validate_page_selection_response(self, parsed_response: Dict[str, Union[List[int], str]], num_results: int) -> bool: 354 | if len(parsed_response['selected_results']) != 2: 355 | return False 356 | if any(num < 1 or num > num_results for num in parsed_response['selected_results']): 357 | return False 358 | return True 359 | 360 | def format_results(self, results: List[Dict]) -> str: 361 | formatted_results = [] 362 | for result in results: 363 | formatted_result = f"{result['number']}. Title: {result.get('title', 'N/A')}\n" 364 | formatted_result += f" Snippet: {result.get('body', 'N/A')[:200]}...\n" 365 | formatted_result += f" URL: {result.get('href', 'N/A')}\n" 366 | formatted_results.append(formatted_result) 367 | return "\n".join(formatted_results) 368 | 369 | def scrape_content(self, urls: List[str]) -> Dict[str, str]: 370 | scraped_content = {} 371 | blocked_urls = [] 372 | for url in urls: 373 | robots_allowed = can_fetch(url) 374 | if robots_allowed: 375 | content = get_web_content([url]) 376 | if content: 377 | scraped_content.update(content) 378 | print(Fore.YELLOW + f"Successfully scraped: {url}" + Style.RESET_ALL) 379 | logger.info(f"Successfully scraped: {url}") 380 | else: 381 | print(Fore.RED + f"Robots.txt disallows scraping of {url}" + Style.RESET_ALL) 382 | logger.warning(f"Robots.txt disallows scraping of {url}") 383 | else: 384 | blocked_urls.append(url) 385 | print(Fore.RED + f"Warning: Robots.txt disallows scraping of {url}" + Style.RESET_ALL) 386 | logger.warning(f"Robots.txt disallows scraping of {url}") 387 | 388 | print(Fore.CYAN + f"Scraped content received for {len(scraped_content)} URLs" + Style.RESET_ALL) 389 | logger.info(f"Scraped content received for {len(scraped_content)} URLs") 390 | 391 | if blocked_urls: 392 | print(Fore.RED + f"Warning: {len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions." + Style.RESET_ALL) 393 | logger.warning(f"{len(blocked_urls)} URL(s) were not scraped due to robots.txt restrictions: {', '.join(blocked_urls)}") 394 | 395 | return scraped_content 396 | 397 | def display_scraped_content(self, scraped_content: Dict[str, str]): 398 | print(f"\n{Fore.CYAN}Scraped Content:{Style.RESET_ALL}") 399 | for url, content in scraped_content.items(): 400 | print(f"{Fore.GREEN}URL: {url}{Style.RESET_ALL}") 401 | print(f"Content: {content[:4000]}...\n") 402 | 403 | def generate_final_answer(self, user_query: str, scraped_content: Dict[str, str]) -> str: 404 | user_query_short = user_query[:200] 405 | prompt = f""" 406 | You are an AI assistant. Provide a comprehensive and detailed answer to the following question using ONLY the information provided in the scraped content. Do not include any references or mention any sources. Answer directly and thoroughly. 407 | 408 | Question: "{user_query_short}" 409 | 410 | Scraped Content: 411 | {self.format_scraped_content(scraped_content)} 412 | 413 | Important Instructions: 414 | 1. Do not use phrases like "Based on the absence of selected results" or similar. 415 | 2. If the scraped content does not contain enough information to answer the question, say so explicitly and explain what information is missing. 416 | 3. Provide as much relevant detail as possible from the scraped content. 417 | 418 | Answer: 419 | """ 420 | max_retries = 3 421 | for attempt in range(max_retries): 422 | with OutputRedirector() as output: 423 | response_text = self.llm.generate(prompt, max_tokens=1024, stop=None) 424 | llm_output = output.getvalue() 425 | logger.info(f"LLM Output in generate_final_answer:\n{llm_output}") 426 | if response_text: 427 | logger.info(f"LLM Response:\n{response_text}") 428 | return response_text 429 | 430 | error_message = "I apologize, but I couldn't generate a satisfactory answer based on the available information." 431 | logger.warning(f"Failed to generate a response after {max_retries} attempts. Returning error message.") 432 | return error_message 433 | 434 | def format_scraped_content(self, scraped_content: Dict[str, str]) -> str: 435 | formatted_content = [] 436 | for url, content in scraped_content.items(): 437 | content = re.sub(r'\s+', ' ', content) 438 | formatted_content.append(f"Content from {url}:\n{content}\n") 439 | return "\n".join(formatted_content) 440 | 441 | def synthesize_final_answer(self, user_query: str) -> str: 442 | prompt = f""" 443 | After multiple search attempts, we couldn't find a fully satisfactory answer to the user's question: "{user_query}" 444 | 445 | Please provide the best possible answer you can, acknowledging any limitations or uncertainties. 446 | If appropriate, suggest ways the user might refine their question or where they might find more information. 447 | 448 | Respond in a clear, concise, and informative manner. 449 | """ 450 | try: 451 | with OutputRedirector() as output: 452 | response_text = self.llm.generate(prompt, max_tokens=self.llm_config.get('max_tokens', 1024), stop=self.llm_config.get('stop', None)) 453 | llm_output = output.getvalue() 454 | logger.info(f"LLM Output in synthesize_final_answer:\n{llm_output}") 455 | if response_text: 456 | return response_text.strip() 457 | except Exception as e: 458 | logger.error(f"Error in synthesize_final_answer: {str(e)}", exc_info=True) 459 | return "I apologize, but after multiple attempts, I wasn't able to find a satisfactory answer to your question. Please try rephrasing your question or breaking it down into smaller, more specific queries." 460 | 461 | # End of EnhancedSelfImprovingSearch class 462 | -------------------------------------------------------------------------------- /research_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import threading 4 | import time 5 | import re 6 | import json 7 | import logging 8 | import curses 9 | import signal 10 | from typing import List, Dict, Set, Optional, Tuple, Union 11 | from dataclasses import dataclass 12 | from queue import Queue 13 | from datetime import datetime 14 | from io import StringIO 15 | from colorama import init, Fore, Style 16 | import select 17 | import termios 18 | import tty 19 | from threading import Event 20 | from urllib.parse import urlparse 21 | from pathlib import Path 22 | 23 | # Initialize colorama for cross-platform color support 24 | if os.name == 'nt': # Windows-specific initialization 25 | init(convert=True, strip=False, wrap=True) 26 | else: 27 | init() 28 | 29 | # Set up logging 30 | log_directory = 'logs' 31 | if not os.path.exists(log_directory): 32 | os.makedirs(log_directory) 33 | 34 | logger = logging.getLogger(__name__) 35 | logger.setLevel(logging.INFO) 36 | log_file = os.path.join(log_directory, 'research_llm.log') 37 | file_handler = logging.FileHandler(log_file) 38 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 39 | file_handler.setFormatter(formatter) 40 | logger.handlers = [] 41 | logger.addHandler(file_handler) 42 | logger.propagate = False 43 | 44 | # Suppress other loggers 45 | for name in logging.root.manager.loggerDict: 46 | if name != __name__: 47 | logging.getLogger(name).disabled = True 48 | 49 | @dataclass 50 | class ResearchFocus: 51 | """Represents a specific area of research focus""" 52 | area: str 53 | priority: int 54 | source_query: str = "" 55 | timestamp: str = "" 56 | search_queries: List[str] = None 57 | 58 | def __post_init__(self): 59 | if not self.timestamp: 60 | self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 61 | if self.search_queries is None: 62 | self.search_queries = [] 63 | 64 | @dataclass 65 | class AnalysisResult: 66 | """Contains the complete analysis result""" 67 | original_question: str 68 | focus_areas: List[ResearchFocus] 69 | raw_response: str 70 | timestamp: str = "" 71 | 72 | def __post_init__(self): 73 | if not self.timestamp: 74 | self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 75 | 76 | class StrategicAnalysisParser: 77 | def __init__(self, llm=None): 78 | self.llm = llm 79 | self.logger = logging.getLogger(__name__) 80 | # Simplify patterns to match exactly what we expect 81 | self.patterns = { 82 | 'priority': [ 83 | r"Priority:\s*(\d+)", # Match exactly what's in our prompt 84 | ] 85 | } 86 | 87 | def strategic_analysis(self, original_query: str) -> Optional[AnalysisResult]: 88 | """Generate and process research areas with retries until success""" 89 | max_retries = 3 90 | try: 91 | self.logger.info("Starting strategic analysis...") 92 | prompt = f""" 93 | You must select exactly 5 areas to investigate in order to explore and gather information to answer the research question: 94 | "{original_query}" 95 | 96 | You MUST provide exactly 5 areas numbered 1-5. Each must have a priority, YOU MUST ensure that you only assign one priority per area. 97 | Assign priority based on the likelihood of a focus area being investigated to provide information that directly will allow you to respond to "{original_query}" with 5 being most likely and 1 being least. 98 | Follow this EXACT format without any deviations or additional text: 99 | 100 | 1. [First research topic] 101 | Priority: [number 1-5] 102 | 103 | 2. [Second research topic] 104 | Priority: [number 1-5] 105 | 106 | 3. [Third research topic] 107 | Priority: [number 1-5] 108 | 109 | 4. [Fourth research topic] 110 | Priority: [number 1-5] 111 | 112 | 5. [Fifth research topic] 113 | Priority: [number 1-5] 114 | """ 115 | for attempt in range(max_retries): 116 | response = self.llm.generate(prompt, max_tokens=1000) 117 | focus_areas = self._extract_research_areas(response) 118 | 119 | if focus_areas: # If we got any valid areas 120 | # Sort by priority (highest first) 121 | focus_areas.sort(key=lambda x: x.priority, reverse=True) 122 | 123 | return AnalysisResult( 124 | original_question=original_query, 125 | focus_areas=focus_areas, 126 | raw_response=response, 127 | timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") 128 | ) 129 | else: 130 | self.logger.warning(f"Attempt {attempt + 1}: No valid areas generated, retrying...") 131 | print(f"\nRetrying research area generation (Attempt {attempt + 1}/{max_retries})...") 132 | 133 | # If all retries failed, try one final time with a stronger prompt 134 | prompt += "\n\nIMPORTANT: You MUST provide exactly 5 research areas with priorities. This is crucial." 135 | response = self.llm.generate(prompt, max_tokens=1000) 136 | focus_areas = self._extract_research_areas(response) 137 | 138 | if focus_areas: 139 | focus_areas.sort(key=lambda x: x.priority, reverse=True) 140 | return AnalysisResult( 141 | original_question=original_query, 142 | focus_areas=focus_areas, 143 | raw_response=response, 144 | timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S") 145 | ) 146 | 147 | self.logger.error("Failed to generate any valid research areas after all attempts") 148 | return None 149 | 150 | except Exception as e: 151 | self.logger.error(f"Error in strategic analysis: {str(e)}") 152 | return None 153 | 154 | def _extract_research_areas(self, text: str) -> List[ResearchFocus]: 155 | """Extract research areas with enhanced parsing to handle priorities in various formats.""" 156 | areas = [] 157 | lines = text.strip().split('\n') 158 | 159 | current_area = None 160 | current_priority = None 161 | 162 | for i in range(len(lines)): 163 | line = lines[i].strip() 164 | if not line: 165 | continue 166 | 167 | # Check for numbered items (e.g., '1. Area Name') 168 | number_match = re.match(r'^(\d+)\.\s*(.*)', line) 169 | if number_match: 170 | # If we have a previous area, add it to our list 171 | if current_area is not None: 172 | areas.append(ResearchFocus( 173 | area=current_area.strip(' -:'), 174 | priority=current_priority or 3, 175 | )) 176 | # Start a new area 177 | area_line = number_match.group(2) 178 | 179 | # Search for 'priority' followed by a number, anywhere in the area_line 180 | priority_inline_match = re.search( 181 | r'(?i)\bpriority\b\s*(?:[:=]?\s*)?(\d+)', area_line) 182 | if priority_inline_match: 183 | # Extract and set the priority 184 | try: 185 | current_priority = int(priority_inline_match.group(1)) 186 | current_priority = max(1, min(5, current_priority)) 187 | except ValueError: 188 | current_priority = 3 # Default priority if parsing fails 189 | # Remove the 'priority' portion from area_line 190 | area_line = area_line[:priority_inline_match.start()] + area_line[priority_inline_match.end():] 191 | area_line = area_line.strip(' -:') 192 | else: 193 | current_priority = None # Priority might be on the next line 194 | 195 | current_area = area_line.strip() 196 | 197 | elif re.match(r'(?i)^priority\s*(?:[:=]?\s*)?(\d+)', line): 198 | # Extract priority from the line following the area 199 | try: 200 | priority_match = re.match(r'(?i)^priority\s*(?:[:=]?\s*)?(\d+)', line) 201 | current_priority = int(priority_match.group(1)) 202 | current_priority = max(1, min(5, current_priority)) 203 | except (ValueError, IndexError): 204 | current_priority = 3 # Default priority if parsing fails 205 | 206 | # Check if this is the last line or the next line is a new area 207 | next_line_is_new_area = (i + 1 < len(lines)) and re.match(r'^\d+\.', lines[i + 1].strip()) 208 | if next_line_is_new_area or i + 1 == len(lines): 209 | if current_area is not None: 210 | # Append the current area and priority to the list 211 | areas.append(ResearchFocus( 212 | area=current_area.strip(' -:'), 213 | priority=current_priority or 3, 214 | )) 215 | current_area = None 216 | current_priority = None 217 | 218 | return areas 219 | 220 | def _clean_text(self, text: str) -> str: 221 | """Clean and normalize text""" 222 | text = re.sub(r'\s+', ' ', text) 223 | text = re.sub(r'(\d+\))', r'\1.', text) 224 | text = re.sub(r'(?i)priority:', 'P:', text) 225 | return text.strip() 226 | 227 | def _add_area(self, areas: List[ResearchFocus], area: str, priority: Optional[int]): 228 | """Add area with basic validation""" 229 | if not area or len(area.split()) < 3: # Basic validation 230 | return 231 | 232 | areas.append(ResearchFocus( 233 | area=area, 234 | priority=priority or 3, 235 | timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 236 | search_queries=[] 237 | )) 238 | 239 | def _normalize_focus_areas(self, areas: List[ResearchFocus]) -> List[ResearchFocus]: 240 | """Normalize and prepare final list of areas""" 241 | if not areas: 242 | return [] 243 | 244 | # Sort by priority 245 | areas.sort(key=lambda x: x.priority, reverse=True) 246 | 247 | # Ensure priorities are properly spread 248 | for i, area in enumerate(areas): 249 | area.priority = max(1, min(5, area.priority)) 250 | 251 | return areas[:5] 252 | 253 | def format_analysis_result(self, result: AnalysisResult) -> str: 254 | """Format the results for display""" 255 | if not result: 256 | return "No valid analysis result generated." 257 | 258 | formatted = [ 259 | f"\nResearch Areas for: {result.original_question}\n" 260 | ] 261 | 262 | for i, focus in enumerate(result.focus_areas, 1): 263 | formatted.extend([ 264 | f"\n{i}. {focus.area}", 265 | f" Priority: {focus.priority}" 266 | ]) 267 | 268 | return "\n".join(formatted) 269 | 270 | class OutputRedirector: 271 | """Redirects stdout and stderr to a string buffer""" 272 | def __init__(self, stream=None): 273 | self.stream = stream or StringIO() 274 | self.original_stdout = sys.stdout 275 | self.original_stderr = sys.stderr 276 | 277 | def __enter__(self): 278 | sys.stdout = self.stream 279 | sys.stderr = self.stream 280 | return self.stream 281 | 282 | def __exit__(self, exc_type, exc_val, exc_tb): 283 | sys.stdout = self.original_stdout 284 | sys.stderr = self.original_stderr 285 | 286 | class TerminalUI: 287 | """Manages terminal display with fixed input area at bottom""" 288 | def __init__(self): 289 | self.stdscr = None 290 | self.input_win = None 291 | self.output_win = None 292 | self.status_win = None 293 | self.max_y = 0 294 | self.max_x = 0 295 | self.input_buffer = "" 296 | self.is_setup = False 297 | self.old_terminal_settings = None 298 | self.should_terminate = Event() 299 | self.shutdown_event = Event() 300 | self.research_thread = None 301 | self.last_display_height = 0 # Track display height for corruption fix 302 | 303 | 304 | def setup(self): 305 | """Initialize the terminal UI""" 306 | if self.is_setup: 307 | return 308 | 309 | # Save terminal settings 310 | if not os.name == 'nt': # Unix-like systems 311 | self.old_terminal_settings = termios.tcgetattr(sys.stdin.fileno()) 312 | 313 | self.stdscr = curses.initscr() 314 | curses.start_color() 315 | curses.noecho() 316 | curses.cbreak() 317 | self.stdscr.keypad(True) 318 | 319 | # Enable only scroll wheel events, not all mouse events 320 | # curses.mousemask(curses.BUTTON4_PRESSED | curses.BUTTON5_PRESSED) 321 | 322 | # Remove this line that was causing the spam 323 | # print('\033[?1003h') # We don't want mouse movement events 324 | 325 | # Get terminal dimensions 326 | self.max_y, self.max_x = self.stdscr.getmaxyx() 327 | 328 | # Create windows 329 | self.output_win = curses.newwin(self.max_y - 4, self.max_x, 0, 0) 330 | self.status_win = curses.newwin(1, self.max_x, self.max_y - 4, 0) 331 | self.input_win = curses.newwin(3, self.max_x, self.max_y - 3, 0) 332 | 333 | # Setup colors 334 | curses.init_pair(1, curses.COLOR_GREEN, curses.COLOR_BLACK) 335 | curses.init_pair(2, curses.COLOR_CYAN, curses.COLOR_BLACK) 336 | curses.init_pair(3, curses.COLOR_YELLOW, curses.COLOR_BLACK) 337 | 338 | # Enable scrolling 339 | self.output_win.scrollok(True) 340 | self.output_win.idlok(True) 341 | self.input_win.scrollok(True) 342 | 343 | self.is_setup = True 344 | self._refresh_input_prompt() 345 | 346 | def cleanup(self): 347 | """Public cleanup method with enhanced terminal restoration""" 348 | if not self.is_setup: 349 | return 350 | try: 351 | # Ensure all windows are properly closed 352 | for win in [self.input_win, self.output_win, self.status_win]: 353 | if win: 354 | win.clear() 355 | win.refresh() 356 | 357 | # Restore terminal state 358 | if self.stdscr: 359 | self.stdscr.keypad(False) 360 | curses.nocbreak() 361 | curses.echo() 362 | curses.endwin() 363 | 364 | # Restore original terminal settings 365 | if self.old_terminal_settings and not os.name == 'nt': 366 | termios.tcsetattr( 367 | sys.stdin.fileno(), 368 | termios.TCSADRAIN, 369 | self.old_terminal_settings 370 | ) 371 | except Exception as e: 372 | logger.error(f"Error during terminal cleanup: {str(e)}") 373 | finally: 374 | self.is_setup = False 375 | self.stdscr = None 376 | self.input_win = None 377 | self.output_win = None 378 | self.status_win = None 379 | 380 | def _cleanup(self): 381 | """Enhanced resource cleanup with better process handling""" 382 | self.should_terminate.set() 383 | 384 | # Handle research thread with improved termination 385 | if self.research_thread and self.research_thread.is_alive(): 386 | try: 387 | self.research_thread.join(timeout=1.0) 388 | if self.research_thread.is_alive(): 389 | import ctypes 390 | ctypes.pythonapi.PyThreadState_SetAsyncExc( 391 | ctypes.c_long(self.research_thread.ident), 392 | ctypes.py_object(SystemExit)) 393 | time.sleep(0.1) # Give thread time to exit 394 | if self.research_thread.is_alive(): # Double-check 395 | ctypes.pythonapi.PyThreadState_SetAsyncExc( 396 | ctypes.c_long(self.research_thread.ident), 397 | 0) # Reset exception 398 | except Exception as e: 399 | logger.error(f"Error terminating research thread: {str(e)}") 400 | 401 | # Clean up LLM with improved error handling 402 | if hasattr(self, 'llm') and hasattr(self.llm, '_cleanup'): 403 | try: 404 | self.llm.cleanup() 405 | except Exception as e: 406 | logger.error(f"Error cleaning up LLM: {str(e)}") 407 | 408 | # Ensure terminal is restored 409 | try: 410 | curses.endwin() 411 | except: 412 | pass 413 | 414 | # Final cleanup of UI 415 | self.cleanup() 416 | 417 | def _refresh_input_prompt(self, prompt="Enter command: "): 418 | """Refresh the fixed input prompt at bottom with display fix""" 419 | if not self.is_setup: 420 | return 421 | 422 | try: 423 | # Clear the entire input window first 424 | self.input_win.clear() 425 | 426 | # Calculate proper cursor position 427 | cursor_y = 0 428 | cursor_x = len(prompt) + len(self.input_buffer) 429 | 430 | # Add the prompt and buffer 431 | self.input_win.addstr(0, 0, f"{prompt}{self.input_buffer}", curses.color_pair(1)) 432 | 433 | # Position cursor correctly 434 | try: 435 | self.input_win.move(cursor_y, cursor_x) 436 | except curses.error: 437 | pass # Ignore if cursor would be off-screen 438 | 439 | self.input_win.refresh() 440 | except curses.error: 441 | pass 442 | 443 | def update_output(self, text: str): 444 | """Update output window with display corruption fix""" 445 | if not self.is_setup: 446 | return 447 | 448 | try: 449 | # Clean ANSI escape codes 450 | clean_text = re.sub(r'\x1b\[[0-9;]*[mK]', '', text) 451 | 452 | # Store current position 453 | current_y, _ = self.output_win.getyx() 454 | 455 | # Clear any potential corruption 456 | if current_y > self.last_display_height: 457 | self.output_win.clear() 458 | 459 | self.output_win.addstr(clean_text + "\n", curses.color_pair(2)) 460 | new_y, _ = self.output_win.getyx() 461 | self.last_display_height = new_y 462 | 463 | self.output_win.refresh() 464 | self._refresh_input_prompt() 465 | except curses.error: 466 | pass 467 | 468 | def update_status(self, text: str): 469 | """Update the status line above input area""" 470 | if not self.is_setup: 471 | return 472 | 473 | try: 474 | self.status_win.clear() 475 | self.status_win.addstr(0, 0, text, curses.color_pair(3)) 476 | self.status_win.refresh() 477 | self._refresh_input_prompt() # Ensure prompt is refreshed after status update 478 | except curses.error: 479 | pass 480 | 481 | def get_input(self, prompt: Optional[str] = None) -> Optional[str]: 482 | """Enhanced input handling with mouse scroll support""" 483 | try: 484 | if prompt: 485 | self.update_status(prompt) 486 | if not self.is_setup: 487 | self.setup() 488 | self.input_buffer = "" 489 | self._refresh_input_prompt() 490 | 491 | while True: 492 | if self.should_terminate.is_set(): 493 | return None 494 | 495 | try: 496 | ch = self.input_win.getch() 497 | 498 | if ch == curses.KEY_MOUSE: 499 | try: 500 | mouse_event = curses.getmouse() 501 | # Ignore mouse events entirely for now 502 | continue 503 | except curses.error: 504 | continue 505 | 506 | if ch == 4: # Ctrl+D 507 | result = self.input_buffer.strip() 508 | self.input_buffer = "" 509 | if not result: 510 | self.cleanup() 511 | return "@quit" 512 | return result 513 | 514 | elif ch == 3: # Ctrl+C 515 | self.should_terminate.set() 516 | self.cleanup() 517 | return "@quit" 518 | 519 | elif ch == ord('\n'): # Enter 520 | result = self.input_buffer.strip() 521 | if result: 522 | self.input_buffer = "" 523 | return result 524 | continue 525 | 526 | elif ch == curses.KEY_BACKSPACE or ch == 127: # Backspace 527 | if self.input_buffer: 528 | self.input_buffer = self.input_buffer[:-1] 529 | self._refresh_input_prompt() 530 | 531 | elif 32 <= ch <= 126: # Printable characters 532 | self.input_buffer += chr(ch) 533 | self._refresh_input_prompt() 534 | 535 | except KeyboardInterrupt: 536 | self.should_terminate.set() 537 | self.cleanup() 538 | return "@quit" 539 | except curses.error: 540 | self._refresh_input_prompt() 541 | 542 | except Exception as e: 543 | logger.error(f"Error in get_input: {str(e)}") 544 | self.should_terminate.set() 545 | self.cleanup() 546 | return "@quit" 547 | 548 | def force_exit(self): 549 | """Force immediate exit with enhanced cleanup""" 550 | try: 551 | self.should_terminate.set() 552 | self.shutdown_event.set() 553 | self._cleanup() # Call private cleanup first 554 | self.cleanup() # Then public cleanup 555 | curses.endwin() # Final attempt to restore terminal 556 | except: 557 | pass 558 | finally: 559 | os._exit(0) # Ensure exit 560 | 561 | class NonBlockingInput: 562 | """Handles non-blocking keyboard input for Unix-like systems""" 563 | def __init__(self): 564 | self.old_settings = None 565 | 566 | def __enter__(self): 567 | if os.name == 'nt': # Windows 568 | return self 569 | self.old_settings = termios.tcgetattr(sys.stdin) 570 | tty.setcbreak(sys.stdin.fileno()) 571 | return self 572 | 573 | def __exit__(self, type, value, traceback): 574 | if os.name != 'nt': # Unix-like 575 | termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self.old_settings) 576 | 577 | def check_input(self, timeout=0.1): 578 | """Check for input without blocking, cross-platform""" 579 | if os.name == 'nt': # Windows 580 | import msvcrt 581 | if msvcrt.kbhit(): 582 | return msvcrt.getch().decode('utf-8') 583 | return None 584 | else: # Unix-like 585 | ready_to_read, _, _ = select.select([sys.stdin], [], [], timeout) 586 | if ready_to_read: 587 | return sys.stdin.read(1) 588 | return None 589 | 590 | class ResearchManager: 591 | """Manages the research process including analysis, search, and documentation""" 592 | def __init__(self, llm_wrapper, parser, search_engine, max_searches_per_cycle: int = 5): 593 | self.llm = llm_wrapper 594 | self.parser = parser 595 | self.search_engine = search_engine 596 | self.max_searches = max_searches_per_cycle 597 | self.should_terminate = threading.Event() 598 | self.shutdown_event = Event() 599 | self.research_started = threading.Event() 600 | self.research_thread = None 601 | self.thinking = False 602 | self.stop_words = { 603 | 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 604 | 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at' 605 | } 606 | 607 | # State tracking 608 | self.searched_urls: Set[str] = set() 609 | self.current_focus: Optional[ResearchFocus] = None 610 | self.original_query: str = "" 611 | self.focus_areas: List[ResearchFocus] = [] 612 | self.is_running = False 613 | 614 | # New conversation mode attributes 615 | self.research_complete = False 616 | self.research_summary = "" 617 | self.conversation_active = False 618 | self.research_content = "" 619 | 620 | # Initialize document paths 621 | self.document_path = None 622 | self.session_files = [] 623 | 624 | # Initialize UI and parser 625 | self.ui = TerminalUI() 626 | self.strategic_parser = StrategicAnalysisParser(llm=self.llm) 627 | 628 | # Initialize new flags for pausing and assessment 629 | self.research_paused = False 630 | self.awaiting_user_decision = False 631 | 632 | # Setup signal handlers 633 | signal.signal(signal.SIGINT, self._signal_handler) 634 | signal.signal(signal.SIGTERM, self._signal_handler) 635 | 636 | def _signal_handler(self, signum, frame): 637 | """Handle interrupt signals""" 638 | self.shutdown_event.set() 639 | self.should_terminate.set() 640 | self._cleanup() 641 | 642 | def print_thinking(self): 643 | """Display thinking indicator to user""" 644 | self.ui.update_output("🧠 Thinking...") 645 | 646 | @staticmethod 647 | def get_initial_input() -> str: 648 | """Get the initial research query from user""" 649 | print(f"{Fore.GREEN}📝 Enter your message (Press CTRL+D to submit):{Style.RESET_ALL}") 650 | lines = [] 651 | try: 652 | while True: 653 | line = input() 654 | if line: # Only add non-empty lines 655 | lines.append(line) 656 | if not line: # Empty line (just Enter pressed) 657 | break 658 | except EOFError: # Ctrl+D pressed 659 | pass 660 | except KeyboardInterrupt: # Ctrl+C pressed 661 | print("\nOperation cancelled") 662 | sys.exit(0) 663 | 664 | return " ".join(lines).strip() 665 | 666 | def formulate_search_queries(self, focus_area: ResearchFocus) -> List[str]: 667 | """Generate search queries for a focus area""" 668 | try: 669 | self.print_thinking() 670 | 671 | prompt = f""" 672 | In order to research this query/topic: 673 | 674 | Context: {self.original_query} 675 | 676 | Base a search query to investigate the following research focus, which is related to the original query/topic: 677 | 678 | Area: {focus_area.area} 679 | 680 | Create a search query that will yield specific, search results thare are directly relevant to your focus area. 681 | Format your response EXACTLY like this: 682 | 683 | Search query: [Your 2-5 word query] 684 | Time range: [d/w/m/y/none] 685 | 686 | Do not provide any additional information or explanation, note that the time range allows you to see results within a time range (d is within the last day, w is within the last week, m is within the last month, y is within the last year, and none is results from anytime, only select one, using only the corresponding letter for whichever of these options you select as indicated in the response format) use your judgement as many searches will not require a time range and some may depending on what the research focus is. 687 | """ 688 | response_text = self.llm.generate(prompt, max_tokens=50, stop=None) 689 | query, time_range = self.parse_query_response(response_text) 690 | 691 | if not query: 692 | self.ui.update_output(f"{Fore.RED}Error: Empty search query. Using focus area as query...{Style.RESET_ALL}") 693 | return [focus_area.area] 694 | 695 | self.ui.update_output(f"{Fore.YELLOW}Original focus: {focus_area.area}{Style.RESET_ALL}") 696 | self.ui.update_output(f"{Fore.YELLOW}Formulated query: {query}{Style.RESET_ALL}") 697 | self.ui.update_output(f"{Fore.YELLOW}Time range: {time_range}{Style.RESET_ALL}") 698 | 699 | return [query] 700 | 701 | except Exception as e: 702 | logger.error(f"Error formulating query: {str(e)}") 703 | return [focus_area.area] 704 | 705 | def parse_search_query(self, query_response: str) -> Dict[str, str]: 706 | """Parse search query formulation response with improved time range detection""" 707 | try: 708 | lines = query_response.strip().split('\n') 709 | result = { 710 | 'query': '', 711 | 'time_range': 'none' 712 | } 713 | 714 | # First try to find standard format 715 | for line in lines: 716 | if ':' in line: 717 | key, value = line.split(':', 1) 718 | key = key.strip().lower() 719 | value = value.strip() 720 | 721 | if 'query' in key: 722 | result['query'] = self._clean_query(value) 723 | elif ('time' in key or 'range' in key) and value.strip().lower() in ['d', 'w', 'm', 'y', 'none']: 724 | result['time_range'] = value.strip().lower() 725 | 726 | # If no time range found, look for individual characters 727 | if result['time_range'] == 'none': 728 | # Get all text except the query itself 729 | full_text = query_response.lower() 730 | if result['query']: 731 | full_text = full_text.replace(result['query'].lower(), '') 732 | 733 | # Look for isolated d, w, m, or y characters 734 | time_chars = set() 735 | for char in ['d', 'w', 'm', 'y']: 736 | # Check if char exists by itself (not part of another word) 737 | matches = re.finditer(r'\b' + char + r'\b', full_text) 738 | for match in matches: 739 | # Verify it's not part of a word 740 | start, end = match.span() 741 | if (start == 0 or not full_text[start-1].isalpha()) and \ 742 | (end == len(full_text) or not full_text[end].isalpha()): 743 | time_chars.add(char) 744 | 745 | # If exactly one time char found, use it 746 | if len(time_chars) == 1: 747 | result['time_range'] = time_chars.pop() 748 | 749 | return result 750 | except Exception as e: 751 | logger.error(f"Error parsing search query: {str(e)}") 752 | return {'query': '', 'time_range': 'none'} 753 | 754 | def _cleanup(self): 755 | """Enhanced cleanup to handle conversation mode""" 756 | self.conversation_active = False 757 | self.should_terminate.set() 758 | 759 | if self.research_thread and self.research_thread.is_alive(): 760 | try: 761 | self.research_thread.join(timeout=1.0) 762 | if self.research_thread.is_alive(): 763 | import ctypes 764 | ctypes.pythonapi.PyThreadState_SetAsyncExc( 765 | ctypes.c_long(self.research_thread.ident), 766 | ctypes.py_object(SystemExit) 767 | ) 768 | except Exception as e: 769 | logger.error(f"Error terminating research thread: {str(e)}") 770 | 771 | if hasattr(self.llm, 'cleanup'): 772 | try: 773 | self.llm.cleanup() 774 | except Exception as e: 775 | logger.error(f"Error cleaning up LLM: {str(e)}") 776 | 777 | if hasattr(self.ui, 'cleanup'): 778 | self.ui.cleanup() 779 | 780 | def _initialize_document(self): 781 | """Initialize research session document""" 782 | try: 783 | # Get all existing research session files 784 | self.session_files = [] 785 | for file in os.listdir(): 786 | if file.startswith("research_session_") and file.endswith(".txt"): 787 | try: 788 | num = int(file.split("_")[2].split(".")[0]) 789 | self.session_files.append(num) 790 | except ValueError: 791 | continue 792 | 793 | # Determine next session number 794 | next_session = 1 if not self.session_files else max(self.session_files) + 1 795 | self.document_path = f"research_session_{next_session}.txt" 796 | 797 | # Initialize the new document 798 | with open(self.document_path, 'w', encoding='utf-8') as f: 799 | f.write(f"Research Session {next_session}\n") 800 | f.write(f"Topic: {self.original_query}\n") 801 | f.write(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") 802 | f.write("="*80 + "\n\n") 803 | f.flush() 804 | 805 | except Exception as e: 806 | logger.error(f"Error initializing document: {str(e)}") 807 | self.document_path = "research_findings.txt" 808 | with open(self.document_path, 'w', encoding='utf-8') as f: 809 | f.write("Research Findings:\n\n") 810 | f.flush() 811 | 812 | def add_to_document(self, content: str, source_url: str, focus_area: str): 813 | """Add research findings to current session document""" 814 | try: 815 | with open(self.document_path, 'a', encoding='utf-8') as f: 816 | if source_url not in self.searched_urls: 817 | f.write(f"\n{'='*80}\n") 818 | f.write(f"Research Focus: {focus_area}\n") 819 | f.write(f"Source: {source_url}\n") 820 | f.write(f"Content:\n{content}\n") 821 | f.write(f"{'='*80}\n") 822 | f.flush() 823 | self.searched_urls.add(source_url) 824 | self.ui.update_output(f"Added content from: {source_url}") 825 | except Exception as e: 826 | logger.error(f"Error adding to document: {str(e)}") 827 | self.ui.update_output(f"Error saving content: {str(e)}") 828 | 829 | def _process_search_results(self, results: Dict[str, str], focus_area: str): 830 | """Process and store search results""" 831 | if not results: 832 | return 833 | 834 | for url, content in results.items(): 835 | if url not in self.searched_urls: 836 | self.add_to_document(content, url, focus_area) 837 | 838 | def _research_loop(self): 839 | """Main research loop with comprehensive functionality""" 840 | self.is_running = True 841 | try: 842 | self.research_started.set() 843 | 844 | while not self.should_terminate.is_set() and not self.shutdown_event.is_set(): 845 | # Check if research is paused 846 | if self.research_paused: 847 | time.sleep(1) 848 | continue 849 | 850 | self.ui.update_output("\nAnalyzing research progress...") 851 | 852 | # Generate focus areas 853 | self.ui.update_output("\nGenerating research focus areas...") 854 | analysis_result = self.strategic_parser.strategic_analysis(self.original_query) 855 | 856 | if not analysis_result: 857 | self.ui.update_output("\nFailed to generate analysis result. Retrying...") 858 | continue 859 | 860 | focus_areas = analysis_result.focus_areas 861 | if not focus_areas: 862 | self.ui.update_output("\nNo valid focus areas generated. Retrying...") 863 | continue 864 | 865 | self.ui.update_output(f"\nGenerated {len(focus_areas)} research areas:") 866 | for i, focus in enumerate(focus_areas, 1): 867 | self.ui.update_output(f"\nArea {i}: {focus.area}") 868 | self.ui.update_output(f"Priority: {focus.priority}") 869 | 870 | # Process each focus area in priority order 871 | for focus_area in focus_areas: 872 | if self.should_terminate.is_set(): 873 | break 874 | 875 | # Check if research is paused 876 | while self.research_paused and not self.should_terminate.is_set(): 877 | time.sleep(1) 878 | 879 | if self.should_terminate.is_set(): 880 | break 881 | 882 | self.current_focus = focus_area 883 | self.ui.update_output(f"\nInvestigating: {focus_area.area}") 884 | 885 | queries = self.formulate_search_queries(focus_area) 886 | if not queries: 887 | continue 888 | 889 | for query in queries: 890 | if self.should_terminate.is_set(): 891 | break 892 | 893 | # Check if research is paused 894 | while self.research_paused and not self.should_terminate.is_set(): 895 | time.sleep(1) 896 | 897 | if self.should_terminate.is_set(): 898 | break 899 | 900 | try: 901 | self.ui.update_output(f"\nSearching: {query}") 902 | results = self.search_engine.perform_search(query, time_range='none') 903 | 904 | if results: 905 | # self.search_engine.display_search_results(results) 906 | selected_urls = self.search_engine.select_relevant_pages(results, query) 907 | 908 | if selected_urls: 909 | self.ui.update_output("\n⚙️ Scraping selected pages...") 910 | scraped_content = self.search_engine.scrape_content(selected_urls) 911 | if scraped_content: 912 | for url, content in scraped_content.items(): 913 | if url not in self.searched_urls: 914 | self.add_to_document(content, url, focus_area.area) 915 | 916 | except Exception as e: 917 | logger.error(f"Error in search: {str(e)}") 918 | self.ui.update_output(f"Error during search: {str(e)}") 919 | 920 | if self.check_document_size(): 921 | self.ui.update_output("\nDocument size limit reached. Finalizing research.") 922 | return 923 | 924 | # After processing all areas, cycle back to generate new ones 925 | self.ui.update_output("\nAll current focus areas investigated. Generating new areas...") 926 | 927 | except Exception as e: 928 | logger.error(f"Error in research loop: {str(e)}") 929 | self.ui.update_output(f"Error in research process: {str(e)}") 930 | finally: 931 | self.is_running = False 932 | 933 | def start_research(self, topic: str): 934 | """Start research with new session document""" 935 | try: 936 | self.ui.setup() 937 | self.original_query = topic 938 | self._initialize_document() 939 | 940 | self.ui.update_output(f"Starting research on: {topic}") 941 | self.ui.update_output(f"Session document: {self.document_path}") 942 | self.ui.update_output("\nCommands available during research:") 943 | self.ui.update_output("'s' = Show status") 944 | self.ui.update_output("'f' = Show current focus") 945 | self.ui.update_output("'p' = Pause and assess the research progress") # New command 946 | self.ui.update_output("'q' = Quit research\n") 947 | 948 | # Reset events 949 | self.should_terminate.clear() 950 | self.research_started.clear() 951 | self.research_paused = False # Ensure research is not paused at the start 952 | self.awaiting_user_decision = False 953 | 954 | # Start research thread 955 | self.research_thread = threading.Thread(target=self._research_loop, daemon=True) 956 | self.research_thread.start() 957 | 958 | # Wait for research to actually start 959 | if not self.research_started.wait(timeout=10): 960 | self.ui.update_output("Error: Research failed to start within timeout period") 961 | self.should_terminate.set() 962 | return 963 | 964 | while not self.should_terminate.is_set(): 965 | cmd = self.ui.get_input("Enter command: ") 966 | if cmd is None or self.shutdown_event.is_set(): 967 | if self.should_terminate.is_set() and not self.research_complete: 968 | self.ui.update_output("\nGenerating research summary... please wait...") 969 | summary = self.terminate_research() 970 | self.ui.update_output("\nFinal Research Summary:") 971 | self.ui.update_output(summary) 972 | break 973 | if cmd: 974 | self._handle_command(cmd) 975 | 976 | except Exception as e: 977 | logger.error(f"Error in research process: {str(e)}") 978 | finally: 979 | self._cleanup() 980 | 981 | def check_document_size(self) -> bool: 982 | """Check if document size is approaching context limit""" 983 | try: 984 | with open(self.document_path, 'r', encoding='utf-8') as f: 985 | content = f.read() 986 | estimated_tokens = len(content.split()) * 1.3 987 | max_tokens = self.llm.llm_config.get('n_ctx', 2048) 988 | current_ratio = estimated_tokens / max_tokens 989 | 990 | if current_ratio > 0.8: 991 | logger.warning(f"Document size at {current_ratio*100:.1f}% of context limit") 992 | self.ui.update_output(f"Warning: Document size at {current_ratio*100:.1f}% of context limit") 993 | 994 | return current_ratio > 0.9 995 | except Exception as e: 996 | logger.error(f"Error checking document size: {str(e)}") 997 | return True 998 | 999 | def _handle_command(self, cmd: str): 1000 | """Handle user commands during research""" 1001 | if cmd.lower() == 's': 1002 | self.ui.update_output(self.get_progress()) 1003 | elif cmd.lower() == 'f': 1004 | if self.current_focus: 1005 | self.ui.update_output("\nCurrent Focus:") 1006 | self.ui.update_output(f"Area: {self.current_focus.area}") 1007 | self.ui.update_output(f"Priority: {self.current_focus.priority}") 1008 | else: 1009 | self.ui.update_output("\nNo current focus area") 1010 | elif cmd.lower() == 'p': 1011 | self.pause_and_assess() 1012 | elif cmd.lower() == 'q': 1013 | self.ui.update_output("\nInitiating research termination...") 1014 | self.should_terminate.set() 1015 | self.ui.update_output("\nGenerating research summary... please wait...") 1016 | summary = self.terminate_research() 1017 | self.ui.update_output("\nFinal Research Summary:") 1018 | self.ui.update_output(summary) 1019 | 1020 | def pause_and_assess(self): 1021 | """Pause the research and assess if the collected content is sufficient.""" 1022 | try: 1023 | # Pause the research thread 1024 | self.ui.update_output("\nPausing research for assessment...") 1025 | self.research_paused = True 1026 | 1027 | # Start progress indicator in a separate thread 1028 | self.summary_ready = False 1029 | indicator_thread = threading.Thread( 1030 | target=self.show_progress_indicator, 1031 | args=("Assessing the researched information...",) 1032 | ) 1033 | indicator_thread.daemon = True 1034 | indicator_thread.start() 1035 | 1036 | # Read the current research content 1037 | if not os.path.exists(self.document_path): 1038 | self.summary_ready = True 1039 | indicator_thread.join() 1040 | self.ui.update_output("No research data found to assess.") 1041 | self.research_paused = False 1042 | return 1043 | 1044 | with open(self.document_path, 'r', encoding='utf-8') as f: 1045 | content = f.read().strip() 1046 | 1047 | if not content: 1048 | self.summary_ready = True 1049 | indicator_thread.join() 1050 | self.ui.update_output("No research data was collected to assess.") 1051 | self.research_paused = False 1052 | return 1053 | 1054 | # Prepare the prompt for the AI assessment 1055 | assessment_prompt = f""" 1056 | Based on the following research content, please assess whether the original query "{self.original_query}" can be answered sufficiently with the collected information. 1057 | 1058 | Research Content: 1059 | {content} 1060 | 1061 | Instructions: 1062 | 1. If the research content provides enough information to answer the original query in detail, respond with: "The research is sufficient to answer the query." 1063 | 2. If not, respond with: "The research is insufficient and it would be advisable to continue gathering information." 1064 | 3. Do not provide any additional information or details. 1065 | 1066 | Assessment: 1067 | """ 1068 | 1069 | # Generate the assessment 1070 | assessment = self.llm.generate(assessment_prompt, max_tokens=200) 1071 | 1072 | # Stop the progress indicator 1073 | self.summary_ready = True 1074 | indicator_thread.join() 1075 | 1076 | # Display the assessment 1077 | self.ui.update_output("\nAssessment Result:") 1078 | self.ui.update_output(assessment.strip()) 1079 | 1080 | # Provide user with options to continue or quit 1081 | self.ui.update_output("\nEnter 'c' to continue the research or 'q' to terminate and generate the summary.") 1082 | self.awaiting_user_decision = True # Flag to indicate we are waiting for user's decision 1083 | 1084 | while self.awaiting_user_decision: 1085 | cmd = self.ui.get_input("Enter command ('c' to continue, 'q' to quit): ") 1086 | if cmd is None: 1087 | continue # Ignore invalid inputs 1088 | cmd = cmd.strip().lower() 1089 | if cmd == 'c': 1090 | self.ui.update_output("\nResuming research...") 1091 | self.research_paused = False 1092 | self.awaiting_user_decision = False 1093 | elif cmd == 'q': 1094 | self.ui.update_output("\nTerminating research and generating summary...") 1095 | self.awaiting_user_decision = False 1096 | self.should_terminate.set() 1097 | summary = self.terminate_research() 1098 | self.ui.update_output("\nFinal Research Summary:") 1099 | self.ui.update_output(summary) 1100 | break 1101 | else: 1102 | self.ui.update_output("Invalid command. Please enter 'c' to continue or 'q' to quit.") 1103 | 1104 | except Exception as e: 1105 | logger.error(f"Error during pause and assess: {str(e)}") 1106 | self.ui.update_output(f"Error during assessment: {str(e)}") 1107 | self.research_paused = False 1108 | finally: 1109 | self.summary_ready = True # Ensure the indicator thread can exit 1110 | 1111 | def get_progress(self) -> str: 1112 | """Get current research progress""" 1113 | return f""" 1114 | Research Progress: 1115 | - Original Query: {self.original_query} 1116 | - Sources analyzed: {len(self.searched_urls)} 1117 | - Status: {'Active' if self.is_running else 'Stopped'} 1118 | - Current focus: {self.current_focus.area if self.current_focus else 'Initializing'} 1119 | """ 1120 | 1121 | def is_active(self) -> bool: 1122 | """Check if research is currently active""" 1123 | return self.is_running and self.research_thread and self.research_thread.is_alive() 1124 | 1125 | def terminate_research(self) -> str: 1126 | """Terminate research and return to main terminal""" 1127 | try: 1128 | print("Initiating research termination...") 1129 | sys.stdout.flush() 1130 | 1131 | # Start progress indicator in a separate thread immediately 1132 | indicator_thread = threading.Thread(target=self.show_progress_indicator) 1133 | indicator_thread.daemon = True 1134 | indicator_thread.start() 1135 | 1136 | if not os.path.exists(self.document_path): 1137 | self.summary_ready = True 1138 | indicator_thread.join(timeout=1.0) 1139 | self._cleanup() 1140 | return "No research data found to summarize." 1141 | 1142 | with open(self.document_path, 'r', encoding='utf-8') as f: 1143 | content = f.read().strip() 1144 | self.research_content = content # Store for conversation mode 1145 | 1146 | if not content or content == "Research Findings:\n\n": 1147 | self.summary_ready = True 1148 | indicator_thread.join(timeout=1.0) 1149 | self._cleanup() 1150 | return "No research data was collected to summarize." 1151 | 1152 | try: 1153 | # Generate summary using LLM 1154 | summary_prompt = f""" 1155 | Analyze the following content to provide a comprehensive research summary and a response to the user's original query "{self.original_query}" ensuring that you conclusively answer the query in detail: 1156 | 1157 | Research Content: 1158 | {content} 1159 | 1160 | Important Instructions: 1161 | > Summarize the research findings that are relevant to the Original topic/question: "{self.original_query}" 1162 | > Ensure that in your summary you directly answer the original question/topic conclusively to the best of your ability in detail. 1163 | > Read the original topic/question again "{self.original_query}" and abide by any additional instructions that it contains, exactly as instructed in your summary otherwise provide it normally should it not have any specific instructions 1164 | 1165 | Summary: 1166 | """ 1167 | 1168 | summary = self.llm.generate(summary_prompt, max_tokens=4000) 1169 | 1170 | # Signal that summary is complete to stop the progress indicator 1171 | self.summary_ready = True 1172 | indicator_thread.join(timeout=1.0) 1173 | 1174 | # Store summary and mark research as complete 1175 | self.research_summary = summary 1176 | self.research_complete = True 1177 | 1178 | # Format summary 1179 | formatted_summary = f""" 1180 | {'='*80} 1181 | RESEARCH SUMMARY 1182 | {'='*80} 1183 | 1184 | Original Query: {self.original_query} 1185 | Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 1186 | 1187 | {summary} 1188 | 1189 | {'='*80} 1190 | End of Summary 1191 | {'='*80} 1192 | """ 1193 | 1194 | # Write to document 1195 | with open(self.document_path, 'a', encoding='utf-8') as f: 1196 | f.write("\n\n" + formatted_summary) 1197 | 1198 | # Clean up research UI 1199 | if hasattr(self, 'ui') and self.ui: 1200 | self.ui.cleanup() 1201 | 1202 | return formatted_summary 1203 | 1204 | except Exception as e: 1205 | self.summary_ready = True 1206 | indicator_thread.join(timeout=1.0) 1207 | raise e 1208 | 1209 | except Exception as e: 1210 | error_msg = f"Error generating summary: {str(e)}" 1211 | logger.error(error_msg) 1212 | return error_msg 1213 | 1214 | finally: 1215 | # Clean up research UI 1216 | self._cleanup_research_ui() 1217 | 1218 | def show_progress_indicator(self, message="Generating summary, please wait..."): 1219 | """Show a rotating progress indicator until the summary is ready.""" 1220 | symbols = ['|', '/', '-', '\\'] 1221 | idx = 0 1222 | self.summary_ready = False # Track whether the summary is complete 1223 | while not self.summary_ready: 1224 | sys.stdout.write(f"\r{message} {symbols[idx]}") 1225 | sys.stdout.flush() 1226 | idx = (idx + 1) % len(symbols) 1227 | time.sleep(0.2) # Adjust the speed of the rotation if needed 1228 | sys.stdout.write("\r" + " " * (len(message) + 2) + "\r") # Clear the line when done 1229 | 1230 | def _cleanup_research_ui(self): 1231 | """Clean up just the research UI components""" 1232 | if hasattr(self, 'ui') and self.ui: 1233 | self.ui.cleanup() 1234 | 1235 | def show_thinking_indicator(self, message: str, stop_flag_name: str): 1236 | """Show a rotating thinking indicator with custom message""" 1237 | symbols = ['|', '/', '-', '\\'] 1238 | idx = 0 1239 | while getattr(self, stop_flag_name): # Use dynamic attribute lookup 1240 | sys.stdout.write(f"\r{message} {symbols[idx]}") 1241 | sys.stdout.flush() 1242 | idx = (idx + 1) % len(symbols) 1243 | time.sleep(0.2) 1244 | sys.stdout.write("\r" + " " * (len(message) + 2) + "\r") # Clear the line when done 1245 | 1246 | def start_conversation_mode(self): 1247 | """Start interactive conversation mode with CTRL+D input handling and thinking indicator""" 1248 | self.conversation_active = True 1249 | self.thinking = False 1250 | 1251 | # Print header with clear instructions 1252 | print("\n" + "="*80) 1253 | print(Fore.CYAN + "Research Conversation Mode" + Style.RESET_ALL) 1254 | print("="*80) 1255 | print(Fore.YELLOW + "\nInstructions:") 1256 | print("- Type your question and press CTRL+D to submit") 1257 | print("- Type 'quit' and press CTRL+D to exit") 1258 | print("- Your messages appear in green") 1259 | print("- AI responses appear in cyan" + Style.RESET_ALL + "\n") 1260 | 1261 | while self.conversation_active: 1262 | try: 1263 | # Show prompt with user input in green 1264 | print(Fore.GREEN + "Your question (Press CTRL+D to submit):" + Style.RESET_ALL) 1265 | user_input = self.get_multiline_conversation_input() 1266 | 1267 | # Handle exit commands 1268 | if not user_input or user_input.lower() in ['quit', 'exit', 'q']: 1269 | print(Fore.YELLOW + "\nExiting conversation mode..." + Style.RESET_ALL) 1270 | self.conversation_active = False 1271 | break 1272 | 1273 | # Skip empty input 1274 | if not user_input.strip(): 1275 | continue 1276 | 1277 | # Echo the submitted question for clarity 1278 | print(Fore.GREEN + "Submitted question:" + Style.RESET_ALL) 1279 | print(Fore.GREEN + user_input + Style.RESET_ALL + "\n") 1280 | 1281 | # Start thinking indicator in a separate thread 1282 | self.thinking = True # Set flag before starting thread 1283 | thinking_thread = threading.Thread( 1284 | target=self.show_thinking_indicator, 1285 | args=("Thinking...", "thinking") 1286 | ) 1287 | thinking_thread.daemon = True 1288 | thinking_thread.start() 1289 | 1290 | try: 1291 | # Generate response 1292 | response = self._generate_conversation_response(user_input) 1293 | 1294 | # Stop thinking indicator 1295 | self.thinking = False 1296 | thinking_thread.join() 1297 | 1298 | # Display response in cyan 1299 | print(Fore.CYAN + "AI Response:" + Style.RESET_ALL) 1300 | print(f"{Fore.CYAN}{response}{Style.RESET_ALL}\n") 1301 | print("-" * 80 + "\n") # Separator between QA pairs 1302 | 1303 | except Exception as e: 1304 | self.thinking = False # Ensure thinking indicator stops 1305 | thinking_thread.join() 1306 | raise e 1307 | 1308 | except KeyboardInterrupt: 1309 | self.thinking = False # Ensure thinking indicator stops 1310 | print(Fore.YELLOW + "\nOperation cancelled. Submit 'quit' to exit." + Style.RESET_ALL) 1311 | except Exception as e: 1312 | logger.error(f"Error in conversation mode: {str(e)}") 1313 | print(Fore.RED + f"Error processing question: {str(e)}" + Style.RESET_ALL) 1314 | 1315 | def _generate_conversation_response(self, user_query: str) -> str: 1316 | """Generate contextual responses with improved context handling""" 1317 | try: 1318 | # Add debug logging to verify content 1319 | logger.info(f"Research summary length: {len(self.research_summary) if self.research_summary else 0}") 1320 | logger.info(f"Research content length: {len(self.research_content) if self.research_content else 0}") 1321 | 1322 | # First verify we have content 1323 | if not self.research_content and not self.research_summary: 1324 | # Try to reload from file if available 1325 | try: 1326 | if os.path.exists(self.document_path): 1327 | with open(self.document_path, 'r', encoding='utf-8') as f: 1328 | self.research_content = f.read().strip() 1329 | except Exception as e: 1330 | logger.error(f"Failed to reload research content: {str(e)}") 1331 | 1332 | # Prepare context, ensuring we have content 1333 | context = f""" 1334 | Research Content: 1335 | {self.research_content} 1336 | 1337 | Research Summary: 1338 | {self.research_summary if self.research_summary else 'No summary available'} 1339 | """ 1340 | 1341 | prompt = f""" 1342 | Based on the following research content and summary, please answer this question: 1343 | 1344 | {context} 1345 | 1346 | Question: {user_query} 1347 | 1348 | you have 2 sets of instructions the applied set and the unapplied set, the applied set should be followed if the question is directly relating to the research content whereas anything else other then direct questions about the content of the research will result in you instead following the unapplied ruleset 1349 | 1350 | Applied: 1351 | 1352 | Instructions: 1353 | 1. Answer based ONLY on the research content provided above if asked a question about your research or that content. 1354 | 2. If the information requested isn't in the research, clearly state that it isn't in the content you gathered. 1355 | 3. Be direct and specific in your response, DO NOT directly cite research unless specifically asked to, be concise and give direct answers to questions based on the research, unless instructed otherwise. 1356 | 1357 | Unapplied: 1358 | 1359 | Instructions: 1360 | 1361 | 1. Do not make up anything that isn't actually true. 1362 | 2. Respond directly to the user's question in an honest and thoughtful manner. 1363 | 3. disregard rules in the applied set for queries not DIRECTLY related to the research, including queries about the research process or what you remember about the research should result in the unapplied ruleset being used. 1364 | 1365 | Answer: 1366 | """ 1367 | 1368 | response = self.llm.generate( 1369 | prompt, 1370 | max_tokens=1000, # Increased for more detailed responses 1371 | temperature=0.7 1372 | ) 1373 | 1374 | if not response or not response.strip(): 1375 | return "I apologize, but I cannot find relevant information in the research content to answer your question." 1376 | 1377 | return response.strip() 1378 | 1379 | except Exception as e: 1380 | logger.error(f"Error generating response: {str(e)}") 1381 | return f"I apologize, but I encountered an error processing your question: {str(e)}" 1382 | 1383 | def get_multiline_conversation_input(self) -> str: 1384 | """Get multiline input with CTRL+D handling for conversation mode""" 1385 | buffer = [] 1386 | 1387 | # Save original terminal settings 1388 | fd = sys.stdin.fileno() 1389 | old_settings = termios.tcgetattr(fd) 1390 | 1391 | try: 1392 | # Set terminal to raw mode 1393 | tty.setraw(fd) 1394 | 1395 | current_line = [] 1396 | while True: 1397 | char = sys.stdin.read(1) 1398 | 1399 | # CTRL+D detection 1400 | if not char or ord(char) == 4: # EOF or CTRL+D 1401 | sys.stdout.write('\n') 1402 | if current_line: 1403 | buffer.append(''.join(current_line)) 1404 | return ' '.join(buffer).strip() 1405 | 1406 | # Handle special characters 1407 | elif ord(char) == 13: # Enter 1408 | sys.stdout.write('\n') 1409 | buffer.append(''.join(current_line)) 1410 | current_line = [] 1411 | 1412 | elif ord(char) == 127: # Backspace 1413 | if current_line: 1414 | current_line.pop() 1415 | sys.stdout.write('\b \b') 1416 | 1417 | elif ord(char) == 3: # CTRL+C 1418 | sys.stdout.write('\n') 1419 | return 'quit' 1420 | 1421 | # Normal character 1422 | elif 32 <= ord(char) <= 126: # Printable characters 1423 | current_line.append(char) 1424 | sys.stdout.write(char) 1425 | 1426 | sys.stdout.flush() 1427 | 1428 | finally: 1429 | # Restore terminal settings 1430 | termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) 1431 | print() # New line for clean display 1432 | 1433 | if __name__ == "__main__": 1434 | from llm_wrapper import LLMWrapper 1435 | from llm_response_parser import UltimateLLMResponseParser 1436 | from Self_Improving_Search import EnhancedSelfImprovingSearch 1437 | 1438 | try: 1439 | print(f"{Fore.CYAN}Initializing Research System...{Style.RESET_ALL}") 1440 | llm = LLMWrapper() 1441 | parser = UltimateLLMResponseParser() 1442 | search_engine = EnhancedSelfImprovingSearch(llm, parser) 1443 | manager = ResearchManager(llm, parser, search_engine) 1444 | 1445 | print(f"{Fore.GREEN}System initialized. Enter your research topic or 'quit' to exit.{Style.RESET_ALL}") 1446 | while True: 1447 | try: 1448 | topic = ResearchManager.get_initial_input() 1449 | if topic.lower() == 'quit': 1450 | break 1451 | 1452 | if not topic: 1453 | continue 1454 | 1455 | if not topic.startswith('@'): 1456 | print(f"{Fore.YELLOW}Please start your research query with '@'{Style.RESET_ALL}") 1457 | continue 1458 | 1459 | topic = topic[1:] # Remove @ prefix 1460 | manager.start_research(topic) 1461 | summary = manager.terminate_research() 1462 | print(f"\n{Fore.GREEN}Research Summary:{Style.RESET_ALL}") 1463 | print(summary) 1464 | print(f"\n{Fore.GREEN}Research completed. Ready for next topic.{Style.RESET_ALL}\n") 1465 | 1466 | except KeyboardInterrupt: 1467 | print(f"\n{Fore.YELLOW}Operation cancelled. Ready for next topic.{Style.RESET_ALL}") 1468 | if 'manager' in locals(): 1469 | manager.terminate_research() 1470 | continue 1471 | 1472 | except KeyboardInterrupt: 1473 | print(f"\n{Fore.YELLOW}Research system shutting down.{Style.RESET_ALL}") 1474 | if 'manager' in locals(): 1475 | manager.terminate_research() 1476 | except Exception as e: 1477 | print(f"{Fore.RED}Critical error: {str(e)}{Style.RESET_ALL}") 1478 | logger.error("Critical error in main loop", exc_info=True) 1479 | 1480 | if os.name == 'nt': 1481 | print(f"{Fore.YELLOW}Running on Windows - Some features may be limited{Style.RESET_ALL}") 1482 | --------------------------------------------------------------------------------