└── smolResearcher.py /smolResearcher.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from smolagents import ToolCallingAgent, DuckDuckGoSearchTool, HfApiModel, Tool 3 | import re 4 | import requests 5 | from markdownify import markdownify 6 | from requests.exceptions import RequestException 7 | from huggingface_hub import login 8 | from typing import List, Dict, Any 9 | import asyncio 10 | from urllib.parse import urlparse 11 | import json 12 | import time 13 | import os 14 | import aiofiles 15 | 16 | 17 | class VisitWebpageTool(Tool): 18 | """Tool that visits a webpage and returns its content as markdown.""" 19 | name = "visit_webpage" 20 | description = "Visits a webpage and returns its content as markdown." 21 | inputs = { 22 | "url": { 23 | "type": "string", 24 | "description": "The URL of the webpage to visit." 25 | } 26 | } 27 | output_type = "string" 28 | 29 | def forward(self, url: str) -> str: 30 | """Visits a webpage and returns its content as markdown.""" 31 | try: 32 | response = requests.get(url) 33 | response.raise_for_status() 34 | markdown_content = markdownify(response.text).strip() 35 | markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) 36 | return markdown_content[:5000] # Trim content to reduce tokens 37 | except RequestException as e: 38 | return f"Error fetching the webpage: {str(e)}" 39 | except Exception as e: 40 | return f"An unexpected error occurred: {str(e)}" 41 | 42 | 43 | class DataCleanerAgent: 44 | def __init__(self): 45 | pass 46 | 47 | def clean_description(self, text: str) -> str: 48 | """Removes JSON structures and website links from a string.""" 49 | if text is None: 50 | return None 51 | text = re.sub(r"https?://[^\s]+", "", text) #Remove urls 52 | try: 53 | # Attempt to remove any json-like structures 54 | text = json.loads(f'"{text}"') 55 | except json.JSONDecodeError: 56 | pass 57 | return str(text).strip() 58 | 59 | def clean_url(self, url: str) -> str | None: 60 | """Ensures that the source is always a single url and that no other characters are present""" 61 | if url is None: 62 | return None 63 | cleaned_url = str(url).strip() 64 | if re.match(r"https?://[^\s]+", cleaned_url): # check that its a url 65 | return cleaned_url 66 | return None 67 | 68 | 69 | class SeedDescriptionAgent: 70 | def __init__(self, csv_path: str, model_id: str, hf_token: str, preferred_sources: List[str] = None): 71 | self.csv_path = csv_path 72 | print(f"Initializing agent with CSV: {csv_path}") 73 | self.df = pd.read_csv(csv_path) 74 | print(f"DataFrame loaded with {len(self.df)} rows.") 75 | self.missing_seeds = self.df[self.df['Description'].isna()].index.tolist() 76 | print(f"Missing seeds indices: {self.missing_seeds}") 77 | self.model = HfApiModel(model_id) 78 | self.agent = ToolCallingAgent( 79 | tools=[DuckDuckGoSearchTool(), VisitWebpageTool()], 80 | model=self.model, 81 | max_steps=2, 82 | ) 83 | self.hf_token = hf_token 84 | self.processed_count = 0 85 | self.preferred_sources = preferred_sources if preferred_sources else [] # Initialize preferred sources 86 | self.cleaner = DataCleanerAgent() 87 | 88 | def format_prompt(self, search_term: str, first_search: bool = True, preferred_source: str = None) -> str: 89 | if first_search: 90 | if preferred_source: 91 | return f""" 92 | You are an expert in seed descriptions. Find a concise description of "{search_term}" from the seed company "{preferred_source}". 93 | If the description is not available from "{preferred_source}", respond with the word "NEXT" only, then I will give you another source. Do not use any other words. 94 | If you find the description at this source, include ONLY the URL of the seed company's *product page* and the description of the seed. 95 | """ 96 | return f""" 97 | You are an expert in seed descriptions. Find a concise description of "{search_term}" from a seed company that sells seeds. 98 | Include ONLY the URL of the seed company's *product page* and the seed description. Do not include listing or category urls, do not include any other text besides the product url and the description, Avoid blogs or articles. 99 | Provide a concise, plain text description only, do not include json, and only provide a result if you find the exact variety name on the page. 100 | """ 101 | return f""" 102 | You are an expert in seed descriptions. Double-check the existing description for "{search_term}" with the provided URL to confirm it is accurate. 103 | If the information given to you matches the search you do then your answer will only include details like variety Description and the URL. 104 | Provide a concise, plain text description only, with no json. 105 | CURRENT_DESCRIPTION: {{current_description}} 106 | PROVIDED_URL: {{provided_url}} 107 | """ 108 | 109 | def clean_variety_name(self, variety_name: str) -> str: 110 | """Removes parentheses and content within from the variety name.""" 111 | return re.sub(r'\s*\([^)]*\)', '', variety_name).strip() 112 | 113 | 114 | async def run(self): 115 | login(token=self.hf_token) 116 | for index in self.missing_seeds: 117 | print(f"\n--- Processing index: {index} ---") 118 | crop = self.df.loc[index, 'Crop'] 119 | variety = self.df.loc[index, 'Variety'] 120 | search_term = f"{crop} {self.clean_variety_name(variety)}" # Clean the search term! 121 | print(f" Search term: {search_term}") 122 | seed_url = None 123 | description = None 124 | 125 | # Try preferred sources first 126 | for source in self.preferred_sources: 127 | print(f" Trying preferred source: {source}") 128 | first_prompt = self.format_prompt(search_term, preferred_source=source) 129 | try: 130 | first_response = self.agent.run(first_prompt) 131 | print(f" Initial LLM response for {search_term} from {source}: {first_response}") 132 | if first_response and first_response.strip() != "NEXT": 133 | description, urls = self.extract_description_and_urls(first_response) 134 | seed_url = self.filter_seed_company_url(urls) 135 | if seed_url: 136 | print(f" Extracted URL: {seed_url}") 137 | page_content = await self.get_page_content(seed_url) #get summary 138 | #if page_content and self.verify_variety_in_content(page_content, search_term): # COMMENT OUT THIS LINE 139 | if page_content: # MODIFIED LINE - VERIFICATION DISABLED 140 | print(" Variety verification passed (DISABLED).") 141 | break # Found a valid URL, stop trying other preferred sources 142 | else: 143 | seed_url = None 144 | print(f" Warning: {search_term} not found on website {source}") 145 | else: 146 | print(f" No seed company URL found in response.") 147 | except Exception as e: 148 | print(f" Error processing {search_term} from {source}: {e}") 149 | continue 150 | 151 | # If no source found from preferred sources, then do a general search. 152 | if not seed_url: 153 | print(" Trying general search.") 154 | first_prompt = self.format_prompt(search_term) 155 | try: 156 | first_response = self.agent.run(first_prompt) 157 | print(f" Initial LLM response for {search_term} (general search): {first_response}") 158 | if first_response: 159 | description, urls = self.extract_description_and_urls(first_response) 160 | seed_url = self.filter_seed_company_url(urls) 161 | if seed_url: 162 | print(f" Extracted URL: {seed_url}") 163 | page_content = await self.get_page_content(seed_url) #get summary 164 | #if not (page_content and self.verify_variety_in_content(page_content, search_term)): # COMMENT OUT THIS LINE 165 | if not page_content: # MODIFIED LINE - VERIFICATION DISABLED 166 | seed_url = None 167 | print(f" Warning: {search_term} not found on website") 168 | else: 169 | print(f" No seed company URL found in response.") 170 | except Exception as e: 171 | print(f" Error processing {search_term} with general search: {e}") 172 | 173 | print(f" Before DataFrame Update - seed_url: {seed_url}, description: {description}") # DEBUG PRINT 174 | 175 | if seed_url: # If we get a url at all, we try to use it. 176 | if description: #If we get a description with the URL 177 | print(f" Found URL: {seed_url} and description.") 178 | updated_desc = await self.check_description_with_url( 179 | search_term, description, seed_url 180 | ) 181 | if updated_desc is None: 182 | print(f" check_description_with_url failed for {search_term}") 183 | else: 184 | print(f" Updated description: {updated_desc}") 185 | description = updated_desc # Use updated description 186 | 187 | if description: # Check again after potentially updating 188 | print(f" Assigning to DataFrame - Description: {description}, Info Source: {seed_url}") # DEBUG PRINT 189 | self.df.loc[index, 'Description'] = self.cleaner.clean_description(description) 190 | self.df.loc[index, 'Info Source'] = self.cleaner.clean_url(seed_url) 191 | print(f" DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}") 192 | 193 | else: #if we only find a URL with no description, check the page for the description 194 | print(f" Found URL: {seed_url}, but no description") 195 | page_content = await self.get_page_content(seed_url) 196 | if page_content: # Verification Disabled - always use page content if available 197 | print(f" Description for {search_term} found on page (verification disabled), adding page content as description") 198 | description = page_content # Use page content as description 199 | print(f" Assigning to DataFrame (page content) - Description: {description}, Info Source: {seed_url}") # DEBUG PRINT 200 | self.df.loc[index, 'Description'] = self.cleaner.clean_description(description) 201 | self.df.loc[index, 'Info Source'] = self.cleaner.clean_url(seed_url) 202 | print(f" DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}") 203 | else: # if no description found on page, use None 204 | print(f" Description for {search_term} not found on page.") 205 | print(f" Assigning None to DataFrame - Description: None, Info Source: None") # DEBUG PRINT 206 | self.df.loc[index, 'Description'] = None 207 | self.df.loc[index, 'Info Source'] = None 208 | print(f" DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}") 209 | else: 210 | # Updated to add None for both when nothing is found 211 | print(f" No URL found. Assigning None to DataFrame - Description: None, Info Source: None") # DEBUG PRINT 212 | self.df.loc[index, 'Description'] = None 213 | self.df.loc[index, 'Info Source'] = None 214 | print(f" DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}") 215 | 216 | print(f" DataFrame before save (index {index}):") 217 | print(self.df.loc[index, ['Crop', 'Variety', 'Description', 'Info Source']]) 218 | 219 | # Save the DataFrame to a new temporary CSV to check if the DataFrame data is being updated correctly. 220 | temp_csv_path = f"{self.csv_path}.temp_{index}.csv" # More distinct temp file names 221 | await self.async_save_csv(temp_csv_path, index) 222 | 223 | 224 | # Save after every row (main CSV) 225 | await self.async_save_csv(self.csv_path, index) 226 | self.processed_count += 1 227 | 228 | # Add a pause between saving to see if that has an impact 229 | await asyncio.sleep(0.1) 230 | 231 | print("Finished Processing all rows.") 232 | 233 | 234 | async def get_page_content(self, url:str) -> str | None: 235 | """Visits the given webpage and returns the first 200 words""" 236 | visit_tool = VisitWebpageTool() 237 | content = await asyncio.to_thread(visit_tool.forward, url) 238 | if isinstance(content, str): 239 | return content 240 | else: 241 | return None 242 | 243 | def verify_variety_in_content(self, content:str, variety: str) -> bool: 244 | """Verifies if the given variety name appears in the webpage summary.""" 245 | if not content: 246 | return False 247 | return variety.lower() in content.lower() 248 | 249 | def extract_description_and_urls(self, response: str) -> tuple[str, List[str]]: 250 | """Extracts the description and URLs from a response string.""" 251 | urls = re.findall(r"(https?://[^\s]+)", response) 252 | description = response.strip() 253 | return description, urls 254 | 255 | def filter_seed_company_url(self, urls: List[str]) -> str | None: 256 | """Filters URLs to find a seed company website.""" 257 | seed_company_keywords = [ 258 | "seed", "seeds", "seedcompany", "seed-company", "seedbank", "nursery", "grow", "garden", 259 | "botanical", "plant", "farm", "agri", "agriculture", "shop", "store" 260 | ] 261 | for url in urls: 262 | domain = urlparse(url).netloc.lower() 263 | if any(keyword in domain for keyword in seed_company_keywords): 264 | if not any(x in url.lower() for x in ["blog", "article"]): 265 | return url 266 | return None 267 | 268 | async def check_description_with_url(self, search_term: str, description: str, url: str) -> str | None: 269 | """Checks the description against a single URL and returns an updated one if needed.""" 270 | if not url: 271 | return None 272 | 273 | second_prompt = self.format_prompt(search_term, first_search=False).format(current_description=description, provided_url=url) 274 | try: 275 | second_response = self.agent.run(second_prompt) # Removed await here! 276 | print(f" Secondary LLM Response for {search_term}: {second_response}") 277 | if second_response and second_response.strip() != description.strip(): 278 | return second_response.strip() 279 | return None 280 | except Exception as e: 281 | print(f" Error during check_description_with_url {search_term}: {e}") 282 | return None 283 | 284 | async def async_save_csv(self, path: str, index: int): 285 | try: 286 | # Create a backup if the file exists 287 | if os.path.exists(path) and path == self.csv_path: 288 | backup_path = f"{path}.bak" 289 | try: 290 | os.rename(path, backup_path) 291 | print(f" Existing CSV backed up to {backup_path} before async save.") 292 | except Exception as e: 293 | print(f" Error backing up file {path}: {e}") 294 | print("Please ensure that the file is not open in another program, or that permissions are configured correctly.") 295 | 296 | # Use aiofiles to save the CSV asynchronously 297 | async with aiofiles.open(path, mode='w', encoding='utf-8') as f: 298 | await f.write(self.df.to_csv(index=False)) 299 | print(f" Async CSV file saved at {path} for index {index}") 300 | 301 | except Exception as e: 302 | print(f" Error saving async CSV: {e}") 303 | print(" Please ensure that the file is not open in another program, or that permissions are configured correctly.") 304 | 305 | 306 | # Example Usage 307 | async def main(): 308 | csv_path = "CSV_PATH" 309 | model_id = "Qwen/Qwen2.5-Coder-32B-Instruct" 310 | hf_token = "HF_TOKEN" # replace with your token. 311 | preferred_sources = [ 312 | "exchange.seedsavers.org", 313 | ] 314 | agent = SeedDescriptionAgent(csv_path, model_id, hf_token, preferred_sources=preferred_sources) 315 | await agent.run() 316 | 317 | if __name__ == "__main__": 318 | asyncio.run(main()) 319 | --------------------------------------------------------------------------------