└── smolResearcher.py


/smolResearcher.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from smolagents import ToolCallingAgent, DuckDuckGoSearchTool, HfApiModel, Tool
  3 | import re
  4 | import requests
  5 | from markdownify import markdownify
  6 | from requests.exceptions import RequestException
  7 | from huggingface_hub import login
  8 | from typing import List, Dict, Any
  9 | import asyncio
 10 | from urllib.parse import urlparse
 11 | import json
 12 | import time
 13 | import os
 14 | import aiofiles
 15 | 
 16 | 
 17 | class VisitWebpageTool(Tool):
 18 |     """Tool that visits a webpage and returns its content as markdown."""
 19 |     name = "visit_webpage"
 20 |     description = "Visits a webpage and returns its content as markdown."
 21 |     inputs = {
 22 |         "url": {
 23 |             "type": "string",
 24 |             "description": "The URL of the webpage to visit."
 25 |         }
 26 |     }
 27 |     output_type = "string"
 28 | 
 29 |     def forward(self, url: str) -> str:
 30 |         """Visits a webpage and returns its content as markdown."""
 31 |         try:
 32 |             response = requests.get(url)
 33 |             response.raise_for_status()
 34 |             markdown_content = markdownify(response.text).strip()
 35 |             markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
 36 |             return markdown_content[:5000] # Trim content to reduce tokens
 37 |         except RequestException as e:
 38 |             return f"Error fetching the webpage: {str(e)}"
 39 |         except Exception as e:
 40 |             return f"An unexpected error occurred: {str(e)}"
 41 | 
 42 | 
 43 | class DataCleanerAgent:
 44 |     def __init__(self):
 45 |         pass
 46 | 
 47 |     def clean_description(self, text: str) -> str:
 48 |         """Removes JSON structures and website links from a string."""
 49 |         if text is None:
 50 |            return None
 51 |         text = re.sub(r"https?://[^\s]+", "", text) #Remove urls
 52 |         try:
 53 |             # Attempt to remove any json-like structures
 54 |             text = json.loads(f'"{text}"')
 55 |         except json.JSONDecodeError:
 56 |              pass
 57 |         return str(text).strip()
 58 | 
 59 |     def clean_url(self, url: str) -> str | None:
 60 |       """Ensures that the source is always a single url and that no other characters are present"""
 61 |       if url is None:
 62 |         return None
 63 |       cleaned_url = str(url).strip()
 64 |       if re.match(r"https?://[^\s]+", cleaned_url): # check that its a url
 65 |         return cleaned_url
 66 |       return None
 67 | 
 68 | 
 69 | class SeedDescriptionAgent:
 70 |     def __init__(self, csv_path: str, model_id: str, hf_token: str, preferred_sources: List[str] = None):
 71 |         self.csv_path = csv_path
 72 |         print(f"Initializing agent with CSV: {csv_path}")
 73 |         self.df = pd.read_csv(csv_path)
 74 |         print(f"DataFrame loaded with {len(self.df)} rows.")
 75 |         self.missing_seeds = self.df[self.df['Description'].isna()].index.tolist()
 76 |         print(f"Missing seeds indices: {self.missing_seeds}")
 77 |         self.model = HfApiModel(model_id)
 78 |         self.agent = ToolCallingAgent(
 79 |             tools=[DuckDuckGoSearchTool(), VisitWebpageTool()],
 80 |             model=self.model,
 81 |             max_steps=2,
 82 |         )
 83 |         self.hf_token = hf_token
 84 |         self.processed_count = 0
 85 |         self.preferred_sources = preferred_sources if preferred_sources else []  # Initialize preferred sources
 86 |         self.cleaner = DataCleanerAgent()
 87 | 
 88 |     def format_prompt(self, search_term: str, first_search: bool = True, preferred_source: str = None) -> str:
 89 |         if first_search:
 90 |             if preferred_source:
 91 |                 return f"""
 92 |                     You are an expert in seed descriptions. Find a concise description of "{search_term}" from the seed company "{preferred_source}".
 93 |                     If the description is not available from "{preferred_source}", respond with the word "NEXT" only, then I will give you another source. Do not use any other words.
 94 |                     If you find the description at this source, include ONLY the URL of the seed company's *product page* and the description of the seed.
 95 |                 """
 96 |             return f"""
 97 |                 You are an expert in seed descriptions. Find a concise description of "{search_term}" from a seed company that sells seeds.
 98 |                 Include ONLY the URL of the seed company's *product page* and the seed description. Do not include listing or category urls, do not include any other text besides the product url and the description, Avoid blogs or articles.
 99 |                  Provide a concise, plain text description only, do not include json, and only provide a result if you find the exact variety name on the page.
100 |             """
101 |         return f"""
102 |             You are an expert in seed descriptions. Double-check the existing description for "{search_term}" with the provided URL to confirm it is accurate.
103 |             If the information given to you matches the search you do then your answer will only include details like variety Description and the URL.
104 |              Provide a concise, plain text description only, with no json.
105 |              CURRENT_DESCRIPTION: {{current_description}}
106 |              PROVIDED_URL: {{provided_url}}
107 |         """
108 | 
109 |     def clean_variety_name(self, variety_name: str) -> str:
110 |         """Removes parentheses and content within from the variety name."""
111 |         return re.sub(r'\s*\([^)]*\)', '', variety_name).strip()
112 | 
113 | 
114 |     async def run(self):
115 |         login(token=self.hf_token)
116 |         for index in self.missing_seeds:
117 |             print(f"\n--- Processing index: {index} ---")
118 |             crop = self.df.loc[index, 'Crop']
119 |             variety = self.df.loc[index, 'Variety']
120 |             search_term = f"{crop} {self.clean_variety_name(variety)}"  # Clean the search term!
121 |             print(f"  Search term: {search_term}")
122 |             seed_url = None
123 |             description = None
124 | 
125 |             # Try preferred sources first
126 |             for source in self.preferred_sources:
127 |                 print(f"  Trying preferred source: {source}")
128 |                 first_prompt = self.format_prompt(search_term, preferred_source=source)
129 |                 try:
130 |                     first_response = self.agent.run(first_prompt)
131 |                     print(f"    Initial LLM response for {search_term} from {source}: {first_response}")
132 |                     if first_response and first_response.strip() != "NEXT":
133 |                         description, urls = self.extract_description_and_urls(first_response)
134 |                         seed_url = self.filter_seed_company_url(urls)
135 |                         if seed_url:
136 |                             print(f"    Extracted URL: {seed_url}")
137 |                             page_content = await self.get_page_content(seed_url) #get summary
138 |                             #if page_content and self.verify_variety_in_content(page_content, search_term): # COMMENT OUT THIS LINE
139 |                             if page_content: # MODIFIED LINE - VERIFICATION DISABLED
140 |                                 print("    Variety verification passed (DISABLED).")
141 |                                 break  # Found a valid URL, stop trying other preferred sources
142 |                             else:
143 |                                 seed_url = None
144 |                                 print(f"    Warning: {search_term} not found on website {source}")
145 |                         else:
146 |                             print(f"    No seed company URL found in response.")
147 |                 except Exception as e:
148 |                      print(f"    Error processing {search_term} from {source}: {e}")
149 |                      continue
150 | 
151 |             # If no source found from preferred sources, then do a general search.
152 |             if not seed_url:
153 |                 print("  Trying general search.")
154 |                 first_prompt = self.format_prompt(search_term)
155 |                 try:
156 |                     first_response = self.agent.run(first_prompt)
157 |                     print(f"    Initial LLM response for {search_term} (general search): {first_response}")
158 |                     if first_response:
159 |                         description, urls = self.extract_description_and_urls(first_response)
160 |                         seed_url = self.filter_seed_company_url(urls)
161 |                         if seed_url:
162 |                             print(f"    Extracted URL: {seed_url}")
163 |                             page_content = await self.get_page_content(seed_url) #get summary
164 |                             #if not (page_content and self.verify_variety_in_content(page_content, search_term)): # COMMENT OUT THIS LINE
165 |                             if not page_content: # MODIFIED LINE - VERIFICATION DISABLED
166 |                                 seed_url = None
167 |                                 print(f"    Warning: {search_term} not found on website")
168 |                         else:
169 |                             print(f"    No seed company URL found in response.")
170 |                 except Exception as e:
171 |                     print(f"    Error processing {search_term} with general search: {e}")
172 | 
173 |             print(f"  Before DataFrame Update - seed_url: {seed_url}, description: {description}") # DEBUG PRINT
174 | 
175 |             if seed_url: # If we get a url at all, we try to use it.
176 |                 if description: #If we get a description with the URL
177 |                     print(f"  Found URL: {seed_url} and description.")
178 |                     updated_desc = await self.check_description_with_url(
179 |                         search_term, description, seed_url
180 |                     )
181 |                     if updated_desc is None:
182 |                        print(f"   check_description_with_url failed for {search_term}")
183 |                     else:
184 |                        print(f"    Updated description: {updated_desc}")
185 |                        description = updated_desc # Use updated description
186 | 
187 |                     if description: # Check again after potentially updating
188 |                         print(f"  Assigning to DataFrame - Description: {description}, Info Source: {seed_url}") # DEBUG PRINT
189 |                         self.df.loc[index, 'Description'] = self.cleaner.clean_description(description)
190 |                         self.df.loc[index, 'Info Source'] = self.cleaner.clean_url(seed_url)
191 |                         print(f"    DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}")
192 | 
193 |                 else: #if we only find a URL with no description, check the page for the description
194 |                     print(f"  Found URL: {seed_url}, but no description")
195 |                     page_content = await self.get_page_content(seed_url)
196 |                     if page_content: # Verification Disabled - always use page content if available
197 |                          print(f"   Description for {search_term} found on page (verification disabled), adding page content as description")
198 |                          description = page_content # Use page content as description
199 |                          print(f"  Assigning to DataFrame (page content) - Description: {description}, Info Source: {seed_url}") # DEBUG PRINT
200 |                          self.df.loc[index, 'Description'] = self.cleaner.clean_description(description)
201 |                          self.df.loc[index, 'Info Source'] = self.cleaner.clean_url(seed_url)
202 |                          print(f"    DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}")
203 |                     else: # if no description found on page, use None
204 |                         print(f"  Description for {search_term} not found on page.")
205 |                         print(f"  Assigning None to DataFrame - Description: None, Info Source: None") # DEBUG PRINT
206 |                         self.df.loc[index, 'Description'] = None
207 |                         self.df.loc[index, 'Info Source'] = None
208 |                         print(f"    DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}")
209 |             else:
210 |                 # Updated to add None for both when nothing is found
211 |                 print(f"  No URL found. Assigning None to DataFrame - Description: None, Info Source: None") # DEBUG PRINT
212 |                 self.df.loc[index, 'Description'] = None
213 |                 self.df.loc[index, 'Info Source'] = None
214 |                 print(f"    DataFrame updated - Description: {self.df.loc[index, 'Description']}, Info Source: {self.df.loc[index, 'Info Source']}")
215 | 
216 |             print(f"  DataFrame before save (index {index}):")
217 |             print(self.df.loc[index, ['Crop', 'Variety', 'Description', 'Info Source']])
218 | 
219 |             # Save the DataFrame to a new temporary CSV to check if the DataFrame data is being updated correctly.
220 |             temp_csv_path = f"{self.csv_path}.temp_{index}.csv"  # More distinct temp file names
221 |             await self.async_save_csv(temp_csv_path, index)
222 | 
223 | 
224 |             # Save after every row (main CSV)
225 |             await self.async_save_csv(self.csv_path, index)
226 |             self.processed_count += 1
227 | 
228 |             # Add a pause between saving to see if that has an impact
229 |             await asyncio.sleep(0.1)
230 | 
231 |         print("Finished Processing all rows.")
232 | 
233 | 
234 |     async def get_page_content(self, url:str) -> str | None:
235 |       """Visits the given webpage and returns the first 200 words"""
236 |       visit_tool = VisitWebpageTool()
237 |       content = await asyncio.to_thread(visit_tool.forward, url)
238 |       if isinstance(content, str):
239 |         return content
240 |       else:
241 |         return None
242 | 
243 |     def verify_variety_in_content(self, content:str, variety: str) -> bool:
244 |         """Verifies if the given variety name appears in the webpage summary."""
245 |         if not content:
246 |             return False
247 |         return variety.lower() in content.lower()
248 | 
249 |     def extract_description_and_urls(self, response: str) -> tuple[str, List[str]]:
250 |         """Extracts the description and URLs from a response string."""
251 |         urls = re.findall(r"(https?://[^\s]+)", response)
252 |         description = response.strip()
253 |         return description, urls
254 | 
255 |     def filter_seed_company_url(self, urls: List[str]) -> str | None:
256 |       """Filters URLs to find a seed company website."""
257 |       seed_company_keywords = [
258 |           "seed", "seeds", "seedcompany", "seed-company", "seedbank", "nursery", "grow", "garden",
259 |           "botanical", "plant", "farm", "agri", "agriculture", "shop", "store"
260 |       ]
261 |       for url in urls:
262 |           domain = urlparse(url).netloc.lower()
263 |           if any(keyword in domain for keyword in seed_company_keywords):
264 |               if not any(x in url.lower() for x in ["blog", "article"]):
265 |                  return url
266 |       return None
267 | 
268 |     async def check_description_with_url(self, search_term: str, description: str, url: str) -> str | None:
269 |         """Checks the description against a single URL and returns an updated one if needed."""
270 |         if not url:
271 |             return None
272 | 
273 |         second_prompt = self.format_prompt(search_term, first_search=False).format(current_description=description, provided_url=url)
274 |         try:
275 |             second_response = self.agent.run(second_prompt) # Removed await here!
276 |             print(f"    Secondary LLM Response for {search_term}: {second_response}")
277 |             if second_response and second_response.strip() != description.strip():
278 |                 return second_response.strip()
279 |             return None
280 |         except Exception as e:
281 |             print(f"    Error during check_description_with_url {search_term}: {e}")
282 |             return None
283 | 
284 |     async def async_save_csv(self, path: str, index: int):
285 |         try:
286 |             # Create a backup if the file exists
287 |             if os.path.exists(path) and path == self.csv_path:
288 |                 backup_path = f"{path}.bak"
289 |                 try:
290 |                   os.rename(path, backup_path)
291 |                   print(f"  Existing CSV backed up to {backup_path} before async save.")
292 |                 except Exception as e:
293 |                     print(f"  Error backing up file {path}: {e}")
294 |                     print("Please ensure that the file is not open in another program, or that permissions are configured correctly.")
295 | 
296 |             # Use aiofiles to save the CSV asynchronously
297 |             async with aiofiles.open(path, mode='w', encoding='utf-8') as f:
298 |                 await f.write(self.df.to_csv(index=False))
299 |             print(f"  Async CSV file saved at {path} for index {index}")
300 | 
301 |         except Exception as e:
302 |             print(f"  Error saving async CSV: {e}")
303 |             print("  Please ensure that the file is not open in another program, or that permissions are configured correctly.")
304 | 
305 | 
306 | # Example Usage
307 | async def main():
308 |     csv_path = "CSV_PATH"
309 |     model_id = "Qwen/Qwen2.5-Coder-32B-Instruct"
310 |     hf_token = "HF_TOKEN"  # replace with your token.
311 |     preferred_sources = [
312 |          "exchange.seedsavers.org",
313 |     ]
314 |     agent = SeedDescriptionAgent(csv_path, model_id, hf_token, preferred_sources=preferred_sources)
315 |     await agent.run()
316 | 
317 | if __name__ == "__main__":
318 |     asyncio.run(main())
319 | 


--------------------------------------------------------------------------------