├── .gitignore ├── images └── Screenshot 2024-12-17 220208.png ├── requirements.txt ├── .env ├── prompts ├── summarise_markdown_page.md ├── reviewer.md ├── summariser.md └── relevance_check.md ├── email_template.md ├── README.md └── email_script.py /.gitignore: -------------------------------------------------------------------------------- 1 | .conda/ 2 | *.md 3 | *.json 4 | -------------------------------------------------------------------------------- /images/Screenshot 2024-12-17 220208.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whitew1994WW/email_research_assistant/HEAD/images/Screenshot 2024-12-17 220208.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain-community 2 | langchain-core 3 | langchain-openai 4 | langgraph 5 | git+https://github.com/sendinblue/APIv3-python-library.git 6 | firecrawl-py 7 | beautifulsoup4 -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=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 2 | DESTINATION_EMAIL=whitew1994@gmail.com 3 | SCRAPING_API_KEY=c0bFSYJ2aaJQGjz3E03iL8fFCXC6kIM6Da0QAq5I3z4vR4TzWVnDqBGAW60y33FqGRENnAo67ZS0jFlM9D 4 | SENDINGBLUE_API_KEY=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 5 | SERPER_API_KEY=2cfb0efb4ce760a4061a4234229967b41207dff5 6 | -------------------------------------------------------------------------------- /prompts/summarise_markdown_page.md: -------------------------------------------------------------------------------- 1 | # Role 2 | 3 | You are a detail-oriented researcher, reading online articles on behalf of a CTO with a technical background. 4 | 5 | # Task 6 | 7 | You will be given a markdown file of an online article. Your task is to read the article and provide a summary of the content, suitable for the CTO. 8 | 9 | # Output 10 | 11 | Markdown formatted text containing a detailed summary of the article, including relevant people, companies, and technologies mentioned. 12 | 13 | # Input 14 | 15 | {markdown_input} 16 | 17 | -------------------------------------------------------------------------------- /email_template.md: -------------------------------------------------------------------------------- 1 | # Big Company Updates 2 | 3 | ## Key Highlights 4 | 5 | - Highlight 1: 6 | Short description 7 | - Highilight 2 8 | Short desctiption 9 | - Highlight 3 10 | Short description 11 | ... Up to 8 highlights 12 | 13 | ## Deeper Dive 14 | 15 | ### 1 16 | ...... 17 | 18 | ### 2 19 | Continue up to 8 20 | 21 | # Industry trends 22 | 23 | ## Key Highlights 24 | 25 | - Highlight 1: 26 | Short description 27 | - Highilight 2 28 | Short desctiption 29 | - Highlight 3 30 | Short description 31 | ... Up to 8 highlights 32 | 33 | ## Deeper Dive 34 | 35 | ### 1 36 | ...... 37 | 38 | ### 2 39 | 40 | Continue up to 8 41 | 42 | -------------------------------------------------------------------------------- /prompts/reviewer.md: -------------------------------------------------------------------------------- 1 | # Role 2 | 3 | You are a editor acting in a document reviewing capacity, with an eye for detial and how readable and consumable the document is. 4 | 5 | You are reviewing the content for a CTO, who is interested in AI and the latest industry trends. 6 | 7 | # Task 8 | 9 | You are reviewing a daily email summary for the CTO that has been created by a research assistant. The general requirements for the email are: 10 | 11 | - Light hearted 12 | - Links included with each reference 13 | - Emojis included where appropriate 14 | - The email should have a high level summary at the top, but also further detail if the CTO should read the full article 15 | 16 | Be harsh, and provide feedback on the email, you have high standards. 17 | 18 | # Output Format 19 | 20 | You should provide feedback on the email, and provide suggestions for improvement. When you are happy with the email, you should indicate this by marking the email as "approved". 21 | -------------------------------------------------------------------------------- /prompts/summariser.md: -------------------------------------------------------------------------------- 1 | # Role 2 | 3 | You are a research assistant who is working for a busy CTO. 4 | 5 | # Task 6 | 7 | You are given a list of summaries of online articles, and their links, and you need to provide a summary email of the content. You need to make sure that the summary is comprehensive, light hearted, and easy to read. You should also include emojis to make it easier to read. You should also include links to the original articles. 8 | 9 | You also need to send a message to the reviewer, asking for feedback on the summary. The reviewer will decide if the summary is approved or not. If not, you will need to provide a new summary. 10 | 11 | # Output Format 12 | 13 | You need to provide an output summary in html format, following this markdown template (note that the template is in markdown format, but the output should be in html format): 14 | 15 | ```markdown 16 | {input_template} 17 | ``` 18 | 19 | The deep dive section should be significantly more detailed than the high level summary section. 20 | 21 | # Input Summaries 22 | 23 | {list_of_summaries} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Google Search Email Research Assistant - Deployed to Render 2 | 3 | Deploy an email research assistant to send you a daily summary of the latest AI research (or whatever you want). 4 | 5 | Required API keys: 6 | - Serper API key (for Google search https://serper.dev/) 7 | - Scraping API key (for scraping urls & rendering JS https://scrapingfish.com/) 8 | - Sendinblue API key (for sending emails https://app.brevo.com/) 9 | - OpenAI API key (for summarizing the results https://platform.openai.com) 10 | 11 | ## Setup for Render deployment 12 | 13 | 1. Point a render CRON job to this repository 14 | 2. Set the environment variables in the render dashboard as per the image below: 15 | 16 | ![Render Environment Variables](./images/Screenshot%202024-12-17%20220208.png) 17 | 18 | 3. Set the entry point to `python email_script.py` 19 | 20 | Thats it! 21 | 22 | ## Setup for local development 23 | 24 | 1. Install the required packages `pip install -r requirements.txt` 25 | 2. Set the environment variables in the .env file 26 | 3. Run the script with `python email_script.py` 27 | 28 | -------------------------------------------------------------------------------- /prompts/relevance_check.md: -------------------------------------------------------------------------------- 1 | # Role 2 | 3 | You are a research assistant who is working for a busy CTO. 4 | 5 | # Context 6 | 7 | The CTO is trying to stay up to date with latest news and information around particular topics. 8 | 9 | # Task 10 | 11 | You are searching through google, and trying to determine which links are potentialy relevant and worth exploring further. 12 | 13 | You will be asked to review search results and to determine if they are relevant to explore further, or if they are not relevant. 14 | 15 | You will be given a list of search results, and you will need to determine which 5 are most relevant, and detail why. 16 | 17 | # Example of what is relevant 18 | 19 | 20 | - Agentic AI updates from the large AI players: 21 | -- OpenAI 22 | -- Meta 23 | -- Perplexity 24 | -- Gemini 25 | -- Mistral 26 | -- Cursor 27 | - How people are using agents in industry, from LinkedIn, Twitter, etc. 28 | - News articles about AI agents. 29 | 30 | # Example of what is not relevant 31 | 32 | - Startup blogs that are likley for marketing purposes 33 | - Technical Blog that isnt an industry use case or update 34 | 35 | # Output 36 | 37 | You will need to output the ID of the 5 most relevant search results, and a short explanation for why they are relevant. 38 | 39 | # Input 40 | 41 | Search results: 42 | 43 | ```json 44 | {input_search_results} 45 | ``` 46 | 47 | -------------------------------------------------------------------------------- /email_script.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Email script for generating and sending AI research summaries. 4 | 5 | This script searches for AI-related content, summarizes it, and sends a daily email digest. 6 | """ 7 | 8 | from __future__ import print_function 9 | import json 10 | import os 11 | import pathlib 12 | import re 13 | from typing import List, Dict, Any, Literal, Annotated 14 | 15 | import requests 16 | from bs4 import BeautifulSoup 17 | from langchain.schema import HumanMessage, AIMessage 18 | from langchain_core.prompts import ChatPromptTemplate 19 | from langchain_openai import ChatOpenAI 20 | from langgraph.graph import StateGraph, START, END 21 | from langgraph.graph.message import add_messages 22 | from pydantic import BaseModel, Field 23 | from typing_extensions import TypedDict 24 | import sib_api_v3_sdk 25 | from sib_api_v3_sdk.rest import ApiException 26 | 27 | 28 | # Configuration 29 | SEARCH_TERMS = [ 30 | "Agentic AI", 31 | "OpenAI LinkedIn", 32 | "Perplexity LinkedIn", 33 | "Meta AI LinkedIn", 34 | "Anthropic LinkedIn" 35 | ] 36 | 37 | required_environment_variables = [ 38 | "SERPER_API_KEY", 39 | "SCRAPING_API_KEY", 40 | "SENDINGBLUE_API_KEY", 41 | "OPENAI_API_KEY" 42 | ] 43 | 44 | def validate_environment_variables(): 45 | """Validate environment variables.""" 46 | for var in required_environment_variables: 47 | if os.getenv(var) is None: 48 | raise ValueError(f"Environment variable {var} is not set") 49 | 50 | class ResultRelevance(BaseModel): 51 | """Model for storing relevance check results.""" 52 | explanation: str 53 | id: str 54 | 55 | 56 | class RelevanceCheckOutput(BaseModel): 57 | """Model for storing all relevant results.""" 58 | relevant_results: List[ResultRelevance] 59 | 60 | 61 | class State(TypedDict): 62 | """State management for the LangGraph workflow.""" 63 | messages: Annotated[list, add_messages] 64 | summaries: List[dict] 65 | approved: bool 66 | created_summaries: Annotated[List[dict], Field(description="The summaries created by the summariser")] 67 | email_template: str 68 | 69 | 70 | class SummariserOutput(BaseModel): 71 | """Output format for the summarizer.""" 72 | email_summary: str = Field(description="The summary email of the content") 73 | message: str = Field(description="A message to the reviewer requesting feedback") 74 | 75 | 76 | class ReviewerOutput(BaseModel): 77 | """Output format for the reviewer.""" 78 | approved: bool = Field(description="Whether the summary is approved") 79 | message: str = Field(description="Feedback message from the reviewer") 80 | 81 | 82 | def search_serper(search_query: str) -> List[Dict[str, Any]]: 83 | """ 84 | Search Google using the Serper API. 85 | 86 | Args: 87 | search_query: The search term to query 88 | 89 | Returns: 90 | List of search results with title, link, snippet, etc. 91 | """ 92 | url = "https://google.serper.dev/search" 93 | 94 | payload = json.dumps({ 95 | "q": search_query, 96 | "gl": "gb", 97 | "num": 20, 98 | "tbs": "qdr:d" 99 | }) 100 | 101 | headers = { 102 | 'X-API-KEY': os.getenv("SERPER_API_KEY"), 103 | 'Content-Type': 'application/json' 104 | } 105 | 106 | response = requests.post(url, headers=headers, data=payload) 107 | results = response.json() 108 | if 'organic' not in results: 109 | raise ValueError(f"No organic results found in results {results} for search query {search_query}") 110 | results_list = results['organic'] 111 | 112 | return [ 113 | { 114 | 'title': result['title'], 115 | 'link': result['link'], 116 | 'snippet': result['snippet'], 117 | 'search_term': search_query, 118 | 'id': idx 119 | } 120 | for idx, result in enumerate(results_list, 1) 121 | ] 122 | 123 | 124 | 125 | def load_prompt(prompt_name: str) -> str: 126 | """Load a prompt template from file.""" 127 | with open(f"prompts/{prompt_name}.md", "r") as file: 128 | return file.read() 129 | 130 | 131 | def check_search_relevance(search_results: Dict[str, Any]) -> RelevanceCheckOutput: 132 | """ 133 | Analyze search results and determine the most relevant ones. 134 | 135 | Args: 136 | search_results: Dictionary containing search results to analyze 137 | 138 | Returns: 139 | RelevanceCheckOutput containing the most relevant results and explanation 140 | """ 141 | prompt = load_prompt("relevance_check") 142 | prompt_template = ChatPromptTemplate.from_messages([("system", prompt)]) 143 | llm = ChatOpenAI(model="gpt-4o").with_structured_output(RelevanceCheckOutput) 144 | 145 | return (prompt_template | llm).invoke({'input_search_results': search_results}) 146 | 147 | 148 | def convert_html_to_markdown(html_content: str) -> str: 149 | """Convert HTML content to markdown format.""" 150 | soup = BeautifulSoup(html_content, 'html.parser') 151 | 152 | # Convert headers 153 | for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): 154 | level = int(h.name[1]) 155 | h.replace_with('#' * level + ' ' + h.get_text() + '\n\n') 156 | 157 | # Convert links 158 | for a in soup.find_all('a'): 159 | href = a.get('href', '') 160 | text = a.get_text() 161 | if href and text: 162 | a.replace_with(f'[{text}]({href})') 163 | 164 | # Convert formatting 165 | for tag, marker in [ 166 | (['b', 'strong'], '**'), 167 | (['i', 'em'], '*') 168 | ]: 169 | for element in soup.find_all(tag): 170 | element.replace_with(f'{marker}{element.get_text()}{marker}') 171 | 172 | # Convert lists 173 | for ul in soup.find_all('ul'): 174 | for li in ul.find_all('li'): 175 | li.replace_with(f'- {li.get_text()}\n') 176 | 177 | for ol in soup.find_all('ol'): 178 | for i, li in enumerate(ol.find_all('li'), 1): 179 | li.replace_with(f'{i}. {li.get_text()}\n') 180 | 181 | # Clean up text 182 | text = soup.get_text() 183 | return re.sub(r'\n\s*\n', '\n\n', text).strip() 184 | 185 | 186 | def scrape_and_save_markdown(relevant_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 187 | """ 188 | Scrape HTML content from URLs and save as markdown. 189 | 190 | Args: 191 | relevant_results: List of dictionaries containing search results 192 | 193 | Returns: 194 | List of dictionaries containing markdown content and metadata 195 | """ 196 | pathlib.Path("scraped_markdown").mkdir(exist_ok=True) 197 | markdown_contents = [] 198 | 199 | for result in relevant_results: 200 | if 'link' not in result: 201 | continue 202 | 203 | payload = { 204 | "api_key": os.getenv("SCRAPING_API_KEY"), 205 | "url": result['link'], 206 | "render_js": "true" 207 | } 208 | 209 | response = requests.get("https://scraping.narf.ai/api/v1/", params=payload) 210 | if response.status_code != 200: 211 | print(f"Failed to fetch {result['link']}: Status code {response.status_code}") 212 | continue 213 | 214 | filename = f"{result.get('id', hash(result['link']))}.md" 215 | filepath = os.path.join("scraped_markdown", filename) 216 | 217 | markdown_content = convert_html_to_markdown(response.content.decode()) 218 | 219 | with open(filepath, 'w', encoding='utf-8') as f: 220 | f.write(markdown_content) 221 | 222 | markdown_contents.append({ 223 | 'url': result['link'], 224 | 'filepath': filepath, 225 | 'markdown': markdown_content, 226 | 'title': result.get('title', ''), 227 | 'id': result.get('id', '') 228 | }) 229 | 230 | print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown") 231 | return markdown_contents 232 | 233 | 234 | def generate_summaries(markdown_contents: List[Dict[str, Any]]) -> List[Dict[str, str]]: 235 | """ 236 | Generate summaries for markdown content using gpt-4o. 237 | 238 | Args: 239 | markdown_contents: List of dictionaries containing markdown content 240 | 241 | Returns: 242 | List of dictionaries containing summaries and URLs 243 | """ 244 | pathlib.Path("markdown_summaries").mkdir(exist_ok=True) 245 | summary_prompt = load_prompt("summarise_markdown_page") 246 | summary_template = ChatPromptTemplate.from_messages([("system", summary_prompt)]) 247 | llm = ChatOpenAI(model="gpt-4o") 248 | summary_chain = summary_template | llm 249 | 250 | summaries = [] 251 | for content in markdown_contents: 252 | try: 253 | summary = summary_chain.invoke({ 254 | 'markdown_input': ' '.join(content['markdown'].split()[:2000]) 255 | }) 256 | 257 | summary_filename = f"summary_{content['id']}.md" 258 | summary_filepath = os.path.join("markdown_summaries", summary_filename) 259 | 260 | with open(summary_filepath, 'w', encoding='utf-8') as f: 261 | f.write(summary.content) 262 | 263 | summaries.append({ 264 | 'markdown_summary': summary.content, 265 | 'url': content['url'] 266 | }) 267 | 268 | except Exception as e: 269 | print(f"Failed to summarize {content['filepath']}: {str(e)}") 270 | 271 | print(f"Successfully generated {len(summaries)} summaries") 272 | return summaries 273 | 274 | 275 | def summariser(state: State) -> Dict: 276 | """Generate email summary from the state.""" 277 | summariser_output = llm_summariser.invoke({ 278 | "messages": state["messages"], 279 | "list_of_summaries": state["summaries"], 280 | "input_template": state["email_template"] 281 | }) 282 | new_messages = [ 283 | AIMessage(content=summariser_output.email_summary), 284 | AIMessage(content=summariser_output.message) 285 | ] 286 | return { 287 | "messages": new_messages, 288 | "created_summaries": [summariser_output.email_summary] 289 | } 290 | 291 | 292 | def reviewer(state: State) -> Dict: 293 | """Review the generated summary.""" 294 | converted_messages = [ 295 | HumanMessage(content=msg.content) if isinstance(msg, AIMessage) 296 | else AIMessage(content=msg.content) if isinstance(msg, HumanMessage) 297 | else msg 298 | for msg in state["messages"] 299 | ] 300 | 301 | state["messages"] = converted_messages 302 | reviewer_output = llm_reviewer.invoke({"messages": state["messages"]}) 303 | 304 | return { 305 | "messages": [HumanMessage(content=reviewer_output.message)], 306 | "approved": reviewer_output.approved 307 | } 308 | 309 | 310 | def conditional_edge(state: State) -> Literal["summariser", END]: 311 | """Determine next step based on approval status.""" 312 | return END if state["approved"] else "summariser" 313 | 314 | 315 | def send_email(email_content: str): 316 | """Send email using Sendinblue API.""" 317 | configuration = sib_api_v3_sdk.Configuration() 318 | configuration.api_key['api-key'] = os.getenv("SENDINGBLUE_API_KEY") 319 | 320 | api_instance = sib_api_v3_sdk.TransactionalEmailsApi(sib_api_v3_sdk.ApiClient(configuration)) 321 | 322 | email_params = { 323 | "subject": "Daily AI Research Summary", 324 | "sender": {"name": "Will White", "email": os.getenv("DESTINATION_EMAIL")}, 325 | "html_content": email_content, 326 | "to": [{"email": os.getenv("DESTINATION_EMAIL"), "name": "Will White"}], 327 | "params": {"subject": "Daily AI Research Summary"} 328 | } 329 | 330 | try: 331 | api_response = api_instance.send_transac_email( 332 | sib_api_v3_sdk.SendSmtpEmail(**email_params) 333 | ) 334 | print(api_response) 335 | except ApiException as e: 336 | print(f"Failed to send email: {e}") 337 | 338 | 339 | def main(): 340 | """Main execution flow.""" 341 | try: 342 | validate_environment_variables() 343 | except ValueError as e: 344 | with open(".env", "w") as f: 345 | # Load environment variables from .env file 346 | for line in f: 347 | if '=' in line: 348 | key, value = line.strip().split('=', 1) 349 | os.environ[key] = value 350 | print("Loaded environment variables from .env file") 351 | 352 | # Search and filter results 353 | relevant_results = [] 354 | for search_term in SEARCH_TERMS: 355 | results = search_serper(search_term) 356 | filtered_results = check_search_relevance(results) 357 | relevant_ids = [r.id for r in filtered_results.relevant_results] 358 | filtered_results = [r for r in results if str(r['id']) in relevant_ids] 359 | relevant_results.extend(filtered_results) 360 | 361 | # Process content 362 | markdown_contents = scrape_and_save_markdown(relevant_results) 363 | summaries = generate_summaries(markdown_contents) 364 | 365 | # Set up LLM workflow 366 | llm = ChatOpenAI(model="gpt-4o") 367 | 368 | with open("email_template.md", "r") as f: 369 | email_template = f.read() 370 | 371 | summariser_prompt = ChatPromptTemplate.from_messages([ 372 | ("system", load_prompt("summariser")), 373 | ("placeholder", "{messages}"), 374 | ]) 375 | 376 | reviewer_prompt = ChatPromptTemplate.from_messages([ 377 | ("system", load_prompt("reviewer")), 378 | ("placeholder", "{messages}"), 379 | ]) 380 | 381 | global llm_summariser, llm_reviewer 382 | llm_summariser = summariser_prompt | llm.with_structured_output(SummariserOutput) 383 | llm_reviewer = reviewer_prompt | llm.with_structured_output(ReviewerOutput) 384 | 385 | # Configure and run graph 386 | graph_builder = StateGraph(State) 387 | graph_builder.add_node("summariser", summariser) 388 | graph_builder.add_node("reviewer", reviewer) 389 | graph_builder.add_edge(START, "summariser") 390 | graph_builder.add_edge("summariser", "reviewer") 391 | graph_builder.add_conditional_edges('reviewer', conditional_edge) 392 | 393 | graph = graph_builder.compile() 394 | output = graph.invoke({"summaries": summaries, "email_template": email_template}) 395 | 396 | # Send final email 397 | send_email(output["created_summaries"][-1]) 398 | 399 | 400 | if __name__ == "__main__": 401 | main() 402 | --------------------------------------------------------------------------------