├── .gitignore
├── images
    └── Screenshot 2024-12-17 220208.png
├── requirements.txt
├── .env
├── prompts
    ├── summarise_markdown_page.md
    ├── reviewer.md
    ├── summariser.md
    └── relevance_check.md
├── email_template.md
├── README.md
└── email_script.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .conda/
2 | *.md
3 | *.json
4 | 


--------------------------------------------------------------------------------
/images/Screenshot 2024-12-17 220208.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whitew1994WW/email_research_assistant/HEAD/images/Screenshot 2024-12-17 220208.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain-community
2 | langchain-core
3 | langchain-openai
4 | langgraph
5 | git+https://github.com/sendinblue/APIv3-python-library.git
6 | firecrawl-py
7 | beautifulsoup4


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=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
2 | DESTINATION_EMAIL=whitew1994@gmail.com
3 | SCRAPING_API_KEY=c0bFSYJ2aaJQGjz3E03iL8fFCXC6kIM6Da0QAq5I3z4vR4TzWVnDqBGAW60y33FqGRENnAo67ZS0jFlM9D
4 | SENDINGBLUE_API_KEY=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
5 | SERPER_API_KEY=2cfb0efb4ce760a4061a4234229967b41207dff5
6 | 


--------------------------------------------------------------------------------
/prompts/summarise_markdown_page.md:
--------------------------------------------------------------------------------
 1 | # Role
 2 | 
 3 | You are a detail-oriented researcher, reading online articles on behalf of a CTO with a technical background.
 4 | 
 5 | # Task
 6 | 
 7 | You will be given a markdown file of an online article. Your task is to read the article and provide a summary of the content, suitable for the CTO.
 8 | 
 9 | # Output
10 | 
11 | Markdown formatted text containing a detailed summary of the article, including relevant people, companies, and technologies mentioned.
12 | 
13 | # Input
14 | 
15 | {markdown_input}
16 | 
17 | 


--------------------------------------------------------------------------------
/email_template.md:
--------------------------------------------------------------------------------
 1 | # Big Company Updates
 2 | 
 3 | ## Key Highlights
 4 | 
 5 | - Highlight 1:
 6 | Short description
 7 | - Highilight 2
 8 | Short desctiption
 9 | - Highlight 3
10 | Short description
11 | ... Up to 8 highlights
12 | 
13 | ## Deeper Dive
14 | 
15 | ### 1
16 | ......
17 | 
18 | ### 2
19 | Continue up to 8
20 | 
21 | # Industry trends
22 | 
23 | ## Key Highlights
24 | 
25 | - Highlight 1:
26 | Short description
27 | - Highilight 2
28 | Short desctiption
29 | - Highlight 3
30 | Short description
31 | ... Up to 8 highlights
32 | 
33 | ## Deeper Dive
34 | 
35 | ### 1
36 | ......
37 | 
38 | ### 2
39 | 
40 | Continue up to 8                                                  
41 | 
42 | 


--------------------------------------------------------------------------------
/prompts/reviewer.md:
--------------------------------------------------------------------------------
 1 | # Role
 2 | 
 3 | You are a editor acting in a document reviewing capacity, with an eye for detial and how readable and consumable the document is.
 4 | 
 5 | You are reviewing the content for a CTO, who is interested in AI and the latest industry trends.
 6 | 
 7 | # Task
 8 | 
 9 | You are reviewing a daily email summary for the CTO that has been created by a research assistant. The general requirements for the email are:
10 | 
11 | - Light hearted
12 | - Links included with each reference
13 | - Emojis included where appropriate
14 | - The email should have a high level summary at the top, but also further detail if the CTO should read the full article
15 | 
16 | Be harsh, and provide feedback on the email, you have high standards.
17 | 
18 | # Output Format
19 | 
20 | You should provide feedback on the email, and provide suggestions for improvement. When you are happy with the email, you should indicate this by marking the email as "approved".
21 | 


--------------------------------------------------------------------------------
/prompts/summariser.md:
--------------------------------------------------------------------------------
 1 | # Role
 2 | 
 3 | You are a research assistant who is working for a busy CTO.
 4 | 
 5 | # Task
 6 | 
 7 | You are given a list of summaries of online articles, and their links, and you need to provide a summary email of the content. You need to make sure that the summary is comprehensive, light hearted, and easy to read. You should also include emojis to make it easier to read. You should also include links to the original articles.
 8 | 
 9 | You also need to send a message to the reviewer, asking for feedback on the summary. The reviewer will decide if the summary is approved or not. If not, you will need to provide a new summary.
10 | 
11 | # Output Format
12 | 
13 | You need to provide an output summary in html format, following this markdown template (note that the template is in markdown format, but the output should be in html format):
14 | 
15 | ```markdown
16 | {input_template}
17 | ```
18 | 
19 | The deep dive section should be significantly more detailed than the high level summary section.
20 | 
21 | # Input Summaries
22 | 
23 | {list_of_summaries}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Search Email Research Assistant - Deployed to Render
 2 | 
 3 | Deploy an email research assistant to send you a daily summary of the latest AI research (or whatever you want).
 4 | 
 5 | Required API keys:
 6 | - Serper API key (for Google search https://serper.dev/)
 7 | - Scraping API key (for scraping urls & rendering JS https://scrapingfish.com/)
 8 | - Sendinblue API key (for sending emails https://app.brevo.com/)
 9 | - OpenAI API key (for summarizing the results https://platform.openai.com)
10 | 
11 | ## Setup for Render deployment
12 | 
13 | 1. Point a render CRON job to this repository
14 | 2. Set the environment variables in the render dashboard as per the image below:
15 | 
16 | ![Render Environment Variables](./images/Screenshot%202024-12-17%20220208.png)
17 | 
18 | 3. Set the entry point to `python email_script.py`
19 | 
20 | Thats it!
21 | 
22 | ## Setup for local development
23 | 
24 | 1. Install the required packages `pip install -r requirements.txt`
25 | 2. Set the environment variables in the .env file
26 | 3. Run the script with `python email_script.py`
27 | 
28 | 


--------------------------------------------------------------------------------
/prompts/relevance_check.md:
--------------------------------------------------------------------------------
 1 | # Role
 2 | 
 3 | You are a research assistant who is working for a busy CTO.
 4 | 
 5 | # Context
 6 | 
 7 | The CTO is trying to stay up to date with latest news and information around particular topics.
 8 | 
 9 | # Task
10 | 
11 | You are searching through google, and trying to determine which links are potentialy relevant and worth exploring further.
12 | 
13 | You will be asked to review search results and to determine if they are relevant to explore further, or if they are not relevant.
14 | 
15 | You will be given a list of search results, and you will need to determine which 5 are most relevant, and detail why.
16 | 
17 | # Example of what is relevant 
18 | 
19 | 
20 | - Agentic AI updates from the large AI players:
21 | -- OpenAI
22 | -- Meta
23 | -- Perplexity
24 | -- Gemini
25 | -- Mistral
26 | -- Cursor
27 | - How people are using agents in industry, from LinkedIn, Twitter, etc.
28 | - News articles about AI agents.
29 | 
30 | # Example of what is not relevant 
31 | 
32 | - Startup blogs that are likley for marketing purposes
33 | - Technical Blog that isnt an industry use case or update
34 | 
35 | # Output
36 | 
37 | You will need to output the ID of the 5 most relevant search results, and a short explanation for why they are relevant.
38 | 
39 | # Input
40 | 
41 | Search results:
42 | 
43 | ```json
44 | {input_search_results}
45 | ```
46 | 
47 | 


--------------------------------------------------------------------------------
/email_script.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Email script for generating and sending AI research summaries.
  4 | 
  5 | This script searches for AI-related content, summarizes it, and sends a daily email digest.
  6 | """
  7 | 
  8 | from __future__ import print_function
  9 | import json
 10 | import os
 11 | import pathlib
 12 | import re
 13 | from typing import List, Dict, Any, Literal, Annotated
 14 | 
 15 | import requests
 16 | from bs4 import BeautifulSoup
 17 | from langchain.schema import HumanMessage, AIMessage
 18 | from langchain_core.prompts import ChatPromptTemplate
 19 | from langchain_openai import ChatOpenAI
 20 | from langgraph.graph import StateGraph, START, END
 21 | from langgraph.graph.message import add_messages
 22 | from pydantic import BaseModel, Field
 23 | from typing_extensions import TypedDict
 24 | import sib_api_v3_sdk
 25 | from sib_api_v3_sdk.rest import ApiException
 26 | 
 27 | 
 28 | # Configuration
 29 | SEARCH_TERMS = [
 30 |     "Agentic AI",
 31 |     "OpenAI LinkedIn",
 32 |     "Perplexity LinkedIn",
 33 |     "Meta AI LinkedIn",
 34 |     "Anthropic LinkedIn"
 35 | ]
 36 | 
 37 | required_environment_variables = [
 38 |     "SERPER_API_KEY",
 39 |     "SCRAPING_API_KEY",
 40 |     "SENDINGBLUE_API_KEY",
 41 |     "OPENAI_API_KEY"
 42 | ]
 43 | 
 44 | def validate_environment_variables():
 45 |     """Validate environment variables."""
 46 |     for var in required_environment_variables:
 47 |         if os.getenv(var) is None:
 48 |             raise ValueError(f"Environment variable {var} is not set")
 49 | 
 50 | class ResultRelevance(BaseModel):
 51 |     """Model for storing relevance check results."""
 52 |     explanation: str
 53 |     id: str
 54 | 
 55 | 
 56 | class RelevanceCheckOutput(BaseModel):
 57 |     """Model for storing all relevant results."""
 58 |     relevant_results: List[ResultRelevance]
 59 | 
 60 | 
 61 | class State(TypedDict):
 62 |     """State management for the LangGraph workflow."""
 63 |     messages: Annotated[list, add_messages]
 64 |     summaries: List[dict]
 65 |     approved: bool
 66 |     created_summaries: Annotated[List[dict], Field(description="The summaries created by the summariser")]
 67 |     email_template: str
 68 | 
 69 | 
 70 | class SummariserOutput(BaseModel):
 71 |     """Output format for the summarizer."""
 72 |     email_summary: str = Field(description="The summary email of the content")
 73 |     message: str = Field(description="A message to the reviewer requesting feedback")
 74 | 
 75 | 
 76 | class ReviewerOutput(BaseModel):
 77 |     """Output format for the reviewer."""
 78 |     approved: bool = Field(description="Whether the summary is approved")
 79 |     message: str = Field(description="Feedback message from the reviewer")
 80 | 
 81 | 
 82 | def search_serper(search_query: str) -> List[Dict[str, Any]]:
 83 |     """
 84 |     Search Google using the Serper API.
 85 |     
 86 |     Args:
 87 |         search_query: The search term to query
 88 |         
 89 |     Returns:
 90 |         List of search results with title, link, snippet, etc.
 91 |     """
 92 |     url = "https://google.serper.dev/search"
 93 |     
 94 |     payload = json.dumps({
 95 |         "q": search_query,
 96 |         "gl": "gb",
 97 |         "num": 20,
 98 |         "tbs": "qdr:d"
 99 |     })
100 | 
101 |     headers = {
102 |         'X-API-KEY': os.getenv("SERPER_API_KEY"),
103 |         'Content-Type': 'application/json'
104 |     }
105 | 
106 |     response = requests.post(url, headers=headers, data=payload)
107 |     results = response.json()
108 |     if 'organic' not in results:
109 |         raise ValueError(f"No organic results found in results {results} for search query {search_query}")
110 |     results_list = results['organic']
111 | 
112 |     return [
113 |         {
114 |             'title': result['title'],
115 |             'link': result['link'],
116 |             'snippet': result['snippet'],
117 |             'search_term': search_query,
118 |             'id': idx
119 |         }
120 |         for idx, result in enumerate(results_list, 1)
121 |     ]
122 | 
123 | 
124 | 
125 | def load_prompt(prompt_name: str) -> str:
126 |     """Load a prompt template from file."""
127 |     with open(f"prompts/{prompt_name}.md", "r") as file:
128 |         return file.read()
129 | 
130 | 
131 | def check_search_relevance(search_results: Dict[str, Any]) -> RelevanceCheckOutput:
132 |     """
133 |     Analyze search results and determine the most relevant ones.
134 |     
135 |     Args:
136 |         search_results: Dictionary containing search results to analyze
137 |         
138 |     Returns:
139 |         RelevanceCheckOutput containing the most relevant results and explanation
140 |     """
141 |     prompt = load_prompt("relevance_check")
142 |     prompt_template = ChatPromptTemplate.from_messages([("system", prompt)])
143 |     llm = ChatOpenAI(model="gpt-4o").with_structured_output(RelevanceCheckOutput)
144 |     
145 |     return (prompt_template | llm).invoke({'input_search_results': search_results})
146 | 
147 | 
148 | def convert_html_to_markdown(html_content: str) -> str:
149 |     """Convert HTML content to markdown format."""
150 |     soup = BeautifulSoup(html_content, 'html.parser')
151 |     
152 |     # Convert headers
153 |     for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
154 |         level = int(h.name[1])
155 |         h.replace_with('#' * level + ' ' + h.get_text() + '\n\n')
156 |     
157 |     # Convert links
158 |     for a in soup.find_all('a'):
159 |         href = a.get('href', '')
160 |         text = a.get_text()
161 |         if href and text:
162 |             a.replace_with(f'[{text}]({href})')
163 |     
164 |     # Convert formatting
165 |     for tag, marker in [
166 |         (['b', 'strong'], '**'),
167 |         (['i', 'em'], '*')
168 |     ]:
169 |         for element in soup.find_all(tag):
170 |             element.replace_with(f'{marker}{element.get_text()}{marker}')
171 |     
172 |     # Convert lists
173 |     for ul in soup.find_all('ul'):
174 |         for li in ul.find_all('li'):
175 |             li.replace_with(f'- {li.get_text()}\n')
176 |     
177 |     for ol in soup.find_all('ol'):
178 |         for i, li in enumerate(ol.find_all('li'), 1):
179 |             li.replace_with(f'{i}. {li.get_text()}\n')
180 |     
181 |     # Clean up text
182 |     text = soup.get_text()
183 |     return re.sub(r'\n\s*\n', '\n\n', text).strip()
184 | 
185 | 
186 | def scrape_and_save_markdown(relevant_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
187 |     """
188 |     Scrape HTML content from URLs and save as markdown.
189 |     
190 |     Args:
191 |         relevant_results: List of dictionaries containing search results
192 |         
193 |     Returns:
194 |         List of dictionaries containing markdown content and metadata
195 |     """
196 |     pathlib.Path("scraped_markdown").mkdir(exist_ok=True)
197 |     markdown_contents = []
198 | 
199 |     for result in relevant_results:
200 |         if 'link' not in result:
201 |             continue
202 | 
203 |         payload = {
204 |             "api_key": os.getenv("SCRAPING_API_KEY"),
205 |             "url": result['link'],
206 |             "render_js": "true"
207 |         }
208 | 
209 |         response = requests.get("https://scraping.narf.ai/api/v1/", params=payload)
210 |         if response.status_code != 200:
211 |             print(f"Failed to fetch {result['link']}: Status code {response.status_code}")
212 |             continue
213 | 
214 |         filename = f"{result.get('id', hash(result['link']))}.md"
215 |         filepath = os.path.join("scraped_markdown", filename)
216 |         
217 |         markdown_content = convert_html_to_markdown(response.content.decode())
218 |         
219 |         with open(filepath, 'w', encoding='utf-8') as f:
220 |             f.write(markdown_content)
221 |         
222 |         markdown_contents.append({
223 |             'url': result['link'],
224 |             'filepath': filepath,
225 |             'markdown': markdown_content,
226 |             'title': result.get('title', ''),
227 |             'id': result.get('id', '')
228 |         })
229 | 
230 |     print(f"Successfully downloaded and saved {len(markdown_contents)} pages as markdown")
231 |     return markdown_contents
232 | 
233 | 
234 | def generate_summaries(markdown_contents: List[Dict[str, Any]]) -> List[Dict[str, str]]:
235 |     """
236 |     Generate summaries for markdown content using gpt-4o.
237 |     
238 |     Args:
239 |         markdown_contents: List of dictionaries containing markdown content
240 |         
241 |     Returns:
242 |         List of dictionaries containing summaries and URLs
243 |     """
244 |     pathlib.Path("markdown_summaries").mkdir(exist_ok=True)
245 |     summary_prompt = load_prompt("summarise_markdown_page")
246 |     summary_template = ChatPromptTemplate.from_messages([("system", summary_prompt)])
247 |     llm = ChatOpenAI(model="gpt-4o")
248 |     summary_chain = summary_template | llm
249 |     
250 |     summaries = []
251 |     for content in markdown_contents:
252 |         try:
253 |             summary = summary_chain.invoke({
254 |                 'markdown_input': ' '.join(content['markdown'].split()[:2000])
255 |             })
256 |             
257 |             summary_filename = f"summary_{content['id']}.md"
258 |             summary_filepath = os.path.join("markdown_summaries", summary_filename)
259 |             
260 |             with open(summary_filepath, 'w', encoding='utf-8') as f:
261 |                 f.write(summary.content)
262 |             
263 |             summaries.append({
264 |                 'markdown_summary': summary.content,
265 |                 'url': content['url']
266 |             })
267 |                 
268 |         except Exception as e:
269 |             print(f"Failed to summarize {content['filepath']}: {str(e)}")
270 | 
271 |     print(f"Successfully generated {len(summaries)} summaries")
272 |     return summaries
273 | 
274 | 
275 | def summariser(state: State) -> Dict:
276 |     """Generate email summary from the state."""
277 |     summariser_output = llm_summariser.invoke({
278 |         "messages": state["messages"],
279 |         "list_of_summaries": state["summaries"],
280 |         "input_template": state["email_template"]
281 |     })
282 |     new_messages = [
283 |         AIMessage(content=summariser_output.email_summary),
284 |         AIMessage(content=summariser_output.message)
285 |     ]
286 |     return {
287 |         "messages": new_messages,
288 |         "created_summaries": [summariser_output.email_summary]
289 |     }
290 | 
291 | 
292 | def reviewer(state: State) -> Dict:
293 |     """Review the generated summary."""
294 |     converted_messages = [
295 |         HumanMessage(content=msg.content) if isinstance(msg, AIMessage)
296 |         else AIMessage(content=msg.content) if isinstance(msg, HumanMessage)
297 |         else msg
298 |         for msg in state["messages"]
299 |     ]
300 |     
301 |     state["messages"] = converted_messages
302 |     reviewer_output = llm_reviewer.invoke({"messages": state["messages"]})
303 |     
304 |     return {
305 |         "messages": [HumanMessage(content=reviewer_output.message)],
306 |         "approved": reviewer_output.approved
307 |     }
308 | 
309 | 
310 | def conditional_edge(state: State) -> Literal["summariser", END]:
311 |     """Determine next step based on approval status."""
312 |     return END if state["approved"] else "summariser"
313 | 
314 | 
315 | def send_email(email_content: str):
316 |     """Send email using Sendinblue API."""
317 |     configuration = sib_api_v3_sdk.Configuration()
318 |     configuration.api_key['api-key'] = os.getenv("SENDINGBLUE_API_KEY")
319 |     
320 |     api_instance = sib_api_v3_sdk.TransactionalEmailsApi(sib_api_v3_sdk.ApiClient(configuration))
321 |     
322 |     email_params = {
323 |         "subject": "Daily AI Research Summary",
324 |         "sender": {"name": "Will White", "email": os.getenv("DESTINATION_EMAIL")},
325 |         "html_content": email_content,
326 |         "to": [{"email": os.getenv("DESTINATION_EMAIL"), "name": "Will White"}],
327 |         "params": {"subject": "Daily AI Research Summary"}
328 |     }
329 |     
330 |     try:
331 |         api_response = api_instance.send_transac_email(
332 |             sib_api_v3_sdk.SendSmtpEmail(**email_params)
333 |         )
334 |         print(api_response)
335 |     except ApiException as e:
336 |         print(f"Failed to send email: {e}")
337 | 
338 | 
339 | def main():
340 |     """Main execution flow."""
341 |     try:
342 |         validate_environment_variables()
343 |     except ValueError as e:
344 |         with open(".env", "w") as f:
345 |             # Load environment variables from .env file
346 |             for line in f:
347 |                 if '=' in line:
348 |                     key, value = line.strip().split('=', 1)
349 |                     os.environ[key] = value
350 |         print("Loaded environment variables from .env file")
351 |     
352 |     # Search and filter results
353 |     relevant_results = []
354 |     for search_term in SEARCH_TERMS:
355 |         results = search_serper(search_term)
356 |         filtered_results = check_search_relevance(results)
357 |         relevant_ids = [r.id for r in filtered_results.relevant_results]
358 |         filtered_results = [r for r in results if str(r['id']) in relevant_ids]
359 |         relevant_results.extend(filtered_results)
360 |     
361 |     # Process content
362 |     markdown_contents = scrape_and_save_markdown(relevant_results)
363 |     summaries = generate_summaries(markdown_contents)
364 | 
365 |     # Set up LLM workflow
366 |     llm = ChatOpenAI(model="gpt-4o")
367 |     
368 |     with open("email_template.md", "r") as f:
369 |         email_template = f.read()
370 | 
371 |     summariser_prompt = ChatPromptTemplate.from_messages([
372 |         ("system", load_prompt("summariser")),
373 |         ("placeholder", "{messages}"),
374 |     ])
375 | 
376 |     reviewer_prompt = ChatPromptTemplate.from_messages([
377 |         ("system", load_prompt("reviewer")),
378 |         ("placeholder", "{messages}"),
379 |     ])
380 | 
381 |     global llm_summariser, llm_reviewer
382 |     llm_summariser = summariser_prompt | llm.with_structured_output(SummariserOutput)
383 |     llm_reviewer = reviewer_prompt | llm.with_structured_output(ReviewerOutput)
384 | 
385 |     # Configure and run graph
386 |     graph_builder = StateGraph(State)
387 |     graph_builder.add_node("summariser", summariser)
388 |     graph_builder.add_node("reviewer", reviewer)
389 |     graph_builder.add_edge(START, "summariser")
390 |     graph_builder.add_edge("summariser", "reviewer")
391 |     graph_builder.add_conditional_edges('reviewer', conditional_edge)
392 | 
393 |     graph = graph_builder.compile()
394 |     output = graph.invoke({"summaries": summaries, "email_template": email_template})
395 | 
396 |     # Send final email
397 |     send_email(output["created_summaries"][-1])
398 | 
399 | 
400 | if __name__ == "__main__":
401 |     main()
402 | 


--------------------------------------------------------------------------------