├── .gitattributes
├── .gitignore
├── app.py
├── custom_tools.py
├── email_cleaning.py
├── extract_faq.py
└── mbox_to_csv.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .env
3 | __pycache__/custom_tools.cpython-310.pyc
4 | email_pairs.csv
5 | faq.csv
6 | past_email_mbox.csv
7 | Sent.mbox
8 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from dotenv import find_dotenv, load_dotenv
 2 | 
 3 | from langchain.agents import initialize_agent
 4 | from langchain.agents import AgentType
 5 | from langchain.chat_models import ChatOpenAI
 6 | from langchain.prompts import MessagesPlaceholder
 7 | from langchain.memory import ConversationSummaryBufferMemory
 8 | from langchain.chains.summarize import load_summarize_chain
 9 | from langchain.schema import SystemMessage
10 | from custom_tools import CreateEmailDraftTool, GenerateEmailResponseTool, ReplyEmailTool, EscalateTool, ProspectResearchTool, CategoriseEmailTool
11 | 
12 | load_dotenv()
13 | llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
14 | 
15 | system_message = SystemMessage(
16 |     content="""
17 |     You are an email inbox assistant of an AI youtube channel called "AI Jason", 
18 |     who is creating AI educational content, 
19 |     Your goal is to handle all the incoming emails by categorising them based on 
20 |     guideline and decide on next steps
21 |     """
22 | )
23 | 
24 | tools = [
25 |     CategoriseEmailTool(),
26 |     ProspectResearchTool(),
27 |     EscalateTool(),
28 |     ReplyEmailTool(),
29 |     CreateEmailDraftTool(),
30 |     GenerateEmailResponseTool(),
31 | ]
32 | 
33 | agent_kwargs = {
34 |     "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
35 |     "system_message": system_message,
36 | }
37 | memory = ConversationSummaryBufferMemory(
38 |     memory_key="memory", return_messages=True, llm=llm, max_token_limit=1000)
39 | 
40 | agent = initialize_agent(
41 |     tools,
42 |     llm,
43 |     agent=AgentType.OPENAI_FUNCTIONS,
44 |     verbose=True,
45 |     agent_kwargs=agent_kwargs,
46 |     memory=memory,
47 | )
48 | 
49 | 
50 | test_email = """
51 | xxxxxxx
52 | """
53 | 
54 | agent({"input": test_email})
55 | 


--------------------------------------------------------------------------------
/custom_tools.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dotenv import load_dotenv, find_dotenv
  3 | import openai
  4 | 
  5 | from langchain import PromptTemplate
  6 | from langchain.agents import initialize_agent, Tool
  7 | from langchain.agents import AgentType
  8 | from langchain.chat_models import ChatOpenAI
  9 | from langchain.prompts import MessagesPlaceholder
 10 | from langchain.memory import ConversationSummaryBufferMemory
 11 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 12 | from langchain.chains.summarize import load_summarize_chain
 13 | from langchain.tools import BaseTool
 14 | from pydantic import BaseModel, Field
 15 | from typing import Type
 16 | from bs4 import BeautifulSoup
 17 | import requests
 18 | import json
 19 | from langchain.schema import SystemMessage
 20 | 
 21 | load_dotenv(find_dotenv())
 22 | openai.api_key = os.environ.get("OPENAI_API_KEY")
 23 | llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
 24 | 
 25 | 
 26 | # CATEGORISE EMAIL
 27 | def check_consulting_email(lates_reply: str):
 28 |     prompt = f"""
 29 |     EMAIL: {lates_reply}
 30 |     ---
 31 | 
 32 |     Above is an email about Job offer / consulting; Your goal is identify if all information above is mentioned:
 33 |     1. What's the problem the prospect is trying to solve? 
 34 |     2. Their budget
 35 | 
 36 |     If all info above is collected, return YES, otherwise, return NO; (Return ONLY YES or NO)
 37 | 
 38 |     ANSWER: 
 39 |     """
 40 | 
 41 |     all_needs_collected_result = openai.ChatCompletion.create(
 42 |         model="gpt-4",
 43 |         messages=[
 44 |             {"role": "user", "content": prompt}
 45 |         ]
 46 |     )
 47 | 
 48 |     all_needs_collected = all_needs_collected_result["choices"][0]["message"]["content"]
 49 | 
 50 |     return all_needs_collected
 51 | 
 52 | 
 53 | def categorise_email(lates_reply: str):
 54 |     categorise_prompt = f"""
 55 |     EMAIL: {lates_reply}
 56 |     ---
 57 | 
 58 |     Your goal is to categorise the email based on categories below:
 59 | 
 60 |     1. COLLABORATION/SPONSORSHIP: These are emails where companies or individuals are reaching out to propose a collaboration or sponsorship opportunity with AI Jason. They often include details about their product or service and how they envision the partnership.
 61 | 
 62 |     2. JOB_OFFER/CONSULTING: These emails involve individuals or companies reaching out to AI Jason with a specific job or project they want him to work on. This could range from developing an AI application to leading a specific activity.
 63 | 
 64 |     3. QUESTIONS: These emails involve individuals reaching out to AI Jason with specific questions or inquiries. This could be about his videos, his knowledge on a specific topic, or his thoughts on a specific AI tool or technology.
 65 | 
 66 |     4. NON_REPLY: These are auto emails that don't need any response or involve companies or individuals reaching out to AI Jason to offer their services. This could be a marketing agency offering to help him find sponsorship opportunities or a company offering a specific tool or service they think he might find useful.
 67 | 
 68 |     5. OTHER: These are emails that don't fit into any of the above categories.
 69 | 
 70 |     CATEGORY (Return ONLY the category name in capital):
 71 |     """
 72 | 
 73 |     category_result = openai.ChatCompletion.create(
 74 |         model="gpt-4",
 75 |         messages=[
 76 |             {"role": "user", "content": categorise_prompt}
 77 |         ]
 78 |     )
 79 | 
 80 |     category = category_result["choices"][0]["message"]["content"]
 81 | 
 82 |     if category == "JOB_OFFER/CONSULTING":
 83 |         all_needs_collected = check_consulting_email(lates_reply)
 84 |         if all_needs_collected == "YES":
 85 |             return {
 86 |                 "Step 1": """Forward the email to jason.zhou.design@gmail.com, with summary of 1.What's the problem the prospect is trying to solve?  2.Their budget"""
 87 |             }
 88 |         else:
 89 |             return {
 90 |                 "Step 1": "Generate email response to the prospect to collect further info based on guidelines",
 91 |                 "Step 2": "Send generated email response to prospect",
 92 |             }
 93 |     else:
 94 |         if category == "COLLABORATION/SPONSORSHIP":
 95 |             return {
 96 |                 "Step 1": "Research about the prospect & company",
 97 |                 "Step 2": "Forward the email to jason.zhou.design@gmail.com, with the research results included"
 98 |             }
 99 |         else:
100 |             if category == "NON_REPLY":
101 |                 return f"This email has already been taken care of or replied before, nothing needs to be done now"
102 |             else:
103 |                 return {
104 |                     "Step 1": "Generate email response based on guidelines",
105 |                     "Step 2": "Create email draft with the generated response"
106 |                 }
107 | 
108 | 
109 | class CategoriseEmailInput(BaseModel):
110 |     lates_reply: str = Field(description="Latest reply from the prospect ")
111 | 
112 | 
113 | class CategoriseEmailTool(BaseTool):
114 |     name = "categorise_email"
115 |     description = "use this to categorise email to decide what to do next"
116 |     args_schema: Type[BaseModel] = CategoriseEmailInput
117 | 
118 |     def _run(self, lates_reply: str):
119 |         return categorise_email(lates_reply)
120 | 
121 |     def _arun(self, url: str):
122 |         raise NotImplementedError(
123 |             "get_stock_performance does not support async")
124 | 
125 | 
126 | # WRITE EMAIL
127 | def generate_email_response(email_thread: str, category: str):
128 |     # URL endpoint
129 |     url = "https://api-f1db6c.stack.tryrelevance.com/latest/studios/6af484b0-a8bf-4545-91b8-75d46ac8f354/trigger_limited"
130 | 
131 |     # Headers
132 |     headers = {
133 |         "Content-Type": "application/json"
134 |     }
135 | 
136 |     # Payload (data)
137 |     data = {
138 |         "params": {
139 |             "client_email": email_thread,
140 |             "goal": "write email response" if category != "CONSULTING FOLLOW UP" else "for each consulting email, we need to collect 1. Their use case & problem they are trying to solve 2. Their budget; Try to collect those info from them",
141 |         },
142 |         "project": "f86edbc1-fcb6-41f9-b9b6-be14a6f06412"
143 |     }
144 | 
145 |     # Send POST request
146 |     response = requests.post(url, headers=headers, json=data)
147 | 
148 |     return response.text
149 | 
150 | 
151 | class GenerateEmailResponseInput(BaseModel):
152 |     """Inputs for scrape_website"""
153 |     email_thread: str = Field(description="The original full email thread")
154 |     category: str = Field(
155 |         description='category of email, can ONLY be "CONSULTING FOLLOW UP" or "OTHER" ')
156 | 
157 | 
158 | class GenerateEmailResponseTool(BaseTool):
159 |     name = "generate_email_response"
160 |     description = "use this to generate the email response based on specific guidelines, voice & tone & knowledge for AI Jason"
161 |     args_schema: Type[BaseModel] = GenerateEmailResponseInput
162 | 
163 |     def _run(self, email_thread: str, category: str):
164 |         return generate_email_response(email_thread, category)
165 | 
166 |     def _arun(self, url: str):
167 |         raise NotImplementedError("failed to escalate")
168 | 
169 | 
170 | # RESEARCH AGENT
171 | 
172 | def summary(objective, content):
173 |     text_splitter = RecursiveCharacterTextSplitter(
174 |         separators=["\n\n", "\n"], chunk_size=10000, chunk_overlap=500)
175 |     docs = text_splitter.create_documents([content])
176 |     map_prompt = """
177 |     Write a summary of the following text for {objective}:
178 |     "{text}"
179 |     SUMMARY:
180 |     """
181 |     map_prompt_template = PromptTemplate(
182 |         template=map_prompt, input_variables=["text", "objective"])
183 | 
184 |     summary_chain = load_summarize_chain(
185 |         llm=llm,
186 |         chain_type='map_reduce',
187 |         map_prompt=map_prompt_template,
188 |         combine_prompt=map_prompt_template,
189 |         verbose=False
190 |     )
191 | 
192 |     output = summary_chain.run(input_documents=docs, objective=objective)
193 | 
194 |     return output
195 | 
196 | 
197 | def scrape_website(objective: str, url: str):
198 |     # scrape website, and also will summarize the content based on objective if the content is too large
199 |     # objective is the original objective & task that user give to the agent, url is the url of the website to be scraped
200 | 
201 |     # Define the headers for the request
202 |     headers = {
203 |         'Cache-Control': 'no-cache',
204 |         'Content-Type': 'application/json',
205 |     }
206 | 
207 |     # Define the data to be sent in the request
208 |     data = {
209 |         "url": url
210 |     }
211 | 
212 |     # Convert Python object to JSON string
213 |     data_json = json.dumps(data)
214 | 
215 |     # Send the POST request
216 |     response = requests.post(
217 |         "https://chrome.browserless.io/content?token=xxxxxxxxxxxxxxxxxxxxxxxxxxx", headers=headers, data=data_json)
218 | 
219 |     # Check the response status code
220 |     if response.status_code == 200:
221 |         soup = BeautifulSoup(response.content, "html.parser")
222 |         text = soup.get_text()
223 |         if len(text) > 10000:
224 |             output = summary(objective, text)
225 |             return output
226 |         else:
227 |             return text
228 |     else:
229 |         return f"HTTP request failed with status code {response.status_code}"
230 | 
231 | 
232 | class ScrapeWebsiteInput(BaseModel):
233 |     """Inputs for scrape_website"""
234 |     objective: str = Field(
235 |         description="The objective & task that users give to the agent")
236 |     url: str = Field(description="The url of the website to be scraped")
237 | 
238 | 
239 | class ScrapeWebsiteTool(BaseTool):
240 |     name = "scrape_website"
241 |     description = "useful when you need to get data from a website url, passing both url and objective to the function; DO NOT make up any url, the url should only be from the search results"
242 |     args_schema: Type[BaseModel] = ScrapeWebsiteInput
243 | 
244 |     def _run(self, objective: str, url: str):
245 |         return scrape_website(objective, url)
246 | 
247 |     def _arun(self, url: str):
248 |         raise NotImplementedError(
249 |             "get_stock_performance does not support async")
250 | 
251 | 
252 | def search(query):
253 |     url = "https://google.serper.dev/search"
254 | 
255 |     payload = json.dumps({
256 |         "q": query
257 |     })
258 | 
259 |     headers = {
260 |         'X-API-KEY': 'xxxxxxxxxxxxxxxxxxxxx',
261 |         'Content-Type': 'application/json'
262 |     }
263 | 
264 |     response = requests.request("POST", url, headers=headers, data=payload)
265 | 
266 |     return response.text
267 | 
268 | 
269 | def prospect_research(email_or_name: str, company: str):
270 |     tools = [
271 |         Tool(
272 |             name="Search",
273 |             func=search,
274 |             description="useful for when you need to answer questions about current events, data. You should ask targeted questions"
275 |         ),
276 |         ScrapeWebsiteTool(),
277 |     ]
278 | 
279 |     system_message = SystemMessage(
280 |         content="""You are a world class researcher, who can do detailed research on any topic and produce facts based results; 
281 |                 you do not make things up, you will try as hard as possible to gather facts & data to back up the research
282 |                 
283 |                 Please make sure you complete the objective above with the following rules:
284 |                 1/ You should do enough research to gather as much information as possible about the objective
285 |                 2/ If there are url of relevant links & articles, you will scrape it to gather more information
286 |                 3/ After scraping & search, you should think "is there any new things i should search & scraping based on the data I collected to increase research quality?" If answer is yes, continue; But don't do this more than 3 iteratins
287 |                 4/ You should not make things up, you should only write facts & data that you have gathered
288 |                 5/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research
289 |                 6/ In the final output, You should include all reference data & links to back up your research; You should include all reference data & links to back up your research"""
290 |     )
291 | 
292 |     agent_kwargs = {
293 |         "extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
294 |         "system_message": system_message,
295 |     }
296 |     memory = ConversationSummaryBufferMemory(
297 |         memory_key="memory", return_messages=True, llm=llm, max_token_limit=1000)
298 | 
299 |     agent = initialize_agent(
300 |         tools,
301 |         llm,
302 |         agent=AgentType.OPENAI_FUNCTIONS,
303 |         verbose=False,
304 |         agent_kwargs=agent_kwargs,
305 |         memory=memory,
306 |     )
307 | 
308 |     message = f"Research about {company} and {email_or_name}; What does the company do, and who the person is"
309 | 
310 |     result = agent({"input": message})
311 | 
312 |     return result
313 | 
314 | 
315 | class ProspectResearchInput(BaseModel):
316 |     """Inputs for scrape_website"""
317 |     email_or_name: str = Field(
318 |         description="The original email address or name of prospect")
319 |     company: str = Field(description="The company name of prospect")
320 | 
321 | 
322 | class ProspectResearchTool(BaseTool):
323 |     name = "prospect_research"
324 |     description = "useful when you need to research about a prospect, passing both email and company to the function, return the summary of its company as well as the prospect"
325 |     args_schema: Type[BaseModel] = ProspectResearchInput
326 | 
327 |     def _run(self, email_or_name: str, company: str):
328 |         return prospect_research(email_or_name, company)
329 | 
330 |     def _arun(self, url: str):
331 |         raise NotImplementedError("failed to escalate")
332 | 
333 | 
334 | # ESCALATE
335 | 
336 | def escalate(original_email_address: str, message: str, additional_context: str):
337 |     # URL to send the POST request to
338 |     url = 'https://hooks.zapier.com/hooks/catch/15616669/38qwq19/'
339 | 
340 |     # Data to send in the POST request
341 |     data = {
342 |         "prospect email": original_email_address,
343 |         "prospect message": message,
344 |         "additional context": additional_context
345 |     }
346 | 
347 |     # Send the POST request
348 |     response = requests.post(url, data=data)
349 | 
350 |     # Check the response
351 |     if response.status_code == 200:
352 |         return ('This email has been escalated to Jason, he will take care of it from here, nothing needs to be done now')
353 |     else:
354 |         return ('Failed to send POST request:', response.status_code)
355 | 
356 | 
357 | class EscalateInput(BaseModel):
358 |     """Inputs for scrape_website"""
359 |     message: str = Field(
360 |         description="The original email thread & message that was received, cc the original copy for escalation")
361 |     original_email_address: str = Field(
362 |         description="The email address that sent the message/email")
363 |     additional_context: str = Field(
364 |         description="additional context about the prospect, can be the company/prospct background research OR the consulting request details like use case, budget, etc.")
365 | 
366 | 
367 | class EscalateTool(BaseTool):
368 |     name = "escalate_to_jason"
369 |     description = "useful when you need to escalate the case to jason or others, passing both message and original_email_address to the function"
370 |     args_schema: Type[BaseModel] = EscalateInput
371 | 
372 |     def _run(self, original_email_address: str, message: str, additional_context: str):
373 |         return escalate(original_email_address, message, additional_context)
374 | 
375 |     def _arun(self, url: str):
376 |         raise NotImplementedError("failed to escalate")
377 | 
378 | 
379 | # REPLY EMAIL
380 | def reply_email(message: str, email_address: str, subject: str):
381 |     return f"An email has been sent to {email_address}"
382 | 
383 |     # URL to send the POST request to
384 |     url = 'https://hooks.zapier.com/hooks/catch/15616669/38qaaau/'
385 | 
386 |     # Data to send in the POST request
387 |     data = {
388 |         "Email": email_address,
389 |         "Subject": subject,
390 |         "Reply": message
391 |     }
392 | 
393 |     # Send the POST request
394 |     response = requests.post(url, data=data)
395 | 
396 |     # Check the response
397 |     if response.status_code == 200:
398 |         return ('Email reply has been created successfully')
399 |     else:
400 |         return ('Failed to send POST request:', response.status_code)
401 | 
402 | 
403 | class ReplyEmailInput(BaseModel):
404 |     """Inputs for scrape_website"""
405 |     message: str = Field(
406 |         description="The generated response message to be sent to the email address")
407 |     email_address: str = Field(
408 |         description="Destination email address to send email to")
409 |     subject: str = Field(description="subject of the email")
410 | 
411 | 
412 | class ReplyEmailTool(BaseTool):
413 |     name = "reply_email"
414 |     description = "use this to send emails"
415 |     args_schema: Type[BaseModel] = ReplyEmailInput
416 | 
417 |     def _run(self, message: str, email_address: str, subject: str):
418 |         return reply_email(message, email_address, subject)
419 | 
420 |     def _arun(self, url: str):
421 |         raise NotImplementedError("failed to escalate")
422 | 
423 | 
424 | # CREATE EMAIL DRAFT
425 | def create_email_draft(prospect_email_address: str, subject: str, generated_reply: str):
426 |     # URL to send the POST request to
427 |     url = 'https://hooks.zapier.com/hooks/catch/15616669/38ikw12/'
428 | 
429 |     # Data to send in the POST request
430 |     data = {
431 |         "email": prospect_email_address,
432 |         "subject": subject,
433 |         "reply": generated_reply
434 |     }
435 | 
436 |     # Send the POST request
437 |     response = requests.post(url, data=data)
438 | 
439 |     # Check the response
440 |     if response.status_code == 200:
441 |         return ('Email draft has been created successfully')
442 |     else:
443 |         return ('Failed to send POST request:', response.status_code)
444 | 
445 | 
446 | class CreateEmailDraftInput(BaseModel):
447 |     """Inputs for scrape_website"""
448 |     prospect_email_address: str = Field(
449 |         description="The prospect's email address")
450 |     subject: str = Field(description="The original email subject")
451 |     generated_reply: str = Field(
452 |         description="Generated email reply to prospect")
453 | 
454 | 
455 | class CreateEmailDraftTool(BaseTool):
456 |     name = "create_email_draft"
457 |     description = "use this to create email draft for jason to review & send"
458 |     args_schema: Type[BaseModel] = CreateEmailDraftInput
459 | 
460 |     def _run(self, prospect_email_address: str, subject: str, generated_reply: str):
461 |         return create_email_draft(prospect_email_address, subject, generated_reply)
462 | 
463 |     def _arun(self, url: str):
464 |         raise NotImplementedError("failed to escalate")
465 | 


--------------------------------------------------------------------------------
/email_cleaning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import find_dotenv, load_dotenv
 3 | import openai
 4 | import json, csv
 5 | 
 6 | load_dotenv(find_dotenv())
 7 | openai.api_key = os.environ.get("OPENAI_API_KEY")
 8 | 
 9 | def parse_email(email_thread):
10 | 
11 |     system_prompt = """
12 |     You are an expert of convert raw email thread into original message / reply pairs. 
13 |     You are given a raw email thread that Jason reply to others, your goal is to convert it into original message / reply pairs. 
14 |     - orignal_message: the last message sent to Jason, if it is a long email thread, only take the last message
15 |     - jason_reply: Jason's reply to the original message
16 | 
17 |     if there is only one message in the thread, that should be jason_reply
18 | 
19 |     The exported format should look something like 
20 |     {
21 |         "original_message": "xxxx",
22 |         "jason_reply": "xxxx"
23 |     }
24 |     """
25 | 
26 |     response = openai.ChatCompletion.create(
27 |         model="gpt-4",
28 |         messages=[
29 |             {"role": "system", "content": system_prompt},
30 |             {"role": "user", "content": email_thread}
31 |         ]
32 |     )
33 | 
34 |     return response["choices"][0]["message"]["content"]
35 | 
36 | def process_csv(input_csv_path, output_csv_path):
37 |     with open(input_csv_path, newline='', encoding='utf-8') as csvfile:
38 |         csv_reader = csv.DictReader(csvfile)
39 |         processed_data = []
40 |         
41 |         for row in csv_reader:
42 |             text = row['Body']  # Get the text from the 'body' column
43 |             json_string = parse_email(text)
44 |             print(json_string)
45 |             json_data = json.loads(json_string)  # Convert JSON string to dictionary
46 |             original_message = json_data.get('original_message', '')
47 |             jason_reply = json_data.get('jason_reply', '')
48 |             # Append original row data and new columns to processed_data
49 |             processed_data.append([original_message, jason_reply])
50 |     
51 |     # Write processed data to a new CSV file
52 |     with open(output_csv_path, mode='w', newline='', encoding='utf-8') as csvfile:
53 |         csv_writer = csv.writer(csvfile)
54 |         # Write header
55 |         csv_writer.writerow(['original_message', 'jason_reply'])
56 |         # Write data rows
57 |         csv_writer.writerows(processed_data)
58 | 
59 | # Paths to your input and output CSV files
60 | input_csv_path = 'past_email_final_mboxt.csv'
61 | output_csv_path = 'email_pairs.csv'
62 | 
63 | # Call the function to process the CSV file
64 | process_csv(input_csv_path, output_csv_path)
65 | 


--------------------------------------------------------------------------------
/extract_faq.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | from dotenv import find_dotenv, load_dotenv
 4 | from langchain.prompts import PromptTemplate
 5 | from langchain.chat_models import ChatOpenAI
 6 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 7 | from langchain.chains.summarize import load_summarize_chain
 8 | 
 9 | load_dotenv()
10 | llm = ChatOpenAI(temperature=0, model_name="gpt-4")
11 | 
12 | def load_csv(file_path):
13 |     # Create a list to hold dictionaries
14 |     data_list = []
15 | 
16 |     # Open the CSV file and read its content
17 |     with open(file_path, 'r') as csv_file:
18 |         csv_reader = csv.DictReader(csv_file)
19 |         
20 |         # For each row, append it as a dictionary to the list
21 |         for row in csv_reader:
22 |             data_list.append(row)
23 | 
24 |     return data_list
25 | 
26 | def extract_faq(text_data):
27 |     text_splitter = RecursiveCharacterTextSplitter(
28 |         chunk_size=3000, 
29 |         chunk_overlap=20,
30 |         length_function = len,
31 |         is_separator_regex=False)
32 | 
33 |     texts = text_splitter.split_text(text_data)
34 |     docs = text_splitter.create_documents(texts)
35 | 
36 | 
37 |     map_prompt = """
38 |     PAST EMAILS:
39 |     {text}
40 |     ----
41 | 
42 |     You are a smart AI assistant, above is some past emails from AI Jason (an AI youtuber), 
43 |     your goal is to learn & extract common FAQ about AI Jason 
44 |     (include both question & answer, return results in JSON):
45 |     """
46 |     map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])
47 | 
48 |     combine_prompt = """
49 |     The following is set of FAQ about AI Jason (an AI youtuber):
50 |     {text}
51 |     Take these and distill it into a final, consolidated array of faq, 
52 |     include both question & answer (in JSON format). 
53 |     
54 |     array of FAQ:
55 |     """
56 |     combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
57 | 
58 |     summary_chain = load_summarize_chain(llm=llm,
59 |                                         chain_type='map_reduce',
60 |                                         map_prompt=map_prompt_template,
61 |                                         combine_prompt=combine_prompt_template,
62 |                                         verbose=True
63 |                                         )
64 | 
65 |     output = summary_chain.run(docs)
66 |     faqs = json.loads(output)
67 | 
68 |     return faqs
69 | 
70 | def save_json_to_csv(data, file_name):
71 |     with open(file_name, mode='w', newline='', encoding='utf-8') as file:
72 |         # Get the keys (column names) from the first dictionary in the list
73 |         fieldnames = data[0].keys()
74 |         
75 |         # Create a CSV dict writer object
76 |         writer = csv.DictWriter(file, fieldnames=fieldnames)
77 |         
78 |         # Write the header row
79 |         writer.writeheader()
80 |         
81 |         # Write the data rows
82 |         for entry in data:
83 |             writer.writerow(entry)
84 | 
85 | 
86 | # Print or save the JSON data
87 | past_emails = load_csv("email_pairs.csv")
88 | 
89 | # Extracting Jason's replies
90 | jasons_replies = [entry["jason_reply"] for entry in past_emails]
91 | jasons_replies_string = json.dumps(jasons_replies)
92 | 
93 | faqs = extract_faq(jasons_replies_string)
94 | 
95 | save_json_to_csv(faqs, "faq.csv")
96 | 
97 | 


--------------------------------------------------------------------------------
/mbox_to_csv.py:
--------------------------------------------------------------------------------
 1 | import mailbox
 2 | import csv
 3 | from email import policy
 4 | from email.parser import BytesParser
 5 | 
 6 | def get_body(message):
 7 |     if message.is_multipart():
 8 |         for part in message.walk():
 9 |             if part.is_multipart():
10 |                 for subpart in part.walk():
11 |                     if subpart.get_content_type() == 'text/plain':
12 |                         return subpart.get_payload(decode=True)
13 |             elif part.get_content_type() == 'text/plain':
14 |                 return part.get_payload(decode=True)
15 |     else:
16 |         return message.get_payload(decode=True)
17 | 
18 | def mbox_to_csv(mbox_file_path, csv_file_path):
19 |     mbox = mailbox.mbox(mbox_file_path)
20 | 
21 |     with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
22 |         writer = csv.writer(file)
23 |         writer.writerow(['Subject', 'From', 'Date', 'To', 'Message-ID', 'Body'])
24 | 
25 |         for message in mbox:
26 |             body = get_body(message)  # Get the message body using the new get_body function
27 |             if body:
28 |                 body = body.decode('utf-8', errors='replace').replace('\n', ' ').replace('\r', '')
29 |             else:
30 |                 body = ''
31 |             writer.writerow([
32 |                 message['subject'],
33 |                 message['from'],
34 |                 message['date'],
35 |                 message['to'],
36 |                 message['message-id'],
37 |                 body
38 |             ])
39 | 
40 | # Usage
41 | mbox_file_path = 'Sent.mbox'  # replace with the path to your MBOX file
42 | csv_file_path = 'past_email_mbox.csv'  # replace with the desired path for the output CSV file
43 | mbox_to_csv(mbox_file_path, csv_file_path)
44 | 


--------------------------------------------------------------------------------