├── .env.example
├── .gitignore
├── LICENSE
├── README.md
├── embeddings.py
├── models.py
├── prompt.md
├── qa.py
├── requirements.txt
└── upload_data.py


/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | __pycache__/
3 | logs/
4 | *.log
5 | data/
6 | .chroma/
7 | .mypy_cache/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Alex Meyer
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Readwise Chat
 2 | 
 3 | This is a repository for building a chatbot on top of your Readwise highlights. It's a bit experimental and there are a number of ways to potentially improve it but it's a start.
 4 | 
 5 | ## How to use
 6 | 
 7 | Before doing anything else, you'll need to have a pro [Readwise account](https://readwise.io/) so that you can export your highlights via CSV.
 8 | 
 9 | Assuming you have an account, go to the [Exports Page](https://readwise.io/export) in your Readwise dashboard. From there, go to the CSV Export section and click the "Export" button. This will download a CSV file to your computer. Remember where this file is saved.
10 | 
11 | Next, install all the dependicies:
12 | 
13 | ```
14 | pip install -r requirements.txt
15 | ```
16 | 
17 | Then, create your own `.env` file by copying the `.env.example` file and filling in the values:
18 | 
19 | ```
20 | cp .env.example .env
21 | ```
22 | 
23 | Next, upload your CSV file you downloaded from Readwise by running:
24 | 
25 | ```
26 | python upload_data.py -f /path/to/your/csv/file
27 | ```
28 | 
29 | If this is your first time running this and you have a lot of highlights, this step could take awhile.
30 | 
31 | Once it's finished running, you can start the chatbot by running:
32 | 
33 | ```
34 | python qa.py
35 | ```
36 | 
37 | This will kick off a chat with the bot. Based on what you ask it, it should pull references to your highlights as you go, even updating those references based on where the conversation goes. However, it's not perfect and it may not always work the way you'd expect. 
38 | 
39 | For best results, try starting off the conversation asking it a specific question about a particular book and topic. The more specific you are, the better the results should be.
40 | 
41 | ## How to improve
42 | 
43 | There are a few different ways you could improve the bot, including:
44 | - Changing up the prompt, prompt engineering goes a long way
45 | - Changing the strategy on pulling embeddings during the conversation, I haven't been able to figure out the best way to do this yet, where it feels like a natural conversation but the data for the bot continues to update itself well
46 | - Giving the user options for what books it has at its disposal before starting the chat
47 | 
48 | And I'm sure there are a ton of other ways you could improve it. If you have any ideas, feel free to open an issue or a PR.


--------------------------------------------------------------------------------
/embeddings.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | from openai import OpenAI
  4 | client = OpenAI()
  5 | import tiktoken
  6 | import chromadb
  7 | from chromadb.utils import embedding_functions
  8 | chroma_client = chromadb.PersistentClient(path="data/chroma")
  9 | import pandas as pd
 10 | import numpy as np
 11 | from typing import Iterator
 12 | from ast import literal_eval
 13 | from tenacity import retry, wait_random_exponential, stop_after_attempt
 14 | from dotenv import load_dotenv
 15 | load_dotenv()
 16 | 
 17 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
 18 | EMBEDDINGS_MODEL = "text-embedding-3-small"
 19 | OPENAI_EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-3-small
 20 | MAX_EMBEDDING_TOKENS = 8191  # the maximum for text-embedding-3-small is 8191
 21 | EMBEDDINGS_INDEX_NAME = "book-notes"
 22 | BATCH_SIZE = 100
 23 | 
 24 | # Models a simple batch generator that make chunks out of an input DataFrame
 25 | class BatchGenerator:
 26 |     def __init__(self, batch_size: int = 10) -> None:
 27 |         self.batch_size = batch_size
 28 | 
 29 |     # Makes chunks out of an input DataFrame
 30 |     def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
 31 |         splits = self.splits_num(df.shape[0])
 32 |         if splits <= 1:
 33 |             yield df
 34 |         else:
 35 |             for chunk in np.array_split(df, splits):
 36 |                 yield chunk
 37 | 
 38 |     # Determines how many chunks DataFrame contains
 39 |     def splits_num(self, elements: int) -> int:
 40 |         return round(elements / self.batch_size)
 41 | 
 42 |     __call__ = to_batches
 43 | 
 44 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
 45 | def get_embedding(text: str, model=EMBEDDINGS_MODEL) -> list[float]:
 46 |     text = text.replace("\n", " ") # OpenAI says removing newlines leads to better performance
 47 |     response = client.embeddings.create(
 48 |         input=text,
 49 |         model=model
 50 |     )
 51 |     return response.data[0].embedding
 52 | 
 53 | def get_embeddings(df: pd.DataFrame):
 54 |     print('Getting embeddings...')
 55 |     encoding = tiktoken.get_encoding(OPENAI_EMBEDDING_ENCODING)
 56 |     # omit any that are too long to embed
 57 |     df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x)))
 58 |     df = df[df.n_tokens <= MAX_EMBEDDING_TOKENS]
 59 |     df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=EMBEDDINGS_MODEL))
 60 |     print('Done getting embeddings.')
 61 |     return df
 62 | 
 63 | def compute_embeddings(df: pd.DataFrame):
 64 |     return {
 65 |         idx: get_embedding(r.combined) for idx, r in df.iterrows()
 66 |     }
 67 | 
 68 | def load_embeddings(filepath: str = 'data/embeddings/book_notes_w_embeddings.csv'):
 69 |     """Load the dataset with the embeddings from a CSV file."""
 70 |     df = pd.read_csv(filepath)
 71 |     # Convert embeddings to list
 72 |     df['embedding'] = df.embedding.apply(literal_eval)
 73 |     # Convert id to string
 74 |     df['id'] = df['id'].apply(str)
 75 |     return df
 76 | 
 77 | def load_dataset_for_embeddings(df: pd.DataFrame):
 78 |     """Configure the dataset for embeddings."""
 79 |     try:
 80 |         # Keep only the columns we need
 81 |         df = df[['id', 'highlight', 'book', 'author', 'note', 'location', 'location_type']]
 82 |         df['combined'] = (
 83 |             "Title: " + df['book'].str.strip().fillna('') + "; " +
 84 |             "Author: " + df['author'].str.strip().fillna('') + "; " +
 85 |             "Highlight: " + df['highlight'].str.strip().fillna('') +
 86 |             (("; Note: " + df['note'].str.strip()) if df['note'].notna().all() else '')
 87 |         )
 88 |         # Convert id to string
 89 |         df['id'] = df['id'].apply(str)
 90 |         return df
 91 |     except:
 92 |         print("Error configuring dataset for embeddings.")
 93 |         return df
 94 | 
 95 | def save_embeddings(df: pd.DataFrame, output_path: str = 'data/embeddings/book_notes_w_embeddings.csv'):
 96 |     """Save the dataset with the embeddings to a CSV file."""
 97 |     if os.path.exists(output_path):
 98 |         # Read in the existing file
 99 |         existing_df = pd.read_csv(output_path)
100 |         # Append the new data to the existing data
101 |         df = pd.concat([existing_df, df], ignore_index=True)
102 |         df.to_csv(f'{output_path}', index=False)
103 |         print(f"Saved embeddings to {output_path}.")
104 |     else:  
105 |         df.to_csv(f'{output_path}', index=False)
106 |         print(f"Saved embeddings to {output_path}.")
107 | 
108 | # Using chromadb for embeddings search
109 | def add_embeddings_to_chroma(df: pd.DataFrame):
110 |     print(f'Adding {len(df)} embeddings to chromadb...')
111 |     ef = embedding_functions.OpenAIEmbeddingFunction(
112 |         api_key=OPENAI_API_KEY,
113 |         model_name=EMBEDDINGS_MODEL
114 |     )
115 |     collection = chroma_client.get_or_create_collection(
116 |         name=EMBEDDINGS_INDEX_NAME, 
117 |         embedding_function=ef
118 |     )
119 | 
120 |     # Create a batch generator
121 |     df_batcher = BatchGenerator(BATCH_SIZE)
122 |     for batch_df in df_batcher(df):
123 |         collection.add(
124 |             embeddings=batch_df['embedding'].tolist(),
125 |             documents=batch_df['combined'].tolist(),
126 |             ids=batch_df['id'].tolist()
127 |         )
128 |     print('Done adding to chromadb.')
129 | 
130 | def query_embeddings_chroma(query: str, n_results: int = 5):
131 |     query_embedding = get_embedding(query)
132 |     ef = embedding_functions.OpenAIEmbeddingFunction(
133 |         api_key=OPENAI_API_KEY,
134 |         model_name=EMBEDDINGS_MODEL
135 |     )
136 |     collection = chroma_client.get_collection(
137 |         name=EMBEDDINGS_INDEX_NAME, 
138 |         embedding_function=ef
139 |     )
140 |     results = collection.query(
141 |         query_embeddings=[query_embedding],
142 |         n_results=n_results
143 |     )
144 |     ids = results["ids"][0]
145 |     distances = results["distances"][0]
146 |     relevant_docs = [(distances[idx], id) for idx, id in enumerate(ids)]
147 |     relevant_docs = sorted(relevant_docs, reverse=True)
148 |     return relevant_docs


--------------------------------------------------------------------------------
/models.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | @dataclass(frozen=True)
 5 | class Message:
 6 |     role: str
 7 |     content: Optional[str] = None
 8 | 
 9 |     def render(self):
10 |         result = self.role + ":"
11 |         if self.content is not None:
12 |             result += " " + self.content
13 |         return result


--------------------------------------------------------------------------------
/prompt.md:
--------------------------------------------------------------------------------
1 | You're a helpful assistant that answers questions based on relevant information from book highlights and notes. Your job is to provide answers to user questions by using the provided book highlights. Your answers should be consise and provide both the answer to the question and the source of information that you used to answer the question. They should also be in the third person. 
2 | 
3 | If the question isn't clear or you don't understand it, you can ask the user for clarification. If you cannot answer the user's question based on the information provided, you say "Sorry, but I can't answer that question".
4 | 
5 | Here's the relevant information you may use to answer the user's question:
6 | $relevant_information
7 | 


--------------------------------------------------------------------------------
/qa.py:
--------------------------------------------------------------------------------
  1 | from openai import OpenAI
  2 | client = OpenAI()
  3 | import pandas as pd
  4 | import sqlite3
  5 | from dataclasses import asdict
  6 | from embeddings import (query_embeddings_chroma)
  7 | from models import Message
  8 | import logging
  9 | logging.basicConfig(filename='logs/qa.log', level=logging.INFO)
 10 | 
 11 | CHAT_MODEL = "gpt-4o"
 12 | MODEL_TEMPERATURE = 0.0
 13 | 
 14 | def stream_gpt_response(prompt: str, messages: list[Message]):
 15 |     """Returns ChatGPT's response to the given prompt."""
 16 |     system_message = [{"role": "system", "content": prompt}]
 17 |     if len(messages) > 0:
 18 |         message_dicts = [asdict(message) for message in messages]
 19 |         conversation_messages = system_message + message_dicts
 20 |     else:
 21 |         conversation_messages = system_message
 22 |     response = client.chat.completions.create(
 23 |         model=CHAT_MODEL,
 24 |         messages=conversation_messages,
 25 |         temperature=MODEL_TEMPERATURE,
 26 |         stream=True
 27 |     )
 28 |     return response
 29 | 
 30 | def ask_gpt_chat(prompt: str, messages: list[Message]):
 31 |     """Returns ChatGPT's response to the given prompt."""
 32 |     system_message = [{"role": "system", "content": prompt}]
 33 |     if len(messages) > 0:
 34 |         message_dicts = [asdict(message) for message in messages]
 35 |         conversation_messages = system_message + message_dicts
 36 |     else:
 37 |         conversation_messages = system_message
 38 |     response = client.chat.completions.create(
 39 |         model=CHAT_MODEL,
 40 |         messages=conversation_messages,
 41 |         temperature=MODEL_TEMPERATURE
 42 |     )
 43 |     return response.choices[0].message.content
 44 | 
 45 | def get_data_for_ids(ids: list) -> pd.DataFrame:
 46 |     # Connect to the database
 47 |     conn = sqlite3.connect("data/highlights.db")
 48 |     c = conn.cursor()
 49 |     # Get the data for the given id
 50 |     c.execute(f"SELECT id, highlight, book, author, note, location, location_type FROM highlights WHERE id IN ({','.join(ids)})")
 51 |     data = c.fetchall()
 52 |     df = pd.DataFrame(data, columns=['id', 'highlight', 'book', 'author', 'note', 'location', 'location_type'])
 53 |     conn.close()
 54 |     return df
 55 | 
 56 | def setup_prompt(relevant_docs) -> str:
 57 |     """Creates a prompt for gpt-3 for generating a response."""
 58 |     formatted_docs = []
 59 |     relevant_data = get_data_for_ids(ids=[doc[1] for doc in relevant_docs])
 60 |     for _, row in relevant_data.iterrows():
 61 |         title = row['book']
 62 |         highlight = row['highlight']
 63 |         formatted_string = f"Title: {title}\n"
 64 |         if pd.notna(row['location_type']) and pd.notna(row['location']):
 65 |             location_type = row['location_type']
 66 |             location_value = row['location']
 67 |             location_string = f"{location_type}: {location_value}"
 68 |             formatted_string += f"{location_string}\n"
 69 |         formatted_string += f"Highlight: {highlight}\n"
 70 |         if pd.notna(row['note']):
 71 |             note = row['note']
 72 |             note_string = f"My Notes: {note}"
 73 |             formatted_string += f"{note_string}\n"
 74 |         formatted_docs.append(formatted_string)
 75 | 
 76 |     with open('prompt.md') as f:
 77 |         prompt = f.read()
 78 |         prompt = prompt.replace("$relevant_information", "\n".join(formatted_docs))
 79 | 
 80 |     return prompt
 81 | 
 82 | if __name__ == "__main__":
 83 |     conversation_messages = []
 84 |     user_messages = []
 85 |     while (user_input := input('You: ').strip()) != "":
 86 |         relevant_docs = query_embeddings_chroma(query=user_input, n_results=10)
 87 |         prompt = setup_prompt(relevant_docs)
 88 |         conversation_messages.append(Message(role="user", content=user_input))
 89 |         user_messages.append(Message(role="user", content=user_input))
 90 |         answer = stream_gpt_response(prompt, conversation_messages)
 91 |         print(f'\nBot: ')
 92 |         complete_answer: list[str] = []
 93 |         for chunk in answer:
 94 |             try:
 95 |                 event_delta = chunk.choices[0].delta
 96 |                 answer_text = event_delta.content or ""
 97 |                 print(answer_text, end='')
 98 |                 complete_answer.append(answer_text)
 99 |             except KeyError:
100 |                 pass
101 | 
102 |         print('\n')
103 |         complete_answer_string = ''.join(complete_answer)
104 |         conversation_messages.append(Message(role="assistant", content=complete_answer_string))


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | chromadb==0.5.0
2 | numpy==1.24.2
3 | openai==1.30.3
4 | pandas==1.5.3
5 | python-dotenv==0.21.1
6 | tenacity==8.2.3
7 | tiktoken==0.3.1


--------------------------------------------------------------------------------
/upload_data.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import datetime
  3 | import os
  4 | import glob
  5 | import pandas as pd
  6 | import argparse
  7 | import sqlite3
  8 | from embeddings import (
  9 |     get_embeddings,
 10 |     save_embeddings,
 11 |     load_dataset_for_embeddings,
 12 |     add_embeddings_to_chroma,
 13 | )
 14 | 
 15 | def create_new_file(filepath: str):
 16 |     current_time = str(int(time.time() * 1000))
 17 |     file_name = filepath.split("/")[-1]
 18 |     file_name = file_name.split(".")[0]
 19 |     file_name = f"{file_name}_{current_time}.csv"
 20 |     new_file_path = f"data/highlights/{file_name}"
 21 |     # Copy the file to the new location
 22 |     os.system(f"cp {filepath} {new_file_path}")
 23 |     return new_file_path
 24 | 
 25 | def get_most_recently_updated_highlights_csv_file():
 26 |     # Get the most recent highlights data CSV file
 27 |     list_of_files = glob.glob('data/highlights/*.csv')
 28 |     if not list_of_files:
 29 |         return None
 30 |     latest_file = max(list_of_files, key=os.path.getctime)
 31 |     return latest_file
 32 | 
 33 | def compare_csv_files(old_file_path: str, new_file_path: str):
 34 |     # Read in both CSV files
 35 |     old_data = pd.read_csv(old_file_path)
 36 |     new_data = pd.read_csv(new_file_path)
 37 | 
 38 |     # Find rows that have been removed
 39 |     removed_data = old_data.merge(new_data, on=list(old_data.columns), how='left', indicator=True)
 40 |     removed_data = removed_data[removed_data['_merge'] == 'left_only'].drop(columns='_merge')
 41 | 
 42 |     # Find rows that have been added
 43 |     added_data = new_data.merge(old_data, on=list(old_data.columns), how='left', indicator=True)
 44 |     added_data = added_data[added_data['_merge'] == 'left_only'].drop(columns='_merge')
 45 | 
 46 |     # Return the changed data
 47 |     return removed_data, added_data
 48 | 
 49 | def update_db(added_data, removed_data):
 50 |     # Connect to the database
 51 |     conn = sqlite3.connect("data/highlights.db")
 52 |     c = conn.cursor()
 53 | 
 54 |     # Remove the removed data from the database
 55 |     for _, row in removed_data.iterrows():
 56 |         c.execute("DELETE FROM highlights WHERE highlight = :highlight", {"highlight": row["Highlight"]})
 57 | 
 58 |     # Add the new data to the database
 59 |     for _, row in added_data.iterrows():
 60 |         c.execute("""INSERT INTO highlights 
 61 |                     (highlight, book, author, note, location, location_type) 
 62 |                     VALUES (?, ?, ?, ?, ?, ?)""", (
 63 |                       row["Highlight"], 
 64 |                       row["Book Title"], 
 65 |                       row["Book Author"], 
 66 |                       row["Note"], 
 67 |                       row["Location"], 
 68 |                       row["Location Type"]
 69 |                     )
 70 |                   )
 71 |         
 72 |     # Commit the changes
 73 |     conn.commit()
 74 |     conn.close()
 75 | 
 76 | def create_db(added_data: pd.DataFrame):
 77 |     # Create the database file if it doesn't exist
 78 |     if not os.path.exists("data/highlights.db"):
 79 |         os.system("touch data/highlights.db")
 80 |     # Connect to the database
 81 |     conn = sqlite3.connect("data/highlights.db")
 82 |     c = conn.cursor()
 83 | 
 84 |     # Create the table
 85 |     c.execute("""CREATE TABLE highlights
 86 |                 (id INTEGER PRIMARY KEY,
 87 |                 highlight TEXT,
 88 |                 book TEXT,
 89 |                 author TEXT,
 90 |                 note TEXT,
 91 |                 location TEXT,
 92 |                 location_type TEXT,
 93 |                 created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
 94 |                 updated_at DATETIME DEFAULT CURRENT_TIMESTAMP)""")
 95 |     conn.commit()
 96 |     
 97 |     for _, row in added_data.iterrows():
 98 |         c.execute("""INSERT INTO highlights 
 99 |                     (highlight, book, author, note, location, location_type) 
100 |                     VALUES (?, ?, ?, ?, ?, ?)""", (
101 |                       row["Highlight"], 
102 |                       row["Book Title"], 
103 |                       row["Book Author"], 
104 |                       row["Note"], 
105 |                       row["Location"], 
106 |                       row["Location Type"]
107 |                     )
108 |                   )
109 | 
110 |     # Commit the changes
111 |     conn.commit()
112 |     conn.close()
113 | 
114 | def get_most_recently_added_data_from_db(since: str):
115 |     # Connect to the database
116 |     conn = sqlite3.connect("data/highlights.db")
117 |     c = conn.cursor()
118 |     # Get the most recently added data
119 |     c.execute(f"SELECT id, highlight, book, author, note, location, location_type FROM highlights WHERE created_at >= '{since}'")
120 |     data = c.fetchall()
121 |     df = pd.DataFrame(data, columns=['id', 'highlight', 'book', 'author', 'note', 'location', 'location_type'])
122 |     conn.close()
123 |     return df
124 | 
125 | if __name__ == "__main__":
126 |     parser = argparse.ArgumentParser()
127 |     parser.add_argument("-f", "--file", help="Specify the file to upload.", required=True)
128 |     args = parser.parse_args()
129 |     filepath = args.file
130 |     # Get the most recently updated CSV file before uploading the new one
131 |     most_recent_filepath = get_most_recently_updated_highlights_csv_file()
132 |     # Upload the new file
133 |     new_filepath = create_new_file(filepath)
134 |     if most_recent_filepath:
135 |         # Compare the new file to the most recently updated file
136 |         removed_data, added_data = compare_csv_files(most_recent_filepath, new_filepath)
137 |         # Process the data that changed
138 |         update_db(added_data, removed_data)
139 |     else:
140 |         # No prior data, so create new database with data
141 |         added_data = pd.read_csv(new_filepath)
142 |         create_db(added_data)
143 |     # Get the most recently changed data, then get and save the embeddings for it
144 |     current_time = new_filepath.split('_')[-1].split('.')[0]
145 |     timestamp = int(current_time)
146 |     dt = datetime.datetime.fromtimestamp(timestamp / 1000)
147 |     date = dt.strftime('%Y-%m-%d %H:%M:%S')
148 |     df = get_most_recently_added_data_from_db(since=date)
149 |     df = load_dataset_for_embeddings(df=df)
150 |     df = get_embeddings(df)
151 |     # Save the embeddings to a CSV file, just in case
152 |     save_embeddings(df, 'data/embeddings/book_notes_w_embeddings.csv')
153 |     # Upload embeddings to chroma
154 |     add_embeddings_to_chroma(df)
155 | 
156 | 


--------------------------------------------------------------------------------