├── .env.example ├── .gitignore ├── LICENSE ├── README.md ├── embeddings.py ├── models.py ├── prompt.md ├── qa.py ├── requirements.txt └── upload_data.py /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __pycache__/ 3 | logs/ 4 | *.log 5 | data/ 6 | .chroma/ 7 | .mypy_cache/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Alex Meyer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Readwise Chat 2 | 3 | This is a repository for building a chatbot on top of your Readwise highlights. It's a bit experimental and there are a number of ways to potentially improve it but it's a start. 4 | 5 | ## How to use 6 | 7 | Before doing anything else, you'll need to have a pro [Readwise account](https://readwise.io/) so that you can export your highlights via CSV. 8 | 9 | Assuming you have an account, go to the [Exports Page](https://readwise.io/export) in your Readwise dashboard. From there, go to the CSV Export section and click the "Export" button. This will download a CSV file to your computer. Remember where this file is saved. 10 | 11 | Next, install all the dependicies: 12 | 13 | ``` 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | Then, create your own `.env` file by copying the `.env.example` file and filling in the values: 18 | 19 | ``` 20 | cp .env.example .env 21 | ``` 22 | 23 | Next, upload your CSV file you downloaded from Readwise by running: 24 | 25 | ``` 26 | python upload_data.py -f /path/to/your/csv/file 27 | ``` 28 | 29 | If this is your first time running this and you have a lot of highlights, this step could take awhile. 30 | 31 | Once it's finished running, you can start the chatbot by running: 32 | 33 | ``` 34 | python qa.py 35 | ``` 36 | 37 | This will kick off a chat with the bot. Based on what you ask it, it should pull references to your highlights as you go, even updating those references based on where the conversation goes. However, it's not perfect and it may not always work the way you'd expect. 38 | 39 | For best results, try starting off the conversation asking it a specific question about a particular book and topic. The more specific you are, the better the results should be. 40 | 41 | ## How to improve 42 | 43 | There are a few different ways you could improve the bot, including: 44 | - Changing up the prompt, prompt engineering goes a long way 45 | - Changing the strategy on pulling embeddings during the conversation, I haven't been able to figure out the best way to do this yet, where it feels like a natural conversation but the data for the bot continues to update itself well 46 | - Giving the user options for what books it has at its disposal before starting the chat 47 | 48 | And I'm sure there are a ton of other ways you could improve it. If you have any ideas, feel free to open an issue or a PR. -------------------------------------------------------------------------------- /embeddings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from openai import OpenAI 4 | client = OpenAI() 5 | import tiktoken 6 | import chromadb 7 | from chromadb.utils import embedding_functions 8 | chroma_client = chromadb.PersistentClient(path="data/chroma") 9 | import pandas as pd 10 | import numpy as np 11 | from typing import Iterator 12 | from ast import literal_eval 13 | from tenacity import retry, wait_random_exponential, stop_after_attempt 14 | from dotenv import load_dotenv 15 | load_dotenv() 16 | 17 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 18 | EMBEDDINGS_MODEL = "text-embedding-3-small" 19 | OPENAI_EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-3-small 20 | MAX_EMBEDDING_TOKENS = 8191 # the maximum for text-embedding-3-small is 8191 21 | EMBEDDINGS_INDEX_NAME = "book-notes" 22 | BATCH_SIZE = 100 23 | 24 | # Models a simple batch generator that make chunks out of an input DataFrame 25 | class BatchGenerator: 26 | def __init__(self, batch_size: int = 10) -> None: 27 | self.batch_size = batch_size 28 | 29 | # Makes chunks out of an input DataFrame 30 | def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]: 31 | splits = self.splits_num(df.shape[0]) 32 | if splits <= 1: 33 | yield df 34 | else: 35 | for chunk in np.array_split(df, splits): 36 | yield chunk 37 | 38 | # Determines how many chunks DataFrame contains 39 | def splits_num(self, elements: int) -> int: 40 | return round(elements / self.batch_size) 41 | 42 | __call__ = to_batches 43 | 44 | @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6)) 45 | def get_embedding(text: str, model=EMBEDDINGS_MODEL) -> list[float]: 46 | text = text.replace("\n", " ") # OpenAI says removing newlines leads to better performance 47 | response = client.embeddings.create( 48 | input=text, 49 | model=model 50 | ) 51 | return response.data[0].embedding 52 | 53 | def get_embeddings(df: pd.DataFrame): 54 | print('Getting embeddings...') 55 | encoding = tiktoken.get_encoding(OPENAI_EMBEDDING_ENCODING) 56 | # omit any that are too long to embed 57 | df["n_tokens"] = df.combined.apply(lambda x: len(encoding.encode(x))) 58 | df = df[df.n_tokens <= MAX_EMBEDDING_TOKENS] 59 | df["embedding"] = df.combined.apply(lambda x: get_embedding(x, model=EMBEDDINGS_MODEL)) 60 | print('Done getting embeddings.') 61 | return df 62 | 63 | def compute_embeddings(df: pd.DataFrame): 64 | return { 65 | idx: get_embedding(r.combined) for idx, r in df.iterrows() 66 | } 67 | 68 | def load_embeddings(filepath: str = 'data/embeddings/book_notes_w_embeddings.csv'): 69 | """Load the dataset with the embeddings from a CSV file.""" 70 | df = pd.read_csv(filepath) 71 | # Convert embeddings to list 72 | df['embedding'] = df.embedding.apply(literal_eval) 73 | # Convert id to string 74 | df['id'] = df['id'].apply(str) 75 | return df 76 | 77 | def load_dataset_for_embeddings(df: pd.DataFrame): 78 | """Configure the dataset for embeddings.""" 79 | try: 80 | # Keep only the columns we need 81 | df = df[['id', 'highlight', 'book', 'author', 'note', 'location', 'location_type']] 82 | df['combined'] = ( 83 | "Title: " + df['book'].str.strip().fillna('') + "; " + 84 | "Author: " + df['author'].str.strip().fillna('') + "; " + 85 | "Highlight: " + df['highlight'].str.strip().fillna('') + 86 | (("; Note: " + df['note'].str.strip()) if df['note'].notna().all() else '') 87 | ) 88 | # Convert id to string 89 | df['id'] = df['id'].apply(str) 90 | return df 91 | except: 92 | print("Error configuring dataset for embeddings.") 93 | return df 94 | 95 | def save_embeddings(df: pd.DataFrame, output_path: str = 'data/embeddings/book_notes_w_embeddings.csv'): 96 | """Save the dataset with the embeddings to a CSV file.""" 97 | if os.path.exists(output_path): 98 | # Read in the existing file 99 | existing_df = pd.read_csv(output_path) 100 | # Append the new data to the existing data 101 | df = pd.concat([existing_df, df], ignore_index=True) 102 | df.to_csv(f'{output_path}', index=False) 103 | print(f"Saved embeddings to {output_path}.") 104 | else: 105 | df.to_csv(f'{output_path}', index=False) 106 | print(f"Saved embeddings to {output_path}.") 107 | 108 | # Using chromadb for embeddings search 109 | def add_embeddings_to_chroma(df: pd.DataFrame): 110 | print(f'Adding {len(df)} embeddings to chromadb...') 111 | ef = embedding_functions.OpenAIEmbeddingFunction( 112 | api_key=OPENAI_API_KEY, 113 | model_name=EMBEDDINGS_MODEL 114 | ) 115 | collection = chroma_client.get_or_create_collection( 116 | name=EMBEDDINGS_INDEX_NAME, 117 | embedding_function=ef 118 | ) 119 | 120 | # Create a batch generator 121 | df_batcher = BatchGenerator(BATCH_SIZE) 122 | for batch_df in df_batcher(df): 123 | collection.add( 124 | embeddings=batch_df['embedding'].tolist(), 125 | documents=batch_df['combined'].tolist(), 126 | ids=batch_df['id'].tolist() 127 | ) 128 | print('Done adding to chromadb.') 129 | 130 | def query_embeddings_chroma(query: str, n_results: int = 5): 131 | query_embedding = get_embedding(query) 132 | ef = embedding_functions.OpenAIEmbeddingFunction( 133 | api_key=OPENAI_API_KEY, 134 | model_name=EMBEDDINGS_MODEL 135 | ) 136 | collection = chroma_client.get_collection( 137 | name=EMBEDDINGS_INDEX_NAME, 138 | embedding_function=ef 139 | ) 140 | results = collection.query( 141 | query_embeddings=[query_embedding], 142 | n_results=n_results 143 | ) 144 | ids = results["ids"][0] 145 | distances = results["distances"][0] 146 | relevant_docs = [(distances[idx], id) for idx, id in enumerate(ids)] 147 | relevant_docs = sorted(relevant_docs, reverse=True) 148 | return relevant_docs -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | @dataclass(frozen=True) 5 | class Message: 6 | role: str 7 | content: Optional[str] = None 8 | 9 | def render(self): 10 | result = self.role + ":" 11 | if self.content is not None: 12 | result += " " + self.content 13 | return result -------------------------------------------------------------------------------- /prompt.md: -------------------------------------------------------------------------------- 1 | You're a helpful assistant that answers questions based on relevant information from book highlights and notes. Your job is to provide answers to user questions by using the provided book highlights. Your answers should be consise and provide both the answer to the question and the source of information that you used to answer the question. They should also be in the third person. 2 | 3 | If the question isn't clear or you don't understand it, you can ask the user for clarification. If you cannot answer the user's question based on the information provided, you say "Sorry, but I can't answer that question". 4 | 5 | Here's the relevant information you may use to answer the user's question: 6 | $relevant_information 7 | -------------------------------------------------------------------------------- /qa.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | client = OpenAI() 3 | import pandas as pd 4 | import sqlite3 5 | from dataclasses import asdict 6 | from embeddings import (query_embeddings_chroma) 7 | from models import Message 8 | import logging 9 | logging.basicConfig(filename='logs/qa.log', level=logging.INFO) 10 | 11 | CHAT_MODEL = "gpt-4o" 12 | MODEL_TEMPERATURE = 0.0 13 | 14 | def stream_gpt_response(prompt: str, messages: list[Message]): 15 | """Returns ChatGPT's response to the given prompt.""" 16 | system_message = [{"role": "system", "content": prompt}] 17 | if len(messages) > 0: 18 | message_dicts = [asdict(message) for message in messages] 19 | conversation_messages = system_message + message_dicts 20 | else: 21 | conversation_messages = system_message 22 | response = client.chat.completions.create( 23 | model=CHAT_MODEL, 24 | messages=conversation_messages, 25 | temperature=MODEL_TEMPERATURE, 26 | stream=True 27 | ) 28 | return response 29 | 30 | def ask_gpt_chat(prompt: str, messages: list[Message]): 31 | """Returns ChatGPT's response to the given prompt.""" 32 | system_message = [{"role": "system", "content": prompt}] 33 | if len(messages) > 0: 34 | message_dicts = [asdict(message) for message in messages] 35 | conversation_messages = system_message + message_dicts 36 | else: 37 | conversation_messages = system_message 38 | response = client.chat.completions.create( 39 | model=CHAT_MODEL, 40 | messages=conversation_messages, 41 | temperature=MODEL_TEMPERATURE 42 | ) 43 | return response.choices[0].message.content 44 | 45 | def get_data_for_ids(ids: list) -> pd.DataFrame: 46 | # Connect to the database 47 | conn = sqlite3.connect("data/highlights.db") 48 | c = conn.cursor() 49 | # Get the data for the given id 50 | c.execute(f"SELECT id, highlight, book, author, note, location, location_type FROM highlights WHERE id IN ({','.join(ids)})") 51 | data = c.fetchall() 52 | df = pd.DataFrame(data, columns=['id', 'highlight', 'book', 'author', 'note', 'location', 'location_type']) 53 | conn.close() 54 | return df 55 | 56 | def setup_prompt(relevant_docs) -> str: 57 | """Creates a prompt for gpt-3 for generating a response.""" 58 | formatted_docs = [] 59 | relevant_data = get_data_for_ids(ids=[doc[1] for doc in relevant_docs]) 60 | for _, row in relevant_data.iterrows(): 61 | title = row['book'] 62 | highlight = row['highlight'] 63 | formatted_string = f"Title: {title}\n" 64 | if pd.notna(row['location_type']) and pd.notna(row['location']): 65 | location_type = row['location_type'] 66 | location_value = row['location'] 67 | location_string = f"{location_type}: {location_value}" 68 | formatted_string += f"{location_string}\n" 69 | formatted_string += f"Highlight: {highlight}\n" 70 | if pd.notna(row['note']): 71 | note = row['note'] 72 | note_string = f"My Notes: {note}" 73 | formatted_string += f"{note_string}\n" 74 | formatted_docs.append(formatted_string) 75 | 76 | with open('prompt.md') as f: 77 | prompt = f.read() 78 | prompt = prompt.replace("$relevant_information", "\n".join(formatted_docs)) 79 | 80 | return prompt 81 | 82 | if __name__ == "__main__": 83 | conversation_messages = [] 84 | user_messages = [] 85 | while (user_input := input('You: ').strip()) != "": 86 | relevant_docs = query_embeddings_chroma(query=user_input, n_results=10) 87 | prompt = setup_prompt(relevant_docs) 88 | conversation_messages.append(Message(role="user", content=user_input)) 89 | user_messages.append(Message(role="user", content=user_input)) 90 | answer = stream_gpt_response(prompt, conversation_messages) 91 | print(f'\nBot: ') 92 | complete_answer: list[str] = [] 93 | for chunk in answer: 94 | try: 95 | event_delta = chunk.choices[0].delta 96 | answer_text = event_delta.content or "" 97 | print(answer_text, end='') 98 | complete_answer.append(answer_text) 99 | except KeyError: 100 | pass 101 | 102 | print('\n') 103 | complete_answer_string = ''.join(complete_answer) 104 | conversation_messages.append(Message(role="assistant", content=complete_answer_string)) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chromadb==0.5.0 2 | numpy==1.24.2 3 | openai==1.30.3 4 | pandas==1.5.3 5 | python-dotenv==0.21.1 6 | tenacity==8.2.3 7 | tiktoken==0.3.1 -------------------------------------------------------------------------------- /upload_data.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | import os 4 | import glob 5 | import pandas as pd 6 | import argparse 7 | import sqlite3 8 | from embeddings import ( 9 | get_embeddings, 10 | save_embeddings, 11 | load_dataset_for_embeddings, 12 | add_embeddings_to_chroma, 13 | ) 14 | 15 | def create_new_file(filepath: str): 16 | current_time = str(int(time.time() * 1000)) 17 | file_name = filepath.split("/")[-1] 18 | file_name = file_name.split(".")[0] 19 | file_name = f"{file_name}_{current_time}.csv" 20 | new_file_path = f"data/highlights/{file_name}" 21 | # Copy the file to the new location 22 | os.system(f"cp {filepath} {new_file_path}") 23 | return new_file_path 24 | 25 | def get_most_recently_updated_highlights_csv_file(): 26 | # Get the most recent highlights data CSV file 27 | list_of_files = glob.glob('data/highlights/*.csv') 28 | if not list_of_files: 29 | return None 30 | latest_file = max(list_of_files, key=os.path.getctime) 31 | return latest_file 32 | 33 | def compare_csv_files(old_file_path: str, new_file_path: str): 34 | # Read in both CSV files 35 | old_data = pd.read_csv(old_file_path) 36 | new_data = pd.read_csv(new_file_path) 37 | 38 | # Find rows that have been removed 39 | removed_data = old_data.merge(new_data, on=list(old_data.columns), how='left', indicator=True) 40 | removed_data = removed_data[removed_data['_merge'] == 'left_only'].drop(columns='_merge') 41 | 42 | # Find rows that have been added 43 | added_data = new_data.merge(old_data, on=list(old_data.columns), how='left', indicator=True) 44 | added_data = added_data[added_data['_merge'] == 'left_only'].drop(columns='_merge') 45 | 46 | # Return the changed data 47 | return removed_data, added_data 48 | 49 | def update_db(added_data, removed_data): 50 | # Connect to the database 51 | conn = sqlite3.connect("data/highlights.db") 52 | c = conn.cursor() 53 | 54 | # Remove the removed data from the database 55 | for _, row in removed_data.iterrows(): 56 | c.execute("DELETE FROM highlights WHERE highlight = :highlight", {"highlight": row["Highlight"]}) 57 | 58 | # Add the new data to the database 59 | for _, row in added_data.iterrows(): 60 | c.execute("""INSERT INTO highlights 61 | (highlight, book, author, note, location, location_type) 62 | VALUES (?, ?, ?, ?, ?, ?)""", ( 63 | row["Highlight"], 64 | row["Book Title"], 65 | row["Book Author"], 66 | row["Note"], 67 | row["Location"], 68 | row["Location Type"] 69 | ) 70 | ) 71 | 72 | # Commit the changes 73 | conn.commit() 74 | conn.close() 75 | 76 | def create_db(added_data: pd.DataFrame): 77 | # Create the database file if it doesn't exist 78 | if not os.path.exists("data/highlights.db"): 79 | os.system("touch data/highlights.db") 80 | # Connect to the database 81 | conn = sqlite3.connect("data/highlights.db") 82 | c = conn.cursor() 83 | 84 | # Create the table 85 | c.execute("""CREATE TABLE highlights 86 | (id INTEGER PRIMARY KEY, 87 | highlight TEXT, 88 | book TEXT, 89 | author TEXT, 90 | note TEXT, 91 | location TEXT, 92 | location_type TEXT, 93 | created_at DATETIME DEFAULT CURRENT_TIMESTAMP, 94 | updated_at DATETIME DEFAULT CURRENT_TIMESTAMP)""") 95 | conn.commit() 96 | 97 | for _, row in added_data.iterrows(): 98 | c.execute("""INSERT INTO highlights 99 | (highlight, book, author, note, location, location_type) 100 | VALUES (?, ?, ?, ?, ?, ?)""", ( 101 | row["Highlight"], 102 | row["Book Title"], 103 | row["Book Author"], 104 | row["Note"], 105 | row["Location"], 106 | row["Location Type"] 107 | ) 108 | ) 109 | 110 | # Commit the changes 111 | conn.commit() 112 | conn.close() 113 | 114 | def get_most_recently_added_data_from_db(since: str): 115 | # Connect to the database 116 | conn = sqlite3.connect("data/highlights.db") 117 | c = conn.cursor() 118 | # Get the most recently added data 119 | c.execute(f"SELECT id, highlight, book, author, note, location, location_type FROM highlights WHERE created_at >= '{since}'") 120 | data = c.fetchall() 121 | df = pd.DataFrame(data, columns=['id', 'highlight', 'book', 'author', 'note', 'location', 'location_type']) 122 | conn.close() 123 | return df 124 | 125 | if __name__ == "__main__": 126 | parser = argparse.ArgumentParser() 127 | parser.add_argument("-f", "--file", help="Specify the file to upload.", required=True) 128 | args = parser.parse_args() 129 | filepath = args.file 130 | # Get the most recently updated CSV file before uploading the new one 131 | most_recent_filepath = get_most_recently_updated_highlights_csv_file() 132 | # Upload the new file 133 | new_filepath = create_new_file(filepath) 134 | if most_recent_filepath: 135 | # Compare the new file to the most recently updated file 136 | removed_data, added_data = compare_csv_files(most_recent_filepath, new_filepath) 137 | # Process the data that changed 138 | update_db(added_data, removed_data) 139 | else: 140 | # No prior data, so create new database with data 141 | added_data = pd.read_csv(new_filepath) 142 | create_db(added_data) 143 | # Get the most recently changed data, then get and save the embeddings for it 144 | current_time = new_filepath.split('_')[-1].split('.')[0] 145 | timestamp = int(current_time) 146 | dt = datetime.datetime.fromtimestamp(timestamp / 1000) 147 | date = dt.strftime('%Y-%m-%d %H:%M:%S') 148 | df = get_most_recently_added_data_from_db(since=date) 149 | df = load_dataset_for_embeddings(df=df) 150 | df = get_embeddings(df) 151 | # Save the embeddings to a CSV file, just in case 152 | save_embeddings(df, 'data/embeddings/book_notes_w_embeddings.csv') 153 | # Upload embeddings to chroma 154 | add_embeddings_to_chroma(df) 155 | 156 | --------------------------------------------------------------------------------