├── .env.example ├── .gitignore ├── LICENSE ├── README.md ├── app.py ├── food_review.csv └── requirements.txt /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=YOUR_OPENAI_API_KEY 2 | NUMIC_TOKEN=YOUR_NOMIC_TOKEN -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Daniel Avila 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Visual Embeddings 2 | This Python script uses the OpenAI API to analyze Amazon food reviews by encoding them into embeddings and conducting semantic similarity searches. 3 | 4 | Check the full article here: [Medium]('https://medium.com') 5 | 6 | ## Code Overview 7 | 8 | The code begins by importing necessary libraries and loading environmental variables that store sensitive information. It then defines a `main()` function, which is later invoked to run the program. 9 | 10 | In `main()`, we first read a CSV file containing Amazon food reviews into a pandas dataframe. We then set the webpage title and icon using Streamlit, another necessary library for this script. 11 | 12 | Using Streamlit forms, the code collects a search sentence and, upon form submission, applies an embedding function from OpenAI to the sentence to create a vector representation of its semantics - a semantic search term. 13 | 14 | The high-dimensional embeddings are then visualized in 2D utilizing the t-SNE algorithm, with different Amazon ratings being displayed in different colors. 15 | 16 | Upon calculation of cosine similarities between the search term vector and individual review embeddings, the 20 reviews with the highest similarities are displayed, showcasing the power of semantic similarity search. 17 | 18 | Lastly, the script maps embeddings to the `nomic` database, allowing the program to interact with the `nomic` embedding mapping and storage tool. 19 | 20 | ## Requirements 21 | 22 | - Python 3 23 | - pandas 24 | - NumPy 25 | - scikit-learn 26 | - Streamlit 27 | - dotenv 28 | - Nomic 29 | - Openai 30 | 31 | ## How to Use 32 | 1. Install all dependencies `pip install -r requirements.txt` 33 | 2. Set up environment variables for the OPENAI_API_KEY & NUMIC_TOKEN for authentication 34 | 3. Run `streamlit run app.py` 35 | 4. Open a web browser to the provided localhost URL 36 | 5. Interact with the visualizations and use the semantic search functionality on the webpage 37 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | from openai.embeddings_utils import get_embedding, cosine_similarity 4 | from sklearn.manifold import TSNE 5 | from sklearn.decomposition import PCA 6 | import streamlit as st 7 | from matplotlib import cm 8 | import pandas as pd 9 | import numpy as np 10 | from ast import literal_eval 11 | import nomic 12 | from nomic import atlas 13 | import matplotlib.pyplot as plt 14 | import matplotlib 15 | import numpy as np 16 | 17 | from dotenv import load_dotenv 18 | load_dotenv() 19 | MODEL = "text-embedding-ada-002" 20 | st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide") 21 | 22 | def main(): 23 | # sidebar with openai api key and nomic token 24 | st.sidebar.title("Credentials") 25 | st.sidebar.write("OpenAI API Key") 26 | openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY")) 27 | st.sidebar.write("Nomic Token") 28 | nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN")) 29 | 30 | openai.api_key = os.getenv("OPENAI_API_KEY") 31 | nomic.login(os.getenv("NOMIC_TOKEN")) 32 | 33 | # get data 34 | datafile_path = "food_review.csv" 35 | # show only columns ProductId, Score, Summary, Text, n_tokens, embedding 36 | df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8]) 37 | st.title("Visual Embeddings and Similarity") 38 | st.write("Amazon food reviews dataset") 39 | st.write(df) 40 | 41 | st.write("Search similarity") 42 | form = st.form('Embeddings') 43 | question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup") 44 | btn = form.form_submit_button("Run") 45 | 46 | if btn: 47 | # si openai api key no es none y nomic token no es none 48 | if openai_api_key is not None and nomic_token is not None: 49 | with st.spinner("Loading"): 50 | search_term_vector = get_embedding(question, engine="text-embedding-ada-002") 51 | search_term_vector = np.array(search_term_vector) 52 | 53 | matrix = np.array(df.embedding.apply(literal_eval).to_list()) 54 | 55 | # Compute distances to the search_term_vector 56 | distances = np.linalg.norm(matrix - search_term_vector, axis=1) 57 | df['distance_to_search_term'] = distances 58 | 59 | # Normalize the distances to range 0-1 for coloring 60 | df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min()) 61 | 62 | # 2D visualization 63 | # Create a t-SNE model and transform the data 64 | # tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200) 65 | # vis_dims = tsne.fit_transform(matrix) 66 | 67 | # colors = cm.rainbow(df['normalized_distance']) 68 | # x = [x for x,y in vis_dims] 69 | # y = [y for x,y in vis_dims] 70 | 71 | # # Plot points with colors corresponding to their distance from search_term_vector 72 | # plt2.scatter(x, y, color=colors, alpha=0.3) 73 | 74 | # # Set title and plot 75 | # plt2.title("Similarity to search term visualized in language using t-SNE") 76 | 77 | #3D visualization PCA 78 | pca = PCA(n_components=3) 79 | vis_dims_pca = pca.fit_transform(matrix) 80 | question_vis = vis_dims_pca.tolist() 81 | 82 | fig = plt.figure(figsize=(10, 5)) 83 | ax = fig.add_subplot(projection='3d') 84 | cmap = plt.get_cmap("tab20") 85 | 86 | # Plot question_vis 87 | ax.scatter(question_vis[0][0], question_vis[0][1], question_vis[0][2], color=cmap(0), s=100, label="Search term") 88 | # Plot other points 89 | for i, point in enumerate(vis_dims_pca): 90 | ax.scatter(point[0], point[1], point[2], color=cmap(df['normalized_distance'][i]), alpha=0.3) 91 | ax.set_title("Similarity to search term visualized in language using PCA") 92 | ax.legend() 93 | plt.show() 94 | 95 | 96 | # Convert 'embedding' column to numpy arrays 97 | df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x))) 98 | df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector)) 99 | 100 | st.title("Visual embedding of the search term and the 20 most similar sentences") 101 | #create two columns 102 | col1, col2 = st.columns(2) 103 | #col1 104 | #show st.plot in col1 105 | col1.pyplot(plt) 106 | 107 | #col2 108 | # #show st.plot in col2 109 | # col2.pyplot(fig) 110 | 111 | #col3 112 | #show df in col2, but only the columns, text and similarities 113 | col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20)) 114 | 115 | # Convert to a list of lists of floats 116 | st.title("Nomic mappping embeddings") 117 | embeddings = np.array(df.embedding.to_list()) 118 | df = df.drop('embedding', axis=1) 119 | df = df.rename(columns={'Unnamed: 0': 'id'}) 120 | 121 | data = df.to_dict('records') 122 | project = atlas.map_embeddings(embeddings=embeddings, data=data, 123 | id_field='id', 124 | colorable_fields=['Score']) 125 | # Convert project to a string before getting link information 126 | project_str = str(project) 127 | 128 | st.text(project_str) 129 | # Split the project string at the colon and take the second part (index 1) 130 | project_link = project_str.split(':', 1)[1] 131 | 132 | # Trim any leading or trailing whitespace 133 | project_link = project_link.strip() 134 | 135 | # Crea un iframe con la URL y muéstralo con Streamlit 136 | st.markdown(f'', unsafe_allow_html=True) 137 | else: 138 | st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar") 139 | if __name__ == "__main__": 140 | main() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | streamlit 3 | pandas 4 | numpy 5 | nomic 6 | matplotlib 7 | plotly 8 | scipy 9 | scikit-learn 10 | python-dotenv --------------------------------------------------------------------------------