├── .env.example
├── .gitignore
├── LICENSE
├── README.md
├── app.py
├── food_review.csv
└── requirements.txt


/.env.example:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=YOUR_OPENAI_API_KEY
2 | NUMIC_TOKEN=YOUR_NOMIC_TOKEN


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Daniel Avila
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Visual Embeddings
 2 | This Python script uses the OpenAI API to analyze Amazon food reviews by encoding them into embeddings and conducting semantic similarity searches.
 3 | 
 4 | Check the full article here: [Medium]('https://medium.com')
 5 | 
 6 | ## Code Overview
 7 | 
 8 | The code begins by importing necessary libraries and loading environmental variables that store sensitive information. It then defines a `main()` function, which is later invoked to run the program.
 9 | 
10 | In `main()`, we first read a CSV file containing Amazon food reviews into a pandas dataframe. We then set the webpage title and icon using Streamlit, another necessary library for this script.
11 | 
12 | Using Streamlit forms, the code collects a search sentence and, upon form submission, applies an embedding function from OpenAI to the sentence to create a vector representation of its semantics - a semantic search term.
13 | 
14 | The high-dimensional embeddings are then visualized in 2D utilizing the t-SNE algorithm, with different Amazon ratings being displayed in different colors.
15 | 
16 | Upon calculation of cosine similarities between the search term vector and individual review embeddings, the 20 reviews with the highest similarities are displayed, showcasing the power of semantic similarity search.
17 | 
18 | Lastly, the script maps embeddings to the `nomic` database, allowing the program to interact with the `nomic` embedding mapping and storage tool.
19 | 
20 | ## Requirements
21 | 
22 | - Python 3
23 | - pandas
24 | - NumPy
25 | - scikit-learn
26 | - Streamlit
27 | - dotenv
28 | - Nomic
29 | - Openai
30 | 
31 | ## How to Use
32 | 1. Install all dependencies `pip install -r requirements.txt`
33 | 2. Set up environment variables for the OPENAI_API_KEY & NUMIC_TOKEN for authentication
34 | 3. Run `streamlit run app.py`
35 | 4. Open a web browser to the provided localhost URL
36 | 5. Interact with the visualizations and use the semantic search functionality on the webpage
37 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import openai
  3 | from openai.embeddings_utils import get_embedding, cosine_similarity
  4 | from sklearn.manifold import TSNE
  5 | from sklearn.decomposition import PCA
  6 | import streamlit as st
  7 | from matplotlib import cm
  8 | import pandas as pd
  9 | import numpy as np
 10 | from ast import literal_eval
 11 | import nomic
 12 | from nomic import atlas
 13 | import matplotlib.pyplot as plt
 14 | import matplotlib
 15 | import numpy as np
 16 | 
 17 | from dotenv import load_dotenv
 18 | load_dotenv()
 19 | MODEL = "text-embedding-ada-002"
 20 | st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide")
 21 | 
 22 | def main():
 23 |     # sidebar with openai api key and nomic token
 24 |     st.sidebar.title("Credentials")
 25 |     st.sidebar.write("OpenAI API Key")
 26 |     openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY"))
 27 |     st.sidebar.write("Nomic Token")
 28 |     nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN"))
 29 | 
 30 |     openai.api_key = os.getenv("OPENAI_API_KEY")
 31 |     nomic.login(os.getenv("NOMIC_TOKEN"))
 32 | 
 33 |     # get data
 34 |     datafile_path = "food_review.csv"
 35 |     # show only columns ProductId, Score, Summary, Text, n_tokens, embedding
 36 |     df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8])
 37 |     st.title("Visual Embeddings and Similarity")
 38 |     st.write("Amazon food reviews dataset")
 39 |     st.write(df)
 40 |     
 41 |     st.write("Search similarity")
 42 |     form = st.form('Embeddings')
 43 |     question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup")
 44 |     btn = form.form_submit_button("Run")
 45 | 
 46 |     if btn:
 47 |         # si openai api key no es none y nomic token no es none
 48 |         if openai_api_key is not None and nomic_token is not None:
 49 |             with st.spinner("Loading"):
 50 |                 search_term_vector = get_embedding(question, engine="text-embedding-ada-002")
 51 |                 search_term_vector = np.array(search_term_vector)
 52 | 
 53 |                 matrix = np.array(df.embedding.apply(literal_eval).to_list())
 54 | 
 55 |                 # Compute distances to the search_term_vector
 56 |                 distances = np.linalg.norm(matrix - search_term_vector, axis=1)
 57 |                 df['distance_to_search_term'] = distances
 58 | 
 59 |                 # Normalize the distances to range 0-1 for coloring
 60 |                 df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min())
 61 | 
 62 |                 # 2D visualization
 63 |                 # Create a t-SNE model and transform the data
 64 |                 # tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
 65 |                 # vis_dims = tsne.fit_transform(matrix)
 66 | 
 67 |                 # colors = cm.rainbow(df['normalized_distance'])
 68 |                 # x = [x for x,y in vis_dims]
 69 |                 # y = [y for x,y in vis_dims]
 70 | 
 71 |                 # # Plot points with colors corresponding to their distance from search_term_vector
 72 |                 # plt2.scatter(x, y, color=colors, alpha=0.3)
 73 | 
 74 |                 # # Set title and plot
 75 |                 # plt2.title("Similarity to search term visualized in language using t-SNE")
 76 |                 
 77 |                 #3D visualization PCA
 78 |                 pca = PCA(n_components=3)
 79 |                 vis_dims_pca = pca.fit_transform(matrix)
 80 |                 question_vis = vis_dims_pca.tolist()
 81 | 
 82 |                 fig = plt.figure(figsize=(10, 5))
 83 |                 ax = fig.add_subplot(projection='3d')
 84 |                 cmap = plt.get_cmap("tab20")
 85 | 
 86 |                 # Plot question_vis
 87 |                 ax.scatter(question_vis[0][0], question_vis[0][1], question_vis[0][2], color=cmap(0), s=100, label="Search term")
 88 |                 # Plot other points
 89 |                 for i, point in enumerate(vis_dims_pca):
 90 |                     ax.scatter(point[0], point[1], point[2], color=cmap(df['normalized_distance'][i]), alpha=0.3)
 91 |                 ax.set_title("Similarity to search term visualized in language using PCA")
 92 |                 ax.legend()
 93 |                 plt.show()
 94 |                
 95 |                 
 96 |                 # Convert 'embedding' column to numpy arrays
 97 |                 df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x)))
 98 |                 df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector))
 99 |                 
100 |                 st.title("Visual embedding of the search term and the 20 most similar sentences")
101 |                 #create two columns
102 |                 col1, col2 = st.columns(2)
103 |                 #col1
104 |                 #show st.plot in col1
105 |                 col1.pyplot(plt)
106 | 
107 |                 #col2
108 |                 # #show st.plot in col2
109 |                 # col2.pyplot(fig)
110 |                 
111 |                 #col3
112 |                 #show df in col2, but only the columns, text and similarities
113 |                 col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20))
114 |                 
115 |                 # Convert to a list of lists of floats
116 |                 st.title("Nomic mappping embeddings")
117 |                 embeddings = np.array(df.embedding.to_list())            
118 |                 df = df.drop('embedding', axis=1)
119 |                 df = df.rename(columns={'Unnamed: 0': 'id'})
120 | 
121 |                 data = df.to_dict('records')
122 |                 project = atlas.map_embeddings(embeddings=embeddings, data=data,
123 |                                             id_field='id',
124 |                                             colorable_fields=['Score'])
125 |                 # Convert project to a string before getting link information
126 |                 project_str = str(project)
127 | 
128 |                 st.text(project_str)
129 |                 # Split the project string at the colon and take the second part (index 1)
130 |                 project_link = project_str.split(':', 1)[1]
131 | 
132 |                 # Trim any leading or trailing whitespace
133 |                 project_link = project_link.strip()
134 | 
135 |                 # Crea un iframe con la URL y muéstralo con Streamlit
136 |                 st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True)
137 |         else:
138 |             st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar")
139 | if __name__ == "__main__":
140 |     main()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai
 2 | streamlit
 3 | pandas
 4 | numpy
 5 | nomic
 6 | matplotlib
 7 | plotly
 8 | scipy
 9 | scikit-learn
10 | python-dotenv


--------------------------------------------------------------------------------