├── .gitignore
├── README.md
├── data
    └── blog.json
├── graph.ipynb
├── main.py
├── output
    └── blog_embedding.png
├── poetry.lock
└── pyproject.toml


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .DS_Store
3 | __pycache__


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Blog Embeddings
2 | 
3 | An analysis of two years of daily blogging on [matt-rickard.com](https://matt-rickard.com). 
4 | 
5 | 1. I embedded all my posts using BERT (a transformers model pre-trained on a large corpus of English data). BERT uses 768-dimensional vectors.
6 | 
7 | 2. Then I ran them through t-SNE (t-distributed stochastic neighbor embedding, a fancy way to visualize high-dimensional data by translating them to two dimensions. 
8 | 
9 | 3. Finally, I separated the two-dimensional space into equally sized bins and asked GPT-3.5 to develop a category name for each set of post titles. 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import openai
  7 | import pandas as pd
  8 | import torch
  9 | from sklearn.manifold import TSNE
 10 | from transformers import AutoModel, AutoTokenizer
 11 | 
 12 | 
 13 | def read_json_data(file_path):
 14 |     with open(file_path) as f:
 15 |         data = json.load(f)
 16 |     return data
 17 | 
 18 | 
 19 | def generate_category(titles):
 20 |     openai.api_key = os.getenv("OPENAI_API_KEY")
 21 | 
 22 |     titles_str = ", ".join(titles)
 23 | 
 24 |     messages = [
 25 |         {"role": "system", "content": "You are a helpful assistant."},
 26 |         {"role": "user", "content": f"""These are the titles: {titles_str}. What is the overarching topic of these titles? 
 27 | - Be as specific as possible. 
 28 | - Only use a few words. 
 29 | - You are writing for a highly technical audience. 
 30 | - Only respond with the topic and nothing else."""},
 31 |     ]
 32 | 
 33 |     while True:
 34 |         try:
 35 |             # Use the chat models
 36 |             response = openai.ChatCompletion.create(
 37 |                 model="gpt-3.5-turbo",
 38 |                 messages=messages,
 39 |                 timeout=5,  
 40 |             )
 41 | 
 42 |             category = response['choices'][0]['message']['content']
 43 |             print(f"Category: {category}")
 44 |             return category
 45 |         except openai.error.OpenAIError:
 46 |             print("An error occurred. Retrying...")
 47 |             time.sleep(2)
 48 | 
 49 | 
 50 | def embedding_metadata(embeddings_2d, titles, grid_size=20):
 51 |     df = pd.DataFrame(embeddings_2d, columns=["x", "y"])
 52 |     df["title"] = titles
 53 | 
 54 |     x_min, x_max = (df["x"].min() // grid_size) * grid_size, (df["x"].max() // grid_size + 1) * grid_size
 55 |     y_min, y_max = (df["y"].min() // grid_size) * grid_size, (df["y"].max() // grid_size + 1) * grid_size
 56 | 
 57 |     x_bins = np.arange(x_min, x_max, grid_size)
 58 |     y_bins = np.arange(y_min, y_max, grid_size)
 59 | 
 60 |     df["x_bin"] = np.digitize(df["x"], x_bins)
 61 |     df["y_bin"] = np.digitize(df["y"], y_bins)
 62 | 
 63 |     groups = df.groupby(["x_bin", "y_bin"])["title"].apply(list)
 64 | 
 65 |     ret = []
 66 |     for (x_bin, y_bin), titles in groups.items():
 67 |         x_bounds = (x_bins[x_bin-1], x_bins[x_bin-1] + grid_size)
 68 |         y_bounds = (y_bins[y_bin-1], y_bins[y_bin-1] + grid_size)
 69 |         category = generate_category(titles)
 70 | 
 71 |         ret.append({
 72 |             "x_bounds": x_bounds,
 73 |             "y_bounds": y_bounds,
 74 |             "titles": titles,
 75 |             "category": category
 76 |         })
 77 | 
 78 |     return ret
 79 | 
 80 | def count_elements(embeddings_2d):
 81 |     x_values, y_values = embeddings_2d[:,0], embeddings_2d[:,1]
 82 | 
 83 |     x_min, x_max = np.min(x_values), np.max(x_values)
 84 |     y_min, y_max = np.min(y_values), np.max(y_values)
 85 | 
 86 |     x_mid = (x_max + x_min) / 2
 87 |     y_mid = (y_max + y_min) / 2
 88 | 
 89 |     quadrants = {
 90 |         "Q1": {"x": (x_mid, x_max), "y": (y_mid, y_max)},
 91 |         "Q2": {"x": (x_min, x_mid), "y": (y_mid, y_max)},
 92 |         "Q3": {"x": (x_min, x_mid), "y": (y_min, y_mid)},
 93 |         "Q4": {"x": (x_mid, x_max), "y": (y_min, y_mid)}
 94 |     }
 95 | 
 96 |     counts = {q: 0 for q in quadrants.keys()}
 97 | 
 98 |     for x, y in embeddings_2d:
 99 |         for q, bounds in quadrants.items():
100 |             if bounds["x"][0] <= x <= bounds["x"][1] and bounds["y"][0] <= y <= bounds["y"][1]:
101 |                 counts[q] += 1
102 | 
103 |     return counts
104 | 
105 | def filter_posts(posts):
106 |     filtered_posts = [post for post in posts if post['status'] == 'published' and post['plaintext'] is not None and post['title'] is not None]
107 |     return filtered_posts
108 | 
109 | def extract_titles_texts(filtered_posts):
110 |     titles = [post['title'] for post in filtered_posts]
111 |     texts = [post['plaintext'] for post in filtered_posts]
112 |     return titles, texts
113 | 
114 | def get_data():
115 |     data = read_json_data('data/blog.json')
116 |     posts = data['db'][0]['data']['posts']
117 |     filtered_posts = filter_posts(posts)
118 |     titles, texts = extract_titles_texts(filtered_posts)
119 |     return titles, texts
120 | 
121 | def load_model_and_tokenizer(model_name="bert-base-uncased"):
122 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
123 |     model = AutoModel.from_pretrained(model_name)
124 |     return tokenizer, model
125 | 
126 | def get_embeddings(tokenizer, model, texts):
127 |     inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
128 |     with torch.no_grad():
129 |         outputs = model(**inputs)
130 |     embeddings = outputs.last_hidden_state.mean(dim=1).numpy()
131 |     return embeddings
132 | 
133 | def reduce_dimensions(embeddings, n_components=2):
134 |     tsne = TSNE(n_components=n_components, random_state=0)
135 |     embeddings_2d = tsne.fit_transform(embeddings)
136 |     return embeddings_2d
137 | 
138 | def write_embeddings_to_json(embeddings_2d, titles, json_file_path):
139 |     data = []
140 |     for i, title in enumerate(titles):
141 |         data.append({
142 |             'title': title,
143 |             'x': embeddings_2d[i, 0].tolist(), 
144 |             'y': embeddings_2d[i, 1].tolist()
145 |         })
146 | 
147 |     with open(json_file_path, 'w') as json_file:
148 |         json.dump(data, json_file)


--------------------------------------------------------------------------------
/output/blog_embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/r2d4/blog-embeddings/33fb089fc6686cc0291d39a8028328eb8fc710be/output/blog_embedding.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "2yearblog"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Matt Rickard <git@matt-rickard.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.8,<3.12"
10 | transformers = "^4.29.2"
11 | torch = "^2.0.1"
12 | matplotlib = "^3.7.1"
13 | scikit-learn = "^1.2.2"
14 | mplcursors = "^0.5.2"
15 | plotly = "^5.14.1"
16 | pandas = "^2.0.1"
17 | nbformat = "^5.8.0"
18 | openai = "^0.27.7"
19 | 
20 | 
21 | [tool.poetry.group.dev.dependencies]
22 | ipykernel = "^6.23.1"
23 | 
24 | [build-system]
25 | requires = ["poetry-core"]
26 | build-backend = "poetry.core.masonry.api"
27 | 


--------------------------------------------------------------------------------