├── .gitignore ├── README.md ├── data └── blog.json ├── graph.ipynb ├── main.py ├── output └── blog_embedding.png ├── poetry.lock └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .DS_Store 3 | __pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blog Embeddings 2 | 3 | An analysis of two years of daily blogging on [matt-rickard.com](https://matt-rickard.com). 4 | 5 | 1. I embedded all my posts using BERT (a transformers model pre-trained on a large corpus of English data). BERT uses 768-dimensional vectors. 6 | 7 | 2. Then I ran them through t-SNE (t-distributed stochastic neighbor embedding, a fancy way to visualize high-dimensional data by translating them to two dimensions. 8 | 9 | 3. Finally, I separated the two-dimensional space into equally sized bins and asked GPT-3.5 to develop a category name for each set of post titles. -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | 5 | import numpy as np 6 | import openai 7 | import pandas as pd 8 | import torch 9 | from sklearn.manifold import TSNE 10 | from transformers import AutoModel, AutoTokenizer 11 | 12 | 13 | def read_json_data(file_path): 14 | with open(file_path) as f: 15 | data = json.load(f) 16 | return data 17 | 18 | 19 | def generate_category(titles): 20 | openai.api_key = os.getenv("OPENAI_API_KEY") 21 | 22 | titles_str = ", ".join(titles) 23 | 24 | messages = [ 25 | {"role": "system", "content": "You are a helpful assistant."}, 26 | {"role": "user", "content": f"""These are the titles: {titles_str}. What is the overarching topic of these titles? 27 | - Be as specific as possible. 28 | - Only use a few words. 29 | - You are writing for a highly technical audience. 30 | - Only respond with the topic and nothing else."""}, 31 | ] 32 | 33 | while True: 34 | try: 35 | # Use the chat models 36 | response = openai.ChatCompletion.create( 37 | model="gpt-3.5-turbo", 38 | messages=messages, 39 | timeout=5, 40 | ) 41 | 42 | category = response['choices'][0]['message']['content'] 43 | print(f"Category: {category}") 44 | return category 45 | except openai.error.OpenAIError: 46 | print("An error occurred. Retrying...") 47 | time.sleep(2) 48 | 49 | 50 | def embedding_metadata(embeddings_2d, titles, grid_size=20): 51 | df = pd.DataFrame(embeddings_2d, columns=["x", "y"]) 52 | df["title"] = titles 53 | 54 | x_min, x_max = (df["x"].min() // grid_size) * grid_size, (df["x"].max() // grid_size + 1) * grid_size 55 | y_min, y_max = (df["y"].min() // grid_size) * grid_size, (df["y"].max() // grid_size + 1) * grid_size 56 | 57 | x_bins = np.arange(x_min, x_max, grid_size) 58 | y_bins = np.arange(y_min, y_max, grid_size) 59 | 60 | df["x_bin"] = np.digitize(df["x"], x_bins) 61 | df["y_bin"] = np.digitize(df["y"], y_bins) 62 | 63 | groups = df.groupby(["x_bin", "y_bin"])["title"].apply(list) 64 | 65 | ret = [] 66 | for (x_bin, y_bin), titles in groups.items(): 67 | x_bounds = (x_bins[x_bin-1], x_bins[x_bin-1] + grid_size) 68 | y_bounds = (y_bins[y_bin-1], y_bins[y_bin-1] + grid_size) 69 | category = generate_category(titles) 70 | 71 | ret.append({ 72 | "x_bounds": x_bounds, 73 | "y_bounds": y_bounds, 74 | "titles": titles, 75 | "category": category 76 | }) 77 | 78 | return ret 79 | 80 | def count_elements(embeddings_2d): 81 | x_values, y_values = embeddings_2d[:,0], embeddings_2d[:,1] 82 | 83 | x_min, x_max = np.min(x_values), np.max(x_values) 84 | y_min, y_max = np.min(y_values), np.max(y_values) 85 | 86 | x_mid = (x_max + x_min) / 2 87 | y_mid = (y_max + y_min) / 2 88 | 89 | quadrants = { 90 | "Q1": {"x": (x_mid, x_max), "y": (y_mid, y_max)}, 91 | "Q2": {"x": (x_min, x_mid), "y": (y_mid, y_max)}, 92 | "Q3": {"x": (x_min, x_mid), "y": (y_min, y_mid)}, 93 | "Q4": {"x": (x_mid, x_max), "y": (y_min, y_mid)} 94 | } 95 | 96 | counts = {q: 0 for q in quadrants.keys()} 97 | 98 | for x, y in embeddings_2d: 99 | for q, bounds in quadrants.items(): 100 | if bounds["x"][0] <= x <= bounds["x"][1] and bounds["y"][0] <= y <= bounds["y"][1]: 101 | counts[q] += 1 102 | 103 | return counts 104 | 105 | def filter_posts(posts): 106 | filtered_posts = [post for post in posts if post['status'] == 'published' and post['plaintext'] is not None and post['title'] is not None] 107 | return filtered_posts 108 | 109 | def extract_titles_texts(filtered_posts): 110 | titles = [post['title'] for post in filtered_posts] 111 | texts = [post['plaintext'] for post in filtered_posts] 112 | return titles, texts 113 | 114 | def get_data(): 115 | data = read_json_data('data/blog.json') 116 | posts = data['db'][0]['data']['posts'] 117 | filtered_posts = filter_posts(posts) 118 | titles, texts = extract_titles_texts(filtered_posts) 119 | return titles, texts 120 | 121 | def load_model_and_tokenizer(model_name="bert-base-uncased"): 122 | tokenizer = AutoTokenizer.from_pretrained(model_name) 123 | model = AutoModel.from_pretrained(model_name) 124 | return tokenizer, model 125 | 126 | def get_embeddings(tokenizer, model, texts): 127 | inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True) 128 | with torch.no_grad(): 129 | outputs = model(**inputs) 130 | embeddings = outputs.last_hidden_state.mean(dim=1).numpy() 131 | return embeddings 132 | 133 | def reduce_dimensions(embeddings, n_components=2): 134 | tsne = TSNE(n_components=n_components, random_state=0) 135 | embeddings_2d = tsne.fit_transform(embeddings) 136 | return embeddings_2d 137 | 138 | def write_embeddings_to_json(embeddings_2d, titles, json_file_path): 139 | data = [] 140 | for i, title in enumerate(titles): 141 | data.append({ 142 | 'title': title, 143 | 'x': embeddings_2d[i, 0].tolist(), 144 | 'y': embeddings_2d[i, 1].tolist() 145 | }) 146 | 147 | with open(json_file_path, 'w') as json_file: 148 | json.dump(data, json_file) -------------------------------------------------------------------------------- /output/blog_embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/r2d4/blog-embeddings/33fb089fc6686cc0291d39a8028328eb8fc710be/output/blog_embedding.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "2yearblog" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Matt Rickard "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.8,<3.12" 10 | transformers = "^4.29.2" 11 | torch = "^2.0.1" 12 | matplotlib = "^3.7.1" 13 | scikit-learn = "^1.2.2" 14 | mplcursors = "^0.5.2" 15 | plotly = "^5.14.1" 16 | pandas = "^2.0.1" 17 | nbformat = "^5.8.0" 18 | openai = "^0.27.7" 19 | 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | ipykernel = "^6.23.1" 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | --------------------------------------------------------------------------------