├── requirements.txt ├── README.md └── embedding.py /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.20.0 2 | pandas>=1.3.0 3 | matplotlib>=3.4.0 4 | seaborn>=0.11.0 5 | scikit-learn>=1.0.0 6 | scipy>=1.7.0 7 | flask>=2.0.0 8 | google-generativeai>=0.3.0 9 | anthropic>=0.8.0 10 | requests>=2.27.0 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Embedding Analysis Tool for SEO 3 | 4 | Read blog post here: https://metehan.ai/blog/embedding-seo-tool-analysis/ 5 | 6 | ## What This Tool Does 7 | 8 | This tool provides in-depth analysis of content embeddings to help with SEO optimization. It: 9 | 10 | - Generates 3072-dimension embeddings of your content using Google's Gemini API 11 | 12 | - Creates visualizations of the embedding data including: 13 | 14 | - Overview of all embedding dimensions 15 | 16 | - Top dimensions by magnitude 17 | 18 | - Activation distribution histogram 19 | 20 | - Dimension clusters heatmap 21 | 22 | - PCA visualization of dimension segments 23 | 24 | - Calculates key metrics like mean values, standard deviation, significant dimensions, etc. 25 | 26 | - Identifies dimension clusters that may represent semantic features 27 | 28 | - Uses Claude 3.7 Sonnet to analyze the embedding patterns and provide: 29 | 30 | - Content quality assessment 31 | 32 | - Identification of semantic structures 33 | 34 | - SEO optimization recommendations 35 | 36 | - Analysis of topical strengths and weaknesses 37 | 38 | ## How to Use This Tool 39 | 40 | ### Setup: 41 | 42 | - Install the requirements: 43 | 44 | pip install -r requirements.txt 45 | 46 | 47 | 2. Replace the API keys in the code: 48 | 49 | - GOOGLE_API_KEY - Your Google API key with access to Gemini models 50 | 51 | - ANTHROPIC_API_KEY - Your Anthropic API key with access to Claude 3.7 Sonnet 52 | 53 | - Run the application: 54 | 55 | python embedding.py 56 | 57 | 58 | - Open your browser and navigate to: 59 | 60 | http://127.0.0.1:5000 61 | 62 | 63 | ### Using the Tool: 64 | 65 | - Paste your content (article, blog post, web page) into the text area 66 | 67 | - Click "Analyze Content" 68 | 69 | - Wait for the analysis to complete (this can take 30-60 seconds) 70 | 71 | - Review the visualizations and analysis: 72 | 73 | - The embedding overview shows activation patterns across all dimensions 74 | 75 | - The top dimensions chart shows which dimensions are most important 76 | 77 | - The dimension clusters visualization helps identify related features 78 | 79 | - The metrics section shows key statistical indicators 80 | 81 | - The Claude analysis provides actionable SEO recommendations 82 | 83 | This tool helps you understand how AI "sees" your content, which semantic features are prominent, and how to optimize for better search engine performance based on embedding patterns. 84 | -------------------------------------------------------------------------------- /embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib 6 | matplotlib.use('Agg') # Use non-interactive backend 7 | import matplotlib.pyplot as plt 8 | from matplotlib.colors import LinearSegmentedColormap 9 | import seaborn as sns 10 | from sklearn.decomposition import PCA 11 | from sklearn.manifold import TSNE 12 | from sklearn.cluster import KMeans 13 | import scipy.stats as stats 14 | import base64 15 | from io import BytesIO 16 | import requests 17 | from flask import Flask, request, render_template_string, jsonify 18 | import google.generativeai as genai 19 | from anthropic import Anthropic 20 | 21 | # Configure APIs 22 | GOOGLE_API_KEY = "xxx" # Replace with your actual API key 23 | ANTHROPIC_API_KEY = "xxx" # Replace with your actual API key 24 | 25 | # Initialize Gemini client 26 | genai.configure(api_key=GOOGLE_API_KEY) 27 | anthropic_client = Anthropic(api_key=ANTHROPIC_API_KEY) 28 | 29 | app = Flask(__name__) 30 | 31 | # Custom JSON encoder to handle NumPy types 32 | class NumpyEncoder(json.JSONEncoder): 33 | def default(self, obj): 34 | if isinstance(obj, np.integer): 35 | return int(obj) 36 | if isinstance(obj, np.floating): 37 | return float(obj) 38 | if isinstance(obj, np.ndarray): 39 | return obj.tolist() 40 | return super(NumpyEncoder, self).default(obj) 41 | 42 | # Configure Flask to use the custom encoder 43 | app.json_encoder = NumpyEncoder 44 | 45 | def get_embedding(text): 46 | """Get embedding from Google Gemini API""" 47 | try: 48 | # FIXED: Use the correct method to get embeddings from Gemini 49 | # Use gemini-embedding-exp-03-07 model as specified 50 | response = genai.embed_content( 51 | model="models/gemini-embedding-exp-03-07", 52 | content=text, 53 | ) 54 | embedding = response["embedding"] 55 | return embedding 56 | except Exception as e: 57 | print(f"Error getting embedding: {e}") 58 | # Return a random embedding for testing if API fails 59 | print("Using random embedding instead") 60 | return np.random.normal(0, 0.1, 3072).tolist() 61 | 62 | def analyze_with_claude(embedding_data, content_snippet): 63 | """Get analysis from Claude 3.7 Sonnet""" 64 | try: 65 | message = anthropic_client.messages.create( 66 | model="claude-3-7-sonnet-latest", 67 | max_tokens=8000, 68 | temperature=1, 69 | thinking={ 70 | "type": "enabled", 71 | "budget_tokens": 4000 72 | }, 73 | system="You are an expert in SEO and NLP embedding analysis. Analyze the provided embedding data to extract insights about content quality, semantic structure, and SEO optimization opportunities. Focus on activation patterns, dimension clusters, and quality indicators. Provide actionable recommendations.", 74 | messages=[ 75 | { 76 | "role": "user", 77 | "content": f"""Analyze this 3k-dimension embedding data from a content piece. Focus on quality indicators, semantic structure, and SEO implications. 78 | 79 | CONTENT SNIPPET (first 4500 chars): 80 | {content_snippet[:18500]}... 81 | 82 | EMBEDDING DATA STATISTICS: 83 | - Dimension count: {len(embedding_data)} 84 | - Mean value: {np.mean(embedding_data):.6f} 85 | - Standard deviation: {np.std(embedding_data):.6f} 86 | - Min value: {np.min(embedding_data):.6f} at dimension {np.argmin(embedding_data)} 87 | - Max value: {np.max(embedding_data):.6f} at dimension {np.argmax(embedding_data)} 88 | - Top 5 dimensions by magnitude: {sorted(range(len(embedding_data)), key=lambda i: abs(embedding_data[i]), reverse=True)[:5]} 89 | 90 | Provide a concise analysis focusing on: 91 | 1. Content quality assessment based on embedding patterns 92 | 2. Key dimension clusters and their likely semantic functions 93 | 3. SEO optimization recommendations based on the embedding structure 94 | 4. Potential topical strengths and weaknesses""" 95 | } 96 | ] 97 | ) 98 | 99 | # Extract text content from Claude's response 100 | # The content might be a list of content blocks or a single text block 101 | if hasattr(message.content, '__iter__') and not isinstance(message.content, str): 102 | # If content is an iterable (like a list of content blocks) 103 | extracted_text = "" 104 | for block in message.content: 105 | if hasattr(block, 'text'): 106 | extracted_text += block.text 107 | elif isinstance(block, str): 108 | extracted_text += block 109 | return extracted_text 110 | elif hasattr(message.content, 'text'): 111 | # If content is a single TextBlock object 112 | return message.content.text 113 | else: 114 | # If content is already a string or something else 115 | return str(message.content) 116 | 117 | except Exception as e: 118 | print(f"Error getting Claude analysis: {e}") 119 | return "Error getting analysis from Claude. Please check your API key and try again." 120 | 121 | def plot_embedding_overview(embedding): 122 | """Create overview plot of embedding values""" 123 | plt.figure(figsize=(12, 6)) 124 | plt.plot(range(len(embedding)), embedding) 125 | plt.axhline(y=0, color='r', linestyle='-', alpha=0.3) 126 | plt.title('Embedding Values Across All 3k Dimensions') 127 | plt.xlabel('Dimension') 128 | plt.ylabel('Value') 129 | plt.grid(True, alpha=0.3) 130 | 131 | # Save plot to base64 string 132 | buf = BytesIO() 133 | plt.savefig(buf, format='png', dpi=100) 134 | plt.close() 135 | buf.seek(0) 136 | return base64.b64encode(buf.read()).decode('utf-8') 137 | 138 | def plot_top_dimensions(embedding): 139 | """Plot top dimensions by magnitude""" 140 | # Get indices of top 20 dimensions by magnitude 141 | top_indices = sorted(range(len(embedding)), key=lambda i: abs(embedding[i]), reverse=True)[:20] 142 | top_values = [embedding[i] for i in top_indices] 143 | 144 | plt.figure(figsize=(12, 6)) 145 | colors = ['blue' if v >= 0 else 'red' for v in top_values] 146 | plt.bar(range(len(top_indices)), top_values, color=colors) 147 | plt.xticks(range(len(top_indices)), top_indices, rotation=45) 148 | plt.title('Top 20 Dimensions by Magnitude') 149 | plt.xlabel('Dimension Index') 150 | plt.ylabel('Value') 151 | plt.grid(True, alpha=0.3) 152 | 153 | # Save plot to base64 string 154 | buf = BytesIO() 155 | plt.savefig(buf, format='png', dpi=100) 156 | plt.close() 157 | buf.seek(0) 158 | return base64.b64encode(buf.read()).decode('utf-8') 159 | 160 | def plot_dimension_clusters(embedding): 161 | """Plot dimension clusters heatmap""" 162 | # Reshape embedding to highlight patterns 163 | embedding_reshaped = np.array(embedding).reshape(64, 48) 164 | 165 | plt.figure(figsize=(12, 8)) 166 | # Create a custom colormap from blue to white to red 167 | cmap = LinearSegmentedColormap.from_list('BrBG', ['blue', 'white', 'red'], N=256) 168 | plt.imshow(embedding_reshaped, cmap=cmap, aspect='auto') 169 | plt.colorbar(label='Activation Value') 170 | plt.title('Embedding Clusters Heatmap (Reshaped to 64x48)') 171 | plt.xlabel('Dimension Group') 172 | plt.ylabel('Dimension Group') 173 | 174 | # Save plot to base64 string 175 | buf = BytesIO() 176 | plt.savefig(buf, format='png', dpi=100) 177 | plt.close() 178 | buf.seek(0) 179 | return base64.b64encode(buf.read()).decode('utf-8') 180 | 181 | def plot_pca(embedding): 182 | """Plot PCA visualization of embedding dimensions""" 183 | # Create a 2D array where each row is a segment of the original embedding 184 | segment_size = 256 185 | num_segments = len(embedding) // segment_size 186 | data_matrix = np.zeros((num_segments, segment_size)) 187 | 188 | # Fill the matrix with segments 189 | for i in range(num_segments): 190 | start = i * segment_size 191 | end = start + segment_size 192 | data_matrix[i] = embedding[start:end] 193 | 194 | # Apply PCA 195 | if num_segments > 1: 196 | pca = PCA(n_components=2) 197 | pca_results = pca.fit_transform(data_matrix) 198 | 199 | plt.figure(figsize=(10, 8)) 200 | plt.scatter(pca_results[:, 0], pca_results[:, 1]) 201 | 202 | # Label each point with its segment range 203 | for i in range(num_segments): 204 | start = i * segment_size 205 | end = start + segment_size - 1 206 | plt.annotate(f"{start}-{end}", 207 | (pca_results[i, 0], pca_results[i, 1]), 208 | fontsize=8) 209 | 210 | plt.title('PCA of Embedding Segments') 211 | plt.xlabel('Principal Component 1') 212 | plt.ylabel('Principal Component 2') 213 | plt.grid(True, alpha=0.3) 214 | else: 215 | # If we don't have enough segments, create a simpler visualization 216 | plt.figure(figsize=(10, 8)) 217 | plt.text(0.5, 0.5, "Not enough segments for PCA visualization", 218 | ha='center', va='center', fontsize=12) 219 | plt.axis('off') 220 | 221 | # Save plot to base64 string 222 | buf = BytesIO() 223 | plt.savefig(buf, format='png', dpi=100) 224 | plt.close() 225 | buf.seek(0) 226 | return base64.b64encode(buf.read()).decode('utf-8') 227 | 228 | def plot_activation_histogram(embedding): 229 | """Plot histogram of embedding activation values""" 230 | plt.figure(figsize=(10, 6)) 231 | plt.hist(embedding, bins=50, alpha=0.7, color='skyblue', edgecolor='black') 232 | plt.axvline(x=0, color='r', linestyle='--', alpha=0.7) 233 | plt.title('Distribution of Embedding Values') 234 | plt.xlabel('Value') 235 | plt.ylabel('Frequency') 236 | plt.grid(True, alpha=0.3) 237 | 238 | # Save plot to base64 string 239 | buf = BytesIO() 240 | plt.savefig(buf, format='png', dpi=100) 241 | plt.close() 242 | buf.seek(0) 243 | return base64.b64encode(buf.read()).decode('utf-8') 244 | 245 | def analyze_embedding(embedding): 246 | """Analyze embedding for key metrics""" 247 | embedding = np.array(embedding) # Convert to numpy array for easier processing 248 | abs_embedding = np.abs(embedding) 249 | 250 | # Calculate key metrics - CONVERT NUMPY TYPES TO PYTHON NATIVE TYPES 251 | metrics = { 252 | "dimension_count": int(len(embedding)), 253 | "mean_value": float(np.mean(embedding)), 254 | "std_dev": float(np.std(embedding)), 255 | "min_value": float(np.min(embedding)), 256 | "min_dimension": int(np.argmin(embedding)), 257 | "max_value": float(np.max(embedding)), 258 | "max_dimension": int(np.argmax(embedding)), 259 | "median_value": float(np.median(embedding)), 260 | "positive_count": int(np.sum(embedding > 0)), 261 | "negative_count": int(np.sum(embedding < 0)), 262 | "zero_count": int(np.sum(embedding == 0)), 263 | "abs_mean": float(np.mean(abs_embedding)), 264 | "significant_dims": int(np.sum(abs_embedding > 0.1)) 265 | } 266 | 267 | # Find activation clusters 268 | significant_threshold = 0.1 269 | significant_dims = np.where(abs_embedding > significant_threshold)[0] 270 | 271 | # Find clusters (dimensions that are close to each other) 272 | clusters = [] 273 | if len(significant_dims) > 0: 274 | current_cluster = [int(significant_dims[0])] # Convert to int 275 | 276 | for i in range(1, len(significant_dims)): 277 | if significant_dims[i] - significant_dims[i-1] <= 5: # If dimensions are close 278 | current_cluster.append(int(significant_dims[i])) # Convert to int 279 | else: 280 | if len(current_cluster) > 0: 281 | clusters.append(current_cluster) 282 | current_cluster = [int(significant_dims[i])] # Convert to int 283 | 284 | if len(current_cluster) > 0: 285 | clusters.append(current_cluster) 286 | 287 | # Filter to meaningful clusters (more than 1 dimension) 288 | clusters = [c for c in clusters if len(c) > 1] 289 | 290 | # Format clusters for display 291 | cluster_info = [] 292 | for i, cluster in enumerate(clusters): 293 | values = [float(embedding[dim]) for dim in cluster] # Convert to float 294 | cluster_info.append({ 295 | "id": i+1, 296 | "dimensions": [int(d) for d in cluster], # Convert to int 297 | "start_dim": int(min(cluster)), 298 | "end_dim": int(max(cluster)), 299 | "size": int(len(cluster)), 300 | "avg_value": float(np.mean(values)), 301 | "max_value": float(np.max(values)), 302 | "max_dim": int(cluster[np.argmax(values)]) 303 | }) 304 | 305 | # Top dimensions by magnitude 306 | top_indices = sorted(range(len(embedding)), key=lambda i: abs(embedding[i]), reverse=True)[:10] 307 | top_dimensions = [{"dimension": int(idx), "value": float(embedding[idx])} for idx in top_indices] 308 | 309 | return { 310 | "metrics": metrics, 311 | "clusters": cluster_info, 312 | "top_dimensions": top_dimensions 313 | } 314 | 315 | # HTML template (single page application) 316 | HTML_TEMPLATE = """ 317 | 318 | 319 | 320 | 321 | Embedding Analysis Tool for SEO by metehan.ai 322 | 323 | 324 | 339 | 340 | 341 |
342 |

Embedding Analysis Tool for SEO by metehan.ai

343 | 344 |
345 |

Content Input

346 |
347 |
348 | 349 | 352 |
353 |
354 | 357 |
358 |
359 |
360 | 361 | 365 | 366 | 417 |
418 | 419 | 530 | 531 | 532 | """ 533 | 534 | @app.route('/') 535 | def index(): 536 | return render_template_string(HTML_TEMPLATE) 537 | 538 | @app.route('/analyze', methods=['POST']) 539 | def analyze(): 540 | data = request.json 541 | content = data.get('content', '') 542 | 543 | # Get embedding from Gemini API 544 | embedding = get_embedding(content) 545 | 546 | # Generate charts 547 | overview_chart = plot_embedding_overview(embedding) 548 | top_dimensions_chart = plot_top_dimensions(embedding) 549 | clusters_chart = plot_dimension_clusters(embedding) 550 | pca_chart = plot_pca(embedding) 551 | histogram_chart = plot_activation_histogram(embedding) 552 | 553 | # Analyze embedding 554 | analysis = analyze_embedding(embedding) 555 | 556 | # Get Claude analysis 557 | claude_analysis = analyze_with_claude(embedding, content) 558 | 559 | # Return all data 560 | return jsonify({ 561 | 'overview_chart': overview_chart, 562 | 'top_dimensions_chart': top_dimensions_chart, 563 | 'clusters_chart': clusters_chart, 564 | 'pca_chart': pca_chart, 565 | 'histogram_chart': histogram_chart, 566 | 'analysis': analysis, 567 | 'claude_analysis': claude_analysis 568 | }) 569 | 570 | if __name__ == '__main__': 571 | print("Starting Embedding Analysis Tool...") 572 | print("If you didn't, Please replace YOUR_GOOGLE_API_KEY and YOUR_ANTHROPIC_API_KEY with your actual API keys.") 573 | print("Visit http://127.0.0.1:5000 in your browser to use the tool.") 574 | app.run(debug=True) 575 | --------------------------------------------------------------------------------