├── requirements.txt ├── demo.py ├── LICENSE ├── crawl.py ├── README.md └── search.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | numpy 3 | scikit-learn 4 | sentence-transformers 5 | openai 6 | 7 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from search import PaperSearcher 2 | 3 | # Use local model (free) 4 | searcher = PaperSearcher('iclr2026_papers.json', model_type='local') 5 | 6 | # Or use OpenAI (better quality) 7 | # searcher = PaperSearcher('iclr2026_papers.json', model_type='openai') 8 | 9 | searcher.compute_embeddings() 10 | 11 | examples = [ 12 | { 13 | "title": "Improving Developer Emotion Classification via LLM-Based Augmentation", 14 | "abstract": "Detecting developer emotion in the informative data stream of technical commit messages..." 15 | }, 16 | ] 17 | 18 | results = searcher.search(examples=examples, top_k=100) 19 | 20 | searcher.display(results, n=10) 21 | searcher.save(results, 'results.json') 22 | 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 gyj155 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crawl.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import time 4 | 5 | def fetch_submissions(venue_id, offset=0, limit=1000): 6 | url = "https://api2.openreview.net/notes" 7 | params = { 8 | "content.venueid": venue_id, 9 | "details": "replyCount,invitation", 10 | "limit": limit, 11 | "offset": offset, 12 | "sort": "number:desc" 13 | } 14 | headers = {"User-Agent": "Mozilla/5.0"} 15 | response = requests.get(url, params=params, headers=headers) 16 | response.raise_for_status() 17 | return response.json() 18 | 19 | def crawl_papers(venue_id, output_file): 20 | all_papers = [] 21 | offset = 0 22 | limit = 1000 23 | 24 | print(f"Fetching papers from {venue_id}...") 25 | 26 | while True: 27 | data = fetch_submissions(venue_id, offset, limit) 28 | notes = data.get("notes", []) 29 | 30 | if not notes: 31 | break 32 | 33 | for note in notes: 34 | paper = { 35 | "id": note.get("id"), 36 | "number": note.get("number"), 37 | "title": note.get("content", {}).get("title", {}).get("value", ""), 38 | "authors": note.get("content", {}).get("authors", {}).get("value", []), 39 | "abstract": note.get("content", {}).get("abstract", {}).get("value", ""), 40 | "keywords": note.get("content", {}).get("keywords", {}).get("value", []), 41 | "primary_area": note.get("content", {}).get("primary_area", {}).get("value", ""), 42 | "forum_url": f"https://openreview.net/forum?id={note.get('id')}" 43 | } 44 | all_papers.append(paper) 45 | 46 | print(f"Fetched {len(notes)} papers (total: {len(all_papers)})") 47 | 48 | if len(notes) < limit: 49 | break 50 | 51 | offset += limit 52 | time.sleep(0.5) 53 | 54 | with open(output_file, "w", encoding="utf-8") as f: 55 | json.dump(all_papers, f, ensure_ascii=False, indent=2) 56 | 57 | print(f"\nTotal: {len(all_papers)} papers") 58 | print(f"Saved to {output_file}") 59 | return all_papers 60 | 61 | if __name__ == "__main__": 62 | crawl_papers( 63 | venue_id="ICLR.cc/2026/Conference/Submission", 64 | output_file="iclr2026_papers.json" 65 | ) 66 | 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Paper Semantic Search 2 | 3 | Find similar papers using semantic search. Supports both local models (free) and OpenAI API (better quality). 4 | 5 | ## Features 6 | 7 | - Request for papers from OpenReview (e.g., ICLR2026 submissions) 8 | - Semantic search with example papers or text queries 9 | - Support embedding caching 10 | - Embed model support: Open-source (e.g., all-MiniLM-L6-v2) or OpenAI 11 | 12 | ## Quick Start 13 | 14 | ```bash 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ### 1. Prepare Papers 19 | 20 | ```python 21 | from crawl import crawl_papers 22 | 23 | crawl_papers( 24 | venue_id="ICLR.cc/2026/Conference/Submission", 25 | output_file="iclr2026_papers.json" 26 | ) 27 | ``` 28 | 29 | ### 2. Search Papers 30 | 31 | ```python 32 | from search import PaperSearcher 33 | 34 | # Local model (free) 35 | searcher = PaperSearcher('iclr2026_papers.json', model_type='local') 36 | 37 | # OpenAI model (better, requires API key) 38 | # export OPENAI_API_KEY='your-key' 39 | # searcher = PaperSearcher('iclr2026_papers.json', model_type='openai') 40 | 41 | searcher.compute_embeddings() 42 | 43 | # Search with example papers that you are interested in 44 | examples = [ 45 | { 46 | "title": "Your paper title", 47 | "abstract": "Your paper abstract..." 48 | } 49 | ] 50 | 51 | results = searcher.search(examples=examples, top_k=100) 52 | 53 | # Or search with text query 54 | results = searcher.search(query="interesting topics", top_k=100) 55 | 56 | searcher.display(results, n=10) 57 | searcher.save(results, 'results.json') 58 | ``` 59 | 60 | 61 | 62 | ## How It Works 63 | 64 | 1. Paper titles and abstracts are converted to embeddings 65 | 2. Embeddings are cached automatically 66 | 3. Your query is embedded using the same model 67 | 4. Cosine similarity finds the most similar papers 68 | 5. Results are ranked by similarity score 69 | 70 | ## Cache 71 | 72 | Embeddings are cached as `cache___.npy`. Delete to recompute. 73 | 74 | ## Example Output 75 | 76 | ``` 77 | ================================================================================ 78 | Top 100 Results (showing 10) 79 | ================================================================================ 80 | 81 | 1. [0.8456] Paper a 82 | #12345 | foundation or frontier models, including LLMs 83 | https://openreview.net/forum?id=xxx 84 | 85 | 2. [0.8234] Paper b 86 | #12346 | applications to robotics, autonomy, planning 87 | https://openreview.net/forum?id=yyy 88 | ``` 89 | 90 | ## Tips 91 | 92 | - Use 1-5 example papers for best results, or a paragraph of description of your interested topic 93 | - Local model is good enough for most cases 94 | - OpenAI model for critical search (~$1 for 18k queries) 95 | 96 | If it's useful, please consider giving a star~ -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import hashlib 5 | from pathlib import Path 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | class PaperSearcher: 9 | def __init__(self, papers_file, model_type="openai", api_key=None, base_url=None): 10 | with open(papers_file, 'r', encoding='utf-8') as f: 11 | self.papers = json.load(f) 12 | 13 | self.model_type = model_type 14 | self.cache_file = self._get_cache_file(papers_file, model_type) 15 | self.embeddings = None 16 | 17 | if model_type == "openai": 18 | from openai import OpenAI 19 | self.client = OpenAI( 20 | api_key=api_key or os.getenv('OPENAI_API_KEY'), 21 | base_url=base_url 22 | ) 23 | self.model_name = "text-embedding-3-large" 24 | else: 25 | from sentence_transformers import SentenceTransformer 26 | self.model = SentenceTransformer('all-MiniLM-L6-v2') 27 | self.model_name = "all-MiniLM-L6-v2" 28 | 29 | self._load_cache() 30 | 31 | def _get_cache_file(self, papers_file, model_type): 32 | base_name = Path(papers_file).stem 33 | file_hash = hashlib.md5(papers_file.encode()).hexdigest()[:8] 34 | cache_name = f"cache_{base_name}_{file_hash}_{model_type}.npy" 35 | return str(Path(papers_file).parent / cache_name) 36 | 37 | def _load_cache(self): 38 | if os.path.exists(self.cache_file): 39 | try: 40 | self.embeddings = np.load(self.cache_file) 41 | if len(self.embeddings) == len(self.papers): 42 | print(f"Loaded cache: {self.embeddings.shape}") 43 | return True 44 | self.embeddings = None 45 | except: 46 | self.embeddings = None 47 | return False 48 | 49 | def _save_cache(self): 50 | np.save(self.cache_file, self.embeddings) 51 | print(f"Saved cache: {self.cache_file}") 52 | 53 | def _create_text(self, paper): 54 | parts = [] 55 | if paper.get('title'): 56 | parts.append(f"Title: {paper['title']}") 57 | if paper.get('abstract'): 58 | parts.append(f"Abstract: {paper['abstract']}") 59 | if paper.get('keywords'): 60 | kw = ', '.join(paper['keywords']) if isinstance(paper['keywords'], list) else paper['keywords'] 61 | parts.append(f"Keywords: {kw}") 62 | return ' '.join(parts) 63 | 64 | def _embed_openai(self, texts): 65 | if isinstance(texts, str): 66 | texts = [texts] 67 | 68 | embeddings = [] 69 | batch_size = 100 70 | 71 | for i in range(0, len(texts), batch_size): 72 | batch = texts[i:i + batch_size] 73 | response = self.client.embeddings.create(input=batch, model=self.model_name) 74 | embeddings.extend([item.embedding for item in response.data]) 75 | 76 | return np.array(embeddings) 77 | 78 | def _embed_local(self, texts): 79 | if isinstance(texts, str): 80 | texts = [texts] 81 | return self.model.encode(texts, show_progress_bar=len(texts) > 100) 82 | 83 | def compute_embeddings(self, force=False): 84 | if self.embeddings is not None and not force: 85 | print("Using cached embeddings") 86 | return self.embeddings 87 | 88 | print(f"Computing embeddings ({self.model_name})...") 89 | texts = [self._create_text(p) for p in self.papers] 90 | 91 | if self.model_type == "openai": 92 | self.embeddings = self._embed_openai(texts) 93 | else: 94 | self.embeddings = self._embed_local(texts) 95 | 96 | print(f"Computed: {self.embeddings.shape}") 97 | self._save_cache() 98 | return self.embeddings 99 | 100 | def search(self, examples=None, query=None, top_k=100): 101 | if self.embeddings is None: 102 | self.compute_embeddings() 103 | 104 | if examples: 105 | texts = [] 106 | for ex in examples: 107 | text = f"Title: {ex['title']}" 108 | if ex.get('abstract'): 109 | text += f" Abstract: {ex['abstract']}" 110 | texts.append(text) 111 | 112 | if self.model_type == "openai": 113 | embs = self._embed_openai(texts) 114 | else: 115 | embs = self._embed_local(texts) 116 | 117 | query_emb = np.mean(embs, axis=0).reshape(1, -1) 118 | 119 | elif query: 120 | if self.model_type == "openai": 121 | query_emb = self._embed_openai(query).reshape(1, -1) 122 | else: 123 | query_emb = self._embed_local(query).reshape(1, -1) 124 | else: 125 | raise ValueError("Provide either examples or query") 126 | 127 | similarities = cosine_similarity(query_emb, self.embeddings)[0] 128 | top_indices = np.argsort(similarities)[::-1][:top_k] 129 | 130 | return [{ 131 | 'paper': self.papers[idx], 132 | 'similarity': float(similarities[idx]) 133 | } for idx in top_indices] 134 | 135 | def display(self, results, n=10): 136 | print(f"\n{'='*80}") 137 | print(f"Top {len(results)} Results (showing {min(n, len(results))})") 138 | print(f"{'='*80}\n") 139 | 140 | for i, result in enumerate(results[:n], 1): 141 | paper = result['paper'] 142 | sim = result['similarity'] 143 | 144 | print(f"{i}. [{sim:.4f}] {paper['title']}") 145 | print(f" #{paper.get('number', 'N/A')} | {paper.get('primary_area', 'N/A')}") 146 | print(f" {paper['forum_url']}\n") 147 | 148 | def save(self, results, output_file): 149 | with open(output_file, 'w', encoding='utf-8') as f: 150 | json.dump({ 151 | 'model': self.model_name, 152 | 'total': len(results), 153 | 'results': results 154 | }, f, ensure_ascii=False, indent=2) 155 | print(f"Saved to {output_file}") 156 | 157 | --------------------------------------------------------------------------------