├── requirements.txt
├── demo.py
├── LICENSE
├── crawl.py
├── README.md
└── search.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | numpy
3 | scikit-learn
4 | sentence-transformers
5 | openai
6 | 
7 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | from search import PaperSearcher
 2 | 
 3 | # Use local model (free)
 4 | searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
 5 | 
 6 | # Or use OpenAI (better quality)
 7 | # searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
 8 | 
 9 | searcher.compute_embeddings()
10 | 
11 | examples = [
12 |     {
13 |         "title": "Improving Developer Emotion Classification via LLM-Based Augmentation",
14 |         "abstract": "Detecting developer emotion in the informative data stream of technical commit messages..."
15 |     },
16 | ]
17 | 
18 | results = searcher.search(examples=examples, top_k=100)
19 | 
20 | searcher.display(results, n=10)
21 | searcher.save(results, 'results.json')
22 | 
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 gyj155
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crawl.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import time
 4 | 
 5 | def fetch_submissions(venue_id, offset=0, limit=1000):
 6 |     url = "https://api2.openreview.net/notes"
 7 |     params = {
 8 |         "content.venueid": venue_id,
 9 |         "details": "replyCount,invitation",
10 |         "limit": limit,
11 |         "offset": offset,
12 |         "sort": "number:desc"
13 |     }
14 |     headers = {"User-Agent": "Mozilla/5.0"}
15 |     response = requests.get(url, params=params, headers=headers)
16 |     response.raise_for_status()
17 |     return response.json()
18 | 
19 | def crawl_papers(venue_id, output_file):
20 |     all_papers = []
21 |     offset = 0
22 |     limit = 1000
23 |     
24 |     print(f"Fetching papers from {venue_id}...")
25 |     
26 |     while True:
27 |         data = fetch_submissions(venue_id, offset, limit)
28 |         notes = data.get("notes", [])
29 |         
30 |         if not notes:
31 |             break
32 |             
33 |         for note in notes:
34 |             paper = {
35 |                 "id": note.get("id"),
36 |                 "number": note.get("number"),
37 |                 "title": note.get("content", {}).get("title", {}).get("value", ""),
38 |                 "authors": note.get("content", {}).get("authors", {}).get("value", []),
39 |                 "abstract": note.get("content", {}).get("abstract", {}).get("value", ""),
40 |                 "keywords": note.get("content", {}).get("keywords", {}).get("value", []),
41 |                 "primary_area": note.get("content", {}).get("primary_area", {}).get("value", ""),
42 |                 "forum_url": f"https://openreview.net/forum?id={note.get('id')}"
43 |             }
44 |             all_papers.append(paper)
45 |         
46 |         print(f"Fetched {len(notes)} papers (total: {len(all_papers)})")
47 |         
48 |         if len(notes) < limit:
49 |             break
50 |             
51 |         offset += limit
52 |         time.sleep(0.5)
53 |     
54 |     with open(output_file, "w", encoding="utf-8") as f:
55 |         json.dump(all_papers, f, ensure_ascii=False, indent=2)
56 |     
57 |     print(f"\nTotal: {len(all_papers)} papers")
58 |     print(f"Saved to {output_file}")
59 |     return all_papers
60 | 
61 | if __name__ == "__main__":
62 |     crawl_papers(
63 |         venue_id="ICLR.cc/2026/Conference/Submission",
64 |         output_file="iclr2026_papers.json"
65 |     )
66 | 
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Paper Semantic Search
 2 | 
 3 | Find similar papers using semantic search. Supports both local models (free) and OpenAI API (better quality).
 4 | 
 5 | ## Features
 6 | 
 7 | - Request for papers from OpenReview (e.g., ICLR2026 submissions)
 8 | - Semantic search with example papers or text queries
 9 | - Support embedding caching 
10 | - Embed model support: Open-source (e.g., all-MiniLM-L6-v2) or OpenAI
11 | 
12 | ## Quick Start
13 | 
14 | ```bash
15 | pip install -r requirements.txt
16 | ```
17 | 
18 | ### 1. Prepare Papers
19 | 
20 | ```python
21 | from crawl import crawl_papers
22 | 
23 | crawl_papers(
24 |     venue_id="ICLR.cc/2026/Conference/Submission",
25 |     output_file="iclr2026_papers.json"
26 | )
27 | ```
28 | 
29 | ### 2. Search Papers
30 | 
31 | ```python
32 | from search import PaperSearcher
33 | 
34 | # Local model (free)
35 | searcher = PaperSearcher('iclr2026_papers.json', model_type='local')
36 | 
37 | # OpenAI model (better, requires API key)
38 | # export OPENAI_API_KEY='your-key'
39 | # searcher = PaperSearcher('iclr2026_papers.json', model_type='openai')
40 | 
41 | searcher.compute_embeddings()
42 | 
43 | # Search with example papers that you are interested in
44 | examples = [
45 |     {
46 |         "title": "Your paper title",
47 |         "abstract": "Your paper abstract..."
48 |     }
49 | ]
50 | 
51 | results = searcher.search(examples=examples, top_k=100)
52 | 
53 | # Or search with text query
54 | results = searcher.search(query="interesting topics", top_k=100)
55 | 
56 | searcher.display(results, n=10)
57 | searcher.save(results, 'results.json')
58 | ```
59 | 
60 | 
61 | 
62 | ## How It Works
63 | 
64 | 1. Paper titles and abstracts are converted to embeddings
65 | 2. Embeddings are cached automatically
66 | 3. Your query is embedded using the same model
67 | 4. Cosine similarity finds the most similar papers
68 | 5. Results are ranked by similarity score
69 | 
70 | ## Cache
71 | 
72 | Embeddings are cached as `cache_<filename>_<hash>_<model>.npy`. Delete to recompute.
73 | 
74 | ## Example Output
75 | 
76 | ```
77 | ================================================================================
78 | Top 100 Results (showing 10)
79 | ================================================================================
80 | 
81 | 1. [0.8456] Paper a
82 |    #12345 | foundation or frontier models, including LLMs
83 |    https://openreview.net/forum?id=xxx
84 | 
85 | 2. [0.8234] Paper b
86 |    #12346 | applications to robotics, autonomy, planning
87 |    https://openreview.net/forum?id=yyy
88 | ```
89 | 
90 | ## Tips
91 | 
92 | - Use 1-5 example papers for best results, or a paragraph of description of your interested topic
93 | - Local model is good enough for most cases
94 | - OpenAI model for critical search (~$1 for 18k queries)
95 | 
96 | If it's useful, please consider giving a star~


--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import os
  4 | import hashlib
  5 | from pathlib import Path
  6 | from sklearn.metrics.pairwise import cosine_similarity
  7 | 
  8 | class PaperSearcher:
  9 |     def __init__(self, papers_file, model_type="openai", api_key=None, base_url=None):
 10 |         with open(papers_file, 'r', encoding='utf-8') as f:
 11 |             self.papers = json.load(f)
 12 |         
 13 |         self.model_type = model_type
 14 |         self.cache_file = self._get_cache_file(papers_file, model_type)
 15 |         self.embeddings = None
 16 |         
 17 |         if model_type == "openai":
 18 |             from openai import OpenAI
 19 |             self.client = OpenAI(
 20 |                 api_key=api_key or os.getenv('OPENAI_API_KEY'),
 21 |                 base_url=base_url
 22 |             )
 23 |             self.model_name = "text-embedding-3-large"
 24 |         else:
 25 |             from sentence_transformers import SentenceTransformer
 26 |             self.model = SentenceTransformer('all-MiniLM-L6-v2')
 27 |             self.model_name = "all-MiniLM-L6-v2"
 28 |         
 29 |         self._load_cache()
 30 |     
 31 |     def _get_cache_file(self, papers_file, model_type):
 32 |         base_name = Path(papers_file).stem
 33 |         file_hash = hashlib.md5(papers_file.encode()).hexdigest()[:8]
 34 |         cache_name = f"cache_{base_name}_{file_hash}_{model_type}.npy"
 35 |         return str(Path(papers_file).parent / cache_name)
 36 |     
 37 |     def _load_cache(self):
 38 |         if os.path.exists(self.cache_file):
 39 |             try:
 40 |                 self.embeddings = np.load(self.cache_file)
 41 |                 if len(self.embeddings) == len(self.papers):
 42 |                     print(f"Loaded cache: {self.embeddings.shape}")
 43 |                     return True
 44 |                 self.embeddings = None
 45 |             except:
 46 |                 self.embeddings = None
 47 |         return False
 48 |     
 49 |     def _save_cache(self):
 50 |         np.save(self.cache_file, self.embeddings)
 51 |         print(f"Saved cache: {self.cache_file}")
 52 |     
 53 |     def _create_text(self, paper):
 54 |         parts = []
 55 |         if paper.get('title'):
 56 |             parts.append(f"Title: {paper['title']}")
 57 |         if paper.get('abstract'):
 58 |             parts.append(f"Abstract: {paper['abstract']}")
 59 |         if paper.get('keywords'):
 60 |             kw = ', '.join(paper['keywords']) if isinstance(paper['keywords'], list) else paper['keywords']
 61 |             parts.append(f"Keywords: {kw}")
 62 |         return ' '.join(parts)
 63 |     
 64 |     def _embed_openai(self, texts):
 65 |         if isinstance(texts, str):
 66 |             texts = [texts]
 67 |         
 68 |         embeddings = []
 69 |         batch_size = 100
 70 |         
 71 |         for i in range(0, len(texts), batch_size):
 72 |             batch = texts[i:i + batch_size]
 73 |             response = self.client.embeddings.create(input=batch, model=self.model_name)
 74 |             embeddings.extend([item.embedding for item in response.data])
 75 |         
 76 |         return np.array(embeddings)
 77 |     
 78 |     def _embed_local(self, texts):
 79 |         if isinstance(texts, str):
 80 |             texts = [texts]
 81 |         return self.model.encode(texts, show_progress_bar=len(texts) > 100)
 82 |     
 83 |     def compute_embeddings(self, force=False):
 84 |         if self.embeddings is not None and not force:
 85 |             print("Using cached embeddings")
 86 |             return self.embeddings
 87 |         
 88 |         print(f"Computing embeddings ({self.model_name})...")
 89 |         texts = [self._create_text(p) for p in self.papers]
 90 |         
 91 |         if self.model_type == "openai":
 92 |             self.embeddings = self._embed_openai(texts)
 93 |         else:
 94 |             self.embeddings = self._embed_local(texts)
 95 |         
 96 |         print(f"Computed: {self.embeddings.shape}")
 97 |         self._save_cache()
 98 |         return self.embeddings
 99 |     
100 |     def search(self, examples=None, query=None, top_k=100):
101 |         if self.embeddings is None:
102 |             self.compute_embeddings()
103 |         
104 |         if examples:
105 |             texts = []
106 |             for ex in examples:
107 |                 text = f"Title: {ex['title']}"
108 |                 if ex.get('abstract'):
109 |                     text += f" Abstract: {ex['abstract']}"
110 |                 texts.append(text)
111 |             
112 |             if self.model_type == "openai":
113 |                 embs = self._embed_openai(texts)
114 |             else:
115 |                 embs = self._embed_local(texts)
116 |             
117 |             query_emb = np.mean(embs, axis=0).reshape(1, -1)
118 |         
119 |         elif query:
120 |             if self.model_type == "openai":
121 |                 query_emb = self._embed_openai(query).reshape(1, -1)
122 |             else:
123 |                 query_emb = self._embed_local(query).reshape(1, -1)
124 |         else:
125 |             raise ValueError("Provide either examples or query")
126 |         
127 |         similarities = cosine_similarity(query_emb, self.embeddings)[0]
128 |         top_indices = np.argsort(similarities)[::-1][:top_k]
129 |         
130 |         return [{
131 |             'paper': self.papers[idx],
132 |             'similarity': float(similarities[idx])
133 |         } for idx in top_indices]
134 |     
135 |     def display(self, results, n=10):
136 |         print(f"\n{'='*80}")
137 |         print(f"Top {len(results)} Results (showing {min(n, len(results))})")
138 |         print(f"{'='*80}\n")
139 |         
140 |         for i, result in enumerate(results[:n], 1):
141 |             paper = result['paper']
142 |             sim = result['similarity']
143 |             
144 |             print(f"{i}. [{sim:.4f}] {paper['title']}")
145 |             print(f"   #{paper.get('number', 'N/A')} | {paper.get('primary_area', 'N/A')}")
146 |             print(f"   {paper['forum_url']}\n")
147 |     
148 |     def save(self, results, output_file):
149 |         with open(output_file, 'w', encoding='utf-8') as f:
150 |             json.dump({
151 |                 'model': self.model_name,
152 |                 'total': len(results),
153 |                 'results': results
154 |             }, f, ensure_ascii=False, indent=2)
155 |         print(f"Saved to {output_file}")
156 | 
157 | 


--------------------------------------------------------------------------------