├── papers └── .gitignore ├── .gitignore ├── __pycache__ └── pdf.cpython-311.pyc ├── requirements.txt ├── pdf.py ├── LICENSE.md ├── README.md └── main.py /papers/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # .gitignore 2 | venv/ 3 | env/ 4 | .DS_Store 5 | .idea 6 | .env -------------------------------------------------------------------------------- /__pycache__/pdf.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/michaelNinh/arxiv-search-summarize/HEAD/__pycache__/pdf.cpython-311.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.2 4 | attrs==23.1.0 5 | beautifulsoup4==4.12.2 6 | bs4==0.0.1 7 | certifi==2023.5.7 8 | cffi==1.15.1 9 | charset-normalizer==3.1.0 10 | cryptography==40.0.2 11 | DateTime==5.1 12 | frozenlist==1.3.3 13 | idna==3.4 14 | lxml==4.9.2 15 | multidict==6.0.4 16 | openai==0.27.6 17 | pdfminer==20191125 18 | pdfminer.six==20221105 19 | pycparser==2.21 20 | pycryptodome==3.17 21 | PyPDF2==3.0.1 22 | python-dateutil==2.8.2 23 | pytz==2023.3 24 | regex==2023.5.5 25 | requests==2.30.0 26 | six==1.16.0 27 | soupsieve==2.4.1 28 | tiktoken==0.4.0 29 | tokenizer==3.4.2 30 | tqdm==4.65.0 31 | urllib3==2.0.2 32 | yarl==1.9.2 33 | zope.interface==6.0 34 | -------------------------------------------------------------------------------- /pdf.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from pdfminer.high_level import extract_text 3 | from io import BytesIO 4 | 5 | 6 | def pdf_to_text(url): 7 | response = requests.get(url) 8 | pdf_file = BytesIO(response.content) 9 | 10 | text = extract_text(pdf_file) 11 | return text 12 | 13 | 14 | def create_chunks(text, chunk_size, overlap): 15 | words = text.split() 16 | chunks = [] 17 | for i in range(0, len(words), chunk_size - overlap): 18 | chunk = words[i:i+chunk_size] 19 | chunks.append(' '.join(chunk)) 20 | return chunks 21 | 22 | 23 | def abs_to_pdf(url): 24 | return url.replace('abs', 'pdf') 25 | 26 | # # Example usage 27 | # url = 'https://arxiv.org/pdf/2304.13343.pdf' 28 | # pdf_text = pdf_to_text(url) 29 | # chunks = create_chunks(pdf_text, 300, 50) 30 | # 31 | # print(len(chunks)) 32 | 33 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright 2023 michaelNinh 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **ARXIV QUERY & SUMMARIZE** 2 | 3 | A simple python script that keyword searches papers and summarizes the abstract in simple English using OpenAI's chatGPT API. Research for everyone! 4 | 5 | Feel free to customize / PR. 6 | 7 | **Installation:** 8 | ``` 9 | #add your api key 10 | pip install -r requirements.txt 11 | python main.py 12 | ``` 13 | 14 | **Bugs:** 15 | - Selecting multiple papers to summarize will sometimes fail 16 | 17 | **Features:** 18 | - first summarizes paper abstract to help you make a decision if you want the full summary 19 | - summarizes the entire paper if selected 20 | - select GPT version 21 | - use your own summary prompt 22 | - add your own tags 23 | - saves papers as txt 24 | - ignores already saved papers 25 | - runs in console, prints summaries in console 26 | 27 | **To do later:** 28 | - add costs 29 | - retry on fail & better error handling 30 | - search for new papers added since list time the script ran 31 | 32 | **Example output:** 33 | 34 | papers/Fast_Distributed_Inference_Serving_for_Large_Language_Models.txt/ 35 | 36 | Title: Fast Distributed Inference Serving for Large Language Models 37 | 38 | URL: http://arxiv.org/abs/2305.05920v1 39 | 40 | Summary: FastServe is a new system that helps large language models (LLMs) to work faster and more efficiently. It uses a special scheduling method that allows it to complete tasks more quickly by breaking them down into smaller parts. This means that it can handle more tasks at once and complete them faster than other systems. FastServe also has a special memory management system that helps it to work more efficiently with LLMs. 41 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dateutil.parser 3 | import requests 4 | import re 5 | import openai 6 | import datetime 7 | from bs4 import BeautifulSoup 8 | from pdf import pdf_to_text, create_chunks, abs_to_pdf 9 | import tiktoken 10 | from dotenv import load_dotenv 11 | 12 | load_dotenv() # take environment variables from .env. 13 | 14 | openai.api_key = os.getenv("SECRET_KEY") 15 | gpt_version = "gpt-3.5-turbo" 16 | 17 | keywords = ['large language models', 'openai'] 18 | max_papers = 5 # max amount of papers to summarize 19 | 20 | summary_prompt = 'Summarize the user input in 3 sentences using simple english' 21 | full_summary_prompt = 'The following user input is an excerpt from a research paper. Produce a short-length summary of ' \ 22 | 'the text using simple English while retaining salient points or key insights. If there is text ' \ 23 | 'not related to the research, ' \ 24 | 'such as references, do not include it in the summary.' 25 | 26 | # research should not be older than this date 27 | oldest_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=7) 28 | 29 | base_url = 'http://export.arxiv.org/api/query?' 30 | start = 0 31 | size = 50 32 | 33 | papers = [] 34 | 35 | while True: 36 | query = ' OR '.join(f'ti:"{kw}"' for kw in keywords) 37 | url = f'{base_url}search_query={query}&start={start}&max_results={size}&sortBy=submittedDate&sortOrder=descending' 38 | 39 | response = requests.get(url) 40 | soup = BeautifulSoup(response.text, 'xml') 41 | entries = soup.findAll('entry') 42 | 43 | if not entries: 44 | break 45 | 46 | for entry in entries: 47 | published = dateutil.parser.parse(entry.published.text) 48 | if published < oldest_date or len(papers) == max_papers: 49 | break 50 | 51 | title = re.sub(r'\W+', '_', entry.title.text) 52 | if os.path.exists(f'papers/{title}.txt'): 53 | # print('already found: ' + title) 54 | continue 55 | 56 | papers.append({ 57 | 'title': entry.title.text, 58 | 'id': entry.id.text, 59 | 'abstract': entry.summary.text, 60 | 'published': published 61 | }) 62 | 63 | if published < oldest_date or len(papers) == max_papers: 64 | break 65 | 66 | start += size 67 | 68 | for paper in papers: 69 | print('creating abstract summary for ' + paper['title']) 70 | completion = openai.ChatCompletion.create( 71 | model=gpt_version, 72 | messages=[ 73 | {"role": "system", "content": summary_prompt}, 74 | {"role": "user", "content": paper['abstract']} 75 | ], 76 | temperature=0, 77 | max_tokens=200 78 | ) 79 | summary = completion['choices'][0]['message']['content'] 80 | print('complete') 81 | 82 | # Save the summary to a text file 83 | title = re.sub(r'\W+', '_', paper['title']) 84 | with open(f'papers/{title}.txt', 'w') as f: 85 | f.write(f"Title: {paper['title']}\n") 86 | f.write(f"URL: {paper['id']}\n") 87 | f.write(f"Summary: {summary}\n") 88 | 89 | # Save the summary in the papers dictionary 90 | paper['summary'] = summary 91 | 92 | # Print the abstract titles and summaries 93 | print("\n" + "====" * 5 + " (* ̄▽ ̄)b" + "\n") 94 | for i, paper in enumerate(papers, 1): 95 | print(f'{i}. {paper["title"]}\n {paper["summary"]}\n') 96 | 97 | 98 | def full_summary(paper): 99 | def summarize_chunks(chunks): 100 | all_summaries = [] 101 | print("# Chunks: " + str(len(chunks))) 102 | round = 0 103 | for chunk in chunks: 104 | print("summarizing chunk: " + str(round + 1)) 105 | completion = openai.ChatCompletion.create( 106 | model=gpt_version, 107 | messages=[ 108 | {"role": "system", "content": full_summary_prompt}, 109 | {"role": "user", "content": chunk} 110 | ], 111 | temperature=0, 112 | max_tokens=200 113 | ) 114 | chunk_summary = completion['choices'][0]['message']['content'] 115 | all_summaries.append(chunk_summary) 116 | round += 1 117 | 118 | if len(chunks) > 1: 119 | # If there's more than one chunk, summarize the combined summaries 120 | return summarize_chunks([' '.join(all_summaries)]) 121 | else: 122 | # If there's only one chunk left, return its summary 123 | return all_summaries[0] 124 | 125 | pdf_url = abs_to_pdf(paper['id']) 126 | pdf_text = pdf_to_text(pdf_url) 127 | chunk_size = 1750 128 | overlap = 100 129 | chunks = create_chunks(pdf_text, chunk_size, overlap) 130 | 131 | # Check token count for each chunk and decrease chunk size if necessary 132 | for chunk in chunks: 133 | encoding = tiktoken.encoding_for_model(gpt_version) 134 | token_count = len(encoding.encode(chunk)) + len(encoding.encode(full_summary_prompt)) 135 | while token_count > 4096: 136 | chunk_size = int(chunk_size * 0.9) 137 | overlap = int(overlap * 0.9) 138 | chunks = create_chunks(pdf_text, chunk_size, overlap) 139 | token_count = len(encoding.encode(chunk)) + len(encoding.encode(full_summary_prompt)) 140 | final_summary = summarize_chunks(chunks) 141 | title = re.sub(r'\W+', '_', paper['title']) 142 | with open(f'papers/{title}.txt', 'a') as f: 143 | f.write(f"Final Summary: {final_summary}\n") 144 | paper['final_summary'] = final_summary 145 | print("====FULL SUMMARY====") 146 | return final_summary 147 | 148 | 149 | # Allow the user to select papers to fully summarize 150 | summarized_papers = set() 151 | 152 | while True: 153 | try: 154 | selection = input( 155 | "Enter paper numbers to summarize, separated by spaces like so '1 2 3', or 'q' to quit: ").strip() 156 | if selection.lower() == 'q': 157 | break 158 | numbers = map(int, selection.split()) 159 | for number in numbers: 160 | if number not in summarized_papers: # Check if the paper is already summarized 161 | print(full_summary(papers[number - 1])) 162 | summarized_papers.add(number) # Add the number to the set of summarized papers 163 | # Check if all papers have been summarized 164 | if len(summarized_papers) == len(papers): 165 | print("All papers have been summarized.") 166 | break 167 | except ValueError: 168 | print("Invalid selection, please make sure you're entering numbers separated by spaces.") 169 | except IndexError: 170 | print("Invalid selection, please make sure the numbers you're entering correspond to available papers.") 171 | 172 | --------------------------------------------------------------------------------