├── app ├── __init__.py ├── utils.py ├── validator.py ├── pdf_document.py ├── summary.py ├── magic.py ├── youtube_video.py ├── main.py └── clustered_summary.py ├── requirements.txt ├── LICENSE ├── .gitignore └── README.md /app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.38.0 2 | youtube-transcript-api==0.6.2 3 | pypdf2==3.0.1 4 | tiktoken==0.7.0 5 | openai==1.42.0 6 | scikit-learn==1.5.1 7 | numpy==2.0.2 -------------------------------------------------------------------------------- /app/utils.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | 4 | def count_tokens(text: str) -> int: 5 | encoding = tiktoken.get_encoding("cl100k_base") 6 | token_list = encoding.encode(text, disallowed_special=()) 7 | return len(token_list) 8 | -------------------------------------------------------------------------------- /app/validator.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from utils import count_tokens 4 | 5 | 6 | class Validator: 7 | 8 | MAX_TOKENS = 1_000_000 9 | 10 | @staticmethod 11 | def validate_text(text: str) -> List[str]: 12 | errors = [] 13 | 14 | if not text: 15 | errors.append("No text found.") 16 | return errors 17 | 18 | elif count_tokens(text) > Validator.MAX_TOKENS: 19 | errors.append( 20 | "The text is too large. Please provide a text with fewer than 1,000,000 tokens." 21 | ) 22 | 23 | return errors 24 | -------------------------------------------------------------------------------- /app/pdf_document.py: -------------------------------------------------------------------------------- 1 | import io 2 | import PyPDF2 3 | 4 | 5 | class PdfDocument: 6 | def __init__(self, pdf_bytes: io.BytesIO): 7 | self.pdf_bytes = pdf_bytes 8 | self._text_content = None 9 | 10 | @property 11 | def text_content(self) -> str: 12 | if self._text_content: 13 | return self._text_content 14 | pdf_reader = PyPDF2.PdfReader(self.pdf_bytes) 15 | text = "" 16 | for i in range(len(pdf_reader.pages)): 17 | p = pdf_reader.pages[i] 18 | text += p.extract_text() 19 | self._text_content = text 20 | return self.sanitize_text(text) 21 | 22 | @staticmethod 23 | def sanitize_text(text: str) -> str: 24 | special_tokens = [ 25 | ">|endoftext|", 26 | "<|fim_prefix|", 27 | "<|fim_middle|", 28 | "<|fim_suffix|", 29 | "<|endofprompt|>", 30 | ] 31 | for special in special_tokens: 32 | text = text.replace(special, "") 33 | return text 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Ethan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app/summary.py: -------------------------------------------------------------------------------- 1 | from magic import Magician 2 | 3 | 4 | class Summary(Magician): 5 | 6 | SUMMARIZE_SYSTEM_MESSAGE = """ 7 | You will be given a complete %s. It will be enclosed in triple backticks. 8 | Please provide a comprehensive and cohesive summary of the %s, focusing on the key points and main ideas, while maintaining clarity and conciseness. 9 | 10 | Format your summary in HTML. It should be structured as follows: 11 | 12 | - A short, bullet form list of key takeaways. 13 | - A well-formatted easy-to-read synopsis, structured like an essay that summarizes the document cohesively. 14 | - A conclusion that ties all the ideas together. 15 | 16 | Format for maximum readability and clarity. 17 | """ 18 | 19 | def __init__(self, text: str, media_type: str): 20 | super().__init__() 21 | self.text = text 22 | self.media_type = media_type 23 | 24 | def get_summary(self) -> str: 25 | 26 | system_message = self.SUMMARIZE_SYSTEM_MESSAGE % ( 27 | self.media_type, 28 | self.media_type, 29 | ) 30 | user_message = f"'''{self.text}'''" 31 | full_summary = self.wave_wand(system_message, user_message) 32 | 33 | return self.extract_code(full_summary) 34 | -------------------------------------------------------------------------------- /app/magic.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, List 3 | 4 | import openai 5 | 6 | 7 | class Magician: 8 | MAGIC_WAND = "gpt-4o-mini" 9 | OTHER_MAGIC_WAND = "text-embedding-3-small" 10 | 11 | def __init__(self): 12 | self.client = openai.OpenAI() 13 | 14 | def wave_wand(self, system_message: str, user_message: str) -> str: 15 | return ( 16 | self.client.chat.completions.create( 17 | model=self.MAGIC_WAND, 18 | messages=[ 19 | {"role": "system", "content": system_message}, 20 | {"role": "user", "content": user_message}, 21 | ], 22 | ) 23 | .choices[0] 24 | .message.content 25 | ) 26 | 27 | def create_magic_numbers(self, text: str) -> Dict[str, List[float]]: 28 | response = self.client.embeddings.create( 29 | input=text, 30 | model=self.OTHER_MAGIC_WAND, 31 | ) 32 | return {text: response.data[0].embedding} 33 | 34 | @staticmethod 35 | def extract_code(markdown_text: str) -> str: 36 | cleaned_text = re.sub( 37 | r"```[\w]*\n(.*?)```", r"\1", markdown_text, flags=re.DOTALL 38 | ).replace("\n", "") 39 | 40 | return cleaned_text 41 | -------------------------------------------------------------------------------- /app/youtube_video.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | from typing import List, Optional 3 | from urllib.parse import urlparse, parse_qs 4 | 5 | from youtube_transcript_api import YouTubeTranscriptApi 6 | 7 | 8 | class YouTubeVideo: 9 | 10 | def __init__(self, video_url: str): 11 | self.video_url = video_url 12 | self.video_id = self.get_youtube_video_id() 13 | 14 | def get_transcript(self) -> Optional[str]: 15 | transcript = YouTubeTranscriptApi.get_transcript(self.video_id) 16 | if not transcript: 17 | return None 18 | return self.transcript_to_plain_text(transcript) 19 | 20 | def get_youtube_video_id(self) -> str: 21 | query = urlparse(self.video_url) 22 | if query.hostname == "youtu.be": 23 | return query.path[1:] 24 | if query.hostname in ("www.youtube.com", "youtube.com"): 25 | if query.path == "/watch": 26 | return parse_qs(query.query)["v"][0] 27 | if query.path[:7] == "/embed/": 28 | return query.path.split("/")[2] 29 | if query.path[:3] == "/v/": 30 | return query.path.split("/")[2] 31 | raise ValueError("Invalid YouTube URL") 32 | 33 | @staticmethod 34 | def transcript_to_plain_text(transcript: List[dict]) -> str: 35 | return " ".join([entry["text"] for entry in transcript]) 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Node.js 2 | node_modules/ 3 | npm-debug.log* 4 | yarn-debug.log* 5 | yarn-error.log* 6 | package-lock.json 7 | yarn.lock 8 | 9 | # Python 10 | __pycache__/ 11 | *.py[cod] 12 | *.pyo 13 | *.pyd 14 | .Python 15 | env/ 16 | venv/ 17 | ENV/ 18 | venv.bak/ 19 | venv.bak/ 20 | *.env 21 | *.venv 22 | *.egg-info/ 23 | 24 | # Virtualenv 25 | .env/ 26 | .venv/ 27 | 28 | # Django 29 | *.log 30 | local_settings.py 31 | db.sqlite3 32 | /media 33 | /staticfiles 34 | 35 | # macOS 36 | .DS_Store 37 | .AppleDouble 38 | .LSOverride 39 | 40 | # Windows 41 | Thumbs.db 42 | ehthumbs.db 43 | Desktop.ini 44 | $RECYCLE.BIN/ 45 | 46 | # Linux 47 | *~ 48 | .nfs* 49 | 50 | # Logs 51 | logs/ 52 | *.log 53 | *.log.* 54 | 55 | # Temporary files 56 | tmp/ 57 | temp/ 58 | *.tmp 59 | *.temp 60 | *.swp 61 | *.swo 62 | *.swn 63 | *.bak 64 | *.backup 65 | 66 | # IDEs and Editors 67 | .vscode/ 68 | .idea/ 69 | *.sublime-project 70 | *.sublime-workspace 71 | 72 | # IntelliJ 73 | *.iml 74 | *.iws 75 | *.ipr 76 | /out/ 77 | 78 | # JetBrains 79 | /.idea/ 80 | *.iml 81 | /out/ 82 | 83 | # Visual Studio Code 84 | .vscode/ 85 | .history/ 86 | 87 | # Eclipse 88 | .project 89 | .classpath 90 | .cproject 91 | .settings/ 92 | .snap 93 | 94 | # NetBeans 95 | nbproject/private/ 96 | build/ 97 | nbbuild/ 98 | dist/ 99 | nbdist/ 100 | nbactions.xml 101 | nb-configuration.xml 102 | 103 | # VS Code 104 | .vscode/ 105 | 106 | # Coverage reports 107 | coverage/ 108 | *.lcov 109 | 110 | # Test outputs 111 | *.out 112 | *.test 113 | 114 | # Environment variables 115 | .env 116 | .env.local 117 | .env.*.local 118 | .envrc 119 | 120 | # Jupyter Notebook 121 | .ipynb_checkpoints/ 122 | 123 | # Archives 124 | *.zip 125 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from streamlit.runtime.uploaded_file_manager import UploadedFile 3 | 4 | from clustered_summary import ClusteredSummary 5 | from pdf_document import PdfDocument 6 | from summary import Summary 7 | from utils import count_tokens 8 | from validator import Validator 9 | from youtube_video import YouTubeVideo 10 | 11 | 12 | def main(): 13 | st.title("Document Summarizer") 14 | 15 | input_method = st.radio( 16 | "Select input method", ("Upload a document", "Enter a YouTube URL") 17 | ) 18 | 19 | if input_method == "Upload a document": 20 | uploaded_file = st.file_uploader( 21 | "Upload a document", 22 | type=["pdf"], 23 | ) 24 | 25 | if input_method == "Enter a YouTube URL": 26 | youtube_url = st.text_input("Enter a YouTube URL") 27 | 28 | st.sidebar.markdown("# [Contact me by email!](mailto:ethanujohnston@gmail.com)") 29 | st.sidebar.markdown( 30 | "# [Check out my other projects!](https://github.com/e-johnstonn)" 31 | ) 32 | st.sidebar.markdown("# [Twitter / X](https://x.com/ethanjdev)") 33 | 34 | if st.button("Summarize"): 35 | if input_method == "Upload a document": 36 | if uploaded_file is None: 37 | st.warning("Please upload a file.") 38 | return 39 | summarize_file(uploaded_file) 40 | 41 | else: 42 | if not youtube_url: 43 | st.warning("Please enter a YouTube URL.") 44 | return 45 | summarize_youtube(youtube_url) 46 | 47 | 48 | def summarize_file(uploaded_file: UploadedFile): 49 | document = PdfDocument(uploaded_file) 50 | validation_errors = Validator.validate_text(document.text_content) 51 | 52 | if validation_errors: 53 | st.warning(f"Invalid input: {','.join(validation_errors)}") 54 | return 55 | 56 | st.markdown(run_summary(document.text_content, f"document"), unsafe_allow_html=True) 57 | 58 | 59 | def summarize_youtube(youtube_url: str): 60 | video = YouTubeVideo(youtube_url) 61 | transcript = video.get_transcript() 62 | validation_errors = Validator.validate_text(transcript) 63 | 64 | if validation_errors: 65 | st.warning(validation_errors) 66 | return 67 | 68 | st.markdown(run_summary(transcript, "youtube video"), unsafe_allow_html=True) 69 | 70 | 71 | def run_summary(text: str, media_type: str): 72 | tokens = count_tokens(text) 73 | 74 | if tokens > 100_000: 75 | return ClusteredSummary(text, media_type).get_summary() 76 | else: 77 | return Summary(text, media_type).get_summary() 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **GPT 3.5/4/4o-mini Powered Document Summarizer** 2 | 3 | This is a tool that takes a text document (PDF or TXT) or YouTube transcript and generates a concise summary using GPT-4O-Mini, GPT-4 or GPT-3.5-turbo. It can accurately summarize hundreds of pages of text. It's built with Python and Streamlit and leverages the langchain library for text processing. 4 | While the final output is generated with the latest GPT family model from OpenAI, GPT-4O-Mini (one of the LLMs that powers ChatGPT), only a small portion of the overall document is used in the prompts. Before any call is made to either LLM, the document is separated into 5 | small sections that contain the majority of the meaning of the document. 6 | 7 | Summarize your documents here (no API key required): https://gpt-document-summarizer.streamlit.app/ 8 | 9 | ## Features 10 | 11 | - Supports PDF and TXT file formats 12 | - Utilizes GPT-4 or GPT-3.5-turbo for generating summaries 13 | - Automatic clustering of the input document to identify key sections 14 | - Customizable number of clusters for the summarization process 15 | 16 | ## Usage 17 | 18 | 1. Launch the Streamlit app by running `streamlit run main.py` 19 | 2. Upload a document (TXT or PDF) to summarize. 20 | 3. Enter your OpenAI API key if the free usage cap has been hit. 21 | 4. Choose whether to use GPT-4 for the summarization (recommended, requires GPT-4 API access). 22 | 5. Click the "Summarize" button and wait for the result. 23 | 24 | ## Modules 25 | 26 | - `main.py`: Streamlit app main file 27 | - `utils.py`: Contains utility functions for document loading, token counting, and summarization 28 | - `streamlit_app_utils.py`: Contains utility functions specifically for the Streamlit app 29 | 30 | ## Main Functions 31 | 32 | - `main()`: Entry point for the Streamlit app 33 | - `process_summarize_button()`: Processes the "Summarize" button click and displays the generated summary 34 | - `validate_input()`: Validates user input and displays warnings for invalid inputs 35 | - `validate_doc_size()`: Validates the document size for token limits 36 | 37 | ## Utility Functions 38 | 39 | - `doc_loader()`: Loads a document from a file path 40 | - `token_counter()`: Counts the number of tokens in a text string 41 | - `doc_to_text()`: Converts a langchain Document object to a text string 42 | - `doc_to_final_summary()`: Generates the final summary for a given document 43 | - `summary_prompt_creator()`: Creates a summary prompt list for the langchain summarize chain 44 | - `pdf_to_text()`: Converts a PDF file to a text string 45 | - `check_gpt_4()`: Checks if the user has access to GPT-4 46 | - `token_limit()`: Checks if a document has more tokens than a specified maximum 47 | - `token_minimum()`: Checks if a document has more tokens than a specified minimum 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /app/clustered_summary.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, as_completed 2 | from typing import List 3 | 4 | import numpy as np 5 | from sklearn.cluster import KMeans 6 | 7 | from magic import Magician 8 | from utils import count_tokens 9 | 10 | 11 | class ClusteredSummary(Magician): 12 | 13 | MIN_CHUNK_SIZE = 50_000 14 | NUM_CLUSTERS = 8 15 | MAX_THREADS = 4 16 | 17 | SUMMARIZE_CHUNK_SYSTEM_MESSAGE = """ 18 | You will be given a single section from a %s. This will be enclosed in triple backticks. 19 | Please provide a comprehensive and cohesive summary of the excerpt, focusing on the key points and main ideas, while maintaining clarity and conciseness.""" 20 | 21 | SUMMARIZE_ALL_CHUNKS_SYSTEM_MESSAGE = """ 22 | You will be given a list of summaries from a %s. They will be enclosed in triple backticks. 23 | Please provide a cohesive summary of the %s, focusing on the key points and main ideas, while maintaining clarity and conciseness. 24 | 25 | Format your summary in HTML. It should be structured as follows: 26 | 27 | - A short, bullet form list of key takeaways. 28 | - A well-formatted easy-to-read synopsis, structured like an essay that summarizes the document cohesively. 29 | - A conclusion that ties all the ideas together. 30 | 31 | Format for maximum readability and clarity. 32 | """ 33 | 34 | def __init__(self, text: str, media_type: str): 35 | super().__init__() 36 | self.text = text 37 | self.media_type = media_type 38 | 39 | def get_summary(self) -> str: 40 | text_chunks = self.chunk_text() 41 | clustered_chunks = self.cluster_chunks(text_chunks) 42 | summaries = [None] * len(clustered_chunks) 43 | 44 | with ThreadPoolExecutor(max_workers=self.MAX_THREADS) as executor: 45 | future_to_index = { 46 | executor.submit(self.get_summary_for_chunk, chunk): i 47 | for i, chunk in enumerate(clustered_chunks) 48 | } 49 | for future in as_completed(future_to_index): 50 | index = future_to_index[future] 51 | summaries[index] = future.result() 52 | 53 | system_message = self.SUMMARIZE_ALL_CHUNKS_SYSTEM_MESSAGE % ( 54 | self.media_type, 55 | self.media_type, 56 | ) 57 | user_message = "\n".join([f"'''{summary}'''" for summary in summaries]) 58 | full_summary = self.wave_wand(system_message, user_message) 59 | 60 | return self.extract_code(full_summary) 61 | 62 | def get_summary_for_chunk(self, chunk): 63 | system_message = self.SUMMARIZE_CHUNK_SYSTEM_MESSAGE % self.media_type 64 | user_message = f"'''{chunk}'''" 65 | return self.wave_wand(system_message, user_message) 66 | 67 | def chunk_text(self): 68 | total_tokens = count_tokens(self.text) 69 | min_chunk_size = min(self.MIN_CHUNK_SIZE, total_tokens // 2) 70 | num_chunks = max(1, total_tokens // min_chunk_size) 71 | chunk_size = total_tokens // num_chunks 72 | avg_token_length = len(self.text) // total_tokens 73 | chunk_size_in_chars = chunk_size * avg_token_length 74 | text_chunks = self._split_text_by_characters(self.text, chunk_size_in_chars) 75 | 76 | return text_chunks 77 | 78 | def cluster_chunks(self, chunks: List[str]) -> List[str]: 79 | if len(chunks) < self.NUM_CLUSTERS: 80 | return chunks 81 | 82 | embeddings = [self.create_magic_numbers(chunk)[chunk] for chunk in chunks] 83 | embeddings_matrix = np.array(embeddings) 84 | 85 | kmeans = KMeans(n_clusters=self.NUM_CLUSTERS, random_state=42).fit( 86 | embeddings_matrix 87 | ) 88 | 89 | closest_chunks = [] 90 | for cluster_idx in range(self.NUM_CLUSTERS): 91 | cluster_center = kmeans.cluster_centers_[cluster_idx] 92 | distances = np.linalg.norm(embeddings_matrix - cluster_center, axis=1) 93 | closest_chunk_idx = np.argmin(distances) 94 | closest_chunks.append(chunks[closest_chunk_idx]) 95 | 96 | return closest_chunks 97 | 98 | def _split_text_by_characters( 99 | self, text: str, chunk_size_in_chars: int 100 | ) -> List[str]: 101 | chunks = [] 102 | start = 0 103 | 104 | while start < len(text): 105 | end = start + chunk_size_in_chars 106 | 107 | if end < len(text): 108 | end = text.rfind(" ", start, end) + 1 109 | 110 | if end <= start: 111 | end = start + chunk_size_in_chars 112 | 113 | chunks.append(text[start:end].strip()) 114 | start = end 115 | 116 | return chunks 117 | --------------------------------------------------------------------------------