├── app
    ├── __init__.py
    ├── utils.py
    ├── validator.py
    ├── pdf_document.py
    ├── summary.py
    ├── magic.py
    ├── youtube_video.py
    ├── main.py
    └── clustered_summary.py
├── requirements.txt
├── LICENSE
├── .gitignore
└── README.md


/app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.38.0
2 | youtube-transcript-api==0.6.2
3 | pypdf2==3.0.1
4 | tiktoken==0.7.0
5 | openai==1.42.0
6 | scikit-learn==1.5.1
7 | numpy==2.0.2


--------------------------------------------------------------------------------
/app/utils.py:
--------------------------------------------------------------------------------
1 | import tiktoken
2 | 
3 | 
4 | def count_tokens(text: str) -> int:
5 |     encoding = tiktoken.get_encoding("cl100k_base")
6 |     token_list = encoding.encode(text, disallowed_special=())
7 |     return len(token_list)
8 | 


--------------------------------------------------------------------------------
/app/validator.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from utils import count_tokens
 4 | 
 5 | 
 6 | class Validator:
 7 | 
 8 |     MAX_TOKENS = 1_000_000
 9 | 
10 |     @staticmethod
11 |     def validate_text(text: str) -> List[str]:
12 |         errors = []
13 | 
14 |         if not text:
15 |             errors.append("No text found.")
16 |             return errors
17 | 
18 |         elif count_tokens(text) > Validator.MAX_TOKENS:
19 |             errors.append(
20 |                 "The text is too large. Please provide a text with fewer than 1,000,000 tokens."
21 |             )
22 | 
23 |         return errors
24 | 


--------------------------------------------------------------------------------
/app/pdf_document.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import PyPDF2
 3 | 
 4 | 
 5 | class PdfDocument:
 6 |     def __init__(self, pdf_bytes: io.BytesIO):
 7 |         self.pdf_bytes = pdf_bytes
 8 |         self._text_content = None
 9 | 
10 |     @property
11 |     def text_content(self) -> str:
12 |         if self._text_content:
13 |             return self._text_content
14 |         pdf_reader = PyPDF2.PdfReader(self.pdf_bytes)
15 |         text = ""
16 |         for i in range(len(pdf_reader.pages)):
17 |             p = pdf_reader.pages[i]
18 |             text += p.extract_text()
19 |         self._text_content = text
20 |         return self.sanitize_text(text)
21 | 
22 |     @staticmethod
23 |     def sanitize_text(text: str) -> str:
24 |         special_tokens = [
25 |             ">|endoftext|",
26 |             "<|fim_prefix|",
27 |             "<|fim_middle|",
28 |             "<|fim_suffix|",
29 |             "<|endofprompt|>",
30 |         ]
31 |         for special in special_tokens:
32 |             text = text.replace(special, "")
33 |         return text
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Ethan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/app/summary.py:
--------------------------------------------------------------------------------
 1 | from magic import Magician
 2 | 
 3 | 
 4 | class Summary(Magician):
 5 | 
 6 |     SUMMARIZE_SYSTEM_MESSAGE = """
 7 |     You will be given a complete %s. It will be enclosed in triple backticks.
 8 |     Please provide a comprehensive and cohesive summary of the %s, focusing on the key points and main ideas, while maintaining clarity and conciseness.
 9 | 
10 |     Format your summary in HTML. It should be structured as follows:
11 | 
12 |     - A short, bullet form list of key takeaways.
13 |     - A well-formatted easy-to-read synopsis, structured like an essay that summarizes the document cohesively.
14 |     - A conclusion that ties all the ideas together.
15 |     
16 |     Format for maximum readability and clarity.
17 |     """
18 | 
19 |     def __init__(self, text: str, media_type: str):
20 |         super().__init__()
21 |         self.text = text
22 |         self.media_type = media_type
23 | 
24 |     def get_summary(self) -> str:
25 | 
26 |         system_message = self.SUMMARIZE_SYSTEM_MESSAGE % (
27 |             self.media_type,
28 |             self.media_type,
29 |         )
30 |         user_message = f"'''{self.text}'''"
31 |         full_summary = self.wave_wand(system_message, user_message)
32 | 
33 |         return self.extract_code(full_summary)
34 | 


--------------------------------------------------------------------------------
/app/magic.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Dict, List
 3 | 
 4 | import openai
 5 | 
 6 | 
 7 | class Magician:
 8 |     MAGIC_WAND = "gpt-4o-mini"
 9 |     OTHER_MAGIC_WAND = "text-embedding-3-small"
10 | 
11 |     def __init__(self):
12 |         self.client = openai.OpenAI()
13 | 
14 |     def wave_wand(self, system_message: str, user_message: str) -> str:
15 |         return (
16 |             self.client.chat.completions.create(
17 |                 model=self.MAGIC_WAND,
18 |                 messages=[
19 |                     {"role": "system", "content": system_message},
20 |                     {"role": "user", "content": user_message},
21 |                 ],
22 |             )
23 |             .choices[0]
24 |             .message.content
25 |         )
26 | 
27 |     def create_magic_numbers(self, text: str) -> Dict[str, List[float]]:
28 |         response = self.client.embeddings.create(
29 |             input=text,
30 |             model=self.OTHER_MAGIC_WAND,
31 |         )
32 |         return {text: response.data[0].embedding}
33 | 
34 |     @staticmethod
35 |     def extract_code(markdown_text: str) -> str:
36 |         cleaned_text = re.sub(
37 |             r"```[\w]*\n(.*?)```", r"\1", markdown_text, flags=re.DOTALL
38 |         ).replace("\n", "")
39 | 
40 |         return cleaned_text
41 | 


--------------------------------------------------------------------------------
/app/youtube_video.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | from typing import List, Optional
 3 | from urllib.parse import urlparse, parse_qs
 4 | 
 5 | from youtube_transcript_api import YouTubeTranscriptApi
 6 | 
 7 | 
 8 | class YouTubeVideo:
 9 | 
10 |     def __init__(self, video_url: str):
11 |         self.video_url = video_url
12 |         self.video_id = self.get_youtube_video_id()
13 | 
14 |     def get_transcript(self) -> Optional[str]:
15 |         transcript = YouTubeTranscriptApi.get_transcript(self.video_id)
16 |         if not transcript:
17 |             return None
18 |         return self.transcript_to_plain_text(transcript)
19 | 
20 |     def get_youtube_video_id(self) -> str:
21 |         query = urlparse(self.video_url)
22 |         if query.hostname == "youtu.be":
23 |             return query.path[1:]
24 |         if query.hostname in ("www.youtube.com", "youtube.com"):
25 |             if query.path == "/watch":
26 |                 return parse_qs(query.query)["v"][0]
27 |             if query.path[:7] == "/embed/":
28 |                 return query.path.split("/")[2]
29 |             if query.path[:3] == "/v/":
30 |                 return query.path.split("/")[2]
31 |         raise ValueError("Invalid YouTube URL")
32 | 
33 |     @staticmethod
34 |     def transcript_to_plain_text(transcript: List[dict]) -> str:
35 |         return " ".join([entry["text"] for entry in transcript])
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Node.js
  2 | node_modules/
  3 | npm-debug.log*
  4 | yarn-debug.log*
  5 | yarn-error.log*
  6 | package-lock.json
  7 | yarn.lock
  8 | 
  9 | # Python
 10 | __pycache__/
 11 | *.py[cod]
 12 | *.pyo
 13 | *.pyd
 14 | .Python
 15 | env/
 16 | venv/
 17 | ENV/
 18 | venv.bak/
 19 | venv.bak/
 20 | *.env
 21 | *.venv
 22 | *.egg-info/
 23 | 
 24 | # Virtualenv
 25 | .env/
 26 | .venv/
 27 | 
 28 | # Django
 29 | *.log
 30 | local_settings.py
 31 | db.sqlite3
 32 | /media
 33 | /staticfiles
 34 | 
 35 | # macOS
 36 | .DS_Store
 37 | .AppleDouble
 38 | .LSOverride
 39 | 
 40 | # Windows
 41 | Thumbs.db
 42 | ehthumbs.db
 43 | Desktop.ini
 44 | $RECYCLE.BIN/
 45 | 
 46 | # Linux
 47 | *~
 48 | .nfs*
 49 | 
 50 | # Logs
 51 | logs/
 52 | *.log
 53 | *.log.*
 54 | 
 55 | # Temporary files
 56 | tmp/
 57 | temp/
 58 | *.tmp
 59 | *.temp
 60 | *.swp
 61 | *.swo
 62 | *.swn
 63 | *.bak
 64 | *.backup
 65 | 
 66 | # IDEs and Editors
 67 | .vscode/
 68 | .idea/
 69 | *.sublime-project
 70 | *.sublime-workspace
 71 | 
 72 | # IntelliJ
 73 | *.iml
 74 | *.iws
 75 | *.ipr
 76 | /out/
 77 | 
 78 | # JetBrains
 79 | /.idea/
 80 | *.iml
 81 | /out/
 82 | 
 83 | # Visual Studio Code
 84 | .vscode/
 85 | .history/
 86 | 
 87 | # Eclipse
 88 | .project
 89 | .classpath
 90 | .cproject
 91 | .settings/
 92 | .snap
 93 | 
 94 | # NetBeans
 95 | nbproject/private/
 96 | build/
 97 | nbbuild/
 98 | dist/
 99 | nbdist/
100 | nbactions.xml
101 | nb-configuration.xml
102 | 
103 | # VS Code
104 | .vscode/
105 | 
106 | # Coverage reports
107 | coverage/
108 | *.lcov
109 | 
110 | # Test outputs
111 | *.out
112 | *.test
113 | 
114 | # Environment variables
115 | .env
116 | .env.local
117 | .env.*.local
118 | .envrc
119 | 
120 | # Jupyter Notebook
121 | .ipynb_checkpoints/
122 | 
123 | # Archives
124 | *.zip
125 | 


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from streamlit.runtime.uploaded_file_manager import UploadedFile
 3 | 
 4 | from clustered_summary import ClusteredSummary
 5 | from pdf_document import PdfDocument
 6 | from summary import Summary
 7 | from utils import count_tokens
 8 | from validator import Validator
 9 | from youtube_video import YouTubeVideo
10 | 
11 | 
12 | def main():
13 |     st.title("Document Summarizer")
14 | 
15 |     input_method = st.radio(
16 |         "Select input method", ("Upload a document", "Enter a YouTube URL")
17 |     )
18 | 
19 |     if input_method == "Upload a document":
20 |         uploaded_file = st.file_uploader(
21 |             "Upload a document",
22 |             type=["pdf"],
23 |         )
24 | 
25 |     if input_method == "Enter a YouTube URL":
26 |         youtube_url = st.text_input("Enter a YouTube URL")
27 | 
28 |     st.sidebar.markdown("# [Contact me by email!](mailto:ethanujohnston@gmail.com)")
29 |     st.sidebar.markdown(
30 |         "# [Check out my other projects!](https://github.com/e-johnstonn)"
31 |     )
32 |     st.sidebar.markdown("# [Twitter / X](https://x.com/ethanjdev)")
33 | 
34 |     if st.button("Summarize"):
35 |         if input_method == "Upload a document":
36 |             if uploaded_file is None:
37 |                 st.warning("Please upload a file.")
38 |                 return
39 |             summarize_file(uploaded_file)
40 | 
41 |         else:
42 |             if not youtube_url:
43 |                 st.warning("Please enter a YouTube URL.")
44 |                 return
45 |             summarize_youtube(youtube_url)
46 | 
47 | 
48 | def summarize_file(uploaded_file: UploadedFile):
49 |     document = PdfDocument(uploaded_file)
50 |     validation_errors = Validator.validate_text(document.text_content)
51 | 
52 |     if validation_errors:
53 |         st.warning(f"Invalid input: {','.join(validation_errors)}")
54 |         return
55 | 
56 |     st.markdown(run_summary(document.text_content, f"document"), unsafe_allow_html=True)
57 | 
58 | 
59 | def summarize_youtube(youtube_url: str):
60 |     video = YouTubeVideo(youtube_url)
61 |     transcript = video.get_transcript()
62 |     validation_errors = Validator.validate_text(transcript)
63 | 
64 |     if validation_errors:
65 |         st.warning(validation_errors)
66 |         return
67 | 
68 |     st.markdown(run_summary(transcript, "youtube video"), unsafe_allow_html=True)
69 | 
70 | 
71 | def run_summary(text: str, media_type: str):
72 |     tokens = count_tokens(text)
73 | 
74 |     if tokens > 100_000:
75 |         return ClusteredSummary(text, media_type).get_summary()
76 |     else:
77 |         return Summary(text, media_type).get_summary()
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # **GPT 3.5/4/4o-mini Powered Document Summarizer**
 2 | 
 3 | This is a tool that takes a text document (PDF or TXT) or YouTube transcript and generates a concise summary using GPT-4O-Mini, GPT-4 or GPT-3.5-turbo. It can accurately summarize hundreds of pages of text. It's built with Python and Streamlit and leverages the langchain library for text processing.
 4 | While the final output is generated with the latest GPT family model from OpenAI, GPT-4O-Mini (one of the LLMs that powers ChatGPT), only a small portion of the overall document is used in the prompts. Before any call is made to either LLM, the document is separated into
 5 | small sections that contain the majority of the meaning of the document. 
 6 | 
 7 | Summarize your documents here (no API key required): https://gpt-document-summarizer.streamlit.app/
 8 | 
 9 | ## Features
10 | 
11 | - Supports PDF and TXT file formats
12 | - Utilizes GPT-4 or GPT-3.5-turbo for generating summaries
13 | - Automatic clustering of the input document to identify key sections
14 | - Customizable number of clusters for the summarization process
15 | 
16 | ## Usage
17 | 
18 | 1. Launch the Streamlit app by running `streamlit run main.py`
19 | 2. Upload a document (TXT or PDF) to summarize.
20 | 3. Enter your OpenAI API key if the free usage cap has been hit.
21 | 4. Choose whether to use GPT-4 for the summarization (recommended, requires GPT-4 API access).
22 | 5. Click the "Summarize" button and wait for the result.
23 | 
24 | ## Modules
25 | 
26 | - `main.py`: Streamlit app main file
27 | - `utils.py`: Contains utility functions for document loading, token counting, and summarization
28 | - `streamlit_app_utils.py`: Contains utility functions specifically for the Streamlit app
29 | 
30 | ## Main Functions
31 | 
32 | - `main()`: Entry point for the Streamlit app
33 | - `process_summarize_button()`: Processes the "Summarize" button click and displays the generated summary
34 | - `validate_input()`: Validates user input and displays warnings for invalid inputs
35 | - `validate_doc_size()`: Validates the document size for token limits
36 | 
37 | ## Utility Functions
38 | 
39 | - `doc_loader()`: Loads a document from a file path
40 | - `token_counter()`: Counts the number of tokens in a text string
41 | - `doc_to_text()`: Converts a langchain Document object to a text string
42 | - `doc_to_final_summary()`: Generates the final summary for a given document
43 | - `summary_prompt_creator()`: Creates a summary prompt list for the langchain summarize chain
44 | - `pdf_to_text()`: Converts a PDF file to a text string
45 | - `check_gpt_4()`: Checks if the user has access to GPT-4
46 | - `token_limit()`: Checks if a document has more tokens than a specified maximum
47 | - `token_minimum()`: Checks if a document has more tokens than a specified minimum
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/app/clustered_summary.py:
--------------------------------------------------------------------------------
  1 | from concurrent.futures import ThreadPoolExecutor, as_completed
  2 | from typing import List
  3 | 
  4 | import numpy as np
  5 | from sklearn.cluster import KMeans
  6 | 
  7 | from magic import Magician
  8 | from utils import count_tokens
  9 | 
 10 | 
 11 | class ClusteredSummary(Magician):
 12 | 
 13 |     MIN_CHUNK_SIZE = 50_000
 14 |     NUM_CLUSTERS = 8
 15 |     MAX_THREADS = 4
 16 | 
 17 |     SUMMARIZE_CHUNK_SYSTEM_MESSAGE = """
 18 |     You will be given a single section from a %s. This will be enclosed in triple backticks.
 19 |     Please provide a comprehensive and cohesive summary of the excerpt, focusing on the key points and main ideas, while maintaining clarity and conciseness."""
 20 | 
 21 |     SUMMARIZE_ALL_CHUNKS_SYSTEM_MESSAGE = """
 22 |     You will be given a list of summaries from a %s. They will be enclosed in triple backticks.
 23 |     Please provide a cohesive summary of the %s, focusing on the key points and main ideas, while maintaining clarity and conciseness.
 24 |     
 25 |     Format your summary in HTML. It should be structured as follows:
 26 |     
 27 |     - A short, bullet form list of key takeaways.
 28 |     - A well-formatted easy-to-read synopsis, structured like an essay that summarizes the document cohesively.
 29 |     - A conclusion that ties all the ideas together.
 30 |     
 31 |     Format for maximum readability and clarity.
 32 |     """
 33 | 
 34 |     def __init__(self, text: str, media_type: str):
 35 |         super().__init__()
 36 |         self.text = text
 37 |         self.media_type = media_type
 38 | 
 39 |     def get_summary(self) -> str:
 40 |         text_chunks = self.chunk_text()
 41 |         clustered_chunks = self.cluster_chunks(text_chunks)
 42 |         summaries = [None] * len(clustered_chunks)
 43 | 
 44 |         with ThreadPoolExecutor(max_workers=self.MAX_THREADS) as executor:
 45 |             future_to_index = {
 46 |                 executor.submit(self.get_summary_for_chunk, chunk): i
 47 |                 for i, chunk in enumerate(clustered_chunks)
 48 |             }
 49 |             for future in as_completed(future_to_index):
 50 |                 index = future_to_index[future]
 51 |                 summaries[index] = future.result()
 52 | 
 53 |         system_message = self.SUMMARIZE_ALL_CHUNKS_SYSTEM_MESSAGE % (
 54 |             self.media_type,
 55 |             self.media_type,
 56 |         )
 57 |         user_message = "\n".join([f"'''{summary}'''" for summary in summaries])
 58 |         full_summary = self.wave_wand(system_message, user_message)
 59 | 
 60 |         return self.extract_code(full_summary)
 61 | 
 62 |     def get_summary_for_chunk(self, chunk):
 63 |         system_message = self.SUMMARIZE_CHUNK_SYSTEM_MESSAGE % self.media_type
 64 |         user_message = f"'''{chunk}'''"
 65 |         return self.wave_wand(system_message, user_message)
 66 | 
 67 |     def chunk_text(self):
 68 |         total_tokens = count_tokens(self.text)
 69 |         min_chunk_size = min(self.MIN_CHUNK_SIZE, total_tokens // 2)
 70 |         num_chunks = max(1, total_tokens // min_chunk_size)
 71 |         chunk_size = total_tokens // num_chunks
 72 |         avg_token_length = len(self.text) // total_tokens
 73 |         chunk_size_in_chars = chunk_size * avg_token_length
 74 |         text_chunks = self._split_text_by_characters(self.text, chunk_size_in_chars)
 75 | 
 76 |         return text_chunks
 77 | 
 78 |     def cluster_chunks(self, chunks: List[str]) -> List[str]:
 79 |         if len(chunks) < self.NUM_CLUSTERS:
 80 |             return chunks
 81 | 
 82 |         embeddings = [self.create_magic_numbers(chunk)[chunk] for chunk in chunks]
 83 |         embeddings_matrix = np.array(embeddings)
 84 | 
 85 |         kmeans = KMeans(n_clusters=self.NUM_CLUSTERS, random_state=42).fit(
 86 |             embeddings_matrix
 87 |         )
 88 | 
 89 |         closest_chunks = []
 90 |         for cluster_idx in range(self.NUM_CLUSTERS):
 91 |             cluster_center = kmeans.cluster_centers_[cluster_idx]
 92 |             distances = np.linalg.norm(embeddings_matrix - cluster_center, axis=1)
 93 |             closest_chunk_idx = np.argmin(distances)
 94 |             closest_chunks.append(chunks[closest_chunk_idx])
 95 | 
 96 |         return closest_chunks
 97 | 
 98 |     def _split_text_by_characters(
 99 |         self, text: str, chunk_size_in_chars: int
100 |     ) -> List[str]:
101 |         chunks = []
102 |         start = 0
103 | 
104 |         while start < len(text):
105 |             end = start + chunk_size_in_chars
106 | 
107 |             if end < len(text):
108 |                 end = text.rfind(" ", start, end) + 1
109 | 
110 |             if end <= start:
111 |                 end = start + chunk_size_in_chars
112 | 
113 |             chunks.append(text[start:end].strip())
114 |             start = end
115 | 
116 |         return chunks
117 | 


--------------------------------------------------------------------------------