├── README.md
├── app.py
├── app1.py
├── app2.py
├── app3.py
└── requirement.txt
/README.md:
--------------------------------------------------------------------------------
1 | # Document-Summarization
2 | Document Summarization App using large language model (LLM) and Langchain framework. Used a pre-trained T5 model and its tokenizer from Hugging Face Transformers library.
3 | Created a summarization pipeline to generate summary using model.
4 |
5 | 1. Import Statements:
6 | - It begins by importing necessary libraries like Streamlit, Langchain, Transformers, and other Python libraries.
7 |
8 | 2. Model and Tokenizer Loading:
9 | - The code loads a pre-trained T5 model (a Transformer-based model) and its associated tokenizer from the Hugging Face Transformers library.
10 | This model is used for text summarization.
11 |
12 | 3. File Loader and Preprocessing:
13 | - The `file_preprocessing` function loads a PDF file using the Langchain library and splits it into smaller text chunks. These text chunks are later used for
14 | summarization.
15 |
16 | 4. LLM Pipeline:
17 | - The `llm_pipeline` function sets up a summarization pipeline using the pre-trained T5 model and tokenizer. It takes the preprocessed text as input and generates
18 | a summary using the model.
19 |
20 | 5. Streamlit Setup:
21 | - The Streamlit app is set up with a title and an option to upload a PDF file.
22 |
23 | 6. Main Function:
24 | - The `main` function is the entry point of the app.
25 | - It provides a file upload button and a "Summarize" button.
26 | - When a PDF file is uploaded and the "Summarize" button is clicked, it displays the uploaded PDF on the left side and the generated summary on the right side
27 | of the Streamlit app.
28 |
29 | 7. HTML Display of PDF:
30 | - The `displayPDF` function converts the uploaded PDF file into base64 format and embeds it in an HTML iframe, allowing the PDF to be displayed in the app.
31 |
32 | 8. Streamlit Configuration:
33 | - The app's layout is configured to be "wide" using `st.set_page_config`.
34 |
35 | 9. Running the App:
36 | - The app is launched when the script is run as the main module (`if __name__ == "__main__": main()`).
37 |
38 | The main functionality of this app is to upload a PDF document, process it, and then display both the PDF and a summarized version of the document.
39 | It utilizes a pre-trained language model for text summarization and Streamlit for creating a user-friendly interface. Users can upload PDFs and quickly obtain
40 | summarized content from them.
41 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from langchain.text_splitter import RecursiveCharacterTextSplitter
3 | from langchain.document_loaders import PyPDFLoader, DirectoryLoader
4 | from langchain.chains.summarize import load_summarize_chain
5 | from transformers import T5Tokenizer, T5ForConditionalGeneration
6 | from transformers import pipeline
7 | import torch
8 | import base64
9 |
10 | #model and tokenizer loading
11 | checkpoint = "MBZUAI/LaMini-Flan-T5-248M"
12 | tokenizer = T5Tokenizer.from_pretrained(checkpoint)
13 | base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32)
14 |
15 | #file loader and preprocessing
16 | def file_preprocessing(file):
17 | loader = PyPDFLoader(file)
18 | pages = loader.load_and_split()
19 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
20 | texts = text_splitter.split_documents(pages)
21 | final_texts = ""
22 | for text in texts:
23 | print(text)
24 | final_texts = final_texts + text.page_content
25 | return final_texts
26 |
27 | #LLM pipeline
28 | def llm_pipeline(filepath):
29 | pipe_sum = pipeline(
30 | 'summarization',
31 | model = base_model,
32 | tokenizer = tokenizer,
33 | max_length = 500,
34 | min_length = 50)
35 | input_text = file_preprocessing(filepath)
36 | result = pipe_sum(input_text)
37 | result = result[0]['summary_text']
38 | return result
39 |
40 | @st.cache_data
41 | #function to display the PDF of a given file
42 | def displayPDF(file):
43 | # Opening file from file path
44 | with open(file, "rb") as f:
45 | base64_pdf = base64.b64encode(f.read()).decode('utf-8')
46 |
47 | # Embedding PDF in HTML
48 | pdf_display = F''
49 |
50 | # Displaying File
51 | st.markdown(pdf_display, unsafe_allow_html=True)
52 |
53 | #streamlit code
54 | st.set_page_config(layout="wide")
55 |
56 | def main():
57 | st.title("Document Summarization App")
58 |
59 | uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
60 |
61 | if uploaded_file is not None:
62 | if st.button("Summarize"):
63 | col1, col2 = st.columns(2)
64 | filepath = "data/"+uploaded_file.name
65 | with open(filepath, "wb") as temp_file:
66 | temp_file.write(uploaded_file.read())
67 | with col1:
68 | st.info("Uploaded File")
69 | pdf_view = displayPDF(filepath)
70 |
71 | with col2:
72 | summary = llm_pipeline(filepath)
73 | st.info("Summarization Complete")
74 | st.success(summary)
75 |
76 |
77 |
78 | if __name__ == "__main__":
79 | main()
--------------------------------------------------------------------------------
/app1.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3 | import torch
4 | import base64
5 | from langchain.text_splitter import RecursiveCharacterTextSplitter
6 | from langchain.document_loaders import PyPDFLoader
7 |
8 | # Model and tokenizer loading
9 | # model_name ="google/pegasus-large" # good
10 | # model_name = "t5-large" # good
11 | model_name = "facebook/bart-large"
12 | tokenizer = AutoTokenizer.from_pretrained(model_name)
13 | base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
14 |
15 | # File loader and preprocessing
16 | def file_preprocessing(file):
17 | loader = PyPDFLoader(file)
18 | pages = loader.load_and_split()
19 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
20 | texts = text_splitter.split_documents(pages)
21 | final_texts = ""
22 | for text in texts:
23 | print(text)
24 | final_texts = final_texts + text.page_content
25 | return final_texts
26 |
27 | # LLM pipeline
28 | def llm_pipeline(filepath):
29 | pipe_sum = pipeline(
30 | 'summarization',
31 | model=base_model,
32 | tokenizer=tokenizer,
33 | min_length=50
34 | )
35 | input_text = file_preprocessing(filepath)
36 | result = pipe_sum(input_text)
37 | result = result[0]['summary_text']
38 | return result
39 |
40 | # Function to display the PDF of a given file
41 | def displayPDF(file):
42 | # Opening file from file path
43 | with open(file, "rb") as f:
44 | base64_pdf = base64.b64encode(f.read()).decode('utf-8')
45 | # Embedding PDF in HTML
46 | pdf_display = F''
47 | # Displ aying File
48 | st.markdown(pdf_display, unsafe_allow_html=True)
49 |
50 | # Streamlit code
51 | st.set_page_config(layout="wide")
52 |
53 | def main():
54 | st.title("Document Summarization App")
55 | uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
56 | if uploaded_file is not None:
57 | if st.button("Summarize"):
58 | col1, col2 = st.columns(2)
59 | filepath = "data/" + uploaded_file.name
60 | with open(filepath, "wb") as temp_file:
61 | temp_file.write(uploaded_file.read())
62 | with col1:
63 | st.info("Uploaded File")
64 | pdf_view = displayPDF(filepath)
65 | with col2:
66 | summary = llm_pipeline(filepath)
67 | st.info("Summarization Complete")
68 | st.success(summary)
69 |
70 | if __name__ == "__main__":
71 | main()
--------------------------------------------------------------------------------
/app2.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import faiss
3 | import torch
4 | import numpy as np
5 | import base64
6 | from langchain.text_splitter import RecursiveCharacterTextSplitter
7 | from langchain.document_loaders import PyPDFLoader
8 | from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
9 |
10 | # Initialize Faiss index and storagestreamlit
11 | dimension = 768 # Change this dimension to match your language model's output dimension
12 | index = faiss.IndexFlatL2(dimension) # You can choose a different index type if needed
13 | doc_vectors = [] # List to store document vectors
14 |
15 | # Load tokenizer and model
16 | checkpoint = "MBZUAI/LaMini-Flan-T5-248M"
17 | tokenizer = T5Tokenizer.from_pretrained(checkpoint)
18 | base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32)
19 |
20 | # Modify the following function to vectorize text using your language model
21 | def vectorize_text(text):
22 | inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
23 | with torch.no_grad():
24 | outputs = base_model(**inputs)
25 | return outputs.last_hidden_state.mean(dim=1).numpy().astype('float32')
26 |
27 | # Modify the file_preprocessing function to store vectors in the vector database
28 | def file_preprocessing_and_vectorization(file):
29 | loader = PyPDFLoader(file)
30 | pages = loader.load_and_split()
31 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)
32 | texts = text_splitter.split_documents(pages)
33 |
34 | for text in texts:
35 | vector = vectorize_text(text.page_content)
36 | doc_vectors.append(vector)
37 | index.add(np.array([vector], dtype='float32')) # Add vector to the index
38 |
39 | return texts
40 |
41 | # Streamlit code
42 | st.set_page_config(layout="wide")
43 |
44 | def main():
45 | st.title("Document Summarization App")
46 | uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
47 | if uploaded_file is not None:
48 | if st.button("Summarize"):
49 | col1, col2 = st.columns(2)
50 | filepath = "data/" + uploaded_file.name
51 | with open(filepath, "wb") as temp_file:
52 | temp_file.write(uploaded_file.read())
53 | with col1:
54 | st.info("Uploaded File")
55 | pdf_view = displayPDF(filepath)
56 | with col2:
57 | texts = file_preprocessing_and_vectorization(filepath)
58 | input_text = texts[0].page_content
59 | summary = llm_pipeline(input_text)
60 | st.info("Summarization Complete")
61 | st.success(summary)
62 |
63 |
64 | if __name__ == "__main__":
65 | main()
--------------------------------------------------------------------------------
/app3.py:
--------------------------------------------------------------------------------
1 | import streamlit as st
2 | import faiss
3 | import numpy as np
4 | import base64
5 | import torch
6 | from transformers import T5Tokenizer, T5ForConditionalGeneration, pipeline
7 |
8 |
9 | # Initialize Faiss index and storage
10 | dimension = 768 # Change this dimension to match your language model's output dimension
11 | num_clusters = 1000 # Adjust the number of clusters based on your requirements
12 | num_sub_quantizers = 64 # Adjust the number of sub-quantizers for IndexIVFPQ
13 | index = faiss.IndexIVFPQ(faiss.IndexFlatL2(dimension), num_clusters, num_sub_quantizers, faiss.METRIC_L2)
14 | doc_ids = [] # List to store document IDs for retrieval
15 | doc_vectors = [] # List to store document vectors
16 |
17 | # Load tokenizer and model
18 | checkpoint = "MBZUAI/LaMini-Flan-T5-248M"
19 | tokenizer = T5Tokenizer.from_pretrained(checkpoint)
20 | base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32)
21 |
22 | # Function to vectorize text using your language model
23 | def vectorize_text(text):
24 | inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
25 | with torch.no_grad():
26 | outputs = base_model(**inputs)
27 | return outputs.last_hidden_state.mean(dim=1).numpy().astype('float32')
28 |
29 | # Function to display the PDF of a given file
30 | def displayPDF(file):
31 | with open(file, "rb") as f:
32 | base64_pdf = base64.b64encode(f.read()).decode('utf-8')
33 | pdf_display = f''
34 | st.markdown(pdf_display, unsafe_allow_html=True)
35 |
36 | # Document summarization pipeline using the language model
37 | def llm_pipeline(input_text):
38 | pipe_sum = pipeline(
39 | 'summarization',
40 | model=base_model,
41 | tokenizer=tokenizer,
42 | max_length=500,
43 | min_length=50
44 | )
45 | result = pipe_sum(input_text)
46 | result = result[0]['summary_text']
47 | return result
48 |
49 | # Main Streamlit application
50 | st.set_page_config(layout="wide")
51 |
52 | def main():
53 | st.title("Document Summarization App")
54 | uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
55 |
56 | if uploaded_file is not None:
57 | if st.button("Summarize"):
58 | col1, col2 = st.columns(2)
59 | filepath = "data/" + uploaded_file.name
60 |
61 | with open(filepath, "wb") as temp_file:
62 | temp_file.write(uploaded_file.read())
63 | with col1:
64 | st.info("Uploaded File")
65 | displayPDF(filepath)
66 |
67 | with col2:
68 | with open(filepath, "rb") as f:
69 | input_text = f.read().decode('utf-8','ignore') # Read the content of the file
70 | summary = llm_pipeline(input_text)
71 | st.info("Summarization Complete")
72 | st.success(summary)
73 |
74 | if __name__ == "__main__":
75 | main()
76 |
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | langchain
2 | sentence_transformers
3 | torch
4 | sentencepiece
5 | transformers
6 | accelerate
7 | pypdf
8 | tiktoken
9 | streamlit
10 | chromadb
--------------------------------------------------------------------------------