├── DocGPT ├── app.py └── requirements.txt └── README.md /DocGPT/app.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | import streamlit as st 3 | from PyPDF2 import PdfReader 4 | from langchain.text_splitter import CharacterTextSplitter 5 | from langchain.embeddings.openai import OpenAIEmbeddings 6 | from langchain.vectorstores import FAISS 7 | from langchain.chains.question_answering import load_qa_chain 8 | from langchain.llms import OpenAI 9 | from langchain.callbacks import get_openai_callback 10 | from docx import Document 11 | from pptx import Presentation 12 | import pandas as pd 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | import seaborn as sns 16 | 17 | def extract_column_name(prompt): 18 | keywords = ["of", "on", "from"] 19 | words = prompt.split() 20 | for i, word in enumerate(words): 21 | if word in keywords and i < len(words) - 1: 22 | return words[i + 1] 23 | return None 24 | 25 | def extract_person_name(query): 26 | words = query.split() 27 | for i, word in enumerate(words): 28 | if word.lower() == "name" and i < len(words) - 1: 29 | return words[i + 1] 30 | return None 31 | 32 | def main(): 33 | load_dotenv() 34 | st.set_page_config( 35 | page_title="DocGPT", 36 | page_icon="📚", 37 | initial_sidebar_state="expanded", 38 | ) 39 | 40 | st.markdown( 41 | """ 42 | 50 | """, 51 | unsafe_allow_html=True, 52 | ) 53 | st.header("DocGPT 📚") 54 | 55 | uploaded_file = st.file_uploader("Upload your document", type=["pdf", "docx", "txt", "pptx", "csv", "xlsx"], key="file_uploader") 56 | 57 | if uploaded_file is not None: 58 | if uploaded_file.type == 'application/pdf': 59 | pdf_reader = PdfReader(uploaded_file) 60 | text = "" 61 | for page in pdf_reader.pages: 62 | text += page.extract_text() 63 | elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 64 | doc = Document(uploaded_file) 65 | text = "\n".join([para.text for para in doc.paragraphs]) 66 | elif uploaded_file.type == 'text/plain': 67 | text = uploaded_file.read().decode('utf-8') 68 | elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 69 | ppt = Presentation(uploaded_file) 70 | text = "" 71 | for slide in ppt.slides: 72 | for shape in slide.shapes: 73 | if hasattr(shape, 'text'): 74 | text += shape.text + "\n" 75 | elif uploaded_file.type == 'text/csv': 76 | df = pd.read_csv(uploaded_file) 77 | st.write("Uploaded CSV Data:") 78 | st.write(df) 79 | 80 | prompt = st.text_input("Enter a data science analysis prompt:") 81 | 82 | if prompt: 83 | st.write("Analysis Result:") 84 | prompt_lower = prompt.lower() 85 | column = extract_column_name(prompt) 86 | if column is not None: 87 | if "mean" in prompt_lower or "average" in prompt_lower: 88 | st.write(f"Mean of '{column}': {df[column].mean()}") 89 | 90 | elif "median" in prompt_lower: 91 | st.write(f"Median of '{column}': {df[column].median()}") 92 | 93 | elif "mode" in prompt_lower: 94 | mode_values = df[column].mode() 95 | if not mode_values.empty: 96 | st.write(f"Mode of '{column}': {', '.join(map(str, mode_values))}") 97 | else: 98 | st.write(f"No mode found in '{column}'.") 99 | 100 | elif "histogram" in prompt_lower: 101 | plt.hist(df[column], bins=20) 102 | st.pyplot(plt) 103 | 104 | elif "scatterplot" in prompt_lower: 105 | x_column = st.selectbox("Select the X-axis column:", df.columns) 106 | plt.scatter(df[x_column], df[column]) 107 | st.pyplot(plt) 108 | 109 | elif "count" in prompt_lower: 110 | st.write(f"Count of '{column}': {df[column].count()}") 111 | 112 | elif "sum" in prompt_lower: 113 | st.write(f"Sum of '{column}': {df[column].sum()}") 114 | 115 | elif "null" in prompt_lower: 116 | st.write(f"Null value count in '{column}': {df[column].isnull().sum()}") 117 | 118 | elif "min" in prompt_lower: 119 | st.write(f"Min value in '{column}': {df[column].min()}") 120 | 121 | elif "max" in prompt_lower: 122 | st.write(f"Max value in '{column}': {df[column].max()}") 123 | 124 | elif "line plot" in prompt_lower: 125 | x_column = st.selectbox("Select the X-axis column:", df.columns) 126 | y_column = st.selectbox("Select the Y-axis column:", df.columns) 127 | plt.plot(df[x_column], df[y_column]) 128 | st.pyplot(plt) 129 | 130 | elif "scatter chart" in prompt_lower: 131 | x_column = st.selectbox("Select the X-axis column:", df.columns) 132 | y_column = st.selectbox("Select the Y-axis column:", df.columns) 133 | plt.scatter(df[x_column], df[y_column]) 134 | st.pyplot(plt) 135 | 136 | elif "correlation chart" in prompt_lower: 137 | corr_matrix = df.corr() 138 | sns.heatmap(corr_matrix, annot=True) 139 | st.pyplot(plt) 140 | 141 | elif "heatmap" in prompt_lower: 142 | sns.heatmap(df.corr(), annot=True, cmap="coolwarm") 143 | st.pyplot(plt) 144 | 145 | elif "bubble chart" in prompt_lower: 146 | x_column = st.selectbox("Select the X-axis column:", df.columns) 147 | y_column = st.selectbox("Select the Y-axis column:", df.columns) 148 | size_column = st.selectbox("Select the size column:", df.columns) 149 | plt.scatter(df[x_column], df[y_column], s=df[size_column]) 150 | st.pyplot(plt) 151 | 152 | elif "radar chart" in prompt_lower: 153 | # Implement radar chart logic here 154 | pass 155 | 156 | elif "ridge plot" in prompt_lower: 157 | sns.ridgeplot(df) 158 | st.pyplot(plt) 159 | 160 | elif "dendrogram" in prompt_lower: 161 | corr_matrix = df.corr() 162 | linkage_matrix = hierarchy.linkage(corr_matrix, method='ward') 163 | dendrogram = hierarchy.dendrogram(linkage_matrix, labels=corr_matrix.index) 164 | st.pyplot(plt) 165 | 166 | else: 167 | st.write("Unsupported analysis prompt.") 168 | 169 | else: 170 | st.write("Column not specified in the prompt.") 171 | 172 | return 173 | elif uploaded_file.type == 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 174 | df = pd.read_excel(uploaded_file, engine="openpyxl") 175 | st.write("Uploaded Excel Data:") 176 | st.write(df) 177 | 178 | user_prompt = st.text_input("Enter a prompt for Excel analysis:") 179 | 180 | if user_prompt: 181 | st.write("Excel Analysis Result:") 182 | user_prompt_lower = user_prompt.lower() 183 | column = extract_column_name(user_prompt) 184 | if column is not None: 185 | if "mean" in user_prompt_lower or "average" in user_prompt_lower: 186 | st.write(f"Mean of '{column}': {df[column].mean()}") 187 | 188 | # Add more analysis options here based on user prompts 189 | 190 | else: 191 | st.write("Unsupported analysis prompt.") 192 | 193 | else: 194 | st.write("Column not specified in the prompt.") 195 | 196 | return 197 | else: 198 | st.warning("Unsupported file type. Please upload a PDF, DOCX, TXT, PPTX, XLSX, or CSV file.") 199 | return 200 | 201 | text_splitter = CharacterTextSplitter( 202 | separator="\n", 203 | chunk_size=2000, 204 | chunk_overlap=400, 205 | length_function=len 206 | ) 207 | chunks = text_splitter.split_text(text) 208 | 209 | embeddings = OpenAIEmbeddings() 210 | knowledge_base = FAISS.from_texts(chunks, embedding=embeddings) 211 | 212 | user_question = st.text_input("Ask a question about your document:") 213 | if user_question: 214 | docs = knowledge_base.similarity_search(user_question) 215 | 216 | llm = OpenAI() 217 | chain = load_qa_chain(llm, chain_type="stuff") 218 | with get_openai_callback() as cb: 219 | response = chain.run(input_documents=docs, question=user_question) 220 | 221 | st.write(response) 222 | 223 | if __name__ == '__main__': 224 | main() 225 | -------------------------------------------------------------------------------- /DocGPT/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NirmalNagaraj/DocGPT/c78bed873f405df8977d7c762fcd42c0d9ab41e1/DocGPT/requirements.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # DocGPT - Your Intelligent Document Analysis and Data Science Chatbot 3 | 4 | Welcome to the official GitHub repository for DocGPT, your go-to tool for advanced document analysis and data science tasks. DocGPT is powered by OpenAI's Language Model and can analyze documents in various formats, including PDF, TXT, and CSV. 5 | 6 | ## What Can DocGPT Do? 7 | 8 | DocGPT is a versatile chatbot with the following key features: 9 | - **Document Analysis:** Extract valuable insights and information from PDF and TXT documents with ease. 10 | - **Data Science:** Perform data analysis and execute data science prompts with CSV files, making it perfect for researchers and data professionals. 11 | - **OpenAI Integration:** DocGPT uses OpenAI's powerful language model and requires an OpenAI API key for full functionality. 12 | 13 | ## Getting Your OpenAI API Key 14 | 15 | To use DocGPT with full capabilities, you'll need an OpenAI API key. Here's how to obtain one: 16 | 1. Visit [OpenAI's website](https://openai.com) and create an account. 17 | 2. Navigate to the API section and follow the instructions to generate your API key. 18 | 3. Once you have your key, you can proceed to configure DocGPT. 19 | 20 | ## Installation and Execution 21 | 22 | Follow these steps to clone, set up, and run DocGPT: 23 | 24 | 1. Clone this repository to your local machine using Git: 25 | ``` 26 | https://github.com/NirmalNagaraj/DocGPT.git 27 | ``` 28 | 29 | 2. Navigate to the project directory: 30 | ``` 31 | cd DocGPT 32 | ``` 33 | 34 | 3. Install the required Python packages using pip: 35 | ``` 36 | pip install -r requirements.txt 37 | ``` 38 | 39 | 4. Create a `.env` file by copying the `.env.example` file and replace `YOUR_OPENAI_API_KEY` with your actual OpenAI API key: (or) Create a `.env` file and include this line 40 | ``` 41 | OPENAI_API_KEY = "YOUR API KEY " 42 | ``` 43 | 44 | 45 | 5. Run the application using Streamlit: 46 | ``` 47 | streamlit run app.py 48 | ``` 49 | 50 | 7. Open your web browser and access the DocGPT interface at `http://localhost:8501`. 51 | 52 | That's it! You're now ready to use DocGPT for document analysis and data science tasks. 53 | 54 | --------------------------------------------------------------------------------