├── .devcontainer └── devcontainer.json ├── README.md ├── app.py └── requirements.txt /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "app.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y Reading Table of Contents to locate financial section...") 51 | toc_pages = [] 52 | 53 | toc_pattern = re.compile(r'^(.*?)\s*([.\s]*)\s*(\d+)$') 54 | financial_section_keyword = re.compile(r'(?i)financial\s+(statements|information)') 55 | 56 | start_page = -1 57 | 58 | with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: 59 | for i in range(min(20, len(pdf.pages))): 60 | page_text = pdf.pages[i].extract_text() 61 | if not page_text: continue 62 | 63 | for line in page_text.split('\n'): 64 | if financial_section_keyword.search(line): 65 | match = toc_pattern.search(line) 66 | if match: 67 | try: 68 | 69 | page_num = int(match.group(3)) - 1 70 | start_page = page_num 71 | st.write(f" ✅ Found 'Financial Statements' section in ToC, starting around page {page_num + 1}.") 72 | 73 | return range(start_page, min(start_page + 40, len(pdf.pages))) 74 | except ValueError: 75 | continue 76 | 77 | st.warning("Could not find 'Financial Statements' in the Table of Contents. Falling back to full document search.") 78 | return None 79 | 80 | def find_financial_statements(pdf_bytes, page_range=None): 81 | """ 82 | Step 2: The Detective. 83 | Searches within a given page range and uses a scoring system to find the best tables. 84 | If no page range is provided, it searches the whole document as a fallback. 85 | """ 86 | 87 | anchor_keywords = { 88 | "balance_sheet": ["total equity and liabilities", "total assets"], 89 | "profit_and_loss": ["revenue from operations", "profit for the period"], 90 | "cash_flow": ["net cash generated from operating activities", "cash flow from operating activities"], 91 | } 92 | statement_keywords = { 93 | "balance_sheet": re.compile(r'(?i)BALANCE\s+SHEET'), 94 | "profit_and_loss": re.compile(r'(?i)PROFIT\s+&\s+LOSS|PROFIT\s+AND\s+LOSS'), 95 | "cash_flow": re.compile(r'(?i)CASH\s+FLOW\s+STATEMENT') 96 | } 97 | extracted_data = {} 98 | 99 | with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf: 100 | search_range = page_range if page_range else range(len(pdf.pages)) 101 | 102 | for statement_name, keyword_regex in statement_keywords.items(): 103 | best_table_overall = None 104 | highest_score_overall = -1 105 | 106 | for page_num in search_range: 107 | page = pdf.pages[page_num] 108 | page_text = page.extract_text(x_tolerance=2) 109 | if not page_text or not keyword_regex.search(page_text): 110 | continue 111 | 112 | st.write(f" -> Evaluating tables for '{statement_name}' on page {page_num + 1}...") 113 | tables = page.extract_tables() 114 | if not tables: continue 115 | 116 | for table_data in tables: 117 | if not table_data or len(table_data) < 3: continue 118 | current_score = 0 119 | header = [str(h).replace('\n', ' ').strip() for h in table_data[0]] 120 | year_pattern = re.compile(r'\b(20\d{2}|FY\d{2})\b') 121 | years_found = len(year_pattern.findall(" ".join(header))) 122 | current_score += years_found * 5 123 | current_score += len(table_data) 124 | first_column_text = " ".join([str(row[0]).lower() for row in table_data[1:] if row and len(row)>0]) 125 | for keyword in anchor_keywords[statement_name]: 126 | if keyword in first_column_text: 127 | current_score += 10 128 | if current_score > highest_score_overall: 129 | highest_score_overall = current_score 130 | best_table_overall = table_data 131 | 132 | if best_table_overall: 133 | header = [str(h).replace('\n', ' ').strip() for h in best_table_overall[0]] 134 | df = pd.DataFrame(best_table_overall[1:], columns=header) 135 | df = df.dropna(how='all') 136 | extracted_data[statement_name] = df 137 | st.write(f" ✅ Selected best table for '{statement_name}' (Score: {highest_score_overall}).") 138 | 139 | if not extracted_data: 140 | st.warning("Could not find and validate any standard financial statements.") 141 | return None 142 | 143 | return extracted_data 144 | 145 | @st.cache_resource 146 | def get_agent_components(_pdf_bytes): 147 | st.info("Step 1: Extracting financial tables from PDF...") 148 | 149 | 150 | page_range = find_financial_section_pages(_pdf_bytes) 151 | financial_data_dfs = find_financial_statements(_pdf_bytes, page_range) 152 | 153 | if not financial_data_dfs: return None, None, None 154 | 155 | @st.cache_resource 156 | def get_agent(_pdf_bytes, _company_name): 157 | """ 158 | This is the new, fully functional get_agent function. 159 | It performs the entire pipeline: extract, ingest and agent setup. 160 | The underscores in the arguments (_pdf_bytes, _company_name) tell Streamlit's cachethis this function's output depends on the content of these arguments. 161 | """ 162 | 163 | # extracting 164 | st.info("Step 1: Extracting financial tables from PDF...") 165 | financial_data_dfs = find_financial_statements(_pdf_bytes) 166 | if not financial_data_dfs: 167 | return None, None 168 | 169 | # ingesting 170 | st.info("Step 2: Preparing agent's memory (Vector Store)...") 171 | documents = [] 172 | for statement_name, df in financial_data_dfs.items(): 173 | content = f"This is the {statement_name.replace('_', ' ').title()}.\n\n{df.to_markdown(index=False)}" 174 | doc = Document(page_content=content, metadata={"source": statement_name}) 175 | documents.append(doc) 176 | 177 | embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001") 178 | vector_store = FAISS.from_documents(documents, embedding=embeddings) 179 | retriever = vector_store.as_retriever() 180 | st.write("✅ Agent's memory is ready.") 181 | 182 | # agent and tool setup 183 | st.info("Step 3: Initializing AI Agent and Tools...") 184 | llm = ChatGoogleGenerativeAI( 185 | model="gemini-2.5-pro", 186 | temperature=0, 187 | safety_settings={ 188 | HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE 189 | }, 190 | ) 191 | 192 | document_qa_prompt = ChatPromptTemplate.from_template("Answer based on context:\n{context}\nQuestion: {input}") 193 | document_chain = create_stuff_documents_chain(llm, document_qa_prompt) 194 | 195 | class FinancialSearchInput(BaseModel): 196 | query: str = Field(description="A detailed question to ask the financial statements.") 197 | 198 | financial_statement_tool = Tool( 199 | name="financial_statement_search", 200 | func=lambda query: document_chain.invoke({"input": query, "context": retriever.invoke(query)}), 201 | description="Search for info within the company's financial statements.", 202 | args_schema=FinancialSearchInput 203 | ) 204 | 205 | web_search_tool = TavilySearchResults(name="web_search") 206 | 207 | tools = [financial_statement_tool, web_search_tool] 208 | 209 | llm_with_tools = llm.bind_tools(tools) 210 | st.write("✅ Agent and tools are ready.") 211 | 212 | return llm_with_tools, tools, retriever 213 | 214 | def get_conversational_chain(retriever): 215 | """Creates the conversational RAG chain for the chat feature.""" 216 | llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2) 217 | 218 | contextualize_q_system_prompt = """Given a chat history and the latest user question \ 219 | which might reference context in the chat history, formulate a standalone question \ 220 | which can be understood without the chat history. Do NOT answer the question, \ 221 | just reformulate it if needed and otherwise return it as is.""" 222 | contextualize_q_prompt = ChatPromptTemplate.from_messages( 223 | [("system", contextualize_q_system_prompt), MessagesPlaceholder("chat_history"), ("human", "{input}")] 224 | ) 225 | history_aware_retriever = create_history_aware_retriever(llm, retriever, contextualize_q_prompt) 226 | 227 | qa_system_prompt = """You are an expert financial analyst assistant. Your primary function is to answer questions based **only** on the financial document context provided to you. 228 | 229 | Your Rules: 230 | 1. Strictly Grounded: You must base your answers exclusively on the information found in the `Context` provided. Do not use any external knowledge or make assumptions. 231 | 2. Handle Missing Information: If the answer to a question cannot be found in the context, you **must** respond with: "The answer is not available in the provided document context." 232 | 3. Be Specific: When answering, if possible, refer to the specific financial statement (e.g., "According to the Profit & Loss statement..."). 233 | 4. Quote Figures: If a question involves numbers or financial data, provide the exact figures and currency units (e.g., ₹ Crores) mentioned in the document. 234 | 5. Be Concise: Keep your answers direct and to the point. 235 | 236 | Context: 237 | {context} 238 | """ 239 | 240 | qa_prompt = ChatPromptTemplate.from_messages( 241 | [ 242 | ("system", qa_system_prompt), 243 | MessagesPlaceholder(variable_name="chat_history"), 244 | ("human", "{input}"), 245 | ] 246 | ) 247 | question_answer_chain = create_stuff_documents_chain(llm, qa_prompt) 248 | 249 | rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain) 250 | return rag_chain 251 | 252 | # streamlit UI 253 | 254 | if "analysis_report" not in st.session_state: 255 | st.session_state.analysis_report = "" 256 | if "chat_history" not in st.session_state: 257 | st.session_state.chat_history = [] 258 | if "components_ready" not in st.session_state: 259 | st.session_state.components_ready = False 260 | 261 | with st.sidebar: 262 | st.header("Setup") 263 | uploaded_file = st.file_uploader("Choose a DRHP PDF file", type="pdf") 264 | company_name_input = st.text_input("Enter the Company Name", placeholder="e.g. Reliance Industries Limited, SpaceX") 265 | 266 | if st.button("Process Document"): 267 | if uploaded_file is not None and company_name_input: 268 | with st.spinner("Analysis in progress... This may take a few minutes."): 269 | pdf_bytes = uploaded_file.read() 270 | 271 | llm_with_tools, tools, retriever = get_agent(pdf_bytes, company_name_input) 272 | st.session_state.llm_with_tools = llm_with_tools 273 | st.session_state.tools = tools 274 | st.session_state.company_name = company_name_input 275 | st.session_state.conversation_chain = get_conversational_chain(retriever) 276 | 277 | # reset chat history for new document 278 | st.session_state.chat_history = [] 279 | st.session_state.analysis_report = "" 280 | 281 | st.success("Document processed! You can now use the analysis and chat features.") 282 | st.session_state.components_ready = True 283 | 284 | else: 285 | st.warning("Please upload a PDF and enter the company name.") 286 | 287 | 288 | tab1, tab2 = st.tabs(["🤖 Full IPO Analysis", "💬 Chat with Document"]) 289 | 290 | with tab1: 291 | st.header("Automated IPO Analysis") 292 | st.write("After processing a document, click the button below to run the full analysis.") 293 | 294 | if st.button("Run Full Analysis"): 295 | if st.session_state.components_ready: 296 | with st.spinner("Agent is running the full analysis... This may take a few minutes."): 297 | tool_map = {tool.name: tool for tool in st.session_state.tools} 298 | master_query = f""" 299 | Your mission is to generate a comprehensive IPO analysis report for **{company_name_input}**. Execute the following steps meticulously and synthesize the findings into a structured final report. 300 | 301 | **Step 1: In-Depth Financial Statement Analysis** 302 | Use the `financial_statement_search` tool to analyze the company's financials from the provided documents. You must investigate and report on the following: 303 | - **Balance Sheet:** Are reserves and surplus consistently growing? Are total borrowings consistently decreasing? 304 | - **Profit & Loss Statement:** Is the revenue from operations consistently growing? Is the profit after tax growing in proportion, or are margins shrinking? 305 | - **Cash Flow Statement:** Is the net cash flow from operating activities consistently positive? 306 | 307 | **IMPORTANT: If any of the financial statements are missing from the context, you must explicitly state that in your report and proceed with the analysis based on the available information.** 308 | 309 | **Step 2: External Market and Valuation Analysis** 310 | Use the `web_search` tool to gather the latest market data. You must find and report on: 311 | - The IPO's price band and the resulting P/E (Price-to-Earnings) ratio. 312 | - The average P/E ratio for the company's specific industry in India. 313 | - The P/E ratios of at least 2-3 key listed peer companies. 314 | - The current Grey Market Premium (GMP) for the IPO. 315 | 316 | **Step 3: Synthesize and Generate the Final Report** 317 | Once you have gathered all the necessary information, generate the final answer. The report **must** be structured exactly as follows: 318 | 319 | ### **IPO Analysis: {company_name_input}** 320 | 321 | **1. Financial Health Analysis:** 322 | - **Balance Sheet Verdict:** (Your findings on reserves and borrowings. Conclude: Strong, Average, or Weak). 323 | - **P&L Statement Verdict:** (Your findings on revenue and profit growth/margins. Conclude: Strong, Average, or Weak). 324 | - **Cash Flow Verdict:** (Your findings on cash from operations. Conclude: Positive, Negative, or **Data Not Available**). 325 | 326 | **2. Valuation Analysis:** 327 | - **Valuation Verdict:** (Your comparison of the IPO's P/E against industry/peer P/Es. Conclude: Aggressive, Fairly Priced, or Attractive). 328 | 329 | **3. Market Sentiment:** 330 | - **GMP Verdict:** (Your findings on the GMP and what it indicates). 331 | 332 | **4. Key Strengths & Risks:** 333 | - **Strengths:** (A bulleted list of 2-3 key positive points). 334 | - **Risks:** (A bulleted list of 2-3 key risks or concerns). 335 | 336 | **5. Final Verdict:** 337 | - (A clear recommendation: **SUBSCRIBE**, **AVOID**, or **SUBSCRIBE FOR LISTING GAINS**), followed by a concise justification. 338 | """ 339 | 340 | history = [HumanMessage(content=master_query)] 341 | final_answer = None 342 | for i in range(15): 343 | response = st.session_state.llm_with_tools.invoke(history) 344 | if not response.tool_calls: 345 | final_answer = response.content 346 | break 347 | history.append(response) 348 | for tool_call in response.tool_calls: 349 | tool_name, tool_args = tool_call["name"], tool_call["args"] 350 | if tool_name in tool_map: 351 | observation = tool_map[tool_name].invoke(tool_args) 352 | else: 353 | observation = f"Error: Tool '{tool_name}' not found." 354 | history.append(ToolMessage(content=str(observation), tool_call_id=tool_call["id"])) 355 | st.session_state.analysis_report = final_answer 356 | st.success("Analysis Complete!") 357 | else: 358 | st.warning("Please process a document in the sidebar first.") 359 | 360 | if st.session_state.analysis_report: 361 | st.markdown("---") 362 | st.subheader("Comprehensive IPO Analysis Report") 363 | st.markdown(st.session_state.analysis_report) 364 | 365 | with tab2: 366 | st.header(f"Chat with the Document") 367 | st.write("Ask any question about the financial data in the uploaded PDF.") 368 | 369 | if not st.session_state.components_ready: 370 | st.warning("Please process a document in the sidebar first to enable chat.") 371 | else: 372 | for message in st.session_state.chat_history: 373 | with st.chat_message(message["role"]): 374 | st.markdown(message["content"]) 375 | 376 | if user_question := st.chat_input("e.g., What was the total revenue in the last reported year?"): 377 | st.session_state.chat_history.append({"role": "user", "content": user_question}) 378 | with st.chat_message("user"): 379 | st.markdown(user_question) 380 | 381 | with st.chat_message("assistant"): 382 | with st.spinner("Thinking..."): 383 | response = st.session_state.conversation_chain.invoke( 384 | {"chat_history": st.session_state.chat_history, "input": user_question} 385 | ) 386 | bot_response = response["answer"] 387 | st.markdown(bot_response) 388 | 389 | st.session_state.chat_history.append({"role": "assistant", "content": bot_response}) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | python-dotenv 3 | langchain 4 | langchain-google-genai 5 | langchain-community 6 | faiss-cpu 7 | tavily-python 8 | pdfplumber 9 | pandas 10 | pydantic 11 | google-generativeai 12 | nest_asyncio --------------------------------------------------------------------------------