├── data └── hi.txt ├── sec3_transactions_2017.pdf ├── sec3_transactions_2018.pdf ├── sec3_transactions_2019.pdf ├── sec3_transactions_2020.pdf ├── sec3_transactions_2021.pdf ├── sec3_transactions_2022.pdf ├── p1sec3_transactions_2023.pdf ├── p2sec3_transactions_2024.pdf ├── requirements.txt ├── .devcontainer └── devcontainer.json └── streamlit_app_9-28-23.py /data/hi.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sec3_transactions_2017.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/sec3_transactions_2017.pdf -------------------------------------------------------------------------------- /sec3_transactions_2018.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/sec3_transactions_2018.pdf -------------------------------------------------------------------------------- /sec3_transactions_2019.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/sec3_transactions_2019.pdf -------------------------------------------------------------------------------- /sec3_transactions_2020.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/sec3_transactions_2020.pdf -------------------------------------------------------------------------------- /sec3_transactions_2021.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/sec3_transactions_2021.pdf -------------------------------------------------------------------------------- /sec3_transactions_2022.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/sec3_transactions_2022.pdf -------------------------------------------------------------------------------- /p1sec3_transactions_2023.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/p1sec3_transactions_2023.pdf -------------------------------------------------------------------------------- /p2sec3_transactions_2024.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/danshorstein/tc_lookup/main/p2sec3_transactions_2024.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.12.2 2 | Flask==3.0.0 3 | pandas==2.1.1 4 | pdfplumber==0.10.2 5 | pyngrok==7.0.0 6 | Requests==2.31.0 7 | services==0.1.1 8 | streamlit==1.26.0 9 | xlsxwriter 10 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "streamlit_app_9-28-23.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y 0 and row[f'Debit{i}'][0] == '4' for i in range(1, 4)]) or \ 226 | any([isinstance(row[f'Credit{i}'], str) and len(row[f'Credit{i}']) > 0 and row[f'Credit{i}'][0] == '4' for i in range(1, 4)]): 227 | return 'Potentially missing Proprietary' 228 | else: 229 | return 'Potentially missing Budgetary' 230 | return '' 231 | 232 | 233 | 234 | def process_and_analyze_uploaded_csv_with_categories(uploaded_file, df_tc_lookup, categories_selected): 235 | """ 236 | Process and analyze the uploaded CSV considering selected TC categories and flagging potential concerns. 237 | """ 238 | if 'analyzed_data' in st.session_state: 239 | return st.session_state['analyzed_data'] 240 | 241 | df_uploaded = pd.read_csv(uploaded_file) 242 | 243 | for col in df_uploaded.columns: 244 | if col.startswith(('Debit', 'Credit')): 245 | df_uploaded[col] = df_uploaded[col].apply(lambda x: str(x)[:6] if pd.notnull(x) else '') 246 | 247 | df_uploaded['Matching TCs'] = None 248 | df_uploaded['Match Type'] = None 249 | # df_uploaded['Potential Concern'] = None 250 | 251 | for idx, row in df_uploaded.iterrows(): 252 | drs = [row['Debit1'], row['Debit2'], row['Debit3']] 253 | crs = [row['Credit1'], row['Credit2'], row['Credit3']] 254 | drs = [dr for dr in drs if dr is not None] 255 | crs = [cr for cr in crs if cr is not None] 256 | 257 | common_sglas = set(drs) & set(crs) 258 | 259 | for sgl in common_sglas: 260 | if sgl in drs and sgl in crs: 261 | if len(drs) > len(crs): 262 | drs.remove(sgl) 263 | else: 264 | crs.remove(sgl) 265 | # for sgl in common_sglas: 266 | # dr_count = drs.count(sgl) 267 | # cr_count = crs.count(sgl) 268 | # cancel_count = min(dr_count, cr_count) 269 | # for _ in range(cancel_count): 270 | # drs.remove(sgl) 271 | # crs.remove(sgl) 272 | 273 | exact_matches = filter_tc_tool(df_tc_lookup[df_tc_lookup.index.str.startswith(tuple(categories_selected))], drs=drs, crs=crs) 274 | 275 | if not exact_matches.empty: 276 | df_uploaded.at[idx, 'Matching TCs'] = ", ".join(exact_matches.index.to_list()) 277 | df_uploaded.at[idx, 'Match Type'] = "Exact Matches" 278 | 279 | # # Check for potential concern 280 | # for tc in exact_matches.index: 281 | # bd_exists = any(exact_matches.loc[tc, col] for col in ['Budgetary_Debits', 'Budgetary_Credits']) 282 | # pr_exists = any(exact_matches.loc[tc, col] for col in ['Proprietary_Debits', 'Proprietary_Credits']) 283 | # if (bd_exists and not pr_exists) or (pr_exists and not bd_exists): 284 | # df_uploaded.at[idx, 'Potential Concern'] = 'Yes' 285 | # break 286 | else: 287 | close_matches = truncate_search(df_tc_lookup[df_tc_lookup.index.str.startswith(tuple(categories_selected))], drs, crs) 288 | if not close_matches.empty: 289 | df_uploaded.at[idx, 'Matching TCs'] = ", ".join(close_matches.index.to_list()) 290 | df_uploaded.at[idx, 'Match Type'] = "Close Matches" 291 | 292 | st.session_state['analyzed_data'] = df_uploaded 293 | return df_uploaded 294 | 295 | 296 | # def process_and_analyze_uploaded_csv_with_session_state(uploaded_file, df_tc_lookup): 297 | # """ 298 | # Process and analyze the uploaded CSV using Streamlit's session state to avoid unnecessary re-analysis. 299 | # """ 300 | # # Check if the analysis has already been done and stored in session state 301 | # if 'analyzed_data' in st.session_state: 302 | # return st.session_state['analyzed_data'] 303 | 304 | # # If not, perform the analysis 305 | # df_uploaded = process_and_analyze_uploaded_csv(uploaded_file, df_tc_lookup) 306 | 307 | # # Store the analysis results in session state 308 | # st.session_state['analyzed_data'] = df_uploaded 309 | 310 | # return df_uploaded 311 | 312 | 313 | 314 | 315 | def generate_excel_from_analysis(df_results, df_tc_lookup): 316 | """ 317 | Generate an Excel file based on the analysis results. 318 | """ 319 | towrite = io.BytesIO() 320 | 321 | with pd.ExcelWriter(towrite, engine='xlsxwriter') as writer: 322 | # Main tab: Resulting Analysis 323 | df_results.to_excel(writer, sheet_name='Analysis Results', index=False) 324 | 325 | # Second tab: Filtered Trans Codes 326 | # Extract unique TCs from the results and filter the TC lookup table 327 | unique_tcs = set(tc for tcs in df_results['Matching TCs'].dropna() for tc in tcs.split(", ")) 328 | df_filtered_tcs = df_tc_lookup[df_tc_lookup.index.isin(unique_tcs)] 329 | df_filtered_tcs.to_excel(writer, sheet_name='Filtered Trans Codes', index=True) 330 | 331 | towrite.seek(0) 332 | return towrite 333 | 334 | def streamlit_app(): 335 | st.title("Fed TC Lookup Tool") 336 | 337 | tab1, tab2 = st.tabs(['TC Analysis', 'File Analysis']) 338 | 339 | with tab1: 340 | 341 | with st.sidebar: 342 | 343 | 344 | # Dropdown for fiscal year selection 345 | fiscal_years = get_fiscal_years_from_filenames() 346 | selected_fy = st.selectbox("Select Fiscal Year", fiscal_years) 347 | 348 | # Loading data for the selected fiscal year 349 | df = load_tc_lookup_tool(selected_fy) 350 | 351 | 352 | # User input 353 | col1, col2 = st.columns(2) 354 | with col1: 355 | drs = [st.text_input(f"Debit {i+1}", value='') for i in range(3)] 356 | with col2: 357 | crs = [st.text_input(f"Credit {i+1}", value='') for i in range(3)] 358 | 359 | keyword = st.text_input("Keyword/Phrase Filter", value='') 360 | 361 | # Filter data based on user input 362 | exact_matches = filter_tc_tool(df, drs=drs, crs=crs) 363 | if keyword: 364 | exact_matches = keyword_filter(exact_matches, keyword) 365 | 366 | st.session_state['exact_matches'] = exact_matches 367 | 368 | 369 | # Check for close matches if no exact matches are found 370 | if exact_matches.empty: 371 | 372 | # Check for close matches using truncation-based search 373 | close_matches = truncate_search(df, drs, crs) 374 | if keyword: 375 | close_matches = keyword_filter(close_matches, keyword) 376 | 377 | st.session_state['close_matches'] = close_matches 378 | 379 | if 'exact_matches' not in st.session_state: 380 | st.session_state['exact_matches'] = pd.DataFrame() 381 | exact_matches = st.session_state['exact_matches'] 382 | 383 | if 'close_matches' not in st.session_state: 384 | st.session_state['close_matches'] = pd.DataFrame() 385 | close_matches = st.session_state['close_matches'] 386 | 387 | if not exact_matches.empty: 388 | display_summary_table(exact_matches, 'Filtered Results Summary') 389 | elif not close_matches.empty: 390 | display_summary_table(close_matches, 'Filtered Results Summary') 391 | # else: 392 | # display_summary_table(df, 'Filtered Results Summary') 393 | 394 | 395 | 396 | 397 | # Display exact matches 398 | st.subheader("Exact Matches") 399 | 400 | 401 | # Link to the source PDF file 402 | if selected_fy == 2024: 403 | st.write("[FY2024 Transactions updated September 2023](https://raw.githubusercontent.com/danshorstein/tc_lookup/main/p2sec3_transactions_2024.pdf)") 404 | elif selected_fy == 2023: 405 | st.write("[FY2023 Transactions updated September 2023](https://raw.githubusercontent.com/danshorstein/tc_lookup/main/p1sec3_transactions_2023.pdf)") 406 | 407 | 408 | # Download as Excel functionality 409 | try: 410 | towrite = io.BytesIO() 411 | 412 | # Create an Excel writer object 413 | with pd.ExcelWriter(towrite, engine='xlsxwriter') as writer: 414 | # Sheet 1: Filtering Criteria and Summary Table 415 | criteria_data = { 416 | 'Criteria': ['Debits', 'Debits', 'Debits', 'Credits', 'Credits', 'Credits', 'Keyword', 'Fiscal Year'], 417 | 'Values': [drs[0], drs[1], drs[2], crs[0], crs[1], crs[2], keyword, selected_fy] 418 | } 419 | criteria_df = pd.DataFrame(criteria_data) 420 | criteria_df.to_excel(writer, sheet_name='Criteria & Summary', startrow=0, startcol=0, index=False) 421 | 422 | # Get the summary table as a DataFrame 423 | if not exact_matches.empty: 424 | summary_df = display_summary_table(exact_matches, 'Filtered Results Summary', return_df=True) 425 | elif not close_matches.empty: 426 | summary_df = display_summary_table(close_matches, 'Filtered Results Summary', return_df=True) 427 | else: 428 | summary_df = display_summary_table(df, 'Filtered Results Summary', return_df=True) 429 | 430 | # Add the summary table below the filtering criteria 431 | summary_df.to_excel(writer, sheet_name='Criteria & Summary', startrow=len(criteria_df) + 2) 432 | 433 | # Sheet 2: Resulting Trans Codes 434 | if not exact_matches.empty: 435 | exact_matches.to_excel(writer, sheet_name='Trans Codes', index=True, header=True) 436 | else: 437 | close_matches.to_excel(writer, sheet_name='Trans Codes', index=True, header=True) 438 | 439 | towrite.seek(0) 440 | st.download_button( 441 | label="Download Matches as Excel", 442 | data=towrite, 443 | file_name=f"matches_fy{selected_fy}.xlsx", 444 | mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 445 | ) 446 | 447 | except Exception as e: 448 | st.write(f'EXCEPTION! {e}') 449 | 450 | 451 | 452 | if not exact_matches.empty: 453 | st.dataframe(exact_matches, 454 | column_config=col_config, 455 | height=600) 456 | 457 | else: 458 | st.write("No exact matches found.") 459 | if not close_matches.empty: 460 | st.subheader("Close Matches") 461 | st.dataframe(close_matches, 462 | column_config=col_config, 463 | height=600) 464 | 465 | with tab2: 466 | st.header("Upload CSV for Analysis") 467 | uploaded_file = st.file_uploader("Choose a CSV file", type="csv") 468 | 469 | # Create the template DataFrame 470 | template_df = pd.DataFrame(columns=["ID", "Debit1", "Debit2", "Debit3", "Credit1", "Credit2", "Credit3"]) 471 | 472 | # Convert the DataFrame to a CSV in-memory object 473 | csv_data = template_df.to_csv(index=False) 474 | 475 | # Offer the CSV for download 476 | st.download_button( 477 | label="Download CSV Template", 478 | data=csv_data, 479 | file_name="upload_template.csv", 480 | mime="text/csv", 481 | ) 482 | 483 | # Transaction Code Categories for selection 484 | tc_categories = { 485 | 'A': 'A. Funding', 486 | 'B': 'B. Disb and Pbls', 487 | 'C': 'C. Coll and Recvs', 488 | 'D': 'D. Adj/Write-offs/Reclass', 489 | 'E': 'E. Accr/Nonbudg Transfers', 490 | 'F': 'F. Yearend', 491 | 'G': 'G. Memo Entries', 492 | 'H': 'H. Specialized Entries' 493 | } 494 | 495 | selected_values = st.multiselect('Select Trans Code Categories', list(tc_categories.values()), default=list(tc_categories.values())) 496 | 497 | categories_selected = [key for key, value in tc_categories.items() if value in selected_values] 498 | 499 | # If there's a change in the selected categories, clear the analysis results 500 | if 'selected_categories' not in st.session_state or set(st.session_state['selected_categories']) != set(categories_selected): 501 | st.session_state['selected_categories'] = categories_selected 502 | if 'analyzed_data' in st.session_state: 503 | del st.session_state['analyzed_data'] 504 | 505 | if uploaded_file: 506 | # Generate a hash of the uploaded file to uniquely identify it 507 | file_hash = hash(uploaded_file.getvalue()) 508 | 509 | # Check if the file is different from the previously uploaded file 510 | if 'uploaded_file_hash' not in st.session_state or st.session_state['uploaded_file_hash'] != file_hash: 511 | # Update the session state with the new file's hash 512 | st.session_state['uploaded_file_hash'] = file_hash 513 | 514 | # Clear the analysis results from the session state 515 | if 'analyzed_data' in st.session_state: 516 | del st.session_state['analyzed_data'] 517 | 518 | fiscal_years = get_fiscal_years_from_filenames() 519 | selected_fy = fiscal_years[0] 520 | df_tc_lookup = load_tc_lookup_tool(selected_fy) 521 | df_results = process_and_analyze_uploaded_csv_with_categories(uploaded_file, df_tc_lookup, categories_selected) 522 | df_results['Potential Concern'] = df_results.apply(lambda row: potential_concern(row, df_tc_lookup), axis=1) 523 | 524 | st.subheader("Analysis Results") 525 | st.write(df_results) 526 | 527 | excel_data = generate_excel_from_analysis(df_results, df_tc_lookup) 528 | st.download_button( 529 | label="Download Analysis as Excel", 530 | data=excel_data, 531 | file_name="analysis_results.xlsx", 532 | mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", 533 | ) 534 | 535 | 536 | 537 | streamlit_app() 538 | --------------------------------------------------------------------------------