├── .gitignore
├── .streamlit
    ├── config-BLACK.toml
    ├── config-DARK.toml
    ├── config.toml
    └── secrets.toml.sample
├── .vscode
    ├── launch.json
    └── settings.json
├── LICENSE
├── README.md
├── app.py
├── app_about.py
├── app_llm_data_query.py
├── app_llm_docs_query.py
├── app_llm_knowlege_graph_gen.py
├── app_state.py
├── common.py
├── data
    ├── CustomMacroModel_L_A.csv
    └── GCFS Countries.xlsx
├── db
    └── empty-for-github.txt
├── docs
    └── cdaniel-future-is-predictable-master.pdf
├── func_prompt.py
├── globals.py
├── graph_frontend
    ├── __init__.py
    └── index.html
├── images
    ├── a12i_logo_circle_transparent.png
    ├── app-demo.gif
    ├── favicon.ico
    └── snapshot-01.png
├── requirements.txt
├── run_app.cmd
├── static
    ├── favicon.ico
    ├── knowledge_graph
    ├── knowledge_graph.gv
    ├── knowledge_graph.gv.pdf
    └── knowledge_graph.png
├── storage
    └── empty-for-github.txt
└── streamlit_debug.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | /.streamlit/secrets.toml
162 | /_ARCHIVE/docs
163 | /backup_this_project.cmd
164 | /weaviate_connect_test.py
165 | /langchain_streaming_test.py
166 | /images/a12i_logo_block_circle.png
167 | /images/a12i_logo_block_circle_transparent.png
168 | /images/a12i_logo_circle.png
169 | /docs/AppArchGuide2.0.pdf
170 | /docs/DesignPrincipleSummaries.doc
171 | /docs/InformationSystemsDesignPrinciples.docx
172 | /docs/Service Architecture Pocket Guide.pdf
173 | /docs/_VPandP_Reference.pdf
174 | /db/*.sqbpro
175 | /storage/docstore.json
176 | /storage/index_store.json
177 | /storage/vector_store.json
178 | /db/*.sqlite3
179 | /NOTES.md
180 | 


--------------------------------------------------------------------------------
/.streamlit/config-BLACK.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | primaryColor = "#7792E3"
3 | backgroundColor = "black"
4 | secondaryBackgroundColor = "black"
5 | textColor = "#DCDCDC"
6 | font = "monospace"
7 | 
8 | 


--------------------------------------------------------------------------------
/.streamlit/config-DARK.toml:
--------------------------------------------------------------------------------
 1 | [theme]
 2 | # Primary accent for interactive elements
 3 | primaryColor = '#FF4B4B'
 4 | 
 5 | # Background color for the main content area
 6 | backgroundColor = '#0E1117'
 7 | 
 8 | # Background color for sidebar and most interactive widgets
 9 | secondaryBackgroundColor = '#262730'
10 | 
11 | # Color used for almost all text
12 | textColor = '#FAFAFA'
13 | 
14 | # Font family for all text in the app, except code blocks
15 | # Accepted values (serif | sans serif | monospace) 
16 | # Default: "sans serif"
17 | font = "sans serif"


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base = "light"
3 | font = "sans serif"
4 | 


--------------------------------------------------------------------------------
/.streamlit/secrets.toml.sample:
--------------------------------------------------------------------------------
1 | IS_CLOUD_DEPLOYMENT='true' # 'true' = deployed on st cloud | 'false' = deployed locally
2 | OPENAI_API_KEY='<Your OpenAI API Key>'
3 | WEAVIATE_API_KEY='<Your Weaviate API Key>'
4 | WEAVIATE_URL='https://<Your Weaviate Cluster ID>.weaviate.network'
5 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: Current File",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${file}",
12 |             "console": "integratedTerminal",
13 |             "args": []
14 |         },
15 |         {
16 |             "name": "Python Streamlit",
17 |             "type": "python",
18 |             "request": "launch",
19 |             "module": "streamlit",
20 |             "args": ["run", "${file}", "--server.port", "6974"],
21 |             "justMyCode": true,
22 |             "redirectOutput": true,
23 |             "logToFile": true,
24 |             "pathMappings": [
25 |                 {
26 |                     "localRoot": "${workspaceFolder}",
27 |                     "remoteRoot": "."
28 |                 }
29 |             ]
30 |         },
31 |         {
32 |             "name": "Python: debugpy Remote Attach",
33 |             "type": "python",
34 |             "request": "attach",
35 |             "connect": {
36 |                 "port": 7777,
37 |                 "host": "127.0.0.1",
38 |             },
39 |             "justMyCode": false,
40 |             "redirectOutput": true,
41 |             "logToFile": true,
42 |             "pathMappings": [
43 |                 {
44 |                     "localRoot": "${workspaceFolder}",
45 |                     "remoteRoot": "."
46 |                 }
47 |             ]
48 |             // "debugAdapterPath": "${workspaceFolder}/src/debugpy/adapter",
49 |         }
50 |     ]
51 | }


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "git.ignoreLimitWarning": true,
3 |     "docwriter.style": "Auto-detect",
4 |     "python.analysis.typeCheckingMode": "off",
5 |     "python.defaultInterpreterPath": "C:\\ProgramData\\Anaconda3\\python.exe",
6 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Arvindra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # LLM | DOC Q&A | KNOWLEDGE GRAPH | EXCEL DATA CHAT
  3 | > _Integrated LLM-based document and data Q&A with knowledge graph visualization_
  4 | 
  5 | > Arvindra Sehmi, A12i (CloudOpti Ltd.) | [LinkedIn](https://www.linkedin.com/in/asehmi/)
  6 | 
  7 | > Updated: 9 December, 2023
  8 | 
  9 | ---
 10 | 
 11 | ### Introduction
 12 | 
 13 | I built this app because I'm writing some chapters for an upcoming book on Streamlit. This app helps me digest a large quantity of information from articles and documents I have on the subject of Software Architecture. I wanted to be able to ask questions about the documents and get answers, and also to visualize the answers in a knowledge graph. I also wanted to upload Excel files and ask questions about the data in the files.
 14 | 
 15 | The application is a typical LLM application, with the addition of a knowledge graph visualization. The app is built in Python using Streamlit. I was inspired by [instagraph](https://github.com/yoheinakajima/instagraph) and re-implemented its graph plot as a Streamlit custom component. I use the Weaviate Cloud (vector) Store (WCS) for document and data indexing. OpenAI, LangChain, and LlamaIndex LLM programming frameworks play an important role too. The application supports local filestore indexing in addition to WCS. OpenAI embeddings are used and the OpenAI API is called, directly or via the other LLM frameworks, for question answering. Hence, you will need an OpenAI API key to use the application. Various LLM models are used for question answering, including the GPT-4-Turbo Turbo and GPT-4 models. They are used for bot chat and completions. Token usage is tracked and costs are estimated.
 16 | 
 17 | The application is deployed on Streamlit Cloud. When deployed in the cloud, the application uses WCS. When deployed locally, the application can be configured to use LlamaIndex to store its index in the local file system.
 18 | 
 19 | ![snapshot](./images/snapshot-01.png)
 20 | 
 21 | ### Streamlit App Demo
 22 | 
 23 | In this demo:
 24 | 
 25 | 1. The user selects or enters a question to query over documents or data which have been indexed into Weaviate (a cloud-based vector store) 
 26 | 2. The app displays the question answer and generates a knowledge graph to complement the answer
 27 | 3. The user can upload an Excel file which can be displayed and queried using natural language
 28 | 4. The app allows the user to enter their OpenAI API key and select the model(s) to use for question answering
 29 | 5. The app displays a per-query cost estimate and a running total of the cost of the queries
 30 | 
 31 | ![st_demo](./images/app-demo.gif)
 32 | 
 33 | ### Try the demo app yourself
 34 | 
 35 | The application can be seen running in the Streamlit Cloud at the link below:
 36 | 
 37 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://docs-n-data-knowledge-app.streamlit.app/)
 38 | 
 39 | **NOTE:** You will need to enter your own OpenAI API. The key is ephemeral and not stored permanently in the application. Once entered, the API Key input box will be hidden and you can start using the app. To re-enter the API Key, a button is provided to clear the current key from memory, after which you can re-enter another key. 
 40 | 
 41 | ### Installation
 42 | 
 43 | Ensure you have installed package requirements with the commands:
 44 | 
 45 | ```bash
 46 | # change to the Streamlit <app root folder>, e.g.
 47 | cd ./docs-n-data-knowledge-app
 48 | pip install -r requirements.txt
 49 | ```
 50 | 
 51 | **Important:** Modify the `secrets.toml` file in the application `.streamlit` root based on the example available in `secrets.toml.sample`.
 52 | 
 53 | ```bash
 54 | OPENAI_API_KEY='<Your OpenAI API Key>'
 55 | WEAVIATE_API_KEY='<Your Weaviate API Key>'
 56 | WEAVIATE_URL='https://<Your Weaviate Cluster ID>.weaviate.network'
 57 | IS_CLOUD_DEPLOYMENT='true' # 'true' = deployed on st cloud | 'false' = deployed locally
 58 | ```
 59 | 
 60 | In `globals.py` you can change the following variables to affect application behaviour:
 61 | 
 62 | ```python
 63 | # See: https://openai.com/pricing
 64 | LANG_MODEL_PRICING = {
 65 |     # Friendly aliases used in app
 66 |     'gpt-4': {'input': 0.03, 'output': 0.06},                       # per 1000 tokens
 67 |     'gpt-4-turbo': {'input': 0.01, 'output': 0.03},                 # per 1000 tokens
 68 |     # Actual model names used in app
 69 |     'gpt-4-1106-preview': {'input': 0.01, 'output': 0.03},          # per 1000 tokens
 70 |     'gpt-3.5-turbo-instruct': {'input': 0.0015, 'output': 0.002},   # per 1000 tokens
 71 | }
 72 | 
 73 | VECTOR_STORE = 'Weaviate' # 'Weaviate' | 'Local'
 74 | 
 75 | # Sample questions for the Document Q&A functionality, based on the topic of _my_ indexed documents
 76 | SAMPLE_QUESTIONS = [
 77 |     "None",     # required
 78 |     "Summarize the most important concepts in a high performance software application",
 79 |     "Summarize the Wardley mapping technique",
 80 |     #  :
 81 |     # ETC.
 82 |     #  :
 83 |     "Most important factors of high performing teams",
 84 | ]
 85 | ```
 86 | 
 87 | Now run Streamlit with `app.py`:
 88 | 
 89 | ```bash
 90 | # I prefer to set the port number too
 91 | streamlit run --server.port 4010 app.py
 92 | ```
 93 | 
 94 | **NOTE:** Whilst there is some clean-up of the structured data expected in the LLM responses, LLMs don't always return data you expect. You might therefore encounter errors. If you do, try changing the LLM model selected and re-run your queries.
 95 | 
 96 | ### TODO
 97 | 
 98 | - Possibly, remove the data page functionality from app and create a separate project for it
 99 | - Implement file upload document Q&A
100 | 
101 | ---
102 | 
103 | If you enjoyed this app, please consider starring this repository.
104 | 
105 | Thanks!
106 | 
107 | Arvindra


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import streamlit as st
  4 | 
  5 | import streamlit_debug
  6 | streamlit_debug.set(flag=True, wait_for_client=False, host='localhost', port=7777)
  7 | 
  8 | st.set_page_config(page_title='ChatGPT Pandas CSV Streamlit App', page_icon='🤖', initial_sidebar_state='expanded', layout='wide')
  9 | # Remove blank space between top of page and content
 10 | st.markdown("<style>div[data-testid='stSidebarUserContent'] { padding-top: 1rem; }</style>", unsafe_allow_html=True)
 11 | st.markdown("<style>div[data-testid='block-container'] { padding-top: 1rem; }</style> ", unsafe_allow_html=True)
 12 | 
 13 | from app_state import (state, init_app_state, reset_app_state, _set_state_cb)
 14 | init_app_state() # ensure all state variables are initialized
 15 | 
 16 | from globals import SAMPLE_QUESTIONS
 17 | 
 18 | logging.basicConfig(level=logging.INFO)
 19 | 
 20 | # APP CALLBACKS -----------------------------------------------------------------
 21 | 
 22 | def _set_openai_api_key_cb():
 23 |     if not state.text_input_openai_api_key.startswith('sk-'):
 24 |         st.warning('Please enter your OpenAI API key!', icon='⚠')
 25 |         return
 26 |     state.openai_api_key = state.text_input_openai_api_key
 27 |     os.environ['OPENAI_API_KEY'] = state.openai_api_key
 28 | 
 29 | def _clear_openai_api_key_cb():
 30 |     state.openai_api_key = ''
 31 |     os.environ['OPENAI_API_KEY'] = state.openai_api_key
 32 | 
 33 | # DATA CHAT PAGE ----------------------------------------------------------------
 34 | 
 35 | def _openai_api_key_guard():
 36 |     # Guardrail for API Key
 37 |     if not state.openai_api_key:
 38 |         st.error('🔑 Please enter your OpenAI API Key in the settings sidebar. 🔑')
 39 |         st.info(
 40 |             'This value is ephemeral and not stored permanently.\n\n'
 41 |             'Once entered, the API Key input box will be removed, and you can start using the app.\n\n'
 42 |             'To re-enter the API Key, click the global settings button to clear the current key from memory.'
 43 |         )
 44 |         with st.sidebar:
 45 |             # api key
 46 |             st.text_input(
 47 |                 '🔑 OpenAI API Key', 
 48 |                 value=state.openai_api_key,
 49 |                 placeholder='sk-...',
 50 |                 type='password',
 51 |                 on_change=_set_openai_api_key_cb,
 52 |                 help='Enter your OpenAI API Key',
 53 |                 key='text_input_openai_api_key'
 54 |             )
 55 |             st.stop()
 56 | 
 57 | # Guardrail for API Key
 58 | _openai_api_key_guard()
 59 | 
 60 | # Once past the guardrails, import the rest of the app which depends on OpenAI API key
 61 | import app_llm_data_query, app_llm_docs_query, app_llm_knowlege_graph_gen, app_about
 62 | 
 63 | def start():
 64 |     # Sidebar
 65 |     with st.sidebar:
 66 |         st.image('./images/a12i_logo_circle_transparent.png')
 67 |         top_level_options = ['Document Q&A | Knowedge Graph', 'Data Chat', 'About']
 68 |         st.subheader('What would you like to do?')
 69 |         top_level = st.radio(
 70 |             'What would you like to do?', 
 71 |             top_level_options, index=0,
 72 |             label_visibility='collapsed', horizontal=False
 73 |         )
 74 |         
 75 |     # Document Q&A | Knowledge Graph
 76 |     if top_level == top_level_options[0]:
 77 |         c1, _ = st.columns([1, 1.5])
 78 |         with c1:
 79 |             # Title and description
 80 |             st.subheader('Document Q&A ❣️ Knowledge Graph')
 81 |             st.caption(
 82 |                 '📑 Ask a question based on pre-uploaded documents on the subject of **Software Architecture**. You can ask questions on any topic '
 83 |                 'in as much detail as you like. For your convenience, some sample questions are provided below.'
 84 |             )
 85 |         c1, _, c3, _ = st.columns([1, 0.075, 1, 1.5])
 86 |         with c1:
 87 |             st.markdown('### **1️⃣ Ask a question**')
 88 |             user_input = st.text_input(
 89 |                 "Enter question here...",
 90 |                 placeholder="Enter text 🖋️ or URL 🔗",
 91 |                 label_visibility="collapsed",
 92 |                 key="user_text_input"
 93 |             )
 94 |             example_selection = st.selectbox(
 95 |                 "📑 You can choose a sample question here instead",
 96 |                 options=SAMPLE_QUESTIONS,
 97 |                 index=0,
 98 |                 key="examples_selectbox"
 99 |             )
100 | 
101 |         with c3:
102 |             user_input_confirmed = False
103 |             include_knowledge_graph = False
104 |             radio_options = [user_input, example_selection] if user_input and (user_input != example_selection) else ([example_selection] if example_selection != "None" else [])
105 |             if radio_options:
106 |                 st.markdown('### **2️⃣ Confirm your question**')
107 |                 with st.form(key="confirm_input_form"):
108 |                     st.radio(
109 |                         "Confirm input", options=radio_options,
110 |                         label_visibility="collapsed",
111 |                         horizontal=True,
112 |                         key="confirm_input"
113 |                     )
114 |                     c1, c2, _ = st.columns([1, 1, 1.5])
115 |                     with c1:
116 |                         user_input_confirmed = st.form_submit_button(
117 |                             label="Confirm and get answer", type='primary',
118 |                             on_click=_set_state_cb, kwargs={
119 |                                 'user_input': "confirm_input",
120 |                                 'estimated_cost_doc': 'estimated_cost_reset',
121 |                                 'estimated_cost_graph': 'estimated_cost_reset',
122 |                             }
123 |                         )
124 |                     with c2:
125 |                         include_knowledge_graph = st.checkbox('Include knowledge graph', value=False)
126 | 
127 |         if state.user_input:
128 |             st.markdown(f'###### ✅ Confirmed question: _{state.user_input}_')
129 |             st.markdown(f'###### ✅ Include knowledge graph: _{include_knowledge_graph}_')
130 |         else:
131 |             st.markdown('###### ❌ No question confirmed yet')
132 |         
133 |         st.markdown('---')
134 | 
135 |         c1, _, c3 = st.columns([1.5, 0.25, 1])
136 |         with c1:
137 |             response = app_llm_docs_query.main('Document Q&A', user_input_confirmed)
138 |         with c3:
139 |             if include_knowledge_graph:
140 |                 app_llm_knowlege_graph_gen.main('Knowledge Graph', user_input_confirmed, response)
141 | 
142 |     # Simple Excel Data Q&A
143 |     if top_level == top_level_options[1]:
144 |         c1, _ = st.columns([1, 2])
145 |         with c1:
146 |             st.subheader('🔢 Simple Excel Data Q&A')
147 |             app_llm_data_query.main('Data Chat')
148 |             
149 |     # About / Display README.md
150 |     if top_level == top_level_options[2]:
151 |         st.subheader('📖 Readme')
152 |         app_about.main()
153 | 
154 |     with st.sidebar:
155 |         st.markdown('---')
156 | 
157 |         with st.expander('#### Cost Estimation', expanded=True):
158 |             st.markdown(f'**Cumulative: ${state.cumulative_cost:.2f}**')
159 |             st.markdown(f'Data query: ${state.estimated_cost_data:.2f}')
160 |             st.markdown(f'Doc query: ${state.estimated_cost_doc:.2f}')
161 |             st.markdown(f'Graph query: ${state.estimated_cost_graph:.2f}')
162 | 
163 |         st.markdown('#### Global Settings')
164 |         if st.button('Reset app state', type='primary', help='Clear results cache and app state (optional). Will clear cost estimations too!'):
165 |             reset_app_state()
166 |             app_llm_data_query.get_llm_data_query_response.clear()
167 |             app_llm_docs_query.get_llm_doc_query_response.clear()
168 |             app_llm_knowlege_graph_gen.get_llm_graph_data_response.clear()
169 |             st.experimental_rerun()
170 |         st.button('Clear OpenAI API key', on_click=_clear_openai_api_key_cb, type='primary', help='Clear OpenAI API key (optional)')
171 | 
172 |         with st.expander('Debug State (excluding private keys)', expanded=False):
173 |             display_state = {k: v for k, v in state.items() if not ('openai' in k or 'weaviate' in k)}
174 |             st.write(display_state)
175 | 
176 |         st.subheader('About')
177 |         st.sidebar.info('Integrated LLM-based document and data Q&A with knowledge graph visualization.\n\n' + \
178 |             '(c) 2023. A12i (CloudOpti Ltd.) All rights reserved.')
179 | 
180 | if __name__ == '__main__':
181 |     start()
182 | 


--------------------------------------------------------------------------------
/app_about.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | def main():
 4 |     c1, _, c3, _ = st.columns([2,0.25,1,1])
 5 |     with c1:
 6 |         with open('./README.md', 'r', encoding='utf-8') as f:
 7 |             readme_lines = f.readlines()
 8 |             readme_buffer = []
 9 |             for line in readme_lines:
10 |                 if '![snapshot](./images/snapshot-01.png)' in line:
11 |                     st.markdown(' '.join(readme_buffer))
12 |                     st.image('./images/snapshot-01.png')
13 |                     readme_buffer.clear()
14 |                 elif '![st_demo](./images/app-demo.gif)' in line:
15 |                     st.markdown(' '.join(readme_buffer))
16 |                     st.image('./images/app-demo.gif')
17 |                     readme_buffer.clear()
18 |                 else:
19 |                     readme_buffer.append(line)
20 |             st.markdown(' '.join(readme_buffer), unsafe_allow_html=True)
21 | 
22 |     with c3:
23 |         st.markdown('''
24 |             ### About 🎈Streamlit
25 | 
26 |             Streamlit is a Python library that allows the creation of interactive, data-driven web applications in Python.
27 |             [Streamlit](https://streamlit.io) is an open-source app framework for Machine Learning and Data Science teams. 
28 |             You can create beautiful data apps in minutes, not weeks. All in pure Python. It's not just for Data Science, though.
29 | 
30 |             With its component extensibility architecture, you can build and integrate most kinds of web frontends into Streamlit apps. 
31 |             
32 |             Streamlit is fast-becoming a de facto standard for building Generative AI and LLM apps in Python.
33 |                          
34 |             ##### Resources
35 | 
36 |             - [Build powerful generative AI apps with Streamlit](https://streamlit.io/generative-ai)
37 |             - [Streamlit Documentation](https://docs.streamlit.io/)
38 |             - [Streamlit Blog](https://blog.streamlit.io/)
39 |             - [Cheat sheet](https://docs.streamlit.io/library/cheatsheet)
40 |             - [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science)
41 |             - [Blog](https://blog.streamlit.io/how-to-master-streamlit-for-data-science/) (How to master Streamlit for data science)
42 | 
43 |             ##### Deploy
44 | 
45 |             Once you've created an app you can use the [Community Cloud](https://streamlit.io/cloud) to deploy, manage, and share your app, in just a few clicks. ''')
46 | 


--------------------------------------------------------------------------------
/app_llm_data_query.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sqlite3
  3 | import pandas as pd
  4 | # from sqlalchemy import create_engine
  5 | from sqlalchemy.pool import StaticPool
  6 | from langchain.callbacks import get_openai_callback
  7 | from langchain.llms import OpenAI
  8 | from langchain.utilities.sql_database import SQLDatabase
  9 | from langchain_experimental.sql import SQLDatabaseChain
 10 | import retry
 11 | import logging
 12 | 
 13 | logging.basicConfig(level=logging.ERROR)
 14 | 
 15 | import streamlit as st
 16 | 
 17 | from globals import (
 18 |     DB_FILE, OPENAI_MODELS_COMPLETIONS, 
 19 |     DEFAULT_MODEL_CONFIG, LANG_MODEL_PRICING
 20 | )
 21 | from app_state import (state, init_app_state, _set_state_cb)
 22 | init_app_state() # ensure all state variables are initialized
 23 | 
 24 | # DATA -------------------------------------------------------------------------
 25 | 
 26 | @st.cache_data(persist='disk')
 27 | def csv_to_df(excel_file):
 28 |     df = pd.read_csv(excel_file)
 29 |     return df
 30 | 
 31 | @st.cache_data(persist='disk')
 32 | def excel_to_df(excel_file):
 33 |     # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
 34 |     # New in Pandas version 1.3.0.
 35 |     #   The engine xlrd now only supports old-style .xls files. When engine=None, the following logic will be used to determine the engine:
 36 |     #   If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), then odf will be used.
 37 |     #   Otherwise if path_or_buffer is an xls format, xlrd will be used.
 38 |     #   Otherwise if path_or_buffer is in xlsb format, pyxlsb will be used.
 39 |     #   Otherwise openpyxl will be used.
 40 |     #
 41 |     # import openpyxl
 42 |     # df = pd.read_excel(excel_file, engine=openpyxl)
 43 |     #
 44 |     # Therefore... do not need to provide "engine" when using a "path_or_buffer"
 45 |     df = pd.read_excel(excel_file, engine='openpyxl')
 46 |     return df
 47 | 
 48 | def prepare_data(df):
 49 |     df.columns = [x.replace(' ', '_').lower() for x in df.columns]
 50 |     return df
 51 | 
 52 | @st.cache_resource()
 53 | def db_connection():
 54 |     return sqlite3.connect(DB_FILE , check_same_thread=False)
 55 | 
 56 | @st.cache_resource()
 57 | def sql_database(table):
 58 |     # create db engine
 59 |     # eng = create_engine(
 60 |     #     url=f'sqlite:///file:{DB_FILE}&cache=shared',
 61 |     #     poolclass=StaticPool, # single connection for requests
 62 |     #     creator=lambda: db_connection(),
 63 |     # )
 64 |     # db = SQLDatabase(engine=eng)
 65 | 
 66 |     db = SQLDatabase.from_uri(
 67 |         database_uri = f'sqlite:///file:{DB_FILE}&cache=shared',
 68 |         include_tables=[table],         # we include only one table to save tokens in the prompt :)
 69 |         sample_rows_in_table_info=2,    # we only need 2 rows to get the table info
 70 |         engine_args={'poolclass': StaticPool, 'creator': lambda: db_connection()},
 71 |     )
 72 |     return db
 73 | 
 74 | # OPENAI DATA QUERY ------------------------------------------------------------
 75 | 
 76 | # create OpenAI LLM connection
 77 | # NOTE: relies on environment key in case you want to
 78 | # remove entering the key in the app
 79 | def get_llm(
 80 |     model_name: str = DEFAULT_MODEL_CONFIG['completions_model'],
 81 |     temperature: float = DEFAULT_MODEL_CONFIG['temperature'],
 82 |     top_p: float = DEFAULT_MODEL_CONFIG['top_p'],
 83 |     max_tokens: int = DEFAULT_MODEL_CONFIG['max_tokens'],
 84 |     max_retries: int = 3,
 85 |     streaming: bool = False,
 86 | ):
 87 |     return OpenAI(
 88 |         openai_api_key=os.environ['OPENAI_API_KEY'], 
 89 |         model_name=model_name,
 90 |         temperature=temperature,
 91 |         top_p=top_p,
 92 |         max_tokens=max_tokens,
 93 |         max_retries=max_retries,
 94 |         streaming=streaming,
 95 |     )
 96 | 
 97 | @retry.retry(tries=2, delay=5, backoff=3, jitter=(1, 5), max_delay=60, logger=logging.getLogger("LLM DATA QUERY (get_llm_data_query_response)"))
 98 | def get_llm_data_query_response(query, table, model_name=DEFAULT_MODEL_CONFIG['completions_model'], intermediate_steps=False, limit=3):
 99 |     model_config = {
100 |         'model_name': model_name,
101 |         'temperature': 0,      # override settings = do not halucinate!
102 |         'top_p': state.top_p,
103 |         'max_tokens': 2000,    # override settings
104 |     }
105 |     llm = get_llm(**model_config)
106 |     
107 |     # create SQLDatabaseChain LLM connection
108 |     db_chain = SQLDatabaseChain.from_llm(
109 |         llm=llm, db=sql_database(table), verbose=True,
110 |         # use_query_checker=True,
111 |         return_intermediate_steps=intermediate_steps,
112 |         top_k=limit
113 |     )
114 |     
115 |     # run query and display result
116 |     with get_openai_callback() as token_counter:
117 |         if query:
118 |             if state.intermediate_steps: 
119 |                 result = db_chain(query)
120 |             else:
121 |                 result = db_chain.run(query)
122 | 
123 |     print('---- Data SQL Query ----', '\n',
124 |           'LLM Prompt Tokens:', token_counter.prompt_tokens, '\n',
125 |           'LLM Completion Tokens:', token_counter.completion_tokens, '\n',
126 |           'Total LLM Token Count:', token_counter.total_tokens)
127 | 
128 |     estimated_cost = ((token_counter.prompt_tokens / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['input']) + \
129 |         ((token_counter.completion_tokens / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['output'])
130 |     print('Data SQL Query Estimated Cost: $', estimated_cost)
131 |     state.estimated_cost_data = estimated_cost
132 |     state.cumulative_cost += estimated_cost
133 | 
134 |     return result
135 | 
136 | # DATA CHAT PAGE ----------------------------------------------------------------
137 | 
138 | def main(title):
139 |     # Sidebar
140 |     with st.sidebar:
141 |         st.markdown(f'#### {title} Settings')
142 |         st.selectbox(
143 |             'OpenAI model', options=OPENAI_MODELS_COMPLETIONS,
144 |             on_change=_set_state_cb, kwargs={'completions_model': 'selectbox_data_completions_model_name'},
145 |             index=OPENAI_MODELS_COMPLETIONS.index(state.completions_model),
146 |             help='Allowed models. Accuracy, speed, token consumption and costs will vary.',
147 |             key='selectbox_data_completions_model_name'
148 |         )
149 |         # results limit
150 |         st.number_input(
151 |             'Results limit', value=state.limit, min_value=1, max_value=10, step=1,
152 |             on_change=_set_state_cb, kwargs={'limit': 'number_input_limit'},
153 |             help='Limit the number of results returned, which can improve performance and save OpenAI costs',
154 |             key='number_input_limit'
155 |         )
156 |     
157 |     # Body
158 |     st.subheader('Upload Data')
159 |     excel_file = st.file_uploader('Choose an Excel file on your computer', type=['xlsx', 'csv'], accept_multiple_files=False)
160 |     if excel_file is None:
161 |         return
162 |         
163 |     if excel_file.type in ['application/vnd.ms-excel', 'application/octet-stream', 'text/csv']:
164 |         df = csv_to_df(excel_file)
165 |         # state.db_table = excel_file.name.replace('.csv', '').replace(' ', '_').lower()
166 |     else: # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
167 |         df = excel_to_df(excel_file)
168 |         # state.db_table = excel_file.name.replace('.xlsx', '').replace(' ', '_').lower()
169 | 
170 |     if st.checkbox('Show Data', value=False):
171 |         st.dataframe(df)
172 | 
173 |     # commit data to sql
174 |     data = prepare_data(df)
175 |     data.to_sql(state.db_table, db_connection(), if_exists='replace', index=False)
176 | 
177 |     st.subheader('Query Data')
178 |     with st.form(key='data_chat_form', clear_on_submit=False):
179 |         # user query
180 |         st.text_input(
181 |             'Enter a data query in plain English', value=state.query,
182 |             help='Enter a question based on the uploaded dataset. Add as much detail as you like. '
183 |             'E.g., "What is X of Y in the table. Limit to 10 results, and format as JSON showing X and Y values only."',
184 |             key='text_input_query_data'
185 |         )
186 |         st.checkbox(
187 |             'Show Intermediate Steps', value=state.intermediate_steps, 
188 |             key='checkbox_intermediate_steps'
189 |         )
190 |         apply_query = st.form_submit_button(
191 |             label='Ask', type='primary',
192 |             on_click=_set_state_cb, kwargs={
193 |                 'intermediate_steps': 'checkbox_intermediate_steps',
194 |                 'query': 'text_input_query_data',
195 |                 'estimated_cost_data': 'estimated_cost_reset',
196 |             },
197 |         )
198 | 
199 |     if apply_query and state.query and state.openai_api_key:
200 |         query = state.query + f' Strictly use only these data columns "{list(data.columns)}". ' + \
201 |             'Do not wrap the SQL statement in quotes. Do not embelish the answer with any additional text.'
202 |         result = get_llm_data_query_response(
203 |             query, state.db_table,
204 |             model_name=state.completions_model,
205 |             intermediate_steps=state.intermediate_steps, 
206 |             limit=state.limit
207 |         )
208 |         if state.intermediate_steps:
209 |             with st.expander('Intermediate Steps', expanded=False):
210 |                 st.write(state.completions_model)
211 |                 st.write(result['intermediate_steps'])
212 |             st.text(result['result'])
213 |         else:
214 |             st.text(result)
215 |     elif apply_query and not state.query:
216 |         st.info('Please enter a query above.')


--------------------------------------------------------------------------------
/app_llm_docs_query.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from langchain.prompts import PromptTemplate
  4 | 
  5 | import tiktoken
  6 | from llama_index.callbacks import CallbackManager, TokenCountingHandler
  7 | from llama_index.node_parser import SimpleNodeParser
  8 | from llama_index.vector_stores import WeaviateVectorStore
  9 | from llama_index import (
 10 |     VectorStoreIndex, SimpleDirectoryReader, 
 11 |     StorageContext, ServiceContext, 
 12 |     load_index_from_storage
 13 | )
 14 | import weaviate
 15 | 
 16 | import streamlit as st
 17 | 
 18 | from app_state import (state, init_app_state, _set_state_cb)
 19 | init_app_state() # ensure all state variables are initialized
 20 | 
 21 | from globals import (
 22 |     VECTOR_STORE, OPENAI_MODELS_COMPLETIONS, 
 23 |     DEFAULT_MODEL_CONFIG, LANG_MODEL_PRICING
 24 | )
 25 | from common import scrape_articles
 26 | 
 27 | # DOCS CHAT PAGE ----------------------------------------------------------------
 28 | 
 29 | wc = None
 30 | # WEAVIATE CLOUD STORE
 31 | if VECTOR_STORE == 'Weaviate':
 32 |     auth_config = weaviate.AuthApiKey(api_key=state.weaviate_api_key)
 33 |     wc = weaviate.Client(
 34 |         url=state.WEAVIATE_URL,
 35 |         auth_client_secret=auth_config,
 36 |         additional_headers={
 37 |             "X-OpenAI-Api-Key": state.openai_api_key,
 38 |         }
 39 |     )
 40 | 
 41 | @st.cache_data(ttl=60*60, show_spinner=False)
 42 | def get_llm_doc_query_response(
 43 |     query_prompt, model_name: str = DEFAULT_MODEL_CONFIG['completions_model'], 
 44 |     _service_context=ServiceContext.from_defaults()
 45 | ):
 46 |     # load index
 47 |     # LOCAL STORE
 48 |     if VECTOR_STORE == 'Local':
 49 |         # rebuild storage context
 50 |         storage_context = StorageContext.from_defaults(persist_dir='./storage')
 51 |         index = load_index_from_storage(storage_context)
 52 | 
 53 |     # WEAVIATE CLOUD STORE
 54 |     elif VECTOR_STORE == 'Weaviate':
 55 |         vector_store = WeaviateVectorStore(weaviate_client = wc, index_name="Documents", text_key="content")
 56 |         # set up the index
 57 |         index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=_service_context)
 58 | 
 59 |     else:
 60 |         raise ValueError(f'Unknown vector store {VECTOR_STORE}')
 61 | 
 62 |     # get query engine over the index
 63 |     query_engine = index.as_query_engine()
 64 |     # query the index
 65 |     response = query_engine.query(query_prompt)
 66 |     response = response.response.replace('•', '*')
 67 |     return response
 68 | 
 69 | def main(title, user_input_confirmed=False):
 70 |     # Count token usage for cost estimation
 71 |     token_counter = TokenCountingHandler(
 72 |         tokenizer=tiktoken.encoding_for_model(state.completions_model).encode,
 73 |         verbose=False  # set to true to see usage printed to the console
 74 |     )
 75 |     callback_manager = CallbackManager([token_counter])
 76 |     service_context = ServiceContext.from_defaults(callback_manager=callback_manager)
 77 |     
 78 |     def _index_documents():
 79 |         # load the documents 
 80 |         documents = SimpleDirectoryReader('docs').load_data()
 81 | 
 82 |         # LOCAL STORE
 83 |         # NOTE: Disallow if cloud deployment (temporary fix for public demo and/or if you 
 84 |         # don't have required file permissions or disk space)
 85 |         if not json.loads(st.secrets['IS_CLOUD_DEPLOYMENT']) and VECTOR_STORE == 'Local':
 86 |             # construct an index over these documents... saved in memory
 87 |             index = VectorStoreIndex.from_documents(documents, show_progress=True, service_context=service_context)
 88 |             # save index on disk
 89 |             index.storage_context.persist(persist_dir='./storage')
 90 | 
 91 |         # WEAVIATE CLOUD STORE
 92 |         elif VECTOR_STORE == 'Weaviate':
 93 |             wc.schema.delete_class("Documents")
 94 |             class_obj = {
 95 |                 "class": "Documents",
 96 |                 "vectorizer": "text2vec-openai",
 97 |                 "moduleConfig": {
 98 |                     "text2vec-openai": {},
 99 |                     "generative-openai": {}
100 |                 }
101 |             }
102 |             wc.schema.create_class(class_obj)
103 |             # chunk up the documents into nodes 
104 |             parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20)
105 |             nodes = parser.get_nodes_from_documents(documents, show_progress=True)
106 |             # construct vector store
107 |             vector_store = WeaviateVectorStore(weaviate_client=wc, index_name="Documents", text_key="content")
108 |             # setting up the storage for the embeddings
109 |             storage_context = StorageContext.from_defaults(vector_store = vector_store)
110 |             # set up the index
111 |             index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True, service_context=service_context)
112 | 
113 |         else:
114 |             raise ValueError(f'Unknown vector store {VECTOR_STORE}')
115 | 
116 |         print('---- Document Q&A  ----', '\n',
117 |               'Indexing Embedding Tokens: ', token_counter.total_embedding_token_count, '\n')
118 | 
119 |     with st.sidebar:
120 |         st.markdown(f'#### {title} Settings')
121 |         st.selectbox(
122 |             'OpenAI model', options=OPENAI_MODELS_COMPLETIONS,
123 |             on_change=_set_state_cb, kwargs={'completions_model': 'selectbox_docs_completions_model_name'},
124 |             index=OPENAI_MODELS_COMPLETIONS.index(state.completions_model),
125 |             help='Allowed models. Accuracy, speed, token consumption and costs will vary.',
126 |             key='selectbox_docs_completions_model_name'
127 |         )
128 |         include_history = st.checkbox('Include history in prompts', value=False)
129 |         if st.button('Clear history'):
130 |             state.questions = []
131 |             state.past = []
132 |         # NOTE: Hide indexing button if cloud deployment (temporary fix for public demo)
133 |         if not json.loads(st.secrets['IS_CLOUD_DEPLOYMENT']) and st.button('Index documents'):
134 |             with st.spinner("Indexing..."):
135 |                 _index_documents()
136 | 
137 |     # GPT completion models can not handle web sites, so we scrape the URL in the user input
138 |     user_input = state.user_input
139 |     if user_input.strip().startswith('http'):
140 |         scraped_texts = scrape_articles([user_input])['text']
141 |         user_input = scraped_texts[0] if scraped_texts else user_input
142 |         user_input = user_input.replace('\n', ' ').replace('\r', '') if user_input else user_input
143 | 
144 |     if include_history:
145 |         context = '\n\n'.join([f'| Question: "{q}" | Answer: "{a}" |' for q, a in zip(state.questions, state.past)])
146 |         refinement = \
147 |             'Finally, return results in markdown text, include bullet point format where appropriate. ' + \
148 |             'Add additional web links at the end of the response if this is useful.'
149 |         prompt_template = "Given this context ### {context} ###. Answer or summarize this: ### {doc_query} ###. {refinement}"
150 |         prompt = PromptTemplate(input_variables=['context', 'doc_query', 'refinement'], template=prompt_template)
151 |         query_prompt = prompt.format(context=context, doc_query=user_input, refinement=refinement)
152 |     else:
153 |         refinement = \
154 |             'Return results in markdown text, include bullet point format where appropriate. ' + \
155 |             'Add additional web links at the end of the response if this is useful.'
156 |         prompt_template = "Answer or summarize this: ### {doc_query} ###. {refinement}"
157 |         prompt = PromptTemplate(input_variables=['doc_query', 'refinement'], template=prompt_template)
158 |         query_prompt = prompt.format(doc_query=user_input, refinement=refinement)
159 | 
160 |     if user_input_confirmed and state.user_input:
161 |         with st.spinner("Generating query answer..."):
162 |             try:
163 |                 response = get_llm_doc_query_response(query_prompt, model_name=state.completions_model, _service_context=service_context)
164 |                 print('---- Document Q&A  ----', '\n',
165 |                       'Embedding Tokens: ', token_counter.total_embedding_token_count, '\n',
166 |                       'LLM Prompt Tokens: ', token_counter.prompt_llm_token_count, '\n',
167 |                       'LLM Completion Tokens: ', token_counter.completion_llm_token_count, '\n',
168 |                       'Total LLM Token Count: ', token_counter.total_llm_token_count)
169 |             except Exception as ex:
170 |                 st.warning(f'Index does not exist. Please index some documents.')
171 |                 st.error(str(ex))
172 |                 return
173 | 
174 |     if state.user_input:
175 |         st.subheader('🙋🏽 Answer')
176 |         with st.spinner("Generating query answer..."):
177 |             try:
178 |                 # This will use cached response!
179 |                 response = get_llm_doc_query_response(query_prompt, model_name=state.completions_model, _service_context=service_context)
180 |             except Exception as ex:
181 |                 st.warning(f'Index does not exist. Please index some documents.')
182 |                 st.error(str(ex))
183 |                 return
184 |             
185 |         if state.user_input not in state.questions:
186 |             state.questions.append(state.user_input)
187 |             state.generated.append((state.user_input, response))
188 |             state.past.append(response)
189 | 
190 |         st.markdown(response)
191 | 
192 |         with st.expander('View conversation history', expanded=False):
193 |             st.markdown('\n\n'.join([f'---\n**Question**\n\n{q}\n\n**Answer**\n\n{a}' for q, a in zip(state.questions, state.past)]))
194 |             
195 |         estimated_cost = ((token_counter.prompt_llm_token_count / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['input']) + \
196 |             ((token_counter.completion_llm_token_count / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['output'])
197 |         print('Document Q&A Estimated Cost: $', estimated_cost)
198 |         state.estimated_cost_doc = estimated_cost
199 |         state.cumulative_cost += estimated_cost
200 | 
201 |         return response
202 | 


--------------------------------------------------------------------------------
/app_llm_knowlege_graph_gen.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import openai
  4 | from graphviz import Digraph
  5 | import base64
  6 | 
  7 | import streamlit as st
  8 | 
  9 | import func_prompt
 10 | from globals import (
 11 |     OPENAI_MODELS_CHAT,
 12 |     DEFAULT_MODEL_CONFIG, LANG_MODEL_PRICING
 13 | )
 14 | from common import SafeFormatter
 15 | 
 16 | from app_state import (state, _set_state_cb, init_app_state, reset_app_state)
 17 | init_app_state() # ensure all state variables are initialized
 18 | 
 19 | # GRAPH GENERATOR -------------------------------------------------------------
 20 | 
 21 | def correct_json(response_data):
 22 |     """
 23 |     Corrects the JSON response from OpenAI to be valid JSON
 24 |     """
 25 |     # clean up the response data JSON
 26 |     response_data = response_data.replace('  ',' ').replace(',\n }','\n }')
 27 |     # For good measure
 28 |     response_data = re.sub(
 29 |         r',\s*}', '}', re.sub(
 30 |         r',\s*]', ']', re.sub(
 31 |         r'(\w+)\s*:', r'"\1":', 
 32 |         response_data
 33 |     )))
 34 |     return response_data
 35 | 
 36 | @st.cache_data(ttl=60*60, show_spinner=False)
 37 | def get_llm_graph_data_response(user_input, model_name=DEFAULT_MODEL_CONFIG['chat_model']):
 38 |     if not user_input:
 39 |         return None
 40 |     print(f"OpenAI call ({model_name})")
 41 |     try:
 42 |         model_config = {
 43 |             'model': model_name,
 44 |             'temperature': state.temperature,
 45 |             'top_p': state.top_p,
 46 |             'max_tokens': state.max_tokens,
 47 |         }
 48 |         completion = openai.chat.completions.create(
 49 |             messages=json.loads(SafeFormatter().format(json.dumps(func_prompt.MESSAGES), user_input=user_input)),
 50 |             functions=func_prompt.FUNCTIONS,
 51 |             function_call=func_prompt.FUNCTION_CALL,
 52 |             **model_config
 53 |         )
 54 |     except openai.RateLimitError as e:
 55 |         # request limit exceeded or something.
 56 |         return str(e)
 57 |     except Exception as e:
 58 |         # general exception handling
 59 |         return str(e)
 60 |     
 61 |     response_data = completion.choices[0].message.function_call.arguments
 62 |     # clean up the response data JSON
 63 |     response_data = correct_json(response_data)
 64 |     # print(response_data)
 65 | 
 66 |     estimated_cost = ((completion.usage.prompt_tokens / 1000.0) * LANG_MODEL_PRICING[state.chat_model]['input']) + \
 67 |         ((completion.usage.completion_tokens / 1000.0) * LANG_MODEL_PRICING[state.chat_model]['output'])
 68 |     print('Knowledge Graph Generation Estimated Cost: $', estimated_cost)
 69 |     state.estimated_cost_graph = estimated_cost
 70 |     state.cumulative_cost += estimated_cost
 71 |     
 72 |     return response_data
 73 | 
 74 | # Function to generate a graph image using Graphviz
 75 | def generate_knowledge_graph(response_data):
 76 |     dot = Digraph(comment="Knowledge Graph")
 77 |     response_dict = json.loads(response_data)
 78 | 
 79 |     # Add nodes to the graph
 80 |     for node in response_dict.get("nodes", []):
 81 |         dot.node(node["id"], f"{node['label']} ({node['type']})")
 82 | 
 83 |     # Add edges to the graph
 84 |     for edge in response_dict.get("edges", []):
 85 |         dot.edge(edge["from"], edge["to"], label=edge["relationship"])
 86 | 
 87 |     # Requires GraphViz executable, so we can't use it in Streamlit Cloud
 88 |     if json.loads(st.secrets['IS_CLOUD_DEPLOYMENT']): 
 89 |         return {'dot': dot, 'png': None, 'gv': None}
 90 |     else:
 91 |         # Render and visualize
 92 |         dot.render("./static/knowledge_graph.gv", view=False)
 93 |         # Render to PNG format and save it
 94 |         dot.render("./static/knowledge_graph", format = "png", view=False)
 95 |         return {'dot': dot, 'png': "./static/knowledge_graph.png", 'gv': "./static/knowledge_graph.gv"}
 96 | 
 97 | def get_graph_data(response_data):
 98 |     try:
 99 |         response_dict = json.loads(response_data)
100 |         # Assume response_data is global or passed appropriately
101 |         nodes = [
102 |             {
103 |                 "data": {
104 |                     "id": node["id"],
105 |                     "label": node["label"],
106 |                     "color": node.get("color", "defaultColor"),
107 |                 }
108 |             }
109 |             for node in response_dict["nodes"]
110 |         ]
111 |         edges = [
112 |             {
113 |                 "data": {
114 |                     "source": edge["from"],
115 |                     "target": edge["to"],
116 |                     "label": edge["relationship"],
117 |                     "color": edge.get("color", "defaultColor"),
118 |                 }
119 |             }
120 |             for edge in response_dict["edges"]
121 |         ]
122 |         return {"elements": {"nodes": nodes, "edges": edges}}
123 |     except:
124 |         return {"elements": {"nodes": [], "edges": []}}
125 | 
126 | # UTILITY ---------------------------------------------------------------------
127 | 
128 | def image_html_fragments(image, text, image_style=None, text_style=None):
129 |     with open(image, 'rb') as img_f:
130 |         img_b64 = base64.b64encode(img_f.read()).decode('utf-8')
131 | 
132 |     img_style = image_style if image_style else "height: 200px; margin: 3px;"
133 |     image_tag_html = f'<img src="data:image/png;base64,{img_b64}" style="{img_style} vertical-align:middle;">'
134 |     image_download_link = f'<a download="knowledge_graph.png" href="data:image/png;base64,{img_b64}">Download</a>'
135 |     
136 |     # style copied from dev tools
137 |     span_style = text_style if text_style else "font-weight: 600; font-size: 1.75rem;"
138 |     span_style = ( f'font-family: Source Sans Pro, sans-serif; {span_style}'
139 |                    'color: rgb(49, 51, 63); letter-spacing: -0.005em;'
140 |                    'padding: 0.5rem 0px 1rem; margin: 0px; line-height: 1.2;'
141 |                    'text-size-adjust: 100%; -webkit-font-smoothing: auto;'
142 |                    'position: relative; vertical-align:middle;' )
143 |     text_html = f'<span style="{span_style}">{text}</span>'
144 | 
145 |     image_html = f'{text_html}&nbsp;&nbsp;{image_tag_html}'
146 | 
147 |     return {'image_html': image_html, 'image_tag_html': image_tag_html, 'image_download_link': image_download_link}
148 | 
149 | # MAIN ------------------------------------------------------------------------
150 | 
151 | def main(title, user_input_confirmed=False, response=None):
152 |     # Sidebar
153 |     with st.sidebar:
154 |         st.markdown(f'#### {title} Settings')
155 |         st.selectbox(
156 |             'OpenAI model', options=OPENAI_MODELS_CHAT,
157 |             on_change=_set_state_cb, kwargs={'chat_model': 'selectbox_graph_chat_model_name'},
158 |             index=OPENAI_MODELS_CHAT.index(state.chat_model),
159 |             help='Allowed models. Accuracy, speed, token consumption and costs will vary.',
160 |             key='selectbox_graph_chat_model_name'
161 |         )
162 | 
163 |     # GPT chat models can handle web sites, so we can keep URLs in the user input
164 |     user_input = state.user_input if state.user_input.strip().startswith('http') else response
165 |     user_input = user_input.replace('\n', ' ').replace('\r', '') if user_input else user_input
166 | 
167 |     if user_input_confirmed and user_input:
168 |         with st.spinner("Generating knowledge graph (this takes a while)..."):
169 |             response_data = get_llm_graph_data_response(user_input, model_name=state.chat_model)
170 | 
171 |     if user_input:
172 |         st.subheader('💡 Answer Knowledge Graph')
173 |         # This will use cached response!
174 |         with st.spinner("Generating knowledge graph (this takes a while)..."):
175 |             response_data = get_llm_graph_data_response(user_input, model_name=state.chat_model)
176 | 
177 |         c1, c2, _ = st.columns([2, 1, 3])
178 |         with c1:
179 |             radio_options = ["Interactive", "Static", "Data"]
180 |             radio_option = st.radio('Knowledge graph options', options=radio_options, horizontal=True)
181 |         with c2:
182 |             height = st.slider("Adjust image height", 100, 1000, 750, 50)
183 |         
184 |         if radio_option == radio_options[0]:
185 |             from graph_frontend import graph_component
186 | 
187 |             # NOTE: This component doesn't actually return any data, so handle_event is a no-op
188 |             def run_component(props):
189 |                 value = graph_component(key='graph', **props)
190 |                 return value
191 |             def handle_event(value):
192 |                 if value is not None:
193 |                     st.write('Received from graph component: ', value)
194 | 
195 |             props = {
196 |                 'data': { 'graph': get_graph_data(response_data) },
197 |                 'graph_height': height,
198 |                 'show_graph_data': False,
199 |             }
200 |             handle_event(run_component(props))
201 | 
202 |         if radio_option == radio_options[1]:
203 |             graph_data = generate_knowledge_graph(response_data)
204 |             # If graphviz executable is available, then we'll have a PNG to download or display
205 |             if graph_data['png']:
206 |                 image_html_frags = image_html_fragments(
207 |                     graph_data['png'], '',
208 |                     image_style=f"height: {height}px; margin: 5px;",
209 |                     text_style="font-weight: 600; font-size: 1.75rem;"
210 |                 )
211 |                 st.markdown(f"{image_html_frags['image_download_link']}", unsafe_allow_html=True)
212 |                 # st.markdown(f"{image_html_frags['image_tag_html']}", unsafe_allow_html=True)
213 |                 # st.markdown(f"{image_html_frags['image_html']}", unsafe_allow_html=True)
214 |             
215 |             # Display using Streamlit's D3.js graphviz renderer
216 |             st.graphviz_chart(graph_data['dot'])
217 |             
218 |         if radio_option == radio_options[2]:
219 |             st.json(get_graph_data(response_data), expanded=True)
220 | 


--------------------------------------------------------------------------------
/app_state.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import openai
 3 | import weaviate
 4 | import streamlit as st
 5 | 
 6 | from globals import (DEFAULT_MODEL_CONFIG, DB_TABLE)
 7 | 
 8 | # MAIN APP STATE ----------------------------------------------------------------
 9 | 
10 | state = st.session_state
11 | 
12 | # Initial state builder
13 | def build_initial_state():
14 |     openai_api_key = None
15 |     if st.secrets.get('OPENAI_API_KEY', None):
16 |         print('settings', 'OPENAI_API_KEY found')
17 |         openai_api_key = st.secrets['OPENAI_API_KEY']
18 |     else:
19 |         print('settings OPENAI_API_KEY not found!')
20 |         # Try get OpenAI api key from os env
21 |         # (this is the workaround for using Streamlit in Heroku)
22 |         if os.environ.get('OPENAI_API_KEY', None):
23 |             print('os.environ', 'OPENAI_API_KEY found')
24 |             openai_api_key = os.environ['OPENAI_API_KEY']
25 |             openai.api_key = os.getenv("OPENAI_API_KEY")
26 | 
27 |     print('openai_api_key', 'sk_...' + openai_api_key[-5:], '\n') if openai_api_key else print('openai_api_key', 'NULL', '\n')
28 | 
29 |     weaviate_api_key = st.secrets.get('WEAVIATE_API_KEY', None)
30 |     print('weaviate_api_key',  weaviate_api_key[:5] + '...' + weaviate_api_key[-5:], '\n') if weaviate_api_key else print('weaviate_api_key', 'NULL', '\n')
31 | 
32 |     WEAVIATE_URL = st.secrets.get('WEAVIATE_URL', None)
33 |     
34 |     initial_state = {
35 |         # MAIN APP STATE
36 |         'openai_api_key': openai_api_key,
37 |         'weaviate_api_key': weaviate_api_key,
38 |         'WEAVIATE_URL': WEAVIATE_URL,
39 |         'menu_choice': 0,
40 |     
41 |         # DATA PAGE STATE
42 |         'limit': 3,
43 |         'query': '',
44 |         'intermediate_steps': True,
45 |         'db_table': DB_TABLE,
46 |         'generated': [],
47 |         'past': [],
48 |         'questions': [],
49 |     
50 |         # KNOWLEDGE GRAPH PAGE STATE
51 |         'user_input': '',
52 | 
53 |         # MODEL STATE
54 |         'chat_model': DEFAULT_MODEL_CONFIG['chat_model'],
55 |         'completions_model': DEFAULT_MODEL_CONFIG['completions_model'],
56 |         'temperature': DEFAULT_MODEL_CONFIG['temperature'],
57 |         'top_p': DEFAULT_MODEL_CONFIG['top_p'],
58 |         'max_tokens': DEFAULT_MODEL_CONFIG['max_tokens'],
59 | 
60 |         'estimated_cost_reset': 0,
61 |         'estimated_cost_data': 0,
62 |         'estimated_cost_doc': 0,
63 |         'estimated_cost_graph': 0,
64 |         'cumulative_cost': 0,
65 |     }
66 |     
67 |     return initial_state
68 | 
69 | # State initializer
70 | def init_app_state():
71 |     initial_state = build_initial_state()
72 |     for k, v in initial_state.items():
73 |         if not state.get(k, None):
74 |             setattr(state, k, v)
75 | 
76 | # State resetter
77 | def reset_app_state():
78 |     initial_state = build_initial_state()
79 |     for k, v in initial_state.items():
80 |         setattr(state, k, v)
81 | 
82 | # STATE CALLBACK ----------------------------------------------------
83 | 
84 | # generic callback to set state
85 | def _set_state_cb(**kwargs):
86 |     for state_key, widget_key in kwargs.items():
87 |         val = state.get(widget_key, None)
88 |         if val is not None or val == "":
89 |             setattr(state, state_key, state[widget_key])
90 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | from colorama import init as colorama_init
  2 | from colorama import Fore
  3 | from colorama import Style
  4 | 
  5 | # --------------------------------------------------------------------------------
  6 | 
  7 | # TODO: Make this HttpException compatible
  8 | # https://stackoverflow.com/questions/64501193/fastapi-how-to-use-httpexception-in-responses
  9 | # https://plainenglish.io/blog/3-ways-to-handle-errors-in-fastapi-that-you-need-to-know-e1199e833039
 10 | # https://christophergs.com/tutorials/ultimate-fastapi-tutorial-pt-5-basic-error-handling/
 11 | 
 12 | # Error handler
 13 | class AppError(Exception):
 14 |     def __init__(self, error, status_code):
 15 |         self.error = error
 16 |         self.status_code = status_code
 17 | 
 18 | 
 19 | def throw_if_nulls(df):
 20 |     if sum(df.isnull().values.ravel()) > 0:
 21 |         raise AppError(
 22 |             error = f'NULL values found in dataframe ({list(df.columns)})!!',
 23 |             status_code = 500
 24 |         )
 25 | 
 26 | # --------------------------------------------------------------------------------
 27 | 
 28 | colorama_init()
 29 | 
 30 | def print_red(msg):
 31 |     print(f'{Fore.RED}{msg}{Style.RESET_ALL}')
 32 | def print_blue(msg):
 33 |     print(f'{Fore.BLUE}{msg}{Style.RESET_ALL}')
 34 | def print_green(msg):
 35 |     print(f'{Fore.GREEN}{msg}{Style.RESET_ALL}')
 36 | def print_yellow(msg):
 37 |     print(f'{Fore.YELLOW}{msg}{Style.RESET_ALL}')
 38 | def print_cyan(msg):
 39 |     print(f'{Fore.CYAN}{msg}{Style.RESET_ALL}')
 40 | def print_magenta(msg):
 41 |     print(f'{Fore.MAGENTA}{msg}{Style.RESET_ALL}')
 42 | 
 43 | # WEB SCRAPER -----------------------------------------------------------------
 44 | 
 45 | # https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html
 46 | import newspaper
 47 | # https://www.crummy.com/software/BeautifulSoup/
 48 | from bs4 import BeautifulSoup
 49 | import htmldate
 50 | import dateutil
 51 | import datefinder
 52 | import random
 53 | import time
 54 | 
 55 | def scrape_articles(source_urls):
 56 |     article_titles = []
 57 |     article_authors = []
 58 |     article_dates = []
 59 |     article_texts = []
 60 |     article_keywords = []
 61 |     article_summaries = []
 62 |     article_urls = []
 63 | 
 64 |     articles_dict = \
 65 |         {'title':article_titles, 'author':article_authors, 'date':article_dates, \
 66 |           'text':article_texts, 'keywords':article_keywords, 'summary':article_summaries, \
 67 |           'url':article_urls }
 68 | 
 69 |     def _newspaper_scraper_helper(url):
 70 |         HEADERS = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0',
 71 |                     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 72 |                     'Referer' : 'https://google.com/' }
 73 |         config = newspaper.Config()
 74 |         config.headers = HEADERS
 75 |         config.request_timeout = 10
 76 | 
 77 |         article = newspaper.Article(url=url, language='en')
 78 |         article.download()
 79 |         article.parse()
 80 |         article.nlp()
 81 | 
 82 |         article_titles.append(article.title)
 83 |         article_authors.append(article.authors)
 84 | 
 85 |         if article.publish_date == None:
 86 |             publish_date = alternative_get_publish_date(article)
 87 |         else:
 88 |             publish_date = parse_date_str(str(article.publish_date))
 89 |         article_dates.append(publish_date)
 90 |         
 91 |         if publish_date:
 92 |             article_text = f'Published: {publish_date.strftime("%d %B %Y")}\n\n{article.text}'
 93 |         else:
 94 |             article_text = article.text
 95 |         article_text = article_text.replace('\n\n','\n').replace('\r\n','\n')
 96 |         article_texts.append(article_text)
 97 |         
 98 |         article_keywords.append(article.keywords)
 99 |         article_summaries.append(article.summary)
100 |         article_urls.append(article.url)
101 | 
102 |         # print(f'{article.title}\n{article.authors}\n{article.publish_date}\n{article_text[:20]}\n\
103 |         #     {article.keywords}\n{article.summary}\n{article.url}')
104 | 
105 |         articles_dict = \
106 |             {'title':article_titles, 'author':article_authors, 'date':article_dates, \
107 |              'text':article_texts, 'keywords':article_keywords, 'summary':article_summaries, \
108 |              'url':article_urls }
109 |             
110 |         # NOTE: Could use this dict object to return a Pandas dataframe instead!
111 |         
112 |         return articles_dict
113 | 
114 |     # for testing
115 |     '''
116 |     count = 0
117 |     '''
118 |     for url in source_urls:
119 | 
120 |         # for testing (can bail out early)
121 |         '''count+=1
122 |         if count > 5:
123 |             break
124 |         '''
125 |         # Use a variable sleep betwen calls (good netizenship!)
126 |         t = random.choice([0.1, 0.25, 0.5, 0.75, 1., 1.1, 1.25, 1.5])
127 |         jitter = random.random()
128 |         T = t + jitter
129 |         time.sleep(T)
130 | 
131 |         try:
132 |             _newspaper_scraper_helper(url.strip())
133 |         except Exception as e:
134 |             print('!!Newspaper Exception!!', '\n', e)
135 |             continue
136 | 
137 |     return articles_dict
138 | 
139 | # if newspaper can't find, then use bs4, datafinder, htmldate
140 | def alternative_get_publish_date(article):
141 |     # try bs4
142 |     soup = BeautifulSoup(article.html, features="lxml")
143 |     # class=newsdate is specific to one site I was scraping (you can change this to suit your needs)
144 |     para = soup.find('p', attrs={'class': 'newsdate'})
145 |     if para:
146 |         datetime_obj = parse_date_str(para.next)
147 |         return datetime_obj
148 | 
149 |     # try datefinder
150 |     try:
151 |         datefndrdt = datefinder.find_dates(article.text)
152 |         datetime_obj = datefndrdt.__next__()
153 |         return datetime_obj
154 |     except:
155 |         pass
156 | 
157 |     # try htmldate
158 |     htmldt = htmldate.find_date(article.html, extensive_search=True, original_date=True)
159 |     if htmldt:
160 |         datetime_obj = parse_date_str(htmldt)
161 |         return datetime_obj
162 | 
163 |     return None
164 | 
165 | def parse_date_str(date_str):
166 |     if date_str:
167 |         try:
168 |             return dateutil.parser.parse(date_str)
169 |         except (ValueError, OverflowError, AttributeError, TypeError):
170 |             # nearly all parse failures are due to URL dates without a day
171 |             # specifier, e.g. /2014/04/
172 |             return None
173 | 
174 | # --------------------------------------------------------------------------------
175 | # After a lot of investigation on partial formatting, I found this solution:
176 | # https://stackoverflow.com/a/34033230
177 | #
178 | # NOTE: (@asehmi) It has been modified to preserve the format strings if fp's
179 | # value is None, which is important when a prompt template is being
180 | # incrementally built up (e.g. say templates are generated from parts 
181 | # and they use parameter subsitutions taken from a database, and
182 | # then they're used to build the final prompt by binding with
183 | # the doc input, etc.) There is also a mod to deal with embedded
184 | # JSON strings which contain braces.
185 | #
186 | import string
187 | class SafeFormatter(string.Formatter):
188 |     def vformat(self, format_string, args, kwargs):
189 |         args_len = len(args)  # for checking IndexError
190 |         tokens = []
191 |         for (lit, name, spec, conv) in self.parse(format_string):
192 |             # re-escape braces that parse() unescaped
193 |             # NOTE: (@asehmi) Modified to deal with embedded JSON strings which contain braces
194 |             if lit[0] in ['{','}'] or lit[-1] in ['{','}']:
195 |                 lit = lit.replace('{', '{{{{').replace('}', '}}}}')
196 |             else:
197 |                 lit = lit.replace('{', '{{').replace('}', '}}')
198 |             # only lit is non-None at the end of the string
199 |             if name is None:
200 |                 tokens.append(lit)
201 |             else:
202 |                 # but conv and spec are None if unused
203 |                 conv = '!' + conv if conv else ''
204 |                 spec = ':' + spec if spec else ''
205 |                 # name includes indexing ([blah]) and attributes (.blah)
206 |                 # so get just the first part
207 |                 fp = name.split('[')[0].split('.')[0]
208 |                 # treat as normal if fp is empty (an implicit
209 |                 # positional arg), a digit (an explicit positional
210 |                 # arg) or if it is in kwargs
211 |                 # NOTE: (@asehmi) Modified to preserve the format if fp's value is None
212 |                 if (not fp or fp.isdigit() or fp in kwargs) and kwargs[fp] is not None:
213 |                     tokens.extend([lit, '{', name, conv, spec, '}'])
214 |                 # otherwise escape the braces
215 |                 else:
216 |                     tokens.extend([lit, '{{', name, conv, spec, '}}'])
217 |         format_string = ''.join(tokens)  # put the string back together
218 |         # finally call the default formatter
219 |         return string.Formatter.vformat(self, format_string, args, kwargs)


--------------------------------------------------------------------------------
/data/GCFS Countries.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/data/GCFS Countries.xlsx


--------------------------------------------------------------------------------
/db/empty-for-github.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/db/empty-for-github.txt


--------------------------------------------------------------------------------
/docs/cdaniel-future-is-predictable-master.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/docs/cdaniel-future-is-predictable-master.pdf


--------------------------------------------------------------------------------
/func_prompt.py:
--------------------------------------------------------------------------------
 1 | MESSAGES = [{
 2 |     "role": "user",
 3 |     "content": "Help me understand following by describing as a detailed knowledge graph: {user_input}"
 4 | }]
 5 | 
 6 | FUNCTION_CALL = {"name": "knowledge_graph"}
 7 | 
 8 | FUNCTIONS = [{
 9 |     "name": "knowledge_graph",
10 |     "description":
11 |         "Generate a knowledge graph with entities and relationships. "
12 |         "Use the colors to help differentiate between different node or edge types/categories. "
13 |         "Always provide light pastel colors that work well with black font.",
14 |     "parameters": {
15 |         "type": "object",
16 |         "properties": {
17 |             "metadata": {
18 |                 "type": "object",
19 |                 "properties": {
20 |                     "createdDate": {"type": "string"},
21 |                     "lastUpdated": {"type": "string"},
22 |                     "description": {"type": "string"}
23 |                 }
24 |             },
25 |             "nodes": {
26 |                 "type": "array",
27 |                 "items": {
28 |                     "type": "object",
29 |                     "properties": {
30 |                         "id": {"type": "string"},
31 |                         "label": {"type": "string"},
32 |                         "type": {"type": "string"},
33 |                         # Added color property
34 |                         "color": {"type": "string"},
35 |                         "properties": {
36 |                             "type": "object",
37 |                             "description": "Additional attributes for the node"
38 |                         }
39 |                     },
40 |                     "required": [
41 |                         "id",
42 |                         "label",
43 |                         "type",
44 |                         "color",
45 |                     ]  # Added color to required
46 |                 },
47 |             },
48 |             "edges": {
49 |                 "type": "array",
50 |                 "items": {
51 |                     "type": "object",
52 |                     "properties": {
53 |                         "from": {"type": "string"},
54 |                         "to": {"type": "string"},
55 |                         "relationship": {"type": "string"},
56 |                         "direction": {"type": "string"},
57 |                         # Added color property
58 |                         "color": {"type": "string"},
59 |                         "properties": {
60 |                             "type": "object",
61 |                             "description": "Additional attributes for the edge"
62 |                         }
63 |                     },
64 |                     "required": [
65 |                         "from",
66 |                         "to",
67 |                         "relationship",
68 |                         "color"
69 |                     ]  # Added color to required
70 |                 },
71 |             },
72 |         },
73 |         "required": ["nodes", "edges"]
74 |     },
75 | }]
76 | 
77 | 


--------------------------------------------------------------------------------
/globals.py:
--------------------------------------------------------------------------------
 1 | # CONSTANTS --------------------------------------------------------------------
 2 | 
 3 | _BASE_DB_PATH = '.'
 4 | _DB_PATH = 'db'
 5 | _DB_NAME = 'gptdb.sqlite3'
 6 | 
 7 | DB_FILE = '{}/{}/{}'.format(_BASE_DB_PATH, _DB_PATH, _DB_NAME)
 8 | DB_TABLE = 'data'
 9 | 
10 | # curl https://api.openai.com/v1/models -H "Content-Type: application/json" -H "Authorization: Bearer %OPENAI_API_KEY%"
11 | # Actual model names used in app for selectors
12 | OPENAI_MODELS_CHAT = ['gpt-4-1106-preview', 'gpt-4']
13 | OPENAI_MODELS_COMPLETIONS = ['gpt-3.5-turbo-instruct']
14 | 
15 | DEFAULT_MODEL_CONFIG = {
16 |     'chat_model': OPENAI_MODELS_CHAT[0],
17 |     'completions_model': OPENAI_MODELS_COMPLETIONS[0],
18 |     'temperature': 0.1,
19 |     'top_p': 0.9,
20 |     'max_tokens': 2048,
21 | }
22 | 
23 | # Mapping from friendly name to actual model name
24 | LANG_MODELS = {
25 |     # Friendly aliases used in app
26 |     'gpt-4': 'gpt-4',
27 |     'gpt-4-turbo': 'gpt-4-1106-preview',
28 |     # Actual model names used in app
29 |     'gpt-4-1106-preview': 'gpt-4-1106-preview',
30 |     'gpt-3.5-turbo-instruct': 'gpt-3.5-turbo-instruct',
31 | }
32 | 
33 | # See: https://openai.com/pricing
34 | LANG_MODEL_PRICING = {
35 |     # Friendly aliases used in app
36 |     'gpt-4': {'input': 0.03, 'output': 0.06},                       # per 1000 tokens
37 |     'gpt-4-turbo': {'input': 0.01, 'output': 0.03},                 # per 1000 tokens
38 |     # Actual model names used in app
39 |     'gpt-4-1106-preview': {'input': 0.01, 'output': 0.03},          # per 1000 tokens
40 |     'gpt-3.5-turbo-instruct': {'input': 0.0015, 'output': 0.002},   # per 1000 tokens
41 | }
42 | 
43 | VECTOR_STORE = 'Weaviate' # 'Weaviate' | 'Local'
44 | 
45 | SAMPLE_QUESTIONS = [
46 |     "None",
47 |     "Summarize the most important concepts in a high performance software application",
48 |     "Summarize the Wardley mapping technique",
49 |     "Summarize the Viewpoints and Perspectives software solutions design methodology",
50 |     "What are the most important considerations when architecting a software solution",
51 |     "Build a 5-part learning plan on how to become a software architect. Detail each part with a short description and bullet points.",
52 |     "Machine learning model training, deployment and operations",
53 |     "What is a knowledge graph",
54 |     "What is a graph neural network",
55 |     "https://en.wikipedia.org/wiki/Graph_theory",
56 |     "Most important factors of high performing teams",
57 | ]
58 | 


--------------------------------------------------------------------------------
/graph_frontend/__init__.py:
--------------------------------------------------------------------------------
1 | import streamlit.components.v1 as components
2 | graph_component = components.declare_component(
3 |     name='graph_component',
4 |     path='./graph_frontend'
5 | )


--------------------------------------------------------------------------------
/graph_frontend/index.html:
--------------------------------------------------------------------------------
  1 | <!-- See: https://discuss.streamlit.io/t/code-snippet-create-components-without-any-frontend-tooling-no-react-babel-webpack-etc/13064 -->
  2 | 
  3 | <style>
  4 |   body {
  5 |       background-color: lightblue;
  6 |       /* border-color: rgb(246, 51, 102); */
  7 |       border-width: 1px;
  8 |       font-family: sans-serif;
  9 |       font-size: 16px;
 10 |   }
 11 |   h1 {
 12 |       font-family: sans-serif;
 13 |       font-size: 24px;
 14 |       font-weight: normal;
 15 |       color: #262730;
 16 |   }
 17 |   #message_div {
 18 |       font-family: sans-serif;
 19 |       font-size: 12px;
 20 |       font-weight: normal;
 21 |       color: #262730;
 22 |   }
 23 | </style>
 24 | 
 25 | <html>
 26 |   <meta name="viewport" content="width=device-width, initial-scale=1" />
 27 |   <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.16/dist/tailwind.min.css" rel="stylesheet">
 28 |   <script src="https://cdnjs.cloudflare.com/ajax/libs/cytoscape/3.19.0/cytoscape.min.js"></script>
 29 | 
 30 |   <body class="bg-gray-100">
 31 |     <!-- Set up your HTML here -->
 32 |     <div>
 33 |       <!-- Am not going to use this input... it's only for testing the Streamlit host component handler -->
 34 |       <textarea id="text_input" placeholder="Enter some text" rows="10" cols="50"></textarea>
 35 |     </div>
 36 |     <div class="container mx-auto mt-10 p-4">
 37 |       <div id="cy" class="w-full h-screen"></div>
 38 |     </div>
 39 | 
 40 |     <script>
 41 |       // ----------------------------------------------------
 42 |       // Use these functions as is to perform the required  
 43 |       // Streamlit component lifecycle actions:
 44 |       //
 45 |       // 1. Signal Streamlit client that component is ready
 46 |       // 2. Signal Streamlit client to set visible height of the component
 47 |       //    (this is optional, in case Streamlit doesn't correctly auto-set it)
 48 |       // 3. Pass values from component to Streamlit client
 49 |       //
 50 | 
 51 |       // Helper function to send type and data messages to Streamlit client
 52 | 
 53 |       const SET_COMPONENT_VALUE = "streamlit:setComponentValue"
 54 |       const RENDER = "streamlit:render"
 55 |       const COMPONENT_READY = "streamlit:componentReady"
 56 |       const SET_FRAME_HEIGHT = "streamlit:setFrameHeight"
 57 | 
 58 |       function _sendMessage(type, data) {
 59 |         // copy data into object
 60 |         var outData = Object.assign({
 61 |           isStreamlitMessage: true,
 62 |           type: type,
 63 |         }, data)
 64 | 
 65 |         if (type == SET_COMPONENT_VALUE) {
 66 |           console.log("_sendMessage data: " + JSON.stringify(data))
 67 |           console.log("_sendMessage outData: " + JSON.stringify(outData))
 68 |         }
 69 |         
 70 |         window.parent.postMessage(outData, "*")
 71 |       }
 72 | 
 73 |       function initialize(pipeline) {
 74 | 
 75 |         // Hook Streamlit's message events into a simple dispatcher of pipeline handlers
 76 |         window.addEventListener("message", (event) => {
 77 |           if (event.data.type == RENDER) {
 78 |             // The event.data.args dict holds any JSON-serializable value
 79 |             // sent from the Streamlit client. It is already deserialized.
 80 |             pipeline.forEach(handler => {
 81 |               handler(event.data.args)
 82 |             })
 83 |           }
 84 |         })
 85 | 
 86 |         _sendMessage(COMPONENT_READY, {apiVersion: 1});
 87 | 
 88 |         // Component should be mounted by Streamlit in an iframe, so try to autoset the iframe height.
 89 |         window.addEventListener("load", () => {
 90 |           window.setTimeout(function() {
 91 |             setFrameHeight(document.documentElement.clientHeight)
 92 |           }, 0)
 93 |         })
 94 | 
 95 |         // Optionally, if auto-height computation fails, you can manually set it
 96 |         // (uncomment below)
 97 |         // setFrameHeight(800)
 98 |       }
 99 | 
100 |       function setFrameHeight(height) {
101 |         _sendMessage(SET_FRAME_HEIGHT, {height: height})
102 |       }
103 | 
104 |       // The `data` argument can be any JSON-serializable value.
105 |       function notifyHost(data) {
106 |         _sendMessage(SET_COMPONENT_VALUE, data)
107 |       }
108 | 
109 |       // ----------------------------------------------------
110 |       // Now implement the custom functionality of the component:
111 | 
112 |       let textArea = document.getElementById("text_input")
113 |       textArea.addEventListener("change", () => {
114 |         notifyHost({
115 |           value: textArea.value,
116 |           dataType: "json",
117 |         })
118 |       })
119 | 
120 |       const calcNodeWidth = label => Math.max(50, label.length * 8) + "px";
121 |     
122 |       function createGraph(data) {
123 |         cytoscape({
124 |           container: document.getElementById('cy'),
125 |           elements: data.elements,
126 |           style: [
127 |           {
128 |             selector: 'node',
129 |             style: {
130 |                 'background-color': 'data(color)',
131 |                 'label': 'data(label)',
132 |                 'text-valign': 'center',
133 |                 'text-halign': 'center',
134 |                 'shape': 'rectangle',
135 |                 'height': '50px',
136 |                 'width': ele => calcNodeWidth(ele.data('label')),
137 |                 'color': function(ele) {
138 |                   return getTextColor(ele.data('color'));
139 |                 },
140 |                 'font-size': '12px'
141 |               }
142 |             },
143 |             {
144 |               selector: 'edge',
145 |               style: {
146 |                 'width': 3,
147 |                 'line-color': 'data(color)',
148 |                 'target-arrow-color': 'data(color)',
149 |                 'target-arrow-shape': 'triangle',
150 |                 'label': 'data(label)',
151 |                 'curve-style': 'unbundled-bezier',
152 |                 'line-dash-pattern': [4, 4],
153 |                 'text-background-color': '#ffffff',
154 |                 'text-background-opacity': 1,
155 |                 'text-background-shape': 'rectangle',
156 |                 'font-size': '10px'
157 |               }
158 |             }
159 |           ],
160 |           layout: {
161 |             name: 'cose',
162 |             fit: true,
163 |             padding: 0,
164 |             avoidOverlap: true
165 |           } 
166 |         });
167 |       }
168 |   
169 |       function getTextColor(bgColor) {
170 |         bgColor = bgColor.replace('#', '');
171 |         const [r, g, b] = [0, 2, 4].map(start => parseInt(bgColor.substr(start, 2), 16));
172 |         const brightness = (r * 0.299) + (g * 0.587) + (b * 0.114);
173 |         return brightness < 40 ? '#ffffff' : '#000000';
174 |       }
175 |   
176 |       // ----------------------------------------------------
177 |       // Define a pipeline of inbound property handlers
178 | 
179 |       // Set initial value sent from Streamlit!
180 |       function initializeProps_Handler(props) {
181 |         if (props.show_graph_data) {
182 |           textArea.style.visibility = 'visible'
183 |           if (textArea.value == "") {
184 |             textArea.value = JSON.stringify(props.data.graph)
185 |           }
186 |         } else {
187 |           textArea.style.visibility = 'hidden'
188 |           textArea.remove()
189 |         }
190 |       }
191 |       // Update graph
192 |       function graphUpdate_Handler(props) {
193 |         createGraph(props.data.graph)
194 |       }
195 |       // Simply log received data dictionary
196 |       function log_Handler(props) {
197 |         console.log("Received from Streamlit: " + JSON.stringify(props))
198 |       }
199 | 
200 |       function adjustHeight_Handler(props) {
201 |         setFrameHeight(props.graph_height)
202 |       }
203 | 
204 |       let pipeline = [initializeProps_Handler, graphUpdate_Handler, adjustHeight_Handler, log_Handler]
205 | 
206 |       // ----------------------------------------------------
207 |       // Finally, initialize component passing in pipeline
208 | 
209 |       initialize(pipeline)
210 | 
211 |     </script>
212 |   </body>
213 | </html>
214 | 


--------------------------------------------------------------------------------
/images/a12i_logo_circle_transparent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/a12i_logo_circle_transparent.png


--------------------------------------------------------------------------------
/images/app-demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/app-demo.gif


--------------------------------------------------------------------------------
/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/favicon.ico


--------------------------------------------------------------------------------
/images/snapshot-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/snapshot-01.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | streamlit
 2 | langchain-experimental==0.0.45
 3 | llama-index==0.9.13
 4 | llama-cpp-python
 5 | sentence_transformers
 6 | weaviate-client==3.24.1
 7 | openai==1.1.2
 8 | sqlalchemy
 9 | debugpy
10 | openpyxl
11 | PyPDF2
12 | pypdf
13 | docx2txt
14 | PyCryptodome
15 | graphviz==0.20.1
16 | networkx==3.1
17 | beautifulsoup4==4.12.2
18 | colorama==0.4.5
19 | newspaper3k==0.2.8
20 | htmldate
21 | datefinder
22 | retry
23 | 


--------------------------------------------------------------------------------
/run_app.cmd:
--------------------------------------------------------------------------------
1 | @echo off
2 | echo ============================================================
3 | echo === OPEN A BROWSER WINDOW AT "http://[ip-address]:6974/" ===
4 | echo ============================================================
5 | streamlit run --server.port=6974 --server.headless=false app.py %1 %2 %3 %4 %5 %6 %7 %8 %9
6 | 


--------------------------------------------------------------------------------
/static/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/static/favicon.ico


--------------------------------------------------------------------------------
/static/knowledge_graph:
--------------------------------------------------------------------------------
 1 | // Knowledge Graph
 2 | digraph {
 3 | 	learning_plan [label="5-Part Learning Plan (Plan)"]
 4 | 	architecture_objectives [label="Identify Architecture Objectives (Step)"]
 5 | 	key_scenarios [label="Key Scenarios (Step)"]
 6 | 	application_overview [label="Application Overview (Step)"]
 7 | 	key_hotspots [label="Key Hotspots (Step)"]
 8 | 	candidate_solutions [label="Candidate Solutions (Step)"]
 9 | 	clear_objectives [label="Clear Objectives (Objective)"]
10 | 	precise_objectives [label="Precise Objectives (Objective)"]
11 | 	focus_design [label="Focus Design (Activity)"]
12 | 	evaluate_architectures [label="Evaluate Architectures (Activity)"]
13 | 	understand_application [label="Understand Application (Activity)"]
14 | 	connect_real_world [label="Connect to Real World (Activity)"]
15 | 	identify_hotspots [label="Identify Hotspots (Activity)"]
16 | 	create_candidate_architecture [label="Create Candidate Architecture (Activity)"]
17 | 	evaluate_candidate_architecture [label="Evaluate Candidate Architecture (Activity)"]
18 | 	app_arch_guide [label="Application Architecture Guide 2.0a (Resource)"]
19 | 	learning_plan -> architecture_objectives [label=includes]
20 | 	learning_plan -> key_scenarios [label=includes]
21 | 	learning_plan -> application_overview [label=includes]
22 | 	learning_plan -> key_hotspots [label=includes]
23 | 	learning_plan -> candidate_solutions [label=includes]
24 | 	architecture_objectives -> clear_objectives [label="aims to establish"]
25 | 	architecture_objectives -> precise_objectives [label="aims to establish"]
26 | 	key_scenarios -> focus_design [label=utilizes]
27 | 	key_scenarios -> evaluate_architectures [label=utilizes]
28 | 	application_overview -> understand_application [label=requires]
29 | 	application_overview -> connect_real_world [label=requires]
30 | 	key_hotspots -> identify_hotspots [label=involves]
31 | 	candidate_solutions -> create_candidate_architecture [label=involves]
32 | 	candidate_solutions -> evaluate_candidate_architecture [label=involves]
33 | 	learning_plan -> app_arch_guide [label="referenced by"]
34 | }
35 | 


--------------------------------------------------------------------------------
/static/knowledge_graph.gv:
--------------------------------------------------------------------------------
 1 | // Knowledge Graph
 2 | digraph {
 3 | 	learning_plan [label="5-Part Learning Plan (Plan)"]
 4 | 	architecture_objectives [label="Identify Architecture Objectives (Step)"]
 5 | 	key_scenarios [label="Key Scenarios (Step)"]
 6 | 	application_overview [label="Application Overview (Step)"]
 7 | 	key_hotspots [label="Key Hotspots (Step)"]
 8 | 	candidate_solutions [label="Candidate Solutions (Step)"]
 9 | 	clear_objectives [label="Clear Objectives (Objective)"]
10 | 	precise_objectives [label="Precise Objectives (Objective)"]
11 | 	focus_design [label="Focus Design (Activity)"]
12 | 	evaluate_architectures [label="Evaluate Architectures (Activity)"]
13 | 	understand_application [label="Understand Application (Activity)"]
14 | 	connect_real_world [label="Connect to Real World (Activity)"]
15 | 	identify_hotspots [label="Identify Hotspots (Activity)"]
16 | 	create_candidate_architecture [label="Create Candidate Architecture (Activity)"]
17 | 	evaluate_candidate_architecture [label="Evaluate Candidate Architecture (Activity)"]
18 | 	app_arch_guide [label="Application Architecture Guide 2.0a (Resource)"]
19 | 	learning_plan -> architecture_objectives [label=includes]
20 | 	learning_plan -> key_scenarios [label=includes]
21 | 	learning_plan -> application_overview [label=includes]
22 | 	learning_plan -> key_hotspots [label=includes]
23 | 	learning_plan -> candidate_solutions [label=includes]
24 | 	architecture_objectives -> clear_objectives [label="aims to establish"]
25 | 	architecture_objectives -> precise_objectives [label="aims to establish"]
26 | 	key_scenarios -> focus_design [label=utilizes]
27 | 	key_scenarios -> evaluate_architectures [label=utilizes]
28 | 	application_overview -> understand_application [label=requires]
29 | 	application_overview -> connect_real_world [label=requires]
30 | 	key_hotspots -> identify_hotspots [label=involves]
31 | 	candidate_solutions -> create_candidate_architecture [label=involves]
32 | 	candidate_solutions -> evaluate_candidate_architecture [label=involves]
33 | 	learning_plan -> app_arch_guide [label="referenced by"]
34 | }
35 | 


--------------------------------------------------------------------------------
/static/knowledge_graph.gv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/static/knowledge_graph.gv.pdf


--------------------------------------------------------------------------------
/static/knowledge_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/static/knowledge_graph.png


--------------------------------------------------------------------------------
/storage/empty-for-github.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/storage/empty-for-github.txt


--------------------------------------------------------------------------------
/streamlit_debug.py:
--------------------------------------------------------------------------------
 1 | # How to use:
 2 | #
 3 | # [1] Ensure you have `debugpy` installed:
 4 | #
 5 | #    > pip install debugpy
 6 | #
 7 | # [2] In your main streamlit app:
 8 | #
 9 | #    import streamlit_debug
10 | #    streamlit_debug.set(flag=True, wait_for_client=True, host='localhost', port=8765)
11 | #
12 | # `flag=True` will initiate a debug session. `wait_for_client=True` will wait for a debug client to attach when
13 | # the streamlit app is run before hitting your next debug breakpoint. `wait_for_client=False` will not wait.
14 | #
15 | # If using VS Code, you need this config in your `.vscode/launch.json` file:
16 | #
17 | #     {
18 | #         // Use IntelliSense to learn about possible attributes.
19 | #         // Hover to view descriptions of existing attributes.
20 | #         // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
21 | #         "version": "0.2.0",
22 | #         "configurations": [
23 | #             {
24 | #                 "name": "Python: Current File",
25 | #                 "type": "python",
26 | #                 "request": "launch",
27 | #                 "program": "${file}",
28 | #                 "console": "integratedTerminal",
29 | #                 "env": {"DEBUG": "true"}
30 | #             },
31 | #             {
32 | #                 "name": "Python: debugpy Remote Attach",
33 | #                 "type": "python",
34 | #                 "request": "attach",
35 | #                 "connect": {
36 | #                     "port": 8765,
37 | #                     "host": "127.0.0.1",
38 | #                 },
39 | #                 "justMyCode": false,
40 | #                 "redirectOutput": true,
41 | #                 "logToFile": true,
42 | #                 "pathMappings": [
43 | #                     {
44 | #                         "localRoot": "${workspaceFolder}",
45 | #                         "remoteRoot": "."
46 | #                     }
47 | #                 ]
48 | #                 // "debugAdapterPath": "${workspaceFolder}/src/debugpy/adapter",
49 | #             },
50 | #         ]
51 | #     }
52 | #
53 | # The port numbers you use need to match - in `streamlit_debug.set()` and `launch.json`. It should NOT be the same port that
54 | # streamlit is started on.
55 | #
56 | # When `flag=True` and `wait_for_client=True`, you'll must activate the "Python: debugpy Remote Attach" debug session
57 | # from vs-code.
58 | 
59 | import streamlit as st
60 | import logging
61 | 
62 | _DEBUG = False
63 | def set(flag: bool=False, wait_for_client=False, host='localhost', port=8765):
64 |     global _DEBUG
65 |     _DEBUG = flag
66 |     try:
67 |         # To prevent debugpy loading again and again because of
68 |         # Streamlit's execution model, we need to track debugging state 
69 |         if 'debugging' not in st.session_state:
70 |             st.session_state.debugging = None
71 | 
72 |         if _DEBUG and not st.session_state.debugging:
73 |             # https://code.visualstudio.com/docs/python/debugging
74 |             import debugpy
75 |             if not debugpy.is_client_connected():
76 |                 debugpy.listen((host, port))
77 |             if wait_for_client:
78 |                 logging.info(f'>>> Waiting for debug client attach... <<<')
79 |                 debugpy.wait_for_client() # Only include this line if you always want to manually attach the debugger
80 |                 logging.info(f'>>> ...attached! <<<')
81 |             # debugpy.breakpoint()
82 | 
83 |             if st.session_state.debugging == None:
84 |                 logging.info(f'>>> Remote debugging activated (host={host}, port={port}) <<<')
85 |             st.session_state.debugging = True
86 |         
87 |         if not _DEBUG:
88 |             if st.session_state.debugging == None:
89 |                 logging.info(f'>>> Remote debugging in NOT active <<<')
90 |             st.session_state.debugging = False
91 |     except:
92 |         # Ignore... e.g. for cloud deployments
93 |         pass
94 | 


--------------------------------------------------------------------------------