├── .gitignore ├── .streamlit ├── config-BLACK.toml ├── config-DARK.toml ├── config.toml └── secrets.toml.sample ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── README.md ├── app.py ├── app_about.py ├── app_llm_data_query.py ├── app_llm_docs_query.py ├── app_llm_knowlege_graph_gen.py ├── app_state.py ├── common.py ├── data ├── CustomMacroModel_L_A.csv └── GCFS Countries.xlsx ├── db └── empty-for-github.txt ├── docs └── cdaniel-future-is-predictable-master.pdf ├── func_prompt.py ├── globals.py ├── graph_frontend ├── __init__.py └── index.html ├── images ├── a12i_logo_circle_transparent.png ├── app-demo.gif ├── favicon.ico └── snapshot-01.png ├── requirements.txt ├── run_app.cmd ├── static ├── favicon.ico ├── knowledge_graph ├── knowledge_graph.gv ├── knowledge_graph.gv.pdf └── knowledge_graph.png ├── storage └── empty-for-github.txt └── streamlit_debug.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | /.streamlit/secrets.toml 162 | /_ARCHIVE/docs 163 | /backup_this_project.cmd 164 | /weaviate_connect_test.py 165 | /langchain_streaming_test.py 166 | /images/a12i_logo_block_circle.png 167 | /images/a12i_logo_block_circle_transparent.png 168 | /images/a12i_logo_circle.png 169 | /docs/AppArchGuide2.0.pdf 170 | /docs/DesignPrincipleSummaries.doc 171 | /docs/InformationSystemsDesignPrinciples.docx 172 | /docs/Service Architecture Pocket Guide.pdf 173 | /docs/_VPandP_Reference.pdf 174 | /db/*.sqbpro 175 | /storage/docstore.json 176 | /storage/index_store.json 177 | /storage/vector_store.json 178 | /db/*.sqlite3 179 | /NOTES.md 180 | -------------------------------------------------------------------------------- /.streamlit/config-BLACK.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | primaryColor = "#7792E3" 3 | backgroundColor = "black" 4 | secondaryBackgroundColor = "black" 5 | textColor = "#DCDCDC" 6 | font = "monospace" 7 | 8 | -------------------------------------------------------------------------------- /.streamlit/config-DARK.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | # Primary accent for interactive elements 3 | primaryColor = '#FF4B4B' 4 | 5 | # Background color for the main content area 6 | backgroundColor = '#0E1117' 7 | 8 | # Background color for sidebar and most interactive widgets 9 | secondaryBackgroundColor = '#262730' 10 | 11 | # Color used for almost all text 12 | textColor = '#FAFAFA' 13 | 14 | # Font family for all text in the app, except code blocks 15 | # Accepted values (serif | sans serif | monospace) 16 | # Default: "sans serif" 17 | font = "sans serif" -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | base = "light" 3 | font = "sans serif" 4 | -------------------------------------------------------------------------------- /.streamlit/secrets.toml.sample: -------------------------------------------------------------------------------- 1 | IS_CLOUD_DEPLOYMENT='true' # 'true' = deployed on st cloud | 'false' = deployed locally 2 | OPENAI_API_KEY='' 3 | WEAVIATE_API_KEY='' 4 | WEAVIATE_URL='https://.weaviate.network' 5 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Current File", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${file}", 12 | "console": "integratedTerminal", 13 | "args": [] 14 | }, 15 | { 16 | "name": "Python Streamlit", 17 | "type": "python", 18 | "request": "launch", 19 | "module": "streamlit", 20 | "args": ["run", "${file}", "--server.port", "6974"], 21 | "justMyCode": true, 22 | "redirectOutput": true, 23 | "logToFile": true, 24 | "pathMappings": [ 25 | { 26 | "localRoot": "${workspaceFolder}", 27 | "remoteRoot": "." 28 | } 29 | ] 30 | }, 31 | { 32 | "name": "Python: debugpy Remote Attach", 33 | "type": "python", 34 | "request": "attach", 35 | "connect": { 36 | "port": 7777, 37 | "host": "127.0.0.1", 38 | }, 39 | "justMyCode": false, 40 | "redirectOutput": true, 41 | "logToFile": true, 42 | "pathMappings": [ 43 | { 44 | "localRoot": "${workspaceFolder}", 45 | "remoteRoot": "." 46 | } 47 | ] 48 | // "debugAdapterPath": "${workspaceFolder}/src/debugpy/adapter", 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "git.ignoreLimitWarning": true, 3 | "docwriter.style": "Auto-detect", 4 | "python.analysis.typeCheckingMode": "off", 5 | "python.defaultInterpreterPath": "C:\\ProgramData\\Anaconda3\\python.exe", 6 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Arvindra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # LLM | DOC Q&A | KNOWLEDGE GRAPH | EXCEL DATA CHAT 3 | > _Integrated LLM-based document and data Q&A with knowledge graph visualization_ 4 | 5 | > Arvindra Sehmi, A12i (CloudOpti Ltd.) | [LinkedIn](https://www.linkedin.com/in/asehmi/) 6 | 7 | > Updated: 9 December, 2023 8 | 9 | --- 10 | 11 | ### Introduction 12 | 13 | I built this app because I'm writing some chapters for an upcoming book on Streamlit. This app helps me digest a large quantity of information from articles and documents I have on the subject of Software Architecture. I wanted to be able to ask questions about the documents and get answers, and also to visualize the answers in a knowledge graph. I also wanted to upload Excel files and ask questions about the data in the files. 14 | 15 | The application is a typical LLM application, with the addition of a knowledge graph visualization. The app is built in Python using Streamlit. I was inspired by [instagraph](https://github.com/yoheinakajima/instagraph) and re-implemented its graph plot as a Streamlit custom component. I use the Weaviate Cloud (vector) Store (WCS) for document and data indexing. OpenAI, LangChain, and LlamaIndex LLM programming frameworks play an important role too. The application supports local filestore indexing in addition to WCS. OpenAI embeddings are used and the OpenAI API is called, directly or via the other LLM frameworks, for question answering. Hence, you will need an OpenAI API key to use the application. Various LLM models are used for question answering, including the GPT-4-Turbo Turbo and GPT-4 models. They are used for bot chat and completions. Token usage is tracked and costs are estimated. 16 | 17 | The application is deployed on Streamlit Cloud. When deployed in the cloud, the application uses WCS. When deployed locally, the application can be configured to use LlamaIndex to store its index in the local file system. 18 | 19 | ![snapshot](./images/snapshot-01.png) 20 | 21 | ### Streamlit App Demo 22 | 23 | In this demo: 24 | 25 | 1. The user selects or enters a question to query over documents or data which have been indexed into Weaviate (a cloud-based vector store) 26 | 2. The app displays the question answer and generates a knowledge graph to complement the answer 27 | 3. The user can upload an Excel file which can be displayed and queried using natural language 28 | 4. The app allows the user to enter their OpenAI API key and select the model(s) to use for question answering 29 | 5. The app displays a per-query cost estimate and a running total of the cost of the queries 30 | 31 | ![st_demo](./images/app-demo.gif) 32 | 33 | ### Try the demo app yourself 34 | 35 | The application can be seen running in the Streamlit Cloud at the link below: 36 | 37 | [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://docs-n-data-knowledge-app.streamlit.app/) 38 | 39 | **NOTE:** You will need to enter your own OpenAI API. The key is ephemeral and not stored permanently in the application. Once entered, the API Key input box will be hidden and you can start using the app. To re-enter the API Key, a button is provided to clear the current key from memory, after which you can re-enter another key. 40 | 41 | ### Installation 42 | 43 | Ensure you have installed package requirements with the commands: 44 | 45 | ```bash 46 | # change to the Streamlit , e.g. 47 | cd ./docs-n-data-knowledge-app 48 | pip install -r requirements.txt 49 | ``` 50 | 51 | **Important:** Modify the `secrets.toml` file in the application `.streamlit` root based on the example available in `secrets.toml.sample`. 52 | 53 | ```bash 54 | OPENAI_API_KEY='' 55 | WEAVIATE_API_KEY='' 56 | WEAVIATE_URL='https://.weaviate.network' 57 | IS_CLOUD_DEPLOYMENT='true' # 'true' = deployed on st cloud | 'false' = deployed locally 58 | ``` 59 | 60 | In `globals.py` you can change the following variables to affect application behaviour: 61 | 62 | ```python 63 | # See: https://openai.com/pricing 64 | LANG_MODEL_PRICING = { 65 | # Friendly aliases used in app 66 | 'gpt-4': {'input': 0.03, 'output': 0.06}, # per 1000 tokens 67 | 'gpt-4-turbo': {'input': 0.01, 'output': 0.03}, # per 1000 tokens 68 | # Actual model names used in app 69 | 'gpt-4-1106-preview': {'input': 0.01, 'output': 0.03}, # per 1000 tokens 70 | 'gpt-3.5-turbo-instruct': {'input': 0.0015, 'output': 0.002}, # per 1000 tokens 71 | } 72 | 73 | VECTOR_STORE = 'Weaviate' # 'Weaviate' | 'Local' 74 | 75 | # Sample questions for the Document Q&A functionality, based on the topic of _my_ indexed documents 76 | SAMPLE_QUESTIONS = [ 77 | "None", # required 78 | "Summarize the most important concepts in a high performance software application", 79 | "Summarize the Wardley mapping technique", 80 | # : 81 | # ETC. 82 | # : 83 | "Most important factors of high performing teams", 84 | ] 85 | ``` 86 | 87 | Now run Streamlit with `app.py`: 88 | 89 | ```bash 90 | # I prefer to set the port number too 91 | streamlit run --server.port 4010 app.py 92 | ``` 93 | 94 | **NOTE:** Whilst there is some clean-up of the structured data expected in the LLM responses, LLMs don't always return data you expect. You might therefore encounter errors. If you do, try changing the LLM model selected and re-run your queries. 95 | 96 | ### TODO 97 | 98 | - Possibly, remove the data page functionality from app and create a separate project for it 99 | - Implement file upload document Q&A 100 | 101 | --- 102 | 103 | If you enjoyed this app, please consider starring this repository. 104 | 105 | Thanks! 106 | 107 | Arvindra -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import streamlit as st 4 | 5 | import streamlit_debug 6 | streamlit_debug.set(flag=True, wait_for_client=False, host='localhost', port=7777) 7 | 8 | st.set_page_config(page_title='ChatGPT Pandas CSV Streamlit App', page_icon='🤖', initial_sidebar_state='expanded', layout='wide') 9 | # Remove blank space between top of page and content 10 | st.markdown("", unsafe_allow_html=True) 11 | st.markdown(" ", unsafe_allow_html=True) 12 | 13 | from app_state import (state, init_app_state, reset_app_state, _set_state_cb) 14 | init_app_state() # ensure all state variables are initialized 15 | 16 | from globals import SAMPLE_QUESTIONS 17 | 18 | logging.basicConfig(level=logging.INFO) 19 | 20 | # APP CALLBACKS ----------------------------------------------------------------- 21 | 22 | def _set_openai_api_key_cb(): 23 | if not state.text_input_openai_api_key.startswith('sk-'): 24 | st.warning('Please enter your OpenAI API key!', icon='⚠') 25 | return 26 | state.openai_api_key = state.text_input_openai_api_key 27 | os.environ['OPENAI_API_KEY'] = state.openai_api_key 28 | 29 | def _clear_openai_api_key_cb(): 30 | state.openai_api_key = '' 31 | os.environ['OPENAI_API_KEY'] = state.openai_api_key 32 | 33 | # DATA CHAT PAGE ---------------------------------------------------------------- 34 | 35 | def _openai_api_key_guard(): 36 | # Guardrail for API Key 37 | if not state.openai_api_key: 38 | st.error('🔑 Please enter your OpenAI API Key in the settings sidebar. 🔑') 39 | st.info( 40 | 'This value is ephemeral and not stored permanently.\n\n' 41 | 'Once entered, the API Key input box will be removed, and you can start using the app.\n\n' 42 | 'To re-enter the API Key, click the global settings button to clear the current key from memory.' 43 | ) 44 | with st.sidebar: 45 | # api key 46 | st.text_input( 47 | '🔑 OpenAI API Key', 48 | value=state.openai_api_key, 49 | placeholder='sk-...', 50 | type='password', 51 | on_change=_set_openai_api_key_cb, 52 | help='Enter your OpenAI API Key', 53 | key='text_input_openai_api_key' 54 | ) 55 | st.stop() 56 | 57 | # Guardrail for API Key 58 | _openai_api_key_guard() 59 | 60 | # Once past the guardrails, import the rest of the app which depends on OpenAI API key 61 | import app_llm_data_query, app_llm_docs_query, app_llm_knowlege_graph_gen, app_about 62 | 63 | def start(): 64 | # Sidebar 65 | with st.sidebar: 66 | st.image('./images/a12i_logo_circle_transparent.png') 67 | top_level_options = ['Document Q&A | Knowedge Graph', 'Data Chat', 'About'] 68 | st.subheader('What would you like to do?') 69 | top_level = st.radio( 70 | 'What would you like to do?', 71 | top_level_options, index=0, 72 | label_visibility='collapsed', horizontal=False 73 | ) 74 | 75 | # Document Q&A | Knowledge Graph 76 | if top_level == top_level_options[0]: 77 | c1, _ = st.columns([1, 1.5]) 78 | with c1: 79 | # Title and description 80 | st.subheader('Document Q&A ❣️ Knowledge Graph') 81 | st.caption( 82 | '📑 Ask a question based on pre-uploaded documents on the subject of **Software Architecture**. You can ask questions on any topic ' 83 | 'in as much detail as you like. For your convenience, some sample questions are provided below.' 84 | ) 85 | c1, _, c3, _ = st.columns([1, 0.075, 1, 1.5]) 86 | with c1: 87 | st.markdown('### **1️⃣ Ask a question**') 88 | user_input = st.text_input( 89 | "Enter question here...", 90 | placeholder="Enter text 🖋️ or URL 🔗", 91 | label_visibility="collapsed", 92 | key="user_text_input" 93 | ) 94 | example_selection = st.selectbox( 95 | "📑 You can choose a sample question here instead", 96 | options=SAMPLE_QUESTIONS, 97 | index=0, 98 | key="examples_selectbox" 99 | ) 100 | 101 | with c3: 102 | user_input_confirmed = False 103 | include_knowledge_graph = False 104 | radio_options = [user_input, example_selection] if user_input and (user_input != example_selection) else ([example_selection] if example_selection != "None" else []) 105 | if radio_options: 106 | st.markdown('### **2️⃣ Confirm your question**') 107 | with st.form(key="confirm_input_form"): 108 | st.radio( 109 | "Confirm input", options=radio_options, 110 | label_visibility="collapsed", 111 | horizontal=True, 112 | key="confirm_input" 113 | ) 114 | c1, c2, _ = st.columns([1, 1, 1.5]) 115 | with c1: 116 | user_input_confirmed = st.form_submit_button( 117 | label="Confirm and get answer", type='primary', 118 | on_click=_set_state_cb, kwargs={ 119 | 'user_input': "confirm_input", 120 | 'estimated_cost_doc': 'estimated_cost_reset', 121 | 'estimated_cost_graph': 'estimated_cost_reset', 122 | } 123 | ) 124 | with c2: 125 | include_knowledge_graph = st.checkbox('Include knowledge graph', value=False) 126 | 127 | if state.user_input: 128 | st.markdown(f'###### ✅ Confirmed question: _{state.user_input}_') 129 | st.markdown(f'###### ✅ Include knowledge graph: _{include_knowledge_graph}_') 130 | else: 131 | st.markdown('###### ❌ No question confirmed yet') 132 | 133 | st.markdown('---') 134 | 135 | c1, _, c3 = st.columns([1.5, 0.25, 1]) 136 | with c1: 137 | response = app_llm_docs_query.main('Document Q&A', user_input_confirmed) 138 | with c3: 139 | if include_knowledge_graph: 140 | app_llm_knowlege_graph_gen.main('Knowledge Graph', user_input_confirmed, response) 141 | 142 | # Simple Excel Data Q&A 143 | if top_level == top_level_options[1]: 144 | c1, _ = st.columns([1, 2]) 145 | with c1: 146 | st.subheader('🔢 Simple Excel Data Q&A') 147 | app_llm_data_query.main('Data Chat') 148 | 149 | # About / Display README.md 150 | if top_level == top_level_options[2]: 151 | st.subheader('📖 Readme') 152 | app_about.main() 153 | 154 | with st.sidebar: 155 | st.markdown('---') 156 | 157 | with st.expander('#### Cost Estimation', expanded=True): 158 | st.markdown(f'**Cumulative: ${state.cumulative_cost:.2f}**') 159 | st.markdown(f'Data query: ${state.estimated_cost_data:.2f}') 160 | st.markdown(f'Doc query: ${state.estimated_cost_doc:.2f}') 161 | st.markdown(f'Graph query: ${state.estimated_cost_graph:.2f}') 162 | 163 | st.markdown('#### Global Settings') 164 | if st.button('Reset app state', type='primary', help='Clear results cache and app state (optional). Will clear cost estimations too!'): 165 | reset_app_state() 166 | app_llm_data_query.get_llm_data_query_response.clear() 167 | app_llm_docs_query.get_llm_doc_query_response.clear() 168 | app_llm_knowlege_graph_gen.get_llm_graph_data_response.clear() 169 | st.experimental_rerun() 170 | st.button('Clear OpenAI API key', on_click=_clear_openai_api_key_cb, type='primary', help='Clear OpenAI API key (optional)') 171 | 172 | with st.expander('Debug State (excluding private keys)', expanded=False): 173 | display_state = {k: v for k, v in state.items() if not ('openai' in k or 'weaviate' in k)} 174 | st.write(display_state) 175 | 176 | st.subheader('About') 177 | st.sidebar.info('Integrated LLM-based document and data Q&A with knowledge graph visualization.\n\n' + \ 178 | '(c) 2023. A12i (CloudOpti Ltd.) All rights reserved.') 179 | 180 | if __name__ == '__main__': 181 | start() 182 | -------------------------------------------------------------------------------- /app_about.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | def main(): 4 | c1, _, c3, _ = st.columns([2,0.25,1,1]) 5 | with c1: 6 | with open('./README.md', 'r', encoding='utf-8') as f: 7 | readme_lines = f.readlines() 8 | readme_buffer = [] 9 | for line in readme_lines: 10 | if '![snapshot](./images/snapshot-01.png)' in line: 11 | st.markdown(' '.join(readme_buffer)) 12 | st.image('./images/snapshot-01.png') 13 | readme_buffer.clear() 14 | elif '![st_demo](./images/app-demo.gif)' in line: 15 | st.markdown(' '.join(readme_buffer)) 16 | st.image('./images/app-demo.gif') 17 | readme_buffer.clear() 18 | else: 19 | readme_buffer.append(line) 20 | st.markdown(' '.join(readme_buffer), unsafe_allow_html=True) 21 | 22 | with c3: 23 | st.markdown(''' 24 | ### About 🎈Streamlit 25 | 26 | Streamlit is a Python library that allows the creation of interactive, data-driven web applications in Python. 27 | [Streamlit](https://streamlit.io) is an open-source app framework for Machine Learning and Data Science teams. 28 | You can create beautiful data apps in minutes, not weeks. All in pure Python. It's not just for Data Science, though. 29 | 30 | With its component extensibility architecture, you can build and integrate most kinds of web frontends into Streamlit apps. 31 | 32 | Streamlit is fast-becoming a de facto standard for building Generative AI and LLM apps in Python. 33 | 34 | ##### Resources 35 | 36 | - [Build powerful generative AI apps with Streamlit](https://streamlit.io/generative-ai) 37 | - [Streamlit Documentation](https://docs.streamlit.io/) 38 | - [Streamlit Blog](https://blog.streamlit.io/) 39 | - [Cheat sheet](https://docs.streamlit.io/library/cheatsheet) 40 | - [Book](https://www.amazon.com/dp/180056550X) (Getting Started with Streamlit for Data Science) 41 | - [Blog](https://blog.streamlit.io/how-to-master-streamlit-for-data-science/) (How to master Streamlit for data science) 42 | 43 | ##### Deploy 44 | 45 | Once you've created an app you can use the [Community Cloud](https://streamlit.io/cloud) to deploy, manage, and share your app, in just a few clicks. ''') 46 | -------------------------------------------------------------------------------- /app_llm_data_query.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlite3 3 | import pandas as pd 4 | # from sqlalchemy import create_engine 5 | from sqlalchemy.pool import StaticPool 6 | from langchain.callbacks import get_openai_callback 7 | from langchain.llms import OpenAI 8 | from langchain.utilities.sql_database import SQLDatabase 9 | from langchain_experimental.sql import SQLDatabaseChain 10 | import retry 11 | import logging 12 | 13 | logging.basicConfig(level=logging.ERROR) 14 | 15 | import streamlit as st 16 | 17 | from globals import ( 18 | DB_FILE, OPENAI_MODELS_COMPLETIONS, 19 | DEFAULT_MODEL_CONFIG, LANG_MODEL_PRICING 20 | ) 21 | from app_state import (state, init_app_state, _set_state_cb) 22 | init_app_state() # ensure all state variables are initialized 23 | 24 | # DATA ------------------------------------------------------------------------- 25 | 26 | @st.cache_data(persist='disk') 27 | def csv_to_df(excel_file): 28 | df = pd.read_csv(excel_file) 29 | return df 30 | 31 | @st.cache_data(persist='disk') 32 | def excel_to_df(excel_file): 33 | # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html 34 | # New in Pandas version 1.3.0. 35 | # The engine xlrd now only supports old-style .xls files. When engine=None, the following logic will be used to determine the engine: 36 | # If path_or_buffer is an OpenDocument format (.odf, .ods, .odt), then odf will be used. 37 | # Otherwise if path_or_buffer is an xls format, xlrd will be used. 38 | # Otherwise if path_or_buffer is in xlsb format, pyxlsb will be used. 39 | # Otherwise openpyxl will be used. 40 | # 41 | # import openpyxl 42 | # df = pd.read_excel(excel_file, engine=openpyxl) 43 | # 44 | # Therefore... do not need to provide "engine" when using a "path_or_buffer" 45 | df = pd.read_excel(excel_file, engine='openpyxl') 46 | return df 47 | 48 | def prepare_data(df): 49 | df.columns = [x.replace(' ', '_').lower() for x in df.columns] 50 | return df 51 | 52 | @st.cache_resource() 53 | def db_connection(): 54 | return sqlite3.connect(DB_FILE , check_same_thread=False) 55 | 56 | @st.cache_resource() 57 | def sql_database(table): 58 | # create db engine 59 | # eng = create_engine( 60 | # url=f'sqlite:///file:{DB_FILE}&cache=shared', 61 | # poolclass=StaticPool, # single connection for requests 62 | # creator=lambda: db_connection(), 63 | # ) 64 | # db = SQLDatabase(engine=eng) 65 | 66 | db = SQLDatabase.from_uri( 67 | database_uri = f'sqlite:///file:{DB_FILE}&cache=shared', 68 | include_tables=[table], # we include only one table to save tokens in the prompt :) 69 | sample_rows_in_table_info=2, # we only need 2 rows to get the table info 70 | engine_args={'poolclass': StaticPool, 'creator': lambda: db_connection()}, 71 | ) 72 | return db 73 | 74 | # OPENAI DATA QUERY ------------------------------------------------------------ 75 | 76 | # create OpenAI LLM connection 77 | # NOTE: relies on environment key in case you want to 78 | # remove entering the key in the app 79 | def get_llm( 80 | model_name: str = DEFAULT_MODEL_CONFIG['completions_model'], 81 | temperature: float = DEFAULT_MODEL_CONFIG['temperature'], 82 | top_p: float = DEFAULT_MODEL_CONFIG['top_p'], 83 | max_tokens: int = DEFAULT_MODEL_CONFIG['max_tokens'], 84 | max_retries: int = 3, 85 | streaming: bool = False, 86 | ): 87 | return OpenAI( 88 | openai_api_key=os.environ['OPENAI_API_KEY'], 89 | model_name=model_name, 90 | temperature=temperature, 91 | top_p=top_p, 92 | max_tokens=max_tokens, 93 | max_retries=max_retries, 94 | streaming=streaming, 95 | ) 96 | 97 | @retry.retry(tries=2, delay=5, backoff=3, jitter=(1, 5), max_delay=60, logger=logging.getLogger("LLM DATA QUERY (get_llm_data_query_response)")) 98 | def get_llm_data_query_response(query, table, model_name=DEFAULT_MODEL_CONFIG['completions_model'], intermediate_steps=False, limit=3): 99 | model_config = { 100 | 'model_name': model_name, 101 | 'temperature': 0, # override settings = do not halucinate! 102 | 'top_p': state.top_p, 103 | 'max_tokens': 2000, # override settings 104 | } 105 | llm = get_llm(**model_config) 106 | 107 | # create SQLDatabaseChain LLM connection 108 | db_chain = SQLDatabaseChain.from_llm( 109 | llm=llm, db=sql_database(table), verbose=True, 110 | # use_query_checker=True, 111 | return_intermediate_steps=intermediate_steps, 112 | top_k=limit 113 | ) 114 | 115 | # run query and display result 116 | with get_openai_callback() as token_counter: 117 | if query: 118 | if state.intermediate_steps: 119 | result = db_chain(query) 120 | else: 121 | result = db_chain.run(query) 122 | 123 | print('---- Data SQL Query ----', '\n', 124 | 'LLM Prompt Tokens:', token_counter.prompt_tokens, '\n', 125 | 'LLM Completion Tokens:', token_counter.completion_tokens, '\n', 126 | 'Total LLM Token Count:', token_counter.total_tokens) 127 | 128 | estimated_cost = ((token_counter.prompt_tokens / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['input']) + \ 129 | ((token_counter.completion_tokens / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['output']) 130 | print('Data SQL Query Estimated Cost: $', estimated_cost) 131 | state.estimated_cost_data = estimated_cost 132 | state.cumulative_cost += estimated_cost 133 | 134 | return result 135 | 136 | # DATA CHAT PAGE ---------------------------------------------------------------- 137 | 138 | def main(title): 139 | # Sidebar 140 | with st.sidebar: 141 | st.markdown(f'#### {title} Settings') 142 | st.selectbox( 143 | 'OpenAI model', options=OPENAI_MODELS_COMPLETIONS, 144 | on_change=_set_state_cb, kwargs={'completions_model': 'selectbox_data_completions_model_name'}, 145 | index=OPENAI_MODELS_COMPLETIONS.index(state.completions_model), 146 | help='Allowed models. Accuracy, speed, token consumption and costs will vary.', 147 | key='selectbox_data_completions_model_name' 148 | ) 149 | # results limit 150 | st.number_input( 151 | 'Results limit', value=state.limit, min_value=1, max_value=10, step=1, 152 | on_change=_set_state_cb, kwargs={'limit': 'number_input_limit'}, 153 | help='Limit the number of results returned, which can improve performance and save OpenAI costs', 154 | key='number_input_limit' 155 | ) 156 | 157 | # Body 158 | st.subheader('Upload Data') 159 | excel_file = st.file_uploader('Choose an Excel file on your computer', type=['xlsx', 'csv'], accept_multiple_files=False) 160 | if excel_file is None: 161 | return 162 | 163 | if excel_file.type in ['application/vnd.ms-excel', 'application/octet-stream', 'text/csv']: 164 | df = csv_to_df(excel_file) 165 | # state.db_table = excel_file.name.replace('.csv', '').replace(' ', '_').lower() 166 | else: # 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' 167 | df = excel_to_df(excel_file) 168 | # state.db_table = excel_file.name.replace('.xlsx', '').replace(' ', '_').lower() 169 | 170 | if st.checkbox('Show Data', value=False): 171 | st.dataframe(df) 172 | 173 | # commit data to sql 174 | data = prepare_data(df) 175 | data.to_sql(state.db_table, db_connection(), if_exists='replace', index=False) 176 | 177 | st.subheader('Query Data') 178 | with st.form(key='data_chat_form', clear_on_submit=False): 179 | # user query 180 | st.text_input( 181 | 'Enter a data query in plain English', value=state.query, 182 | help='Enter a question based on the uploaded dataset. Add as much detail as you like. ' 183 | 'E.g., "What is X of Y in the table. Limit to 10 results, and format as JSON showing X and Y values only."', 184 | key='text_input_query_data' 185 | ) 186 | st.checkbox( 187 | 'Show Intermediate Steps', value=state.intermediate_steps, 188 | key='checkbox_intermediate_steps' 189 | ) 190 | apply_query = st.form_submit_button( 191 | label='Ask', type='primary', 192 | on_click=_set_state_cb, kwargs={ 193 | 'intermediate_steps': 'checkbox_intermediate_steps', 194 | 'query': 'text_input_query_data', 195 | 'estimated_cost_data': 'estimated_cost_reset', 196 | }, 197 | ) 198 | 199 | if apply_query and state.query and state.openai_api_key: 200 | query = state.query + f' Strictly use only these data columns "{list(data.columns)}". ' + \ 201 | 'Do not wrap the SQL statement in quotes. Do not embelish the answer with any additional text.' 202 | result = get_llm_data_query_response( 203 | query, state.db_table, 204 | model_name=state.completions_model, 205 | intermediate_steps=state.intermediate_steps, 206 | limit=state.limit 207 | ) 208 | if state.intermediate_steps: 209 | with st.expander('Intermediate Steps', expanded=False): 210 | st.write(state.completions_model) 211 | st.write(result['intermediate_steps']) 212 | st.text(result['result']) 213 | else: 214 | st.text(result) 215 | elif apply_query and not state.query: 216 | st.info('Please enter a query above.') -------------------------------------------------------------------------------- /app_llm_docs_query.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from langchain.prompts import PromptTemplate 4 | 5 | import tiktoken 6 | from llama_index.callbacks import CallbackManager, TokenCountingHandler 7 | from llama_index.node_parser import SimpleNodeParser 8 | from llama_index.vector_stores import WeaviateVectorStore 9 | from llama_index import ( 10 | VectorStoreIndex, SimpleDirectoryReader, 11 | StorageContext, ServiceContext, 12 | load_index_from_storage 13 | ) 14 | import weaviate 15 | 16 | import streamlit as st 17 | 18 | from app_state import (state, init_app_state, _set_state_cb) 19 | init_app_state() # ensure all state variables are initialized 20 | 21 | from globals import ( 22 | VECTOR_STORE, OPENAI_MODELS_COMPLETIONS, 23 | DEFAULT_MODEL_CONFIG, LANG_MODEL_PRICING 24 | ) 25 | from common import scrape_articles 26 | 27 | # DOCS CHAT PAGE ---------------------------------------------------------------- 28 | 29 | wc = None 30 | # WEAVIATE CLOUD STORE 31 | if VECTOR_STORE == 'Weaviate': 32 | auth_config = weaviate.AuthApiKey(api_key=state.weaviate_api_key) 33 | wc = weaviate.Client( 34 | url=state.WEAVIATE_URL, 35 | auth_client_secret=auth_config, 36 | additional_headers={ 37 | "X-OpenAI-Api-Key": state.openai_api_key, 38 | } 39 | ) 40 | 41 | @st.cache_data(ttl=60*60, show_spinner=False) 42 | def get_llm_doc_query_response( 43 | query_prompt, model_name: str = DEFAULT_MODEL_CONFIG['completions_model'], 44 | _service_context=ServiceContext.from_defaults() 45 | ): 46 | # load index 47 | # LOCAL STORE 48 | if VECTOR_STORE == 'Local': 49 | # rebuild storage context 50 | storage_context = StorageContext.from_defaults(persist_dir='./storage') 51 | index = load_index_from_storage(storage_context) 52 | 53 | # WEAVIATE CLOUD STORE 54 | elif VECTOR_STORE == 'Weaviate': 55 | vector_store = WeaviateVectorStore(weaviate_client = wc, index_name="Documents", text_key="content") 56 | # set up the index 57 | index = VectorStoreIndex.from_vector_store(vector_store=vector_store, service_context=_service_context) 58 | 59 | else: 60 | raise ValueError(f'Unknown vector store {VECTOR_STORE}') 61 | 62 | # get query engine over the index 63 | query_engine = index.as_query_engine() 64 | # query the index 65 | response = query_engine.query(query_prompt) 66 | response = response.response.replace('•', '*') 67 | return response 68 | 69 | def main(title, user_input_confirmed=False): 70 | # Count token usage for cost estimation 71 | token_counter = TokenCountingHandler( 72 | tokenizer=tiktoken.encoding_for_model(state.completions_model).encode, 73 | verbose=False # set to true to see usage printed to the console 74 | ) 75 | callback_manager = CallbackManager([token_counter]) 76 | service_context = ServiceContext.from_defaults(callback_manager=callback_manager) 77 | 78 | def _index_documents(): 79 | # load the documents 80 | documents = SimpleDirectoryReader('docs').load_data() 81 | 82 | # LOCAL STORE 83 | # NOTE: Disallow if cloud deployment (temporary fix for public demo and/or if you 84 | # don't have required file permissions or disk space) 85 | if not json.loads(st.secrets['IS_CLOUD_DEPLOYMENT']) and VECTOR_STORE == 'Local': 86 | # construct an index over these documents... saved in memory 87 | index = VectorStoreIndex.from_documents(documents, show_progress=True, service_context=service_context) 88 | # save index on disk 89 | index.storage_context.persist(persist_dir='./storage') 90 | 91 | # WEAVIATE CLOUD STORE 92 | elif VECTOR_STORE == 'Weaviate': 93 | wc.schema.delete_class("Documents") 94 | class_obj = { 95 | "class": "Documents", 96 | "vectorizer": "text2vec-openai", 97 | "moduleConfig": { 98 | "text2vec-openai": {}, 99 | "generative-openai": {} 100 | } 101 | } 102 | wc.schema.create_class(class_obj) 103 | # chunk up the documents into nodes 104 | parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20) 105 | nodes = parser.get_nodes_from_documents(documents, show_progress=True) 106 | # construct vector store 107 | vector_store = WeaviateVectorStore(weaviate_client=wc, index_name="Documents", text_key="content") 108 | # setting up the storage for the embeddings 109 | storage_context = StorageContext.from_defaults(vector_store = vector_store) 110 | # set up the index 111 | index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True, service_context=service_context) 112 | 113 | else: 114 | raise ValueError(f'Unknown vector store {VECTOR_STORE}') 115 | 116 | print('---- Document Q&A ----', '\n', 117 | 'Indexing Embedding Tokens: ', token_counter.total_embedding_token_count, '\n') 118 | 119 | with st.sidebar: 120 | st.markdown(f'#### {title} Settings') 121 | st.selectbox( 122 | 'OpenAI model', options=OPENAI_MODELS_COMPLETIONS, 123 | on_change=_set_state_cb, kwargs={'completions_model': 'selectbox_docs_completions_model_name'}, 124 | index=OPENAI_MODELS_COMPLETIONS.index(state.completions_model), 125 | help='Allowed models. Accuracy, speed, token consumption and costs will vary.', 126 | key='selectbox_docs_completions_model_name' 127 | ) 128 | include_history = st.checkbox('Include history in prompts', value=False) 129 | if st.button('Clear history'): 130 | state.questions = [] 131 | state.past = [] 132 | # NOTE: Hide indexing button if cloud deployment (temporary fix for public demo) 133 | if not json.loads(st.secrets['IS_CLOUD_DEPLOYMENT']) and st.button('Index documents'): 134 | with st.spinner("Indexing..."): 135 | _index_documents() 136 | 137 | # GPT completion models can not handle web sites, so we scrape the URL in the user input 138 | user_input = state.user_input 139 | if user_input.strip().startswith('http'): 140 | scraped_texts = scrape_articles([user_input])['text'] 141 | user_input = scraped_texts[0] if scraped_texts else user_input 142 | user_input = user_input.replace('\n', ' ').replace('\r', '') if user_input else user_input 143 | 144 | if include_history: 145 | context = '\n\n'.join([f'| Question: "{q}" | Answer: "{a}" |' for q, a in zip(state.questions, state.past)]) 146 | refinement = \ 147 | 'Finally, return results in markdown text, include bullet point format where appropriate. ' + \ 148 | 'Add additional web links at the end of the response if this is useful.' 149 | prompt_template = "Given this context ### {context} ###. Answer or summarize this: ### {doc_query} ###. {refinement}" 150 | prompt = PromptTemplate(input_variables=['context', 'doc_query', 'refinement'], template=prompt_template) 151 | query_prompt = prompt.format(context=context, doc_query=user_input, refinement=refinement) 152 | else: 153 | refinement = \ 154 | 'Return results in markdown text, include bullet point format where appropriate. ' + \ 155 | 'Add additional web links at the end of the response if this is useful.' 156 | prompt_template = "Answer or summarize this: ### {doc_query} ###. {refinement}" 157 | prompt = PromptTemplate(input_variables=['doc_query', 'refinement'], template=prompt_template) 158 | query_prompt = prompt.format(doc_query=user_input, refinement=refinement) 159 | 160 | if user_input_confirmed and state.user_input: 161 | with st.spinner("Generating query answer..."): 162 | try: 163 | response = get_llm_doc_query_response(query_prompt, model_name=state.completions_model, _service_context=service_context) 164 | print('---- Document Q&A ----', '\n', 165 | 'Embedding Tokens: ', token_counter.total_embedding_token_count, '\n', 166 | 'LLM Prompt Tokens: ', token_counter.prompt_llm_token_count, '\n', 167 | 'LLM Completion Tokens: ', token_counter.completion_llm_token_count, '\n', 168 | 'Total LLM Token Count: ', token_counter.total_llm_token_count) 169 | except Exception as ex: 170 | st.warning(f'Index does not exist. Please index some documents.') 171 | st.error(str(ex)) 172 | return 173 | 174 | if state.user_input: 175 | st.subheader('🙋🏽 Answer') 176 | with st.spinner("Generating query answer..."): 177 | try: 178 | # This will use cached response! 179 | response = get_llm_doc_query_response(query_prompt, model_name=state.completions_model, _service_context=service_context) 180 | except Exception as ex: 181 | st.warning(f'Index does not exist. Please index some documents.') 182 | st.error(str(ex)) 183 | return 184 | 185 | if state.user_input not in state.questions: 186 | state.questions.append(state.user_input) 187 | state.generated.append((state.user_input, response)) 188 | state.past.append(response) 189 | 190 | st.markdown(response) 191 | 192 | with st.expander('View conversation history', expanded=False): 193 | st.markdown('\n\n'.join([f'---\n**Question**\n\n{q}\n\n**Answer**\n\n{a}' for q, a in zip(state.questions, state.past)])) 194 | 195 | estimated_cost = ((token_counter.prompt_llm_token_count / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['input']) + \ 196 | ((token_counter.completion_llm_token_count / 1000.0) * LANG_MODEL_PRICING[state.completions_model]['output']) 197 | print('Document Q&A Estimated Cost: $', estimated_cost) 198 | state.estimated_cost_doc = estimated_cost 199 | state.cumulative_cost += estimated_cost 200 | 201 | return response 202 | -------------------------------------------------------------------------------- /app_llm_knowlege_graph_gen.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import openai 4 | from graphviz import Digraph 5 | import base64 6 | 7 | import streamlit as st 8 | 9 | import func_prompt 10 | from globals import ( 11 | OPENAI_MODELS_CHAT, 12 | DEFAULT_MODEL_CONFIG, LANG_MODEL_PRICING 13 | ) 14 | from common import SafeFormatter 15 | 16 | from app_state import (state, _set_state_cb, init_app_state, reset_app_state) 17 | init_app_state() # ensure all state variables are initialized 18 | 19 | # GRAPH GENERATOR ------------------------------------------------------------- 20 | 21 | def correct_json(response_data): 22 | """ 23 | Corrects the JSON response from OpenAI to be valid JSON 24 | """ 25 | # clean up the response data JSON 26 | response_data = response_data.replace(' ',' ').replace(',\n }','\n }') 27 | # For good measure 28 | response_data = re.sub( 29 | r',\s*}', '}', re.sub( 30 | r',\s*]', ']', re.sub( 31 | r'(\w+)\s*:', r'"\1":', 32 | response_data 33 | ))) 34 | return response_data 35 | 36 | @st.cache_data(ttl=60*60, show_spinner=False) 37 | def get_llm_graph_data_response(user_input, model_name=DEFAULT_MODEL_CONFIG['chat_model']): 38 | if not user_input: 39 | return None 40 | print(f"OpenAI call ({model_name})") 41 | try: 42 | model_config = { 43 | 'model': model_name, 44 | 'temperature': state.temperature, 45 | 'top_p': state.top_p, 46 | 'max_tokens': state.max_tokens, 47 | } 48 | completion = openai.chat.completions.create( 49 | messages=json.loads(SafeFormatter().format(json.dumps(func_prompt.MESSAGES), user_input=user_input)), 50 | functions=func_prompt.FUNCTIONS, 51 | function_call=func_prompt.FUNCTION_CALL, 52 | **model_config 53 | ) 54 | except openai.RateLimitError as e: 55 | # request limit exceeded or something. 56 | return str(e) 57 | except Exception as e: 58 | # general exception handling 59 | return str(e) 60 | 61 | response_data = completion.choices[0].message.function_call.arguments 62 | # clean up the response data JSON 63 | response_data = correct_json(response_data) 64 | # print(response_data) 65 | 66 | estimated_cost = ((completion.usage.prompt_tokens / 1000.0) * LANG_MODEL_PRICING[state.chat_model]['input']) + \ 67 | ((completion.usage.completion_tokens / 1000.0) * LANG_MODEL_PRICING[state.chat_model]['output']) 68 | print('Knowledge Graph Generation Estimated Cost: $', estimated_cost) 69 | state.estimated_cost_graph = estimated_cost 70 | state.cumulative_cost += estimated_cost 71 | 72 | return response_data 73 | 74 | # Function to generate a graph image using Graphviz 75 | def generate_knowledge_graph(response_data): 76 | dot = Digraph(comment="Knowledge Graph") 77 | response_dict = json.loads(response_data) 78 | 79 | # Add nodes to the graph 80 | for node in response_dict.get("nodes", []): 81 | dot.node(node["id"], f"{node['label']} ({node['type']})") 82 | 83 | # Add edges to the graph 84 | for edge in response_dict.get("edges", []): 85 | dot.edge(edge["from"], edge["to"], label=edge["relationship"]) 86 | 87 | # Requires GraphViz executable, so we can't use it in Streamlit Cloud 88 | if json.loads(st.secrets['IS_CLOUD_DEPLOYMENT']): 89 | return {'dot': dot, 'png': None, 'gv': None} 90 | else: 91 | # Render and visualize 92 | dot.render("./static/knowledge_graph.gv", view=False) 93 | # Render to PNG format and save it 94 | dot.render("./static/knowledge_graph", format = "png", view=False) 95 | return {'dot': dot, 'png': "./static/knowledge_graph.png", 'gv': "./static/knowledge_graph.gv"} 96 | 97 | def get_graph_data(response_data): 98 | try: 99 | response_dict = json.loads(response_data) 100 | # Assume response_data is global or passed appropriately 101 | nodes = [ 102 | { 103 | "data": { 104 | "id": node["id"], 105 | "label": node["label"], 106 | "color": node.get("color", "defaultColor"), 107 | } 108 | } 109 | for node in response_dict["nodes"] 110 | ] 111 | edges = [ 112 | { 113 | "data": { 114 | "source": edge["from"], 115 | "target": edge["to"], 116 | "label": edge["relationship"], 117 | "color": edge.get("color", "defaultColor"), 118 | } 119 | } 120 | for edge in response_dict["edges"] 121 | ] 122 | return {"elements": {"nodes": nodes, "edges": edges}} 123 | except: 124 | return {"elements": {"nodes": [], "edges": []}} 125 | 126 | # UTILITY --------------------------------------------------------------------- 127 | 128 | def image_html_fragments(image, text, image_style=None, text_style=None): 129 | with open(image, 'rb') as img_f: 130 | img_b64 = base64.b64encode(img_f.read()).decode('utf-8') 131 | 132 | img_style = image_style if image_style else "height: 200px; margin: 3px;" 133 | image_tag_html = f'' 134 | image_download_link = f'Download' 135 | 136 | # style copied from dev tools 137 | span_style = text_style if text_style else "font-weight: 600; font-size: 1.75rem;" 138 | span_style = ( f'font-family: Source Sans Pro, sans-serif; {span_style}' 139 | 'color: rgb(49, 51, 63); letter-spacing: -0.005em;' 140 | 'padding: 0.5rem 0px 1rem; margin: 0px; line-height: 1.2;' 141 | 'text-size-adjust: 100%; -webkit-font-smoothing: auto;' 142 | 'position: relative; vertical-align:middle;' ) 143 | text_html = f'{text}' 144 | 145 | image_html = f'{text_html}  {image_tag_html}' 146 | 147 | return {'image_html': image_html, 'image_tag_html': image_tag_html, 'image_download_link': image_download_link} 148 | 149 | # MAIN ------------------------------------------------------------------------ 150 | 151 | def main(title, user_input_confirmed=False, response=None): 152 | # Sidebar 153 | with st.sidebar: 154 | st.markdown(f'#### {title} Settings') 155 | st.selectbox( 156 | 'OpenAI model', options=OPENAI_MODELS_CHAT, 157 | on_change=_set_state_cb, kwargs={'chat_model': 'selectbox_graph_chat_model_name'}, 158 | index=OPENAI_MODELS_CHAT.index(state.chat_model), 159 | help='Allowed models. Accuracy, speed, token consumption and costs will vary.', 160 | key='selectbox_graph_chat_model_name' 161 | ) 162 | 163 | # GPT chat models can handle web sites, so we can keep URLs in the user input 164 | user_input = state.user_input if state.user_input.strip().startswith('http') else response 165 | user_input = user_input.replace('\n', ' ').replace('\r', '') if user_input else user_input 166 | 167 | if user_input_confirmed and user_input: 168 | with st.spinner("Generating knowledge graph (this takes a while)..."): 169 | response_data = get_llm_graph_data_response(user_input, model_name=state.chat_model) 170 | 171 | if user_input: 172 | st.subheader('💡 Answer Knowledge Graph') 173 | # This will use cached response! 174 | with st.spinner("Generating knowledge graph (this takes a while)..."): 175 | response_data = get_llm_graph_data_response(user_input, model_name=state.chat_model) 176 | 177 | c1, c2, _ = st.columns([2, 1, 3]) 178 | with c1: 179 | radio_options = ["Interactive", "Static", "Data"] 180 | radio_option = st.radio('Knowledge graph options', options=radio_options, horizontal=True) 181 | with c2: 182 | height = st.slider("Adjust image height", 100, 1000, 750, 50) 183 | 184 | if radio_option == radio_options[0]: 185 | from graph_frontend import graph_component 186 | 187 | # NOTE: This component doesn't actually return any data, so handle_event is a no-op 188 | def run_component(props): 189 | value = graph_component(key='graph', **props) 190 | return value 191 | def handle_event(value): 192 | if value is not None: 193 | st.write('Received from graph component: ', value) 194 | 195 | props = { 196 | 'data': { 'graph': get_graph_data(response_data) }, 197 | 'graph_height': height, 198 | 'show_graph_data': False, 199 | } 200 | handle_event(run_component(props)) 201 | 202 | if radio_option == radio_options[1]: 203 | graph_data = generate_knowledge_graph(response_data) 204 | # If graphviz executable is available, then we'll have a PNG to download or display 205 | if graph_data['png']: 206 | image_html_frags = image_html_fragments( 207 | graph_data['png'], '', 208 | image_style=f"height: {height}px; margin: 5px;", 209 | text_style="font-weight: 600; font-size: 1.75rem;" 210 | ) 211 | st.markdown(f"{image_html_frags['image_download_link']}", unsafe_allow_html=True) 212 | # st.markdown(f"{image_html_frags['image_tag_html']}", unsafe_allow_html=True) 213 | # st.markdown(f"{image_html_frags['image_html']}", unsafe_allow_html=True) 214 | 215 | # Display using Streamlit's D3.js graphviz renderer 216 | st.graphviz_chart(graph_data['dot']) 217 | 218 | if radio_option == radio_options[2]: 219 | st.json(get_graph_data(response_data), expanded=True) 220 | -------------------------------------------------------------------------------- /app_state.py: -------------------------------------------------------------------------------- 1 | import os 2 | import openai 3 | import weaviate 4 | import streamlit as st 5 | 6 | from globals import (DEFAULT_MODEL_CONFIG, DB_TABLE) 7 | 8 | # MAIN APP STATE ---------------------------------------------------------------- 9 | 10 | state = st.session_state 11 | 12 | # Initial state builder 13 | def build_initial_state(): 14 | openai_api_key = None 15 | if st.secrets.get('OPENAI_API_KEY', None): 16 | print('settings', 'OPENAI_API_KEY found') 17 | openai_api_key = st.secrets['OPENAI_API_KEY'] 18 | else: 19 | print('settings OPENAI_API_KEY not found!') 20 | # Try get OpenAI api key from os env 21 | # (this is the workaround for using Streamlit in Heroku) 22 | if os.environ.get('OPENAI_API_KEY', None): 23 | print('os.environ', 'OPENAI_API_KEY found') 24 | openai_api_key = os.environ['OPENAI_API_KEY'] 25 | openai.api_key = os.getenv("OPENAI_API_KEY") 26 | 27 | print('openai_api_key', 'sk_...' + openai_api_key[-5:], '\n') if openai_api_key else print('openai_api_key', 'NULL', '\n') 28 | 29 | weaviate_api_key = st.secrets.get('WEAVIATE_API_KEY', None) 30 | print('weaviate_api_key', weaviate_api_key[:5] + '...' + weaviate_api_key[-5:], '\n') if weaviate_api_key else print('weaviate_api_key', 'NULL', '\n') 31 | 32 | WEAVIATE_URL = st.secrets.get('WEAVIATE_URL', None) 33 | 34 | initial_state = { 35 | # MAIN APP STATE 36 | 'openai_api_key': openai_api_key, 37 | 'weaviate_api_key': weaviate_api_key, 38 | 'WEAVIATE_URL': WEAVIATE_URL, 39 | 'menu_choice': 0, 40 | 41 | # DATA PAGE STATE 42 | 'limit': 3, 43 | 'query': '', 44 | 'intermediate_steps': True, 45 | 'db_table': DB_TABLE, 46 | 'generated': [], 47 | 'past': [], 48 | 'questions': [], 49 | 50 | # KNOWLEDGE GRAPH PAGE STATE 51 | 'user_input': '', 52 | 53 | # MODEL STATE 54 | 'chat_model': DEFAULT_MODEL_CONFIG['chat_model'], 55 | 'completions_model': DEFAULT_MODEL_CONFIG['completions_model'], 56 | 'temperature': DEFAULT_MODEL_CONFIG['temperature'], 57 | 'top_p': DEFAULT_MODEL_CONFIG['top_p'], 58 | 'max_tokens': DEFAULT_MODEL_CONFIG['max_tokens'], 59 | 60 | 'estimated_cost_reset': 0, 61 | 'estimated_cost_data': 0, 62 | 'estimated_cost_doc': 0, 63 | 'estimated_cost_graph': 0, 64 | 'cumulative_cost': 0, 65 | } 66 | 67 | return initial_state 68 | 69 | # State initializer 70 | def init_app_state(): 71 | initial_state = build_initial_state() 72 | for k, v in initial_state.items(): 73 | if not state.get(k, None): 74 | setattr(state, k, v) 75 | 76 | # State resetter 77 | def reset_app_state(): 78 | initial_state = build_initial_state() 79 | for k, v in initial_state.items(): 80 | setattr(state, k, v) 81 | 82 | # STATE CALLBACK ---------------------------------------------------- 83 | 84 | # generic callback to set state 85 | def _set_state_cb(**kwargs): 86 | for state_key, widget_key in kwargs.items(): 87 | val = state.get(widget_key, None) 88 | if val is not None or val == "": 89 | setattr(state, state_key, state[widget_key]) 90 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | from colorama import init as colorama_init 2 | from colorama import Fore 3 | from colorama import Style 4 | 5 | # -------------------------------------------------------------------------------- 6 | 7 | # TODO: Make this HttpException compatible 8 | # https://stackoverflow.com/questions/64501193/fastapi-how-to-use-httpexception-in-responses 9 | # https://plainenglish.io/blog/3-ways-to-handle-errors-in-fastapi-that-you-need-to-know-e1199e833039 10 | # https://christophergs.com/tutorials/ultimate-fastapi-tutorial-pt-5-basic-error-handling/ 11 | 12 | # Error handler 13 | class AppError(Exception): 14 | def __init__(self, error, status_code): 15 | self.error = error 16 | self.status_code = status_code 17 | 18 | 19 | def throw_if_nulls(df): 20 | if sum(df.isnull().values.ravel()) > 0: 21 | raise AppError( 22 | error = f'NULL values found in dataframe ({list(df.columns)})!!', 23 | status_code = 500 24 | ) 25 | 26 | # -------------------------------------------------------------------------------- 27 | 28 | colorama_init() 29 | 30 | def print_red(msg): 31 | print(f'{Fore.RED}{msg}{Style.RESET_ALL}') 32 | def print_blue(msg): 33 | print(f'{Fore.BLUE}{msg}{Style.RESET_ALL}') 34 | def print_green(msg): 35 | print(f'{Fore.GREEN}{msg}{Style.RESET_ALL}') 36 | def print_yellow(msg): 37 | print(f'{Fore.YELLOW}{msg}{Style.RESET_ALL}') 38 | def print_cyan(msg): 39 | print(f'{Fore.CYAN}{msg}{Style.RESET_ALL}') 40 | def print_magenta(msg): 41 | print(f'{Fore.MAGENTA}{msg}{Style.RESET_ALL}') 42 | 43 | # WEB SCRAPER ----------------------------------------------------------------- 44 | 45 | # https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html 46 | import newspaper 47 | # https://www.crummy.com/software/BeautifulSoup/ 48 | from bs4 import BeautifulSoup 49 | import htmldate 50 | import dateutil 51 | import datefinder 52 | import random 53 | import time 54 | 55 | def scrape_articles(source_urls): 56 | article_titles = [] 57 | article_authors = [] 58 | article_dates = [] 59 | article_texts = [] 60 | article_keywords = [] 61 | article_summaries = [] 62 | article_urls = [] 63 | 64 | articles_dict = \ 65 | {'title':article_titles, 'author':article_authors, 'date':article_dates, \ 66 | 'text':article_texts, 'keywords':article_keywords, 'summary':article_summaries, \ 67 | 'url':article_urls } 68 | 69 | def _newspaper_scraper_helper(url): 70 | HEADERS = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:78.0) Gecko/20100101 Firefox/78.0', 71 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 72 | 'Referer' : 'https://google.com/' } 73 | config = newspaper.Config() 74 | config.headers = HEADERS 75 | config.request_timeout = 10 76 | 77 | article = newspaper.Article(url=url, language='en') 78 | article.download() 79 | article.parse() 80 | article.nlp() 81 | 82 | article_titles.append(article.title) 83 | article_authors.append(article.authors) 84 | 85 | if article.publish_date == None: 86 | publish_date = alternative_get_publish_date(article) 87 | else: 88 | publish_date = parse_date_str(str(article.publish_date)) 89 | article_dates.append(publish_date) 90 | 91 | if publish_date: 92 | article_text = f'Published: {publish_date.strftime("%d %B %Y")}\n\n{article.text}' 93 | else: 94 | article_text = article.text 95 | article_text = article_text.replace('\n\n','\n').replace('\r\n','\n') 96 | article_texts.append(article_text) 97 | 98 | article_keywords.append(article.keywords) 99 | article_summaries.append(article.summary) 100 | article_urls.append(article.url) 101 | 102 | # print(f'{article.title}\n{article.authors}\n{article.publish_date}\n{article_text[:20]}\n\ 103 | # {article.keywords}\n{article.summary}\n{article.url}') 104 | 105 | articles_dict = \ 106 | {'title':article_titles, 'author':article_authors, 'date':article_dates, \ 107 | 'text':article_texts, 'keywords':article_keywords, 'summary':article_summaries, \ 108 | 'url':article_urls } 109 | 110 | # NOTE: Could use this dict object to return a Pandas dataframe instead! 111 | 112 | return articles_dict 113 | 114 | # for testing 115 | ''' 116 | count = 0 117 | ''' 118 | for url in source_urls: 119 | 120 | # for testing (can bail out early) 121 | '''count+=1 122 | if count > 5: 123 | break 124 | ''' 125 | # Use a variable sleep betwen calls (good netizenship!) 126 | t = random.choice([0.1, 0.25, 0.5, 0.75, 1., 1.1, 1.25, 1.5]) 127 | jitter = random.random() 128 | T = t + jitter 129 | time.sleep(T) 130 | 131 | try: 132 | _newspaper_scraper_helper(url.strip()) 133 | except Exception as e: 134 | print('!!Newspaper Exception!!', '\n', e) 135 | continue 136 | 137 | return articles_dict 138 | 139 | # if newspaper can't find, then use bs4, datafinder, htmldate 140 | def alternative_get_publish_date(article): 141 | # try bs4 142 | soup = BeautifulSoup(article.html, features="lxml") 143 | # class=newsdate is specific to one site I was scraping (you can change this to suit your needs) 144 | para = soup.find('p', attrs={'class': 'newsdate'}) 145 | if para: 146 | datetime_obj = parse_date_str(para.next) 147 | return datetime_obj 148 | 149 | # try datefinder 150 | try: 151 | datefndrdt = datefinder.find_dates(article.text) 152 | datetime_obj = datefndrdt.__next__() 153 | return datetime_obj 154 | except: 155 | pass 156 | 157 | # try htmldate 158 | htmldt = htmldate.find_date(article.html, extensive_search=True, original_date=True) 159 | if htmldt: 160 | datetime_obj = parse_date_str(htmldt) 161 | return datetime_obj 162 | 163 | return None 164 | 165 | def parse_date_str(date_str): 166 | if date_str: 167 | try: 168 | return dateutil.parser.parse(date_str) 169 | except (ValueError, OverflowError, AttributeError, TypeError): 170 | # nearly all parse failures are due to URL dates without a day 171 | # specifier, e.g. /2014/04/ 172 | return None 173 | 174 | # -------------------------------------------------------------------------------- 175 | # After a lot of investigation on partial formatting, I found this solution: 176 | # https://stackoverflow.com/a/34033230 177 | # 178 | # NOTE: (@asehmi) It has been modified to preserve the format strings if fp's 179 | # value is None, which is important when a prompt template is being 180 | # incrementally built up (e.g. say templates are generated from parts 181 | # and they use parameter subsitutions taken from a database, and 182 | # then they're used to build the final prompt by binding with 183 | # the doc input, etc.) There is also a mod to deal with embedded 184 | # JSON strings which contain braces. 185 | # 186 | import string 187 | class SafeFormatter(string.Formatter): 188 | def vformat(self, format_string, args, kwargs): 189 | args_len = len(args) # for checking IndexError 190 | tokens = [] 191 | for (lit, name, spec, conv) in self.parse(format_string): 192 | # re-escape braces that parse() unescaped 193 | # NOTE: (@asehmi) Modified to deal with embedded JSON strings which contain braces 194 | if lit[0] in ['{','}'] or lit[-1] in ['{','}']: 195 | lit = lit.replace('{', '{{{{').replace('}', '}}}}') 196 | else: 197 | lit = lit.replace('{', '{{').replace('}', '}}') 198 | # only lit is non-None at the end of the string 199 | if name is None: 200 | tokens.append(lit) 201 | else: 202 | # but conv and spec are None if unused 203 | conv = '!' + conv if conv else '' 204 | spec = ':' + spec if spec else '' 205 | # name includes indexing ([blah]) and attributes (.blah) 206 | # so get just the first part 207 | fp = name.split('[')[0].split('.')[0] 208 | # treat as normal if fp is empty (an implicit 209 | # positional arg), a digit (an explicit positional 210 | # arg) or if it is in kwargs 211 | # NOTE: (@asehmi) Modified to preserve the format if fp's value is None 212 | if (not fp or fp.isdigit() or fp in kwargs) and kwargs[fp] is not None: 213 | tokens.extend([lit, '{', name, conv, spec, '}']) 214 | # otherwise escape the braces 215 | else: 216 | tokens.extend([lit, '{{', name, conv, spec, '}}']) 217 | format_string = ''.join(tokens) # put the string back together 218 | # finally call the default formatter 219 | return string.Formatter.vformat(self, format_string, args, kwargs) -------------------------------------------------------------------------------- /data/GCFS Countries.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/data/GCFS Countries.xlsx -------------------------------------------------------------------------------- /db/empty-for-github.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/db/empty-for-github.txt -------------------------------------------------------------------------------- /docs/cdaniel-future-is-predictable-master.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/docs/cdaniel-future-is-predictable-master.pdf -------------------------------------------------------------------------------- /func_prompt.py: -------------------------------------------------------------------------------- 1 | MESSAGES = [{ 2 | "role": "user", 3 | "content": "Help me understand following by describing as a detailed knowledge graph: {user_input}" 4 | }] 5 | 6 | FUNCTION_CALL = {"name": "knowledge_graph"} 7 | 8 | FUNCTIONS = [{ 9 | "name": "knowledge_graph", 10 | "description": 11 | "Generate a knowledge graph with entities and relationships. " 12 | "Use the colors to help differentiate between different node or edge types/categories. " 13 | "Always provide light pastel colors that work well with black font.", 14 | "parameters": { 15 | "type": "object", 16 | "properties": { 17 | "metadata": { 18 | "type": "object", 19 | "properties": { 20 | "createdDate": {"type": "string"}, 21 | "lastUpdated": {"type": "string"}, 22 | "description": {"type": "string"} 23 | } 24 | }, 25 | "nodes": { 26 | "type": "array", 27 | "items": { 28 | "type": "object", 29 | "properties": { 30 | "id": {"type": "string"}, 31 | "label": {"type": "string"}, 32 | "type": {"type": "string"}, 33 | # Added color property 34 | "color": {"type": "string"}, 35 | "properties": { 36 | "type": "object", 37 | "description": "Additional attributes for the node" 38 | } 39 | }, 40 | "required": [ 41 | "id", 42 | "label", 43 | "type", 44 | "color", 45 | ] # Added color to required 46 | }, 47 | }, 48 | "edges": { 49 | "type": "array", 50 | "items": { 51 | "type": "object", 52 | "properties": { 53 | "from": {"type": "string"}, 54 | "to": {"type": "string"}, 55 | "relationship": {"type": "string"}, 56 | "direction": {"type": "string"}, 57 | # Added color property 58 | "color": {"type": "string"}, 59 | "properties": { 60 | "type": "object", 61 | "description": "Additional attributes for the edge" 62 | } 63 | }, 64 | "required": [ 65 | "from", 66 | "to", 67 | "relationship", 68 | "color" 69 | ] # Added color to required 70 | }, 71 | }, 72 | }, 73 | "required": ["nodes", "edges"] 74 | }, 75 | }] 76 | 77 | -------------------------------------------------------------------------------- /globals.py: -------------------------------------------------------------------------------- 1 | # CONSTANTS -------------------------------------------------------------------- 2 | 3 | _BASE_DB_PATH = '.' 4 | _DB_PATH = 'db' 5 | _DB_NAME = 'gptdb.sqlite3' 6 | 7 | DB_FILE = '{}/{}/{}'.format(_BASE_DB_PATH, _DB_PATH, _DB_NAME) 8 | DB_TABLE = 'data' 9 | 10 | # curl https://api.openai.com/v1/models -H "Content-Type: application/json" -H "Authorization: Bearer %OPENAI_API_KEY%" 11 | # Actual model names used in app for selectors 12 | OPENAI_MODELS_CHAT = ['gpt-4-1106-preview', 'gpt-4'] 13 | OPENAI_MODELS_COMPLETIONS = ['gpt-3.5-turbo-instruct'] 14 | 15 | DEFAULT_MODEL_CONFIG = { 16 | 'chat_model': OPENAI_MODELS_CHAT[0], 17 | 'completions_model': OPENAI_MODELS_COMPLETIONS[0], 18 | 'temperature': 0.1, 19 | 'top_p': 0.9, 20 | 'max_tokens': 2048, 21 | } 22 | 23 | # Mapping from friendly name to actual model name 24 | LANG_MODELS = { 25 | # Friendly aliases used in app 26 | 'gpt-4': 'gpt-4', 27 | 'gpt-4-turbo': 'gpt-4-1106-preview', 28 | # Actual model names used in app 29 | 'gpt-4-1106-preview': 'gpt-4-1106-preview', 30 | 'gpt-3.5-turbo-instruct': 'gpt-3.5-turbo-instruct', 31 | } 32 | 33 | # See: https://openai.com/pricing 34 | LANG_MODEL_PRICING = { 35 | # Friendly aliases used in app 36 | 'gpt-4': {'input': 0.03, 'output': 0.06}, # per 1000 tokens 37 | 'gpt-4-turbo': {'input': 0.01, 'output': 0.03}, # per 1000 tokens 38 | # Actual model names used in app 39 | 'gpt-4-1106-preview': {'input': 0.01, 'output': 0.03}, # per 1000 tokens 40 | 'gpt-3.5-turbo-instruct': {'input': 0.0015, 'output': 0.002}, # per 1000 tokens 41 | } 42 | 43 | VECTOR_STORE = 'Weaviate' # 'Weaviate' | 'Local' 44 | 45 | SAMPLE_QUESTIONS = [ 46 | "None", 47 | "Summarize the most important concepts in a high performance software application", 48 | "Summarize the Wardley mapping technique", 49 | "Summarize the Viewpoints and Perspectives software solutions design methodology", 50 | "What are the most important considerations when architecting a software solution", 51 | "Build a 5-part learning plan on how to become a software architect. Detail each part with a short description and bullet points.", 52 | "Machine learning model training, deployment and operations", 53 | "What is a knowledge graph", 54 | "What is a graph neural network", 55 | "https://en.wikipedia.org/wiki/Graph_theory", 56 | "Most important factors of high performing teams", 57 | ] 58 | -------------------------------------------------------------------------------- /graph_frontend/__init__.py: -------------------------------------------------------------------------------- 1 | import streamlit.components.v1 as components 2 | graph_component = components.declare_component( 3 | name='graph_component', 4 | path='./graph_frontend' 5 | ) -------------------------------------------------------------------------------- /graph_frontend/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
33 | 34 | 35 |
36 |
37 |
38 |
39 | 40 | 212 | 213 | 214 | -------------------------------------------------------------------------------- /images/a12i_logo_circle_transparent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/a12i_logo_circle_transparent.png -------------------------------------------------------------------------------- /images/app-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/app-demo.gif -------------------------------------------------------------------------------- /images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/favicon.ico -------------------------------------------------------------------------------- /images/snapshot-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/images/snapshot-01.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | langchain-experimental==0.0.45 3 | llama-index==0.9.13 4 | llama-cpp-python 5 | sentence_transformers 6 | weaviate-client==3.24.1 7 | openai==1.1.2 8 | sqlalchemy 9 | debugpy 10 | openpyxl 11 | PyPDF2 12 | pypdf 13 | docx2txt 14 | PyCryptodome 15 | graphviz==0.20.1 16 | networkx==3.1 17 | beautifulsoup4==4.12.2 18 | colorama==0.4.5 19 | newspaper3k==0.2.8 20 | htmldate 21 | datefinder 22 | retry 23 | -------------------------------------------------------------------------------- /run_app.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | echo ============================================================ 3 | echo === OPEN A BROWSER WINDOW AT "http://[ip-address]:6974/" === 4 | echo ============================================================ 5 | streamlit run --server.port=6974 --server.headless=false app.py %1 %2 %3 %4 %5 %6 %7 %8 %9 6 | -------------------------------------------------------------------------------- /static/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/static/favicon.ico -------------------------------------------------------------------------------- /static/knowledge_graph: -------------------------------------------------------------------------------- 1 | // Knowledge Graph 2 | digraph { 3 | learning_plan [label="5-Part Learning Plan (Plan)"] 4 | architecture_objectives [label="Identify Architecture Objectives (Step)"] 5 | key_scenarios [label="Key Scenarios (Step)"] 6 | application_overview [label="Application Overview (Step)"] 7 | key_hotspots [label="Key Hotspots (Step)"] 8 | candidate_solutions [label="Candidate Solutions (Step)"] 9 | clear_objectives [label="Clear Objectives (Objective)"] 10 | precise_objectives [label="Precise Objectives (Objective)"] 11 | focus_design [label="Focus Design (Activity)"] 12 | evaluate_architectures [label="Evaluate Architectures (Activity)"] 13 | understand_application [label="Understand Application (Activity)"] 14 | connect_real_world [label="Connect to Real World (Activity)"] 15 | identify_hotspots [label="Identify Hotspots (Activity)"] 16 | create_candidate_architecture [label="Create Candidate Architecture (Activity)"] 17 | evaluate_candidate_architecture [label="Evaluate Candidate Architecture (Activity)"] 18 | app_arch_guide [label="Application Architecture Guide 2.0a (Resource)"] 19 | learning_plan -> architecture_objectives [label=includes] 20 | learning_plan -> key_scenarios [label=includes] 21 | learning_plan -> application_overview [label=includes] 22 | learning_plan -> key_hotspots [label=includes] 23 | learning_plan -> candidate_solutions [label=includes] 24 | architecture_objectives -> clear_objectives [label="aims to establish"] 25 | architecture_objectives -> precise_objectives [label="aims to establish"] 26 | key_scenarios -> focus_design [label=utilizes] 27 | key_scenarios -> evaluate_architectures [label=utilizes] 28 | application_overview -> understand_application [label=requires] 29 | application_overview -> connect_real_world [label=requires] 30 | key_hotspots -> identify_hotspots [label=involves] 31 | candidate_solutions -> create_candidate_architecture [label=involves] 32 | candidate_solutions -> evaluate_candidate_architecture [label=involves] 33 | learning_plan -> app_arch_guide [label="referenced by"] 34 | } 35 | -------------------------------------------------------------------------------- /static/knowledge_graph.gv: -------------------------------------------------------------------------------- 1 | // Knowledge Graph 2 | digraph { 3 | learning_plan [label="5-Part Learning Plan (Plan)"] 4 | architecture_objectives [label="Identify Architecture Objectives (Step)"] 5 | key_scenarios [label="Key Scenarios (Step)"] 6 | application_overview [label="Application Overview (Step)"] 7 | key_hotspots [label="Key Hotspots (Step)"] 8 | candidate_solutions [label="Candidate Solutions (Step)"] 9 | clear_objectives [label="Clear Objectives (Objective)"] 10 | precise_objectives [label="Precise Objectives (Objective)"] 11 | focus_design [label="Focus Design (Activity)"] 12 | evaluate_architectures [label="Evaluate Architectures (Activity)"] 13 | understand_application [label="Understand Application (Activity)"] 14 | connect_real_world [label="Connect to Real World (Activity)"] 15 | identify_hotspots [label="Identify Hotspots (Activity)"] 16 | create_candidate_architecture [label="Create Candidate Architecture (Activity)"] 17 | evaluate_candidate_architecture [label="Evaluate Candidate Architecture (Activity)"] 18 | app_arch_guide [label="Application Architecture Guide 2.0a (Resource)"] 19 | learning_plan -> architecture_objectives [label=includes] 20 | learning_plan -> key_scenarios [label=includes] 21 | learning_plan -> application_overview [label=includes] 22 | learning_plan -> key_hotspots [label=includes] 23 | learning_plan -> candidate_solutions [label=includes] 24 | architecture_objectives -> clear_objectives [label="aims to establish"] 25 | architecture_objectives -> precise_objectives [label="aims to establish"] 26 | key_scenarios -> focus_design [label=utilizes] 27 | key_scenarios -> evaluate_architectures [label=utilizes] 28 | application_overview -> understand_application [label=requires] 29 | application_overview -> connect_real_world [label=requires] 30 | key_hotspots -> identify_hotspots [label=involves] 31 | candidate_solutions -> create_candidate_architecture [label=involves] 32 | candidate_solutions -> evaluate_candidate_architecture [label=involves] 33 | learning_plan -> app_arch_guide [label="referenced by"] 34 | } 35 | -------------------------------------------------------------------------------- /static/knowledge_graph.gv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/static/knowledge_graph.gv.pdf -------------------------------------------------------------------------------- /static/knowledge_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/static/knowledge_graph.png -------------------------------------------------------------------------------- /storage/empty-for-github.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asehmi/docs-n-data-knowledge-app/746806c3ab3bee5288a790cb9294130d6a5a86ef/storage/empty-for-github.txt -------------------------------------------------------------------------------- /streamlit_debug.py: -------------------------------------------------------------------------------- 1 | # How to use: 2 | # 3 | # [1] Ensure you have `debugpy` installed: 4 | # 5 | # > pip install debugpy 6 | # 7 | # [2] In your main streamlit app: 8 | # 9 | # import streamlit_debug 10 | # streamlit_debug.set(flag=True, wait_for_client=True, host='localhost', port=8765) 11 | # 12 | # `flag=True` will initiate a debug session. `wait_for_client=True` will wait for a debug client to attach when 13 | # the streamlit app is run before hitting your next debug breakpoint. `wait_for_client=False` will not wait. 14 | # 15 | # If using VS Code, you need this config in your `.vscode/launch.json` file: 16 | # 17 | # { 18 | # // Use IntelliSense to learn about possible attributes. 19 | # // Hover to view descriptions of existing attributes. 20 | # // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 21 | # "version": "0.2.0", 22 | # "configurations": [ 23 | # { 24 | # "name": "Python: Current File", 25 | # "type": "python", 26 | # "request": "launch", 27 | # "program": "${file}", 28 | # "console": "integratedTerminal", 29 | # "env": {"DEBUG": "true"} 30 | # }, 31 | # { 32 | # "name": "Python: debugpy Remote Attach", 33 | # "type": "python", 34 | # "request": "attach", 35 | # "connect": { 36 | # "port": 8765, 37 | # "host": "127.0.0.1", 38 | # }, 39 | # "justMyCode": false, 40 | # "redirectOutput": true, 41 | # "logToFile": true, 42 | # "pathMappings": [ 43 | # { 44 | # "localRoot": "${workspaceFolder}", 45 | # "remoteRoot": "." 46 | # } 47 | # ] 48 | # // "debugAdapterPath": "${workspaceFolder}/src/debugpy/adapter", 49 | # }, 50 | # ] 51 | # } 52 | # 53 | # The port numbers you use need to match - in `streamlit_debug.set()` and `launch.json`. It should NOT be the same port that 54 | # streamlit is started on. 55 | # 56 | # When `flag=True` and `wait_for_client=True`, you'll must activate the "Python: debugpy Remote Attach" debug session 57 | # from vs-code. 58 | 59 | import streamlit as st 60 | import logging 61 | 62 | _DEBUG = False 63 | def set(flag: bool=False, wait_for_client=False, host='localhost', port=8765): 64 | global _DEBUG 65 | _DEBUG = flag 66 | try: 67 | # To prevent debugpy loading again and again because of 68 | # Streamlit's execution model, we need to track debugging state 69 | if 'debugging' not in st.session_state: 70 | st.session_state.debugging = None 71 | 72 | if _DEBUG and not st.session_state.debugging: 73 | # https://code.visualstudio.com/docs/python/debugging 74 | import debugpy 75 | if not debugpy.is_client_connected(): 76 | debugpy.listen((host, port)) 77 | if wait_for_client: 78 | logging.info(f'>>> Waiting for debug client attach... <<<') 79 | debugpy.wait_for_client() # Only include this line if you always want to manually attach the debugger 80 | logging.info(f'>>> ...attached! <<<') 81 | # debugpy.breakpoint() 82 | 83 | if st.session_state.debugging == None: 84 | logging.info(f'>>> Remote debugging activated (host={host}, port={port}) <<<') 85 | st.session_state.debugging = True 86 | 87 | if not _DEBUG: 88 | if st.session_state.debugging == None: 89 | logging.info(f'>>> Remote debugging in NOT active <<<') 90 | st.session_state.debugging = False 91 | except: 92 | # Ignore... e.g. for cloud deployments 93 | pass 94 | --------------------------------------------------------------------------------