├── .vscode └── settings.json ├── pyproject.toml ├── app.py ├── README.md ├── knowledge_base.py └── .gitignore /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.linting.enabled": false 3 | } -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "website-to-knowledge-base" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["JimZer "] 6 | readme = "README.md" 7 | packages = [{include = "website_to_knowledge_base"}] 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | openai = "^0.27.4" 12 | langchain = "^0.0.144" 13 | unstructured = "^0.5.13" 14 | chromadb = "^0.3.21" 15 | tiktoken = "^0.3.3" 16 | python-dotenv = "^1.0.0" 17 | loguru = "^0.7.0" 18 | streamlit = "^1.21.0" 19 | 20 | 21 | [tool.poetry.group.dev.dependencies] 22 | black = "^23.3.0" 23 | isort = "^5.12.0" 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | 4 | from knowledge_base import KnowledgeBase 5 | 6 | # Page setup 7 | st.set_page_config(page_title="Website to AI-Powered Knowledge Base", page_icon="🐍") 8 | st.title("AI-Powered Knowledge Base") 9 | 10 | # Remove whitespace from the top of the page and sidebar 11 | st.markdown( 12 | """ 13 | 27 | """, 28 | unsafe_allow_html=True, 29 | ) 30 | 31 | st.markdown("## Config") 32 | 33 | col1, col2 = st.columns(2) 34 | 35 | with col1: 36 | sitemap_url = st.text_input("URL to the website sitemap", value="") 37 | 38 | with col2: 39 | pattern = st.text_input("URL filter pattern (optional)", value="") 40 | 41 | 42 | st.markdown("## Ask") 43 | 44 | 45 | @st.cache_resource 46 | def get_knowledge_base(url, pattern): 47 | return KnowledgeBase( 48 | sitemap_url=url, 49 | pattern=pattern, 50 | chunk_size=8000, 51 | chunk_overlap=3000, 52 | ) 53 | 54 | 55 | @st.cache_resource 56 | def get_answer(url, pattern, query): 57 | kb = get_knowledge_base(sitemap_url, pattern) 58 | return kb.ask(query) 59 | 60 | 61 | if sitemap_url and pattern: 62 | with st.spinner("Getting the knowledge base ready, this may take a bit ..."): 63 | kb = get_knowledge_base(sitemap_url, pattern) 64 | 65 | query = st.text_input("Question", value="") 66 | 67 | if query: 68 | with st.spinner("Getting the answer ..."): 69 | result = get_answer(sitemap_url, pattern, query) 70 | 71 | st.markdown("### Answer") 72 | st.markdown(result["answer"]) 73 | st.markdown("### Sources") 74 | st.markdown("\n ".join([f"- {x}" for x in result["sources"].split("\n")])) 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI-Powered Knowledge Base 2 | 3 | [Demo](https://user-images.githubusercontent.com/19983429/235894528-9ad791a8-f2f0-4ad8-a2fc-69f4ed0aa8d8.mp4) 4 | 5 | [YouTube Demo](https://youtu.be/qRsNQweVKj0) 6 | 7 | This repository contains an AI-powered knowledge base that utilizes the LLMs model to answer questions based on a given website's content and **provide sources as links** to the relevant pages. 8 | 9 | The system: 10 | 1. Loads the website's content using a **sitemap** 11 | 2. Split each web page into **chunks** 12 | 4. **Embed** each chunk using a LLM (for now OpenAI) and store them in the **Chroma vector database 13 | 5. Then it embeds the user query and run a similarity search using the Chroma database 14 | 5. Finally it loads the similarity search results as context for a LLM (for now ChatGPT) to find relevant answers and citing the sources 15 | 16 | It also provides a Streamlit-based web interface for an easy-to-use experience. 17 | 18 | ## Files 19 | 20 | - `knowledge_base.py`: The main module that creates the KnowledgeBase class. This class is responsible for loading and processing the website content, creating the document index, and querying the LLM model for answers. 21 | - `app.py`: A Streamlit web application that provides a user interface for querying the AI-powered knowledge base. 22 | 23 | ## Installation 24 | 25 | 1. Clone the repository: 26 | 27 | ``` 28 | git clone git@github.com:bitswired/website-to-knowledge-base.git 29 | ``` 30 | 31 | 2. Instal the project with poetry: 32 | 33 | ``` 34 | poetry install 35 | ``` 36 | 37 | ## Usage 38 | 39 | ### Knowledge Base 40 | 41 | To use the KnowledgeBase class, follow these steps: 42 | 43 | 1. Import the KnowledgeBase class: 44 | 45 | ```python 46 | from knowledge_base import KnowledgeBase 47 | ``` 48 | 49 | 2. Instantiate the KnowledgeBase with the appropriate sitemap URL and pattern (optional): 50 | 51 | ```python 52 | kb = KnowledgeBase( 53 | sitemap_url="https://nextjs.org/sitemap.xml", 54 | pattern="docs/api-refe", 55 | chunk_size=8000, 56 | chunk_overlap=3000, 57 | ) 58 | ``` 59 | 60 | 3. Ask a question: 61 | 62 | ```python 63 | result = kb.ask("How do I deploy my Next.js app?") 64 | print(result) 65 | ``` 66 | 67 | ### Web Application 68 | 69 | To run the Streamlit web application, execute the following command in your terminal: 70 | 71 | ``` 72 | streamlit run app.py 73 | ``` 74 | 75 | The web app will open in your default browser. Enter the URL to the website's sitemap, an optional filter pattern for the URLs, and your question. The AI-powered knowledge base will return an answer based on the content of the website. 76 | 77 | ## Requirements 78 | 79 | - An API key for OpenAI's GPT-4 (see [OpenAI's API documentation](https://beta.openai.com/docs/) for details) 80 | 81 | ## License 82 | 83 | This project is licensed under the MIT License. 84 | -------------------------------------------------------------------------------- /knowledge_base.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from langchain.embeddings.openai import OpenAIEmbeddings 4 | from langchain.text_splitter import CharacterTextSplitter 5 | from langchain.vectorstores import Chroma 6 | from langchain.document_loaders import UnstructuredURLLoader 7 | from langchain.chat_models import ChatOpenAI 8 | from langchain.chains import RetrievalQAWithSourcesChain 9 | 10 | import requests 11 | import xml.etree.ElementTree as ET 12 | from dotenv import load_dotenv 13 | from loguru import logger 14 | 15 | load_dotenv() 16 | 17 | 18 | def extract_urls_from_sitemap(sitemap): 19 | """ 20 | Extract all URLs from a sitemap XML string. 21 | 22 | Args: 23 | sitemap_string (str): The sitemap XML string. 24 | 25 | Returns: 26 | A list of URLs extracted from the sitemap. 27 | """ 28 | # Parse the XML from the string 29 | root = ET.fromstring(sitemap) 30 | 31 | # Define the namespace for the sitemap XML 32 | namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"} 33 | 34 | # Find all elements under the elements 35 | urls = [ 36 | url.find("ns:loc", namespace).text for url in root.findall("ns:url", namespace) 37 | ] 38 | 39 | # Return the list of URLs 40 | return urls 41 | 42 | 43 | class KnowledgeBase: 44 | def __init__( 45 | self, 46 | sitemap_url: str, 47 | chunk_size: int, 48 | chunk_overlap: int, 49 | pattern: Optional[str] = None, 50 | ): 51 | logger.info("Building the knowledge base ...") 52 | 53 | logger.info("Loading sitemap from {sitemap_url} ...", sitemap_url=sitemap_url) 54 | sitemap = requests.get(sitemap_url).text 55 | urls = extract_urls_from_sitemap(sitemap) 56 | 57 | if pattern: 58 | logger.info("Filtering URLs with pattern {pattern} ...", pattern=pattern) 59 | urls = [x for x in urls if pattern in x] 60 | logger.info("{n} URLs extracted", n=len(urls)) 61 | 62 | logger.info("Loading URLs content ...") 63 | loader = UnstructuredURLLoader(urls) 64 | data = loader.load() 65 | 66 | logger.info("Splitting documents in chunks ...") 67 | doc_splitter = CharacterTextSplitter( 68 | chunk_size=chunk_size, chunk_overlap=chunk_overlap 69 | ) 70 | docs = doc_splitter.split_documents(data) 71 | logger.info("{n} chunks created", n=len(docs)) 72 | 73 | logger.info("Building the vector database ...") 74 | embeddings = OpenAIEmbeddings() 75 | docsearch = Chroma.from_documents(docs, embeddings) 76 | 77 | logger.info("Building the retrieval chain ...") 78 | self.chain = RetrievalQAWithSourcesChain.from_chain_type( 79 | ChatOpenAI(), 80 | chain_type="map_reduce", 81 | retriever=docsearch.as_retriever(), 82 | ) 83 | 84 | logger.info("Knowledge base created!") 85 | 86 | def ask(self, query: str): 87 | return self.chain({"question": query}, return_only_outputs=True) 88 | 89 | 90 | if __name__ == "__main__": 91 | # Build the knowledge base 92 | kb = KnowledgeBase( 93 | sitemap_url="https://nextjs.org/sitemap.xml", 94 | pattern="docs/api-refe", 95 | chunk_size=8000, 96 | chunk_overlap=3000, 97 | ) 98 | 99 | # Ask a question 100 | res = kb.ask("How do I deploy my Next.js app?") 101 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | # LSP config files 174 | pyrightconfig.json 175 | 176 | # End of https://www.toptal.com/developers/gitignore/api/python 177 | 178 | .chroma --------------------------------------------------------------------------------