├── models ├── __init__.py ├── imported_urls.py ├── imported_pages.py ├── session.py ├── question.py └── conversation.py ├── tools ├── __init__.py └── focused_labs_q_and_a_tool.py ├── .python-version ├── ArchitectureOverview.png ├── config.py ├── conversation_repository.py ├── init-db.sql ├── requirements.txt ├── tests ├── accuracy_test.py ├── test_text_cleaner.py ├── test_query_service.py ├── accuracy_test_runner.py ├── test_utils.py └── questions.txt ├── database.py ├── pinecone_database.py ├── utils.py ├── query_service.py ├── main.py ├── import_service.py ├── .gitignore ├── agent.py ├── logger.py ├── README.md └── text_cleaner.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10.7 2 | -------------------------------------------------------------------------------- /ArchitectureOverview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/focused-dot-io/ai-knowledge-base-demo/HEAD/ArchitectureOverview.png -------------------------------------------------------------------------------- /models/imported_urls.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class ImportedUrls(BaseModel): 5 | page_urls: list -------------------------------------------------------------------------------- /models/imported_pages.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class ImportedPages(BaseModel): 5 | page_ids: list 6 | -------------------------------------------------------------------------------- /models/session.py: -------------------------------------------------------------------------------- 1 | from uuid import UUID 2 | 3 | from pydantic import BaseModel 4 | 5 | 6 | class Session(BaseModel): 7 | session_id: UUID -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | CHAT_MODEL = 'gpt-3.5-turbo' 2 | EMBEDDING_MODEL = 'text-embedding-ada-002' 3 | PINECONE_INDEX = "fl-kb-hub" 4 | PINECONE_ENVIRONMENT = "asia-southeast1-gcp-free" 5 | -------------------------------------------------------------------------------- /models/question.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from uuid import UUID 3 | 4 | from pydantic import BaseModel 5 | 6 | 7 | class Question(BaseModel): 8 | text: str 9 | role: str 10 | session_id: Optional[UUID] -------------------------------------------------------------------------------- /conversation_repository.py: -------------------------------------------------------------------------------- 1 | from fastapi import Depends 2 | 3 | from database import get_db 4 | from models.conversation import Conversation 5 | 6 | 7 | def create_conversation(db: Depends(get_db()), conversation: Conversation): 8 | db.add(conversation) 9 | db.commit() 10 | -------------------------------------------------------------------------------- /init-db.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE conversation 2 | ( 3 | id SERIAL PRIMARY KEY, 4 | session_id UUID NOT NULL, 5 | created_at timestamp NOT NULL, 6 | question varchar NOT NULL, 7 | response varchar NOT NULL, 8 | error_message varchar 9 | ); 10 | -------------------------------------------------------------------------------- /models/conversation.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, UUID 2 | 3 | from database import Base 4 | 5 | 6 | class Conversation(Base): 7 | __tablename__ = "conversation" 8 | id = Column(Integer, primary_key=True, index=True) 9 | session_id = Column(UUID, unique=True, index=True) 10 | created_at = Column(String) 11 | question = Column(String) 12 | response = Column(String) 13 | error_message = Column(String) 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.27.1 2 | redis==4.5.1 3 | requests==2.28.2 4 | termcolor==2.2.0 5 | jupyter 6 | ipykernel 7 | textract 8 | llama-index==0.6.24 9 | langchain==0.0.239 10 | python-dotenv==1.0.0 11 | httpx 12 | fastapi~=0.98.0 13 | uvicorn~=0.22.0 14 | pinecone-client==2.2.2 15 | transformers 16 | fastapi-sessions 17 | google-api-python-client 18 | google-auth-httplib2 19 | google-auth-oauthlib 20 | google~=3.0.0 21 | pydantic~=1.10.12 22 | SQLAlchemy~=2.0.19 23 | psycopg2-binary -------------------------------------------------------------------------------- /tests/accuracy_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from agent import Agent 4 | from logger import save_question, save_error 5 | 6 | 7 | def accuracy_test(spreadsheet_id, question): 8 | agent = Agent(personality="human") 9 | try: 10 | answer = agent.query_agent(question) 11 | response_formatted = json.loads(answer, strict=False) 12 | save_question(question=question, answer=response_formatted, sheet_id=spreadsheet_id) 13 | except Exception as e: 14 | save_error(question, str(e), spreadsheet_id) 15 | raise e 16 | -------------------------------------------------------------------------------- /database.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | from sqlalchemy import create_engine 5 | from sqlalchemy.ext.declarative import declarative_base 6 | from sqlalchemy.orm import sessionmaker 7 | 8 | load_dotenv() 9 | SQLALCHEMY_DATABASE_URL = f"postgresql://doadmin:{os.getenv('DB_PASSWORD')}@db-ai-knowledge-base-do-user-14399519-0.b.db.ondigitalocean.com:25060/{os.getenv('DB_NAME')}?sslmode=require" 10 | 11 | engine = create_engine( 12 | SQLALCHEMY_DATABASE_URL 13 | ) 14 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 15 | 16 | Base = declarative_base() 17 | 18 | 19 | def get_db(): 20 | db = SessionLocal() 21 | try: 22 | yield db 23 | finally: 24 | db.close() 25 | -------------------------------------------------------------------------------- /pinecone_database.py: -------------------------------------------------------------------------------- 1 | from llama_index.vector_stores import PineconeVectorStore 2 | from llama_index import StorageContext, GPTVectorStoreIndex, VectorStoreIndex 3 | from dotenv import load_dotenv 4 | from config import PINECONE_INDEX, PINECONE_ENVIRONMENT 5 | 6 | load_dotenv() 7 | 8 | 9 | def get_pinecone_index(): 10 | storage_context = get_pinecone_storage_context() 11 | index = VectorStoreIndex([], storage_context=storage_context) 12 | return index 13 | 14 | 15 | def get_pinecone_storage_context(): 16 | vector_store = get_vector_store() 17 | return StorageContext.from_defaults(vector_store=vector_store) 18 | 19 | 20 | def get_vector_store(): 21 | return PineconeVectorStore( 22 | index_name=PINECONE_INDEX, 23 | environment=PINECONE_ENVIRONMENT, 24 | # metadata_filters=metadata_filters 25 | ) -------------------------------------------------------------------------------- /tests/test_text_cleaner.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import text_cleaner 4 | 5 | 6 | class TextCleanerTest(unittest.TestCase): 7 | 8 | def test_removes_emojis(self): 9 | self.assertEqual(text_cleaner.remove_emoji('😀 hi'), ' hi') 10 | self.assertEqual(text_cleaner.remove_emoji(' hi'), ' hi') 11 | 12 | def test_remove_specific_characters(self): 13 | self.assertEqual(text_cleaner.remove_specific_characters('hi£££££££'), 'hi') 14 | self.assertEqual(text_cleaner.remove_specific_characters('hi'), 'hi') 15 | self.assertEqual(text_cleaner.remove_specific_characters('hi.'), 'hi.') 16 | 17 | def test_replace_contractions(self): 18 | self.assertEqual(text_cleaner.replace_contractions('hi btw ma\'am'), 'hi by the way madam') 19 | self.assertEqual(text_cleaner.replace_contractions('hi by the way madam'), 'hi by the way madam') 20 | 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /tests/test_query_service.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import patch, Mock 3 | 4 | import query_service 5 | from agent import Agent 6 | from models.question import Question 7 | 8 | mock_response = f""" 9 | {{ 10 | "result": "Here's an answer to a question!", 11 | "sources": [] 12 | }}""" 13 | 14 | 15 | def mock_response_func(cls, *args, **kwargs): 16 | return mock_response 17 | 18 | 19 | class TestQueryService(unittest.TestCase): 20 | @patch.object(Agent, 'query_agent', mock_response_func) 21 | @patch("conversation_repository.create_conversation") 22 | def test_query_service_query_calls_repo_to_log(self, create_convo_repo: Mock): 23 | under_test = query_service.QueryService() 24 | q = Question(text="Hello question!", session_id="2ddc72f3-b04d-4516-ac80-cff3619eccd4", role="human") 25 | under_test.query( 26 | question=q) 27 | create_convo_repo.assert_called_once() 28 | 29 | 30 | if __name__ == '__main__': 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /tools/focused_labs_q_and_a_tool.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pinecone 4 | from dotenv import load_dotenv 5 | from langchain.chains import RetrievalQA 6 | from langchain.chat_models import ChatOpenAI, openai 7 | from langchain.embeddings import OpenAIEmbeddings 8 | from langchain.vectorstores import Pinecone 9 | from config import PINECONE_ENVIRONMENT, PINECONE_INDEX, EMBEDDING_MODEL 10 | 11 | load_dotenv() 12 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 13 | openai.api_key = OPENAI_API_KEY 14 | 15 | 16 | def create_vector_db_tool(llm: ChatOpenAI): 17 | pinecone.init( 18 | api_key=os.getenv('PINECONE_API_KEY'), 19 | environment=PINECONE_ENVIRONMENT 20 | ) 21 | text_field = "text" 22 | 23 | index = pinecone.Index(PINECONE_INDEX) 24 | 25 | embedding_model = OpenAIEmbeddings( 26 | model=EMBEDDING_MODEL, 27 | openai_api_key=OPENAI_API_KEY 28 | ) 29 | 30 | vectorstore = Pinecone( 31 | index, embedding_model.embed_query, text_field 32 | ) 33 | return RetrievalQA.from_chain_type( 34 | llm=llm, 35 | chain_type="stuff", 36 | return_source_documents=True, 37 | input_key="question", 38 | retriever=vectorstore.as_retriever(search_kwargs={"k": 3}) 39 | ) 40 | -------------------------------------------------------------------------------- /tests/accuracy_test_runner.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import datetime 3 | 4 | from accuracy_test import accuracy_test 5 | from logger import create_sheet_in_folder 6 | 7 | SHARED_FOLDER_ID = '1O2TcHSz8UhSSJoRzvP7QHhnSLVurY5cC' 8 | 9 | 10 | def ask_questions(question_file_name, questions): 11 | timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") 12 | sheet_name = "{file_name}-{ts}".format(file_name=question_file_name.replace('.', '_'), ts=timestamp) 13 | sheet_id = create_sheet_in_folder(sheet_name, folder_id=SHARED_FOLDER_ID, sheet_range='Sheet1', sheet_data=[ 14 | [ 15 | 'Timestamp(UTC)', 16 | 'Session id', 17 | 'Question', 18 | 'Answer', 19 | 'Sources', 20 | 'Error Message', 21 | 'Accuracy(1 - 5 where 5 is the best)', 22 | 'Comments' 23 | ] 24 | ]) 25 | for question in questions: 26 | try: 27 | if not question.strip().startswith('#'): 28 | accuracy_test(sheet_id, question.strip()) 29 | except ValueError as e: 30 | print(f"Error when asking question {question.strip()}: str(e))") 31 | 32 | 33 | if __name__ == "__main__": 34 | question_file = open(sys.argv[1], 'r') 35 | questions = question_file.readlines() 36 | ask_questions(question_file.name, questions) 37 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def output_response(response) -> None: 5 | """ 6 | You may be wondering why aren't we streaming the response using the openai completion API 7 | This is currently in beta in the langchain library, I will update this example 8 | to showcase this as implementation details may change 9 | Since it's flagged as beta adding it here may cause confusion as most 10 | likely it will be changed again within a few weeks 11 | For now output_response will simulate streaming for the purpose of illustration 12 | Args: 13 | response: text output generated by ChatGPT 14 | """ 15 | if not response: 16 | print("There's no response.") 17 | else: 18 | print(response) 19 | print("-----") 20 | 21 | 22 | def is_answer_formatted_in_json(answer): 23 | try: 24 | json.loads(answer, strict=False) 25 | return True 26 | except ValueError: 27 | return False 28 | 29 | 30 | def format_escape_characters(s): 31 | return s.replace('"', '\\"').replace("\n", "\\n") 32 | 33 | 34 | def transform_source_docs(result): 35 | formatted_result_string = format_escape_characters(result["result"]) 36 | if 'source_documents' in result.keys(): 37 | return f""" 38 | {{ 39 | "result": "{formatted_result_string}", 40 | "sources": {json.dumps([i.metadata for i in result['source_documents']])} 41 | }}""" 42 | return f""" 43 | {{ 44 | "result": "{formatted_result_string}", 45 | "sources": [] 46 | }}""" 47 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | 4 | from langchain.schema import Document 5 | 6 | from utils import transform_source_docs 7 | 8 | mock_result = { 9 | "source_documents": [], 10 | "result": "here's a response!" 11 | } 12 | 13 | mock_result_with_docs = { 14 | "source_documents": [Document( 15 | page_content='some page content', 16 | metadata={'URL': 'url', 17 | 'doc_id': 'doc_id', 18 | 'document_id': 'document_id', 19 | 'node_info': '{"start": 0, "end": 504, "_node_type": "1"}', 20 | 'page_id': 'page_id', 21 | 'ref_doc_id': 'ref_doc_id', 22 | 'relationships': '{"1": "relationship_id"}', 23 | 'title': 'A page title here!'})], 24 | "result": "here's a response!" 25 | } 26 | 27 | 28 | class TestUtils(unittest.TestCase): 29 | 30 | def test_transform_source_docs(self): 31 | response_transformed = transform_source_docs(mock_result) 32 | self.assertIn("here's a response!", response_transformed) 33 | self.assertIn('"sources": []', response_transformed) 34 | json.loads(response_transformed) 35 | 36 | def test_transform_source_docs_with_content(self): 37 | response_transformed = transform_source_docs(mock_result_with_docs) 38 | self.assertIn("here's a response!", response_transformed) 39 | self.assertIn('A page title here', response_transformed) 40 | response = json.loads(response_transformed) 41 | self.assertEqual(1, len(response['sources'])) 42 | 43 | 44 | if __name__ == '__main__': 45 | unittest.main() 46 | -------------------------------------------------------------------------------- /query_service.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import json 3 | from uuid import uuid4 4 | import conversation_repository 5 | from agent import Agent 6 | from database import get_db 7 | from models.conversation import Conversation 8 | from models.question import Question 9 | from models.session import Session 10 | 11 | 12 | class QueryService: 13 | 14 | def __init__(self): 15 | self.agents = {} 16 | 17 | def _create_query_session(self, personality): 18 | session_id = uuid4() 19 | self.agents[session_id] = Agent(personality=personality) 20 | return session_id 21 | 22 | def query(self, question: Question): 23 | session_id = question.session_id 24 | if session_id not in self.agents: 25 | session_id = self._create_query_session(personality=question.role) 26 | try: 27 | agent = self.agents[session_id] 28 | answer = agent.query_agent(user_input=question.text) 29 | response_formatted = json.loads(answer, strict=False) 30 | except Exception as e: 31 | conversation_repository.create_conversation( 32 | db=next(get_db()), 33 | conversation=Conversation(session_id=session_id, question=question.text, created_at=datetime.now(), 34 | response="", error_message=str(e))) 35 | raise e 36 | try: 37 | conversation_repository.create_conversation( 38 | db=next(get_db()), 39 | conversation=Conversation(session_id=session_id, question=question.text, created_at=datetime.now(), 40 | response=response_formatted['result'])) 41 | except Exception as e: 42 | print(f"Failed to log response. Error: {e}") 43 | return {"response": response_formatted, "session_id": session_id} 44 | 45 | def delete_query_session(self, session: Session): 46 | if session.session_id in self.agents: 47 | self.agents.pop(session.session_id) 48 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | from contextlib import asynccontextmanager 4 | 5 | import uvicorn 6 | from dotenv import load_dotenv 7 | from fastapi import FastAPI 8 | from fastapi.middleware.cors import CORSMiddleware 9 | 10 | from import_service import import_web_scrape_data, import_notion_data 11 | from models.imported_pages import ImportedPages 12 | from models.imported_urls import ImportedUrls 13 | from models.question import Question 14 | from models.session import Session 15 | from query_service import QueryService 16 | 17 | load_dotenv() 18 | 19 | allowed_origins = [ 20 | "http://localhost:3000", 21 | "https://fl-ai-knowledgehub-h27h6.ondigitalocean.app/", 22 | "https://dev-kb-xxl7y.ondigitalocean.app/", 23 | "https://chat.withfocus.com/", 24 | "https://chat.focusedlabs.io/" 25 | ] 26 | 27 | query_service = QueryService() 28 | 29 | 30 | def init_logging(): 31 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 32 | logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) 33 | 34 | 35 | 36 | @asynccontextmanager 37 | async def lifespan(app: FastAPI): 38 | init_logging() 39 | 40 | yield 41 | 42 | 43 | app = FastAPI(lifespan=lifespan) 44 | 45 | app.add_middleware( 46 | CORSMiddleware, 47 | allow_origins=allowed_origins, 48 | allow_credentials=True, 49 | allow_methods=["*"], 50 | allow_headers=["*"], 51 | ) 52 | 53 | 54 | @app.post("/load-notion-docs") 55 | def load_notion_documents(imported_pages: ImportedPages): 56 | print(f"Loading the following notion docs {imported_pages}") 57 | import_notion_data(imported_pages.page_ids) 58 | return {"status": "Notion Docs Loaded"} 59 | 60 | 61 | @app.post("/load-website-docs") 62 | def load_web_scrape_documents(website: ImportedUrls): 63 | print(f"Loading following web scraped docs {website.page_urls}") 64 | import_web_scrape_data(website.page_urls) 65 | 66 | 67 | @app.get("/") 68 | async def root(): 69 | return {"message": "Hello World"} 70 | 71 | 72 | @app.post("/query/") 73 | async def query(question: Question): 74 | return query_service.query(question=question) 75 | 76 | 77 | @app.post("/delete_session") 78 | async def delete_session(session: Session): 79 | query_service.delete_query_session(session) 80 | 81 | 82 | if __name__ == "__main__": 83 | uvicorn.run(app, host="0.0.0.0", port=8000) 84 | -------------------------------------------------------------------------------- /tests/questions.txt: -------------------------------------------------------------------------------- 1 | What does Focused Labs do? 2 | Tell me a brief history of Focused Labs. 3 | What type of work does Focused Labs excel at? 4 | What are Focused Labs company values? 5 | Tell me more about what listen first means? 6 | Tell me more about what learn why means? 7 | Tell me more about what love your craft means? 8 | Where is the Denver office located? 9 | Where is the Chicago office located? 10 | Tell me about the Chicago office, what is it like? 11 | Where are the Focused Labs offices located? 12 | Who owns Focused Labs? 13 | Who is the current CEO? 14 | Who are the founders and what is their background? 15 | Who is the leadership team? 16 | How many employees do you have? 17 | Who are the software engineers? 18 | As a potential employee, what programming languages should I know? 19 | What type of skills do you need to be a software engineer here? 20 | What programming languages are Focused labs developers good at? 21 | What does a typical day look like? 22 | What are your working hours? 23 | What does a typical team look like at Focused Labs? 24 | As a potential employee, how long will I be on each project? 25 | What kinds of projects does FL work on? 26 | What clients has Focused Labs worked with? 27 | Tell me more about your work with Hertz 28 | What are your benefits? 29 | What is the expected compensation package for an employee? 30 | What healthcare options does focused labs offer? 31 | What is the typical process of working with FL as a customer? 32 | Why should I trust Focused Labs? 33 | Focused Labs doesn't have expertise in my domain or any particular business domain. What makes you capable of doing the work in my industry? 34 | I have an idea but don't know where to start. How can Focused Labs help? 35 | Can focused Labs help me migrate from on premise to AWS? 36 | Do I want to be a customer? 37 | Why should I work with FL over another similar company or competitor? 38 | What are Focused Labs' rates? 39 | What are my rate negotiation options? 40 | Why is the work you deliver worth those rates? 41 | What are the benefits of custom built software? 42 | What is TDD? 43 | What is agile? 44 | What is Focused Labs' policy on remote work? 45 | What is the company culture like? 46 | What jobs are available now? 47 | As a potential employee, what does Focused Labs' process look like? 48 | As a prospective employee, do I have to always pair program? 49 | Why do you have an AI chatbot? 50 | How do I contact Focused Labs? 51 | Does Focused Labs know a lot about AI? 52 | Write me a haiku about Focused Labs? 53 | If Focused Labs were an animal, what animal would it be? -------------------------------------------------------------------------------- /import_service.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from langchain.chat_models import ChatOpenAI 4 | 5 | from config import CHAT_MODEL 6 | from pinecone_database import get_pinecone_storage_context 7 | from llama_index import NotionPageReader, VectorStoreIndex, download_loader, LLMPredictor, ServiceContext 8 | import os 9 | from dotenv import load_dotenv 10 | from text_cleaner import normalize_text 11 | import http.client 12 | 13 | load_dotenv() 14 | NOTION_API_KEY = os.getenv('NOTION_API_KEY') 15 | 16 | page_titles = [{}] 17 | 18 | 19 | def get_llm_predictor(): 20 | return LLMPredictor(llm=ChatOpenAI(temperature=0, max_tokens=512, model_name=CHAT_MODEL)) 21 | 22 | 23 | def get_service_context(): 24 | llm_predictor_chatgpt = get_llm_predictor() 25 | return ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt) 26 | 27 | 28 | def get_notion_metadata(page_id): 29 | try: 30 | headers = {'Authorization': f'Bearer {NOTION_API_KEY}', 'Notion-Version': '2022-06-28'} 31 | connection = http.client.HTTPSConnection("api.notion.com") 32 | 33 | connection.request("GET", f"/v1/pages/{page_id}/properties/title", headers=headers) 34 | page_title = json.loads(connection.getresponse().read()) 35 | 36 | connection.request("GET", f"/v1/pages/{page_id}", headers=headers) 37 | page_url = json.loads(connection.getresponse().read()) 38 | 39 | return {"page_title": page_title['results'][0]['title']['plain_text'], "page_url": page_url['url']} 40 | except Exception as e: 41 | print(f"Failed to retrieve notion metadata{e} for page id: {page_id}") 42 | return {"page_title": "", "page_url": ""} 43 | 44 | 45 | def import_notion_data(page_ids): 46 | documents = NotionPageReader(integration_token=NOTION_API_KEY).load_data(page_ids=page_ids) 47 | for document in documents: 48 | document_metadata = get_notion_metadata(page_id=document.extra_info["page_id"]) 49 | url = document_metadata['page_url'] 50 | title = document_metadata['page_title'] 51 | document.extra_info.update({"URL": url, "title": title}) 52 | document.metadata = ({"URL": url, "title": title}) 53 | document.text = normalize_text(document.text) 54 | 55 | index = VectorStoreIndex.from_documents(documents, 56 | storage_context=get_pinecone_storage_context(), 57 | service_context=get_service_context()) 58 | return index 59 | 60 | 61 | def import_web_scrape_data(urls: list): 62 | BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader") 63 | 64 | loader = BeautifulSoupWebReader() 65 | documents = loader.load_data(urls=urls) 66 | 67 | for document in documents: 68 | document.text = normalize_text(document.text) 69 | 70 | index = VectorStoreIndex.from_documents(documents, 71 | storage_context=get_pinecone_storage_context(), 72 | service_context=get_service_context()) 73 | return index 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Local environment variables 2 | .env 3 | Pipfile 4 | 5 | # Credentials for google cloud APIs 6 | service_account_credentials.json 7 | 8 | .idea/ 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/#use-with-ide 119 | .pdm.toml 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | #.idea/ 170 | -------------------------------------------------------------------------------- /agent.py: -------------------------------------------------------------------------------- 1 | from langchain.agents import Tool, ConversationalChatAgent, AgentExecutor 2 | from langchain.chains import RetrievalQA 3 | from langchain.chat_models import ChatOpenAI 4 | from langchain.memory import ConversationBufferWindowMemory 5 | 6 | from config import CHAT_MODEL 7 | from tools.focused_labs_q_and_a_tool import create_vector_db_tool 8 | from utils import is_answer_formatted_in_json, output_response, transform_source_docs 9 | 10 | 11 | class Agent: 12 | 13 | def __init__(self, personality): 14 | self.personality = personality 15 | self.llm = ChatOpenAI(temperature=0, model_name=CHAT_MODEL) 16 | self.agent_executor = self.create_agent_executor() 17 | 18 | def create_agent_executor(self): 19 | q_and_a_tool = create_vector_db_tool(llm=self.llm) 20 | tools = [ 21 | Tool( 22 | name="Focused Labs QA", 23 | return_direct=True, 24 | func=lambda query: _parse_source_docs(q_and_a_tool, query), 25 | description="useful for when you need to answer questions about Focused Labs" 26 | ) 27 | ] 28 | memory = ConversationBufferWindowMemory(llm=self.llm, k=10, memory_key="chat_history", return_messages=True, 29 | human_prefix="user", ai_prefix="assistant", input_key="input") 30 | custom_agent = ConversationalChatAgent.from_llm_and_tools(llm=self.llm, 31 | tools=tools, 32 | verbose=True, 33 | max_iterations=3, 34 | handle_parsing_errors=True, 35 | memory=memory, 36 | input_variables=["input", "chat_history", 37 | "agent_scratchpad"], 38 | system_message= 39 | f""" 40 | Have a conversation with a human, answering the 41 | following as best you can and try to use a tool to help. 42 | You have access to the following tools: 43 | Focused Labs QA-useful for when you need to answer 44 | questions about Focused Labs. If you don't know the 45 | answer don't make one up, just say "Hmm, I'm not sure 46 | please contact work@focusedlabs.io for further assistance." 47 | Answer questions from the perspective of a {self.personality}""" 48 | ) 49 | return AgentExecutor.from_agent_and_tools(agent=custom_agent, tools=tools, memory=memory, 50 | verbose=True) 51 | 52 | def query_agent(self, user_input): 53 | try: 54 | response = self.agent_executor.run(input=user_input) 55 | if is_answer_formatted_in_json(response): 56 | return response 57 | return f""" 58 | {{ 59 | "result": "{response}", 60 | "sources": [] 61 | }}""" 62 | 63 | except ValueError as e: 64 | response = str(e) 65 | response_prefix = "Could not parse LLM output: `\nAI: " 66 | if not response.startswith(response_prefix): 67 | raise e 68 | response_suffix = "`" 69 | if response.startswith(response_prefix): 70 | response = response[len(response_prefix):] 71 | if response.endswith(response_suffix): 72 | response = response[:-len(response_suffix)] 73 | output_response(response) 74 | return response 75 | 76 | 77 | def _parse_source_docs(q_and_a_tool: RetrievalQA, query: str): 78 | result = q_and_a_tool({"question": query}) 79 | return transform_source_docs(result) 80 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import base64 4 | import os 5 | import pickle 6 | from datetime import datetime 7 | 8 | from dotenv import load_dotenv 9 | from google.auth.transport.requests import Request 10 | from google_auth_oauthlib.flow import InstalledAppFlow 11 | from googleapiclient.discovery import build 12 | from googleapiclient.errors import HttpError 13 | 14 | load_dotenv() 15 | 16 | 17 | def authenticate(): 18 | creds = None 19 | 20 | token_base64 = os.getenv("GOOGLE_CREDS_TOKEN") 21 | if token_base64: 22 | token_data = base64.b64decode(token_base64) 23 | 24 | creds = pickle.loads(token_data) 25 | 26 | if not creds or not creds.valid: 27 | if creds and creds.expired and creds.refresh_token: 28 | creds.refresh(Request()) 29 | else: 30 | flow = InstalledAppFlow.from_client_secrets_file( 31 | '../service_account_credentials.json', ['https://www.googleapis.com/auth/drive.file', 32 | 'https://www.googleapis.com/auth/spreadsheets']) 33 | creds = flow.run_local_server(port=0) 34 | token_data = pickle.dumps(creds) 35 | 36 | token_base64 = base64.b64encode(token_data).decode() 37 | print("GOOGLE_CREDS_TOKEN in Base64:", token_base64) 38 | 39 | return creds 40 | 41 | 42 | def save_question(question, answer, sheet_id=os.getenv("GOOGLE_API_SPREADSHEET_ID"), 43 | sheet_range=os.getenv("GOOGLE_API_RANGE_NAME"), session_id=""): 44 | try: 45 | creds = authenticate() 46 | sources = "\n".join([i['URL'] for i in answer["sources"]]) if len(answer["sources"]) > 0 else "" 47 | append_values(creds, sheet_id, 48 | sheet_range, "USER_ENTERED", 49 | [ 50 | [ 51 | str(datetime.utcnow()), 52 | str(session_id), 53 | question, 54 | answer["result"], 55 | sources 56 | ] 57 | ]) 58 | except Exception as e: 59 | print(f"Error returned from google authentication: {e}") 60 | 61 | 62 | def save_error(question, message, sheet_id=os.getenv("GOOGLE_API_SPREADSHEET_ID"), 63 | sheet_range=os.getenv("GOOGLE_API_RANGE_NAME"), session_id=""): 64 | try: 65 | creds = authenticate() 66 | append_values(creds, sheet_id, sheet_range, 67 | "USER_ENTERED", 68 | [ 69 | [ 70 | str(datetime.utcnow()), 71 | str(session_id), 72 | question, 73 | "", 74 | "", 75 | message 76 | ] 77 | ]) 78 | except Exception as e: 79 | print(f"Error returned from google authentication: {e}") 80 | 81 | 82 | def create_sheet_in_folder(sheet_name, folder_id, sheet_range=None, sheet_data=None): 83 | creds = authenticate() 84 | 85 | drive_service = build('drive', 'v3', credentials=creds, cache_discovery=False) 86 | 87 | file_metadata = { 88 | 'name': sheet_name, 89 | 'mimeType': 'application/vnd.google-apps.spreadsheet', 90 | 'parents': [folder_id] 91 | } 92 | 93 | file = drive_service.files().create(body=file_metadata, fields='id', supportsAllDrives=True).execute() 94 | 95 | if sheet_data is not None: 96 | append_values(creds, file.get('id'), sheet_range, "USER_ENTERED", sheet_data) 97 | 98 | print('Created new spreadsheet: https://docs.google.com/spreadsheets/d/%s' % file.get('id')) 99 | return file.get('id') 100 | 101 | 102 | def append_values(creds, sheet_id, range_name, value_input_option, values): 103 | try: 104 | sheet_service = build('sheets', 'v4', credentials=creds, cache_discovery=False) 105 | 106 | body = { 107 | 'values': values 108 | } 109 | result = sheet_service.spreadsheets().values().append( 110 | spreadsheetId=sheet_id, range=range_name, 111 | valueInputOption=value_input_option, body=body).execute() 112 | return result 113 | 114 | except HttpError as error: 115 | print(f"An error occurred when trying to append values to google sheet {sheet_id}: {error}") 116 | return error 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Focused Labs Knowledge Base Demo · https://chat.withfocus.com/ 2 | 3 | ## Table of Contents 4 | 1. [Focused Labs Knowledge Base Demo](#focused-labs-knowledge-base-demo--httpschatwithfocuscom) 5 | 2. [Why?](#why) 6 | - [Why build this? Why does this matter?](#why-build-this-why-does-this-matter) 7 | 3. [What?](#what) 8 | - [What is in this repository?](#what-is-in-this-repository) 9 | - [Data Inputs](#data-inputs) 10 | 4. [Prerequisites](#prerequisites) 11 | 5. [Getting the demo running...](#getting-the-demo-running) 12 | - [Edit Configuration Files](#edit-configuration-files) 13 | - [Update `.env`](#update-env) 14 | - [Update `config.py`](#update-configpy) 15 | - [The API](#the-api) 16 | - [Run](#run) 17 | - [Endpoints](#endpoints) 18 | - [Run accuracy test](#run-accuracy-test) 19 | 20 | This sample project demonstrates a possible implementation of a domain specific AI Knowledge Base using Focused Labs as 21 | the example. 22 | 23 | The UI (frontend code) lives in [this repository](https://github.com/focused-labs/knowledge-base-demo-ui). 24 | 25 | ## Why? 26 | 27 | #### Why build this? Why does this matter? 28 | 29 | - AI driven solutions will empower organizations to build on top of existing infrastructure and unlock legacy 30 | - Customized AI ChatBots accelerate product development by making disparate and complex information easy to find 31 | - Unblock teams to focus on what matters - building working software - rather than chasing down people and documentation 32 | 33 | ## What? 34 | 35 | #### What is in this repository? 36 | 37 | A python codebase that harnesses the power of semantic vector search fueled by advanced LLMs. 38 | 39 | ![Overview](ArchitectureOverview.png) 40 | 41 | ### Data Inputs: 42 | 43 | 1. Our external Notion wiki 44 | 2. Our public website: [Focused Labs](https://focusedlabs.io/) 45 | 46 | ## Prerequisites 47 | 48 | 1. A Pinecone Vector Database. You can create a free account [at Pinecone's website](https://www.pinecone.io/). 49 | 2. A Open AI API account (api key). You can sign up [at Open AI's website](https://platform.openai.com/signup). 50 | 3. Python (and your favorite IDE). We are using python v3.10.7. 51 | 4. (Optional) Notion API Key. 52 | 53 | 54 | ## Getting the demo running... 55 | 56 | ### Edit Configuration Files 57 | 58 | #### Update `.env` 59 | 60 | ``` 61 | OPENAI_API_KEY = "" 62 | NOTION_API_KEY = "" 63 | PINECONE_API_KEY = "" 64 | ``` 65 | 66 | #### Update `config.py` 67 | 68 | ``` 69 | PINECONE_INDEX = "" 70 | PINECONE_ENVIRONMENT = "" 71 | ``` 72 | 73 | (Optional) You can explore other models. We recommend using the current configuration for best results.Then, ask a teammate to add you to the Focused 74 | Labs Knowledge Base Hub project. 75 | 76 | ### The API 77 | 78 | #### Run 79 | 80 | ``` 81 | uvicorn main:app 82 | ``` 83 | 84 | Add `--reload` if you make a code change the app will restart on its own. 85 | 86 | #### Endpoints 87 | 88 | ``` 89 | 1. Endpoint: / 90 | Description: A simple endpoint that returns a "Hello World" message. 91 | Method: GET 92 | 93 | 94 | 2. Endpoint: /load-notion-docs 95 | Description: Loads documents from Notion based on provided Notion page IDs. 96 | Method: POST 97 | Payload: json 98 | { 99 | "page_ids": ["", "", ...] 100 | } 101 | 102 | 3. Endpoint: /load-website-docs 103 | Description: Loads documents based on provided URLs that are web scraped. 104 | Method: POST 105 | Payload: json 106 | { 107 | "page_urls": ["", "", ...] 108 | } 109 | 110 | 4. Endpoint: /query/ 111 | Description: Accepts a question and role and returns the appropriate query result. 112 | Method: POST 113 | Payload: json 114 | { 115 | "text": "", 116 | "role": "" 117 | } 118 | Description: Accepts a question and returns the appropriate query result. 119 | 120 | 5. Endpoint: /delete_session 121 | Description: Deletes a conversation based on the provided session ID. 122 | Method: POST 123 | Payload: json 124 | { 125 | "session_id": "" 126 | } 127 | 128 | ``` 129 | 130 | ### Run accuracy test 131 | 132 | Cannot be run via command line. 133 | 134 | Edit the run configuration for `accuracy_test_runner.py` and add the file that contains the list of questions you want 135 | to ask. Ex: `questions.txt` 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /text_cleaner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | import openai 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | openai.api_key = os.getenv('OPENAI_API_KEY') 9 | 10 | 11 | def remove_emoji(string): 12 | emoji_pattern = re.compile("[" 13 | u"\U0001F600-\U0001F64F" # emoticons 14 | u"\U0001F300-\U0001F5FF" # symbols & pictographs 15 | u"\U0001F680-\U0001F6FF" # transport & map symbols 16 | u"\U0001F1E0-\U0001F1FF" # flags (iOS) 17 | u"\U00002702-\U000027B0" 18 | u"\U000024C2-\U0001F251" 19 | "]+", flags=re.UNICODE) 20 | return emoji_pattern.sub(r'', string) 21 | 22 | 23 | def remove_specific_characters(string): 24 | puncts = ['\u200d', '?', '....', '..', '...', '#', '"', '|', "'", 25 | '[', ']', '>', '=', '*', '+', '\\', 26 | '•', '~', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', 27 | '″', '′', 'Â', '█', 28 | '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', 29 | '¦', '║', '―', '¥', '▓', 30 | '—', '‹', '─', '▒', ':', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', 31 | '▲', 'è', '¸', '¾', 32 | 'Ã', '⋅', '‘', '∞', '∙', ')', '↓', '、', '│', '(', '»', ',', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', 33 | '▬', '❤', 'ï', 'Ø', 34 | '¹', '≤', '‡', '√', '!', '🅰', '🅱'] 35 | 36 | for punct in puncts: 37 | string = string.replace(punct, "") 38 | 39 | return string.replace(" ", " ").replace("\n", "; ").replace("\t", " ").replace("\xa0", "") 40 | 41 | 42 | def replace_contractions(string): 43 | contraction_colloq_dict = {"btw": "by the way", "ain't": "is not", "aren't": "are not", "can't": "cannot", 44 | "'cause": "because", "could've": "could have", "couldn't": "could not", 45 | "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", 46 | "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", 47 | "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", 48 | "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 49 | "I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would", 50 | "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am", 51 | "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 52 | "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us", 53 | "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not", 54 | "mightn't've": "might not have", "must've": "must have", "mustn't": "must not", 55 | "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have", 56 | "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", 57 | "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", 58 | "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 59 | "she'll've": "she will have", "she's": "she is", "should've": "should have", 60 | "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have", 61 | "so's": "so as", "this's": "this is", "that'd": "that would", 62 | "that'd've": "that would have", "that's": "that is", "there'd": "there would", 63 | "there'd've": "there would have", "there's": "there is", "here's": "here is", 64 | "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 65 | "they'll've": "they will have", "they're": "they are", "they've": "they have", 66 | "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", 67 | "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", 68 | "weren't": "were not", "what'll": "what will", "what'll've": "what will have", 69 | "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", 70 | "when've": "when have", "where'd": "where did", "where's": "where is", 71 | "where've": "where have", "who'll": "who will", "who'll've": "who will have", 72 | "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", 73 | "will've": "will have", "won't": "will not", "won't've": "will not have", 74 | "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", 75 | "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have", 76 | "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would", 77 | "you'd've": "you would have"} 78 | 79 | for contraction, replacement in contraction_colloq_dict.items(): 80 | string = string.replace(contraction, replacement) 81 | 82 | return string 83 | 84 | 85 | def normalize_text(text): 86 | text = remove_emoji(text) 87 | text = replace_contractions(text) 88 | text = remove_specific_characters(text) 89 | return text 90 | --------------------------------------------------------------------------------