├── models
    ├── __init__.py
    ├── imported_urls.py
    ├── imported_pages.py
    ├── session.py
    ├── question.py
    └── conversation.py
├── tools
    ├── __init__.py
    └── focused_labs_q_and_a_tool.py
├── .python-version
├── ArchitectureOverview.png
├── config.py
├── conversation_repository.py
├── init-db.sql
├── requirements.txt
├── tests
    ├── accuracy_test.py
    ├── test_text_cleaner.py
    ├── test_query_service.py
    ├── accuracy_test_runner.py
    ├── test_utils.py
    └── questions.txt
├── database.py
├── pinecone_database.py
├── utils.py
├── query_service.py
├── main.py
├── import_service.py
├── .gitignore
├── agent.py
├── logger.py
├── README.md
└── text_cleaner.py


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.7
2 | 


--------------------------------------------------------------------------------
/ArchitectureOverview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/focused-dot-io/ai-knowledge-base-demo/HEAD/ArchitectureOverview.png


--------------------------------------------------------------------------------
/models/imported_urls.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class ImportedUrls(BaseModel):
5 |     page_urls: list


--------------------------------------------------------------------------------
/models/imported_pages.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | 
3 | 
4 | class ImportedPages(BaseModel):
5 |     page_ids: list
6 | 


--------------------------------------------------------------------------------
/models/session.py:
--------------------------------------------------------------------------------
1 | from uuid import UUID
2 | 
3 | from pydantic import BaseModel
4 | 
5 | 
6 | class Session(BaseModel):
7 |     session_id: UUID


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | CHAT_MODEL = 'gpt-3.5-turbo'
2 | EMBEDDING_MODEL = 'text-embedding-ada-002'
3 | PINECONE_INDEX = "fl-kb-hub"
4 | PINECONE_ENVIRONMENT = "asia-southeast1-gcp-free"
5 | 


--------------------------------------------------------------------------------
/models/question.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from uuid import UUID
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | 
 7 | class Question(BaseModel):
 8 |     text: str
 9 |     role: str
10 |     session_id: Optional[UUID]


--------------------------------------------------------------------------------
/conversation_repository.py:
--------------------------------------------------------------------------------
 1 | from fastapi import Depends
 2 | 
 3 | from database import get_db
 4 | from models.conversation import Conversation
 5 | 
 6 | 
 7 | def create_conversation(db: Depends(get_db()), conversation: Conversation):
 8 |     db.add(conversation)
 9 |     db.commit()
10 | 


--------------------------------------------------------------------------------
/init-db.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE conversation
 2 | (
 3 |     id           SERIAL PRIMARY KEY,
 4 |     session_id   UUID      NOT NULL,
 5 |     created_at   timestamp NOT NULL,
 6 |     question     varchar NOT NULL,
 7 |     response     varchar NOT NULL,
 8 |     error_message varchar
 9 | );
10 | 


--------------------------------------------------------------------------------
/models/conversation.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, Integer, String, UUID
 2 | 
 3 | from database import Base
 4 | 
 5 | 
 6 | class Conversation(Base):
 7 |     __tablename__ = "conversation"
 8 |     id = Column(Integer, primary_key=True, index=True)
 9 |     session_id = Column(UUID, unique=True, index=True)
10 |     created_at = Column(String)
11 |     question = Column(String)
12 |     response = Column(String)
13 |     error_message = Column(String)
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | openai==0.27.1
 2 | redis==4.5.1
 3 | requests==2.28.2
 4 | termcolor==2.2.0
 5 | jupyter
 6 | ipykernel
 7 | textract
 8 | llama-index==0.6.24
 9 | langchain==0.0.239
10 | python-dotenv==1.0.0
11 | httpx
12 | fastapi~=0.98.0
13 | uvicorn~=0.22.0
14 | pinecone-client==2.2.2
15 | transformers
16 | fastapi-sessions
17 | google-api-python-client
18 | google-auth-httplib2
19 | google-auth-oauthlib
20 | google~=3.0.0
21 | pydantic~=1.10.12
22 | SQLAlchemy~=2.0.19
23 | psycopg2-binary


--------------------------------------------------------------------------------
/tests/accuracy_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from agent import Agent
 4 | from logger import save_question, save_error
 5 | 
 6 | 
 7 | def accuracy_test(spreadsheet_id, question):
 8 |     agent = Agent(personality="human")
 9 |     try:
10 |         answer = agent.query_agent(question)
11 |         response_formatted = json.loads(answer, strict=False)
12 |         save_question(question=question, answer=response_formatted, sheet_id=spreadsheet_id)
13 |     except Exception as e:
14 |         save_error(question, str(e), spreadsheet_id)
15 |         raise e
16 | 


--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | from sqlalchemy import create_engine
 5 | from sqlalchemy.ext.declarative import declarative_base
 6 | from sqlalchemy.orm import sessionmaker
 7 | 
 8 | load_dotenv()
 9 | SQLALCHEMY_DATABASE_URL = f"postgresql://doadmin:{os.getenv('DB_PASSWORD')}@db-ai-knowledge-base-do-user-14399519-0.b.db.ondigitalocean.com:25060/{os.getenv('DB_NAME')}?sslmode=require"
10 | 
11 | engine = create_engine(
12 |     SQLALCHEMY_DATABASE_URL
13 | )
14 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
15 | 
16 | Base = declarative_base()
17 | 
18 | 
19 | def get_db():
20 |     db = SessionLocal()
21 |     try:
22 |         yield db
23 |     finally:
24 |         db.close()
25 | 


--------------------------------------------------------------------------------
/pinecone_database.py:
--------------------------------------------------------------------------------
 1 | from llama_index.vector_stores import PineconeVectorStore
 2 | from llama_index import StorageContext, GPTVectorStoreIndex, VectorStoreIndex
 3 | from dotenv import load_dotenv
 4 | from config import PINECONE_INDEX, PINECONE_ENVIRONMENT
 5 | 
 6 | load_dotenv()
 7 | 
 8 | 
 9 | def get_pinecone_index():
10 |     storage_context = get_pinecone_storage_context()
11 |     index = VectorStoreIndex([], storage_context=storage_context)
12 |     return index
13 | 
14 | 
15 | def get_pinecone_storage_context():
16 |     vector_store = get_vector_store()
17 |     return StorageContext.from_defaults(vector_store=vector_store)
18 | 
19 | 
20 | def get_vector_store():
21 |     return PineconeVectorStore(
22 |         index_name=PINECONE_INDEX,
23 |         environment=PINECONE_ENVIRONMENT,
24 |         # metadata_filters=metadata_filters
25 |     )


--------------------------------------------------------------------------------
/tests/test_text_cleaner.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import text_cleaner
 4 | 
 5 | 
 6 | class TextCleanerTest(unittest.TestCase):
 7 | 
 8 |     def test_removes_emojis(self):
 9 |         self.assertEqual(text_cleaner.remove_emoji('😀 hi'), ' hi')
10 |         self.assertEqual(text_cleaner.remove_emoji(' hi'), ' hi')
11 | 
12 |     def test_remove_specific_characters(self):
13 |         self.assertEqual(text_cleaner.remove_specific_characters('hi£££££££'), 'hi')
14 |         self.assertEqual(text_cleaner.remove_specific_characters('hi'), 'hi')
15 |         self.assertEqual(text_cleaner.remove_specific_characters('hi.'), 'hi.')
16 | 
17 |     def test_replace_contractions(self):
18 |         self.assertEqual(text_cleaner.replace_contractions('hi btw ma\'am'), 'hi by the way madam')
19 |         self.assertEqual(text_cleaner.replace_contractions('hi by the way madam'), 'hi by the way madam')
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/tests/test_query_service.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import patch, Mock
 3 | 
 4 | import query_service
 5 | from agent import Agent
 6 | from models.question import Question
 7 | 
 8 | mock_response = f"""
 9 |             {{
10 |                 "result": "Here's an answer to a question!",
11 |                 "sources": []
12 |             }}"""
13 | 
14 | 
15 | def mock_response_func(cls, *args, **kwargs):
16 |     return mock_response
17 | 
18 | 
19 | class TestQueryService(unittest.TestCase):
20 |     @patch.object(Agent, 'query_agent', mock_response_func)
21 |     @patch("conversation_repository.create_conversation")
22 |     def test_query_service_query_calls_repo_to_log(self, create_convo_repo: Mock):
23 |         under_test = query_service.QueryService()
24 |         q = Question(text="Hello question!", session_id="2ddc72f3-b04d-4516-ac80-cff3619eccd4", role="human")
25 |         under_test.query(
26 |             question=q)
27 |         create_convo_repo.assert_called_once()
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     unittest.main()
32 | 


--------------------------------------------------------------------------------
/tools/focused_labs_q_and_a_tool.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pinecone
 4 | from dotenv import load_dotenv
 5 | from langchain.chains import RetrievalQA
 6 | from langchain.chat_models import ChatOpenAI, openai
 7 | from langchain.embeddings import OpenAIEmbeddings
 8 | from langchain.vectorstores import Pinecone
 9 | from config import PINECONE_ENVIRONMENT, PINECONE_INDEX, EMBEDDING_MODEL
10 | 
11 | load_dotenv()
12 | OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
13 | openai.api_key = OPENAI_API_KEY
14 | 
15 | 
16 | def create_vector_db_tool(llm: ChatOpenAI):
17 |     pinecone.init(
18 |         api_key=os.getenv('PINECONE_API_KEY'),
19 |         environment=PINECONE_ENVIRONMENT
20 |     )
21 |     text_field = "text"
22 | 
23 |     index = pinecone.Index(PINECONE_INDEX)
24 | 
25 |     embedding_model = OpenAIEmbeddings(
26 |         model=EMBEDDING_MODEL,
27 |         openai_api_key=OPENAI_API_KEY
28 |     )
29 | 
30 |     vectorstore = Pinecone(
31 |         index, embedding_model.embed_query, text_field
32 |     )
33 |     return RetrievalQA.from_chain_type(
34 |         llm=llm,
35 |         chain_type="stuff",
36 |         return_source_documents=True,
37 |         input_key="question",
38 |         retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
39 |     )
40 | 


--------------------------------------------------------------------------------
/tests/accuracy_test_runner.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from datetime import datetime
 3 | 
 4 | from accuracy_test import accuracy_test
 5 | from logger import create_sheet_in_folder
 6 | 
 7 | SHARED_FOLDER_ID = '1O2TcHSz8UhSSJoRzvP7QHhnSLVurY5cC'
 8 | 
 9 | 
10 | def ask_questions(question_file_name, questions):
11 |     timestamp = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
12 |     sheet_name = "{file_name}-{ts}".format(file_name=question_file_name.replace('.', '_'), ts=timestamp)
13 |     sheet_id = create_sheet_in_folder(sheet_name, folder_id=SHARED_FOLDER_ID, sheet_range='Sheet1', sheet_data=[
14 |         [
15 |             'Timestamp(UTC)',
16 |             'Session id',
17 |             'Question',
18 |             'Answer',
19 |             'Sources',
20 |             'Error Message',
21 |             'Accuracy(1 - 5 where 5 is the best)',
22 |             'Comments'
23 |         ]
24 |     ])
25 |     for question in questions:
26 |         try:
27 |             if not question.strip().startswith('#'):
28 |                 accuracy_test(sheet_id, question.strip())
29 |         except ValueError as e:
30 |             print(f"Error when asking question {question.strip()}: str(e))")
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     question_file = open(sys.argv[1], 'r')
35 |     questions = question_file.readlines()
36 |     ask_questions(question_file.name, questions)
37 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def output_response(response) -> None:
 5 |     """
 6 |     You may be wondering why aren't we streaming the response using the openai completion API
 7 |     This is currently in beta in the langchain library, I will update this example
 8 |     to showcase this as implementation details may change
 9 |     Since it's flagged as beta adding it here may cause confusion as most
10 |     likely it will be changed again within a few weeks
11 |     For now output_response will simulate streaming for the purpose of illustration
12 |     Args:
13 |         response: text output generated by ChatGPT
14 |     """
15 |     if not response:
16 |         print("There's no response.")
17 |     else:
18 |         print(response)
19 |     print("-----")
20 | 
21 | 
22 | def is_answer_formatted_in_json(answer):
23 |     try:
24 |         json.loads(answer, strict=False)
25 |         return True
26 |     except ValueError:
27 |         return False
28 | 
29 | 
30 | def format_escape_characters(s):
31 |     return s.replace('"', '\\"').replace("\n", "\\n")
32 | 
33 | 
34 | def transform_source_docs(result):
35 |     formatted_result_string = format_escape_characters(result["result"])
36 |     if 'source_documents' in result.keys():
37 |         return f"""
38 |             {{
39 |             "result": "{formatted_result_string}",
40 |             "sources": {json.dumps([i.metadata for i in result['source_documents']])}
41 |             }}"""
42 |     return f"""
43 |         {{
44 |         "result": "{formatted_result_string}",
45 |         "sources": []
46 |         }}"""
47 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import unittest
 3 | 
 4 | from langchain.schema import Document
 5 | 
 6 | from utils import transform_source_docs
 7 | 
 8 | mock_result = {
 9 |     "source_documents": [],
10 |     "result": "here's a response!"
11 | }
12 | 
13 | mock_result_with_docs = {
14 |     "source_documents": [Document(
15 |         page_content='some page content',
16 |         metadata={'URL': 'url',
17 |                   'doc_id': 'doc_id',
18 |                   'document_id': 'document_id',
19 |                   'node_info': '{"start": 0, "end": 504, "_node_type": "1"}',
20 |                   'page_id': 'page_id',
21 |                   'ref_doc_id': 'ref_doc_id',
22 |                   'relationships': '{"1": "relationship_id"}',
23 |                   'title': 'A page title here!'})],
24 |     "result": "here's a response!"
25 | }
26 | 
27 | 
28 | class TestUtils(unittest.TestCase):
29 | 
30 |     def test_transform_source_docs(self):
31 |         response_transformed = transform_source_docs(mock_result)
32 |         self.assertIn("here's a response!", response_transformed)
33 |         self.assertIn('"sources": []', response_transformed)
34 |         json.loads(response_transformed)
35 | 
36 |     def test_transform_source_docs_with_content(self):
37 |         response_transformed = transform_source_docs(mock_result_with_docs)
38 |         self.assertIn("here's a response!", response_transformed)
39 |         self.assertIn('A page title here', response_transformed)
40 |         response = json.loads(response_transformed)
41 |         self.assertEqual(1, len(response['sources']))
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 


--------------------------------------------------------------------------------
/query_service.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | import json
 3 | from uuid import uuid4
 4 | import conversation_repository
 5 | from agent import Agent
 6 | from database import get_db
 7 | from models.conversation import Conversation
 8 | from models.question import Question
 9 | from models.session import Session
10 | 
11 | 
12 | class QueryService:
13 | 
14 |     def __init__(self):
15 |         self.agents = {}
16 | 
17 |     def _create_query_session(self, personality):
18 |         session_id = uuid4()
19 |         self.agents[session_id] = Agent(personality=personality)
20 |         return session_id
21 | 
22 |     def query(self, question: Question):
23 |         session_id = question.session_id
24 |         if session_id not in self.agents:
25 |             session_id = self._create_query_session(personality=question.role)
26 |         try:
27 |             agent = self.agents[session_id]
28 |             answer = agent.query_agent(user_input=question.text)
29 |             response_formatted = json.loads(answer, strict=False)
30 |         except Exception as e:
31 |             conversation_repository.create_conversation(
32 |                 db=next(get_db()),
33 |                 conversation=Conversation(session_id=session_id, question=question.text, created_at=datetime.now(),
34 |                                           response="", error_message=str(e)))
35 |             raise e
36 |         try:
37 |             conversation_repository.create_conversation(
38 |                 db=next(get_db()),
39 |                 conversation=Conversation(session_id=session_id, question=question.text, created_at=datetime.now(),
40 |                                           response=response_formatted['result']))
41 |         except Exception as e:
42 |             print(f"Failed to log response. Error: {e}")
43 |         return {"response": response_formatted, "session_id": session_id}
44 | 
45 |     def delete_query_session(self, session: Session):
46 |         if session.session_id in self.agents:
47 |             self.agents.pop(session.session_id)
48 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | from contextlib import asynccontextmanager
 4 | 
 5 | import uvicorn
 6 | from dotenv import load_dotenv
 7 | from fastapi import FastAPI
 8 | from fastapi.middleware.cors import CORSMiddleware
 9 | 
10 | from import_service import import_web_scrape_data, import_notion_data
11 | from models.imported_pages import ImportedPages
12 | from models.imported_urls import ImportedUrls
13 | from models.question import Question
14 | from models.session import Session
15 | from query_service import QueryService
16 | 
17 | load_dotenv()
18 | 
19 | allowed_origins = [
20 |     "http://localhost:3000",
21 |     "https://fl-ai-knowledgehub-h27h6.ondigitalocean.app/",
22 |     "https://dev-kb-xxl7y.ondigitalocean.app/",
23 |     "https://chat.withfocus.com/",
24 |     "https://chat.focusedlabs.io/"
25 | ]
26 | 
27 | query_service = QueryService()
28 | 
29 | 
30 | def init_logging():
31 |     logging.basicConfig(stream=sys.stdout, level=logging.INFO)
32 |     logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
33 | 
34 | 
35 | 
36 | @asynccontextmanager
37 | async def lifespan(app: FastAPI):
38 |     init_logging()
39 | 
40 |     yield
41 | 
42 | 
43 | app = FastAPI(lifespan=lifespan)
44 | 
45 | app.add_middleware(
46 |     CORSMiddleware,
47 |     allow_origins=allowed_origins,
48 |     allow_credentials=True,
49 |     allow_methods=["*"],
50 |     allow_headers=["*"],
51 | )
52 | 
53 | 
54 | @app.post("/load-notion-docs")
55 | def load_notion_documents(imported_pages: ImportedPages):
56 |     print(f"Loading the following notion docs {imported_pages}")
57 |     import_notion_data(imported_pages.page_ids)
58 |     return {"status": "Notion Docs Loaded"}
59 | 
60 | 
61 | @app.post("/load-website-docs")
62 | def load_web_scrape_documents(website: ImportedUrls):
63 |     print(f"Loading following web scraped docs {website.page_urls}")
64 |     import_web_scrape_data(website.page_urls)
65 | 
66 | 
67 | @app.get("/")
68 | async def root():
69 |     return {"message": "Hello World"}
70 | 
71 | 
72 | @app.post("/query/")
73 | async def query(question: Question):
74 |     return query_service.query(question=question)
75 | 
76 | 
77 | @app.post("/delete_session")
78 | async def delete_session(session: Session):
79 |     query_service.delete_query_session(session)
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     uvicorn.run(app, host="0.0.0.0", port=8000)
84 | 


--------------------------------------------------------------------------------
/tests/questions.txt:
--------------------------------------------------------------------------------
 1 | What does Focused Labs do?
 2 | Tell me a brief history of Focused Labs.
 3 | What type of work does Focused Labs excel at?
 4 | What are Focused Labs company values?
 5 | Tell me more about what listen first means?
 6 | Tell me more about what learn why means?
 7 | Tell me more about what love your craft means?
 8 | Where is the Denver office located?
 9 | Where is the Chicago office located?
10 | Tell me about the Chicago office, what is it like?
11 | Where are the Focused Labs offices located?
12 | Who owns Focused Labs?
13 | Who is the current CEO?
14 | Who are the founders and what is their background?
15 | Who is the leadership team?
16 | How many employees do you have?
17 | Who are the software engineers?
18 | As a potential employee, what programming languages should I know?
19 | What type of skills do you need to be a software engineer here?
20 | What programming languages are Focused labs developers good at?
21 | What does a typical day look like?
22 | What are your working hours?
23 | What does a typical team look like at Focused Labs?
24 | As a potential employee, how long will I be on each project?
25 | What kinds of projects does FL work on?
26 | What clients has Focused Labs worked with?
27 | Tell me more about your work with Hertz
28 | What are your benefits?
29 | What is the expected compensation package for an employee?
30 | What healthcare options does focused labs offer?
31 | What is the typical process of working with FL as a customer?
32 | Why should I trust Focused Labs?
33 | Focused Labs doesn't have expertise in my domain or any particular business domain. What makes you capable of doing the work in my industry?
34 | I have an idea but don't know where to start. How can Focused Labs help?
35 | Can focused Labs help me migrate from on premise to AWS?
36 | Do I want to be a customer?
37 | Why should I work with FL over another similar company or competitor?
38 | What are Focused Labs' rates?
39 | What are my rate negotiation options?
40 | Why is the work you deliver worth those rates?
41 | What are the benefits of custom built software?
42 | What is TDD?
43 | What is agile?
44 | What is Focused Labs' policy on remote work?
45 | What is the company culture like?
46 | What jobs are available now?
47 | As a potential employee, what does Focused Labs' process look like?
48 | As a prospective employee, do I have to always pair program?
49 | Why do you have an AI chatbot?
50 | How do I contact Focused Labs?
51 | Does Focused Labs know a lot about AI?
52 | Write me a haiku about Focused Labs?
53 | If Focused Labs were an animal, what animal would it be?


--------------------------------------------------------------------------------
/import_service.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from langchain.chat_models import ChatOpenAI
 4 | 
 5 | from config import CHAT_MODEL
 6 | from pinecone_database import get_pinecone_storage_context
 7 | from llama_index import NotionPageReader, VectorStoreIndex, download_loader, LLMPredictor, ServiceContext
 8 | import os
 9 | from dotenv import load_dotenv
10 | from text_cleaner import normalize_text
11 | import http.client
12 | 
13 | load_dotenv()
14 | NOTION_API_KEY = os.getenv('NOTION_API_KEY')
15 | 
16 | page_titles = [{}]
17 | 
18 | 
19 | def get_llm_predictor():
20 |     return LLMPredictor(llm=ChatOpenAI(temperature=0, max_tokens=512, model_name=CHAT_MODEL))
21 | 
22 | 
23 | def get_service_context():
24 |     llm_predictor_chatgpt = get_llm_predictor()
25 |     return ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt)
26 | 
27 | 
28 | def get_notion_metadata(page_id):
29 |     try:
30 |         headers = {'Authorization': f'Bearer {NOTION_API_KEY}', 'Notion-Version': '2022-06-28'}
31 |         connection = http.client.HTTPSConnection("api.notion.com")
32 | 
33 |         connection.request("GET", f"/v1/pages/{page_id}/properties/title", headers=headers)
34 |         page_title = json.loads(connection.getresponse().read())
35 | 
36 |         connection.request("GET", f"/v1/pages/{page_id}", headers=headers)
37 |         page_url = json.loads(connection.getresponse().read())
38 | 
39 |         return {"page_title": page_title['results'][0]['title']['plain_text'], "page_url": page_url['url']}
40 |     except Exception as e:
41 |         print(f"Failed to retrieve notion metadata{e} for page id: {page_id}")
42 |         return {"page_title": "", "page_url": ""}
43 | 
44 | 
45 | def import_notion_data(page_ids):
46 |     documents = NotionPageReader(integration_token=NOTION_API_KEY).load_data(page_ids=page_ids)
47 |     for document in documents:
48 |         document_metadata = get_notion_metadata(page_id=document.extra_info["page_id"])
49 |         url = document_metadata['page_url']
50 |         title = document_metadata['page_title']
51 |         document.extra_info.update({"URL": url, "title": title})
52 |         document.metadata = ({"URL": url, "title": title})
53 |         document.text = normalize_text(document.text)
54 | 
55 |     index = VectorStoreIndex.from_documents(documents,
56 |                                             storage_context=get_pinecone_storage_context(),
57 |                                             service_context=get_service_context())
58 |     return index
59 | 
60 | 
61 | def import_web_scrape_data(urls: list):
62 |     BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
63 | 
64 |     loader = BeautifulSoupWebReader()
65 |     documents = loader.load_data(urls=urls)
66 | 
67 |     for document in documents:
68 |         document.text = normalize_text(document.text)
69 | 
70 |     index = VectorStoreIndex.from_documents(documents,
71 |                                             storage_context=get_pinecone_storage_context(),
72 |                                             service_context=get_service_context())
73 |     return index
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Local environment variables
  2 | .env
  3 | Pipfile
  4 | 
  5 | # Credentials for google cloud APIs
  6 | service_account_credentials.json
  7 | 
  8 | .idea/
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | .pybuilder/
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | #   For a library or package, you might want to ignore these files since the code is
 96 | #   intended to run in multiple environments; otherwise, check them in:
 97 | # .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 
158 | # pytype static type analyzer
159 | .pytype/
160 | 
161 | # Cython debug symbols
162 | cython_debug/
163 | 
164 | # PyCharm
165 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
168 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 | 


--------------------------------------------------------------------------------
/agent.py:
--------------------------------------------------------------------------------
 1 | from langchain.agents import Tool, ConversationalChatAgent, AgentExecutor
 2 | from langchain.chains import RetrievalQA
 3 | from langchain.chat_models import ChatOpenAI
 4 | from langchain.memory import ConversationBufferWindowMemory
 5 | 
 6 | from config import CHAT_MODEL
 7 | from tools.focused_labs_q_and_a_tool import create_vector_db_tool
 8 | from utils import is_answer_formatted_in_json, output_response, transform_source_docs
 9 | 
10 | 
11 | class Agent:
12 | 
13 |     def __init__(self, personality):
14 |         self.personality = personality
15 |         self.llm = ChatOpenAI(temperature=0, model_name=CHAT_MODEL)
16 |         self.agent_executor = self.create_agent_executor()
17 | 
18 |     def create_agent_executor(self):
19 |         q_and_a_tool = create_vector_db_tool(llm=self.llm)
20 |         tools = [
21 |             Tool(
22 |                 name="Focused Labs QA",
23 |                 return_direct=True,
24 |                 func=lambda query: _parse_source_docs(q_and_a_tool, query),
25 |                 description="useful for when you need to answer questions about Focused Labs"
26 |             )
27 |         ]
28 |         memory = ConversationBufferWindowMemory(llm=self.llm, k=10, memory_key="chat_history", return_messages=True,
29 |                                                 human_prefix="user", ai_prefix="assistant", input_key="input")
30 |         custom_agent = ConversationalChatAgent.from_llm_and_tools(llm=self.llm,
31 |                                                                   tools=tools,
32 |                                                                   verbose=True,
33 |                                                                   max_iterations=3,
34 |                                                                   handle_parsing_errors=True,
35 |                                                                   memory=memory,
36 |                                                                   input_variables=["input", "chat_history",
37 |                                                                                    "agent_scratchpad"],
38 |                                                                   system_message=
39 |                                                                   f"""
40 |                                                                   Have a conversation with a human, answering the 
41 |                                                                   following as best you can and try to use a tool to help. 
42 |                                                                   You have access to the following tools: 
43 |                                                                   Focused Labs QA-useful for when you need to answer
44 |                                                                   questions about Focused Labs. If you don't know the 
45 |                                                                   answer don't make one up, just say "Hmm, I'm not sure 
46 |                                                                   please contact work@focusedlabs.io for further assistance."
47 |                                                                   Answer questions from the perspective of a {self.personality}"""
48 |                                                                   )
49 |         return AgentExecutor.from_agent_and_tools(agent=custom_agent, tools=tools, memory=memory,
50 |                                                   verbose=True)
51 | 
52 |     def query_agent(self, user_input):
53 |         try:
54 |             response = self.agent_executor.run(input=user_input)
55 |             if is_answer_formatted_in_json(response):
56 |                 return response
57 |             return f"""
58 |             {{
59 |                 "result": "{response}",
60 |                 "sources": []
61 |             }}"""
62 | 
63 |         except ValueError as e:
64 |             response = str(e)
65 |             response_prefix = "Could not parse LLM output: `\nAI: "
66 |             if not response.startswith(response_prefix):
67 |                 raise e
68 |             response_suffix = "`"
69 |             if response.startswith(response_prefix):
70 |                 response = response[len(response_prefix):]
71 |             if response.endswith(response_suffix):
72 |                 response = response[:-len(response_suffix)]
73 |             output_response(response)
74 |             return response
75 | 
76 | 
77 | def _parse_source_docs(q_and_a_tool: RetrievalQA, query: str):
78 |     result = q_and_a_tool({"question": query})
79 |     return transform_source_docs(result)
80 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import base64
  4 | import os
  5 | import pickle
  6 | from datetime import datetime
  7 | 
  8 | from dotenv import load_dotenv
  9 | from google.auth.transport.requests import Request
 10 | from google_auth_oauthlib.flow import InstalledAppFlow
 11 | from googleapiclient.discovery import build
 12 | from googleapiclient.errors import HttpError
 13 | 
 14 | load_dotenv()
 15 | 
 16 | 
 17 | def authenticate():
 18 |     creds = None
 19 | 
 20 |     token_base64 = os.getenv("GOOGLE_CREDS_TOKEN")
 21 |     if token_base64:
 22 |         token_data = base64.b64decode(token_base64)
 23 | 
 24 |         creds = pickle.loads(token_data)
 25 | 
 26 |     if not creds or not creds.valid:
 27 |         if creds and creds.expired and creds.refresh_token:
 28 |             creds.refresh(Request())
 29 |         else:
 30 |             flow = InstalledAppFlow.from_client_secrets_file(
 31 |                 '../service_account_credentials.json', ['https://www.googleapis.com/auth/drive.file',
 32 |                                                         'https://www.googleapis.com/auth/spreadsheets'])
 33 |             creds = flow.run_local_server(port=0)
 34 |             token_data = pickle.dumps(creds)
 35 | 
 36 |             token_base64 = base64.b64encode(token_data).decode()
 37 |             print("GOOGLE_CREDS_TOKEN in Base64:", token_base64)
 38 | 
 39 |     return creds
 40 | 
 41 | 
 42 | def save_question(question, answer, sheet_id=os.getenv("GOOGLE_API_SPREADSHEET_ID"),
 43 |                   sheet_range=os.getenv("GOOGLE_API_RANGE_NAME"), session_id=""):
 44 |     try:
 45 |         creds = authenticate()
 46 |         sources = "\n".join([i['URL'] for i in answer["sources"]]) if len(answer["sources"]) > 0 else ""
 47 |         append_values(creds, sheet_id,
 48 |                       sheet_range, "USER_ENTERED",
 49 |                       [
 50 |                           [
 51 |                               str(datetime.utcnow()),
 52 |                               str(session_id),
 53 |                               question,
 54 |                               answer["result"],
 55 |                               sources
 56 |                           ]
 57 |                       ])
 58 |     except Exception as e:
 59 |         print(f"Error returned from google authentication: {e}")
 60 | 
 61 | 
 62 | def save_error(question, message, sheet_id=os.getenv("GOOGLE_API_SPREADSHEET_ID"),
 63 |                sheet_range=os.getenv("GOOGLE_API_RANGE_NAME"), session_id=""):
 64 |     try:
 65 |         creds = authenticate()
 66 |         append_values(creds, sheet_id, sheet_range,
 67 |                       "USER_ENTERED",
 68 |                       [
 69 |                           [
 70 |                               str(datetime.utcnow()),
 71 |                               str(session_id),
 72 |                               question,
 73 |                               "",
 74 |                               "",
 75 |                               message
 76 |                           ]
 77 |                       ])
 78 |     except Exception as e:
 79 |         print(f"Error returned from google authentication: {e}")
 80 | 
 81 | 
 82 | def create_sheet_in_folder(sheet_name, folder_id, sheet_range=None, sheet_data=None):
 83 |     creds = authenticate()
 84 | 
 85 |     drive_service = build('drive', 'v3', credentials=creds, cache_discovery=False)
 86 | 
 87 |     file_metadata = {
 88 |         'name': sheet_name,
 89 |         'mimeType': 'application/vnd.google-apps.spreadsheet',
 90 |         'parents': [folder_id]
 91 |     }
 92 | 
 93 |     file = drive_service.files().create(body=file_metadata, fields='id', supportsAllDrives=True).execute()
 94 | 
 95 |     if sheet_data is not None:
 96 |         append_values(creds, file.get('id'), sheet_range, "USER_ENTERED", sheet_data)
 97 | 
 98 |     print('Created new spreadsheet: https://docs.google.com/spreadsheets/d/%s' % file.get('id'))
 99 |     return file.get('id')
100 | 
101 | 
102 | def append_values(creds, sheet_id, range_name, value_input_option, values):
103 |     try:
104 |         sheet_service = build('sheets', 'v4', credentials=creds, cache_discovery=False)
105 | 
106 |         body = {
107 |             'values': values
108 |         }
109 |         result = sheet_service.spreadsheets().values().append(
110 |             spreadsheetId=sheet_id, range=range_name,
111 |             valueInputOption=value_input_option, body=body).execute()
112 |         return result
113 | 
114 |     except HttpError as error:
115 |         print(f"An error occurred when trying to append values to google sheet {sheet_id}: {error}")
116 |         return error
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Focused Labs Knowledge Base Demo · https://chat.withfocus.com/
  2 | 
  3 | ## Table of Contents
  4 | 1. [Focused Labs Knowledge Base Demo](#focused-labs-knowledge-base-demo--httpschatwithfocuscom)
  5 | 2. [Why?](#why)
  6 |     - [Why build this? Why does this matter?](#why-build-this-why-does-this-matter)
  7 | 3. [What?](#what)
  8 |     - [What is in this repository?](#what-is-in-this-repository)
  9 |     - [Data Inputs](#data-inputs)
 10 | 4. [Prerequisites](#prerequisites)
 11 | 5. [Getting the demo running...](#getting-the-demo-running)
 12 |     - [Edit Configuration Files](#edit-configuration-files)
 13 |         - [Update `.env`](#update-env)
 14 |         - [Update `config.py`](#update-configpy)
 15 |     - [The API](#the-api)
 16 |         - [Run](#run)
 17 |         - [Endpoints](#endpoints)
 18 |     - [Run accuracy test](#run-accuracy-test)
 19 | 
 20 | This sample project demonstrates a possible implementation of a domain specific AI Knowledge Base using Focused Labs as
 21 | the example.
 22 | 
 23 | The UI (frontend code) lives in [this repository](https://github.com/focused-labs/knowledge-base-demo-ui).
 24 | 
 25 | ## Why?
 26 | 
 27 | #### Why build this? Why does this matter?
 28 | 
 29 | - AI driven solutions will empower organizations to build on top of existing infrastructure and unlock legacy
 30 | - Customized AI ChatBots accelerate product development by making disparate and complex information easy to find
 31 | - Unblock teams to focus on what matters - building working software - rather than chasing down people and documentation
 32 | 
 33 | ## What?
 34 | 
 35 | #### What is in this repository?
 36 | 
 37 | A python codebase that harnesses the power of semantic vector search fueled by advanced LLMs.
 38 | 
 39 | ![Overview](ArchitectureOverview.png)
 40 | 
 41 | ### Data Inputs:
 42 | 
 43 | 1. Our external Notion wiki
 44 | 2. Our public website: [Focused Labs](https://focusedlabs.io/)
 45 | 
 46 | ## Prerequisites
 47 | 
 48 | 1. A Pinecone Vector Database. You can create a free account [at Pinecone's website](https://www.pinecone.io/).
 49 | 2. A Open AI API account (api key). You can sign up [at Open AI's website](https://platform.openai.com/signup).
 50 | 3. Python (and your favorite IDE). We are using python v3.10.7.
 51 | 4. (Optional) Notion API Key.
 52 | 
 53 | 
 54 | ## Getting the demo running...
 55 | 
 56 | ### Edit Configuration Files
 57 | 
 58 | #### Update `.env`
 59 | 
 60 | ```
 61 | OPENAI_API_KEY = "<Open AI API Token Secure Note>"
 62 | NOTION_API_KEY = "<Notion API Token Secure Note>"
 63 | PINECONE_API_KEY = "<retrieve value Pinecone DB DEV credential>"
 64 | ```
 65 | 
 66 | #### Update `config.py`
 67 | 
 68 | ```
 69 | PINECONE_INDEX = "<name of your index, ex: "focusedlabs-pinecone-index">"
 70 | PINECONE_ENVIRONMENT = "<name of your pineconce env. ex: asia-southeast1-gcp-free>"
 71 | ```
 72 | 
 73 | (Optional) You can explore other models. We recommend using the current configuration for best results.Then, ask a teammate to add you to the Focused
 74 | Labs Knowledge Base Hub project.
 75 | 
 76 | ### The API
 77 | 
 78 | #### Run
 79 | 
 80 | ```
 81 |  uvicorn main:app
 82 | ```
 83 | 
 84 | Add `--reload` if you make a code change the app will restart on its own.
 85 | 
 86 | #### Endpoints
 87 | 
 88 | ```
 89 | 1. Endpoint: /
 90 |    Description: A simple endpoint that returns a "Hello World" message.
 91 |    Method: GET
 92 |    
 93 |    
 94 | 2. Endpoint: /load-notion-docs
 95 |    Description: Loads documents from Notion based on provided Notion page IDs.
 96 |    Method: POST
 97 |    Payload: json
 98 |        {
 99 |          "page_ids": ["<NOTION_PAGE_ID_1>", "<NOTION_PAGE_ID_2>", ...]
100 |        }
101 |    
102 | 3. Endpoint: /load-website-docs
103 |    Description: Loads documents based on provided URLs that are web scraped.
104 |    Method: POST
105 |    Payload: json
106 |        {
107 |          "page_urls": ["<URL_1>", "<URL_2>", ...]
108 |        }
109 |    
110 | 4. Endpoint: /query/
111 |    Description: Accepts a question and role and returns the appropriate query result.
112 |    Method: POST
113 |    Payload: json
114 |         {
115 |           "text": "<user question here>",
116 |           "role": "<user persona, ex: potential customer>"
117 |         }
118 |    Description: Accepts a question and returns the appropriate query result.
119 |    
120 | 5. Endpoint: /delete_session
121 |    Description: Deletes a conversation based on the provided session ID.
122 |    Method: POST
123 |    Payload: json
124 |        {
125 |          "session_id": "<YOUR_SESSION_ID>"
126 |        }
127 |    
128 | ```
129 | 
130 | ### Run accuracy test
131 | 
132 | Cannot be run via command line.
133 | 
134 | Edit the run configuration for `accuracy_test_runner.py` and add the file that contains the list of questions you want
135 | to ask. Ex: `questions.txt`
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/text_cleaner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | import openai
 5 | from dotenv import load_dotenv
 6 | 
 7 | load_dotenv()
 8 | openai.api_key = os.getenv('OPENAI_API_KEY')
 9 | 
10 | 
11 | def remove_emoji(string):
12 |     emoji_pattern = re.compile("["
13 |                                u"\U0001F600-\U0001F64F"  # emoticons
14 |                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
15 |                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
16 |                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
17 |                                u"\U00002702-\U000027B0"
18 |                                u"\U000024C2-\U0001F251"
19 |                                "]+", flags=re.UNICODE)
20 |     return emoji_pattern.sub(r'', string)
21 | 
22 | 
23 | def remove_specific_characters(string):
24 |     puncts = ['\u200d', '?', '....', '..', '...', '#', '"', '|', "'",
25 |               '[', ']', '>', '=', '*', '+', '\\',
26 |               '•', '~', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§',
27 |               '″', '′', 'Â', '█',
28 |               '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═',
29 |               '¦', '║', '―', '¥', '▓',
30 |               '—', '‹', '─', '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤',
31 |               '▲', 'è', '¸', '¾',
32 |               'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗',
33 |               '▬', '❤', 'ï', 'Ø',
34 |               '¹', '≤', '‡', '√', '!', '🅰', '🅱']
35 | 
36 |     for punct in puncts:
37 |         string = string.replace(punct, "")
38 | 
39 |     return string.replace("  ", " ").replace("\n", "; ").replace("\t", " ").replace("\xa0", "")
40 | 
41 | 
42 | def replace_contractions(string):
43 |     contraction_colloq_dict = {"btw": "by the way", "ain't": "is not", "aren't": "are not", "can't": "cannot",
44 |                                "'cause": "because", "could've": "could have", "couldn't": "could not",
45 |                                "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
46 |                                "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will",
47 |                                "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will",
48 |                                "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
49 |                                "I'll've": "I will have", "I'm": "I am", "I've": "I have", "i'd": "i would",
50 |                                "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have", "i'm": "i am",
51 |                                "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
52 |                                "it'll": "it will", "it'll've": "it will have", "it's": "it is", "let's": "let us",
53 |                                "ma'am": "madam", "mayn't": "may not", "might've": "might have", "mightn't": "might not",
54 |                                "mightn't've": "might not have", "must've": "must have", "mustn't": "must not",
55 |                                "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have",
56 |                                "o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have",
57 |                                "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
58 |                                "she'd": "she would", "she'd've": "she would have", "she'll": "she will",
59 |                                "she'll've": "she will have", "she's": "she is", "should've": "should have",
60 |                                "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have",
61 |                                "so's": "so as", "this's": "this is", "that'd": "that would",
62 |                                "that'd've": "that would have", "that's": "that is", "there'd": "there would",
63 |                                "there'd've": "there would have", "there's": "there is", "here's": "here is",
64 |                                "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
65 |                                "they'll've": "they will have", "they're": "they are", "they've": "they have",
66 |                                "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have",
67 |                                "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have",
68 |                                "weren't": "were not", "what'll": "what will", "what'll've": "what will have",
69 |                                "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is",
70 |                                "when've": "when have", "where'd": "where did", "where's": "where is",
71 |                                "where've": "where have", "who'll": "who will", "who'll've": "who will have",
72 |                                "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have",
73 |                                "will've": "will have", "won't": "will not", "won't've": "will not have",
74 |                                "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
75 |                                "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
76 |                                "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would",
77 |                                "you'd've": "you would have"}
78 | 
79 |     for contraction, replacement in contraction_colloq_dict.items():
80 |         string = string.replace(contraction, replacement)
81 | 
82 |     return string
83 | 
84 | 
85 | def normalize_text(text):
86 |     text = remove_emoji(text)
87 |     text = replace_contractions(text)
88 |     text = remove_specific_characters(text)
89 |     return text
90 | 


--------------------------------------------------------------------------------