├── .gitignore ├── README.md ├── app.py ├── assets └── mx_bot.png ├── data └── data.txt ├── generate_index.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | /Pipfile 2 | /Pipfile.lock 3 | .env 4 | __pycache__/ 5 | .idea/ 6 | index_*.json 7 | /data/* 8 | /data/data.txt 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI powered FAQ Chatbot 2 | Build your own ChatGPT powered faq / knowledge base chatbot that can infer and answer queries based on the information provided via an external data source 3 | ## Requirements for running it locally on laptop ## 4 | 5 | * Windows / Mac / Linux with Git installed 6 | * Python 3.8+ 7 | * MessengerX.io API Token - FREE for Indie Developers 8 | * Open AI Key 9 | * Ngrok for Tunneling 10 | 11 | ### Install requirements ### 12 | ```bash 13 | pip install -r requirements.txt 14 | ``` 15 | 16 | ### Create a new .env file in the root directory ### 17 | * For Linux / Mac 18 | ```bash 19 | nano -w .env 20 | ``` 21 | 22 | * For Windows 23 | ```bash 24 | type nul > .env 25 | ``` 26 | 27 | ### Add the following values to your .env file 28 | ```bash 29 | OPENAI_API_KEY= # your open ai key 30 | MESSENGERX_BASE_URL=https://ganglia.machaao.com 31 | OVERRIDE_INDEX_CHECK=False 32 | ``` 33 | * Set OVERRIDE_INDEX_CHECK to ```True``` if you want the bot to automatically 34 | retrain your index when you update your data source 35 | 36 | ### Add your data sources in the ```data``` directory 37 | * It can be of any type (.txt, .pdf etc.) and may contain multiple files. Its totally flexible 38 | 39 | ### Run your chatbot app on your local server 40 | ```bash 41 | python app.py 42 | ``` 43 | 44 | ### Start ngrok.io tunnel in a new terminal (local development) ### 45 | ``` 46 | ngrok http 5000 47 | ``` 48 | 49 | ### Update your webhook to receive messages ### 50 | Update your bot Webhook URL at [MessengerX.io Portal](https://portal.messengerx.io) with the url provided by ngrok 51 | ``` 52 | https:///machaao/hook 53 | ``` 54 | 55 | * If your ```NGROK URL``` is ```https://e9fe-115-187-40-104.in.ngrok.io``` then your bot 56 | settings page should look like this 👇🏻 57 | ![figure](/assets/mx_bot.png) 58 | 59 | ### Your chatbot is now ready to start receiving incoming messages from users 60 | ```bash 61 | # HappyCoding 62 | ``` 63 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import traceback 5 | 6 | import jwt 7 | from dotenv import load_dotenv 8 | from flask import Flask, request, Response 9 | from machaao import Machaao 10 | 11 | from generate_index import ExtDataIndex 12 | 13 | load_dotenv() 14 | app = Flask(__name__) 15 | 16 | MESSENGERX_BASE_URL = os.environ.get("MESSENGERX_BASE_URL") 17 | OPEN_AI_KEY = os.environ.get("OPENAI_API_KEY") 18 | 19 | param_list = [MESSENGERX_BASE_URL, OPEN_AI_KEY] 20 | 21 | for param in param_list: 22 | if param is None: 23 | raise Exception("Environment variables not set in .env file") 24 | 25 | 26 | def exception_handler(exception, data=None): 27 | caller = sys._getframe(1).f_code.co_name 28 | print(f"{caller} function failed") 29 | if hasattr(exception, 'message'): 30 | print(exception.message) 31 | else: 32 | print("Unexpected error: ", sys.exc_info()[0]) 33 | 34 | if data is not None: 35 | machaao = Machaao(data['api_token'], MESSENGERX_BASE_URL) 36 | 37 | payload = { 38 | "identifier": "BROADCAST_FB_QUICK_REPLIES", 39 | "users": [data['user_id']], 40 | "message": { 41 | "text": data['output_text'], 42 | "quick_replies": [{ 43 | "content_type": "text", 44 | "payload": "Hi", 45 | "title": "👋🏻 Hi" 46 | }] 47 | } 48 | } 49 | 50 | response = machaao.send_message(payload) 51 | return Response( 52 | mimetype="application/json", 53 | response=json.dumps({ 54 | "error": True, 55 | "message": response.text 56 | }), 57 | status=400, 58 | ) 59 | 60 | else: 61 | 62 | return Response( 63 | mimetype="application/json", 64 | response=json.dumps({ 65 | "error": True, 66 | "message": "Server error occurred. Check logs for details" 67 | }), 68 | status=400, 69 | ) 70 | 71 | 72 | @app.route('/', methods=['GET']) 73 | def index(): 74 | return "ok" 75 | 76 | 77 | def extract_data(api_token, req): 78 | messaging = None 79 | user_id = req.headers.get("machaao-user-id", None) 80 | raw = req.json["raw"] 81 | 82 | if raw != "": 83 | inp = jwt.decode(str(raw), api_token, algorithms=["HS512"]) 84 | sub = inp.get("sub", None) 85 | if sub and type(sub) is dict: 86 | sub = json.dumps(sub) 87 | 88 | if sub: 89 | decoded = json.loads(sub) 90 | messaging = decoded.get("messaging", None) 91 | 92 | return { 93 | "user_id": user_id, 94 | "messaging": messaging 95 | } 96 | 97 | 98 | @app.route('/machaao/hook', methods=['POST']) 99 | def receive(): 100 | _api_token = None 101 | _user_id = None 102 | try: 103 | _api_token = request.headers["bot-token"] 104 | _user_id = request.headers["machaao-user-id"] 105 | 106 | if not _api_token: 107 | return Response( 108 | mimetype="application/json", 109 | response=json.dumps({ 110 | "error": True, 111 | "message": "Invalid Request, Check your token" 112 | }), 113 | status=400, 114 | ) 115 | 116 | return incoming(request) 117 | 118 | except KeyError as k: 119 | traceback.print_exc(file=sys.stdout) 120 | exception_handler(k) 121 | except AttributeError as a: 122 | traceback.print_exc(file=sys.stdout) 123 | exception_handler(a) 124 | except Exception as e: 125 | traceback.print_exc(file=sys.stdout) 126 | if _user_id and _api_token: 127 | data = dict() 128 | data['user_id'] = _user_id 129 | data['api_token'] = _api_token 130 | data['output_text'] = "Oops!! Our bot is currently unavailable" 131 | exception_handler(e, data) 132 | else: 133 | exception_handler(e) 134 | 135 | 136 | def incoming(req): 137 | api_token = req.headers.get("bot-token", None) 138 | incoming_data = extract_data(api_token, req) 139 | print(f"incoming: {incoming_data}") 140 | 141 | machaao = Machaao(api_token, MESSENGERX_BASE_URL) 142 | messaging = incoming_data["messaging"] 143 | user_id = incoming_data["user_id"] 144 | message = messaging[0]["message_data"]["text"].lower() 145 | 146 | if message == "hi": 147 | output_text = "Hello!! I am an FAQ bot demonstrating the indexing performance of llama index on your External " \ 148 | "data source. Ask me an FAQ relevant to your data source" 149 | else: 150 | output_text = str(idx_obj.query(message)).strip() 151 | 152 | payload = { 153 | "identifier": "BROADCAST_FB_QUICK_REPLIES", 154 | "users": [user_id], 155 | "message": { 156 | "text": output_text, 157 | "quick_replies": [{ 158 | "content_type": "text", 159 | "payload": "Hi", 160 | "title": "👋🏻 Hi" 161 | }] 162 | } 163 | } 164 | 165 | response = machaao.send_message(payload) 166 | output_payload = { 167 | "success": True, 168 | "message": response.text 169 | } 170 | 171 | return Response( 172 | mimetype="application/json", 173 | response=json.dumps(output_payload), 174 | status=200, 175 | ) 176 | 177 | 178 | if __name__ == '__main__': 179 | idx_obj = ExtDataIndex() 180 | app.run(debug=True) 181 | -------------------------------------------------------------------------------- /assets/mx_bot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/machaao/chatgpt-faq/129ee76071c4b11f3daaebb3212d77db09ad3684/assets/mx_bot.png -------------------------------------------------------------------------------- /data/data.txt: -------------------------------------------------------------------------------- 1 | Q- Who was the only Sikh President of India? 2 | A: Giani Zail Singh. 3 | 4 | Q- Which city hosted the first ever Asian Games? 5 | A: New Delhi. 6 | 7 | Q- Which Indian Prime Minister held the shortest term in office? 8 | A: Gulzarilal Nanda. 9 | 10 | Q- Where did India win the first Cricket World Cup? 11 | A: The Lords stadium in London. 12 | 13 | Q- In which year was Rashtrapati Bhavan built? 14 | A: 1929. 15 | 16 | Q- Who was India’s first female playback singer? 17 | A: Rajkumari Dubey. 18 | 19 | Q- Who is the only spiritual personality to be awarded the Bharat Ratna, the highest civilian award in India? 20 | A: Mother Teresa (also called Saint Mother Teresa of Kolkata). 21 | 22 | Q- Where is the largest Hindu Temple in the world? 23 | A: Angkor Vat, Cambodia. 24 | 25 | Q- Which are the two newest ministries of the Indian government? 26 | A: Ministry of Skills Development & Entrepreneurship and Ministry of AYUSH 27 | 28 | Q- How many times has India won the Hockey World Cup? 29 | A: Once. -------------------------------------------------------------------------------- /generate_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | 4 | from langchain import OpenAI 5 | from llama_index import SimpleDirectoryReader, GPTSimpleVectorIndex, LLMPredictor, PromptHelper 6 | 7 | 8 | class ExtDataIndex: 9 | def __init__(self): 10 | self.data_loc = 'data' 11 | self.model_name = "gpt-3.5-turbo" 12 | self.index_name = f"index_{self.model_name}.json" 13 | self.root = os.path.dirname(os.path.abspath(__file__)) 14 | self.override_latest_index_check = os.environ.get('OVERRIDE_INDEX_CHECK', None) 15 | if self.override_latest_index_check is not None: 16 | self.override_latest_index_check = eval(self.override_latest_index_check) 17 | self.index = self.load_index() 18 | 19 | def check_is_data_source_updated(self): 20 | if self.override_latest_index_check: 21 | print("Skipping latest index check") 22 | return False 23 | 24 | idx_f = pathlib.Path(f"{self.root}/{self.index_name}") 25 | data_f = pathlib.Path(f"{self.root}/{self.data_loc}") 26 | 27 | data_modified_time = data_f.stat().st_mtime 28 | idx_modified_time = idx_f.stat().st_mtime 29 | 30 | if idx_modified_time < data_modified_time: 31 | print("Data source has been updated. Creating new index") 32 | return True 33 | else: 34 | print("Data source has not been updated") 35 | return False 36 | 37 | def query(self, query_str): 38 | resp = self.index.query(query_str, mode='default') 39 | print(f'Question: {query_str}') 40 | print(resp) 41 | return resp 42 | 43 | def load_index(self): 44 | idx_loaded = False 45 | idx_exists = os.path.exists(f"{self.root}/{self.index_name}") 46 | data_path = f"{self.root}/{self.data_loc}" 47 | 48 | data_dir_contents = os.listdir(data_path) 49 | 50 | if not data_dir_contents: 51 | print("Please add a data source") 52 | raise Exception("No data source present") 53 | 54 | if idx_exists: 55 | idx_updated = self.check_is_data_source_updated() 56 | if not idx_updated: 57 | print("Index loaded from disk") 58 | index = GPTSimpleVectorIndex.load_from_disk(f"{self.index_name}") 59 | idx_loaded = True 60 | 61 | if not idx_loaded: 62 | index = self.build_index() 63 | 64 | return index 65 | 66 | def build_index(self): 67 | # define prompt helper 68 | # set maximum input size 69 | max_input_size = 4096 70 | # set number of output tokens 71 | num_output = 256 72 | # set maximum chunk overlap 73 | max_chunk_overlap = 20 74 | prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap) 75 | llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name=self.model_name)) 76 | 77 | documents = SimpleDirectoryReader(f'{self.data_loc}').load_data() 78 | 79 | index = GPTSimpleVectorIndex( 80 | documents, 81 | llm_predictor=llm_predictor, 82 | prompt_helper=prompt_helper, 83 | ) 84 | 85 | print("New index created and saved to disk") 86 | index.save_to_disk(f"{self.index_name}") 87 | return index 88 | 89 | 90 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | anyio==3.6.2 4 | async-timeout==4.0.2 5 | asyncio==3.4.3 6 | attrs==22.2.0 7 | beautifulsoup4==4.11.2 8 | bs4==0.0.1 9 | certifi==2022.12.7 10 | charset-normalizer==3.0.1 11 | click==8.1.3 12 | colorama==0.4.6 13 | dataclasses-json==0.5.7 14 | docopt==0.6.2 15 | filelock==3.9.0 16 | Flask==2.2.3 17 | Flask-API==3.0.post1 18 | Flask-Executor==1.0.0 19 | frozenlist==1.3.3 20 | greenlet==2.0.2 21 | h11==0.14.0 22 | httpcore==0.16.3 23 | httpx==0.23.3 24 | huggingface-hub==0.12.1 25 | idna==3.4 26 | importlib-metadata==6.0.0 27 | itsdangerous==2.1.2 28 | Jinja2==3.1.2 29 | joblib==1.2.0 30 | langchain==0.0.92 31 | llama-index==0.4.8 32 | machaao==0.3.18 33 | MarkupSafe==2.1.2 34 | marshmallow==3.19.0 35 | marshmallow-enum==1.5.1 36 | multidict==6.0.4 37 | mypy-extensions==1.0.0 38 | nltk==3.8.1 39 | numpy==1.24.2 40 | openai==0.26.5 41 | packaging==23.0 42 | pandas==1.5.3 43 | Pillow==9.4.0 44 | pipreqs==0.4.11 45 | pydantic==1.10.5 46 | PyJWT==2.6.0 47 | python-dateutil==2.8.2 48 | python-dotenv==1.0.0 49 | pytz==2022.7.1 50 | PyYAML==6.0 51 | regex==2022.10.31 52 | requests==2.28.2 53 | rfc3986==1.5.0 54 | six==1.16.0 55 | sniffio==1.3.0 56 | soupsieve==2.4 57 | SQLAlchemy==1.4.46 58 | tenacity==8.1.0 59 | tokenizers==0.13.2 60 | torch==1.13.1+cu116 61 | torchaudio==0.13.1+cu116 62 | torchvision==0.14.1+cu116 63 | tqdm==4.64.1 64 | transformers==4.26.1 65 | typing-inspect==0.8.0 66 | typing_extensions==4.5.0 67 | urllib3==1.26.14 68 | websockets==10.4 69 | Werkzeug==2.2.3 70 | yarg==0.1.9 71 | yarl==1.8.2 72 | zipp==3.15.0 73 | --------------------------------------------------------------------------------