├── .gitignore
├── README.md
├── localbot.py
├── localbot_adapted.py
├── requirements.txt
├── slack_manifest.json
└── slackbot.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Connector dev bot
 2 | 
 3 | This is the code for the [tutorial published on the Airbyte blog](https://airbyte.com/tutorials/chat-with-your-data-using-openai-pinecone-airbyte-and-langchain)
 4 | 
 5 | It implements a chat bot that uses contextual information stored in Pinecone, Langchain to orchestrate an LLM and the Slack sdk to provide a Slack bot that can answer Airbyte connector builder-related questions on Slack.
 6 | 
 7 | If you like this project, leave us a star ⭐ on the main [Airbyte Repo](https://github.com/airbytehq/airbyte/blob/master/README.md)! 
 8 | 
 9 | ## How to run
10 | 
11 | You need locally installed python
12 | 
13 | * Follow the tutorial to create a Pinecone index and populate it with data via Airbyte
14 | * Run `python -m venv venv` to create a virtual environment
15 | * Run `source venv/bin/activate` to activate the virtual environment
16 | * Run `pip install -r requirements.txt` to install the dependencies
17 | 
18 | ### Run the bot locally in your terminal
19 | 
20 | * Run `export PINECONE_API_KEY=<your pinecone api key>` to set the pinecone api key
21 | * Run `export PINECONE_INDEX_NAME=<your pinecone index name>` to set the pinecone index name
22 | * Run `export PINECONE_ENV=<your pinecone env>` to set the pinecone env
23 | * Run `export OPENAI_API_KEY=<your openai api key>` to set the openai api key
24 | * Run `python localbot.py` to start the bot (`localbot_adapted.py` uses improved prompts for better results)
25 | 
26 | ### Run the bot on Slack
27 | 
28 | * Use the `slack_manifest.yml` file to create a Slack app and install it in your workspace.
29 | * Run `export PINECONE_API_KEY=<your pinecone api key>` to set the pinecone api key
30 | * Run `export PINECONE_INDEX_NAME=<your pinecone index name>` to set the pinecone index name
31 | * Run `export PINECONE_ENV=<your pinecone env>` to set the pinecone env
32 | * Run `export OPENAI_API_KEY=<your openai api key>` to set the openai api key
33 | * Run `export SLACK_APP_TOKEN=<your slack app token>` to set the slack app token
34 | * Run `export SLACK_BOT_TOKEN=<your slack bot token>` to set the slack bot token
35 | * Run `python slackbot.py` to start the bot
36 | 
37 | Again, leave us a star ⭐ on the main [Airbyte repo](https://github.com/airbytehq/airbyte/blob/master/README.md)!
38 | 


--------------------------------------------------------------------------------
/localbot.py:
--------------------------------------------------------------------------------
 1 | # This script is running the most basic bot locally in your terminal
 2 | 
 3 | import os
 4 | import pinecone
 5 | from langchain.chains import RetrievalQA
 6 | from langchain.embeddings import OpenAIEmbeddings
 7 | from langchain.llms import OpenAI
 8 | from langchain.vectorstores import Pinecone
 9 | 
10 | embeddings = OpenAIEmbeddings()
11 | pinecone.init(api_key=os.environ["PINECONE_KEY"], environment=os.environ["PINECONE_ENV"])
12 | index = pinecone.Index(os.environ["PINECONE_INDEX"])
13 | vector_store = Pinecone(index, embeddings.embed_query, "text")
14 | 
15 | qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=vector_store.as_retriever())
16 | 
17 | print("Connector development help bot. What do you want to know?")
18 | while True:
19 |     query = input("")
20 |     answer = qa.run(query)
21 |     print(answer)
22 |     print("\nWhat else can I help you with:")


--------------------------------------------------------------------------------
/localbot_adapted.py:
--------------------------------------------------------------------------------
 1 | # This script is running the bot with improved prompts locally in your terminal
 2 | 
 3 | import os
 4 | from typing import List
 5 | from langchain import PromptTemplate
 6 | import pinecone
 7 | from langchain.chains import RetrievalQA
 8 | from langchain.embeddings import OpenAIEmbeddings
 9 | from langchain.llms import OpenAI
10 | from langchain.vectorstores import Pinecone
11 | from langchain.vectorstores.base import VectorStoreRetriever
12 | from langchain.schema.document import Document
13 | 
14 | embeddings = OpenAIEmbeddings()
15 | pinecone.init(api_key=os.environ["PINECONE_KEY"], environment=os.environ["PINECONE_ENV"])
16 | index = pinecone.Index(os.environ["PINECONE_INDEX"])
17 | vector_store = Pinecone(index, embeddings.embed_query, "text")
18 | 
19 | prompt_template = """You are a question-answering bot operating on Github issues and documentation pages for a product called connector builder. The documentation pages document what can be done, the issues document future plans and bugs. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. State were you got this information from (and the github issue number if applicable), but do only if you used the information in your answer.
20 | 
21 | {context}
22 | 
23 | Question: {question}
24 | Helpful Answer:"""
25 | prompt = PromptTemplate(
26 |     template=prompt_template, input_variables=["context", "question"]
27 | )
28 | class ContextualRetriever(VectorStoreRetriever):
29 |     def _get_relevant_documents(self, query: str, *, run_manager) -> List[Document]:
30 |         docs = super()._get_relevant_documents(query, run_manager=run_manager)
31 |         return [self.format_doc(doc) for doc in docs]
32 | 
33 |     def format_doc(self, doc: Document) -> Document:
34 |         if doc.metadata["_airbyte_stream"] == "item_collection":
35 |             doc.page_content = f"Excerpt from documentation page: {doc.page_content}"
36 |         elif doc.metadata["_airbyte_stream"] == "issues":
37 |             doc.page_content =  f"Excerpt from Github issue: {doc.page_content}, issue number: {int(doc.metadata['number']):d}, issue state: {doc.metadata['state']}"
38 |         elif doc.metadata["_airbyte_stream"] == "threads" or doc.metadata["_airbyte_stream"] == "channel_messages":
39 |             doc.page_content =  f"Excerpt from Slack thread: {doc.page_content}"
40 |         return doc
41 | 
42 | 
43 | qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=ContextualRetriever(vectorstore=vector_store), chain_type_kwargs={"prompt": prompt})
44 | 
45 | print("Connector development help bot. What do you want to know?")
46 | while True:
47 |     query = input("")
48 |     answer = qa.run(query)
49 |     print(answer)
50 |     print("\nWhat else can I help you with:")
51 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pinecone-client
2 | langchain
3 | openai
4 | slack_sdk
5 | tiktoken
6 | 


--------------------------------------------------------------------------------
/slack_manifest.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "display_information": {
 3 |         "name": "Connector Development Helper Bot",
 4 |         "description": "Helpful bot to answer questions around connector development",
 5 |         "background_color": "#303133"
 6 |     },
 7 |     "features": {
 8 |         "bot_user": {
 9 |             "display_name": "Airbyte Connector Dev Help",
10 |             "always_online": false
11 |         }
12 |     },
13 |     "oauth_config": {
14 |         "scopes": {
15 |             "user": [
16 |                 "channels:history"
17 |             ],
18 |             "bot": [
19 |                 "channels:history",
20 |                 "channels:join",
21 |                 "channels:read",
22 |                 "files:read",
23 |                 "groups:read",
24 |                 "links:read",
25 |                 "reactions:read",
26 |                 "remote_files:read",
27 |                 "team:read",
28 |                 "usergroups:read",
29 |                 "users.profile:read",
30 |                 "users:read",
31 |                 "app_mentions:read",
32 |                 "chat:write"
33 |             ]
34 |         }
35 |     },
36 |     "settings": {
37 |         "event_subscriptions": {
38 |             "bot_events": [
39 |                 "app_mention"
40 |             ]
41 |         },
42 |         "interactivity": {
43 |             "is_enabled": true
44 |         },
45 |         "org_deploy_enabled": false,
46 |         "socket_mode_enabled": true,
47 |         "token_rotation_enabled": false
48 |     }
49 | }


--------------------------------------------------------------------------------
/slackbot.py:
--------------------------------------------------------------------------------
 1 | # This script is running the bot on Slack
 2 | 
 3 | import os
 4 | from langchain.prompts import PromptTemplate
 5 | from slack_sdk import WebClient
 6 | from slack_sdk.socket_mode import SocketModeClient
 7 | from slack_sdk.socket_mode.request import SocketModeRequest
 8 | from slack_sdk.socket_mode.response import SocketModeResponse
 9 | import pinecone
10 | from langchain.chains import RetrievalQA
11 | from langchain.embeddings import OpenAIEmbeddings
12 | from langchain.llms import OpenAI
13 | from langchain.vectorstores import Pinecone
14 | from langchain.vectorstores.base import VectorStoreRetriever
15 | from typing import List
16 | from langchain.schema.document import Document
17 | 
18 | # Initialize Pinecone
19 | embeddings = OpenAIEmbeddings()
20 | pinecone.init(api_key=os.environ["PINECONE_KEY"], environment=os.environ["PINECONE_ENV"])
21 | index = pinecone.Index(os.environ["PINECONE_INDEX"])
22 | vector_store = Pinecone(index, embeddings.embed_query, "text")
23 | 
24 | 
25 | # Define prompts
26 | prompt_template = """You are a question-answering bot operating on Github issues and documentation pages for a product called connector builder. The documentation pages document what can be done, the issues document future plans and bugs. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. State were you got this information from (and the github issue number if applicable), but do only if you used the information in your answer.
27 | 
28 | {context}
29 | 
30 | Question: {question}
31 | Helpful Answer:"""
32 | prompt = PromptTemplate(
33 |     template=prompt_template, input_variables=["context", "question"]
34 | )
35 | class ContextualRetriever(VectorStoreRetriever):
36 |     def _get_relevant_documents(self, query: str, *, run_manager) -> List[Document]:
37 |         docs = super()._get_relevant_documents(query, run_manager=run_manager)
38 |         return [self.format_doc(doc) for doc in docs]
39 | 
40 |     def format_doc(self, doc: Document) -> Document:
41 |         if doc.metadata["_airbyte_stream"] == "item_collection":
42 |             doc.page_content = f"Excerpt from documentation page: {doc.page_content}"
43 |         elif doc.metadata["_airbyte_stream"] == "issues":
44 |             doc.page_content =  f"Excerpt from Github issue: {doc.page_content}, issue number: {int(doc.metadata['number']):d}, issue state: {doc.metadata['state']}"
45 |         elif doc.metadata["_airbyte_stream"] == "threads" or doc.metadata["_airbyte_stream"] == "channel_messages":
46 |             doc.page_content =  f"Excerpt from Slack thread: {doc.page_content}"
47 |         return doc
48 | 
49 | 
50 | # Initialize the QA system
51 | qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=ContextualRetriever(vectorstore=vector_store), chain_type_kwargs={"prompt": prompt})
52 | 
53 | 
54 | # Wire it up to Slack
55 | slack_web_client = WebClient(token=os.environ["SLACK_BOT_TOKEN"])
56 | 
57 | handled_messages = {}
58 | 
59 | def process(_: SocketModeClient, socket_mode_request: SocketModeRequest):
60 |     if socket_mode_request.type == "events_api":
61 |         event = socket_mode_request.payload.get("event", {})
62 |         client_msg_id = event.get("client_msg_id")
63 |         if event.get("type") == "app_mention" and not handled_messages.get(client_msg_id):
64 |             handled_messages[client_msg_id] = True
65 |             channel_id = event.get("channel")
66 |             text = event.get("text")
67 |             result = qa.run(text)
68 |             slack_web_client.chat_postMessage(channel=channel_id, text=result)
69 |     
70 |     return SocketModeResponse(envelope_id=socket_mode_request.envelope_id)
71 | 
72 | socket_mode_client = SocketModeClient(
73 |     app_token=os.environ["SLACK_APP_TOKEN"], 
74 |     web_client=slack_web_client
75 | )
76 | socket_mode_client.socket_mode_request_listeners.append(process)
77 | 
78 | socket_mode_client.connect()
79 | print("listening")
80 | from threading import Event
81 | Event().wait()
82 | 


--------------------------------------------------------------------------------