├── subgraphs_faq.xlsx ├── requirements.txt ├── LICENSE ├── add_embeddings.py ├── telegram-group-data-collection.py ├── README.md ├── discord-channel-data-collection.py ├── .gitignore └── telegram-bot.py /subgraphs_faq.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/balakhonoff/AnythingGPT/HEAD/subgraphs_faq.xlsx -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==2.0.3 2 | requests==2.31.0 3 | python-telegram-bot==13.6.0 4 | Telethon==1.29.0 5 | openai==0.27.8 6 | openpyxl==3.1.2 7 | matplotlib==3.7.2 8 | plotly==5.15.0 9 | scipy==1.10.1 10 | scikit-learn==1.3.0 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Kirill Balakhonov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /add_embeddings.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import openai 3 | import argparse 4 | 5 | 6 | # Create an Argument Parser object 7 | parser = argparse.ArgumentParser(description='Adding embeddings for each line of csv file') 8 | 9 | # Add the arguments 10 | parser.add_argument('--openai_api_key', type=str, help='API KEY of OpenAI API to create contextual embeddings for each line') 11 | parser.add_argument('--file', type=str, help='A source CSV file with the text data') 12 | parser.add_argument('--colname', type=str, help='Column name with the texts') 13 | 14 | # Parse the command-line arguments 15 | args = parser.parse_args() 16 | 17 | # Access the argument values 18 | openai.api_key = args.openai_api_key 19 | file = args.file 20 | colname = args.colname 21 | 22 | if file[-4:] == '.csv': 23 | df = pd.read_csv(file) 24 | else: 25 | df = pd.read_excel(file) 26 | 27 | # filter NAs 28 | df = df[~df[colname].isna()] 29 | # Keep only questions 30 | df = df[df[colname].str.contains('\?')] 31 | 32 | 33 | def get_embedding(text, model="text-embedding-ada-002"): 34 | i = 0 35 | max_try = 3 36 | # to avoid random OpenAI API fails: 37 | while i < max_try: 38 | try: 39 | text = text.replace("\n", " ") 40 | result = openai.Embedding.create(input=[text], model=model)['data'][0]['embedding'] 41 | return result 42 | except: 43 | i += 1 44 | 45 | 46 | def process_row(x): 47 | return get_embedding(x, model='text-embedding-ada-002') 48 | 49 | 50 | df['ada_embedding'] = df[colname].apply(process_row) 51 | df.to_csv(file[:-4]+'_question_embed.csv', index=False) 52 | 53 | 54 | -------------------------------------------------------------------------------- /telegram-group-data-collection.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import argparse 3 | from telethon import TelegramClient 4 | 5 | # Create an Argument Parser object 6 | parser = argparse.ArgumentParser(description='Telegram Group Data Collection Script') 7 | 8 | # Add the arguments 9 | parser.add_argument('--app_id', type=int, help='Telegram APP id from https://my.telegram.org/apps') 10 | parser.add_argument('--app_hash', type=str, help='Telegram APP hash from https://my.telegram.org/apps') 11 | parser.add_argument('--phone_number', type=str, help='Telegram user phone number with the leading "+"') 12 | parser.add_argument('--password', type=str, help='Telegram user password') 13 | parser.add_argument('--group_name', type=str, help='Telegram group public name without "@"') 14 | parser.add_argument('--limit_messages', type=int, help='Number of last messages to download') 15 | 16 | # Parse the command-line arguments 17 | args = parser.parse_args() 18 | 19 | # Access the argument values 20 | app_id = args.app_id 21 | app_hash = args.app_hash 22 | phone_number = args.phone_number 23 | password = args.password 24 | group_name = args.group_name 25 | limit_messages = args.limit_messages 26 | 27 | 28 | async def main(): 29 | messages = await client.get_messages(group_name, limit=limit_messages) 30 | df = pd.DataFrame(columns=['date', 'user_id', 'raw_text', 'views', 'forwards', 'text', 'chan', 'id']) 31 | 32 | for m in messages: 33 | if m is not None: 34 | if 'from_id' in m.__dict__.keys(): 35 | if m.from_id is not None: 36 | if 'user_id' in m.from_id.__dict__.keys(): 37 | df = pd.concat([df, pd.DataFrame([{'date': m.date, 'user_id': m.from_id.user_id, 'raw_text': m.raw_text, 'views': m.views, 38 | 'forwards': m.forwards, 'text': m.text, 'chan': group_name, 'id': m.id}])], ignore_index=True) 39 | 40 | df = df[~df['user_id'].isna()] 41 | df = df[~df['text'].isna()] 42 | df['date'] = pd.to_datetime(df['date']) 43 | df = df.sort_values('date').reset_index(drop=True) 44 | 45 | df.to_csv(f'../telegram_messages_{group_name}.csv', index=False) 46 | 47 | client = TelegramClient('session', app_id, app_hash) 48 | client.start(phone=phone_number, password=password) 49 | 50 | with client: 51 | client.loop.run_until_complete(main()) 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AnythingGPT 2 | The Python micro framework for building knowledge base powered ChatGPT assistants 3 | 4 | # Install 5 | 1. Fork&Clone 6 | 2. Go to the project directory 7 | 3. Set up an environment and install needed libraries 8 | ```buildoutcfg 9 | virtualenv -p python3.8 .venv 10 | source .venv/bin/activate 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | # Using the manually compiled knowledge base of questions and answers. 15 | - To do so, create a xlsx file using ./subgraphs_faq.xlsx as an example 16 | - This file will contain questions and answers 17 | - Questions must be with at least one "?" mark, otherwise a question will be excluded 18 | - If you are going to use only manually compiled knowledge base you can skip the next two optional sections 19 | 20 | # Collect data from discord (optional) 21 | - First, open a discord channel in your browser and get the channel ID from the URL https://discord.com/channels/xxx/{CHANNEL_ID} 22 | - Second, being on the discord channel page, start typing anything, then open developer tools -> Network -> Find "typing" -> Headers -> Authorization. 23 | - Third, run the script with the obtained parameters 24 | ```buildoutcfg 25 | source .venv/bin/activate 26 | python discord-channel-data-collection.py --channel_id=123456 --authorization_key="123456qwerty" 27 | ``` 28 | 29 | # Collect data from telegram chat (optional) 30 | - First, create an app using https://my.telegram.org/apps and get app_id and app_hash 31 | - Second, find a group name that you are going to use 32 | - Third, run the script with the obtained parameters from your telegram user creds: 33 | ```buildoutcfg 34 | source .venv/bin/activate 35 | python telegram-group-data-collection.py --app_id=123456 --app_hash="123456qwerty" --phone_number="+xxxxxx" --password="qwerty123" --group_name="xxx" --limit_messages=100 36 | ``` 37 | 38 | # Add contextual ADA embeddings to the csv file with texts 39 | - For any csv or excel file which has a column with text one can run this script to save all texts with "?" together with embeddings in a new file 40 | - You need an OpenAI API key to run this for embedding generation 41 | 42 | Example command: 43 | ```buildoutcfg 44 | python add_embeddings.py --openai_api_key="xxx" --file="./subgraphs_faq.xlsx" --colname="Question" 45 | ``` 46 | 47 | # Run the example telegram bot 48 | - The bot which will answer to the questions considering the provided topic 49 | - Also it will strongly follow the provided knowledge base 50 | 51 | Example command: 52 | ```buildoutcfg 53 | python telegram-bot.py --openai_api_key="xxx" --telegram_bot_token="xxx" --file="./subgraphs_faq._question_embed.csv" --topic="The Graph subgraph development" 54 | ``` 55 | 56 | # Modify the files to solve your own task 57 | and star this repository🙂 58 | 59 | 60 | -------------------------------------------------------------------------------- /discord-channel-data-collection.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import pandas as pd 4 | import argparse 5 | 6 | # Create an Argument Parser object 7 | parser = argparse.ArgumentParser(description='Discord Channel Data Collection Script') 8 | 9 | # Add the arguments 10 | parser.add_argument('--channel_id', type=str, help='Channel ID from the URL of a channel in browser https://discord.com/channels/xxx/{CHANNEL_ID}') 11 | parser.add_argument('--authorization_key', type=str, help='Authorization Key. Being on the discord channel page, start typing anything, then open developer tools -> Network -> Find "typing" -> Headers -> Authorization.') 12 | 13 | # Parse the command-line arguments 14 | args = parser.parse_args() 15 | 16 | # Access the argument values 17 | channel_id = args.channel_id 18 | authorization_key = args.authorization_key 19 | 20 | # Print the argument values 21 | print(f"Channel ID: {channel_id}") 22 | print(f"Authorization Key: {authorization_key}") 23 | 24 | 25 | def retrieve_messages(channel_id, authorization_key): 26 | num = 0 27 | limit = 100 28 | 29 | headers = { 30 | 'authorization': authorization_key 31 | } 32 | 33 | last_message_id = None 34 | 35 | # Create a pandas DataFrame 36 | df = pd.DataFrame(columns=['id', 'dt', 'text', 'author_id', 'author_username', 'is_bot', 'is_reply', 'id_reply']) 37 | 38 | while True: 39 | query_parameters = f'limit={limit}' 40 | if last_message_id is not None: 41 | query_parameters += f'&before={last_message_id}' 42 | 43 | r = requests.get( 44 | f'https://discord.com/api/v9/channels/{channel_id}/messages?{query_parameters}', headers=headers 45 | ) 46 | jsonn = json.loads(r.text) 47 | if len(jsonn) == 0: 48 | break 49 | 50 | for value in jsonn: 51 | is_reply = False 52 | id_reply = '0' 53 | if 'message_reference' in value and value['message_reference'] is not None: 54 | if 'message_id' in value['message_reference'].keys(): 55 | is_reply = True 56 | id_reply = value['message_reference']['message_id'] 57 | 58 | text = value['content'] 59 | if 'embeds' in value.keys(): 60 | if len(value['embeds'])>0: 61 | for x in value['embeds']: 62 | if 'description' in x.keys(): 63 | if text != '': 64 | text += ' ' + x['description'] 65 | else: 66 | text = x['description'] 67 | df_t = pd.DataFrame({ 68 | 'id': value['id'], 69 | 'dt': value['timestamp'], 70 | 'text': text, 71 | 'author_id': value['author']['id'], 72 | 'author_username': value['author']['username'], 73 | 'is_bot': value['author']['bot'] if 'bot' in value['author'].keys() else False, 74 | 'is_reply': is_reply, 75 | 'id_reply': id_reply, 76 | }, index=[0]) 77 | if len(df) == 0: 78 | df = df_t.copy() 79 | else: 80 | df = pd.concat([df, df_t], ignore_index=True) 81 | 82 | last_message_id = value['id'] 83 | num = num + 1 84 | 85 | print('number of messages we collected is', num) 86 | 87 | 88 | # Save DataFrame to a CSV file 89 | df.to_csv(f'../discord_messages_{channel_id}.csv', index=False) 90 | 91 | 92 | if __name__ == '__main__': 93 | retrieve_messages(channel_id, authorization_key) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | session* 163 | session.session 164 | session.session-journal 165 | session (2).session-journal 166 | -------------------------------------------------------------------------------- /telegram-bot.py: -------------------------------------------------------------------------------- 1 | import threading 2 | import telegram 3 | from telegram.ext import Updater, CommandHandler, MessageHandler, Filters 4 | 5 | import openai 6 | from openai.embeddings_utils import cosine_similarity 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | import argparse 12 | import functools 13 | 14 | # Create an Argument Parser object 15 | parser = argparse.ArgumentParser(description='Run the bot which uses prepared knowledge base enriched with contextual embeddings') 16 | 17 | # Add the arguments 18 | parser.add_argument('--openai_api_key', type=str, help='API KEY of OpenAI API to create contextual embeddings for each line') 19 | parser.add_argument('--telegram_bot_token', type=str, help='A telegram bot token obtained via @BotFather') 20 | parser.add_argument('--file', type=str, help='A source CSV file with the questions, answers and embeddings') 21 | parser.add_argument('--topic', type=str, help='Write the topic to add a default context for the bot') 22 | parser.add_argument('--start_message', type=str, help="The text that will be shown to the users after they click /start button/command", default="Hello, World!") 23 | parser.add_argument('--model', type=str, help='A model of ChatGPT which will be used', default='gpt-3.5-turbo-16k') 24 | parser.add_argument('--num_top_qa', type=str, help="The number of top similar questions' answers as a context", default=3) 25 | 26 | # Parse the command-line arguments 27 | args = parser.parse_args() 28 | 29 | # Access the argument values 30 | openai.api_key = args.openai_api_key 31 | token = args.telegram_bot_token 32 | file = args.file 33 | topic = args.topic 34 | model = args.model 35 | num_top_qa = args.num_top_qa 36 | start_message = args.start_message 37 | 38 | # reading QA file with embeddings 39 | df_qa = pd.read_csv(file) 40 | df_qa['ada_embedding'] = df_qa.ada_embedding.apply(eval).apply(np.array) 41 | 42 | 43 | 44 | def retry_on_error(func): 45 | @functools.wraps(func) 46 | def wrapper(*args, **kwargs): 47 | max_retries = 3 48 | for i in range(max_retries): 49 | try: 50 | return func(*args, **kwargs) 51 | except Exception as e: 52 | print(f"Error occurred, retrying ({i+1}/{max_retries} attempts)...") 53 | # If all retries failed, raise the last exception 54 | raise e 55 | 56 | return wrapper 57 | 58 | @retry_on_error 59 | def call_chatgpt(*args, **kwargs): 60 | return openai.ChatCompletion.create(*args, **kwargs) 61 | 62 | 63 | def get_embedding(text, model="text-embedding-ada-002"): 64 | text = text.replace("\n", " ") 65 | return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding'] 66 | 67 | 68 | def search_similar(df, product_description, n=3, pprint=True): 69 | embedding = get_embedding(product_description, model='text-embedding-ada-002') 70 | df['similarities'] = df.ada_embedding.apply(lambda x: cosine_similarity(x, embedding)) 71 | res = df.sort_values('similarities', ascending=False).head(n) 72 | return res 73 | 74 | 75 | def collect_text_qa(df): 76 | text = '' 77 | for i, row in df.iterrows(): 78 | text += f'Q: <'+row['Question'] + '>\nA: <'+ row['Answer'] +'>\n\n' 79 | print('len qa', len(text.split(' '))) 80 | return text 81 | 82 | def telegram_message_format(text): 83 | max_message_length = 4096 84 | 85 | if len(text) > max_message_length: 86 | parts = [] 87 | while len(text) > max_message_length: 88 | parts.append(text[:max_message_length]) 89 | text = text[max_message_length:] 90 | parts.append(text) 91 | return parts 92 | else: 93 | return [text] 94 | 95 | 96 | def collect_full_prompt(question, qa_prompt, chat_prompt=None): 97 | prompt = f'I need to get an answer to the question related to the topic of "{topic}": ' + "{{{"+ question +"}}}. " 98 | prompt += '\n\nPossibly, you might find an answer in these Q&As [use the information only if it is actually relevant and useful for the question answering]: \n\n' + qa_prompt 99 | # edit if you need to use this also 100 | if chat_prompt is not None: 101 | prompt += "---------\nIf you didn't find a clear answer in the Q&As, possibly, these talks from chats might be helpful to answer properly [use the information only if it is actually relevant and useful for the question answering]: \n\n" + chat_prompt 102 | prompt += f'\nFinally, only if the information above was not enough you can use your knowledge in the topic of "{topic}" to answer the question.' 103 | 104 | return prompt 105 | 106 | 107 | def start(update, context): 108 | user = update.effective_user 109 | context.bot.send_message(chat_id=user.id, text=start_message) 110 | 111 | def message_handler(update, context): 112 | 113 | thread = threading.Thread(target=long_running_task, args=(update, context)) 114 | thread.start() 115 | 116 | def long_running_task(update, context): 117 | user = update.effective_user 118 | context.bot.send_message(chat_id=user.id, text='🕰️⏰🕙⏱️⏳...') 119 | 120 | try: 121 | question = update.message.text.strip() 122 | except Exception as e: 123 | context.bot.send_message(chat_id=user.id, 124 | text=f"🤔It seems like you're sending not text to the bot. Currently, the bot can only work with text requests.") 125 | return 126 | 127 | try: 128 | qa_found = search_similar(df_qa, question, n=num_top_qa) 129 | qa_prompt = collect_text_qa(qa_found) 130 | full_prompt = collect_full_prompt(question, qa_prompt) 131 | except Exception as e: 132 | context.bot.send_message(chat_id=user.id, 133 | text=f"Search failed. Debug needed.") 134 | return 135 | 136 | try: 137 | print(full_prompt) 138 | completion = call_chatgpt( 139 | model=model, 140 | n=1, 141 | messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": full_prompt}] 142 | ) 143 | result = completion['choices'][0]['message']['content'] 144 | except Exception as e: 145 | context.bot.send_message(chat_id=user.id, 146 | text=f'It seems like the OpenAI service is responding with errors. Try sending the request again.') 147 | return 148 | 149 | parts = telegram_message_format(result) 150 | for part in parts: 151 | update.message.reply_text(part, reply_to_message_id=update.message.message_id) 152 | 153 | 154 | bot = telegram.Bot(token=token) 155 | updater = Updater(token=token, use_context=True) 156 | dispatcher = updater.dispatcher 157 | 158 | dispatcher.add_handler(CommandHandler("start", start, filters=Filters.chat_type.private)) 159 | dispatcher.add_handler(MessageHandler(~Filters.command & Filters.text, message_handler)) 160 | 161 | updater.start_polling() 162 | --------------------------------------------------------------------------------