├── screenshot.png ├── config_example.py ├── schema.mysql.sql ├── schema.postgres.sql ├── templates ├── nav.html ├── 404.html ├── howitworks.html ├── prompt.html ├── login.html ├── admin.html ├── session.html ├── eval.html ├── head.html └── index.html ├── static └── manifest.json ├── requirements.txt ├── LICENSE ├── irc.pl ├── .gitignore ├── PromptManager.py ├── ingest.py ├── eval.py ├── generateprompt.py ├── README.md └── web.py /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aaronpk/llm-chatbot/main/screenshot.png -------------------------------------------------------------------------------- /config_example.py: -------------------------------------------------------------------------------- 1 | ME = "" 2 | CALLBACK_URL = "" 3 | CLIENT_ID = "" 4 | API_KEY = "" 5 | 6 | DB_TYPE = "mysql" 7 | # DB_TYPE = "postgres" 8 | # DB_TYPE = None 9 | DB_HOST = "" 10 | DB_NAME = "" 11 | DB_USER = "" 12 | DB_PASS = "" 13 | 14 | OPENAI_KEY = "" 15 | -------------------------------------------------------------------------------- /schema.mysql.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE answers ( 2 | id VARCHAR(255) NOT NULL PRIMARY KEY, 3 | date DATETIME, 4 | username VARCHAR(255), 5 | prompt TEXT, 6 | question TEXT, 7 | prompt_id VARCHAR(255), 8 | status VARCHAR(255), 9 | feedback INT(4) DEFAULT NULL 10 | ); 11 | 12 | -------------------------------------------------------------------------------- /schema.postgres.sql: -------------------------------------------------------------------------------- 1 | 2 | CREATE TABLE public.answers ( 3 | prompt text, 4 | question text, 5 | id text NOT NULL, 6 | prompt_id text, 7 | date text, 8 | username text, 9 | status text, 10 | feedback integer 11 | ); 12 | 13 | ALTER TABLE ONLY public.answers 14 | ADD CONSTRAINT answers_pkey PRIMARY KEY (id); 15 | 16 | -------------------------------------------------------------------------------- /templates/nav.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /templates/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

James Bot - 404 🤖

8 |

Sorry, this page doesn't exist.

9 | 12 |
13 | 14 | 15 | -------------------------------------------------------------------------------- /static/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "James Bot", 3 | "name": "James Bot", 4 | "icons": [ 5 | { 6 | "src": "https://jamesg.blog/assets/coffeeshop.jpeg", 7 | "type": "image/jpeg", 8 | "sizes": "512x512 120x120 152x152 167x167 180x180 192x192 384x384", 9 | "purpose": "any maskable" 10 | } 11 | ], 12 | "id": "/?source=pwa", 13 | "start_url": "/?source=pwa", 14 | "background_color": "#fff", 15 | "display": "standalone", 16 | "scope": "/", 17 | "theme_color": "#EEBAB2", 18 | "description": "Ask James Bot a question. Trained on James' public IRC messages and blog posts on jamesg.blog" 19 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | async-timeout==4.0.2 4 | attrs==22.2.0 5 | beautifulsoup4==4.10.0 6 | certifi==2022.12.7 7 | cffi==1.15.1 8 | charset-normalizer==2.0.12 9 | click==8.1.3 10 | cryptography==40.0.1 11 | faiss-cpu==1.7.3 12 | Flask==2.2.3 13 | frozenlist==1.3.3 14 | html5lib==1.1 15 | idna==3.4 16 | importlib-metadata==6.1.0 17 | indieweb-utils==0.8.1 18 | itsdangerous==2.1.2 19 | Jinja2==3.1.2 20 | jwt==1.3.1 21 | lxml==4.9.1 22 | MarkupSafe==2.1.2 23 | mf2py==1.1.2 24 | multidict==6.0.4 25 | numpy==1.24.2 26 | openai==0.27.2 27 | Pillow==9.4.0 28 | psycopg2==2.9.5 29 | mysql-connector-python==8.0.32 30 | pycparser==2.21 31 | PyJWT==2.4.0 32 | requests==2.26.0 33 | six==1.16.0 34 | soupsieve==2.4 35 | tqdm==4.65.0 36 | urllib3==1.26.15 37 | webencodings==0.5.1 38 | Werkzeug==2.2.3 39 | yarl==1.8.2 40 | zipp==3.15.0 41 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright 2023 capjamesg 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 6 | software and associated documentation files (the "Software"), to deal in the Software 7 | without restriction, including without limitation the rights to use, copy, modify, 8 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 9 | permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 12 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 13 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 14 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 15 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 16 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | -------------------------------------------------------------------------------- /irc.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use warnings; 4 | use strict; 5 | 6 | use LWP::UserAgent; 7 | use JSON; 8 | use Mojo::DOM; 9 | 10 | package UppercaseBot; 11 | use base qw(Bot::BasicBot); 12 | 13 | sub said { 14 | my $self = shift; 15 | my $arguments = shift; 16 | 17 | my $body = $arguments->{body}; 18 | 19 | # if message startswith !jamesbot 20 | if ($body =~ /^\!jamesbot/) { 21 | my $prompt = $body; 22 | $prompt =~ s/^\!jamesbot //; 23 | # make http request to https://jamesg.blog/bot/query form encoded with argument prompt= 24 | my $ua = LWP::UserAgent->new; 25 | my $response = $ua->post('https://jamesg.blog/bot/query', [query => $prompt]); 26 | my $json = JSON->new; 27 | my $data = $json->decode($response->decoded_content); 28 | my $answer = $data->{response}; 29 | 30 | # remove all text in () 31 | $answer =~ s/\(.*?\)//g; 32 | 33 | # remove all html tags 34 | $answer =~ s/<[^>]*>//g; 35 | 36 | $self->say(channel => $arguments->{channel}, body => $answer); 37 | } 38 | } 39 | 40 | package main; 41 | 42 | my $bot = UppercaseBot->new( 43 | server => "irc.libera.chat", 44 | port => "6667", 45 | channels => ["#channel"], 46 | nick => "bot", 47 | name => "bot" 48 | ); 49 | 50 | $bot->run(); -------------------------------------------------------------------------------- /templates/howitworks.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

How James Bot Works 🤖

8 |

James Bot is an AI chatbot built to reference James' writing. The bot can reference information, among other sources:

9 | 14 |

The chatbot is powered by OpenAI's GPT-3.5 Turbo API.

15 |

This bot is being built to reference sources when making statements to substantiate the results from the bot. With that said, the bot sometimes creates fake sources that are not in the reference corpus.

16 |

This bot is prone to hallucination (generating text that is factually inaccurate).

17 |

Claims made by the bot should be fact-checked against all provided sources, and with secondary sources should such research be prudent.

18 |

This bot is an experiment with generative AI.

19 | 22 |
23 | 24 | -------------------------------------------------------------------------------- /templates/prompt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

James Bot 🤖

8 |

Showing a saved answer.

9 | 10 |
{{ prompt['prompt'] | safe }}
11 | {% if prompt['date'] %} 12 |

Generated on {{ prompt['date'] }}.

13 | {% endif %} 14 |

This text was written by a generative text language model that references James' writing. Factual inaccuracies may be present. Sources, where provided, should be validated by a human.

15 | 16 | 17 |

About This Project

18 |

James Bot is an AI chatbot trained on James' writing. You can ask the Bot questions below.

19 |

Please note the Bot generates text and may be prone to hallucination (generating text that is factually inaccurate).

20 | 23 |
24 | 25 | -------------------------------------------------------------------------------- /templates/login.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

Log In with IndieAuth

8 |
9 |
10 | 11 |
12 | 15 |
16 | 37 | 38 | -------------------------------------------------------------------------------- /templates/admin.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

James Bot Admin 🤖

8 |

Current State

9 |
10 | Show Information 11 | 17 |
18 |                     {{ current_prompt }}
19 |                 
20 |
21 |

Prompts

22 | 39 | {% for i in range(1, num_pages + 1) %} 40 | {{ i }} 41 | {% endfor %} 42 | 48 | 51 |
52 | 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /templates/session.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

James Bot: Prompt Session 🤖

8 | 9 | {% if me %} 10 | Logged in as {{ me }} 11 | {% endif %} 12 | 13 |

Your Prompts

14 | 15 | 16 |

This text was written by a generative text language model, trained on James' blog. Factual inaccuracies may be present.

17 |

About This Project

18 |

James Bot is an AI chatbot trained on James' public IRC messages and blog. You can ask the Bot questions below.

19 |

Please note the Bot generates text and may be prone to hallucination (generating text that is factually inaccurate).

20 | 23 |
24 | 56 | 57 | -------------------------------------------------------------------------------- /templates/eval.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

James Bot Evaluation 🤖

8 |

Results

9 | 17 |

Prompt and Data

18 | 25 | {% if eval["stats"]["successful_evals_count"] > 0 %} 26 |

Successful Questions and Answers

27 | 35 | {% endif %} 36 | {% if eval["stats"]["failed_evals_count"] > 0 %} 37 |

Failed Questions and Answers

38 | 46 | {% endif %} 47 | {% if eval["stats"]["unsure_evals_count"] > 0 %} 48 |

Unsure Questions and Answers

49 | 57 | {% endif %} 58 | 67 | 70 |
71 | 72 | -------------------------------------------------------------------------------- /templates/head.html: -------------------------------------------------------------------------------- 1 | 2 | James Bot 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | {% if slug %} 19 | 20 | {% else %} 21 | 22 | {% endif %} 23 | 24 | 137 | -------------------------------------------------------------------------------- /PromptManager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from copy import deepcopy 4 | 5 | import numpy as np 6 | import openai 7 | 8 | import config 9 | 10 | openai.api_key = config.OPENAI_KEY 11 | 12 | # if prompts.json not present, raise error 13 | if not os.path.exists(f"prompts.json"): 14 | raise Exception( 15 | "prompts.json not found. You must generate a prompt with `python3 generateprompt.py` before running the web app." 16 | ) 17 | 18 | with open("prompts.json", "r") as f: 19 | prompts = json.load(f) 20 | 21 | prompt_list = prompts["prompts"] 22 | 23 | 24 | class Prompt: 25 | def __init__(self, prompt_id=prompts["latest_id"]): 26 | self.prompt_id = prompt_id 27 | self.substitutions = prompt_list[self.prompt_id]["substitutions"] 28 | self.date_created = prompt_list[self.prompt_id]["date"] 29 | self.index_id = prompt_list[self.prompt_id]["index_id"] 30 | self.index_name = prompt_list[self.prompt_id]["index_name"] 31 | 32 | def __repr__(self): 33 | print(f"Prompt ID: {self.prompt_id}") 34 | print(self.prompt) 35 | 36 | def seek_substitutions(self): 37 | print(self.substitutions) 38 | 39 | def raw_prompt(self): 40 | return prompt_list[self.prompt_id]["prompt"] 41 | 42 | def execute(self, substitutions={}, prompt_text="", temperature=None): 43 | new_prompt = deepcopy(prompt_list[self.prompt_id]) 44 | 45 | if prompt_text != "": 46 | new_prompt["prompt"][-1]["content"] = prompt_text 47 | 48 | for message in new_prompt["prompt"]: 49 | for key in substitutions: 50 | if key in message["content"]: 51 | message["content"] = message["content"].replace( 52 | f"[[[{key}]]]", substitutions[key] 53 | ) 54 | 55 | if temperature is not None: 56 | return openai.Completion.create( 57 | model="gpt-3.5-turbo", 58 | messages=new_prompt["prompt"], 59 | )["choices"][0]["message"]["content"] 60 | 61 | print(new_prompt["prompt"]) 62 | 63 | return openai.ChatCompletion.create( 64 | model="gpt-3.5-turbo", 65 | messages=new_prompt["prompt"], 66 | )["choices"][0]["message"]["content"] 67 | 68 | def get_facts_and_knn(self, query, vector_index, schema, facts): 69 | embedded_query = openai.Embedding.create( 70 | input=query, model="text-embedding-ada-002" 71 | ) 72 | 73 | D, I = vector_index.search( 74 | np.array([embedded_query["data"][0]["embedding"]]).reshape(1, 1536), 25 75 | ) 76 | 77 | knn = [] 78 | 79 | for i in I[0]: 80 | knn.append(schema[i]["text"]) 81 | 82 | content_sources = [ 83 | schema[i].get("url", None) for i in I[0] if schema[i].get("url") 84 | ] 85 | 86 | if len(content_sources) > 0: 87 | titles = [schema[i].get("title", schema[i]["url"]) for i in I[0]] 88 | dates = [schema[i].get("date", "") for i in I[0]] 89 | 90 | # create text that looks like this 91 | # [fact] (Source: [source]) 92 | 93 | facts_and_sources = [] 94 | 95 | for fact in facts: 96 | facts_and_sources.append(fact + " (Source: " + config.FACT_SOURCE + ")") 97 | 98 | skipped = [] 99 | 100 | for i in range(len(knn)): 101 | facts_and_sources.append( 102 | knn[i] 103 | + ' (Source: ' 106 | + titles[i] 107 | + ", " 108 | + dates[i] 109 | + ")" 110 | ) 111 | 112 | facts_and_sources_text = "\n\n".join(facts_and_sources) 113 | # cut off at 2000 words 114 | facts_and_sources_text = " ".join(facts_and_sources_text.split(" ")[:300]) 115 | 116 | references = [ 117 | {"url": content_sources[i], "title": titles[i]} 118 | for i in range(len(knn)) 119 | if i not in skipped 120 | ] 121 | else: 122 | facts_and_sources_text = "\n\n".join([schema[i]["text"] for i in I[0]]) 123 | skipped = [] 124 | 125 | references = [] 126 | 127 | return facts_and_sources_text, knn, references 128 | -------------------------------------------------------------------------------- /ingest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import time 5 | 6 | import faiss 7 | import numpy as np 8 | import openai 9 | import requests 10 | 11 | import config 12 | 13 | openai.api_key = config.OPENAI_KEY 14 | 15 | 16 | def initialize_loading(): 17 | # create indices/ if it doesn't exist 18 | if not os.path.exists("indices"): 19 | os.makedirs("indices") 20 | 21 | first_start = False 22 | 23 | # create indices/current.json if it doesn't exist 24 | if not os.path.exists("indices/current.json"): 25 | first_start = True 26 | with open("indices/current.json", "w") as f: 27 | json.dump({"index": 1}, f) 28 | 29 | with open("indices/current.json", "r") as f: 30 | current_index = json.load(f)["index"] 31 | 32 | if first_start or "--new" in sys.argv: 33 | if not first_start: 34 | current_index = current_index + 1 35 | 36 | with open("indices/current.json", "w") as f: 37 | json.dump({"index": current_index}, f) 38 | 39 | # mkdir if it doesn't exist 40 | if not os.path.exists("indices/" + str(current_index)): 41 | os.makedirs("indices/" + str(current_index)) 42 | 43 | # read all files in logs 44 | # if first start or "--new" is an argument, use fresh index 45 | 46 | if first_start or "--new" in sys.argv: 47 | vector_index = faiss.IndexFlatL2(1536) 48 | schema = [] 49 | else: 50 | # open most recent index, which should have the name "main.bin" 51 | print("Opening index "+str(current_index)) 52 | vector_index = faiss.read_index("indices/" + str(current_index) + "/main_vector_index.bin") 53 | 54 | with open("indices/" + str(current_index) + "/main_schema.json", "r") as f: 55 | schema = json.load(f) 56 | 57 | return vector_index, schema, current_index 58 | 59 | 60 | def exponential_backoff(embedding_text: str, max_retries: int = 5): 61 | """ 62 | Retry a request if it fails due to rate limiting. 63 | """ 64 | num_retries = 0 65 | 66 | while num_retries < max_retries: 67 | try: 68 | return openai.Embedding.create( 69 | input=embedding_text, model="text-embedding-ada-002" 70 | ) 71 | except openai.error.RateLimitError: 72 | time.sleep(1 * num_retries) 73 | num_retries += 1 74 | print(f"Rate limited, retrying {num_retries} of {max_retries}...") 75 | 76 | raise requests.exceptions.ConnectionError("Could not connect to OpenAI API") 77 | 78 | 79 | def save_index_and_schema( 80 | vector_index, schema: dict, current_index: int, stage: str = "main" 81 | ): 82 | """ 83 | Save index and schema to disk. 84 | """ 85 | print("Saving index and schema for stage " + stage) 86 | dir = "indices/" + str(current_index) + "/" + stage 87 | faiss.write_index(vector_index, dir + "_vector_index.bin") 88 | with open(dir + "_schema.json", "w") as f: 89 | json.dump(schema, f) 90 | 91 | 92 | def get_embedding(vector_index, document: str, schema: list = []): 93 | """ 94 | Get embedding for a document and add it to the vector index. 95 | """ 96 | response = exponential_backoff(document["text"]) 97 | 98 | embeddings = response["data"][0]["embedding"] 99 | vector_index.add(np.array([embeddings]).reshape(1, 1536)) 100 | 101 | schema.append(document) 102 | time.sleep(1) 103 | return vector_index, schema 104 | 105 | 106 | def index_pending( 107 | vector_index, 108 | current_index: int, 109 | schema: list = [], 110 | chunking_mechanism: str = "words", 111 | word_count: int = 750, 112 | ) -> tuple: 113 | """ 114 | Index all pending documents in pending_indexing/*.json. 115 | 116 | Chunking mechanism can be either "words" or "paragraphs". 117 | 118 | "words" will split documents into "words" word chunks. 119 | "paragraphs" will split documents into paragraphs. 120 | 121 | "paragraphs" is recommended for documents where paragraphs hold a lot of context. 122 | 123 | If key context is not available at the paragraph level of a document -- such as may the the case for a wiki page, for instance -- "words" is recommended. 124 | """ 125 | # if not exists, return 126 | if not os.path.exists("pending_indexing"): 127 | return vector_index, schema 128 | 129 | if not os.path.exists("indexed_docs"): 130 | os.mkdir("indexed_docs") 131 | 132 | for file in os.listdir("pending_indexing"): 133 | if file.endswith(".json"): 134 | with open("pending_indexing/" + file, "r") as f: 135 | data = json.load(f) 136 | 137 | sys.stdout.write(f"Indexing {file}\n") 138 | sys.stdout.flush() 139 | 140 | document_text = data["text"].split(" ") 141 | 142 | # divide document into 750 word chunks, max 143 | if chunking_mechanism == "words": 144 | chunks = [ 145 | " ".join(document_text[i : i + word_count]) 146 | for i in range(0, len(document_text), word_count) 147 | ] 148 | elif chunking_mechanism == "paragraphs": 149 | chunks = data["text"].split("\n\n") 150 | else: 151 | raise ValueError("Invalid chunking mechanism.") 152 | 153 | other_metadata = data.copy() 154 | 155 | # remove text from other_metadata 156 | del other_metadata["text"] 157 | 158 | fully_assembled_docs = [] 159 | 160 | for chunk in chunks: 161 | fully_assembled_docs.append({"text": chunk, **other_metadata}) 162 | 163 | print(" Indexing in "+str(len(chunks))+" chunks") 164 | 165 | for data in fully_assembled_docs: 166 | vector_index, schema = get_embedding(vector_index, data, schema) 167 | save_index_and_schema(vector_index, schema, current_index, stage="main") 168 | 169 | os.rename("pending_indexing/" + file, "indexed_docs/" + file) 170 | 171 | return vector_index, schema 172 | 173 | 174 | vector_index, schema, current_index = initialize_loading() 175 | vector_index, schema = index_pending( 176 | vector_index, current_index, schema, chunking_mechanism="words" 177 | ) 178 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import json 4 | import os 5 | import uuid 6 | 7 | import faiss 8 | 9 | from PromptManager import Prompt 10 | 11 | parser = argparse.ArgumentParser() 12 | 13 | parser.add_argument("--create", action="store_true") 14 | parser.add_argument("--eval", action="store_true") 15 | 16 | CURRENT_INDEX_NAME = "main" 17 | 18 | prompt_data = Prompt() 19 | 20 | index_number = prompt_data.index_id 21 | queried_index = prompt_data.index_name 22 | prompt_id = prompt_data.prompt_id 23 | 24 | EVALUATE_PROMPT = """ 25 | You are a bot tasked with verifying whether an answer is substantiated by a Source listed below. 26 | 27 | If a Source proves the answer is correct, respond with the word "CORRECT"; if it proves the answer is incorrect, respond with the word "INCORRECT"; if you are unsure, respond with "UNSURE". 28 | 29 | Claim: 30 | [[[QUERY]]] 31 | 32 | Sources: 33 | [[[SOURCES]]] 34 | """ 35 | 36 | vector_index = faiss.read_index( 37 | f"indices/{index_number}/{queried_index}_vector_index.bin" 38 | ) 39 | 40 | with open(f"indices/{index_number}/{queried_index}_schema.json", "r") as f: 41 | schema = json.load(f) 42 | 43 | 44 | class Evaluation: 45 | def __init__(self): 46 | if not os.path.exists("evals"): 47 | evals = [] 48 | else: 49 | evals = [] 50 | 51 | for file in os.listdir("evals"): 52 | if file.endswith(".json"): 53 | with open("evals/" + file, "r") as f: 54 | evals.extend(json.load(f)) 55 | 56 | self.evals = evals 57 | self.successful_evals = [] 58 | self.failed_evals = [] 59 | self.unsure_evals = [] 60 | self.eval_started_time = None 61 | self.eval_ended_time = None 62 | self.uuid = str(uuid.uuid4()) 63 | self.stats = {} 64 | 65 | def run_evals(self): 66 | self.eval_started_time = datetime.datetime.now() 67 | for count, eval in enumerate(self.evals): 68 | print( 69 | f"Running eval \"{eval['question']}\" ({count + 1}/{len(self.evals)})" 70 | ) 71 | facts = [] 72 | 73 | facts_and_sources_text, knn, references = prompt_data.get_facts_and_knn( 74 | eval["question"], vector_index, schema, facts 75 | ) 76 | 77 | current_date = datetime.datetime.now().strftime("%Y-%m-%d") 78 | 79 | response = prompt_data.execute( 80 | { 81 | "CURRENT_DATE": current_date, 82 | "FACTS": "\n".join(facts), 83 | "SOURCES": facts_and_sources_text, 84 | "QUERY": eval["question"], 85 | } 86 | ) 87 | 88 | print(response) 89 | 90 | print("Evaluating response...\n\n\n") 91 | 92 | eval_response = prompt_data.execute( 93 | { 94 | "QUERY": response, 95 | "SOURCES": facts_and_sources_text, 96 | }, 97 | prompt_text=EVALUATE_PROMPT, 98 | temperature=0.0, 99 | ) 100 | 101 | print(eval_response) 102 | 103 | eval_record = { 104 | "response": eval_response, 105 | "knn": knn, 106 | "references": references, 107 | "question": eval["question"], 108 | } 109 | 110 | if "CORRECT" in eval_response: 111 | self.successful_evals.append(eval_record) 112 | elif "INCORRECT" in eval_response: 113 | self.failed_evals.append(eval_record) 114 | elif "UNSURE" in eval_response: 115 | self.unsure_evals.append(eval_record) 116 | 117 | self.eval_ended_time = datetime.datetime.now() 118 | 119 | def get_eval_stats(self): 120 | precision, recall, f1_score = self.calculate_f1_score() 121 | 122 | self.stats = { 123 | "successful_evals_count": len(self.successful_evals), 124 | "failed_evals_count": len(self.failed_evals), 125 | "unsure_evals_count": len(self.unsure_evals), 126 | "successful_evals": self.successful_evals, 127 | "failed_evals": self.failed_evals, 128 | "unsure_evals": self.unsure_evals, 129 | "precision": precision, 130 | "recall": recall, 131 | "f1_score": f1_score, 132 | "eval_started_time": self.eval_started_time.strftime("%Y-%m-%d %H:%M:%S"), 133 | "eval_ended_time": self.eval_ended_time.strftime("%Y-%m-%d %H:%M:%S"), 134 | "uuid": self.uuid, 135 | } 136 | 137 | return self.stats 138 | 139 | def pretty_print_eval_stats(self): 140 | stats = self.stats 141 | print(f"Precision: {stats['precision']}") 142 | print(f"Recall: {stats['recall']}") 143 | print(f"F1 Score: {stats['f1_score']}") 144 | print(f"Successful evals: {stats['successful_evals_count']}") 145 | print(f"Failed evals: {stats['failed_evals_count']}") 146 | print(f"Unsure evals: {stats['unsure_evals_count']}") 147 | print(f"Eval started at: {stats['eval_started_time']}") 148 | 149 | def calculate_f1_score(self): 150 | if len(self.successful_evals) == 0: 151 | return 0, 0, 0 152 | 153 | precision = len(self.successful_evals) / ( 154 | len(self.successful_evals) + len(self.failed_evals) 155 | ) 156 | recall = len(self.successful_evals) / ( 157 | len(self.successful_evals) + len(self.unsure_evals) 158 | ) 159 | f1_score = 2 * (precision * recall) / (precision + recall) 160 | 161 | return precision, recall, f1_score 162 | 163 | def save_evals(self): 164 | if not os.path.exists("evals.json"): 165 | with open("evals.json", "w") as f: 166 | json.dump([], f) 167 | 168 | with open("evals.json", "r") as f: 169 | all_evals = json.load(f) 170 | 171 | eval_report = { 172 | "prompt_id": prompt_id, 173 | "index_id": index_number, 174 | "eval_uuid": self.uuid, 175 | "index_name": queried_index, 176 | "generated_on": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 177 | "stats": self.get_eval_stats(), 178 | } 179 | 180 | all_evals.append(eval_report) 181 | 182 | with open("evals.json", "w") as f: 183 | json.dump(all_evals, f) 184 | 185 | def create_eval(self, question, answer, eval_name): 186 | if not os.path.exists("evals"): 187 | os.mkdir("evals") 188 | 189 | eval = {"question": question, "answer": answer} 190 | 191 | to_write = [eval] 192 | 193 | if os.path.exists(f"evals/{eval_name}.json"): 194 | with open(f"evals/{eval_name}.json", "r") as f: 195 | evals = json.load(f) 196 | 197 | evals.append(eval) 198 | to_write = evals 199 | 200 | with open(f"evals/{eval_name}.json", "w") as f: 201 | json.dump(to_write, f) 202 | 203 | def create_eval_interactive(self, eval_name=None): 204 | question = input("Question: ") 205 | answer = input("Answer: ") 206 | 207 | if eval_name is None: 208 | eval_name = input("Eval name: ") 209 | 210 | self.create_eval(question, answer, eval_name) 211 | 212 | 213 | if __name__ == "__main__": 214 | eval = Evaluation() 215 | 216 | if parser.parse_args().create: 217 | while True: 218 | eval.create_eval_interactive("coffee") 219 | 220 | if parser.parse_args().eval: 221 | eval.run_evals() 222 | eval.pretty_print_eval_stats() 223 | eval.save_evals() 224 | -------------------------------------------------------------------------------- /generateprompt.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import os 4 | import re 5 | import uuid 6 | 7 | if not os.path.exists("prompts.json"): 8 | data_structure = {"prompts": {}, "latest_id": ""} 9 | 10 | with open("prompts.json", "w") as f: 11 | json.dump(data_structure, f) 12 | 13 | with open("prompts.json", "r") as f: 14 | prompt_data = json.load(f) 15 | 16 | # get most current index from indices/current.json 17 | if not os.path.exists("indices/current.json"): 18 | if not os.path.exists("indices"): 19 | os.makedirs("indices") 20 | 21 | with open("indices/current.json", "w") as f: 22 | json.dump({"index": 0}, f) 23 | 24 | current_index = 0 25 | else: 26 | with open("indices/current.json", "r") as f: 27 | current_index = json.load(f)["index"] 28 | 29 | # MUST FILL OUT 30 | INDEX_ID = current_index 31 | INDEX_NAME = "main" 32 | 33 | prompt_to_add = { 34 | "id": uuid.uuid4().hex, 35 | "date": datetime.datetime.now().strftime("%Y-%m-%d"), 36 | "index_id": INDEX_ID, 37 | "index_name": INDEX_NAME, 38 | "prompt": [ 39 | { 40 | "role": "system", 41 | "content": f"""You are James. You can use the text provided below help you answer. Be positive, concise, and a bit whimsical. The date is [[[CURRENT_DATE]]]. If you are not confident with your answer, say 'I don't know' then stop. 42 | 43 | Prompts must be written in English. 44 | 45 | You are not allowed to add links from sites that are not mentioned in the Sources. 46 | 47 | Citations must replace the keyword in the source text. Do not cite like "(Source: )". For example, if you want to cite the first source, you would write: "concept..." then continue with your text. Do not put links at the end of your answer. 48 | 49 | All links must be in HTML tags. 50 | 51 | Include a maximum of three citations. Only cite from URLs in this prompt. Never cite from another site. 52 | 53 | If you use a fact from the system prompt, cite it with the following format: "James' homepage". 54 | 55 | Provide quotes to substantiate your claims, only from this prompt, never from elsewhere. Cite quotes with the following format: "page title". 56 | 57 | Sources use this format: 58 | 59 | Source Text (Source: Source Title, 2020-01-01) 60 | 61 | Here is an example source: 62 | 63 | James is a writer. (Source: James' homepage, 2020-01-01) 64 | 65 | If you were to cite this, you would say: 66 | 67 | James is a writer. (James' homepage, 2020-01-01) 68 | 69 | [STOP] means end of sources. 70 | 71 | Do not dangle prepositions. "to whom" is correct. "who to" is not correct. 72 | 73 | Quotations from Sources may be used to substantiate your claims, as long as they are cited. 74 | 75 | Here are facts about you: \n 76 | 77 | [[[FACTS]]] 78 | """, 79 | }, 80 | { 81 | "role": "user", 82 | "content": """ 83 | You are James. Answer using "I". Answer the question 'do you like paramore?'. 84 | 85 | If you use text in a section to make a statement, you must cite the source in a HTML tag. The text in the Sources section is formatted with a URL and a passage. You can only cite sources that are in the Sources section. The anchor text must be the title of source. You must never generate the anchor text. 86 | 87 | Use the Sources text below, as well as your facts above, to answer. 88 | 89 | [STOP] means end of sources. 90 | 91 | 92 | Sources 93 | ------- 94 | My pronouns are he/him/his. I live in Scotland. (Source: https://jamesg.blog/about/) 95 | 96 | I write about the open web and coffee. Every two weeks, I co-host the Europe Homebrew Website Club. (Source: https://jamesg.blog/about/) 97 | 98 | My email address is readers@jamesg.blog. (Source: https://jamesg.blog/about/) 99 | 100 | I also contribute to the IndieWeb wiki. I have open sourced some code on GitHub. (Source: https://jamesg.blog/about/) 101 | 102 | I work for Roboflow. (Source: https://jamesg.blog/about/) 103 | 104 | I am learning about computer vision (Source: https://jamesg.blog/about/) 105 | 106 | I listen to a lot of Taylor Swift music. (Source: https://jamesg.blog/about/) 107 | 108 | - Paramore (indie). I am a big fan of their later music, especially their album After Laughter. I'm just starting to explore their earlier music. (Source: Discovering new music (with recommendations)) 109 | 110 | What bands are you listening to right now? What artists do you enjoy the most? I would love to know. I am planning to explore the music of Rudimental more soon, a collaborator in the song These Days and many other famous tunes with which you may be familiar, as well as Paramore's earlier music. What else should I explore? Maybe I should go back to listening to radio like I did last year to discover new tunes? (Source: Discovering new music (with recommendations)) 111 | 112 | I listen to a lot of Taylor Swift music. (Source: Facts) 113 | 114 | This page shows a list of '''music''', artists, and bands I enjoy. 115 | 116 | This page is not meant to be complete. I shall update this page every so often. 117 | 118 | == Bands == 119 | 120 | * Paramore 121 | * The Lumineers 122 | * The Beatles 123 | * MisterWives 124 | * Coldplay 125 | * Oh Wonder 126 | 127 | == Artists == 128 | 129 | * Meat Loaf 130 | * Taylor Swift 131 | * Tessa Violet 132 | * Katy Perry 133 | * Jessie J 134 | * ... 135 | 136 | == Songs == 137 | 138 | * Mr. Blue Sky (ELO) 139 | * Strangers (Sigrid) 140 | * ... (Source: Music) 141 | 142 | Before I begin, I should tell you that I prefer indie and pop music. Everyone has different tastes in music but I hope that someone who reads this has a similar taste and can find value in the list I 143 | [STOP]""", 144 | }, 145 | { 146 | "role": "assistant", 147 | "content": """Yes, I do like Paramore. As a matter of fact, I'm a big fan of their later music in particular, specifically their album After Laughter. I'm just starting to explore their earlier music. (Discovering new music (with recommendations))""", 148 | }, 149 | { 150 | "role": "user", 151 | "content": """What is AI?""", 152 | }, 153 | { 154 | "role": "assistant", 155 | "content": """I don't know. AI is not in my sources.""", 156 | }, 157 | { 158 | "role": "user", 159 | "content": f"""You are James. Answer using "I". Answer the question '[[[QUERY]]]?'. 160 | 161 | If you use text in a section to make a statement, you must cite the source in a HTML tag. The text in the Sources section is formatted with a URL and a passage. You can only cite sources that are in the Sources section. The anchor text must be the title of source. You must never generate the anchor text. 162 | 163 | Use the Sources text below, as well as your facts above, to answer. Sources have dates at the end. You should prefer more recent information. And add a caveat such as "this may be out of date since my Source was published on [date]", where [date] is the date on which the source was published. if you are citing information older than one year from [[[CURRENT_DATE]]] 164 | 165 | [STOP] means end of sources.\n 166 | 167 | Sources 168 | ------- 169 | 170 | [[[SOURCES]]] 171 | 172 | [STOP] 173 | """, 174 | }, 175 | ], 176 | } 177 | 178 | required_substitutions = [] 179 | 180 | for prompt in prompt_data["prompts"]: 181 | # find text in [[[TEXT]]] format 182 | contents = [i["content"] for i in prompt_to_add["prompt"]] 183 | matches = re.findall(r"\[\[\[(.*?)\]\]\]", "".join(contents)) 184 | 185 | for match in matches: 186 | if match not in required_substitutions: 187 | required_substitutions.append(match) 188 | 189 | prompt_data["prompts"][prompt_to_add["id"]] = prompt_to_add 190 | prompt_data["prompts"][prompt_to_add["id"]]["substitutions"] = required_substitutions 191 | prompt_data["latest_id"] = prompt_to_add["id"] 192 | 193 | with open("prompts.json", "w") as f: 194 | json.dump(prompt_data, f) 195 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI Documentation Chatbot (Powered by ChatGPT 3.5) 2 | 3 | The source code for James Bot, a bot that makes reference to a corpus of documents to answer questions. 4 | 5 | ![A screenshot of James Bot, a chatbot that can answer questions using information from James' Coffee Blog](screenshot.png) 6 | 7 | ## How it Works 8 | 9 | This AI documentation bot has three components: 10 | 11 | 1. Data ingestion, where a reference index and vector store are compiled. At this stage, the program calculates the embeddings associated with the text documents you want to ingest. You may want to write custom ingestion scripts that work with your data. An example is provided in the `example_ingest.py` file. The reference index maps to the ID associated with each item in a vector store, both of which are queried in the web application. 12 | 2. Prompt generation, where you create a prompt configuration for use in the application. 13 | 3. The web application, where: 14 | 1. A user enters a query; 15 | 2. The vector index and reference index are queried to return information about the entry; 16 | 3. A prompt is generated to send to ChatGPT and; 17 | 4. The response from the OpenAI API is returned to the client. 18 | 19 | ### Disclaimer 20 | 21 | This application sends data from the sources you provide to OpenAI in a prompt for use in answering questions. You should independently research the data policies of OpenAI to understand how your data may be used. 22 | 23 | ## Getting Started 24 | 25 | ### Install Dependencies 26 | 27 | To get started with this project, first set up a virtual environment, install the required dependencies, and create a config file: 28 | 29 | ``` 30 | python3 -m venv venv 31 | source venv/bin/activate 32 | pip3 install -r requirements.txt 33 | cp example_config.py config.py 34 | ``` 35 | 36 | ### Configure Database 37 | 38 | Next, we need to set up a PostgreSQL database. The required database schema is stored in the `schema.sql` file. Run this command to create the database from the schema: 39 | 40 | ``` 41 | psql -f schema.sql 42 | ``` 43 | 44 | Now, open the `config.py` file and set the database connection details: 45 | 46 | ``` 47 | DB_HOST="localhost" 48 | DB_NAME="name" 49 | DB_USER="user" 50 | DB_PASS="password" 51 | ``` 52 | 53 | ### Configure OpenAI API Access 54 | 55 | You will need an OpenAI API key to use this project. Create an OpenAI account, then retrieve your API key. Save the key into your `config.py` file: 56 | 57 | ``` 58 | OPENAI_KEY="" 59 | ``` 60 | 61 | ### Configure IndieAuth Authentication 62 | 63 | This application uses IndieAuth to support the administration panel. To authenticate as an admin, you must set up IndieAuth on your website. 64 | 65 | When you have set up IndieAuth, you can set the following variables in the `config.py` file: 66 | 67 | ``` 68 | CALLBACK_URL="https://example.com/callback" 69 | CLIENT_ID="https://example.com" 70 | ME="https://yoursite.com" # the URL of your personal website with which you will authenticate. 71 | ``` 72 | 73 | ### Ingest Content 74 | 75 | Next, create a folder called `pending_indexing`. In that folder, store any JSON documents that you want to ingest into the reference index. The JSON documents should contain, at minimum a `text` field with the text of the document. The JSON documents may be blog posts, podcast transcripts, chat logs, or any other type of text document. 76 | 77 | Next, you will need to ingest content to create a reference index and data store based on the JSON documents you have stored in the `pending_indexing` folder. 78 | 79 | To create the index, run this command: 80 | 81 | ``` 82 | python3 ingest.py 83 | ``` 84 | 85 | This will create a reference index and vector store in the `indices/` directory. The reference index is a JSON file that maps the ID of each item in the vector store to the text of the document. The vector store is a binary file that contains the embeddings for each item in the index. 86 | 87 | ### Set Up a Prompt 88 | 89 | Next, you need to configure a prompt. This is the template that will be used to create a prompt that is sent to the OpenAI API. 90 | 91 | To do so, open the `generate_prompt.py` file and replace the example text with the prompt that you want to use in the web application. By using this file, you can generate different versions of a prompt for use in your application. This makes it easy for you to track changes to your prompts over time and revert back to previous versions if required. 92 | 93 | An example prompt is: 94 | 95 | ``` 96 | You are James. Answer using "I". Answer the question '[[[QUERY]]]?'. 97 | 98 | If you use text in a section to make a statement, you must cite the source in a HTML tag. The text in the Sources section is formatted with a URL and a passage. You can only cite sources that are in the Sources section. The anchor text must be the title of source. You must never generate the anchor text. 99 | 100 | Use the Sources text below, as well as your facts above, to answer. Sources have dates at the end. You should prefer more recent information. And add a caveat such as "this may be out of date since my Source was published on [date]", where [date] is the date on which the source was published. if you are citing information older than one year from [[[CURRENT_DATE]]] 101 | 102 | [STOP] means end of sources.\n 103 | 104 | Sources 105 | ------- 106 | 107 | [[[SOURCES]]] 108 | 109 | [STOP] 110 | ``` 111 | 112 | In this prompt, values in `[[[]]]` are substitutions. These substitutions are replaced with values from the JSON documents at query time. 113 | 114 | By default: 115 | 116 | - `[[[QUERY]]]` is replaced with the text a user submits as a query in the web interface. 117 | - `[[[SOURCE]]]` is replaced with the text of the document that is returned from the vector store. 118 | - `[[[CURRENT_DATE]]]` is replaced with the current date. 119 | 120 | To configure other substitutions, modify the logic in the `web.py` file where the `prompt_data.execute()` call is made. 121 | 122 | You should specify a `System` prompt that contains high-level rules in the `generateprompt.py` file. Then, add an `Assistant` prompt. The `Assistant` prompt should contain specific instructions for the assistant to follow, and is where the user query and sources information will be added. 123 | 124 | You can then generate a prompt version using this line of code: 125 | 126 | ``` 127 | python3 generateprompt.py 128 | ``` 129 | 130 | ### Run the Web Application 131 | 132 | Next, you can run the web application: 133 | 134 | ``` 135 | python3 web.py 136 | ``` 137 | 138 | The chatbot will be available at `http://localhost:5000`. 139 | 140 | ## Schemas and Indices 141 | 142 | The `ingest.py` script stores indices in the `indices/` directory. For each new index, a new directory is created with the number corresponding with the index. 143 | 144 | The most recent index is stored in `indices/current.json`. 145 | 146 | Each numbered folder will contain two types of files: 147 | 148 | 1. A JSON file with the structured information you have ingested. 149 | 2. The vector store, which contains the embeddings for each item in the index. This is stored in a `.bin` file. 150 | 151 | If you have created multiple indices for a single version, they will all appear in the numbered folder associated with that version. 152 | 153 | ## Application Routes 154 | 155 | The web application has a few user-facing routes: 156 | 157 | - `/`: Send a query to the API. 158 | - `/adminpage`: View all prompts added to the system. 159 | - `/login`: Authenticate with [IndieAuth](https://indieweb.org/IndieAuth). 160 | - `/session`: See all the prompts that you have created, stored in `localStorage`. 161 | - `/logout`: Log out of the application. 162 | - `/prompt/`: View a specific prompt. 163 | 164 | ## API 165 | 166 | ### Send a Query 167 | 168 | You can send a query to the API to retrieve a response in a JSON format. To do so, use the following query structure: 169 | 170 | ``` 171 | curl -X POST -d "query=What books do you like?" http://localhost:5000/query 172 | ``` 173 | 174 | This query returns a payload in this form: 175 | 176 | ``` 177 | { 178 | "id": "prompt", 179 | "knn": [ 180 | "Books!!!","## Reading Books", 181 | "I do not read books very often but I do enjoy reading. While I have read more books on coffee than anything else, I am very fond of fiction, particularly Japanese fiction. I like getting to know the characters and seeing them change throughout a book.", 182 | ... 183 | ], 184 | "references": { 185 | "inline":[], 186 | "sources":[ 187 | {"title":"https://chat.indieweb.org/{\"type\":\"m","url":"https://chat.indieweb.org/{\"type\":\"m"}, 188 | {"title":"How I Learn About Speciality Coffee","url":"https://jamesg.blog/2020/10/25-how-i-learn-about-speciality-coffee/"}, 189 | ... 190 | ] 191 | }, 192 | "response": "Answer: I do not read books very often but I do enjoy reading. While I have read more books on coffee than anything else, I am very fond of fiction, particularly Japanese fiction.\n\nEvaluation: This is my answer based on my personal preference when it comes to books.\n\nTruth: N/A Since this is a personal preference and not a factual statement, it cannot be evaluated as true or false based on the sources provided." 193 | } 194 | ``` 195 | 196 | Where: 197 | 198 | - `id`: The unique ID associated with the answer. 199 | - `knn`: The nearest neighbours of the query in the vector store, used as Sources (if configured). 200 | - `references`: The references associated with the answer. 201 | - `response`: The response from the OpenAI API. 202 | 203 | ### Save Document for Indexing 204 | 205 | Indexing happens in two stages: 206 | 207 | 1. Save a document via the API or a custom ingestion script, then; 208 | 2. Run the `ingest.py` script to create the vector store. 209 | 210 | To save a document for later indexing, make the following request: 211 | 212 | ``` 213 | // should be a json request 214 | 215 | curl -X POST -d '{"title":"My Title","content":"My Content","date":"2023-01-01","url":"https://example.com"}' http://localhost:5000/index 216 | ``` 217 | 218 | A 200 response indicates that the document was successfully saved. 219 | 220 | The JSON payload can be any arbitrary JSON, but there MUST be a value called `text` present. `text` must contain the main content in the document. This is the content for which an embedding will be calculated. 221 | 222 | The following keys are recommended, in addition to `text`: 223 | 224 | - `title`: The title of the document. 225 | - `date`: The date on which the content was published. 226 | - `url`: The URL where content can be found. 227 | 228 | Whenever you want to update the index, run the `ingest.py` script. Then, restart the web application. This restart is necessary because the vector database is held in RAM in the web application. 229 | 230 | ## Evaluations (Experimental) 231 | 232 | *Note that the below method is experimental. Because language models are non-deterministic, the results of evaluations may vary.* 233 | 234 | This project has a mechanism called Evaluations that you can use to (roughly) evaluate the performance of your bot. This mechanism is inspired by [openai/evals](https://github.com/openai/evals). 235 | 236 | To create an Evaluation, you must first have a prompt and reference index set up. Then, create a folder called `evals` and add a JSON file with the following structure: 237 | 238 | ``` 239 | [ 240 | { 241 | "question": "...", 242 | "answer": "..." 243 | } 244 | ] 245 | ``` 246 | 247 | You can create as many JSON documents in the `evals` folder as you want. You can use separate JSON documents for different types of evaluations. 248 | 249 | To run an evaluation, use the following command: 250 | 251 | ``` 252 | python3 eval.py --eval 253 | ``` 254 | 255 | When you run this command, the script will iterate through all the JSON documents in the `evals` folder. Each question is sent to the OpenAI API in the format of your prompt template as normal. Then, a new prompt is created that asks whether the answer can be substantiated by the sources provided. This new prompt is sent to the OpenAI API, where three responses are possible: 256 | 257 | - `CORRECT`: The answer is substantiated by the sources. 258 | - `INCORRECT`: The answer is not substantiated by the sources. 259 | - `UNSURE`: The bot is unsure. 260 | 261 | The results are printed to the console after all Evaluations have been executed. 262 | 263 | You can also review the results of the most recent evaluation by opening `http://localhost:5000/eval` in the web application. You need to be authenticated as an administrative user to access this page. 264 | 265 | ## Contributors 266 | 267 | - capjamesg 268 | 269 | ## License 270 | 271 | This project is licensed under an [MIT 0 (No-Attribution) License](LICENSE). -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% include 'head.html' %} 4 | 5 | {% include 'nav.html' %} 6 |
7 |

James Bot 🤖

8 | 9 | 10 |
Ask a question!
11 |

12 |

Prompt References

13 |
14 | View References 15 | 16 |
17 |
18 |

This text was written by a generative text language model, trained on James' blog. Factual inaccuracies may be present.

19 |
20 |

Review the Answer

21 | 25 |
26 | 43 |

Question Ideas

44 | 50 |

About This Project

51 |

James Bot is an AI chatbot trained on James' public IRC messages and blog. You can ask the Bot questions below.

52 |

Please note the Bot generates text and may be prone to hallucination (generating text that is factually inaccurate).

53 | 56 |
57 | 241 | 242 | -------------------------------------------------------------------------------- /web.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import datetime 3 | import hashlib 4 | import json 5 | import os 6 | import pprint 7 | import random 8 | import re 9 | import string 10 | import uuid 11 | import sys 12 | 13 | import faiss 14 | import openai 15 | from flask import (Flask, flash, jsonify, redirect, render_template, request, 16 | session) 17 | from indieweb_utils import (Paginator, discover_endpoints, 18 | indieauth_callback_handler) 19 | 20 | import config 21 | from PromptManager import Prompt 22 | 23 | 24 | openai.api_key = config.OPENAI_KEY 25 | 26 | if config.DB_TYPE == "postgres": 27 | import psycopg2 28 | conn = psycopg2.connect( 29 | host=config.DB_HOST, 30 | database=config.DB_NAME, 31 | user=config.DB_USER, 32 | password=config.DB_PASS, 33 | ) 34 | if config.DB_TYPE == "mysql": 35 | import mysql.connector 36 | conn = mysql.connector.connect( 37 | host=config.DB_HOST, 38 | database=config.DB_NAME, 39 | user=config.DB_USER, 40 | password=config.DB_PASS, 41 | ) 42 | 43 | prompt_data = Prompt() 44 | 45 | index_number = prompt_data.index_id 46 | queried_index = prompt_data.index_name 47 | prompt_id = prompt_data.prompt_id 48 | 49 | ME = config.ME 50 | CALLBACK_URL = config.CALLBACK_URL 51 | CLIENT_ID = config.CLIENT_ID 52 | API_KEY = config.API_KEY 53 | 54 | vector_index = faiss.read_index( 55 | f"indices/{index_number}/{queried_index}_vector_index.bin" 56 | ) 57 | 58 | app = Flask(__name__) 59 | app.secret_key = random.choice(string.ascii_letters) + "".join( 60 | random.choices(string.ascii_letters + string.digits, k=15) 61 | ) 62 | 63 | with open(f"indices/{index_number}/{queried_index}_schema.json", "r") as f: 64 | schema = json.load(f) 65 | 66 | # create pending_indexing dir if it doesn't exist 67 | if not os.path.exists("pending_indexing"): 68 | os.mkdir("pending_indexing") 69 | 70 | if not os.path.exists("evals.json"): 71 | with open("evals.json", "w") as f: 72 | json.dump([], f) 73 | 74 | with open("evals.json", "r") as f: 75 | all_evals = json.load(f) 76 | 77 | 78 | def prompt_is_safe(prompt: str) -> bool: 79 | response = openai.Moderation.create(input=prompt) 80 | 81 | results = response["results"][0] 82 | 83 | values = list(results.values()) 84 | 85 | # if any value is equal to True, the prompt is not safe 86 | if True in values: 87 | return False 88 | 89 | return True 90 | 91 | 92 | @app.route("/") 93 | def index(): 94 | prompt_value = request.args.get("prompt", "") 95 | 96 | return render_template( 97 | "index.html", prompt=prompt_value, username=session.get("me") 98 | ) 99 | 100 | 101 | @app.route("/eval") 102 | def eval_list(): 103 | if not session.get("me") or session.get("me") != ME: 104 | return redirect("/") 105 | 106 | return render_template("eval.html", username=session.get("me"), eval=all_evals[-1]) 107 | 108 | 109 | @app.route("/prompt/") 110 | def prompt(prompt_id): 111 | with conn.cursor(dictionary=True) as cur: 112 | cur.execute("SELECT * FROM answers WHERE id = %s", (prompt_id,)) 113 | prompt = cur.fetchone() 114 | 115 | if not prompt: 116 | return render_template("404.html"), 404 117 | 118 | return render_template("prompt.html", prompt=prompt, slug="prompt/" + prompt['id']) 119 | 120 | 121 | @app.route("/session", methods=["GET"]) 122 | def user_session(): 123 | return render_template("session.html", me=session.get("me")) 124 | 125 | 126 | @app.route("/index", methods=["POST"]) 127 | def index_content(): 128 | # get key from header 129 | key = request.headers.get("Authorization", "").replace("Bearer ", "") 130 | 131 | if key != API_KEY: 132 | return jsonify({"status": "error", "message": "Invalid API key."}), 401 133 | 134 | # accepted input: any aritrary JSON object 135 | with open(f"pending_indexing/{uuid.uuid4().hex}.json", "w") as f: 136 | json.dump(request.json, f) 137 | 138 | return jsonify({"status": "success"}) 139 | 140 | 141 | @app.route("/adminpage", methods=["GET"]) 142 | def admin(): 143 | if not session.get("me") or session.get("me") != ME: 144 | return redirect("/") 145 | 146 | with conn.cursor(dictionary=True) as cur: 147 | cur.execute("SELECT * FROM answers ORDER BY date DESC") 148 | all_posts = cur.fetchall() 149 | 150 | # reverse posts 151 | all_posts = all_posts[::-1] 152 | 153 | paginator = Paginator(all_posts, 10) 154 | 155 | page = request.args.get("page", 0) 156 | 157 | try: 158 | page = int(page) 159 | except ValueError: 160 | page = 0 161 | 162 | num_pages = paginator.total_pages 163 | 164 | if num_pages > 0: 165 | all_posts = paginator.get_page(page) 166 | else: 167 | all_posts = [] 168 | 169 | return render_template( 170 | "admin.html", 171 | prompts=all_posts, 172 | index_number=index_number, 173 | queried_index=queried_index, 174 | prompt_id=prompt_id, 175 | current_prompt=pprint.pformat(prompt_data.raw_prompt()), 176 | username=session.get("me"), 177 | num_pages=num_pages, 178 | page=page, 179 | ) 180 | 181 | 182 | @app.route("/defend", methods=["POST"]) 183 | def defend(): 184 | """ 185 | This endpoint is not in active use. 186 | """ 187 | id = request.form["id"] 188 | 189 | # remove all punctuation aside from question marks, commas, and full stops 190 | with conn.cursor(dictionary=True) as cur: 191 | cur.execute("SELECT * FROM answers WHERE id = %s", (id,)) 192 | prompt = cur.fetchone() 193 | 194 | query = prompt['prompt'] 195 | 196 | # get Sources from prompt 197 | prompt_text = prompt['prompt'] 198 | 199 | sources = prompt_text.split("Sources")[-1].split("[STOP]")[0] 200 | 201 | prompt = f""" 202 | Your task is to evaluate whether the following statement is backed up by the Sources provided. 203 | 204 | You must only use the sources in the Sources section. The end of the Sources is marked by the line [END OF SOURCES]. 205 | 206 | You must not query the internet or reference any information not in this prompt. 207 | 208 | Statement 209 | --------- 210 | 211 | {query} 212 | 213 | Sources 214 | ------- 215 | 216 | {sources} 217 | 218 | [END OF SOURCES] 219 | 220 | Explain your response in bullet points, with reference to quotations from the sources. 221 | """ 222 | 223 | result = openai.ChatCompletion.create( 224 | model="gpt-3.5-turbo", 225 | messages=[ 226 | { 227 | "role": "system", 228 | "content": """ 229 | You are a helpful AI bot tasked with evaluating the validity of a statement based on sources. 230 | 231 | Given a statement and a list of sources, you must determine whether the statement is backed up by the sources. 232 | 233 | You have no name. You must not reference your own existence in your response. You must not reference any sources not in the Sources section of a prompt. 234 | """, 235 | }, 236 | { 237 | "role": "assistant", 238 | "content": prompt, 239 | }, 240 | ], 241 | )["choices"][0]["message"]["content"] 242 | 243 | return jsonify( 244 | {"response": result + "\n\n------------------------\n\nSources:\n\n" + sources} 245 | ) 246 | 247 | 248 | @app.route("/query", methods=["POST"]) 249 | def query(): 250 | query = request.form["query"] 251 | username = session.get("me") 252 | 253 | # remove all punctuation aside from question marks, commas, and full stops 254 | query = re.sub(r"[^\w\s\?\.,]", "", query).strip("?") # .lower() 255 | 256 | # query can be no more than 100 words 257 | query = " ".join(query.split()[:100]) 258 | 259 | safe = prompt_is_safe(query) 260 | 261 | if not safe: 262 | return jsonify( 263 | { 264 | "response": "Sorry. I can't help you with that.", 265 | "references": [], 266 | "knn": [], 267 | } 268 | ) 269 | 270 | current_date = datetime.datetime.now().strftime("%Y-%m-%d") 271 | 272 | facts = [] 273 | 274 | facts_and_sources_text, knn, references = prompt_data.get_facts_and_knn( 275 | query, vector_index, schema, facts 276 | ) 277 | 278 | response = prompt_data.execute( 279 | { 280 | "CURRENT_DATE": current_date, 281 | "FACTS": "\n".join(facts), 282 | "SOURCES": facts_and_sources_text, 283 | "QUERY": query, 284 | } 285 | ) 286 | 287 | # get all inline citations 288 | citations = re.findall(r"(.*?)", response) 289 | 290 | citations = [{"url": c[0], "title": c[1]} for c in citations] 291 | 292 | if config.DB_TYPE: 293 | cursor = conn.cursor() 294 | 295 | # save prompt response and original question 296 | identifier = str(uuid.uuid4()) 297 | 298 | date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 299 | 300 | cursor.execute( 301 | "INSERT INTO answers (prompt, question, id, prompt_id, date, username, status) VALUES (%s, %s, %s, %s, %s, %s, %s)", 302 | (response, query, identifier, prompt_id, date, username, "0"), 303 | ) 304 | 305 | conn.commit() 306 | else: 307 | identifier = "" 308 | 309 | return jsonify( 310 | { 311 | "response": response, 312 | "knn": knn, 313 | "references": { 314 | "inline": citations, 315 | "sources": references, 316 | }, 317 | "id": identifier, 318 | } 319 | ) 320 | 321 | 322 | @app.route("/callback") 323 | def indieauth_callback(): 324 | code = request.args.get("code") 325 | state = request.args.get("state") 326 | 327 | # these are the scopes necessary for the application to run 328 | required_scopes = [] 329 | 330 | try: 331 | response = indieauth_callback_handler( 332 | code=code, 333 | state=state, 334 | token_endpoint=session.get("token_endpoint"), 335 | code_verifier=session["code_verifier"], 336 | session_state=session.get("state"), 337 | me=ME, 338 | callback_url=CALLBACK_URL, 339 | client_id=CLIENT_ID, 340 | required_scopes=required_scopes, 341 | ) 342 | except Exception as e: 343 | sys.stdout.write(f"Error {e}\n") 344 | sys.stdout.flush() 345 | flash("Sorry, there was an error. Please try again.") 346 | return redirect("/login") 347 | 348 | session.pop("code_verifier") 349 | 350 | session["me"] = response.response.get("me") 351 | session["access_token"] = response.response.get("access_token") 352 | session["scope"] = response.response.get("scope") 353 | 354 | return redirect("/adminpage") 355 | 356 | 357 | @app.route("/logout") 358 | def logout(): 359 | session.pop("me") 360 | session.pop("access_token") 361 | 362 | return redirect("/login") 363 | 364 | 365 | @app.route("/login", methods=["GET"]) 366 | def login(): 367 | return render_template("login.html") 368 | 369 | 370 | @app.route("/discover", methods=["POST"]) 371 | def discover_auth_endpoint(): 372 | domain = request.form.get("domain") 373 | 374 | if domain.strip() != ME: 375 | flash("Sorry, this domain is not supported.") 376 | return redirect("/login") 377 | 378 | headers_to_find = ["authorization_endpoint", "token_endpoint"] 379 | 380 | headers = discover_endpoints(domain, headers_to_find) 381 | 382 | if not headers.get("authorization_endpoint"): 383 | flash( 384 | "A valid IndieAuth authorization endpoint could not be found on your website." 385 | ) 386 | return redirect("/login") 387 | 388 | if not headers.get("token_endpoint"): 389 | flash("A valid IndieAuth token endpoint could not be found on your website.") 390 | return redirect("/login") 391 | 392 | authorization_endpoint = headers.get("authorization_endpoint") 393 | token_endpoint = headers.get("token_endpoint") 394 | 395 | session["server_url"] = headers.get("microsub") 396 | 397 | code_verifier = "".join( 398 | random.choice(string.ascii_uppercase + string.digits) for _ in range(30) 399 | ) 400 | 401 | session["code_verifier"] = code_verifier 402 | session["authorization_endpoint"] = authorization_endpoint 403 | session["token_endpoint"] = token_endpoint 404 | 405 | sha256_hash = hashlib.sha256(code_verifier.encode('utf-8')).digest() 406 | code_challenge = base64.urlsafe_b64encode(sha256_hash).rstrip(b'=').decode('utf-8') 407 | 408 | state = "".join( 409 | random.choice(string.ascii_uppercase + string.digits) for _ in range(10) 410 | ) 411 | 412 | session["state"] = state 413 | 414 | return redirect( 415 | authorization_endpoint 416 | + "?client_id=" 417 | + CLIENT_ID 418 | + "&redirect_uri=" 419 | + CALLBACK_URL 420 | + "&scope=profile&response_type=code&code_challenge=" 421 | + code_challenge 422 | + "&code_challenge_method=S256&state=" 423 | + state 424 | ) 425 | 426 | 427 | @app.route("/feedback", methods=["POST"]) 428 | def feedback(): 429 | feedback = request.form["feedback"] 430 | id = request.form["id"] 431 | 432 | # if feedback not 1 or -1, return error 433 | if id not in ["1", "-1"]: 434 | return jsonify({"success": False}) 435 | 436 | cursor = conn.cursor() 437 | 438 | cursor.execute( 439 | "UPDATE answers SET feedback = %s WHERE id = %s", 440 | (feedback, id), 441 | ) 442 | 443 | conn.commit() 444 | 445 | return jsonify({"success": True}) 446 | 447 | 448 | if __name__ == "__main__": 449 | app.run() 450 | --------------------------------------------------------------------------------