├── .dockerignore ├── .gitignore ├── .python-version ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── config.py ├── data └── books.db ├── logger.py ├── main.py ├── notebooks ├── ask-ey.ipynb ├── embed.ipynb ├── scrape.ipynb └── t-sne.png ├── qa.py ├── railway.toml ├── requirements.txt ├── runtime.txt ├── search.py ├── sql.py ├── summarize.py └── utils.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .github 3 | .gitignore 4 | .idea 5 | .venv 6 | .git 7 | .ipynb_checkpoints 8 | __pycache__ 9 | tests 10 | _releaser 11 | _site 12 | CONTRIBUTING.md 13 | Dockerfile 14 | docker-compose.yml 15 | /vendor -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | # Local env 163 | .vscode/ 164 | .DS_Store -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.7 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | # temp stage 3 | FROM python:3.9.7-slim as builder 4 | 5 | WORKDIR /app 6 | 7 | ENV PYTHONDONTWRITEBYTECODE 1 8 | ENV PYTHONUNBUFFERED 1 9 | 10 | RUN apt-get update && \ 11 | apt-get install -y --no-install-recommends gcc 12 | 13 | COPY requirements.txt . 14 | RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt 15 | 16 | 17 | # final stage 18 | FROM python:3.9.7-slim 19 | 20 | WORKDIR /app 21 | 22 | COPY --from=builder /app/wheels /wheels 23 | COPY --from=builder /app/requirements.txt . 24 | 25 | RUN pip install --no-cache /wheels/* 26 | 27 | COPY . . 28 | 29 | ENTRYPOINT [ "python3", "main.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: build dev prod 2 | 3 | build: 4 | DOCKER_BUILDKIT=1 docker build -t gpt-dev . 5 | 6 | dev: build 7 | docker run -it --rm gpt-dev --env=dev 8 | 9 | prod: build 10 | docker run -it --rm gpt-dev -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # discord-llm 2 | 3 | Code for [Experimenting with LLMs to Research, Reflect, and Plan](https://eugeneyan.com/writing/llm-experiments/). Disclaimer: The code is disorganized and hacky, and relies largely on LangChain's abstractions. May be useful as a reference, but **not** as learning material. 4 | 5 | If you want to try this, update the `.env` file with your own keys. Most functionality, such as summarizing urls, sql queries on `/data/books.db`, and search should work right out of the box. For Q&A, you'll need to add your own custom indices. 6 | 7 | ## Discord functionality 8 | - Summarized and ELI5 urls 9 | - Run basic SQL via a chain or agent 10 | - Run a search query via google custom search 11 | - Q&A on custom indices (Note: You need to add your own indices) 12 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configurations 3 | """ 4 | # Defaults 5 | DEFAULT_MODEL = 'gpt-3.5-turbo' 6 | 7 | # Config for summarization 8 | TOKENIZER_DICT = {'gpt-3.5-turbo': 'cl100k_base', 9 | 'gpt-4': 'cl100k_base'} 10 | SUMMARY_MAX_TOKENS_DICT = {'gpt-3.5-turbo': 3800, 11 | 'gpt-4': 7000} 12 | 13 | SUMMARY_MODEL = DEFAULT_MODEL 14 | SUMMARY_TOKENIZER = TOKENIZER_DICT[SUMMARY_MODEL] 15 | SUMMARY_MAX_TOKENS = SUMMARY_MAX_TOKENS_DICT[SUMMARY_MODEL] 16 | 17 | # Config for search 18 | SEARCH_MODEL = DEFAULT_MODEL 19 | 20 | # Config for SQL 21 | SQL_MODEL = DEFAULT_MODEL 22 | 23 | # Config for Q&A 24 | QA_MODEL = DEFAULT_MODEL 25 | PINECONE_ENV = 'us-west4-gcp' 26 | PINECONE_INDEX_NAME_EY = 'ask-ey' 27 | PINECONE_INDEX_NAME_BOARD = 'board' 28 | EMBEDDING_MODEL = 'text-embedding-ada-002' 29 | -------------------------------------------------------------------------------- /data/books.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/eugeneyan/discord-llm/0308a50b89e432bd7e34656e9b3911939eab7e9f/data/books.db -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logger utility 3 | """ 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | logger.setLevel(logging.INFO) 8 | formatter = logging.Formatter('%(asctime)s - %(message)s') 9 | 10 | # create console handler and set level to info 11 | ch = logging.StreamHandler() 12 | ch.setFormatter(formatter) 13 | ch.setLevel(logging.INFO) 14 | 15 | # add ch to logger 16 | logger.addHandler(ch) 17 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Bot for discord server that utilizes OpenAI's api for commands. 3 | """ 4 | import argparse 5 | import os 6 | from sqlite3 import OperationalError 7 | 8 | import interactions 9 | from dotenv import load_dotenv 10 | 11 | from config import DEFAULT_MODEL 12 | from logger import logger 13 | from qa import qa_board, qa_ey 14 | from search import search_agent 15 | from sql import sql_agent, sql_chain 16 | from summarize import eli5_url, summarize_url 17 | 18 | # Parse arguments 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('--env', type=str, default='prod') 21 | args = parser.parse_args() 22 | logger.info(f'Arguments: {args.__dict__}') 23 | 24 | # Discord arguments 25 | MAX_INITIAL_MESSAGE_LENGTH = 1900 26 | MAX_MESSAGE_LENGTH = 2000 27 | 28 | # Load environment variables 29 | load_dotenv() 30 | TOKEN = os.getenv('DISCORD_TOKEN') 31 | GUILD_ID = os.getenv('DISCORD_GUILD_ID') 32 | CMD_PREFIX = '' 33 | 34 | if args.env == 'dev': 35 | TOKEN = os.getenv('DISCORD_TOKEN_DEV') 36 | CMD_PREFIX = 'dev-' 37 | 38 | bot = interactions.Client(TOKEN) 39 | logger.info(f'Bot initialized: {bot.__dict__}') 40 | 41 | # Define reusable options 42 | OPTIONS_TEMPERATURE = interactions.Option(name='temperature', description='Lower values = more focused responses, higher values = more random', required=False, 43 | type=interactions.OptionType.NUMBER, min_value=0.0, max_value=2.0) 44 | OPTIONS_MODEL = interactions.Option(name='model', description='Model to use', required=False, 45 | type=interactions.OptionType.STRING, 46 | choices=[interactions.Choice(name='gpt-3.5', value='gpt-3.5-turbo'), 47 | interactions.Choice(name='gpt-4', value='gpt-4')]) 48 | OPTIONS_SHOW_SOURCE = interactions.Option(name='show_source', description='Show snippets of source content', required=False, 49 | type=interactions.OptionType.BOOLEAN, 50 | choices=[interactions.Choice(name='yes', value=True), 51 | interactions.Choice(name='no', value=False)]) 52 | 53 | 54 | @bot.command(name=f'{CMD_PREFIX}hello', description='Says hello without hitting any APIs. Used for health checks.', scope=GUILD_ID) 55 | async def _hello(ctx: interactions.CommandContext): 56 | await ctx.send(f'Hello {ctx.author.mention}! How are you?') 57 | 58 | 59 | @bot.command(name=f'{CMD_PREFIX}summarize', description='Summarizes a URL in bullet points', scope=GUILD_ID, 60 | options=[interactions.Option(name='url', description='URL to summarize', required=True, type=interactions.OptionType.STRING), 61 | OPTIONS_TEMPERATURE, OPTIONS_MODEL]) 62 | async def _summarize(ctx: interactions.CommandContext, url: str, temperature: float = None, model: str = DEFAULT_MODEL): 63 | logger.info(f'Summarize: {url}, Temp: {temperature}, Model: {model}') 64 | await ctx.defer() 65 | summary, time = summarize_url(url, temperature, model) 66 | summary += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`' 67 | await ctx.send(f'Here is the summary of {url}:\n\n{summary[:MAX_INITIAL_MESSAGE_LENGTH]}') 68 | for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(summary), MAX_MESSAGE_LENGTH): 69 | await ctx.send(f'{summary[i:i+MAX_MESSAGE_LENGTH]}') 70 | 71 | 72 | @ bot.command(name=f'{CMD_PREFIX}eli5', description='Explains a URL to a five-year old', scope=GUILD_ID, 73 | options=[interactions.Option(name='url', description='URL to explain', required=True, type=interactions.OptionType.STRING), 74 | OPTIONS_TEMPERATURE, OPTIONS_MODEL]) 75 | async def _eli5(ctx: interactions.CommandContext, url: str, temperature: float = None, model: str = DEFAULT_MODEL): 76 | logger.info(f'ELI5: {url}, Temp: {temperature}, Model: {model}') 77 | await ctx.defer() 78 | explanation, time = eli5_url(url, temperature, model) 79 | explanation += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`' 80 | await ctx.send(f'Here is the explanation of {url}:\n\n{explanation[:MAX_INITIAL_MESSAGE_LENGTH]}') 81 | for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(explanation), MAX_MESSAGE_LENGTH): 82 | await ctx.send(f'{explanation[i:i+MAX_MESSAGE_LENGTH]}') 83 | 84 | 85 | @bot.command(name=f'{CMD_PREFIX}search', description='Searches the internet for a query', scope=GUILD_ID, 86 | options=[interactions.Option(name='query', description='Query to search for', required=True, type=interactions.OptionType.STRING), 87 | OPTIONS_TEMPERATURE, OPTIONS_MODEL]) 88 | async def _search_agent(ctx: interactions.CommandContext, query: str, temperature: float = None, model: str = DEFAULT_MODEL): 89 | logger.info(f'Search: {query}, Temp: {temperature}, Model: {model}') 90 | await ctx.defer() 91 | try: 92 | result, time = search_agent(query, temperature, model) 93 | result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`' 94 | await ctx.send(f'{result[:MAX_INITIAL_MESSAGE_LENGTH]}') 95 | for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(result), MAX_MESSAGE_LENGTH): 96 | await ctx.send(f'{result[i:i+MAX_MESSAGE_LENGTH]}') 97 | except ValueError as e: 98 | await ctx.send(f'Error: {e}. Please try again.') 99 | 100 | 101 | @bot.command(name=f'{CMD_PREFIX}table', description='Describes the books table.', scope=GUILD_ID) 102 | async def _table(ctx: interactions.CommandContext): 103 | await ctx.send(f'The books table has the following columns: id, title, author, language, average rating, ratings count, and text reviews count.') 104 | 105 | 106 | @bot.command(name=f'{CMD_PREFIX}sql', description='Queries a database', scope=GUILD_ID, 107 | options=[interactions.Option(name='query', description='Query to search for', required=True, type=interactions.OptionType.STRING), 108 | OPTIONS_TEMPERATURE, OPTIONS_MODEL]) 109 | async def _sql_chain(ctx: interactions.CommandContext, query: str, temperature: float = None, model: str = DEFAULT_MODEL): 110 | logger.info(f'SQL-chain: {query}, Temp: {temperature}, Model: {model}') 111 | await ctx.defer() 112 | try: 113 | result, time = sql_chain(query, temperature, model) 114 | result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`' 115 | await ctx.send(f'{result[:MAX_INITIAL_MESSAGE_LENGTH]}') 116 | for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(result), MAX_MESSAGE_LENGTH): 117 | await ctx.send(f'{result[i:i+MAX_MESSAGE_LENGTH]}') 118 | except OperationalError as e: 119 | await ctx.send(f'Error: {e}. Please try again.') 120 | 121 | 122 | @bot.command(name=f'{CMD_PREFIX}sql-agent', description='Queries a database', scope=GUILD_ID, 123 | options=[interactions.Option(name='query', description='Query to search for', required=True, type=interactions.OptionType.STRING), 124 | OPTIONS_TEMPERATURE, OPTIONS_MODEL]) 125 | async def _sql_agent(ctx: interactions.CommandContext, query: str, temperature: float = None, model: str = DEFAULT_MODEL): 126 | logger.info(f'SQL-agent: {query}, Temp: {temperature}, Model: {model}') 127 | await ctx.defer() 128 | try: 129 | result, time = sql_agent(query, temperature, model) 130 | result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`' 131 | await ctx.send(f'{result[:MAX_INITIAL_MESSAGE_LENGTH]}') 132 | for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(result), MAX_MESSAGE_LENGTH): 133 | await ctx.send(f'{result[i:i+MAX_MESSAGE_LENGTH]}') 134 | except ValueError as e: 135 | await ctx.send(f'Error: {e}. Please try again.') 136 | 137 | 138 | @bot.command(name=f'{CMD_PREFIX}ask-ey', description='Asks eugeneyan.com a question', scope=GUILD_ID, 139 | options=[interactions.Option(name='question', description='Question to ask', required=True, type=interactions.OptionType.STRING), 140 | OPTIONS_TEMPERATURE, OPTIONS_MODEL, OPTIONS_SHOW_SOURCE]) 141 | async def _ask_ey(ctx: interactions.CommandContext, question: str, temperature: float = None, model: str = DEFAULT_MODEL, show_source: bool = False): 142 | logger.info( 143 | f'Ask ey: {question}, Temp: {temperature}, Model: {model}, Show source: {show_source}') 144 | await ctx.defer() 145 | # The first element is the answer, the rest are sources 146 | result_list, time = qa_ey(question, temperature, model) 147 | 148 | result = result_list[0] 149 | result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`' 150 | await ctx.send(f'{result[:MAX_MESSAGE_LENGTH]}') 151 | 152 | if show_source: 153 | # Send sources as individual messages 154 | for source in result_list[1:]: 155 | await ctx.send(f'{source[:MAX_MESSAGE_LENGTH]}') 156 | 157 | 158 | @bot.command(name=f'{CMD_PREFIX}board', description='Asks board of advisors a question', scope=GUILD_ID, 159 | options=[interactions.Option(name='question', description='Question to ask', required=True, type=interactions.OptionType.STRING), 160 | OPTIONS_TEMPERATURE, OPTIONS_MODEL, OPTIONS_SHOW_SOURCE]) 161 | async def _ask_board(ctx: interactions.CommandContext, question: str, temperature: float = None, model: str = DEFAULT_MODEL, show_source: bool = False): 162 | logger.info( 163 | f'Ask board: {question}, Temp: {temperature}, Model: {model}, Show source: {show_source}') 164 | await ctx.defer() 165 | # The first element is the answer, the rest are sources 166 | result_list, time = qa_board(question, temperature, model) 167 | 168 | result = result_list[0] 169 | result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`' 170 | await ctx.send(f'{result[:MAX_MESSAGE_LENGTH]}') 171 | 172 | if show_source: 173 | # Send sources as individual messages 174 | for source in result_list[1:]: 175 | await ctx.send(f'{source[:MAX_MESSAGE_LENGTH]}') 176 | 177 | 178 | if __name__ == '__main__': 179 | bot.start() 180 | -------------------------------------------------------------------------------- /notebooks/ask-ey.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import math\n", 10 | "import os\n", 11 | "import pickle\n", 12 | "\n", 13 | "import pandas as pd\n", 14 | "import pinecone\n", 15 | "import requests\n", 16 | "import xmltodict\n", 17 | "from bs4 import BeautifulSoup\n", 18 | "from dotenv import load_dotenv\n", 19 | "from langchain import OpenAI\n", 20 | "from langchain.chains import RetrievalQAWithSourcesChain\n", 21 | "from langchain.embeddings import OpenAIEmbeddings\n", 22 | "from langchain.text_splitter import CharacterTextSplitter\n", 23 | "from langchain.vectorstores import FAISS, Pinecone" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "True" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "load_dotenv()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import sys\n", 53 | "\n", 54 | "sys.path.append('..')\n", 55 | "\n", 56 | "from logger import logger\n", 57 | "from config import PINECONE_ENV" 58 | ] 59 | }, 60 | { 61 | "attachments": {}, 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Scrape URLs" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "r = requests.get(\"http://eugeneyan.com/sitemap.xml\")\n", 75 | "xml = r.text\n", 76 | "raw = xmltodict.parse(xml)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "html = requests.get('https://eugeneyan.com/writing/content-moderation/').text\n", 86 | "soup = BeautifulSoup(html, features=\"html.parser\")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 255, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# _paragraphs = soup.find_all('p')\n", 96 | "# _paragraphs[:5]" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 248, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# paragraphs = []\n", 106 | "\n", 107 | "# for p in _paragraphs:\n", 108 | "# if'class' in p.attrs and 'date' in p['class']:\n", 109 | "# continue\n", 110 | "# if p.get_text() == 'To cite this content, please use:':\n", 111 | "# break\n", 112 | "# paragraphs.append(p.get_text())" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 256, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "# lines = [line.strip() for line in paragraphs]\n", 122 | "# lines[:5]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 257, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# lines = [line for line in lines if len(line) > 15]\n", 132 | "# lines[:5]" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 259, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# print('\\n'.join(line for line in lines if line))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "def extract_text_from(url, min_line_length=20, last_paragraph='To cite this content, please use:'):\n", 151 | " html = requests.get(url).text\n", 152 | " soup = BeautifulSoup(html, features=\"html.parser\")\n", 153 | " \n", 154 | " # Find all paragraphs and exclude all paragraphs after the \"To cite this content, please use:\" paragraph\n", 155 | " _paragraphs = soup.find_all('p')\n", 156 | " \n", 157 | " paragraphs = []\n", 158 | " for p in _paragraphs:\n", 159 | " if'class' in p.attrs and 'date' in p['class']:\n", 160 | " continue\n", 161 | " if p.get_text() == last_paragraph:\n", 162 | " break\n", 163 | " paragraphs.append(p.get_text())\n", 164 | " logger.debug(f'Paragraphs: {paragraphs[0]}')\n", 165 | " \n", 166 | " # Remove consecutive newlines\n", 167 | " lines = (line.strip() for line in paragraphs)\n", 168 | " \n", 169 | " # Remove lines that are less than 10 characters\n", 170 | " lines = (line for line in lines if len(line) > min_line_length)\n", 171 | " \n", 172 | " return '\\n'.join(line for line in lines if line)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 7, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "How can we improve a machine learning project’s chance of success? Over the years, I’ve explored various mechanisms in both my own projects and those of my team members. Most people who tried these mechanisms ended up adopting them in future projects.\n", 185 | "While these mechanisms were developed with machine learning projects in mind, with a few tweaks, they can be applied to other technical endeavors too.\n", 186 | "If your team is like most teams I’ve been on, you have 2 - 3 problems for every available person. Thus, each member works on 1 or 2 problems simultaneously, with some folks taking 3 or more. And because everyone’s so busy, we barely have time to check in on each other’s projects outside of standup, planning, retrospective, etc.\n", 187 | "This is an anti-pattern. It can lead to a project going off-track for months, or a critical error (e.g., incorrect training data, invalid train-validation split) going undetected until late in the implementation phase.\n", 188 | "One solution is to have a pilot and copilot for each project. The pilot is the main project owner and is in charge of its success (or failure). They own and delegate the work as required though they’re usually responsible for the bulk of design and critical code paths.\n", 189 | "The copilot helps the pilot stay on track, identify critical flaws, and call out blindspots. This includes periodic check-ins, reviewing document drafts and prototypes, and being a mandatory code reviewer. For example, the copilot should challenge the pilot if the proposed design doesn’t solve the business problem, or if the train-validation split is invalid. To be able to spot these issues, the copilot typically has experience in the problem space, or has more experience in general, similar to how senior engineers guide juniors.\n", 190 | "For every 10 hours the pilot spends on the project, the copilot can expect to spend an hour on reviews (10% of the pilot’s effort). While this may seem excessive, copilots have helped avoid costlier rework or abandoning a project due to mistakes that snowballed.\n", 191 | "Pilots and copilots don’t have to be from the same job family. As an applied scientist, I often partner with an engineer who helps with infrastructure, observability, CI/CD, etc. If both scientist and engineer are sufficiently experienced, they can double up as each other’s copilot. As they review each other’s work, knowledge transfer occurs organically and they learn to be effective copilots for other engineers or scientists in future projects.\n", 192 | "Also read more on the dangers of flying solo by Ethan Rosenthal and Vicki Boykis.\n", 193 | "In my earlier projects, because I was overeager, I would immediately jump into the data and begin training models. After watching me go in the wrong direction for a week or two, a merciful senior would share a paper, casually suggesting that it might be helpful to read it. It always was. After letting this happen once too often, I finally learned to start my projects with a literature review.\n", 194 | "For a literature review, I read papers relevant to the problem. I’m biased towards solutions that have been applied in industry though more academic papers have also been helpful.\n", 195 | "While reading these papers, I’m less interested in model architecture and focus on:\n", 196 | "To quickly go through the papers, I adopt the three-pass approach.\n", 197 | "This is similar to a code review but for machine learning prototypes and experiments. Once I have initial experiment results, I schedule a review with fellow scientists to ensure I haven’t overlooked any blindspots or committed critical errors.\n", 198 | "During the review, I focus on understanding the methodology and the potential of the current approach. Some questions include:\n", 199 | "To conduct methodology reviews asynchronously, like a code review, we could adopt a tool like DagsHub which supports comments on Jupyter notebooks and data.\n", 200 | "To tie it all together, we timebox each project phase and task. Time constraints help us focus on the most important tasks and not get bogged down in the details. Timeboxing for machine learning projects can be challenging, because compared to engineering projects, the work is relatively ill-defined. Furthermore, a large part of the work is research and experimentation which unfortunately leads to many a dead end.\n", 201 | "But it’s because of these challenges that timeboxing is effective—how much effort should we invest before pivoting? In most industry settings, we don’t have limitless resources to pursue a problem for years.\n", 202 | "(I treat timeboxes differently from estimates. Timeboxes are stretch goals while estimates are project management inputs that indicate the upper bound of effort needed. To convert timeboxes to estimates, I usually multiply by 1.5 - 3.0.)\n", 203 | "Here are three ways to define timeboxes.\n", 204 | "The first—and most aggressive—approach is to take the time spent on similar projects and halve it. This forces us to be scrappy and build a minimum lovable product that we can quickly get feedback on, reducing the iteration cycle. This approach works well in startups and startup-like teams though it can be too intense to adopt all the time.\n", 205 | "A less extreme approach is to set a timebox that is “comfortable yet challenging”. Thus, instead of halving the timebox, we reduce it by 10 - 20%. By deliberately introducing these constraints, we give ourselves the opportunity to reflect on timesinks to avoid and how to achieve more with fewer resources. This is a good default for most seasoned teams.\n", 206 | "Finally, for greenfield projects that may be hard to scope, we can adopt standard timeboxes. For example, we might allocate two weeks for a literature review, four to eight weeks to build a prototype, and three to six months to implement it in production.\n", 207 | "I’ve also written about other mechanisms for machine learning projects, including:\n", 208 | "What mechanisms do you adopt in your machine learning projects? Please share below!\n", 209 | "Thanks to Yang Xinyi for reading drafts of this.\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "print(extract_text_from('https://eugeneyan.com/writing/mechanisms-for-projects/'))" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 8, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "pages = []\n", 224 | "\n", 225 | "for info in raw['urlset']['url']:\n", 226 | " url = info['loc']\n", 227 | " if 'https://eugeneyan.com/writing/' in info['loc']:\n", 228 | " pages.append({'text': extract_text_from(url), 'url': url})" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 12, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "df = pd.DataFrame(pages)\n", 238 | "\n", 239 | "# # Exclude short posts that may be talks and mostly images\n", 240 | "df['text_len'] = df['text'].apply(lambda x: len(x))\n", 241 | "df = df[df['text_len'] > 500]\n", 242 | "df = df.drop(columns=['text_len'])\n", 243 | "\n", 244 | "# Exclude certain urls\n", 245 | "excluded_urls = {''}\n", 246 | "df = df[~df['url'].isin(excluded_urls)]" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 15, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "df.to_parquet('../data/eugeneyan.parquet', compression='gzip')" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | }, 265 | { 266 | "attachments": {}, 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "## Split each page into documents" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 264, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "name": "stderr", 280 | "output_type": "stream", 281 | "text": [ 282 | "2023-03-26 17:06:54,129 - Split https://eugeneyan.com/writing/llm-bio/ into 10 docs\n", 283 | "2023-03-26 17:06:54,131 - Split https://eugeneyan.com/writing/labeling-guidelines/ into 5 docs\n", 284 | "2023-03-26 17:06:54,133 - Split https://eugeneyan.com/writing/content-moderation/ into 14 docs\n", 285 | "2023-03-26 17:06:54,134 - Split https://eugeneyan.com/writing/mechanisms-for-teams/ into 6 docs\n", 286 | "2023-03-26 17:06:54,134 - Split https://eugeneyan.com/writing/mechanisms-for-projects/ into 5 docs\n", 287 | "2023-03-26 17:06:54,135 - Split https://eugeneyan.com/writing/roam-to-obsidian/ into 2 docs\n", 288 | "2023-03-26 17:06:54,135 - Split https://eugeneyan.com/writing/getting-help/ into 3 docs\n", 289 | "2023-03-26 17:06:54,136 - Split https://eugeneyan.com/writing/2022-in-review/ into 6 docs\n", 290 | "2023-03-26 17:06:54,137 - Split https://eugeneyan.com/writing/autoencoders-vs-diffusers/ into 3 docs\n", 291 | "2023-03-26 17:06:54,138 - Split https://eugeneyan.com/writing/text-to-image/ into 16 docs\n", 292 | "2023-03-26 17:06:54,138 - Split https://eugeneyan.com/writing/recsys2022/ into 14 docs\n", 293 | "2023-03-26 17:06:54,139 - Split https://eugeneyan.com/writing/testing-pipelines/ into 10 docs\n", 294 | "2023-03-26 17:06:54,140 - Split https://eugeneyan.com/writing/simplicity/ into 6 docs\n", 295 | "2023-03-26 17:06:54,141 - Split https://eugeneyan.com/writing/uncommon-python/ into 7 docs\n", 296 | "2023-03-26 17:06:54,141 - Split https://eugeneyan.com/writing/15-5/ into 2 docs\n", 297 | "2023-03-26 17:06:54,142 - Split https://eugeneyan.com/writing/design-patterns/ into 7 docs\n", 298 | "2023-03-26 17:06:54,142 - Split https://eugeneyan.com/writing/onboarding/ into 9 docs\n", 299 | "2023-03-26 17:06:54,143 - Split https://eugeneyan.com/writing/bandits/ into 14 docs\n", 300 | "2023-03-26 17:06:54,144 - Split https://eugeneyan.com/writing/position-bias/ into 6 docs\n", 301 | "2023-03-26 17:06:54,145 - Split https://eugeneyan.com/writing/counterfactual-evaluation/ into 8 docs\n", 302 | "2023-03-26 17:06:54,146 - Split https://eugeneyan.com/writing/intent-vs-requirements/ into 7 docs\n", 303 | "2023-03-26 17:06:54,147 - Split https://eugeneyan.com/writing/project-quick-start/ into 8 docs\n", 304 | "2023-03-26 17:06:54,148 - Split https://eugeneyan.com/writing/becoming-a-data-leader/ into 2 docs\n", 305 | "2023-03-26 17:06:54,149 - Split https://eugeneyan.com/writing/red-flags/ into 6 docs\n", 306 | "2023-03-26 17:06:54,149 - Split https://eugeneyan.com/writing/how-to-keep-learning/ into 5 docs\n", 307 | "2023-03-26 17:06:54,150 - Split https://eugeneyan.com/writing/2021-year-in-review/ into 6 docs\n", 308 | "2023-03-26 17:06:54,151 - Split https://eugeneyan.com/writing/applyingml/ into 1 docs\n", 309 | "2023-03-26 17:06:54,152 - Split https://eugeneyan.com/writing/what-i-learned-from-writing-online-susan-shu/ into 4 docs\n", 310 | "2023-03-26 17:06:54,152 - Split https://eugeneyan.com/writing/what-i-learned-from-writing-online/ into 6 docs\n", 311 | "2023-03-26 17:06:54,153 - Split https://eugeneyan.com/writing/recsys2021/ into 5 docs\n", 312 | "2023-03-26 17:06:54,154 - Split https://eugeneyan.com/writing/first-rule-of-ml/ into 6 docs\n", 313 | "2023-03-26 17:06:54,155 - Split https://eugeneyan.com/writing/reinforcement-learning-for-recsys-and-search/ into 12 docs\n", 314 | "2023-03-26 17:06:54,155 - Split https://eugeneyan.com/writing/bootstrapping-data-labels/ into 11 docs\n", 315 | "2023-03-26 17:06:54,156 - Split https://eugeneyan.com/writing/mailbag-bootstrap-relevant-docs/ into 2 docs\n", 316 | "2023-03-26 17:06:54,157 - Split https://eugeneyan.com/writing/influencing-without-authority/ into 8 docs\n", 317 | "2023-03-26 17:06:54,158 - Split https://eugeneyan.com/writing/system-design-for-discovery/ into 12 docs\n", 318 | "2023-03-26 17:06:54,159 - Split https://eugeneyan.com/writing/patterns-for-personalization/ into 24 docs\n", 319 | "2023-03-26 17:06:54,159 - Split https://eugeneyan.com/writing/machine-learning-metagame/ into 11 docs\n", 320 | "2023-03-26 17:06:54,160 - Split https://eugeneyan.com/writing/search-query-matching/ into 18 docs\n", 321 | "2023-03-26 17:06:54,161 - Split https://eugeneyan.com/writing/imposter-syndrome-susan/ into 6 docs\n", 322 | "2023-03-26 17:06:54,162 - Split https://eugeneyan.com/writing/imposter-syndrome/ into 8 docs\n", 323 | "2023-03-26 17:06:54,162 - Split https://eugeneyan.com/writing/values-and-superpowers/ into 8 docs\n", 324 | "2023-03-26 17:06:54,163 - Split https://eugeneyan.com/writing/how-to-choose-problems/ into 11 docs\n", 325 | "2023-03-26 17:06:54,164 - Split https://eugeneyan.com/writing/seven-habits-that-shaped-my-decade/ into 13 docs\n", 326 | "2023-03-26 17:06:54,165 - Split https://eugeneyan.com/writing/ml-design-docs/ into 14 docs\n", 327 | "2023-03-26 17:06:54,166 - Split https://eugeneyan.com/writing/writing-docs-why-what-how/ into 9 docs\n", 328 | "2023-03-26 17:06:54,167 - Split https://eugeneyan.com/writing/feature-stores/ into 15 docs\n", 329 | "2023-03-26 17:06:54,168 - Split https://eugeneyan.com/writing/how-to-win-data-hackathon/ into 5 docs\n", 330 | "2023-03-26 17:06:54,169 - Split https://eugeneyan.com/writing/data-science-teams/ into 14 docs\n", 331 | "2023-03-26 17:06:54,170 - Split https://eugeneyan.com/writing/you-dont-need-another-mooc/ into 7 docs\n", 332 | "2023-03-26 17:06:54,171 - Split https://eugeneyan.com/writing/mailbag-resume-for-experienced-ds/ into 3 docs\n", 333 | "2023-03-26 17:06:54,173 - Split https://eugeneyan.com/writing/real-time-recommendations/ into 17 docs\n", 334 | "2023-03-26 17:06:54,174 - Split https://eugeneyan.com/writing/2021-roadmap/ into 3 docs\n", 335 | "2023-03-26 17:06:54,174 - Split https://eugeneyan.com/writing/retrospective-2020/ into 6 docs\n", 336 | "2023-03-26 17:06:54,175 - Split https://eugeneyan.com/writing/flying-dagger/ into 2 docs\n", 337 | "2023-03-26 17:06:54,176 - Split https://eugeneyan.com/writing/how-i-reflect-and-plan/ into 2 docs\n", 338 | "2023-03-26 17:06:54,177 - Split https://eugeneyan.com/writing/informal-mentors-alexey-grigorev/ into 17 docs\n", 339 | "2023-03-26 17:06:54,178 - Split https://eugeneyan.com/writing/mailbag-blog-architecture/ into 1 docs\n", 340 | "2023-03-26 17:06:54,179 - Split https://eugeneyan.com/writing/life-lessons-from-machine-learning/ into 10 docs\n", 341 | "2023-03-26 17:06:54,180 - Split https://eugeneyan.com/writing/role-title-mismatch/ into 6 docs\n", 342 | "2023-03-26 17:06:54,181 - Split https://eugeneyan.com/writing/data-science-roles/ into 9 docs\n", 343 | "2023-03-26 17:06:54,182 - Split https://eugeneyan.com/writing/informal-mentors-chip-huyen/ into 8 docs\n", 344 | "2023-03-26 17:06:54,183 - Split https://eugeneyan.com/writing/data-discovery-platforms/ into 12 docs\n", 345 | "2023-03-26 17:06:54,184 - Split https://eugeneyan.com/writing/netlify-back-to-github-pages/ into 2 docs\n", 346 | "2023-03-26 17:06:54,185 - Split https://eugeneyan.com/writing/data-science-portfolio-how-why-what/ into 13 docs\n", 347 | "2023-03-26 17:06:54,186 - Split https://eugeneyan.com/writing/how-to-install-scann-on-mac/ into 2 docs\n", 348 | "2023-03-26 17:06:54,186 - Split https://eugeneyan.com/writing/prototyping-to-get-buy-in/ into 6 docs\n", 349 | "2023-03-26 17:06:54,187 - Split https://eugeneyan.com/writing/writing-and-coding/ into 11 docs\n", 350 | "2023-03-26 17:06:54,188 - Split https://eugeneyan.com/writing/recsys2020/ into 11 docs\n", 351 | "2023-03-26 17:06:54,189 - Split https://eugeneyan.com/writing/present/ into 1 docs\n", 352 | "2023-03-26 17:06:54,190 - Split https://eugeneyan.com/writing/favorite-productivity-coffee-routines-habits/ into 12 docs\n", 353 | "2023-03-26 17:06:54,191 - Split https://eugeneyan.com/writing/how-to-accomplish-more-with-less/ into 9 docs\n", 354 | "2023-03-26 17:06:54,191 - Split https://eugeneyan.com/writing/migrating-to-utterances/ into 1 docs\n", 355 | "2023-03-26 17:06:54,192 - Split https://eugeneyan.com/writing/testing-ml/ into 6 docs\n", 356 | "2023-03-26 17:06:54,193 - Split https://eugeneyan.com/writing/mailbag-pdf-fields/ into 3 docs\n", 357 | "2023-03-26 17:06:54,193 - Split https://eugeneyan.com/writing/why-read-papers/ into 5 docs\n", 358 | "2023-03-26 17:06:54,194 - Split https://eugeneyan.com/writing/mailbag-senior-ds/ into 2 docs\n", 359 | "2023-03-26 17:06:54,195 - Split https://eugeneyan.com/writing/beginners-mind/ into 9 docs\n", 360 | "2023-03-26 17:06:54,196 - Split https://eugeneyan.com/writing/nlp-supervised-learning-survey/ into 17 docs\n", 361 | "2023-03-26 17:06:54,197 - Split https://eugeneyan.com/writing/end-to-end-data-science/ into 14 docs\n", 362 | "2023-03-26 17:06:54,198 - Split https://eugeneyan.com/writing/fastapi-html-checkbox-download/ into 2 docs\n", 363 | "2023-03-26 17:06:54,199 - Split https://eugeneyan.com/writing/what-i-did-not-learn-about-writing-in-school/ into 8 docs\n", 364 | "2023-03-26 17:06:54,200 - Split https://eugeneyan.com/writing/georgia-tech-omscs-faq/ into 13 docs\n", 365 | "2023-03-26 17:06:54,200 - Split https://eugeneyan.com/writing/how-to-set-up-html-app-with-fastapi-jinja-forms-templates/ into 1 docs\n", 366 | "2023-03-26 17:06:54,201 - Split https://eugeneyan.com/writing/why-you-need-to-follow-up-after-your-data-science-project/ into 8 docs\n", 367 | "2023-03-26 17:06:54,202 - Split https://eugeneyan.com/writing/what-i-do-during-a-data-science-project-to-ensure-success/ into 5 docs\n", 368 | "2023-03-26 17:06:54,203 - Split https://eugeneyan.com/writing/how-to-update-github-profile-readme-automatically/ into 2 docs\n", 369 | "2023-03-26 17:06:54,204 - Split https://eugeneyan.com/writing/when-giving-your-100-gets-you-less-than-85/ into 2 docs\n", 370 | "2023-03-26 17:06:54,205 - Split https://eugeneyan.com/writing/notes-from-sparkai-summit-application-specific/ into 11 docs\n", 371 | "2023-03-26 17:06:54,207 - Split https://eugeneyan.com/writing/notes-from-sparkai-summit-application-agnostic/ into 7 docs\n", 372 | "2023-03-26 17:06:54,239 - Split https://eugeneyan.com/writing/setting-up-python-project-for-automation-and-collaboration/ into 10 docs\n", 373 | "2023-03-26 17:06:54,240 - Split https://eugeneyan.com/writing/mailbag-ds-requirements/ into 2 docs\n", 374 | "2023-03-26 17:06:54,247 - Split https://eugeneyan.com/writing/why-airflow-jobs-one-day-late/ into 2 docs\n", 375 | "2023-03-26 17:06:54,258 - Split https://eugeneyan.com/writing/what-i-do-before-a-data-science-project-to-ensure-success/ into 11 docs\n", 376 | "2023-03-26 17:06:54,260 - Split https://eugeneyan.com/writing/what-i-love-about-scrum-for-data-science/ into 8 docs\n", 377 | "2023-03-26 17:06:54,261 - Split https://eugeneyan.com/writing/how-to-apply-crockers-law-for-feedback-and-growth/ into 6 docs\n", 378 | "2023-03-26 17:06:54,261 - Split https://eugeneyan.com/writing/practical-guide-to-maintaining-machine-learning/ into 12 docs\n", 379 | "2023-03-26 17:06:54,262 - Split https://eugeneyan.com/writing/challenges-after-deploying-machine-learning/ into 12 docs\n", 380 | "2023-03-26 17:06:54,263 - Split https://eugeneyan.com/writing/how-to-write-david-x-sahil/ into 4 docs\n", 381 | "2023-03-26 17:06:54,264 - Split https://eugeneyan.com/writing/evaluating-ideas-at-a-hackathon/ into 6 docs\n", 382 | "2023-03-26 17:06:54,265 - Split https://eugeneyan.com/writing/serendipity-and-accuracy-in-recommender-systems/ into 9 docs\n", 383 | "2023-03-26 17:06:54,266 - Split https://eugeneyan.com/writing/how-to-give-a-kick-ass-data-science-talk/ into 6 docs\n", 384 | "2023-03-26 17:06:54,267 - Split https://eugeneyan.com/writing/commando-soldier-police-and-your-career/ into 5 docs\n", 385 | "2023-03-26 17:06:54,268 - Split https://eugeneyan.com/writing/note-taking-zettelkasten/ into 5 docs\n", 386 | "2023-03-26 17:06:54,269 - Split https://eugeneyan.com/writing/reading-note-taking-writing/ into 5 docs\n", 387 | "2023-03-26 17:06:54,270 - Split https://eugeneyan.com/writing/experimentation-workflow-with-jupyter-papermill-mlflow/ into 5 docs\n", 388 | "2023-03-26 17:06:54,271 - Split https://eugeneyan.com/writing/psych-grad-to-data-science-lead/ into 10 docs\n", 389 | "2023-03-26 17:06:54,272 - Split https://eugeneyan.com/writing/recommender-systems-graph-and-nlp-pytorch/ into 15 docs\n", 390 | "2023-03-26 17:06:54,272 - Split https://eugeneyan.com/writing/recommender-systems-baseline-pytorch/ into 11 docs\n", 391 | "2023-03-26 17:06:54,273 - Split https://eugeneyan.com/writing/omscs-cs6200-introduction-to-operating-systems/ into 7 docs\n", 392 | "2023-03-26 17:06:54,274 - Split https://eugeneyan.com/writing/omscs-cs6750-human-computer-interaction/ into 7 docs\n", 393 | "2023-03-26 17:06:54,275 - Split https://eugeneyan.com/writing/goodbye-wordpress-hello-jekyll into 1 docs\n", 394 | "2023-03-26 17:06:54,276 - Split https://eugeneyan.com/writing/omscs-cs6440-intro-to-health-informatics/ into 7 docs\n", 395 | "2023-03-26 17:06:54,277 - Split https://eugeneyan.com/writing/omscs-cs7646-machine-learning-for-trading/ into 7 docs\n", 396 | "2023-03-26 17:06:54,278 - Split https://eugeneyan.com/writing/what-does-a-data-scientist-really-do/ into 5 docs\n", 397 | "2023-03-26 17:06:54,279 - Split https://eugeneyan.com/writing/data-science-and-agile-frameworks-for-effectiveness/ into 13 docs\n", 398 | "2023-03-26 17:06:54,280 - Split https://eugeneyan.com/writing/data-science-and-agile-what-works-and-what-doesnt/ into 10 docs\n", 399 | "2023-03-26 17:06:54,281 - Split https://eugeneyan.com/writing/omscs-cs6601-artificial-intelligence/ into 7 docs\n", 400 | "2023-03-26 17:06:54,282 - Split https://eugeneyan.com/writing/omscs-cs6460-education-technology/ into 8 docs\n", 401 | "2023-03-26 17:06:54,283 - Split https://eugeneyan.com/writing/omscs-cs7642-reinforcement-learning/ into 6 docs\n", 402 | "2023-03-26 17:06:54,284 - Split https://eugeneyan.com/writing/building-a-strong-data-science-team-culture/ into 3 docs\n", 403 | "2023-03-26 17:06:54,286 - Split https://eugeneyan.com/writing/omscs-cs7641-machine-learning/ into 4 docs\n", 404 | "2023-03-26 17:06:54,287 - Split https://eugeneyan.com/writing/my-first-100-days-as-data-science-lead/ into 4 docs\n", 405 | "2023-03-26 17:06:54,288 - Split https://eugeneyan.com/writing/omscs-cs6300-software-development-process/ into 5 docs\n", 406 | "2023-03-26 17:06:54,289 - Split https://eugeneyan.com/writing/how-to-get-started-in-data-science/ into 7 docs\n", 407 | "2023-03-26 17:06:54,290 - Split https://eugeneyan.com/writing/omscs-cs6476-computer-vision/ into 5 docs\n", 408 | "2023-03-26 17:06:54,290 - Split https://eugeneyan.com/writing/one-way-to-help-a-data-science-team-succeed/ into 2 docs\n", 409 | "2023-03-26 17:06:54,291 - Split https://eugeneyan.com/writing/product-categorization-api-part-3-creating-an-api/ into 4 docs\n", 410 | "2023-03-26 17:06:54,292 - Split https://eugeneyan.com/writing/image-search-is-now-live/ into 2 docs\n", 411 | "2023-03-26 17:06:54,293 - Split https://eugeneyan.com/writing/product-categorization-api-part-2-data-preparation/ into 5 docs\n", 412 | "2023-03-26 17:06:54,294 - Split https://eugeneyan.com/writing/image-categorization-is-now-live/ into 1 docs\n", 413 | "2023-03-26 17:06:54,295 - Split https://eugeneyan.com/writing/im-going-back-to-school/ into 4 docs\n", 414 | "2023-03-26 17:06:54,295 - Split https://eugeneyan.com/writing/sortmyskills-is-now-live/ into 3 docs\n", 415 | "2023-03-26 17:06:54,296 - Split https://eugeneyan.com/writing/product-categorization-api-part-1-data-acquisition-and-formatting/ into 5 docs\n", 416 | "2023-03-26 17:06:54,297 - Split https://eugeneyan.com/writing/thoughts-on-functional-programming-in-scala-course-coursera/ into 3 docs\n", 417 | "2023-03-26 17:06:54,298 - Split https://eugeneyan.com/writing/first-post/ into 1 docs\n", 418 | "2023-03-26 17:06:54,298 - Split https://eugeneyan.com/writing/datakind-sg-project-accelerator/ into 9 docs\n", 419 | "2023-03-26 17:06:54,300 - Split https://eugeneyan.com/writing/ into 15 docs\n" 420 | ] 421 | } 422 | ], 423 | "source": [ 424 | "text_splitter = CharacterTextSplitter(chunk_size=1500, separator='\\n')\n", 425 | "\n", 426 | "docs, metadata = [], []\n", 427 | "\n", 428 | "for page in pages:\n", 429 | " splits = text_splitter.split_text(page['text'])\n", 430 | " # for split in splits:\n", 431 | " # docs.append(split)\n", 432 | " # metadata.append({'source': split, 'url': page['url']})\n", 433 | " docs.extend(splits)\n", 434 | " metadata.extend([{'source': page['url']}] * len(splits)) # This Q&A chain relies on the url being in the 'source' key\n", 435 | " logger.info(f'Split {page[\"url\"]} into {len(splits)} docs')" 436 | ] 437 | }, 438 | { 439 | "attachments": {}, 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "## Create a FAISS vector store for offline prototyping" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 265, 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadata)\n", 453 | "with open('../data/faiss_store.pkl', 'wb') as f:\n", 454 | " pickle.dump(store, f) # This is a 10mb file" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 266, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "# question = 'Question for eugeneyan.com: Why is writing important?'\n", 464 | "\n", 465 | "# with open('../data/faiss_store.pkl', 'rb') as f:\n", 466 | "# store = pickle.load(open('../data/faiss_store.pkl', 'rb'))\n", 467 | "\n", 468 | "# chain = load_qa_with_sources_chain(ChatOpenAI(temperature=0), verbose=False)\n", 469 | "# response = chain({'input_documents': store.similarity_search(question, 4), \n", 470 | "# 'question': question})" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 267, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "# response" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 268, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# # VectorDBQAWithSourcesChain is DEPRECATED\n", 489 | "\n", 490 | "# question = 'Question for eugeneyan.com: Why is writing important?'\n", 491 | "\n", 492 | "# with open('../data/faiss_store.pkl', 'rb') as f:\n", 493 | "# store = pickle.load(open('../data/faiss_store.pkl', 'rb'))\n", 494 | "\n", 495 | "# llm=ChatOpenAI(temperature=0)\n", 496 | "# chain = VectorDBQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', vectorstore=store)\n", 497 | "# response = chain({'question': question})" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 303, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "# Use this instead of load_qa_with_sources_chain for more control\n", 507 | "question = 'Question for eugeneyan.com: What is content moderation?'\n", 508 | "\n", 509 | "with open('../data/faiss_store.pkl', 'rb') as f:\n", 510 | " store = pickle.load(open('../data/faiss_store.pkl', 'rb'))\n", 511 | "\n", 512 | "llm=OpenAI(temperature=0)\n", 513 | "chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', retriever=store.as_retriever(), return_source_documents=True)\n", 514 | "response = chain({'question': question}, return_only_outputs=False)" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 304, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "{'question': 'Question for eugeneyan.com: What is content moderation?',\n", 526 | " 'answer': ' Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. It involves collecting a set of ground truth, using supervised ML models, and applying heuristics and unsupervised models.\\n',\n", 527 | " 'sources': 'https://eugeneyan.com/writing/content-moderation/',\n", 528 | " 'source_documents': [Document(page_content='Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. How do we know which are irrelevant, incorrect, or downright harmful? A related problem is detecting anomalous activity such as fraudulent transactions or malicious traffic.\\nTo learn more about building robust content moderation systems, I dug into industry papers and tech blogs on classification, anomaly detection, and search relevance. Here are five patterns I observed:\\nRegardless of whether a heuristic-based, supervised, or unsupervised solution is adopted, we typically start with collecting a set of ground truth. This ground truth can then be used to train supervised ML models as well as evaluate the performance of heuristics and unsupervised models. The ground truth also acts as seed data to bootstrap more labels via active or semi-supervised learning.\\nThe most straightforward way to collect ground truth is to ask users. For Stack Exchange to block spam on their sites, a valuable data source is users flagging posts as spam. These flags were then used to identify and act on spammy users by blocking or rate-limiting them. They were also used as training data for machine learning models.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/content-moderation/'}, lookup_index=0),\n", 529 | " Document(page_content='Community Engagement:\\nEugene is passionate about fostering a strong data science community. He is an organizer and mentor for the Data Science Global Impact Challenge, a competition that encourages participants to use data science techniques to address pressing global issues. He is also involved in the AI Singapore initiative, which aims to promote AI and data science in Singapore.\\nIn summary, Eugene Yan is an experienced data scientist and machine learning practitioner known for his work in search and recommendation systems. Through his various roles, writing, and speaking engagements,and mentorship, he has contributed significantly to the field of data science and continues to share his knowledge with the community.\\nThe overall theme seems correct, though it got many details wrong, including:\\nEugene Yan, also known as @eugeneyan, is a data scientist, writer, and entrepreneur based in Singapore. He is widely recognized for his contributions to the data science community, including his popular blog, eugeneyan.com, where he shares insights on data science, machine learning, and personal growth.\\nEugene holds a Bachelor’s degree in Electrical and Electronic Engineering from the National University of Singapore, as well as a Master’s degree in Management Science and Engineering from Stanford University. After completing his studies, he worked at several tech companies, including Google, where he served as a software engineer.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/llm-bio/'}, lookup_index=0),\n", 530 | " Document(page_content='I think a lot about machine learning. And I think a lot about life. Sometimes, the channels mix and I find certain lessons from machine learning applicable to life.\\nHere are seven lessons. While I assume most readers are familiar with these machine learning concepts, I begin each lesson with a brief explanation.\\nWe clean data so our downstream analysis or machine learning is correct. As Randy Au shares, data cleaning isn’t grunt work; it is the work.\\nWe don’t use data without exploring and cleaning it. Similarly, we shouldn’t consume life’s inputs without assessing and filtering them.\\nTake food for example. How often do we reach for what’s widely available and easy to prepare? Until a few years ago, I was happily munching on a bowl of Sugary-Os cereal daily. Now that I’m more aware of my family’s history with diabetes, I’m more selective and pay greater attention to nutritional content. Also, as age catches up and my metabolism slows, I have to make a conscious effort to eat healthier and avoid junk food.\\nSugar is \"good\" for you (source)\\nIt’s the same with content. News outlets and social media rank information based on virality and advertising dollars. “Empty calorie info-bites” that are easy to consume—but don’t enrich us—circulate faster. Misinformation is rampant. Some content is in poor taste and downright toxic, and attempts to engage don’t end well. For sanity’s sake, just filter it out. Curate your news sources and who you follow on social media.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/life-lessons-from-machine-learning/'}, lookup_index=0),\n", 531 | " Document(page_content='Stack Exchange has several layers of defense against spam. The first line of defense is triggered when a spammer posts too often to be humanly possible. The spammer is hit with an HTTP 429 Error (Too Many Requests) and blocked or rate-limited.\\nThe second line of defense is based on heuristics. Specifically, they run posts through an “unholy amount of regular expressions” and some rules. If a post is caught, it is sent to users to check and potentially flag it as spam. If six users flag it as spam (six flags lol), the post is marked as spam and the user is blocked, rate-limited, or prevented from posting.\\nThe final line of defense is a (machine learning?) system that identifies posts most likely to be spam. They shadow-tested it and found it to be extremely accurate. It was catching almost all of the blatantly obvious spam. Eventually, this system was armed to cast three automatic flags and it drastically reduced the time to spam post deletion.\\nTime till spam deletion drops from no auto-flag (red) to 1 auto-flag (green) to 3 auto-flags (orange)\\nCloudflare also combines heuristics and machine learning (and other techniques) to identify bot traffic. They shared a comparison: If machine learning inference requires ~50ms, then hundreds of heuristics can be applied at ~20ms.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/content-moderation/'}, lookup_index=0)]}" 532 | ] 533 | }, 534 | "execution_count": 304, 535 | "metadata": {}, 536 | "output_type": "execute_result" 537 | } 538 | ], 539 | "source": [ 540 | "response" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 305, 546 | "metadata": {}, 547 | "outputs": [ 548 | { 549 | "name": "stderr", 550 | "output_type": "stream", 551 | "text": [ 552 | "2023-03-26 17:14:41,484 - Question: Question for eugeneyan.com: What is content moderation?\n", 553 | "2023-03-26 17:14:41,486 - Answer: Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. It involves collecting a set of ground truth, using supervised ML models, and applying heuristics and unsupervised models.\n", 554 | "\n", 555 | "2023-03-26 17:14:41,488 - Sources: https://eugeneyan.com/writing/content-moderation/\n", 556 | "2023-03-26 17:14:41,489 - URL: https://eugeneyan.com/writing/content-moderation/\n", 557 | "\n", 558 | "2023-03-26 17:14:41,491 - Source: Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. How do we know which are irrelevant, incorrect, or downright harmful? A related problem is detecting anomalous activity such as fraudulent transactions or malicious traffic.\n", 559 | "To learn more about building robust content moderation systems, I dug into industry papers and tech blogs on classification, anomaly detection, and search relevance. Here are five patterns I observed:\n", 560 | "Regardless of whether a heuristic-based, supervised, or unsupervised solution is adopted, we typically start with collecting a set of ground truth. This ground truth can then be used to train supervised ML models as well as evaluate the performance of heuristics and unsupervised models. The ground truth also acts as seed data to bootstrap more labels via active or semi-supervised learning.\n", 561 | "The most straightforward way to collect ground truth is to ask users. For Stack Exchange to block spam on their sites, a valuable data source is users flagging posts as spam. These flags were then used to identify and act on spammy users by blocking or rate-limiting them. They were also used as training data for machine learning models.\n", 562 | "\n", 563 | "2023-03-26 17:14:41,492 - =====================\n", 564 | "2023-03-26 17:14:41,493 - URL: https://eugeneyan.com/writing/content-moderation/\n", 565 | "\n", 566 | "2023-03-26 17:14:41,494 - Source: Stack Exchange has several layers of defense against spam. The first line of defense is triggered when a spammer posts too often to be humanly possible. The spammer is hit with an HTTP 429 Error (Too Many Requests) and blocked or rate-limited.\n", 567 | "The second line of defense is based on heuristics. Specifically, they run posts through an “unholy amount of regular expressions” and some rules. If a post is caught, it is sent to users to check and potentially flag it as spam. If six users flag it as spam (six flags lol), the post is marked as spam and the user is blocked, rate-limited, or prevented from posting.\n", 568 | "The final line of defense is a (machine learning?) system that identifies posts most likely to be spam. They shadow-tested it and found it to be extremely accurate. It was catching almost all of the blatantly obvious spam. Eventually, this system was armed to cast three automatic flags and it drastically reduced the time to spam post deletion.\n", 569 | "Time till spam deletion drops from no auto-flag (red) to 1 auto-flag (green) to 3 auto-flags (orange)\n", 570 | "Cloudflare also combines heuristics and machine learning (and other techniques) to identify bot traffic. They shared a comparison: If machine learning inference requires ~50ms, then hundreds of heuristics can be applied at ~20ms.\n", 571 | "\n", 572 | "2023-03-26 17:14:41,495 - =====================\n" 573 | ] 574 | } 575 | ], 576 | "source": [ 577 | "logger.info(f'Question: {response[\"question\"]}')\n", 578 | "logger.info(f'Answer: {response[\"answer\"]}')\n", 579 | "logger.info(f'Sources: {response[\"sources\"]}')\n", 580 | "\n", 581 | "sources = set(response['sources'].split(', '))\n", 582 | "\n", 583 | "for doc in response['source_documents']:\n", 584 | " if doc.metadata[\"source\"] in sources:\n", 585 | " logger.info(f'URL: {doc.metadata[\"source\"]}\\n')\n", 586 | " logger.info(f'Source: {doc.page_content}\\n')\n", 587 | " logger.info('=====================')" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 306, 593 | "metadata": {}, 594 | "outputs": [ 595 | { 596 | "data": { 597 | "text/plain": [ 598 | "['https://eugeneyan.com/writing/content-moderation/']" 599 | ] 600 | }, 601 | "execution_count": 306, 602 | "metadata": {}, 603 | "output_type": "execute_result" 604 | } 605 | ], 606 | "source": [ 607 | "response['sources'].split(', ')" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 149, 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "data": { 617 | "text/plain": [ 618 | "Document(page_content='How to Write: Advice from David Perell and Sahil Lavingia\\neugeneyan\\nStart Here\\nWriting\\nSpeaking\\nNewsletter\\nAbout\\nHow to Write: Advice from David Perell and Sahil Lavingia\\n[\\nwriting\\n]\\n· 4 min read\\nWriting is a superpower. Telepathy to be exact.\\nThink about it—through writing, I broadcast ideas from my mind to yours. Across time. Across space. (Yes, the internet plays a big role, but let’s focus on writing.) The more effective your writing, the stronger your telepathic ability.\\nWhy write about writing on this site?\\nWriting is essential for effective data science. Good writing means you get buy-in on ideas, your methodology and experiments can be replicated, and readers understand enough to give feedback. Poor writing gets you zilch (and snores). Business folk have enough trouble understanding data geeks as it is—don’t make it harder with your writing.\\nWriting is an important way to learn. (The other important way is learning.) When writing, you have to organize ideas and prune the unnecessary. Along the way, you find gaps in your understanding, which leads you to more research and learning.\\nWriting becomes more important as your career progresses. Everything is writing—emails, specs, documentation, articles. Code too. As you become more senior, you write more docs, less code. Seniors contribute by designing systems and communicating them to teams to implement.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/how-to-write-david-x-sahil/'}, lookup_index=0)" 619 | ] 620 | }, 621 | "execution_count": 149, 622 | "metadata": {}, 623 | "output_type": "execute_result" 624 | } 625 | ], 626 | "source": [ 627 | "doc" 628 | ] 629 | }, 630 | { 631 | "attachments": {}, 632 | "cell_type": "markdown", 633 | "metadata": {}, 634 | "source": [ 635 | "## Using pinecone" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": 307, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "pinecone.init(api_key=os.getenv('PINECONE_API_KEY'), environment=PINECONE_ENV)" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 311, 650 | "metadata": {}, 651 | "outputs": [ 652 | { 653 | "data": { 654 | "text/plain": [ 655 | "{'dimension': 1536,\n", 656 | " 'index_fullness': 0.0,\n", 657 | " 'namespaces': {},\n", 658 | " 'total_vector_count': 0}" 659 | ] 660 | }, 661 | "execution_count": 311, 662 | "metadata": {}, 663 | "output_type": "execute_result" 664 | } 665 | ], 666 | "source": [ 667 | "index_name = 'ask-ey'\n", 668 | "index = pinecone.Index(index_name)\n", 669 | "\n", 670 | "# Delete and recreate index\n", 671 | "# pinecone.delete_index(index_name)\n", 672 | "pinecone.create_index(index_name, dimension=1536, metric='cosine', pod_type='p2.x1')\n", 673 | "index.describe_index_stats()" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 313, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "# # Initialize with small set of data - \n", 683 | "# p = Pinecone.from_texts(docs[0:2], \n", 684 | "# embeddings, \n", 685 | "# index_name=index_name, \n", 686 | "# metadatas=metadata[0:2])\n", 687 | "\n", 688 | "# index.describe_index_stats()" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 314, 694 | "metadata": {}, 695 | "outputs": [ 696 | { 697 | "data": { 698 | "text/plain": [ 699 | "{'dimension': 1536,\n", 700 | " 'index_fullness': 0.0,\n", 701 | " 'namespaces': {},\n", 702 | " 'total_vector_count': 0}" 703 | ] 704 | }, 705 | "execution_count": 314, 706 | "metadata": {}, 707 | "output_type": "execute_result" 708 | } 709 | ], 710 | "source": [ 711 | "embeddings = OpenAIEmbeddings()\n", 712 | "\n", 713 | "# Load existing pinecone index\n", 714 | "store = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)\n", 715 | "index.describe_index_stats()" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 315, 721 | "metadata": {}, 722 | "outputs": [ 723 | { 724 | "name": "stderr", 725 | "output_type": "stream", 726 | "text": [ 727 | "2023-03-26 17:21:12,696 - Adding chunk 1 of 10 (0 to 100))\n" 728 | ] 729 | }, 730 | { 731 | "data": { 732 | "application/vnd.jupyter.widget-view+json": { 733 | "model_id": "b5874d629b824f268bc671c93796fb3c", 734 | "version_major": 2, 735 | "version_minor": 0 736 | }, 737 | "text/plain": [ 738 | "Upserted vectors: 0%| | 0/100 [00:00 str: 45 | 46 | llm = ChatOpenAI(temperature=temperature, model_name=model) 47 | chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', 48 | retriever=store_ey.as_retriever(), 49 | return_source_documents=True) 50 | 51 | response = chain({'question': question}) 52 | pretty_response = prettify_qa_response(response) 53 | 54 | return pretty_response 55 | 56 | 57 | @timer 58 | def qa_board(question: str, temperature: float = None, model: str = QA_MODEL) -> str: 59 | 60 | llm = ChatOpenAI(temperature=temperature, model_name=model) 61 | chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', 62 | retriever=store_board.as_retriever(), 63 | return_source_documents=True) 64 | 65 | response = chain({'question': question}) 66 | pretty_response = prettify_qa_response(response) 67 | 68 | return pretty_response 69 | -------------------------------------------------------------------------------- /railway.toml: -------------------------------------------------------------------------------- 1 | [build] 2 | builder = "DOCKERFILE" 3 | dockerfilePath = "Dockerfile" 4 | 5 | [deploy] 6 | restartPolicyType = "ON_FAILURE" 7 | restartPolicyMaxRetries = 10 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | anyio==3.6.2 4 | appnope==0.1.3 5 | argon2-cffi==21.3.0 6 | argon2-cffi-bindings==21.2.0 7 | arrow==1.2.3 8 | asttokens==2.2.1 9 | async-timeout==4.0.2 10 | attrs==22.2.0 11 | autopep8==2.0.2 12 | backcall==0.2.0 13 | beautifulsoup4==4.12.0 14 | bleach==6.0.0 15 | cachetools==5.3.0 16 | certifi==2024.7.4 17 | cffi==1.15.1 18 | charset-normalizer==3.1.0 19 | comm==0.1.3 20 | dataclasses-json==0.5.7 21 | debugpy==1.6.6 22 | decorator==5.1.1 23 | defusedxml==0.7.1 24 | discord-py-interactions==4.4.0 25 | discord-py-slash-command==4.2.1 26 | discord.py==2.2.2 27 | dnspython==2.3.0 28 | executing==1.2.0 29 | faiss-cpu==1.7.3 30 | fastjsonschema==2.16.3 31 | fqdn==1.5.1 32 | frozenlist==1.3.3 33 | google-api-core==2.11.0 34 | google-api-python-client==2.82.0 35 | google-auth==2.16.3 36 | google-auth-httplib2==0.1.0 37 | google-search-results==2.4.2 38 | googleapis-common-protos==1.59.0 39 | greenlet==2.0.2 40 | httplib2==0.22.0 41 | idna==3.4 42 | importlib-metadata==6.1.0 43 | install==1.3.5 44 | ipykernel==6.22.0 45 | ipython==8.11.0 46 | ipython-genutils==0.2.0 47 | ipywidgets==8.0.5 48 | isoduration==20.11.0 49 | jedi==0.18.2 50 | Jinja2==3.1.2 51 | jsonpointer==2.3 52 | jsonschema==4.17.3 53 | jupyter==1.0.0 54 | jupyter-console==6.6.3 55 | jupyter-events==0.6.3 56 | jupyter_client==8.1.0 57 | jupyter_core==5.3.0 58 | jupyter_server==2.5.0 59 | jupyter_server_terminals==0.4.4 60 | jupyterlab-pygments==0.2.2 61 | jupyterlab-widgets==3.0.6 62 | langchain==0.0.325 63 | loguru==0.6.0 64 | MarkupSafe==2.1.2 65 | marshmallow==3.19.0 66 | marshmallow-enum==1.5.1 67 | matplotlib-inline==0.1.6 68 | mistune==2.0.5 69 | multidict==6.0.4 70 | mypy-extensions==1.0.0 71 | nbclassic==0.5.3 72 | nbclient==0.7.2 73 | nbconvert==7.2.10 74 | nbformat==5.8.0 75 | nest-asyncio==1.5.6 76 | notebook==6.5.3 77 | notebook_shim==0.2.2 78 | numpy==1.24.2 79 | openai==0.27.2 80 | packaging==23.0 81 | pandas==1.5.3 82 | pandocfilters==1.5.0 83 | parso==0.8.3 84 | pexpect==4.8.0 85 | pickleshare==0.7.5 86 | pinecone-client==2.2.1 87 | platformdirs==3.1.1 88 | prometheus-client==0.16.0 89 | prompt-toolkit==3.0.38 90 | protobuf==4.22.1 91 | psutil==5.9.4 92 | ptyprocess==0.7.0 93 | pure-eval==0.2.2 94 | pyasn1==0.4.8 95 | pyasn1-modules==0.2.8 96 | pycodestyle==2.10.0 97 | pycparser==2.21 98 | pydantic==1.10.7 99 | Pygments==2.14.0 100 | pyparsing==3.0.9 101 | pyrsistent==0.19.3 102 | python-dateutil==2.8.2 103 | python-dotenv==1.0.0 104 | python-json-logger==2.0.7 105 | pytz==2022.7.1 106 | PyYAML==6.0 107 | pyzmq==25.0.2 108 | qtconsole==5.4.1 109 | QtPy==2.3.0 110 | regex==2023.3.23 111 | requests==2.28.2 112 | rfc3339-validator==0.1.4 113 | rfc3986-validator==0.1.1 114 | rsa==4.9 115 | Send2Trash==1.8.0 116 | six==1.16.0 117 | sniffio==1.3.0 118 | soupsieve==2.4 119 | SQLAlchemy==1.4.47 120 | stack-data==0.6.2 121 | tenacity==8.2.2 122 | terminado==0.17.1 123 | tiktoken==0.3.2 124 | tinycss2==1.2.1 125 | tomli==2.0.1 126 | tornado==6.2 127 | tqdm==4.65.0 128 | traitlets==5.9.0 129 | typing-inspect==0.8.0 130 | typing_extensions==4.5.0 131 | uri-template==1.2.0 132 | uritemplate==4.1.1 133 | urllib3==1.26.15 134 | wcwidth==0.2.6 135 | webcolors==1.12 136 | webencodings==0.5.1 137 | websocket-client==1.5.1 138 | widgetsnbextension==4.0.6 139 | xmltodict==0.13.0 140 | yarl==1.8.2 141 | zipp==3.15.0 142 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | # For railway.app 2 | python -3.9.7 -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for searching the internet. 3 | """ 4 | from dotenv import load_dotenv 5 | from langchain import LLMChain 6 | from langchain.agents import AgentExecutor, Tool, ZeroShotAgent 7 | from langchain.chat_models import ChatOpenAI 8 | from langchain.utilities import GoogleSearchAPIWrapper 9 | 10 | from config import SEARCH_MODEL 11 | from logger import logger 12 | from utils import prettify_agent_response, timer 13 | 14 | # Create tools 15 | load_dotenv() 16 | search = GoogleSearchAPIWrapper() 17 | TOOLS = [ 18 | Tool( 19 | name="Search", 20 | func=search.run, 21 | description="Useful for when you need to answer questions about current events" 22 | ) 23 | ] 24 | TOOL_STRINGS = "\n".join( 25 | [f"{tool.name}: {tool.description}" for tool in TOOLS]) 26 | TOOL_NAMES = ", ".join([tool.name for tool in TOOLS]) 27 | 28 | PREFIX = """Please answer the following questions as best you can. You have access to the following tools:""" 29 | FORMAT_INSTRUCTIONS = """Please use the following format: 30 | 31 | Question: the input question you must answer 32 | Thought: you should always think about what to do 33 | Action: the action to take, should be one of [{tool_names}] 34 | Action Input: the input to the action 35 | Observation: the result of the action 36 | ... (this Thought/Action/Action Input/Observation can repeat N times) 37 | Thought: I now know the final answer 38 | Final Answer: the final answer to the original input question""" 39 | SUFFIX = """Please begin! 40 | 41 | Question: {input} 42 | Thought: {agent_scratchpad}""" 43 | 44 | FORMAT_INSTRUCTIONS = FORMAT_INSTRUCTIONS.format(tool_names=TOOL_NAMES) 45 | 46 | 47 | # Search agent biaed on zeroshot 48 | @timer 49 | def search_agent(question: str, temperature: float = None, model: str = SEARCH_MODEL) -> str: 50 | """ 51 | Calls OpenAI API and searches the web to find the best answer to a question. 52 | """ 53 | # Write prompt and create zero-shot agent 54 | prompt = ZeroShotAgent.create_prompt( 55 | TOOLS, 56 | prefix=PREFIX, 57 | suffix=SUFFIX, 58 | format_instructions=FORMAT_INSTRUCTIONS, 59 | input_variables=['input', 'agent_scratchpad'] 60 | ) 61 | logger.info(prompt.template) 62 | 63 | # Create LLM and call API 64 | llm = ChatOpenAI(temperature=temperature, model_name=model) 65 | llm_chain = LLMChain(llm=llm, prompt=prompt) 66 | 67 | # Create agent with tools 68 | agent = ZeroShotAgent(llm_chain=llm_chain, 69 | tools=TOOLS, tool_names=TOOL_NAMES) 70 | agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=TOOLS, max_iterations=10, 71 | verbose=True, return_intermediate_steps=True) 72 | 73 | response = agent_executor({'input': question}) 74 | pretty_response = prettify_agent_response(response) 75 | 76 | return pretty_response 77 | -------------------------------------------------------------------------------- /sql.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent that can query a database 3 | Docs: https://langchain.readthedocs.io/en/latest/modules/chains/examples/sqlite.html 4 | Dataset: https://www.kaggle.com/datasets/jealousleopard/goodreadsbooks 5 | """ 6 | from dotenv import load_dotenv 7 | from langchain import LLMChain, OpenAI, SQLDatabase, SQLDatabaseChain 8 | from langchain.agents import (AgentExecutor, Tool, ZeroShotAgent, 9 | create_sql_agent) 10 | from langchain.agents.agent_toolkits import SQLDatabaseToolkit 11 | from langchain.chat_models import ChatOpenAI 12 | from langchain.llms.openai import OpenAI 13 | from langchain.sql_database import SQLDatabase 14 | 15 | from config import SQL_MODEL 16 | from logger import logger 17 | from utils import prettify_agent_response, prettify_chain_response, timer 18 | 19 | # Load env 20 | load_dotenv() 21 | DB = SQLDatabase.from_uri('sqlite:///data/books.db') 22 | TOOLKIT = SQLDatabaseToolkit(db=DB) 23 | TOOLS = TOOLKIT.get_tools() 24 | TOOL_NAMES = [tool.name for tool in TOOLS] 25 | 26 | 27 | PREFIX = """You are an agent designed to interact with a SQL database. 28 | Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer. 29 | Unless the user specifies a specific number of examples they wish to obtain, always limit your query to at most {top_k} results. 30 | You can order the results by a relevant column to return the most interesting examples in the database. 31 | Never query for all the columns from a specific table, only ask for a the few relevant columns given the question. 32 | You have access to tools for interacting with the database. 33 | Only use the below tools. Only use the information returned by the below tools to construct your final answer. 34 | You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again. 35 | 36 | DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database. 37 | 38 | If the question does not seem related to the database, just return "I don't know" as the answer. 39 | """ 40 | FORMAT_INSTRUCTIONS = """Use the following format: 41 | 42 | Question: the input question you must answer 43 | Thought: you should always think about what to do 44 | Action: the action to take, should be one of [{tool_names}] 45 | Action Input: the input to the action 46 | Observation: the result of the action 47 | ... (this Thought/Action/Action Input/Observation can repeat N times) 48 | Thought: I now know the final answer 49 | Final Answer: the final answer to the original input question""" 50 | SUFFIX = """Begin! 51 | 52 | Question: {input} 53 | Thought: I should look at the tables in the database to see what I can query. 54 | {agent_scratchpad}""" 55 | 56 | 57 | # Defines agent to query a database 58 | @timer 59 | def sql_agent(query: str, temperature: float = 0, top_k: int = 10, model: str = SQL_MODEL) -> str: 60 | """ 61 | Create Agent that can query a database. 62 | """ 63 | # Write prompt and create zero-shot agent 64 | prompt = ZeroShotAgent.create_prompt( 65 | tools=TOOLKIT.get_tools(), 66 | prefix=PREFIX.format(dialect=TOOLKIT.dialect, top_k=top_k), 67 | suffix=SUFFIX, 68 | format_instructions=FORMAT_INSTRUCTIONS, 69 | input_variables=['input', 'agent_scratchpad'] 70 | ) 71 | logger.info(prompt.template) 72 | 73 | # Create LLM and call API 74 | llm = ChatOpenAI(temperature=temperature, model_name=model) 75 | llm_chain = LLMChain(llm=llm, prompt=prompt) 76 | 77 | # Create agent with tools 78 | agent = ZeroShotAgent(llm_chain=llm_chain, 79 | tools=TOOLS, tool_names=TOOL_NAMES) 80 | agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=TOOLS, max_iterations=10, 81 | verbose=True, return_intermediate_steps=True) 82 | 83 | response = agent_executor({'input': query}) 84 | pretty_response = prettify_agent_response(response) 85 | 86 | return pretty_response 87 | 88 | 89 | # Create chain to query database 90 | @timer 91 | def sql_chain(query: str, temperature: float = 0, model: str = SQL_MODEL) -> str: 92 | """ 93 | Create chain that can query a database. 94 | """ 95 | llm = OpenAI(temperature=temperature, model_name=model) 96 | db_chain = SQLDatabaseChain( 97 | llm=llm, database=DB, verbose=True, return_intermediate_steps=True) 98 | 99 | response = db_chain(query) 100 | logger.info(f'Response: {response}') 101 | pretty_response = prettify_chain_response(response) 102 | return pretty_response 103 | -------------------------------------------------------------------------------- /summarize.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for summarizing text. 3 | """ 4 | import re 5 | from typing import List 6 | 7 | import requests 8 | import tiktoken 9 | from bs4 import BeautifulSoup 10 | from langchain import OpenAI 11 | from langchain.chains.summarize import load_summarize_chain 12 | from langchain.docstore.document import Document 13 | from langchain.prompts import (ChatPromptTemplate, HumanMessagePromptTemplate, 14 | SystemMessagePromptTemplate) 15 | from langchain.text_splitter import TokenTextSplitter 16 | 17 | from config import SUMMARY_MAX_TOKENS, SUMMARY_MODEL, SUMMARY_TOKENIZER 18 | from logger import logger 19 | from utils import timer 20 | 21 | ENC = tiktoken.encoding_for_model(SUMMARY_MODEL) 22 | TEXT_SPLITTER = TokenTextSplitter(encoding_name=SUMMARY_TOKENIZER) 23 | 24 | 25 | # Count the number of tokens in text 26 | def num_tokens(text: str) -> int: 27 | """ 28 | Count the number of tokens in text. 29 | """ 30 | enc = tiktoken.encoding_for_model(SUMMARY_MODEL) 31 | return len(enc.encode(text)) 32 | 33 | 34 | # Get text from url 35 | def get_text_from_url(url: str) -> str: 36 | """ 37 | Get text from url. 38 | """ 39 | response = requests.get(url) 40 | soup = BeautifulSoup(response.text, 'html.parser') 41 | text = re.sub(r'\n+', '\n', soup.get_text()) # Remove consecutive newlines 42 | 43 | # Trim text to 1800 tokens 44 | trimmed_text = ENC.decode((ENC.encode(text))[:SUMMARY_MAX_TOKENS]) 45 | 46 | logger.info( 47 | f'{num_tokens(trimmed_text)}/{num_tokens(text)} tokens from {url}') 48 | return trimmed_text 49 | 50 | 51 | # Get docs from text 52 | def get_docs_from_text(text: str) -> list: 53 | """ 54 | Get docs from text. 55 | """ 56 | texts = TEXT_SPLITTER.split_text(text) 57 | docs = [Document(page_content=t) for t in texts] 58 | logger.info(f'Created {len(docs):,} out of {len(texts):,} total docs') 59 | return docs 60 | 61 | 62 | # Remove empty lines from text 63 | def remove_empty_lines(text: str) -> str: 64 | """ 65 | Remove empty lines from text. 66 | """ 67 | return '\n'.join([line for line in text.splitlines() if line.strip()]) 68 | 69 | 70 | # Calls OpenAI API and returns summary of text 71 | def summarize(docs: List[str], temperature: float, model: str) -> str: 72 | """ 73 | Calls OpenAI API and returns summary of text. 74 | """ 75 | # Write prompt 76 | system_msg = """You are a teacher who summarizes documents into easily digestible bullet points.""" 77 | human_msg = """Summarize the following text in bullet points: 78 | 79 | {text} 80 | 81 | Concise summary in bullet points:""" 82 | 83 | messages = [ 84 | SystemMessagePromptTemplate.from_template(system_msg), 85 | HumanMessagePromptTemplate.from_template(human_msg) 86 | ] 87 | 88 | prompt = ChatPromptTemplate.from_messages(messages) 89 | logger.info(f'Prompt: {prompt}, temperature: {temperature}') 90 | 91 | # Create LLM and call API 92 | llm = OpenAI(temperature=temperature, model_name=model) 93 | chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt) 94 | response = chain.run(docs) 95 | logger.info( 96 | f'Results received: {response} ({num_tokens(response)} tokens), temperature: {temperature}') 97 | 98 | return response 99 | 100 | 101 | # Calls OpenAI API and explains the text like the user is a five-year old 102 | def eli5(docs: List[str], temperature: float, model: str) -> str: 103 | """ 104 | Calls OpenAI API and returns explaination for a five year old 105 | """ 106 | # Write prompt 107 | system_msg = """You are a teacher who explains documents to a five-year old.""" 108 | human_msg = """Explain the following text to a five-year old: 109 | 110 | {text} 111 | 112 | Concise explanation:""" 113 | 114 | messages = [ 115 | SystemMessagePromptTemplate.from_template(system_msg), 116 | HumanMessagePromptTemplate.from_template(human_msg) 117 | ] 118 | 119 | prompt = ChatPromptTemplate.from_messages(messages) 120 | logger.info(f'Prompt: {prompt}, temperature: {temperature}') 121 | 122 | # Create LLM and call API 123 | llm = OpenAI(temperature=temperature, model_name=model) 124 | chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt) 125 | response = chain.run(docs) 126 | logger.info( 127 | f'Results received: {response} ({num_tokens(response)} tokens), temperature: {temperature}') 128 | 129 | return response 130 | 131 | 132 | # Summarize text from url 133 | @timer 134 | def summarize_url(url: str, temperature: float = None, model: str = SUMMARY_MODEL) -> str: 135 | """ 136 | Calls OpenAI API and returns summary of text. 137 | """ 138 | logger.info( 139 | f'summarize: {url} (temperature: {temperature}, model: {model})') 140 | # Get text from url 141 | text = get_text_from_url(url) 142 | docs = get_docs_from_text(text) 143 | response = summarize(docs, temperature, model) 144 | pretty_response = remove_empty_lines(response) 145 | 146 | return pretty_response 147 | 148 | 149 | # Explain like I'm five from url 150 | @timer 151 | def eli5_url(url: str, temperature: float = None, model: str = SUMMARY_MODEL) -> str: 152 | """ 153 | Calls OpenAI API and explains the text like the user is a five-year old. 154 | """ 155 | logger.info(f'eli5: {url} (temperature: {temperature}, model: {model})') 156 | # Get text from url 157 | text = get_text_from_url(url) 158 | docs = get_docs_from_text(text) 159 | response = eli5(docs, temperature, model) 160 | pretty_response = remove_empty_lines(response) 161 | 162 | return pretty_response 163 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for the project. 3 | """ 4 | import re 5 | from functools import wraps 6 | from time import perf_counter 7 | from typing import Callable 8 | 9 | 10 | # Timer decorator 11 | def timer(func: Callable) -> Callable: 12 | """ 13 | Timer decorator. 14 | """ 15 | @wraps(func) 16 | def wrapper_timer(*args, **kwargs): 17 | start_time = perf_counter() 18 | value = func(*args, **kwargs) 19 | end_time = perf_counter() 20 | run_time = end_time - start_time 21 | print(f'Finished {func.__name__!r} in {run_time:.2f} secs') 22 | return value, run_time 23 | 24 | return wrapper_timer 25 | 26 | 27 | # Prettify langchain agent response 28 | def prettify_agent_response(response: dict, input_key: str = 'input', output_key: str = 'output') -> str: 29 | """ 30 | Pretty print the response from the agent. 31 | """ 32 | pretty_result = '' 33 | 34 | pretty_result += f'**Input:** {response[input_key]}\n\n' 35 | 36 | for step in response['intermediate_steps']: 37 | action = step[0] 38 | result = step[1] 39 | # pretty_result += f'**Tool**: {action.tool} | **Input**: "{action.tool_input}"\n' 40 | pretty_result += f'**Thought:** {action.log}\n' 41 | pretty_result += f'**Observation:** _{result}_\n\n' 42 | 43 | pretty_result += f'\n**Output:** {response[output_key]}' 44 | 45 | return pretty_result 46 | 47 | 48 | # Prettify langchain agent response 49 | def prettify_chain_response(response: dict, input_key: str = 'query', output_key: str = 'result') -> str: 50 | """ 51 | Pretty print the response from the chain. 52 | """ 53 | pretty_result = '' 54 | 55 | pretty_result += f'**Input:** {response[input_key]}\n\n' 56 | 57 | if 'intermediate_steps' in response: 58 | for i, step in enumerate(response['intermediate_steps']): 59 | pretty_result += f'**Step {i}:** {step}\n\n' 60 | 61 | pretty_result += f'\n**Output:** {response[output_key]}' 62 | 63 | return pretty_result 64 | 65 | 66 | # Wrap urls in <> to prevent discord from embedding them 67 | def wrap_urls(text): 68 | return re.sub(r'(https?://\S+)', r'<\1>', text) 69 | 70 | 71 | # Prettify langchain Q&A chain response 72 | def prettify_qa_response(response: dict, question_key: str = 'question', answer_key: str = 'answer') -> str: 73 | """ 74 | Pretty print the response from the Q&A chain. 75 | """ 76 | result_list = [] 77 | pretty_qa = '' 78 | 79 | pretty_qa += f'**Question:** {response[question_key]}\n\n' 80 | pretty_qa += f'**Answer:** {response[answer_key]}\n' 81 | pretty_qa += f'**Sources:** {wrap_urls(response["sources"])}' 82 | result_list.append(pretty_qa) 83 | 84 | for doc in response['source_documents']: 85 | pretty_source = '' 86 | pretty_source += f'**Source:** {wrap_urls(doc.page_content)}\n\n' 87 | pretty_source += f'**URL:** {wrap_urls(doc.metadata["source"])}' 88 | result_list.append(pretty_source) 89 | 90 | return result_list 91 | --------------------------------------------------------------------------------