├── .dockerignore
├── .gitignore
├── .python-version
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── config.py
├── data
    └── books.db
├── logger.py
├── main.py
├── notebooks
    ├── ask-ey.ipynb
    ├── embed.ipynb
    ├── scrape.ipynb
    └── t-sne.png
├── qa.py
├── railway.toml
├── requirements.txt
├── runtime.txt
├── search.py
├── sql.py
├── summarize.py
└── utils.py


/.dockerignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .github
 3 | .gitignore
 4 | .idea
 5 | .venv
 6 | .git
 7 | .ipynb_checkpoints
 8 | __pycache__
 9 | tests
10 | _releaser
11 | _site
12 | CONTRIBUTING.md
13 | Dockerfile
14 | docker-compose.yml
15 | /vendor


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | # Local env
163 | .vscode/
164 | .DS_Store


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.9.7
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax=docker/dockerfile:1
 2 | # temp stage
 3 | FROM python:3.9.7-slim as builder
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | ENV PYTHONDONTWRITEBYTECODE 1
 8 | ENV PYTHONUNBUFFERED 1
 9 | 
10 | RUN apt-get update && \
11 |     apt-get install -y --no-install-recommends gcc
12 | 
13 | COPY requirements.txt .
14 | RUN pip wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
15 | 
16 | 
17 | # final stage
18 | FROM python:3.9.7-slim
19 | 
20 | WORKDIR /app
21 | 
22 | COPY --from=builder /app/wheels /wheels
23 | COPY --from=builder /app/requirements.txt .
24 | 
25 | RUN pip install --no-cache /wheels/*
26 | 
27 | COPY . .
28 | 
29 | ENTRYPOINT [ "python3", "main.py"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: build dev prod
 2 | 
 3 | build:
 4 | 	DOCKER_BUILDKIT=1 docker build -t gpt-dev .
 5 | 
 6 | dev: build
 7 | 	docker run -it --rm gpt-dev --env=dev
 8 | 
 9 | prod: build
10 | 	docker run -it --rm gpt-dev


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # discord-llm
 2 | 
 3 | Code for [Experimenting with LLMs to Research, Reflect, and Plan](https://eugeneyan.com/writing/llm-experiments/). Disclaimer: The code is disorganized and hacky, and relies largely on LangChain's abstractions. May be useful as a reference, but **not** as learning material.
 4 | 
 5 | If you want to try this, update the `.env` file with your own keys. Most functionality, such as summarizing urls, sql queries on `/data/books.db`, and search should work right out of the box. For Q&A, you'll need to add your own custom indices.
 6 | 
 7 | ## Discord functionality
 8 | - Summarized and ELI5 urls
 9 | - Run basic SQL via a chain or agent
10 | - Run a search query via google custom search
11 | - Q&A on custom indices (Note: You need to add your own indices)
12 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configurations
 3 | """
 4 | # Defaults
 5 | DEFAULT_MODEL = 'gpt-3.5-turbo'
 6 | 
 7 | # Config for summarization
 8 | TOKENIZER_DICT = {'gpt-3.5-turbo': 'cl100k_base',
 9 |                   'gpt-4': 'cl100k_base'}
10 | SUMMARY_MAX_TOKENS_DICT = {'gpt-3.5-turbo': 3800,
11 |                            'gpt-4': 7000}
12 | 
13 | SUMMARY_MODEL = DEFAULT_MODEL
14 | SUMMARY_TOKENIZER = TOKENIZER_DICT[SUMMARY_MODEL]
15 | SUMMARY_MAX_TOKENS = SUMMARY_MAX_TOKENS_DICT[SUMMARY_MODEL]
16 | 
17 | # Config for search
18 | SEARCH_MODEL = DEFAULT_MODEL
19 | 
20 | # Config for SQL
21 | SQL_MODEL = DEFAULT_MODEL
22 | 
23 | # Config for Q&A
24 | QA_MODEL = DEFAULT_MODEL
25 | PINECONE_ENV = 'us-west4-gcp'
26 | PINECONE_INDEX_NAME_EY = 'ask-ey'
27 | PINECONE_INDEX_NAME_BOARD = 'board'
28 | EMBEDDING_MODEL = 'text-embedding-ada-002'
29 | 


--------------------------------------------------------------------------------
/data/books.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/discord-llm/0308a50b89e432bd7e34656e9b3911939eab7e9f/data/books.db


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Logger utility
 3 | """
 4 | import logging
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | logger.setLevel(logging.INFO)
 8 | formatter = logging.Formatter('%(asctime)s - %(message)s')
 9 | 
10 | # create console handler and set level to info
11 | ch = logging.StreamHandler()
12 | ch.setFormatter(formatter)
13 | ch.setLevel(logging.INFO)
14 | 
15 | # add ch to logger
16 | logger.addHandler(ch)
17 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Bot for discord server that utilizes OpenAI's api for commands.
  3 | """
  4 | import argparse
  5 | import os
  6 | from sqlite3 import OperationalError
  7 | 
  8 | import interactions
  9 | from dotenv import load_dotenv
 10 | 
 11 | from config import DEFAULT_MODEL
 12 | from logger import logger
 13 | from qa import qa_board, qa_ey
 14 | from search import search_agent
 15 | from sql import sql_agent, sql_chain
 16 | from summarize import eli5_url, summarize_url
 17 | 
 18 | # Parse arguments
 19 | parser = argparse.ArgumentParser()
 20 | parser.add_argument('--env', type=str, default='prod')
 21 | args = parser.parse_args()
 22 | logger.info(f'Arguments: {args.__dict__}')
 23 | 
 24 | # Discord arguments
 25 | MAX_INITIAL_MESSAGE_LENGTH = 1900
 26 | MAX_MESSAGE_LENGTH = 2000
 27 | 
 28 | # Load environment variables
 29 | load_dotenv()
 30 | TOKEN = os.getenv('DISCORD_TOKEN')
 31 | GUILD_ID = os.getenv('DISCORD_GUILD_ID')
 32 | CMD_PREFIX = ''
 33 | 
 34 | if args.env == 'dev':
 35 |     TOKEN = os.getenv('DISCORD_TOKEN_DEV')
 36 |     CMD_PREFIX = 'dev-'
 37 | 
 38 | bot = interactions.Client(TOKEN)
 39 | logger.info(f'Bot initialized: {bot.__dict__}')
 40 | 
 41 | # Define reusable options
 42 | OPTIONS_TEMPERATURE = interactions.Option(name='temperature', description='Lower values = more focused responses, higher values = more random', required=False,
 43 |                                           type=interactions.OptionType.NUMBER, min_value=0.0, max_value=2.0)
 44 | OPTIONS_MODEL = interactions.Option(name='model', description='Model to use', required=False,
 45 |                                     type=interactions.OptionType.STRING,
 46 |                                     choices=[interactions.Choice(name='gpt-3.5', value='gpt-3.5-turbo'),
 47 |                                              interactions.Choice(name='gpt-4', value='gpt-4')])
 48 | OPTIONS_SHOW_SOURCE = interactions.Option(name='show_source', description='Show snippets of source content', required=False,
 49 |                                           type=interactions.OptionType.BOOLEAN,
 50 |                                           choices=[interactions.Choice(name='yes', value=True),
 51 |                                                    interactions.Choice(name='no', value=False)])
 52 | 
 53 | 
 54 | @bot.command(name=f'{CMD_PREFIX}hello', description='Says hello without hitting any APIs. Used for health checks.', scope=GUILD_ID)
 55 | async def _hello(ctx: interactions.CommandContext):
 56 |     await ctx.send(f'Hello {ctx.author.mention}! How are you?')
 57 | 
 58 | 
 59 | @bot.command(name=f'{CMD_PREFIX}summarize', description='Summarizes a URL in bullet points', scope=GUILD_ID,
 60 |              options=[interactions.Option(name='url', description='URL to summarize', required=True, type=interactions.OptionType.STRING),
 61 |                       OPTIONS_TEMPERATURE, OPTIONS_MODEL])
 62 | async def _summarize(ctx: interactions.CommandContext, url: str, temperature: float = None, model: str = DEFAULT_MODEL):
 63 |     logger.info(f'Summarize: {url}, Temp: {temperature}, Model: {model}')
 64 |     await ctx.defer()
 65 |     summary, time = summarize_url(url, temperature, model)
 66 |     summary += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`'
 67 |     await ctx.send(f'Here is the summary of {url}:\n\n{summary[:MAX_INITIAL_MESSAGE_LENGTH]}')
 68 |     for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(summary), MAX_MESSAGE_LENGTH):
 69 |         await ctx.send(f'{summary[i:i+MAX_MESSAGE_LENGTH]}')
 70 | 
 71 | 
 72 | @ bot.command(name=f'{CMD_PREFIX}eli5', description='Explains a URL to a five-year old', scope=GUILD_ID,
 73 |               options=[interactions.Option(name='url', description='URL to explain', required=True, type=interactions.OptionType.STRING),
 74 |                        OPTIONS_TEMPERATURE, OPTIONS_MODEL])
 75 | async def _eli5(ctx: interactions.CommandContext, url: str, temperature: float = None, model: str = DEFAULT_MODEL):
 76 |     logger.info(f'ELI5: {url}, Temp: {temperature}, Model: {model}')
 77 |     await ctx.defer()
 78 |     explanation, time = eli5_url(url, temperature, model)
 79 |     explanation += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`'
 80 |     await ctx.send(f'Here is the explanation of {url}:\n\n{explanation[:MAX_INITIAL_MESSAGE_LENGTH]}')
 81 |     for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(explanation), MAX_MESSAGE_LENGTH):
 82 |         await ctx.send(f'{explanation[i:i+MAX_MESSAGE_LENGTH]}')
 83 | 
 84 | 
 85 | @bot.command(name=f'{CMD_PREFIX}search', description='Searches the internet for a query', scope=GUILD_ID,
 86 |              options=[interactions.Option(name='query', description='Query to search for', required=True, type=interactions.OptionType.STRING),
 87 |                       OPTIONS_TEMPERATURE, OPTIONS_MODEL])
 88 | async def _search_agent(ctx: interactions.CommandContext, query: str, temperature: float = None, model: str = DEFAULT_MODEL):
 89 |     logger.info(f'Search: {query}, Temp: {temperature}, Model: {model}')
 90 |     await ctx.defer()
 91 |     try:
 92 |         result, time = search_agent(query, temperature, model)
 93 |         result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`'
 94 |         await ctx.send(f'{result[:MAX_INITIAL_MESSAGE_LENGTH]}')
 95 |         for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(result), MAX_MESSAGE_LENGTH):
 96 |             await ctx.send(f'{result[i:i+MAX_MESSAGE_LENGTH]}')
 97 |     except ValueError as e:
 98 |         await ctx.send(f'Error: {e}. Please try again.')
 99 | 
100 | 
101 | @bot.command(name=f'{CMD_PREFIX}table', description='Describes the books table.', scope=GUILD_ID)
102 | async def _table(ctx: interactions.CommandContext):
103 |     await ctx.send(f'The books table has the following columns: id, title, author, language, average rating, ratings count, and text reviews count.')
104 | 
105 | 
106 | @bot.command(name=f'{CMD_PREFIX}sql', description='Queries a database', scope=GUILD_ID,
107 |              options=[interactions.Option(name='query', description='Query to search for', required=True, type=interactions.OptionType.STRING),
108 |                       OPTIONS_TEMPERATURE, OPTIONS_MODEL])
109 | async def _sql_chain(ctx: interactions.CommandContext, query: str, temperature: float = None, model: str = DEFAULT_MODEL):
110 |     logger.info(f'SQL-chain: {query}, Temp: {temperature}, Model: {model}')
111 |     await ctx.defer()
112 |     try:
113 |         result, time = sql_chain(query, temperature, model)
114 |         result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`'
115 |         await ctx.send(f'{result[:MAX_INITIAL_MESSAGE_LENGTH]}')
116 |         for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(result), MAX_MESSAGE_LENGTH):
117 |             await ctx.send(f'{result[i:i+MAX_MESSAGE_LENGTH]}')
118 |     except OperationalError as e:
119 |         await ctx.send(f'Error: {e}. Please try again.')
120 | 
121 | 
122 | @bot.command(name=f'{CMD_PREFIX}sql-agent', description='Queries a database', scope=GUILD_ID,
123 |              options=[interactions.Option(name='query', description='Query to search for', required=True, type=interactions.OptionType.STRING),
124 |                       OPTIONS_TEMPERATURE, OPTIONS_MODEL])
125 | async def _sql_agent(ctx: interactions.CommandContext, query: str, temperature: float = None, model: str = DEFAULT_MODEL):
126 |     logger.info(f'SQL-agent: {query}, Temp: {temperature}, Model: {model}')
127 |     await ctx.defer()
128 |     try:
129 |         result, time = sql_agent(query, temperature, model)
130 |         result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`'
131 |         await ctx.send(f'{result[:MAX_INITIAL_MESSAGE_LENGTH]}')
132 |         for i in range(MAX_INITIAL_MESSAGE_LENGTH, len(result), MAX_MESSAGE_LENGTH):
133 |             await ctx.send(f'{result[i:i+MAX_MESSAGE_LENGTH]}')
134 |     except ValueError as e:
135 |         await ctx.send(f'Error: {e}. Please try again.')
136 | 
137 | 
138 | @bot.command(name=f'{CMD_PREFIX}ask-ey', description='Asks eugeneyan.com a question', scope=GUILD_ID,
139 |              options=[interactions.Option(name='question', description='Question to ask', required=True, type=interactions.OptionType.STRING),
140 |                       OPTIONS_TEMPERATURE, OPTIONS_MODEL, OPTIONS_SHOW_SOURCE])
141 | async def _ask_ey(ctx: interactions.CommandContext, question: str, temperature: float = None, model: str = DEFAULT_MODEL, show_source: bool = False):
142 |     logger.info(
143 |         f'Ask ey: {question}, Temp: {temperature}, Model: {model}, Show source: {show_source}')
144 |     await ctx.defer()
145 |     # The first element is the answer, the rest are sources
146 |     result_list, time = qa_ey(question, temperature, model)
147 | 
148 |     result = result_list[0]
149 |     result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`'
150 |     await ctx.send(f'{result[:MAX_MESSAGE_LENGTH]}')
151 | 
152 |     if show_source:
153 |         # Send sources as individual messages
154 |         for source in result_list[1:]:
155 |             await ctx.send(f'{source[:MAX_MESSAGE_LENGTH]}')
156 | 
157 | 
158 | @bot.command(name=f'{CMD_PREFIX}board', description='Asks board of advisors a question', scope=GUILD_ID,
159 |              options=[interactions.Option(name='question', description='Question to ask', required=True, type=interactions.OptionType.STRING),
160 |                       OPTIONS_TEMPERATURE, OPTIONS_MODEL, OPTIONS_SHOW_SOURCE])
161 | async def _ask_board(ctx: interactions.CommandContext, question: str, temperature: float = None, model: str = DEFAULT_MODEL, show_source: bool = False):
162 |     logger.info(
163 |         f'Ask board: {question}, Temp: {temperature}, Model: {model}, Show source: {show_source}')
164 |     await ctx.defer()
165 |     # The first element is the answer, the rest are sources
166 |     result_list, time = qa_board(question, temperature, model)
167 | 
168 |     result = result_list[0]
169 |     result += f'\n\n `Temp: {temperature}, Model: {model}, Time: {time:.2f}s`'
170 |     await ctx.send(f'{result[:MAX_MESSAGE_LENGTH]}')
171 | 
172 |     if show_source:
173 |         # Send sources as individual messages
174 |         for source in result_list[1:]:
175 |             await ctx.send(f'{source[:MAX_MESSAGE_LENGTH]}')
176 | 
177 | 
178 | if __name__ == '__main__':
179 |     bot.start()
180 | 


--------------------------------------------------------------------------------
/notebooks/ask-ey.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 11,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import math\n",
  10 |     "import os\n",
  11 |     "import pickle\n",
  12 |     "\n",
  13 |     "import pandas as pd\n",
  14 |     "import pinecone\n",
  15 |     "import requests\n",
  16 |     "import xmltodict\n",
  17 |     "from bs4 import BeautifulSoup\n",
  18 |     "from dotenv import load_dotenv\n",
  19 |     "from langchain import OpenAI\n",
  20 |     "from langchain.chains import RetrievalQAWithSourcesChain\n",
  21 |     "from langchain.embeddings import OpenAIEmbeddings\n",
  22 |     "from langchain.text_splitter import CharacterTextSplitter\n",
  23 |     "from langchain.vectorstores import FAISS, Pinecone"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 2,
  29 |    "metadata": {},
  30 |    "outputs": [
  31 |     {
  32 |      "data": {
  33 |       "text/plain": [
  34 |        "True"
  35 |       ]
  36 |      },
  37 |      "execution_count": 2,
  38 |      "metadata": {},
  39 |      "output_type": "execute_result"
  40 |     }
  41 |    ],
  42 |    "source": [
  43 |     "load_dotenv()"
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "code",
  48 |    "execution_count": 3,
  49 |    "metadata": {},
  50 |    "outputs": [],
  51 |    "source": [
  52 |     "import sys\n",
  53 |     "\n",
  54 |     "sys.path.append('..')\n",
  55 |     "\n",
  56 |     "from logger import logger\n",
  57 |     "from config import PINECONE_ENV"
  58 |    ]
  59 |   },
  60 |   {
  61 |    "attachments": {},
  62 |    "cell_type": "markdown",
  63 |    "metadata": {},
  64 |    "source": [
  65 |     "## Scrape URLs"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": 4,
  71 |    "metadata": {},
  72 |    "outputs": [],
  73 |    "source": [
  74 |     "r = requests.get(\"http://eugeneyan.com/sitemap.xml\")\n",
  75 |     "xml = r.text\n",
  76 |     "raw = xmltodict.parse(xml)"
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "code",
  81 |    "execution_count": 5,
  82 |    "metadata": {},
  83 |    "outputs": [],
  84 |    "source": [
  85 |     "html = requests.get('https://eugeneyan.com/writing/content-moderation/').text\n",
  86 |     "soup = BeautifulSoup(html, features=\"html.parser\")"
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": 255,
  92 |    "metadata": {},
  93 |    "outputs": [],
  94 |    "source": [
  95 |     "# _paragraphs = soup.find_all('p')\n",
  96 |     "# _paragraphs[:5]"
  97 |    ]
  98 |   },
  99 |   {
 100 |    "cell_type": "code",
 101 |    "execution_count": 248,
 102 |    "metadata": {},
 103 |    "outputs": [],
 104 |    "source": [
 105 |     "# paragraphs = []\n",
 106 |     "\n",
 107 |     "# for p in _paragraphs:\n",
 108 |     "#     if'class' in p.attrs and 'date' in p['class']:\n",
 109 |     "#         continue\n",
 110 |     "#     if p.get_text() == 'To cite this content, please use:':\n",
 111 |     "#         break\n",
 112 |     "#     paragraphs.append(p.get_text())"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "code",
 117 |    "execution_count": 256,
 118 |    "metadata": {},
 119 |    "outputs": [],
 120 |    "source": [
 121 |     "# lines = [line.strip() for line in paragraphs]\n",
 122 |     "# lines[:5]"
 123 |    ]
 124 |   },
 125 |   {
 126 |    "cell_type": "code",
 127 |    "execution_count": 257,
 128 |    "metadata": {},
 129 |    "outputs": [],
 130 |    "source": [
 131 |     "# lines = [line for line in lines if len(line) > 15]\n",
 132 |     "# lines[:5]"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": 259,
 138 |    "metadata": {},
 139 |    "outputs": [],
 140 |    "source": [
 141 |     "# print('\\n'.join(line for line in lines if line))"
 142 |    ]
 143 |   },
 144 |   {
 145 |    "cell_type": "code",
 146 |    "execution_count": 6,
 147 |    "metadata": {},
 148 |    "outputs": [],
 149 |    "source": [
 150 |     "def extract_text_from(url, min_line_length=20, last_paragraph='To cite this content, please use:'):\n",
 151 |     "    html = requests.get(url).text\n",
 152 |     "    soup = BeautifulSoup(html, features=\"html.parser\")\n",
 153 |     "    \n",
 154 |     "    # Find all paragraphs and exclude all paragraphs after the \"To cite this content, please use:\" paragraph\n",
 155 |     "    _paragraphs = soup.find_all('p')\n",
 156 |     "    \n",
 157 |     "    paragraphs = []\n",
 158 |     "    for p in _paragraphs:\n",
 159 |     "        if'class' in p.attrs and 'date' in p['class']:\n",
 160 |     "            continue\n",
 161 |     "        if p.get_text() == last_paragraph:\n",
 162 |     "            break\n",
 163 |     "        paragraphs.append(p.get_text())\n",
 164 |     "    logger.debug(f'Paragraphs: {paragraphs[0]}')\n",
 165 |     "    \n",
 166 |     "    # Remove consecutive newlines\n",
 167 |     "    lines = (line.strip() for line in paragraphs)\n",
 168 |     "    \n",
 169 |     "    # Remove lines that are less than 10 characters\n",
 170 |     "    lines = (line for line in lines if len(line) > min_line_length)\n",
 171 |     "    \n",
 172 |     "    return '\\n'.join(line for line in lines if line)"
 173 |    ]
 174 |   },
 175 |   {
 176 |    "cell_type": "code",
 177 |    "execution_count": 7,
 178 |    "metadata": {},
 179 |    "outputs": [
 180 |     {
 181 |      "name": "stdout",
 182 |      "output_type": "stream",
 183 |      "text": [
 184 |       "How can we improve a machine learning project’s chance of success? Over the years, I’ve explored various mechanisms in both my own projects and those of my team members. Most people who tried these mechanisms ended up adopting them in future projects.\n",
 185 |       "While these mechanisms were developed with machine learning projects in mind, with a few tweaks, they can be applied to other technical endeavors too.\n",
 186 |       "If your team is like most teams I’ve been on, you have 2 - 3 problems for every available person. Thus, each member works on 1 or 2 problems simultaneously, with some folks taking 3 or more. And because everyone’s so busy, we barely have time to check in on each other’s projects outside of standup, planning, retrospective, etc.\n",
 187 |       "This is an anti-pattern. It can lead to a project going off-track for months, or a critical error (e.g., incorrect training data, invalid train-validation split) going undetected until late in the implementation phase.\n",
 188 |       "One solution is to have a pilot and copilot for each project. The pilot is the main project owner and is in charge of its success (or failure). They own and delegate the work as required though they’re usually responsible for the bulk of design and critical code paths.\n",
 189 |       "The copilot helps the pilot stay on track, identify critical flaws, and call out blindspots. This includes periodic check-ins, reviewing document drafts and prototypes, and being a mandatory code reviewer. For example, the copilot should challenge the pilot if the proposed design doesn’t solve the business problem, or if the train-validation split is invalid. To be able to spot these issues, the copilot typically has experience in the problem space, or has more experience in general, similar to how senior engineers guide juniors.\n",
 190 |       "For every 10 hours the pilot spends on the project, the copilot can expect to spend an hour on reviews (10% of the pilot’s effort). While this may seem excessive, copilots have helped avoid costlier rework or abandoning a project due to mistakes that snowballed.\n",
 191 |       "Pilots and copilots don’t have to be from the same job family. As an applied scientist, I often partner with an engineer who helps with infrastructure, observability, CI/CD, etc. If both scientist and engineer are sufficiently experienced, they can double up as each other’s copilot. As they review each other’s work, knowledge transfer occurs organically and they learn to be effective copilots for other engineers or scientists in future projects.\n",
 192 |       "Also read more on the dangers of flying solo by Ethan Rosenthal and Vicki Boykis.\n",
 193 |       "In my earlier projects, because I was overeager, I would immediately jump into the data and begin training models. After watching me go in the wrong direction for a week or two, a merciful senior would share a paper, casually suggesting that it might be helpful to read it. It always was. After letting this happen once too often, I finally learned to start my projects with a literature review.\n",
 194 |       "For a literature review, I read papers relevant to the problem. I’m biased towards solutions that have been applied in industry though more academic papers have also been helpful.\n",
 195 |       "While reading these papers, I’m less interested in model architecture and focus on:\n",
 196 |       "To quickly go through the papers, I adopt the three-pass approach.\n",
 197 |       "This is similar to a code review but for machine learning prototypes and experiments. Once I have initial experiment results, I schedule a review with fellow scientists to ensure I haven’t overlooked any blindspots or committed critical errors.\n",
 198 |       "During the review, I focus on understanding the methodology and the potential of the current approach. Some questions include:\n",
 199 |       "To conduct methodology reviews asynchronously, like a code review, we could adopt a tool like DagsHub which supports comments on Jupyter notebooks and data.\n",
 200 |       "To tie it all together, we timebox each project phase and task. Time constraints help us focus on the most important tasks and not get bogged down in the details. Timeboxing for machine learning projects can be challenging, because compared to engineering projects, the work is relatively ill-defined. Furthermore, a large part of the work is research and experimentation which unfortunately leads to many a dead end.\n",
 201 |       "But it’s because of these challenges that timeboxing is effective—how much effort should we invest before pivoting? In most industry settings, we don’t have limitless resources to pursue a problem for years.\n",
 202 |       "(I treat timeboxes differently from estimates. Timeboxes are stretch goals while estimates are project management inputs that indicate the upper bound of effort needed. To convert timeboxes to estimates, I usually multiply by 1.5 - 3.0.)\n",
 203 |       "Here are three ways to define timeboxes.\n",
 204 |       "The first—and most aggressive—approach is to take the time spent on similar projects and halve it. This forces us to be scrappy and build a minimum lovable product that we can quickly get feedback on, reducing the iteration cycle. This approach works well in startups and startup-like teams though it can be too intense to adopt all the time.\n",
 205 |       "A less extreme approach is to set a timebox that is “comfortable yet challenging”. Thus, instead of halving the timebox, we reduce it by 10 - 20%. By deliberately introducing these constraints, we give ourselves the opportunity to reflect on timesinks to avoid and how to achieve more with fewer resources. This is a good default for most seasoned teams.\n",
 206 |       "Finally, for greenfield projects that may be hard to scope, we can adopt standard timeboxes. For example, we might allocate two weeks for a literature review, four to eight weeks to build a prototype, and three to six months to implement it in production.\n",
 207 |       "I’ve also written about other mechanisms for machine learning projects, including:\n",
 208 |       "What mechanisms do you adopt in your machine learning projects? Please share below!\n",
 209 |       "Thanks to Yang Xinyi for reading drafts of this.\n"
 210 |      ]
 211 |     }
 212 |    ],
 213 |    "source": [
 214 |     "print(extract_text_from('https://eugeneyan.com/writing/mechanisms-for-projects/'))"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "code",
 219 |    "execution_count": 8,
 220 |    "metadata": {},
 221 |    "outputs": [],
 222 |    "source": [
 223 |     "pages = []\n",
 224 |     "\n",
 225 |     "for info in raw['urlset']['url']:\n",
 226 |     "    url = info['loc']\n",
 227 |     "    if 'https://eugeneyan.com/writing/' in info['loc']:\n",
 228 |     "        pages.append({'text': extract_text_from(url), 'url': url})"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": 12,
 234 |    "metadata": {},
 235 |    "outputs": [],
 236 |    "source": [
 237 |     "df = pd.DataFrame(pages)\n",
 238 |     "\n",
 239 |     "# # Exclude short posts that may be talks and mostly images\n",
 240 |     "df['text_len'] = df['text'].apply(lambda x: len(x))\n",
 241 |     "df = df[df['text_len'] > 500]\n",
 242 |     "df = df.drop(columns=['text_len'])\n",
 243 |     "\n",
 244 |     "# Exclude certain urls\n",
 245 |     "excluded_urls = {''}\n",
 246 |     "df = df[~df['url'].isin(excluded_urls)]"
 247 |    ]
 248 |   },
 249 |   {
 250 |    "cell_type": "code",
 251 |    "execution_count": 15,
 252 |    "metadata": {},
 253 |    "outputs": [],
 254 |    "source": [
 255 |     "df.to_parquet('../data/eugeneyan.parquet', compression='gzip')"
 256 |    ]
 257 |   },
 258 |   {
 259 |    "cell_type": "code",
 260 |    "execution_count": null,
 261 |    "metadata": {},
 262 |    "outputs": [],
 263 |    "source": []
 264 |   },
 265 |   {
 266 |    "attachments": {},
 267 |    "cell_type": "markdown",
 268 |    "metadata": {},
 269 |    "source": [
 270 |     "## Split each page into documents"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": 264,
 276 |    "metadata": {},
 277 |    "outputs": [
 278 |     {
 279 |      "name": "stderr",
 280 |      "output_type": "stream",
 281 |      "text": [
 282 |       "2023-03-26 17:06:54,129 - Split https://eugeneyan.com/writing/llm-bio/ into 10 docs\n",
 283 |       "2023-03-26 17:06:54,131 - Split https://eugeneyan.com/writing/labeling-guidelines/ into 5 docs\n",
 284 |       "2023-03-26 17:06:54,133 - Split https://eugeneyan.com/writing/content-moderation/ into 14 docs\n",
 285 |       "2023-03-26 17:06:54,134 - Split https://eugeneyan.com/writing/mechanisms-for-teams/ into 6 docs\n",
 286 |       "2023-03-26 17:06:54,134 - Split https://eugeneyan.com/writing/mechanisms-for-projects/ into 5 docs\n",
 287 |       "2023-03-26 17:06:54,135 - Split https://eugeneyan.com/writing/roam-to-obsidian/ into 2 docs\n",
 288 |       "2023-03-26 17:06:54,135 - Split https://eugeneyan.com/writing/getting-help/ into 3 docs\n",
 289 |       "2023-03-26 17:06:54,136 - Split https://eugeneyan.com/writing/2022-in-review/ into 6 docs\n",
 290 |       "2023-03-26 17:06:54,137 - Split https://eugeneyan.com/writing/autoencoders-vs-diffusers/ into 3 docs\n",
 291 |       "2023-03-26 17:06:54,138 - Split https://eugeneyan.com/writing/text-to-image/ into 16 docs\n",
 292 |       "2023-03-26 17:06:54,138 - Split https://eugeneyan.com/writing/recsys2022/ into 14 docs\n",
 293 |       "2023-03-26 17:06:54,139 - Split https://eugeneyan.com/writing/testing-pipelines/ into 10 docs\n",
 294 |       "2023-03-26 17:06:54,140 - Split https://eugeneyan.com/writing/simplicity/ into 6 docs\n",
 295 |       "2023-03-26 17:06:54,141 - Split https://eugeneyan.com/writing/uncommon-python/ into 7 docs\n",
 296 |       "2023-03-26 17:06:54,141 - Split https://eugeneyan.com/writing/15-5/ into 2 docs\n",
 297 |       "2023-03-26 17:06:54,142 - Split https://eugeneyan.com/writing/design-patterns/ into 7 docs\n",
 298 |       "2023-03-26 17:06:54,142 - Split https://eugeneyan.com/writing/onboarding/ into 9 docs\n",
 299 |       "2023-03-26 17:06:54,143 - Split https://eugeneyan.com/writing/bandits/ into 14 docs\n",
 300 |       "2023-03-26 17:06:54,144 - Split https://eugeneyan.com/writing/position-bias/ into 6 docs\n",
 301 |       "2023-03-26 17:06:54,145 - Split https://eugeneyan.com/writing/counterfactual-evaluation/ into 8 docs\n",
 302 |       "2023-03-26 17:06:54,146 - Split https://eugeneyan.com/writing/intent-vs-requirements/ into 7 docs\n",
 303 |       "2023-03-26 17:06:54,147 - Split https://eugeneyan.com/writing/project-quick-start/ into 8 docs\n",
 304 |       "2023-03-26 17:06:54,148 - Split https://eugeneyan.com/writing/becoming-a-data-leader/ into 2 docs\n",
 305 |       "2023-03-26 17:06:54,149 - Split https://eugeneyan.com/writing/red-flags/ into 6 docs\n",
 306 |       "2023-03-26 17:06:54,149 - Split https://eugeneyan.com/writing/how-to-keep-learning/ into 5 docs\n",
 307 |       "2023-03-26 17:06:54,150 - Split https://eugeneyan.com/writing/2021-year-in-review/ into 6 docs\n",
 308 |       "2023-03-26 17:06:54,151 - Split https://eugeneyan.com/writing/applyingml/ into 1 docs\n",
 309 |       "2023-03-26 17:06:54,152 - Split https://eugeneyan.com/writing/what-i-learned-from-writing-online-susan-shu/ into 4 docs\n",
 310 |       "2023-03-26 17:06:54,152 - Split https://eugeneyan.com/writing/what-i-learned-from-writing-online/ into 6 docs\n",
 311 |       "2023-03-26 17:06:54,153 - Split https://eugeneyan.com/writing/recsys2021/ into 5 docs\n",
 312 |       "2023-03-26 17:06:54,154 - Split https://eugeneyan.com/writing/first-rule-of-ml/ into 6 docs\n",
 313 |       "2023-03-26 17:06:54,155 - Split https://eugeneyan.com/writing/reinforcement-learning-for-recsys-and-search/ into 12 docs\n",
 314 |       "2023-03-26 17:06:54,155 - Split https://eugeneyan.com/writing/bootstrapping-data-labels/ into 11 docs\n",
 315 |       "2023-03-26 17:06:54,156 - Split https://eugeneyan.com/writing/mailbag-bootstrap-relevant-docs/ into 2 docs\n",
 316 |       "2023-03-26 17:06:54,157 - Split https://eugeneyan.com/writing/influencing-without-authority/ into 8 docs\n",
 317 |       "2023-03-26 17:06:54,158 - Split https://eugeneyan.com/writing/system-design-for-discovery/ into 12 docs\n",
 318 |       "2023-03-26 17:06:54,159 - Split https://eugeneyan.com/writing/patterns-for-personalization/ into 24 docs\n",
 319 |       "2023-03-26 17:06:54,159 - Split https://eugeneyan.com/writing/machine-learning-metagame/ into 11 docs\n",
 320 |       "2023-03-26 17:06:54,160 - Split https://eugeneyan.com/writing/search-query-matching/ into 18 docs\n",
 321 |       "2023-03-26 17:06:54,161 - Split https://eugeneyan.com/writing/imposter-syndrome-susan/ into 6 docs\n",
 322 |       "2023-03-26 17:06:54,162 - Split https://eugeneyan.com/writing/imposter-syndrome/ into 8 docs\n",
 323 |       "2023-03-26 17:06:54,162 - Split https://eugeneyan.com/writing/values-and-superpowers/ into 8 docs\n",
 324 |       "2023-03-26 17:06:54,163 - Split https://eugeneyan.com/writing/how-to-choose-problems/ into 11 docs\n",
 325 |       "2023-03-26 17:06:54,164 - Split https://eugeneyan.com/writing/seven-habits-that-shaped-my-decade/ into 13 docs\n",
 326 |       "2023-03-26 17:06:54,165 - Split https://eugeneyan.com/writing/ml-design-docs/ into 14 docs\n",
 327 |       "2023-03-26 17:06:54,166 - Split https://eugeneyan.com/writing/writing-docs-why-what-how/ into 9 docs\n",
 328 |       "2023-03-26 17:06:54,167 - Split https://eugeneyan.com/writing/feature-stores/ into 15 docs\n",
 329 |       "2023-03-26 17:06:54,168 - Split https://eugeneyan.com/writing/how-to-win-data-hackathon/ into 5 docs\n",
 330 |       "2023-03-26 17:06:54,169 - Split https://eugeneyan.com/writing/data-science-teams/ into 14 docs\n",
 331 |       "2023-03-26 17:06:54,170 - Split https://eugeneyan.com/writing/you-dont-need-another-mooc/ into 7 docs\n",
 332 |       "2023-03-26 17:06:54,171 - Split https://eugeneyan.com/writing/mailbag-resume-for-experienced-ds/ into 3 docs\n",
 333 |       "2023-03-26 17:06:54,173 - Split https://eugeneyan.com/writing/real-time-recommendations/ into 17 docs\n",
 334 |       "2023-03-26 17:06:54,174 - Split https://eugeneyan.com/writing/2021-roadmap/ into 3 docs\n",
 335 |       "2023-03-26 17:06:54,174 - Split https://eugeneyan.com/writing/retrospective-2020/ into 6 docs\n",
 336 |       "2023-03-26 17:06:54,175 - Split https://eugeneyan.com/writing/flying-dagger/ into 2 docs\n",
 337 |       "2023-03-26 17:06:54,176 - Split https://eugeneyan.com/writing/how-i-reflect-and-plan/ into 2 docs\n",
 338 |       "2023-03-26 17:06:54,177 - Split https://eugeneyan.com/writing/informal-mentors-alexey-grigorev/ into 17 docs\n",
 339 |       "2023-03-26 17:06:54,178 - Split https://eugeneyan.com/writing/mailbag-blog-architecture/ into 1 docs\n",
 340 |       "2023-03-26 17:06:54,179 - Split https://eugeneyan.com/writing/life-lessons-from-machine-learning/ into 10 docs\n",
 341 |       "2023-03-26 17:06:54,180 - Split https://eugeneyan.com/writing/role-title-mismatch/ into 6 docs\n",
 342 |       "2023-03-26 17:06:54,181 - Split https://eugeneyan.com/writing/data-science-roles/ into 9 docs\n",
 343 |       "2023-03-26 17:06:54,182 - Split https://eugeneyan.com/writing/informal-mentors-chip-huyen/ into 8 docs\n",
 344 |       "2023-03-26 17:06:54,183 - Split https://eugeneyan.com/writing/data-discovery-platforms/ into 12 docs\n",
 345 |       "2023-03-26 17:06:54,184 - Split https://eugeneyan.com/writing/netlify-back-to-github-pages/ into 2 docs\n",
 346 |       "2023-03-26 17:06:54,185 - Split https://eugeneyan.com/writing/data-science-portfolio-how-why-what/ into 13 docs\n",
 347 |       "2023-03-26 17:06:54,186 - Split https://eugeneyan.com/writing/how-to-install-scann-on-mac/ into 2 docs\n",
 348 |       "2023-03-26 17:06:54,186 - Split https://eugeneyan.com/writing/prototyping-to-get-buy-in/ into 6 docs\n",
 349 |       "2023-03-26 17:06:54,187 - Split https://eugeneyan.com/writing/writing-and-coding/ into 11 docs\n",
 350 |       "2023-03-26 17:06:54,188 - Split https://eugeneyan.com/writing/recsys2020/ into 11 docs\n",
 351 |       "2023-03-26 17:06:54,189 - Split https://eugeneyan.com/writing/present/ into 1 docs\n",
 352 |       "2023-03-26 17:06:54,190 - Split https://eugeneyan.com/writing/favorite-productivity-coffee-routines-habits/ into 12 docs\n",
 353 |       "2023-03-26 17:06:54,191 - Split https://eugeneyan.com/writing/how-to-accomplish-more-with-less/ into 9 docs\n",
 354 |       "2023-03-26 17:06:54,191 - Split https://eugeneyan.com/writing/migrating-to-utterances/ into 1 docs\n",
 355 |       "2023-03-26 17:06:54,192 - Split https://eugeneyan.com/writing/testing-ml/ into 6 docs\n",
 356 |       "2023-03-26 17:06:54,193 - Split https://eugeneyan.com/writing/mailbag-pdf-fields/ into 3 docs\n",
 357 |       "2023-03-26 17:06:54,193 - Split https://eugeneyan.com/writing/why-read-papers/ into 5 docs\n",
 358 |       "2023-03-26 17:06:54,194 - Split https://eugeneyan.com/writing/mailbag-senior-ds/ into 2 docs\n",
 359 |       "2023-03-26 17:06:54,195 - Split https://eugeneyan.com/writing/beginners-mind/ into 9 docs\n",
 360 |       "2023-03-26 17:06:54,196 - Split https://eugeneyan.com/writing/nlp-supervised-learning-survey/ into 17 docs\n",
 361 |       "2023-03-26 17:06:54,197 - Split https://eugeneyan.com/writing/end-to-end-data-science/ into 14 docs\n",
 362 |       "2023-03-26 17:06:54,198 - Split https://eugeneyan.com/writing/fastapi-html-checkbox-download/ into 2 docs\n",
 363 |       "2023-03-26 17:06:54,199 - Split https://eugeneyan.com/writing/what-i-did-not-learn-about-writing-in-school/ into 8 docs\n",
 364 |       "2023-03-26 17:06:54,200 - Split https://eugeneyan.com/writing/georgia-tech-omscs-faq/ into 13 docs\n",
 365 |       "2023-03-26 17:06:54,200 - Split https://eugeneyan.com/writing/how-to-set-up-html-app-with-fastapi-jinja-forms-templates/ into 1 docs\n",
 366 |       "2023-03-26 17:06:54,201 - Split https://eugeneyan.com/writing/why-you-need-to-follow-up-after-your-data-science-project/ into 8 docs\n",
 367 |       "2023-03-26 17:06:54,202 - Split https://eugeneyan.com/writing/what-i-do-during-a-data-science-project-to-ensure-success/ into 5 docs\n",
 368 |       "2023-03-26 17:06:54,203 - Split https://eugeneyan.com/writing/how-to-update-github-profile-readme-automatically/ into 2 docs\n",
 369 |       "2023-03-26 17:06:54,204 - Split https://eugeneyan.com/writing/when-giving-your-100-gets-you-less-than-85/ into 2 docs\n",
 370 |       "2023-03-26 17:06:54,205 - Split https://eugeneyan.com/writing/notes-from-sparkai-summit-application-specific/ into 11 docs\n",
 371 |       "2023-03-26 17:06:54,207 - Split https://eugeneyan.com/writing/notes-from-sparkai-summit-application-agnostic/ into 7 docs\n",
 372 |       "2023-03-26 17:06:54,239 - Split https://eugeneyan.com/writing/setting-up-python-project-for-automation-and-collaboration/ into 10 docs\n",
 373 |       "2023-03-26 17:06:54,240 - Split https://eugeneyan.com/writing/mailbag-ds-requirements/ into 2 docs\n",
 374 |       "2023-03-26 17:06:54,247 - Split https://eugeneyan.com/writing/why-airflow-jobs-one-day-late/ into 2 docs\n",
 375 |       "2023-03-26 17:06:54,258 - Split https://eugeneyan.com/writing/what-i-do-before-a-data-science-project-to-ensure-success/ into 11 docs\n",
 376 |       "2023-03-26 17:06:54,260 - Split https://eugeneyan.com/writing/what-i-love-about-scrum-for-data-science/ into 8 docs\n",
 377 |       "2023-03-26 17:06:54,261 - Split https://eugeneyan.com/writing/how-to-apply-crockers-law-for-feedback-and-growth/ into 6 docs\n",
 378 |       "2023-03-26 17:06:54,261 - Split https://eugeneyan.com/writing/practical-guide-to-maintaining-machine-learning/ into 12 docs\n",
 379 |       "2023-03-26 17:06:54,262 - Split https://eugeneyan.com/writing/challenges-after-deploying-machine-learning/ into 12 docs\n",
 380 |       "2023-03-26 17:06:54,263 - Split https://eugeneyan.com/writing/how-to-write-david-x-sahil/ into 4 docs\n",
 381 |       "2023-03-26 17:06:54,264 - Split https://eugeneyan.com/writing/evaluating-ideas-at-a-hackathon/ into 6 docs\n",
 382 |       "2023-03-26 17:06:54,265 - Split https://eugeneyan.com/writing/serendipity-and-accuracy-in-recommender-systems/ into 9 docs\n",
 383 |       "2023-03-26 17:06:54,266 - Split https://eugeneyan.com/writing/how-to-give-a-kick-ass-data-science-talk/ into 6 docs\n",
 384 |       "2023-03-26 17:06:54,267 - Split https://eugeneyan.com/writing/commando-soldier-police-and-your-career/ into 5 docs\n",
 385 |       "2023-03-26 17:06:54,268 - Split https://eugeneyan.com/writing/note-taking-zettelkasten/ into 5 docs\n",
 386 |       "2023-03-26 17:06:54,269 - Split https://eugeneyan.com/writing/reading-note-taking-writing/ into 5 docs\n",
 387 |       "2023-03-26 17:06:54,270 - Split https://eugeneyan.com/writing/experimentation-workflow-with-jupyter-papermill-mlflow/ into 5 docs\n",
 388 |       "2023-03-26 17:06:54,271 - Split https://eugeneyan.com/writing/psych-grad-to-data-science-lead/ into 10 docs\n",
 389 |       "2023-03-26 17:06:54,272 - Split https://eugeneyan.com/writing/recommender-systems-graph-and-nlp-pytorch/ into 15 docs\n",
 390 |       "2023-03-26 17:06:54,272 - Split https://eugeneyan.com/writing/recommender-systems-baseline-pytorch/ into 11 docs\n",
 391 |       "2023-03-26 17:06:54,273 - Split https://eugeneyan.com/writing/omscs-cs6200-introduction-to-operating-systems/ into 7 docs\n",
 392 |       "2023-03-26 17:06:54,274 - Split https://eugeneyan.com/writing/omscs-cs6750-human-computer-interaction/ into 7 docs\n",
 393 |       "2023-03-26 17:06:54,275 - Split https://eugeneyan.com/writing/goodbye-wordpress-hello-jekyll into 1 docs\n",
 394 |       "2023-03-26 17:06:54,276 - Split https://eugeneyan.com/writing/omscs-cs6440-intro-to-health-informatics/ into 7 docs\n",
 395 |       "2023-03-26 17:06:54,277 - Split https://eugeneyan.com/writing/omscs-cs7646-machine-learning-for-trading/ into 7 docs\n",
 396 |       "2023-03-26 17:06:54,278 - Split https://eugeneyan.com/writing/what-does-a-data-scientist-really-do/ into 5 docs\n",
 397 |       "2023-03-26 17:06:54,279 - Split https://eugeneyan.com/writing/data-science-and-agile-frameworks-for-effectiveness/ into 13 docs\n",
 398 |       "2023-03-26 17:06:54,280 - Split https://eugeneyan.com/writing/data-science-and-agile-what-works-and-what-doesnt/ into 10 docs\n",
 399 |       "2023-03-26 17:06:54,281 - Split https://eugeneyan.com/writing/omscs-cs6601-artificial-intelligence/ into 7 docs\n",
 400 |       "2023-03-26 17:06:54,282 - Split https://eugeneyan.com/writing/omscs-cs6460-education-technology/ into 8 docs\n",
 401 |       "2023-03-26 17:06:54,283 - Split https://eugeneyan.com/writing/omscs-cs7642-reinforcement-learning/ into 6 docs\n",
 402 |       "2023-03-26 17:06:54,284 - Split https://eugeneyan.com/writing/building-a-strong-data-science-team-culture/ into 3 docs\n",
 403 |       "2023-03-26 17:06:54,286 - Split https://eugeneyan.com/writing/omscs-cs7641-machine-learning/ into 4 docs\n",
 404 |       "2023-03-26 17:06:54,287 - Split https://eugeneyan.com/writing/my-first-100-days-as-data-science-lead/ into 4 docs\n",
 405 |       "2023-03-26 17:06:54,288 - Split https://eugeneyan.com/writing/omscs-cs6300-software-development-process/ into 5 docs\n",
 406 |       "2023-03-26 17:06:54,289 - Split https://eugeneyan.com/writing/how-to-get-started-in-data-science/ into 7 docs\n",
 407 |       "2023-03-26 17:06:54,290 - Split https://eugeneyan.com/writing/omscs-cs6476-computer-vision/ into 5 docs\n",
 408 |       "2023-03-26 17:06:54,290 - Split https://eugeneyan.com/writing/one-way-to-help-a-data-science-team-succeed/ into 2 docs\n",
 409 |       "2023-03-26 17:06:54,291 - Split https://eugeneyan.com/writing/product-categorization-api-part-3-creating-an-api/ into 4 docs\n",
 410 |       "2023-03-26 17:06:54,292 - Split https://eugeneyan.com/writing/image-search-is-now-live/ into 2 docs\n",
 411 |       "2023-03-26 17:06:54,293 - Split https://eugeneyan.com/writing/product-categorization-api-part-2-data-preparation/ into 5 docs\n",
 412 |       "2023-03-26 17:06:54,294 - Split https://eugeneyan.com/writing/image-categorization-is-now-live/ into 1 docs\n",
 413 |       "2023-03-26 17:06:54,295 - Split https://eugeneyan.com/writing/im-going-back-to-school/ into 4 docs\n",
 414 |       "2023-03-26 17:06:54,295 - Split https://eugeneyan.com/writing/sortmyskills-is-now-live/ into 3 docs\n",
 415 |       "2023-03-26 17:06:54,296 - Split https://eugeneyan.com/writing/product-categorization-api-part-1-data-acquisition-and-formatting/ into 5 docs\n",
 416 |       "2023-03-26 17:06:54,297 - Split https://eugeneyan.com/writing/thoughts-on-functional-programming-in-scala-course-coursera/ into 3 docs\n",
 417 |       "2023-03-26 17:06:54,298 - Split https://eugeneyan.com/writing/first-post/ into 1 docs\n",
 418 |       "2023-03-26 17:06:54,298 - Split https://eugeneyan.com/writing/datakind-sg-project-accelerator/ into 9 docs\n",
 419 |       "2023-03-26 17:06:54,300 - Split https://eugeneyan.com/writing/ into 15 docs\n"
 420 |      ]
 421 |     }
 422 |    ],
 423 |    "source": [
 424 |     "text_splitter = CharacterTextSplitter(chunk_size=1500, separator='\\n')\n",
 425 |     "\n",
 426 |     "docs, metadata = [], []\n",
 427 |     "\n",
 428 |     "for page in pages:\n",
 429 |     "    splits = text_splitter.split_text(page['text'])\n",
 430 |     "    # for split in splits:\n",
 431 |     "    #     docs.append(split)\n",
 432 |     "    #     metadata.append({'source': split, 'url': page['url']})\n",
 433 |     "    docs.extend(splits)\n",
 434 |     "    metadata.extend([{'source': page['url']}] * len(splits))  # This Q&A chain relies on the url being in the 'source' key\n",
 435 |     "    logger.info(f'Split {page[\"url\"]} into {len(splits)} docs')"
 436 |    ]
 437 |   },
 438 |   {
 439 |    "attachments": {},
 440 |    "cell_type": "markdown",
 441 |    "metadata": {},
 442 |    "source": [
 443 |     "## Create a FAISS vector store for offline prototyping"
 444 |    ]
 445 |   },
 446 |   {
 447 |    "cell_type": "code",
 448 |    "execution_count": 265,
 449 |    "metadata": {},
 450 |    "outputs": [],
 451 |    "source": [
 452 |     "store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadata)\n",
 453 |     "with open('../data/faiss_store.pkl', 'wb') as f:\n",
 454 |     "    pickle.dump(store, f)  # This is a 10mb file"
 455 |    ]
 456 |   },
 457 |   {
 458 |    "cell_type": "code",
 459 |    "execution_count": 266,
 460 |    "metadata": {},
 461 |    "outputs": [],
 462 |    "source": [
 463 |     "# question = 'Question for eugeneyan.com: Why is writing important?'\n",
 464 |     "\n",
 465 |     "# with open('../data/faiss_store.pkl', 'rb') as f:\n",
 466 |     "#     store = pickle.load(open('../data/faiss_store.pkl', 'rb'))\n",
 467 |     "\n",
 468 |     "# chain = load_qa_with_sources_chain(ChatOpenAI(temperature=0), verbose=False)\n",
 469 |     "# response = chain({'input_documents': store.similarity_search(question, 4), \n",
 470 |     "#                   'question': question})"
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "code",
 475 |    "execution_count": 267,
 476 |    "metadata": {},
 477 |    "outputs": [],
 478 |    "source": [
 479 |     "# response"
 480 |    ]
 481 |   },
 482 |   {
 483 |    "cell_type": "code",
 484 |    "execution_count": 268,
 485 |    "metadata": {},
 486 |    "outputs": [],
 487 |    "source": [
 488 |     "# # VectorDBQAWithSourcesChain is DEPRECATED\n",
 489 |     "\n",
 490 |     "# question = 'Question for eugeneyan.com: Why is writing important?'\n",
 491 |     "\n",
 492 |     "# with open('../data/faiss_store.pkl', 'rb') as f:\n",
 493 |     "#     store = pickle.load(open('../data/faiss_store.pkl', 'rb'))\n",
 494 |     "\n",
 495 |     "# llm=ChatOpenAI(temperature=0)\n",
 496 |     "# chain = VectorDBQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', vectorstore=store)\n",
 497 |     "# response = chain({'question': question})"
 498 |    ]
 499 |   },
 500 |   {
 501 |    "cell_type": "code",
 502 |    "execution_count": 303,
 503 |    "metadata": {},
 504 |    "outputs": [],
 505 |    "source": [
 506 |     "# Use this instead of load_qa_with_sources_chain for more control\n",
 507 |     "question = 'Question for eugeneyan.com: What is content moderation?'\n",
 508 |     "\n",
 509 |     "with open('../data/faiss_store.pkl', 'rb') as f:\n",
 510 |     "    store = pickle.load(open('../data/faiss_store.pkl', 'rb'))\n",
 511 |     "\n",
 512 |     "llm=OpenAI(temperature=0)\n",
 513 |     "chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', retriever=store.as_retriever(), return_source_documents=True)\n",
 514 |     "response = chain({'question': question}, return_only_outputs=False)"
 515 |    ]
 516 |   },
 517 |   {
 518 |    "cell_type": "code",
 519 |    "execution_count": 304,
 520 |    "metadata": {},
 521 |    "outputs": [
 522 |     {
 523 |      "data": {
 524 |       "text/plain": [
 525 |        "{'question': 'Question for eugeneyan.com: What is content moderation?',\n",
 526 |        " 'answer': ' Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. It involves collecting a set of ground truth, using supervised ML models, and applying heuristics and unsupervised models.\\n',\n",
 527 |        " 'sources': 'https://eugeneyan.com/writing/content-moderation/',\n",
 528 |        " 'source_documents': [Document(page_content='Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. How do we know which are irrelevant, incorrect, or downright harmful? A related problem is detecting anomalous activity such as fraudulent transactions or malicious traffic.\\nTo learn more about building robust content moderation systems, I dug into industry papers and tech blogs on classification, anomaly detection, and search relevance. Here are five patterns I observed:\\nRegardless of whether a heuristic-based, supervised, or unsupervised solution is adopted, we typically start with collecting a set of ground truth. This ground truth can then be used to train supervised ML models as well as evaluate the performance of heuristics and unsupervised models. The ground truth also acts as seed data to bootstrap more labels via active or semi-supervised learning.\\nThe most straightforward way to collect ground truth is to ask users. For Stack Exchange to block spam on their sites, a valuable data source is users flagging posts as spam. These flags were then used to identify and act on spammy users by blocking or rate-limiting them. They were also used as training data for machine learning models.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/content-moderation/'}, lookup_index=0),\n",
 529 |        "  Document(page_content='Community Engagement:\\nEugene is passionate about fostering a strong data science community. He is an organizer and mentor for the Data Science Global Impact Challenge, a competition that encourages participants to use data science techniques to address pressing global issues. He is also involved in the AI Singapore initiative, which aims to promote AI and data science in Singapore.\\nIn summary, Eugene Yan is an experienced data scientist and machine learning practitioner known for his work in search and recommendation systems. Through his various roles, writing, and speaking engagements,and mentorship, he has contributed significantly to the field of data science and continues to share his knowledge with the community.\\nThe overall theme seems correct, though it got many details wrong, including:\\nEugene Yan, also known as @eugeneyan, is a data scientist, writer, and entrepreneur based in Singapore. He is widely recognized for his contributions to the data science community, including his popular blog, eugeneyan.com, where he shares insights on data science, machine learning, and personal growth.\\nEugene holds a Bachelor’s degree in Electrical and Electronic Engineering from the National University of Singapore, as well as a Master’s degree in Management Science and Engineering from Stanford University. After completing his studies, he worked at several tech companies, including Google, where he served as a software engineer.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/llm-bio/'}, lookup_index=0),\n",
 530 |        "  Document(page_content='I think a lot about machine learning. And I think a lot about life. Sometimes, the channels mix and I find certain lessons from machine learning applicable to life.\\nHere are seven lessons. While I assume most readers are familiar with these machine learning concepts, I begin each lesson with a brief explanation.\\nWe clean data so our downstream analysis or machine learning is correct. As Randy Au shares, data cleaning isn’t grunt work; it is the work.\\nWe don’t use data without exploring and cleaning it. Similarly, we shouldn’t consume life’s inputs without assessing and filtering them.\\nTake food for example. How often do we reach for what’s widely available and easy to prepare? Until a few years ago, I was happily munching on a bowl of Sugary-Os cereal daily. Now that I’m more aware of my family’s history with diabetes, I’m more selective and pay greater attention to nutritional content. Also, as age catches up and my metabolism slows, I have to make a conscious effort to eat healthier and avoid junk food.\\nSugar is \"good\" for you (source)\\nIt’s the same with content. News outlets and social media rank information based on virality and advertising dollars. “Empty calorie info-bites” that are easy to consume—but don’t enrich us—circulate faster. Misinformation is rampant. Some content is in poor taste and downright toxic, and attempts to engage don’t end well. For sanity’s sake, just filter it out. Curate your news sources and who you follow on social media.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/life-lessons-from-machine-learning/'}, lookup_index=0),\n",
 531 |        "  Document(page_content='Stack Exchange has several layers of defense against spam. The first line of defense is triggered when a spammer posts too often to be humanly possible. The spammer is hit with an HTTP 429 Error (Too Many Requests) and blocked or rate-limited.\\nThe second line of defense is based on heuristics. Specifically, they run posts through an “unholy amount of regular expressions” and some rules. If a post is caught, it is sent to users to check and potentially flag it as spam. If six users flag it as spam (six flags lol), the post is marked as spam and the user is blocked, rate-limited, or prevented from posting.\\nThe final line of defense is a (machine learning?) system that identifies posts most likely to be spam. They shadow-tested it and found it to be extremely accurate. It was catching almost all of the blatantly obvious spam. Eventually, this system was armed to cast three automatic flags and it drastically reduced the time to spam post deletion.\\nTime till spam deletion drops from no auto-flag (red) to 1 auto-flag (green) to 3 auto-flags (orange)\\nCloudflare also combines heuristics and machine learning (and other techniques) to identify bot traffic. They shared a comparison: If machine learning inference requires ~50ms, then hundreds of heuristics can be applied at ~20ms.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/content-moderation/'}, lookup_index=0)]}"
 532 |       ]
 533 |      },
 534 |      "execution_count": 304,
 535 |      "metadata": {},
 536 |      "output_type": "execute_result"
 537 |     }
 538 |    ],
 539 |    "source": [
 540 |     "response"
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "code",
 545 |    "execution_count": 305,
 546 |    "metadata": {},
 547 |    "outputs": [
 548 |     {
 549 |      "name": "stderr",
 550 |      "output_type": "stream",
 551 |      "text": [
 552 |       "2023-03-26 17:14:41,484 - Question: Question for eugeneyan.com: What is content moderation?\n",
 553 |       "2023-03-26 17:14:41,486 - Answer:  Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. It involves collecting a set of ground truth, using supervised ML models, and applying heuristics and unsupervised models.\n",
 554 |       "\n",
 555 |       "2023-03-26 17:14:41,488 - Sources: https://eugeneyan.com/writing/content-moderation/\n",
 556 |       "2023-03-26 17:14:41,489 - URL: https://eugeneyan.com/writing/content-moderation/\n",
 557 |       "\n",
 558 |       "2023-03-26 17:14:41,491 - Source: Content moderation is the process of learning and inferring the quality of human-generated content such as product reviews, social media posts, and ads. How do we know which are irrelevant, incorrect, or downright harmful? A related problem is detecting anomalous activity such as fraudulent transactions or malicious traffic.\n",
 559 |       "To learn more about building robust content moderation systems, I dug into industry papers and tech blogs on classification, anomaly detection, and search relevance. Here are five patterns I observed:\n",
 560 |       "Regardless of whether a heuristic-based, supervised, or unsupervised solution is adopted, we typically start with collecting a set of ground truth. This ground truth can then be used to train supervised ML models as well as evaluate the performance of heuristics and unsupervised models. The ground truth also acts as seed data to bootstrap more labels via active or semi-supervised learning.\n",
 561 |       "The most straightforward way to collect ground truth is to ask users. For Stack Exchange to block spam on their sites, a valuable data source is users flagging posts as spam. These flags were then used to identify and act on spammy users by blocking or rate-limiting them. They were also used as training data for machine learning models.\n",
 562 |       "\n",
 563 |       "2023-03-26 17:14:41,492 - =====================\n",
 564 |       "2023-03-26 17:14:41,493 - URL: https://eugeneyan.com/writing/content-moderation/\n",
 565 |       "\n",
 566 |       "2023-03-26 17:14:41,494 - Source: Stack Exchange has several layers of defense against spam. The first line of defense is triggered when a spammer posts too often to be humanly possible. The spammer is hit with an HTTP 429 Error (Too Many Requests) and blocked or rate-limited.\n",
 567 |       "The second line of defense is based on heuristics. Specifically, they run posts through an “unholy amount of regular expressions” and some rules. If a post is caught, it is sent to users to check and potentially flag it as spam. If six users flag it as spam (six flags lol), the post is marked as spam and the user is blocked, rate-limited, or prevented from posting.\n",
 568 |       "The final line of defense is a (machine learning?) system that identifies posts most likely to be spam. They shadow-tested it and found it to be extremely accurate. It was catching almost all of the blatantly obvious spam. Eventually, this system was armed to cast three automatic flags and it drastically reduced the time to spam post deletion.\n",
 569 |       "Time till spam deletion drops from no auto-flag (red) to 1 auto-flag (green) to 3 auto-flags (orange)\n",
 570 |       "Cloudflare also combines heuristics and machine learning (and other techniques) to identify bot traffic. They shared a comparison: If machine learning inference requires ~50ms, then hundreds of heuristics can be applied at ~20ms.\n",
 571 |       "\n",
 572 |       "2023-03-26 17:14:41,495 - =====================\n"
 573 |      ]
 574 |     }
 575 |    ],
 576 |    "source": [
 577 |     "logger.info(f'Question: {response[\"question\"]}')\n",
 578 |     "logger.info(f'Answer: {response[\"answer\"]}')\n",
 579 |     "logger.info(f'Sources: {response[\"sources\"]}')\n",
 580 |     "\n",
 581 |     "sources = set(response['sources'].split(', '))\n",
 582 |     "\n",
 583 |     "for doc in response['source_documents']:\n",
 584 |     "    if doc.metadata[\"source\"] in sources:\n",
 585 |     "        logger.info(f'URL: {doc.metadata[\"source\"]}\\n')\n",
 586 |     "        logger.info(f'Source: {doc.page_content}\\n')\n",
 587 |     "        logger.info('=====================')"
 588 |    ]
 589 |   },
 590 |   {
 591 |    "cell_type": "code",
 592 |    "execution_count": 306,
 593 |    "metadata": {},
 594 |    "outputs": [
 595 |     {
 596 |      "data": {
 597 |       "text/plain": [
 598 |        "['https://eugeneyan.com/writing/content-moderation/']"
 599 |       ]
 600 |      },
 601 |      "execution_count": 306,
 602 |      "metadata": {},
 603 |      "output_type": "execute_result"
 604 |     }
 605 |    ],
 606 |    "source": [
 607 |     "response['sources'].split(', ')"
 608 |    ]
 609 |   },
 610 |   {
 611 |    "cell_type": "code",
 612 |    "execution_count": 149,
 613 |    "metadata": {},
 614 |    "outputs": [
 615 |     {
 616 |      "data": {
 617 |       "text/plain": [
 618 |        "Document(page_content='How to Write: Advice from David Perell and Sahil Lavingia\\neugeneyan\\nStart Here\\nWriting\\nSpeaking\\nNewsletter\\nAbout\\nHow to Write: Advice from David Perell and Sahil Lavingia\\n[\\nwriting\\n]\\n· 4 min read\\nWriting is a superpower. Telepathy to be exact.\\nThink about it—through writing, I broadcast ideas from my mind to yours. Across time. Across space. (Yes, the internet plays a big role, but let’s focus on writing.) The more effective your writing, the stronger your telepathic ability.\\nWhy write about writing on this site?\\nWriting is essential for effective data science. Good writing means you get buy-in on ideas, your methodology and experiments can be replicated, and readers understand enough to give feedback. Poor writing gets you zilch (and snores). Business folk have enough trouble understanding data geeks as it is—don’t make it harder with your writing.\\nWriting is an important way to learn. (The other important way is learning.) When writing, you have to organize ideas and prune the unnecessary. Along the way, you find gaps in your understanding, which leads you to more research and learning.\\nWriting becomes more important as your career progresses. Everything is writing—emails, specs, documentation, articles. Code too. As you become more senior, you write more docs, less code. Seniors contribute by designing systems and communicating them to teams to implement.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/how-to-write-david-x-sahil/'}, lookup_index=0)"
 619 |       ]
 620 |      },
 621 |      "execution_count": 149,
 622 |      "metadata": {},
 623 |      "output_type": "execute_result"
 624 |     }
 625 |    ],
 626 |    "source": [
 627 |     "doc"
 628 |    ]
 629 |   },
 630 |   {
 631 |    "attachments": {},
 632 |    "cell_type": "markdown",
 633 |    "metadata": {},
 634 |    "source": [
 635 |     "## Using pinecone"
 636 |    ]
 637 |   },
 638 |   {
 639 |    "cell_type": "code",
 640 |    "execution_count": 307,
 641 |    "metadata": {},
 642 |    "outputs": [],
 643 |    "source": [
 644 |     "pinecone.init(api_key=os.getenv('PINECONE_API_KEY'), environment=PINECONE_ENV)"
 645 |    ]
 646 |   },
 647 |   {
 648 |    "cell_type": "code",
 649 |    "execution_count": 311,
 650 |    "metadata": {},
 651 |    "outputs": [
 652 |     {
 653 |      "data": {
 654 |       "text/plain": [
 655 |        "{'dimension': 1536,\n",
 656 |        " 'index_fullness': 0.0,\n",
 657 |        " 'namespaces': {},\n",
 658 |        " 'total_vector_count': 0}"
 659 |       ]
 660 |      },
 661 |      "execution_count": 311,
 662 |      "metadata": {},
 663 |      "output_type": "execute_result"
 664 |     }
 665 |    ],
 666 |    "source": [
 667 |     "index_name = 'ask-ey'\n",
 668 |     "index = pinecone.Index(index_name)\n",
 669 |     "\n",
 670 |     "# Delete and recreate index\n",
 671 |     "# pinecone.delete_index(index_name)\n",
 672 |     "pinecone.create_index(index_name, dimension=1536, metric='cosine', pod_type='p2.x1')\n",
 673 |     "index.describe_index_stats()"
 674 |    ]
 675 |   },
 676 |   {
 677 |    "cell_type": "code",
 678 |    "execution_count": 313,
 679 |    "metadata": {},
 680 |    "outputs": [],
 681 |    "source": [
 682 |     "# # Initialize with small set of data - \n",
 683 |     "# p = Pinecone.from_texts(docs[0:2], \n",
 684 |     "#                         embeddings, \n",
 685 |     "#                         index_name=index_name, \n",
 686 |     "#                         metadatas=metadata[0:2])\n",
 687 |     "\n",
 688 |     "# index.describe_index_stats()"
 689 |    ]
 690 |   },
 691 |   {
 692 |    "cell_type": "code",
 693 |    "execution_count": 314,
 694 |    "metadata": {},
 695 |    "outputs": [
 696 |     {
 697 |      "data": {
 698 |       "text/plain": [
 699 |        "{'dimension': 1536,\n",
 700 |        " 'index_fullness': 0.0,\n",
 701 |        " 'namespaces': {},\n",
 702 |        " 'total_vector_count': 0}"
 703 |       ]
 704 |      },
 705 |      "execution_count": 314,
 706 |      "metadata": {},
 707 |      "output_type": "execute_result"
 708 |     }
 709 |    ],
 710 |    "source": [
 711 |     "embeddings = OpenAIEmbeddings()\n",
 712 |     "\n",
 713 |     "# Load existing pinecone index\n",
 714 |     "store = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)\n",
 715 |     "index.describe_index_stats()"
 716 |    ]
 717 |   },
 718 |   {
 719 |    "cell_type": "code",
 720 |    "execution_count": 315,
 721 |    "metadata": {},
 722 |    "outputs": [
 723 |     {
 724 |      "name": "stderr",
 725 |      "output_type": "stream",
 726 |      "text": [
 727 |       "2023-03-26 17:21:12,696 - Adding chunk 1 of 10 (0 to 100))\n"
 728 |      ]
 729 |     },
 730 |     {
 731 |      "data": {
 732 |       "application/vnd.jupyter.widget-view+json": {
 733 |        "model_id": "b5874d629b824f268bc671c93796fb3c",
 734 |        "version_major": 2,
 735 |        "version_minor": 0
 736 |       },
 737 |       "text/plain": [
 738 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 739 |       ]
 740 |      },
 741 |      "metadata": {},
 742 |      "output_type": "display_data"
 743 |     },
 744 |     {
 745 |      "name": "stderr",
 746 |      "output_type": "stream",
 747 |      "text": [
 748 |       "2023-03-26 17:21:38,225 - Adding chunk 2 of 10 (100 to 200))\n"
 749 |      ]
 750 |     },
 751 |     {
 752 |      "data": {
 753 |       "application/vnd.jupyter.widget-view+json": {
 754 |        "model_id": "bdda4ae1574e411aa66fe5d1ad6c5e5c",
 755 |        "version_major": 2,
 756 |        "version_minor": 0
 757 |       },
 758 |       "text/plain": [
 759 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 760 |       ]
 761 |      },
 762 |      "metadata": {},
 763 |      "output_type": "display_data"
 764 |     },
 765 |     {
 766 |      "name": "stderr",
 767 |      "output_type": "stream",
 768 |      "text": [
 769 |       "2023-03-26 17:21:58,889 - Adding chunk 3 of 10 (200 to 300))\n"
 770 |      ]
 771 |     },
 772 |     {
 773 |      "data": {
 774 |       "application/vnd.jupyter.widget-view+json": {
 775 |        "model_id": "5af701e6ce24426badc48a11fa953c53",
 776 |        "version_major": 2,
 777 |        "version_minor": 0
 778 |       },
 779 |       "text/plain": [
 780 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 781 |       ]
 782 |      },
 783 |      "metadata": {},
 784 |      "output_type": "display_data"
 785 |     },
 786 |     {
 787 |      "name": "stderr",
 788 |      "output_type": "stream",
 789 |      "text": [
 790 |       "2023-03-26 17:22:17,041 - Adding chunk 4 of 10 (300 to 400))\n"
 791 |      ]
 792 |     },
 793 |     {
 794 |      "data": {
 795 |       "application/vnd.jupyter.widget-view+json": {
 796 |        "model_id": "bc9a0089aa89405ebd7aed48b37777e8",
 797 |        "version_major": 2,
 798 |        "version_minor": 0
 799 |       },
 800 |       "text/plain": [
 801 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 802 |       ]
 803 |      },
 804 |      "metadata": {},
 805 |      "output_type": "display_data"
 806 |     },
 807 |     {
 808 |      "name": "stderr",
 809 |      "output_type": "stream",
 810 |      "text": [
 811 |       "2023-03-26 17:22:39,901 - Adding chunk 5 of 10 (400 to 500))\n"
 812 |      ]
 813 |     },
 814 |     {
 815 |      "data": {
 816 |       "application/vnd.jupyter.widget-view+json": {
 817 |        "model_id": "6acf5cf5d6014984bf6c5f1bd4c395dc",
 818 |        "version_major": 2,
 819 |        "version_minor": 0
 820 |       },
 821 |       "text/plain": [
 822 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 823 |       ]
 824 |      },
 825 |      "metadata": {},
 826 |      "output_type": "display_data"
 827 |     },
 828 |     {
 829 |      "name": "stderr",
 830 |      "output_type": "stream",
 831 |      "text": [
 832 |       "2023-03-26 17:22:58,653 - Adding chunk 6 of 10 (500 to 600))\n"
 833 |      ]
 834 |     },
 835 |     {
 836 |      "data": {
 837 |       "application/vnd.jupyter.widget-view+json": {
 838 |        "model_id": "545244f0a9a74126a0143833d626290b",
 839 |        "version_major": 2,
 840 |        "version_minor": 0
 841 |       },
 842 |       "text/plain": [
 843 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 844 |       ]
 845 |      },
 846 |      "metadata": {},
 847 |      "output_type": "display_data"
 848 |     },
 849 |     {
 850 |      "name": "stderr",
 851 |      "output_type": "stream",
 852 |      "text": [
 853 |       "2023-03-26 17:23:19,250 - Adding chunk 7 of 10 (600 to 700))\n"
 854 |      ]
 855 |     },
 856 |     {
 857 |      "data": {
 858 |       "application/vnd.jupyter.widget-view+json": {
 859 |        "model_id": "f7a3e718b72c481b87d590b7fe63a300",
 860 |        "version_major": 2,
 861 |        "version_minor": 0
 862 |       },
 863 |       "text/plain": [
 864 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 865 |       ]
 866 |      },
 867 |      "metadata": {},
 868 |      "output_type": "display_data"
 869 |     },
 870 |     {
 871 |      "name": "stderr",
 872 |      "output_type": "stream",
 873 |      "text": [
 874 |       "2023-03-26 17:23:40,646 - Adding chunk 8 of 10 (700 to 800))\n"
 875 |      ]
 876 |     },
 877 |     {
 878 |      "data": {
 879 |       "application/vnd.jupyter.widget-view+json": {
 880 |        "model_id": "edeb1e11eb8148e7b301d9f5687f6a3f",
 881 |        "version_major": 2,
 882 |        "version_minor": 0
 883 |       },
 884 |       "text/plain": [
 885 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 886 |       ]
 887 |      },
 888 |      "metadata": {},
 889 |      "output_type": "display_data"
 890 |     },
 891 |     {
 892 |      "name": "stderr",
 893 |      "output_type": "stream",
 894 |      "text": [
 895 |       "2023-03-26 17:23:58,679 - Adding chunk 9 of 10 (800 to 900))\n"
 896 |      ]
 897 |     },
 898 |     {
 899 |      "data": {
 900 |       "application/vnd.jupyter.widget-view+json": {
 901 |        "model_id": "e07fa7c925934611a81de348f8768209",
 902 |        "version_major": 2,
 903 |        "version_minor": 0
 904 |       },
 905 |       "text/plain": [
 906 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 907 |       ]
 908 |      },
 909 |      "metadata": {},
 910 |      "output_type": "display_data"
 911 |     },
 912 |     {
 913 |      "name": "stderr",
 914 |      "output_type": "stream",
 915 |      "text": [
 916 |       "2023-03-26 17:24:18,642 - Adding chunk 10 of 10 (900 to 1000))\n"
 917 |      ]
 918 |     },
 919 |     {
 920 |      "data": {
 921 |       "application/vnd.jupyter.widget-view+json": {
 922 |        "model_id": "d531e75e24ed444ab075c541f0678d91",
 923 |        "version_major": 2,
 924 |        "version_minor": 0
 925 |       },
 926 |       "text/plain": [
 927 |        "Upserted vectors:   0%|          | 0/100 [00:00<?, ?it/s]"
 928 |       ]
 929 |      },
 930 |      "metadata": {},
 931 |      "output_type": "display_data"
 932 |     }
 933 |    ],
 934 |    "source": [
 935 |     "# Add data to pinecone in chunks to avoid errors\n",
 936 |     "chunk_size = 100\n",
 937 |     "last_chunk = 0\n",
 938 |     "num_chunks = math.ceil(len(docs) / chunk_size)\n",
 939 |     "\n",
 940 |     "for i in range(last_chunk, num_chunks):    \n",
 941 |     "    start_idx = i * chunk_size \n",
 942 |     "    end_idx = min(start_idx + chunk_size, len(docs))\n",
 943 |     "    logger.info(f'Adding chunk {i+1} of {num_chunks} ({start_idx} to {end_idx}))')\n",
 944 |     "    \n",
 945 |     "    _docs = docs[start_idx:end_idx]\n",
 946 |     "    _metadata = metadata[start_idx:end_idx]\n",
 947 |     "    \n",
 948 |     "    store.add_texts(_docs, _metadata)"
 949 |    ]
 950 |   },
 951 |   {
 952 |    "cell_type": "code",
 953 |    "execution_count": 317,
 954 |    "metadata": {},
 955 |    "outputs": [
 956 |     {
 957 |      "data": {
 958 |       "text/plain": [
 959 |        "{'dimension': 1536,\n",
 960 |        " 'index_fullness': 0.0,\n",
 961 |        " 'namespaces': {'': {'vector_count': 1000}},\n",
 962 |        " 'total_vector_count': 1000}"
 963 |       ]
 964 |      },
 965 |      "execution_count": 317,
 966 |      "metadata": {},
 967 |      "output_type": "execute_result"
 968 |     }
 969 |    ],
 970 |    "source": [
 971 |     "index.describe_index_stats()"
 972 |    ]
 973 |   },
 974 |   {
 975 |    "cell_type": "code",
 976 |    "execution_count": 325,
 977 |    "metadata": {},
 978 |    "outputs": [],
 979 |    "source": [
 980 |     "# Use this instead of load_qa_with_sources_chain for more control\n",
 981 |     "question = 'Question for eugeneyan.com: Why is writing important?'\n",
 982 |     "\n",
 983 |     "store = Pinecone.from_existing_index(index_name=index_name, embedding=embeddings)\n",
 984 |     "\n",
 985 |     "llm=OpenAI(temperature=0)\n",
 986 |     "chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff', retriever=store.as_retriever(), return_source_documents=True)\n",
 987 |     "response = chain({'question': question}, return_only_outputs=False)"
 988 |    ]
 989 |   },
 990 |   {
 991 |    "cell_type": "code",
 992 |    "execution_count": 326,
 993 |    "metadata": {},
 994 |    "outputs": [
 995 |     {
 996 |      "data": {
 997 |       "text/plain": [
 998 |        "{'question': 'Question for eugeneyan.com: Why is writing important?',\n",
 999 |        " 'answer': ' Writing is important because it helps to clarify thinking, further learning, and share ideas with others.\\n',\n",
1000 |        " 'sources': 'https://eugeneyan.com/writing/how-to-write-david-x-sahil/, https://eugeneyan.com/writing/informal-mentors-chip-huyen/, https://eugeneyan.com/writing/writing-and-coding/, https://eugeneyan.com/writing/reading-note-taking-writing/',\n",
1001 |        " 'source_documents': [Document(page_content='“There’s no such thing as good writing, only good rewriting” - Robert Graves\\nWrite evergreen content. Focus on topics that will always add value and be helpful. Perhaps a short essay on why and how to write. Such writing stays relevant for years. (Nonetheless, many people write such pieces, so you might not contribute much. But hey, you learn a lot by writing about it). So reconsider writing that 183,768th piece of COVID-19 visualisation/dashboarding.\\nIf you found this post useful, share this viral tweet with your friends. Spread the word on writing effectively. =)\\n14 Ideas on How to Grow Your Business by Writing (THREAD) Ideas are all from @shl and @david_perellNotes below 👇https://t.co/c3Q45CZtdP\\nThanks to Yang Xinyi and Marianne Tan for reading drafts of this.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/how-to-write-david-x-sahil/'}, lookup_index=0),\n",
1002 |        "  Document(page_content='I think one of the biggest challenges is ensuring that AI is doing more good than evil. Historically, technological progresses have allowed those with technology to oppress those who don’t. For example, the industrial revolution allowed many European nations to grow, but it also allowed them to colonize smaller, lesser developed countries. Now, we see that some countries are very good at AI; I’m concerned something similar could happen again.\\n(Author’s note: Most ML advancements and capabilities lie within a handful of tech giants. I wonder how we can help smaller companies also benefit from machine learning.)\\nHmmm, this is a hard question.\\nFor me, I like fundamental ideas. I like best practices. There’s a saying I like: “Innovate where you can. Where you can’t, use the industry standards.” For SMEs, instead of chasing fancy new things, I think they should choose less fancy but more stable solutions.\\nMany enterprises have already seen some benefits from AI. I think it’s important for SMEs to consider ML solutions, and it’s good to start simple.\\nI mostly learn through hands-on experience, including work. At work, I learn a lot from my colleagues, who are awesome and very patient with me. I also enjoy reading books and think that books have less noise relative to what’s available online.\\nI also try to write about what I learned. Writing helps me learn better.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/informal-mentors-chip-huyen/'}, lookup_index=0),\n",
1003 |        "  Document(page_content='Here’s a good question to ask ourselves: “Which would have greater impact right now?” If we need to understand and clarify the intent and context so we can start implementing, then writing a design doc will help most. If we’re struggling to meet an delivery deadline, then writing (and reviewing) code will help more.\\nWriting documents is like writing code. Code is for machines; documents are for people.\\nWriting documents is hard. For some, harder than writing code. And relative to writing code, the impact of writing documents is not as immediately measurable. But if it helps provide clarity, or save the team time, then it’s well worth the effort. (If you know of ways to measure the impact of writing, I would love to hear from you!)\\nWhat advice do you have for tech professionals starting to write more? Share in the comments below.\\nWhy does writing become more important (than coding) as we gain seniority? How do we balance between both? How can we help the team write better?Here, I explore answers to these question with the help of some friends, including @Al_Grigor & @nlpguy_.https://t.co/foqhU5A9Rx\\nThanks to Yang Xinyi and Drew Stegmaier for reading drafts of this. Thanks to David Said, Alexey Grigorev, Pratik Bhavsar, and Grace Tang for making time to discuss on this topic.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/writing-and-coding/'}, lookup_index=0),\n",
1004 |        "  Document(page_content='Here’s a good question to ask ourselves: “Which would have greater impact right now?” If we need to understand and clarify the intent and context so we can start implementing, then writing a design doc will help most. If we’re struggling to meet an delivery deadline, then writing (and reviewing) code will help more.\\nWriting documents is like writing code. Code is for machines; documents are for people.\\nWriting documents is hard. For some, harder than writing code. And relative to writing code, the impact of writing documents is not as immediately measurable. But if it helps provide clarity, or save the team time, then it’s well worth the effort. (If you know of ways to measure the impact of writing, I would love to hear from you!)\\nWhat advice do you have for tech professionals starting to write more? Share in the comments below.\\nWhy does writing become more important (than coding) as we gain seniority? How do we balance between both? How can we help the team write better?Here, I explore answers to these question with the help of some friends, including @Al_Grigor & @nlpguy_.https://t.co/foqhU5A9Rx\\nThanks to Yang Xinyi and Drew Stegmaier for reading drafts of this. Thanks to David Said, Alexey Grigorev, Pratik Bhavsar, and Grace Tang for making time to discuss on this topic.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/writing-and-coding/'}, lookup_index=0),\n",
1005 |        "  Document(page_content='Writing begins before you write. I used to think writing started with the first sentence typed. I was wrong.\\nIt should be clear from previous sections that writing starts with reading and taking notes, similar to how you would write an academic paper. (I hope you vaguely recall the process of writing papers in college). Don’t start with only a blank sheet of paper.\\nWriting is not the outcome of thinking—it is thinking. Ever wondered why writing is hard? It’s because you have to think hard.\\nBefore writing, I reviewed my notes and knowledge tree on the relevant material. While writing, I discovered gaps in my understanding. This led to more research and learning. After a first draft, I had to organise the ideas, prune the unnecessary, and edit the language to ensure my thoughts are conveyed correctly.\\nThrough this, I clarified my thinking and furthered my learning. As a bonus, I now have an essay that can be shared easily. Others can read it to understand my views and provide feedback.\\nReading, taking notes, and writing are not separate activities—they make up a (virtuous) cycle. As you write, you discover more you need to read and learn.\\nTry it. The next time you read (non-fiction), consciously think about how to take notes and teach someone else the material. The next time you write, use your notes to put together an outline, upon which you can add your ideas and views.\\nThanks to Yang Xinyi, Gabriel Chuan, and Marianne Tan for reading drafts of this.', lookup_str='', metadata={'source': 'https://eugeneyan.com/writing/reading-note-taking-writing/'}, lookup_index=0)]}"
1006 |       ]
1007 |      },
1008 |      "execution_count": 326,
1009 |      "metadata": {},
1010 |      "output_type": "execute_result"
1011 |     }
1012 |    ],
1013 |    "source": [
1014 |     "response"
1015 |    ]
1016 |   },
1017 |   {
1018 |    "cell_type": "code",
1019 |    "execution_count": 338,
1020 |    "metadata": {},
1021 |    "outputs": [],
1022 |    "source": [
1023 |     "result_list = []\n",
1024 |     "pretty_qa = ''\n",
1025 |     "\n",
1026 |     "pretty_qa += f'**Question:** {response[\"question\"]}\\n'\n",
1027 |     "pretty_qa += f'**Answer:** {response[\"answer\"]}\\n'\n",
1028 |     "pretty_qa += f'**Sources:** {response[\"sources\"]}\\n\\n'\n",
1029 |     "result_list.append(pretty_qa)\n",
1030 |     "\n",
1031 |     "for doc in response['source_documents']:\n",
1032 |     "    pretty_source = ''\n",
1033 |     "    pretty_source += f'**Source:** {doc.page_content}\\n'\n",
1034 |     "    pretty_source += f'**URL:** {doc.metadata[\"source\"]}\\n\\n'\n",
1035 |     "    result_list.append(pretty_source)"
1036 |    ]
1037 |   },
1038 |   {
1039 |    "cell_type": "code",
1040 |    "execution_count": 339,
1041 |    "metadata": {},
1042 |    "outputs": [
1043 |     {
1044 |      "name": "stdout",
1045 |      "output_type": "stream",
1046 |      "text": [
1047 |       "Question: Question for eugeneyan.com: Why is writing important?\n",
1048 |       "Answer:  Writing is important because it helps to clarify thinking, further learning, and share ideas with others.\n",
1049 |       "\n",
1050 |       "Sources: https://eugeneyan.com/writing/how-to-write-david-x-sahil/, https://eugeneyan.com/writing/informal-mentors-chip-huyen/, https://eugeneyan.com/writing/writing-and-coding/, https://eugeneyan.com/writing/reading-note-taking-writing/\n",
1051 |       "\n",
1052 |       "Source: “There’s no such thing as good writing, only good rewriting” - Robert Graves\n",
1053 |       "Write evergreen content. Focus on topics that will always add value and be helpful. Perhaps a short essay on why and how to write. Such writing stays relevant for years. (Nonetheless, many people write such pieces, so you might not contribute much. But hey, you learn a lot by writing about it). So reconsider writing that 183,768th piece of COVID-19 visualisation/dashboarding.\n",
1054 |       "If you found this post useful, share this viral tweet with your friends. Spread the word on writing effectively. =)\n",
1055 |       "14 Ideas on How to Grow Your Business by Writing (THREAD) Ideas are all from @shl and @david_perellNotes below 👇https://t.co/c3Q45CZtdP\n",
1056 |       "Thanks to Yang Xinyi and Marianne Tan for reading drafts of this.\n",
1057 |       "URL: https://eugeneyan.com/writing/how-to-write-david-x-sahil/\n",
1058 |       "\n",
1059 |       "Source: I think one of the biggest challenges is ensuring that AI is doing more good than evil. Historically, technological progresses have allowed those with technology to oppress those who don’t. For example, the industrial revolution allowed many European nations to grow, but it also allowed them to colonize smaller, lesser developed countries. Now, we see that some countries are very good at AI; I’m concerned something similar could happen again.\n",
1060 |       "(Author’s note: Most ML advancements and capabilities lie within a handful of tech giants. I wonder how we can help smaller companies also benefit from machine learning.)\n",
1061 |       "Hmmm, this is a hard question.\n",
1062 |       "For me, I like fundamental ideas. I like best practices. There’s a saying I like: “Innovate where you can. Where you can’t, use the industry standards.” For SMEs, instead of chasing fancy new things, I think they should choose less fancy but more stable solutions.\n",
1063 |       "Many enterprises have already seen some benefits from AI. I think it’s important for SMEs to consider ML solutions, and it’s good to start simple.\n",
1064 |       "I mostly learn through hands-on experience, including work. At work, I learn a lot from my colleagues, who are awesome and very patient with me. I also enjoy reading books and think that books have less noise relative to what’s available online.\n",
1065 |       "I also try to write about what I learned. Writing helps me learn better.\n",
1066 |       "URL: https://eugeneyan.com/writing/informal-mentors-chip-huyen/\n",
1067 |       "\n",
1068 |       "Source: Here’s a good question to ask ourselves: “Which would have greater impact right now?” If we need to understand and clarify the intent and context so we can start implementing, then writing a design doc will help most. If we’re struggling to meet an delivery deadline, then writing (and reviewing) code will help more.\n",
1069 |       "Writing documents is like writing code. Code is for machines; documents are for people.\n",
1070 |       "Writing documents is hard. For some, harder than writing code. And relative to writing code, the impact of writing documents is not as immediately measurable. But if it helps provide clarity, or save the team time, then it’s well worth the effort. (If you know of ways to measure the impact of writing, I would love to hear from you!)\n",
1071 |       "What advice do you have for tech professionals starting to write more? Share in the comments below.\n",
1072 |       "Why does writing become more important (than coding) as we gain seniority? How do we balance between both? How can we help the team write better?Here, I explore answers to these question with the help of some friends, including @Al_Grigor & @nlpguy_.https://t.co/foqhU5A9Rx\n",
1073 |       "Thanks to Yang Xinyi and Drew Stegmaier for reading drafts of this. Thanks to David Said, Alexey Grigorev, Pratik Bhavsar, and Grace Tang for making time to discuss on this topic.\n",
1074 |       "URL: https://eugeneyan.com/writing/writing-and-coding/\n",
1075 |       "\n",
1076 |       "Source: Here’s a good question to ask ourselves: “Which would have greater impact right now?” If we need to understand and clarify the intent and context so we can start implementing, then writing a design doc will help most. If we’re struggling to meet an delivery deadline, then writing (and reviewing) code will help more.\n",
1077 |       "Writing documents is like writing code. Code is for machines; documents are for people.\n",
1078 |       "Writing documents is hard. For some, harder than writing code. And relative to writing code, the impact of writing documents is not as immediately measurable. But if it helps provide clarity, or save the team time, then it’s well worth the effort. (If you know of ways to measure the impact of writing, I would love to hear from you!)\n",
1079 |       "What advice do you have for tech professionals starting to write more? Share in the comments below.\n",
1080 |       "Why does writing become more important (than coding) as we gain seniority? How do we balance between both? How can we help the team write better?Here, I explore answers to these question with the help of some friends, including @Al_Grigor & @nlpguy_.https://t.co/foqhU5A9Rx\n",
1081 |       "Thanks to Yang Xinyi and Drew Stegmaier for reading drafts of this. Thanks to David Said, Alexey Grigorev, Pratik Bhavsar, and Grace Tang for making time to discuss on this topic.\n",
1082 |       "URL: https://eugeneyan.com/writing/writing-and-coding/\n",
1083 |       "\n",
1084 |       "Source: Writing begins before you write. I used to think writing started with the first sentence typed. I was wrong.\n",
1085 |       "It should be clear from previous sections that writing starts with reading and taking notes, similar to how you would write an academic paper. (I hope you vaguely recall the process of writing papers in college). Don’t start with only a blank sheet of paper.\n",
1086 |       "Writing is not the outcome of thinking—it is thinking. Ever wondered why writing is hard? It’s because you have to think hard.\n",
1087 |       "Before writing, I reviewed my notes and knowledge tree on the relevant material. While writing, I discovered gaps in my understanding. This led to more research and learning. After a first draft, I had to organise the ideas, prune the unnecessary, and edit the language to ensure my thoughts are conveyed correctly.\n",
1088 |       "Through this, I clarified my thinking and furthered my learning. As a bonus, I now have an essay that can be shared easily. Others can read it to understand my views and provide feedback.\n",
1089 |       "Reading, taking notes, and writing are not separate activities—they make up a (virtuous) cycle. As you write, you discover more you need to read and learn.\n",
1090 |       "Try it. The next time you read (non-fiction), consciously think about how to take notes and teach someone else the material. The next time you write, use your notes to put together an outline, upon which you can add your ideas and views.\n",
1091 |       "Thanks to Yang Xinyi, Gabriel Chuan, and Marianne Tan for reading drafts of this.\n",
1092 |       "URL: https://eugeneyan.com/writing/reading-note-taking-writing/\n",
1093 |       "\n",
1094 |       "\n"
1095 |      ]
1096 |     }
1097 |    ],
1098 |    "source": [
1099 |     "print(pretty_result)"
1100 |    ]
1101 |   },
1102 |   {
1103 |    "cell_type": "code",
1104 |    "execution_count": 327,
1105 |    "metadata": {},
1106 |    "outputs": [
1107 |     {
1108 |      "name": "stderr",
1109 |      "output_type": "stream",
1110 |      "text": [
1111 |       "2023-03-26 17:28:06,037 - Question: Question for eugeneyan.com: Why is writing important?\n",
1112 |       "2023-03-26 17:28:06,038 - Answer:  Writing is important because it helps to clarify thinking, further learning, and share ideas with others.\n",
1113 |       "\n",
1114 |       "2023-03-26 17:28:06,039 - Sources: https://eugeneyan.com/writing/how-to-write-david-x-sahil/, https://eugeneyan.com/writing/informal-mentors-chip-huyen/, https://eugeneyan.com/writing/writing-and-coding/, https://eugeneyan.com/writing/reading-note-taking-writing/\n",
1115 |       "2023-03-26 17:28:06,040 - URL: https://eugeneyan.com/writing/how-to-write-david-x-sahil/\n",
1116 |       "\n",
1117 |       "2023-03-26 17:28:06,040 - Source: “There’s no such thing as good writing, only good rewriting” - Robert Graves\n",
1118 |       "Write evergreen content. Focus on topics that will always add value and be helpful. Perhaps a short essay on why and how to write. Such writing stays relevant for years. (Nonetheless, many people write such pieces, so you might not contribute much. But hey, you learn a lot by writing about it). So reconsider writing that 183,768th piece of COVID-19 visualisation/dashboarding.\n",
1119 |       "If you found this post useful, share this viral tweet with your friends. Spread the word on writing effectively. =)\n",
1120 |       "14 Ideas on How to Grow Your Business by Writing (THREAD) Ideas are all from @shl and @david_perellNotes below 👇https://t.co/c3Q45CZtdP\n",
1121 |       "Thanks to Yang Xinyi and Marianne Tan for reading drafts of this.\n",
1122 |       "\n",
1123 |       "2023-03-26 17:28:06,041 - =====================\n",
1124 |       "2023-03-26 17:28:06,042 - URL: https://eugeneyan.com/writing/informal-mentors-chip-huyen/\n",
1125 |       "\n",
1126 |       "2023-03-26 17:28:06,042 - Source: I think one of the biggest challenges is ensuring that AI is doing more good than evil. Historically, technological progresses have allowed those with technology to oppress those who don’t. For example, the industrial revolution allowed many European nations to grow, but it also allowed them to colonize smaller, lesser developed countries. Now, we see that some countries are very good at AI; I’m concerned something similar could happen again.\n",
1127 |       "(Author’s note: Most ML advancements and capabilities lie within a handful of tech giants. I wonder how we can help smaller companies also benefit from machine learning.)\n",
1128 |       "Hmmm, this is a hard question.\n",
1129 |       "For me, I like fundamental ideas. I like best practices. There’s a saying I like: “Innovate where you can. Where you can’t, use the industry standards.” For SMEs, instead of chasing fancy new things, I think they should choose less fancy but more stable solutions.\n",
1130 |       "Many enterprises have already seen some benefits from AI. I think it’s important for SMEs to consider ML solutions, and it’s good to start simple.\n",
1131 |       "I mostly learn through hands-on experience, including work. At work, I learn a lot from my colleagues, who are awesome and very patient with me. I also enjoy reading books and think that books have less noise relative to what’s available online.\n",
1132 |       "I also try to write about what I learned. Writing helps me learn better.\n",
1133 |       "\n",
1134 |       "2023-03-26 17:28:06,043 - =====================\n",
1135 |       "2023-03-26 17:28:06,043 - URL: https://eugeneyan.com/writing/writing-and-coding/\n",
1136 |       "\n",
1137 |       "2023-03-26 17:28:06,044 - Source: Here’s a good question to ask ourselves: “Which would have greater impact right now?” If we need to understand and clarify the intent and context so we can start implementing, then writing a design doc will help most. If we’re struggling to meet an delivery deadline, then writing (and reviewing) code will help more.\n",
1138 |       "Writing documents is like writing code. Code is for machines; documents are for people.\n",
1139 |       "Writing documents is hard. For some, harder than writing code. And relative to writing code, the impact of writing documents is not as immediately measurable. But if it helps provide clarity, or save the team time, then it’s well worth the effort. (If you know of ways to measure the impact of writing, I would love to hear from you!)\n",
1140 |       "What advice do you have for tech professionals starting to write more? Share in the comments below.\n",
1141 |       "Why does writing become more important (than coding) as we gain seniority? How do we balance between both? How can we help the team write better?Here, I explore answers to these question with the help of some friends, including @Al_Grigor & @nlpguy_.https://t.co/foqhU5A9Rx\n",
1142 |       "Thanks to Yang Xinyi and Drew Stegmaier for reading drafts of this. Thanks to David Said, Alexey Grigorev, Pratik Bhavsar, and Grace Tang for making time to discuss on this topic.\n",
1143 |       "\n",
1144 |       "2023-03-26 17:28:06,045 - =====================\n",
1145 |       "2023-03-26 17:28:06,045 - URL: https://eugeneyan.com/writing/writing-and-coding/\n",
1146 |       "\n",
1147 |       "2023-03-26 17:28:06,046 - Source: Here’s a good question to ask ourselves: “Which would have greater impact right now?” If we need to understand and clarify the intent and context so we can start implementing, then writing a design doc will help most. If we’re struggling to meet an delivery deadline, then writing (and reviewing) code will help more.\n",
1148 |       "Writing documents is like writing code. Code is for machines; documents are for people.\n",
1149 |       "Writing documents is hard. For some, harder than writing code. And relative to writing code, the impact of writing documents is not as immediately measurable. But if it helps provide clarity, or save the team time, then it’s well worth the effort. (If you know of ways to measure the impact of writing, I would love to hear from you!)\n",
1150 |       "What advice do you have for tech professionals starting to write more? Share in the comments below.\n",
1151 |       "Why does writing become more important (than coding) as we gain seniority? How do we balance between both? How can we help the team write better?Here, I explore answers to these question with the help of some friends, including @Al_Grigor & @nlpguy_.https://t.co/foqhU5A9Rx\n",
1152 |       "Thanks to Yang Xinyi and Drew Stegmaier for reading drafts of this. Thanks to David Said, Alexey Grigorev, Pratik Bhavsar, and Grace Tang for making time to discuss on this topic.\n",
1153 |       "\n",
1154 |       "2023-03-26 17:28:06,047 - =====================\n",
1155 |       "2023-03-26 17:28:06,047 - URL: https://eugeneyan.com/writing/reading-note-taking-writing/\n",
1156 |       "\n",
1157 |       "2023-03-26 17:28:06,048 - Source: Writing begins before you write. I used to think writing started with the first sentence typed. I was wrong.\n",
1158 |       "It should be clear from previous sections that writing starts with reading and taking notes, similar to how you would write an academic paper. (I hope you vaguely recall the process of writing papers in college). Don’t start with only a blank sheet of paper.\n",
1159 |       "Writing is not the outcome of thinking—it is thinking. Ever wondered why writing is hard? It’s because you have to think hard.\n",
1160 |       "Before writing, I reviewed my notes and knowledge tree on the relevant material. While writing, I discovered gaps in my understanding. This led to more research and learning. After a first draft, I had to organise the ideas, prune the unnecessary, and edit the language to ensure my thoughts are conveyed correctly.\n",
1161 |       "Through this, I clarified my thinking and furthered my learning. As a bonus, I now have an essay that can be shared easily. Others can read it to understand my views and provide feedback.\n",
1162 |       "Reading, taking notes, and writing are not separate activities—they make up a (virtuous) cycle. As you write, you discover more you need to read and learn.\n",
1163 |       "Try it. The next time you read (non-fiction), consciously think about how to take notes and teach someone else the material. The next time you write, use your notes to put together an outline, upon which you can add your ideas and views.\n",
1164 |       "Thanks to Yang Xinyi, Gabriel Chuan, and Marianne Tan for reading drafts of this.\n",
1165 |       "\n",
1166 |       "2023-03-26 17:28:06,049 - =====================\n"
1167 |      ]
1168 |     }
1169 |    ],
1170 |    "source": [
1171 |     "logger.info(f'Question: {response[\"question\"]}')\n",
1172 |     "logger.info(f'Answer: {response[\"answer\"]}')\n",
1173 |     "logger.info(f'Sources: {response[\"sources\"]}')\n",
1174 |     "\n",
1175 |     "sources = set(response['sources'].split(', '))\n",
1176 |     "\n",
1177 |     "for doc in response['source_documents']:\n",
1178 |     "    if doc.metadata[\"source\"] in sources:\n",
1179 |     "        logger.info(f'URL: {doc.metadata[\"source\"]}\\n')\n",
1180 |     "        logger.info(f'Source: {doc.page_content}\\n')\n",
1181 |     "        logger.info('=====================')"
1182 |    ]
1183 |   },
1184 |   {
1185 |    "cell_type": "code",
1186 |    "execution_count": null,
1187 |    "metadata": {},
1188 |    "outputs": [],
1189 |    "source": []
1190 |   }
1191 |  ],
1192 |  "metadata": {
1193 |   "kernelspec": {
1194 |    "display_name": ".venv",
1195 |    "language": "python",
1196 |    "name": "python3"
1197 |   },
1198 |   "language_info": {
1199 |    "codemirror_mode": {
1200 |     "name": "ipython",
1201 |     "version": 3
1202 |    },
1203 |    "file_extension": ".py",
1204 |    "mimetype": "text/x-python",
1205 |    "name": "python",
1206 |    "nbconvert_exporter": "python",
1207 |    "pygments_lexer": "ipython3",
1208 |    "version": "3.9.7"
1209 |   },
1210 |   "orig_nbformat": 4
1211 |  },
1212 |  "nbformat": 4,
1213 |  "nbformat_minor": 2
1214 | }
1215 | 


--------------------------------------------------------------------------------
/notebooks/t-sne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eugeneyan/discord-llm/0308a50b89e432bd7e34656e9b3911939eab7e9f/notebooks/t-sne.png


--------------------------------------------------------------------------------
/qa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for Q&A on a vector index
 3 | """
 4 | import os
 5 | 
 6 | import pinecone
 7 | from dotenv import load_dotenv
 8 | from langchain.chains import RetrievalQAWithSourcesChain
 9 | from langchain.chat_models import ChatOpenAI
10 | from langchain.embeddings import OpenAIEmbeddings
11 | from langchain.vectorstores import Pinecone
12 | 
13 | from config import (EMBEDDING_MODEL, PINECONE_ENV, PINECONE_INDEX_NAME_BOARD,
14 |                     PINECONE_INDEX_NAME_EY, QA_MODEL)
15 | from logger import logger
16 | from utils import prettify_qa_response, timer
17 | 
18 | # Load env variables
19 | load_dotenv()
20 | 
21 | # Initialize embeddings
22 | embeddings = OpenAIEmbeddings(model=EMBEDDING_MODEL)
23 | 
24 | # Initialize Pinecone index for ask-ey
25 | pinecone.init(api_key=os.getenv('PINECONE_API_KEY_EY'),
26 |               environment=PINECONE_ENV)
27 | index = pinecone.Index(PINECONE_INDEX_NAME_EY)
28 | logger.info(
29 |     f'Stats for {PINECONE_INDEX_NAME_EY}: {index.describe_index_stats()}')
30 | store_ey = Pinecone.from_existing_index(
31 |     index_name=PINECONE_INDEX_NAME_EY, embedding=embeddings)
32 | 
33 | # Initialize Pinecone index for board
34 | pinecone.init(api_key=os.getenv('PINECONE_API_KEY_BOARD'),
35 |               environment=PINECONE_ENV)
36 | index = pinecone.Index(PINECONE_INDEX_NAME_BOARD)
37 | logger.info(
38 |     f'Stats for {PINECONE_INDEX_NAME_BOARD}: {index.describe_index_stats()}')
39 | store_board = Pinecone.from_existing_index(
40 |     index_name=PINECONE_INDEX_NAME_BOARD, embedding=embeddings)
41 | 
42 | 
43 | @timer
44 | def qa_ey(question: str, temperature: float = None, model: str = QA_MODEL) -> str:
45 | 
46 |     llm = ChatOpenAI(temperature=temperature, model_name=model)
47 |     chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff',
48 |                                                         retriever=store_ey.as_retriever(),
49 |                                                         return_source_documents=True)
50 | 
51 |     response = chain({'question': question})
52 |     pretty_response = prettify_qa_response(response)
53 | 
54 |     return pretty_response
55 | 
56 | 
57 | @timer
58 | def qa_board(question: str, temperature: float = None, model: str = QA_MODEL) -> str:
59 | 
60 |     llm = ChatOpenAI(temperature=temperature, model_name=model)
61 |     chain = RetrievalQAWithSourcesChain.from_chain_type(llm, chain_type='stuff',
62 |                                                         retriever=store_board.as_retriever(),
63 |                                                         return_source_documents=True)
64 | 
65 |     response = chain({'question': question})
66 |     pretty_response = prettify_qa_response(response)
67 | 
68 |     return pretty_response
69 | 


--------------------------------------------------------------------------------
/railway.toml:
--------------------------------------------------------------------------------
1 | [build]
2 | builder = "DOCKERFILE"
3 | dockerfilePath = "Dockerfile"
4 | 
5 | [deploy]
6 | restartPolicyType = "ON_FAILURE"
7 | restartPolicyMaxRetries = 10


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohttp==3.8.4
  2 | aiosignal==1.3.1
  3 | anyio==3.6.2
  4 | appnope==0.1.3
  5 | argon2-cffi==21.3.0
  6 | argon2-cffi-bindings==21.2.0
  7 | arrow==1.2.3
  8 | asttokens==2.2.1
  9 | async-timeout==4.0.2
 10 | attrs==22.2.0
 11 | autopep8==2.0.2
 12 | backcall==0.2.0
 13 | beautifulsoup4==4.12.0
 14 | bleach==6.0.0
 15 | cachetools==5.3.0
 16 | certifi==2024.7.4
 17 | cffi==1.15.1
 18 | charset-normalizer==3.1.0
 19 | comm==0.1.3
 20 | dataclasses-json==0.5.7
 21 | debugpy==1.6.6
 22 | decorator==5.1.1
 23 | defusedxml==0.7.1
 24 | discord-py-interactions==4.4.0
 25 | discord-py-slash-command==4.2.1
 26 | discord.py==2.2.2
 27 | dnspython==2.3.0
 28 | executing==1.2.0
 29 | faiss-cpu==1.7.3
 30 | fastjsonschema==2.16.3
 31 | fqdn==1.5.1
 32 | frozenlist==1.3.3
 33 | google-api-core==2.11.0
 34 | google-api-python-client==2.82.0
 35 | google-auth==2.16.3
 36 | google-auth-httplib2==0.1.0
 37 | google-search-results==2.4.2
 38 | googleapis-common-protos==1.59.0
 39 | greenlet==2.0.2
 40 | httplib2==0.22.0
 41 | idna==3.4
 42 | importlib-metadata==6.1.0
 43 | install==1.3.5
 44 | ipykernel==6.22.0
 45 | ipython==8.11.0
 46 | ipython-genutils==0.2.0
 47 | ipywidgets==8.0.5
 48 | isoduration==20.11.0
 49 | jedi==0.18.2
 50 | Jinja2==3.1.2
 51 | jsonpointer==2.3
 52 | jsonschema==4.17.3
 53 | jupyter==1.0.0
 54 | jupyter-console==6.6.3
 55 | jupyter-events==0.6.3
 56 | jupyter_client==8.1.0
 57 | jupyter_core==5.3.0
 58 | jupyter_server==2.5.0
 59 | jupyter_server_terminals==0.4.4
 60 | jupyterlab-pygments==0.2.2
 61 | jupyterlab-widgets==3.0.6
 62 | langchain==0.0.325
 63 | loguru==0.6.0
 64 | MarkupSafe==2.1.2
 65 | marshmallow==3.19.0
 66 | marshmallow-enum==1.5.1
 67 | matplotlib-inline==0.1.6
 68 | mistune==2.0.5
 69 | multidict==6.0.4
 70 | mypy-extensions==1.0.0
 71 | nbclassic==0.5.3
 72 | nbclient==0.7.2
 73 | nbconvert==7.2.10
 74 | nbformat==5.8.0
 75 | nest-asyncio==1.5.6
 76 | notebook==6.5.3
 77 | notebook_shim==0.2.2
 78 | numpy==1.24.2
 79 | openai==0.27.2
 80 | packaging==23.0
 81 | pandas==1.5.3
 82 | pandocfilters==1.5.0
 83 | parso==0.8.3
 84 | pexpect==4.8.0
 85 | pickleshare==0.7.5
 86 | pinecone-client==2.2.1
 87 | platformdirs==3.1.1
 88 | prometheus-client==0.16.0
 89 | prompt-toolkit==3.0.38
 90 | protobuf==4.22.1
 91 | psutil==5.9.4
 92 | ptyprocess==0.7.0
 93 | pure-eval==0.2.2
 94 | pyasn1==0.4.8
 95 | pyasn1-modules==0.2.8
 96 | pycodestyle==2.10.0
 97 | pycparser==2.21
 98 | pydantic==1.10.7
 99 | Pygments==2.14.0
100 | pyparsing==3.0.9
101 | pyrsistent==0.19.3
102 | python-dateutil==2.8.2
103 | python-dotenv==1.0.0
104 | python-json-logger==2.0.7
105 | pytz==2022.7.1
106 | PyYAML==6.0
107 | pyzmq==25.0.2
108 | qtconsole==5.4.1
109 | QtPy==2.3.0
110 | regex==2023.3.23
111 | requests==2.28.2
112 | rfc3339-validator==0.1.4
113 | rfc3986-validator==0.1.1
114 | rsa==4.9
115 | Send2Trash==1.8.0
116 | six==1.16.0
117 | sniffio==1.3.0
118 | soupsieve==2.4
119 | SQLAlchemy==1.4.47
120 | stack-data==0.6.2
121 | tenacity==8.2.2
122 | terminado==0.17.1
123 | tiktoken==0.3.2
124 | tinycss2==1.2.1
125 | tomli==2.0.1
126 | tornado==6.2
127 | tqdm==4.65.0
128 | traitlets==5.9.0
129 | typing-inspect==0.8.0
130 | typing_extensions==4.5.0
131 | uri-template==1.2.0
132 | uritemplate==4.1.1
133 | urllib3==1.26.15
134 | wcwidth==0.2.6
135 | webcolors==1.12
136 | webencodings==0.5.1
137 | websocket-client==1.5.1
138 | widgetsnbextension==4.0.6
139 | xmltodict==0.13.0
140 | yarl==1.8.2
141 | zipp==3.15.0
142 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | # For railway.app
2 | python -3.9.7


--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Module for searching the internet.
 3 | """
 4 | from dotenv import load_dotenv
 5 | from langchain import LLMChain
 6 | from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
 7 | from langchain.chat_models import ChatOpenAI
 8 | from langchain.utilities import GoogleSearchAPIWrapper
 9 | 
10 | from config import SEARCH_MODEL
11 | from logger import logger
12 | from utils import prettify_agent_response, timer
13 | 
14 | # Create tools
15 | load_dotenv()
16 | search = GoogleSearchAPIWrapper()
17 | TOOLS = [
18 |     Tool(
19 |         name="Search",
20 |         func=search.run,
21 |         description="Useful for when you need to answer questions about current events"
22 |     )
23 | ]
24 | TOOL_STRINGS = "\n".join(
25 |     [f"{tool.name}: {tool.description}" for tool in TOOLS])
26 | TOOL_NAMES = ", ".join([tool.name for tool in TOOLS])
27 | 
28 | PREFIX = """Please answer the following questions as best you can. You have access to the following tools:"""
29 | FORMAT_INSTRUCTIONS = """Please use the following format:
30 | 
31 | Question: the input question you must answer
32 | Thought: you should always think about what to do
33 | Action: the action to take, should be one of [{tool_names}]
34 | Action Input: the input to the action
35 | Observation: the result of the action
36 | ... (this Thought/Action/Action Input/Observation can repeat N times)
37 | Thought: I now know the final answer
38 | Final Answer: the final answer to the original input question"""
39 | SUFFIX = """Please begin!
40 | 
41 | Question: {input}
42 | Thought: {agent_scratchpad}"""
43 | 
44 | FORMAT_INSTRUCTIONS = FORMAT_INSTRUCTIONS.format(tool_names=TOOL_NAMES)
45 | 
46 | 
47 | # Search agent biaed on zeroshot
48 | @timer
49 | def search_agent(question: str, temperature: float = None, model: str = SEARCH_MODEL) -> str:
50 |     """
51 |     Calls OpenAI API and searches the web to find the best answer to a question.
52 |     """
53 |     # Write prompt and create zero-shot agent
54 |     prompt = ZeroShotAgent.create_prompt(
55 |         TOOLS,
56 |         prefix=PREFIX,
57 |         suffix=SUFFIX,
58 |         format_instructions=FORMAT_INSTRUCTIONS,
59 |         input_variables=['input', 'agent_scratchpad']
60 |     )
61 |     logger.info(prompt.template)
62 | 
63 |     # Create LLM and call API
64 |     llm = ChatOpenAI(temperature=temperature, model_name=model)
65 |     llm_chain = LLMChain(llm=llm, prompt=prompt)
66 | 
67 |     # Create agent with tools
68 |     agent = ZeroShotAgent(llm_chain=llm_chain,
69 |                           tools=TOOLS, tool_names=TOOL_NAMES)
70 |     agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=TOOLS, max_iterations=10,
71 |                                                         verbose=True, return_intermediate_steps=True)
72 | 
73 |     response = agent_executor({'input': question})
74 |     pretty_response = prettify_agent_response(response)
75 | 
76 |     return pretty_response
77 | 


--------------------------------------------------------------------------------
/sql.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agent that can query a database
  3 | Docs: https://langchain.readthedocs.io/en/latest/modules/chains/examples/sqlite.html
  4 | Dataset: https://www.kaggle.com/datasets/jealousleopard/goodreadsbooks
  5 | """
  6 | from dotenv import load_dotenv
  7 | from langchain import LLMChain, OpenAI, SQLDatabase, SQLDatabaseChain
  8 | from langchain.agents import (AgentExecutor, Tool, ZeroShotAgent,
  9 |                               create_sql_agent)
 10 | from langchain.agents.agent_toolkits import SQLDatabaseToolkit
 11 | from langchain.chat_models import ChatOpenAI
 12 | from langchain.llms.openai import OpenAI
 13 | from langchain.sql_database import SQLDatabase
 14 | 
 15 | from config import SQL_MODEL
 16 | from logger import logger
 17 | from utils import prettify_agent_response, prettify_chain_response, timer
 18 | 
 19 | # Load env
 20 | load_dotenv()
 21 | DB = SQLDatabase.from_uri('sqlite:///data/books.db')
 22 | TOOLKIT = SQLDatabaseToolkit(db=DB)
 23 | TOOLS = TOOLKIT.get_tools()
 24 | TOOL_NAMES = [tool.name for tool in TOOLS]
 25 | 
 26 | 
 27 | PREFIX = """You are an agent designed to interact with a SQL database.
 28 | Given an input question, create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.
 29 | Unless the user specifies a specific number of examples they wish to obtain, always limit your query to at most {top_k} results.
 30 | You can order the results by a relevant column to return the most interesting examples in the database.
 31 | Never query for all the columns from a specific table, only ask for a the few relevant columns given the question.
 32 | You have access to tools for interacting with the database.
 33 | Only use the below tools. Only use the information returned by the below tools to construct your final answer.
 34 | You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.
 35 | 
 36 | DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.
 37 | 
 38 | If the question does not seem related to the database, just return "I don't know" as the answer.
 39 | """
 40 | FORMAT_INSTRUCTIONS = """Use the following format:
 41 | 
 42 | Question: the input question you must answer
 43 | Thought: you should always think about what to do
 44 | Action: the action to take, should be one of [{tool_names}]
 45 | Action Input: the input to the action
 46 | Observation: the result of the action
 47 | ... (this Thought/Action/Action Input/Observation can repeat N times)
 48 | Thought: I now know the final answer
 49 | Final Answer: the final answer to the original input question"""
 50 | SUFFIX = """Begin!
 51 | 
 52 | Question: {input}
 53 | Thought: I should look at the tables in the database to see what I can query.
 54 | {agent_scratchpad}"""
 55 | 
 56 | 
 57 | # Defines agent to query a database
 58 | @timer
 59 | def sql_agent(query: str, temperature: float = 0, top_k: int = 10, model: str = SQL_MODEL) -> str:
 60 |     """
 61 |     Create Agent that can query a database.
 62 |     """
 63 |     # Write prompt and create zero-shot agent
 64 |     prompt = ZeroShotAgent.create_prompt(
 65 |         tools=TOOLKIT.get_tools(),
 66 |         prefix=PREFIX.format(dialect=TOOLKIT.dialect, top_k=top_k),
 67 |         suffix=SUFFIX,
 68 |         format_instructions=FORMAT_INSTRUCTIONS,
 69 |         input_variables=['input', 'agent_scratchpad']
 70 |     )
 71 |     logger.info(prompt.template)
 72 | 
 73 |     # Create LLM and call API
 74 |     llm = ChatOpenAI(temperature=temperature, model_name=model)
 75 |     llm_chain = LLMChain(llm=llm, prompt=prompt)
 76 | 
 77 |     # Create agent with tools
 78 |     agent = ZeroShotAgent(llm_chain=llm_chain,
 79 |                           tools=TOOLS, tool_names=TOOL_NAMES)
 80 |     agent_executor = AgentExecutor.from_agent_and_tools(agent=agent, tools=TOOLS, max_iterations=10,
 81 |                                                         verbose=True, return_intermediate_steps=True)
 82 | 
 83 |     response = agent_executor({'input': query})
 84 |     pretty_response = prettify_agent_response(response)
 85 | 
 86 |     return pretty_response
 87 | 
 88 | 
 89 | # Create chain to query database
 90 | @timer
 91 | def sql_chain(query: str, temperature: float = 0, model: str = SQL_MODEL) -> str:
 92 |     """
 93 |     Create chain that can query a database.
 94 |     """
 95 |     llm = OpenAI(temperature=temperature, model_name=model)
 96 |     db_chain = SQLDatabaseChain(
 97 |         llm=llm, database=DB, verbose=True, return_intermediate_steps=True)
 98 | 
 99 |     response = db_chain(query)
100 |     logger.info(f'Response: {response}')
101 |     pretty_response = prettify_chain_response(response)
102 |     return pretty_response
103 | 


--------------------------------------------------------------------------------
/summarize.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for summarizing text.
  3 | """
  4 | import re
  5 | from typing import List
  6 | 
  7 | import requests
  8 | import tiktoken
  9 | from bs4 import BeautifulSoup
 10 | from langchain import OpenAI
 11 | from langchain.chains.summarize import load_summarize_chain
 12 | from langchain.docstore.document import Document
 13 | from langchain.prompts import (ChatPromptTemplate, HumanMessagePromptTemplate,
 14 |                                SystemMessagePromptTemplate)
 15 | from langchain.text_splitter import TokenTextSplitter
 16 | 
 17 | from config import SUMMARY_MAX_TOKENS, SUMMARY_MODEL, SUMMARY_TOKENIZER
 18 | from logger import logger
 19 | from utils import timer
 20 | 
 21 | ENC = tiktoken.encoding_for_model(SUMMARY_MODEL)
 22 | TEXT_SPLITTER = TokenTextSplitter(encoding_name=SUMMARY_TOKENIZER)
 23 | 
 24 | 
 25 | # Count the number of tokens in text
 26 | def num_tokens(text: str) -> int:
 27 |     """
 28 |     Count the number of tokens in text.
 29 |     """
 30 |     enc = tiktoken.encoding_for_model(SUMMARY_MODEL)
 31 |     return len(enc.encode(text))
 32 | 
 33 | 
 34 | # Get text from url
 35 | def get_text_from_url(url: str) -> str:
 36 |     """
 37 |     Get text from url.
 38 |     """
 39 |     response = requests.get(url)
 40 |     soup = BeautifulSoup(response.text, 'html.parser')
 41 |     text = re.sub(r'\n+', '\n', soup.get_text())  # Remove consecutive newlines
 42 | 
 43 |     # Trim text to 1800 tokens
 44 |     trimmed_text = ENC.decode((ENC.encode(text))[:SUMMARY_MAX_TOKENS])
 45 | 
 46 |     logger.info(
 47 |         f'{num_tokens(trimmed_text)}/{num_tokens(text)} tokens from {url}')
 48 |     return trimmed_text
 49 | 
 50 | 
 51 | # Get docs from text
 52 | def get_docs_from_text(text: str) -> list:
 53 |     """
 54 |     Get docs from text.
 55 |     """
 56 |     texts = TEXT_SPLITTER.split_text(text)
 57 |     docs = [Document(page_content=t) for t in texts]
 58 |     logger.info(f'Created {len(docs):,} out of {len(texts):,} total docs')
 59 |     return docs
 60 | 
 61 | 
 62 | # Remove empty lines from text
 63 | def remove_empty_lines(text: str) -> str:
 64 |     """
 65 |     Remove empty lines from text.
 66 |     """
 67 |     return '\n'.join([line for line in text.splitlines() if line.strip()])
 68 | 
 69 | 
 70 | # Calls OpenAI API and returns summary of text
 71 | def summarize(docs: List[str], temperature: float, model: str) -> str:
 72 |     """
 73 |     Calls OpenAI API and returns summary of text.
 74 |     """
 75 |     # Write prompt
 76 |     system_msg = """You are a teacher who summarizes documents into easily digestible bullet points."""
 77 |     human_msg = """Summarize the following text in bullet points: 
 78 |     
 79 |     {text}
 80 | 
 81 |     Concise summary in bullet points:"""
 82 | 
 83 |     messages = [
 84 |         SystemMessagePromptTemplate.from_template(system_msg),
 85 |         HumanMessagePromptTemplate.from_template(human_msg)
 86 |     ]
 87 | 
 88 |     prompt = ChatPromptTemplate.from_messages(messages)
 89 |     logger.info(f'Prompt: {prompt}, temperature: {temperature}')
 90 | 
 91 |     # Create LLM and call API
 92 |     llm = OpenAI(temperature=temperature, model_name=model)
 93 |     chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
 94 |     response = chain.run(docs)
 95 |     logger.info(
 96 |         f'Results received: {response} ({num_tokens(response)} tokens), temperature: {temperature}')
 97 | 
 98 |     return response
 99 | 
100 | 
101 | # Calls OpenAI API and explains the text like the user is a five-year old
102 | def eli5(docs: List[str], temperature: float, model: str) -> str:
103 |     """
104 |     Calls OpenAI API and returns explaination for a five year old
105 |     """
106 |     # Write prompt
107 |     system_msg = """You are a teacher who explains documents to a five-year old."""
108 |     human_msg = """Explain the following text to a five-year old: 
109 |     
110 |     {text}
111 | 
112 |     Concise explanation:"""
113 | 
114 |     messages = [
115 |         SystemMessagePromptTemplate.from_template(system_msg),
116 |         HumanMessagePromptTemplate.from_template(human_msg)
117 |     ]
118 | 
119 |     prompt = ChatPromptTemplate.from_messages(messages)
120 |     logger.info(f'Prompt: {prompt}, temperature: {temperature}')
121 | 
122 |     # Create LLM and call API
123 |     llm = OpenAI(temperature=temperature, model_name=model)
124 |     chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
125 |     response = chain.run(docs)
126 |     logger.info(
127 |         f'Results received: {response} ({num_tokens(response)} tokens), temperature: {temperature}')
128 | 
129 |     return response
130 | 
131 | 
132 | # Summarize text from url
133 | @timer
134 | def summarize_url(url: str, temperature: float = None, model: str = SUMMARY_MODEL) -> str:
135 |     """
136 |     Calls OpenAI API and returns summary of text.
137 |     """
138 |     logger.info(
139 |         f'summarize: {url} (temperature: {temperature}, model: {model})')
140 |     # Get text from url
141 |     text = get_text_from_url(url)
142 |     docs = get_docs_from_text(text)
143 |     response = summarize(docs, temperature, model)
144 |     pretty_response = remove_empty_lines(response)
145 | 
146 |     return pretty_response
147 | 
148 | 
149 | # Explain like I'm five from url
150 | @timer
151 | def eli5_url(url: str, temperature: float = None, model: str = SUMMARY_MODEL) -> str:
152 |     """
153 |     Calls OpenAI API and explains the text like the user is a five-year old.
154 |     """
155 |     logger.info(f'eli5: {url} (temperature: {temperature}, model: {model})')
156 |     # Get text from url
157 |     text = get_text_from_url(url)
158 |     docs = get_docs_from_text(text)
159 |     response = eli5(docs, temperature, model)
160 |     pretty_response = remove_empty_lines(response)
161 | 
162 |     return pretty_response
163 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for the project.
 3 | """
 4 | import re
 5 | from functools import wraps
 6 | from time import perf_counter
 7 | from typing import Callable
 8 | 
 9 | 
10 | # Timer decorator
11 | def timer(func: Callable) -> Callable:
12 |     """
13 |     Timer decorator.
14 |     """
15 |     @wraps(func)
16 |     def wrapper_timer(*args, **kwargs):
17 |         start_time = perf_counter()
18 |         value = func(*args, **kwargs)
19 |         end_time = perf_counter()
20 |         run_time = end_time - start_time
21 |         print(f'Finished {func.__name__!r} in {run_time:.2f} secs')
22 |         return value, run_time
23 | 
24 |     return wrapper_timer
25 | 
26 | 
27 | # Prettify langchain agent response
28 | def prettify_agent_response(response: dict, input_key: str = 'input', output_key: str = 'output') -> str:
29 |     """
30 |     Pretty print the response from the agent.
31 |     """
32 |     pretty_result = ''
33 | 
34 |     pretty_result += f'**Input:** {response[input_key]}\n\n'
35 | 
36 |     for step in response['intermediate_steps']:
37 |         action = step[0]
38 |         result = step[1]
39 |         # pretty_result += f'**Tool**: {action.tool} | **Input**: "{action.tool_input}"\n'
40 |         pretty_result += f'**Thought:** {action.log}\n'
41 |         pretty_result += f'**Observation:** _{result}_\n\n'
42 | 
43 |     pretty_result += f'\n**Output:** {response[output_key]}'
44 | 
45 |     return pretty_result
46 | 
47 | 
48 | # Prettify langchain agent response
49 | def prettify_chain_response(response: dict, input_key: str = 'query', output_key: str = 'result') -> str:
50 |     """
51 |     Pretty print the response from the chain.
52 |     """
53 |     pretty_result = ''
54 | 
55 |     pretty_result += f'**Input:** {response[input_key]}\n\n'
56 | 
57 |     if 'intermediate_steps' in response:
58 |         for i, step in enumerate(response['intermediate_steps']):
59 |             pretty_result += f'**Step {i}:** {step}\n\n'
60 | 
61 |     pretty_result += f'\n**Output:** {response[output_key]}'
62 | 
63 |     return pretty_result
64 | 
65 | 
66 | # Wrap urls in <> to prevent discord from embedding them
67 | def wrap_urls(text):
68 |     return re.sub(r'(https?://\S+)', r'<\1>', text)
69 | 
70 | 
71 | # Prettify langchain Q&A chain response
72 | def prettify_qa_response(response: dict, question_key: str = 'question', answer_key: str = 'answer') -> str:
73 |     """
74 |     Pretty print the response from the Q&A chain.
75 |     """
76 |     result_list = []
77 |     pretty_qa = ''
78 | 
79 |     pretty_qa += f'**Question:** {response[question_key]}\n\n'
80 |     pretty_qa += f'**Answer:** {response[answer_key]}\n'
81 |     pretty_qa += f'**Sources:** {wrap_urls(response["sources"])}'
82 |     result_list.append(pretty_qa)
83 | 
84 |     for doc in response['source_documents']:
85 |         pretty_source = ''
86 |         pretty_source += f'**Source:** {wrap_urls(doc.page_content)}\n\n'
87 |         pretty_source += f'**URL:** {wrap_urls(doc.metadata["source"])}'
88 |         result_list.append(pretty_source)
89 | 
90 |     return result_list
91 | 


--------------------------------------------------------------------------------