├── yt_pic.png
├── evals_graph.png
├── README.md
├── test_unit_test_eval.py
├── unit_test_eval.py
├── LICENSE
├── llama3_research_agent.ipynb
└── EVAL_Testing.ipynb


/yt_pic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ALucek/evals-deepdive/main/yt_pic.png


--------------------------------------------------------------------------------
/evals_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ALucek/evals-deepdive/main/evals_graph.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code From My GenAI Eval Deep Dive Video!
2 | 
3 | [![x](yt_pic.png)](https://youtu.be/-6D9U5CDKgE)
4 | Click play to watch :)
5 | 


--------------------------------------------------------------------------------
/test_unit_test_eval.py:
--------------------------------------------------------------------------------
 1 | from unit_test_eval import generate
 2 | import pytest
 3 | from langsmith import unit
 4 | from langsmith import expect
 5 | 
 6 | # Basic assertion test
 7 | @unit
 8 | def test_generate():
 9 |     query = "How do I write a for loop with range 10 in Python?"
10 |     output = generate(query)
11 |     assert output == "for i in range(10):"
12 | 
13 | # Using langsmith's expect with .to_contain() method
14 | @unit
15 | def test_generate_2():
16 |     query = "How do I make a list in python?"
17 |     output = generate(query)
18 |     expect(output).to_contain("[]")
19 | 
20 | # Using langsmith's expect with .embedding_distance() and .edit_distance() for fuzzy matching
21 | @unit
22 | def test_generate_3():
23 |     query = "How do I write hello world in Python?"
24 |     reference = 'print("Hello, World!")'
25 |     output = generate(query)
26 |     # Embedding Distance
27 |     expect.embedding_distance(prediction=output, reference=reference).to_be_less_than(0.5)
28 |     # Damerau-Levenshtein Edit Distance
29 |     expect.edit_distance(prediction=output, reference=reference)


--------------------------------------------------------------------------------
/unit_test_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from langchain.prompts import PromptTemplate
 3 | from langchain_core.output_parsers import StrOutputParser
 4 | from langchain_community.chat_models import ChatOllama
 5 | 
 6 | os.environ['LANGCHAIN_TRACING_V2'] = 'true'
 7 | os.environ["LANGCHAIN_PROJECT"] = "Eval Unit Testing"
 8 | 
 9 | llama3 = ChatOllama(model='llama3', temperature=0)
10 | 
11 | prompt = PromptTemplate(
12 |     template="""
13 |     
14 |     <|begin_of_text|>
15 |     
16 |     <|start_header_id|>system<|end_header_id|> 
17 |     
18 |     You are an AI assistant for generating python. Generate a python code snippet that answers the following question.
19 |     Keep your answer concise, only include the necessary code snippet with no preamble or explanation.
20 |     
21 |     <|eot_id|>
22 |     
23 |     <|start_header_id|>user<|end_header_id|>
24 |     
25 |     Question: {question} 
26 |     Answer: 
27 |     
28 |     <|eot_id|>
29 |     
30 |     <|start_header_id|>assistant<|end_header_id|>""",
31 |     input_variables=["question"],
32 | )
33 | 
34 | # Chain
35 | chain = prompt | llama3 | StrOutputParser()
36 | 
37 | # Function
38 | def generate(query):
39 |     response = chain.invoke({'question': query})
40 |     return response
41 | 
42 | out = generate("How do I write a for loop with range 10 in Python?")
43 | print(out)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/llama3_research_agent.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "0c707a92-6f82-44d8-8de0-fa612064df5e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Local Web Research Agent w/ Llama 3 8b\n",
  9 |     "\n",
 10 |     "### [Llama 3 Release](https://llama.meta.com/llama3/)\n",
 11 |     "\n",
 12 |     "### [Ollama Llama 3 Model](https://ollama.com/library/llama3)\n",
 13 |     "---\n",
 14 |     "\n",
 15 |     "![diagram](local_agent_diagram.png)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "id": "715059b3-857c-456d-a740-24e2551d739d",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "---\n",
 24 |     "[Llama 3 Prompt Format](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/)\n",
 25 |     "\n",
 26 |     "### Special Tokens used with Meta Llama 3\n",
 27 |     "* **<|begin_of_text|>**: This is equivalent to the BOS token\n",
 28 |     "* **<|eot_id|>**: This signifies the end of the message in a turn.\n",
 29 |     "* **<|start_header_id|>{role}<|end_header_id|>**: These tokens enclose the role for a particular message. The possible roles can be: system, user, assistant.\n",
 30 |     "* **<|end_of_text|>**: This is equivalent to the EOS token. On generating this token, Llama 3 will cease to generate more tokens.\n",
 31 |     "A prompt should contain a single system message, can contain multiple alternating user and assistant messages, and always ends with the last user message followed by the assistant header."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 19,
 37 |    "id": "35f2cb84-6abf-4a6c-8d1f-cdc6474b77ee",
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "# Displaying final output format\n",
 42 |     "from IPython.display import display, Markdown, Latex\n",
 43 |     "# LangChain Dependencies\n",
 44 |     "from langchain.prompts import PromptTemplate\n",
 45 |     "from langchain_core.output_parsers import JsonOutputParser, StrOutputParser\n",
 46 |     "from langchain_community.chat_models import ChatOllama\n",
 47 |     "from langchain_community.tools import DuckDuckGoSearchRun\n",
 48 |     "from langchain_community.utilities import DuckDuckGoSearchAPIWrapper\n",
 49 |     "from langgraph.graph import END, StateGraph\n",
 50 |     "# For State Graph \n",
 51 |     "from typing_extensions import TypedDict\n",
 52 |     "import os"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 20,
 58 |    "id": "d39b8539-1bfe-4001-b7b2-6752a77846d5",
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Environment Variables\n",
 63 |     "os.environ['LANGCHAIN_TRACING_V2'] = 'true'\n",
 64 |     "os.environ[\"LANGCHAIN_PROJECT\"] = \"L3 Research Agent\""
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 21,
 70 |    "id": "9b341d1d-0a59-4c03-8558-759ea00171bb",
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "# Defining LLM\n",
 75 |     "local_llm = 'llama3'\n",
 76 |     "llama3 = ChatOllama(model=local_llm, temperature=0)\n",
 77 |     "llama3_json = ChatOllama(model=local_llm, format='json', temperature=0)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 22,
 83 |    "id": "4c7813ac-791f-4035-a5ec-04810d5de5f9",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Web Search Tool\n",
 88 |     "# pip install -U duckduckgo_search==5.3.0b4\n",
 89 |     "# ^ if running into 202 rate limit error\n",
 90 |     "\n",
 91 |     "wrapper = DuckDuckGoSearchAPIWrapper(max_results=15)\n",
 92 |     "web_search_tool = DuckDuckGoSearchRun(api_wrapper=wrapper)\n",
 93 |     "\n",
 94 |     "# Test Run\n",
 95 |     "# resp = web_search_tool.invoke(\"home depot news\")\n",
 96 |     "# resp"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 23,
102 |    "id": "2d798a81-6ed6-4a4f-a1d9-93b4e3059fee",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# Generation Prompt\n",
107 |     "\n",
108 |     "generate_prompt = PromptTemplate(\n",
109 |     "    template=\"\"\"\n",
110 |     "    \n",
111 |     "    <|begin_of_text|>\n",
112 |     "    \n",
113 |     "    <|start_header_id|>system<|end_header_id|> \n",
114 |     "    \n",
115 |     "    You are an AI assistant for Research Question Tasks, that synthesizes web search results. \n",
116 |     "    Strictly use the following pieces of web search context to answer the question. If you don't know the answer, just say that you don't know. \n",
117 |     "    keep the answer concise, but provide all of the details you can in the form of a research report. \n",
118 |     "    Only make direct references to material if provided in the context.\n",
119 |     "    \n",
120 |     "    <|eot_id|>\n",
121 |     "    \n",
122 |     "    <|start_header_id|>user<|end_header_id|>\n",
123 |     "    \n",
124 |     "    Question: {question} \n",
125 |     "    Web Search Context: {context} \n",
126 |     "    Answer: \n",
127 |     "    \n",
128 |     "    <|eot_id|>\n",
129 |     "    \n",
130 |     "    <|start_header_id|>assistant<|end_header_id|>\"\"\",\n",
131 |     "    input_variables=[\"question\", \"context\"],\n",
132 |     ")\n",
133 |     "\n",
134 |     "# Chain\n",
135 |     "generate_chain = generate_prompt | llama3 | StrOutputParser()\n",
136 |     "\n",
137 |     "# Test Run\n",
138 |     "# question = \"How are you?\"\n",
139 |     "# context = \"\"\n",
140 |     "# generation = generate_chain.invoke({\"context\": context, \"question\": question})\n",
141 |     "# print(generation)"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 24,
147 |    "id": "49fa1965-6bd4-4dfc-9eb8-96c6cff7b639",
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "# Router\n",
152 |     "\n",
153 |     "router_prompt = PromptTemplate(\n",
154 |     "    template=\"\"\"\n",
155 |     "    \n",
156 |     "    <|begin_of_text|>\n",
157 |     "    \n",
158 |     "    <|start_header_id|>system<|end_header_id|>\n",
159 |     "    \n",
160 |     "    You are an expert at routing a user question to either the generation stage or web search. \n",
161 |     "    Use the web search for questions that require more context for a better answer, or recent events.\n",
162 |     "    Otherwise, you can skip and go straight to the generation phase to respond.\n",
163 |     "    You do not need to be stringent with the keywords in the question related to these topics.\n",
164 |     "    Give a binary choice 'web_search' or 'generate' based on the question. \n",
165 |     "    Return the JSON with a single key 'choice' with no premable or explanation. \n",
166 |     "    \n",
167 |     "    Question to route: {question} \n",
168 |     "    \n",
169 |     "    <|eot_id|>\n",
170 |     "    \n",
171 |     "    <|start_header_id|>assistant<|end_header_id|>\n",
172 |     "    \n",
173 |     "    \"\"\",\n",
174 |     "    input_variables=[\"question\"],\n",
175 |     ")\n",
176 |     "\n",
177 |     "# Chain\n",
178 |     "question_router = router_prompt | llama3_json | JsonOutputParser()\n",
179 |     "\n",
180 |     "# Test Run\n",
181 |     "# question = \"What's up?\"\n",
182 |     "# print(question_router.invoke({\"question\": question}))"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 25,
188 |    "id": "33ab4128-e0b0-4f49-9f36-1d3bf5636715",
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "# Query Transformation\n",
193 |     "\n",
194 |     "query_prompt = PromptTemplate(\n",
195 |     "    template=\"\"\"\n",
196 |     "    \n",
197 |     "    <|begin_of_text|>\n",
198 |     "    \n",
199 |     "    <|start_header_id|>system<|end_header_id|> \n",
200 |     "    \n",
201 |     "    You are an expert at crafting web search queries for research questions.\n",
202 |     "    More often than not, a user will ask a basic question that they wish to learn more about, however it might not be in the best format. \n",
203 |     "    Reword their query to be the most effective web search string possible.\n",
204 |     "    Return the JSON with a single key 'query' with no premable or explanation. \n",
205 |     "    \n",
206 |     "    Question to transform: {question} \n",
207 |     "    \n",
208 |     "    <|eot_id|>\n",
209 |     "    \n",
210 |     "    <|start_header_id|>assistant<|end_header_id|>\n",
211 |     "    \n",
212 |     "    \"\"\",\n",
213 |     "    input_variables=[\"question\"],\n",
214 |     ")\n",
215 |     "\n",
216 |     "# Chain\n",
217 |     "query_chain = query_prompt | llama3_json | JsonOutputParser()\n",
218 |     "\n",
219 |     "# Test Run\n",
220 |     "# question = \"What's happened recently with Macom?\"\n",
221 |     "# print(query_chain.invoke({\"question\": question}))"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 26,
227 |    "id": "a1c8e922-3f00-48d6-83cb-cc78a2292838",
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "# Graph State\n",
232 |     "class GraphState(TypedDict):\n",
233 |     "    \"\"\"\n",
234 |     "    Represents the state of our graph.\n",
235 |     "\n",
236 |     "    Attributes:\n",
237 |     "        question: question\n",
238 |     "        generation: LLM generation\n",
239 |     "        search_query: revised question for web search\n",
240 |     "        context: web_search result\n",
241 |     "    \"\"\"\n",
242 |     "    question : str\n",
243 |     "    generation : str\n",
244 |     "    search_query : str\n",
245 |     "    context : str\n",
246 |     "\n",
247 |     "# Node - Generate\n",
248 |     "\n",
249 |     "def generate(state):\n",
250 |     "    \"\"\"\n",
251 |     "    Generate answer\n",
252 |     "\n",
253 |     "    Args:\n",
254 |     "        state (dict): The current graph state\n",
255 |     "\n",
256 |     "    Returns:\n",
257 |     "        state (dict): New key added to state, generation, that contains LLM generation\n",
258 |     "    \"\"\"\n",
259 |     "    \n",
260 |     "    print(\"Step: Generating Final Response\")\n",
261 |     "    question = state[\"question\"]\n",
262 |     "    context = state[\"context\"]\n",
263 |     "\n",
264 |     "    # Answer Generation\n",
265 |     "    generation = generate_chain.invoke({\"context\": context, \"question\": question})\n",
266 |     "    return {\"generation\": generation}\n",
267 |     "\n",
268 |     "# Node - Query Transformation\n",
269 |     "\n",
270 |     "def transform_query(state):\n",
271 |     "    \"\"\"\n",
272 |     "    Transform user question to web search\n",
273 |     "\n",
274 |     "    Args:\n",
275 |     "        state (dict): The current graph state\n",
276 |     "\n",
277 |     "    Returns:\n",
278 |     "        state (dict): Appended search query\n",
279 |     "    \"\"\"\n",
280 |     "    \n",
281 |     "    print(\"Step: Optimizing Query for Web Search\")\n",
282 |     "    question = state['question']\n",
283 |     "    gen_query = query_chain.invoke({\"question\": question})\n",
284 |     "    search_query = gen_query[\"query\"]\n",
285 |     "    return {\"search_query\": search_query}\n",
286 |     "\n",
287 |     "\n",
288 |     "# Node - Web Search\"\n",
289 |     "def web_search(state):\n",
290 |     "    \"\"\"\n",
291 |     "    Web search based on the question\n",
292 |     "\n",
293 |     "    Args:\n",
294 |     "        state (dict): The current graph state\n",
295 |     "\n",
296 |     "    Returns:\n",
297 |     "        state (dict): Appended web results to context\n",
298 |     "    \"\"\"\n",
299 |     "\n",
300 |     "    search_query = state['search_query']\n",
301 |     "    print(f'Step: Searching the Web for: \"{search_query}\"')\n",
302 |     "    \n",
303 |     "    # Web search tool call\n",
304 |     "    search_result = web_search_tool.invoke(search_query)\n",
305 |     "    return {\"context\": search_result}\n",
306 |     "\n",
307 |     "\n",
308 |     "# Conditional Edge, Routing\n",
309 |     "\n",
310 |     "def route_question(state):\n",
311 |     "    \"\"\"\n",
312 |     "    route question to web search or generation.\n",
313 |     "\n",
314 |     "    Args:\n",
315 |     "        state (dict): The current graph state\n",
316 |     "\n",
317 |     "    Returns:\n",
318 |     "        str: Next node to call\n",
319 |     "    \"\"\"\n",
320 |     "\n",
321 |     "    print(\"Step: Routing Query\")\n",
322 |     "    question = state['question']\n",
323 |     "    output = question_router.invoke({\"question\": question})\n",
324 |     "    if output['choice'] == \"web_search\":\n",
325 |     "        print(\"Step: Routing Query to Web Search\")\n",
326 |     "        return \"websearch\"\n",
327 |     "    elif output['choice'] == 'generate':\n",
328 |     "        print(\"Step: Routing Query to Generation\")\n",
329 |     "        return \"generate\""
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 27,
335 |    "id": "8f665713-e80b-4d86-8015-77ba55506004",
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": [
339 |     "# Build the nodes\n",
340 |     "workflow = StateGraph(GraphState)\n",
341 |     "workflow.add_node(\"websearch\", web_search)\n",
342 |     "workflow.add_node(\"transform_query\", transform_query)\n",
343 |     "workflow.add_node(\"generate\", generate)\n",
344 |     "\n",
345 |     "# Build the edges\n",
346 |     "workflow.set_conditional_entry_point(\n",
347 |     "    route_question,\n",
348 |     "    {\n",
349 |     "        \"websearch\": \"transform_query\",\n",
350 |     "        \"generate\": \"generate\",\n",
351 |     "    },\n",
352 |     ")\n",
353 |     "workflow.add_edge(\"transform_query\", \"websearch\")\n",
354 |     "workflow.add_edge(\"websearch\", \"generate\")\n",
355 |     "workflow.add_edge(\"generate\", END)\n",
356 |     "\n",
357 |     "# Compile the workflow\n",
358 |     "local_agent = workflow.compile()"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 28,
364 |    "id": "4f53aa05-20b2-420e-9a8f-bf12b1e547ab",
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "from langsmith import traceable\n",
369 |     "\n",
370 |     "@traceable \n",
371 |     "def run_agent(query):\n",
372 |     "    output = local_agent.invoke({\"question\": query})\n",
373 |     "    print(\"=======\")\n",
374 |     "    display(Markdown(output[\"generation\"]))"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 29,
380 |    "id": "6a1b135d-131e-4276-b40c-12ea8b78c39c",
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "name": "stdout",
385 |      "output_type": "stream",
386 |      "text": [
387 |       "Step: Routing Query\n",
388 |       "Step: Routing Query to Web Search\n",
389 |       "Step: Optimizing Query for Web Search\n",
390 |       "Step: Searching the Web for: \"Apple Q3 earnings report\"\n",
391 |       "Step: Generating Final Response\n",
392 |       "=======\n"
393 |      ]
394 |     },
395 |     {
396 |      "data": {
397 |       "text/markdown": [
398 |        "Based on the provided web search context, Apple's Q3 earnings are as follows:\n",
399 |        "\n",
400 |        "* Quarterly revenue: $81.8 billion (down 1% year over year)\n",
401 |        "* Quarterly earnings per diluted share: $1.26 (up 5% year over year)\n",
402 |        "\n",
403 |        "Note that these figures were announced by Apple on August 3, 2023, and the company's CEO Tim Cook and CFO Luca Maestri shared additional details during their Q3 2023 financial results call."
404 |       ],
405 |       "text/plain": [
406 |        "<IPython.core.display.Markdown object>"
407 |       ]
408 |      },
409 |      "metadata": {},
410 |      "output_type": "display_data"
411 |     }
412 |    ],
413 |    "source": [
414 |     "# Test it out!\n",
415 |     "run_agent(\"What's are Apple's q3 earnings\")"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "markdown",
420 |    "id": "b56fca5a-b73e-4c19-9f47-ebe0ca08bc79",
421 |    "metadata": {},
422 |    "source": [
423 |     "---\n",
424 |     "# Attaching Evals to Existing Runs\n",
425 |     "\n",
426 |     "What if you have an existing application that's being traced, and you want to insert evaluations at specific parts of the operation?"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "id": "492d1aa9-67df-4dee-8934-81ec77106dd7",
432 |    "metadata": {},
433 |    "source": [
434 |     "### Creating a quick QA dataset to test against"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 30,
440 |    "id": "29e9bab4-5734-4592-a69b-36ee756b56d5",
441 |    "metadata": {},
442 |    "outputs": [],
443 |    "source": [
444 |     "from langsmith import Client\n",
445 |     "\n",
446 |     "client = Client()\n",
447 |     "\n",
448 |     "examples = [\n",
449 |     "    (\"What Apple's Q3 Earnings?\", \"Apple today announced financial results for its fiscal 2023 third quarter ended July 1, 2023. The Company posted quarterly revenue of $81.8 billion, down 1 percent year over year, and quarterly earnings per diluted share of $1.26, up 5 percent year over year.\"),\n",
450 |     "    (\"What are new apple products?\", \"Apple is refreshing both iPad Pro models with OLED screens, bringing a major update in display quality. There will be two models with screen sizes around 11 and 13 inches, and we are expecting design updates. With the switch to OLED, Apple is cutting down on thickness, and the new iPad Pro models will be much thinner. We're also expecting them to adopt the M3 chip for faster performance, and Apple is planning to debut a new Magic Keyboard that gives the iPad Pro a more Mac-like feel and a new Apple Pencil.  With the 2024 iPad Air refresh, we're getting two models for the first time. The smaller iPad Air will have a 10.9-inch display like the current iPad Air, while the larger version will have a 12.9-inch display like the current 12.9-inch iPad Pro. The iPad Air models will be more affordable than the iPad Pro models, and won't have \\\"Pro\\\" features like ProMotion refresh rates and OLED displays. Rumors are mixed on whether the iPad Air will get the M2 or the M3 chip, but either option will be an improvement over the M1 in the current model.\"),\n",
451 |     "]\n",
452 |     "\n",
453 |     "dataset_name = \"Apple - L3 Agent Testing\"\n",
454 |     "if not client.has_dataset(dataset_name=dataset_name):\n",
455 |     "    dataset = client.create_dataset(dataset_name=dataset_name)\n",
456 |     "    inputs, outputs = zip(\n",
457 |     "        *[({\"input\": input}, {\"expected\": expected}) for input, expected in examples]\n",
458 |     "    )\n",
459 |     "    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=dataset.id)"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "markdown",
464 |    "id": "52f0e334-b5ad-4b3c-9a0e-95c047dd7ef9",
465 |    "metadata": {},
466 |    "source": [
467 |     "### Defining Some Custom Evaluators\n",
468 |     "\n",
469 |     "Few notes here, using structured function calling alongside OpenAI to create a quick LLM-as-judge Evaluator\n",
470 |     "\n",
471 |     "Also, need to make sure that digging into your runs/child_runs is accurate. Using LangSmith expand all runs to see how this flows exactly."
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": 31,
477 |    "id": "8a90ba29-7799-4c47-b196-0494bbc3a801",
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "from langsmith.evaluation import LangChainStringEvaluator, evaluate\n",
482 |     "from langsmith.schemas import Example, Run\n",
483 |     "from langchain_openai import ChatOpenAI\n",
484 |     "from langchain_core.prompts import ChatPromptTemplate\n",
485 |     "from langchain_core.pydantic_v1 import BaseModel, Field\n",
486 |     "\n",
487 |     "# Search Tool Test\n",
488 |     "def search_retrieval(root_run: Run, example: Example) -> dict:\n",
489 |     "    \"\"\"\n",
490 |     "    A simple evaluator that checks if the retrieved web search contains answer for the question\n",
491 |     "    \"\"\"\n",
492 |     "    # Get documents and answer\n",
493 |     "    agent_run = next(run for run in root_run.child_runs if run.name == \"run_agent\")\n",
494 |     "    LangGraph = next(run for run in agent_run.child_runs if run.name == \"LangGraph\")\n",
495 |     "    search_run = next(run for run in LangGraph.child_runs if run.name == \"websearch\")\n",
496 |     "    context = search_run.outputs[\"context\"]\n",
497 |     "    question = agent_run.inputs[\"query\"]\n",
498 |     "\n",
499 |     "    # Data model\n",
500 |     "    class GradeWebsearch(BaseModel):\n",
501 |     "        \"\"\"Binary score for whether websearch contains question context.\"\"\"\n",
502 |     "\n",
503 |     "        binary_score: int = Field(description=\"Context contains answer to question, 1 or 0\")\n",
504 |     "\n",
505 |     "    # LLM with function call\n",
506 |     "    llm = ChatOpenAI(model=\"gpt-4o\", temperature=0)\n",
507 |     "    structured_websearch_grader = llm.with_structured_output(GradeWebsearch)\n",
508 |     "\n",
509 |     "    # Prompt\n",
510 |     "    system = \"\"\"You are a grader assessing whether an Web search contains the context needed to answer a user query. \\n\n",
511 |     "        Give a binary score 1 or 0, where 1 means that the answer is in the web search results.\"\"\"\n",
512 |     "    websearch_prompt = ChatPromptTemplate.from_messages(\n",
513 |     "        [\n",
514 |     "            (\"system\", system),\n",
515 |     "            (\"human\", \"Web search: \\n\\n {context} \\n\\n Question: {question}\"),\n",
516 |     "        ]\n",
517 |     "    )\n",
518 |     "\n",
519 |     "    websearch_grader = websearch_prompt | structured_websearch_grader\n",
520 |     "    score = websearch_grader.invoke({\"context\": context, \"question\": question})\n",
521 |     "    return {\"key\": \"websearch_verification\", \"score\": int(score.binary_score)}\n",
522 |     "\n",
523 |     "# Hallucination Test\n",
524 |     "def hallucination(root_run: Run, example: Example) -> dict:\n",
525 |     "    \"\"\"\n",
526 |     "    A simple evaluator that checks to see the answer is grounded in the context\n",
527 |     "    \"\"\"\n",
528 |     "    # Get documents and answer\n",
529 |     "    agent_run = next(run for run in root_run.child_runs if run.name == \"run_agent\")\n",
530 |     "    LangGraph = next(run for run in agent_run.child_runs if run.name == \"LangGraph\")\n",
531 |     "    search_run = next(run for run in LangGraph.child_runs if run.name == \"websearch\")\n",
532 |     "    context = search_run.outputs[\"context\"]\n",
533 |     "    generation = LangGraph.outputs[\"generation\"]\n",
534 |     "\n",
535 |     "    # Data model\n",
536 |     "    class GradeHallucinations(BaseModel):\n",
537 |     "        \"\"\"Binary score for hallucination present in generation answer.\"\"\"\n",
538 |     "\n",
539 |     "        binary_score: int = Field(description=\"Answer is grounded in the facts, 1 or 0\")\n",
540 |     "\n",
541 |     "    # LLM with function call\n",
542 |     "    llm = ChatOpenAI(model=\"gpt-4o\", temperature=0)\n",
543 |     "    structured_llm_grader = llm.with_structured_output(GradeHallucinations)\n",
544 |     "\n",
545 |     "    # Prompt\n",
546 |     "    system = \"\"\"You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \\n\n",
547 |     "        Give a binary score 1 or 0, where 1 means that the answer is grounded in / supported by the set of facts.\"\"\"\n",
548 |     "    hallucination_prompt = ChatPromptTemplate.from_messages(\n",
549 |     "        [\n",
550 |     "            (\"system\", system),\n",
551 |     "            (\"human\", \"Set of facts: \\n\\n {context} \\n\\n LLM generation: {generation}\"),\n",
552 |     "        ]\n",
553 |     "    )\n",
554 |     "\n",
555 |     "    hallucination_grader = hallucination_prompt | structured_llm_grader\n",
556 |     "    score = hallucination_grader.invoke({\"context\": context, \"generation\": generation})\n",
557 |     "    return {\"key\": \"answer_hallucination\", \"score\": int(score.binary_score)}"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "markdown",
562 |    "id": "02b4202b-18fa-46ff-a814-d9b2fc1e9e42",
563 |    "metadata": {},
564 |    "source": [
565 |     "### Running the Evaluation!"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": 32,
571 |    "id": "482e415d-e08c-4acf-83a3-15e2fb5c28df",
572 |    "metadata": {},
573 |    "outputs": [
574 |     {
575 |      "name": "stdout",
576 |      "output_type": "stream",
577 |      "text": [
578 |       "View the evaluation results for experiment: 'websearch-test-1-4ebcc119' at:\n",
579 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/e301d2c7-3cfd-4a70-8ecf-2ea308bf9ad4/compare?selectedSessions=86b68fc3-c086-4a77-a730-7bb46c77028f\n",
580 |       "\n",
581 |       "\n"
582 |      ]
583 |     },
584 |     {
585 |      "data": {
586 |       "application/vnd.jupyter.widget-view+json": {
587 |        "model_id": "8b6a92a3483b4201a344e98023a2fdd4",
588 |        "version_major": 2,
589 |        "version_minor": 0
590 |       },
591 |       "text/plain": [
592 |        "0it [00:00, ?it/s]"
593 |       ]
594 |      },
595 |      "metadata": {},
596 |      "output_type": "display_data"
597 |     },
598 |     {
599 |      "name": "stdout",
600 |      "output_type": "stream",
601 |      "text": [
602 |       "Step: Routing Query\n",
603 |       "Step: Routing Query\n",
604 |       "Step: Routing Query to Web Search\n",
605 |       "Step: Optimizing Query for Web Search\n",
606 |       "Step: Routing Query to Web Search\n",
607 |       "Step: Optimizing Query for Web Search\n",
608 |       "Step: Searching the Web for: \"new Apple products\"\n",
609 |       "Step: Searching the Web for: \"Apple Q3 earnings report\"\n",
610 |       "Step: Generating Final Response\n",
611 |       "Step: Generating Final Response\n",
612 |       "=======\n"
613 |      ]
614 |     },
615 |     {
616 |      "data": {
617 |       "text/markdown": [
618 |        "Based on the provided web search context, new Apple products include:\n",
619 |        "\n",
620 |        "* iPad Air: Available in new blue and purple finishes, along with starlight and space gray, starting at $599 for the 11-inch model and $799 for the 13-inch model.\n",
621 |        "* iPhone 15 and iPhone 15 Plus: Feature a gorgeous new design, Dynamic Island, 48MP Main camera, and A16 Bionic chip. They will be available in five colors and have a USB-C connector, contoured edge, and durable color-infused back glass. Pre-orders begin on September 15, with availability starting on September 22.\n",
622 |        "* iPhone 15 Pro and iPhone 15 Pro Max: Available in four stunning new finishes, including black titanium, white titanium, blue titanium, and natural titanium. Pre-orders begin on September 15, with availability starting on September 22.\n",
623 |        "* Apple Watch Series 9: Available in 41mm and 45mm sizes in starlight, midnight, silver, (PRODUCT)RED, and a new pink aluminum case, as well as stainless steel in gold, silver, and graphite cases.\n",
624 |        "* Apple Pencil: A new, more affordable option with pixel-perfect accuracy, low latency, and tilt sensitivity for note taking, sketching, and more. It works with all iPad models that have a USB-C port, including iPad Pro, iPad Air, and iPad mini, and is available for purchase beginning in early November.\n",
625 |        "* Mac Studio: Receiving an update, including the silicon, replacing the M1 Max and M1 Ultra with the M2 Max and M2 Ultra.\n",
626 |        "\n",
627 |        "Note: The article also mentions Apple's upcoming mixed reality headset, which can play back stereoscopic 3D video shot on iPhone 15 Pro."
628 |       ],
629 |       "text/plain": [
630 |        "<IPython.core.display.Markdown object>"
631 |       ]
632 |      },
633 |      "metadata": {},
634 |      "output_type": "display_data"
635 |     },
636 |     {
637 |      "name": "stdout",
638 |      "output_type": "stream",
639 |      "text": [
640 |       "=======\n"
641 |      ]
642 |     },
643 |     {
644 |      "data": {
645 |       "text/markdown": [
646 |        "Based on the provided web search context, Apple's Q3 earnings are as follows:\n",
647 |        "\n",
648 |        "* Quarterly revenue: $81.8 billion, down 1% year over year\n",
649 |        "* Quarterly earnings per diluted share: $1.26, up 5% year over year\n",
650 |        "\n",
651 |        "These figures were announced by Apple in its fiscal 2023 third-quarter earnings report, which was released on August 3, 2023."
652 |       ],
653 |       "text/plain": [
654 |        "<IPython.core.display.Markdown object>"
655 |       ]
656 |      },
657 |      "metadata": {},
658 |      "output_type": "display_data"
659 |     }
660 |    ],
661 |    "source": [
662 |     "experiment_results = evaluate(\n",
663 |     "    lambda inputs: run_agent(inputs[\"input\"]),\n",
664 |     "    data=\"Apple - L3 Agent Testing\",\n",
665 |     "    evaluators=[search_retrieval, hallucination],\n",
666 |     "    experiment_prefix=\"websearch-test-1\"\n",
667 |     ")"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": null,
673 |    "id": "72781a0b-3082-4503-bd76-ddeb2a52efb1",
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": []
677 |   }
678 |  ],
679 |  "metadata": {
680 |   "kernelspec": {
681 |    "display_name": "Python 3 (ipykernel)",
682 |    "language": "python",
683 |    "name": "python3"
684 |   },
685 |   "language_info": {
686 |    "codemirror_mode": {
687 |     "name": "ipython",
688 |     "version": 3
689 |    },
690 |    "file_extension": ".py",
691 |    "mimetype": "text/x-python",
692 |    "name": "python",
693 |    "nbconvert_exporter": "python",
694 |    "pygments_lexer": "ipython3",
695 |    "version": "3.12.1"
696 |   }
697 |  },
698 |  "nbformat": 4,
699 |  "nbformat_minor": 5
700 | }
701 | 


--------------------------------------------------------------------------------
/EVAL_Testing.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "8898b7e2-c7cf-49e4-8ee3-81d071736a4b",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# LLM Application Evaluations\n",
   9 |     "\n",
  10 |     "**Problem:** LLM Applications are very new, and have limited resources for evaluating performance. LLM's are dynamic in their output, and thus require very custom evaluations, many overlook this step or do their actual testing in production with user feedback. This [doesn't always work out...](https://twitter.com/ChrisJBakke/status/1736533308849443121)\n",
  11 |     "\n",
  12 |     "**Solution:** We will be going over ways to utilize LangChain's offering [LangSmith](https://docs.smith.langchain.com/), a seperate software that allows tracing, testing, and evaluation of LLM applications.\n",
  13 |     "\n",
  14 |     "### LangChain's phenomenal summary of the LLM evaluation landscape:\n",
  15 |     "\n",
  16 |     "![x](evals_graph.png)"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "markdown",
  21 |    "id": "fe43f4e9-38ae-4b05-afc1-d9d973d71776",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "---\n",
  25 |     "### Topics Covered\n",
  26 |     "\n",
  27 |     "#### Adding a Dataset to Langsmith\n",
  28 |     "- Adding the HuggingFace GO_Emotions Dataset to LangSmith\n",
  29 |     "#### Creating and Running Custom Evaluator to Compare LLM Outputs on Classification Task\n",
  30 |     "- Creating a custom evaluator to determine quality of output from different LLM's on emotion classification task for comparison\n",
  31 |     "#### Using an LLM as an Evaluator\n",
  32 |     "- Using an LLM-as-judge flow for Evaluating an LLM's ability for Question & Answer flows from a dataset made from a blog post\n",
  33 |     "- Both evaluating and comparing OpenAI and Mistral 7b's ability to answer questions\n",
  34 |     "#### Overview of Built-In w/LangChainStringEvaluator\n",
  35 |     "- Chain of Thought QA for contextual accuracy for both GPT-4o and Mistral 7b, then comparing the two\n",
  36 |     "- Using Built-In Criteria, Helpfulness\n",
  37 |     "#### LLM as an Evaluator with Custom Criteria\n",
  38 |     "- Unlabeled objectivity, having an LLM evaluate output without a grounded reference\n",
  39 |     "- Labeled objectivity, having an LLM evaluate output WITH a grounded reference\n",
  40 |     "  - Both as range scores and binary scores\n",
  41 |     "#### Evaluating Existing Evaluations (Summary Evaluation)\n",
  42 |     "- Running evaluations on entire experiments, not just each example\n",
  43 |     "- Pass test/fail test for overall evaluation score example with Mistral 7b\n",
  44 |     "#### Pairwise Evaluations (Comparing Experiments Against Each Other)\n",
  45 |     "- Evaluation for comparing two experiments outputs\n",
  46 |     "- LLM as showing preference towards one output vs another and comparing\n",
  47 |     "#### Unit Tests\n",
  48 |     "- Attaching decorators to pytest tests for evaluation in LangSmith\n",
  49 |     "- Both assertations, and for custom tests like embedding distance, edit distance, contains etc that work better for LLM output\n",
  50 |     "#### Evaluating Specific Parts of Existing Workflows\n",
  51 |     "- How to plug into specific parts of an overall LLM application workflow and run custom evaluations on the different steps\n",
  52 |     "- Adding on a few evaluations to my llama3 web research agent to evaluate document retrieval relevancy, and hallucination measurement\n",
  53 |     "---"
  54 |    ]
  55 |   },
  56 |   {
  57 |    "cell_type": "code",
  58 |    "execution_count": 4,
  59 |    "id": "f0549a3d-0df7-4e7c-b4c7-66e36bfe5fda",
  60 |    "metadata": {},
  61 |    "outputs": [],
  62 |    "source": [
  63 |     "import os\n",
  64 |     "# os.environ['LANGCHAIN_API_KEY'] = ''\n",
  65 |     "os.environ['LANGCHAIN_TRACING_V2'] = 'true'\n",
  66 |     "os.environ['LANGCHAIN_PROJECT'] = 'Eval Testing 1'"
  67 |    ]
  68 |   },
  69 |   {
  70 |    "cell_type": "markdown",
  71 |    "id": "80f19020-3ca3-4fe6-bcd7-39d8ef50bd36",
  72 |    "metadata": {},
  73 |    "source": [
  74 |     "# DataSets\n",
  75 |     "\n",
  76 |     "### Importing a HuggingFace Dataset into LangSmith\n",
  77 |     "\n",
  78 |     "Going to be using one that I have previously used for fine tuning: \n",
  79 |     "\n",
  80 |     "https://huggingface.co/datasets/go_emotions"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "code",
  85 |    "execution_count": 5,
  86 |    "id": "4cdb351e-bba5-45f9-85a7-d348e7347a1c",
  87 |    "metadata": {},
  88 |    "outputs": [
  89 |     {
  90 |      "data": {
  91 |       "text/html": [
  92 |        "<div>\n",
  93 |        "<style scoped>\n",
  94 |        "    .dataframe tbody tr th:only-of-type {\n",
  95 |        "        vertical-align: middle;\n",
  96 |        "    }\n",
  97 |        "\n",
  98 |        "    .dataframe tbody tr th {\n",
  99 |        "        vertical-align: top;\n",
 100 |        "    }\n",
 101 |        "\n",
 102 |        "    .dataframe thead th {\n",
 103 |        "        text-align: right;\n",
 104 |        "    }\n",
 105 |        "</style>\n",
 106 |        "<table border=\"1\" class=\"dataframe\">\n",
 107 |        "  <thead>\n",
 108 |        "    <tr style=\"text-align: right;\">\n",
 109 |        "      <th></th>\n",
 110 |        "      <th>comment</th>\n",
 111 |        "      <th>emotion label</th>\n",
 112 |        "    </tr>\n",
 113 |        "  </thead>\n",
 114 |        "  <tbody>\n",
 115 |        "    <tr>\n",
 116 |        "      <th>0</th>\n",
 117 |        "      <td>Omg i hope this is about [NAME]. I would LOVE ...</td>\n",
 118 |        "      <td>optimism</td>\n",
 119 |        "    </tr>\n",
 120 |        "    <tr>\n",
 121 |        "      <th>1</th>\n",
 122 |        "      <td>Finale</td>\n",
 123 |        "      <td>neutral</td>\n",
 124 |        "    </tr>\n",
 125 |        "    <tr>\n",
 126 |        "      <th>2</th>\n",
 127 |        "      <td>Which suggests nothing in itself. The same mod...</td>\n",
 128 |        "      <td>anger, annoyance</td>\n",
 129 |        "    </tr>\n",
 130 |        "    <tr>\n",
 131 |        "      <th>3</th>\n",
 132 |        "      <td>I double dog dare him.</td>\n",
 133 |        "      <td>neutral</td>\n",
 134 |        "    </tr>\n",
 135 |        "    <tr>\n",
 136 |        "      <th>4</th>\n",
 137 |        "      <td>Believe you me. TLJ is much, much worse.</td>\n",
 138 |        "      <td>disappointment, disgust</td>\n",
 139 |        "    </tr>\n",
 140 |        "  </tbody>\n",
 141 |        "</table>\n",
 142 |        "</div>"
 143 |       ],
 144 |       "text/plain": [
 145 |        "                                             comment            emotion label\n",
 146 |        "0  Omg i hope this is about [NAME]. I would LOVE ...                 optimism\n",
 147 |        "1                                             Finale                  neutral\n",
 148 |        "2  Which suggests nothing in itself. The same mod...         anger, annoyance\n",
 149 |        "3                             I double dog dare him.                  neutral\n",
 150 |        "4           Believe you me. TLJ is much, much worse.  disappointment, disgust"
 151 |       ]
 152 |      },
 153 |      "execution_count": 5,
 154 |      "metadata": {},
 155 |      "output_type": "execute_result"
 156 |     }
 157 |    ],
 158 |    "source": [
 159 |     "# Dataset Import\n",
 160 |     "\n",
 161 |     "# Importing with huggingface datasets package\n",
 162 |     "import pandas as pd\n",
 163 |     "from datasets import load_dataset\n",
 164 |     "\n",
 165 |     "df = load_dataset('go_emotions')\n",
 166 |     "\n",
 167 |     "# creating an emotion index label dictionary\n",
 168 |     "label_index = {\n",
 169 |     "    \"0\": \"admiration\",\n",
 170 |     "    \"1\": \"amusement\",\n",
 171 |     "    \"2\": \"anger\",\n",
 172 |     "    \"3\": \"annoyance\",\n",
 173 |     "    \"4\": \"approval\",\n",
 174 |     "    \"5\": \"caring\",\n",
 175 |     "    \"6\": \"confusion\",\n",
 176 |     "    \"7\": \"curiosity\",\n",
 177 |     "    \"8\": \"desire\",\n",
 178 |     "    \"9\": \"disappointment\",\n",
 179 |     "    \"10\": \"disapproval\",\n",
 180 |     "    \"11\": \"disgust\",\n",
 181 |     "    \"12\": \"embarassment\",\n",
 182 |     "    \"13\": \"excitement\",\n",
 183 |     "    \"14\": \"fear\",\n",
 184 |     "    \"15\": \"gratitude\",\n",
 185 |     "    \"16\": \"grief\",\n",
 186 |     "    \"17\": \"joy\",\n",
 187 |     "    \"18\": \"love\",\n",
 188 |     "    \"19\": \"nervousness\",\n",
 189 |     "    \"20\": \"optimism\",\n",
 190 |     "    \"21\": \"pride\",\n",
 191 |     "    \"22\": \"realization\",\n",
 192 |     "    \"23\": \"relief\",\n",
 193 |     "    \"24\": \"remorse\",\n",
 194 |     "    \"25\": \"sadness\",\n",
 195 |     "    \"26\": \"surprise\",\n",
 196 |     "    \"27\": \"neutral\"\n",
 197 |     "}\n",
 198 |     "\n",
 199 |     "# Pull some random 20 Comments & Emotion\n",
 200 |     "data = []\n",
 201 |     "for i in range(1001, 1022):\n",
 202 |     "    comment = df['train'][i]['text']\n",
 203 |     "    label_indices = df['train'][i]['labels']\n",
 204 |     "\n",
 205 |     "    # deal with labels\n",
 206 |     "    if not isinstance(label_indices, list):\n",
 207 |     "        label_indices = [label_indices]\n",
 208 |     "\n",
 209 |     "    # label mapping\n",
 210 |     "    emotions = ', '.join([label_index.get(str(label)) for label in label_indices])\n",
 211 |     "\n",
 212 |     "    data.append((comment, emotions))\n",
 213 |     "\n",
 214 |     "comments_df = pd.DataFrame(data, columns=[\"comment\", \"emotion label\"])\n",
 215 |     "\n",
 216 |     "comments_df.head()"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "code",
 221 |    "execution_count": 6,
 222 |    "id": "b25b6398-d003-413f-9609-d9c283378821",
 223 |    "metadata": {},
 224 |    "outputs": [],
 225 |    "source": [
 226 |     "# Putting dataset into langsmith\n",
 227 |     "from langsmith import Client\n",
 228 |     "\n",
 229 |     "client = Client()\n",
 230 |     "dataset_name = \"go_emotions\"\n",
 231 |     "\n",
 232 |     "# Store\n",
 233 |     "dataset = client.create_dataset(\n",
 234 |     "    dataset_name=dataset_name,\n",
 235 |     "    description=\"Social Media Comment and Emotion from HuggingFace Go Emotions\"\n",
 236 |     ")\n",
 237 |     "client.create_examples(\n",
 238 |     "    inputs=[{\"comment\": q} for q in comments_df['comment']],\n",
 239 |     "    outputs=[{\"emotion\": a} for a in comments_df['emotion label']],\n",
 240 |     "    dataset_id=dataset.id\n",
 241 |     ")"
 242 |    ]
 243 |   },
 244 |   {
 245 |    "cell_type": "markdown",
 246 |    "id": "777445b6-e2ef-489d-b5d3-c3454442e9f7",
 247 |    "metadata": {},
 248 |    "source": [
 249 |     "---\n",
 250 |     "# 1: Comparing Models with Custom Evaluation on LangChain Chain\n",
 251 |     "\n",
 252 |     "Classification Task of emotions with GPT-4o, GPT-3.5-T & Fine Tuned GPT-3.5-T"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "markdown",
 257 |    "id": "c106bc1d-52f2-4843-af5e-732320b0158a",
 258 |    "metadata": {},
 259 |    "source": [
 260 |     "### Setting up First \"LLM App\"\n",
 261 |     "\n",
 262 |     "Emotion classification chain. Take in a social media comment, apply one of 27 emotion labels or neutral to it. \n",
 263 |     "\n",
 264 |     "We will be setting up this chain with 3 models. Base GPT-4o, Base GPT-3.5-Turbo, Fine Tuned GPT-3.5-Turbo on the Go Emotions Dataset"
 265 |    ]
 266 |   },
 267 |   {
 268 |    "cell_type": "code",
 269 |    "execution_count": 7,
 270 |    "id": "a32c42a6-d0cd-4d41-81b9-657cc88133df",
 271 |    "metadata": {},
 272 |    "outputs": [],
 273 |    "source": [
 274 |     "# Setting Up Chain\n",
 275 |     "from langchain_openai import ChatOpenAI\n",
 276 |     "from langchain.prompts import ChatPromptTemplate\n",
 277 |     "from langchain_core.runnables import RunnablePassthrough\n",
 278 |     "from langchain_core.output_parsers import StrOutputParser\n",
 279 |     "\n",
 280 |     "emotion_analysis_template = \"\"\"\n",
 281 |     "You are a cutting edge emotion analysis classification assistant.\\\n",
 282 |     "You analyze a comment, and apply one or more emotion labels to it. \\\n",
 283 |     "\n",
 284 |     "The emotion labels are detailed here: \\\n",
 285 |     "\n",
 286 |     "['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']\n",
 287 |     "\n",
 288 |     "Your output should simply be just the respective emotion, and if there are multiple seperated with commas. \\\n",
 289 |     "\n",
 290 |     "The comment is here: {comment}\n",
 291 |     "\"\"\"\n",
 292 |     "\n",
 293 |     "output_parser = StrOutputParser()\n",
 294 |     "\n",
 295 |     "# different models to plug in (plus the fine tuned one!)\n",
 296 |     "gpt4o_llm = ChatOpenAI(temperature=0.0, model=\"gpt-4o\")\n",
 297 |     "ft_llm = ChatOpenAI(temperature=0.0, model=\"ft:gpt-3.5-turbo-0125:personal:go-emotions:95jDha5f\")\n",
 298 |     "gpt35t_llm = ChatOpenAI(temperature=0.0, model=\"gpt-3.5-turbo-0125\")\n",
 299 |     "\n",
 300 |     "emotion_analysis_prompt = ChatPromptTemplate.from_template(emotion_analysis_template)\n",
 301 |     "\n",
 302 |     "analysis_chain_gpt35t = (\n",
 303 |     "    {\"comment\": RunnablePassthrough()} \n",
 304 |     "    | emotion_analysis_prompt\n",
 305 |     "    | gpt35t_llm\n",
 306 |     "    | output_parser\n",
 307 |     ")\n",
 308 |     "\n",
 309 |     "analysis_chain_gpt4o = (\n",
 310 |     "    {\"comment\": RunnablePassthrough()} \n",
 311 |     "    | emotion_analysis_prompt\n",
 312 |     "    | gpt4o_llm\n",
 313 |     "    | output_parser\n",
 314 |     ")\n",
 315 |     "\n",
 316 |     "analysis_chain_ft = (\n",
 317 |     "    {\"comment\": RunnablePassthrough()} \n",
 318 |     "    | emotion_analysis_prompt\n",
 319 |     "    | ft_llm\n",
 320 |     "    | output_parser\n",
 321 |     ")"
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "markdown",
 326 |    "id": "40e4e1d5-b7e5-46fe-880c-f195f394e9ef",
 327 |    "metadata": {},
 328 |    "source": [
 329 |     "### Defining a custom evaluator\n",
 330 |     "\n",
 331 |     "Currently we have two pieces of data\n",
 332 |     "1. The dataset social media comment\n",
 333 |     "2. The dataset assigned emotion label(s)\n",
 334 |     "\n",
 335 |     "Want to evaluate model performance on the (1)Dataset social media comment in comparison to the (2)dataset assigned emotion label.\n",
 336 |     "\n",
 337 |     "The below function assigns an \"is_same\" score of 1 if it's an exact match, 0.5 if the LLM output partially contains the expected label, or 0 if nothing is included, this is returned as a dictionary with a key and score.\n",
 338 |     "\n",
 339 |     "To set this up, we have to specify the `Run` and `Example`. `Run` is the LLM \"run\" being evaluated, whereas `Example` is what's in the dataset."
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": 8,
 345 |    "id": "2779b33d-e071-4020-9865-d717351bd59f",
 346 |    "metadata": {},
 347 |    "outputs": [],
 348 |    "source": [
 349 |     "from langsmith.schemas import Run, Example\n",
 350 |     "from langsmith.evaluation import evaluate\n",
 351 |     "\n",
 352 |     "def expected_eval(run: Run, example: Example) -> dict:\n",
 353 |     "    # Getting the emotions and response as a set \n",
 354 |     "    expected_answer = set(example.outputs.get(\"emotion\").split(\", \"))\n",
 355 |     "    response = set(run.outputs.get(\"output\").split(\", \"))\n",
 356 |     "\n",
 357 |     "    # Check if response matches the expected answer exactly\n",
 358 |     "    if response == expected_answer:\n",
 359 |     "        return {\"key\": \"is_same\", \"score\": 1}\n",
 360 |     "    # Check if there is any overlap (partial match)\n",
 361 |     "    elif response & expected_answer:\n",
 362 |     "        return {\"key\": \"is_same\", \"score\": 0.5}\n",
 363 |     "    # No overlap at all\n",
 364 |     "    else:\n",
 365 |     "        return {\"key\": \"is_same\", \"score\": 0}\n",
 366 |     "\n"
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "markdown",
 371 |    "id": "790ece44-385c-4f5d-bd90-f37f7d60c74d",
 372 |    "metadata": {},
 373 |    "source": [
 374 |     "### Using evaluate() to run your evaluations\n",
 375 |     "\n",
 376 |     "evaluate() needs a few arguments, the function (or in this case the chain) to evaluate, the dataset to compare against, the evaluator(s) as a list (can run multiple at a time, hence list), an experiment prefix for identification, and can pass in any metadata as a dictionary.\n",
 377 |     "\n",
 378 |     "#### Evaluating is_same score on base gpt-3.5-turbo output against go_emotions dataset"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": 9,
 384 |    "id": "f615d21d-107f-4934-8943-9ff082b6d103",
 385 |    "metadata": {},
 386 |    "outputs": [
 387 |     {
 388 |      "name": "stdout",
 389 |      "output_type": "stream",
 390 |      "text": [
 391 |       "View the evaluation results for experiment: 'test-gpt35t-expected_answer-c71af42f' at:\n",
 392 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/ab3b5df2-9b7c-42ae-85b2-5a1a3e6bd96d/compare?selectedSessions=b00fe5eb-67c1-4d86-8530-d3378fc7215b\n",
 393 |       "\n",
 394 |       "\n"
 395 |      ]
 396 |     },
 397 |     {
 398 |      "data": {
 399 |       "application/vnd.jupyter.widget-view+json": {
 400 |        "model_id": "d34a5af06e7340b0b9cc92d7b606128e",
 401 |        "version_major": 2,
 402 |        "version_minor": 0
 403 |       },
 404 |       "text/plain": [
 405 |        "0it [00:00, ?it/s]"
 406 |       ]
 407 |      },
 408 |      "metadata": {},
 409 |      "output_type": "display_data"
 410 |     }
 411 |    ],
 412 |    "source": [
 413 |     "# Evaluators\n",
 414 |     "qa_evaluator = [expected_eval]\n",
 415 |     "dataset_name = 'go_emotions'\n",
 416 |     "\n",
 417 |     "# Base Model gpt-3.5-turbo Run\n",
 418 |     "base_gpt35t_eval = evaluate(\n",
 419 |     "    analysis_chain_gpt35t.invoke,\n",
 420 |     "    data=dataset_name,\n",
 421 |     "    evaluators=qa_evaluator,\n",
 422 |     "    experiment_prefix=\"test-gpt35t-expected_answer\",\n",
 423 |     "    metadata={\n",
 424 |     "        \"variant\": \"base model gpt-3.5-turbo\"\n",
 425 |     "    }\n",
 426 |     ")"
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "markdown",
 431 |    "id": "6a1ed570-0b32-4b42-81e6-7508c5ef6007",
 432 |    "metadata": {},
 433 |    "source": [
 434 |     "#### Evaluating is_same score on base gpt-4o output against go_emotions dataset"
 435 |    ]
 436 |   },
 437 |   {
 438 |    "cell_type": "code",
 439 |    "execution_count": 10,
 440 |    "id": "0a88f287-37b3-4b3a-9168-7c61f711daf0",
 441 |    "metadata": {},
 442 |    "outputs": [
 443 |     {
 444 |      "name": "stdout",
 445 |      "output_type": "stream",
 446 |      "text": [
 447 |       "View the evaluation results for experiment: 'test-gpt4o-expected_answer-2cd9e55d' at:\n",
 448 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/ab3b5df2-9b7c-42ae-85b2-5a1a3e6bd96d/compare?selectedSessions=e966e0f8-66cb-470c-8f7b-8e16b104b6da\n",
 449 |       "\n",
 450 |       "\n"
 451 |      ]
 452 |     },
 453 |     {
 454 |      "data": {
 455 |       "application/vnd.jupyter.widget-view+json": {
 456 |        "model_id": "717b86c950964ad2b3c7f5b14f169e7c",
 457 |        "version_major": 2,
 458 |        "version_minor": 0
 459 |       },
 460 |       "text/plain": [
 461 |        "0it [00:00, ?it/s]"
 462 |       ]
 463 |      },
 464 |      "metadata": {},
 465 |      "output_type": "display_data"
 466 |     }
 467 |    ],
 468 |    "source": [
 469 |     "# Base Model gpt-4o Run\n",
 470 |     "base_gpt4o_eval = evaluate(\n",
 471 |     "    analysis_chain_gpt4o.invoke,\n",
 472 |     "    data=dataset_name,\n",
 473 |     "    evaluators=qa_evaluator,\n",
 474 |     "    experiment_prefix=\"test-gpt4o-expected_answer\",\n",
 475 |     "    metadata={\n",
 476 |     "        \"variant\": \"base model gpt-4o\"\n",
 477 |     "    }\n",
 478 |     ")"
 479 |    ]
 480 |   },
 481 |   {
 482 |    "cell_type": "markdown",
 483 |    "id": "90c1008f-157a-4c01-8ae0-a3dbc77bc4bc",
 484 |    "metadata": {},
 485 |    "source": [
 486 |     "#### Evaluating is_same score on fine tuned gpt-3.5-turbo output against go_emotions dataset"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "code",
 491 |    "execution_count": 11,
 492 |    "id": "2b78bf78-f982-4615-af2e-8b1e0d092631",
 493 |    "metadata": {},
 494 |    "outputs": [
 495 |     {
 496 |      "name": "stdout",
 497 |      "output_type": "stream",
 498 |      "text": [
 499 |       "View the evaluation results for experiment: 'test-ft-3.5t-expected_answer-12892071' at:\n",
 500 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/ab3b5df2-9b7c-42ae-85b2-5a1a3e6bd96d/compare?selectedSessions=6504ca10-7e42-4c56-a4c8-c29264025a5a\n",
 501 |       "\n",
 502 |       "\n"
 503 |      ]
 504 |     },
 505 |     {
 506 |      "data": {
 507 |       "application/vnd.jupyter.widget-view+json": {
 508 |        "model_id": "17506acabedf4a68b1de9eb8534732ed",
 509 |        "version_major": 2,
 510 |        "version_minor": 0
 511 |       },
 512 |       "text/plain": [
 513 |        "0it [00:00, ?it/s]"
 514 |       ]
 515 |      },
 516 |      "metadata": {},
 517 |      "output_type": "display_data"
 518 |     }
 519 |    ],
 520 |    "source": [
 521 |     "# Base Model fine-tuned gpt-3.5-turbo Run\n",
 522 |     "ft_gpt35t_eval = evaluate(\n",
 523 |     "    analysis_chain_ft.invoke,\n",
 524 |     "    data=dataset_name,\n",
 525 |     "    evaluators=qa_evaluator,\n",
 526 |     "    experiment_prefix=\"test-ft-3.5t-expected_answer\",\n",
 527 |     "    metadata={\n",
 528 |     "        \"variant\": \"fine tuned gpt-3.5-turbo\"\n",
 529 |     "    }\n",
 530 |     ")"
 531 |    ]
 532 |   },
 533 |   {
 534 |    "cell_type": "markdown",
 535 |    "id": "cdbdf47f-2370-42c5-90f4-df194450c6d3",
 536 |    "metadata": {},
 537 |    "source": [
 538 |     "---\n",
 539 |     "# 2: Assessing Model Output Using an LLM-As-Judge Approach\n",
 540 |     "\n",
 541 |     "Using built in evaluators to assess model performance, using another model!"
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "markdown",
 546 |    "id": "cc2daff1-108a-4d8a-8683-3b9e3c7e7b3d",
 547 |    "metadata": {},
 548 |    "source": [
 549 |     "#### Creating a new dataset of question and answer pairs\n",
 550 |     "\n",
 551 |     "Website of interest: https://lilianweng.github.io/posts/2023-06-23-agent/"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": 12,
 557 |    "id": "7517c95e-07ec-46c6-9885-5a58c786c5a4",
 558 |    "metadata": {},
 559 |    "outputs": [],
 560 |    "source": [
 561 |     "# Loading A Web Page\n",
 562 |     "import requests\n",
 563 |     "from bs4 import BeautifulSoup\n",
 564 |     "url = 'https://lilianweng.github.io/posts/2023-06-23-agent/'\n",
 565 |     "response = requests.get(url)\n",
 566 |     "soup = BeautifulSoup(response.content, 'html.parser')\n",
 567 |     "text = [p.text for p in soup.find_all('p')]\n",
 568 |     "full_text = '\\n'.join(text)"
 569 |    ]
 570 |   },
 571 |   {
 572 |    "cell_type": "code",
 573 |    "execution_count": 13,
 574 |    "id": "8bd4fc57-ff83-4de7-8ac4-814c4bf1d717",
 575 |    "metadata": {},
 576 |    "outputs": [],
 577 |    "source": [
 578 |     "# Example Questions\n",
 579 |     "inputs = [\n",
 580 |     "    \"What is the primary function of LLM in autonomous agents?\",\n",
 581 |     "    \"Can you describe the role of 'Planning' in LLM-powered autonomous agents?\",\n",
 582 |     "    \"What types of memory are utilized by LLM-powered agents?\",\n",
 583 |     "    \"How do autonomous agents use tool APIs?\",\n",
 584 |     "    \"What are some challenges faced by LLM-powered autonomous agents in real-world applications?\"\n",
 585 |     "]\n",
 586 |     "\n",
 587 |     "outputs = [\n",
 588 |     "    \"LLM functions as the core controller or 'brain' of autonomous agents, enabling them to handle complex tasks through planning, memory, and tool use.\",\n",
 589 |     "    \"In LLM-powered agents, 'Planning' involves breaking down complex tasks into manageable subgoals, reflecting on past actions, and refining strategies for improved outcomes.\",\n",
 590 |     "    \"LLM-powered agents utilize short-term memory for in-context learning and long-term memory for retaining and recalling information over extended periods, often leveraging external vector stores.\",\n",
 591 |     "    \"Autonomous agents use tool APIs to extend their capabilities beyond the model's weights, allowing access to current information, code execution, and proprietary data.\",\n",
 592 |     "    \"Challenges include managing the complexity of task dependencies, maintaining the stability of model outputs, and ensuring efficient interaction with external models and APIs.\"\n",
 593 |     "]\n",
 594 |     "\n",
 595 |     "# Dataset\n",
 596 |     "qa_pairs = [{\"question\": q, \"answer\": a} for q, a in zip(inputs, outputs)]\n",
 597 |     "df = pd.DataFrame(qa_pairs)"
 598 |    ]
 599 |   },
 600 |   {
 601 |    "cell_type": "code",
 602 |    "execution_count": 14,
 603 |    "id": "0feea2e5-c9b3-450a-b1ce-c275d173d05b",
 604 |    "metadata": {},
 605 |    "outputs": [],
 606 |    "source": [
 607 |     "# Putting dataset into langsmith\n",
 608 |     "from langsmith import Client\n",
 609 |     "\n",
 610 |     "client = Client()\n",
 611 |     "dataset_name = \"agent_dataset\"\n",
 612 |     "\n",
 613 |     "# Store\n",
 614 |     "dataset = client.create_dataset(\n",
 615 |     "    dataset_name=dataset_name,\n",
 616 |     "    description=\"QA pairs Lilian Weng's AI Agents Blog Post.\"\n",
 617 |     ")\n",
 618 |     "client.create_examples(\n",
 619 |     "    inputs=[{\"question\": q} for q in inputs],\n",
 620 |     "    outputs=[{\"answer\": a} for a in outputs],\n",
 621 |     "    dataset_id=dataset.id\n",
 622 |     ")"
 623 |    ]
 624 |   },
 625 |   {
 626 |    "cell_type": "markdown",
 627 |    "id": "3e6c18d9-4fcf-418b-b04f-1252006de752",
 628 |    "metadata": {},
 629 |    "source": [
 630 |     "### Defining \"apps\" to test\n",
 631 |     "\n",
 632 |     "Two LLM \"apps\" to be tested. Both are simple Question and Answering setups, with the context of the web page above inserted into the prompt.\n",
 633 |     "1. OpenAI gpt-4o Q/A\n",
 634 |     "2. Mistral 7b Q/A"
 635 |    ]
 636 |   },
 637 |   {
 638 |    "cell_type": "code",
 639 |    "execution_count": 15,
 640 |    "id": "9a734444-6d0b-4a62-9191-a0ff7f67bb48",
 641 |    "metadata": {},
 642 |    "outputs": [],
 643 |    "source": [
 644 |     "# OpenAI API\n",
 645 |     "import openai\n",
 646 |     "from langsmith.wrappers import wrap_openai\n",
 647 |     "openai_client = wrap_openai(openai.Client())\n",
 648 |     "\n",
 649 |     "def qa_oai(inputs: dict) -> dict:\n",
 650 |     "    system_msg = f\"Answer the user's question in 2-3 sentences using this context: \\n\\n\\n {full_text}\"\n",
 651 |     "    \n",
 652 |     "    messages = [{\"role\": \"system\", \"content\": system_msg},\n",
 653 |     "                {\"role\": \"user\", \"content\": inputs[\"question\"]}]\n",
 654 |     "\n",
 655 |     "    response = openai_client.chat.completions.create(messages=messages, model=\"gpt-4o\")\n",
 656 |     "\n",
 657 |     "    return {\"answer\": response.dict()['choices'][0]['message']['content']}"
 658 |    ]
 659 |   },
 660 |   {
 661 |    "cell_type": "code",
 662 |    "execution_count": 16,
 663 |    "id": "e1554ec4-90b9-489f-97bc-edb31473f0fa",
 664 |    "metadata": {},
 665 |    "outputs": [],
 666 |    "source": [
 667 |     "# Ollama API\n",
 668 |     "import ollama\n",
 669 |     "from langsmith.run_helpers import traceable\n",
 670 |     "\n",
 671 |     "@traceable(run_type=\"llm\")\n",
 672 |     "def call_ollama(messages, model: str):\n",
 673 |     "    stream = ollama.chat(messages=messages, model='mistral', stream=True)\n",
 674 |     "    response = ''\n",
 675 |     "    for chunk in stream:\n",
 676 |     "        print(chunk['message']['content'], end='', flush=True)\n",
 677 |     "        response = response + chunk['message']['content']\n",
 678 |     "    return response\n",
 679 |     "\n",
 680 |     "def qa_mistral(inputs: dict) -> dict:\n",
 681 |     "    system_msg = f\"Answer the user's question using this context: \\n\\n\\n {full_text}\"\n",
 682 |     "    \n",
 683 |     "    messages = [{\"role\": \"system\", \"content\": system_msg},\n",
 684 |     "                {\"role\": \"user\", \"content\": f'Answer the question in 2-3 sentences {inputs[\"question\"]}' }]\n",
 685 |     "    \n",
 686 |     "    response = call_ollama(messages, model=\"mistral\")\n",
 687 |     "\n",
 688 |     "    return {\"answer\": response} "
 689 |    ]
 690 |   },
 691 |   {
 692 |    "cell_type": "markdown",
 693 |    "id": "3ab7800a-4ff9-418d-8bdb-e456d1e1cdf4",
 694 |    "metadata": {},
 695 |    "source": [
 696 |     "### We can now use a built in functionality, the `LangChainStringEvaluator`\n",
 697 |     "\n",
 698 |     "https://docs.smith.langchain.com/old/evaluation/faq/evaluator-implementations\n",
 699 |     "\n",
 700 |     "LangChainStringEvaluator has many built in evaluators. And essentially... evaluates a string based on different `criteria`. \n",
 701 |     "\n",
 702 |     "| Evaluator Name          | Output Key               | Simple Code Example                                                                                      |\n",
 703 |     "|-------------------------|--------------------------|---------------------------------------------------------------------------------------------------------|\n",
 704 |     "| QA                      | correctness              | `LangChainStringEvaluator(\"qa\")`                                                                        |\n",
 705 |     "| Contextual Q&A          | contextual accuracy      | `LangChainStringEvaluator(\"context_qa\")`                                                                |\n",
 706 |     "| Chain of Thought Q&A    | cot contextual accuracy  | `LangChainStringEvaluator(\"cot_qa\")`                                                                    |\n",
 707 |     "| Criteria                | Depends on criteria key  | `LangChainStringEvaluator(\"criteria\", config={ \"criteria\": <criterion> })`                              |\n",
 708 |     "| Labeled Criteria        | Depends on criteria key  | `LangChainStringEvaluator(\"labeled_criteria\", config={ \"criteria\": <criterion> })`                      |\n",
 709 |     "| Score                   | Depends on criteria key  | `LangChainStringEvaluator(\"score_string\", config={ \"criteria\": <criterion>, \"normalize_by\": 10 })`      |\n",
 710 |     "| Labeled Score           | Depends on criteria key  | `LangChainStringEvaluator(\"labeled_score_string\", config={ \"criteria\": <criterion>, \"normalize_by\": 10 })` |\n",
 711 |     "| Embedding Distance      | embedding_cosine_distance| `LangChainStringEvaluator(\"embedding_distance\")`                                                        |\n",
 712 |     "| String Distance         | string_distance          | `LangChainStringEvaluator(\"string_distance\", config={\"distance\": \"damerau_levenshtein\" })`              |\n",
 713 |     "| Exact Match             | exact_match              | `LangChainStringEvaluator(\"exact_match\")`                                                               |\n",
 714 |     "| Regex Match             | regex_match              | `LangChainStringEvaluator(\"regex_match\")`                                                               |\n",
 715 |     "| Json Validity           | json_validity            | `LangChainStringEvaluator(\"json_validity\")`                                                             |\n",
 716 |     "| Json Equality           | json_equality            | `LangChainStringEvaluator(\"json_equality\")`                                                             |\n",
 717 |     "| Json Edit Distance      | json_edit_distance       | `LangChainStringEvaluator(\"json_edit_distance\")`                                                        |\n",
 718 |     "| Json Schema             | json_schema              | `LangChainStringEvaluator(\"json_schema\")`                                                               |\n",
 719 |     "\n",
 720 |     "\n",
 721 |     "`criterion` may be one of the default implemented criteria: `conciseness`, `relevance`, `correctness`, `coherence`, `harmfulness`, `maliciousness`, `helpfulness`, `controversiality`, `misogyny`, and `criminality`.\n",
 722 |     "\n",
 723 |     "Or, you may define your own criteria in a custom dict as follows:\n",
 724 |     "`{ \"criterion_key\": \"criterion description\" }`"
 725 |    ]
 726 |   },
 727 |   {
 728 |    "cell_type": "markdown",
 729 |    "id": "67e45740-c8ba-4040-a5f3-c2ad7a3e272e",
 730 |    "metadata": {},
 731 |    "source": [
 732 |     "For our evaluation, going to use `[LangChainStringEvaluator(\"cot_qa\")]` for Chain of Thought contextual accuracy on question and answering. This will compare the LLM generated response to the question with the expected answer from the dataset, using a built in CoT chain."
 733 |    ]
 734 |   },
 735 |   {
 736 |    "cell_type": "code",
 737 |    "execution_count": 17,
 738 |    "id": "0cc07ae1-509c-4317-a3c5-4a1f18f1c802",
 739 |    "metadata": {},
 740 |    "outputs": [
 741 |     {
 742 |      "name": "stdout",
 743 |      "output_type": "stream",
 744 |      "text": [
 745 |       "View the evaluation results for experiment: 'test-agent-qa-oai-641102aa' at:\n",
 746 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=4af55dee-9be5-4973-9a56-f16c87ae65aa\n",
 747 |       "\n",
 748 |       "\n"
 749 |      ]
 750 |     },
 751 |     {
 752 |      "data": {
 753 |       "application/vnd.jupyter.widget-view+json": {
 754 |        "model_id": "6de31ee911d34df29aaa87194c020e69",
 755 |        "version_major": 2,
 756 |        "version_minor": 0
 757 |       },
 758 |       "text/plain": [
 759 |        "0it [00:00, ?it/s]"
 760 |       ]
 761 |      },
 762 |      "metadata": {},
 763 |      "output_type": "display_data"
 764 |     }
 765 |    ],
 766 |    "source": [
 767 |     "from langsmith.evaluation import LangChainStringEvaluator\n",
 768 |     "\n",
 769 |     "qa_evaluator = [LangChainStringEvaluator(\"cot_qa\")]\n",
 770 |     "dataset_name = \"agent_dataset\"\n",
 771 |     "\n",
 772 |     "oai_cot_eval = evaluate(\n",
 773 |     "    qa_oai,\n",
 774 |     "    data=dataset_name,\n",
 775 |     "    evaluators=qa_evaluator,\n",
 776 |     "    experiment_prefix=\"test-agent-qa-oai\",\n",
 777 |     "    # Any experiment metadata can be specified here\n",
 778 |     "    metadata={\n",
 779 |     "        \"variant\": \"full website in context window with gpt-4o\"\n",
 780 |     "    }\n",
 781 |     ")"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "code",
 786 |    "execution_count": 18,
 787 |    "id": "70251d04-1061-4a6f-a939-c8831f0f5713",
 788 |    "metadata": {},
 789 |    "outputs": [
 790 |     {
 791 |      "name": "stdout",
 792 |      "output_type": "stream",
 793 |      "text": [
 794 |       "View the evaluation results for experiment: 'test-agent-qa-mistral-6881f27b' at:\n",
 795 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=07c0e0f4-fee9-42da-ac77-b6cf914997c8\n",
 796 |       "\n",
 797 |       "\n"
 798 |      ]
 799 |     },
 800 |     {
 801 |      "data": {
 802 |       "application/vnd.jupyter.widget-view+json": {
 803 |        "model_id": "93601f9d96794fc1b19eece0da5c8df9",
 804 |        "version_major": 2,
 805 |        "version_minor": 0
 806 |       },
 807 |       "text/plain": [
 808 |        "0it [00:00, ?it/s]"
 809 |       ]
 810 |      },
 811 |      "metadata": {},
 812 |      "output_type": "display_data"
 813 |     },
 814 |     {
 815 |      "name": "stdout",
 816 |      "output_type": "stream",
 817 |      "text": [
 818 |       " Planning plays a crucial role in LLM-powered autonomous agents as it enables them to adjust their actions based on long-term goals and unexpected errors. However, current LLMs face challenges in planning over extended periods and decomposing tasks effectively, making them less robust compared to humans. Techniques such as self-reflection, vector search, tool augmentation, and reinforcement learning are being explored to enhance the planning capabilities of LLMs. LLM-powered agents utilize different types of memory, including dynamic memory for storing and reflecting on past experiences, and external knowledge sources such as databases or APIs for accessing additional information. Some agents also incorporate vector stores and retrieval systems for efficient access to large knowledge pools. Autonomous agents use tool APIs by integrating them with large language models, allowing the agents to access external knowledge and perform specific tasks more efficiently. This enables the agents to expand their capabilities beyond their internal knowledge base and improve their problem-solving abilities. Examples of tool APIs include database access, web search engines, and scientific research tools. The primary function of a Large Language Model (LLM) in autonomous agents is to process natural language instructions and generate responses or actions based on that input. It serves as the brain of the agent, interpreting data from external components and making decisions through reasoning and problem-solving capabilities. Some challenges faced by LLM-powered autonomous agents in real-world applications include finite context length, which limits historical information and instruction inclusion; difficulties in long-term planning and task decomposition; and the reliability of natural language interfaces due to potential formatting errors or rebellious behavior from the models."
 819 |      ]
 820 |     }
 821 |    ],
 822 |    "source": [
 823 |     "mistral_cot_eval = evaluate(\n",
 824 |     "    qa_mistral,\n",
 825 |     "    data=dataset_name,\n",
 826 |     "    evaluators=qa_evaluator,\n",
 827 |     "    experiment_prefix=\"test-agent-qa-mistral\",\n",
 828 |     "    # Any experiment metadata can be specified here\n",
 829 |     "    metadata={\n",
 830 |     "        \"variant\": \"full website in context window with Mistral 7b\"\n",
 831 |     "    }\n",
 832 |     ")"
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "markdown",
 837 |    "id": "1cec334e-0e66-4e2e-b281-276cb295086c",
 838 |    "metadata": {},
 839 |    "source": [
 840 |     "### Trying one more out built in criteria, helpfulness"
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "code",
 845 |    "execution_count": 19,
 846 |    "id": "b2b0f316-a21b-4da1-90a0-838f4bd181a0",
 847 |    "metadata": {},
 848 |    "outputs": [
 849 |     {
 850 |      "name": "stdout",
 851 |      "output_type": "stream",
 852 |      "text": [
 853 |       "View the evaluation results for experiment: 'test-agent-qa-oai-helpfulness-3361d6cb' at:\n",
 854 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=1a6be94f-4f7b-4ef9-863c-ed2be1d3b441\n",
 855 |       "\n",
 856 |       "\n"
 857 |      ]
 858 |     },
 859 |     {
 860 |      "data": {
 861 |       "application/vnd.jupyter.widget-view+json": {
 862 |        "model_id": "acf3dc4b99d84108b9b92e3f420c8daa",
 863 |        "version_major": 2,
 864 |        "version_minor": 0
 865 |       },
 866 |       "text/plain": [
 867 |        "0it [00:00, ?it/s]"
 868 |       ]
 869 |      },
 870 |      "metadata": {},
 871 |      "output_type": "display_data"
 872 |     }
 873 |    ],
 874 |    "source": [
 875 |     "oai_helpfulness_eval = evaluate(\n",
 876 |     "    qa_oai,\n",
 877 |     "    data=dataset_name,\n",
 878 |     "    evaluators=[LangChainStringEvaluator(\"criteria\", config={ \"criteria\": \"helpfulness\" })],\n",
 879 |     "    experiment_prefix=\"test-agent-qa-oai-helpfulness\",\n",
 880 |     "    metadata={\n",
 881 |     "        \"variant\": \"full website in context window with gpt-4o, Helpfulness check\"\n",
 882 |     "    }\n",
 883 |     ")"
 884 |    ]
 885 |   },
 886 |   {
 887 |    "cell_type": "code",
 888 |    "execution_count": 20,
 889 |    "id": "451c9a31-dce9-4180-a8eb-a334d3680155",
 890 |    "metadata": {},
 891 |    "outputs": [
 892 |     {
 893 |      "name": "stdout",
 894 |      "output_type": "stream",
 895 |      "text": [
 896 |       "View the evaluation results for experiment: 'test-agent-qa-mistral-helpfulness-d223aa3b' at:\n",
 897 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=7245e36d-b143-4cfa-a15c-9f5714885636\n",
 898 |       "\n",
 899 |       "\n"
 900 |      ]
 901 |     },
 902 |     {
 903 |      "data": {
 904 |       "application/vnd.jupyter.widget-view+json": {
 905 |        "model_id": "2ec5ff558e68484995752351ac5ddec3",
 906 |        "version_major": 2,
 907 |        "version_minor": 0
 908 |       },
 909 |       "text/plain": [
 910 |        "0it [00:00, ?it/s]"
 911 |       ]
 912 |      },
 913 |      "metadata": {},
 914 |      "output_type": "display_data"
 915 |     },
 916 |     {
 917 |      "name": "stdout",
 918 |      "output_type": "stream",
 919 |      "text": [
 920 |       " Autonomous agents utilize tool APIs by integrating them with large language models, enabling the agents to execute specific tasks and access external knowledge sources. This modular architecture enhances the capabilities of the agents, allowing them to perform complex operations and interact with various systems. (References: [11], [15], [17], [20]) In LLM-powered autonomous agents, planning plays a crucial role in enabling long-term goal achievement and effective task decomposition. It allows the agent to adjust plans when faced with unexpected errors, making it more robust compared to humans who learn from trial and error. However, current LLMs struggle with reliably generating accurate and formatted outputs for interface communication, which can limit their planning capabilities. Ongoing research focuses on improving these aspects, such as incorporating feedback mechanisms, synergizing reasoning and acting, and developing modular architectures that combine large language models with external knowledge sources and discrete reasoning. LLM-powered agents typically utilize two types of memory: internal memory, which is the agent's own knowledge base and working memory for storing information relevant to the current task, and external memory or databases, which store additional information that the agent may need to access during problem solving. The choice and combination of these memory types depend on the specific use case and design of the LLM-powered agent. In autonomous agents, Large Language Models (LLMs) serve as the intelligent core that processes natural language instructions and generates responses or actions based on the given context. They enable the agent to understand, reason, and generate human-like text, making them essential for tasks involving communication, problem solving, and interaction with external components. Some challenges faced by LLM-powered autonomous agents in real-world applications include finite context length, which limits the inclusion of historical information and detailed instructions, and reliability of natural language interfaces, as LLMs may make formatting errors or exhibit rebellious behavior. These limitations impact planning capabilities, task decomposition, and overall robustness of the agents. (Refer to [1-20] for more details.)"
 921 |      ]
 922 |     }
 923 |    ],
 924 |    "source": [
 925 |     "mistral_helpfulness_eval = evaluate(\n",
 926 |     "    qa_mistral,\n",
 927 |     "    data=dataset_name,\n",
 928 |     "    evaluators=[LangChainStringEvaluator(\"criteria\", config={ \"criteria\": \"helpfulness\" })],\n",
 929 |     "    experiment_prefix=\"test-agent-qa-mistral-helpfulness\",\n",
 930 |     "    metadata={\n",
 931 |     "        \"variant\": \"full website in context window with Mistral 7b, helpfulness check\"\n",
 932 |     "    }\n",
 933 |     ")"
 934 |    ]
 935 |   },
 936 |   {
 937 |    "cell_type": "markdown",
 938 |    "id": "99cd9178-d45b-4455-abb0-f590c2ab5ee7",
 939 |    "metadata": {},
 940 |    "source": [
 941 |     "---\n",
 942 |     "# Digging in Deeper: Using LLM-As-A-Judge with Custom Criteria\n",
 943 |     "\n",
 944 |     "Same as above, but with our own custom criteria now!\n",
 945 |     "\n",
 946 |     "* `criteria` for binary scoring\n",
 947 |     "* `score_string` for numeric scoring\n",
 948 |     "* `labeled_criteria` evaluator instructs an LLM to assess if a prediction satisfies the criteria, taking into account the reference label\n",
 949 |     "* `labeled_score_string` evaluator has the LLM score the prediction on a numeric scale based on how well it satisfies the criteria compared to the reference"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "markdown",
 954 |    "id": "b71991cb-4d51-41f1-b84f-69d811793855",
 955 |    "metadata": {},
 956 |    "source": [
 957 |     "### Unlabeled (no ground truth- LLM assesses at face value)"
 958 |    ]
 959 |   },
 960 |   {
 961 |    "cell_type": "code",
 962 |    "execution_count": 22,
 963 |    "id": "d537165b-3d6e-4791-a8a2-f79fc4f070e4",
 964 |    "metadata": {},
 965 |    "outputs": [
 966 |     {
 967 |      "name": "stderr",
 968 |      "output_type": "stream",
 969 |      "text": [
 970 |       "This chain was only tested with GPT-4. Performance may be significantly worse with other models.\n"
 971 |      ]
 972 |     }
 973 |    ],
 974 |    "source": [
 975 |     "eval_objectivity_scorestring = LangChainStringEvaluator(\n",
 976 |     "    \"score_string\",\n",
 977 |     "    config={\n",
 978 |     "        \"criteria\": {\n",
 979 |     "            \"objectivity\": \"On a scale of 1-10, how unbiased and impartial is this submission? Rate it based on the extent to which it presents information and facts without personal opinions or slanted perspectives, aligning with the objective truth of the topic.\",\n",
 980 |     "        },\n",
 981 |     "        \"normalize_by\": 10,\n",
 982 |     "    }  \n",
 983 |     ")\n",
 984 |     "\n",
 985 |     "eval_objectivity_criteria = LangChainStringEvaluator(\n",
 986 |     "    \"criteria\",\n",
 987 |     "    config={\n",
 988 |     "        \"criteria\": {\n",
 989 |     "            \"objectivity\": \"Is this submission unbiased and impartial, presenting information and facts without personal opinions or slanted perspectives, ensuring it aligns with the objective truth of the topic?\",\n",
 990 |     "        }\n",
 991 |     "    }\n",
 992 |     ")"
 993 |    ]
 994 |   },
 995 |   {
 996 |    "cell_type": "markdown",
 997 |    "id": "03662ffa-1b26-41c8-bbcb-985f4598e188",
 998 |    "metadata": {},
 999 |    "source": [
1000 |     "### Labeled (with reference to data)\n",
1001 |     "\n",
1002 |     "This requires an extra prepare_data argument, that will take in a `prediction`, `reference`, and `input`. Similar to above, these will be referenced from the `Run` (LLM output) or the `Example` (Dataset)"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "code",
1007 |    "execution_count": 21,
1008 |    "id": "07b3e5f3-6d54-4c56-a9dd-06545a7161cb",
1009 |    "metadata": {},
1010 |    "outputs": [],
1011 |    "source": [
1012 |     "eval_labeled_objectivity_criteria = LangChainStringEvaluator(\n",
1013 |     "    \"labeled_criteria\",\n",
1014 |     "    config={\n",
1015 |     "        \"criteria\": {\n",
1016 |     "            \"objectivity\": (\n",
1017 |     "                \"Is this submission unbiased and impartial, presenting information and facts without personal opinions or slanted perspectives, ensuring it aligns with the objective truth of the topic?\"\n",
1018 |     "            )\n",
1019 |     "        }\n",
1020 |     "    },\n",
1021 |     "    prepare_data=lambda run, example: {\n",
1022 |     "        \"prediction\": run.outputs[\"answer\"],\n",
1023 |     "        \"reference\": example.outputs[\"answer\"],\n",
1024 |     "        \"input\": example.inputs[\"question\"],   \n",
1025 |     "    }\n",
1026 |     ")\n",
1027 |     "\n",
1028 |     "eval_labeled_objectivity_scorestring = LangChainStringEvaluator(\n",
1029 |     "    \"labeled_score_string\", \n",
1030 |     "    config={\n",
1031 |     "        \"criteria\": { \n",
1032 |     "            \"objectivity\": \"On a scale of 1-10, how unbiased and impartial is this submission? Rate it based on the extent to which it presents information and facts without personal opinions or slanted perspectives, aligning with the objective truth of the topic.\"\n",
1033 |     "        },\n",
1034 |     "        \"normalize_by\": 10,\n",
1035 |     "    },\n",
1036 |     "    prepare_data=lambda run, example: {\n",
1037 |     "        \"prediction\": run.outputs[\"answer\"], \n",
1038 |     "        \"reference\": example.outputs[\"answer\"],\n",
1039 |     "        \"input\": example.inputs[\"question\"],\n",
1040 |     "    }  \n",
1041 |     ")"
1042 |    ]
1043 |   },
1044 |   {
1045 |    "cell_type": "markdown",
1046 |    "id": "e13ec288-e22c-48f1-98d7-aa836fc7031d",
1047 |    "metadata": {},
1048 |    "source": [
1049 |     "### Running multiple evaluators at once in a list "
1050 |    ]
1051 |   },
1052 |   {
1053 |    "cell_type": "code",
1054 |    "execution_count": 23,
1055 |    "id": "2a0ba522-6a83-4d66-ba66-9f7fdb891f38",
1056 |    "metadata": {},
1057 |    "outputs": [],
1058 |    "source": [
1059 |     "unlabeled_evaluators = [eval_objectivity_scorestring, eval_objectivity_criteria]\n",
1060 |     "labeled_evaluators = [eval_labeled_objectivity_criteria, eval_labeled_objectivity_scorestring]\n",
1061 |     "dataset_name = \"agent_dataset\""
1062 |    ]
1063 |   },
1064 |   {
1065 |    "cell_type": "markdown",
1066 |    "id": "9b4d20d2-9682-40e1-b389-392837a5d678",
1067 |    "metadata": {},
1068 |    "source": [
1069 |     "**Unlabeled evaluators with GPT-4o**"
1070 |    ]
1071 |   },
1072 |   {
1073 |    "cell_type": "code",
1074 |    "execution_count": 24,
1075 |    "id": "690e73a5-4b69-4d2b-bdac-76ce0594e54c",
1076 |    "metadata": {},
1077 |    "outputs": [
1078 |     {
1079 |      "name": "stdout",
1080 |      "output_type": "stream",
1081 |      "text": [
1082 |       "View the evaluation results for experiment: 'test-agent-objectivity-unlabeled-oai-e5900284' at:\n",
1083 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=d7fc5637-a4c0-447e-9f2b-9dcdfa8fa57f\n",
1084 |       "\n",
1085 |       "\n"
1086 |      ]
1087 |     },
1088 |     {
1089 |      "data": {
1090 |       "application/vnd.jupyter.widget-view+json": {
1091 |        "model_id": "21ef032702d84587b5c8b3a039b640eb",
1092 |        "version_major": 2,
1093 |        "version_minor": 0
1094 |       },
1095 |       "text/plain": [
1096 |        "0it [00:00, ?it/s]"
1097 |       ]
1098 |      },
1099 |      "metadata": {},
1100 |      "output_type": "display_data"
1101 |     }
1102 |    ],
1103 |    "source": [
1104 |     "oai_unlabeled_results = evaluate(\n",
1105 |     "    qa_oai,\n",
1106 |     "    data=dataset_name,\n",
1107 |     "    evaluators=unlabeled_evaluators,\n",
1108 |     "    experiment_prefix=\"test-agent-objectivity-unlabeled-oai\",\n",
1109 |     "    metadata={\n",
1110 |     "        \"variant\": \"full website in context window with gpt-4o, unlabeled\"\n",
1111 |     "    }\n",
1112 |     ")"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "markdown",
1117 |    "id": "371e1f39-e8a3-485d-80b8-b25ed7b69280",
1118 |    "metadata": {},
1119 |    "source": [
1120 |     "**Unlabeled evaluators with Mistral 7b**"
1121 |    ]
1122 |   },
1123 |   {
1124 |    "cell_type": "code",
1125 |    "execution_count": 25,
1126 |    "id": "283d50f2-5706-453e-ada1-46895a0e7d4a",
1127 |    "metadata": {
1128 |     "scrolled": true
1129 |    },
1130 |    "outputs": [
1131 |     {
1132 |      "name": "stdout",
1133 |      "output_type": "stream",
1134 |      "text": [
1135 |       "View the evaluation results for experiment: 'test-agent-objectivity-unlabeled-mistral-e18e0d1b' at:\n",
1136 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=afce084b-30c0-42ab-8c26-6127035dcf88\n",
1137 |       "\n",
1138 |       "\n"
1139 |      ]
1140 |     },
1141 |     {
1142 |      "data": {
1143 |       "application/vnd.jupyter.widget-view+json": {
1144 |        "model_id": "677f5c5de8564c22854252bf50400635",
1145 |        "version_major": 2,
1146 |        "version_minor": 0
1147 |       },
1148 |       "text/plain": [
1149 |        "0it [00:00, ?it/s]"
1150 |       ]
1151 |      },
1152 |      "metadata": {},
1153 |      "output_type": "display_data"
1154 |     },
1155 |     {
1156 |      "name": "stdout",
1157 |      "output_type": "stream",
1158 |      "text": [
1159 |       " The primary function of Large Language Models (LLMs) in autonomous agents is to process natural language inputs and generate appropriate outputs, interacting with external components such as memory and tools. They help in understanding instructions, generating responses, and executing tasks by parsing and interpreting textual data. LLM-powered autonomous agents face several challenges in real-world applications, including finite context length which limits historical information and detailed instructions, reliability of natural language interfaces due to formatting errors and rebellious behavior, and difficulties with long-term planning and task decomposition. These limitations impact the robustness and effectiveness of these agents. (References: [1], [2], [3], [4], [5], [6], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21]) In LLM-powered autonomous agents, planning plays a crucial role by allowing the agent to adjust its actions based on long-term goals and unexpected errors. It helps the agent to explore the solution space effectively and make robust decisions. However, current LLMs face challenges in reliably handling natural language interfaces, finite context length, and long-term planning, which limit their overall effectiveness as autonomous agents. Ongoing research aims to address these limitations by integrating techniques such as self-reflection, tool augmentation, and reinforcement learning. Autonomous agents use tool APIs by integrating them with large language models, enabling the agents to perform specific tasks and access external knowledge sources. This synergistic approach allows agents to expand their capabilities beyond their inherent limitations and solve complex problems in various domains. LLM-powered agents typically utilize two types of memory: external memory, which can be a database or knowledge base, and internal memory, which is the agent's own state and context. External memory stores facts and information that the agent does not have access to internally, while internal memory helps the agent keep track of its current state and the context of the conversation."
1160 |      ]
1161 |     }
1162 |    ],
1163 |    "source": [
1164 |     "mistral_unlabeled_results = evaluate(\n",
1165 |     "    qa_mistral,\n",
1166 |     "    data=dataset_name,\n",
1167 |     "    evaluators=unlabeled_evaluators,\n",
1168 |     "    experiment_prefix=\"test-agent-objectivity-unlabeled-mistral\",\n",
1169 |     "    # Any experiment metadata can be specified here\n",
1170 |     "    metadata={\n",
1171 |     "        \"variant\": \"full website in context window with Mistral 7B, unlabeled\"\n",
1172 |     "    }\n",
1173 |     ")"
1174 |    ]
1175 |   },
1176 |   {
1177 |    "cell_type": "markdown",
1178 |    "id": "f832fb63-b3ee-4d18-b909-d5b214ba046d",
1179 |    "metadata": {},
1180 |    "source": [
1181 |     "**Labeled Evaluators with gpt-4o**"
1182 |    ]
1183 |   },
1184 |   {
1185 |    "cell_type": "code",
1186 |    "execution_count": 26,
1187 |    "id": "47d03ff3-56a6-474b-b5e9-ed2cac75fa2a",
1188 |    "metadata": {},
1189 |    "outputs": [
1190 |     {
1191 |      "name": "stdout",
1192 |      "output_type": "stream",
1193 |      "text": [
1194 |       "View the evaluation results for experiment: 'test-agent-objectivity-labeled-oai-a19e3d7e' at:\n",
1195 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=a337ef96-4f61-4a1f-bef8-37e3e009ce74\n",
1196 |       "\n",
1197 |       "\n"
1198 |      ]
1199 |     },
1200 |     {
1201 |      "data": {
1202 |       "application/vnd.jupyter.widget-view+json": {
1203 |        "model_id": "7915defdd9ef44bcaf832258e37999e4",
1204 |        "version_major": 2,
1205 |        "version_minor": 0
1206 |       },
1207 |       "text/plain": [
1208 |        "0it [00:00, ?it/s]"
1209 |       ]
1210 |      },
1211 |      "metadata": {},
1212 |      "output_type": "display_data"
1213 |     }
1214 |    ],
1215 |    "source": [
1216 |     "# These are now in comparison to the \"reference output\"\n",
1217 |     "oai_labeled_results = evaluate(\n",
1218 |     "    qa_oai,\n",
1219 |     "    data=dataset_name,\n",
1220 |     "    evaluators=labeled_evaluators,\n",
1221 |     "    experiment_prefix=\"test-agent-objectivity-labeled-oai\",\n",
1222 |     "    metadata={\n",
1223 |     "        \"variant\": \"full website in context window with gpt-3.5-turbo, labeled\"\n",
1224 |     "    }\n",
1225 |     ")"
1226 |    ]
1227 |   },
1228 |   {
1229 |    "cell_type": "markdown",
1230 |    "id": "21588304-31a5-4f44-b1d6-666d56c8070a",
1231 |    "metadata": {},
1232 |    "source": [
1233 |     "**Labeled evaluators with Mistral 7b**"
1234 |    ]
1235 |   },
1236 |   {
1237 |    "cell_type": "code",
1238 |    "execution_count": 27,
1239 |    "id": "5ce4d95d-54f9-4f44-940c-4bfd4106c6c5",
1240 |    "metadata": {},
1241 |    "outputs": [
1242 |     {
1243 |      "name": "stdout",
1244 |      "output_type": "stream",
1245 |      "text": [
1246 |       "View the evaluation results for experiment: 'test-agent-objectivity-labeled-mistral-017ce55d' at:\n",
1247 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=652ea396-158f-4ade-aff2-81fa78b8d11d\n",
1248 |       "\n",
1249 |       "\n"
1250 |      ]
1251 |     },
1252 |     {
1253 |      "data": {
1254 |       "application/vnd.jupyter.widget-view+json": {
1255 |        "model_id": "db8953d187e547b4a5bf4ab5fcf27b37",
1256 |        "version_major": 2,
1257 |        "version_minor": 0
1258 |       },
1259 |       "text/plain": [
1260 |        "0it [00:00, ?it/s]"
1261 |       ]
1262 |      },
1263 |      "metadata": {},
1264 |      "output_type": "display_data"
1265 |     },
1266 |     {
1267 |      "name": "stdout",
1268 |      "output_type": "stream",
1269 |      "text": [
1270 |       " Some challenges faced by LLM-powered autonomous agents in real-world applications include finite context length, which limits historical information and detailed instructions, making it difficult for long-term planning and effective task decomposition. Another challenge is the reliability of natural language interfaces, as LLMs may exhibit formatting errors or rebellious behavior. These issues make it important to continually research and develop methods to improve the performance and robustness of these agents. LLM-powered agents utilize different types of memory, including dynamic memory for self-reflection and static memory stored in vector stores or databases. They also rely on natural language interfaces to communicate with external components, which can be unreliable due to formatting errors or rebellious behavior. The primary function of a Large Language Model (LLM) in autonomous agents is to process natural language instructions and generate responses or actions based on that input. It serves as the conversational and cognitive component, interpreting data from external components and tools, and executing tasks assigned by the agent's architecture. LLMs help agents understand context, perform reasoning, and interact with their environment, making them essential building blocks for developing sophisticated autonomous systems. In LLM-powered autonomous agents, planning plays a crucial role in guiding the agent's actions based on its current context and long-term goals. It helps the agent adjust its plans when faced with unexpected errors and enables effective exploration of the solution space. However, challenges such as finite context length and reliability of natural language interfaces can limit the agent's planning capabilities. Techniques like self-reflection, in-context reinforcement learning, algorithm distillation, and tool augmentation are being explored to improve planning in LLM-powered autonomous agents. Autonomous agents can use Tool APIs to interact with external tools and systems, expanding their capabilities beyond language processing. This enables agents to perform complex tasks that require specialized knowledge or access to external data, making them more versatile and effective in various domains."
1271 |      ]
1272 |     }
1273 |    ],
1274 |    "source": [
1275 |     "mistral_labeled_results = evaluate(\n",
1276 |     "    qa_mistral,\n",
1277 |     "    data=dataset_name,\n",
1278 |     "    evaluators=labeled_evaluators,\n",
1279 |     "    experiment_prefix=\"test-agent-objectivity-labeled-mistral\",\n",
1280 |     "    # Any experiment metadata can be specified here\n",
1281 |     "    metadata={\n",
1282 |     "        \"variant\": \"full website in context window with Mistral 7B, labeled\"\n",
1283 |     "    }\n",
1284 |     ")"
1285 |    ]
1286 |   },
1287 |   {
1288 |    "cell_type": "markdown",
1289 |    "id": "45619028-4521-417b-b7a0-b47a60618a8d",
1290 |    "metadata": {},
1291 |    "source": [
1292 |     "---\n",
1293 |     "# Evaluating existing Evaluations\n",
1294 |     "\n",
1295 |     "What if your evaluation of interest is not at the individual run level, but on the overall experiment level?\n",
1296 |     "\n",
1297 |     "https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_existing_experiment"
1298 |    ]
1299 |   },
1300 |   {
1301 |    "cell_type": "code",
1302 |    "execution_count": 28,
1303 |    "id": "62231e17-bda6-4571-9943-a263606ac3b3",
1304 |    "metadata": {},
1305 |    "outputs": [
1306 |     {
1307 |      "name": "stderr",
1308 |      "output_type": "stream",
1309 |      "text": [
1310 |       "This chain was only tested with GPT-4. Performance may be significantly worse with other models.\n",
1311 |       "This chain was only tested with GPT-4. Performance may be significantly worse with other models.\n",
1312 |       "This chain was only tested with GPT-4. Performance may be significantly worse with other models.\n"
1313 |      ]
1314 |     },
1315 |     {
1316 |      "name": "stdout",
1317 |      "output_type": "stream",
1318 |      "text": [
1319 |       "View the evaluation results for experiment: 'test-agent-qa-mistral-multicriteria-63912a34' at:\n",
1320 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=b7f96e55-d16a-4dac-b9ad-44b0cacff931\n",
1321 |       "\n",
1322 |       "\n"
1323 |      ]
1324 |     },
1325 |     {
1326 |      "data": {
1327 |       "application/vnd.jupyter.widget-view+json": {
1328 |        "model_id": "726e7075c1c2454eb88371d3daaae0ca",
1329 |        "version_major": 2,
1330 |        "version_minor": 0
1331 |       },
1332 |       "text/plain": [
1333 |        "0it [00:00, ?it/s]"
1334 |       ]
1335 |      },
1336 |      "metadata": {},
1337 |      "output_type": "display_data"
1338 |     },
1339 |     {
1340 |      "name": "stdout",
1341 |      "output_type": "stream",
1342 |      "text": [
1343 |       " LLM-powered agents typically utilize two types of memory: dynamic memory for storing and retrieving information during conversation or task execution, and external knowledge sources such as databases or APIs to access a larger pool of information. Some agents also use self-reflection and learning mechanisms to store past experiences for future reference. In LLM-powered autonomous agents, 'Planning' is a crucial component that enables the agent to generate a sequence of actions based on its current context and goals. It allows the agent to adjust its plans when faced with unexpected errors, improving its robustness compared to humans who learn from trial and error. Effective planning in LLMs is challenging due to their finite context length and reliability issues with natural language interfaces. Various approaches like reinforcement learning, algorithm distillation, and modular architecture have been explored to enhance the planning capabilities of LLMs. The primary function of Large Language Models (LLMs) in autonomous agents is to process natural language inputs, generate appropriate responses, and perform tasks by interacting with external components such as memory and tools. They are designed to learn from experience and adapt to new situations, making them valuable for solving complex problems and executing multi-step instructions. LLM-powered autonomous agents face several challenges in real-world applications, including finite context length which limits historical information and detailed instructions, reliability of natural language interfaces due to formatting errors and occasional rebellious behavior, and difficulties in long-term planning and task decomposition. (References: [1], [2], [3], [4], [5], [6], [8], [9], [10], [11], [12], [13], [15], [17], [18], [19]) Autonomous agents can use tool APIs by integrating them with large language models, enabling the agents to access and utilize external tools for specific tasks. This integration enhances the agent's capabilities, making it more effective in solving complex problems and performing various tasks. Examples of tool-augmented agents include ChemCrow for chemistry tasks, HuggingGPT for AI tasks, and GPT-Engineer for software engineering tasks."
1344 |      ]
1345 |     }
1346 |    ],
1347 |    "source": [
1348 |     "helpfulness_scorestring = LangChainStringEvaluator(\"score_string\", config={ \"criteria\": \"helpfulness\" })\n",
1349 |     "conciseness_scorestring = LangChainStringEvaluator(\"score_string\", config={ \"criteria\": \"conciseness\" })\n",
1350 |     "coherence_scorestring = LangChainStringEvaluator(\"score_string\", config={ \"criteria\": \"coherence\" })\n",
1351 |     "\n",
1352 |     "evaluators = [helpfulness_scorestring, conciseness_scorestring, coherence_scorestring]\n",
1353 |     "\n",
1354 |     "mistral_multicriteria_eval = evaluate(\n",
1355 |     "    qa_mistral,\n",
1356 |     "    data=dataset_name,\n",
1357 |     "    evaluators=evaluators,\n",
1358 |     "    experiment_prefix=\"test-agent-qa-mistral-multicriteria\",\n",
1359 |     "    metadata={\n",
1360 |     "        \"variant\": \"full website in context window with Mistral 7b, helpfulness, conciseness, and coherence check\"\n",
1361 |     "    }\n",
1362 |     ")"
1363 |    ]
1364 |   },
1365 |   {
1366 |    "cell_type": "markdown",
1367 |    "id": "37480cca-b069-4734-ac0f-ea3ab72da3c8",
1368 |    "metadata": {},
1369 |    "source": [
1370 |     "### Set up a Summary Evaluator to look over the entire dataset and determine whether an output was generated\n",
1371 |     "\n",
1372 |     "Our criteria for a pass is that the model output an answer successfully 80% of the time"
1373 |    ]
1374 |   },
1375 |   {
1376 |    "cell_type": "code",
1377 |    "execution_count": 29,
1378 |    "id": "493a1911-3cad-4002-be94-ef557fc81aec",
1379 |    "metadata": {},
1380 |    "outputs": [
1381 |     {
1382 |      "name": "stdout",
1383 |      "output_type": "stream",
1384 |      "text": [
1385 |       "View the evaluation results for experiment: 'test-agent-qa-mistral-multicriteria-63912a34' at:\n",
1386 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=b7f96e55-d16a-4dac-b9ad-44b0cacff931\n",
1387 |       "\n",
1388 |       "\n"
1389 |      ]
1390 |     },
1391 |     {
1392 |      "data": {
1393 |       "application/vnd.jupyter.widget-view+json": {
1394 |        "model_id": "faf7f9c720b8410a849cc15d5b6e116b",
1395 |        "version_major": 2,
1396 |        "version_minor": 0
1397 |       },
1398 |       "text/plain": [
1399 |        "0it [00:00, ?it/s]"
1400 |       ]
1401 |      },
1402 |      "metadata": {},
1403 |      "output_type": "display_data"
1404 |     },
1405 |     {
1406 |      "data": {
1407 |       "text/plain": [
1408 |        "<ExperimentResults test-agent-qa-mistral-multicriteria-63912a34>"
1409 |       ]
1410 |      },
1411 |      "execution_count": 29,
1412 |      "metadata": {},
1413 |      "output_type": "execute_result"
1414 |     }
1415 |    ],
1416 |    "source": [
1417 |     "from langsmith.evaluation import evaluate_existing\n",
1418 |     "\n",
1419 |     "experiment_name = mistral_multicriteria_eval.experiment_name\n",
1420 |     "\n",
1421 |     "def passed_eval(runs: list, examples: list):\n",
1422 |     "    output = 0\n",
1423 |     "    for i, run in enumerate(runs):\n",
1424 |     "        if run.outputs[\"answer\"]:\n",
1425 |     "            output +=1\n",
1426 |     "    if output / len(runs) > 0.8:\n",
1427 |     "        return {\"key\": \"pass\", \"score\": True}\n",
1428 |     "    else:\n",
1429 |     "        return {\"key\": \"fail\", \"score\": False}\n",
1430 |     "\n",
1431 |     "evaluate_existing(experiment_name, summary_evaluators=[passed_eval])"
1432 |    ]
1433 |   },
1434 |   {
1435 |    "cell_type": "markdown",
1436 |    "id": "83ba1750-f9ea-4dd5-9160-1def16ca5d4a",
1437 |    "metadata": {},
1438 |    "source": [
1439 |     "---\n",
1440 |     "# Pairwise Evaluations\n",
1441 |     "\n",
1442 |     "https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_pairwise\n",
1443 |     "\n",
1444 |     "Allows you to evaluate exisiting experiments against eachother. Example: LLM-As-Judge evaluating it's preference between two outputs from LLMs from an existing evaluation. This could be useful to compare two small model outputs using a large model. \n",
1445 |     "\n",
1446 |     "Using this prompt: https://smith.langchain.com/hub/langchain-ai/pairwise-evaluation-2?organizationId=ef6f5694-a2fa-5316-9158-12297cd17350"
1447 |    ]
1448 |   },
1449 |   {
1450 |    "cell_type": "code",
1451 |    "execution_count": 30,
1452 |    "id": "cfe453b6-7ff4-4054-991a-6720fc3e554b",
1453 |    "metadata": {},
1454 |    "outputs": [],
1455 |    "source": [
1456 |     "from langsmith.evaluation import evaluate_comparative\n",
1457 |     "from langchain import hub\n",
1458 |     "from langchain_openai import ChatOpenAI\n",
1459 |     "from langsmith.schemas import Run, Example\n",
1460 |     "prompt = hub.pull(\"langchain-ai/pairwise-evaluation-2\")\n",
1461 |     "\n",
1462 |     "# Example from documentation, using GPT-4o to evaluate preference between two model's outputs\n",
1463 |     "def evaluate_pairwise(runs: list[Run], example: Example):\n",
1464 |     "    scores = {}\n",
1465 |     "\n",
1466 |     "    # Create the model to run your evaluator\n",
1467 |     "    model = ChatOpenAI(model_name=\"gpt-4o\")\n",
1468 |     "\n",
1469 |     "    runnable = prompt | model\n",
1470 |     "    response = runnable.invoke({\n",
1471 |     "        \"question\": example.inputs[\"question\"],\n",
1472 |     "        \"answer_a\": runs[0].outputs[\"answer\"] if runs[0].outputs is not None else \"N/A\",\n",
1473 |     "        \"answer_b\": runs[1].outputs[\"answer\"] if runs[1].outputs is not None else \"N/A\",\n",
1474 |     "    })\n",
1475 |     "    score = response[\"Preference\"]\n",
1476 |     "    if score == 1:\n",
1477 |     "        scores[runs[0].id] = 1\n",
1478 |     "        scores[runs[1].id] = 0\n",
1479 |     "    elif score == 2:\n",
1480 |     "        scores[runs[0].id] = 0\n",
1481 |     "        scores[runs[1].id] = 1\n",
1482 |     "    else:\n",
1483 |     "        scores[runs[0].id] = 0\n",
1484 |     "        scores[runs[1].id] = 0\n",
1485 |     "    return {\"key\": \"ranked_preference\", \"scores\": scores}"
1486 |    ]
1487 |   },
1488 |   {
1489 |    "cell_type": "markdown",
1490 |    "id": "88e3b01c-bc3e-4668-a09a-2f07bb1a6392",
1491 |    "metadata": {},
1492 |    "source": [
1493 |     "### Running Comparative Evaluation\n",
1494 |     "\n",
1495 |     "Going to use our prior experiments on helpfulness using Mistral 7b and GPT-4o"
1496 |    ]
1497 |   },
1498 |   {
1499 |    "cell_type": "code",
1500 |    "execution_count": 31,
1501 |    "id": "af7abd47-fc42-4744-b111-35a5a31c7011",
1502 |    "metadata": {},
1503 |    "outputs": [
1504 |     {
1505 |      "name": "stdout",
1506 |      "output_type": "stream",
1507 |      "text": [
1508 |       "View the pairwise evaluation results at:\n",
1509 |       "https://smith.langchain.com/o/ef6f5694-a2fa-5316-9158-12297cd17350/datasets/8d445ea4-b7d1-4d36-a641-437e4efa4a5b/compare?selectedSessions=1a6be94f-4f7b-4ef9-863c-ed2be1d3b441%2C7245e36d-b143-4cfa-a15c-9f5714885636&comparativeExperiment=dca56cef-79d4-4252-9171-c60b3217ff75\n",
1510 |       "\n",
1511 |       "\n"
1512 |      ]
1513 |     },
1514 |     {
1515 |      "data": {
1516 |       "application/vnd.jupyter.widget-view+json": {
1517 |        "model_id": "b50344bb6f834cf09475ec5a7df7c1c9",
1518 |        "version_major": 2,
1519 |        "version_minor": 0
1520 |       },
1521 |       "text/plain": [
1522 |        "  0%|          | 0/5 [00:00<?, ?it/s]"
1523 |       ]
1524 |      },
1525 |      "metadata": {},
1526 |      "output_type": "display_data"
1527 |     },
1528 |     {
1529 |      "data": {
1530 |       "text/plain": [
1531 |        "<langsmith.evaluation._runner.ComparativeExperimentResults at 0x1772c7fe0>"
1532 |       ]
1533 |      },
1534 |      "execution_count": 31,
1535 |      "metadata": {},
1536 |      "output_type": "execute_result"
1537 |     }
1538 |    ],
1539 |    "source": [
1540 |     "evaluate_comparative(\n",
1541 |     "    # Replace the following array with the names or IDs of your experiments\n",
1542 |     "    [oai_helpfulness_eval.experiment_name, mistral_helpfulness_eval.experiment_name],\n",
1543 |     "    evaluators=[evaluate_pairwise],\n",
1544 |     ")"
1545 |    ]
1546 |   },
1547 |   {
1548 |    "cell_type": "markdown",
1549 |    "id": "ef046fc0-8c0e-46d8-ac87-1a92fbda8a57",
1550 |    "metadata": {},
1551 |    "source": [
1552 |     "---\n",
1553 |     "# Unit Tests w/pytest - VSCode Example\n",
1554 |     "\n",
1555 |     "https://docs.smith.langchain.com/how_to_guides/evaluation/unit_testing\n",
1556 |     "\n",
1557 |     "You can use a built in decorator `@unit` to attach to `pytest` tests. These will then be logged with langsmith and can be viewed/compared similar to the existing evaluations we've been over.\n",
1558 |     "\n",
1559 |     "Built in with the `@unit` Decorator:\n",
1560 |     "\n",
1561 |     "| Feedback           | Description                                                   | Example                                                                                                                      |\n",
1562 |     "|--------------------|---------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|\n",
1563 |     "| pass               | Binary pass/fail score, 1 for pass, 0 for fail                | `assert False # Fails`                                                                                                       |\n",
1564 |     "| expectation        | Binary expectation score, 1 if expectation is met, 0 if not   | `expect(prediction).against(lambda x: re.search(r\"\\b[a-f\\d]{8}-[a-f\\d]{4}-[a-f\\d]{4}-[a-f\\d]{4}-[a-f\\d]{12}\\b\", x))`          |\n",
1565 |     "| embedding_distance | Cosine distance between two embeddings                        | `expect.embedding_distance(prediction=prediction, expectation=expectation)`                                                  |\n",
1566 |     "| edit_distance      | Edit distance between two strings                             | `expect.edit_distance(prediction=prediction, expectation=expectation)`                                                       |\n",
1567 |     "\n",
1568 |     "\n",
1569 |     "\n",
1570 |     "### `expect` methods\n",
1571 |     "\n",
1572 |     "\n",
1573 |     "| Method              | Description                                                                                   | Parameters                                           |\n",
1574 |     "|---------------------|-----------------------------------------------------------------------------------------------|------------------------------------------------------|\n",
1575 |     "| `to_be_less_than`   | Assert that the expectation value is less than the given value.                               | `value`                                              |\n",
1576 |     "| `to_be_greater_than`| Assert that the expectation value is greater than the given value.                            | `value`                                              |\n",
1577 |     "| `to_be_between`     | Assert that the expectation value is between the given min and max values.                    | `min_value, max_value`                               |\n",
1578 |     "| `to_be_approximately`| Assert that the expectation value is approximately equal to the given value.                  | `value, precision=2`                                 |\n",
1579 |     "| `to_equal`          | Assert that the expectation value equals the given value.                                     | `value`                                              |\n",
1580 |     "| `to_contain`        | Assert that the expectation value contains the given value.                                   | `value`                                              |\n",
1581 |     "| `against`           | Assert the expectation value against a custom function.                                       | `func`                                               |\n",
1582 |     "\n"
1583 |    ]
1584 |   },
1585 |   {
1586 |    "cell_type": "markdown",
1587 |    "id": "48801d92-050a-44bb-9da9-468132ca3326",
1588 |    "metadata": {},
1589 |    "source": [
1590 |     "---\n",
1591 |     "# Hopping over to Llama3 Research Agent Notebook to Assess Attaching Evaluations within Existing Runs"
1592 |    ]
1593 |   },
1594 |   {
1595 |    "cell_type": "code",
1596 |    "execution_count": null,
1597 |    "id": "71068d61-ee98-4e56-9041-40574d057d35",
1598 |    "metadata": {},
1599 |    "outputs": [],
1600 |    "source": []
1601 |   },
1602 |   {
1603 |    "cell_type": "code",
1604 |    "execution_count": null,
1605 |    "id": "a6b053f2-0c96-4929-9ca9-98d78c496276",
1606 |    "metadata": {},
1607 |    "outputs": [],
1608 |    "source": []
1609 |   }
1610 |  ],
1611 |  "metadata": {
1612 |   "kernelspec": {
1613 |    "display_name": "Python 3 (ipykernel)",
1614 |    "language": "python",
1615 |    "name": "python3"
1616 |   },
1617 |   "language_info": {
1618 |    "codemirror_mode": {
1619 |     "name": "ipython",
1620 |     "version": 3
1621 |    },
1622 |    "file_extension": ".py",
1623 |    "mimetype": "text/x-python",
1624 |    "name": "python",
1625 |    "nbconvert_exporter": "python",
1626 |    "pygments_lexer": "ipython3",
1627 |    "version": "3.12.1"
1628 |   }
1629 |  },
1630 |  "nbformat": 4,
1631 |  "nbformat_minor": 5
1632 | }
1633 | 


--------------------------------------------------------------------------------