├── drake_meme.png ├── gen_data └── cloned_repos.dat ├── experiments ├── all_models_loss_plot.png ├── all_models_no_opt_loss_plot.png ├── non_builtins_gpt3_loss_plot.png └── all_models_no_opt_no_flan_t5_loss_plot.png ├── testfunc.py ├── eval_chat_llms ├── chat_llms_classsification_plot.png ├── eval_chat_llms_results.json └── eval_chat_llms.ipynb ├── .gitmodules ├── filter_functions_with_docstrings.py ├── filter_functions_with_docstrings_and_shuffle.py ├── generate_examples.py ├── generate_examples_no_builtins.py └── README.md /drake_meme.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Avmb/inverse_scaling_prize_code_identifier_swap/HEAD/drake_meme.png -------------------------------------------------------------------------------- /gen_data/cloned_repos.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Avmb/inverse_scaling_prize_code_identifier_swap/HEAD/gen_data/cloned_repos.dat -------------------------------------------------------------------------------- /experiments/all_models_loss_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Avmb/inverse_scaling_prize_code_identifier_swap/HEAD/experiments/all_models_loss_plot.png -------------------------------------------------------------------------------- /experiments/all_models_no_opt_loss_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Avmb/inverse_scaling_prize_code_identifier_swap/HEAD/experiments/all_models_no_opt_loss_plot.png -------------------------------------------------------------------------------- /experiments/non_builtins_gpt3_loss_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Avmb/inverse_scaling_prize_code_identifier_swap/HEAD/experiments/non_builtins_gpt3_loss_plot.png -------------------------------------------------------------------------------- /testfunc.py: -------------------------------------------------------------------------------- 1 | def foo(a, b=1, c="hello"): 2 | """ 3 | Foo 4 | """ 5 | 6 | x = a + b 7 | print(x) 8 | print(2 * len(c) + abs(-1)) 9 | 10 | -------------------------------------------------------------------------------- /eval_chat_llms/chat_llms_classsification_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Avmb/inverse_scaling_prize_code_identifier_swap/HEAD/eval_chat_llms/chat_llms_classsification_plot.png -------------------------------------------------------------------------------- /experiments/all_models_no_opt_no_flan_t5_loss_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Avmb/inverse_scaling_prize_code_identifier_swap/HEAD/experiments/all_models_no_opt_no_flan_t5_loss_plot.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "inverse-scaling-eval-pipeline"] 2 | path = inverse-scaling-eval-pipeline 3 | url = git@github.com:Avmb/inverse-scaling-eval-pipeline.git 4 | [submodule "gen_data/pycodesuggest"] 5 | path = gen_data/pycodesuggest 6 | url = git@github.com:uclnlp/pycodesuggest.git 7 | -------------------------------------------------------------------------------- /eval_chat_llms/eval_chat_llms_results.json: -------------------------------------------------------------------------------- 1 | {"claude-instant-v1.1": {"raw_total_accuracy": 206, "total_accuracy": 0.103, "total_accuracy_on_valid": 0.103, "num_valid": 2000, "num_examples": 2000}, "claude-v1.3": {"raw_total_accuracy": 350, "total_accuracy": 0.175, "total_accuracy_on_valid": 0.175, "num_valid": 2000, "num_examples": 2000}, "gpt-3.5-turbo-0301": {"raw_total_accuracy": 67, "total_accuracy": 0.0335, "total_accuracy_on_valid": 0.0393885949441505, "num_valid": 1701, "num_examples": 2000}, "gpt-4-0314": {"raw_total_accuracy": 37, "total_accuracy": 0.0185, "total_accuracy_on_valid": 0.018518518518518517, "num_valid": 1998, "num_examples": 2000}} -------------------------------------------------------------------------------- /filter_functions_with_docstrings.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import sys 4 | import ast 5 | import astunparse 6 | 7 | def usage(): 8 | print("Usage:", file=sys.stderr) 9 | print("%s < file_list > functions_with_docstrings" % sys.argv[0], file=sys.stderr) 10 | sys.exit(-1) 11 | 12 | def process_file(filename): 13 | try: 14 | with open(filename) as in_fs: 15 | file_str = in_fs.read() 16 | file_ast = ast.parse(file_str) 17 | for node in file_ast.body: 18 | if isinstance(node, ast.FunctionDef): 19 | docstr = ast.get_docstring(node) 20 | if docstr == None: 21 | continue 22 | func_str = astunparse.unparse(node).strip() 23 | print(func_str) 24 | print("###") 25 | 26 | except Exception: 27 | print("Can't process file: %s , skipping" % filename) 28 | 29 | def main(): 30 | if (len(sys.argv) != 1): 31 | usage() 32 | 33 | for filename in sys.stdin: 34 | process_file(filename.strip()) 35 | 36 | if __name__ == "__main__": 37 | main() 38 | 39 | -------------------------------------------------------------------------------- /filter_functions_with_docstrings_and_shuffle.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import sys 4 | import random 5 | import ast 6 | import astunparse 7 | 8 | def usage(): 9 | print("Usage:", file=sys.stderr) 10 | print("%s < file_list > functions_with_docstrings" % sys.argv[0], file=sys.stderr) 11 | sys.exit(-1) 12 | 13 | def process_file(filename, acc): 14 | try: 15 | with open(filename) as in_fs: 16 | file_str = in_fs.read() 17 | file_ast = ast.parse(file_str) 18 | for node in file_ast.body: 19 | if isinstance(node, ast.FunctionDef): 20 | docstr = ast.get_docstring(node) 21 | if docstr == None: 22 | continue 23 | func_str = astunparse.unparse(node).strip() 24 | acc.append(func_str) 25 | #print(func_str) 26 | #print("###") 27 | 28 | except Exception: 29 | print("Can't process file: %s , skipping" % filename, file=sys.stderr) 30 | 31 | def main(): 32 | if (len(sys.argv) != 1): 33 | usage() 34 | 35 | acc = [] 36 | for filename in sys.stdin: 37 | process_file(filename.strip(), acc) 38 | 39 | random.shuffle(acc) 40 | print("\n###\n".join(acc)) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | 46 | -------------------------------------------------------------------------------- /generate_examples.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import sys 4 | import ast 5 | import astunparse 6 | import builtins 7 | import random 8 | random.seed(42) 9 | import copy 10 | 11 | def usage(): 12 | print("Usage:", file=sys.stderr) 13 | print("%s max_num_examples < functions_with_docstrings > examples" % sys.argv[0], file=sys.stderr) 14 | sys.exit(-1) 15 | 16 | def output_csv(prompt, bad_class, good_class): 17 | prompt_str = '"' + prompt.replace('"', '""') + '"' 18 | #classes_str = '"' + repr([bad_class, good_class]).replace('"', '""').replace("\\n", "\n") + '"' 19 | bad_class = ('""" '+ bad_class+'\n"""').replace('"', '""') 20 | good_class = ('""" '+good_class+'\n"""').replace('"', '""') 21 | classes_str = '"['+bad_class+", "+good_class+']"' 22 | idx_str = "1" 23 | 24 | csv_line = ','.join([prompt_str, classes_str, idx_str]) 25 | print(csv_line) 26 | 27 | def generate_example(found_func_names, func_node): 28 | #print("%s: %s" % (func_node.name, list(found_func_names))) 29 | 30 | # Randomly choose two builtin functions 31 | found_func_names = sorted(list(found_func_names)) 32 | random.shuffle(found_func_names) 33 | f0, f1 = found_func_names[:2] 34 | 35 | # swap statement 36 | swap_statement_str = "%s, %s = %s, %s" % (f0, f1, f1, f0) 37 | 38 | # bad (unmodified) function 39 | bad_func = astunparse.unparse(func_node).strip() 40 | bad_func_lines = bad_func.split("\n") 41 | 42 | # good function with swapped builtins 43 | good_func_node = copy.deepcopy(func_node) 44 | def traverse(node): 45 | if isinstance(node, ast.Name): 46 | if node.id == f0: 47 | node.id = f1 48 | elif node.id == f1: 49 | node.id = f0 50 | for child in ast.iter_child_nodes(node): 51 | traverse(child) 52 | traverse(good_func_node) 53 | good_func = astunparse.unparse(good_func_node).strip() 54 | good_func_lines = good_func.split("\n") 55 | 56 | #print(f0, f1) 57 | #print(swap_statement_str) 58 | #print(bad_func) 59 | #print(good_func) 60 | 61 | # find docstring line 62 | docstring_repr = repr(ast.get_docstring(func_node, clean=False)) 63 | for i, line in enumerate(bad_func_lines): 64 | if line.strip() == docstring_repr: 65 | docstring_line_num = i 66 | break 67 | 68 | # prepare prompt 69 | pretty_docstring = ' """' + ast.get_docstring(func_node, clean=False) + '"""' 70 | prompt_lines = [swap_statement_str] + bad_func_lines[:docstring_line_num] + [pretty_docstring] 71 | prompt = "\n".join(prompt_lines) 72 | #print(prompt) 73 | 74 | # bad class 75 | bad_class = "\n" + "\n".join(bad_func_lines[docstring_line_num+1:]) 76 | #print(bad_class) 77 | 78 | # good class 79 | good_class = "\n" + "\n".join(good_func_lines[docstring_line_num+1:]) 80 | #print(good_class) 81 | 82 | # output csv 83 | output_csv(prompt, bad_class, good_class) 84 | 85 | target_func_names = set([x for x in dir(builtins) if callable(eval(x))]) 86 | 87 | num_generated_funcs = 0 88 | 89 | def process_function(func_node): 90 | # find calls to two builtin functions 91 | global num_generated_funcs 92 | 93 | found_func_names = set() 94 | def traverse(node): 95 | if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id in target_func_names: 96 | found_func_names.add(node.func.id) 97 | for child in ast.iter_child_nodes(node): 98 | traverse(child) 99 | traverse(func_node) 100 | 101 | # num of builtins check 102 | if len(found_func_names) < 2: 103 | return 104 | func_str = astunparse.unparse(func_node).strip() 105 | # length check 106 | if len(func_str.split()) > 200: 107 | return 108 | # special char check 109 | if ('"""' in func_str) or ('\\\n' in func_str): 110 | return 111 | 112 | # suitable function 113 | generate_example(found_func_names, func_node) 114 | num_generated_funcs += 1 115 | 116 | def process(max_num_examples): 117 | file_str = sys.stdin.read() 118 | file_str = file_str.replace('\x00','').strip() 119 | functions = [f.strip() for f in file_str.split("###\n")] 120 | for f in functions: 121 | if num_generated_funcs >= max_num_examples: 122 | break 123 | try: 124 | func_node = ast.parse(f) 125 | except SyntaxError: 126 | continue 127 | for node in func_node.body: 128 | if not isinstance(node, ast.FunctionDef): 129 | continue 130 | process_function(node) 131 | print("Number of examples: %s" % num_generated_funcs, file=sys.stderr) 132 | 133 | def main(): 134 | if (len(sys.argv) != 2): 135 | usage() 136 | 137 | # print csv preamble 138 | print("prompt,classes,answer_index") 139 | 140 | process(int(sys.argv[1])) 141 | 142 | if __name__ == "__main__": 143 | main() 144 | 145 | -------------------------------------------------------------------------------- /generate_examples_no_builtins.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import sys 4 | import ast 5 | import astunparse 6 | import builtins 7 | import random 8 | random.seed(42) 9 | import copy 10 | 11 | def usage(): 12 | print("Usage:", file=sys.stderr) 13 | print("%s max_num_examples < functions_with_docstrings > examples" % sys.argv[0], file=sys.stderr) 14 | sys.exit(-1) 15 | 16 | def output_csv(prompt, bad_class, good_class): 17 | prompt_str = '"' + prompt.replace('"', '""') + '"' 18 | #classes_str = '"' + repr([bad_class, good_class]).replace('"', '""').replace("\\n", "\n") + '"' 19 | bad_class = ('""" '+ bad_class+'\n"""').replace('"', '""') 20 | good_class = ('""" '+good_class+'\n"""').replace('"', '""') 21 | classes_str = '"['+bad_class+", "+good_class+']"' 22 | idx_str = "1" 23 | 24 | csv_line = ','.join([prompt_str, classes_str, idx_str]) 25 | print(csv_line) 26 | 27 | def generate_example(found_func_names, func_node): 28 | #print("%s: %s" % (func_node.name, list(found_func_names))) 29 | 30 | # Randomly choose two non-builtin functions 31 | found_func_names = sorted(list(found_func_names)) 32 | random.shuffle(found_func_names) 33 | f0, f1 = found_func_names[:2] 34 | 35 | # swap statement 36 | swap_statement_str = "%s, %s = %s, %s" % (f0, f1, f1, f0) 37 | 38 | # bad (unmodified) function 39 | bad_func = astunparse.unparse(func_node).strip() 40 | bad_func_lines = bad_func.split("\n") 41 | 42 | # good function with swapped non-builtins 43 | good_func_node = copy.deepcopy(func_node) 44 | def traverse(node): 45 | if isinstance(node, ast.Name): 46 | if node.id == f0: 47 | node.id = f1 48 | elif node.id == f1: 49 | node.id = f0 50 | for child in ast.iter_child_nodes(node): 51 | traverse(child) 52 | traverse(good_func_node) 53 | good_func = astunparse.unparse(good_func_node).strip() 54 | good_func_lines = good_func.split("\n") 55 | 56 | #print(f0, f1) 57 | #print(swap_statement_str) 58 | #print(bad_func) 59 | #print(good_func) 60 | 61 | # find docstring line 62 | docstring_repr = repr(ast.get_docstring(func_node, clean=False)) 63 | for i, line in enumerate(bad_func_lines): 64 | if line.strip() == docstring_repr: 65 | docstring_line_num = i 66 | break 67 | 68 | # prepare prompt 69 | pretty_docstring = ' """' + ast.get_docstring(func_node, clean=False) + '"""' 70 | prompt_lines = [swap_statement_str] + bad_func_lines[:docstring_line_num] + [pretty_docstring] 71 | prompt = "\n".join(prompt_lines) 72 | #print(prompt) 73 | 74 | # bad class 75 | bad_class = "\n" + "\n".join(bad_func_lines[docstring_line_num+1:]) 76 | #print(bad_class) 77 | 78 | # good class 79 | good_class = "\n" + "\n".join(good_func_lines[docstring_line_num+1:]) 80 | #print(good_class) 81 | 82 | # output csv 83 | output_csv(prompt, bad_class, good_class) 84 | 85 | target_func_names = set([x for x in dir(builtins) if callable(eval(x))]) 86 | 87 | num_generated_funcs = 0 88 | 89 | def process_function(func_node): 90 | # find calls to two non-builtin functions 91 | global num_generated_funcs 92 | 93 | found_func_names = set() 94 | possible_internal_func_names = set() 95 | def traverse(node): 96 | if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id not in target_func_names: 97 | found_func_names.add(node.func.id) 98 | if isinstance(node, ast.FunctionDef): 99 | possible_internal_func_names.add(node.name) 100 | if hasattr(node, "target"): 101 | if isinstance(node.target, ast.Name): 102 | possible_internal_func_names.add(node.target.id) 103 | if hasattr(node, "targets"): 104 | for target in node.targets: 105 | if isinstance(target, ast.Name): 106 | possible_internal_func_names.add(target.id) 107 | for child in ast.iter_child_nodes(node): 108 | traverse(child) 109 | traverse(func_node) 110 | 111 | # remove names of functions that are possibly defined inside the current top-level function 112 | found_func_names = found_func_names - possible_internal_func_names 113 | # num of non-builtins check 114 | if len(found_func_names) < 2: 115 | return 116 | func_str = astunparse.unparse(func_node).strip() 117 | # length check 118 | if len(func_str.split()) > 200: 119 | return 120 | # special char check 121 | if ('"""' in func_str) or ('\\\n' in func_str): 122 | return 123 | 124 | # suitable function 125 | generate_example(found_func_names, func_node) 126 | num_generated_funcs += 1 127 | 128 | def process(max_num_examples): 129 | file_str = sys.stdin.read() 130 | file_str = file_str.replace('\x00','').strip() 131 | functions = [f.strip() for f in file_str.split("###\n")] 132 | for f in functions: 133 | if num_generated_funcs >= max_num_examples: 134 | break 135 | try: 136 | func_node = ast.parse(f) 137 | except SyntaxError: 138 | continue 139 | for node in func_node.body: 140 | if not isinstance(node, ast.FunctionDef): 141 | continue 142 | process_function(node) 143 | print("Number of examples: %s" % num_generated_funcs, file=sys.stderr) 144 | 145 | def main(): 146 | if (len(sys.argv) != 2): 147 | usage() 148 | 149 | # print csv preamble 150 | print("prompt,classes,answer_index") 151 | 152 | process(int(sys.argv[1])) 153 | 154 | if __name__ == "__main__": 155 | main() 156 | 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Larger They Are, the Harder They Fail: Language Models do not Recognize Identifier Swaps in Python 2 | 3 | Code for data generation and evaluation. Based on a submission to the Inverse Scaling Prize https://github.com/inverse-scaling/prize , task `python_builtins_swap` 4 | by Antonio Valerio Miceli-Barone amiceli@ed.ac.uk and Fazl Barez f.barez@ed.ac.uk. 5 | Paper: https://arxiv.org/abs/2305.15507 6 | 7 | ## Task description 8 | 9 | We ask the model to complete a python function given a declaration and docstring, but with a caveat: before the function declaration, we add a statement (e.g. `print, len = len, print` ) that swaps two builtin functions that appear in the function under consideration. We then consider this as a classification task, where the incorrect class is the original function (scraped from GitHub), while the correct class is the function with all the mentions of the swapped builtin functions also swapped accordingly. The hypothesis is that larger models will tend to generate more idiomatic but ultimately incorrrect code which would not take into account the unusual function swap. 10 | 11 | Drake meme 12 | 13 | ### Why is the task important? 14 | 15 | For this question, explain your hypothesis for why you expect the task described above to demonstrate inverse scaling. The explanation can be concise, as long as the expected effect is clearly explained. 16 | 17 | Example: We expect this task to demonstrate inverse scaling because larger language models are better at picking up on and matching the bias in the question, which will lead them to change their answer more. 18 | 19 | ### Why do you expect to see inverse scaling? 20 | 21 | Larger models may be more prone to reproduce the typical distribution of code seen during training, where unusual swaps (e.g. print with len) are normally not present. 22 | 23 | ### Why is the task novel or surprising? 24 | 25 | Is inverse scaling on the task novel (not shown in prior work) and/or surprising? Why or why not? 26 | 27 | ### Dataset generation procedure 28 | 29 | 1. We scrape python code from GitHub using https://github.com/uclnlp/pycodesuggest 30 | 2. We extract top-level functions with a docstring 31 | 3. We take each function that calls at least two different builtin functions, randomly select two of these, and then we create a prompt (everything up to the docstring) and a correct and incorrect pair "classes" (everything after the docstring, with and without the correct substitution) 32 | 33 | ## Code generation 34 | 35 | In order to generate the dataset, first clone the pycodesuggest repository in the `gen_data` directory and scrape python repositories from GitHub. 36 | For this subission we downloaded 559 repositories from the most recent snapshot of GitHub available on 16 Dec 2022. 37 | 38 | We used the command: 39 | ``` 40 | python3 /path_to_pycodesuggest/github-scraper/scraper.py --mode new --outdir=/full_path_to_scrape_output_dir/scrape/ --dbfile=//full_path_to_scrape_output_dir/cloned_repos.dat --githubuser=amiceli --search="CC-BY-4.0 in:readme size:<=200000" 41 | ``` 42 | which we stopped after getting enough repositories. 43 | We did not use the normalization scripts. 44 | 45 | The generated database and file list are available in the gen_data directory 46 | 47 | After the download is complete, run `filter_functions_with_docstrings_and_shuffle.py` and `generate_examples.py` to generate the dataset. We arbitrary cut off the dataset at 1000 examples. Run `generate_examples_no_builtins.py` to generate the alternate dataset where non-builtin functions are swapped. Both datasets are available in the `cc_4_0_licensed/` directory. 48 | This code depends on `astunparse 1.6.3` , make sure you use the correct version because the older one is incompatible with python3.8 . 49 | 50 | ## Evaluation 51 | 52 | For our main experiments, clone our modified version of the Inverse Scaling Prize repository `inverse-scaling-eval-pipeline` and follow the instructions. The `experiments/` directory contains a jupyter notebook to generate the plots in the paper. 53 | 54 | For our experiments on the Chat LLMs, use the jupyter notebook in the `eval_chat_llms/` directory. 55 | 56 | ## Results 57 | 58 | All the models tested always prefer the incorrect answer to the correct one, hence classification accuracy is zero. For some model families the preference is more prominent in terms of classification loss for bigger models, resulting in inverse scaling. 59 | 60 | ![Main experimental results](experiments/all_models_loss_plot.png?raw=true "Main experimental results") 61 | 62 | Similar results are observed on the Chat LLMs in the OpenAI family and Anthropic family. 63 | 64 | ![Chat LLMs results](eval_chat_llms/chat_llms_classsification_plot.png?raw=true "Chat LLMs results") 65 | 66 | Inverse scaling is also observed when swapping non-builtin top-level functions. 67 | 68 | ![Non-builtin experiment results](experiments/non_builtins_gpt3_loss_plot.png?raw=true "Non-builtin experiment results") 69 | 70 | LLMs prefer incorrect programs that use functions in a common way to out-of-distribution but correct programs. 71 | 72 | ## Copyright takedown 73 | 74 | If you believe that material you own a copyright to has been included into our dataset and you wish it to be removed, please contact the authors by opening an issue on this GitHub repository. 75 | 76 | ## Cite this work 77 | 78 | Please cite this work as: 79 | ``` 80 | @inproceedings{miceli-barone-etal-2023-larger, 81 | title = "The Larger they are, the Harder they Fail: Language Models do not Recognize Identifier Swaps in Python", 82 | author = "Miceli Barone, Antonio Valerio and 83 | Barez, Fazl and 84 | Cohen, Shay B. and 85 | Konstas, Ioannis", 86 | booktitle = "Findings of the Association for Computational Linguistics: ACL 2023", 87 | month = jul, 88 | year = "2023", 89 | address = "Toronto, Canada", 90 | publisher = "Association for Computational Linguistics", 91 | url = "https://aclanthology.org/2023.findings-acl.19", 92 | pages = "272--292", 93 | } 94 | ``` 95 | -------------------------------------------------------------------------------- /eval_chat_llms/eval_chat_llms.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "5d64b071", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import sys, os\n", 11 | "from collections import namedtuple\n", 12 | "import csv\n", 13 | "import ast\n", 14 | "import re\n", 15 | "\n", 16 | "from tqdm import tqdm\n", 17 | "\n", 18 | "import openai, anthropic\n", 19 | "import backoff\n", 20 | "import tiktoken\n", 21 | "\n", 22 | "from langchain.chat_models import ChatOpenAI\n", 23 | "from langchain.chat_models import ChatAnthropic\n", 24 | "from langchain.prompts.chat import (\n", 25 | " ChatPromptTemplate,\n", 26 | " SystemMessagePromptTemplate,\n", 27 | " AIMessagePromptTemplate,\n", 28 | " HumanMessagePromptTemplate,\n", 29 | ")\n", 30 | "from langchain.schema import (\n", 31 | " AIMessage,\n", 32 | " HumanMessage,\n", 33 | " SystemMessage\n", 34 | ")" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "1d4f2d0b", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "oai_api_key_file = \"path/to/openai_key\"\n", 45 | "anthropic_api_key_file = \"path/to/anthropic_key\"\n", 46 | "\n", 47 | "with open(oai_api_key_file, \"r\") as api_key_fs:\n", 48 | " os.environ[\"OPENAI_API_KEY\"] = api_key_fs.read().strip()\n", 49 | " \n", 50 | "with open(anthropic_api_key_file, \"r\") as api_key_fs:\n", 51 | " os.environ[\"ANTHROPIC_API_KEY\"] = api_key_fs.read().strip()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "bfd3ee1a", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "test_set_file = \"../cc_4_0_licensed/python_builtins_swap.csv\" # Path to test set" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "id": "f58467d7", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "test_set = []\n", 72 | "with open(test_set_file, \"r\") as in_fs:\n", 73 | " reader = csv.reader(in_fs)\n", 74 | " for i, row in enumerate(reader):\n", 75 | " if i == 0:\n", 76 | " continue # skip header\n", 77 | " example = {\"prompt\": row[0], \"classes\": ast.literal_eval(row[1]), \"answer_index\": int(row[2])}\n", 78 | " test_set.append(example)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 6, 84 | "id": "5700c058", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "prompt_template = \\\n", 89 | "\"\"\"\n", 90 | "Consider the following Python programs:\n", 91 | "\n", 92 | "Program 1:\n", 93 | "\n", 94 | "{program_1}\n", 95 | "\n", 96 | "Program 2:\n", 97 | "\n", 98 | "{program_2}\n", 99 | "\n", 100 | "Which program is more likely to be correct? Write only the number of the program and nothing else.\n", 101 | "\"\"\"" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 26, 107 | "id": "43c2dcca", 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "batch_size = 20 # note: actual submitted batches are twice as big since each example is sumbitted in two variants\n", 112 | "max_tokens_per_example = 4000 # Computed using tiktoken, not including the system prompt, assuming gpt-3.5-turbo-0301" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 13, 118 | "id": "08771c15", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "tokenizer = tiktoken.encoding_for_model(\"gpt-3.5-turbo-0301\")" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 27, 128 | "id": "c1105335", 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "@backoff.on_exception(backoff.expo, Exception)\n", 133 | "def chat_llm_batch(chat_model, batch_messages, **kwargs):\n", 134 | " return chat_model.generate(batch_messages, **kwargs)\n", 135 | "\n", 136 | "def eval_chat_model_batch(chat_model, batch, switch_order):\n", 137 | " b_accuracy, b_num_valid = 0, 0\n", 138 | " batch_messages = []\n", 139 | " y_batch = []\n", 140 | " for example in batch:\n", 141 | " y = example[\"answer_index\"]\n", 142 | " if switch_order:\n", 143 | " y = 1 - y\n", 144 | " x = prompt_template.format(\n", 145 | " program_1 = example[\"prompt\"] + example[\"classes\"][1-y].rstrip() + \"\\n\",\n", 146 | " program_2 = example[\"prompt\"] + example[\"classes\"][y].rstrip() + \"\\n\")\n", 147 | " if len(tokenizer.encode(x)) > max_tokens_per_example:\n", 148 | " # too long, skip\n", 149 | " continue\n", 150 | " x_messages = [\n", 151 | " SystemMessage(content=\"You are a helpful assistant.\"),\n", 152 | " HumanMessage(content=x)]\n", 153 | " batch_messages.append(x_messages)\n", 154 | " y_batch.append(y)\n", 155 | " llm_batch_response = chat_llm_batch(chat_model, batch_messages)\n", 156 | " for i, gen in enumerate(llm_batch_response.generations):\n", 157 | " match = re.search(r'(\\d+)', gen[0].text)\n", 158 | " if match is None:\n", 159 | " continue\n", 160 | " gen_class = int(match.group(1)) - 1 # generated class ids should be returned by the llm in the 1, 2 range\n", 161 | " if (gen_class < 0) or (gen_class > 1):\n", 162 | " continue\n", 163 | " if gen_class == y:\n", 164 | " b_accuracy += 1\n", 165 | " b_num_valid += 1\n", 166 | " return b_accuracy, b_num_valid \n", 167 | "\n", 168 | "def eval_chat_model(chat_model, test_set):\n", 169 | " raw_total_accuracy, num_valid, num_examples = 0, 0, 0\n", 170 | " batch = []\n", 171 | " for i in tqdm(range(len(test_set) + 1)):\n", 172 | " if (i >= len(test_set)) or (len(batch) >= batch_size):\n", 173 | " b_accuracy, b_num_valid = eval_chat_model_batch(chat_model, batch, False)\n", 174 | " raw_total_accuracy += b_accuracy\n", 175 | " num_valid += b_num_valid\n", 176 | " num_examples += len(batch)\n", 177 | " b_accuracy, b_num_valid = eval_chat_model_batch(chat_model, batch, True)\n", 178 | " raw_total_accuracy += b_accuracy\n", 179 | " num_valid += b_num_valid\n", 180 | " num_examples += len(batch)\n", 181 | " batch = []\n", 182 | " if i < len(test_set):\n", 183 | " batch.append(test_set[i])\n", 184 | " total_accuracy = float(raw_total_accuracy) / num_examples\n", 185 | " total_accuracy_on_valid = float(raw_total_accuracy) / num_valid\n", 186 | " return {\n", 187 | " \"raw_total_accuracy\": raw_total_accuracy,\n", 188 | " \"total_accuracy\": total_accuracy,\n", 189 | " \"total_accuracy_on_valid\": total_accuracy_on_valid,\n", 190 | " \"num_valid\": num_valid,\n", 191 | " \"num_examples\": num_examples}" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 15, 197 | "id": "73b7d0d6", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "eval_results = {}" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 16, 207 | "id": "21f91b50", 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "name": "stdout", 212 | "output_type": "stream", 213 | "text": [ 214 | "claude-instant-v1.1\n" 215 | ] 216 | }, 217 | { 218 | "name": "stderr", 219 | "output_type": "stream", 220 | "text": [ 221 | "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1001/1001 [19:59<00:00, 1.20s/it]" 222 | ] 223 | }, 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "{'raw_total_accuracy': 206, 'total_accuracy': 0.103, 'total_accuracy_on_valid': 0.103, 'num_valid': 2000, 'num_examples': 2000}\n", 229 | "----\n" 230 | ] 231 | }, 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "model_name = \"claude-instant-v1.1\"\n", 242 | "print(model_name)\n", 243 | "chat = ChatAnthropic(model=model_name, temperature=0.0)\n", 244 | "chat_eval = eval_chat_model(chat, test_set)\n", 245 | "print(chat_eval)\n", 246 | "print(\"----\")\n", 247 | "eval_results[model_name] = chat_eval" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 17, 253 | "id": "eec967f4", 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "claude-v1.3\n" 261 | ] 262 | }, 263 | { 264 | "name": "stderr", 265 | "output_type": "stream", 266 | "text": [ 267 | "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1001/1001 [44:04<00:00, 2.64s/it]" 268 | ] 269 | }, 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "{'raw_total_accuracy': 350, 'total_accuracy': 0.175, 'total_accuracy_on_valid': 0.175, 'num_valid': 2000, 'num_examples': 2000}\n", 275 | "----\n" 276 | ] 277 | }, 278 | { 279 | "name": "stderr", 280 | "output_type": "stream", 281 | "text": [ 282 | "\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "model_name = \"claude-v1.3\"\n", 288 | "print(model_name)\n", 289 | "chat = ChatAnthropic(model=model_name, temperature=0.0)\n", 290 | "chat_eval = eval_chat_model(chat, test_set)\n", 291 | "print(chat_eval)\n", 292 | "print(\"----\")\n", 293 | "eval_results[model_name] = chat_eval" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 19, 299 | "id": "70146189", 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "gpt-3.5-turbo-0301\n" 307 | ] 308 | }, 309 | { 310 | "name": "stderr", 311 | "output_type": "stream", 312 | "text": [ 313 | " 0%| | 0/1001 [00:00._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID c972c4c81559af3b5b87a200dcc30cfb in your message.).\n", 314 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 733f3116a1f0e13399dc761d63e86d5b in your message.).\n", 315 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 648d55296b55331c90f1a708a9bd62c7 in your message.).\n", 316 | " 2%|███▌ | 21/1001 [02:56<2:16:53, 8.38s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 78abc76238809b53f9b07291f1a65258 in your message.).\n", 317 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID cbe735c97c99a54afd478332b013c5ed in your message.).\n", 318 | " 4%|██████▉ | 41/1001 [05:14<2:00:46, 7.55s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 48ddf4dc0e07f85fc3db394a86e6c552 in your message.).\n", 319 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID e9e2af2e93cbc267679bf53fe51a109f in your message.).\n", 320 | " 6%|██████████▎ | 61/1001 [07:44<1:57:51, 7.52s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 7a9b9aee36b37a43e784d0830f35c10a in your message.).\n", 321 | " 8%|█████████████▊ | 81/1001 [09:06<1:34:43, 6.18s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 3ac3ad0f5c4d17c377a7d75a0bdf8157 in your message.).\n", 322 | " 12%|████████████████████▍ | 121/1001 [12:22<1:20:02, 5.46s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID dec996e5bfdf68528d8986637bf3108a in your message.).\n", 323 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID fc39bfc5a6d35e87a62107d35277fca7 in your message.).\n", 324 | " 14%|███████████████████████▊ | 141/1001 [15:24<1:35:14, 6.64s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID d6477786da0b945cea8d95863896c327 in your message.).\n", 325 | " 16%|███████████████████████████▏ | 161/1001 [16:53<1:23:21, 5.95s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID d3833f81fdd9fec2df7701cc1483dc3f in your message.).\n", 326 | " 18%|██████████████████████████████▌ | 181/1001 [18:42<1:19:08, 5.79s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Gateway timeout. {\"error\":{\"code\":524,\"message\":\"Gateway timeout.\",\"param\":null,\"type\":\"cf_gateway_timeout\"}} 524 {'error': {'code': 524, 'message': 'Gateway timeout.', 'param': None, 'type': 'cf_gateway_timeout'}} {'Date': 'Fri, 19 May 2023 03:15:11 GMT', 'Content-Type': 'application/json', 'Content-Length': '92', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c991de88ac554ca-MAN'}.\n", 327 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 496f5b43f10392e79944b269889b42a3 in your message.).\n", 328 | " 20%|█████████████████████████████████▉ | 201/1001 [30:18<3:16:36, 14.75s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6af8f193ec2819f0363a67c631400d29 in your message.).\n", 329 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 9297c1114defe99a5b9b459d1470d8d0 in your message.).\n", 330 | " 22%|█████████████████████████████████████▎ | 221/1001 [32:28<2:38:51, 12.22s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID bce0e2a4229097fef4d23f837ac4132e in your message.).\n", 331 | " 24%|████████████████████████████████████████▋ | 241/1001 [33:59<2:05:06, 9.88s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 5d1c9c5efcec53a9913a851b0abd6c1c in your message.).\n", 332 | " 26%|████████████████████████████████████████████ | 261/1001 [35:32<1:42:20, 8.30s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID f53ddf65745275ba339510f961c0bcb2 in your message.).\n", 333 | " 28%|███████████████████████████████████████████████▍ | 281/1001 [37:26<1:30:14, 7.52s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 19ea986237046f51342d73572cc6c074 in your message.).\n", 334 | " 30%|██████████████████████████████████████████████████▊ | 301/1001 [39:11<1:19:47, 6.84s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 98ef64371c5e795ad3ca04d4a24ca4fd in your message.).\n", 335 | " 32%|██████████████████████████████████████████████████████▏ | 321/1001 [41:09<1:14:10, 6.54s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 30e3584dbb4e7843696d79b1fa83f665 in your message.).\n", 336 | " 34%|█████████████████████████████████████████████████████████▌ | 341/1001 [43:30<1:13:42, 6.70s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 1bf905f0dac6b4c92702626c230b2f4f in your message.).\n", 337 | " 36%|████████████████████████████████████████████████████████████▉ | 361/1001 [45:17<1:07:09, 6.30s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID f4035d75dd635543f08622cfd86c69ca in your message.).\n", 338 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID e9492f8ec1f661477f4b52449868e0a2 in your message.).\n", 339 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 58ab1ed704edbe3d4a782e3efa1b0804 in your message.).\n", 340 | " 42%|███████████████████████████████████████████████████████████████████████ | 421/1001 [53:32<1:07:23, 6.97s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 72c834add675b33b571e686c2b067d00 in your message.).\n", 341 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 0caddff1ad816fb84e83f49a1e37a7b3 in your message.).\n", 342 | " 44%|██████████████████████████████████████████████████████████████████████████▍ | 441/1001 [55:34<1:02:38, 6.71s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID b324e50d8c4110282a8a9af917f0c724 in your message.).\n" 343 | ] 344 | }, 345 | { 346 | "name": "stderr", 347 | "output_type": "stream", 348 | "text": [ 349 | " 48%|██████████████████████████████████████████████████████████████████████████████████▏ | 481/1001 [58:53<50:31, 5.83s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 03:51:10 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9959abca650763-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 350 | " 50%|███████████████████████████████████████████████████████████████████████████████████▌ | 501/1001 [1:05:28<1:23:20, 10.00s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 2020fb8629e9fd48eeb974cc62481fb1 in your message.).\n", 351 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID af7bb2e3b582bddbcd48ea164b36fb2d in your message.).\n", 352 | " 54%|███████████████████████████████████████████████████████████████████████████████████████████▎ | 541/1001 [1:09:15<59:20, 7.74s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 4274e08ee3a21135720cc16c3e0c960e in your message.).\n", 353 | " 56%|██████████████████████████████████████████████████████████████████████████████████████████████▋ | 561/1001 [1:10:59<51:11, 6.98s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6ba0f86850591bd07462a749f88ab058 in your message.).\n", 354 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID c3d5797b533845fd80bbc1393090cb50 in your message.).\n", 355 | " 58%|██████████████████████████████████████████████████████████████████████████████████████████████████ | 581/1001 [1:13:43<51:22, 7.34s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 2a02fb0cd0ce7f57eb623983b3792e0a in your message.).\n", 356 | " 60%|█████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 601/1001 [1:15:25<44:28, 6.67s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 936ac7fe674e2306fab43b20573538f7 in your message.).\n", 357 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 2.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID b294124c9922f82529e844869da4d225 in your message.).\n", 358 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 7373f73a07cfd74e79d5fb1f6d811277 in your message.).\n", 359 | " 62%|████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 621/1001 [1:18:19<46:05, 7.28s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID a9a7f7388809a45a3fb4dbdfa6a77350 in your message.).\n", 360 | " 64%|████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 641/1001 [1:20:10<40:32, 6.76s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 2b62b51e0e87101a332677549da0b9a7 in your message.).\n", 361 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 7ebcc84bdaeec7be21746738735456f5 in your message.).\n", 362 | " 66%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 661/1001 [1:22:18<37:41, 6.65s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID fbf712d25b96e0040928232e05bdee5b in your message.).\n", 363 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 35b74a1c994aac9566b6f33741f11e24 in your message.).\n" 364 | ] 365 | }, 366 | { 367 | "name": "stderr", 368 | "output_type": "stream", 369 | "text": [ 370 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID efe51d859e825291eccc62b2e2796018 in your message.).\n", 371 | " 68%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉ | 681/1001 [1:24:56<37:28, 7.03s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 52bf904a49ade142ef428ef051853999 in your message.).\n", 372 | " 72%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 721/1001 [1:27:49<25:58, 5.57s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 42c459ad2eca5e1b81ec8f88c0d7eee9 in your message.).\n", 373 | " 76%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 761/1001 [1:30:44<19:26, 4.86s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 87b82c18a0a1b33f41cbca55c960aeeb in your message.).\n", 374 | " 82%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 821/1001 [1:34:15<11:30, 3.83s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 41a47e96cbe37f3397c5b5166495cef6 in your message.).\n", 375 | " 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 861/1001 [1:36:41<08:28, 3.64s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 3ff19d7b74541c1b5afb2a9f0ac46c1c in your message.).\n", 376 | " 90%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ | 901/1001 [1:39:31<06:21, 3.82s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 39b0247737cc2c2050ac03dcdc2c5269 in your message.).\n", 377 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 66943b060398bcc7d95fb44583c4b6c1 in your message.).\n", 378 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID b3f23958dd2cef1b88b8fccb16e63a8c in your message.).\n", 379 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 2cc01eacb10be5cdf644e6d02230edfd in your message.).\n", 380 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 20119bfc788cca651c52d853d1194f4f in your message.).\n", 381 | " 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊ | 941/1001 [1:44:24<05:14, 5.24s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 6dcd037fd7fe193836125a6693c3ae51 in your message.).\n", 382 | " 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 961/1001 [1:45:53<03:19, 5.00s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 624f7b69e1bd6ab0a1b671899826671c in your message.).\n", 383 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 465a7df4df9531babf1321524f807425 in your message.).\n", 384 | "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1001/1001 [1:49:18<00:00, 6.55s/it]" 385 | ] 386 | }, 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "{'raw_total_accuracy': 67, 'total_accuracy': 0.0335, 'total_accuracy_on_valid': 0.0393885949441505, 'num_valid': 1701, 'num_examples': 2000}\n", 392 | "----\n" 393 | ] 394 | }, 395 | { 396 | "name": "stderr", 397 | "output_type": "stream", 398 | "text": [ 399 | "\n" 400 | ] 401 | } 402 | ], 403 | "source": [ 404 | "model_name=\"gpt-3.5-turbo-0301\"\n", 405 | "chat = ChatOpenAI(model_name=model_name, temperature=0.0)\n", 406 | "print(model_name)\n", 407 | "chat_eval = eval_chat_model(chat, test_set)\n", 408 | "print(chat_eval)\n", 409 | "print(\"----\")\n", 410 | "eval_results[model_name] = chat_eval" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 28, 416 | "id": "cc837ea7", 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "gpt-4-0314\n" 424 | ] 425 | }, 426 | { 427 | "name": "stderr", 428 | "output_type": "stream", 429 | "text": [ 430 | " 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 721/1001 [37:03<14:16, 3.06s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 11:33:58 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9bff9a1cee0abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 431 | " 74%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌ | 741/1001 [43:15<33:26, 7.72s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 11:39:29 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c07a8dc0b0abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 432 | "Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 11:45:41 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c10c48a440abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 433 | " 78%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 781/1001 [55:46<43:39, 11.91s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 11:52:04 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c1a1b797e0abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 434 | " 80%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 801/1001 [1:02:04<46:43, 14.02s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 11:58:54 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c2414dc6a0abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 435 | " 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎ | 861/1001 [1:10:26<21:29, 9.21s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 12:07:25 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c30972daa0abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 436 | " 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋ | 881/1001 [1:16:48<24:20, 12.17s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 12:13:40 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c39b60c800abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 437 | " 92%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍ | 921/1001 [1:24:10<14:30, 10.88s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 12:20:28 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c43b4faa10abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n", 438 | " 96%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏ | 961/1001 [1:31:23<06:46, 10.15s/it]Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.._completion_with_retry in 1.0 seconds as it raised APIError: Bad gateway. {\"error\":{\"code\":502,\"message\":\"Bad gateway.\",\"param\":null,\"type\":\"cf_bad_gateway\"}} 502 {'error': {'code': 502, 'message': 'Bad gateway.', 'param': None, 'type': 'cf_bad_gateway'}} {'Date': 'Fri, 19 May 2023 12:27:50 GMT', 'Content-Type': 'application/json', 'Content-Length': '84', 'Connection': 'keep-alive', 'X-Frame-Options': 'SAMEORIGIN', 'Referrer-Policy': 'same-origin', 'Cache-Control': 'private, max-age=0, no-store, no-cache, must-revalidate, post-check=0, pre-check=0', 'Expires': 'Thu, 01 Jan 1970 00:00:01 GMT', 'Server': 'cloudflare', 'CF-RAY': '7c9c4e6ded4d0abf-MAN', 'alt-svc': 'h3=\":443\"; ma=86400, h3-29=\":443\"; ma=86400'}.\n" 439 | ] 440 | }, 441 | { 442 | "name": "stderr", 443 | "output_type": "stream", 444 | "text": [ 445 | "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1001/1001 [1:38:48<00:00, 5.92s/it]" 446 | ] 447 | }, 448 | { 449 | "name": "stdout", 450 | "output_type": "stream", 451 | "text": [ 452 | "{'raw_total_accuracy': 37, 'total_accuracy': 0.0185, 'total_accuracy_on_valid': 0.018518518518518517, 'num_valid': 1998, 'num_examples': 2000}\n", 453 | "----\n" 454 | ] 455 | }, 456 | { 457 | "name": "stderr", 458 | "output_type": "stream", 459 | "text": [ 460 | "\n" 461 | ] 462 | } 463 | ], 464 | "source": [ 465 | "model_name=\"gpt-4-0314\"\n", 466 | "chat = ChatOpenAI(model_name=model_name, temperature=0.0)\n", 467 | "print(model_name)\n", 468 | "chat_eval = eval_chat_model(chat, test_set)\n", 469 | "print(chat_eval)\n", 470 | "print(\"----\")\n", 471 | "eval_results[model_name] = chat_eval" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "a2c8575a", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 69, 485 | "id": "81b9df05", 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "import json\n", 490 | "import matplotlib.pyplot as plt\n", 491 | "import numpy as np\n", 492 | "from pathlib import Path" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 30, 498 | "id": "2ab4c70d", 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "out_file_name = \"eval_chat_llms_results.json\"" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 33, 508 | "id": "1438aa8c", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "with open(out_file_name, \"w\") as out_fs:\n", 513 | " json.dump(eval_results, out_fs)" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "id": "733358a0", 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 34, 527 | "id": "9c51c0c9", 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "plt.rcParams.update({'font.size': 12.0})\n", 532 | "plt.rcParams.update({'figure.titlesize': 'small'})\n", 533 | "plt.rcParams.update({'legend.fontsize': 'small'})" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 73, 539 | "id": "20ab4444", 540 | "metadata": {}, 541 | "outputs": [ 542 | { 543 | "data": { 544 | "image/png": "\n", 545 | "text/plain": [ 546 | "
" 547 | ] 548 | }, 549 | "metadata": {}, 550 | "output_type": "display_data" 551 | } 552 | ], 553 | "source": [ 554 | "title = \"Classification accuracy of chat LLMs\"\n", 555 | "\n", 556 | "model_names_succinct = [\"-\".join(model_name.split(\"-\")[:-1]) for model_name in eval_results.keys()]\n", 557 | "ind = np.arange(len(model_names_succinct))\n", 558 | "width = 0.25\n", 559 | "\n", 560 | "pos_vals = [100.0 * eval_results[model_name]['total_accuracy'] for model_name in eval_results.keys()]\n", 561 | "plt.bar(ind, pos_vals, width, label='Correct', color='blue')\n", 562 | "\n", 563 | "neg_vals = [100.0 * (eval_results[model_name]['num_examples'] - eval_results[model_name]['raw_total_accuracy']) / eval_results[model_name]['num_examples'] for model_name in eval_results.keys()]\n", 564 | "plt.bar(ind+width, neg_vals, width, label='Incorrect', color='red')\n", 565 | "\n", 566 | "invalid_vals = [100.0 * (1.0 - (eval_results[model_name]['num_valid'] / eval_results[model_name]['num_examples'])) for model_name in eval_results.keys()]\n", 567 | "plt.bar(ind+2*width, invalid_vals, width, label='Invalid', color='black')\n", 568 | "\n", 569 | "plt.xlabel(\"Model\")\n", 570 | "plt.title(title)\n", 571 | "plt.xticks(ind + (width), model_names_succinct)\n", 572 | "plt.legend(bbox_to_anchor=(1.05, 1.0), loc='upper left')\n", 573 | "plt.tight_layout()\n", 574 | "\n", 575 | "plt.savefig(Path(\"./\", \"chat_llms_classsification_plot.svg\"), format=\"svg\")\n", 576 | "plt.savefig(Path(\"./\", \"chat_llms_classsification_plot.png\"), format=\"png\")\n", 577 | "plt.savefig(Path(\"./\", \"chat_llms_classsification_plot.pdf\"), format=\"pdf\")" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 71, 583 | "id": "e6ce861b", 584 | "metadata": {}, 585 | "outputs": [ 586 | { 587 | "data": { 588 | "text/plain": [ 589 | "{'claude-instant-v1.1': {'raw_total_accuracy': 206,\n", 590 | " 'total_accuracy': 0.103,\n", 591 | " 'total_accuracy_on_valid': 0.103,\n", 592 | " 'num_valid': 2000,\n", 593 | " 'num_examples': 2000},\n", 594 | " 'claude-v1.3': {'raw_total_accuracy': 350,\n", 595 | " 'total_accuracy': 0.175,\n", 596 | " 'total_accuracy_on_valid': 0.175,\n", 597 | " 'num_valid': 2000,\n", 598 | " 'num_examples': 2000},\n", 599 | " 'gpt-3.5-turbo-0301': {'raw_total_accuracy': 67,\n", 600 | " 'total_accuracy': 0.0335,\n", 601 | " 'total_accuracy_on_valid': 0.0393885949441505,\n", 602 | " 'num_valid': 1701,\n", 603 | " 'num_examples': 2000},\n", 604 | " 'gpt-4-0314': {'raw_total_accuracy': 37,\n", 605 | " 'total_accuracy': 0.0185,\n", 606 | " 'total_accuracy_on_valid': 0.018518518518518517,\n", 607 | " 'num_valid': 1998,\n", 608 | " 'num_examples': 2000}}" 609 | ] 610 | }, 611 | "execution_count": 71, 612 | "metadata": {}, 613 | "output_type": "execute_result" 614 | } 615 | ], 616 | "source": [ 617 | "eval_results" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 74, 623 | "id": "5f9b7e1f", 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "from IPython.display import Image" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": 75, 633 | "id": "5865c575", 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "data": { 638 | "image/png": "\n", 639 | "text/plain": [ 640 | "" 641 | ] 642 | }, 643 | "execution_count": 75, 644 | "metadata": {}, 645 | "output_type": "execute_result" 646 | } 647 | ], 648 | "source": [ 649 | "Image(\"./chat_llms_classsification_plot.png\")" 650 | ] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": null, 655 | "id": "c715b0ca", 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [] 659 | } 660 | ], 661 | "metadata": { 662 | "kernelspec": { 663 | "display_name": "Python 3 (ipykernel)", 664 | "language": "python", 665 | "name": "python3" 666 | }, 667 | "language_info": { 668 | "codemirror_mode": { 669 | "name": "ipython", 670 | "version": 3 671 | }, 672 | "file_extension": ".py", 673 | "mimetype": "text/x-python", 674 | "name": "python", 675 | "nbconvert_exporter": "python", 676 | "pygments_lexer": "ipython3", 677 | "version": "3.8.13" 678 | } 679 | }, 680 | "nbformat": 4, 681 | "nbformat_minor": 5 682 | } 683 | --------------------------------------------------------------------------------