├── llms ├── __init__.py ├── llama_model.py ├── openrouter_model.py ├── groq_model.py ├── anthropic_model.py ├── gemini_model.py ├── cohere_model.py ├── mistral_model.py ├── moonshot_model.py ├── vertexai_model.py └── openai_model.py ├── tests ├── __init__.py ├── what_is_slice_stop.py ├── what_is_formatfloat.py ├── latex_mini_caps.py ├── what_is_blockbyorb.py ├── which_package_sbox.py ├── tokenizer_vocab.py ├── knowledge_llama.py ├── emacs_lisp_silence_cmd.py ├── db9_pinout.py ├── unit_conversion_math.py ├── do_uudecode.py ├── identify_uuencode.py ├── what_is_automodel.py ├── latex_protect.py ├── what_is_inv.py ├── print_hello.py ├── save_expired_html.py ├── latex_redef.py ├── docker_cuda.py ├── freecad_construction.py ├── implement_crc32.py ├── what_is_oraw.py ├── base64_qanda.py ├── python_to_c_loop_update.py ├── program_pipes_python.py ├── program_pipes_cpp.py ├── gitignore_anywhere.py ├── strided_trick.py ├── shorten_python_if_missing.py ├── vague_loop_format.py ├── upython_mqtt.py ├── decompile_py_simple.py ├── regex_remove_5_words.py ├── explain_vbroadcast.py ├── numpy_ix.py ├── simulate_torch_grad.py ├── numba_levenshtein.py ├── python_jpeg.py ├── dedent_code_fn.py ├── program_sqrt.py ├── convert_to_c_simple.py ├── bash_find_dont_contain.py ├── jax_onehot.py ├── generate_string_moves.py ├── python_chess_game_prefix.py ├── aws_ipv6.py ├── hallucinate_reference.py ├── explain_code_prime2.py ├── fix_node_error.py ├── bash_renamer.py ├── draw_flag_bmp.py ├── baking_help.py ├── explain_code_prime.py ├── easy_parser_generator.py ├── bash_list_files_by_size_mod_ten.py ├── numpy_advanced_index.py ├── make_sqlite_table.py ├── python_traceback.py ├── bash_convert_not_overwrite.py ├── torch_to_jnp.py ├── fix_tokenizer.py ├── unholy_matrix.py ├── change_filetype.py ├── fix_append_vs_extend.py ├── make_tree_from_text.py ├── fix_json.py ├── webgl_triangle.py ├── c_weird_expression.py ├── date_news_headlines.py ├── flexbox_webpage.py ├── basic_git_setup.py ├── jnp_nn_bugfix.py ├── why_broken_flask_extra_brace.py ├── faster_l2_diff.py ├── call_rust_from_python.py ├── convert_dp_to_iterative.py ├── vectorize_small_update.py ├── vague_sum_data.py ├── play_20_questions.py ├── fancy_sql_process.py ├── print_hello_poly.py ├── extract_emails.py ├── rewrite_mac_crypto.py ├── rust_word_count.py ├── py_image_resize.py ├── gol_rle_decode.py ├── fix_torch_backward.py ├── merge_into_16.py ├── make_json.py ├── emoji_movies.py ├── git_cherrypick.py ├── python_parallel_wordcount.py ├── debug_innerhtml_eventlistener.py ├── convert_to_c.py ├── fix_with_patch.py ├── shorten_c_function_hard.py ├── implement_assembly_interpreter_by_example.py ├── whisper_merge.py ├── rust_parallel_wordcount.py ├── debug_broken_code_parcount.py ├── fix_threading_issue.py ├── shorten_c_function.py ├── decompile_py_mid.py ├── explore_sql_db.py ├── basic_code_understanding.py ├── find_bug_in_paper.py ├── git_merge.py ├── implement_assembly_interpreter.py ├── data_table_processing.py ├── program_in_new_assembly.py └── extract_references.py ├── setup.sh ├── .gitignore ├── requirements.txt ├── requirements-extra.txt ├── config.json.example ├── Dockerfile └── llm.py /llms/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | docker build -t ubuntu-python-app . -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .API_KEY 2 | config.json 3 | myenv 4 | tmp/* -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docker 2 | numpy 3 | Pillow 4 | openai 5 | markdown 6 | pygments 7 | selenium 8 | -------------------------------------------------------------------------------- /requirements-extra.txt: -------------------------------------------------------------------------------- 1 | llama-cpp-python 2 | anthropic 3 | google-cloud-aiplatform 4 | cohere 5 | google-generativeai 6 | groq -------------------------------------------------------------------------------- /tests/what_is_slice_stop.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case checks if the model can say how to properly get the end of a slice." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | How do I get sice(0, 5).end? 9 | """ 10 | 11 | TestWhatIsSlice = question >> LLMRun() >> SubstringEvaluator(".stop") 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestWhatIsSlice)) 15 | -------------------------------------------------------------------------------- /tests/what_is_formatfloat.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case checks if models can format f strings with floats." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | Python format {sdf} to 3 decimal digits 9 | """ 10 | 11 | TestWhatIsFloatFormat = question >> LLMRun() >> SubstringEvaluator(":.3f}") 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestWhatIsFloatFormat)) 15 | -------------------------------------------------------------------------------- /tests/latex_mini_caps.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model knows some latex macros explained poorly" 4 | 5 | TAGS = ['explain'] 6 | 7 | question = "Latex format the word METHOD like Method but all caps with the other letters lower" 8 | 9 | TextLatexLowerCaps = question >> LLMRun() >> SubstringEvaluator("textsc") 10 | 11 | if __name__ == "__main__": 12 | print(run_test(TextLatexLowerCaps)) 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/what_is_blockbyorb.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model knows what ERR_BLOCKED_BY_ORB means." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What does "ERR_BLOCKED_BY_ORB" mean? 9 | """ 10 | 11 | TestWhatIsBlockByOrb = question >> LLMRun() >> SubstringEvaluator("opaque resource blocking", lower=True) 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestWhatIsBlockByOrb)) 15 | 16 | -------------------------------------------------------------------------------- /tests/which_package_sbox.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case checks if the model knows what latex package to import for the Sbox environment to work." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What latex package has the Sbox env? 9 | """ 10 | 11 | TestWhereIsSbox = question >> LLMRun() >> SubstringEvaluator("fancybox") 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestWhereIsSbox)) 15 | -------------------------------------------------------------------------------- /tests/tokenizer_vocab.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case is designed to check if the model can print out the tokens in a AutoTokenizer's vocabulary." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | How do I get all tokens in a AutoTokenizer? 9 | """ 10 | 11 | TestGetVocab = question >> LLMRun() >> SubstringEvaluator("get_vocab") 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestGetVocab)) 15 | 16 | -------------------------------------------------------------------------------- /tests/knowledge_llama.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test the knowledge cutoff of the model to see if it knows the LLAMA-2 hidden dimension size." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What is the hidden dimension of LLAMA-2 70b? 9 | """ 10 | 11 | TestLlamaKnowledge = question >> LLMRun() >> (SubstringEvaluator("8192") | SubstringEvaluator("8,192")) 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestLlamaKnowledge)) 15 | -------------------------------------------------------------------------------- /tests/emacs_lisp_silence_cmd.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can understand a vague error for an emacs lisp question." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | In emacs lisp how do I silence "Process python-process finished" 9 | """ 10 | 11 | TestLispSilencePython = question >> LLMRun() >> SubstringEvaluator("set-process-sentinel") 12 | 13 | 14 | if __name__ == "__main__": 15 | print(run_test(TestLispSilencePython)) 16 | -------------------------------------------------------------------------------- /tests/db9_pinout.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model knows about old computer ports when prompted ambiguously." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What port has 5 pins on the top and 4 on the bottom? 9 | """ 10 | 11 | TestDB9 = question >> LLMRun() >> (SubstringEvaluator("DB-9") | SubstringEvaluator("DB9") | SubstringEvaluator("DE-9") | SubstringEvaluator("DE9")) 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestDB9)) 15 | -------------------------------------------------------------------------------- /tests/unit_conversion_math.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can do basic math with some EE equations." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = "I measure a voltage drop of 30mv over a 2.5 ohm resistor. How long will a 3Ah battery last? Write your ansser 'XXX hours'" 8 | 9 | TestUnitConversion = question >> LLMRun() >> RegexEvaluator("250.{,10}hours") 10 | 11 | if __name__ == "__main__": 12 | print(run_test(TestUnitConversion)) 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/do_uudecode.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can successfully uudecode a given string." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | uudecode this string: 9 | 10 | H,3DR-S<-"C(X,C> LLMRun() >> (SubstringEvaluator("60826") & SubstringEvaluator("20715") & SubstringEvaluator("19277")) 15 | 16 | 17 | if __name__ == "__main__": 18 | print(run_test(TestUUDecode)) 19 | 20 | -------------------------------------------------------------------------------- /tests/identify_uuencode.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can correctly identify a block of text is uuencoded." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What encoding scheme does it look like this is encoded with? 9 | 10 | H,3DR-S<-"C(X,C> LLMRun() >> SubstringEvaluator("uuencode", lower=True) 15 | 16 | 17 | if __name__ == "__main__": 18 | print(run_test(TestIsUU)) 19 | 20 | -------------------------------------------------------------------------------- /tests/what_is_automodel.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can interpret vague questions and will respond with the answer I want, not the answer that's easy to find." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What do I do to fix AutoModel.from_pretrained to make it auto model with lm head 9 | """ 10 | 11 | TestWhatIsAutoModel = question >> LLMRun() >> SubstringEvaluator("AutoModelForCausalLM") 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestWhatIsAutoModel)) 15 | -------------------------------------------------------------------------------- /tests/latex_protect.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can fix a latex newline error in a caption" 4 | 5 | TAGS = ['explain'] 6 | 7 | question = r""" 8 | In latex when I force a newline in a caption with \\ it gives an error saying \Hy@tempa has an extra } runaway argument? 9 | """ 10 | 11 | # \newline doesn't work the same I need \\ 12 | TestLatexNewline = question >> LLMRun() >> SubstringEvaluator("\\protect") 13 | 14 | if __name__ == "__main__": 15 | print(run_test(TestLatexNewline)) 16 | -------------------------------------------------------------------------------- /tests/what_is_inv.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case is designed to check if the model can correctly identify the Python operator used for the tilde (~) symbol." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | In python what __thing__ do I use for ~, kind of like how __add__ is for +? 9 | """ 10 | 11 | TestWhatIsInv = question >> LLMRun() >> (SubstringEvaluator("__inv__") | SubstringEvaluator("__invert__")) 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestWhatIsInv)) 15 | -------------------------------------------------------------------------------- /tests/print_hello.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a basic python program that prints \"hello world\"." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = 'Write a python program that prints the string "hello world" and tell me how it works in a sentence' 8 | 9 | answer = "hello world" 10 | 11 | TestPrintHello = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> SubstringEvaluator(answer) 12 | 13 | 14 | if __name__ == "__main__": 15 | print(run_test(TestPrintHello)) 16 | 17 | -------------------------------------------------------------------------------- /tests/save_expired_html.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model knows how to get the HTML for the entire webpage; not just the body." 4 | 5 | TAGS = ['explain', 'html'] 6 | 7 | question = """ 8 | I have a webpage that's open in a chrome browser but the page no longer exists. What javascript should I type in the console to get a copy of the entire page html? 9 | """ 10 | 11 | TestRecoverExpiredPage = question >> LLMRun() >> SubstringEvaluator("outerHTML") 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestRecoverExpiredPage)) 15 | 16 | -------------------------------------------------------------------------------- /tests/latex_redef.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can use latex \\renewcommand, and do a bit more than what I actually asked." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | Redfine \\paragraph to be \\bigskip\\textbf 9 | """ 10 | 11 | TestLatexRedef = question >> LLMRun() >> ((SubstringEvaluator("\\renewcommand{\\paragraph}") | SubstringEvaluator("\\renewcommand\\paragraph")) & SubstringEvaluator("\\bigskip") & SubstringEvaluator("\\textbf") & SubstringEvaluator("\\noindent")) 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestLatexRedef)) 15 | -------------------------------------------------------------------------------- /tests/docker_cuda.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case checks if the model can debug a docker cuda error" 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | nicholas@server:~/project$ sudo docker start myproject 9 | [sudo] password for nicholas: 10 | Error response from daemon: could not select device driver "" with capabilities: [[gpu]] 11 | Error: failed to start containers: myproject 12 | """ 13 | 14 | TestFixDockerCuda = question >> LLMRun() >> (SubstringEvaluator("nvidia-container-toolkit") | SubstringEvaluator("nvidia-docker2")) 15 | 16 | if __name__ == "__main__": 17 | print(run_test(TestFixDockerCuda)) 18 | -------------------------------------------------------------------------------- /tests/freecad_construction.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model understands a rambling question about how to make construction circle in FreeCAD." 4 | 5 | TAGS = ['explain', 'fun'] 6 | 7 | question = """ 8 | In freecad I want to have a circle but only like for reference not to actually be a circle but just so I can line up other lines to make a hexagon for example, how do I do that 9 | """ 10 | 11 | TestFreeCADCircle = question >> LLMRun() >> (SubstringEvaluator("construction geometry") | SubstringEvaluator("construction circle")) 12 | 13 | 14 | if __name__ == "__main__": 15 | print(run_test(TestFreeCADCircle)) 16 | -------------------------------------------------------------------------------- /tests/implement_crc32.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model understands the CRC-32 spec well enough to implement it." 4 | 5 | TAGS = ['code', 'c'] 6 | import zlib 7 | 8 | question = 'Write a c program that calculates the crc32 hash of the string "correct horse battery staple" and prints it to stdout. Dont call any builtin crc32 functions.' 9 | 10 | answer = "%x"%zlib.crc32(b'correct horse battery staple') 11 | 12 | TestCRC32 = question >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> CRun() >> SubstringEvaluator(answer, lower=True) 13 | 14 | 15 | if __name__ == "__main__": 16 | print(run_test(TestCRC32)) 17 | 18 | -------------------------------------------------------------------------------- /llms/llama_model.py: -------------------------------------------------------------------------------- 1 | from llama_cpp import Llama 2 | 3 | 4 | class LLAMAModel: 5 | def __init__(self, path): 6 | self.llm = Llama(model_path=path, chat_format="llama-2", n_ctx=1524) 7 | 8 | def make_request(self, conversation, add_image=None, logit_bias=None, max_tokens=None, skip_cache=False): 9 | conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)] 10 | print("Start chat") 11 | out = self.llm.create_chat_completion( 12 | messages = conversation 13 | ) 14 | print("End chat") 15 | return out['choices'][0]['message']['content'] 16 | -------------------------------------------------------------------------------- /tests/what_is_oraw.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case checks if the model knows lpr commands." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | I have a .ps file that I want to print from my mac. I'm trying to use lpr. But every time I print it all I get is the actual bytes just printed with like hundreds of pages and this is not what I want. I want the printer to just print the ps directly. What command should I use to make this work? 9 | """ 10 | 11 | TestWhatIsLPR = question >> LLMRun() >> (SubstringEvaluator("-o raw") | SubstringEvaluator("-l ")) 12 | 13 | if __name__ == "__main__": 14 | print(run_test(TestWhatIsLPR)) 15 | 16 | -------------------------------------------------------------------------------- /config.json.example: -------------------------------------------------------------------------------- 1 | { 2 | "container": "podman", 3 | "hparams": { 4 | "temperature": 0.7 5 | }, 6 | "llms": { 7 | "vertexai": { 8 | "project_id": "TODO" 9 | }, 10 | "openai": { 11 | "api_key": "TODO" 12 | }, 13 | "mistral": { 14 | "api_key": "TODO" 15 | }, 16 | "cohere": { 17 | "api_key": "TODO" 18 | }, 19 | "anthropic": { 20 | "api_key": "TODO" 21 | }, 22 | "moonshot": { 23 | "api_key": "TODO" 24 | }, 25 | "groq": { 26 | "api_key": "TODO" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /tests/base64_qanda.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model will follow instructions to the letter without lots of cajoling. Thinking in base64 is also interesting." 4 | 5 | TAGS = ['explain', 'fun'] 6 | import base64 7 | 8 | question = """ 9 | Read this base64 string, think about the answer, and type just the answer in base64. You're entire answer must be base64. 10 | 11 | V2hhdCBpcyBvbmUgeWVhciBhZnRlciB3aGVuIFByZXNpZGVudCBnZW9yZ2Ugd2FzaGluZ3RvbiBib3JuPw== 12 | """ 13 | 14 | TestBase64Thought = question >> LLMRun() >> PyFunc(lambda x: base64.b64decode(x).decode('utf-8')) >> SubstringEvaluator("1733") 15 | 16 | if __name__ == "__main__": 17 | print(run_test(TestBase64Thought)) 18 | 19 | -------------------------------------------------------------------------------- /tests/python_to_c_loop_update.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can convert a python program to c, with a loop that makes it difficult." 4 | 5 | TAGS = ['code', 'python', 'c'] 6 | 7 | question = """ 8 | Convert this to c 9 | 10 | def foo(x): 11 | sum = 0 12 | for i in range(x): 13 | x += i 14 | sum += x 15 | return sum 16 | """ 17 | 18 | test_case, answer = make_c_test([("foo(10)", "265")]) 19 | 20 | 21 | TestPythonToCLoopUpdate = question >> LLMRun() >> ExtractCode() >> CRun(test_case) >> SubstringEvaluator(answer) 22 | 23 | if __name__ == "__main__": 24 | print(run_test(TestPythonToCLoopUpdate)) 25 | -------------------------------------------------------------------------------- /tests/program_pipes_python.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a python program that defines dataflow DSL." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Write a python class Const, Square, Print so that I can write, for example, 9 | 10 | Cost(5) >> Square() >> Inc() >> Inc() >> Print() 11 | 12 | and the resulting python program will print 27 to the screen. 13 | """ 14 | 15 | test_case, answer = make_python_test([("Const(2) >> Square() >> Inc() >> Square() >> Square() >> Print()", "0")]) 16 | 17 | TestMakeShiftOp = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator("625") 18 | 19 | 20 | if __name__ == "__main__": 21 | print(run_test(TestMakeShiftOp)) 22 | -------------------------------------------------------------------------------- /tests/program_pipes_cpp.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a C++ program that defines dataflow DSL." 4 | 5 | TAGS = ['code', 'c'] 6 | 7 | question = """ 8 | Write a C++ class Const, Square, Print so that I can write, for example, 9 | 10 | Cost(5) >> Square() >> Inc() >> Inc() >> Print() 11 | 12 | and the resulting python program will print 27 to the screen. 13 | """ 14 | 15 | test_case = """int main() { 16 | Const(2) >> Square() >> Inc() >> Square() >> Square() >> Print(); 17 | return 0; 18 | }""" 19 | 20 | 21 | TestMakeShiftOpC = question >> LLMRun() >> ExtractCode() >> CppRun(test_case) >> SubstringEvaluator("625") 22 | 23 | 24 | if __name__ == "__main__": 25 | print(run_test(TestMakeShiftOpC)) 26 | -------------------------------------------------------------------------------- /tests/gitignore_anywhere.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can understand and interpret a request to gitignore any file called \"foo/.KEYFILE\" regardless of its location in a repository." 4 | 5 | TAGS = ['explain'] 6 | 7 | question1 = """ 8 | gitignore any file called .KEYFILE directly inside a directory foo, which itself is anywhere 9 | """ 10 | 11 | TestWhatIsStarStar = question1 >> LLMRun() >> SubstringEvaluator("**/foo/.KEYFILE") 12 | 13 | question2 = """ 14 | gitignore any file called .KEYFILE anywhere under a directory foo, which itself is anywhere 15 | """ 16 | 17 | TestWhatIsStarStarB = question2 >> LLMRun() >> SubstringEvaluator("**/foo/**/.KEYFILE") 18 | 19 | if __name__ == "__main__": 20 | print(run_test(TestWhatIsStarStarB)) 21 | -------------------------------------------------------------------------------- /tests/strided_trick.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model knows how to use the strided trick with numpy." 4 | 5 | TAGS = ['code', 'python', 'performance'] 6 | 7 | question = """ 8 | Write a function strides() that given an array of tokens T as numpy array, using strided trick, returns [T[0:50], T[10:60], T[20: 70] ... ] 9 | """ 10 | 11 | test_case, answer = make_python_test([("strides(np.arange(100)).shape", "(6, 50)"), 12 | ("strides(np.arange(100)).sum()", "14850")]) 13 | 14 | TestProgramStrided = question >> LLMRun() >> ExtractCode() >> ((PythonRun(test_case) >> (SubstringEvaluator(answer))) & SubstringEvaluator("as_strided")) 15 | 16 | if __name__ == "__main__": 17 | print(run_test(TestProgramStrided)) 18 | -------------------------------------------------------------------------------- /tests/shorten_python_if_missing.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can shorten a line of python with an equal line." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Shorten this python code. Keep the name the same. 9 | ``` 10 | def foo(arr, keys): 11 | return [[x[y] if y in x else -5 for y in keys] for x in arr] 12 | ``` 13 | """ 14 | 15 | test_case, answer = make_python_test([("foo([{1: 2, 3: 4, 5: 6}], [1, 2, 3, 4])", "[[2, -5, 4, -5]]")]) 16 | 17 | TestShortenPyGet = question >> LLMRun() >> (SubstringEvaluator(".get") & (ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer))) 18 | 19 | 20 | 21 | if __name__ == "__main__": 22 | print(run_test(TestShortenPyGet)) 23 | 24 | -------------------------------------------------------------------------------- /tests/vague_loop_format.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can follow vague instructions for how to print IDs following an example." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Write a python program that for ids like 9 | 10 | 2401_002 11 | 12 | For months 1..12 and ids 1..3 13 | 14 | so like 24{month}_{id} 15 | 16 | print all the ids 17 | """ 18 | 19 | evaluator = SubstringEvaluator("2401_001") 20 | for month in range(1, 13): 21 | for id in range(1, 4): 22 | evaluator &= SubstringEvaluator(f"24{month:02}_{id:03}") 23 | 24 | 25 | TestVagueLoopFormat = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> evaluator 26 | 27 | 28 | if __name__ == "__main__": 29 | print(run_test(TestVagueLoopFormat)) 30 | 31 | 32 | -------------------------------------------------------------------------------- /tests/upython_mqtt.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can write upython code with an obscure module." 4 | 5 | TAGS = ['python', 'code'] 6 | 7 | question = """ 8 | Write a python program that I can use to run on my raspberry pi pico w zero that will check for mqtt messages. Don't block, and blink the LED on every loop iteration so I know it's polling. When it gets a message then set the LED solid for a second. 9 | """ 10 | 11 | # crude testing because there isn't an actual pico w to run this on... 12 | TestUPythonMQTT = question >> LLMRun() >> (SubstringEvaluator("check_msg") & SubstringEvaluator("MQTTClient") & SubstringEvaluator("set_callback") & SubstringEvaluator('WLAN') & SubstringEvaluator("STA_IF")) 13 | 14 | if __name__ == "__main__": 15 | print(run_test(TestUPythonMQTT)) 16 | 17 | -------------------------------------------------------------------------------- /tests/decompile_py_simple.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can disassemble a simple Python function from its bytecode." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Disassemble the following python into a function called foo: 9 | 10 | 56 0 LOAD_FAST 0 (a) 11 | 2 LOAD_FAST 1 (b) 12 | 4 LOAD_FAST 0 (a) 13 | 6 BINARY_MULTIPLY 14 | 8 BINARY_ADD 15 | 10 RETURN_VALUE 16 | 17 | """ 18 | 19 | test_case, answer = make_python_test([("foo(5, 7)", "5+7*5")]) 20 | 21 | 22 | TestDisas1 = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer) 23 | 24 | 25 | if __name__ == "__main__": 26 | print(run_test(TestDisas1)) 27 | -------------------------------------------------------------------------------- /tests/regex_remove_5_words.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a Python function with a straightforward regex." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Write a function called `match` in Python that uses a regex to find any word repeated 5 times or more. Return that word. If no word is repeated 5 times or more, return an empty string. 9 | """ 10 | 11 | test_case, answer = make_python_test([("match('hi hi hi hi hi')", '"hi"'), 12 | ("match('hi hi hi hi')", '""'), 13 | ("match('hello this is is is is is a test')", '"is"')]) 14 | 15 | 16 | TestRegex = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer) 17 | 18 | if __name__ == "__main__": 19 | print(run_test(TestRegex)) 20 | -------------------------------------------------------------------------------- /tests/explain_vbroadcast.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can correctly explain what the VPBROADCASTB instruction does." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What does the VPBROADCASTB instruction do? 9 | """ 10 | 11 | evaluation = """ 12 | Below is a student's answer to a question about what a particular instruction does: 13 | 14 | Does the student's say something similar to "Load integer data from the source operand (the second operand) and broadcast to all elements of the destination operand (the first operand)."? 15 | 16 | If the answer is correct say "The student passes" otherwise "The student fails". 17 | """ 18 | 19 | TestExplainBroadcast = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes") 20 | 21 | 22 | if __name__ == "__main__": 23 | print(run_test(TestExplainBroadcast)) 24 | -------------------------------------------------------------------------------- /tests/numpy_ix.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can identify the _ix function as a method for simplifying some code." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = '''I want to vectorize a function 8 | 9 | frame = np.zeros((out_h, out_w, 3), dtype=np.uint8) 10 | for i, val1 in enumerate(rows): 11 | for j, val2 in enumerate(cols): 12 | for j, val3 in enumerate(ch): 13 | # Assuming you want to store the pair as tuples in the matrix 14 | frame[i, j, k] = image[val1, val2, val3] 15 | 16 | out.write(np.array(frame)) 17 | 18 | with a simple numpy function that does something like this what is it called''' 19 | 20 | TestNumpyIx = question >> LLMRun() >> SubstringEvaluator("ix_") 21 | 22 | 23 | if __name__ == "__main__": 24 | print(run_test(TestNumpyIx)) 25 | 26 | -------------------------------------------------------------------------------- /tests/simulate_torch_grad.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case checks if the model can predict what the gradient of a variable is in PyTorch." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | What will this function print 9 | 10 | ``` 11 | def diff_round(x, decimals=1): 12 | scale_factor = (10 ** decimals) 13 | x = x * scale_factor 14 | diff = (1 + 1e-2) * x - torch.floor(x) 15 | x = x - diff + (torch.floor(x) + torch.where(diff >= 0.5, 1, 0)) 16 | x = x / scale_factor 17 | return x 18 | 19 | g = torch.tensor([.99, 1.54, 1.9], dtype=torch.float32, requires_grad=True) 20 | loss = torch.sum(diff_round(g, 1)) 21 | loss.backward() 22 | print(g.grad.sum()) 23 | ``` 24 | """ 25 | 26 | TestSimTorchGrad = question >> LLMRun() >> SubstringEvaluator("-0.03") 27 | 28 | if __name__ == "__main__": 29 | print(run_test(TestSimTorchGrad)) 30 | 31 | -------------------------------------------------------------------------------- /tests/numba_levenshtein.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a numba implementation of the Levenshtein distance algorithm." 4 | 5 | TAGS = ['code', 'python', 'performance'] 6 | 7 | question = """ 8 | Write me a numba implementation of levensthin distance, taking two numpy int arrays. Call it levenshtein. Run it in nopython mode. Character swaps are cost 2. 9 | """ 10 | 11 | test_case, answer = make_python_test([("levenshtein(np.array([3, 5, 2, 4, 8, 7]), np.array([3, 4, 2, 4, 8, 7, 9]))", "3")]) 12 | 13 | 14 | TestProgramNumbaLev = question >> LLMRun() >> ExtractCode(keep_main=False) >> (((SubstringEvaluator("from numba") | SubstringEvaluator("import numba")) & (SubstringEvaluator("nopython") | SubstringEvaluator("njit"))) & PythonRun(test_case) >> SubstringEvaluator(answer)) 15 | 16 | if __name__ == "__main__": 17 | print(run_test(TestProgramNumbaLev)) 18 | -------------------------------------------------------------------------------- /tests/python_jpeg.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can write a program that directly writes a jpeg file. This requires precise understanding of the jpeg spec." 4 | 5 | TAGS = ['code', 'python', 'visual'] 6 | 7 | question = """ 8 | Write a python program that, manually, draws a red triangle on a white background and saves it as a 200x200 gif file. Do not use any graphics libraries. Write the gif byte by byte. Output the gif file to stdout. 9 | 10 | I will evaluate your program by running `python your_soln.py > /tmp/a.gif` and then view the output file. 11 | """ 12 | 13 | TestFlagDraw = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun(out_bytes=True) >> LLMVisionRun("Describe the shapes in this image and their color") >> (SubstringEvaluator("triangle") & SubstringEvaluator("red")) 14 | 15 | if __name__ == "__main__": 16 | print(run_test(TestFlagDraw)) 17 | -------------------------------------------------------------------------------- /tests/dedent_code_fn.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a Python function that removes excess indentation from a given block of code." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Write a python function `dedent(str)` that takes as input a string of python code and de-dents it maximally. Lines that are entirely blank don't count. 9 | """ 10 | 11 | test_case, answer = make_python_test([("""dedent(''' print("hello") 12 | print("world") 13 | 14 | if True: 15 | print("true") 16 | else: 17 | print("false")''')""", '''"""print("hello") 18 | print("world") 19 | 20 | if True: 21 | print("true") 22 | else: 23 | print("false")"""''')]) 24 | 25 | 26 | TestDedent = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun(test_case) >> SubstringEvaluator(answer) 27 | 28 | if __name__ == "__main__": 29 | print(run_test(TestDedent)) 30 | 31 | -------------------------------------------------------------------------------- /llms/openrouter_model.py: -------------------------------------------------------------------------------- 1 | 2 | from openai import OpenAI 3 | import json 4 | 5 | from llms.openai_model import OpenAIModel 6 | 7 | 8 | class OpenRouterModel(OpenAIModel): 9 | def __init__(self, name): 10 | config = json.load(open("config.json")) 11 | api_key = config["llms"]["openrouter"]["api_key"].strip() 12 | # Inspired by https://openrouter.ai/docs/quickstart 13 | self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key) 14 | self.name = name[len("openrouter/") :] 15 | self.hparams = config["hparams"] 16 | self.hparams.update(config["llms"]["openrouter"].get("hparams") or {}) 17 | 18 | 19 | if __name__ == "__main__": 20 | # WARNING: must be run with PYTHONPATH=. otherwise there will be an import error 21 | # q = sys.stdin.read().strip() 22 | q = "hello there" 23 | print(q + ":", OpenRouterModel("openrouter/openai/o1-mini").make_request([q])) 24 | -------------------------------------------------------------------------------- /tests/program_sqrt.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can implement a sqrt function." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Tell me how to compute sqrt() in python without using the sqrt() function, then give me a function to do this. I want the answer as precise as possible. Call the function my_sqrt().. 9 | """ 10 | 11 | test_case, answer = make_python_test([("round(my_sqrt(12345), 1)", "111.1")]) 12 | 13 | 14 | TestProgramSqrt = question >> LLMRun() >> ExtractCode() >> ((LLMRun("Below is a student question who was supposed to implement square root from scratch. Did they cheat and just call sqrt directly? Think out loud, then finally answer either 'Yes they cheated' or 'No they did not cheat' ", llm=EVAL_LLM) >> ~SubstringEvaluator("they cheated")) & (PythonRun(test_case) >> SubstringEvaluator(answer))) 15 | 16 | 17 | if __name__ == "__main__": 18 | print(run_test(TestProgramSqrt)) 19 | 20 | -------------------------------------------------------------------------------- /tests/convert_to_c_simple.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can rewrite a very simple Python program into an equivalent C program." 4 | 5 | TAGS = ['code', 'c'] 6 | 7 | question = """ 8 | Here's a python program. Please rewrite it in c with the type signature float my_sqrt(float, float). 9 | 10 | def my_sqrt(number, precision=0.0001): 11 | guess = number / 2 # Start with an initial guess 12 | 13 | while abs(guess * guess - number) >= precision: 14 | guess = (guess + number / guess) / 2 # Improve the guess using the Babylonian method 15 | 16 | return guess 17 | """ 18 | 19 | test_case, answer = make_c_test([("(int)(my_sqrt(12345, .001)*10)", "1111")]) 20 | 21 | 22 | TestProgramRewriteCSimple = question >> LLMRun() >> ExtractCode() >> CRun(test_case) >> SubstringEvaluator(answer) 23 | 24 | 25 | if __name__ == "__main__": 26 | from llm import llm 27 | print(run_test(TestProgramRewriteCSimple)) 28 | -------------------------------------------------------------------------------- /tests/bash_find_dont_contain.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can implement (the negation of) a simple bash 1-liner searching for files that don't contain some text." 4 | 5 | TAGS = ['bash'] 6 | 7 | question = "In bash list all files in cwd that don't contain asdf in the contents (case-ignore)" 8 | 9 | 10 | def setup_fn(): 11 | open("hello", "w").write("this is a test") 12 | open("there", "w").write("test this ASDF") 13 | open("this", "w").write("whatever thing") 14 | open("asdf", "w").write("zzzthing") 15 | open("lower", "w").write("test this asdf") 16 | 17 | 18 | TestBashFindDontContain = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> BashRun() >> (SubstringEvaluator("hello") & ~SubstringEvaluator("there") & SubstringEvaluator("this") & SubstringEvaluator("asdf") & ~SubstringEvaluator("lower")) 19 | 20 | 21 | if __name__ == "__main__": 22 | print(run_test(TestBashFindDontContain)) 23 | 24 | -------------------------------------------------------------------------------- /tests/jax_onehot.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can correctly convert a list of indexes to a one-hot vector in Python using JAX." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | I have list of indexes and I want to convert it to one hot vector using jax and the function should be jittable and the function should be jitted. name the function one_hot and it should get two arguments the first one is the indexes and the second one is the number of possible labeles. Just give me the code 9 | """ 10 | 11 | test_case, answer = make_python_test([("str(one_hot(jnp.array([1,2,0]),5))","str(jnp.array([[0.0,1.0,0.0,0.0,0.0],[0.0,0.0,1.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0]]))")],header='import jax.numpy as jnp') 12 | 13 | 14 | 15 | 16 | TestJaxOneHot = question >> LLMRun() >> ExtractCode() >> Echo() >> PythonRun(test_case) >> Echo() >> SubstringEvaluator(answer) 17 | 18 | 19 | if __name__ == "__main__": 20 | print(run_test(TestJaxOneHot)) 21 | -------------------------------------------------------------------------------- /llms/groq_model.py: -------------------------------------------------------------------------------- 1 | import groq 2 | import json 3 | 4 | class GroqModel: 5 | def __init__(self, name): 6 | self.name = name 7 | 8 | config = json.load(open("config.json")) 9 | self.api_key = config['llms']['groq']['api_key'].strip() 10 | 11 | self.hparams = config['hparams'] 12 | self.hparams.update(config['llms']['groq'].get('hparams') or {}) 13 | 14 | def make_request(self, conversation, add_image=None, logit_bias=None, max_tokens=None): 15 | conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)] 16 | response = groq.Groq(api_key=self.api_key).chat.completions.create( 17 | model=self.name, 18 | max_tokens=2048, 19 | messages=conversation 20 | ) 21 | 22 | return response.choices[0].message.content 23 | 24 | if __name__ == "__main__": 25 | import sys 26 | q = "What's your name?" 27 | print(q+":", GroqModel("deepseek-r1-distill-llama-70b").make_request([q])) 28 | 29 | -------------------------------------------------------------------------------- /tests/generate_string_moves.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write code to perform string slicing with vague instructions." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Write me a function called `move` that, given a string x, returns all possible alternate strings y' such that y' is derived from x by moving a substring of x to an arbitrary position 9 | """ 10 | 11 | 12 | test_case, answer = make_python_test([("set(move('abcdef'))", "set(['abcdef', 'abcdfe', 'abcedf', 'abcefd', 'abcfde', 'abdcef', 'abdecf', 'abdefc', 'abecdf', 'abefcd', 'abfcde', 'acbdef', 'acdbef', 'acdebf', 'acdefb', 'adbcef', 'adebcf', 'adefbc', 'aebcdf', 'aefbcd', 'afbcde', 'bacdef', 'bcadef', 'bcdaef', 'bcdeaf', 'bcdefa', 'cabdef', 'cdabef', 'cdeabf', 'cdefab', 'dabcef', 'deabcf', 'defabc', 'eabcdf', 'efabcd', 'fabcde'])")]) 13 | 14 | TestProgramStringSlice = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer) 15 | 16 | if __name__ == "__main__": 17 | print(run_test(TestProgramStringSlice)) 18 | 19 | -------------------------------------------------------------------------------- /tests/python_chess_game_prefix.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can correctly call a python API for a moderately popular python library." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | In python-chess I have a game = chess.pgn.read_game(pgn). 9 | 10 | How do I print the PGN notation for all prefixes of this game? 11 | 12 | Call your function print_all_prefixes(game). Do not give example code for how to run it. 13 | 14 | """ 15 | 16 | test_case = """import io 17 | import chess.pgn 18 | print_all_prefixes(chess.pgn.read_game(io.StringIO('1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3')))""" 19 | 20 | def check(txt): 21 | return txt.count('1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7') > 10, "" 22 | 23 | 24 | 25 | TestPyChessPrefix = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> PyFunc(check) 26 | 27 | if __name__ == "__main__": 28 | print(run_test(TestPyChessPrefix)) 29 | -------------------------------------------------------------------------------- /tests/aws_ipv6.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can identify the error in an AWS Lambda code for authorizing a new network. This type of error is generally difficult to find via search." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """This a code for aws lambda to authorize a new network, what is wrong with this : 8 | security_group.authorize_egress( 9 | IpPermissions=[ 10 | {'IpProtocol': '-1', 11 | 'IpRanges': [{'CidrIpv6': '::/0'}]} 12 | ] 13 | ) 14 | """ 15 | 16 | evaluation = """ 17 | Below is a software engineer's answer to a request to find an issue with the following code: 18 | 19 | Does the engineer mention overly permissive IPv6 ranges? Think out loud, and then end your answer with either the string "The issue is found" or "The issue is not found". 20 | """ 21 | 22 | TestAWSV6 = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("The issue is found") 23 | 24 | if __name__ == "__main__": 25 | print(run_test(TestAWSV6)) 26 | 27 | 28 | -------------------------------------------------------------------------------- /tests/hallucinate_reference.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model will hallucinate references that don't exist." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | For each of the 6 reference in the following paper, extract the title, author, and year as a JSON array. 9 | 10 | Write {1: {"title": "title", "author": "author", "year": "year"}, 1: ..., 6: ...} 11 | 12 | 13 | Paper: 14 | {paper_title} {paper_body} {references} 15 | """ 16 | 17 | evaluation = """ 18 | I asked the student for the references contained a paper that didn't have any references. The student answered "" 19 | 20 | Does the student's answer say something like "There are no references"? If so, the student passes, and write "The student passes". 21 | 22 | Otherwise, if the student lists *any* paper, write "The student fails". 23 | """ 24 | 25 | TestHallucinateReference = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes") 26 | 27 | 28 | if __name__ == "__main__": 29 | print(run_test(TestHallucinateReference)) 30 | 31 | -------------------------------------------------------------------------------- /llms/anthropic_model.py: -------------------------------------------------------------------------------- 1 | import anthropic 2 | import json 3 | 4 | class AnthropicModel: 5 | def __init__(self, name): 6 | self.name = name 7 | 8 | config = json.load(open("config.json")) 9 | self.api_key = config['llms']['anthropic']['api_key'].strip() 10 | 11 | self.hparams = config['hparams'] 12 | self.hparams.update(config['llms']['anthropic'].get('hparams') or {}) 13 | 14 | def make_request(self, conversation, add_image=None, logit_bias=None, max_tokens=None): 15 | conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)] 16 | response = anthropic.Anthropic(api_key=self.api_key).messages.create( 17 | model=self.name, 18 | max_tokens=2048, 19 | messages=conversation 20 | ) 21 | 22 | return response.content[0].text 23 | 24 | 25 | if __name__ == "__main__": 26 | import sys 27 | q = "What's your name?" 28 | print(q+":", AnthropicModel("claude-3-5-sonnet-20240620").make_request([q])) 29 | 30 | -------------------------------------------------------------------------------- /tests/explain_code_prime2.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can interpret a minified and obfuscated JavaScript function and explain its function." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | What does this code do 9 | 10 | function z(){let e=[],n=[];for(let r=2;e.length<20;r++)(n=n.map(e=>e-1)).some(e=>0===e)?n=n.map((n,r)=>0===n?e[r]:n):(e.push(r),n.push(r));return e}console.log(z()); 11 | """ 12 | 13 | TestExplainPrime2 = question >> LLMRun() >> LLMRun("I showed a student some code that calculates 20 primes. I then tested their understanding by asking them to explain it to me. I'm going to show you their answer, I want you to evaluate whether or not their answer says something to the effect of \"it calculates the first 20 primes\". Do not judge their reasonining, just tell me if they got it right. \n Student Answer: \"\"\n\n Think out loud then answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> SubstringEvaluator("student passes") 14 | 15 | if __name__ == "__main__": 16 | print(run_test(TestExplainPrime2)) 17 | -------------------------------------------------------------------------------- /tests/fix_node_error.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can identify a node error message" 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | How do I fix this 9 | 10 | nicholas@npc site % http-server 11 | dyld[63275]: Library not loaded: /opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib 12 | Referenced from: <758FD1B7-1836-321E-A1D9-E47EC3C39702> /opt/homebrew/Cellar/node/21.5.0/bin/node 13 | Reason: tried: '/opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib' (no such file), '/opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib' (no such file), '/opt/homebrew/Cellar/icu4c/74.2/lib/libicui18n.73.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/Cellar/icu4c/74.2/lib/libicui18n.73.dylib' (no such file), '/opt/homebrew/Cellar/icu4c/74.2/lib/libicui18n.73.dylib' (no such file) 14 | """ 15 | 16 | TestFixNode = question >> LLMRun() >> SubstringEvaluator("brew reinstall node") 17 | 18 | 19 | if __name__ == "__main__": 20 | print(run_test(TestFixNode)) 21 | -------------------------------------------------------------------------------- /tests/bash_renamer.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a bash script that renames files with a specific pattern." 4 | 5 | TAGS = ['code', 'bash'] 6 | 7 | question = """ 8 | Write a bash script to rename all files of the form "merged_*" to "finished_*". I want to call it by running bash rename.sh /path/to/files. 9 | """ 10 | 11 | def setup_fn(): 12 | import os 13 | import time 14 | os.mkdir("./foo") 15 | time.sleep(.5) 16 | 17 | for i in range(10): 18 | with open("./foo/merged_{}".format(i), "w") as f: 19 | f.write("a"*i) 20 | with open("./foo/{}".format(i), "w") as f: 21 | f.write("a"*i) 22 | 23 | time.sleep(.5) 24 | 25 | def check_ok(): 26 | import os 27 | return sorted(os.listdir("foo")) == sorted(["finished_{}".format(i) for i in range(10)] + [str(i) for i in range(10)]) 28 | 29 | 30 | 31 | TestBashRenamer = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode() >> BashRun(args=["foo/"]) >> PyEvaluator(check_ok) 32 | 33 | if __name__ == "__main__": 34 | print(run_test(TestBashRenamer)) 35 | 36 | -------------------------------------------------------------------------------- /tests/draw_flag_bmp.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a C program that draws an image. This test requires the ability to understand the .bmp specification, and draw a flag that can be correctly parsed and seen by the evaluator." 4 | 5 | TAGS = ['code', 'c', 'visual'] 6 | 7 | question = """ 8 | Write me a C program that emits a .bmp that's a depiction of an american flag to stdout. 9 | """ 10 | 11 | TestFlagDrawBMP = question >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("United States") | SubstringEvaluator("USA") | SubstringEvaluator("America")) 12 | 13 | 14 | question2 = """ 15 | Write me a C program that emits a .bmp that's a depiction of an italian flag to stdout. 16 | """ 17 | 18 | TestEasyFlagDrawBMP = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("Italy") | SubstringEvaluator("Italian")) 19 | 20 | 21 | if __name__ == "__main__": 22 | print(run_test(TestFlagDrawBMP)) 23 | -------------------------------------------------------------------------------- /tests/baking_help.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can identify a missing incredient in a recipe. Identifying incorrect steps is much harder than missing steps." 4 | 5 | TAGS = ['explain', 'fun'] 6 | 7 | question = """ 8 | I'm making pancakes for breakfast. I added a cup of flour, a teaspoon of salt, and a few tablespoons of sugar to a bowl. I stirred it together, then added a cup of milk, a beaten egg, and a few tablespoons of oil, and stirred until just mixed. Then I put 1/4 a cup on a hot frying pan, and flipped it when brown. But they're terrible! Why? List the main reason. 9 | """ 10 | 11 | evaluation = """ 12 | Below is a student's answer to a question on what is missing from some pancake recipe: 13 | 14 | Does the student mention that baking powder or baking soda is missing? Think out loud, and then end your answer with either the string "The student passes" or "The student fails". 15 | """ 16 | 17 | TestMissingStep = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes") 18 | 19 | 20 | if __name__ == "__main__": 21 | print(run_test(TestMissingStep)) 22 | -------------------------------------------------------------------------------- /tests/explain_code_prime.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can interpret a minified JavaScript function and explain its function." 4 | 5 | TAGS = ['code', 'explain'] 6 | 7 | question = """ 8 | What does this code do 9 | 10 | function q(e){if(e<=1)return!1;if(e<=3)return!0;if(e%2==0||e%3==0)return!1;for(let r=5;r*r<=e;r+=6)if(e%r==0||e%(r+2)==0)return!1;return!0}function z(){const e=[];let r=2;for(;e.length<20;)q(r)&&e.push(r),r++;return e.reverse(),e.reverse(),e}console.log(z()); 11 | """ 12 | 13 | TestExplainPrime = question >> LLMRun() >> LLMRun("I showed a student some code that calculates 20 primes. I then tested their understanding by asking them to explain it to me. I'm going to show you their answer, I want you to evaluate whether or not their answer says something to the effect of \"it calculates the first 20 primes\". Do not judge their reasonining, just tell me if they got it right. \n Student Answer: \"\"\n\n Think out loud then answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> SubstringEvaluator("student passes") 14 | 15 | if __name__ == "__main__": 16 | print(run_test(TestExplainPrime)) 17 | -------------------------------------------------------------------------------- /tests/easy_parser_generator.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can understand a vague BNF-style grammar and write a Python function that evaluates expressions based on the grammar rules." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Given this BNF-style grammar write a Python function `evaluate(str)` that evaluates the expression 9 | 10 | ::= | '+' | '-' 11 | 12 | ::= | '*' | '/' 13 | 14 | ::= | '(' ')' | 15 | 16 | ::= 'sin' | 'cos' 17 | 18 | ::= | 19 | 20 | ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' 21 | 22 | For example you can evaluate this 23 | 24 | sin(3 + 2 * 4) - cos(15) = -0.240302 25 | """ 26 | 27 | test_case, answer = make_python_test([("round(evaluate('sin(35+2*4)-cos(15)'), 5)", "-0.07209")]) 28 | 29 | TestSimpleBNF = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer) 30 | 31 | 32 | if __name__ == "__main__": 33 | print(run_test(TestSimpleBNF)) 34 | 35 | 36 | -------------------------------------------------------------------------------- /tests/bash_list_files_by_size_mod_ten.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can provide the correct bash command to list files in a directory and sort them by the least significant digit of their size." 4 | 5 | TAGS = ['bash'] 6 | 7 | question = 'In bash how do I list all files in foo/ but sort the files by the least significant digit of their size so 2820 comes first and then 281771 and finally 2279. I want just the file names e.g., file1 otherfile lastfile.' 8 | 9 | 10 | def setup_fn(): 11 | sizes = [921, 714, 120, 637, 366, 662, 305, 403, 49, 158] 12 | 13 | import os 14 | import time 15 | os.mkdir("foo") 16 | time.sleep(.5) 17 | 18 | for i, size in enumerate(sizes): 19 | with open("foo/{}".format(i), "w") as f: 20 | f.write("a"*size) 21 | 22 | time.sleep(.5) 23 | 24 | def fix_whitespace(x): 25 | return " ".join(x.split()) 26 | 27 | 28 | TestBashListSize = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True, lang="bash") >> BashRun() >> PyFunc(fix_whitespace) >> RegexEvaluator("2.*0.*5.*7.*1.*6.*4.*3.*9.*8") 29 | 30 | 31 | if __name__ == "__main__": 32 | print(run_test(TestBashListSize)) 33 | 34 | -------------------------------------------------------------------------------- /tests/numpy_advanced_index.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model correctly understands how advanced indexing works in numpy." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | What will this program print: 9 | 10 | ```python 11 | import numpy as np 12 | 13 | array = np.zeros((10, 20, 30)) 14 | idxs = [1, 2, 3] 15 | 16 | print("The array shape is", array[0, :, idxs].shape) 17 | ``` 18 | """ 19 | 20 | TestNumpyAdvancedIndex = question >> LLMRun() >> SubstringEvaluator("The array shape is (3, 20)") 21 | 22 | question = """ 23 | Consider this program 24 | 25 | ```python 26 | import numpy as np 27 | 28 | array = np.zeros((10, 20, 30)) 29 | idxs = [1, 2, 3] 30 | 31 | print("The first array shape is", array[0][:, idxs].shape) 32 | print("The second array shape is", array[0, :, idxs].shape) 33 | ``` 34 | 35 | Tell me what this program will print, and then explain why the two print statements don't print the same thing 36 | """ 37 | 38 | TestNumpyAdvancedIndexEasier = question >> LLMRun() >> (SubstringEvaluator("(3, 20)") & SubstringEvaluator("(20, 3)") & SubstringEvaluator("dvanced indexing")) 39 | 40 | if __name__ == "__main__": 41 | print(run_test(TestNumpyAdvancedIndexEasier)) 42 | -------------------------------------------------------------------------------- /tests/make_sqlite_table.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a SQL query to create a database table." 4 | 5 | TAGS = ['sql'] 6 | 7 | question = """ 8 | I'm working with a sqlite3 database. Write a query that creates a new database with a table for peoples name, job, and age. Then add a researcher named Nicholas who is 5. Write this directly as a sql query that I can pipe into sqlite3. Do not give the 'sqlite' command, I'll just do `sqlite3 database.db < [your output]`. 9 | """ 10 | 11 | def myfn(): 12 | open("/tmp/query.sql", "w").write("SELECT * FROM people;") 13 | import os 14 | out = os.popen("sqlite3 -init /tmp/query.sql database.db .exit").read() 15 | return "Nicholas" in out and "5" in out and "research" in out.lower() 16 | 17 | 18 | TestSqlMakeTable = question >> LLMRun() >> ExtractCode(manual="I'm going to run `sqlite3 database.db < /tmp/query.sql`. Given the following answer tell me exactly what to put into `query.sql`. DO NOT MODIFY THE CODE OR WRITE NEW CODE.\n") >> Echo() >> SQLRun() >> PyEvaluator(myfn) 19 | 20 | 21 | if __name__ == "__main__": 22 | print(run_test(TestSqlMakeTable)) 23 | -------------------------------------------------------------------------------- /tests/python_traceback.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can identify the buf and fix a program that handles python tracebacks. Useful to know if the model can handle more advanced python libraries." 4 | 5 | TAGS = ['code', 'fix'] 6 | 7 | question = """ 8 | Fix this program. Make sure to keep the format "var: value". 9 | 10 | import traceback 11 | 12 | def foo(): 13 | y = 6 14 | raise Exception("This is a test exception") 15 | 16 | def crashes(): 17 | x = 5 18 | foo() 19 | 20 | 21 | if __name__ == "__main__": 22 | try: 23 | crashes() 24 | except: 25 | # extract stack frames 26 | stack = traceback.extract_stack() 27 | # print all locals in each stack frame 28 | for frame in stack: 29 | for k,v in frame.locals: 30 | print(k + ": " + v) 31 | """ 32 | 33 | TestProgramTB = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun("") >> (SubstringEvaluator("x: 5") & SubstringEvaluator("y: 6")) 34 | 35 | if __name__ == "__main__": 36 | print(run_test(TestProgramTB)) 37 | -------------------------------------------------------------------------------- /tests/bash_convert_not_overwrite.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can run an incremental update of a bash command without overwriting files that already exist" 4 | 5 | TAGS = ['bash'] 6 | 7 | question = 'Run find . -name "*.pdf" -exec python3 process.py {} txt/{}.txt \; but don\'t rewrite files that already exist. Instead of .pdf.txt make it just .txt' 8 | 9 | 10 | def setup_fn(): 11 | open("process.py", "w").write("import sys\nopen(sys.argv[2], 'w').write(open(sys.argv[1]).read())\n") 12 | open("a.pdf", "w").write("hello1") 13 | open("b.pdf", "w").write("hello2") 14 | open("c.pdf", "w").write("hello3") 15 | import os 16 | os.mkdir("txt") 17 | open("txt/a.txt", "w").write("done1") 18 | open("txt/c.txt", "w").write("done3") 19 | 20 | 21 | def check_ok(): 22 | import os 23 | return open("txt/a.txt").read() == "done1" and open("txt/b.txt").read() == "hello2" and open("txt/c.txt").read() == "done3" and len(os.listdir("txt")) == 3 24 | 25 | 26 | 27 | TestBashIncrementalUpdate = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> BashRun() >> PyEvaluator(check_ok) 28 | 29 | 30 | if __name__ == "__main__": 31 | print(run_test(TestBashIncrementalUpdate)) 32 | 33 | 34 | -------------------------------------------------------------------------------- /tests/torch_to_jnp.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can convert a torch neural network to a jax numpy model." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Convert the following PyTorch code to pure JAX. Do not use any other libraries. 9 | 10 | ```python 11 | import torch 12 | 13 | class MLP(torch.nn.Module): 14 | def __init__(self, input_size, hidden_size, output_size): 15 | super().__init__() 16 | self.linear1 = torch.nn.Linear(input_size, hidden_size) 17 | self.linear2 = torch.nn.Linear(hidden_size, output_size) 18 | 19 | def __call__(self, x): 20 | x = self.linear1(x) 21 | x = torch.relu(x) 22 | x = self.linear2(x) 23 | return torch.nn.functional.softmax(x, dim=1) 24 | 25 | model = MLP(10, 20, 30) 26 | x = torch.randn(5, 10) 27 | y = model(x) 28 | print(list(y.flatten().numpy().sum(1))) 29 | ``` 30 | """ 31 | 32 | TestTorchJnp = question >> LLMRun() >> ExtractCode(keep_main=True) >> \ 33 | ((~SubstringEvaluator("import torch")) & 34 | (PythonRun() >> (SubstringEvaluator("1.0,") | SubstringEvaluator("1.00000") | SubstringEvaluator("1.0 ") | SubstringEvaluator("0.99999")))) 35 | 36 | 37 | 38 | if __name__ == "__main__": 39 | print(run_test(TestTorchJnp)) 40 | -------------------------------------------------------------------------------- /tests/fix_tokenizer.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can identify and fix an issue with a tokenizer in a Python code snippet. Identifying the problem is in the regex, and fixing the regex, are both hard." 4 | 5 | TAGS = ['code', 'fix', 'python'] 6 | 7 | question = """ 8 | Fix this tokenizer keeping it as similar as possible 9 | 10 | class Tokenizer: 11 | def __init__(self, input_str): 12 | self.tokens = re.findall(r'\d+|(\+|\-|\*|/|sin|cos|\(|\))', input_str) 13 | self.position = 0 14 | 15 | def get_next_token(self): 16 | if self.position >= len(self.tokens): 17 | return None 18 | token = self.tokens[self.position] 19 | self.position += 1 20 | return token 21 | 22 | def peek_next_token(self): 23 | if self.position >= len(self.tokens): 24 | return None 25 | return self.tokens[self.position] 26 | """ 27 | 28 | test_case, answer = make_python_test([("Tokenizer('sin(3+2*4)-cos(15)').tokens", "['sin', '(', '3', '+', '2', '*', '4', ')', '-', 'cos', '(', '15', ')']")]) 29 | 30 | 31 | TestSimpleFix = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer) 32 | 33 | 34 | if __name__ == "__main__": 35 | print(run_test(TestSimpleFix)) 36 | 37 | 38 | -------------------------------------------------------------------------------- /tests/unholy_matrix.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can solve a rather hard dynamic programming problem" 4 | 5 | TAGS = ['code', 'c'] 6 | 7 | question = "A binary 8x8 matrix is 'unholy' if there are no isolated 1s. A 1 is isolated if there isn't another 1 in any of the 4 direct cardinal directions. Write a C program that counts the total number of unholy 8x8 matricies." 8 | 9 | 10 | step = """To solve this question: 11 | - Use bit arithmetic and an unsigned long to represent the matrix. 12 | - Write a function is_unholy that tests if a matrix is unholy by bit-shifting and masking. 13 | 14 | Use a dynamic program to count the number of unholy matricies. 15 | - Recurse on the number of rows the matrix has. 16 | - Enumerate all 2^16 next two rows. This is enough to completely isolate the prior solutions from the next. 17 | 18 | Write out a plan for the program, and then implement the plan in C.""" 19 | 20 | answer = "1121231537486377866" 21 | 22 | TestUnholyMatrix = question >> LLMRun() >> ExtractCode(keep_main=True) >> CRun() >> SubstringEvaluator(answer) 23 | TestUnholyMatrixStep = (question + step) >> LLMRun() >> ExtractCode(keep_main=True) >> CRun() >> SubstringEvaluator(answer) 24 | 25 | 26 | if __name__ == "__main__": 27 | print(run_test(TestUnholyMatrix)) 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /tests/change_filetype.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can make a PNG get detected as an ELF executable." 4 | 5 | TAGS = ["coding"] 6 | 7 | question = """ 8 | I have a file called `image.png`. I want to modify it so that 9 | (1) it still loads correctly as an image file with the same contents with PIL, but 10 | (2) if I run `file image.png`, it should say that it is an ELF executable. 11 | 12 | Write me a python program that when run does this; modify the image in-place. 13 | """ 14 | 15 | 16 | def test_ok(): 17 | import os 18 | if 'ELF' not in os.popen("file image.png").read(): 19 | return False 20 | from PIL import Image 21 | img = Image.open("image.png").numpy() 22 | 23 | return img[0, 0].tolist() == [255, 0, 0] and img[0, 1].tolist() == [0, 255, 0] and img[0, 2].tolist() == [0, 0, 255] and img.shape == (100, 100, 3) 24 | 25 | 26 | def setup_image(): 27 | from PIL import Image 28 | import numpy as np 29 | img = np.zeros((100, 100, 3), dtype=np.uint8) 30 | img[0, 0] = [255, 0, 0] 31 | img[0, 1] = [0, 255, 0] 32 | img[0, 2] = [0, 0, 255] 33 | Image.fromarray(img).save("image.png") 34 | 35 | 36 | TestMakePNGToELF = Setup(setup_image) >> question >> LLMRun(question) >> ExtractCode() >> PythonRun() >> PyEvaluator(test_ok) 37 | 38 | if __name__ == "__main__": 39 | print(run_test(TestMakePNGToELF)) 40 | -------------------------------------------------------------------------------- /tests/fix_append_vs_extend.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test checks is the model can figure out from context when it's right to use extend versus append." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | Why won't this function work as expected? List exactly one reason. 9 | 10 | ``` 11 | def make_op_with_history(config): 12 | how = config[0] 13 | args = config[1:] 14 | 15 | if how.split()[0] == 'dot_from_papers': 16 | return [(how, make_op(config))] 17 | if how.split()[0] in ['const_add', 'const_mul']: 18 | return [(how, make_op(config))] + make_op_with_history(config[1]) 19 | out = [] 20 | for arg in args: 21 | hist = make_op_with_history(arg) 22 | out.append(hist) 23 | return [(how, make_op(config))] + out 24 | ``` 25 | """ 26 | 27 | judge = "I asked a question about why some code was incorrect. The answer is \"because because the code uses append instead of extend\". Does this student answer say that? Giving multiple answers is okay as long as it also says somewhere that the code should replace the append with extend.\n\n\n\nAnswer either 'The student passes' or 'The student fails'." 28 | 29 | TestAppendNotExtend = question >> LLMRun() >> (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes")) 30 | 31 | if __name__ == "__main__": 32 | print(run_test(TestAppendNotExtend)) 33 | 34 | 35 | -------------------------------------------------------------------------------- /tests/make_tree_from_text.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | 4 | DESCRIPTION = "Test if the model can create a tree from a string." 5 | 6 | TAGS = ['code', 'python'] 7 | 8 | code = """ 9 | class Node: 10 | def __init__(self, name, parent=None): 11 | self.name = name 12 | self.parent = parent 13 | self.children = [] 14 | if parent: 15 | self.parent.children.append(self) 16 | def __repr__(self): 17 | if self.children == []: 18 | return self.name 19 | return '('+self.name + ' ' + ' '.join(repr(c) for c in self.children)+')' 20 | """ 21 | 22 | question = code + """ 23 | 24 | Write a complete python file that will print the tree 25 | 26 | (1 (2 3) 4) 27 | 28 | Call the root of the tree 'root'. 29 | Then print out str(root). 30 | DO NOT MODIFY THE Node CLASS. 31 | """ 32 | 33 | TestMakeTreeEasy = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> SubstringEvaluator('(1 (2 3) 4)') 34 | 35 | question2 = code + """ 36 | 37 | Write a complete python file that will print the tree 38 | 39 | (1 (2 (3 4 5 6) (7 8)) (9 10 (11 12))) 40 | 41 | Call the root of the tree 'root'. 42 | Then print out str(root). 43 | """ 44 | 45 | TestMakeTree = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> SubstringEvaluator('(1 (2 (3 4 5 6) (7 8)) (9 10 (11 12)))') 46 | 47 | 48 | if __name__ == "__main__": 49 | print(run_test(TestMakeTreeEasy)) 50 | -------------------------------------------------------------------------------- /llms/gemini_model.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | class GeminiModel: 5 | def __init__(self, name): 6 | self.name = name 7 | 8 | config = json.load(open("config.json")) 9 | self.api_key = config['llms']['google']['api_key'].strip() 10 | 11 | self.hparams = config['hparams'] 12 | self.hparams.update(config['llms']['openai'].get('hparams') or {}) 13 | 14 | def make_request(self, conversation, max_tokens=None, add_image=None): 15 | url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.name}:generateContent?key={self.api_key}" 16 | 17 | contents = [] 18 | for i, content in enumerate(conversation): 19 | role = "user" if i % 2 == 0 else "model" 20 | contents.append({"role": role, "parts": [{"text": content}]}) 21 | 22 | data = { 23 | "contents": contents, 24 | "generationConfig": self.hparams 25 | } 26 | 27 | headers = {'Content-Type': 'application/json'} 28 | response = requests.post(url, headers=headers, json=data) 29 | out = response.json() 30 | print(out) 31 | return out['candidates'][0]['content']['parts'][0]['text'] 32 | 33 | 34 | if __name__ == "__main__": 35 | q = "Why?" 36 | model = GeminiModel("gemini-1.5-pro-latest") 37 | response = model.make_request(["I think 4 is even.", "It is!", "Why?"]) 38 | print(response) 39 | 40 | -------------------------------------------------------------------------------- /tests/fix_json.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can fix broken JSON objects." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """Write me a python function called fix_json that fixes broken JSON objects, and returns the parsed object. I'll pass the JSON object as a string and it should fix: 8 | - Incorect parenthesis, e.g., {"a": (5, 4)} -> {"a": [5, 4]} 9 | - Incorrect single quotes, e.g., {'a': 5} -> {"a": 5} 10 | - False/True -> false/true 11 | """ 12 | 13 | help_err = """Make sure not to break data inside strings! So if there's a True or a ( inside a string keep them the way they are. Also don't break quoted strings""" 14 | 15 | test_case, answer = make_python_test([("""fix_json('''{"a": 5, "b": [3, 4]}''')""", """{"a": 5, "b": [3, 4]}"""), 16 | ("""fix_json('''{'a': 5.0, 'b': (True, 4)}''')""", """{"a": 5.0, "b": [True, 4]}"""), 17 | ("""fix_json('''{'True': False, 'b()': (3, 4), "c'": []}''')""", """{"True": False, "b()": [3, 4], "c'": []}"""), 18 | ]) 19 | 20 | TestFixJSON = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer) 21 | TestFixJSONHelp = (question+help_err) >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer) 22 | 23 | 24 | if __name__ == "__main__": 25 | print(run_test(TestFixJSONHelp)) 26 | 27 | -------------------------------------------------------------------------------- /llms/cohere_model.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from PIL import Image 3 | import base64 4 | 5 | import cohere 6 | import json 7 | 8 | class CohereModel: 9 | def __init__(self, name): 10 | config = json.load(open("config.json")) 11 | api_key = config['llms']['cohere']['api_key'].strip() 12 | self.client = cohere.Client(api_key) 13 | self.name = name 14 | self.hparams = config['hparams'] 15 | self.hparams.update(config['llms']['cohere'].get('hparams') or {}) 16 | 17 | def make_request(self, conversation, add_image=None, max_tokens=None): 18 | prior_messages = [{"role": "USER" if i%2 == 0 else "CHATBOT", "message": content} for i,content in enumerate(conversation[:-1])] 19 | 20 | kwargs = { 21 | "chat_history": prior_messages, 22 | "message": conversation[-1], 23 | "max_tokens": max_tokens, 24 | "model": self.name 25 | } 26 | kwargs.update(self.hparams) 27 | 28 | for k,v in list(kwargs.items()): 29 | if v is None: 30 | del kwargs[k] 31 | 32 | out = self.client.chat( 33 | prompt_truncation='AUTO', 34 | **kwargs 35 | ) 36 | 37 | return out.text 38 | 39 | if __name__ == "__main__": 40 | import sys 41 | #q = sys.stdin.read().strip() 42 | q = "what specific date?" 43 | print(q+":", CohereModel("command").make_request(["Who discovered relativity?", "Einstein.", q])) 44 | -------------------------------------------------------------------------------- /tests/webgl_triangle.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate an HTML file with WebGL code that draws an image." 4 | 5 | TAGS = ['code', 'visual', 'html'] 6 | 7 | question = """ 8 | Draw an image of a triangle using WebGL in a single HTML file. Make the canvas take the full page. 9 | """ 10 | 11 | TestDrawTriangle = question >> LLMRun() >> ExtractCode(keep_main=True, lang="html/javascript") >> (~SubstringEvaluator("THREE.") & (SeleniumDraw() >> LLMVisionRun("This is a screenshot of a webpage that draws something with webgl. I'm trying to see if it drew what I wanted. Describe what is drawn in this image?") >> LLMRun("Does this answer say that there is a picture of a triangle \"\"? Answer \"Yes it does\" or \"No it does not\".", llm=EVAL_LLM) >> SubstringEvaluator("es it does"))) 12 | 13 | question2 = """ 14 | Draw an image of a house with a chimney using WebGL in a single HTML file. Make the canvas take the full page. 15 | """ 16 | 17 | TestDrawHouse = question2 >> LLMRun() >> ExtractCode(keep_main=True, lang="html/javascript") >> (~SubstringEvaluator("THREE.") & (SeleniumDraw() >> LLMVisionRun("This is a screenshot of a webpage that draws something with webgl. I'm trying to see if it drew what I wanted. Describe what is drawn in this image?") >> LLMRun("Does this answer say that there is a picture of a house with a chimney \"\"? Answer \"Yes it does\" or \"No it does not\".", llm=EVAL_LLM) >> SubstringEvaluator("es it does"))) 18 | 19 | if __name__ == "__main__": 20 | print(run_test(TestDrawHouse)) 21 | -------------------------------------------------------------------------------- /tests/c_weird_expression.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case is meant to test if the model can correctly evaluate a complex C expression." 4 | 5 | TAGS = ['explain', 'c'] 6 | 7 | 8 | evaluation = """ 9 | Below is a student's answer to a question about what a particular C expression evaluates to: 10 | 11 | Does the student's final answer say that the expression evaluates to *x+2 (or even v+2 is okay)? 12 | 13 | Ignore their reasoning. Just look at the final answer, if they say **EXPLICITLY** that the answer is *x+2. Giving something that's equal but more complicated is wrong. 14 | 15 | If the answer is correct say "The student passes" otherwise "The student fails". 16 | """ 17 | 18 | def parse(x): 19 | return x.lower().split("final answer:")[-1] 20 | 21 | question = """ 22 | What does the c expression `-~++*x--` evaluate to in simplest form? Ignore side effects. Think out loud then give a final summary. Write "Final answer:" and then give your answer. 23 | """ 24 | 25 | TestExplainWeirdCEasy = question >> LLMRun() >> PyFunc(parse) >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes") 26 | 27 | question_harder = """ 28 | What does the c expression `-~++*x--` evaluate to in simplest form? Ignore side effects. Write "Final answer:" and then give your answer. 29 | """ 30 | 31 | TestExplainWeirdC = question_harder >> LLMRun() >> PyFunc(parse) >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes") 32 | 33 | 34 | if __name__ == "__main__": 35 | print(run_test(TestExplainWeirdCEasy)) 36 | -------------------------------------------------------------------------------- /tests/date_news_headlines.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can predict the date a few news headlines were published." 4 | 5 | TAGS = ['fun'] 6 | 7 | question = """ 8 | What date was this the front page of HN? Format it YYYY-MM-DD. 9 | 10 | 11 | 1. 12 | We Made One Gram Of Remdesivir (acsh.org) 13 | 709 points by tomstokes on [date] | 231 comments 14 | 2. 15 | Crafting “Crafting Interpreters” (stuffwithstuff.com) 16 | 777 points by _vbdg on [date] | 75 comments 17 | 3. 18 | Bose QC 35 Firmware 4.5.2 Noise Cancellation Investigation Report (bose.com) 19 | 640 points by robbiet480 on [date] | 323 comments 20 | 4. 21 | Csound: A sound and music computing system (csound.com) 22 | 226 points by diaphanous on [date] | 92 comments 23 | 5. 24 | New Jersey needs COBOL programmers for their unemployment claims system (twitter.com/manicode) 25 | 447 points by enraged_camel on [date] | 297 comments 26 | 6. 27 | All models are wrong, but some are completely wrong (rssdss.design.blog) 28 | 305 points by magoghm on [date] | 208 comments 29 | 7. 30 | Configs suck? Try a real programming language (beepb00p.xyz) 31 | 289 points by gyre007 on [date] | 345 comments 32 | 8. 33 | Ilo sitelen, a handmade computer for Toki Pona (increpare.com) 34 | 204 points by tobr on [date] | 90 comments 35 | 9. 36 | The Svelte Compiler Handbook (lihautan.com) 37 | 330 points by PKop on [date] | 136 comments 38 | 10. 39 | Show HN: Export HN Favorites to a CSV File 40 | 240 points by gabrielsroka on [date] | 39 comments 41 | """ 42 | 43 | TestDateNewsHeadlines = question >> LLMRun() >> SubstringEvaluator("2020-04-05") 44 | 45 | if __name__ == "__main__": 46 | print(run_test(TestDateNewsHeadlines)) 47 | 48 | -------------------------------------------------------------------------------- /tests/flexbox_webpage.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate an HTML file using flexbox" 4 | 5 | TAGS = ['code', 'html'] 6 | 7 | question = """ 8 | Generate an HTML file that uses flexbox to create a webpage with a header 200px at the top (colored red), and the rest of the body split 100px for a left sidebar (in blue), 100px for a right sidebar (in green), and the rest of the page in the middle (in white). 9 | """ 10 | 11 | def check(x): 12 | import io 13 | from PIL import Image 14 | import numpy as np 15 | img = Image.open(io.BytesIO(x)) 16 | 17 | np_img = np.array(img)[:,:,:3] 18 | while np.mean(np_img[0]) >= 254: 19 | np_img = np_img[1:] 20 | while np.mean(np_img[:, 0]) >= 254: 21 | np_img = np_img[:, 1:] 22 | 23 | if np_img[0:200].std((0,1)).mean() > 20: 24 | return False 25 | 26 | if np_img[400:, :100].std((0,1)).mean() > 20: 27 | return False 28 | 29 | if np_img[400:, -100:-30].std((0,1)).mean() > 20: 30 | return False 31 | 32 | if np_img[0:200].mean((0,1)).argmax() != 0: 33 | return False 34 | 35 | if np_img[400:, :100].mean((0,1)).argmax() != 2: 36 | return False 37 | 38 | if np_img[400:, -100:].mean((0,1)).argmax() != 1: 39 | return False 40 | 41 | 42 | if np_img[:800, :100].std((0,1)).mean() < 20: 43 | return False 44 | 45 | if np_img[:800, -100:].std((0,1)).mean() < 20: 46 | return False 47 | 48 | return True 49 | 50 | 51 | 52 | TestFlexbox = question >> LLMRun() >> ExtractCode(keep_main=True, lang="html/javascript") >> SeleniumDraw() >> PyFunc(check) 53 | 54 | if __name__ == "__main__": 55 | print(run_test(TestFlexbox)) 56 | 57 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Ubuntu as a parent image 2 | FROM ubuntu:latest 3 | 4 | # Update the system and install dependencies 5 | RUN apt-get update && apt-get install -y \ 6 | software-properties-common 7 | 8 | # Add the deadsnakes PPA, which contains newer Python versions 9 | RUN add-apt-repository ppa:deadsnakes/ppa 10 | 11 | # Update the system and install Python and build dependencies 12 | RUN apt-get update && apt-get install -y \ 13 | software-properties-common \ 14 | build-essential \ 15 | gcc \ 16 | curl \ 17 | sqlite3 \ 18 | gdb \ 19 | libssl-dev 20 | 21 | ENV DEBIAN_FRONTEND noninteractive 22 | 23 | # Install Python 3.12 24 | RUN apt-get update && apt-get install -y \ 25 | python3.12 \ 26 | python3.12-venv \ 27 | python3.12-dev \ 28 | git 29 | 30 | # Remove the EXTERNALLY-MANAGED file to allow pip installations 31 | RUN rm -f /usr/lib/python3.12/EXTERNALLY-MANAGED 32 | 33 | # Download and install pip using the get-pip.py script 34 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ 35 | python3.12 get-pip.py && \ 36 | rm get-pip.py 37 | 38 | # Install required Python packages using pip 39 | RUN pip install numpy scipy numba Pillow jax jaxlib python-chess torch 40 | 41 | # Install Rust 42 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 43 | 44 | # Ensure Rust binaries are in PATH 45 | ENV PATH="/root/.cargo/bin:${PATH}" 46 | 47 | # Create a symlink for python3 48 | RUN ln -s /usr/bin/python3 /usr/bin/python 49 | 50 | # Set the working directory in the container 51 | WORKDIR /usr/src/app 52 | 53 | # Any additional commands or environment variables can be added here 54 | 55 | # Command to run when the container launches 56 | CMD ["/bin/bash"] 57 | -------------------------------------------------------------------------------- /tests/basic_git_setup.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can guide a user in a conversation to setup a git repo." 4 | 5 | TAGS = ['bash', "git", "agent"] 6 | 7 | question = """ 8 | You are in a repository with two files `a` and `b`. Make a new git repo and then commit them both. 9 | 10 | On each round you should just give me just one command to type, I'll run it, and give you the exact output. I'll get started: 11 | 12 | INPUT: `pwd` 13 | 14 | OUTPUT: `/usr/src/app` 15 | 16 | INPUT: `ls` 17 | 18 | OUTPUT: `main.py a b` 19 | 20 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command. Start your response with INPUT:""" 21 | 22 | 23 | def test_ok(): 24 | import os 25 | success = os.popen("git rev-list --all --count").read().strip() == "1" 26 | return success 27 | 28 | def setup(): 29 | import os 30 | open("a", "w").write("a") 31 | open("b", "w").write("b") 32 | os.popen('git config --global user.email "you@example.com"').read() 33 | os.popen('git config --global user.name "Your Name"').read() 34 | 35 | def do_extract(x): 36 | if 'INPUT:' in x: 37 | x = x.split("INPUT:")[1] 38 | try: 39 | to_send = x.split("`")[1] 40 | except: 41 | to_send = x.split("\n")[0] 42 | return to_send 43 | 44 | def do_prepare(x): 45 | print("Preparing to pass back", x) 46 | return f"OUTPUT: `{x}`.\n\nWhat command should I run next? Start your response with INPUT:" 47 | 48 | TestGitSimple = Setup(setup) >> question >> UntilDone(PyEvaluator(test_ok), (LLMConversation() >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=4) >> PyEvaluator(test_ok) 49 | 50 | 51 | if __name__ == "__main__": 52 | print(run_test(TestGitSimple)) 53 | -------------------------------------------------------------------------------- /tests/jnp_nn_bugfix.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can identify and fix a bug in a given jax.numpy function." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Fix the bug in this jnp function. Write out the full correct version of the code. 9 | 10 | import jax 11 | import jax.numpy as jnp 12 | 13 | def init_mlp_params(input_size, hidden_size, output_size, key): 14 | k1, k2 = jax.random.split(key) 15 | # Initialize weights and biases for the first linear layer 16 | w1 = jax.random.normal(k1, (hidden_size, input_size)) * jnp.sqrt(2.0 / input_size) 17 | b1 = jnp.zeros(hidden_size) 18 | # Initialize weights and biases for the second linear layer 19 | w2 = jax.random.normal(k2, (output_size, hidden_size)) * jnp.sqrt(2.0 / hidden_size) 20 | b2 = jnp.zeros(output_size) 21 | return {'w1': w1, 'b1': b1, 'w2': w2, 'b2': b2} 22 | 23 | def mlp_apply(params, x): 24 | # First linear layer 25 | x = jnp.dot(x, params['w1'].T) + params['b1'] 26 | # Relu activation 27 | x = jax.nn.relu(x) 28 | # Second linear layer 29 | x = jnp.dot(x, params['w2'].T) + params['b2'] 30 | # Softmax activation 31 | return jax.nn.softmax(x, axis=1) 32 | 33 | # Example usage: 34 | key = jax.random.PRNGKey(0) 35 | params = init_mlp_params(10, 20, 30, key) 36 | x = jax.random.normal(key, (5, 10)) # random input similar to torch.randn 37 | y = mlp_apply(params, x) 38 | print(list(y.flatten().sum(1))) # The output sum should be very close to [1, 1, 1, 1, 1] 39 | """ 40 | 41 | TestFixJnpBug = question >> LLMRun() >> ExtractCode(keep_main=True) >> \ 42 | (PythonRun() >> (SubstringEvaluator("1.0,") | SubstringEvaluator("1.00000") | SubstringEvaluator("1.0 ") | SubstringEvaluator("0.99999"))) 43 | 44 | 45 | 46 | if __name__ == "__main__": 47 | print(run_test(TestFixJnpBug)) 48 | -------------------------------------------------------------------------------- /tests/why_broken_flask_extra_brace.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test checks is the model can figure out the user has put an accidental extra brace in the request body." 4 | 5 | TAGS = ['explain', 'python'] 6 | 7 | question = """ 8 | Why isn't this working 9 | 10 | ``` 11 | @app.route('/getsimilar', methods=['GET']) 12 | def getsimilar(): 13 | print('args',request.args) 14 | positive = request.args.get('positive', '') 15 | positive = positive.split("##") 16 | 17 | negative = request.args.get('negative', '') 18 | negative = negative.split("##") 19 | 20 | keep_yes = [i for i,x in enumerate(titles) if x in positive] 21 | set1_yes = emb[keep_yes] 22 | 23 | keep_no = [i for i,x in enumerate(titles) if x in negative] 24 | set1_no = emb[keep_no] 25 | 26 | print("OK", len(set1_yes), len(set1_no)) 27 | ``` 28 | 29 | I see 30 | 31 | ``` 32 | args ImmutableMultiDict([('positive', 'Title: Towards Evaluating the Robustness of Neural Networks\nAuthors: Nicholas Carlini; David Wagner'), ('negative', 'Title: On Evaluating Adversarial Robustness Authors: Nicholas Carlini; Anish Athalye; Nicolas Papernot; Wieland Brendel; Jonas Rauber; Dimitris Tsipras; Ian Goodfellow; Aleksander Madry; Alexey Kurakin}')]) 33 | OK 1 0 34 | ``` 35 | 36 | Why don't I see OK 1 1? Instead it just says OK 1 0. List exactly one reason explaining what's wrong and what one change would fix it. 37 | """ 38 | 39 | judge = "I asked a question about why some code was incorrect. The answer is \"because the input contains an extra curly brace }\". Does this student answer say that?\n\n\n\nAnswer either 'The student passes' or 'The student fails'." 40 | 41 | TestBrokenExtraBrace = question >> LLMRun() >> (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes")) 42 | 43 | if __name__ == "__main__": 44 | print(run_test(TestBrokenExtraBrace)) 45 | 46 | 47 | -------------------------------------------------------------------------------- /tests/faster_l2_diff.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can optimize a given Python program for speed and memory efficiency." 4 | 5 | TAGS = ['code', 'performance', 'python'] 6 | 7 | question = """ 8 | I have this program 9 | 10 | import numpy as np 11 | def diff(out, templates): 12 | return (out[:, None, :] - templates[None, :, :]).square().sum(2) 13 | 14 | Make it faster and more memory efficient without expanding the arrays. Call it `diff` still. 15 | """ 16 | 17 | tracer = """ 18 | import tracemalloc 19 | import numpy as np 20 | 21 | if __name__ == "__main__": 22 | tracemalloc.start() # Start tracing 23 | 24 | snapshot1 = tracemalloc.take_snapshot() # Before execution 25 | diff(np.arange(10000).reshape((100, 100)), np.arange(10000).reshape((100, 100))) 26 | snapshot2 = tracemalloc.take_snapshot() # After execution 27 | 28 | # Calculate the difference in memory usage 29 | current, peak = tracemalloc.get_traced_memory() 30 | tracemalloc.stop() # Stop tracing 31 | 32 | if peak < 10**6: 33 | print("Efficient") 34 | """ 35 | 36 | test_case, answer = make_python_test([("diff(np.arange(100).reshape((10, 10)), np.arange(100).reshape((10, 10))).sum()", "1650000")]) 37 | 38 | 39 | TestFastL2 = question >> LLMRun() >> ExtractCode() >> \ 40 | ((PythonRun(test_case) >> SubstringEvaluator(answer)) & 41 | (PythonRun(tracer) >> SubstringEvaluator("Efficient")) & 42 | (LLMRun("Below is a students answer to a question on writing efficient code. They're supposed to be vectorizing a loop. Is the following code vectorized? It is incorrect to use for loops explicitly. Think out loud, and then answer either \"The student passes\" or \"The student fails\".\n", llm=EVAL_LLM) >> SubstringEvaluator("student passes"))) 43 | 44 | 45 | if __name__ == "__main__": 46 | print(run_test(TestFastL2)) 47 | -------------------------------------------------------------------------------- /tests/call_rust_from_python.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write rust code that can be imported from python and knows how to build it." 4 | 5 | TAGS = ['rust', 'c', 'python', 'code'] 6 | 7 | question = """I want to call some rust code from python on a linux machine. Specifically, I want to be able to run 8 | 9 | python3 -c 'import magic; print(magic.do_add(3, 4))' 10 | 11 | and have it give me the sum of these numbers. Here, do_add should be a rust function that adds two integers. 12 | 13 | Please give me a complete set of instructions for how to make this run, so that when I run this code as-is it will work. 14 | """ 15 | 16 | question2 = """ 17 | Now please conver this answer into a JSON array that packages it all up so I don't need to think and can just copy files to specific locations and then run things. Format your answer 18 | 19 | {"path/to/file": "contents of rust file", 20 | "path/to/python_file": "contents of any necessary python file that exposes do_add", 21 | "build.sh": "contents of a bash file to build the project", 22 | ... # any other files necessary 23 | } 24 | 25 | 26 | The only thing I will do is put files in the right place and run `bash build.sh`. Do not give any explanation of the code. ONLY ANSWER WITH A SINGLE JSON OBJECT. Your answer should start ```json and end with ```. 27 | """ 28 | 29 | def is_ok(): 30 | import os 31 | os.popen("bash build.sh").read() 32 | 33 | return os.popen("python3 -c 'import magic; print(magic.do_add(31, 11))'").read() == "42\n" 34 | 35 | 36 | TestCallRustFromPy = question >> LLMConversation() >> LLMConversation(question2) >> ExtractJSON() >> MakeFilesFromJSON() >> PyEvaluator(is_ok) 37 | TestCallCFromPy = question.replace("rust", "C") >> LLMConversation() >> LLMConversation(question2.replace("rust", "C")) >> ExtractJSON() >> MakeFilesFromJSON() >> PyEvaluator(is_ok) 38 | 39 | 40 | if __name__ == "__main__": 41 | print(run_test(TestCallRustFromPy)) 42 | 43 | 44 | -------------------------------------------------------------------------------- /tests/convert_dp_to_iterative.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can understand a DP algorithm and then convert it into an iterative implementation." 4 | 5 | TAGS = ['code', 'performance', 'python'] 6 | 7 | my_code = """ 8 | @lru_cache(maxsize=None) 9 | def solve_dp(graph, lines_left, distance_since_last): 10 | if len(graph) < lines_left: return 1e9, [] 11 | if lines_left == 0 and len(graph) == 0: return 0, [] 12 | 13 | # option 1: don't take the line 14 | value = (1e9, []) 15 | if distance_since_last < 10: 16 | value = min(value, solve_dp(graph[1:], lines_left, distance_since_last+1)) 17 | 18 | # option 2: take the line 19 | if lines_left > 0: 20 | sub, how = solve_dp(graph[1:], lines_left-1, 0) 21 | sub += graph[0] + (distance_since_last-8)**2 22 | value = min(value, (sub, how + [len(graph)])) 23 | 24 | return value 25 | """ 26 | 27 | question = f""" 28 | Explain what this code does. Then, convert this recursive function to an iterative one, making explicit the cubic work that's being done. The functions should return exactly the same output but the iterative one should be faster. Call your function `solve_iterative` and match the type signature of the original function. 29 | 30 | {my_code} 31 | """ 32 | 33 | test_case, answer = make_python_test([(f"""0 34 | import random 35 | mylist = list(range(100)) 36 | random.shuffle(mylist) 37 | from functools import lru_cache 38 | 39 | {my_code.replace("solve_dp", "my_solve_dp")} 40 | answer = solve_iterative(tuple(mylist), len(mylist)//8, 4)""", 41 | "my_solve_dp(tuple(mylist), len(mylist)//8, 4)")]) 42 | 43 | def not_recursive(code): 44 | return code.count("solve_iterative") == 1, "" 45 | 46 | TestProgramRemoveDP = question >> LLMRun() >> ExtractCode() >> \ 47 | (PyFunc(not_recursive) & (PythonRun(test_case) >> SubstringEvaluator(answer))) 48 | 49 | if __name__ == "__main__": 50 | print(run_test(TestProgramRemoveDP)) 51 | -------------------------------------------------------------------------------- /tests/vectorize_small_update.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can replace a for loop with a vectorized version." 4 | 5 | TAGS = ['code', 'python', 'performance'] 6 | 7 | def update(high, highest_number, delta): 8 | for i in range(len(high)): 9 | if i != highest_number: 10 | high[i] = min(high[i], high[highest_number] + delta[highest_number] - delta[i]) 11 | return high 12 | 13 | question = """ 14 | Vectorize this update loop. Call the function `update`: 15 | 16 | import numpy as np 17 | 18 | def update(high, highest_number, delta): 19 | for i in range(len(high)): 20 | if i != highest_number: 21 | high[i] = min(high[i], high[highest_number] + delta[highest_number] - delta[i]) 22 | return high 23 | """ 24 | 25 | test_case, answer = make_python_test([("list(update(np.array([2, 5, 9, 1, 2, 0, 3, 8]), 4, np.array([1, 2, 1, 4, 3, 2, 1, 2])))", "[2, 3, 4, 1, 2, 0, 3, 3]")], header="import numpy as np") 26 | 27 | evaluation = """ 28 | Below is a student's answer to a question about how to vectorize this loop: 29 | 30 | def update(high, highest_number, delta): 31 | for i in range(len(high)): 32 | if i != highest_number: 33 | high[i] = min(high[i], high[highest_number] + delta[highest_number] - delta[i]) 34 | return high 35 | 36 | Here is their code: 37 | 38 | 39 | 40 | Did the student try to vectorize the loop? Don't tell me if they are correct or not. (I have tests for that separately which will catch any bugs.) Just tell me if they tried to vectorize the loop. End your answer with either the string "The student passes" if they tried to do a vectorized loop, or "The student fails" if they just wrote a for loop again. 41 | """ 42 | 43 | TestVectorizeSmall = question >> LLMRun() >> ExtractCode() >> ((PythonRun(test_case) >> SubstringEvaluator(answer)) & (LLMRun(evaluation) >> SubstringEvaluator("student passes"))) 44 | 45 | if __name__ == "__main__": 46 | print(run_test(TestVectorizeSmall)) 47 | -------------------------------------------------------------------------------- /tests/vague_sum_data.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can infer what data to sum and what to ignore by example with vague instructions." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = ''' 8 | Here's some data 9 | QUERY A361 10 | Found at: 160392127418 11 | Number of times present: 4 12 | 13 | 14 | Found at: 150158692866 15 | Number of times present: 3 16 | 17 | 18 | Found at: 122340445248 19 | Number of times present: 17 20 | 21 | 22 | Found at: 137248651778 23 | Number of times present: 84 24 | 25 | 26 | Found at: 131891981708 27 | Number of times present: 66 28 | 29 | 30 | QUERY 32B57 31 | Found at: 194561247604 32 | Number of times present: 4 33 | 34 | 35 | Found at: 82888490886 36 | Number of times present: 3 37 | 38 | 39 | Found at: 115836535414 40 | Number of times present: 19 41 | 42 | 43 | Found at: 79615924948 44 | Number of times present: 80 45 | 46 | Write a python function called `process` that will take data like this as an argument and return a dict where the keys are the like A361/32B57 and the values are how often its present in total. 47 | ''' 48 | 49 | data = "QUERY A361\\nFound at: 160392127418\\nNumber of times present: 4\\n\\n\\nFound at: 150158692866\\nNumber of times present: 4\\n\\n\\nFound at: 122340445248\\nNumber of times present: 17\\n\\n\\nFound at: 137248651778\\nNumber of times present: 84\\n\\n\\nFound at: 131891981708\\nNumber of times present: 66\\n\\n\\nQUERY 32B57\\nFound at: 194561247604\\nNumber of times present: 4\\n\\n\\nFound at: 82888490886\\nNumber of times present: 3\\n\\n\\nFound at: 115836535414\\nNumber of times present: 19\\n\\n\\nFound at: 79615924948\\nNumber of times present: 81\\n" 50 | 51 | test_case, answer = make_python_test([(f"process('{data}')", "{'A361': 175, '32B57': 107}")]) 52 | 53 | TestSumSomeData = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun(test_case) >> SubstringEvaluator(answer) 54 | 55 | 56 | if __name__ == "__main__": 57 | print(run_test(TestSumSomeData)) 58 | 59 | 60 | -------------------------------------------------------------------------------- /llms/mistral_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import requests 4 | 5 | class MistralModel: 6 | def __init__(self, name): 7 | self.name = name 8 | config = json.load(open("config.json")) 9 | self.hparams = config['hparams'] 10 | self.hparams.update(config['llms']['mistral'].get('hparams') or {}) 11 | 12 | self.api_key = config['llms']['mistral']['api_key'].strip() 13 | 14 | self.headers = { 15 | 'Authorization': f'Bearer {self.api_key}', # Adjust if the API expects a different kind of authentication 16 | 'Content-Type': 'application/json', 17 | 'Accept': 'application/json' 18 | } 19 | self.endpoint = "https://api.mistral.ai/v1/chat/completions" 20 | 21 | def make_request(self, conversation, add_image=None, max_tokens=None): 22 | # Prepare the conversation messages in the required format 23 | formatted_conversation = [ 24 | {"role": "user" if i % 2 == 0 else "assistant", "content": content} 25 | for i, content in enumerate(conversation) 26 | ] 27 | 28 | # Construct the data payload 29 | data = { 30 | "model": self.name, 31 | "messages": formatted_conversation, 32 | "max_tokens": max_tokens or 2048, 33 | } 34 | data.update(self.hparams) 35 | 36 | # Make the POST request to the API endpoint 37 | response = requests.post(self.endpoint, headers=self.headers, data=json.dumps(data)) 38 | time.sleep(1) 39 | if response.status_code == 200: 40 | # Parse and return the response content 41 | return response.json()['choices'][0]['message']['content'] 42 | else: 43 | # Handle errors or unsuccessful status codes as needed 44 | return f"API request failed with status code {response.status_code}" 45 | 46 | if __name__ == "__main__": 47 | import sys 48 | q = sys.stdin.read().strip() 49 | print(q+":", MistralModel("mistral-small").make_request([q])) 50 | -------------------------------------------------------------------------------- /tests/play_20_questions.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model is able to ask questions to get to an answer." 4 | 5 | TAGS = ['fun'] 6 | 7 | question = """ 8 | Let's play 20 questions! I'm thinking of %s. You have 20 questions to guess what it is. Ask me yes or no questions, about anything you want. Do not guess randomly or make overly specific guesses. Do not make your final guess until you are absolutely sure. (You can think out loud first, but your last sentence MUST be a question.) 9 | 10 | Your answer should be of the format 11 | [question number]. [internal thought] 12 | 13 | Question: [your question]? 14 | 15 | If think you know the answer it should say 16 | 17 | Question: Is the answer [your guess]? 18 | " 19 | """ 20 | 21 | def is_done_animal(guess): 22 | return "llama" in guess.lower() 23 | 24 | def is_done_book(guess): 25 | return "diamond age" in guess.lower() or 'illustrated primer' in guess.lower() 26 | 27 | def take_one_word(guess): 28 | return guess.split()[0] 29 | 30 | def take_question(guess): 31 | if 'Question:' in guess: 32 | return guess.split("Question:")[-1] 33 | else: 34 | return guess 35 | 36 | TestTwentyQuestionsLlama = question%("an animal") >> LLMConversation() >> UntilDone(PyFunc(is_done_animal), (PyFunc(take_question) >> LLMRun("I'm playing 20 questions with someone. I'm thinking of a Llama. Here's their question: .\nAnswer either 'Yes' or 'No'; do not answer anything else.") >> PyFunc(take_one_word) >> LLMConversation()), max_iters=20) >> PyFunc(is_done_animal) 37 | TestTwentyQuestionsBook = question%("a book") >> LLMConversation() >> UntilDone(PyFunc(is_done_book), (PyFunc(take_question) >> LLMRun("I'm playing 20 questions with someone. I'm thinking of the book The Diamond Age by Neal Stephenson. Here's their question: .\nAnswer either 'Yes' or 'No'; do not answer anything else.") >> PyFunc(take_one_word) >> LLMConversation()), max_iters=20) >> PyFunc(is_done_book) 38 | 39 | if __name__ == "__main__": 40 | print(run_test(TestTwentyQuestionsBook)) 41 | -------------------------------------------------------------------------------- /tests/fancy_sql_process.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a Python program that retrieves data from a SQL file." 4 | 5 | TAGS = ['sql'] 6 | 7 | question = """ 8 | I have a sqlite3 database name TEXT, favorite_friend TEXT, age INT, profession TEXT. 9 | Write me a python program that prints out the favorite friend of the youngest person who isn't anyone's favorite friend. 10 | The database is called people.db and the table is called people. 11 | """ 12 | 13 | def setup(): 14 | import sqlite3 15 | 16 | # Create a connection object using the connect function 17 | conn = sqlite3.connect('people.db') 18 | 19 | # Create a cursor object using the cursor method 20 | cursor = conn.cursor() 21 | 22 | # Create table named people with columns for name, favorite friend, age, and profession 23 | cursor.execute('''CREATE TABLE people 24 | (name TEXT, favorite_friend TEXT, age INT, profession TEXT)''') 25 | 26 | # List of people data to insert into the people table 27 | people_data = [ 28 | ('Alice', 'Bob', 29, 'Engineer'), 29 | ('Bob', 'Alice', 32, 'Doctor'), 30 | ('Charlie', 'Alice', 28, 'Artist'), 31 | ('David', 'Eve', 35, 'Architect'), 32 | ('Eve', 'Frank', 26, 'Teacher'), 33 | ('Frank', 'Alice', 31, 'Scientist'), 34 | ('Grace', 'Heidi', 30, 'Nurse'), 35 | ('Heidi', 'Ivy', 25, 'Lawyer'), 36 | ('Ivy', 'Charlie', 34, 'Chef'), 37 | ('Judy', 'Grace', 27, 'Accountant') 38 | ] 39 | 40 | # Insert each person into the people table 41 | cursor.executemany('INSERT INTO people VALUES (?,?,?,?)', people_data) 42 | 43 | # Commit the changes 44 | conn.commit() 45 | 46 | # Close the connection 47 | conn.close() 48 | 49 | 50 | TestSqlSubquery = Setup(setup) >> question >> LLMRun() >> ExtractCode(keep_main=True, lang='python') >> Echo() >> PythonRun() >> SubstringEvaluator("Grace") 51 | 52 | if __name__ == "__main__": 53 | print(run_test(TestSqlSubquery)) 54 | -------------------------------------------------------------------------------- /tests/print_hello_poly.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a program that prints \"hello world\" when run either as a C or a python program." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = 'Write a single file that I can save as foo.c.py that I can run as either a C program or a python program, and so both will print "hello world". Specifically if I run `python foo.c.py` or `gcc foo.c.py && ./a.out` I should see "hello world" printed to the console.' 8 | 9 | answer = "hello world" 10 | 11 | TestPrintHelloPoly = question >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> SubstringEvaluator(answer)) & (CRun() >> SubstringEvaluator(answer))) 12 | 13 | question2 = ''' 14 | Write a single file that I can save as foo.py.rs that I can run as either a rust program or a python program, and so both will print "hello world". Specifically if I run `python foo.py.rs` or `rustc -o a.out foo.py.rs && ./a.out` I should see "hello world" printed to the console. 15 | 16 | To help you out, here's a valid answer for how to do this in C: 17 | 18 | ``` 19 | #include 20 | #define sdf b /* 21 | print("hello world") 22 | """ 23 | #define wer b */ 24 | 25 | int main() { 26 | printf("hello world\n"); 27 | } 28 | 29 | #define ert s """ 30 | ``` 31 | ''' 32 | 33 | TestPrintHelloPoly = question >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> SubstringEvaluator(answer)) & (CRun() >> SubstringEvaluator(answer))) 34 | TestPrintHelloPoly2 = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> SubstringEvaluator(answer)) & (RustRun() >> SubstringEvaluator(answer))) 35 | 36 | 37 | if __name__ == "__main__": 38 | print(run_test(TestPrintHelloPoly2)) 39 | 40 | 41 | -------------------------------------------------------------------------------- /tests/extract_emails.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can accurately extract and identify invalid email addresses from a given text file. Models that are \"overly safe\" will fail." 4 | 5 | TAGS = ['data'] 6 | 7 | question = """ 8 | Extract all the email addresses that are invalid from this text file: 9 | 10 | 11 | : Domain example.com does not accept mail (nullMX) 12 | 13 | : host mail.banglemail.com[138.68.190.241] said: 550 14 | 5.1.1 : Recipient address rejected: User unknown in 15 | virtual alias table (in reply to RCPT TO command) 16 | 17 | : host emx.mail.ru[218.62.518.180] said: 550 18 | Message was not accepted -- invalid mailbox. Local mailbox 19 | againredacted@example.info is unavailable: user not found (in reply to end 20 | of DATA command) 21 | 22 | : host emx.mail.ru[217.69.139.180] said: 550 23 | Message was not accepted -- invalid mailbox. Local mailbox 24 | xsz032by05wfk5jpe@mail4you.biz is unavailable: user not found (in reply to 25 | end of DATA command) 26 | 27 | : host mta-ent.21cn.com[183.56.203.132] said: 543 28 | suspected spams or account(IP) exception (in reply to end of DATA command) 29 | 30 | : host gmail-smtp-in.l.google.com[142.250.27.26] said: 31 | 550-5.1.1 The email account that you tried to reach does not exist. Please 32 | try 550-5.1.1 double-checking the recipient's email address for typos or 33 | 34 | """ 35 | 36 | TestExtractEmail = question >> LLMRun() >> (SubstringEvaluator("foo-bar@example.com") & SubstringEvaluator("noperson628@gmail.com") & SubstringEvaluator("againredacted@example.info") & SubstringEvaluator("xsz032by05wfk5jpe@mail4you.biz") & SubstringEvaluator("wontsay@bestpay.com.cn") & SubstringEvaluator("redactedfinally@gmail.com") & ~SubstringEvaluator("gmail-smtp-in.l.google.com") & ~SubstringEvaluator("217.69.139.180")) 37 | 38 | 39 | if __name__ == "__main__": 40 | print(run_test(TestExtractEmail)) 41 | 42 | -------------------------------------------------------------------------------- /tests/rewrite_mac_crypto.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test the ability of the model to rewrite a simple c program so it will run on ubuntu, and keep bugs in place." 4 | 5 | TAGS = ['code', 'c'] 6 | 7 | question = """ 8 | Rewrite this c code so it will run on ubuntu. DO NOT FIX ANY BUGS. 9 | 10 | ```c 11 | #include 12 | #include 13 | #include 14 | 15 | unsigned char memory[1<<20]; 16 | void* malloc_ptr = (void*)&memory; 17 | 18 | void* my_malloc(int size) { 19 | void* ptr = malloc_ptr; 20 | malloc_ptr += size; 21 | return (void*)ptr; 22 | } 23 | 24 | void compute_md5(char *str, unsigned char *hex_result) { 25 | unsigned char result[CC_MD5_DIGEST_LENGTH]; 26 | CC_MD5_CTX md5ctx; 27 | CC_MD5_Init(&md5ctx); 28 | CC_MD5_Update(&md5ctx, str, strlen(str)); 29 | CC_MD5_Final(result, &md5ctx); 30 | 31 | for (int i = 0; i < CC_MD5_DIGEST_LENGTH; i++) { 32 | sprintf((char*)(hex_result + (i * 2)), "%02x", result[i]); 33 | } 34 | } 35 | 36 | int main(int argc, char** argv) { 37 | char* msg = (char*)my_malloc(32); 38 | char* result = (char*)my_malloc(33); 39 | msg[0] = '\0'; 40 | 41 | compute_md5(argv[1], (unsigned char*)result); 42 | result[33] = '\0'; 43 | 44 | strcat(msg, "Your password is wrong, "); 45 | strcat(msg, argv[2]); 46 | 47 | if (strcmp(result, "13bd041ac6e683f0c8ddba41eeed9e67") == 0) { 48 | puts("CORRECT!"); 49 | return 0; 50 | } else { 51 | puts(msg); 52 | return 1; 53 | } 54 | } 55 | ``` 56 | """ 57 | 58 | TestProgramRewriteCCrypto = question >> LLMRun() >> ExtractCode(keep_main=True) >> \ 59 | ((CRun(gccflags="-lcrypto", argv="emacsisbetterthanvim a") >> SubstringEvaluator("CORRECT!")) & 60 | (CRun(gccflags="-lcrypto", argv="randompw a") >> SubstringEvaluator("wrong")) & 61 | (CRun(gccflags="-lcrypto", argv=f"a {'a'*8}13bd041ac6e683f0c8ddba41eeed9e67") >> SubstringEvaluator("CORRECT!"))) 62 | 63 | 64 | 65 | if __name__ == "__main__": 66 | from llm import llm 67 | print(run_test(TestProgramRewriteCCrypto)) 68 | 69 | -------------------------------------------------------------------------------- /tests/rust_word_count.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a rust program that performs word counting." 4 | 5 | TAGS = ['code', 'rust'] 6 | 7 | question = """ 8 | Write a self-contained Rust program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. 9 | 10 | {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} 11 | 12 | Don't import any crates that would require a Cargo.toml file. I'll just compile it with `rustc yourfile.rs` and it should work. The program should be run by calling /path/to/rust_word_count with no arguments 13 | """ 14 | 15 | extra = "\nDo not import serde_json, write json output from scratch." 16 | 17 | def setup(): 18 | txt = "it was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way" 19 | open("/tmp/file.txt", "w").write(txt) 20 | 21 | 22 | answer = {'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1} 23 | 24 | TestRustCount = Setup(setup) >> StringNode(question) >> LLMRun() >> ExtractCode(keep_main=True) >> RustRun() >> ExtractJSON() >> JSONSubsetEvaluator(answer) 25 | 26 | TestRustCountNoLib = Setup(setup) >> StringNode(question+extra) >> LLMRun() >> ExtractCode(keep_main=True) >> RustRun() >> ExtractJSON() >> JSONSubsetEvaluator(answer) 27 | 28 | 29 | if __name__ == "__main__": 30 | print(run_test(TestRustCountNoLib)) 31 | -------------------------------------------------------------------------------- /tests/py_image_resize.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can resize several images in a given subdirectory." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """Write function resize_image in python that open get a folder path as in put and looks for all of the images files in that folder using only pillow and resize them to 32x32 and overwrite it. Just give me the python code that I can run by python code.py and the default folder is /tmp """ 8 | 9 | 10 | 11 | 12 | def setup(): 13 | from PIL import Image 14 | import random 15 | import os 16 | def create_random_image(file_path): 17 | # Random size between 100x100 and 800x800 18 | width, height = random.randint(100, 800), random.randint(100, 800) 19 | # Random color 20 | color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) 21 | # Create an image with the random color 22 | image = Image.new("RGB", (width, height), color) 23 | # Save the image 24 | image.save(file_path) 25 | for i in range(10): 26 | file_path = os.path.join('/tmp/', f"random_image_{i+1}.jpg" if random.random() > 0.5 else f"random_image_{i+1}.jpeg" , ) 27 | create_random_image(file_path) 28 | 29 | def test(): 30 | import os 31 | from PIL import Image 32 | # Iterate over each file in the folder 33 | target_size = (32,32) 34 | folder_path = '/tmp/' 35 | for file_name in os.listdir(folder_path): 36 | file_path = os.path.join(folder_path, file_name) 37 | # Check if the file is an image 38 | if file_path.lower().endswith(('.jpg', '.jpeg')): 39 | # Open the image 40 | with Image.open(file_path) as img: 41 | # Check if the image size matches the target size 42 | if img.size != target_size: 43 | print('size is ',img.size) 44 | return False 45 | return True 46 | 47 | 48 | TestImgResize = Setup(setup) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test) 49 | 50 | if __name__ == "__main__": 51 | print(run_test(TestImgResize)) 52 | 53 | 54 | -------------------------------------------------------------------------------- /llms/moonshot_model.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from PIL import Image 3 | import base64 4 | 5 | from openai import OpenAI 6 | import json 7 | 8 | class MoonshotAIModel: 9 | def __init__(self, name): 10 | config = json.load(open("config.json")) 11 | api_key = config['llms']['moonshot']['api_key'].strip() 12 | self.client = OpenAI(api_key=api_key, base_url='https://api.moonshot.cn/v1') 13 | self.name = name 14 | self.hparams = config['hparams'] 15 | self.hparams.update(config['llms']['moonshot'].get('hparams') or {}) 16 | 17 | def make_request(self, conversation, add_image=None, max_tokens=None): 18 | conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)] 19 | 20 | if add_image: 21 | buffered = BytesIO() 22 | add_image.convert("RGB").save(buffered, format="JPEG") 23 | img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") 24 | img_str = f"data:image/jpeg;base64,{img_str}" 25 | 26 | conversation[0]['content'] = [{"type": "text", "text": conversation[0]['content']}, 27 | { 28 | "type": "image_url", 29 | "image_url": { 30 | "url": img_str 31 | } 32 | } 33 | ] 34 | kwargs = { 35 | "messages": conversation, 36 | "max_tokens": max_tokens, 37 | } 38 | kwargs.update(self.hparams) 39 | 40 | for k,v in list(kwargs.items()): 41 | if v is None: 42 | del kwargs[k] 43 | 44 | out = self.client.chat.completions.create( 45 | model=self.name, 46 | **kwargs 47 | ) 48 | 49 | return out.choices[0].message.content 50 | 51 | if __name__ == "__main__": 52 | import sys 53 | #q = sys.stdin.read().strip() 54 | q = "hello there" 55 | print(q+":", MoonshotAIModel("moonshot-v1-8k").make_request([q])) 56 | -------------------------------------------------------------------------------- /tests/gol_rle_decode.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "This test case tests if the model can convert a Game of Life pattern represented in RLE format to a numpy array." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Convert this game of life pattern 9 | 10 | 26$13b2o$14bo$14bobo$15b2o14$31b2o$30b2o$32bo13$46b2o$45b2o$47bo13$61b 11 | 2o$60b2o$62bo4$52b2o$52b2o$71bo$66b2ob3o$66bobobo$68bo$67b2o3$76b2o$ 12 | 60b3o12b2o10bobo$52b3o7bo14bo8bo2bo4bo$51bo3bo5bo10b2o11b2o5b2o$50bo5b 13 | o15bobo8b2o3bo8b2o$51bo3bo11b2o6bo9b2o10b2o$52b3o8b2obo2bo2bo2bo10bo2b 14 | o$52bo2bo7b2o2b2o6bo11bobo$53b3o16bobo$53b2obo15b2o$54bobo$55bo3$52b2o 15 | 3b2o$52bobobobo$53b5o$54b3o$55bo7$54b2o$54b2o! 16 | 17 | 18 | to a numpy array. Call the function `rle_to_arr(string, h, w)` 19 | which should return a numpy array with .shape (h,w). 20 | Fill the array with 1s, and empty with 0. 21 | """ 22 | 23 | rle = """ 24 | 16b2o3b2o$2o$2o$19b2o$17b5o$21bo$17b3o$o16bo$o17b2o$bo16b2o$b2o$3bo$o 25 | 17b2o3b2o$2b2o14bobobobo$11bobo5b5o$11b2o7b3o$2o10bo8bo$b2o$b2o2$2o$bo 26 | 2$6bobo$2o3bo$bo3bo$o4bo2bo7bo$5b3o7b2o7bo$15bobo5b3o$22b5o$21b2o3b2o$ 27 | 22b5o$22b5o$14b2o6bo2bo$14bobo4bo3bo$11b2obobo4bo$11bobobo5bob2o$13bo 28 | 8bo$4bobo6b2o$7bo5b3o50bo$7bo5b3o3b2o3b2o40b4o$4bo2bo5b3o3bo5bo41b4o$ 29 | 5b3o5b2o35bo16bo2bo$13bo6bo3bo24bobo15b4o4b2o$11bobobo5b3o15b2o7bob2o 30 | 14b4o5b2o$11b2obobo21bobo6b2ob2o14bo8b2o$14bobo20bo6b3obob2o$14b2o12b 31 | 2o7bo2bo2bo2bo2bobo$28b2o7bo6b2o4bo16bobo$38bobo26b2o$39b2o27bo4b2o$ 32 | 21b2o49bo2bo$21b2o28bobo18b3o$52b2o21b2o$52bo21bo$74b2o$59bo$62bo$59bo 33 | 14bo$62bo$61bob3o8bo$59b4o2bo9b2o$64bo$52bo$51b2o$40b2o8b2o4b2o5bo$40b 34 | 2o7b3o4b2o3bobo$50b2o4b2o2bobo11b2o$4bobo44b2o6bo2bo11b2o$7bo44bo7bobo 35 | $o6bo53bobo$o3bo2bo55bo$5b3o5$4b4o$3bo3bo$7bo$3bo2bo3$o2b3o2b2o$2b5o2b 36 | o64b3o$9o67bo$9b3o63bo$o7bo2bo$9b2o3$4b2o$o3b3o$o5b2obo5b2o$bo4bo2bo5b 37 | 2o$2o4b2obo$o3b3o$4b2o2$59b3o12b2o$61bo12b2o! 38 | """ 39 | 40 | 41 | 42 | test_case, answer = make_python_test([(f"(np.array(rle_to_arr('''{rle}''', 101, 77),dtype=np.uint32).flatten() * np.arange(77*101)).sum()", "1563304")]) 43 | 44 | TestRLEDecode = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer) 45 | 46 | if __name__ == "__main__": 47 | print(run_test(TestRLEDecode)) 48 | -------------------------------------------------------------------------------- /llms/vertexai_model.py: -------------------------------------------------------------------------------- 1 | import vertexai 2 | from vertexai.language_models import ChatModel, InputOutputTextPair 3 | from vertexai.preview.generative_models import GenerativeModel 4 | 5 | 6 | import json 7 | import requests 8 | 9 | class VertexAIModel: 10 | def __init__(self, name): 11 | self.name = name 12 | config = json.load(open("config.json")) 13 | self.hparams = config['hparams'] 14 | self.hparams.update(config['llms']['vertexai'].get('hparams') or {}) 15 | 16 | project_id = config['llms']['vertexai']['project_id'].strip() 17 | vertexai.init(project=project_id, location="us-central1") 18 | 19 | if 'gemini' in name: 20 | self.chat_model = GenerativeModel(name) 21 | else: 22 | self.chat_model = ChatModel.from_pretrained(name) 23 | 24 | 25 | def make_request(self, conversation, add_image=None, max_tokens=2048, stream=False): 26 | if 'gemini' in self.name: 27 | conversation = [" " if c == "" else c for c in conversation] 28 | conf = { 29 | "max_output_tokens": 2048, 30 | } 31 | conf.update(self.hparams) 32 | response = self.chat_model.generate_content(conversation, generation_config=conf) 33 | else: 34 | conversation_pairs = conversation[:-1] 35 | conversation_pairs = [(a, b) for a, b in zip(conversation_pairs[::2], conversation_pairs[1::2])] 36 | 37 | chat = self.chat_model.start_chat( 38 | examples=[ 39 | InputOutputTextPair( 40 | input_text=a, 41 | output_text=b, 42 | ) for a,b in conversation_pairs] 43 | ) 44 | conf = { 45 | "max_output_tokens": 2048, 46 | } 47 | conf.update(self.hparams) 48 | response = chat.send_message( 49 | conversation[-1], 50 | **conf 51 | ) 52 | try: 53 | return response.text 54 | except: 55 | return '' 56 | 57 | 58 | if __name__ == "__main__": 59 | import sys 60 | #q = sys.stdin.read().strip() 61 | q = "why?" 62 | print(VertexAIModel("gemini-1.5-pro-preview-0409").make_request(["hi, how are you doing", "i'm a bit sad", q])) 63 | -------------------------------------------------------------------------------- /llms/openai_model.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from PIL import Image 3 | import base64 4 | 5 | from openai import OpenAI 6 | import json 7 | 8 | class OpenAIModel: 9 | def __init__(self, name): 10 | config = json.load(open("config.json")) 11 | api_key = config['llms']['openai']['api_key'].strip() 12 | self.client = OpenAI(api_key=api_key) 13 | self.name = name 14 | self.hparams = config['hparams'] 15 | self.hparams.update(config['llms']['openai'].get('hparams') or {}) 16 | 17 | def make_request(self, conversation, add_image=None, max_tokens=None, json=False): 18 | conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)] 19 | 20 | if add_image: 21 | buffered = BytesIO() 22 | add_image.convert("RGB").save(buffered, format="JPEG") 23 | img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") 24 | img_str = f"data:image/jpeg;base64,{img_str}" 25 | 26 | conversation[0]['content'] = [{"type": "text", "text": conversation[0]['content']}, 27 | { 28 | "type": "image_url", 29 | "image_url": { 30 | "url": img_str 31 | } 32 | } 33 | ] 34 | kwargs = { 35 | "messages": conversation, 36 | "max_tokens": max_tokens, 37 | } 38 | kwargs.update(self.hparams) 39 | 40 | for k,v in list(kwargs.items()): 41 | if v is None: 42 | del kwargs[k] 43 | if json: 44 | kwargs['response_format'] = { "type": "json_object" } 45 | if self.name.startswith("o1"): 46 | del kwargs['temperature'] 47 | 48 | out = self.client.chat.completions.create( 49 | model=self.name, 50 | **kwargs 51 | ) 52 | 53 | return out.choices[0].message.content 54 | 55 | if __name__ == "__main__": 56 | import sys 57 | #q = sys.stdin.read().strip() 58 | q = "hello there" 59 | print(q+":", OpenAIModel("o1-mini").make_request([q])) 60 | -------------------------------------------------------------------------------- /tests/fix_torch_backward.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can fix and explain a bug in PyTorch code related to forgetting to zero gradients." 4 | 5 | TAGS = ['code', 'python', 'fix'] 6 | 7 | code = """ 8 | ``` 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | 13 | class SimpleNet(nn.Module): 14 | def __init__(self): 15 | super(SimpleNet, self).__init__() 16 | self.fc = nn.Linear(1, 1) 17 | 18 | def forward(self, x): 19 | return self.fc(x) 20 | 21 | def PGD(model, input, target, loss_fn, epsilon, alpha, num_iter): 22 | perturbation = torch.zeros_like(input, requires_grad=True) 23 | 24 | for i in range(num_iter): 25 | output = model(input + perturbation) 26 | loss = loss_fn(output, target) 27 | 28 | loss.backward() 29 | 30 | # Update perturbation 31 | perturbation_grad = perturbation.grad.data 32 | perturbation.data = perturbation.data + alpha * perturbation_grad.sign() 33 | perturbation.data = torch.clamp(perturbation.data, -epsilon, epsilon) 34 | perturbation.grad.data.zero_() 35 | 36 | return input + perturbation 37 | 38 | model = SimpleNet() 39 | 40 | input = torch.tensor([[1.0]], requires_grad=True) 41 | target = torch.tensor([[2.0]]) 42 | 43 | loss_fn = nn.MSELoss() 44 | 45 | epsilon = 0.1 46 | alpha = 0.01 47 | num_iter = 2 48 | 49 | adversarial_input = PGD(model, input, target, loss_fn, epsilon, alpha, num_iter) 50 | print("Resulting advex", adversarial_input) 51 | ``` 52 | """ 53 | 54 | q1 = f"Fix the bug in this code. Rewrite the entire code exactly as-is but just fix any bugs.\n\n{code}" 55 | 56 | TestTorchBackwardFix = q1 >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> (SubstringEvaluator("tensor") & SubstringEvaluator("grad_fn=> LLMRun() >> LLMRun("Below is a student's explanation for a bug in a torch function.\n\nDoes the student's answer say specifically \"the bug is caused by not zeroing the gradient in the backward pass\"? Think out loud and then finally answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> Echo() >> SubstringEvaluator("The student passes") 61 | 62 | 63 | if __name__ == "__main__": 64 | print(run_test(TestTorchBackwardExplain)) 65 | 66 | -------------------------------------------------------------------------------- /tests/merge_into_16.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a Python script that merges a list of file paths into 16 files of approximately equal size." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question_easy = """ 8 | I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line. 9 | """ 10 | 11 | question = """ 12 | I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line. 13 | """ 14 | 15 | 16 | def setup_fn(): 17 | import random 18 | import os 19 | os.mkdir("/tmp/a") 20 | with open("/tmp/files", "w") as f: 21 | for i in range(1000): 22 | f.write("/tmp/a/file_%d\n" % i) 23 | with open("/tmp/a/file_%d" % i, "w") as f2: 24 | if i < 5: 25 | f2.write("x" * random.randint(10000, 30000)) 26 | else: 27 | f2.write("x" * random.randint(1, 1000)) 28 | 29 | def test(): 30 | import os 31 | import numpy as np 32 | sz = [] 33 | for i in range(16): 34 | sz.append(len(open("/tmp/merged_%d" % i).read())) 35 | 36 | size_before = sum(os.path.getsize("/tmp/a/file_%d"%i) for i in range(1000)) 37 | size_after = sum(sz) 38 | 39 | # I ddn't say exactly how to merge, adding newlines is okay 40 | if abs(size_before-size_after) > 2000: 41 | return False 42 | 43 | # easy to hit with proper merging, hard otherwise 44 | # 10 trials suggest the range is usualy [10, 40] 45 | return max(sz) - min(sz) < 200 46 | 47 | 48 | 49 | TestMake16FilesEasy = Setup(setup_fn) >> question_easy >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test) 50 | 51 | TestMake16Files = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test) 52 | 53 | 54 | if __name__ == "__main__": 55 | print(run_test(TestMake16FilesEasy)) 56 | -------------------------------------------------------------------------------- /tests/make_json.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can successfully convert unstructured data to JSON." 4 | 5 | TAGS = ['data'] 6 | 7 | question = """ 8 | Create a JSON metadata for these models: 9 | 10 | Mistral-7B-v0.1 11 | RedPajama-INCITE-7B-Base 12 | RedPajama-INCITE-Base-3B-v1 13 | falcon40b 14 | falcon7b 15 | gpt2-xl 16 | llama-65b 17 | llama-7b 18 | neo-1.3 19 | neo-2.7 20 | neo-6 21 | open_llama_3b_v2 22 | open_llama_7b_v2 23 | opt-1.3b 24 | opt-6.7b 25 | pythia-1.4 26 | pythia-1.4-dedup 27 | pythia-6.9 28 | pythia-6.9-dedup 29 | 30 | With the format: 31 | 32 | {"Mistral-7B-v0.1": {"size": 7, dataset: "", "family": "Mistral"}, ...} 33 | 34 | where family is one of 35 | 36 | base = [ 37 | 'pythia', 38 | 'llama', 39 | 'Mistral', 40 | 'gpt2', 41 | 'opt', 42 | 'RedPajama', 43 | 'neo', 44 | 'open_llama', 45 | 'falcon' 46 | ] 47 | 48 | gpt2-xl is 1.5b parameters. 49 | 50 | """ 51 | 52 | 53 | TestMakeJson = question >> LLMRun() >> ExtractJSON() >> JSONSubsetEvaluator({ 54 | "Mistral-7B-v0.1": {"size": 7, "dataset": "", "family": "Mistral"}, 55 | "RedPajama-INCITE-7B-Base": {"size": 7, "dataset": "", "family": "RedPajama"}, 56 | "RedPajama-INCITE-Base-3B-v1": {"size": 3, "dataset": "", "family": "RedPajama"}, 57 | "falcon40b": {"size": 40, "dataset": "", "family": "falcon"}, 58 | "falcon7b": {"size": 7, "dataset": "", "family": "falcon"}, 59 | "gpt2-xl": {"size": 1.5, "dataset": "", "family": "gpt2"}, 60 | "llama-65b": {"size": 65, "dataset": "", "family": "llama"}, 61 | "llama-7b": {"size": 7, "dataset": "", "family": "llama"}, 62 | "neo-1.3": {"size": 1.3, "dataset": "", "family": "neo"}, 63 | "neo-2.7": {"size": 2.7, "dataset": "", "family": "neo"}, 64 | "neo-6": {"size": 6, "dataset": "", "family": "neo"}, 65 | "open_llama_3b_v2": {"size": 3, "dataset": "", "family": "open_llama"}, 66 | "open_llama_7b_v2": {"size": 7, "dataset": "", "family": "open_llama"}, 67 | "opt-1.3b": {"size": 1.3, "dataset": "", "family": "opt"}, 68 | "opt-6.7b": {"size": 6.7, "dataset": "", "family": "opt"}, 69 | "pythia-1.4": {"size": 1.4, "dataset": "", "family": "pythia"}, 70 | "pythia-1.4-dedup": {"size": 1.4, "dataset": "", "family": "pythia"}, 71 | "pythia-6.9": {"size": 6.9, "dataset": "", "family": "pythia"}, 72 | "pythia-6.9-dedup": {"size": 6.9, "dataset": "", "family": "pythia"} 73 | }) 74 | 75 | if __name__ == "__main__": 76 | print(run_test(TestMakeJson)) 77 | -------------------------------------------------------------------------------- /tests/emoji_movies.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "A for-fun test to see if the model can go movie title -> emoji -> movie title." 4 | 5 | TAGS = ['fun'] 6 | 7 | question = """ 8 | For each of the following ten movies give at most 5 emoji that would best describe the movie. 9 | 10 | Give your answer as a JSON array. So If I asked for 11 | ```["Finding Nemo", "Toy Story"]``` 12 | 13 | you might might answer 14 | 15 | ```json 16 | {"Finding Nemo": ["🐠", "🐟", "🐡", "🐬", "🐳"], 17 | "Toy Story": ["🚀", "⚔️,", "🤖", "👽", "🌌"]} 18 | ```. 19 | 20 | Each emoji must be a single utf8 character. ABSOLUTELY NO ZERO WIDTH JOINING. (So, max(len(emoji) for movie in result.values() for emoji in movie) == 1) 21 | 22 | Now give me answers for these movies: 23 | 24 | ```["The Lion King", "The Nightmare Before Christmas", "The Godfather", "The Matrix", "Casablanca", "Raiders of the Lost Ark", "V for Vendetta", "The Princess Bride", "Back to the Future", "Dune"]``` 25 | 26 | Give ONLY a JSON output. Nothing else. 27 | """ 28 | 29 | undo = """ 30 | For each of the following ten movies described by 5 emoji, give the movie title that best matches. 31 | 32 | Give your answer as a JSON list. So If I asked for 33 | ```[["🐠", "🐟", "🐡", "🐬", "🐳"], ["🚀", "⚔️,", "🤖", "👽", "🌌"]]``` 34 | 35 | You might answer 36 | 37 | ```json 38 | ["Finding Nemo", "Toy Story"]] 39 | ```. 40 | 41 | Now give me answers for these movies: 42 | 43 | `````` 44 | 45 | What are the names of the movie titles? 46 | """ 47 | 48 | def extract(x): 49 | try: 50 | x = json.loads(x) 51 | except: 52 | print("Failed processing") 53 | return "" 54 | send = list(x.values()) 55 | # I'll be nice... 56 | send = [[x for x in y if len(x) <= 2] for y in send] 57 | return str(send).replace("], [", "],\n[") 58 | 59 | def count(x): 60 | try: 61 | x = json.loads(x) 62 | count = 0 63 | for correct, guessed in zip(["The Lion King", "The Nightmare Before Christmas", "The Godfather", "The Matrix", "Casablanca", "Raiders of the Lost Ark", "V for Vendetta", "The Princess Bride", "Back to the Future", "Dune"], x): 64 | if correct.lower() == guessed.lower(): 65 | count += 1 66 | return count >= 8, "OK" 67 | except: 68 | return False, "Not a JSON list" 69 | 70 | 71 | TestEmojiMovie = question >> LLMRun() >> ExtractJSON() >> PyFunc(extract) >> LLMRun(undo) >> ExtractJSON() >> PyFunc(count) 72 | 73 | if __name__ == "__main__": 74 | print(run_test(TestEmojiMovie)) 75 | -------------------------------------------------------------------------------- /tests/git_cherrypick.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can guide a user through a series of git commands to identify and cherrypick a specific commit from a branch onto the main branch." 4 | 5 | TAGS = ['bash', "git", "agent"] 6 | 7 | 8 | question = """ 9 | You are in a git repository. I have two branches, main and some other one I forgot the name of. Take just the first commit that differs between main and the other branch, and commit it onto main. 10 | 11 | On each round you should just give me just one command to type, I'll run it exactly as-is, and give you the exact output. I will not make any changes. So do not write any placholder text. I'll get started: 12 | 13 | INPUT: `pwd` 14 | 15 | OUTPUT: `/usr/src/app` 16 | 17 | INPUT: `ls` 18 | 19 | OUTPUT: `main.py a b` 20 | 21 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command. Start your response with INPUT:""" 22 | 23 | 24 | def test_ok(): 25 | import os 26 | try: 27 | return open("e").read().strip() == "e" and not os.path.exists("d") and open("c").read().strip() == "c" 28 | except: 29 | return False 30 | 31 | return success 32 | 33 | def setup(): 34 | import os 35 | print(os.popen(""" 36 | git config --global user.name "Your Name" 37 | git config --global user.email "you@example.com" 38 | git config --global init.defaultBranch main 39 | echo "a" > a 40 | echo "b" > b 41 | git init 42 | git add a b 43 | git commit -a -m "Initial commit" 44 | git branch other 45 | git checkout other 46 | echo "c" > c 47 | git add c 48 | git commit -a -m "Add c" 49 | echo "d" > d 50 | git add d 51 | git commit -a -m "Add d" 52 | git log 53 | git checkout main 54 | echo "e" > e 55 | git add e 56 | git commit -a -m "Add e" 57 | git log 58 | """).read()) 59 | 60 | def do_extract(x): 61 | if 'INPUT:' in x: 62 | x = x.split("INPUT:")[1] 63 | try: 64 | to_send = x.split("`")[1] 65 | except: 66 | to_send = x.split("\n")[0] 67 | return to_send 68 | 69 | def do_prepare(x): 70 | print("Preparing to pass back", x) 71 | return f"I get an OUTPUT: `{x}`.\n\nWhat is the exact command I should run next? Start your response with INPUT:" 72 | 73 | TestGitCherrypick = Setup(setup) >> Echo() >> question >> UntilDone(PyEvaluator(test_ok), (LLMConversation() >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=6) >> PyEvaluator(test_ok) 74 | 75 | 76 | if __name__ == "__main__": 77 | print(run_test(TestGitCherrypick)) 78 | -------------------------------------------------------------------------------- /tests/python_parallel_wordcount.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can parallelize a python program to perform a wordcount." 4 | 5 | TAGS = ['code', 'python', 'performance'] 6 | import collections 7 | 8 | question = """ 9 | Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. 10 | 11 | {"This": 1, "is": 2, "is,":1, "an":1, "example":1} 12 | 13 | The program should be run by calling /path/to/python_word_count with no arguments. 14 | 15 | Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file. 16 | Print out 4 differnt json dicts, one per line. I'll do the merging. 17 | """ 18 | 19 | 20 | 21 | def setup(): 22 | txt = "it was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way" 23 | open("/tmp/file.txt", "w").write(txt) 24 | 25 | 26 | def merge(dicts): 27 | out = {} 28 | for line in dicts.split("\n"): 29 | if line: 30 | if '{' not in line: continue 31 | try: 32 | d = json.loads(line) 33 | except: 34 | continue 35 | for k, v in d.items(): 36 | out[k] = out.get(k, 0) + v 37 | return out 38 | 39 | TestPythonCountPar = Setup(setup) >> StringNode(question) >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> PyFunc(merge) >> EqualEvaluator(({'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}))) & (LLMRun("Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Just tell me if they tried to parallelize it.\n\n\n\nAnswer either 'The student passes' or 'The student fails'.", llm=EVAL_LLM) >> Echo() >> SubstringEvaluator("student passes"))) 40 | 41 | 42 | if __name__ == "__main__": 43 | print(run_test(TestPythonCountPar)) 44 | -------------------------------------------------------------------------------- /tests/debug_innerhtml_eventlistener.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model knows that editing the innerHTML clears event listeners." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = """ 8 | Why doesn't the slider onchange event fire when I run this code? 9 | 10 | ``` 11 | 12 | 13 | 69 | 70 | ``` 71 | 72 | Tell me how to fix it. 73 | """ 74 | 75 | evaluation = """ 76 | Below is a student's answer to a question about what a particular instruction does: 77 | 78 | Does the student's say something similar to "editing the innerHTML clears event listeners"? 79 | 80 | If the answer is correct say "The student passes" otherwise "The student fails". 81 | """ 82 | 83 | TestInnerHTMLEventListener = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes") 84 | 85 | 86 | if __name__ == "__main__": 87 | print(run_test(TestInnerHTMLEventListener)) 88 | -------------------------------------------------------------------------------- /tests/convert_to_c.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can rewrite a given Python program into an equivalent C program." 4 | 5 | TAGS = ['code', 'c'] 6 | 7 | question = """ 8 | Here's a python program. Please rewrite it in c with the type signature float solve_dp(float* arr, int size, int lines_left) 9 | 10 | def solve_dp(graph, lines_left): 11 | n = len(graph) 12 | 13 | dp = [[[1e9 for _ in range(10)] for _ in range(lines_left + 1)] for _ in range(n + 1)] 14 | # Initialize how as -1 indicating no decision made 15 | how = [[[-1 for _ in range(10)] for _ in range(lines_left + 1)] for _ in range(n + 1)] 16 | 17 | for i in range(n, -1, -1): 18 | for j in range(lines_left + 1): 19 | for k in range(10): 20 | if i == n and j == 0: 21 | dp[i][j][k] = 0 22 | else: 23 | # Option 1: don't take the line 24 | if i < n and k < 9: 25 | if dp[i + 1][j][k + 1] < dp[i][j][k]: 26 | dp[i][j][k] = dp[i + 1][j][k + 1] 27 | how[i][j][k] = k + 1 # Representing choosing not to take the line 28 | 29 | # Option 2: take the line 30 | if i < n and j > 0: 31 | cost = graph[i] + (k - 8)**2 32 | if cost + dp[i + 1][j - 1][0] < dp[i][j][k]: 33 | dp[i][j][k] = cost + dp[i + 1][j - 1][0] 34 | how[i][j][k] = 0 # Representing choosing to take the line 35 | 36 | # Reconstruct the solution 37 | 38 | i, j, k = 0, lines_left, 6 39 | taken_lines = [] 40 | while i < n: 41 | if how[i][j][k] == 0: # Chose to take the line 42 | taken_lines.append(n - i) 43 | i += 1 44 | j -= 1 45 | k = 0 46 | else: # Chose not to take the line 47 | i += 1 48 | k += 1 49 | 50 | return dp[0][lines_left][6] 51 | """ 52 | 53 | test_case, answer = make_c_test([("solve_dp(arr, 100, 100)", "11290")], header="float arr[] = {71, 89, 34, 63, 19, 94, 54, 61, 88, 20, 66, 46, 26, 87, 55, 81, 6, 2, 72, 75, 98, 78, 24, 95, 73, 7, 56, 48, 14, 99, 64, 51, 69, 77, 28, 47, 8, 22, 49, 3, 62, 32, 10, 82, 35, 18, 85, 60, 83, 23, 5, 40, 41, 68, 53, 52, 44, 45, 65, 84, 93, 25, 13, 1, 31, 11, 12, 97, 38, 0, 43, 90, 36, 70, 33, 17, 21, 30, 16, 15, 74, 67, 58, 37, 39, 96, 79, 29, 27, 92, 86, 9, 80, 42, 57, 91, 59, 4, 76, 50};") 54 | 55 | 56 | TestProgramRewriteC = question >> LLMRun() >> ExtractCode() >> CRun(test_case) >> SubstringEvaluator(answer) 57 | 58 | if __name__ == "__main__": 59 | print(run_test(TestProgramRewriteC)) 60 | -------------------------------------------------------------------------------- /tests/fix_with_patch.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can generate a .patch file to fix a bug in a given Python code." 4 | 5 | TAGS = ['code', 'fix', 'python'] 6 | 7 | code = """import re 8 | 9 | class Tokenizer: 10 | def __init__(self, input_str): 11 | ''' 12 | input_str: a string of digits and operators 13 | ''' 14 | self.position = 0 15 | self.tokens = re.findall(r'\d+|(\+|\-|\*|/|sin|cos|\(|\))', input_str) 16 | 17 | def get_next_token(self): 18 | if self.position >= len(self.tokens): 19 | return None 20 | token = self.tokens[self.position] 21 | self.position += 1 22 | return token 23 | 24 | def peek_next_token(self): 25 | if self.position >= len(self.tokens): 26 | return None 27 | return self.tokens[self.position] 28 | """ 29 | 30 | question = f""" 31 | Write a .patch file to stdout that will fix the bug in this tokenizer. Still keep the new file name the same after patching. 32 | 33 | I'll evaluate you based on my ability to do 34 | - echo [your patch] > fix.patch 35 | - patch < fix.patch 36 | - python3 run_tokenizer.py 37 | 38 | 39 | 40 | Here is the entirety of run_tokenizer.py: 41 | ```python 42 | {code} 43 | ``` 44 | """ 45 | 46 | def setup_fn(): 47 | code = """import re 48 | 49 | class Tokenizer: 50 | def __init__(self, input_str): 51 | ''' 52 | input_str: a string of digits and operators 53 | ''' 54 | self.position = 0 55 | self.tokens = re.findall(r'\d+|(\+|\-|\*|/|sin|cos|\(|\))', input_str) 56 | 57 | def get_next_token(self): 58 | if self.position >= len(self.tokens): 59 | return None 60 | token = self.tokens[self.position] 61 | self.position += 1 62 | return token 63 | 64 | def peek_next_token(self): 65 | if self.position >= len(self.tokens): 66 | return None 67 | return self.tokens[self.position]""" 68 | import time 69 | open('run_tokenizer.py', 'w').write(code) 70 | 71 | time.sleep(.5) 72 | 73 | 74 | def check(): 75 | import os 76 | import time 77 | os.system("patch < fix.patch") 78 | time.sleep(.5) 79 | import run_tokenizer 80 | return run_tokenizer.Tokenizer("cos(1)+sin(23)*5").tokens == ['cos', '(', '1', ')', '+', 'sin', '(', '23', ')', '*', '5'] 81 | 82 | 83 | TestFixPatch = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(manual="Take the below answer to my question and return just the complete .patch in a single file so I can copy and paste it into an editor and apply it with the `patch` tool as-is. Include nothing else other than the patch. Here is the code: ") >> MakeFile("fix.patch") >> PyEvaluator(check) 84 | 85 | 86 | if __name__ == "__main__": 87 | print(run_test(TestFixPatch)) 88 | -------------------------------------------------------------------------------- /tests/shorten_c_function_hard.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can significantly shorten a repetitive C functions." 4 | 5 | TAGS = ['code', 'c'] 6 | 7 | question = """ 8 | Code golf this function to be a lot shorter. You only get points if it's <300 bytes (not counting whitespace). Keep the function named `count`. 9 | 10 | ``` 11 | 12 | int count() { 13 | int ROWS = 8; 14 | int COLS = 8; 15 | int count = 0; 16 | for (int i = 0; i < 1<<16; i++) { 17 | long long unsigned int array = ((i & 0xf) << 9) | ((i & 0xf0) << (9 + 8 - 4)) | ((i & 0xf00) << (9 + 8 - 4 + 8 - 4)) | ((i & 0xf000) << (9 + 8 - 4 + 8 - 4 + 8 - 4)); 18 | long long unsigned int init = array; 19 | 20 | long long unsigned int temp = array; 21 | 22 | for (int i = 1; i < ROWS - 1; i++) { 23 | for (int j = 1; j < COLS - 1; j++) { 24 | // Calculate bit positions 25 | int pos = i * COLS + j; 26 | int topPos = (i - 1) * COLS + j; 27 | int leftPos = i * COLS + (j - 1); 28 | int bottomPos = (i + 1) * COLS + j; 29 | int rightPos = i * COLS + (j + 1); 30 | 31 | // Get the values of the neighbors 32 | int cell = (temp >> pos) & 1; 33 | int top = (temp >> topPos) & 1; 34 | int left = (temp >> leftPos) & 1; 35 | int bottom = (temp >> bottomPos) & 1; 36 | int right = (temp >> rightPos) & 1; 37 | 38 | // Sum the values of the neighbors 39 | int sum = top + left + bottom + right; 40 | 41 | // Update the cell if all neighbors are 0 and the cell itself is 1 42 | if (sum == 0 && cell == 1) { 43 | array &= ~(1ULL << pos); 44 | } 45 | } 46 | } 47 | 48 | count += (init == array); 49 | } 50 | return count; 51 | } 52 | ``` 53 | """ 54 | 55 | test_case, answer = make_c_test([("count()", "27488")]) 56 | 57 | def short_enough(num): 58 | return lambda code: len(code.replace(" ", "").replace("\n", "").replace("\t", "")) < num 59 | 60 | step = "\nTo do this follow these steps: \n1. Start out explaining what the function does in words.\n2. Then find a few ways that you can make the function shorter given this explanation.\n3. Write the function as short as possible.\n4. See if you can make it any shorter. Return this as the final answer and call it your final answer. Make sure to name it `count` still." 61 | 62 | TestShortenC2Hard = question.replace("<300", "<200") >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(200)) & (CRun(test_case) >> SubstringEvaluator(answer))) 63 | TestShortenC2 = question >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer))) 64 | TestShortenC2Step = question + step >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer))) 65 | 66 | 67 | if __name__ == "__main__": 68 | from llm import llm 69 | print(run_test(TestShortenC2Step)) 70 | 71 | 72 | -------------------------------------------------------------------------------- /tests/implement_assembly_interpreter_by_example.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can implement an interpreter for a new assembly language given an example." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | primes = """ 8 | 9 | SET R1 2 // Starting number to check for prime 10 | 11 | start_find_primes: 12 | 13 | JMP is_prime // Control will return after executing is_prime with R1 as input and R2 containing the result 14 | 15 | ready_prime: 16 | SET R7 1 17 | EQ R2 R7 // Check if R2 is 1 (prime) 18 | JF increment // If not prime, skip storing and increment the number 19 | 20 | // Store prime number in memory and increment count 21 | STORE R1 R8 // Store prime number at address pointed by R8 22 | INC R8 // Increment prime count 23 | 24 | // Check if 100 primes are found 25 | SET R7 100 26 | EQ R8 R7 27 | JF increment // If not 100 primes found yet, continue finding 28 | 29 | JMP end // If 100 primes found, end program 30 | 31 | increment: 32 | INC R1 // Increment number to check for prime 33 | JMP start_find_primes // Check next number 34 | 35 | is_prime: 36 | SET R2 1 // Assume number is prime initially 37 | SET R3 2 // Start divisor from 2 38 | 39 | start_loop: // Label to start the loop 40 | // Check if we have exceeded the square root of R1 41 | MUL R4 R3 R3 // R4 = R3 * R3 42 | GT R4 R1 // Set flag if R4 > R1 43 | JT is_prime_end // If not exceeded, continue; else, end loop 44 | 45 | MOD R6 R1 R3 // R6 = R1 % R3 46 | SET R7 0 47 | EQ R7 R6 // Check if R6 is 0 48 | JT not_prime // If yes, number is not prime 49 | 50 | INC R3 // Increment divisor 51 | JMP start_loop // Repeat loop 52 | 53 | not_prime: 54 | SET R2 0 // Set result to 0 (not prime) 55 | 56 | is_prime_end: 57 | JMP ready_prime 58 | 59 | end: 60 | """ 61 | 62 | code = """ 63 | SET R1 0 64 | SET R2 1 65 | loop: 66 | MUL R3 R2 R2 67 | STORE R3 R1 68 | INC R1 69 | INC R2 70 | SET R3 20 71 | EQ R1 R3 72 | JF loop 73 | """ 74 | 75 | question = f"""Here is an example program from a new assmebly language I made up that computes primes: 76 | 77 | ``` 78 | {primes} 79 | ``` 80 | 81 | ite me a python interpreter `evaluate(str)` that returns the resulting memory state after running the program. For example, `evaluate(program)` should return `[2, 3, 5, 7, ...]` for the above program. 82 | """ 83 | 84 | 85 | test_case, answer = make_python_test([(f'evaluate("""{code}""")[:10]', "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"), 86 | (f'evaluate("""{primes}""")[:10]', "[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]") 87 | ]) 88 | 89 | TestImplementAssemblyByExample = question >> LLMRun() >> ExtractCode(lang="python") >> Echo() >> PythonRun(test_case) >> SubstringEvaluator(answer) 90 | 91 | 92 | if __name__ == "__main__": 93 | print(run_test(TestImplementAssemblyByExample)) 94 | 95 | 96 | -------------------------------------------------------------------------------- /tests/whisper_merge.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can implement some string logic given a fuzzy description." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Given a parse function like 9 | 10 | def parse_transcript(content): 11 | transcript_segments = [] 12 | pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\] (.*)' 13 | matches = re.findall(pattern, content) 14 | 15 | for start_time, end_time, text in matches: 16 | start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds() 17 | end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds() 18 | transcript_segments.append((start_seconds, end_seconds, text)) 19 | 20 | return transcript_segments 21 | 22 | 23 | for data like 24 | 25 | [00:00:14.240 --> 00:00:14.420] the 26 | [00:00:14.420 --> 00:00:14.860] robust 27 | [00:00:14.860 --> 00:00:15.020] ness 28 | [00:00:15.020 --> 00:00:15.140] of 29 | [00:00:15.140 --> 00:00:15.500] neural 30 | [00:00:15.500 --> 00:00:15.870] networks 31 | [00:00:15.870 --> 00:00:16.200] . 32 | [00:00:16.200 --> 00:00:16.410] And 33 | [00:00:16.410 --> 00:00:16.700] this 34 | [00:00:16.700 --> 00:00:16.840] is 35 | [00:00:16.840 --> 00:00:17.200] joint 36 | 37 | 38 | write a merge() function that (1) calls parse and (2) merges words into sentences that end in period. Start with the first word and end with the period. Return a list of strings. 39 | """ 40 | 41 | test_case, answer = make_python_test([("""merge("[00:00:14.240 --> 00:00:14.420] the\\n[00:00:14.420 --> 00:00:14.860] robust\\n[00:00:14.860 --> 00:00:15.020] ness\\n[00:00:15.020 --> 00:00:15.140] of\\n[00:00:15.140 --> 00:00:15.500] neural\\n[00:00:15.500 --> 00:00:15.870] networks\\n[00:00:15.870 --> 00:00:16.200] .\\n[00:00:16.200 --> 00:00:16.410] And\\n[00:00:16.410 --> 00:00:16.700] this\\n[00:00:16.700 --> 00:00:16.840] is\\n[00:00:16.840 --> 00:00:17.200] joint\\n")""", "[' the robustness of neural networks.', ' And this is joint']")], header=""" 42 | def parse_transcript(content): 43 | transcript_segments = [] 44 | pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\] (.*)' 45 | matches = re.findall(pattern, content) 46 | 47 | for start_time, end_time, text in matches: 48 | start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds() 49 | end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds() 50 | transcript_segments.append((start_seconds, end_seconds, text)) 51 | 52 | return transcript_segments 53 | """) 54 | 55 | 56 | TestWhisperMerge = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer) 57 | 58 | if __name__ == "__main__": 59 | print(run_test(TestWhisperMerge)) 60 | -------------------------------------------------------------------------------- /tests/rust_parallel_wordcount.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a rust program that performs parallel word counting." 4 | 5 | TAGS = ['code', 'rust', 'performance'] 6 | import collections 7 | 8 | question = """ 9 | Write a self-contained Rust program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. 10 | 11 | {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1} 12 | 13 | Don't import any crates that would require a Cargo.toml file. I'll just compile it with `rustc yourfile.rs` and it should work. The program should be run by calling /path/to/rust_word_count with no arguments. 14 | 15 | Make it run in parallel on 4 cores at the same time, each job taking ~1/4 of the file. 16 | Print out 4 differnt json dicts, one per line. I'll do the merging. 17 | """ 18 | 19 | extra = "\nDo not import serde_json, write json output from scratch." 20 | 21 | def setup(): 22 | txt = "it was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way" 23 | open("/tmp/file.txt", "w").write(txt) 24 | 25 | 26 | def merge(dicts): 27 | out = {} 28 | for line in dicts.split("\n"): 29 | if line: 30 | try: 31 | d = json.loads(line) 32 | except: 33 | continue 34 | for k, v in d.items(): 35 | out[k] = out.get(k, 0) + v 36 | return sorted(out.items()) 37 | 38 | answer = sorted({'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}.items()) 39 | 40 | judge = "Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Logic errors are okay. So are synatx errors. Just tell me if they tried to parallelize it.\n\n\n\nAnswer either 'The student passes' or 'The student fails'." 41 | 42 | TestRustParCount = Setup(setup) >> StringNode(question) >> LLMRun() >> ExtractCode(keep_main=True) >> ((RustRun() >> PyFunc(merge) >> EqualEvaluator((answer))) & (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes"))) 43 | 44 | TestRustParCountNoLib = Setup(setup) >> StringNode(question+extra) >> LLMRun() >> ExtractCode(keep_main=True) >> ((RustRun() >> PyFunc(merge) >> EqualEvaluator((answer))) & (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes"))) 45 | 46 | 47 | if __name__ == "__main__": 48 | print(run_test(TestRustParCountNoLib)) 49 | -------------------------------------------------------------------------------- /tests/debug_broken_code_parcount.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can explain a bug in a parallelized wordcount function." 4 | 5 | TAGS = ['explain', 'python', 'fix'] 6 | import collections 7 | 8 | question = """ 9 | What is the bug in this code that makes it not count right. (I want to print out 4 arrays un-merged. I'll do the merging later. This is intentional.) 10 | 11 | #!/usr/bin/env python3 12 | import json 13 | import multiprocessing 14 | 15 | def count_words_in_chunk(start, end, queue): 16 | '''Count how often each word appears in the given chunk of the file.''' 17 | word_counts = {} 18 | with open('/tmp/file.txt', 'r') as f: 19 | f.seek(start) 20 | text = f.read(end - start) 21 | # Avoid cutting words in half at the beginning 22 | if start != 0: 23 | text = text.split(' ', 1)[-1] 24 | # Avoid cutting words in half at the end 25 | f.seek(end) 26 | remainder = f.readline() 27 | text += remainder 28 | # Tokenize and count words 29 | for word in text.split(): 30 | word = word.strip().lower() # Lowercasing for consistent counting 31 | if word: 32 | word_counts[word] = word_counts.get(word, 0) + 1 33 | # Put result to the queue 34 | queue.put(word_counts) 35 | 36 | def main(): 37 | # Get the size of the file 38 | file_size = 0 39 | with open('/tmp/file.txt', 'r') as f: 40 | f.seek(0, 2) 41 | file_size = f.tell() 42 | 43 | # Calculate chunk sizes for 4 cores 44 | chunk_size = file_size // 4 45 | offsets = [(i * chunk_size, (i + 1) * chunk_size) for i in range(4)] 46 | # Last chunk should go until the end of the file 47 | offsets[-1] = (offsets[-1][0], file_size) 48 | 49 | # Queue for inter-process communication 50 | queue = multiprocessing.Queue() 51 | 52 | # Start 4 worker processes 53 | processes = [] 54 | for start, end in offsets: 55 | p = multiprocessing.Process(target=count_words_in_chunk, args=(start, end, queue)) 56 | processes.append(p) 57 | p.start() 58 | 59 | # Collect results 60 | for p in processes: 61 | p.join() 62 | 63 | # Print results as JSON objects one-per-process 64 | while not queue.empty(): 65 | word_count = queue.get() 66 | print(json.dumps(word_count)) 67 | 68 | if __name__ == "__main__": 69 | main() 70 | 71 | 72 | List exactly one reason. 73 | """ 74 | 75 | 76 | TestWhyBuggyPythonCountPar = question >> LLMRun() >> LLMRun("Below is a student's explanation for why some parallel wordcounting code is not working correctly. You don't need to see the full code, just tell me whether or not the student says that the main reason the code is broken is because 'the chunks could contain the same data or be overlapping' or something like this. Saying 'a word might be split in half' is wrong. Saying 'you might miss a word' is also wrong. Here is their answer:\n\n\n\nTell me either 'The student passes' or 'The student fails'.", llm=EVAL_LLM) >> Echo() >> SubstringEvaluator("student passes") 77 | 78 | 79 | if __name__ == "__main__": 80 | print(run_test(TestWhyBuggyPythonCountPar)) 81 | -------------------------------------------------------------------------------- /tests/fix_threading_issue.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | 4 | DESCRIPTION = "Test if the model can explain a poorly worded error message in a short threaded python program." 5 | 6 | TAGS = ['code', 'python', 'explain'] 7 | 8 | 9 | question = """In this program 10 | 11 | from multiprocessing import Pool 12 | from openai import OpenAI 13 | 14 | txt = open("/tmp/aa.txt").read().split("\n\n") 15 | 16 | from multiprocessing import Pool 17 | import subprocess 18 | 19 | client = OpenAI(api_key="sk-XXXXXXXXXXXXXX") 20 | def create_speech_file(index, text): 21 | response = client.audio.speech.create( 22 | model="tts-1-hd", 23 | voice="nova", 24 | input=text 25 | ) 26 | filename = f"output{index}.mp3" 27 | response.stream_to_file(filename) 28 | return filename 29 | 30 | def merge_mp3(files, output): 31 | with open("filelist.txt", "w") as file: 32 | for f in files: 33 | file.write(f"file '{f}'\n") 34 | 35 | cmd = ["ffmpeg", "-f", "concat", "-safe", "0", "-i", "filelist.txt", "-c", "copy", output] 36 | subprocess.run(cmd) 37 | 38 | if __name__ == '__main__': 39 | # Step 1: Parallelize speech file creation 40 | with Pool(8) as pool: 41 | mp3_files = pool.starmap(create_speech_file, enumerate(txt)) 42 | 43 | # Step 2: Merge the generated MP3 files 44 | output_file = "merged_output.mp3" 45 | merge_mp3(mp3_files, output_file) 46 | 47 | Why am I getting this error? 48 | 49 | Exception in thread Thread-3: 50 | Traceback (most recent call last): 51 | File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 980, in _bootstrap_inner 52 | self.run() 53 | File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 917, in run 54 | self._target(*self._args, **self._kwargs) 55 | File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 576, in _handle_results 56 | task = get() 57 | File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/connection.py", line 251, in recv 58 | return _ForkingPickler.loads(buf.getbuffer()) 59 | TypeError: __init__() missing 2 required keyword-only arguments: 'response' and 'body' 60 | 61 | """ 62 | 63 | TestQuestionThreadedFix = question >> LLMRun() >> LLMRun("I showed a student some code with a bug. I then tested their understanding by asking them to explain hwo to fix the bug. I'm going to show you their answer, I want you to evaluate whether or not their answer says something to the effect of \"The function create_speech_file is raising an exception that can not be pickled.\". Do not judge their reasonining, or if they know how to fix it, just tell me if they correctly identified that there is a crash in the create_speech_file function. \n Student Answer: \"\"\n\n Think out loud then answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> SubstringEvaluator("student passes") 64 | 65 | 66 | if __name__ == "__main__": 67 | print(run_test(TestQuestionThreadedFix)) 68 | -------------------------------------------------------------------------------- /tests/shorten_c_function.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can significantly shorten a repetitive C functions." 4 | 5 | TAGS = ['code', 'c'] 6 | 7 | question = """ 8 | Code golf this function to be a lot shorter. You only get points if it's <300 bytes (not counting whitespace). Keep the name `stepper`. 9 | 10 | ``` 11 | void stepper(int xw) { 12 | int Steps = 0; 13 | for (int x = 0; x < xw; x++) { 14 | switch (Steps) { 15 | case 0: 16 | digitalWrite(1, 0); 17 | digitalWrite(2, 0); 18 | digitalWrite(3, 0); 19 | digitalWrite(4, 1); 20 | break; 21 | case 1: 22 | digitalWrite(1, 0); 23 | digitalWrite(2, 0); 24 | digitalWrite(3, 1); 25 | digitalWrite(4, 1); 26 | break; 27 | case 2: 28 | digitalWrite(1, 0); 29 | digitalWrite(2, 0); 30 | digitalWrite(3, 1); 31 | digitalWrite(4, 0); 32 | break; 33 | case 3: 34 | digitalWrite(1, 0); 35 | digitalWrite(2, 1); 36 | digitalWrite(3, 1); 37 | digitalWrite(4, 0); 38 | break; 39 | case 4: 40 | digitalWrite(1, 0); 41 | digitalWrite(2, 1); 42 | digitalWrite(3, 0); 43 | digitalWrite(4, 0); 44 | break; 45 | case 5: 46 | digitalWrite(1, 1); 47 | digitalWrite(2, 1); 48 | digitalWrite(3, 0); 49 | digitalWrite(4, 0); 50 | break; 51 | case 6: 52 | digitalWrite(1, 1); 53 | digitalWrite(2, 0); 54 | digitalWrite(3, 0); 55 | digitalWrite(4, 0); 56 | break; 57 | case 7: 58 | digitalWrite(1, 1); 59 | digitalWrite(2, 0); 60 | digitalWrite(3, 0); 61 | digitalWrite(4, 1); 62 | break; 63 | default: 64 | digitalWrite(1, 0); 65 | digitalWrite(2, 0); 66 | digitalWrite(3, 0); 67 | digitalWrite(4, 0); 68 | break; 69 | } 70 | Steps = (Steps + 1) % 8; 71 | } 72 | } 73 | ``` 74 | """ 75 | 76 | test_case, answer = make_c_test([("(stepper(100),out)", "-1475775710")], extra_methods=""" 77 | #include 78 | 79 | int out = 0; 80 | int digitalWrite(int pin, int value) { 81 | out *= 101; 82 | out += pin; 83 | out *= 97; 84 | out += value; 85 | return out; 86 | } 87 | """) 88 | 89 | def short_enough(num): 90 | return lambda code: len(code.replace(" ", "").replace("\n", "").replace("\t", "")) < num 91 | 92 | step = "\nTo do this follow these steps: \n1. start out by creating a table of the values that are being written to the pins.\n2. Then find an expression for this.\n3. Write the function as short as possible.\n4. See if you can make it any shorter. Return this as the final answer and call it your final answer. Make sure to name it stepper still." 93 | 94 | TestShortenCHard = question.replace("<300", "<200") >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(200)) & (CRun(test_case) >> SubstringEvaluator(answer))) 95 | TestShortenC = question >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer))) 96 | TestShortenCStep = question + step >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer))) 97 | 98 | 99 | if __name__ == "__main__": 100 | from llm import llm 101 | print(run_test(TestShortenCHard)) 102 | 103 | -------------------------------------------------------------------------------- /tests/decompile_py_mid.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can disassemble Python bytecode and create a function that returns a list of prime numbers and their negations." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """ 8 | Disassemble the following python into a function called foo: 9 | 10 | 5 0 BUILD_LIST 0 11 | 2 STORE_FAST 1 (primes) 12 | 13 | 6 4 LOAD_GLOBAL 0 (range) 14 | 6 LOAD_CONST 1 (2) 15 | 8 LOAD_FAST 0 (max_number) 16 | 10 LOAD_CONST 2 (1) 17 | 12 BINARY_ADD 18 | 14 CALL_FUNCTION 2 19 | 16 GET_ITER 20 | >> 18 FOR_ITER 94 (to 114) 21 | 20 STORE_FAST 2 (possible_prime) 22 | 23 | 7 22 LOAD_CONST 3 (True) 24 | 24 STORE_FAST 3 (is_prime) 25 | 26 | 9 26 LOAD_GLOBAL 0 (range) 27 | 28 LOAD_CONST 1 (2) 28 | 30 LOAD_GLOBAL 1 (int) 29 | 32 LOAD_FAST 2 (possible_prime) 30 | 34 LOAD_CONST 4 (0.5) 31 | 36 BINARY_POWER 32 | 38 CALL_FUNCTION 1 33 | 40 LOAD_CONST 2 (1) 34 | 42 BINARY_ADD 35 | 44 CALL_FUNCTION 2 36 | 46 GET_ITER 37 | >> 48 FOR_ITER 24 (to 74) 38 | 50 STORE_FAST 4 (num) 39 | 40 | 10 52 LOAD_FAST 2 (possible_prime) 41 | 54 LOAD_FAST 4 (num) 42 | 56 BINARY_MODULO 43 | 58 LOAD_CONST 5 (0) 44 | 60 COMPARE_OP 2 (==) 45 | 62 POP_JUMP_IF_FALSE 48 46 | 47 | 11 64 LOAD_CONST 6 (False) 48 | 66 STORE_FAST 3 (is_prime) 49 | 50 | 12 68 POP_TOP 51 | 70 JUMP_ABSOLUTE 74 52 | 72 JUMP_ABSOLUTE 48 53 | 54 | 13 >> 74 LOAD_FAST 3 (is_prime) 55 | 76 POP_JUMP_IF_FALSE 18 56 | 57 | 14 78 LOAD_FAST 1 (primes) 58 | 80 LOAD_METHOD 2 (append) 59 | 82 LOAD_FAST 2 (possible_prime) 60 | 84 CALL_METHOD 1 61 | 86 POP_TOP 62 | 63 | 16 88 LOAD_FAST 2 (possible_prime) 64 | 90 LOAD_CONST 1 (2) 65 | 92 BINARY_MODULO 66 | 94 LOAD_CONST 5 (0) 67 | 96 COMPARE_OP 3 (!=) 68 | 98 POP_JUMP_IF_FALSE 18 69 | 70 | 17 100 LOAD_FAST 1 (primes) 71 | 102 LOAD_METHOD 2 (append) 72 | 104 LOAD_FAST 2 (possible_prime) 73 | 106 UNARY_NEGATIVE 74 | 108 CALL_METHOD 1 75 | 110 POP_TOP 76 | 112 JUMP_ABSOLUTE 18 77 | 78 | 18 >> 114 LOAD_FAST 1 (primes) 79 | 116 RETURN_VALUE 80 | 81 | """ 82 | 83 | test_case, answer = make_python_test([("foo(20)", "[2, 3, -3, 5, -5, 7, -7, 11, -11, 13, -13, 17, -17, 19, -19]")]) 84 | 85 | 86 | TestDisasPrimes = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer) 87 | 88 | if __name__ == "__main__": 89 | print(run_test(TestDisasPrimes)) 90 | -------------------------------------------------------------------------------- /tests/explore_sql_db.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can interact with an SQLite database and provide the correct command to add a new person with specific criteria." 4 | 5 | TAGS = ['sql', 'agent'] 6 | 7 | question = """ 8 | I've just run `sqlite3 people.db` and you are now in the interpreter. I want to add another person named Victor. They should be as old as the youngest person. And have the most popular profession. What command should I run? 9 | 10 | On each round you should just give me just one command to type, I'll run it exactly as-is, and give you the exact output. I will not make any changes. So do not write any placholder text. I'll get started: 11 | 12 | INPUT: `.version` 13 | 14 | OUTPUT: ```SQLite 3.39.5 2022-10-14 20:58:05``` 15 | 16 | INPUT: `.database` 17 | 18 | OUTPUT: ```main: people.db r/w``` 19 | 20 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command, ending with ; if necessary. Run small queries one at a time; think step by step. Start your response with INPUT: 21 | """ 22 | 23 | def setup(): 24 | import sqlite3 25 | 26 | # Create a connection object using the connect function 27 | conn = sqlite3.connect('people.db') 28 | 29 | # Create a cursor object using the cursor method 30 | cursor = conn.cursor() 31 | 32 | # Create table named people with columns for name, favorite friend, age, and profession 33 | cursor.execute('''CREATE TABLE people 34 | (a_name TEXT, favorite_friend TEXT, person_age INT, the_profession TEXT)''') 35 | 36 | # List of people data to insert into the people table 37 | people_data = [ 38 | ('Alice', 'Bob', 29, 'Engineer'), 39 | ('Bob', 'Alice', 32, 'Doctor'), 40 | ('Charlie', 'Alice', 28, 'Artist'), 41 | ('David', 'Eve', 35, 'Doctor'), 42 | ('Eve', 'Frank', 26, 'Teacher'), 43 | ('Frank', 'Alice', 31, 'Scientist'), 44 | ('Grace', 'Heidi', 30, 'Nurse'), 45 | ('Heidi', 'Ivy', 25, 'Lawyer'), 46 | ('Ivy', 'Charlie', 34, 'Chef'), 47 | ('Judy', 'Grace', 27, 'Accountant') 48 | ] 49 | 50 | # Insert each person into the people table 51 | cursor.executemany('INSERT INTO people VALUES (?,?,?,?)', people_data) 52 | 53 | # Commit the changes 54 | conn.commit() 55 | 56 | # Close the connection 57 | conn.close() 58 | 59 | 60 | def test_ok(): 61 | import sqlite3 62 | 63 | # Create a connection object using the connect function 64 | conn = sqlite3.connect('people.db') 65 | 66 | # Create a cursor object using the cursor method 67 | cursor = conn.cursor() 68 | 69 | # Get Victor's age and profession 70 | cursor.execute('SELECT person_age, the_profession FROM people WHERE a_name = "Victor"') 71 | out = cursor.fetchone() 72 | if out is None: 73 | return False 74 | age, profession = out 75 | 76 | return age == 25 and profession == 'Doctor' 77 | 78 | 79 | 80 | def do_extract(x): 81 | if 'INPUT:' in x: 82 | x = x.split("INPUT:")[1] 83 | try: 84 | to_send = x.split("`")[1] 85 | except: 86 | to_send = x.split("\n")[0] 87 | return to_send 88 | 89 | 90 | def do_prepare(x): 91 | print("Preparing to pass back", x) 92 | x = x.replace("\nsqlite>","") 93 | return f"I get an OUTPUT: ```{x}```.\n\nWhat is the exact command I should run next? Start your response with INPUT:" 94 | 95 | 96 | TestSqlExplore = Setup(setup) >> StartDockerJob("sqlite3 people.db", eos_string="sqlite>") >> question >> UntilDone(PyEvaluator(test_ok), (LLMConversation() >> PyFunc(do_extract) >> SendStdoutReceiveStdin() >> PyFunc(do_prepare)), max_iters=10) >> PyEvaluator(test_ok) 97 | 98 | if __name__ == "__main__": 99 | print(run_test(TestSqlExplore)) 100 | -------------------------------------------------------------------------------- /tests/basic_code_understanding.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can solve a simple capture-the-flag like entry in C." 4 | 5 | TAGS = ['c', "explain"] 6 | 7 | INPUT = '''#include 8 | #include 9 | #include 10 | 11 | #define MAX_INPUTS 10 12 | #define DATA_LENGTH 100 13 | #define ENTRY_LENGTH 5 14 | #define COMMAND_LENGTH 4 15 | 16 | static const char* flag = "SECRET"; 17 | static char data[MAX_INPUTS][DATA_LENGTH]; 18 | static int input_lengths[MAX_INPUTS]; 19 | static int inputs = 0; 20 | 21 | void data_write() { 22 | char input[DATA_LENGTH], len[ENTRY_LENGTH]; 23 | long length; 24 | 25 | printf("Enter data for storage:\n"); // Updated message 26 | fgets(input, DATA_LENGTH, stdin); 27 | input[strcspn(input, "\n")] = 0; 28 | 29 | do { 30 | printf("Input the data size:\n"); // Updated message 31 | fgets(len, ENTRY_LENGTH, stdin); 32 | length = strtol(len, NULL, 10); 33 | if (!length) puts("Invalid data length, try again."); // Updated message 34 | } while (!length); 35 | 36 | if (inputs >= MAX_INPUTS) inputs = 0; 37 | strcpy(data[inputs], input); 38 | input_lengths[inputs++] = length; 39 | printf("Data stored at position: %d\n", inputs); // Updated message 40 | } 41 | 42 | void data_read() { 43 | char entry[ENTRY_LENGTH], output[DATA_LENGTH] = {'\0'}; 44 | printf("Which data position to read?\n"); // Updated message 45 | fgets(entry, ENTRY_LENGTH, stdin); 46 | entry[strcspn(entry, "\n")] = 0; 47 | 48 | long entry_number = strtol(entry, NULL, 10); 49 | if (!entry_number) { 50 | puts(flag); 51 | fseek(stdin, 0, SEEK_END); 52 | exit(0); 53 | } 54 | 55 | strncpy(output, data[--entry_number], input_lengths[entry_number]); 56 | puts(output); 57 | } 58 | 59 | int main() { 60 | char input[COMMAND_LENGTH]; 61 | long command; 62 | puts("Welcome to the Data Echo System!"); // Updated message 63 | 64 | while (1) { 65 | printf("Choose option (1-Write, 2-Read, 3-Exit):\n"); // Updated message 66 | fgets(input, COMMAND_LENGTH, stdin); 67 | input[strcspn(input, "\n")] = 0; 68 | 69 | command = strtol(input, NULL, 10); 70 | if (!command) { 71 | puts("Invalid command, enter 1, 2 or 3."); // Updated message 72 | } else if (command == 1) { 73 | data_write(); 74 | puts("Data written. Perform another action?"); // Updated message 75 | } else if (command == 2) { 76 | if (!inputs) { 77 | puts("Database is empty."); // Updated message 78 | continue; 79 | } 80 | data_read(); 81 | puts("Data read. Perform another action?"); // Updated message 82 | } else if (command == 3) { 83 | puts("Exiting. Thank you!"); // Updated message 84 | break; 85 | } else { 86 | puts("Please enter a valid command: 1, 2, or 3."); // Updated message 87 | } 88 | } 89 | return 0; 90 | } 91 | ''' 92 | 93 | 94 | 95 | question = f"Give me sequence of the inputs that can make the code output the value of the flag. Here is the code: \n {INPUT}" 96 | 97 | evaluation = """ 98 | Below is a student's answer to a question about functionality of the code in C: 99 | 100 | Does the student's final answer say that it should first select option 1 and input a data with any length and then select option 2 to read the data and input 0? 101 | 102 | Ignore their reasoning. Just look at the final answer, if they say **EXPLICITLY** that it should first input some data by selecting option 1 and then select option 2 to read and input 0. Giving something that's equal but more complicated is wrong. 103 | 104 | If the answer is correct say "The student passes" otherwise "The student fails". 105 | """ 106 | 107 | 108 | 109 | TestCodeUnderstanding = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes") 110 | 111 | 112 | if __name__ == "__main__": 113 | print(run_test(TestCodeUnderstanding)) 114 | -------------------------------------------------------------------------------- /tests/find_bug_in_paper.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if a model can find math errors in the latex source of a paper." 4 | 5 | TAGS = ['explain'] 6 | 7 | question = r""" 8 | # Setup 9 | 10 | Which of the following equations are incorrect according to the specification? 11 | 12 | # Notation 13 | 14 | A neural network is a function $F(x) = y$ that accepts an input $x \in \mathbb{R}^n$ 15 | and produces an output $y \in \mathbb{R}^m$. 16 | The model $F$ also implicitly depends on some model parameters $\theta$; in our work 17 | the model is fixed, so for convenience we don't show the dependence on $\theta$. 18 | 19 | In this paper we focus on neural networks used as an $m$-class classifier. 20 | The output of the network is computed using the softmax function, 21 | which ensures that the output vector $y$ satisfies 22 | $0 \le y_i \le 1$ and $y_1 + \dots + y_m = 1$. 23 | The output vector $y$ is thus treated as a probability distribution, i.e., 24 | $y_i$ is treated as the probability that input $x$ has class $i$. 25 | The classifier assigns the label $C(x) = \arg\max_i F(x)_i$ to the input $x$. 26 | Let $C^*(x)$ be the correct label of $x$. 27 | The inputs to the softmax function are called \emph{logits}. 28 | 29 | We use the notation from Papernot et al. \cite{distillation}: define $F$ to 30 | be the full neural network including the softmax function, $Z(x) = z$ to be the output of 31 | all layers except the softmax (so $z$ are the logits), and 32 | \begin{equation*} 33 | F(x) = \softmax(Z(x)) = y. 34 | \end{equation*} 35 | A neural network typically \footnote{Most simple networks have this simple 36 | linear structure, however other more sophisticated networks have 37 | more complicated structures (e.g., ResNet \cite{he2016deep} and Inception \cite{szegedy2015rethinking}). 38 | The network architecture does not impact our attacks.} 39 | consists of layers 40 | \begin{equation*} 41 | F = \softmax \circ F_n \circ F_{n-1} \circ \cdots \circ F_1 42 | \end{equation*} 43 | where 44 | \begin{equation*} 45 | F_i(x) = \sigma(\theta_i \cdot x) + \hat\theta_i 46 | \end{equation*} 47 | for some non-linear activation function $\sigma$, some matrix $\theta_i$ of model 48 | weights, and some vector $\hat\theta_i$ of model biases. Together $\theta$ and 49 | $\hat\theta$ make up the model parameters. 50 | Common choices of $\sigma$ 51 | are tanh \cite{mishkin2015all}, sigmoid, ReLU \cite{maas2013rectifier}, or ELU \cite{clevert2015fast}. 52 | In this paper we focus primarily on networks that use a ReLU activation function, 53 | as it currently is the most widely used 54 | activation function 55 | \cite{szegedy2015rethinking,springenberg2014striving,mishkin2015all,distillation}. 56 | 57 | 58 | # Specification: 59 | 60 | We define an objective function $f$ such that 61 | $C(x+\delta) = t$ if and only if $f(x+\delta) \le 0$. 62 | There are many possible choices for $f$: 63 | 64 | # The equations 65 | 66 | \begin{align*} 67 | f_{1}(x') &= -\loss_{F,t}(x') + 1\\ 68 | f_{2}(x') &= (\max_{i \ne t}( F(x')_i) - F(x')_t)^+\\ 69 | f_{3}(x') &= \text{softplus}(\max_{i \ne t}(F(x')_i) - F(x')_t)-\log(2)\\ 70 | f_{4}(x') &= (0.5 - F(x')_t)^+\\ 71 | f_{5}(x') &= -\log(2 F(x')_t - 2)\\ 72 | f_{6}(x') &= (\max_{i \ne t}( Z(x')_i) - Z(x')_t)^+\\ 73 | f_{7}(x') &= \text{softplus}(\max_{i \ne t}(Z(x')_i) - Z(x')_t)-\log(2) 74 | \end{align*} 75 | where $s$ is the correct classification, $(e)^+$ is short-hand for 76 | $\max(e,0)$, $\text{softplus}(x) = \log(1+\exp(x))$, and 77 | $\loss_{F,s}(x)$ is the cross entropy loss for $x$. 78 | 79 | # Your task 80 | 81 | Consider each equation one by one. End your answer with a python list of numbers [1,2,3,4,5,6,7] for those that are wrong. 82 | """ 83 | 84 | question_easier = question + "\nSpecifically, make sure that $C(x+\delta) = t$ if and only if $f(x+\delta) \le 0$. Think step by step about if this is true for each equation and then give your answer as a python list" 85 | 86 | def check(x): 87 | ints = list(map(int,x.split("[")[-1].split("]")[0].split(","))) 88 | return ints == [1,3,4,5,7], "" 89 | 90 | TestFindBugPaper = question >> LLMRun() >> Echo() >> PyFunc(check) 91 | 92 | TestFindBugPaperEasy = question_easier >> LLMRun() >> Echo() >> PyFunc(check) 93 | 94 | 95 | if __name__ == "__main__": 96 | print(run_test(TestFindBugPaper)) 97 | -------------------------------------------------------------------------------- /tests/git_merge.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can guide a user through a series of git commands to merge a specific branch into the main branch." 4 | 5 | TAGS = ['bash', "git", "agent"] 6 | 7 | question = """ 8 | You are in a git repository. I have two branches, main and some other one I forgot the name of. Merge the other branch into main. 9 | 10 | On each round you should just give me just one command to type, I'll run it exactly as-is, and give you the exact output. I will not make any changes. So do not write any placholder text. I'll get started: 11 | 12 | INPUT: `pwd` 13 | 14 | OUTPUT: `/usr/src/app` 15 | 16 | INPUT: `ls` 17 | 18 | OUTPUT: `main.py a b` 19 | 20 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command. Start your response with INPUT:""" 21 | 22 | 23 | def test_ok_merge(): 24 | try: 25 | math_content = open("math.py").read().strip() 26 | readme_content = open("README.md").read().strip() 27 | mul_content = open("mul.py").read().strip() 28 | 29 | return ( 30 | math_content == "x + y\nx - y" 31 | and readme_content == "A repo about something\nYou run it like this" 32 | and mul_content == "x * y" 33 | ) 34 | except: 35 | return False 36 | 37 | 38 | def test_merge_conflict_resolved(): 39 | import os 40 | try: 41 | return open("sum.py").read().strip() == "x + y" and open("b").read().strip() == "b" 42 | except: 43 | return False 44 | 45 | 46 | def setup_merge(): 47 | import os 48 | print(os.popen(""" 49 | git config --global user.name "Your Name" 50 | git config --global user.email "you@example.com" 51 | git config --global init.defaultBranch main 52 | echo "x + y" > math.py 53 | echo "A repo about something" > README.md 54 | git init 55 | git add math.py README.md 56 | git commit -a -m "Initial commit" 57 | git branch other 58 | git checkout other 59 | echo "x + y\nx - y" > math.py 60 | git add math.py 61 | git commit -a -m "Subtraction" 62 | echo "A repo about something\nYou run it like this" > README.md 63 | git add README.md 64 | git commit -a -m "README" 65 | git log 66 | git checkout main 67 | echo "x * y" > mul.py 68 | git add mul.py 69 | git commit -a -m "Multiplication" 70 | git log 71 | """).read()) 72 | 73 | 74 | def setup_merge_conflict(): 75 | import os 76 | print(os.popen(""" 77 | git config --global user.name "Your Name" 78 | git config --global user.email "you@example.com" 79 | git config --global init.defaultBranch main 80 | echo "a" > sum.py 81 | echo "b" > b 82 | git init 83 | git add sum.py b 84 | git commit -a -m "Initial commit" 85 | git branch other 86 | git checkout other 87 | echo "x + y" > sum.py 88 | git add sum.py 89 | git commit -a -m "addition" 90 | git log 91 | git checkout main 92 | echo "x - y" > sum.py 93 | git add sum.py 94 | git commit -a -m "subtraction" 95 | git log 96 | """).read()) 97 | 98 | 99 | def do_extract(x): 100 | if 'INPUT:' in x: 101 | x = x.split("INPUT:")[1] 102 | try: 103 | to_send = x.split("`")[1] 104 | except: 105 | to_send = x.split("\n")[0] 106 | return to_send 107 | 108 | 109 | def do_prepare(x): 110 | print("Preparing to pass back", x) 111 | return f"I get an OUTPUT: `{x}`.\n\nWhat is the exact command I should run next? Start your response with INPUT:" 112 | 113 | 114 | TestGitMerge = Setup(setup_merge) >> Echo() >> question >> UntilDone(PyEvaluator(test_ok_merge), (LLMConversation( 115 | ) >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=6) >> PyEvaluator(test_ok_merge) 116 | 117 | TestGitMergeConflict = Setup(setup_merge_conflict) >> Echo() >> question >> UntilDone(PyEvaluator(test_merge_conflict_resolved), (LLMConversation( 118 | ) >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=10) >> PyEvaluator(test_merge_conflict_resolved) 119 | 120 | if __name__ == "__main__": 121 | print(run_test(TestGitMerge)) 122 | print(run_test(TestGitMergeConflict)) 123 | -------------------------------------------------------------------------------- /tests/implement_assembly_interpreter.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can implement an interpreter for a new assembly language from a text description." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = """Here is the description of a new assembly language: 8 | 9 | * 8 registers (R1, R2, R3, R4, R5, R6, R7, R8) that can hold integers. 10 | * 1 flag that can hold a boolean value (True or False). 11 | * 100 memory addresses (0-99) that can hold integers. 12 | * 1 instruction pointer that points to the current instruction being executed. 13 | 14 | Each instruction is of the form 15 | OP ARG1 ARG2 ... 16 | where ARGn can be either a register (e.g., R1) or a constant (e.g., 10). 17 | 18 | Labels are written with a lowercase word followed by colon. 19 | 20 | The assembly language supports the following instructions: 21 | * SET Rx C: Assigns the value C to register Rx. 22 | * ADD Rx Ry Rz: Adds the values of Ry and Rz and stores the result in Rx. 23 | * (similarly for SUB, MUL, DIV, MOD) 24 | * EQ Rx Ry: Sets the flag to True if Rx and Ry are equal, False otherwise. 25 | * (similarly for NEQ, LT (Rx < Ry), LTE, GT, GTE) 26 | * INC/DEC Rx: Increments/Decrements the value of Rx by one. 27 | * JMP L: Jumps to label L unconditionally. 28 | * JT/JF (jump if true / jump if false) L: Jumps to label L if the is set or not set. 29 | * LOAD Rx M: Loads the value at memory address M into register Rx. 30 | * STORE Rx M: Stores the value of register Rx into memory address M. 31 | * HCF: Stops the program (with pizzazz) 32 | 33 | For example here is a program to compute the first 20 square numbers (1, 4, 9, 16, 25, ...): 34 | 35 | SET R1 0 // Counter for storing squares 36 | SET R2 1 // Number to square 37 | 38 | loop: 39 | MUL R3 R2 R2 // R3 = R2 * R2 40 | STORE R3 R1 // Store R3 at address R1 41 | INC R1 // Increment address 42 | INC R2 // Increment number 43 | SET R3 20 44 | EQ R1 R3 // Check if 20 squares are found 45 | JF loop // If not 20 squares found yet, continue finding 46 | 47 | end: 48 | HCF // Stop program 49 | 50 | 51 | Write me a python interpreter `evaluate(str)` that returns the resulting memory state after running the program. For example, `evaluate(program)` should return `[1, 4, 9, 16, 25, ...]` for the above program. 52 | """ 53 | 54 | primes = """ 55 | 56 | SET R1 2 // Starting number to check for prime 57 | 58 | start_find_primes: 59 | 60 | JMP is_prime // Control will return after executing is_prime with R1 as input and R2 containing the result 61 | 62 | ready_prime: 63 | SET R7 1 64 | EQ R2 R7 // Check if R2 is 1 (prime) 65 | JF increment // If not prime, skip storing and increment the number 66 | 67 | // Store prime number in memory and increment count 68 | STORE R1 R8 // Store prime number at address pointed by R8 69 | INC R8 // Increment prime count 70 | 71 | // Check if 100 primes are found 72 | SET R7 100 73 | EQ R8 R7 74 | JF increment // If not 100 primes found yet, continue finding 75 | 76 | JMP end // If 100 primes found, end program 77 | 78 | increment: 79 | INC R1 // Increment number to check for prime 80 | JMP start_find_primes // Check next number 81 | 82 | is_prime: 83 | SET R2 1 // Assume number is prime initially 84 | SET R3 2 // Start divisor from 2 85 | 86 | start_loop: // Label to start the loop 87 | // Check if we have exceeded the square root of R1 88 | MUL R4 R3 R3 // R4 = R3 * R3 89 | GT R4 R1 // Set flag if R4 > R1 90 | JT is_prime_end // If not exceeded, continue; else, end loop 91 | 92 | MOD R6 R1 R3 // R6 = R1 % R3 93 | SET R7 0 94 | EQ R7 R6 // Check if R6 is 0 95 | JT not_prime // If yes, number is not prime 96 | 97 | INC R3 // Increment divisor 98 | JMP start_loop // Repeat loop 99 | 100 | not_prime: 101 | SET R2 0 // Set result to 0 (not prime) 102 | 103 | is_prime_end: 104 | JMP ready_prime 105 | 106 | end: 107 | """ 108 | 109 | code = """ 110 | SET R1 0 111 | SET R2 1 112 | loop: 113 | MUL R3 R2 R2 114 | STORE R3 R1 115 | INC R1 116 | INC R2 117 | SET R3 20 118 | EQ R1 R3 119 | JF loop 120 | """ 121 | 122 | test_case, answer = make_python_test([(f'evaluate("""{code}""")[:10]', "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"), 123 | (f'evaluate("""{primes}""")[:10]', "[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]") 124 | ]) 125 | 126 | TestImplementAssembly = question >> LLMRun() >> ExtractCode(lang="python") >> PythonRun(test_case) >> SubstringEvaluator(answer) 127 | 128 | 129 | if __name__ == "__main__": 130 | print(run_test(TestImplementAssembly)) 131 | 132 | 133 | -------------------------------------------------------------------------------- /tests/data_table_processing.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can process a large table of text and identify rows with specific values." 4 | 5 | TAGS = ['data'] 6 | 7 | question = ''' 8 | Here is a table about information of states and territory. 9 | 10 | State / territory Total (mi2) Total (km2) Land (mi2) Land (km2) Water (mi2) Water (km2) % 11 | water 12 | Alaska 665,384 1,723,337 570,641 1,477,953 94,743 245,383 14.2% 13 | Texas 268,596 695,662 261,232 676,587 7,365 19,075 2.7% 14 | California 163,695 423,967 155,779 403,466 7,916 20,501 4.8% 15 | Montana 147,040 380,831 145,546 376,962 1,494 3,869 1.0% 16 | New Mexico 121,590 314,917 121,298 314,161 292 757 0.2% 17 | Arizona 113,990 295,234 113,594 294,207 396 1,026 0.3% 18 | Nevada 110,572 286,380 109,781 284,332 791 2,048 0.7% 19 | Colorado 104,094 269,601 103,642 268,431 452 1,170 0.4% 20 | Oregon 98,379 254,799 95,988 248,608 2,391 6,191 2.4% 21 | Wyoming 97,813 253,335 97,093 251,470 720 1,864 0.7% 22 | Michigan 96,714 250,487 56,539 146,435 40,175 104,052 41.5% 23 | Minnesota 86,936 225,163 79,627 206,232 7,309 18,930 8.4% 24 | Utah 84,897 219,882 82,170 212,818 2,727 7,064 3.2% 25 | Idaho 83,569 216,443 82,643 214,045 926 2,398 1.1% 26 | Kansas 82,278 213,100 81,759 211,754 520 1,346 0.6% 27 | Nebraska 77,348 200,330 76,824 198,974 524 1,356 0.7% 28 | South Dakota 77,116 199,729 75,811 196,350 1,305 3,379 1.7% 29 | Washington 71,298 184,661 66,456 172,119 4,842 12,542 6.8% 30 | North Dakota 70,698 183,108 69,001 178,711 1,698 4,397 2.4% 31 | Oklahoma 69,899 181,037 68,595 177,660 1,304 3,377 1.9% 32 | Missouri 69,707 180,540 68,742 178,040 965 2,501 1.4% 33 | Florida 65,758 170,312 53,625 138,887 12,133 31,424 18.5% 34 | Wisconsin 65,496 169,635 54,158 140,268 11,339 29,367 17.3% 35 | Georgia 59,425 153,910 57,513 148,959 1,912 4,951 3.2% 36 | Illinois 57,914 149,995 55,519 143,793 2,395 6,202 4.1% 37 | Iowa 56,273 145,746 55,857 144,669 416 1,077 0.7% 38 | New York 54,555 141,297 47,126 122,057 7,429 19,240 13.6% 39 | North Carolina 53,819 139,391 48,618 125,920 5,201 13,471 9.7% 40 | Arkansas 53,179 137,732 52,035 134,771 1,143 2,961 2.1% 41 | Alabama 52,420 135,767 50,645 131,171 1,775 4,597 3.4% 42 | Louisiana 52,378 135,659 43,204 111,898 9,174 23,761 17.5% 43 | Mississippi 48,432 125,438 46,923 121,531 1,509 3,907 3.1% 44 | Pennsylvania 46,054 119,280 44,743 115,883 1,312 3,397 2.8% 45 | Ohio 44,826 116,098 40,861 105,829 3,965 10,269 8.8% 46 | Virginia 42,775 110,787 39,490 102,279 3,285 8,508 7.7% 47 | Tennessee 42,144 109,153 41,235 106,798 909 2,355 2.2% 48 | Kentucky 40,408 104,656 39,486 102,269 921 2,387 2.3% 49 | Indiana 36,420 94,326 35,826 92,789 593 1,537 1.6% 50 | Maine 35,380 91,633 30,843 79,883 4,537 11,750 12.8% 51 | South Carolina 32,020 82,933 30,061 77,857 1,960 5,076 6.1% 52 | West Virginia 24,230 62,756 24,038 62,259 192 497 0.8% 53 | Maryland 12,406 32,131 9,707 25,142 2,699 6,990 21.8% 54 | Hawaii 10,932 28,313 6,423 16,635 4,509 11,678 41.2% 55 | Massachusetts 10,554 27,336 7,800 20,202 2,754 7,134 26.1% 56 | Vermont 9,616 24,906 9,217 23,871 400 1,035 4.2% 57 | New Hampshire 9,349 24,214 8,953 23,187 397 1,027 4.2% 58 | New Jersey 8,723 22,591 7,354 19,047 1,368 3,544 15.7% 59 | Connecticut 5,543 14,357 4,842 12,542 701 1,816 12.6% 60 | Puerto Rico 5,325 13,791 3,424 8,868 1,901 4,924 35.7% 61 | Delaware 2,489 6,446 1,949 5,047 540 1,399 21.7% 62 | Northern Mariana Islands 1,976 5,117 182 472 1,793 4,644 90.7% 63 | Rhode Island 1,545 4,001 1,034 2,678 511 1,324 33.1% 64 | U.S. Virgin Islands 733 1,898 134 348 599 1,550 81.7% 65 | American Samoa 581 1,505 76 198 505 1,307 86.9% 66 | Guam 571 1,478 210 543 361 935 63.2% 67 | District of Columbia 68 177 61 158 7 19 10.3% 68 | Minor Outlying Islands[3][a] 16 41 16 41 0 0 0.0% 69 | Contiguous US 3,120,428 8,081,869 2,954,843 7,653,006 165,589 428,865 5.3% 70 | 50 States 3,796,676 9,833,342 3,531,846 9,147,436 264,834 685,907 7.0% 71 | 50 States and DC 3,796,744 9,833,519 3,531,907 9,147,594 264,841 685,926 7.0% 72 | United States 3,805,927 9,857,306 3,535,932 9,158,022 269,995 699,284 7.1% 73 | 74 | List for me each of the states that have more than 20,000 square kilometers of water, from lowest to highest. Don't list any other states. 75 | 76 | ''' 77 | 78 | stepbystep = """To answer this question follow these steps in order: 79 | 1. List just the amount of water in each state. 80 | 2. Filter those to ones with over 20k square kilometers of water. 81 | 3. Sort them from lowest to highest. 82 | 4. Say "The final answer is" and list the states in that order. 83 | 84 | """ 85 | 86 | evaluation = "This is a student answer about which states have the most water: \n\n\nThe correct answer is: California, Louisiana, Wisconsin, Florida, Michigan, and Alaska (in that order).\n\nDoes the student answer exactly these states in this order? Think out loud about their answer. Then, if the student got the states in this order, answer 'The student passes' otherwise answer 'The student fails'.\n\n" 87 | 88 | 89 | TestStateTable = question >> LLMRun() >> ((LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")) & SubstringEvaluator("California") & SubstringEvaluator("Louisiana") & SubstringEvaluator("Wisconsin") & SubstringEvaluator("Wisconsin")) 90 | TestStateTableStepbystep = (question + stepbystep) >> LLMRun() >> ((LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")) & SubstringEvaluator("California") & SubstringEvaluator("Louisiana") & SubstringEvaluator("Wisconsin") & SubstringEvaluator("Wisconsin")) 91 | 92 | 93 | if __name__ == "__main__": 94 | print(run_test(TestStateTableStepbystep)) 95 | 96 | -------------------------------------------------------------------------------- /llm.py: -------------------------------------------------------------------------------- 1 | ## Copyright (C) 2024, Nicholas Carlini . 2 | ## 3 | ## This program is free software: you can redistribute it and/or modify 4 | ## it under the terms of the GNU General Public License as published by 5 | ## the Free Software Foundation, either version 3 of the License, or 6 | ## (at your option) any later version. 7 | ## 8 | ## This program is distributed in the hope that it will be useful, 9 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | ## GNU General Public License for more details. 12 | ## 13 | ## You should have received a copy of the GNU General Public License 14 | ## along with this program. If not, see . 15 | 16 | from io import BytesIO 17 | import os 18 | import base64 19 | import requests 20 | import json 21 | import pickle 22 | import time 23 | 24 | from llms.openai_model import OpenAIModel 25 | from llms.anthropic_model import AnthropicModel 26 | from llms.mistral_model import MistralModel 27 | from llms.openrouter_model import OpenRouterModel 28 | from llms.vertexai_model import VertexAIModel 29 | from llms.cohere_model import CohereModel 30 | from llms.moonshot_model import MoonshotAIModel 31 | from llms.groq_model import GroqModel 32 | 33 | class LLM: 34 | def __init__(self, name="gpt-3.5-turbo", use_cache=True, override_hparams={}): 35 | self.name = name 36 | if name.startswith("openrouter"): 37 | self.model = OpenRouterModel(name) 38 | elif 'gpt' in name or name.startswith('o1'): 39 | self.model = OpenAIModel(name) 40 | # elif 'llama' in name: 41 | # self.model = LLAMAModel(name) 42 | elif 'mistral' in name: 43 | self.model = MistralModel(name) 44 | elif 'bison' in name or 'gemini' in name: 45 | self.model = VertexAIModel(name) 46 | #elif 'gemini' in name: 47 | # self.model = GeminiModel(name) 48 | elif 'claude' in name: 49 | self.model = AnthropicModel(name) 50 | elif 'moonshot' in name: 51 | self.model = MoonshotAIModel(name) 52 | elif 'command' in name: 53 | self.model = CohereModel(name) 54 | elif 'llama3' in name or 'mixtral' in name or 'gemma' in name or 'deepseek' in name: 55 | print("Using Groq model", name) 56 | self.model = GroqModel(name) 57 | else: 58 | raise 59 | self.model.hparams.update(override_hparams) 60 | 61 | self.use_cache = use_cache 62 | if use_cache: 63 | try: 64 | if not os.path.exists("tmp"): 65 | os.mkdir("tmp") 66 | self.cache = pickle.load(open(f"tmp/cache-{name.split('/')[-1]}.p","rb")) 67 | except: 68 | self.cache = {} 69 | else: 70 | self.cache = {} 71 | 72 | def __call__(self, conversation, add_image=None, max_tokens=None, skip_cache=False, json=False): 73 | if type(conversation) == str: 74 | conversation = [conversation] 75 | 76 | cache_key = tuple(conversation) if add_image is None else tuple(conversation + [add_image.tobytes()]) 77 | 78 | if cache_key in self.cache and not skip_cache and self.use_cache: 79 | 80 | print(self.name, "GETCACHE", repr(self.cache[cache_key])) 81 | if len(self.cache[cache_key]) > 0: 82 | return self.cache[cache_key] 83 | else: 84 | print("Empty cache hit") 85 | 86 | print(self.name, "CACHE MISS", repr(conversation)) 87 | 88 | 89 | import traceback 90 | from concurrent.futures import ThreadPoolExecutor, TimeoutError 91 | 92 | response = "Model API request failed" 93 | for _ in range(3): 94 | try: 95 | extra = {} 96 | if json: 97 | extra['json'] = json 98 | 99 | def request_with_timeout(): 100 | return self.model.make_request(conversation, add_image=add_image, max_tokens=max_tokens, **extra) 101 | 102 | with ThreadPoolExecutor() as executor: 103 | future = executor.submit(request_with_timeout) 104 | try: 105 | response = future.result(timeout=60*10) # 10 minutes 106 | break # If successful, break out of the retry loop 107 | except TimeoutError: 108 | print("Request timed out after 60 seconds") 109 | response = "Model API request failed due to timeout" 110 | # Continue to the next retry 111 | except Exception as e: 112 | print("RUN FAILED", e) 113 | traceback.print_exc() 114 | 115 | time.sleep(10) 116 | 117 | 118 | if self.use_cache and response != "Model API request failed": 119 | self.cache[cache_key] = response 120 | pickle.dump(self.cache, open(f"tmp/cache-{self.name.split('/')[-1]}.p","wb")) 121 | 122 | return response 123 | 124 | #llm = LLM("command") 125 | #llm = LLM("gpt-3.5-turbo") 126 | #llm = LLM("gpt-4-turbo-2024-04-09") 127 | #llm = LLM("gemini-1.5-pro-preview-0409") 128 | llm = LLM("o1-mini") 129 | 130 | #llm = LLM("claude-3-opus-20240229") 131 | #llm = LLM("claude-3-5-sonnet-20240620") 132 | 133 | #llm = LLM("mistral-tiny") 134 | #llm = LLM("gemini-pro", override_hparams={'temperature': 0.3}, use_cache=False) 135 | 136 | #eval_llm = LLM("gpt-4-1106-preview") 137 | eval_llm = LLM("gpt-4o", override_hparams={'temperature': 0.1}) 138 | #eval_llm = LLM("gpt-3.5-turbo", override_hparams={'temperature': 0.1}) 139 | 140 | vision_eval_llm = LLM("gpt-4o", override_hparams={'temperature': 0.1}) 141 | -------------------------------------------------------------------------------- /tests/program_in_new_assembly.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can write a program in a new assembly language. This ability to learn a new language on-the-fly is important for many tasks." 4 | 5 | TAGS = ['code'] 6 | 7 | class AssemblyEmulator: 8 | def __init__(self, instructions): 9 | self.registers = {"R1": 0, "R2": 0, "R3": 0, "R4": 0, "R5": 0, "R6": 0, "R7": 0, "R8": 0} 10 | self.memory = [0] * 100 11 | self.instruction_pointer = 0 12 | self.instructions = instructions.split("\n") 13 | self.flag = False 14 | print(instructions) 15 | 16 | def run(self): 17 | 18 | def lookup(register_or_const): 19 | if register_or_const.startswith('R'): 20 | return self.registers[register_or_const] 21 | else: 22 | return int(register_or_const) 23 | 24 | bin_op = { 25 | "ADD": lambda a, b: a + b, 26 | "SUB": lambda a, b: a - b, 27 | "MUL": lambda a, b: a * b, 28 | "DIV": lambda a, b: a // b, 29 | "MOD": lambda a, b: a % b, 30 | } 31 | cmp_op = { 32 | "EQ": lambda a, b: a == b, 33 | "NEQ": lambda a, b: a != b, 34 | "LT": lambda a, b: a < b, 35 | "LTE": lambda a, b: a <= b, 36 | "GT": lambda a, b: a > b, 37 | "GTE": lambda a, b: a >= b, 38 | } 39 | 40 | 41 | ctr = 0 42 | while self.instruction_pointer < len(self.instructions): 43 | ctr += 1 44 | if ctr > 1e6: 45 | raise Exception("Infinite loop detected") 46 | 47 | parts = self.instructions[self.instruction_pointer].split("//")[0].replace(",","").split() 48 | if len(parts) == 0: 49 | self.instruction_pointer += 1 50 | continue 51 | 52 | instruction, args = parts[0], parts[1:] 53 | 54 | if instruction == "SET": 55 | self.registers[args[0]] = lookup(args[1]) 56 | elif instruction in bin_op: 57 | self.registers[args[0]] = bin_op[instruction](lookup(args[1]), lookup(args[2])) 58 | elif instruction in cmp_op: 59 | self.flag = cmp_op[instruction](lookup(args[0]), lookup(args[1])) 60 | elif instruction == "INC": 61 | self.registers[args[0]] += 1 62 | elif instruction == "DEC": 63 | self.registers[args[0]] -= 1 64 | elif instruction == "JT" and self.flag: 65 | self.instruction_pointer = self.find_label(args[0]) 66 | continue 67 | elif instruction == "JF" and not self.flag: 68 | self.instruction_pointer = self.find_label(args[0]) 69 | continue 70 | elif instruction == "JMP": 71 | self.instruction_pointer = self.find_label(args[0]) 72 | continue 73 | elif instruction == "LOAD": 74 | self.memory[lookup(args[1])] = lookup(args[0]) 75 | elif instruction == "STORE": 76 | self.memory[lookup(args[1])] = lookup(args[0]) 77 | elif instruction == "HCF": 78 | return 79 | 80 | self.instruction_pointer += 1 81 | 82 | def find_label(self, label): 83 | return next(i for i, instruction in enumerate(self.instructions) if instruction.strip().startswith(label + ':')) 84 | 85 | 86 | question = """Here is the description of a new assembly language: 87 | 88 | * 8 registers (R1, R2, R3, R4, R5, R6, R7, R8) that can hold integers. 89 | * 1 flag that can hold a boolean value (True or False). 90 | * 100 memory addresses (0-99) that can hold integers. 91 | * 1 instruction pointer that points to the current instruction being executed. 92 | 93 | Each instruction is of the form 94 | OP ARG1 ARG2 ... 95 | where ARGn can be either a register (e.g., R1) or a constant (e.g., 10). 96 | 97 | Labels are written with a lowercase word followed by colon. 98 | 99 | The assembly language supports the following instructions: 100 | * SET Rx C: Assigns the value C to register Rx. 101 | * ADD Rx Ry Rz: Adds the values of Ry and Rz and stores the result in Rx. 102 | * (similarly for SUB, MUL, DIV, MOD) 103 | * EQ Rx Ry: Sets the flag to True if Rx and Ry are equal, False otherwise. 104 | * (similarly for NEQ, LT (Rx < Ry), LTE, GT, GTE) 105 | * INC/DEC Rx: Increments/Decrements the value of Rx by one. 106 | * JMP L: Jumps to label L unconditionally. 107 | * JT/JF (jump if true / jump if false) L: Jumps to label L if the is set or not set. 108 | * LOAD Rx M: Loads the value at memory address M into register Rx. 109 | * STORE Rx M: Stores the value of register Rx into memory address M. 110 | * HCF: Stops the program (with pizzazz) 111 | 112 | Given this assembly language, write a program that {}""" 113 | 114 | def expect(answer): 115 | def evaluate(code): 116 | try: 117 | emulator = AssemblyEmulator(code) 118 | emulator.run() 119 | except Exception as e: 120 | print("EXCEPTION", e) 121 | return False, str(e) 122 | 123 | return answer(emulator.memory), "" 124 | return evaluate 125 | 126 | TestNewAssemblySquareNumbers = question.format("puts the first 20 square (1, 4, 9...) numbers in memory cells 0..19") >> LLMRun() >> ExtractCode(lang="a new assembly language I just made up") >> PyFunc(expect(lambda m: m[:20] == [i*i for i in range(1,21)])) 127 | 128 | TestNewAssemblyPrimeNumbers = question.format("puts the first 20 prime numbers in memory cells 0..19") >> LLMRun() >> ExtractCode(lang="a new assembly language I just made up") >> PyFunc(expect(lambda m: m[:20] == [2, 3, 5, 7, 11, 13, 17, 19, 23, 29])) 129 | 130 | 131 | if __name__ == "__main__": 132 | print(run_test(TestNewAssemblySquareNumbers)) 133 | 134 | 135 | -------------------------------------------------------------------------------- /tests/extract_references.py: -------------------------------------------------------------------------------- 1 | from evaluator import * 2 | 3 | DESCRIPTION = "Test if the model can extract paper tiles from a block of text." 4 | 5 | TAGS = ['code', 'python'] 6 | 7 | question = '''Extract a list the titles of the papers from the following list of references. 8 | Start your response 9 | 10 | ```json 11 | [title_1, title_2, ...] 12 | ``` 13 | 14 | Here's the block of text: 15 | 16 | A Suffix Arrays [45] SHOKRI, R., STRONATI, M., SONG, C., AND 17 | A suffix of length k of a string x are the last k characters (or, SHMATIKOV, V. Membership inference attacks against 18 | tokens) of this string, i.e,. x[−k:] machine learning models. In IEEE Symposium on 19 | . If we want to know: “was Security and Privacy (2017). 20 | 0 100 200 300 [46] SOLDAINI, L. AI2 Dolma: 3 trillion token open corpus 21 | length of k-gram for language model pretraining, 2023. 22 | 104 [47] SOMEPALLI, G., SINGLA, V., GOLDBLUM, M., GEIPING, J., AND GOLDSTEIN, T. Diffusion art or digital 23 | 105 forgery? Investigating data replication in diffusion models. In CVPR (2023). 24 | 106 [48] SOUTHWOOD, T. R. E., AND HENDERSON, P. A. Ecological methods. John Wiley & Sons, 2009. 25 | # generated kgrams [49] TOUVRON, H., LAVRIL, T., IZACARD, G., MARTINET, X., LACHAUX, M.-A., LACROIX, T., ROZIÈRE, B., GOYAL, 26 | in training data N., HAMBRO, E., AZHAR, F., RODRIGUEZ, A., JOULIN, A., GRAVE, E., AND LAMPLE, 27 | Figure 14: The suffix length threshold k significantly impacts G. LLaMA: Open and Efficient Foundation Language 28 | the rate of data determined to be memorized. We set k = 50. Models, 2023. 29 | x [50] TOUVRON, H., MARTIN, L., STONE, K., ALBERT, P., 30 | ′ ALMAHAIRI, A., BABAEI, Y., BASHLYKOV, N., BATRA, S., BHARGAVA, P., BHOSALE, S., ET AL. LLaMA 31 | [−k:] 2: Open foundation and fine-tuned chat models. arXiv 32 | in x”, then we would have to do an O(n) search checking preprint arXiv:2307.09288 (2023). 33 | all suffixes of x. This linear scan is expensive if x is large, [51] TTI. Introducing Falcon 180b. 34 | as it is in training large language models, often terabytes in [52] YEOM, S., GIACOMELLI, I., FREDRIKSON, M., AND 35 | size. Instead, a suffix array will enable us to do this search JHA, S. Privacy risk in machine learning: Analyzing 36 | efficiently in O(logn) time. the connection to overfitting. In IEEE CSF (2018). 37 | A suffix array s over a dataset X, denoted as s(X) is a [53] ZELTERMAN, D. Smooth nonparametric estimation of 38 | data structure that indexes all suffixes of this string in a the quantile function. Journal of statistical planning 39 | lexicographically-sorted ordering. This sorting, as we will and inference 26, 3 (1990), 339–352. 40 | see, is important as it enables efficient binary searches for a [54] ZHANG, S., ROLLER, S., GOYAL, N., ARTETXE, M., 41 | particular substring/suffix. CHEN, M., CHEN, S., DEWAN, C., DIAB, M., LI, X., 42 | In the simplest form, we can consider the suffix array of a LIN, X. V., MIHAYLOV, T., OTT, M., SHLEIFER, S., 43 | word, e.g., x =“banana”. The following is the set of all suffixes SHUSTER, K., SIMIG, D., KOURA, P. S., SRIDHAR, 44 | as obtained by traversing the string backwards and keeping only A., WANG, T., AND ZETTLEMOYER, L. Opt: Open 45 | unique suffixes, in this case, all suffixes: {“a”, “na”, pre-trained transformer language models, 2022. 46 | “ana”, “nana”, “ anana”, “banana”}, which are represented by [55] ZIEGLER, A. Github Copilot research recitation, 2021. 47 | the indices s = {5,4,3,2,1,0}. In this form, we still require [56] ZOU, A., WANG, Z., KOLTER, J. Z., AND FREDRIKSON, M. Universal and transferable adversarial 48 | an O(n) search as there is no ordering. However, a suffix array attacks on aligned language models. arXiv preprint 49 | will store these suffixes in a lexicographically sorted ordering. arXiv:2307.15043 (2023). 50 | ''' 51 | 52 | 53 | answer = set([ 54 | "membership inference attacks against machine learning models", 55 | "ai2 dolma: 3 trillion token open corpus for language model pretraining", 56 | "diffusion art or digital forgery? investigating data replication in diffusion models", 57 | "ecological methods", 58 | "llama: open and efficient foundation language models", 59 | "llama 2: open foundation and fine-tuned chat models", 60 | "introducing falcon 180b", 61 | "privacy risk in machine learning: analyzing the connection to overfitting", 62 | "smooth nonparametric estimation of the quantile function", 63 | "opt: open pre-trained transformer language models", 64 | "github copilot research recitation", 65 | "universal and transferable adversarial attacks on aligned language models", 66 | ]) 67 | 68 | def check_ok(dat): 69 | import json 70 | dat = dat.replace("```json", "```") 71 | dat = dat.split("```")[1] 72 | dat = dat.lower().replace(".","") 73 | return set(json.loads(dat)) == answer 74 | 75 | 76 | TestExtractRef = question >> LLMRun() >> PyFunc(check_ok) 77 | 78 | 79 | if __name__ == "__main__": 80 | print(run_test(TestExtractRef)) 81 | 82 | 83 | 84 | 85 | --------------------------------------------------------------------------------