├── llms
    ├── __init__.py
    ├── llama_model.py
    ├── openrouter_model.py
    ├── groq_model.py
    ├── anthropic_model.py
    ├── gemini_model.py
    ├── cohere_model.py
    ├── mistral_model.py
    ├── moonshot_model.py
    ├── vertexai_model.py
    └── openai_model.py
├── tests
    ├── __init__.py
    ├── what_is_slice_stop.py
    ├── what_is_formatfloat.py
    ├── latex_mini_caps.py
    ├── what_is_blockbyorb.py
    ├── which_package_sbox.py
    ├── tokenizer_vocab.py
    ├── knowledge_llama.py
    ├── emacs_lisp_silence_cmd.py
    ├── db9_pinout.py
    ├── unit_conversion_math.py
    ├── do_uudecode.py
    ├── identify_uuencode.py
    ├── what_is_automodel.py
    ├── latex_protect.py
    ├── what_is_inv.py
    ├── print_hello.py
    ├── save_expired_html.py
    ├── latex_redef.py
    ├── docker_cuda.py
    ├── freecad_construction.py
    ├── implement_crc32.py
    ├── what_is_oraw.py
    ├── base64_qanda.py
    ├── python_to_c_loop_update.py
    ├── program_pipes_python.py
    ├── program_pipes_cpp.py
    ├── gitignore_anywhere.py
    ├── strided_trick.py
    ├── shorten_python_if_missing.py
    ├── vague_loop_format.py
    ├── upython_mqtt.py
    ├── decompile_py_simple.py
    ├── regex_remove_5_words.py
    ├── explain_vbroadcast.py
    ├── numpy_ix.py
    ├── simulate_torch_grad.py
    ├── numba_levenshtein.py
    ├── python_jpeg.py
    ├── dedent_code_fn.py
    ├── program_sqrt.py
    ├── convert_to_c_simple.py
    ├── bash_find_dont_contain.py
    ├── jax_onehot.py
    ├── generate_string_moves.py
    ├── python_chess_game_prefix.py
    ├── aws_ipv6.py
    ├── hallucinate_reference.py
    ├── explain_code_prime2.py
    ├── fix_node_error.py
    ├── bash_renamer.py
    ├── draw_flag_bmp.py
    ├── baking_help.py
    ├── explain_code_prime.py
    ├── easy_parser_generator.py
    ├── bash_list_files_by_size_mod_ten.py
    ├── numpy_advanced_index.py
    ├── make_sqlite_table.py
    ├── python_traceback.py
    ├── bash_convert_not_overwrite.py
    ├── torch_to_jnp.py
    ├── fix_tokenizer.py
    ├── unholy_matrix.py
    ├── change_filetype.py
    ├── fix_append_vs_extend.py
    ├── make_tree_from_text.py
    ├── fix_json.py
    ├── webgl_triangle.py
    ├── c_weird_expression.py
    ├── date_news_headlines.py
    ├── flexbox_webpage.py
    ├── basic_git_setup.py
    ├── jnp_nn_bugfix.py
    ├── why_broken_flask_extra_brace.py
    ├── faster_l2_diff.py
    ├── call_rust_from_python.py
    ├── convert_dp_to_iterative.py
    ├── vectorize_small_update.py
    ├── vague_sum_data.py
    ├── play_20_questions.py
    ├── fancy_sql_process.py
    ├── print_hello_poly.py
    ├── extract_emails.py
    ├── rewrite_mac_crypto.py
    ├── rust_word_count.py
    ├── py_image_resize.py
    ├── gol_rle_decode.py
    ├── fix_torch_backward.py
    ├── merge_into_16.py
    ├── make_json.py
    ├── emoji_movies.py
    ├── git_cherrypick.py
    ├── python_parallel_wordcount.py
    ├── debug_innerhtml_eventlistener.py
    ├── convert_to_c.py
    ├── fix_with_patch.py
    ├── shorten_c_function_hard.py
    ├── implement_assembly_interpreter_by_example.py
    ├── whisper_merge.py
    ├── rust_parallel_wordcount.py
    ├── debug_broken_code_parcount.py
    ├── fix_threading_issue.py
    ├── shorten_c_function.py
    ├── decompile_py_mid.py
    ├── explore_sql_db.py
    ├── basic_code_understanding.py
    ├── find_bug_in_paper.py
    ├── git_merge.py
    ├── implement_assembly_interpreter.py
    ├── data_table_processing.py
    ├── program_in_new_assembly.py
    └── extract_references.py
├── setup.sh
├── .gitignore
├── requirements.txt
├── requirements-extra.txt
├── config.json.example
├── Dockerfile
└── llm.py


/llms/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | docker build -t ubuntu-python-app .


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .API_KEY
2 | config.json
3 | myenv
4 | tmp/*


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | docker
2 | numpy
3 | Pillow
4 | openai
5 | markdown
6 | pygments
7 | selenium
8 | 


--------------------------------------------------------------------------------
/requirements-extra.txt:
--------------------------------------------------------------------------------
1 | llama-cpp-python
2 | anthropic
3 | google-cloud-aiplatform
4 | cohere
5 | google-generativeai
6 | groq


--------------------------------------------------------------------------------
/tests/what_is_slice_stop.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case checks if the model can say how to properly get the end of a slice."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | How do I get sice(0, 5).end?
 9 | """
10 | 
11 | TestWhatIsSlice = question >> LLMRun() >> SubstringEvaluator(".stop")
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestWhatIsSlice))
15 | 


--------------------------------------------------------------------------------
/tests/what_is_formatfloat.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case checks if models can format f strings with floats."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | Python format {sdf} to 3 decimal digits
 9 | """
10 | 
11 | TestWhatIsFloatFormat = question >> LLMRun() >> SubstringEvaluator(":.3f}")
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestWhatIsFloatFormat))
15 | 


--------------------------------------------------------------------------------
/tests/latex_mini_caps.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model knows some latex macros explained poorly"
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = "Latex format the word METHOD like Method but all caps with the other letters lower"
 8 | 
 9 | TextLatexLowerCaps = question >> LLMRun() >> SubstringEvaluator("textsc")
10 | 
11 | if __name__ == "__main__":
12 |     print(run_test(TextLatexLowerCaps))
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/what_is_blockbyorb.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model knows what ERR_BLOCKED_BY_ORB means."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What does "ERR_BLOCKED_BY_ORB" mean?
 9 | """
10 | 
11 | TestWhatIsBlockByOrb = question >> LLMRun() >> SubstringEvaluator("opaque resource blocking", lower=True)
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestWhatIsBlockByOrb))
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/which_package_sbox.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case checks if the model knows what latex package to import for the Sbox environment to work."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What latex package has the Sbox env?
 9 | """
10 | 
11 | TestWhereIsSbox = question >> LLMRun() >> SubstringEvaluator("fancybox")
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestWhereIsSbox))
15 | 


--------------------------------------------------------------------------------
/tests/tokenizer_vocab.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case is designed to check if the model can print out the tokens in a AutoTokenizer's vocabulary."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | How do I get all tokens in a AutoTokenizer?
 9 | """
10 | 
11 | TestGetVocab = question >> LLMRun() >> SubstringEvaluator("get_vocab")
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestGetVocab))
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/knowledge_llama.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test the knowledge cutoff of the model to see if it knows the LLAMA-2 hidden dimension size."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What is the hidden dimension of LLAMA-2 70b?
 9 | """
10 | 
11 | TestLlamaKnowledge = question >> LLMRun() >> (SubstringEvaluator("8192") | SubstringEvaluator("8,192"))
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestLlamaKnowledge))
15 | 


--------------------------------------------------------------------------------
/tests/emacs_lisp_silence_cmd.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can understand a vague error for an emacs lisp question."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | In emacs lisp how do I silence "Process python-process finished"
 9 | """
10 | 
11 | TestLispSilencePython = question >> LLMRun() >> SubstringEvaluator("set-process-sentinel")
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     print(run_test(TestLispSilencePython))
16 | 


--------------------------------------------------------------------------------
/tests/db9_pinout.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model knows about old computer ports when prompted ambiguously."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What port has 5 pins on the top and 4 on the bottom?
 9 | """
10 | 
11 | TestDB9 = question >> LLMRun() >> (SubstringEvaluator("DB-9") | SubstringEvaluator("DB9") | SubstringEvaluator("DE-9") | SubstringEvaluator("DE9"))
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestDB9))
15 | 


--------------------------------------------------------------------------------
/tests/unit_conversion_math.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can do basic math with some EE equations."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = "I measure a voltage drop of 30mv over a 2.5 ohm resistor. How long will a 3Ah battery last? Write your ansser 'XXX hours'"
 8 | 
 9 | TestUnitConversion = question >> LLMRun() >> RegexEvaluator("250.{,10}hours")
10 | 
11 | if __name__ == "__main__":
12 |     print(run_test(TestUnitConversion))
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/tests/do_uudecode.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can successfully uudecode a given string."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | uudecode this string:
 9 | 
10 | H,3DR-S<-"C(X,C<U#0HT.3(V-@T*-C`X,C8-"C(X,S8V#0HR,#<Q-0``
11 | """
12 | 
13 | 
14 | TestUUDecode = question >> LLMRun() >> (SubstringEvaluator("60826") & SubstringEvaluator("20715") & SubstringEvaluator("19277"))
15 |     
16 | 
17 | if __name__ == "__main__":
18 |     print(run_test(TestUUDecode))
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/identify_uuencode.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can correctly identify a block of text is uuencoded."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What encoding scheme does it look like this is encoded with?
 9 | 
10 | H,3DR-S<-"C(X,C<U#0HT.3(V-@T*-C`X,C8-"C(X,S8V#0HR,#<Q-0``
11 | """
12 | 
13 | 
14 | TestIsUU = question >> LLMRun() >> SubstringEvaluator("uuencode", lower=True)
15 |     
16 | 
17 | if __name__ == "__main__":
18 |     print(run_test(TestIsUU))
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/what_is_automodel.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can interpret vague questions and will respond with the answer I want, not the answer that's easy to find."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What do I do to fix AutoModel.from_pretrained to make it auto model with lm head
 9 | """
10 | 
11 | TestWhatIsAutoModel = question >> LLMRun() >> SubstringEvaluator("AutoModelForCausalLM")
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestWhatIsAutoModel))
15 | 


--------------------------------------------------------------------------------
/tests/latex_protect.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can fix a latex newline error in a caption"
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = r"""
 8 | In latex when I force a newline in a caption with \\  it gives an error saying \Hy@tempa has an extra } runaway argument?
 9 | """
10 | 
11 | # \newline doesn't work the same I need \\
12 | TestLatexNewline = question >> LLMRun() >> SubstringEvaluator("\\protect")
13 | 
14 | if __name__ == "__main__":
15 |     print(run_test(TestLatexNewline))
16 | 


--------------------------------------------------------------------------------
/tests/what_is_inv.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case is designed to check if the model can correctly identify the Python operator used for the tilde (~) symbol."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | In python what __thing__ do I use for ~, kind of like how __add__ is for +?
 9 | """
10 | 
11 | TestWhatIsInv = question >> LLMRun() >> (SubstringEvaluator("__inv__") | SubstringEvaluator("__invert__"))
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestWhatIsInv))
15 | 


--------------------------------------------------------------------------------
/tests/print_hello.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a basic python program that prints \"hello world\"."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = 'Write a python program that prints the string "hello world" and tell me how it works in a sentence'
 8 | 
 9 | answer = "hello world"
10 | 
11 | TestPrintHello = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> SubstringEvaluator(answer)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     print(run_test(TestPrintHello))
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/save_expired_html.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model knows how to get the HTML for the entire webpage; not just the body."
 4 | 
 5 | TAGS = ['explain', 'html']
 6 | 
 7 | question = """
 8 | I have a webpage that's open in a chrome browser but the page no longer exists. What javascript should I type in the console to get a copy of the entire page html?
 9 | """
10 | 
11 | TestRecoverExpiredPage = question >> LLMRun() >> SubstringEvaluator("outerHTML")
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestRecoverExpiredPage))
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/latex_redef.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can use latex \\renewcommand, and do a bit more than what I actually asked."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | Redfine \\paragraph to be \\bigskip\\textbf
 9 | """
10 | 
11 | TestLatexRedef = question >> LLMRun() >> ((SubstringEvaluator("\\renewcommand{\\paragraph}") | SubstringEvaluator("\\renewcommand\\paragraph")) & SubstringEvaluator("\\bigskip") & SubstringEvaluator("\\textbf") & SubstringEvaluator("\\noindent"))
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestLatexRedef))
15 | 


--------------------------------------------------------------------------------
/tests/docker_cuda.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case checks if the model can debug a docker cuda error"
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | nicholas@server:~/project$ sudo docker start myproject
 9 | [sudo] password for nicholas: 
10 | Error response from daemon: could not select device driver "" with capabilities: [[gpu]]
11 | Error: failed to start containers: myproject
12 | """
13 | 
14 | TestFixDockerCuda = question >> LLMRun() >> (SubstringEvaluator("nvidia-container-toolkit") | SubstringEvaluator("nvidia-docker2"))
15 | 
16 | if __name__ == "__main__":
17 |     print(run_test(TestFixDockerCuda))
18 | 


--------------------------------------------------------------------------------
/tests/freecad_construction.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model understands a rambling question about how to make construction circle in FreeCAD."
 4 | 
 5 | TAGS = ['explain', 'fun']
 6 | 
 7 | question = """
 8 | In freecad I want to have a circle but only like for reference not to actually be a circle but just so I can line up other lines to make a hexagon for example, how do I do that
 9 | """
10 | 
11 | TestFreeCADCircle = question >> LLMRun() >> (SubstringEvaluator("construction geometry") | SubstringEvaluator("construction circle"))
12 |     
13 | 
14 | if __name__ == "__main__":
15 |     print(run_test(TestFreeCADCircle))
16 | 


--------------------------------------------------------------------------------
/tests/implement_crc32.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model understands the CRC-32 spec well enough to implement it."
 4 | 
 5 | TAGS = ['code', 'c']
 6 | import zlib
 7 | 
 8 | question = 'Write a c program that calculates the crc32 hash of the string "correct horse battery staple" and prints it to stdout. Dont call any builtin crc32 functions.'
 9 | 
10 | answer = "%x"%zlib.crc32(b'correct horse battery staple')
11 | 
12 | TestCRC32 = question >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> CRun() >> SubstringEvaluator(answer, lower=True)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     print(run_test(TestCRC32))
17 | 
18 | 


--------------------------------------------------------------------------------
/llms/llama_model.py:
--------------------------------------------------------------------------------
 1 | from llama_cpp import Llama
 2 | 
 3 | 
 4 | class LLAMAModel:
 5 |     def __init__(self, path):
 6 |         self.llm = Llama(model_path=path, chat_format="llama-2", n_ctx=1524)
 7 | 
 8 |     def make_request(self, conversation, add_image=None, logit_bias=None, max_tokens=None, skip_cache=False):
 9 |         conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)]
10 |         print("Start chat")
11 |         out = self.llm.create_chat_completion(
12 |           messages = conversation
13 |             )
14 |         print("End chat")
15 |         return out['choices'][0]['message']['content']
16 | 


--------------------------------------------------------------------------------
/tests/what_is_oraw.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case checks if the model knows lpr commands."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | I have a .ps file that I want to print from my mac. I'm trying to use lpr. But every time I print it all I get is the actual bytes just printed with like hundreds of pages and this is not what I want. I want the printer to just print the ps directly. What command should I use to make this work? 
 9 | """
10 | 
11 | TestWhatIsLPR = question >> LLMRun() >> (SubstringEvaluator("-o raw") | SubstringEvaluator("-l "))
12 | 
13 | if __name__ == "__main__":
14 |     print(run_test(TestWhatIsLPR))
15 | 
16 | 


--------------------------------------------------------------------------------
/config.json.example:
--------------------------------------------------------------------------------
 1 | {
 2 |     "container": "podman",
 3 |     "hparams": {
 4 |         "temperature": 0.7
 5 |     },
 6 |     "llms": {
 7 |         "vertexai": {
 8 |             "project_id": "TODO"
 9 |         },
10 |         "openai": {
11 |             "api_key": "TODO"
12 |         },
13 |         "mistral": {
14 |             "api_key": "TODO"
15 |         },
16 |         "cohere": {
17 |             "api_key": "TODO"
18 |         },
19 |         "anthropic": {
20 |             "api_key": "TODO"
21 |         },
22 |         "moonshot": {
23 |             "api_key": "TODO"
24 |         },
25 |         "groq": {
26 |             "api_key": "TODO"
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/tests/base64_qanda.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model will follow instructions to the letter without lots of cajoling. Thinking in base64 is also interesting."
 4 | 
 5 | TAGS = ['explain', 'fun']
 6 | import base64 
 7 | 
 8 | question = """
 9 | Read this base64 string, think about the answer, and type just the answer in base64. You're entire answer must be base64.
10 | 
11 | V2hhdCBpcyBvbmUgeWVhciBhZnRlciB3aGVuIFByZXNpZGVudCBnZW9yZ2Ugd2FzaGluZ3RvbiBib3JuPw==
12 | """
13 | 
14 | TestBase64Thought = question >> LLMRun() >> PyFunc(lambda x: base64.b64decode(x).decode('utf-8')) >> SubstringEvaluator("1733")
15 | 
16 | if __name__ == "__main__":
17 |     print(run_test(TestBase64Thought))
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/python_to_c_loop_update.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can convert a python program to c, with a loop that makes it difficult."
 4 | 
 5 | TAGS = ['code', 'python', 'c']
 6 | 
 7 | question = """
 8 | Convert this to c
 9 | 
10 | def foo(x):
11 |     sum = 0
12 |     for i in range(x):
13 |         x += i
14 |         sum += x
15 |     return sum
16 | """
17 | 
18 | test_case, answer = make_c_test([("foo(10)", "265")])
19 | 
20 | 
21 | TestPythonToCLoopUpdate = question >> LLMRun() >> ExtractCode() >> CRun(test_case) >> SubstringEvaluator(answer)
22 |                                                                     
23 | if __name__ == "__main__":
24 |     print(run_test(TestPythonToCLoopUpdate))
25 | 


--------------------------------------------------------------------------------
/tests/program_pipes_python.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a python program that defines dataflow DSL."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Write a python class Const, Square, Print so that I can write, for example,
 9 | 
10 | Cost(5) >> Square() >> Inc() >> Inc() >> Print()
11 | 
12 | and the resulting python program will print 27 to the screen.
13 | """
14 | 
15 | test_case, answer = make_python_test([("Const(2) >> Square() >> Inc() >> Square() >> Square() >> Print()", "0")])
16 | 
17 | TestMakeShiftOp = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator("625")
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     print(run_test(TestMakeShiftOp))
22 | 


--------------------------------------------------------------------------------
/tests/program_pipes_cpp.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a C++ program that defines dataflow DSL."
 4 | 
 5 | TAGS = ['code', 'c']
 6 | 
 7 | question = """
 8 | Write a C++ class Const, Square, Print so that I can write, for example,
 9 | 
10 | Cost(5) >> Square() >> Inc() >> Inc() >> Print()
11 | 
12 | and the resulting python program will print 27 to the screen.
13 | """
14 | 
15 | test_case = """int main() {
16 | Const(2) >> Square() >> Inc() >> Square() >> Square() >> Print();
17 | return 0;
18 | }"""
19 | 
20 | 
21 | TestMakeShiftOpC = question >> LLMRun() >> ExtractCode() >> CppRun(test_case) >> SubstringEvaluator("625")
22 |     
23 | 
24 | if __name__ == "__main__":
25 |     print(run_test(TestMakeShiftOpC))
26 | 


--------------------------------------------------------------------------------
/tests/gitignore_anywhere.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can understand and interpret a request to gitignore any file called \"foo/.KEYFILE\"  regardless of its location in a repository."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question1 = """
 8 | gitignore any file called .KEYFILE directly inside a directory foo, which itself is anywhere
 9 | """
10 | 
11 | TestWhatIsStarStar = question1 >> LLMRun() >> SubstringEvaluator("**/foo/.KEYFILE")
12 | 
13 | question2 = """
14 | gitignore any file called .KEYFILE anywhere under a directory foo, which itself is anywhere
15 | """
16 | 
17 | TestWhatIsStarStarB = question2 >> LLMRun() >> SubstringEvaluator("**/foo/**/.KEYFILE")
18 | 
19 | if __name__ == "__main__":
20 |     print(run_test(TestWhatIsStarStarB))
21 | 


--------------------------------------------------------------------------------
/tests/strided_trick.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model knows how to use the strided trick with numpy."
 4 | 
 5 | TAGS = ['code', 'python', 'performance']
 6 | 
 7 | question = """
 8 | Write a function strides() that given an array of tokens T as numpy array, using strided trick, returns [T[0:50], T[10:60], T[20: 70] ... ]
 9 | """
10 | 
11 | test_case, answer = make_python_test([("strides(np.arange(100)).shape", "(6, 50)"),
12 |                                      ("strides(np.arange(100)).sum()", "14850")])
13 | 
14 | TestProgramStrided = question >> LLMRun() >> ExtractCode() >> ((PythonRun(test_case) >> (SubstringEvaluator(answer))) & SubstringEvaluator("as_strided"))
15 | 
16 | if __name__ == "__main__":
17 |     print(run_test(TestProgramStrided))
18 | 


--------------------------------------------------------------------------------
/tests/shorten_python_if_missing.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can shorten a line of python with an equal line."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Shorten this python code. Keep the name the same.
 9 | ```
10 | def foo(arr, keys):
11 |     return [[x[y] if y in x else -5 for y in keys] for x in arr]
12 | ```
13 | """
14 | 
15 | test_case, answer = make_python_test([("foo([{1: 2, 3: 4, 5: 6}], [1, 2, 3, 4])", "[[2, -5, 4, -5]]")])
16 | 
17 | TestShortenPyGet = question >> LLMRun() >> (SubstringEvaluator(".get") & (ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer)))
18 | 
19 |                                             
20 | 
21 | if __name__ == "__main__":
22 |     print(run_test(TestShortenPyGet))
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/vague_loop_format.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can follow vague instructions for how to print IDs following an example."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Write a python program that for ids like
 9 | 
10 | 2401_002
11 | 
12 | For months 1..12 and ids 1..3
13 | 
14 | so like 24{month}_{id}
15 | 
16 | print all the ids
17 | """
18 | 
19 | evaluator = SubstringEvaluator("2401_001")
20 | for month in range(1, 13):
21 |     for id in range(1, 4):
22 |         evaluator &= SubstringEvaluator(f"24{month:02}_{id:03}")
23 | 
24 | 
25 | TestVagueLoopFormat = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> evaluator
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     print(run_test(TestVagueLoopFormat))
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/tests/upython_mqtt.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can write upython code with an obscure module."
 4 | 
 5 | TAGS = ['python', 'code']
 6 | 
 7 | question = """
 8 | Write a python program that I can use to run on my raspberry pi pico w zero that will check for mqtt messages. Don't block, and blink the LED on every loop iteration so I know it's polling. When it gets a message then set the LED solid for a second.
 9 | """
10 | 
11 | # crude testing because there isn't an actual pico w to run this on...
12 | TestUPythonMQTT = question >> LLMRun() >> (SubstringEvaluator("check_msg") & SubstringEvaluator("MQTTClient") & SubstringEvaluator("set_callback") & SubstringEvaluator('WLAN') & SubstringEvaluator("STA_IF"))
13 | 
14 | if __name__ == "__main__":
15 |     print(run_test(TestUPythonMQTT))
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/decompile_py_simple.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can disassemble a simple Python function from its bytecode."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Disassemble the following python into a function called foo:
 9 | 
10 |  56           0 LOAD_FAST                0 (a)
11 |               2 LOAD_FAST                1 (b)
12 |               4 LOAD_FAST                0 (a)
13 |               6 BINARY_MULTIPLY
14 |               8 BINARY_ADD
15 |              10 RETURN_VALUE
16 | 
17 | """
18 | 
19 | test_case, answer = make_python_test([("foo(5, 7)", "5+7*5")])
20 | 
21 | 
22 | TestDisas1 = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     print(run_test(TestDisas1))
27 | 


--------------------------------------------------------------------------------
/tests/regex_remove_5_words.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write a Python function with a straightforward regex."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Write a function called `match` in Python that uses a regex to find any word repeated 5 times or more. Return that word. If no word is repeated 5 times or more, return an empty string.
 9 | """
10 | 
11 | test_case, answer = make_python_test([("match('hi hi hi hi hi')", '"hi"'),
12 |                                       ("match('hi hi hi hi')", '""'),
13 |                                       ("match('hello this is is is is is a test')", '"is"')])
14 | 
15 | 
16 | TestRegex = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer)
17 | 
18 | if __name__ == "__main__":
19 |     print(run_test(TestRegex))
20 | 


--------------------------------------------------------------------------------
/tests/explain_vbroadcast.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can correctly explain what the VPBROADCASTB instruction does."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What does the VPBROADCASTB instruction do?
 9 | """
10 | 
11 | evaluation = """
12 | Below is a student's answer to a question about what a particular instruction does: <A>
13 | 
14 | Does the student's say something similar to "Load integer data from the source operand (the second operand) and broadcast to all elements of the destination operand (the first operand)."?
15 | 
16 | If the answer is correct say "The student passes" otherwise "The student fails".
17 | """
18 | 
19 | TestExplainBroadcast = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")
20 |     
21 | 
22 | if __name__ == "__main__":
23 |     print(run_test(TestExplainBroadcast))
24 | 


--------------------------------------------------------------------------------
/tests/numpy_ix.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can identify the _ix function as a method for simplifying some code."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = '''I want to vectorize a function
 8 | 
 9 |         frame = np.zeros((out_h, out_w, 3), dtype=np.uint8)
10 |         for i, val1 in enumerate(rows):
11 |             for j, val2 in enumerate(cols):
12 |                 for j, val3 in enumerate(ch):
13 |                     # Assuming you want to store the pair as tuples in the matrix
14 |                     frame[i, j, k] = image[val1, val2, val3]
15 | 
16 |         out.write(np.array(frame))
17 | 
18 | with a simple numpy function that does something like this what is it called'''
19 | 
20 | TestNumpyIx = question >> LLMRun() >> SubstringEvaluator("ix_")
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     print(run_test(TestNumpyIx))
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/simulate_torch_grad.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case checks if the model can predict what the gradient of a variable is in PyTorch."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | What will this function print
 9 | 
10 | ```
11 | def diff_round(x, decimals=1):
12 |     scale_factor = (10 ** decimals)
13 |     x = x * scale_factor
14 |     diff = (1 + 1e-2) * x - torch.floor(x)
15 |     x = x - diff + (torch.floor(x) + torch.where(diff >= 0.5, 1, 0))
16 |     x = x / scale_factor
17 |     return x
18 | 
19 | g = torch.tensor([.99, 1.54, 1.9], dtype=torch.float32, requires_grad=True)
20 | loss = torch.sum(diff_round(g, 1))
21 | loss.backward()
22 | print(g.grad.sum())
23 | ```
24 | """
25 | 
26 | TestSimTorchGrad = question >> LLMRun() >> SubstringEvaluator("-0.03")
27 | 
28 | if __name__ == "__main__":
29 |     print(run_test(TestSimTorchGrad))
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/numba_levenshtein.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a numba implementation of the Levenshtein distance algorithm."
 4 | 
 5 | TAGS = ['code', 'python', 'performance']
 6 | 
 7 | question = """
 8 | Write me a numba implementation of levensthin distance, taking two numpy int arrays. Call it levenshtein. Run it in nopython mode. Character swaps are cost 2.
 9 | """
10 | 
11 | test_case, answer = make_python_test([("levenshtein(np.array([3, 5, 2, 4, 8, 7]), np.array([3, 4, 2, 4, 8, 7, 9]))", "3")])
12 | 
13 | 
14 | TestProgramNumbaLev = question >> LLMRun() >> ExtractCode(keep_main=False) >> (((SubstringEvaluator("from numba") | SubstringEvaluator("import numba")) & (SubstringEvaluator("nopython") | SubstringEvaluator("njit"))) & PythonRun(test_case) >> SubstringEvaluator(answer))
15 | 
16 | if __name__ == "__main__":
17 |     print(run_test(TestProgramNumbaLev))
18 | 


--------------------------------------------------------------------------------
/tests/python_jpeg.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can write a program that directly writes a jpeg file. This requires precise understanding of the jpeg spec."
 4 | 
 5 | TAGS = ['code', 'python', 'visual']
 6 | 
 7 | question = """
 8 | Write a python program that, manually, draws a red triangle on a white background and saves it as a 200x200 gif file. Do not use any graphics libraries. Write the gif byte by byte. Output the gif file to stdout.
 9 | 
10 | I will evaluate your program by running `python your_soln.py > /tmp/a.gif` and then view the output file.
11 | """
12 | 
13 | TestFlagDraw = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun(out_bytes=True) >> LLMVisionRun("Describe the shapes in this image and their color") >> (SubstringEvaluator("triangle") & SubstringEvaluator("red"))
14 | 
15 | if __name__ == "__main__":
16 |     print(run_test(TestFlagDraw))
17 | 


--------------------------------------------------------------------------------
/tests/dedent_code_fn.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write a Python function that removes excess indentation from a given block of code."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Write a python function `dedent(str)` that takes as input a string of python code and de-dents it maximally. Lines that are entirely blank don't count.
 9 | """
10 | 
11 | test_case, answer = make_python_test([("""dedent('''    print("hello")
12 |     print("world")
13 | 
14 |     if True:
15 |         print("true")
16 |     else:
17 |         print("false")''')""", '''"""print("hello")
18 | print("world")
19 | 
20 | if True:
21 |     print("true")
22 | else:
23 |     print("false")"""''')])
24 | 
25 | 
26 | TestDedent = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun(test_case) >> SubstringEvaluator(answer)
27 | 
28 | if __name__ == "__main__":
29 |     print(run_test(TestDedent))
30 | 
31 | 


--------------------------------------------------------------------------------
/llms/openrouter_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from openai import OpenAI
 3 | import json
 4 | 
 5 | from llms.openai_model import OpenAIModel
 6 | 
 7 | 
 8 | class OpenRouterModel(OpenAIModel):
 9 |     def __init__(self, name):
10 |         config = json.load(open("config.json"))
11 |         api_key = config["llms"]["openrouter"]["api_key"].strip()
12 |         # Inspired by https://openrouter.ai/docs/quickstart
13 |         self.client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=api_key)
14 |         self.name = name[len("openrouter/") :]
15 |         self.hparams = config["hparams"]
16 |         self.hparams.update(config["llms"]["openrouter"].get("hparams") or {})
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     # WARNING: must be run with PYTHONPATH=. otherwise there will be an import error
21 |     # q = sys.stdin.read().strip()
22 |     q = "hello there"
23 |     print(q + ":", OpenRouterModel("openrouter/openai/o1-mini").make_request([q]))
24 | 


--------------------------------------------------------------------------------
/tests/program_sqrt.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can implement a sqrt function."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Tell me how to compute sqrt() in python without using the sqrt() function, then give me a function to do this. I want the answer as precise as possible. Call the function my_sqrt()..
 9 | """
10 | 
11 | test_case, answer = make_python_test([("round(my_sqrt(12345), 1)", "111.1")])
12 | 
13 | 
14 | TestProgramSqrt = question >> LLMRun() >> ExtractCode() >> ((LLMRun("Below is a student question who was supposed to implement square root from scratch. Did they cheat and just call sqrt directly? Think out loud, then finally answer either 'Yes they cheated' or 'No they did not cheat' <A>", llm=EVAL_LLM) >> ~SubstringEvaluator("they cheated")) & (PythonRun(test_case) >> SubstringEvaluator(answer)))
15 |     
16 |     
17 | if __name__ == "__main__":
18 |     print(run_test(TestProgramSqrt))
19 | 
20 | 


--------------------------------------------------------------------------------
/tests/convert_to_c_simple.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can rewrite a very simple Python program into an equivalent C program."
 4 | 
 5 | TAGS = ['code', 'c']
 6 | 
 7 | question = """
 8 | Here's a python program. Please rewrite it in c with the type signature float my_sqrt(float, float).
 9 | 
10 | def my_sqrt(number, precision=0.0001):
11 |     guess = number / 2  # Start with an initial guess
12 |     
13 |     while abs(guess * guess - number) >= precision:
14 |         guess = (guess + number / guess) / 2  # Improve the guess using the Babylonian method
15 |     
16 |     return guess
17 | """
18 | 
19 | test_case, answer = make_c_test([("(int)(my_sqrt(12345, .001)*10)", "1111")])
20 | 
21 | 
22 | TestProgramRewriteCSimple = question >> LLMRun() >> ExtractCode() >> CRun(test_case) >> SubstringEvaluator(answer)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     from llm import llm
27 |     print(run_test(TestProgramRewriteCSimple))
28 | 


--------------------------------------------------------------------------------
/tests/bash_find_dont_contain.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can implement (the negation of) a simple bash 1-liner searching for files that don't contain some text."
 4 | 
 5 | TAGS = ['bash']
 6 | 
 7 | question = "In bash list all files in cwd that don't contain asdf in the contents (case-ignore)"
 8 | 
 9 | 
10 | def setup_fn():
11 |     open("hello", "w").write("this is a test")
12 |     open("there", "w").write("test this ASDF")
13 |     open("this", "w").write("whatever thing")
14 |     open("asdf", "w").write("zzzthing")
15 |     open("lower", "w").write("test this asdf")
16 | 
17 |     
18 | TestBashFindDontContain = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> BashRun() >> (SubstringEvaluator("hello") & ~SubstringEvaluator("there") & SubstringEvaluator("this") & SubstringEvaluator("asdf") & ~SubstringEvaluator("lower"))
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     print(run_test(TestBashFindDontContain))
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/jax_onehot.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can correctly convert a list of indexes to a one-hot vector in Python using JAX."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | I have list of indexes and I want to convert it to one hot vector using jax and the function should be jittable and the function should be jitted. name the function one_hot and it should get two arguments the first one is the indexes and the second one is the number of possible labeles. Just give me the code
 9 | """
10 | 
11 | test_case, answer = make_python_test([("str(one_hot(jnp.array([1,2,0]),5))","str(jnp.array([[0.0,1.0,0.0,0.0,0.0],[0.0,0.0,1.0,0.0,0.0],[1.0,0.0,0.0,0.0,0.0]]))")],header='import jax.numpy as jnp')
12 | 
13 | 
14 | 
15 | 
16 | TestJaxOneHot = question >> LLMRun() >> ExtractCode() >> Echo() >>  PythonRun(test_case) >>  Echo() >> SubstringEvaluator(answer)
17 |                                                                     
18 | 
19 | if __name__ == "__main__":
20 |     print(run_test(TestJaxOneHot))
21 | 


--------------------------------------------------------------------------------
/llms/groq_model.py:
--------------------------------------------------------------------------------
 1 | import groq
 2 | import json
 3 | 
 4 | class GroqModel:
 5 |     def __init__(self, name):
 6 |         self.name = name
 7 | 
 8 |         config = json.load(open("config.json"))
 9 |         self.api_key = config['llms']['groq']['api_key'].strip()
10 | 
11 |         self.hparams = config['hparams']
12 |         self.hparams.update(config['llms']['groq'].get('hparams') or {})
13 |         
14 |     def make_request(self, conversation, add_image=None, logit_bias=None, max_tokens=None):
15 |         conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)]
16 |         response = groq.Groq(api_key=self.api_key).chat.completions.create(
17 |             model=self.name,
18 |             max_tokens=2048,
19 |             messages=conversation
20 |         )
21 | 
22 |         return response.choices[0].message.content
23 | 
24 | if __name__ == "__main__":
25 |     import sys
26 |     q = "What's your name?"
27 |     print(q+":", GroqModel("deepseek-r1-distill-llama-70b").make_request([q]))
28 | 
29 | 


--------------------------------------------------------------------------------
/tests/generate_string_moves.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write code to perform string slicing with vague instructions."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Write me a function called `move` that, given a string x, returns all possible alternate strings y' such that y' is derived from x by moving a substring of x to an arbitrary position
 9 | """
10 | 
11 | 
12 | test_case, answer = make_python_test([("set(move('abcdef'))", "set(['abcdef', 'abcdfe', 'abcedf', 'abcefd', 'abcfde', 'abdcef', 'abdecf', 'abdefc', 'abecdf', 'abefcd', 'abfcde', 'acbdef', 'acdbef', 'acdebf', 'acdefb', 'adbcef', 'adebcf', 'adefbc', 'aebcdf', 'aefbcd', 'afbcde', 'bacdef', 'bcadef', 'bcdaef', 'bcdeaf', 'bcdefa', 'cabdef', 'cdabef', 'cdeabf', 'cdefab', 'dabcef', 'deabcf', 'defabc', 'eabcdf', 'efabcd', 'fabcde'])")])
13 | 
14 | TestProgramStringSlice = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer)
15 | 
16 | if __name__ == "__main__":
17 |     print(run_test(TestProgramStringSlice))
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/python_chess_game_prefix.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can correctly call a python API for a moderately popular python library."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | In python-chess I have a game = chess.pgn.read_game(pgn).
 9 | 
10 | How do I print the PGN notation for all prefixes of this game?
11 | 
12 | Call your function print_all_prefixes(game). Do not give example code for how to run it.
13 | 
14 | """
15 | 
16 | test_case = """import io
17 | import chess.pgn
18 | print_all_prefixes(chess.pgn.read_game(io.StringIO('1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7 4. d4 O-O 5. Bf4 d5 6. Qb3 dxc4 7. Qxc4 c6 8. e4 Nbd7 9. Rd1 Nb6 10. Qc5 Bg4 11. Bg5 Na4 12. Qa3 Nxc3 13. bxc3 Nxe4 14. Bxe7 Qb6 15. Bc4 Nxc3')))"""
19 | 
20 | def check(txt):
21 |     return txt.count('1. Nf3 Nf6 2. c4 g6 3. Nc3 Bg7') > 10, ""
22 |     
23 |     
24 | 
25 | TestPyChessPrefix = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> PyFunc(check)
26 | 
27 | if __name__ == "__main__":
28 |     print(run_test(TestPyChessPrefix))
29 | 


--------------------------------------------------------------------------------
/tests/aws_ipv6.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can identify the error in an AWS Lambda code for authorizing a new network. This type of error is generally difficult to find via search."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """This a code for aws lambda to authorize a new network, what is wrong with this :
 8 | security_group.authorize_egress(
 9 |             IpPermissions=[
10 |                 {'IpProtocol': '-1',
11 |                  'IpRanges': [{'CidrIpv6': '::/0'}]}
12 |             ]
13 |         )
14 | """
15 | 
16 | evaluation = """
17 | Below is a software engineer's answer to a request to find an issue with the following code: <A>
18 | 
19 | Does the engineer mention overly permissive IPv6 ranges? Think out loud, and then end your answer with either the string "The issue is found" or "The issue is not found".
20 | """
21 | 
22 | TestAWSV6 = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("The issue is found")
23 | 
24 | if __name__ == "__main__":
25 |     print(run_test(TestAWSV6))
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/hallucinate_reference.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model will hallucinate references that don't exist."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | For each of the 6 reference in the following paper, extract the title, author, and year as a JSON array.
 9 | 
10 | Write {1: {"title": "title", "author": "author", "year": "year"}, 1: ..., 6: ...}
11 | 
12 | 
13 | Paper:
14 | {paper_title} {paper_body} {references}
15 | """
16 | 
17 | evaluation = """
18 | I asked the student for the references contained a paper that didn't have any references. The student answered "<A>"
19 | 
20 | Does the student's answer say something like "There are no references"? If so, the student passes, and write "The student passes".
21 | 
22 | Otherwise, if the student lists *any* paper, write "The student fails".
23 | """
24 | 
25 | TestHallucinateReference = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")
26 |     
27 | 
28 | if __name__ == "__main__":
29 |     print(run_test(TestHallucinateReference))
30 | 
31 | 


--------------------------------------------------------------------------------
/llms/anthropic_model.py:
--------------------------------------------------------------------------------
 1 | import anthropic
 2 | import json
 3 | 
 4 | class AnthropicModel:
 5 |     def __init__(self, name):
 6 |         self.name = name
 7 | 
 8 |         config = json.load(open("config.json"))
 9 |         self.api_key = config['llms']['anthropic']['api_key'].strip()
10 | 
11 |         self.hparams = config['hparams']
12 |         self.hparams.update(config['llms']['anthropic'].get('hparams') or {})
13 |         
14 |     def make_request(self, conversation, add_image=None, logit_bias=None, max_tokens=None):
15 |         conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)]
16 |         response = anthropic.Anthropic(api_key=self.api_key).messages.create(
17 |             model=self.name,
18 |             max_tokens=2048,
19 |             messages=conversation
20 |         )
21 | 
22 |         return response.content[0].text
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     import sys
27 |     q = "What's your name?"
28 |     print(q+":", AnthropicModel("claude-3-5-sonnet-20240620").make_request([q]))
29 | 
30 | 


--------------------------------------------------------------------------------
/tests/explain_code_prime2.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can interpret a minified and obfuscated JavaScript function and explain its function."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | What does this code do
 9 | 
10 | function z(){let e=[],n=[];for(let r=2;e.length<20;r++)(n=n.map(e=>e-1)).some(e=>0===e)?n=n.map((n,r)=>0===n?e[r]:n):(e.push(r),n.push(r));return e}console.log(z());
11 | """
12 | 
13 | TestExplainPrime2 = question >> LLMRun() >> LLMRun("I showed a student some code that calculates 20 primes. I then tested their understanding by asking them to explain it to me. I'm going to show you their answer, I want you to evaluate whether or not their answer says something to the effect of \"it calculates the first 20 primes\". Do not judge their reasonining, just tell me if they got it right. \n Student Answer: \"<A>\"\n\n Think out loud then answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> SubstringEvaluator("student passes")
14 | 
15 | if __name__ == "__main__":
16 |     print(run_test(TestExplainPrime2))
17 | 


--------------------------------------------------------------------------------
/tests/fix_node_error.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can identify a node error message"
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | How do I fix this
 9 | 
10 | nicholas@npc site % http-server 
11 | dyld[63275]: Library not loaded: /opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib
12 |   Referenced from: <758FD1B7-1836-321E-A1D9-E47EC3C39702> /opt/homebrew/Cellar/node/21.5.0/bin/node
13 |   Reason: tried: '/opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib' (no such file), '/opt/homebrew/opt/icu4c/lib/libicui18n.73.dylib' (no such file), '/opt/homebrew/Cellar/icu4c/74.2/lib/libicui18n.73.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/Cellar/icu4c/74.2/lib/libicui18n.73.dylib' (no such file), '/opt/homebrew/Cellar/icu4c/74.2/lib/libicui18n.73.dylib' (no such file)
14 | """
15 | 
16 | TestFixNode = question >> LLMRun() >> SubstringEvaluator("brew reinstall node")
17 |     
18 | 
19 | if __name__ == "__main__":
20 |     print(run_test(TestFixNode))
21 | 


--------------------------------------------------------------------------------
/tests/bash_renamer.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write a bash script that renames files with a specific pattern."
 4 | 
 5 | TAGS = ['code', 'bash']
 6 | 
 7 | question = """
 8 | Write a bash script to rename all files of the form "merged_*" to "finished_*". I want to call it by running bash rename.sh /path/to/files.
 9 | """
10 | 
11 | def setup_fn():
12 |     import os
13 |     import time
14 |     os.mkdir("./foo")
15 |     time.sleep(.5)
16 |  
17 |     for i in range(10):
18 |         with open("./foo/merged_{}".format(i), "w") as f:
19 |             f.write("a"*i)
20 |         with open("./foo/{}".format(i), "w") as f:
21 |             f.write("a"*i)
22 | 
23 |     time.sleep(.5)
24 | 
25 | def check_ok():
26 |     import os
27 |     return sorted(os.listdir("foo")) == sorted(["finished_{}".format(i) for i in range(10)] + [str(i) for i in range(10)])
28 |     
29 | 
30 | 
31 | TestBashRenamer = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode() >> BashRun(args=["foo/"]) >> PyEvaluator(check_ok)
32 | 
33 | if __name__ == "__main__":
34 |     print(run_test(TestBashRenamer))
35 | 
36 | 


--------------------------------------------------------------------------------
/tests/draw_flag_bmp.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write a C program that draws an image. This test requires the ability to understand the .bmp specification, and draw a flag that can be correctly parsed and seen by the evaluator."
 4 | 
 5 | TAGS = ['code', 'c', 'visual']
 6 | 
 7 | question = """
 8 | Write me a C program that emits a .bmp that's a depiction of an american flag to stdout.
 9 | """
10 | 
11 | TestFlagDrawBMP = question >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("United States") | SubstringEvaluator("USA") | SubstringEvaluator("America"))
12 | 
13 | 
14 | question2 = """
15 | Write me a C program that emits a .bmp that's a depiction of an italian flag to stdout.
16 | """
17 | 
18 | TestEasyFlagDrawBMP = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> CRun(out_bytes=True) >> LLMVisionRun("What flag is shown in this image?") >> (SubstringEvaluator("Italy") | SubstringEvaluator("Italian"))
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     print(run_test(TestFlagDrawBMP))
23 | 


--------------------------------------------------------------------------------
/tests/baking_help.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can identify a missing incredient in a recipe. Identifying incorrect steps is much harder than missing steps."
 4 | 
 5 | TAGS = ['explain', 'fun']
 6 | 
 7 | question = """
 8 | I'm making pancakes for breakfast. I added a cup of flour, a teaspoon of salt, and a few tablespoons of sugar to a bowl. I stirred it together, then added a cup of milk, a beaten egg, and a few tablespoons of oil, and stirred until just mixed. Then I put 1/4 a cup on a hot frying pan, and flipped it when brown. But they're terrible! Why? List the main reason.
 9 | """
10 | 
11 | evaluation = """
12 | Below is a student's answer to a question on what is missing from some pancake recipe: <A>
13 | 
14 | Does the student mention that baking powder or baking soda is missing? Think out loud, and then end your answer with either the string "The student passes" or "The student fails".
15 | """
16 | 
17 | TestMissingStep = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")
18 |     
19 | 
20 | if __name__ == "__main__":
21 |     print(run_test(TestMissingStep))
22 | 


--------------------------------------------------------------------------------
/tests/explain_code_prime.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can interpret a minified JavaScript function and explain its function."
 4 | 
 5 | TAGS = ['code', 'explain']
 6 | 
 7 | question = """
 8 | What does this code do
 9 | 
10 | function q(e){if(e<=1)return!1;if(e<=3)return!0;if(e%2==0||e%3==0)return!1;for(let r=5;r*r<=e;r+=6)if(e%r==0||e%(r+2)==0)return!1;return!0}function z(){const e=[];let r=2;for(;e.length<20;)q(r)&&e.push(r),r++;return e.reverse(),e.reverse(),e}console.log(z());
11 | """
12 | 
13 | TestExplainPrime = question >> LLMRun() >> LLMRun("I showed a student some code that calculates 20 primes. I then tested their understanding by asking them to explain it to me. I'm going to show you their answer, I want you to evaluate whether or not their answer says something to the effect of \"it calculates the first 20 primes\". Do not judge their reasonining, just tell me if they got it right. \n Student Answer: \"<A>\"\n\n Think out loud then answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> SubstringEvaluator("student passes")
14 | 
15 | if __name__ == "__main__":
16 |     print(run_test(TestExplainPrime))
17 | 


--------------------------------------------------------------------------------
/tests/easy_parser_generator.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can understand a vague BNF-style grammar and write a Python function that evaluates expressions based on the grammar rules."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Given this BNF-style grammar write a Python function `evaluate(str)` that evaluates the expression
 9 | 
10 | <expression> ::= <term> | <expression> '+' <term> | <expression> '-' <term>
11 | 
12 | <term> ::= <factor> | <term> '*' <factor> | <term> '/' <factor>
13 | 
14 | <factor> ::= <number> | '(' <expression> ')' | <prefix-op> <factor>
15 | 
16 | <prefix-op> ::= 'sin' | 'cos'
17 | 
18 | <number> ::= <digit> | <number> <digit>
19 | 
20 | <digit> ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
21 | 
22 | For example you can evaluate this
23 | 
24 | sin(3 + 2 * 4) - cos(15) = -0.240302
25 | """
26 | 
27 | test_case, answer = make_python_test([("round(evaluate('sin(35+2*4)-cos(15)'), 5)", "-0.07209")])
28 | 
29 | TestSimpleBNF = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     print(run_test(TestSimpleBNF))
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/tests/bash_list_files_by_size_mod_ten.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can provide the correct bash command to list files in a directory and sort them by the least significant digit of their size."
 4 | 
 5 | TAGS = ['bash']
 6 | 
 7 | question = 'In bash how do I list all files in foo/ but sort the files by the least significant digit of their size so 2820 comes first and then 281771 and finally 2279. I want just the file names e.g., file1 otherfile lastfile.'
 8 | 
 9 | 
10 | def setup_fn():
11 |     sizes = [921, 714, 120, 637, 366, 662, 305, 403, 49, 158]
12 |     
13 |     import os
14 |     import time
15 |     os.mkdir("foo")
16 |     time.sleep(.5)
17 |  
18 |     for i, size in enumerate(sizes):
19 |         with open("foo/{}".format(i), "w") as f:
20 |             f.write("a"*size)
21 | 
22 |     time.sleep(.5)
23 | 
24 | def fix_whitespace(x):
25 |     return " ".join(x.split())
26 | 
27 | 
28 | TestBashListSize = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True, lang="bash") >> BashRun() >> PyFunc(fix_whitespace) >> RegexEvaluator("2.*0.*5.*7.*1.*6.*4.*3.*9.*8")
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     print(run_test(TestBashListSize))
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/numpy_advanced_index.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model correctly understands how advanced indexing works in numpy."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | What will this program print:
 9 | 
10 | ```python
11 | import numpy as np
12 | 
13 | array = np.zeros((10, 20, 30))
14 | idxs = [1, 2, 3]
15 | 
16 | print("The array shape is", array[0, :, idxs].shape)
17 | ```
18 | """
19 | 
20 | TestNumpyAdvancedIndex = question >> LLMRun() >> SubstringEvaluator("The array shape is (3, 20)")
21 | 
22 | question = """
23 | Consider this program
24 | 
25 | ```python
26 | import numpy as np
27 | 
28 | array = np.zeros((10, 20, 30))
29 | idxs = [1, 2, 3]
30 | 
31 | print("The first array shape is", array[0][:, idxs].shape)
32 | print("The second array shape is", array[0, :, idxs].shape)
33 | ```
34 | 
35 | Tell me what this program will print, and then explain why the two print statements don't print the same thing
36 | """
37 | 
38 | TestNumpyAdvancedIndexEasier = question >> LLMRun() >> (SubstringEvaluator("(3, 20)") & SubstringEvaluator("(20, 3)") & SubstringEvaluator("dvanced indexing"))
39 | 
40 | if __name__ == "__main__":
41 |     print(run_test(TestNumpyAdvancedIndexEasier))
42 | 


--------------------------------------------------------------------------------
/tests/make_sqlite_table.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a SQL query to create a database table."
 4 | 
 5 | TAGS = ['sql']
 6 | 
 7 | question = """
 8 | I'm working with a sqlite3 database. Write a query that creates a new database with a table for peoples name, job, and age. Then add a researcher named Nicholas who is 5. Write this directly as a sql query that I can pipe into sqlite3. Do not give the 'sqlite' command, I'll just do `sqlite3 database.db < [your output]`.
 9 | """
10 | 
11 | def myfn():
12 |     open("/tmp/query.sql", "w").write("SELECT * FROM people;")
13 |     import os
14 |     out = os.popen("sqlite3 -init /tmp/query.sql database.db .exit").read()
15 |     return "Nicholas" in out and "5" in out and "research" in out.lower()
16 | 
17 | 
18 | TestSqlMakeTable = question >> LLMRun() >> ExtractCode(manual="I'm going to run `sqlite3 database.db < /tmp/query.sql`. Given the following answer tell me exactly what to put into `query.sql`. DO NOT MODIFY THE CODE OR WRITE NEW CODE.\n<A>") >> Echo() >> SQLRun() >> PyEvaluator(myfn)
19 |                                                                     
20 | 
21 | if __name__ == "__main__":
22 |     print(run_test(TestSqlMakeTable))
23 | 


--------------------------------------------------------------------------------
/tests/python_traceback.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can identify the buf and fix a program that handles python tracebacks. Useful to know if the model can handle more advanced python libraries."
 4 | 
 5 | TAGS = ['code', 'fix']
 6 | 
 7 | question = """
 8 | Fix this program. Make sure to keep the format "var: value".
 9 | 
10 | import traceback
11 | 
12 | def foo():
13 |     y = 6
14 |     raise Exception("This is a test exception")
15 | 
16 | def crashes():
17 |     x = 5
18 |     foo()
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     try:
23 |         crashes()
24 |     except:
25 |         # extract stack frames                                                                                                                                                        
26 |         stack = traceback.extract_stack()
27 |         # print all locals in each stack frame
28 |         for frame in stack:
29 |             for k,v  in frame.locals:
30 |                 print(k + ": " + v)
31 | """
32 | 
33 | TestProgramTB = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun("") >> (SubstringEvaluator("x: 5") & SubstringEvaluator("y: 6"))
34 | 
35 | if __name__ == "__main__":
36 |     print(run_test(TestProgramTB))
37 | 


--------------------------------------------------------------------------------
/tests/bash_convert_not_overwrite.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can run an incremental update of a bash command without overwriting files that already exist"
 4 | 
 5 | TAGS = ['bash']
 6 | 
 7 | question = 'Run find . -name "*.pdf" -exec python3 process.py {} txt/{}.txt \;  but don\'t rewrite files that already exist. Instead of .pdf.txt make it just .txt'
 8 | 
 9 | 
10 | def setup_fn():
11 |     open("process.py", "w").write("import sys\nopen(sys.argv[2], 'w').write(open(sys.argv[1]).read())\n")
12 |     open("a.pdf", "w").write("hello1")
13 |     open("b.pdf", "w").write("hello2")
14 |     open("c.pdf", "w").write("hello3")
15 |     import os
16 |     os.mkdir("txt")
17 |     open("txt/a.txt", "w").write("done1")
18 |     open("txt/c.txt", "w").write("done3")
19 | 
20 | 
21 | def check_ok():
22 |     import os
23 |     return open("txt/a.txt").read() == "done1" and open("txt/b.txt").read() == "hello2" and open("txt/c.txt").read() == "done3" and len(os.listdir("txt")) == 3
24 | 
25 | 
26 |     
27 | TestBashIncrementalUpdate = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> BashRun() >> PyEvaluator(check_ok)
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     print(run_test(TestBashIncrementalUpdate))
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/tests/torch_to_jnp.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can convert a torch neural network to a jax numpy model."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Convert the following PyTorch code to pure JAX. Do not use any other libraries.
 9 | 
10 | ```python
11 | import torch
12 | 
13 | class MLP(torch.nn.Module):
14 |     def __init__(self, input_size, hidden_size, output_size):
15 |         super().__init__()
16 |         self.linear1 = torch.nn.Linear(input_size, hidden_size)
17 |         self.linear2 = torch.nn.Linear(hidden_size, output_size)
18 | 
19 |     def __call__(self, x):
20 |         x = self.linear1(x)
21 |         x = torch.relu(x)
22 |         x = self.linear2(x)
23 |         return torch.nn.functional.softmax(x, dim=1)
24 | 
25 | model = MLP(10, 20, 30)
26 | x = torch.randn(5, 10)
27 | y = model(x)
28 | print(list(y.flatten().numpy().sum(1)))
29 | ```
30 | """
31 | 
32 | TestTorchJnp = question >> LLMRun() >> ExtractCode(keep_main=True) >> \
33 |     ((~SubstringEvaluator("import torch")) &
34 |      (PythonRun() >> (SubstringEvaluator("1.0,") | SubstringEvaluator("1.00000") | SubstringEvaluator("1.0 ") | SubstringEvaluator("0.99999"))))
35 | 
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     print(run_test(TestTorchJnp))
40 | 


--------------------------------------------------------------------------------
/tests/fix_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can identify and fix an issue with a tokenizer in a Python code snippet. Identifying the problem is in the regex, and fixing the regex, are both hard."
 4 | 
 5 | TAGS = ['code', 'fix', 'python']
 6 | 
 7 | question = """
 8 | Fix this tokenizer keeping it as similar as possible
 9 | 
10 | class Tokenizer:
11 |     def __init__(self, input_str):
12 |         self.tokens = re.findall(r'\d+|(\+|\-|\*|/|sin|cos|\(|\))', input_str)
13 |         self.position = 0
14 | 
15 |     def get_next_token(self):
16 |         if self.position >= len(self.tokens):
17 |             return None
18 |         token = self.tokens[self.position]
19 |         self.position += 1
20 |         return token
21 | 
22 |     def peek_next_token(self):
23 |         if self.position >= len(self.tokens):
24 |             return None
25 |         return self.tokens[self.position]
26 | """
27 | 
28 | test_case, answer = make_python_test([("Tokenizer('sin(3+2*4)-cos(15)').tokens", "['sin', '(', '3', '+', '2', '*', '4', ')', '-', 'cos', '(', '15', ')']")])
29 | 
30 | 
31 | TestSimpleFix = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     print(run_test(TestSimpleFix))
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/tests/unholy_matrix.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can solve a rather hard dynamic programming problem"
 4 | 
 5 | TAGS = ['code', 'c']
 6 | 
 7 | question = "A binary 8x8 matrix is 'unholy' if there are no isolated 1s. A 1 is isolated if there isn't another 1 in any of the 4 direct cardinal directions. Write a C program that counts the total number of unholy 8x8 matricies."
 8 | 
 9 | 
10 | step = """To solve this question:
11 | - Use bit arithmetic and an unsigned long to represent the matrix.
12 | - Write a function is_unholy that tests if a matrix is unholy by bit-shifting and masking.
13 | 
14 | Use a dynamic program to count the number of unholy matricies.
15 | - Recurse on the number of rows the matrix has.
16 | - Enumerate all 2^16 next two rows. This is enough to completely isolate the prior solutions from the next.
17 | 
18 | Write out a plan for the program, and then implement the plan in C."""
19 | 
20 | answer = "1121231537486377866"
21 | 
22 | TestUnholyMatrix = question >> LLMRun() >> ExtractCode(keep_main=True) >> CRun() >> SubstringEvaluator(answer)
23 | TestUnholyMatrixStep = (question + step) >> LLMRun() >> ExtractCode(keep_main=True) >> CRun() >> SubstringEvaluator(answer)
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     print(run_test(TestUnholyMatrix))
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/tests/change_filetype.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can make a PNG get detected as an ELF executable."
 4 | 
 5 | TAGS = ["coding"]
 6 | 
 7 | question = """
 8 | I have a file called `image.png`. I want to modify it so that
 9 |     (1) it still loads correctly as an image file with the same contents with PIL, but
10 |     (2) if I run `file image.png`, it should say that it is an ELF executable.
11 | 
12 | Write me a python program that when run does this; modify the image in-place.
13 | """
14 | 
15 | 
16 | def test_ok():
17 |     import os
18 |     if 'ELF' not in os.popen("file image.png").read():
19 |         return False
20 |     from PIL import Image
21 |     img = Image.open("image.png").numpy()
22 |     
23 |     return img[0, 0].tolist() == [255, 0, 0] and img[0, 1].tolist() == [0, 255, 0] and img[0, 2].tolist() == [0, 0, 255] and img.shape == (100, 100, 3)
24 | 
25 | 
26 | def setup_image():
27 |     from PIL import Image
28 |     import numpy as np
29 |     img = np.zeros((100, 100, 3), dtype=np.uint8)
30 |     img[0, 0] = [255, 0, 0]
31 |     img[0, 1] = [0, 255, 0]
32 |     img[0, 2] = [0, 0, 255]
33 |     Image.fromarray(img).save("image.png")
34 | 
35 | 
36 | TestMakePNGToELF = Setup(setup_image) >> question >> LLMRun(question) >> ExtractCode() >> PythonRun() >> PyEvaluator(test_ok)
37 | 
38 | if __name__ == "__main__":
39 |     print(run_test(TestMakePNGToELF))
40 | 


--------------------------------------------------------------------------------
/tests/fix_append_vs_extend.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test checks is the model can figure out from context when it's right to use extend versus append."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | Why won't this function work as expected? List exactly one reason.
 9 | 
10 | ```
11 | def make_op_with_history(config):
12 |     how = config[0]
13 |     args = config[1:]
14 | 
15 |     if how.split()[0] == 'dot_from_papers':
16 |         return [(how, make_op(config))]
17 |     if how.split()[0] in ['const_add', 'const_mul']:
18 |         return [(how, make_op(config))] + make_op_with_history(config[1])
19 |     out = []
20 |     for arg in args:
21 |         hist = make_op_with_history(arg)
22 |         out.append(hist)
23 |     return [(how, make_op(config))] + out
24 | ```
25 | """
26 | 
27 | judge = "I asked a question about why some code was incorrect. The answer is \"because because the code uses append instead of extend\". Does this student answer say that? Giving multiple answers is okay as long as it also says somewhere that the code should replace the append with extend.\n\n<A>\n\nAnswer either 'The student passes' or 'The student fails'."
28 | 
29 | TestAppendNotExtend = question >> LLMRun() >> (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes"))
30 | 
31 | if __name__ == "__main__":
32 |     print(run_test(TestAppendNotExtend))
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/tests/make_tree_from_text.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | 
 4 | DESCRIPTION = "Test if the model can create a tree from a string."
 5 | 
 6 | TAGS = ['code', 'python']
 7 | 
 8 | code = """
 9 | class Node:
10 |     def __init__(self, name, parent=None):
11 |         self.name = name
12 |         self.parent = parent
13 |         self.children = []
14 |         if parent:
15 |             self.parent.children.append(self)
16 |     def __repr__(self):
17 |         if self.children == []:
18 |             return self.name
19 |         return '('+self.name + ' ' + ' '.join(repr(c) for c in self.children)+')'
20 | """
21 | 
22 | question = code + """
23 | 
24 | Write a complete python file that will print the tree
25 | 
26 | (1 (2 3) 4)
27 | 
28 | Call the root of the tree 'root'.
29 | Then print out str(root).
30 | DO NOT MODIFY THE Node CLASS.
31 | """
32 | 
33 | TestMakeTreeEasy = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> SubstringEvaluator('(1 (2 3) 4)')
34 | 
35 | question2 = code + """
36 | 
37 | Write a complete python file that will print the tree
38 | 
39 | (1 (2 (3 4 5 6) (7 8)) (9 10 (11 12)))
40 | 
41 | Call the root of the tree 'root'.
42 | Then print out str(root).
43 | """
44 | 
45 | TestMakeTree = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> SubstringEvaluator('(1 (2 (3 4 5 6) (7 8)) (9 10 (11 12)))')
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     print(run_test(TestMakeTreeEasy))
50 | 


--------------------------------------------------------------------------------
/llms/gemini_model.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | class GeminiModel:
 5 |     def __init__(self, name):
 6 |         self.name = name
 7 | 
 8 |         config = json.load(open("config.json"))
 9 |         self.api_key = config['llms']['google']['api_key'].strip()
10 | 
11 |         self.hparams = config['hparams']
12 |         self.hparams.update(config['llms']['openai'].get('hparams') or {})
13 |         
14 |     def make_request(self, conversation, max_tokens=None, add_image=None):
15 |         url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.name}:generateContent?key={self.api_key}"
16 | 
17 |         contents = []
18 |         for i, content in enumerate(conversation):
19 |             role = "user" if i % 2 == 0 else "model"
20 |             contents.append({"role": role, "parts": [{"text": content}]})
21 | 
22 |         data = {
23 |             "contents": contents,
24 |             "generationConfig": self.hparams
25 |         }
26 | 
27 |         headers = {'Content-Type': 'application/json'}
28 |         response = requests.post(url, headers=headers, json=data)
29 |         out = response.json()
30 |         print(out)
31 |         return out['candidates'][0]['content']['parts'][0]['text']
32 |     
33 | 
34 | if __name__ == "__main__":
35 |     q = "Why?"
36 |     model = GeminiModel("gemini-1.5-pro-latest")
37 |     response = model.make_request(["I think 4 is even.", "It is!", "Why?"])
38 |     print(response)
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/fix_json.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can fix broken JSON objects."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """Write me a python function called fix_json that fixes broken JSON objects, and returns the parsed object. I'll pass the JSON object as a string and it should fix:
 8 | - Incorect parenthesis, e.g., {"a": (5, 4)} -> {"a": [5, 4]}
 9 | - Incorrect single quotes, e.g., {'a': 5} -> {"a": 5}
10 | - False/True -> false/true
11 | """
12 | 
13 | help_err = """Make sure not to break data inside strings! So if there's a True or a ( inside a string keep them the way they are. Also don't break quoted strings"""
14 | 
15 | test_case, answer = make_python_test([("""fix_json('''{"a": 5, "b": [3, 4]}''')""", """{"a": 5, "b": [3, 4]}"""),
16 |                                       ("""fix_json('''{'a': 5.0, 'b': (True, 4)}''')""", """{"a": 5.0, "b": [True, 4]}"""),
17 |                                       ("""fix_json('''{'True': False, 'b()': (3, 4), "c'": []}''')""", """{"True": False, "b()": [3, 4], "c'": []}"""),
18 |                                       ])
19 | 
20 | TestFixJSON = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer)
21 | TestFixJSONHelp = (question+help_err) >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     print(run_test(TestFixJSONHelp))
26 | 
27 | 


--------------------------------------------------------------------------------
/llms/cohere_model.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from PIL import Image
 3 | import base64
 4 | 
 5 | import cohere
 6 | import json
 7 | 
 8 | class CohereModel:
 9 |     def __init__(self, name):
10 |         config = json.load(open("config.json"))
11 |         api_key = config['llms']['cohere']['api_key'].strip()
12 |         self.client = cohere.Client(api_key)
13 |         self.name = name
14 |         self.hparams = config['hparams']
15 |         self.hparams.update(config['llms']['cohere'].get('hparams') or {})
16 | 
17 |     def make_request(self, conversation, add_image=None, max_tokens=None):
18 |         prior_messages = [{"role": "USER" if i%2 == 0 else "CHATBOT", "message": content} for i,content in enumerate(conversation[:-1])]
19 | 
20 |         kwargs = {
21 |             "chat_history": prior_messages,
22 |             "message": conversation[-1],
23 |             "max_tokens": max_tokens,
24 |             "model": self.name
25 |         }
26 |         kwargs.update(self.hparams)
27 |     
28 |         for k,v in list(kwargs.items()):
29 |             if v is None:
30 |                 del kwargs[k]
31 |     
32 |         out = self.client.chat(
33 |             prompt_truncation='AUTO',
34 |             **kwargs
35 |         )
36 |     
37 |         return out.text
38 | 
39 | if __name__ == "__main__":
40 |     import sys
41 |     #q = sys.stdin.read().strip()
42 |     q = "what specific date?"
43 |     print(q+":", CohereModel("command").make_request(["Who discovered relativity?", "Einstein.", q]))
44 | 


--------------------------------------------------------------------------------
/tests/webgl_triangle.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate an HTML file with WebGL code that draws an image."
 4 | 
 5 | TAGS = ['code', 'visual', 'html']
 6 | 
 7 | question = """
 8 | Draw an image of a triangle using WebGL in a single HTML file. Make the canvas take the full page.
 9 | """
10 | 
11 | TestDrawTriangle = question >> LLMRun() >> ExtractCode(keep_main=True, lang="html/javascript") >> (~SubstringEvaluator("THREE.") & (SeleniumDraw() >> LLMVisionRun("This is a screenshot of a webpage that draws something with webgl. I'm trying to see if it drew what I wanted. Describe what is drawn in this image?") >> LLMRun("Does this answer say that there is a picture of a triangle \"<A>\"? Answer \"Yes it does\" or \"No it does not\".", llm=EVAL_LLM) >> SubstringEvaluator("es it does")))
12 | 
13 | question2 = """
14 | Draw an image of a house with a chimney using WebGL in a single HTML file. Make the canvas take the full page.
15 | """
16 | 
17 | TestDrawHouse = question2 >> LLMRun() >> ExtractCode(keep_main=True, lang="html/javascript") >> (~SubstringEvaluator("THREE.") & (SeleniumDraw() >> LLMVisionRun("This is a screenshot of a webpage that draws something with webgl. I'm trying to see if it drew what I wanted. Describe what is drawn in this image?") >> LLMRun("Does this answer say that there is a picture of a house with a chimney \"<A>\"? Answer \"Yes it does\" or \"No it does not\".", llm=EVAL_LLM) >> SubstringEvaluator("es it does")))
18 | 
19 | if __name__ == "__main__":
20 |     print(run_test(TestDrawHouse))
21 | 


--------------------------------------------------------------------------------
/tests/c_weird_expression.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case is meant to test if the model can correctly evaluate a complex C expression."
 4 | 
 5 | TAGS = ['explain', 'c']
 6 | 
 7 | 
 8 | evaluation = """
 9 | Below is a student's answer to a question about what a particular C expression evaluates to: <A>
10 | 
11 | Does the student's final answer say that the expression evaluates to *x+2 (or even v+2 is okay)?
12 | 
13 | Ignore their reasoning. Just look at the final answer, if they say **EXPLICITLY** that the answer is *x+2. Giving something that's equal but more complicated is wrong. 
14 | 
15 | If the answer is correct say "The student passes" otherwise "The student fails".
16 | """
17 | 
18 | def parse(x):
19 |     return x.lower().split("final answer:")[-1]
20 | 
21 | question = """
22 | What does the c expression `-~++*x--` evaluate to in simplest form? Ignore side effects. Think out loud then give a final summary. Write "Final answer:" and then give your answer.
23 | """
24 | 
25 | TestExplainWeirdCEasy = question >> LLMRun() >> PyFunc(parse) >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")
26 | 
27 | question_harder = """
28 | What does the c expression `-~++*x--` evaluate to in simplest form? Ignore side effects. Write "Final answer:" and then give your answer.
29 | """
30 | 
31 | TestExplainWeirdC = question_harder >> LLMRun() >> PyFunc(parse) >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     print(run_test(TestExplainWeirdCEasy))
36 | 


--------------------------------------------------------------------------------
/tests/date_news_headlines.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can predict the date a few news headlines were published."
 4 | 
 5 | TAGS = ['fun']
 6 | 
 7 | question = """
 8 | What date was this the front page of HN? Format it YYYY-MM-DD.
 9 | 
10 | 
11 | 1.
12 | We Made One Gram Of Remdesivir (acsh.org)
13 | 709 points by tomstokes on [date] | 231 comments
14 | 2.
15 | Crafting “Crafting Interpreters” (stuffwithstuff.com)
16 | 777 points by _vbdg on [date] | 75 comments
17 | 3.
18 | Bose QC 35 Firmware 4.5.2 Noise Cancellation Investigation Report (bose.com)
19 | 640 points by robbiet480 on [date] | 323 comments
20 | 4.
21 | Csound: A sound and music computing system (csound.com)
22 | 226 points by diaphanous on [date] | 92 comments
23 | 5.
24 | New Jersey needs COBOL programmers for their unemployment claims system (twitter.com/manicode)
25 | 447 points by enraged_camel on [date] | 297 comments
26 | 6.
27 | All models are wrong, but some are completely wrong (rssdss.design.blog)
28 | 305 points by magoghm on [date] | 208 comments
29 | 7.
30 | Configs suck? Try a real programming language (beepb00p.xyz)
31 | 289 points by gyre007 on [date] | 345 comments
32 | 8.
33 | Ilo sitelen, a handmade computer for Toki Pona (increpare.com)
34 | 204 points by tobr on [date] | 90 comments
35 | 9.
36 | The Svelte Compiler Handbook (lihautan.com)
37 | 330 points by PKop on [date] | 136 comments
38 | 10.
39 | Show HN: Export HN Favorites to a CSV File
40 | 240 points by gabrielsroka on [date] | 39 comments
41 | """
42 | 
43 | TestDateNewsHeadlines = question >> LLMRun() >> SubstringEvaluator("2020-04-05")
44 | 
45 | if __name__ == "__main__":
46 |     print(run_test(TestDateNewsHeadlines))
47 | 
48 | 


--------------------------------------------------------------------------------
/tests/flexbox_webpage.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate an HTML file using flexbox"
 4 | 
 5 | TAGS = ['code', 'html']
 6 | 
 7 | question = """
 8 | Generate an HTML file that uses flexbox to create a webpage with a header 200px at the top (colored red), and the rest of the body split 100px for a left sidebar (in blue), 100px for a right sidebar (in green), and the rest of the page in the middle (in white).
 9 | """
10 | 
11 | def check(x):
12 |     import io
13 |     from PIL import Image
14 |     import numpy as np
15 |     img = Image.open(io.BytesIO(x))
16 | 
17 |     np_img = np.array(img)[:,:,:3]
18 |     while np.mean(np_img[0]) >= 254:
19 |         np_img = np_img[1:]
20 |     while np.mean(np_img[:, 0]) >= 254:
21 |         np_img = np_img[:, 1:]
22 | 
23 |     if np_img[0:200].std((0,1)).mean() > 20:
24 |         return False
25 | 
26 |     if np_img[400:, :100].std((0,1)).mean() > 20:
27 |         return False
28 | 
29 |     if np_img[400:, -100:-30].std((0,1)).mean() > 20:
30 |         return False
31 | 
32 |     if np_img[0:200].mean((0,1)).argmax() != 0:
33 |         return False
34 | 
35 |     if np_img[400:, :100].mean((0,1)).argmax() != 2:
36 |         return False
37 | 
38 |     if np_img[400:, -100:].mean((0,1)).argmax() != 1:
39 |         return False
40 |     
41 |     
42 |     if np_img[:800, :100].std((0,1)).mean() < 20:
43 |         return False
44 | 
45 |     if np_img[:800, -100:].std((0,1)).mean() < 20:
46 |         return False
47 |     
48 |     return True
49 | 
50 |     
51 | 
52 | TestFlexbox = question >> LLMRun() >> ExtractCode(keep_main=True, lang="html/javascript") >> SeleniumDraw() >> PyFunc(check)
53 | 
54 | if __name__ == "__main__":
55 |     print(run_test(TestFlexbox))
56 | 
57 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Ubuntu as a parent image
 2 | FROM ubuntu:latest
 3 | 
 4 | # Update the system and install dependencies
 5 | RUN apt-get update && apt-get install -y \
 6 |     software-properties-common
 7 | 
 8 | # Add the deadsnakes PPA, which contains newer Python versions
 9 | RUN add-apt-repository ppa:deadsnakes/ppa
10 | 
11 | # Update the system and install Python and build dependencies
12 | RUN apt-get update && apt-get install -y \
13 |     software-properties-common \
14 |     build-essential \
15 |     gcc \
16 |     curl \
17 |     sqlite3 \
18 |     gdb \
19 |     libssl-dev
20 | 
21 | ENV DEBIAN_FRONTEND noninteractive
22 | 
23 | # Install Python 3.12
24 | RUN apt-get update && apt-get install -y \
25 |     python3.12 \
26 |     python3.12-venv \
27 |     python3.12-dev \
28 |     git
29 | 
30 | # Remove the EXTERNALLY-MANAGED file to allow pip installations
31 | RUN rm -f /usr/lib/python3.12/EXTERNALLY-MANAGED
32 | 
33 | # Download and install pip using the get-pip.py script
34 | RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
35 |     python3.12 get-pip.py && \
36 |     rm get-pip.py
37 | 
38 | # Install required Python packages using pip
39 | RUN pip install numpy scipy numba Pillow jax jaxlib python-chess torch
40 | 
41 | # Install Rust
42 | RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
43 | 
44 | # Ensure Rust binaries are in PATH
45 | ENV PATH="/root/.cargo/bin:${PATH}"
46 | 
47 | # Create a symlink for python3
48 | RUN ln -s /usr/bin/python3 /usr/bin/python
49 | 
50 | # Set the working directory in the container
51 | WORKDIR /usr/src/app
52 | 
53 | # Any additional commands or environment variables can be added here
54 | 
55 | # Command to run when the container launches
56 | CMD ["/bin/bash"]
57 | 


--------------------------------------------------------------------------------
/tests/basic_git_setup.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can guide a user in a conversation to setup a git repo."
 4 | 
 5 | TAGS = ['bash', "git", "agent"]
 6 | 
 7 | question = """
 8 | You are in a repository with two files `a` and `b`. Make a new git repo and then commit them both.
 9 | 
10 | On each round you should just give me just one command to type, I'll run it, and give you the exact output. I'll get started:
11 | 
12 | INPUT: `pwd`
13 | 
14 | OUTPUT: `/usr/src/app`
15 | 
16 | INPUT: `ls`
17 | 
18 | OUTPUT: `main.py a b`
19 | 
20 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command. Start your response with INPUT:"""
21 | 
22 | 
23 | def test_ok():
24 |     import os
25 |     success = os.popen("git rev-list --all --count").read().strip() == "1"
26 |     return success
27 | 
28 | def setup():
29 |     import os
30 |     open("a", "w").write("a")
31 |     open("b", "w").write("b")
32 |     os.popen('git config --global user.email "you@example.com"').read()
33 |     os.popen('git config --global user.name "Your Name"').read()
34 | 
35 | def do_extract(x):
36 |     if 'INPUT:' in x:
37 |         x = x.split("INPUT:")[1]
38 |     try:
39 |         to_send = x.split("`")[1]
40 |     except:
41 |         to_send = x.split("\n")[0]
42 |     return to_send
43 | 
44 | def do_prepare(x):
45 |     print("Preparing to pass back", x)
46 |     return f"OUTPUT: `{x}`.\n\nWhat command should I run next? Start your response with INPUT:"
47 |     
48 | TestGitSimple = Setup(setup) >> question >> UntilDone(PyEvaluator(test_ok), (LLMConversation() >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=4) >> PyEvaluator(test_ok)
49 |     
50 | 
51 | if __name__ == "__main__":
52 |     print(run_test(TestGitSimple))
53 | 


--------------------------------------------------------------------------------
/tests/jnp_nn_bugfix.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can identify and fix a bug in a given jax.numpy function."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Fix the bug in this jnp function. Write out the full correct version of the code.
 9 | 
10 | import jax
11 | import jax.numpy as jnp
12 | 
13 | def init_mlp_params(input_size, hidden_size, output_size, key):
14 |     k1, k2 = jax.random.split(key)
15 |     # Initialize weights and biases for the first linear layer
16 |     w1 = jax.random.normal(k1, (hidden_size, input_size)) * jnp.sqrt(2.0 / input_size)
17 |     b1 = jnp.zeros(hidden_size)
18 |     # Initialize weights and biases for the second linear layer
19 |     w2 = jax.random.normal(k2, (output_size, hidden_size)) * jnp.sqrt(2.0 / hidden_size)
20 |     b2 = jnp.zeros(output_size)
21 |     return {'w1': w1, 'b1': b1, 'w2': w2, 'b2': b2}
22 | 
23 | def mlp_apply(params, x):
24 |     # First linear layer
25 |     x = jnp.dot(x, params['w1'].T) + params['b1']
26 |     # Relu activation
27 |     x = jax.nn.relu(x)
28 |     # Second linear layer
29 |     x = jnp.dot(x, params['w2'].T) + params['b2']
30 |     # Softmax activation
31 |     return jax.nn.softmax(x, axis=1)
32 | 
33 | # Example usage:
34 | key = jax.random.PRNGKey(0)
35 | params = init_mlp_params(10, 20, 30, key)
36 | x = jax.random.normal(key, (5, 10)) # random input similar to torch.randn
37 | y = mlp_apply(params, x)
38 | print(list(y.flatten().sum(1)))  # The output sum should be very close to [1, 1, 1, 1, 1]
39 | """
40 | 
41 | TestFixJnpBug = question >> LLMRun() >> ExtractCode(keep_main=True) >> \
42 |     (PythonRun() >> (SubstringEvaluator("1.0,") | SubstringEvaluator("1.00000") | SubstringEvaluator("1.0 ") | SubstringEvaluator("0.99999")))
43 | 
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     print(run_test(TestFixJnpBug))
48 | 


--------------------------------------------------------------------------------
/tests/why_broken_flask_extra_brace.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test checks is the model can figure out the user has put an accidental extra brace in the request body."
 4 | 
 5 | TAGS = ['explain', 'python']
 6 | 
 7 | question = """
 8 | Why isn't this working
 9 | 
10 | ```
11 | @app.route('/getsimilar', methods=['GET'])
12 | def getsimilar():
13 |     print('args',request.args)
14 |     positive = request.args.get('positive', '')
15 |     positive = positive.split("##")
16 | 
17 |     negative = request.args.get('negative', '')
18 |     negative = negative.split("##")
19 | 
20 |     keep_yes = [i for i,x in enumerate(titles) if x in positive]
21 |     set1_yes = emb[keep_yes]
22 | 
23 |     keep_no = [i for i,x in enumerate(titles) if x in negative]
24 |     set1_no = emb[keep_no]
25 | 
26 |     print("OK", len(set1_yes), len(set1_no))
27 | ```
28 | 
29 | I see
30 | 
31 | ```
32 | args ImmutableMultiDict([('positive', 'Title: Towards Evaluating the Robustness of Neural Networks\nAuthors: Nicholas Carlini; David Wagner'), ('negative', 'Title: On Evaluating Adversarial Robustness Authors: Nicholas Carlini; Anish Athalye; Nicolas Papernot; Wieland Brendel; Jonas Rauber; Dimitris Tsipras; Ian Goodfellow; Aleksander Madry; Alexey Kurakin}')])
33 | OK 1 0
34 | ```
35 | 
36 | Why don't I see OK 1 1? Instead it just says OK 1 0. List exactly one reason explaining what's wrong and what one change would fix it.
37 | """
38 | 
39 | judge = "I asked a question about why some code was incorrect. The answer is \"because the input contains an extra curly brace }\". Does this student answer say that?\n\n<A>\n\nAnswer either 'The student passes' or 'The student fails'."
40 | 
41 | TestBrokenExtraBrace = question >> LLMRun() >> (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes"))
42 | 
43 | if __name__ == "__main__":
44 |     print(run_test(TestBrokenExtraBrace))
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/tests/faster_l2_diff.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can optimize a given Python program for speed and memory efficiency."
 4 | 
 5 | TAGS = ['code', 'performance', 'python']
 6 | 
 7 | question = """
 8 | I have this program
 9 | 
10 | import numpy as np
11 | def diff(out, templates):
12 |     return (out[:, None, :] - templates[None, :, :]).square().sum(2)
13 | 
14 | Make it faster and more memory efficient without expanding the arrays. Call it `diff` still.
15 | """
16 | 
17 | tracer = """
18 | import tracemalloc
19 | import numpy as np
20 | 
21 | if __name__ == "__main__":
22 |     tracemalloc.start()  # Start tracing
23 | 
24 |     snapshot1 = tracemalloc.take_snapshot()  # Before execution
25 |     diff(np.arange(10000).reshape((100, 100)), np.arange(10000).reshape((100, 100)))
26 |     snapshot2 = tracemalloc.take_snapshot()  # After execution
27 | 
28 |     # Calculate the difference in memory usage
29 |     current, peak = tracemalloc.get_traced_memory()
30 |     tracemalloc.stop()  # Stop tracing
31 | 
32 |     if peak < 10**6:
33 |         print("Efficient")
34 | """
35 | 
36 | test_case, answer = make_python_test([("diff(np.arange(100).reshape((10, 10)), np.arange(100).reshape((10, 10))).sum()", "1650000")])
37 | 
38 | 
39 | TestFastL2 = question >> LLMRun() >> ExtractCode() >> \
40 |           ((PythonRun(test_case) >> SubstringEvaluator(answer)) &
41 |            (PythonRun(tracer) >> SubstringEvaluator("Efficient")) &
42 |            (LLMRun("Below is a students answer to a question on writing efficient code. They're supposed to be vectorizing a loop. Is the following code vectorized? It is incorrect to use for loops explicitly. Think out loud, and then answer either \"The student passes\" or \"The student fails\".\n<A>", llm=EVAL_LLM) >> SubstringEvaluator("student passes")))
43 |                                                                     
44 | 
45 | if __name__ == "__main__":
46 |     print(run_test(TestFastL2))
47 | 


--------------------------------------------------------------------------------
/tests/call_rust_from_python.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write rust code that can be imported from python and knows how to build it."
 4 | 
 5 | TAGS = ['rust', 'c', 'python', 'code']
 6 | 
 7 | question = """I want to call some rust code from python on a linux machine. Specifically, I want to be able to run
 8 | 
 9 | python3 -c 'import magic; print(magic.do_add(3, 4))'
10 | 
11 | and have it give me the sum of these numbers. Here, do_add should be a rust function that adds two integers.
12 | 
13 | Please give me a complete set of instructions for how to make this run, so that when I run this code as-is it will work.
14 | """
15 | 
16 | question2 = """
17 | Now please conver this answer into a JSON array that packages it all up so I don't need to think and can just copy files to specific locations and then run things. Format your answer
18 | 
19 | {"path/to/file": "contents of rust file",
20 |  "path/to/python_file": "contents of any necessary python file that exposes do_add",
21 |  "build.sh": "contents of a bash file to build the project",
22 |  ...  # any other files necessary
23 | }
24 | 
25 | 
26 | The only thing I will do is put files in the right place and run `bash build.sh`. Do not give any explanation of the code. ONLY ANSWER WITH A SINGLE JSON OBJECT. Your answer should start ```json and end with ```.
27 | """
28 | 
29 | def is_ok():
30 |     import os
31 |     os.popen("bash build.sh").read()
32 |     
33 |     return os.popen("python3 -c 'import magic; print(magic.do_add(31, 11))'").read() == "42\n"
34 | 
35 |     
36 | TestCallRustFromPy = question >> LLMConversation() >> LLMConversation(question2) >> ExtractJSON() >> MakeFilesFromJSON() >> PyEvaluator(is_ok)
37 | TestCallCFromPy = question.replace("rust", "C") >> LLMConversation() >> LLMConversation(question2.replace("rust", "C")) >> ExtractJSON() >> MakeFilesFromJSON() >> PyEvaluator(is_ok)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     print(run_test(TestCallRustFromPy))
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/tests/convert_dp_to_iterative.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can understand a DP algorithm and then convert it into an iterative implementation."
 4 | 
 5 | TAGS = ['code', 'performance', 'python']
 6 | 
 7 | my_code = """
 8 | @lru_cache(maxsize=None)
 9 | def solve_dp(graph, lines_left, distance_since_last):
10 |     if len(graph) < lines_left: return 1e9, []
11 |     if lines_left == 0 and len(graph) == 0: return 0, []
12 | 
13 |     # option 1: don't take the line
14 |     value = (1e9, [])
15 |     if distance_since_last < 10:
16 |         value = min(value, solve_dp(graph[1:], lines_left, distance_since_last+1))
17 | 
18 |     # option 2: take the line
19 |     if lines_left > 0:
20 |         sub, how = solve_dp(graph[1:], lines_left-1, 0)
21 |         sub += graph[0] + (distance_since_last-8)**2
22 |         value = min(value, (sub, how + [len(graph)]))
23 | 
24 |     return value
25 | """
26 | 
27 | question = f"""
28 | Explain what this code does. Then, convert this recursive function to an iterative one, making explicit the cubic work that's being done. The functions should return exactly the same output but the iterative one should be faster. Call your function `solve_iterative` and match the type signature of the original function.
29 | 
30 | {my_code}
31 | """
32 | 
33 | test_case, answer = make_python_test([(f"""0
34 | import random
35 | mylist = list(range(100))
36 | random.shuffle(mylist)
37 | from functools import lru_cache
38 | 
39 | {my_code.replace("solve_dp", "my_solve_dp")}
40 | answer = solve_iterative(tuple(mylist), len(mylist)//8, 4)""",
41 |                                        "my_solve_dp(tuple(mylist), len(mylist)//8, 4)")])
42 | 
43 | def not_recursive(code):
44 |     return code.count("solve_iterative") == 1, ""
45 | 
46 | TestProgramRemoveDP = question >> LLMRun() >> ExtractCode() >> \
47 |     (PyFunc(not_recursive) & (PythonRun(test_case) >> SubstringEvaluator(answer)))
48 | 
49 | if __name__ == "__main__":
50 |     print(run_test(TestProgramRemoveDP))
51 | 


--------------------------------------------------------------------------------
/tests/vectorize_small_update.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can replace a for loop with a vectorized version."
 4 | 
 5 | TAGS = ['code', 'python', 'performance']
 6 | 
 7 | def update(high, highest_number, delta):
 8 |     for i in range(len(high)):
 9 |         if i != highest_number:
10 |             high[i] = min(high[i], high[highest_number] + delta[highest_number] - delta[i])
11 |     return high
12 | 
13 | question = """
14 | Vectorize this update loop. Call the function `update`:
15 | 
16 | import numpy as np
17 | 
18 | def update(high, highest_number, delta):
19 |     for i in range(len(high)):
20 |         if i != highest_number:
21 |             high[i] = min(high[i], high[highest_number] + delta[highest_number] - delta[i])
22 |     return high
23 | """
24 | 
25 | test_case, answer = make_python_test([("list(update(np.array([2, 5, 9, 1, 2, 0, 3, 8]), 4, np.array([1, 2, 1, 4, 3, 2, 1, 2])))", "[2, 3, 4, 1, 2, 0, 3, 3]")], header="import numpy as np")
26 | 
27 | evaluation = """
28 | Below is a student's answer to a question about how to vectorize this loop:
29 | 
30 | def update(high, highest_number, delta):
31 |     for i in range(len(high)):
32 |         if i != highest_number:
33 |             high[i] = min(high[i], high[highest_number] + delta[highest_number] - delta[i])
34 |     return high
35 | 
36 | Here is their code:
37 | 
38 | <A>
39 | 
40 | Did the student try to vectorize the loop? Don't tell me if they are correct or not. (I have tests for that separately which will catch any bugs.) Just tell me if they tried to vectorize the loop. End your answer with either the string "The student passes" if they tried to do a vectorized loop, or "The student fails" if they just wrote a for loop again.
41 | """
42 | 
43 | TestVectorizeSmall = question >> LLMRun() >> ExtractCode() >> ((PythonRun(test_case) >> SubstringEvaluator(answer)) & (LLMRun(evaluation) >> SubstringEvaluator("student passes")))
44 | 
45 | if __name__ == "__main__":
46 |     print(run_test(TestVectorizeSmall))
47 | 


--------------------------------------------------------------------------------
/tests/vague_sum_data.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can infer what data to sum and what to ignore by example with vague instructions."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = '''
 8 | Here's some data
 9 | QUERY A361
10 | Found at: 160392127418
11 | Number of times present: 4
12 | 
13 | 
14 | Found at: 150158692866
15 | Number of times present: 3
16 | 
17 | 
18 | Found at: 122340445248
19 | Number of times present: 17
20 | 
21 | 
22 | Found at: 137248651778
23 | Number of times present: 84
24 | 
25 | 
26 | Found at: 131891981708
27 | Number of times present: 66
28 | 
29 | 
30 | QUERY 32B57
31 | Found at: 194561247604
32 | Number of times present: 4
33 | 
34 | 
35 | Found at: 82888490886
36 | Number of times present: 3
37 | 
38 | 
39 | Found at: 115836535414
40 | Number of times present: 19
41 | 
42 | 
43 | Found at: 79615924948
44 | Number of times present: 80
45 | 
46 | Write a python function called `process` that will take data like this as an argument and return a dict where the keys are the like A361/32B57 and the values are how often its present in total.
47 | '''
48 | 
49 | data = "QUERY A361\\nFound at: 160392127418\\nNumber of times present: 4\\n\\n\\nFound at: 150158692866\\nNumber of times present: 4\\n\\n\\nFound at: 122340445248\\nNumber of times present: 17\\n\\n\\nFound at: 137248651778\\nNumber of times present: 84\\n\\n\\nFound at: 131891981708\\nNumber of times present: 66\\n\\n\\nQUERY 32B57\\nFound at: 194561247604\\nNumber of times present: 4\\n\\n\\nFound at: 82888490886\\nNumber of times present: 3\\n\\n\\nFound at: 115836535414\\nNumber of times present: 19\\n\\n\\nFound at: 79615924948\\nNumber of times present: 81\\n"
50 | 
51 | test_case, answer = make_python_test([(f"process('{data}')", "{'A361': 175, '32B57': 107}")])
52 | 
53 | TestSumSomeData = question >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun(test_case) >> SubstringEvaluator(answer)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     print(run_test(TestSumSomeData))
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/llms/mistral_model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | import requests
 4 | 
 5 | class MistralModel:
 6 |     def __init__(self, name):
 7 |         self.name = name
 8 |         config = json.load(open("config.json"))
 9 |         self.hparams = config['hparams']
10 |         self.hparams.update(config['llms']['mistral'].get('hparams') or {})
11 | 
12 |         self.api_key = config['llms']['mistral']['api_key'].strip()
13 |     
14 |         self.headers = {
15 |             'Authorization': f'Bearer {self.api_key}',  # Adjust if the API expects a different kind of authentication
16 |             'Content-Type': 'application/json',
17 |             'Accept': 'application/json'
18 |         }
19 |         self.endpoint = "https://api.mistral.ai/v1/chat/completions"
20 | 
21 |     def make_request(self, conversation, add_image=None, max_tokens=None):
22 |         # Prepare the conversation messages in the required format
23 |         formatted_conversation = [
24 |             {"role": "user" if i % 2 == 0 else "assistant", "content": content}
25 |             for i, content in enumerate(conversation)
26 |         ]
27 | 
28 |         # Construct the data payload
29 |         data = {
30 |             "model": self.name,
31 |             "messages": formatted_conversation,
32 |             "max_tokens": max_tokens or 2048,
33 |         }
34 |         data.update(self.hparams)
35 | 
36 |         # Make the POST request to the API endpoint
37 |         response = requests.post(self.endpoint, headers=self.headers, data=json.dumps(data))
38 |         time.sleep(1)
39 |         if response.status_code == 200:
40 |             # Parse and return the response content
41 |             return response.json()['choices'][0]['message']['content']
42 |         else:
43 |             # Handle errors or unsuccessful status codes as needed
44 |             return f"API request failed with status code {response.status_code}"
45 | 
46 | if __name__ == "__main__":
47 |     import sys
48 |     q = sys.stdin.read().strip()
49 |     print(q+":", MistralModel("mistral-small").make_request([q]))
50 | 


--------------------------------------------------------------------------------
/tests/play_20_questions.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model is able to ask questions to get to an answer."
 4 | 
 5 | TAGS = ['fun']
 6 | 
 7 | question = """
 8 | Let's play 20 questions! I'm thinking of %s. You have 20 questions to guess what it is. Ask me yes or no questions, about anything you want. Do not guess randomly or make overly specific guesses. Do not make your final guess until you are absolutely sure. (You can think out loud first, but your last sentence MUST be a question.)
 9 | 
10 | Your answer should be of the format
11 | [question number]. [internal thought]
12 | 
13 | Question: [your question]?
14 | 
15 | If think you know the answer it should say
16 | 
17 | Question: Is the answer [your guess]?
18 | "
19 | """
20 | 
21 | def is_done_animal(guess):
22 |     return "llama" in guess.lower()
23 | 
24 | def is_done_book(guess):
25 |     return "diamond age" in guess.lower() or 'illustrated primer' in guess.lower()
26 | 
27 | def take_one_word(guess):
28 |     return guess.split()[0]
29 | 
30 | def take_question(guess):
31 |     if 'Question:' in guess:
32 |         return guess.split("Question:")[-1]
33 |     else:
34 |         return guess
35 | 
36 | TestTwentyQuestionsLlama = question%("an animal")  >> LLMConversation() >> UntilDone(PyFunc(is_done_animal), (PyFunc(take_question) >> LLMRun("I'm playing 20 questions with someone. I'm thinking of a Llama. Here's their question: <A>.\nAnswer either 'Yes' or 'No'; do not answer anything else.") >> PyFunc(take_one_word) >> LLMConversation()), max_iters=20) >> PyFunc(is_done_animal)
37 | TestTwentyQuestionsBook = question%("a book")  >> LLMConversation() >> UntilDone(PyFunc(is_done_book), (PyFunc(take_question) >> LLMRun("I'm playing 20 questions with someone. I'm thinking of the book The Diamond Age by Neal Stephenson. Here's their question: <A>.\nAnswer either 'Yes' or 'No'; do not answer anything else.") >> PyFunc(take_one_word) >> LLMConversation()), max_iters=20) >> PyFunc(is_done_book)
38 |  
39 | if __name__ == "__main__":
40 |     print(run_test(TestTwentyQuestionsBook))
41 | 


--------------------------------------------------------------------------------
/tests/fancy_sql_process.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a Python program that retrieves data from a SQL file."
 4 | 
 5 | TAGS = ['sql']
 6 | 
 7 | question = """
 8 | I have a sqlite3 database name TEXT, favorite_friend TEXT, age INT, profession TEXT.
 9 | Write me a python program that prints out the favorite friend of the youngest person who isn't anyone's favorite friend.
10 | The database is called people.db and the table is called people.
11 | """
12 | 
13 | def setup():
14 |     import sqlite3
15 |     
16 |     # Create a connection object using the connect function
17 |     conn = sqlite3.connect('people.db')
18 |     
19 |     # Create a cursor object using the cursor method
20 |     cursor = conn.cursor()
21 |     
22 |     # Create table named people with columns for name, favorite friend, age, and profession
23 |     cursor.execute('''CREATE TABLE people
24 |                  (name TEXT, favorite_friend TEXT, age INT, profession TEXT)''')
25 |     
26 |     # List of people data to insert into the people table
27 |     people_data = [
28 |         ('Alice', 'Bob', 29, 'Engineer'),
29 |         ('Bob', 'Alice', 32, 'Doctor'),
30 |         ('Charlie', 'Alice', 28, 'Artist'),
31 |         ('David', 'Eve', 35, 'Architect'),
32 |         ('Eve', 'Frank', 26, 'Teacher'),
33 |         ('Frank', 'Alice', 31, 'Scientist'),
34 |         ('Grace', 'Heidi', 30, 'Nurse'),
35 |         ('Heidi', 'Ivy', 25, 'Lawyer'),
36 |         ('Ivy', 'Charlie', 34, 'Chef'),
37 |         ('Judy', 'Grace', 27, 'Accountant')
38 |     ]
39 |     
40 |     # Insert each person into the people table
41 |     cursor.executemany('INSERT INTO people VALUES (?,?,?,?)', people_data)
42 |     
43 |     # Commit the changes
44 |     conn.commit()
45 |     
46 |     # Close the connection
47 |     conn.close()
48 | 
49 | 
50 | TestSqlSubquery = Setup(setup) >> question >> LLMRun() >> ExtractCode(keep_main=True, lang='python') >> Echo() >> PythonRun() >> SubstringEvaluator("Grace")
51 | 
52 | if __name__ == "__main__":
53 |     print(run_test(TestSqlSubquery))
54 | 


--------------------------------------------------------------------------------
/tests/print_hello_poly.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a program that prints \"hello world\" when run either as a C or a python program."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = 'Write a single file that I can save as foo.c.py that I can run as either a C program or a python program, and so both will print "hello world". Specifically if I run `python foo.c.py` or `gcc foo.c.py && ./a.out` I should see "hello world" printed to the console.'
 8 | 
 9 | answer = "hello world"
10 | 
11 | TestPrintHelloPoly = question >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> SubstringEvaluator(answer)) & (CRun() >> SubstringEvaluator(answer)))
12 | 
13 | question2 = '''
14 | Write a single file that I can save as foo.py.rs that I can run as either a rust program or a python program, and so both will print "hello world". Specifically if I run `python foo.py.rs` or `rustc -o a.out foo.py.rs && ./a.out` I should see "hello world" printed to the console.
15 | 
16 | To help you out, here's a valid answer for how to do this in C:
17 | 
18 | ```
19 | #include <stdio.h>
20 | #define sdf b /*                                                                                                                 
21 | print("hello world")                                                                                                             
22 | """                                                                                                                              
23 | #define wer b */
24 | 
25 | int main() {
26 |   printf("hello world\n");
27 | }
28 | 
29 | #define ert s """
30 | ```
31 | '''
32 | 
33 | TestPrintHelloPoly = question >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> SubstringEvaluator(answer)) & (CRun() >> SubstringEvaluator(answer)))
34 | TestPrintHelloPoly2 = question2 >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> SubstringEvaluator(answer)) & (RustRun() >> SubstringEvaluator(answer)))
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     print(run_test(TestPrintHelloPoly2))
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/tests/extract_emails.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can accurately extract and identify invalid email addresses from a given text file. Models that are \"overly safe\" will fail."
 4 | 
 5 | TAGS = ['data']
 6 | 
 7 | question = """
 8 | Extract all the email addresses that are invalid from this text file:
 9 | 
10 | 
11 | <foo-bar@example.com>: Domain example.com does not accept mail (nullMX)
12 | 
13 | <noperson628@gmail.com>: host mail.banglemail.com[138.68.190.241] said: 550
14 |    5.1.1 <noperson628@gmail.com>: Recipient address rejected: User unknown in
15 |    virtual alias table (in reply to RCPT TO command)
16 | 
17 | <againredacted@example.info>: host emx.mail.ru[218.62.518.180] said: 550
18 |    Message was not accepted -- invalid mailbox.  Local mailbox
19 |    againredacted@example.info is unavailable: user not found (in reply to end
20 |    of DATA command)
21 | 
22 | <xsz032by05wfk5jpe@mail4you.biz>: host emx.mail.ru[217.69.139.180] said: 550
23 |    Message was not accepted -- invalid mailbox.  Local mailbox
24 |    xsz032by05wfk5jpe@mail4you.biz is unavailable: user not found (in reply to
25 |    end of DATA command)
26 | 
27 | <wontsay@bestpay.com.cn>: host mta-ent.21cn.com[183.56.203.132] said: 543
28 |    suspected spams or account(IP) exception (in reply to end of DATA command)
29 | 
30 | <redactedfinally@gmail.com>: host gmail-smtp-in.l.google.com[142.250.27.26] said:
31 |    550-5.1.1 The email account that you tried to reach does not exist. Please
32 |    try 550-5.1.1 double-checking the recipient's email address for typos or
33 | 
34 | """
35 | 
36 | TestExtractEmail = question >> LLMRun() >> (SubstringEvaluator("foo-bar@example.com") & SubstringEvaluator("noperson628@gmail.com") & SubstringEvaluator("againredacted@example.info") & SubstringEvaluator("xsz032by05wfk5jpe@mail4you.biz") & SubstringEvaluator("wontsay@bestpay.com.cn") & SubstringEvaluator("redactedfinally@gmail.com") & ~SubstringEvaluator("gmail-smtp-in.l.google.com") & ~SubstringEvaluator("217.69.139.180"))
37 |     
38 | 
39 | if __name__ == "__main__":
40 |     print(run_test(TestExtractEmail))
41 |     
42 | 


--------------------------------------------------------------------------------
/tests/rewrite_mac_crypto.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test the ability of the model to rewrite a simple c program so it will run on ubuntu, and keep bugs in place."
 4 | 
 5 | TAGS = ['code', 'c']
 6 | 
 7 | question = """
 8 | Rewrite this c code so it will run on ubuntu. DO NOT FIX ANY BUGS.
 9 | 
10 | ```c
11 | #include <stdio.h>
12 | #include <strings.h>
13 | #include <CommonCrypto/CommonDigest.h>
14 | 
15 | unsigned char memory[1<<20];
16 | void* malloc_ptr = (void*)&memory;
17 | 
18 | void* my_malloc(int size) {
19 |     void* ptr = malloc_ptr;
20 |     malloc_ptr += size;
21 |     return (void*)ptr;
22 | }
23 | 
24 | void compute_md5(char *str, unsigned char *hex_result) {
25 |     unsigned char result[CC_MD5_DIGEST_LENGTH];
26 |     CC_MD5_CTX md5ctx;
27 |     CC_MD5_Init(&md5ctx);
28 |     CC_MD5_Update(&md5ctx, str, strlen(str));
29 |     CC_MD5_Final(result, &md5ctx);
30 | 
31 |     for (int i = 0; i < CC_MD5_DIGEST_LENGTH; i++) {
32 |         sprintf((char*)(hex_result + (i * 2)), "%02x", result[i]);
33 |     }
34 | }
35 | 
36 | int main(int argc, char** argv) {
37 |     char* msg = (char*)my_malloc(32);
38 |     char* result = (char*)my_malloc(33);
39 |     msg[0] = '\0';
40 | 
41 |     compute_md5(argv[1], (unsigned char*)result);
42 |     result[33] = '\0';
43 | 
44 |     strcat(msg, "Your password is wrong, ");
45 |     strcat(msg, argv[2]);
46 | 
47 |     if (strcmp(result, "13bd041ac6e683f0c8ddba41eeed9e67") == 0) {
48 |       puts("CORRECT!");
49 |       return 0;
50 |     } else {
51 |       puts(msg);
52 |       return 1;
53 |     }
54 | }
55 | ```
56 | """
57 | 
58 | TestProgramRewriteCCrypto = question >> LLMRun() >> ExtractCode(keep_main=True) >> \
59 |     ((CRun(gccflags="-lcrypto", argv="emacsisbetterthanvim a") >> SubstringEvaluator("CORRECT!")) &
60 |      (CRun(gccflags="-lcrypto", argv="randompw a") >> SubstringEvaluator("wrong")) &
61 |      (CRun(gccflags="-lcrypto", argv=f"a {'a'*8}13bd041ac6e683f0c8ddba41eeed9e67") >> SubstringEvaluator("CORRECT!")))
62 |      
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     from llm import llm
67 |     print(run_test(TestProgramRewriteCCrypto))
68 | 
69 | 


--------------------------------------------------------------------------------
/tests/rust_word_count.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write a rust program that performs word counting."
 4 | 
 5 | TAGS = ['code', 'rust']
 6 | 
 7 | question = """
 8 | Write a self-contained Rust program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout. 
 9 | 
10 | {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1}
11 | 
12 | Don't import any crates that would require a Cargo.toml file. I'll just compile it with `rustc yourfile.rs` and it should work. The program should be run by calling /path/to/rust_word_count with no arguments
13 | """
14 | 
15 | extra = "\nDo not import serde_json, write json output from scratch."
16 | 
17 | def setup():
18 |     txt = "it was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way"
19 |     open("/tmp/file.txt", "w").write(txt)
20 | 
21 | 
22 | answer = {'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}
23 | 
24 | TestRustCount = Setup(setup) >> StringNode(question) >> LLMRun() >> ExtractCode(keep_main=True) >> RustRun() >> ExtractJSON() >> JSONSubsetEvaluator(answer)
25 | 
26 | TestRustCountNoLib = Setup(setup) >> StringNode(question+extra) >> LLMRun() >> ExtractCode(keep_main=True) >> RustRun() >> ExtractJSON() >> JSONSubsetEvaluator(answer)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     print(run_test(TestRustCountNoLib))
31 | 


--------------------------------------------------------------------------------
/tests/py_image_resize.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can resize several images in a given subdirectory."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """Write function resize_image in python that open get a folder path as in put and looks for all of the images files in that folder using only pillow and resize them to 32x32 and overwrite it. Just give me the python code that I can run by python code.py and the default folder is /tmp """
 8 | 
 9 | 
10 | 
11 | 
12 | def setup():
13 |     from PIL import Image
14 |     import random
15 |     import os
16 |     def create_random_image(file_path):
17 |         # Random size between 100x100 and 800x800
18 |         width, height = random.randint(100, 800), random.randint(100, 800)
19 |         # Random color
20 |         color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
21 |         # Create an image with the random color
22 |         image = Image.new("RGB", (width, height), color)
23 |         # Save the image
24 |         image.save(file_path)
25 |     for i in range(10):
26 |         file_path = os.path.join('/tmp/', f"random_image_{i+1}.jpg" if random.random() > 0.5 else  f"random_image_{i+1}.jpeg"  , )
27 |         create_random_image(file_path)
28 | 
29 | def test():
30 |     import os
31 |     from PIL import Image
32 |     # Iterate over each file in the folder
33 |     target_size = (32,32)
34 |     folder_path = '/tmp/'
35 |     for file_name in os.listdir(folder_path):
36 |         file_path = os.path.join(folder_path, file_name)
37 |         # Check if the file is an image
38 |         if file_path.lower().endswith(('.jpg', '.jpeg')):
39 |             # Open the image
40 |             with Image.open(file_path) as img:
41 |                 # Check if the image size matches the target size
42 |                 if img.size != target_size:
43 |                     print('size is ',img.size)
44 |                     return False
45 |     return True
46 | 
47 | 
48 | TestImgResize = Setup(setup) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test)
49 | 
50 | if __name__ == "__main__":
51 |     print(run_test(TestImgResize))
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/llms/moonshot_model.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from PIL import Image
 3 | import base64
 4 | 
 5 | from openai import OpenAI
 6 | import json
 7 | 
 8 | class MoonshotAIModel:
 9 |     def __init__(self, name):
10 |         config = json.load(open("config.json"))
11 |         api_key = config['llms']['moonshot']['api_key'].strip()
12 |         self.client = OpenAI(api_key=api_key, base_url='https://api.moonshot.cn/v1')
13 |         self.name = name
14 |         self.hparams = config['hparams']
15 |         self.hparams.update(config['llms']['moonshot'].get('hparams') or {})
16 | 
17 |     def make_request(self, conversation, add_image=None, max_tokens=None):
18 |         conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)]
19 |     
20 |         if add_image:
21 |             buffered = BytesIO()
22 |             add_image.convert("RGB").save(buffered, format="JPEG")
23 |             img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
24 |             img_str = f"data:image/jpeg;base64,{img_str}"
25 |             
26 |             conversation[0]['content'] = [{"type": "text", "text": conversation[0]['content']},
27 |                                           {
28 |                                             "type": "image_url",
29 |                                             "image_url": {
30 |                                               "url": img_str
31 |                                             }
32 |                                           }
33 |                                           ]
34 |         kwargs = {
35 |             "messages": conversation,
36 |             "max_tokens": max_tokens,
37 |         }
38 |         kwargs.update(self.hparams)
39 |     
40 |         for k,v in list(kwargs.items()):
41 |             if v is None:
42 |                 del kwargs[k]
43 |     
44 |         out = self.client.chat.completions.create(
45 |             model=self.name,
46 |             **kwargs
47 |         )
48 |     
49 |         return out.choices[0].message.content
50 | 
51 | if __name__ == "__main__":
52 |     import sys
53 |     #q = sys.stdin.read().strip()
54 |     q = "hello there"
55 |     print(q+":", MoonshotAIModel("moonshot-v1-8k").make_request([q]))
56 | 


--------------------------------------------------------------------------------
/tests/gol_rle_decode.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "This test case tests if the model can convert a Game of Life pattern represented in RLE format to a numpy array."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Convert this game of life pattern
 9 | 
10 | 26$13b2o$14bo$14bobo$15b2o14$31b2o$30b2o$32bo13$46b2o$45b2o$47bo13$61b
11 | 2o$60b2o$62bo4$52b2o$52b2o$71bo$66b2ob3o$66bobobo$68bo$67b2o3$76b2o$
12 | 60b3o12b2o10bobo$52b3o7bo14bo8bo2bo4bo$51bo3bo5bo10b2o11b2o5b2o$50bo5b
13 | o15bobo8b2o3bo8b2o$51bo3bo11b2o6bo9b2o10b2o$52b3o8b2obo2bo2bo2bo10bo2b
14 | o$52bo2bo7b2o2b2o6bo11bobo$53b3o16bobo$53b2obo15b2o$54bobo$55bo3$52b2o
15 | 3b2o$52bobobobo$53b5o$54b3o$55bo7$54b2o$54b2o!
16 | 
17 | 
18 | to a numpy array. Call the function `rle_to_arr(string, h, w)`
19 | which should return a numpy array with .shape (h,w).
20 | Fill the array with 1s, and empty with 0.
21 | """
22 | 
23 | rle = """
24 | 16b2o3b2o$2o$2o$19b2o$17b5o$21bo$17b3o$o16bo$o17b2o$bo16b2o$b2o$3bo$o
25 | 17b2o3b2o$2b2o14bobobobo$11bobo5b5o$11b2o7b3o$2o10bo8bo$b2o$b2o2$2o$bo
26 | 2$6bobo$2o3bo$bo3bo$o4bo2bo7bo$5b3o7b2o7bo$15bobo5b3o$22b5o$21b2o3b2o$
27 | 22b5o$22b5o$14b2o6bo2bo$14bobo4bo3bo$11b2obobo4bo$11bobobo5bob2o$13bo
28 | 8bo$4bobo6b2o$7bo5b3o50bo$7bo5b3o3b2o3b2o40b4o$4bo2bo5b3o3bo5bo41b4o$
29 | 5b3o5b2o35bo16bo2bo$13bo6bo3bo24bobo15b4o4b2o$11bobobo5b3o15b2o7bob2o
30 | 14b4o5b2o$11b2obobo21bobo6b2ob2o14bo8b2o$14bobo20bo6b3obob2o$14b2o12b
31 | 2o7bo2bo2bo2bo2bobo$28b2o7bo6b2o4bo16bobo$38bobo26b2o$39b2o27bo4b2o$
32 | 21b2o49bo2bo$21b2o28bobo18b3o$52b2o21b2o$52bo21bo$74b2o$59bo$62bo$59bo
33 | 14bo$62bo$61bob3o8bo$59b4o2bo9b2o$64bo$52bo$51b2o$40b2o8b2o4b2o5bo$40b
34 | 2o7b3o4b2o3bobo$50b2o4b2o2bobo11b2o$4bobo44b2o6bo2bo11b2o$7bo44bo7bobo
35 | $o6bo53bobo$o3bo2bo55bo$5b3o5$4b4o$3bo3bo$7bo$3bo2bo3$o2b3o2b2o$2b5o2b
36 | o64b3o$9o67bo$9b3o63bo$o7bo2bo$9b2o3$4b2o$o3b3o$o5b2obo5b2o$bo4bo2bo5b
37 | 2o$2o4b2obo$o3b3o$4b2o2$59b3o12b2o$61bo12b2o!
38 | """
39 | 
40 | 
41 | 
42 | test_case, answer = make_python_test([(f"(np.array(rle_to_arr('''{rle}''', 101, 77),dtype=np.uint32).flatten() * np.arange(77*101)).sum()", "1563304")])
43 | 
44 | TestRLEDecode = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer)
45 | 
46 | if __name__ == "__main__":
47 |     print(run_test(TestRLEDecode))
48 | 


--------------------------------------------------------------------------------
/llms/vertexai_model.py:
--------------------------------------------------------------------------------
 1 | import vertexai
 2 | from vertexai.language_models import ChatModel, InputOutputTextPair
 3 | from vertexai.preview.generative_models import GenerativeModel
 4 | 
 5 | 
 6 | import json
 7 | import requests
 8 | 
 9 | class VertexAIModel:
10 |     def __init__(self, name):
11 |         self.name = name
12 |         config = json.load(open("config.json"))
13 |         self.hparams = config['hparams']
14 |         self.hparams.update(config['llms']['vertexai'].get('hparams') or {})
15 | 
16 |         project_id = config['llms']['vertexai']['project_id'].strip()
17 |         vertexai.init(project=project_id, location="us-central1")
18 | 
19 |         if 'gemini' in name:
20 |             self.chat_model = GenerativeModel(name)
21 |         else:
22 |             self.chat_model = ChatModel.from_pretrained(name)
23 | 
24 | 
25 |     def make_request(self, conversation, add_image=None, max_tokens=2048, stream=False):
26 |         if 'gemini' in self.name:
27 |             conversation = [" " if c == "" else c for c in conversation]
28 |             conf = {
29 |                 "max_output_tokens": 2048,
30 |               }
31 |             conf.update(self.hparams)
32 |             response = self.chat_model.generate_content(conversation, generation_config=conf)
33 |         else:
34 |             conversation_pairs = conversation[:-1]
35 |             conversation_pairs = [(a, b) for a, b in zip(conversation_pairs[::2], conversation_pairs[1::2])]
36 |     
37 |             chat = self.chat_model.start_chat(
38 |                  examples=[
39 |                      InputOutputTextPair(
40 |                          input_text=a,
41 |                          output_text=b,
42 |                      ) for a,b in conversation_pairs]
43 |             )
44 |             conf = {
45 |                 "max_output_tokens": 2048,
46 |               }
47 |             conf.update(self.hparams)
48 |             response = chat.send_message(
49 |                 conversation[-1],
50 |                 **conf
51 |             )
52 |         try:
53 |             return response.text
54 |         except:
55 |             return ''
56 |         
57 | 
58 | if __name__ == "__main__":
59 |     import sys
60 |     #q = sys.stdin.read().strip()
61 |     q = "why?"
62 |     print(VertexAIModel("gemini-1.5-pro-preview-0409").make_request(["hi, how are you doing", "i'm a bit sad", q]))
63 | 


--------------------------------------------------------------------------------
/llms/openai_model.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from PIL import Image
 3 | import base64
 4 | 
 5 | from openai import OpenAI
 6 | import json
 7 | 
 8 | class OpenAIModel:
 9 |     def __init__(self, name):
10 |         config = json.load(open("config.json"))
11 |         api_key = config['llms']['openai']['api_key'].strip()
12 |         self.client = OpenAI(api_key=api_key)
13 |         self.name = name
14 |         self.hparams = config['hparams']
15 |         self.hparams.update(config['llms']['openai'].get('hparams') or {})
16 | 
17 |     def make_request(self, conversation, add_image=None, max_tokens=None, json=False):
18 |         conversation = [{"role": "user" if i%2 == 0 else "assistant", "content": content} for i,content in enumerate(conversation)]
19 |     
20 |         if add_image:
21 |             buffered = BytesIO()
22 |             add_image.convert("RGB").save(buffered, format="JPEG")
23 |             img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
24 |             img_str = f"data:image/jpeg;base64,{img_str}"
25 |             
26 |             conversation[0]['content'] = [{"type": "text", "text": conversation[0]['content']},
27 |                                           {
28 |                                             "type": "image_url",
29 |                                             "image_url": {
30 |                                               "url": img_str
31 |                                             }
32 |                                           }
33 |                                           ]
34 |         kwargs = {
35 |             "messages": conversation,
36 |             "max_tokens": max_tokens,
37 |         }
38 |         kwargs.update(self.hparams)
39 |     
40 |         for k,v in list(kwargs.items()):
41 |             if v is None:
42 |                 del kwargs[k]
43 |         if json:
44 |             kwargs['response_format'] = { "type": "json_object" }
45 |         if self.name.startswith("o1"):
46 |             del kwargs['temperature']
47 | 
48 |         out = self.client.chat.completions.create(
49 |             model=self.name,
50 |             **kwargs
51 |         )
52 |     
53 |         return out.choices[0].message.content
54 | 
55 | if __name__ == "__main__":
56 |     import sys
57 |     #q = sys.stdin.read().strip()
58 |     q = "hello there"
59 |     print(q+":", OpenAIModel("o1-mini").make_request([q]))
60 | 


--------------------------------------------------------------------------------
/tests/fix_torch_backward.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can fix and explain a bug in PyTorch code related to forgetting to zero gradients."
 4 | 
 5 | TAGS = ['code', 'python', 'fix']
 6 | 
 7 | code = """
 8 | ```
 9 | import torch
10 | import torch.nn as nn
11 | import torch.optim as optim
12 | 
13 | class SimpleNet(nn.Module):
14 |     def __init__(self):
15 |         super(SimpleNet, self).__init__()
16 |         self.fc = nn.Linear(1, 1)
17 | 
18 |     def forward(self, x):
19 |         return self.fc(x)
20 | 
21 | def PGD(model, input, target, loss_fn, epsilon, alpha, num_iter):
22 |     perturbation = torch.zeros_like(input, requires_grad=True)
23 | 
24 |     for i in range(num_iter):
25 |         output = model(input + perturbation)
26 |         loss = loss_fn(output, target)
27 | 
28 |         loss.backward()
29 | 
30 |         # Update perturbation
31 |         perturbation_grad = perturbation.grad.data
32 |         perturbation.data = perturbation.data + alpha * perturbation_grad.sign()
33 |         perturbation.data = torch.clamp(perturbation.data, -epsilon, epsilon)
34 |         perturbation.grad.data.zero_()
35 | 
36 |     return input + perturbation
37 | 
38 | model = SimpleNet()
39 | 
40 | input = torch.tensor([[1.0]], requires_grad=True)
41 | target = torch.tensor([[2.0]])
42 | 
43 | loss_fn = nn.MSELoss()
44 | 
45 | epsilon = 0.1
46 | alpha = 0.01
47 | num_iter = 2
48 | 
49 | adversarial_input = PGD(model, input, target, loss_fn, epsilon, alpha, num_iter)
50 | print("Resulting advex", adversarial_input)
51 | ```
52 | """
53 | 
54 | q1 = f"Fix the bug in this code. Rewrite the entire code exactly as-is but just fix any bugs.\n\n{code}"
55 | 
56 | TestTorchBackwardFix = q1 >> LLMRun() >> ExtractCode(keep_main=True) >> PythonRun() >> (SubstringEvaluator("tensor") & SubstringEvaluator("grad_fn=<Add"))
57 | 
58 | q2 = f"Explain the bug in this code.\n\n{code}"
59 | 
60 | TestTorchBackwardExplain = q2 >> LLMRun() >> LLMRun("Below is a student's explanation for a bug in a torch function.\n<A>\nDoes the student's answer say specifically \"the bug is caused by not zeroing the gradient in the backward pass\"? Think out loud and then finally answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> Echo() >> SubstringEvaluator("The student passes")
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     print(run_test(TestTorchBackwardExplain))
65 | 
66 | 


--------------------------------------------------------------------------------
/tests/merge_into_16.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write a Python script that merges a list of file paths into 16 files of approximately equal size."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question_easy = """
 8 | I have a list of several thousand file paths in /tmp/files. Write a python script that merges these all together and splits the content into 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. To do this, sort files by size (largest first) and add files in order always to the smallest one. "/tmp/files" is a list of file paths, one per line.
 9 | """
10 | 
11 | question = """
12 | I have a list of several thousand file paths in /tmp/files. Write a python script that merges the content int 16 files called /tmp/merged_%d for i in 0..15 where each is approximately the same size. "/tmp/files" is a list of file paths, one per line.
13 | """
14 | 
15 | 
16 | def setup_fn():
17 |     import random
18 |     import os
19 |     os.mkdir("/tmp/a")
20 |     with open("/tmp/files", "w") as f:
21 |         for i in range(1000):
22 |             f.write("/tmp/a/file_%d\n" % i)
23 |             with open("/tmp/a/file_%d" % i, "w") as f2:
24 |                 if i < 5:
25 |                     f2.write("x" * random.randint(10000, 30000))
26 |                 else:
27 |                     f2.write("x" * random.randint(1, 1000))
28 |     
29 | def test():
30 |     import os
31 |     import numpy as np
32 |     sz = []
33 |     for i in range(16):
34 |         sz.append(len(open("/tmp/merged_%d" % i).read()))
35 | 
36 |     size_before = sum(os.path.getsize("/tmp/a/file_%d"%i) for i in range(1000))
37 |     size_after = sum(sz)
38 | 
39 |     # I ddn't say exactly how to merge, adding newlines is okay
40 |     if abs(size_before-size_after) > 2000:
41 |         return False
42 | 
43 |     # easy to hit with proper merging, hard otherwise
44 |     # 10 trials suggest the range is usualy [10, 40]
45 |     return max(sz) - min(sz) < 200
46 |             
47 |                     
48 |     
49 | TestMake16FilesEasy = Setup(setup_fn) >> question_easy >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test)
50 | 
51 | TestMake16Files = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(keep_main=True) >> Echo() >> PythonRun() >> PyEvaluator(test)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     print(run_test(TestMake16FilesEasy))
56 | 


--------------------------------------------------------------------------------
/tests/make_json.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can successfully convert unstructured data to JSON."
 4 | 
 5 | TAGS = ['data']
 6 | 
 7 | question = """
 8 | Create a JSON metadata for these models:
 9 | 
10 | Mistral-7B-v0.1
11 | RedPajama-INCITE-7B-Base
12 | RedPajama-INCITE-Base-3B-v1
13 | falcon40b
14 | falcon7b
15 | gpt2-xl
16 | llama-65b
17 | llama-7b
18 | neo-1.3
19 | neo-2.7
20 | neo-6
21 | open_llama_3b_v2
22 | open_llama_7b_v2
23 | opt-1.3b
24 | opt-6.7b
25 | pythia-1.4
26 | pythia-1.4-dedup
27 | pythia-6.9
28 | pythia-6.9-dedup
29 | 
30 | With the format:
31 | 
32 | {"Mistral-7B-v0.1": {"size": 7, dataset: "", "family": "Mistral"}, ...}
33 | 
34 | where family is one of 
35 | 
36 |     base = [
37 |         'pythia',
38 |         'llama',
39 |         'Mistral',
40 |         'gpt2',
41 |         'opt',
42 |         'RedPajama',
43 |         'neo',
44 |         'open_llama',
45 |         'falcon'
46 |     ]
47 | 
48 | gpt2-xl is 1.5b parameters.
49 | 
50 | """
51 | 
52 | 
53 | TestMakeJson = question >> LLMRun() >> ExtractJSON() >> JSONSubsetEvaluator({
54 |   "Mistral-7B-v0.1": {"size": 7, "dataset": "", "family": "Mistral"},
55 |   "RedPajama-INCITE-7B-Base": {"size": 7, "dataset": "", "family": "RedPajama"},
56 |   "RedPajama-INCITE-Base-3B-v1": {"size": 3, "dataset": "", "family": "RedPajama"},
57 |   "falcon40b": {"size": 40, "dataset": "", "family": "falcon"},
58 |   "falcon7b": {"size": 7, "dataset": "", "family": "falcon"},
59 |   "gpt2-xl": {"size": 1.5, "dataset": "", "family": "gpt2"},
60 |   "llama-65b": {"size": 65, "dataset": "", "family": "llama"},
61 |   "llama-7b": {"size": 7, "dataset": "", "family": "llama"},
62 |   "neo-1.3": {"size": 1.3, "dataset": "", "family": "neo"},
63 |   "neo-2.7": {"size": 2.7, "dataset": "", "family": "neo"},
64 |   "neo-6": {"size": 6, "dataset": "", "family": "neo"},
65 |   "open_llama_3b_v2": {"size": 3, "dataset": "", "family": "open_llama"},
66 |   "open_llama_7b_v2": {"size": 7, "dataset": "", "family": "open_llama"},
67 |   "opt-1.3b": {"size": 1.3, "dataset": "", "family": "opt"},
68 |   "opt-6.7b": {"size": 6.7, "dataset": "", "family": "opt"},
69 |   "pythia-1.4": {"size": 1.4, "dataset": "", "family": "pythia"},
70 |   "pythia-1.4-dedup": {"size": 1.4, "dataset": "", "family": "pythia"},
71 |   "pythia-6.9": {"size": 6.9, "dataset": "", "family": "pythia"},
72 |   "pythia-6.9-dedup": {"size": 6.9, "dataset": "", "family": "pythia"}
73 | })
74 | 
75 | if __name__ == "__main__":
76 |     print(run_test(TestMakeJson))
77 | 


--------------------------------------------------------------------------------
/tests/emoji_movies.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "A for-fun test to see if the model can go movie title -> emoji -> movie title."
 4 | 
 5 | TAGS = ['fun']
 6 | 
 7 | question = """
 8 | For each of the following ten movies give at most 5 emoji that would best describe the movie.
 9 | 
10 | Give your answer as a JSON array. So If I asked for
11 | ```["Finding Nemo", "Toy Story"]```
12 | 
13 | you might might answer
14 | 
15 | ```json
16 | {"Finding Nemo": ["🐠", "🐟", "🐡", "🐬", "🐳"],
17 | "Toy Story": ["🚀", "⚔️,", "🤖", "👽", "🌌"]}
18 | ```.
19 | 
20 | Each emoji must be a single utf8 character. ABSOLUTELY NO ZERO WIDTH JOINING. (So, max(len(emoji) for movie in result.values() for emoji in movie) == 1)
21 | 
22 | Now give me answers for these movies:
23 | 
24 | ```["The Lion King", "The Nightmare Before Christmas", "The Godfather", "The Matrix", "Casablanca", "Raiders of the Lost Ark", "V for Vendetta", "The Princess Bride", "Back to the Future", "Dune"]```
25 | 
26 | Give ONLY a JSON output. Nothing else.
27 | """
28 | 
29 | undo = """
30 | For each of the following ten movies described by 5 emoji, give the movie title that best matches.
31 | 
32 | Give your answer as a JSON list. So If I asked for
33 | ```[["🐠", "🐟", "🐡", "🐬", "🐳"], ["🚀", "⚔️,", "🤖", "👽", "🌌"]]```
34 | 
35 | You might answer
36 | 
37 | ```json
38 | ["Finding Nemo", "Toy Story"]]
39 | ```.
40 | 
41 | Now give me answers for these movies:
42 | 
43 | ```<A>```
44 | 
45 | What are the names of the movie titles?
46 | """
47 | 
48 | def extract(x):
49 |     try:
50 |         x = json.loads(x)
51 |     except:
52 |         print("Failed processing")
53 |         return ""
54 |     send = list(x.values())
55 |     # I'll be nice...
56 |     send = [[x for x in y if len(x) <= 2] for y in send]
57 |     return str(send).replace("], [", "],\n[")
58 | 
59 | def count(x):
60 |     try:
61 |         x = json.loads(x)
62 |         count = 0
63 |         for correct, guessed in zip(["The Lion King", "The Nightmare Before Christmas", "The Godfather", "The Matrix", "Casablanca", "Raiders of the Lost Ark", "V for Vendetta", "The Princess Bride", "Back to the Future", "Dune"], x):
64 |             if correct.lower() == guessed.lower():
65 |                 count += 1
66 |         return count >= 8, "OK"
67 |     except:
68 |         return False, "Not a JSON list"
69 | 
70 | 
71 | TestEmojiMovie = question >> LLMRun() >> ExtractJSON() >> PyFunc(extract) >> LLMRun(undo) >> ExtractJSON() >> PyFunc(count)
72 | 
73 | if __name__ == "__main__":
74 |     print(run_test(TestEmojiMovie))
75 | 


--------------------------------------------------------------------------------
/tests/git_cherrypick.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can guide a user through a series of git commands to identify and cherrypick a specific commit from a branch onto the main branch."
 4 | 
 5 | TAGS = ['bash', "git", "agent"]
 6 | 
 7 | 
 8 | question = """
 9 | You are in a git repository. I have two branches, main and some other one I forgot the name of. Take just the first commit that differs between main and the other branch, and commit it onto main.
10 | 
11 | On each round you should just give me just one command to type, I'll run it exactly as-is, and give you the exact output. I will not make any changes. So do not write any placholder text. I'll get started:
12 | 
13 | INPUT: `pwd`
14 | 
15 | OUTPUT: `/usr/src/app`
16 | 
17 | INPUT: `ls`
18 | 
19 | OUTPUT: `main.py a b`
20 | 
21 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command. Start your response with INPUT:"""
22 | 
23 | 
24 | def test_ok():
25 |     import os
26 |     try:
27 |         return open("e").read().strip() == "e" and not os.path.exists("d") and open("c").read().strip() == "c"
28 |     except:
29 |         return False
30 |     
31 |     return success
32 | 
33 | def setup():
34 |     import os
35 |     print(os.popen("""
36 |     git config --global user.name "Your Name"
37 |     git config --global user.email "you@example.com"
38 |     git config --global init.defaultBranch main
39 |     echo "a" > a
40 |     echo "b" > b
41 |     git init
42 |     git add a b
43 |     git commit -a -m "Initial commit"
44 |     git branch other
45 |     git checkout other
46 |     echo "c" > c
47 |     git add c
48 |     git commit -a -m "Add c"
49 |     echo "d" > d
50 |     git add d
51 |     git commit -a -m "Add d"
52 |     git log
53 |     git checkout main
54 |     echo "e" > e
55 |     git add e
56 |     git commit -a -m "Add e"
57 |     git log
58 |     """).read())
59 | 
60 | def do_extract(x):
61 |     if 'INPUT:' in x:
62 |         x = x.split("INPUT:")[1]
63 |     try:
64 |         to_send = x.split("`")[1]
65 |     except:
66 |         to_send = x.split("\n")[0]
67 |     return to_send
68 | 
69 | def do_prepare(x):
70 |     print("Preparing to pass back", x)
71 |     return f"I get an OUTPUT: `{x}`.\n\nWhat is the exact command I should run next? Start your response with INPUT:"
72 |     
73 | TestGitCherrypick = Setup(setup) >> Echo() >> question >> UntilDone(PyEvaluator(test_ok), (LLMConversation() >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=6) >> PyEvaluator(test_ok)
74 |     
75 | 
76 | if __name__ == "__main__":
77 |     print(run_test(TestGitCherrypick))
78 | 


--------------------------------------------------------------------------------
/tests/python_parallel_wordcount.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can parallelize a python program to perform a wordcount."
 4 | 
 5 | TAGS = ['code', 'python', 'performance']
 6 | import collections
 7 | 
 8 | question = """
 9 | Write a self-contained Python program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout.
10 | 
11 | {"This": 1, "is": 2, "is,":1, "an":1, "example":1}
12 | 
13 | The program should be run by calling /path/to/python_word_count with no arguments.
14 | 
15 | Make it run in parallel on 4 cores at the same time, each taking 1/4 of the file.
16 | Print out 4 differnt json dicts, one per line. I'll do the merging.
17 | """
18 | 
19 | 
20 | 
21 | def setup():
22 |     txt = "it was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way"    
23 |     open("/tmp/file.txt", "w").write(txt)
24 |     
25 | 
26 | def merge(dicts):
27 |     out = {}
28 |     for line in dicts.split("\n"):
29 |         if line:
30 |             if '{' not in line: continue
31 |             try:
32 |                 d = json.loads(line)
33 |             except:
34 |                 continue
35 |             for k, v in d.items():
36 |                 out[k] = out.get(k, 0) + v
37 |     return out
38 |     
39 | TestPythonCountPar = Setup(setup) >> StringNode(question) >> LLMRun() >> ExtractCode(keep_main=True) >> ((PythonRun() >> PyFunc(merge) >> EqualEvaluator(({'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}))) & (LLMRun("Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Just tell me if they tried to parallelize it.\n\n<A>\n\nAnswer either 'The student passes' or 'The student fails'.", llm=EVAL_LLM) >> Echo() >> SubstringEvaluator("student passes")))
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     print(run_test(TestPythonCountPar))
44 | 


--------------------------------------------------------------------------------
/tests/debug_innerhtml_eventlistener.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model knows that editing the innerHTML clears event listeners."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = """
 8 | Why doesn't the slider onchange event fire when I run this code?
 9 | 
10 | ```
11 | <body>
12 | 
13 | <script>
14 | 
15 | var candidate_paper_list;
16 | 
17 | class Dropdown {
18 |     constructor(parent_html, parent_node) {
19 |         var r = Math.random()
20 |         parent_html.innerHTML += `<div class="autocomplete-container" style="border: 1px solid #000; padding: 1em">
21 |         <label for="search-input">Search Paper Titles:</label>
22 |         <input type="text" id="search-input-${r}" autocomplete="off">
23 |         <div id="autocomplete-results"></div>
24 | </div>`;
25 | 
26 |     }
27 | 
28 | }
29 | 
30 | class Mul {
31 |     constructor(parent_html, parent_node) {
32 |         var div = document.createElement("div");
33 |         div.style = "border: 1px solid #000; padding: 1em"
34 |         parent_html.appendChild(div);
35 |         var slider = document.createElement("input");
36 |         slider.type = "range"
37 |         slider.id = "foo"
38 |         slider.min = "0"
39 |         slider.max = "100"
40 |         slider.step = ".1"
41 |         this.slider = slider
42 |         div.appendChild(slider);
43 |         this.node = div;
44 |         div.appendChild(document.createElement("br"));
45 |         parent_node.child = this;
46 | 
47 |         slider.addEventListener("input", (event) => {
48 |             console.log(event.target.value);
49 |         });
50 | 
51 |     }
52 | 
53 |     serialize() {
54 |         console.log("VALUE", this.slider.value);
55 |         console.log("VALUE", this.slider);
56 |         let positive_papers = ['const_mul', this.child.serialize(), +this.slider.value];
57 |         return positive_papers;
58 |     }
59 | }
60 | 
61 | 
62 | document.body.onload = function() {
63 |     var mul = new Mul(document.body, {})
64 |     root_node = mul;
65 |     var dropdown = new Dropdown(mul.node, mul);
66 | }
67 | 
68 | </script>
69 | </body>
70 | ```
71 | 
72 | Tell me how to fix it.
73 | """
74 | 
75 | evaluation = """
76 | Below is a student's answer to a question about what a particular instruction does: <A>
77 | 
78 | Does the student's say something similar to "editing the innerHTML clears event listeners"?
79 | 
80 | If the answer is correct say "The student passes" otherwise "The student fails".
81 | """
82 | 
83 | TestInnerHTMLEventListener = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")
84 |     
85 | 
86 | if __name__ == "__main__":
87 |     print(run_test(TestInnerHTMLEventListener))
88 | 


--------------------------------------------------------------------------------
/tests/convert_to_c.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can rewrite a given Python program into an equivalent C program."
 4 | 
 5 | TAGS = ['code', 'c']
 6 | 
 7 | question = """
 8 | Here's a python program. Please rewrite it in c with the type signature float solve_dp(float* arr, int size, int lines_left)
 9 | 
10 | def solve_dp(graph, lines_left):
11 |     n = len(graph)
12 | 
13 |     dp = [[[1e9 for _ in range(10)] for _ in range(lines_left + 1)] for _ in range(n + 1)]
14 |     # Initialize how as -1 indicating no decision made
15 |     how = [[[-1 for _ in range(10)] for _ in range(lines_left + 1)] for _ in range(n + 1)]
16 | 
17 |     for i in range(n, -1, -1):
18 |         for j in range(lines_left + 1):
19 |             for k in range(10):
20 |                 if i == n and j == 0:
21 |                     dp[i][j][k] = 0
22 |                 else:
23 |                     # Option 1: don't take the line
24 |                     if i < n and k < 9:
25 |                         if dp[i + 1][j][k + 1] < dp[i][j][k]:
26 |                             dp[i][j][k] = dp[i + 1][j][k + 1]
27 |                             how[i][j][k] = k + 1  # Representing choosing not to take the line
28 | 
29 |                     # Option 2: take the line
30 |                     if i < n and j > 0:
31 |                         cost = graph[i] + (k - 8)**2
32 |                         if cost + dp[i + 1][j - 1][0] < dp[i][j][k]:
33 |                             dp[i][j][k] = cost + dp[i + 1][j - 1][0]
34 |                             how[i][j][k] = 0  # Representing choosing to take the line
35 | 
36 |     # Reconstruct the solution
37 | 
38 |     i, j, k = 0, lines_left, 6
39 |     taken_lines = []
40 |     while i < n:
41 |         if how[i][j][k] == 0:  # Chose to take the line
42 |             taken_lines.append(n - i)
43 |             i += 1
44 |             j -= 1
45 |             k = 0
46 |         else:  # Chose not to take the line
47 |             i += 1
48 |             k += 1
49 | 
50 |     return dp[0][lines_left][6]
51 | """
52 | 
53 | test_case, answer = make_c_test([("solve_dp(arr, 100, 100)", "11290")], header="float arr[] = {71, 89, 34, 63, 19, 94, 54, 61, 88, 20, 66, 46, 26, 87, 55, 81, 6, 2, 72, 75, 98, 78, 24, 95, 73, 7, 56, 48, 14, 99, 64, 51, 69, 77, 28, 47, 8, 22, 49, 3, 62, 32, 10, 82, 35, 18, 85, 60, 83, 23, 5, 40, 41, 68, 53, 52, 44, 45, 65, 84, 93, 25, 13, 1, 31, 11, 12, 97, 38, 0, 43, 90, 36, 70, 33, 17, 21, 30, 16, 15, 74, 67, 58, 37, 39, 96, 79, 29, 27, 92, 86, 9, 80, 42, 57, 91, 59, 4, 76, 50};")
54 | 
55 | 
56 | TestProgramRewriteC = question >> LLMRun() >> ExtractCode() >> CRun(test_case) >> SubstringEvaluator(answer)
57 | 
58 | if __name__ == "__main__":
59 |     print(run_test(TestProgramRewriteC))
60 | 


--------------------------------------------------------------------------------
/tests/fix_with_patch.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can generate a .patch file to fix a bug in a given Python code."
 4 | 
 5 | TAGS = ['code', 'fix', 'python']
 6 | 
 7 | code = """import re
 8 | 
 9 | class Tokenizer:
10 |     def __init__(self, input_str):
11 |         '''
12 |         input_str: a string of digits and operators
13 |         '''
14 |         self.position = 0
15 |         self.tokens = re.findall(r'\d+|(\+|\-|\*|/|sin|cos|\(|\))', input_str)
16 | 
17 |     def get_next_token(self):
18 |         if self.position >= len(self.tokens):
19 |             return None
20 |         token = self.tokens[self.position]
21 |         self.position += 1
22 |         return token
23 | 
24 |     def peek_next_token(self):
25 |         if self.position >= len(self.tokens):
26 |             return None
27 |         return self.tokens[self.position]
28 | """
29 | 
30 | question = f"""
31 | Write a .patch file to stdout that will fix the bug in this tokenizer. Still keep the new file name the same after patching.
32 | 
33 | I'll evaluate you based on my ability to do
34 | - echo [your patch] > fix.patch
35 | - patch < fix.patch
36 | - python3 run_tokenizer.py
37 | 
38 | 
39 | 
40 | Here is the entirety of run_tokenizer.py:
41 | ```python
42 | {code}
43 | ```
44 | """
45 | 
46 | def setup_fn():
47 |     code = """import re
48 | 
49 | class Tokenizer:
50 |     def __init__(self, input_str):
51 |         '''
52 |         input_str: a string of digits and operators
53 |         '''
54 |         self.position = 0
55 |         self.tokens = re.findall(r'\d+|(\+|\-|\*|/|sin|cos|\(|\))', input_str)
56 | 
57 |     def get_next_token(self):
58 |         if self.position >= len(self.tokens):
59 |             return None
60 |         token = self.tokens[self.position]
61 |         self.position += 1
62 |         return token
63 | 
64 |     def peek_next_token(self):
65 |         if self.position >= len(self.tokens):
66 |             return None
67 |         return self.tokens[self.position]"""
68 |     import time
69 |     open('run_tokenizer.py', 'w').write(code)
70 | 
71 |     time.sleep(.5)
72 | 
73 | 
74 | def check():
75 |     import os
76 |     import time
77 |     os.system("patch < fix.patch")
78 |     time.sleep(.5)
79 |     import run_tokenizer
80 |     return run_tokenizer.Tokenizer("cos(1)+sin(23)*5").tokens == ['cos', '(', '1', ')', '+', 'sin', '(', '23', ')', '*', '5']
81 | 
82 | 
83 | TestFixPatch = Setup(setup_fn) >> question >> LLMRun() >> ExtractCode(manual="Take the below answer to my question and return just the complete .patch in a single file so I can copy and paste it into an editor and apply it with the `patch` tool as-is. Include nothing else other than the patch. Here is the code: <A>") >> MakeFile("fix.patch") >> PyEvaluator(check)
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     print(run_test(TestFixPatch))
88 | 


--------------------------------------------------------------------------------
/tests/shorten_c_function_hard.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can significantly shorten a repetitive C functions."
 4 | 
 5 | TAGS = ['code', 'c']
 6 | 
 7 | question = """
 8 | Code golf this function to be a lot shorter. You only get points if it's <300 bytes (not counting whitespace). Keep the function named `count`.
 9 | 
10 | ```
11 | 
12 | int count() {
13 |     int ROWS = 8;
14 |     int COLS = 8;
15 |     int count = 0;
16 |     for (int i = 0; i < 1<<16; i++) {
17 |         long long unsigned int array = ((i & 0xf) << 9) | ((i & 0xf0) << (9 + 8 - 4)) | ((i & 0xf00) << (9 + 8 - 4 + 8 - 4)) | ((i & 0xf000) << (9 + 8 - 4 + 8 - 4 + 8 - 4));
18 |         long long unsigned int init = array;
19 | 
20 |     long long unsigned int temp = array;
21 | 
22 |     for (int i = 1; i < ROWS - 1; i++) {
23 |         for (int j = 1; j < COLS - 1; j++) {
24 |             // Calculate bit positions
25 |             int pos = i * COLS + j;
26 |             int topPos = (i - 1) * COLS + j;
27 |             int leftPos = i * COLS + (j - 1);
28 |             int bottomPos = (i + 1) * COLS + j;
29 |             int rightPos = i * COLS + (j + 1);
30 | 
31 |             // Get the values of the neighbors
32 |             int cell = (temp >> pos) & 1;
33 |             int top = (temp >> topPos) & 1;
34 |             int left = (temp >> leftPos) & 1;
35 |             int bottom = (temp >> bottomPos) & 1;
36 |             int right = (temp >> rightPos) & 1;
37 | 
38 |             // Sum the values of the neighbors
39 |             int sum = top + left + bottom + right;
40 | 
41 |             // Update the cell if all neighbors are 0 and the cell itself is 1
42 |             if (sum == 0 && cell == 1) {
43 |                 array &= ~(1ULL << pos);
44 |             }
45 |         }
46 |     }
47 | 
48 |         count += (init == array);
49 |     }
50 |     return count;
51 | }
52 | ```
53 | """
54 | 
55 | test_case, answer = make_c_test([("count()", "27488")])
56 | 
57 | def short_enough(num):
58 |     return lambda code: len(code.replace(" ", "").replace("\n", "").replace("\t", "")) < num
59 | 
60 | step = "\nTo do this follow these steps: \n1. Start out explaining what the function does in words.\n2. Then find a few ways that you can make the function shorter given this explanation.\n3. Write the function as short as possible.\n4. See if you can make it any shorter. Return this as the final answer and call it your final answer. Make sure to name it `count` still."
61 | 
62 | TestShortenC2Hard = question.replace("<300", "<200") >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(200)) & (CRun(test_case) >> SubstringEvaluator(answer)))
63 | TestShortenC2 = question >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer)))
64 | TestShortenC2Step = question + step >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer)))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     from llm import llm
69 |     print(run_test(TestShortenC2Step))
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/tests/implement_assembly_interpreter_by_example.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can implement an interpreter for a new assembly language given an example."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | primes = """
 8 | 
 9 | SET R1 2          // Starting number to check for prime
10 | 
11 | start_find_primes:
12 | 
13 |     JMP is_prime // Control will return after executing is_prime with R1 as input and R2 containing the result
14 | 
15 | ready_prime:
16 |     SET R7 1
17 |     EQ R2 R7        // Check if R2 is 1 (prime)
18 |     JF increment  // If not prime, skip storing and increment the number
19 | 
20 |     // Store prime number in memory and increment count
21 |     STORE R1 R8   // Store prime number at address pointed by R8
22 |     INC R8        // Increment prime count
23 | 
24 |     // Check if 100 primes are found
25 |     SET R7 100
26 |     EQ R8 R7
27 |     JF increment  // If not 100 primes found yet, continue finding
28 | 
29 |     JMP end        // If 100 primes found, end program
30 | 
31 | increment:
32 |     INC R1         // Increment number to check for prime
33 |     JMP start_find_primes // Check next number
34 | 
35 | is_prime:
36 |     SET R2 1       // Assume number is prime initially
37 |     SET R3 2       // Start divisor from 2
38 | 
39 | start_loop:        // Label to start the loop
40 |     // Check if we have exceeded the square root of R1
41 |     MUL R4 R3 R3   // R4 = R3 * R3
42 |     GT R4 R1       // Set flag if R4 > R1
43 |     JT is_prime_end        // If not exceeded, continue; else, end loop
44 | 
45 |     MOD R6 R1 R3   // R6 = R1 % R3
46 |     SET R7 0
47 |     EQ R7 R6     // Check if R6 is 0
48 |     JT not_prime   // If yes, number is not prime
49 | 
50 |     INC R3         // Increment divisor
51 |     JMP start_loop // Repeat loop
52 | 
53 | not_prime:
54 |     SET R2 0       // Set result to 0 (not prime)
55 | 
56 | is_prime_end:
57 |     JMP ready_prime
58 | 
59 | end:
60 | """
61 | 
62 | code = """
63 |     SET R1 0
64 |     SET R2 1
65 | loop:
66 |     MUL R3 R2 R2
67 |     STORE R3 R1
68 |     INC R1
69 |     INC R2
70 |     SET R3 20
71 |     EQ R1 R3
72 |     JF loop
73 | """
74 | 
75 | question = f"""Here is an example program from a new assmebly language I made up that computes primes:
76 | 
77 | ```
78 | {primes}
79 | ```
80 | 
81 | ite me a python interpreter `evaluate(str)` that returns the resulting memory state after running the program. For example, `evaluate(program)` should return `[2, 3, 5, 7, ...]` for the above program.
82 | """
83 | 
84 | 
85 | test_case, answer = make_python_test([(f'evaluate("""{code}""")[:10]', "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"),
86 |                                       (f'evaluate("""{primes}""")[:10]', "[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]")
87 |                                       ])
88 | 
89 | TestImplementAssemblyByExample = question >> LLMRun() >> ExtractCode(lang="python") >> Echo() >> PythonRun(test_case) >> SubstringEvaluator(answer)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     print(run_test(TestImplementAssemblyByExample))
94 | 
95 |     
96 | 


--------------------------------------------------------------------------------
/tests/whisper_merge.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can implement some string logic given a fuzzy description."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Given a parse function like
 9 | 
10 | def parse_transcript(content):
11 |     transcript_segments = []
12 |     pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]  (.*)'
13 |     matches = re.findall(pattern, content)
14 | 
15 |     for start_time, end_time, text in matches:
16 |         start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds()
17 |         end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds()
18 |         transcript_segments.append((start_seconds, end_seconds, text))
19 | 
20 |     return transcript_segments
21 | 
22 | 
23 | for data like
24 | 
25 | [00:00:14.240 --> 00:00:14.420]   the
26 | [00:00:14.420 --> 00:00:14.860]   robust
27 | [00:00:14.860 --> 00:00:15.020]  ness
28 | [00:00:15.020 --> 00:00:15.140]   of
29 | [00:00:15.140 --> 00:00:15.500]   neural
30 | [00:00:15.500 --> 00:00:15.870]   networks
31 | [00:00:15.870 --> 00:00:16.200]  .
32 | [00:00:16.200 --> 00:00:16.410]   And
33 | [00:00:16.410 --> 00:00:16.700]   this
34 | [00:00:16.700 --> 00:00:16.840]   is
35 | [00:00:16.840 --> 00:00:17.200]   joint
36 | 
37 | 
38 | write a merge() function that (1) calls parse and (2) merges words into sentences that end in period. Start with the first word and end with the period. Return a list of strings.
39 | """
40 | 
41 | test_case, answer = make_python_test([("""merge("[00:00:14.240 --> 00:00:14.420]   the\\n[00:00:14.420 --> 00:00:14.860]   robust\\n[00:00:14.860 --> 00:00:15.020]  ness\\n[00:00:15.020 --> 00:00:15.140]   of\\n[00:00:15.140 --> 00:00:15.500]   neural\\n[00:00:15.500 --> 00:00:15.870]   networks\\n[00:00:15.870 --> 00:00:16.200]  .\\n[00:00:16.200 --> 00:00:16.410]   And\\n[00:00:16.410 --> 00:00:16.700]   this\\n[00:00:16.700 --> 00:00:16.840]   is\\n[00:00:16.840 --> 00:00:17.200]   joint\\n")""", "[' the robustness of neural networks.', ' And this is joint']")], header="""
42 | def parse_transcript(content):
43 |     transcript_segments = []
44 |     pattern = r'\[(\d{2}:\d{2}:\d{2}\.\d{3}) --> (\d{2}:\d{2}:\d{2}\.\d{3})\]  (.*)'
45 |     matches = re.findall(pattern, content)
46 | 
47 |     for start_time, end_time, text in matches:
48 |         start_seconds = timedelta(hours=int(start_time[:2]), minutes=int(start_time[3:5]), seconds=int(start_time[6:8]), milliseconds=int(start_time[9:])).total_seconds()
49 |         end_seconds = timedelta(hours=int(end_time[:2]), minutes=int(end_time[3:5]), seconds=int(end_time[6:8]), milliseconds=int(end_time[9:])).total_seconds()
50 |         transcript_segments.append((start_seconds, end_seconds, text))
51 | 
52 |     return transcript_segments
53 | """)
54 | 
55 | 
56 | TestWhisperMerge = question >> LLMRun() >> ExtractCode() >> PythonRun(test_case) >> SubstringEvaluator(answer)
57 | 
58 | if __name__ == "__main__":
59 |     print(run_test(TestWhisperMerge))
60 | 


--------------------------------------------------------------------------------
/tests/rust_parallel_wordcount.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can write a rust program that performs parallel word counting."
 4 | 
 5 | TAGS = ['code', 'rust', 'performance']
 6 | import collections
 7 | 
 8 | question = """
 9 | Write a self-contained Rust program that prints a JSON output of how often each word appears in a text file located in /tmp/file.txt. So if the file contains "This is is is, an example" then the program should print to stdout.
10 | 
11 | {"This": 1, "is": 2, "is,": 1, "an": 1, "example": 1}
12 | 
13 | Don't import any crates that would require a Cargo.toml file. I'll just compile it with `rustc yourfile.rs` and it should work. The program should be run by calling /path/to/rust_word_count with no arguments.
14 | 
15 | Make it run in parallel on 4 cores at the same time, each job taking ~1/4 of the file.
16 | Print out 4 differnt json dicts, one per line. I'll do the merging.
17 | """
18 | 
19 | extra = "\nDo not import serde_json, write json output from scratch."
20 | 
21 | def setup():
22 |     txt = "it was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven, we were all going direct the other way"
23 |     open("/tmp/file.txt", "w").write(txt)
24 | 
25 | 
26 | def merge(dicts):
27 |     out = {}
28 |     for line in dicts.split("\n"):
29 |         if line:
30 |             try:
31 |                 d = json.loads(line)
32 |             except:
33 |                 continue
34 |             for k, v in d.items():
35 |                 out[k] = out.get(k, 0) + v
36 |     return sorted(out.items())
37 | 
38 | answer = sorted({'the': 11, 'it': 10, 'was': 10, 'of': 10, 'we': 4, 'times,': 2, 'age': 2, 'epoch': 2, 'season': 2, 'had': 2, 'before': 2, 'us,': 2, 'were': 2, 'all': 2, 'going': 2, 'direct': 2, 'best': 1, 'worst': 1, 'wisdom,': 1, 'foolishness,': 1, 'belief,': 1, 'incredulity,': 1, 'Light,': 1, 'Darkness,': 1, 'spring': 1, 'hope,': 1, 'winter': 1, 'despair,': 1, 'everything': 1, 'nothing': 1, 'to': 1, 'Heaven,': 1, 'other': 1, 'way': 1}.items())
39 | 
40 | judge = "Below is some student's code. I asked them to parallelize some code. Do not judge if it is correct, I have test cases for that. Logic errors are okay. So are synatx errors. Just tell me if they tried to parallelize it.\n\n<A>\n\nAnswer either 'The student passes' or 'The student fails'."
41 | 
42 | TestRustParCount = Setup(setup) >> StringNode(question) >> LLMRun() >> ExtractCode(keep_main=True) >> ((RustRun() >> PyFunc(merge) >> EqualEvaluator((answer))) & (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes")))
43 | 
44 | TestRustParCountNoLib = Setup(setup) >> StringNode(question+extra) >> LLMRun() >> ExtractCode(keep_main=True) >> ((RustRun() >> PyFunc(merge) >> EqualEvaluator((answer))) & (LLMRun(judge, llm=EVAL_LLM) >> SubstringEvaluator("student passes")))
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     print(run_test(TestRustParCountNoLib))
49 | 


--------------------------------------------------------------------------------
/tests/debug_broken_code_parcount.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can explain a bug in a parallelized wordcount function."
 4 | 
 5 | TAGS = ['explain', 'python', 'fix']
 6 | import collections
 7 | 
 8 | question = """
 9 | What is the bug in this code that makes it not count right. (I want to print out 4 arrays un-merged. I'll do the merging later. This is intentional.)
10 | 
11 | #!/usr/bin/env python3
12 | import json
13 | import multiprocessing
14 | 
15 | def count_words_in_chunk(start, end, queue):
16 |     '''Count how often each word appears in the given chunk of the file.'''
17 |     word_counts = {}
18 |     with open('/tmp/file.txt', 'r') as f:
19 |         f.seek(start)
20 |         text = f.read(end - start)
21 |         # Avoid cutting words in half at the beginning
22 |         if start != 0:
23 |             text = text.split(' ', 1)[-1]
24 |         # Avoid cutting words in half at the end
25 |         f.seek(end)
26 |         remainder = f.readline()
27 |         text += remainder
28 |         # Tokenize and count words
29 |         for word in text.split():
30 |             word = word.strip().lower()  # Lowercasing for consistent counting
31 |             if word:
32 |                 word_counts[word] = word_counts.get(word, 0) + 1
33 |     # Put result to the queue
34 |     queue.put(word_counts)
35 | 
36 | def main():
37 |     # Get the size of the file
38 |     file_size = 0
39 |     with open('/tmp/file.txt', 'r') as f:
40 |         f.seek(0, 2)
41 |         file_size = f.tell()
42 | 
43 |     # Calculate chunk sizes for 4 cores
44 |     chunk_size = file_size // 4
45 |     offsets = [(i * chunk_size, (i + 1) * chunk_size) for i in range(4)]
46 |     # Last chunk should go until the end of the file
47 |     offsets[-1] = (offsets[-1][0], file_size)
48 | 
49 |     # Queue for inter-process communication
50 |     queue = multiprocessing.Queue()
51 | 
52 |     # Start 4 worker processes
53 |     processes = []
54 |     for start, end in offsets:
55 |         p = multiprocessing.Process(target=count_words_in_chunk, args=(start, end, queue))
56 |         processes.append(p)
57 |         p.start()
58 | 
59 |     # Collect results
60 |     for p in processes:
61 |         p.join()
62 | 
63 |     # Print results as JSON objects one-per-process
64 |     while not queue.empty():
65 |         word_count = queue.get()
66 |         print(json.dumps(word_count))
67 | 
68 | if __name__ == "__main__":
69 |     main()
70 | 
71 | 
72 | List exactly one reason.
73 | """
74 | 
75 |     
76 | TestWhyBuggyPythonCountPar = question >> LLMRun() >> LLMRun("Below is a student's explanation for why some parallel wordcounting code is not working correctly. You don't need to see the full code, just tell me whether or not the student says that the main reason the code is broken is because 'the chunks could contain the same data or be overlapping' or something like this. Saying 'a word might be split in half' is wrong. Saying 'you might miss a word' is also wrong. Here is their answer:\n\n<A>\n\nTell me either 'The student passes' or 'The student fails'.", llm=EVAL_LLM) >> Echo() >> SubstringEvaluator("student passes")
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     print(run_test(TestWhyBuggyPythonCountPar))
81 | 


--------------------------------------------------------------------------------
/tests/fix_threading_issue.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | 
 4 | DESCRIPTION = "Test if the model can explain a poorly worded error message in a short threaded python program."
 5 | 
 6 | TAGS = ['code', 'python', 'explain']
 7 | 
 8 | 
 9 | question = """In this program
10 | 
11 | from multiprocessing import Pool
12 | from openai import OpenAI
13 | 
14 | txt = open("/tmp/aa.txt").read().split("\n\n")
15 | 
16 | from multiprocessing import Pool
17 | import subprocess
18 | 
19 | client = OpenAI(api_key="sk-XXXXXXXXXXXXXX")
20 | def create_speech_file(index, text):
21 |     response = client.audio.speech.create(
22 |         model="tts-1-hd",
23 |         voice="nova",
24 | 	input=text
25 |     )
26 |     filename = f"output{index}.mp3"
27 |     response.stream_to_file(filename)
28 |     return filename
29 | 
30 | def merge_mp3(files, output):
31 |     with open("filelist.txt", "w") as file:
32 | 	for f in files:
33 |             file.write(f"file '{f}'\n")
34 | 
35 |     cmd = ["ffmpeg", "-f", "concat", "-safe", "0", "-i", "filelist.txt", "-c", "copy", output]
36 |     subprocess.run(cmd)
37 | 
38 | if __name__ == '__main__':
39 |     # Step 1: Parallelize speech file creation                                                                                                                            
40 |     with Pool(8) as pool:
41 | 	mp3_files = pool.starmap(create_speech_file, enumerate(txt))
42 | 
43 |     # Step 2: Merge the generated MP3 files                                                                                                                               
44 |     output_file = "merged_output.mp3"
45 |     merge_mp3(mp3_files, output_file)
46 | 
47 | Why am I getting this error?
48 | 
49 | Exception in thread Thread-3:
50 | Traceback (most recent call last):
51 |   File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 980, in _bootstrap_inner
52 |     self.run()
53 |   File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/threading.py", line 917, in run
54 |     self._target(*self._args, **self._kwargs)
55 |   File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/pool.py", line 576, in _handle_results
56 |     task = get()
57 |   File "/opt/homebrew/Cellar/python@3.9/3.9.16/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/connection.py", line 251, in recv
58 |     return _ForkingPickler.loads(buf.getbuffer())
59 | TypeError: __init__() missing 2 required keyword-only arguments: 'response' and 'body'
60 | 
61 | """
62 | 
63 | TestQuestionThreadedFix = question >> LLMRun() >> LLMRun("I showed a student some code with a bug. I then tested their understanding by asking them to explain hwo to fix the bug. I'm going to show you their answer, I want you to evaluate whether or not their answer says something to the effect of \"The function create_speech_file is raising an exception that can not be pickled.\". Do not judge their reasonining, or if they know how to fix it, just tell me if they correctly identified that there is a crash in the create_speech_file function. \n Student Answer: \"<A>\"\n\n Think out loud then answer either \"The student passes\" or \"The student fails\".", llm=EVAL_LLM) >> SubstringEvaluator("student passes")
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     print(run_test(TestQuestionThreadedFix))
68 | 


--------------------------------------------------------------------------------
/tests/shorten_c_function.py:
--------------------------------------------------------------------------------
  1 | from evaluator import *
  2 | 
  3 | DESCRIPTION = "Test if the model can significantly shorten a repetitive C functions."
  4 | 
  5 | TAGS = ['code', 'c']
  6 | 
  7 | question = """
  8 | Code golf this function to be a lot shorter. You only get points if it's <300 bytes (not counting whitespace). Keep the name `stepper`.
  9 | 
 10 | ```
 11 | void stepper(int xw) {
 12 |   int Steps = 0;
 13 |   for (int x = 0; x < xw; x++) {
 14 |     switch (Steps) {
 15 |       case 0:
 16 |         digitalWrite(1, 0);
 17 |         digitalWrite(2, 0);
 18 |         digitalWrite(3, 0);
 19 |         digitalWrite(4, 1);
 20 |         break;
 21 |       case 1:
 22 |         digitalWrite(1, 0);
 23 |         digitalWrite(2, 0);
 24 |         digitalWrite(3, 1);
 25 |         digitalWrite(4, 1);
 26 |         break;
 27 |       case 2:
 28 |         digitalWrite(1, 0);
 29 |         digitalWrite(2, 0);
 30 |         digitalWrite(3, 1);
 31 |         digitalWrite(4, 0);
 32 |         break;
 33 |       case 3:
 34 |         digitalWrite(1, 0);
 35 |         digitalWrite(2, 1);
 36 |         digitalWrite(3, 1);
 37 |         digitalWrite(4, 0);
 38 |         break;
 39 |       case 4:
 40 |         digitalWrite(1, 0);
 41 |         digitalWrite(2, 1);
 42 |         digitalWrite(3, 0);
 43 |         digitalWrite(4, 0);
 44 |         break;
 45 |       case 5:
 46 |         digitalWrite(1, 1);
 47 |         digitalWrite(2, 1);
 48 |         digitalWrite(3, 0);
 49 |         digitalWrite(4, 0);
 50 |         break;
 51 |       case 6:
 52 |         digitalWrite(1, 1);
 53 |         digitalWrite(2, 0);
 54 |         digitalWrite(3, 0);
 55 |         digitalWrite(4, 0);
 56 |         break;
 57 |       case 7:
 58 |         digitalWrite(1, 1);
 59 |         digitalWrite(2, 0);
 60 |         digitalWrite(3, 0);
 61 |         digitalWrite(4, 1);
 62 |         break;
 63 |       default:
 64 |         digitalWrite(1, 0);
 65 |         digitalWrite(2, 0);
 66 |         digitalWrite(3, 0);
 67 |         digitalWrite(4, 0);
 68 |         break;
 69 |     }
 70 |     Steps = (Steps + 1) % 8;
 71 |   }
 72 | }
 73 | ```
 74 | """
 75 | 
 76 | test_case, answer = make_c_test([("(stepper(100),out)", "-1475775710")], extra_methods="""
 77 | #include <stdio.h>
 78 | 
 79 | int out = 0;
 80 | int digitalWrite(int pin, int value) {
 81 |   out *= 101;
 82 |   out += pin;
 83 |   out *= 97;
 84 |   out += value;
 85 |   return out;
 86 | }
 87 | """)
 88 | 
 89 | def short_enough(num):
 90 |     return lambda code: len(code.replace(" ", "").replace("\n", "").replace("\t", "")) < num
 91 | 
 92 | step = "\nTo do this follow these steps: \n1. start out by creating a table of the values that are being written to the pins.\n2. Then find an expression for this.\n3. Write the function as short as possible.\n4. See if you can make it any shorter. Return this as the final answer and call it your final answer. Make sure to name it stepper still."
 93 | 
 94 | TestShortenCHard = question.replace("<300", "<200") >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(200)) & (CRun(test_case) >> SubstringEvaluator(answer)))
 95 | TestShortenC = question >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer)))
 96 | TestShortenCStep = question + step >> LLMRun() >> ExtractCode() >> (PyFunc(short_enough(300)) & (CRun(test_case) >> SubstringEvaluator(answer)))
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     from llm import llm
101 |     print(run_test(TestShortenCHard))
102 | 
103 | 


--------------------------------------------------------------------------------
/tests/decompile_py_mid.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can disassemble Python bytecode and create a function that returns a list of prime numbers and their negations."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = """
 8 | Disassemble the following python into a function called foo:
 9 | 
10 |   5           0 BUILD_LIST               0
11 |               2 STORE_FAST               1 (primes)
12 | 
13 |   6           4 LOAD_GLOBAL              0 (range)
14 |               6 LOAD_CONST               1 (2)
15 |               8 LOAD_FAST                0 (max_number)
16 |              10 LOAD_CONST               2 (1)
17 |              12 BINARY_ADD
18 |              14 CALL_FUNCTION            2
19 |              16 GET_ITER
20 |         >>   18 FOR_ITER                94 (to 114)
21 |              20 STORE_FAST               2 (possible_prime)
22 | 
23 |   7          22 LOAD_CONST               3 (True)
24 |              24 STORE_FAST               3 (is_prime)
25 | 
26 |   9          26 LOAD_GLOBAL              0 (range)
27 |              28 LOAD_CONST               1 (2)
28 |              30 LOAD_GLOBAL              1 (int)
29 |              32 LOAD_FAST                2 (possible_prime)
30 |              34 LOAD_CONST               4 (0.5)
31 |              36 BINARY_POWER
32 |              38 CALL_FUNCTION            1
33 |              40 LOAD_CONST               2 (1)
34 |              42 BINARY_ADD
35 |              44 CALL_FUNCTION            2
36 |              46 GET_ITER
37 |         >>   48 FOR_ITER                24 (to 74)
38 |              50 STORE_FAST               4 (num)
39 | 
40 |  10          52 LOAD_FAST                2 (possible_prime)
41 |              54 LOAD_FAST                4 (num)
42 |              56 BINARY_MODULO
43 |              58 LOAD_CONST               5 (0)
44 |              60 COMPARE_OP               2 (==)
45 |              62 POP_JUMP_IF_FALSE       48
46 | 
47 |  11          64 LOAD_CONST               6 (False)
48 |              66 STORE_FAST               3 (is_prime)
49 | 
50 |  12          68 POP_TOP
51 |              70 JUMP_ABSOLUTE           74
52 |              72 JUMP_ABSOLUTE           48
53 | 
54 |  13     >>   74 LOAD_FAST                3 (is_prime)
55 |              76 POP_JUMP_IF_FALSE       18
56 | 
57 |  14          78 LOAD_FAST                1 (primes)
58 |              80 LOAD_METHOD              2 (append)
59 |              82 LOAD_FAST                2 (possible_prime)
60 |              84 CALL_METHOD              1
61 |              86 POP_TOP
62 | 
63 |  16          88 LOAD_FAST                2 (possible_prime)
64 |              90 LOAD_CONST               1 (2)
65 |              92 BINARY_MODULO
66 |              94 LOAD_CONST               5 (0)
67 |              96 COMPARE_OP               3 (!=)
68 |              98 POP_JUMP_IF_FALSE       18
69 | 
70 |  17         100 LOAD_FAST                1 (primes)
71 |             102 LOAD_METHOD              2 (append)
72 |             104 LOAD_FAST                2 (possible_prime)
73 |             106 UNARY_NEGATIVE
74 |             108 CALL_METHOD              1
75 |             110 POP_TOP
76 |             112 JUMP_ABSOLUTE           18
77 | 
78 |  18     >>  114 LOAD_FAST                1 (primes)
79 |             116 RETURN_VALUE
80 | 
81 | """
82 | 
83 | test_case, answer = make_python_test([("foo(20)", "[2, 3, -3, 5, -5, 7, -7, 11, -11, 13, -13, 17, -17, 19, -19]")])
84 | 
85 | 
86 | TestDisasPrimes = question >> LLMRun() >> ExtractCode(keep_main=False) >> PythonRun(test_case) >> SubstringEvaluator(answer)
87 | 
88 | if __name__ == "__main__":
89 |     print(run_test(TestDisasPrimes))
90 | 


--------------------------------------------------------------------------------
/tests/explore_sql_db.py:
--------------------------------------------------------------------------------
  1 | from evaluator import *
  2 | 
  3 | DESCRIPTION = "Test if the model can interact with an SQLite database and provide the correct command to add a new person with specific criteria."
  4 | 
  5 | TAGS = ['sql', 'agent']
  6 | 
  7 | question = """
  8 | I've just run `sqlite3 people.db` and you are now in the interpreter. I want to add another person named Victor. They should be as old as the youngest person. And have the most popular profession. What command should I run?
  9 | 
 10 | On each round you should just give me just one command to type, I'll run it exactly as-is, and give you the exact output. I will not make any changes. So do not write any placholder text. I'll get started:
 11 | 
 12 | INPUT: `.version`
 13 | 
 14 | OUTPUT: ```SQLite 3.39.5 2022-10-14 20:58:05```
 15 | 
 16 | INPUT: `.database`
 17 | 
 18 | OUTPUT: ```main: people.db r/w```
 19 | 
 20 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command, ending with ; if necessary. Run small queries one at a time; think step by step. Start your response with INPUT:
 21 | """
 22 | 
 23 | def setup():
 24 |     import sqlite3
 25 |     
 26 |     # Create a connection object using the connect function
 27 |     conn = sqlite3.connect('people.db')
 28 |     
 29 |     # Create a cursor object using the cursor method
 30 |     cursor = conn.cursor()
 31 |     
 32 |     # Create table named people with columns for name, favorite friend, age, and profession
 33 |     cursor.execute('''CREATE TABLE people
 34 |                  (a_name TEXT, favorite_friend TEXT, person_age INT, the_profession TEXT)''')
 35 |     
 36 |     # List of people data to insert into the people table
 37 |     people_data = [
 38 |         ('Alice', 'Bob', 29, 'Engineer'),
 39 |         ('Bob', 'Alice', 32, 'Doctor'),
 40 |         ('Charlie', 'Alice', 28, 'Artist'),
 41 |         ('David', 'Eve', 35, 'Doctor'),
 42 |         ('Eve', 'Frank', 26, 'Teacher'),
 43 |         ('Frank', 'Alice', 31, 'Scientist'),
 44 |         ('Grace', 'Heidi', 30, 'Nurse'),
 45 |         ('Heidi', 'Ivy', 25, 'Lawyer'),
 46 |         ('Ivy', 'Charlie', 34, 'Chef'),
 47 |         ('Judy', 'Grace', 27, 'Accountant')
 48 |     ]
 49 |     
 50 |     # Insert each person into the people table
 51 |     cursor.executemany('INSERT INTO people VALUES (?,?,?,?)', people_data)
 52 |     
 53 |     # Commit the changes
 54 |     conn.commit()
 55 |     
 56 |     # Close the connection
 57 |     conn.close()
 58 | 
 59 | 
 60 | def test_ok():
 61 |     import sqlite3
 62 |     
 63 |     # Create a connection object using the connect function
 64 |     conn = sqlite3.connect('people.db')
 65 |     
 66 |     # Create a cursor object using the cursor method
 67 |     cursor = conn.cursor()
 68 | 
 69 |     # Get Victor's age and profession
 70 |     cursor.execute('SELECT person_age, the_profession FROM people WHERE a_name = "Victor"')
 71 |     out = cursor.fetchone()
 72 |     if out is None:
 73 |         return False
 74 |     age, profession = out
 75 |     
 76 |     return age == 25 and profession == 'Doctor'
 77 |     
 78 | 
 79 | 
 80 | def do_extract(x):
 81 |     if 'INPUT:' in x:
 82 |         x = x.split("INPUT:")[1]
 83 |     try:
 84 |         to_send = x.split("`")[1]
 85 |     except:
 86 |         to_send = x.split("\n")[0]
 87 |     return to_send
 88 | 
 89 | 
 90 | def do_prepare(x):
 91 |     print("Preparing to pass back", x)
 92 |     x = x.replace("\nsqlite>","")
 93 |     return f"I get an OUTPUT: ```{x}```.\n\nWhat is the exact command I should run next? Start your response with INPUT:"
 94 | 
 95 | 
 96 | TestSqlExplore = Setup(setup) >> StartDockerJob("sqlite3 people.db", eos_string="sqlite>") >> question >> UntilDone(PyEvaluator(test_ok), (LLMConversation() >> PyFunc(do_extract) >> SendStdoutReceiveStdin() >> PyFunc(do_prepare)), max_iters=10) >> PyEvaluator(test_ok)
 97 |  
 98 | if __name__ == "__main__":
 99 |     print(run_test(TestSqlExplore))
100 | 


--------------------------------------------------------------------------------
/tests/basic_code_understanding.py:
--------------------------------------------------------------------------------
  1 | from evaluator import *
  2 | 
  3 | DESCRIPTION = "Test if a model can solve a simple capture-the-flag like entry in C."
  4 | 
  5 | TAGS = ['c', "explain"]
  6 | 
  7 | INPUT = '''#include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | 
 11 | #define MAX_INPUTS 10
 12 | #define DATA_LENGTH 100
 13 | #define ENTRY_LENGTH 5
 14 | #define COMMAND_LENGTH 4
 15 | 
 16 | static const char* flag = "SECRET";
 17 | static char data[MAX_INPUTS][DATA_LENGTH];
 18 | static int input_lengths[MAX_INPUTS];
 19 | static int inputs = 0;
 20 | 
 21 | void data_write() {
 22 |     char input[DATA_LENGTH], len[ENTRY_LENGTH];
 23 |     long length;
 24 | 
 25 |     printf("Enter data for storage:\n"); // Updated message
 26 |     fgets(input, DATA_LENGTH, stdin);
 27 |     input[strcspn(input, "\n")] = 0;
 28 | 
 29 |     do {
 30 |         printf("Input the data size:\n"); // Updated message
 31 |         fgets(len, ENTRY_LENGTH, stdin);
 32 |         length = strtol(len, NULL, 10);
 33 |         if (!length) puts("Invalid data length, try again."); // Updated message
 34 |     } while (!length);
 35 | 
 36 |     if (inputs >= MAX_INPUTS) inputs = 0;
 37 |     strcpy(data[inputs], input);
 38 |     input_lengths[inputs++] = length;
 39 |     printf("Data stored at position: %d\n", inputs); // Updated message
 40 | }
 41 | 
 42 | void data_read() {
 43 |     char entry[ENTRY_LENGTH], output[DATA_LENGTH] = {'\0'};
 44 |     printf("Which data position to read?\n"); // Updated message
 45 |     fgets(entry, ENTRY_LENGTH, stdin);
 46 |     entry[strcspn(entry, "\n")] = 0;
 47 | 
 48 |     long entry_number = strtol(entry, NULL, 10);
 49 |     if (!entry_number) {
 50 |         puts(flag);
 51 |         fseek(stdin, 0, SEEK_END);
 52 |         exit(0);
 53 |     }
 54 | 
 55 |     strncpy(output, data[--entry_number], input_lengths[entry_number]);
 56 |     puts(output);
 57 | }
 58 | 
 59 | int main() {
 60 |     char input[COMMAND_LENGTH];
 61 |     long command;
 62 |     puts("Welcome to the Data Echo System!"); // Updated message
 63 | 
 64 |     while (1) {
 65 |         printf("Choose option (1-Write, 2-Read, 3-Exit):\n"); // Updated message
 66 |         fgets(input, COMMAND_LENGTH, stdin);
 67 |         input[strcspn(input, "\n")] = 0;
 68 | 
 69 |         command = strtol(input, NULL, 10);
 70 |         if (!command) {
 71 |             puts("Invalid command, enter 1, 2 or 3."); // Updated message
 72 |         } else if (command == 1) {
 73 |             data_write();
 74 |             puts("Data written. Perform another action?"); // Updated message
 75 |         } else if (command == 2) {
 76 |             if (!inputs) {
 77 |                 puts("Database is empty."); // Updated message
 78 |                 continue;
 79 |             }
 80 |             data_read();
 81 |             puts("Data read. Perform another action?"); // Updated message
 82 |         } else if (command == 3) {
 83 |             puts("Exiting. Thank you!"); // Updated message
 84 |             break;
 85 |         } else {
 86 |             puts("Please enter a valid command: 1, 2, or 3."); // Updated message
 87 |         }
 88 |     }
 89 |     return 0;
 90 | }
 91 | '''
 92 | 
 93 | 
 94 | 
 95 | question = f"Give me sequence of the inputs that can make the code output the value of the flag. Here is the code: \n {INPUT}"
 96 | 
 97 | evaluation = """
 98 | Below is a student's answer to a question about functionality of the code in C: <A>
 99 | 
100 | Does the student's final answer say that it should first select option 1 and input a data with any length and then select option 2 to read the data and input 0?
101 | 
102 | Ignore their reasoning. Just look at the final answer, if they say **EXPLICITLY** that it should first input some data by selecting option 1 and then select option 2 to read and input 0. Giving something that's equal but more complicated is wrong. 
103 | 
104 | If the answer is correct say "The student passes" otherwise "The student fails".
105 | """
106 | 
107 | 
108 | 
109 | TestCodeUnderstanding = question >> LLMRun() >> LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")
110 |     
111 | 
112 | if __name__ == "__main__":
113 |     print(run_test(TestCodeUnderstanding))
114 | 


--------------------------------------------------------------------------------
/tests/find_bug_in_paper.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if a model can find math errors in the latex source of a paper."
 4 | 
 5 | TAGS = ['explain']
 6 | 
 7 | question = r"""
 8 | # Setup
 9 | 
10 | Which of the following equations are incorrect according to the specification?
11 | 
12 | # Notation
13 | 
14 | A neural network is a function $F(x) = y$ that accepts an input $x \in \mathbb{R}^n$
15 | and produces an output $y \in \mathbb{R}^m$.
16 | The model $F$ also implicitly depends on some model parameters $\theta$; in our work
17 | the model is fixed, so for convenience we don't show the dependence on $\theta$.
18 | 
19 | In this paper we focus on neural networks used as an $m$-class classifier.
20 | The output of the network is computed using the softmax function,
21 | which ensures that the output vector $y$ satisfies
22 | $0 \le y_i \le 1$ and $y_1 + \dots + y_m = 1$.
23 | The output vector $y$ is thus treated as a probability distribution, i.e.,
24 | $y_i$ is treated as the probability that input $x$ has class $i$.
25 | The classifier assigns the label $C(x) = \arg\max_i F(x)_i$ to the input $x$.
26 | Let $C^*(x)$ be the correct label of $x$.
27 | The inputs to the softmax function are called \emph{logits}.
28 | 
29 | We use the notation from Papernot et al. \cite{distillation}: define $F$ to
30 | be the full neural network including the softmax function, $Z(x) = z$ to be the output of
31 | all layers except the softmax (so $z$ are the logits), and
32 | \begin{equation*}
33 | F(x) = \softmax(Z(x)) = y.
34 | \end{equation*}
35 | A neural network typically \footnote{Most simple networks have this simple
36 |   linear structure, however other more sophisticated networks have
37 |   more complicated structures (e.g., ResNet \cite{he2016deep} and Inception \cite{szegedy2015rethinking}).
38 |   The network architecture does not impact our attacks.}
39 | consists of layers
40 | \begin{equation*}
41 | F = \softmax \circ F_n \circ F_{n-1} \circ \cdots \circ F_1
42 | \end{equation*}
43 | where
44 | \begin{equation*}
45 | F_i(x)  = \sigma(\theta_i \cdot x) + \hat\theta_i
46 | \end{equation*}
47 | for some non-linear activation function $\sigma$, some matrix $\theta_i$ of model
48 | weights, and some vector $\hat\theta_i$ of model biases. Together $\theta$ and
49 | $\hat\theta$ make up the model parameters.
50 | Common choices of $\sigma$
51 | are tanh \cite{mishkin2015all}, sigmoid, ReLU \cite{maas2013rectifier}, or ELU \cite{clevert2015fast}.
52 | In this paper we focus primarily on networks that use a ReLU activation function,
53 | as it currently is the most widely used
54 | activation function
55 | \cite{szegedy2015rethinking,springenberg2014striving,mishkin2015all,distillation}.
56 | 
57 | 
58 | # Specification:
59 | 
60 | We define an objective function $f$ such that
61 | $C(x+\delta) = t$ if and only if $f(x+\delta) \le 0$.
62 | There are many possible choices for $f$:
63 | 
64 | # The equations
65 | 
66 | \begin{align*}
67 | f_{1}(x') &= -\loss_{F,t}(x') + 1\\
68 | f_{2}(x') &= (\max_{i \ne t}( F(x')_i) - F(x')_t)^+\\
69 | f_{3}(x') &= \text{softplus}(\max_{i \ne t}(F(x')_i) - F(x')_t)-\log(2)\\
70 | f_{4}(x') &= (0.5 - F(x')_t)^+\\
71 | f_{5}(x') &= -\log(2 F(x')_t - 2)\\
72 | f_{6}(x') &= (\max_{i \ne t}( Z(x')_i) - Z(x')_t)^+\\
73 | f_{7}(x') &= \text{softplus}(\max_{i \ne t}(Z(x')_i) - Z(x')_t)-\log(2)
74 | \end{align*}
75 | where $s$ is the correct classification, $(e)^+$ is short-hand for
76 | $\max(e,0)$, $\text{softplus}(x) = \log(1+\exp(x))$, and
77 | $\loss_{F,s}(x)$ is the cross entropy loss for $x$.
78 | 
79 | # Your task
80 | 
81 | Consider each equation one by one. End your answer with a python list of numbers [1,2,3,4,5,6,7] for those that are wrong.
82 | """
83 | 
84 | question_easier = question + "\nSpecifically, make sure that $C(x+\delta) = t$ if and only if $f(x+\delta) \le 0$. Think step by step about if this is true for each equation and then give your answer as a python list"
85 | 
86 | def check(x):
87 |     ints = list(map(int,x.split("[")[-1].split("]")[0].split(",")))
88 |     return ints == [1,3,4,5,7], ""
89 | 
90 | TestFindBugPaper = question >> LLMRun() >> Echo() >> PyFunc(check)
91 | 
92 | TestFindBugPaperEasy = question_easier >> LLMRun() >> Echo() >> PyFunc(check)
93 | 
94 | 
95 | if __name__ == "__main__":
96 |     print(run_test(TestFindBugPaper))
97 | 


--------------------------------------------------------------------------------
/tests/git_merge.py:
--------------------------------------------------------------------------------
  1 | from evaluator import *
  2 | 
  3 | DESCRIPTION = "Test if the model can guide a user through a series of git commands to merge a specific branch into the main branch."
  4 | 
  5 | TAGS = ['bash', "git", "agent"]
  6 | 
  7 | question = """
  8 | You are in a git repository. I have two branches, main and some other one I forgot the name of. Merge the other branch into main.
  9 | 
 10 | On each round you should just give me just one command to type, I'll run it exactly as-is, and give you the exact output. I will not make any changes. So do not write any placholder text. I'll get started:
 11 | 
 12 | INPUT: `pwd`
 13 | 
 14 | OUTPUT: `/usr/src/app`
 15 | 
 16 | INPUT: `ls`
 17 | 
 18 | OUTPUT: `main.py a b`
 19 | 
 20 | Now your turn. What do you want to run? Do not explain yourself, just give me exactly one command. Start your response with INPUT:"""
 21 | 
 22 | 
 23 | def test_ok_merge():
 24 |     try:
 25 |         math_content = open("math.py").read().strip()
 26 |         readme_content = open("README.md").read().strip()
 27 |         mul_content = open("mul.py").read().strip()
 28 | 
 29 |         return (
 30 |             math_content == "x + y\nx - y"
 31 |             and readme_content == "A repo about something\nYou run it like this"
 32 |             and mul_content == "x * y"
 33 |         )
 34 |     except:
 35 |         return False
 36 | 
 37 | 
 38 | def test_merge_conflict_resolved():
 39 |     import os
 40 |     try:
 41 |         return open("sum.py").read().strip() == "x + y" and open("b").read().strip() == "b"
 42 |     except:
 43 |         return False
 44 | 
 45 | 
 46 | def setup_merge():
 47 |     import os
 48 |     print(os.popen("""
 49 |     git config --global user.name "Your Name"
 50 |     git config --global user.email "you@example.com"
 51 |     git config --global init.defaultBranch main
 52 |     echo "x + y" > math.py
 53 |     echo "A repo about something" > README.md
 54 |     git init
 55 |     git add math.py README.md
 56 |     git commit -a -m "Initial commit"
 57 |     git branch other
 58 |     git checkout other
 59 |     echo "x + y\nx - y" > math.py
 60 |     git add math.py
 61 |     git commit -a -m "Subtraction"
 62 |     echo "A repo about something\nYou run it like this" > README.md
 63 |     git add README.md
 64 |     git commit -a -m "README"
 65 |     git log
 66 |     git checkout main
 67 |     echo "x * y" > mul.py
 68 |     git add mul.py
 69 |     git commit -a -m "Multiplication"
 70 |     git log
 71 |     """).read())
 72 | 
 73 | 
 74 | def setup_merge_conflict():
 75 |     import os
 76 |     print(os.popen("""
 77 |     git config --global user.name "Your Name"
 78 |     git config --global user.email "you@example.com"
 79 |     git config --global init.defaultBranch main
 80 |     echo "a" > sum.py
 81 |     echo "b" > b
 82 |     git init
 83 |     git add sum.py b
 84 |     git commit -a -m "Initial commit"
 85 |     git branch other
 86 |     git checkout other
 87 |     echo "x + y" > sum.py
 88 |     git add sum.py
 89 |     git commit -a -m "addition"
 90 |     git log
 91 |     git checkout main
 92 |     echo "x - y" > sum.py
 93 |     git add sum.py
 94 |     git commit -a -m "subtraction"
 95 |     git log
 96 |     """).read())
 97 | 
 98 | 
 99 | def do_extract(x):
100 |     if 'INPUT:' in x:
101 |         x = x.split("INPUT:")[1]
102 |     try:
103 |         to_send = x.split("`")[1]
104 |     except:
105 |         to_send = x.split("\n")[0]
106 |     return to_send
107 | 
108 | 
109 | def do_prepare(x):
110 |     print("Preparing to pass back", x)
111 |     return f"I get an OUTPUT: `{x}`.\n\nWhat is the exact command I should run next? Start your response with INPUT:"
112 | 
113 | 
114 | TestGitMerge = Setup(setup_merge) >> Echo() >> question >> UntilDone(PyEvaluator(test_ok_merge), (LLMConversation(
115 | ) >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=6) >> PyEvaluator(test_ok_merge)
116 | 
117 | TestGitMergeConflict = Setup(setup_merge_conflict) >> Echo() >> question >> UntilDone(PyEvaluator(test_merge_conflict_resolved), (LLMConversation(
118 | ) >> PyFunc(do_extract) >> TerminalRun() >> PyFunc(do_prepare)), max_iters=10) >> PyEvaluator(test_merge_conflict_resolved)
119 | 
120 | if __name__ == "__main__":
121 |     print(run_test(TestGitMerge))
122 |     print(run_test(TestGitMergeConflict))
123 | 


--------------------------------------------------------------------------------
/tests/implement_assembly_interpreter.py:
--------------------------------------------------------------------------------
  1 | from evaluator import *
  2 | 
  3 | DESCRIPTION = "Test if the model can implement an interpreter for a new assembly language from a text description."
  4 | 
  5 | TAGS = ['code', 'python']
  6 | 
  7 | question = """Here is the description of a new assembly language:
  8 | 
  9 | * 8 registers (R1, R2, R3, R4, R5, R6, R7, R8) that can hold integers.
 10 | * 1 flag that can hold a boolean value (True or False).
 11 | * 100 memory addresses (0-99) that can hold integers.
 12 | * 1 instruction pointer that points to the current instruction being executed.
 13 | 
 14 | Each instruction is of the form
 15 | OP ARG1 ARG2 ...
 16 | where ARGn can be either a register (e.g., R1) or a constant (e.g., 10).
 17 | 
 18 | Labels are written with a lowercase word followed by colon.
 19 | 
 20 | The assembly language supports the following instructions:
 21 | * SET Rx C: Assigns the value C to register Rx.
 22 | * ADD Rx Ry Rz: Adds the values of Ry and Rz and stores the result in Rx.
 23 | * (similarly for SUB, MUL, DIV, MOD)
 24 | * EQ Rx Ry: Sets the flag to True if Rx and Ry are equal, False otherwise.
 25 | * (similarly for NEQ, LT (Rx < Ry), LTE, GT, GTE)
 26 | * INC/DEC Rx: Increments/Decrements the value of Rx by one.
 27 | * JMP L: Jumps to label L unconditionally.
 28 | * JT/JF (jump if true / jump if false) L: Jumps to label L if the is set or not set.
 29 | * LOAD Rx M: Loads the value at memory address M into register Rx.
 30 | * STORE Rx M: Stores the value of register Rx into memory address M.
 31 | * HCF: Stops the program (with pizzazz)
 32 | 
 33 | For example here is a program to compute the first 20 square numbers (1, 4, 9, 16, 25, ...):
 34 | 
 35 |     SET R1 0      // Counter for storing squares
 36 |     SET R2 1      // Number to square
 37 | 
 38 | loop:
 39 |     MUL R3 R2 R2  // R3 = R2 * R2
 40 |     STORE R3 R1   // Store R3 at address R1
 41 |     INC R1        // Increment address
 42 |     INC R2        // Increment number
 43 |     SET R3 20
 44 |     EQ R1 R3      // Check if 20 squares are found
 45 |     JF loop       // If not 20 squares found yet, continue finding
 46 | 
 47 | end:
 48 |     HCF           // Stop program
 49 | 
 50 | 
 51 | Write me a python interpreter `evaluate(str)` that returns the resulting memory state after running the program. For example, `evaluate(program)` should return `[1, 4, 9, 16, 25, ...]` for the above program.
 52 | """
 53 | 
 54 | primes = """
 55 | 
 56 | SET R1 2          // Starting number to check for prime
 57 | 
 58 | start_find_primes:
 59 | 
 60 |     JMP is_prime // Control will return after executing is_prime with R1 as input and R2 containing the result
 61 | 
 62 | ready_prime:
 63 |     SET R7 1
 64 |     EQ R2 R7        // Check if R2 is 1 (prime)
 65 |     JF increment  // If not prime, skip storing and increment the number
 66 | 
 67 |     // Store prime number in memory and increment count
 68 |     STORE R1 R8   // Store prime number at address pointed by R8
 69 |     INC R8        // Increment prime count
 70 | 
 71 |     // Check if 100 primes are found
 72 |     SET R7 100
 73 |     EQ R8 R7
 74 |     JF increment  // If not 100 primes found yet, continue finding
 75 | 
 76 |     JMP end        // If 100 primes found, end program
 77 | 
 78 | increment:
 79 |     INC R1         // Increment number to check for prime
 80 |     JMP start_find_primes // Check next number
 81 | 
 82 | is_prime:
 83 |     SET R2 1       // Assume number is prime initially
 84 |     SET R3 2       // Start divisor from 2
 85 | 
 86 | start_loop:        // Label to start the loop
 87 |     // Check if we have exceeded the square root of R1
 88 |     MUL R4 R3 R3   // R4 = R3 * R3
 89 |     GT R4 R1       // Set flag if R4 > R1
 90 |     JT is_prime_end        // If not exceeded, continue; else, end loop
 91 | 
 92 |     MOD R6 R1 R3   // R6 = R1 % R3
 93 |     SET R7 0
 94 |     EQ R7 R6     // Check if R6 is 0
 95 |     JT not_prime   // If yes, number is not prime
 96 | 
 97 |     INC R3         // Increment divisor
 98 |     JMP start_loop // Repeat loop
 99 | 
100 | not_prime:
101 |     SET R2 0       // Set result to 0 (not prime)
102 | 
103 | is_prime_end:
104 |     JMP ready_prime
105 | 
106 | end:
107 | """
108 | 
109 | code = """
110 |     SET R1 0
111 |     SET R2 1
112 | loop:
113 |     MUL R3 R2 R2
114 |     STORE R3 R1
115 |     INC R1
116 |     INC R2
117 |     SET R3 20
118 |     EQ R1 R3
119 |     JF loop
120 | """
121 | 
122 | test_case, answer = make_python_test([(f'evaluate("""{code}""")[:10]', "[1, 4, 9, 16, 25, 36, 49, 64, 81, 100]"),
123 |                                       (f'evaluate("""{primes}""")[:10]', "[2, 3, 5, 7, 11, 13, 17, 19, 23, 29]")
124 |                                       ])
125 | 
126 | TestImplementAssembly = question >> LLMRun() >> ExtractCode(lang="python") >> PythonRun(test_case) >> SubstringEvaluator(answer)
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     print(run_test(TestImplementAssembly))
131 | 
132 |     
133 | 


--------------------------------------------------------------------------------
/tests/data_table_processing.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can process a large table of text and identify rows with specific values."
 4 | 
 5 | TAGS = ['data']
 6 | 
 7 | question = '''
 8 | Here is a table about information of states and territory.
 9 | 
10 | State / territory	Total (mi2) 	Total (km2) 	Land (mi2) 	Land (km2) 	Water (mi2) 	Water (km2) 	%
11 | water
12 |  Alaska	665,384	1,723,337	570,641	1,477,953	94,743	245,383	14.2%
13 |  Texas	268,596	695,662	261,232	676,587	7,365	19,075	2.7%
14 |  California	163,695	423,967	155,779	403,466	7,916	20,501	4.8%
15 |  Montana	147,040	380,831	145,546	376,962	1,494	3,869	1.0%
16 |  New Mexico	121,590	314,917	121,298	314,161	292	757	0.2%
17 |  Arizona	113,990	295,234	113,594	294,207	396	1,026	0.3%
18 |  Nevada	110,572	286,380	109,781	284,332	791	2,048	0.7%
19 |  Colorado	104,094	269,601	103,642	268,431	452	1,170	0.4%
20 |  Oregon	98,379	254,799	95,988	248,608	2,391	6,191	2.4%
21 |  Wyoming	97,813	253,335	97,093	251,470	720	1,864	0.7%
22 |  Michigan	96,714	250,487	56,539	146,435	40,175	104,052	41.5%
23 |  Minnesota	86,936	225,163	79,627	206,232	7,309	18,930	8.4%
24 |  Utah	84,897	219,882	82,170	212,818	2,727	7,064	3.2%
25 |  Idaho	83,569	216,443	82,643	214,045	926	2,398	1.1%
26 |  Kansas	82,278	213,100	81,759	211,754	520	1,346	0.6%
27 |  Nebraska	77,348	200,330	76,824	198,974	524	1,356	0.7%
28 |  South Dakota	77,116	199,729	75,811	196,350	1,305	3,379	1.7%
29 |  Washington	71,298	184,661	66,456	172,119	4,842	12,542	6.8%
30 |  North Dakota	70,698	183,108	69,001	178,711	1,698	4,397	2.4%
31 |  Oklahoma	69,899	181,037	68,595	177,660	1,304	3,377	1.9%
32 |  Missouri	69,707	180,540	68,742	178,040	965	2,501	1.4%
33 |  Florida	65,758	170,312	53,625	138,887	12,133	31,424	18.5%
34 |  Wisconsin	65,496	169,635	54,158	140,268	11,339	29,367	17.3%
35 |  Georgia	59,425	153,910	57,513	148,959	1,912	4,951	3.2%
36 |  Illinois	57,914	149,995	55,519	143,793	2,395	6,202	4.1%
37 |  Iowa	56,273	145,746	55,857	144,669	416	1,077	0.7%
38 |  New York	54,555	141,297	47,126	122,057	7,429	19,240	13.6%
39 |  North Carolina	53,819	139,391	48,618	125,920	5,201	13,471	9.7%
40 |  Arkansas	53,179	137,732	52,035	134,771	1,143	2,961	2.1%
41 |  Alabama	52,420	135,767	50,645	131,171	1,775	4,597	3.4%
42 |  Louisiana	52,378	135,659	43,204	111,898	9,174	23,761	17.5%
43 |  Mississippi	48,432	125,438	46,923	121,531	1,509	3,907	3.1%
44 |  Pennsylvania	46,054	119,280	44,743	115,883	1,312	3,397	2.8%
45 |  Ohio	44,826	116,098	40,861	105,829	3,965	10,269	8.8%
46 |  Virginia	42,775	110,787	39,490	102,279	3,285	8,508	7.7%
47 |  Tennessee	42,144	109,153	41,235	106,798	909	2,355	2.2%
48 |  Kentucky	40,408	104,656	39,486	102,269	921	2,387	2.3%
49 |  Indiana	36,420	94,326	35,826	92,789	593	1,537	1.6%
50 |  Maine	35,380	91,633	30,843	79,883	4,537	11,750	12.8%
51 |  South Carolina	32,020	82,933	30,061	77,857	1,960	5,076	6.1%
52 |  West Virginia	24,230	62,756	24,038	62,259	192	497	0.8%
53 |  Maryland	12,406	32,131	9,707	25,142	2,699	6,990	21.8%
54 |  Hawaii	10,932	28,313	6,423	16,635	4,509	11,678	41.2%
55 |  Massachusetts	10,554	27,336	7,800	20,202	2,754	7,134	26.1%
56 |  Vermont	9,616	24,906	9,217	23,871	400	1,035	4.2%
57 |  New Hampshire	9,349	24,214	8,953	23,187	397	1,027	4.2%
58 |  New Jersey	8,723	22,591	7,354	19,047	1,368	3,544	15.7%
59 |  Connecticut	5,543	14,357	4,842	12,542	701	1,816	12.6%
60 |  Puerto Rico	5,325	13,791	3,424	8,868	1,901	4,924	35.7%
61 |  Delaware	2,489	6,446	1,949	5,047	540	1,399	21.7%
62 |  Northern Mariana Islands	1,976	5,117	182	472	1,793	4,644	90.7%
63 |  Rhode Island	1,545	4,001	1,034	2,678	511	1,324	33.1%
64 |  U.S. Virgin Islands	733	1,898	134	348	599	1,550	81.7%
65 |  American Samoa	581	1,505	76	198	505	1,307	86.9%
66 |  Guam	571	1,478	210	543	361	935	63.2%
67 |  District of Columbia	68	177	61	158	7	19	10.3%
68 | Minor Outlying Islands[3][a]	16	41	16	41	0	0	0.0%
69 | Contiguous US	3,120,428	8,081,869	2,954,843	7,653,006	165,589	428,865	5.3%
70 | 50 States	3,796,676	9,833,342	3,531,846	9,147,436	264,834	685,907	7.0%
71 | 50 States and DC	3,796,744	9,833,519	3,531,907	9,147,594	264,841	685,926	7.0%
72 |  United States	3,805,927	9,857,306	3,535,932	9,158,022	269,995	699,284	7.1%
73 | 
74 | List for me each of the states that have more than 20,000 square kilometers of water, from lowest to highest. Don't list any other states.
75 | 
76 | '''
77 | 
78 | stepbystep = """To answer this question follow these steps in order:
79 | 1. List just the amount of water in each state.
80 | 2. Filter those to ones with over 20k square kilometers of water.
81 | 3. Sort them from lowest to highest.
82 | 4. Say "The final answer is" and list the states in that order.
83 | 
84 | """
85 | 
86 | evaluation = "This is a student answer about which states have the most water: \n<A>\n\nThe correct answer is: California, Louisiana, Wisconsin, Florida, Michigan, and Alaska (in that order).\n\nDoes the student answer exactly these states in this order? Think out loud about their answer. Then, if the student got the states in this order, answer 'The student passes' otherwise answer 'The student fails'.\n\n"
87 | 
88 | 
89 | TestStateTable = question >> LLMRun() >> ((LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")) & SubstringEvaluator("California") & SubstringEvaluator("Louisiana") & SubstringEvaluator("Wisconsin") & SubstringEvaluator("Wisconsin"))
90 | TestStateTableStepbystep = (question + stepbystep) >> LLMRun() >> ((LLMRun(evaluation, llm=EVAL_LLM) >> SubstringEvaluator("student passes")) & SubstringEvaluator("California") & SubstringEvaluator("Louisiana") & SubstringEvaluator("Wisconsin") & SubstringEvaluator("Wisconsin"))
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     print(run_test(TestStateTableStepbystep))
95 | 
96 | 


--------------------------------------------------------------------------------
/llm.py:
--------------------------------------------------------------------------------
  1 | ## Copyright (C) 2024, Nicholas Carlini <nicholas@carlini.com>.
  2 | ##
  3 | ## This program is free software: you can redistribute it and/or modify
  4 | ## it under the terms of the GNU General Public License as published by
  5 | ## the Free Software Foundation, either version 3 of the License, or
  6 | ## (at your option) any later version.
  7 | ##
  8 | ## This program is distributed in the hope that it will be useful,
  9 | ## but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | ## GNU General Public License for more details.
 12 | ##
 13 | ## You should have received a copy of the GNU General Public License
 14 | ## along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | from io import BytesIO
 17 | import os
 18 | import base64
 19 | import requests
 20 | import json
 21 | import pickle
 22 | import time
 23 | 
 24 | from llms.openai_model import OpenAIModel
 25 | from llms.anthropic_model import AnthropicModel
 26 | from llms.mistral_model import MistralModel
 27 | from llms.openrouter_model import OpenRouterModel
 28 | from llms.vertexai_model import VertexAIModel
 29 | from llms.cohere_model import CohereModel
 30 | from llms.moonshot_model import MoonshotAIModel
 31 | from  llms.groq_model import GroqModel
 32 | 
 33 | class LLM:
 34 |     def __init__(self, name="gpt-3.5-turbo", use_cache=True, override_hparams={}):
 35 |         self.name = name
 36 |         if name.startswith("openrouter"):
 37 |             self.model = OpenRouterModel(name)
 38 |         elif 'gpt' in name or name.startswith('o1'):
 39 |             self.model = OpenAIModel(name)
 40 |         # elif 'llama' in name:
 41 |         #     self.model = LLAMAModel(name)
 42 |         elif 'mistral' in name:
 43 |             self.model = MistralModel(name)
 44 |         elif 'bison' in name or 'gemini' in name:
 45 |             self.model = VertexAIModel(name)
 46 |         #elif 'gemini' in name:
 47 |         #    self.model = GeminiModel(name)
 48 |         elif 'claude' in name:
 49 |             self.model = AnthropicModel(name)
 50 |         elif 'moonshot' in name:
 51 |             self.model = MoonshotAIModel(name)            
 52 |         elif 'command' in name:
 53 |             self.model = CohereModel(name)
 54 |         elif 'llama3' in name or 'mixtral' in name or 'gemma' in name or 'deepseek' in name:
 55 |             print("Using Groq model", name)
 56 |             self.model = GroqModel(name)
 57 |         else:
 58 |             raise
 59 |         self.model.hparams.update(override_hparams)
 60 | 
 61 |         self.use_cache = use_cache
 62 |         if use_cache:
 63 |             try:
 64 |                 if not os.path.exists("tmp"):
 65 |                     os.mkdir("tmp")
 66 |                 self.cache = pickle.load(open(f"tmp/cache-{name.split('/')[-1]}.p","rb"))
 67 |             except:
 68 |                 self.cache = {}
 69 |         else:
 70 |             self.cache = {}
 71 | 
 72 |     def __call__(self, conversation, add_image=None, max_tokens=None, skip_cache=False, json=False):
 73 |         if type(conversation) == str:
 74 |             conversation = [conversation]
 75 | 
 76 |         cache_key = tuple(conversation) if add_image is None else tuple(conversation + [add_image.tobytes()])
 77 | 
 78 |         if cache_key in self.cache and not skip_cache and self.use_cache:
 79 |             
 80 |             print(self.name, "GETCACHE", repr(self.cache[cache_key]))
 81 |             if len(self.cache[cache_key]) > 0:
 82 |                 return self.cache[cache_key]
 83 |             else:
 84 |                 print("Empty cache hit")
 85 | 
 86 |         print(self.name, "CACHE MISS", repr(conversation))
 87 | 
 88 |         
 89 |         import traceback
 90 |         from concurrent.futures import ThreadPoolExecutor, TimeoutError
 91 | 
 92 |         response = "Model API request failed"
 93 |         for _ in range(3):
 94 |             try:
 95 |                 extra = {}
 96 |                 if json:
 97 |                     extra['json'] = json
 98 |                 
 99 |                 def request_with_timeout():
100 |                     return self.model.make_request(conversation, add_image=add_image, max_tokens=max_tokens, **extra)
101 |                 
102 |                 with ThreadPoolExecutor() as executor:
103 |                     future = executor.submit(request_with_timeout)
104 |                     try:
105 |                         response = future.result(timeout=60*10)  # 10 minutes
106 |                         break  # If successful, break out of the retry loop
107 |                     except TimeoutError:
108 |                         print("Request timed out after 60 seconds")
109 |                         response = "Model API request failed due to timeout"
110 |                         # Continue to the next retry
111 |             except Exception as e:
112 |                 print("RUN FAILED", e)
113 |                 traceback.print_exc()
114 |             
115 |             time.sleep(10)
116 |         
117 | 
118 |         if self.use_cache and response != "Model API request failed":
119 |             self.cache[cache_key] = response
120 |             pickle.dump(self.cache, open(f"tmp/cache-{self.name.split('/')[-1]}.p","wb"))
121 |         
122 |         return response
123 | 
124 | #llm = LLM("command")
125 | #llm = LLM("gpt-3.5-turbo")
126 | #llm = LLM("gpt-4-turbo-2024-04-09")
127 | #llm = LLM("gemini-1.5-pro-preview-0409")
128 | llm = LLM("o1-mini")
129 | 
130 | #llm = LLM("claude-3-opus-20240229")
131 | #llm = LLM("claude-3-5-sonnet-20240620")
132 | 
133 | #llm = LLM("mistral-tiny")
134 | #llm = LLM("gemini-pro", override_hparams={'temperature': 0.3}, use_cache=False)
135 | 
136 | #eval_llm = LLM("gpt-4-1106-preview")
137 | eval_llm = LLM("gpt-4o", override_hparams={'temperature': 0.1})
138 | #eval_llm = LLM("gpt-3.5-turbo", override_hparams={'temperature': 0.1})
139 | 
140 | vision_eval_llm = LLM("gpt-4o", override_hparams={'temperature': 0.1})
141 | 


--------------------------------------------------------------------------------
/tests/program_in_new_assembly.py:
--------------------------------------------------------------------------------
  1 | from evaluator import *
  2 | 
  3 | DESCRIPTION = "Test if the model can write a program in a new assembly language. This ability to learn a new language on-the-fly is important for many tasks."
  4 | 
  5 | TAGS = ['code']
  6 | 
  7 | class AssemblyEmulator:
  8 |     def __init__(self, instructions):
  9 |         self.registers = {"R1": 0, "R2": 0, "R3": 0, "R4": 0, "R5": 0, "R6": 0, "R7": 0, "R8": 0}
 10 |         self.memory = [0] * 100
 11 |         self.instruction_pointer = 0
 12 |         self.instructions = instructions.split("\n")
 13 |         self.flag = False
 14 |         print(instructions)
 15 | 
 16 |     def run(self):
 17 | 
 18 |         def lookup(register_or_const):
 19 |             if register_or_const.startswith('R'):
 20 |                 return self.registers[register_or_const]
 21 |             else:
 22 |                 return int(register_or_const)
 23 | 
 24 |         bin_op = {
 25 |             "ADD": lambda a, b: a + b,
 26 |             "SUB": lambda a, b: a - b,
 27 |             "MUL": lambda a, b: a * b,
 28 |             "DIV": lambda a, b: a // b,
 29 |             "MOD": lambda a, b: a % b,
 30 |             }
 31 |         cmp_op = {
 32 |             "EQ": lambda a, b: a == b,
 33 |             "NEQ": lambda a, b: a != b,
 34 |             "LT": lambda a, b: a < b,
 35 |             "LTE": lambda a, b: a <= b,
 36 |             "GT": lambda a, b: a > b,
 37 |             "GTE": lambda a, b: a >= b,
 38 |             }
 39 |             
 40 |             
 41 |         ctr = 0
 42 |         while self.instruction_pointer < len(self.instructions):
 43 |             ctr += 1
 44 |             if ctr > 1e6:
 45 |                 raise Exception("Infinite loop detected")
 46 | 
 47 |             parts = self.instructions[self.instruction_pointer].split("//")[0].replace(",","").split()
 48 |             if len(parts) == 0:
 49 |                 self.instruction_pointer += 1
 50 |                 continue
 51 | 
 52 |             instruction, args = parts[0], parts[1:]
 53 | 
 54 |             if instruction == "SET":
 55 |                 self.registers[args[0]] = lookup(args[1])
 56 |             elif instruction in bin_op:
 57 |                 self.registers[args[0]] = bin_op[instruction](lookup(args[1]), lookup(args[2]))
 58 |             elif instruction in cmp_op:
 59 |                 self.flag = cmp_op[instruction](lookup(args[0]), lookup(args[1]))
 60 |             elif instruction == "INC":
 61 |                 self.registers[args[0]] += 1
 62 |             elif instruction == "DEC":
 63 |                 self.registers[args[0]] -= 1
 64 |             elif instruction == "JT" and self.flag:
 65 |                 self.instruction_pointer = self.find_label(args[0])
 66 |                 continue
 67 |             elif instruction == "JF" and not self.flag:
 68 |                 self.instruction_pointer = self.find_label(args[0])
 69 |                 continue
 70 |             elif instruction == "JMP":
 71 |                 self.instruction_pointer = self.find_label(args[0])
 72 |                 continue
 73 |             elif instruction == "LOAD":
 74 |                 self.memory[lookup(args[1])] = lookup(args[0])
 75 |             elif instruction == "STORE":
 76 |                 self.memory[lookup(args[1])] = lookup(args[0])
 77 |             elif instruction == "HCF":
 78 |                 return
 79 | 
 80 |             self.instruction_pointer += 1
 81 | 
 82 |     def find_label(self, label):
 83 |         return next(i for i, instruction in enumerate(self.instructions) if instruction.strip().startswith(label + ':'))
 84 | 
 85 |     
 86 | question = """Here is the description of a new assembly language:
 87 | 
 88 | * 8 registers (R1, R2, R3, R4, R5, R6, R7, R8) that can hold integers.
 89 | * 1 flag that can hold a boolean value (True or False).
 90 | * 100 memory addresses (0-99) that can hold integers.
 91 | * 1 instruction pointer that points to the current instruction being executed.
 92 | 
 93 | Each instruction is of the form
 94 | OP ARG1 ARG2 ...
 95 | where ARGn can be either a register (e.g., R1) or a constant (e.g., 10).
 96 | 
 97 | Labels are written with a lowercase word followed by colon.
 98 | 
 99 | The assembly language supports the following instructions:
100 | * SET Rx C: Assigns the value C to register Rx.
101 | * ADD Rx Ry Rz: Adds the values of Ry and Rz and stores the result in Rx.
102 | * (similarly for SUB, MUL, DIV, MOD)
103 | * EQ Rx Ry: Sets the flag to True if Rx and Ry are equal, False otherwise.
104 | * (similarly for NEQ, LT (Rx < Ry), LTE, GT, GTE)
105 | * INC/DEC Rx: Increments/Decrements the value of Rx by one.
106 | * JMP L: Jumps to label L unconditionally.
107 | * JT/JF (jump if true / jump if false) L: Jumps to label L if the is set or not set.
108 | * LOAD Rx M: Loads the value at memory address M into register Rx.
109 | * STORE Rx M: Stores the value of register Rx into memory address M.
110 | * HCF: Stops the program (with pizzazz)
111 | 
112 | Given this assembly language, write a program that {}"""
113 | 
114 | def expect(answer):
115 |     def evaluate(code):
116 |         try:
117 |             emulator = AssemblyEmulator(code)
118 |             emulator.run()
119 |         except Exception as e:
120 |             print("EXCEPTION", e)
121 |             return False, str(e)
122 |             
123 |         return answer(emulator.memory), ""
124 |     return evaluate
125 | 
126 | TestNewAssemblySquareNumbers = question.format("puts the first 20 square (1, 4, 9...) numbers in memory cells 0..19") >> LLMRun() >> ExtractCode(lang="a new assembly language I just made up") >> PyFunc(expect(lambda m: m[:20] == [i*i for i in range(1,21)]))
127 | 
128 | TestNewAssemblyPrimeNumbers = question.format("puts the first 20 prime numbers in memory cells 0..19") >> LLMRun() >> ExtractCode(lang="a new assembly language I just made up") >> PyFunc(expect(lambda m: m[:20] == [2, 3, 5, 7, 11, 13, 17, 19, 23, 29]))
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     print(run_test(TestNewAssemblySquareNumbers))
133 | 
134 |     
135 | 


--------------------------------------------------------------------------------
/tests/extract_references.py:
--------------------------------------------------------------------------------
 1 | from evaluator import *
 2 | 
 3 | DESCRIPTION = "Test if the model can extract paper tiles from a block of text."
 4 | 
 5 | TAGS = ['code', 'python']
 6 | 
 7 | question = '''Extract a list the titles of the papers from the following list of references.
 8 | Start your response
 9 | 
10 | ```json
11 | [title_1, title_2, ...]
12 | ```
13 | 
14 | Here's the block of text:
15 | 
16 | A Suffix Arrays                                                         [45] SHOKRI, R., STRONATI, M., SONG, C., AND                                                              
17 | A suffix of length k of a string x are the last k characters (or,       SHMATIKOV, V. Membership inference attacks against                                                        
18 | tokens) of this string, i.e,. x[−k:]                                    machine learning models. In IEEE Symposium on                                                             
19 | . If we want to know: “was                                              Security and Privacy (2017).                                                                              
20 | 0 100 200 300                                                           [46] SOLDAINI, L. AI2 Dolma: 3 trillion token open corpus                                                 
21 | length of k-gram                                                        for language model pretraining, 2023.                                                                     
22 | 104                                                                     [47] SOMEPALLI, G., SINGLA, V., GOLDBLUM, M., GEIPING, J., AND GOLDSTEIN, T. Diffusion art or digital     
23 | 105                                                                     forgery? Investigating data replication in diffusion models. In CVPR (2023).                              
24 | 106                                                                     [48] SOUTHWOOD, T. R. E., AND HENDERSON, P. A. Ecological methods. John Wiley & Sons, 2009.               
25 | # generated kgrams                                                      [49] TOUVRON, H., LAVRIL, T., IZACARD, G., MARTINET, X., LACHAUX, M.-A., LACROIX, T., ROZIÈRE, B., GOYAL, 
26 | in training data                                                        N., HAMBRO, E., AZHAR, F., RODRIGUEZ, A., JOULIN, A., GRAVE, E., AND LAMPLE,                              
27 | Figure 14: The suffix length threshold k significantly impacts          G. LLaMA: Open and Efficient Foundation Language                                                          
28 | the rate of data determined to be memorized. We set k = 50.             Models, 2023.                                                                                             
29 | x                                                                       [50] TOUVRON, H., MARTIN, L., STONE, K., ALBERT, P.,                                                      
30 | ′                                                                       ALMAHAIRI, A., BABAEI, Y., BASHLYKOV, N., BATRA, S., BHARGAVA, P., BHOSALE, S., ET AL. LLaMA              
31 | [−k:]                                                                   2: Open foundation and fine-tuned chat models. arXiv                                                      
32 | in x”, then we would have to do an O(n) search checking                 preprint arXiv:2307.09288 (2023).                                                                         
33 | all suffixes of x. This linear scan is expensive if x is large,         [51] TTI. Introducing Falcon 180b.                                                                        
34 | as it is in training large language models, often terabytes in          [52] YEOM, S., GIACOMELLI, I., FREDRIKSON, M., AND                                                        
35 | size. Instead, a suffix array will enable us to do this search          JHA, S. Privacy risk in machine learning: Analyzing                                                       
36 | efficiently in O(logn) time.                                            the connection to overfitting. In IEEE CSF (2018).                                                        
37 | A suffix array s over a dataset X, denoted as s(X) is a                 [53] ZELTERMAN, D. Smooth nonparametric estimation of                                                     
38 | data structure that indexes all suffixes of this string in a            the quantile function. Journal of statistical planning                                                    
39 | lexicographically-sorted ordering. This sorting, as we will             and inference 26, 3 (1990), 339–352.                                                                      
40 | see, is important as it enables efficient binary searches for a         [54] ZHANG, S., ROLLER, S., GOYAL, N., ARTETXE, M.,                                                       
41 | particular substring/suffix.                                            CHEN, M., CHEN, S., DEWAN, C., DIAB, M., LI, X.,                                                          
42 | In the simplest form, we can consider the suffix array of a             LIN, X. V., MIHAYLOV, T., OTT, M., SHLEIFER, S.,                                                          
43 | word, e.g., x =“banana”. The following is the set of all suffixes       SHUSTER, K., SIMIG, D., KOURA, P. S., SRIDHAR,                                                            
44 | as obtained by traversing the string backwards and keeping only         A., WANG, T., AND ZETTLEMOYER, L. Opt: Open                                                               
45 | unique suffixes, in this case, all suffixes: {“a”, “na”,                pre-trained transformer language models, 2022.                                                            
46 | “ana”, “nana”, “ anana”, “banana”}, which are represented by            [55] ZIEGLER, A. Github Copilot research recitation, 2021.                                                
47 | the indices s = {5,4,3,2,1,0}. In this form, we still require           [56] ZOU, A., WANG, Z., KOLTER, J. Z., AND FREDRIKSON, M. Universal and transferable adversarial          
48 | an O(n) search as there is no ordering. However, a suffix array         attacks on aligned language models. arXiv preprint                                                        
49 | will store these suffixes in a lexicographically sorted ordering.       arXiv:2307.15043 (2023).                                                                                  
50 | '''
51 | 
52 | 
53 | answer = set([
54 |         "membership inference attacks against machine learning models",
55 |         "ai2 dolma: 3 trillion token open corpus for language model pretraining",
56 |         "diffusion art or digital forgery? investigating data replication in diffusion models",
57 |         "ecological methods",
58 |         "llama: open and efficient foundation language models",
59 |         "llama 2: open foundation and fine-tuned chat models",
60 |         "introducing falcon 180b",
61 |         "privacy risk in machine learning: analyzing the connection to overfitting",
62 |         "smooth nonparametric estimation of the quantile function",
63 |         "opt: open pre-trained transformer language models",
64 |         "github copilot research recitation",
65 |         "universal and transferable adversarial attacks on aligned language models",
66 |         ])
67 | 
68 | def check_ok(dat):
69 |     import json
70 |     dat = dat.replace("```json", "```")
71 |     dat = dat.split("```")[1]
72 |     dat = dat.lower().replace(".","")
73 |     return set(json.loads(dat)) == answer
74 | 
75 | 
76 | TestExtractRef = question >> LLMRun() >> PyFunc(check_ok)
77 | 
78 | 
79 | if __name__ == "__main__":
80 |     print(run_test(TestExtractRef))
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------