├── README.md ├── data ├── benchmark_questions.jsonl ├── final_model_responses │ ├── claude-3-5-sonnet-20241022_responses.jsonl │ ├── gemini-1.5-pro-002_responses.jsonl │ ├── gpt-4o-2024-08-06_responses.jsonl │ ├── llama-3-2-3b-instruct.jsonl │ ├── llama-3-3-70b-instruct.jsonl │ ├── llama3-1-405b-instruct-v1_responses.jsonl │ ├── mistral-large-latest_responses.jsonl │ ├── mixtral-8x7b-instruct.jsonl │ ├── o1-preview_responses.jsonl │ ├── qwen2-5_14b.jsonl │ ├── qwen2-5_72b.jsonl │ └── qwen2_72b.jsonl └── response_template.jsonl ├── main.py ├── requirements.txt └── src ├── conversation.py ├── data_loader.py ├── evaluator.py ├── models ├── base.py ├── factory.py ├── huggingface.py └── openai.py └── result_parser.py /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/README.md -------------------------------------------------------------------------------- /data/benchmark_questions.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/benchmark_questions.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/claude-3-5-sonnet-20241022_responses.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/claude-3-5-sonnet-20241022_responses.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/gemini-1.5-pro-002_responses.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/gemini-1.5-pro-002_responses.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/gpt-4o-2024-08-06_responses.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/gpt-4o-2024-08-06_responses.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/llama-3-2-3b-instruct.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/llama-3-2-3b-instruct.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/llama-3-3-70b-instruct.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/llama-3-3-70b-instruct.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/llama3-1-405b-instruct-v1_responses.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/llama3-1-405b-instruct-v1_responses.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/mistral-large-latest_responses.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/mistral-large-latest_responses.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/mixtral-8x7b-instruct.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/mixtral-8x7b-instruct.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/o1-preview_responses.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/o1-preview_responses.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/qwen2-5_14b.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/qwen2-5_14b.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/qwen2-5_72b.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/qwen2-5_72b.jsonl -------------------------------------------------------------------------------- /data/final_model_responses/qwen2_72b.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/final_model_responses/qwen2_72b.jsonl -------------------------------------------------------------------------------- /data/response_template.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/data/response_template.jsonl -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/main.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/requirements.txt -------------------------------------------------------------------------------- /src/conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/conversation.py -------------------------------------------------------------------------------- /src/data_loader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/data_loader.py -------------------------------------------------------------------------------- /src/evaluator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/evaluator.py -------------------------------------------------------------------------------- /src/models/base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/models/base.py -------------------------------------------------------------------------------- /src/models/factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/models/factory.py -------------------------------------------------------------------------------- /src/models/huggingface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/models/huggingface.py -------------------------------------------------------------------------------- /src/models/openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/models/openai.py -------------------------------------------------------------------------------- /src/result_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ekwinox117/multi-challenge/HEAD/src/result_parser.py --------------------------------------------------------------------------------