├── .config
    └── pre-commit-config.yaml
├── .env.sample
├── .github
    ├── .codecov.yml
    └── dependabot.yml
├── .gitignore
├── LICENSE
├── README.md
├── agent_as_a_judge
    ├── __init__.py
    ├── agent.py
    ├── config.py
    ├── llm
    │   ├── __init__.py
    │   ├── cost.py
    │   └── provider.py
    ├── module
    │   ├── __init__.py
    │   ├── ask.py
    │   ├── code_search.py
    │   ├── graph.py
    │   ├── locate.py
    │   ├── memory.py
    │   ├── planning.py
    │   ├── prompt
    │   │   ├── __init__.py
    │   │   ├── prompt_ask.py
    │   │   ├── prompt_judge.py
    │   │   ├── prompt_locate.py
    │   │   ├── prompt_planning.py
    │   │   ├── prompt_retrieve.py
    │   │   ├── system_prompt_ask.py
    │   │   ├── system_prompt_judge.py
    │   │   ├── system_prompt_locate.py
    │   │   ├── system_prompt_planning.py
    │   │   └── system_prompt_retrieve.py
    │   ├── read.py
    │   ├── statistics.py
    │   └── text_retrieve.py
    └── utils
    │   ├── __init__.py
    │   ├── count_lines.py
    │   └── truncate.py
├── assets
    ├── aaaj_logo_v6.png
    ├── aaaj_logo_v7.png
    ├── aaaj_sample.md
    ├── ask_sample.md
    ├── dataset.png
    ├── demo.gif
    ├── devai_logo.png
    ├── judge_first.png
    ├── openwiki_1a.jpeg
    ├── openwiki_1b.jpeg
    └── sample.jpeg
├── benchmark
    ├── devai
    │   ├── README.md
    │   ├── constraints.json
    │   ├── instances
    │   │   ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json
    │   │   ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json
    │   │   ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json
    │   │   ├── 04_Text_Generation_GPT2_Prompts_DL.json
    │   │   ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json
    │   │   ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json
    │   │   ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json
    │   │   ├── 08_Robot_Control_PPO_PyBullet_RL.json
    │   │   ├── 09_Recommendation_System_NCF_MovieLens_ML.json
    │   │   ├── 10_Face_Recognition_FaceNet_LFW_DL.json
    │   │   ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json
    │   │   ├── 12_Spam_Detection_SVM_Enron_ML.json
    │   │   ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json
    │   │   ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json
    │   │   ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json
    │   │   ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json
    │   │   ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json
    │   │   ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json
    │   │   ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json
    │   │   ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json
    │   │   ├── 21_Iris_Classification_SVM_Iris_ML.json
    │   │   ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json
    │   │   ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json
    │   │   ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json
    │   │   ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json
    │   │   ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json
    │   │   ├── 27_Image_Generation_DCGAN_MNIST_DL.json
    │   │   ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json
    │   │   ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json
    │   │   ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json
    │   │   ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json
    │   │   ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json
    │   │   ├── 33_Object_Detection_YOLOv3_COCO_DL.json
    │   │   ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json
    │   │   ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json
    │   │   ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json
    │   │   ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json
    │   │   ├── 38_Object_Tracking_Siamese_OTB50_DL.json
    │   │   ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    │   │   ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json
    │   │   ├── 41_Stock_Classification_KNN_YahooFinance_ML.json
    │   │   ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json
    │   │   ├── 43_Social_Network_Analysis_GCN_Cora_ML.json
    │   │   ├── 44_Text_Classification_BERT_AGNews_DL.json
    │   │   ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json
    │   │   ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json
    │   │   ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json
    │   │   ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json
    │   │   ├── 49_Explainable_AI_LIME_Titanic_ML.json
    │   │   ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json
    │   │   ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json
    │   │   ├── 52_Devin_AI_Trains_an_AI.json
    │   │   ├── 53_Devin_Upwork_Side_Hustle.json
    │   │   ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json
    │   │   └── 55_SQLite_Database_Viewer_and_Analyzer_App.json
    │   ├── trajectory-schema.json
    │   └── validate_trajectory.py
    ├── judgment
    │   ├── GPT-Pilot
    │   │   ├── agent_as_a_judge
    │   │   │   └── gray_box
    │   │   │   │   ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json
    │   │   │   │   ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json
    │   │   │   │   ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json
    │   │   │   │   ├── 04_Text_Generation_GPT2_Prompts_DL.json
    │   │   │   │   ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json
    │   │   │   │   ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json
    │   │   │   │   ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json
    │   │   │   │   ├── 08_Robot_Control_PPO_PyBullet_RL.json
    │   │   │   │   ├── 09_Recommendation_System_NCF_MovieLens_ML.json
    │   │   │   │   ├── 10_Face_Recognition_FaceNet_LFW_DL.json
    │   │   │   │   ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json
    │   │   │   │   ├── 12_Spam_Detection_SVM_Enron_ML.json
    │   │   │   │   ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json
    │   │   │   │   ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json
    │   │   │   │   ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json
    │   │   │   │   ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json
    │   │   │   │   ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json
    │   │   │   │   ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json
    │   │   │   │   ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json
    │   │   │   │   ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json
    │   │   │   │   ├── 21_Iris_Classification_SVM_Iris_ML.json
    │   │   │   │   ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json
    │   │   │   │   ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json
    │   │   │   │   ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json
    │   │   │   │   ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json
    │   │   │   │   ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json
    │   │   │   │   ├── 27_Image_Generation_DCGAN_MNIST_DL.json
    │   │   │   │   ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json
    │   │   │   │   ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json
    │   │   │   │   ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json
    │   │   │   │   ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json
    │   │   │   │   ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json
    │   │   │   │   ├── 33_Object_Detection_YOLOv3_COCO_DL.json
    │   │   │   │   ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json
    │   │   │   │   ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json
    │   │   │   │   ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json
    │   │   │   │   ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json
    │   │   │   │   ├── 38_Object_Tracking_Siamese_OTB50_DL.json
    │   │   │   │   ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    │   │   │   │   ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json
    │   │   │   │   ├── 41_Stock_Classification_KNN_YahooFinance_ML.json
    │   │   │   │   ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json
    │   │   │   │   ├── 43_Social_Network_Analysis_GCN_Cora_ML.json
    │   │   │   │   ├── 44_Text_Classification_BERT_AGNews_DL.json
    │   │   │   │   ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json
    │   │   │   │   ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json
    │   │   │   │   ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json
    │   │   │   │   ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json
    │   │   │   │   ├── 49_Explainable_AI_LIME_Titanic_ML.json
    │   │   │   │   ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json
    │   │   │   │   ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json
    │   │   │   │   ├── 52_Devin_AI_Trains_an_AI.json
    │   │   │   │   ├── 53_Devin_Upwork_Side_Hustle.json
    │   │   │   │   ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json
    │   │   │   │   └── 55_SQLite_Database_Viewer_and_Analyzer_App.json
    │   │   └── human_as_a_judge
    │   │   │   ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json
    │   │   │   ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json
    │   │   │   ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json
    │   │   │   ├── 04_Text_Generation_GPT2_Prompts_DL.json
    │   │   │   ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json
    │   │   │   ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json
    │   │   │   ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json
    │   │   │   ├── 08_Robot_Control_PPO_PyBullet_RL.json
    │   │   │   ├── 09_Recommendation_System_NCF_MovieLens_ML.json
    │   │   │   ├── 10_Face_Recognition_FaceNet_LFW_DL.json
    │   │   │   ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json
    │   │   │   ├── 12_Spam_Detection_SVM_Enron_ML.json
    │   │   │   ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json
    │   │   │   ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json
    │   │   │   ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json
    │   │   │   ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json
    │   │   │   ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json
    │   │   │   ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json
    │   │   │   ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json
    │   │   │   ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json
    │   │   │   ├── 21_Iris_Classification_SVM_Iris_ML.json
    │   │   │   ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json
    │   │   │   ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json
    │   │   │   ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json
    │   │   │   ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json
    │   │   │   ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json
    │   │   │   ├── 27_Image_Generation_DCGAN_MNIST_DL.json
    │   │   │   ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json
    │   │   │   ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json
    │   │   │   ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json
    │   │   │   ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json
    │   │   │   ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json
    │   │   │   ├── 33_Object_Detection_YOLOv3_COCO_DL.json
    │   │   │   ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json
    │   │   │   ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json
    │   │   │   ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json
    │   │   │   ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json
    │   │   │   ├── 38_Object_Tracking_Siamese_OTB50_DL.json
    │   │   │   ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    │   │   │   ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json
    │   │   │   ├── 41_Stock_Classification_KNN_YahooFinance_ML.json
    │   │   │   ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json
    │   │   │   ├── 43_Social_Network_Analysis_GCN_Cora_ML.json
    │   │   │   ├── 44_Text_Classification_BERT_AGNews_DL.json
    │   │   │   ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json
    │   │   │   ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json
    │   │   │   ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json
    │   │   │   ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json
    │   │   │   ├── 49_Explainable_AI_LIME_Titanic_ML.json
    │   │   │   ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json
    │   │   │   ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json
    │   │   │   ├── 52_Devin_AI_Trains_an_AI.json
    │   │   │   ├── 53_Devin_Upwork_Side_Hustle.json
    │   │   │   ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json
    │   │   │   └── 55_SQLite_Database_Viewer_and_Analyzer_App.json
    │   ├── MetaGPT
    │   │   ├── agent_as_a_judge
    │   │   │   └── gray_box
    │   │   │   │   ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json
    │   │   │   │   ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json
    │   │   │   │   ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json
    │   │   │   │   ├── 04_Text_Generation_GPT2_Prompts_DL.json
    │   │   │   │   ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json
    │   │   │   │   ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json
    │   │   │   │   ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json
    │   │   │   │   ├── 08_Robot_Control_PPO_PyBullet_RL.json
    │   │   │   │   ├── 09_Recommendation_System_NCF_MovieLens_ML.json
    │   │   │   │   ├── 10_Face_Recognition_FaceNet_LFW_DL.json
    │   │   │   │   ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json
    │   │   │   │   ├── 12_Spam_Detection_SVM_Enron_ML.json
    │   │   │   │   ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json
    │   │   │   │   ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json
    │   │   │   │   ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json
    │   │   │   │   ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json
    │   │   │   │   ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json
    │   │   │   │   ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json
    │   │   │   │   ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json
    │   │   │   │   ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json
    │   │   │   │   ├── 21_Iris_Classification_SVM_Iris_ML.json
    │   │   │   │   ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json
    │   │   │   │   ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json
    │   │   │   │   ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json
    │   │   │   │   ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json
    │   │   │   │   ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json
    │   │   │   │   ├── 27_Image_Generation_DCGAN_MNIST_DL.json
    │   │   │   │   ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json
    │   │   │   │   ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json
    │   │   │   │   ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json
    │   │   │   │   ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json
    │   │   │   │   ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json
    │   │   │   │   ├── 33_Object_Detection_YOLOv3_COCO_DL.json
    │   │   │   │   ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json
    │   │   │   │   ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json
    │   │   │   │   ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json
    │   │   │   │   ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json
    │   │   │   │   ├── 38_Object_Tracking_Siamese_OTB50_DL.json
    │   │   │   │   ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    │   │   │   │   ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json
    │   │   │   │   ├── 41_Stock_Classification_KNN_YahooFinance_ML.json
    │   │   │   │   ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json
    │   │   │   │   ├── 43_Social_Network_Analysis_GCN_Cora_ML.json
    │   │   │   │   ├── 44_Text_Classification_BERT_AGNews_DL.json
    │   │   │   │   ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json
    │   │   │   │   ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json
    │   │   │   │   ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json
    │   │   │   │   ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json
    │   │   │   │   ├── 49_Explainable_AI_LIME_Titanic_ML.json
    │   │   │   │   ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json
    │   │   │   │   ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json
    │   │   │   │   ├── 52_Devin_AI_Trains_an_AI.json
    │   │   │   │   ├── 53_Devin_Upwork_Side_Hustle.json
    │   │   │   │   ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json
    │   │   │   │   └── 55_SQLite_Database_Viewer_and_Analyzer_App.json
    │   │   └── human_as_a_judge
    │   │   │   ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json
    │   │   │   ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json
    │   │   │   ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json
    │   │   │   ├── 04_Text_Generation_GPT2_Prompts_DL.json
    │   │   │   ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json
    │   │   │   ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json
    │   │   │   ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json
    │   │   │   ├── 08_Robot_Control_PPO_PyBullet_RL.json
    │   │   │   ├── 09_Recommendation_System_NCF_MovieLens_ML.json
    │   │   │   ├── 10_Face_Recognition_FaceNet_LFW_DL.json
    │   │   │   ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json
    │   │   │   ├── 12_Spam_Detection_SVM_Enron_ML.json
    │   │   │   ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json
    │   │   │   ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json
    │   │   │   ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json
    │   │   │   ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json
    │   │   │   ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json
    │   │   │   ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json
    │   │   │   ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json
    │   │   │   ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json
    │   │   │   ├── 21_Iris_Classification_SVM_Iris_ML.json
    │   │   │   ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json
    │   │   │   ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json
    │   │   │   ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json
    │   │   │   ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json
    │   │   │   ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json
    │   │   │   ├── 27_Image_Generation_DCGAN_MNIST_DL.json
    │   │   │   ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json
    │   │   │   ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json
    │   │   │   ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json
    │   │   │   ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json
    │   │   │   ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json
    │   │   │   ├── 33_Object_Detection_YOLOv3_COCO_DL.json
    │   │   │   ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json
    │   │   │   ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json
    │   │   │   ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json
    │   │   │   ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json
    │   │   │   ├── 38_Object_Tracking_Siamese_OTB50_DL.json
    │   │   │   ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    │   │   │   ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json
    │   │   │   ├── 41_Stock_Classification_KNN_YahooFinance_ML.json
    │   │   │   ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json
    │   │   │   ├── 43_Social_Network_Analysis_GCN_Cora_ML.json
    │   │   │   ├── 44_Text_Classification_BERT_AGNews_DL.json
    │   │   │   ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json
    │   │   │   ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json
    │   │   │   ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json
    │   │   │   ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json
    │   │   │   ├── 49_Explainable_AI_LIME_Titanic_ML.json
    │   │   │   ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json
    │   │   │   ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json
    │   │   │   ├── 52_Devin_AI_Trains_an_AI.json
    │   │   │   ├── 53_Devin_Upwork_Side_Hustle.json
    │   │   │   ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json
    │   │   │   └── 55_SQLite_Database_Viewer_and_Analyzer_App.json
    │   └── OpenHands
    │   │   ├── agent_as_a_judge
    │   │       └── gray_box
    │   │       │   ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json
    │   │       │   ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json
    │   │       │   ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json
    │   │       │   ├── 04_Text_Generation_GPT2_Prompts_DL.json
    │   │       │   ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json
    │   │       │   ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json
    │   │       │   ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json
    │   │       │   ├── 08_Robot_Control_PPO_PyBullet_RL.json
    │   │       │   ├── 09_Recommendation_System_NCF_MovieLens_ML.json
    │   │       │   ├── 10_Face_Recognition_FaceNet_LFW_DL.json
    │   │       │   ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json
    │   │       │   ├── 12_Spam_Detection_SVM_Enron_ML.json
    │   │       │   ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json
    │   │       │   ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json
    │   │       │   ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json
    │   │       │   ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json
    │   │       │   ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json
    │   │       │   ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json
    │   │       │   ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json
    │   │       │   ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json
    │   │       │   ├── 21_Iris_Classification_SVM_Iris_ML.json
    │   │       │   ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json
    │   │       │   ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json
    │   │       │   ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json
    │   │       │   ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json
    │   │       │   ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json
    │   │       │   ├── 27_Image_Generation_DCGAN_MNIST_DL.json
    │   │       │   ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json
    │   │       │   ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json
    │   │       │   ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json
    │   │       │   ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json
    │   │       │   ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json
    │   │       │   ├── 33_Object_Detection_YOLOv3_COCO_DL.json
    │   │       │   ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json
    │   │       │   ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json
    │   │       │   ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json
    │   │       │   ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json
    │   │       │   ├── 38_Object_Tracking_Siamese_OTB50_DL.json
    │   │       │   ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    │   │       │   ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json
    │   │       │   ├── 41_Stock_Classification_KNN_YahooFinance_ML.json
    │   │       │   ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json
    │   │       │   ├── 43_Social_Network_Analysis_GCN_Cora_ML.json
    │   │       │   ├── 44_Text_Classification_BERT_AGNews_DL.json
    │   │       │   ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json
    │   │       │   ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json
    │   │       │   ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json
    │   │       │   ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json
    │   │       │   ├── 49_Explainable_AI_LIME_Titanic_ML.json
    │   │       │   ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json
    │   │       │   ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json
    │   │       │   ├── 52_Devin_AI_Trains_an_AI.json
    │   │       │   ├── 53_Devin_Upwork_Side_Hustle.json
    │   │       │   ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json
    │   │       │   └── 55_SQLite_Database_Viewer_and_Analyzer_App.json
    │   │   └── human_as_a_judge
    │   │       ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json
    │   │       ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json
    │   │       ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json
    │   │       ├── 04_Text_Generation_GPT2_Prompts_DL.json
    │   │       ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json
    │   │       ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json
    │   │       ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json
    │   │       ├── 08_Robot_Control_PPO_PyBullet_RL.json
    │   │       ├── 09_Recommendation_System_NCF_MovieLens_ML.json
    │   │       ├── 10_Face_Recognition_FaceNet_LFW_DL.json
    │   │       ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json
    │   │       ├── 12_Spam_Detection_SVM_Enron_ML.json
    │   │       ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json
    │   │       ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json
    │   │       ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json
    │   │       ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json
    │   │       ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json
    │   │       ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json
    │   │       ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json
    │   │       ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json
    │   │       ├── 21_Iris_Classification_SVM_Iris_ML.json
    │   │       ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json
    │   │       ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json
    │   │       ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json
    │   │       ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json
    │   │       ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json
    │   │       ├── 27_Image_Generation_DCGAN_MNIST_DL.json
    │   │       ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json
    │   │       ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json
    │   │       ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json
    │   │       ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json
    │   │       ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json
    │   │       ├── 33_Object_Detection_YOLOv3_COCO_DL.json
    │   │       ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json
    │   │       ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json
    │   │       ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json
    │   │       ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json
    │   │       ├── 38_Object_Tracking_Siamese_OTB50_DL.json
    │   │       ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    │   │       ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json
    │   │       ├── 41_Stock_Classification_KNN_YahooFinance_ML.json
    │   │       ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json
    │   │       ├── 43_Social_Network_Analysis_GCN_Cora_ML.json
    │   │       ├── 44_Text_Classification_BERT_AGNews_DL.json
    │   │       ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json
    │   │       ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json
    │   │       ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json
    │   │       ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json
    │   │       ├── 49_Explainable_AI_LIME_Titanic_ML.json
    │   │       ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json
    │   │       ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json
    │   │       ├── 52_Devin_AI_Trains_an_AI.json
    │   │       ├── 53_Devin_Upwork_Side_Hustle.json
    │   │       ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json
    │   │       └── 55_SQLite_Database_Viewer_and_Analyzer_App.json
    ├── trajectories
    │   └── OpenHands
    │   │   └── 39_Drug_Response_Prediction_SVM_GDSC_ML.json
    └── workspaces
    │   └── OpenHands
    │       └── 39_Drug_Response_Prediction_SVM_GDSC_ML
    │           ├── gdsc_dataset.csv
    │           ├── results
    │               ├── drug_response_prediction_report.md
    │               ├── drug_response_prediction_report.pdf
    │               ├── performance.txt
    │               └── rmse_scores.png
    │           └── src
    │               ├── data_loader.py
    │               ├── model.py
    │               └── train.py
├── poetry.lock
├── pyproject.toml
└── scripts
    ├── README.md
    ├── run_aaaj.py
    ├── run_ask.py
    ├── run_statistics.py
    ├── run_wiki.py
    └── templates
        └── html
            └── index.html


/.config/pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |   rev: v4.0.1
 4 |   hooks:
 5 |     - id: trailing-whitespace
 6 |     - id: end-of-file-fixer
 7 |     - id: check-yaml
 8 |     - id: check-json
 9 | - repo: https://github.com/pre-commit/mirrors-mypy
10 |   rev: v0.910
11 |   hooks:
12 |     - id: mypy
13 |       additional_dependencies: ['types-termcolor']
14 |       language: python
15 |       entry: poetry run mypy
16 | 


--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | DEFAULT_LLM="gpt-4o-2024-08-06"
2 | OPENAI_API_KEY="sk-***"
3 | PROJECT_DIR="{PATH_TO_THIS_PROJECT}"


--------------------------------------------------------------------------------
/.github/.codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   notify:
 3 |     wait_for_ci: true
 4 | 
 5 | coverage:
 6 |   status:
 7 |     patch:
 8 |       default:
 9 |         threshold: 100% 
10 |     project:
11 |       default:
12 |         threshold: 5%
13 | comment: false
14 | github_checks:
15 |     annotations: false
16 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "pip"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "weekly"
 7 |     open-pull-requests-limit: 5
 8 |     assignees:
 9 |       - mczhuge
10 |     labels:
11 |       - "dependencies"
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 metauto.ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/__init__.py:
--------------------------------------------------------------------------------
1 | from .llm.provider import LLM
2 | from .llm.cost import Cost
3 | 
4 | __all__ = ["LLM", "Cost"]
5 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List, Optional
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | @dataclass
 7 | class AgentConfig:
 8 |     include_dirs: Optional[List[str]] = None
 9 |     exclude_dirs: Optional[List[str]] = None
10 |     exclude_files: Optional[List[str]] = None
11 |     setting: str = "gray_box"
12 |     planning: str = "efficient (no planning)"
13 |     judge_dir: Optional[Path] = None
14 |     workspace_dir: Optional[Path] = None
15 |     instance_dir: Optional[Path] = None
16 |     trajectory_file: Optional[Path] = None
17 | 
18 |     @classmethod
19 |     def from_args(cls, args):
20 | 
21 |         return cls(
22 |             include_dirs=(
23 |                 args.include_dirs
24 |                 if hasattr(args, "include_dirs")
25 |                 else ["src", "results", "models", "data"]
26 |             ),
27 |             exclude_dirs=(
28 |                 args.exclude_dirs
29 |                 if hasattr(args, "exclude_dirs")
30 |                 else ["__pycache__", "env"]
31 |             ),
32 |             exclude_files=(
33 |                 args.exclude_files if hasattr(args, "exclude_files") else [".DS_Store"]
34 |             ),
35 |             setting=args.setting,
36 |             planning=args.planning,
37 |             judge_dir=Path(args.judge_dir),
38 |             workspace_dir=Path(args.workspace_dir),
39 |             instance_dir=Path(args.instance_dir),
40 |             trajectory_file=(
41 |                 Path(args.trajectory_file) if args.trajectory_file else None
42 |             ),
43 |         )
44 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/agent_as_a_judge/llm/__init__.py


--------------------------------------------------------------------------------
/agent_as_a_judge/llm/cost.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | class Cost:
 6 | 
 7 |     def __init__(self) -> None:
 8 |         self._accumulated_cost: float = 0.0
 9 |         self._costs: list[float] = []
10 | 
11 |     @property
12 |     def accumulated_cost(self) -> float:
13 |         return self._accumulated_cost
14 | 
15 |     @accumulated_cost.setter
16 |     def accumulated_cost(self, value: float) -> None:
17 |         if value < 0:
18 |             raise ValueError("Total cost cannot be negative.")
19 |         self._accumulated_cost = value
20 | 
21 |     @property
22 |     def costs(self) -> list:
23 |         return self._costs
24 | 
25 |     def add_cost(self, value: float) -> None:
26 |         if value < 0:
27 |             raise ValueError("Added cost cannot be negative.")
28 |         self._accumulated_cost += value
29 |         self._costs.append(value)
30 | 
31 |     def get(self):
32 |         return {"accumulated_cost": self._accumulated_cost, "costs": self._costs}
33 | 
34 |     def log(self):
35 |         cost = self.get()
36 |         logs = ""
37 |         for key, value in cost.items():
38 |             logs += f"{key}: {value}\n"
39 |         return logs
40 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/agent_as_a_judge/module/__init__.py


--------------------------------------------------------------------------------
/agent_as_a_judge/module/memory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Memory module to store and retrieve historical judgments.
 3 | """
 4 | 
 5 | import os
 6 | import logging
 7 | import json
 8 | from pathlib import Path
 9 | 
10 | 
11 | class Memory:
12 |     def __init__(self, memory_file: Path = None):
13 | 
14 |         self.judgments = []
15 |         self.memory_file = memory_file
16 | 
17 |     def save_to_file(self):
18 |         if not self.memory_file:
19 |             logging.error("No memory file provided.")
20 |             return
21 | 
22 |         try:
23 |             with open(self.memory_file, "w") as file:
24 |                 json.dump({"judge_stats": self.judgments}, file, indent=4)
25 |                 logging.info(
26 |                     f"Saved {len(self.judgments)} judgments to file '{self.memory_file}'."
27 |                 )
28 |         except Exception as e:
29 |             logging.error(f"Failed to save judgments to file '{self.memory_file}': {e}")
30 | 
31 |     def add_judgment(self, criteria: str, satisfied: bool, reason: list):
32 |         new_judgment = {"criteria": criteria, "satisfied": satisfied, "reason": reason}
33 |         self.judgments.append(new_judgment)
34 |         logging.debug(
35 |             f"Added new judgment for criteria: '{criteria}', Satisfied: {satisfied}"
36 |         )
37 | 
38 |     def get_historical_evidence(self) -> str:
39 | 
40 |         if not os.path.exists(self.memory_file):
41 |             logging.error(f"File '{self.memory_file}' not found.")
42 |             return
43 | 
44 |         with open(self.memory_file, "r") as file:
45 |             data = json.load(file)
46 |             self.judgments = data.get("judge_stats", [])
47 |             logging.info(
48 |                 f"Loaded {len(self.judgments)} judgments from file '{self.memory_file}'."
49 |             )
50 | 
51 |         if not self.judgments:
52 |             logging.warning("No historical judgments available.")
53 |             return "No historical judgments available."
54 | 
55 |         historical_evidence = "\n".join(
56 |             self._format_judgment(i, judgment)
57 |             for i, judgment in enumerate(self.judgments, 1)
58 |         )
59 |         logging.info(f"Retrieved {len(self.judgments)} historical judgments.")
60 |         return historical_evidence
61 | 
62 |     @staticmethod
63 |     def _format_judgment(index: int, judgment: dict) -> str:
64 |         criteria = judgment.get("criteria", "No criteria available")
65 |         satisfied = "Yes" if judgment.get("satisfied") else "No"
66 | 
67 |         llm_stats = judgment.get("llm_stats", {})
68 |         reasons = llm_stats.get("reason", [])
69 | 
70 |         if isinstance(reasons, list):
71 |             formatted_reasons = (
72 |                 "\n      ".join(reasons) if reasons else "No reasoning provided"
73 |             )
74 |         else:
75 |             formatted_reasons = reasons if reasons else "No reasoning provided"
76 | 
77 |         output = (
78 |             f"\n{'-'*50}"
79 |             f"\nRequirement {index}:"
80 |             f"\n{'-'*50}"
81 |             f"\nCriteria   : {criteria}"
82 |             f"\nSatisfied  : {satisfied}"
83 |             f"\nReason     :\n      {formatted_reasons}"
84 |             f"\n{'-'*50}\n"
85 |         )
86 | 
87 |         return output
88 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/planning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import time
 4 | import logging
 5 | from agent_as_a_judge.llm.provider import LLM
 6 | from dotenv import load_dotenv
 7 | from rich.logging import RichHandler
 8 | from agent_as_a_judge.module.prompt.system_prompt_planning import (
 9 |     get_planning_system_prompt,
10 | )
11 | from agent_as_a_judge.module.prompt.prompt_planning import get_planning_prompt
12 | 
13 | logging.basicConfig(
14 |     level=logging.INFO,
15 |     format="%(asctime)s - %(levelname)s - %(message)s",
16 |     handlers=[RichHandler()],
17 | )
18 | load_dotenv()
19 | 
20 | 
21 | class Planning:
22 |     def __init__(self):
23 |         self.llm = LLM(
24 |             model=os.getenv("DEFAULT_LLM"), api_key=os.getenv("OPENAI_API_KEY")
25 |         )
26 | 
27 |     def generate_plan(self, criteria: str) -> dict:
28 |         system_prompt = get_planning_system_prompt("English")  #
29 |         user_prompt = get_planning_prompt(criteria)
30 | 
31 |         messages = [
32 |             {"role": "system", "content": system_prompt},
33 |             {"role": "user", "content": user_prompt},
34 |         ]
35 | 
36 |         start_time = time.time()
37 |         llm_stats = self._llm_inference(messages)
38 |         llm_stats["inference_time"] = time.time() - start_time
39 |         actions = self.parse_plan(llm_stats["llm_response"])
40 | 
41 |         return {"actions": actions, "llm_stats": llm_stats}
42 | 
43 |     def parse_plan(self, plan: str) -> list:
44 |         actions = []
45 |         action_patterns = {
46 |             "user_query": r"\[User Query\]",
47 |             "workspace": r"\[Workspace\]",
48 |             "locate": r"\[Locate\]",
49 |             "read": r"\[Read\]",
50 |             "search": r"\[Search\]",
51 |             "history": r"\[History\]",
52 |             "trajectory": r"\[Trajectory\]",
53 |         }
54 | 
55 |         for line in plan.splitlines():
56 |             for action, pattern in action_patterns.items():
57 |                 if re.search(pattern, line, re.IGNORECASE):
58 |                     actions.append(action)
59 |                     break
60 | 
61 |         return actions
62 | 
63 |     def _llm_inference(self, messages: list) -> dict:
64 | 
65 |         response, cost, accumulated_cost = self.llm.do_completion(
66 |             messages=messages, temperature=0.0
67 |         )
68 | 
69 |         llm_response = response.choices[0].message["content"]
70 |         input_token = response.usage.prompt_tokens
71 |         output_token = response.usage.completion_tokens
72 | 
73 |         return {
74 |             "llm_response": llm_response,
75 |             "input_tokens": input_token,
76 |             "output_tokens": output_token,
77 |             "cost": cost,
78 |             # "accumulated_cost": accumulated_cost
79 |         }
80 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/agent_as_a_judge/module/prompt/__init__.py


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/prompt_ask.py:
--------------------------------------------------------------------------------
 1 | def get_ask_prompt(question: str, evidence: str) -> str:
 2 | 
 3 |     return f"""
 4 | Provided below is relevant information about the project or context:
 5 | {evidence}
 6 | 
 7 | Kindly respond to the following user input:
 8 | {question}
 9 | 
10 | As per the guidelines, provide a comprehensive answer referencing specific elements from the provided information where applicable.
11 |     """
12 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/prompt_judge.py:
--------------------------------------------------------------------------------
 1 | def get_judge_prompt(criteria: str, evidence: str) -> str:
 2 | 
 3 |     return f"""
 4 | Provided below is relevant information about the project:
 5 | {evidence}
 6 | 
 7 | Kindly perform an evaluation of the following criteria:
 8 | {criteria}
 9 | 
10 | As per the guidelines, respond with either <SATISFIED> or <UNSATISFIED>, followed by a concise justification that references specific elements from the project information, such as code snippets, data samples, or output results.
11 |     """
12 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/prompt_locate.py:
--------------------------------------------------------------------------------
 1 | def get_prompt_locate(criteria: str, workspace_info: str) -> str:
 2 | 
 3 |     demonstration = """
 4 | Example:
 5 | Suppose the criteria is:
 6 | 'The database functionality is implemented in `src/db.py`, and the logging system is defined in `src/logging.py`.'
 7 | 
 8 | And the workspace information is:
 9 | /project
10 | ├── src
11 | │   ├── db.py
12 | │   ├── logging.py
13 | │   ├── utils.py
14 | └── tests
15 |     ├── test_db.py
16 |     └── test_logging.py
17 | 
18 | Based on the criteria, the following paths (no more than 5) should be returned, each wrapped in dollar signs (`$`):
19 | $/project/src/db.py$
20 | $/project/src/logging.py$
21 |     """
22 | 
23 |     return f"""
24 | Provided below is the structure of the workspace:
25 | {workspace_info}
26 | 
27 | This is the criteria related to the task:
28 | {criteria}
29 | 
30 | Follow the format in the example below and return only the file paths that match the criteria:
31 | {demonstration}
32 |     """
33 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/prompt_planning.py:
--------------------------------------------------------------------------------
 1 | def get_planning_prompt(criteria: str) -> str:
 2 |     """
 3 |     Returns the LLM prompt to generate a step-by-step plan for evaluating or resolving the given criteria.
 4 |     The prompt includes demonstrations to guide the LLM in creating effective plans without repeating the action descriptions.
 5 |     """
 6 |     return f"""
 7 |     You are tasked with generating a list of actions to evaluate or resolve the following requirement. 
 8 |     Select only the necessary actions and arrange them in a logical order to systematically collect evidence and verify whether the requirement is satisfied.
 9 | 
10 |     Requirement: "{criteria}"
11 | 
12 |     Here are some examples of how to create a plan:
13 | 
14 |     Example 1:
15 |     Requirement: "The system must generate a summary report saved as `output/report.txt`."
16 |     Plan:
17 |     - [Locate]: Locate the `output/report.txt` file in the workspace.
18 |     - [Read]: Read the contents of the `report.txt` file to verify it contains the summary report.
19 |     - [Search]: Search the codebase for any functions or methods responsible for generating `report.txt`.
20 | 
21 |     Example 2:
22 |     Requirement: "The machine learning model must be trained and saved as `results/model.pkl`."
23 |     Plan:
24 |     - [Locate]: Locate `results/model.pkl` in the workspace.
25 |     - [Search]: Search for the model training code in the source files.
26 |     - [Read]: Read the model training code to verify it aligns with the specified requirement.
27 |     - [Trajectory]: Analyze the historical development of the model training process to understand any prior modifications.
28 | 
29 |     Now, generate a step-by-step plan for the following requirement:
30 | 
31 |     Requirement: "{criteria}"
32 | 
33 |     Response:
34 |     """
35 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/prompt_retrieve.py:
--------------------------------------------------------------------------------
 1 | def get_text_retrieve_prompt(criteria: str, long_context: str) -> str:
 2 | 
 3 |     return f"""
 4 |         Below is a log of actions, steps, and file operations:
 5 |         {long_context}
 6 | 
 7 |         Summarize concise evidence directly related to the following criteria:
 8 |         {criteria}
 9 | 
10 |         Focus on the last one or two mentions of relevant files or actions. Since I can check the files locally, omit file existence and content details. Provide a brief analysis of the latest status of relevant files or functions. Exclude irrelevant information.
11 |         """
12 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/system_prompt_ask.py:
--------------------------------------------------------------------------------
 1 | def get_ask_system_prompt(language="English"):
 2 | 
 3 |     if language == "English":
 4 |         return """
 5 | You are a knowledgeable assistant capable of answering user queries clearly and accurately.
 6 | Your goal is to respond to the user input provided, using relevant project information and context where necessary.
 7 |         """
 8 |     else:
 9 |         raise NotImplementedError(f"The language '{language}' is not supported.")
10 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/system_prompt_judge.py:
--------------------------------------------------------------------------------
 1 | def get_judge_system_prompt(language="English"):
 2 | 
 3 |     if language == "English":
 4 |         return """
 5 |         You are an advanced AI system serving as an impartial judge for intelligent code generation outputs. Your primary role is to rigorously evaluate whether the agent's outputs satisfy the specified requirements by thoroughly analyzing the provided code, data, and other relevant materials.
 6 | 
 7 |         You will systematically assess aspects such as datasets, model implementations, training procedures, and any task-specific criteria outlined in the requirements. Your evaluations must be objective, detailed, and based solely on the evidence provided.
 8 | 
 9 |         For each requirement, deliver one of the following judgments:
10 | 
11 |         1. <SATISFIED>: Use this if the agent's output fully meets the requirement. Provide a brief and precise explanation demonstrating how the specific criteria are fulfilled.
12 | 
13 |         2. <UNSATISFIED>: Use this if the agent's output does not meet the requirement. Provide a concise explanation indicating the deficiencies or omissions.
14 | 
15 |         Your assessment should reference specific elements such as code snippets, data samples, or output results where appropriate. Ensure that your justifications are clear, precise, and directly related to the criteria.
16 | 
17 |         Respond with either <SATISFIED> or <UNSATISFIED>, followed by your concise justification.
18 |         """
19 | 
20 |     else:
21 |         raise NotImplementedError(f"The language '{language}' is not supported.")
22 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/system_prompt_locate.py:
--------------------------------------------------------------------------------
 1 | def get_system_prompt_locate(language="English"):
 2 | 
 3 |     if language == "English":
 4 |         return """
 5 | You are an advanced AI system specializing in understanding project structures and determining file locations based on provided criteria.
 6 | Your task is to locate specific files in the workspace based on the user's criteria and workspace information.
 7 |         """
 8 |     else:
 9 |         raise NotImplementedError(f"The language '{language}' is not supported.")
10 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/system_prompt_planning.py:
--------------------------------------------------------------------------------
 1 | def get_planning_system_prompt(language="English"):
 2 | 
 3 |     if language == "English":
 4 |         return """
 5 |         You are an advanced AI system tasked with generating a step-by-step plan to help verify whether a project's outputs meet the specified requirements. 
 6 |         Your goal is to generate a series of actions that systematically gather evidence from various sources, such as code, documentation, history, or data, to assess whether the requirement is fully satisfied.
 7 | 
 8 |         The actions you can choose from are listed below. Select the necessary actions based on the requirement and arrange them in a logical order:
 9 |         
10 |         - [User Query]: Use the user's original query to provide context and understand the requirement.
11 |         - [Workspace]: Analyze the overall workspace structure to understand the project’s components and dependencies.
12 |         - [Locate]: Locate specific files or directories in the workspace that may contain relevant information or code.
13 |         - [Read]: Read and examine the contents of files to verify their correctness and relevance to the requirement.
14 |         - [Search]: Search for relevant code snippets, functions, or variables related to the requirement.
15 |         - [History]: Refer to previous judgments, evaluations, or decisions made in earlier iterations or related projects.
16 |         - [Trajectory]: Analyze the historical development or decision-making trajectory of the project, including previous changes or iterations that impacted the current state.
17 | 
18 |         Your task is to select and order the necessary actions that will systematically collect evidence to allow for a thorough evaluation of the requirement.
19 |         """
20 |     else:
21 |         raise NotImplementedError(f"The language '{language}' is not supported.")
22 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/prompt/system_prompt_retrieve.py:
--------------------------------------------------------------------------------
 1 | def get_retrieve_system_prompt(language="English"):
 2 |     if language == "English":
 3 |         return """
 4 | You are an advanced AI system specializing in retrieving environmental feedback from project execution trajectories. Your task is to analyze the provided trajectory data and extract information about the most relevant files mentioned in the given criteria.
 5 | 
 6 | Focus on the following:
 7 | 
 8 | 1. Identify the **most recent steps** where the files directly related to the criteria were involved in execution, loading, or saving operations.
 9 | 2. Provide environmental feedback for these files, such as any errors, warnings, or issues encountered during their execution or processing.
10 | 3. Highlight whether any problems occurred that might affect the functionality or success of these files in the project.
11 | 
12 | Your output should be structured as follows:
13 | 
14 | - **<RELEVANT STEPS>**: List the specific steps involving the relevant files, including any environmental feedback such as error messages, execution results, or other issues encountered. Each step should concisely present the key information needed to assess the files' execution status.
15 | 
16 | Avoid including details about file contents or existence, as this information is already available. Focus solely on the environmental feedback related to the execution of the most relevant files.
17 | 
18 | Your goal is to provide clear and concise information that helps determine if there were any execution problems with the files mentioned in the criteria.
19 |         """
20 |     else:
21 |         raise NotImplementedError(f"The language '{language}' is not supported.")
22 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/module/statistics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | from typing import List
 4 | from rich.logging import RichHandler
 5 | from agent_as_a_judge.module.graph import DevGraph
 6 | 
 7 | logging.basicConfig(
 8 |     level=logging.INFO,
 9 |     format="%(asctime)s - %(levelname)s - %(message)s",
10 |     handlers=[RichHandler()],
11 | )
12 | 
13 | 
14 | class DevStatistics:
15 | 
16 |     def __init__(self, workspace: Path):
17 |         self.workspace = workspace
18 | 
19 |     def count_lines_of_code(self, filepaths: List[Path]) -> (int, int):
20 | 
21 |         total_lines = 0
22 |         total_files = 0
23 | 
24 |         for filepath in filepaths:
25 |             try:
26 |                 with open(filepath, "r", encoding="utf-8") as f:
27 |                     lines = f.readlines()
28 |                     total_lines += len(lines)
29 |                     total_files += 1
30 |             except Exception as e:
31 |                 logging.warning(f"Failed to process file {filepath}: {e}")
32 | 
33 |         return total_lines, total_files
34 | 
35 |     def calculate_statistics(self):
36 | 
37 |         if self.workspace.exists():
38 |             logging.info(f"Processing workspace: {self.workspace.stem}")
39 | 
40 |             dev_graph = DevGraph(
41 |                 root=str(self.workspace),
42 |                 include_dirs=["src", "results", "models"],
43 |                 exclude_dirs=["__pycache__", "env"],
44 |                 exclude_files=[".DS_Store"],
45 |             )
46 | 
47 |             py_files = dev_graph.list_py_files([self.workspace])
48 |             all_files = dev_graph.list_all_files(self.workspace)
49 |             lines_in_workspace, files_in_workspace = self.count_lines_of_code(py_files)
50 |             total_files_in_workspace = len(all_files)
51 |             total_non_code_files_in_workspace = (
52 |                 total_files_in_workspace - files_in_workspace
53 |             )
54 | 
55 |             logging.info(f"  Total files: {total_files_in_workspace}")
56 |             logging.info(f"  Non-Python files: {total_non_code_files_in_workspace}")
57 |             logging.info(f"  Python files: {files_in_workspace}")
58 |             logging.info(f"  Lines of Python code: {lines_in_workspace}")
59 | 
60 |             return (
61 |                 total_files_in_workspace,
62 |                 total_non_code_files_in_workspace,
63 |                 files_in_workspace,
64 |                 lines_in_workspace,
65 |             )
66 | 
67 |         else:
68 |             logging.warning(
69 |                 f"Workspace '{self.workspace.stem}' does not exist. Skipping..."
70 |             )
71 |             return 0, 0, 0, 0
72 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from agent_as_a_judge.utils.truncate import truncate_string
2 | from agent_as_a_judge.utils.count_lines import count_lines_of_code
3 | 
4 | 
5 | __all__ = ["truncate_string", "count_lines_of_code"]
6 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/utils/count_lines.py:
--------------------------------------------------------------------------------
 1 | def count_lines_of_code(filepaths):
 2 | 
 3 |     total_lines = 0
 4 |     total_files = 0
 5 |     for filepath in filepaths:
 6 |         with open(filepath, "r", encoding="utf-8") as f:
 7 |             lines = f.readlines()
 8 |             total_lines += len(lines)
 9 |             total_files += 1
10 |     return total_lines, total_files
11 | 


--------------------------------------------------------------------------------
/agent_as_a_judge/utils/truncate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | from typing import Union
 4 | import tiktoken
 5 | from dotenv import load_dotenv
 6 | 
 7 | load_dotenv()
 8 | 
 9 | 
10 | def truncate_string(
11 |     info_string: Union[str, None],
12 |     model: str = os.getenv("DEFAULT_LLM"),
13 |     max_tokens: int = 10000,
14 |     drop_mode="middle",
15 | ) -> str:
16 | 
17 |     if info_string is None:
18 |         logging.warning(
19 |             "Received None input for truncation. Returning an empty string."
20 |         )
21 |         return ""
22 | 
23 |     info_string = str(info_string)
24 |     
25 |     try:
26 |         encoding = tiktoken.encoding_for_model(model)
27 |     except KeyError:
28 |         # Fallback to cl100k_base (used by gpt-4) if model not found
29 |         logging.warning(f"Model {model} not found in tiktoken. Using cl100k_base encoding instead.")
30 |         encoding = tiktoken.get_encoding("cl100k_base")
31 |     
32 |     tokens = encoding.encode(info_string, disallowed_special=())
33 | 
34 |     # If tokens exceed the maximum length, we truncate based on the drop_mode
35 |     if len(tokens) > max_tokens:
36 |         # logging.warning(f"Input string exceeds maximum token limit ({max_tokens}). Truncating using {drop_mode} mode.")
37 |         ellipsis = encoding.encode("...")
38 |         ellipsis_len = len(ellipsis)
39 | 
40 |         if drop_mode == "head":
41 |             tokens = ellipsis + tokens[-(max_tokens - ellipsis_len) :]
42 |         elif drop_mode == "middle":
43 |             head_tokens = (max_tokens - ellipsis_len) // 2
44 |             tail_tokens = max_tokens - head_tokens - ellipsis_len
45 |             tokens = tokens[:head_tokens] + ellipsis + tokens[-tail_tokens:]
46 |         elif drop_mode == "tail":
47 |             tokens = tokens[: (max_tokens - ellipsis_len)] + ellipsis
48 | 
49 |         else:
50 |             raise ValueError(
51 |                 f"Unknown drop_mode: {drop_mode}. Supported modes: 'head', 'middle', 'tail'."
52 |             )
53 | 
54 |     return encoding.decode(tokens)
55 | 


--------------------------------------------------------------------------------
/assets/aaaj_logo_v6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/aaaj_logo_v6.png


--------------------------------------------------------------------------------
/assets/aaaj_logo_v7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/aaaj_logo_v7.png


--------------------------------------------------------------------------------
/assets/dataset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/dataset.png


--------------------------------------------------------------------------------
/assets/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/demo.gif


--------------------------------------------------------------------------------
/assets/devai_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/devai_logo.png


--------------------------------------------------------------------------------
/assets/judge_first.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/judge_first.png


--------------------------------------------------------------------------------
/assets/openwiki_1a.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/openwiki_1a.jpeg


--------------------------------------------------------------------------------
/assets/openwiki_1b.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/openwiki_1b.jpeg


--------------------------------------------------------------------------------
/assets/sample.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/sample.jpeg


--------------------------------------------------------------------------------
/benchmark/devai/constraints.json:
--------------------------------------------------------------------------------
1 | {
2 |    "generic": "This is a task that requires you to write, execute, and save source code. You have a hard time limit of 30 minutes to produce your programmatic solution to the given task. This time limit includes execution time. The quality of your solution will be judged based on what you left in the working folder by the time 30 minutes expire. Additionally, the hardware you are running on is unknown, and the presence of a GPU is not guaranteed.",
3 |    "is_training_needed": "Keep the time limit in mind when setting hyperparameters for training.",
4 |    "is_kaggle_api_needed": "You can use the Kaggle API credentials stored in `kaggle.json` in your current working directory."
5 | }
6 | 


--------------------------------------------------------------------------------
/benchmark/devai/instances/02_Maze_Solver_Q_Learning_Gridworld_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "02_Maze_Solver_Q_Learning_Gridworld_RL",
 3 |     "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": null
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": null
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.",
29 |             "category": "Visualization",
30 |             "satisfied": null
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.",
40 |             "category": "Save Trained Model",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1
48 |             ],
49 |             "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.",
50 |             "category": "Visualization",
51 |             "satisfied": null
52 |         }
53 |     ],
54 |     "preferences": [
55 |         {
56 |             "preference_id": 0,
57 |             "criteria": "Some real-time progress or feedback during the training process should be displayed.",
58 |             "satisfied": null
59 |         },
60 |         {
61 |             "preference_id": 1,
62 |             "criteria": "The code should be written in a way that's easy to modify or extend later on.",
63 |             "satisfied": null
64 |         }
65 |     ],
66 |     "is_kaggle_api_needed": false,
67 |     "is_training_needed": true,
68 |     "is_web_navigation_needed": false
69 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML",
 3 |     "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.",
 4 |     "tags": [
 5 |         "Classification",
 6 |         "Natural Language Processing",
 7 |         "Supervised Learning"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": null
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": null
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.",
33 |             "category": "Data preprocessing and postprocessing",
34 |             "satisfied": null
35 |         },
36 |         {
37 |             "requirement_id": 3,
38 |             "prerequisites": [],
39 |             "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.",
40 |             "category": "Machine Learning Method",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.",
51 |             "category": "Performance Metrics",
52 |             "satisfied": null
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The model should be straightforward to interpret.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "The final report should be structured clearly for easy review.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": false,
70 |     "hint": "In the query, there is a missing word \"be\" after the word \"should\" in \"The dataset should loaded in `src/data_loader.py`\"."
71 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/05_Game_Simulation_DQN_CartPole_v1_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "05_Game_Simulation_DQN_CartPole_v1_RL",
 3 |     "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": null
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": null
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.",
29 |             "category": "Visualization",
30 |             "satisfied": null
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1
37 |             ],
38 |             "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.",
39 |             "category": "Save Trained Model",
40 |             "satisfied": null
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1
47 |             ],
48 |             "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.",
49 |             "category": "Performance Metrics",
50 |             "satisfied": null
51 |         }
52 |     ],
53 |     "preferences": [
54 |         {
55 |             "preference_id": 0,
56 |             "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.",
57 |             "satisfied": null
58 |         },
59 |         {
60 |             "preference_id": 1,
61 |             "criteria": "The return over episode curve has key milestones annotated.",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "is_kaggle_api_needed": false,
66 |     "is_training_needed": true,
67 |     "is_web_navigation_needed": false
68 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/06_Sentiment_Analysis_SVM_Sentiment140_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML",
 3 |     "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.",
 4 |     "tags": [
 5 |         "Natural Language Processing",
 6 |         "Supervised Learning"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": null
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": null
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": null
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": null
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": null
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The dataset download process should be reliable, with clear error handling.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The final accuracy report should be straightforward and easy to interpret.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": true,
72 |     "is_web_navigation_needed": false
73 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/07_Image_Super_Resolution_SRCNN_Set5_DL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "07_Image_Super_Resolution_SRCNN_Set5_DL",
 3 |     "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`.  The generated images should be high-quality and clearly show improvements.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": null
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": null
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [],
28 |             "criteria": "The \"SRCNN\" model is used in `src/model.py`.",
29 |             "category": "Machine Learning Method",
30 |             "satisfied": null
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.",
40 |             "category": "Visualization",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.",
51 |             "category": "Visualization",
52 |             "satisfied": null
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "Well-organized output images, highlighting key improvements, should be included.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": true
70 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/08_Robot_Control_PPO_PyBullet_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "08_Robot_Control_PPO_PyBullet_RL",
 3 |     "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.",
12 |             "category": "Dataset or Environment",
13 |             "satisfied": null
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"PPO\" algorithm is used in `src/train.py`.",
19 |             "category": "Machine Learning Method",
20 |             "satisfied": null
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0
26 |             ],
27 |             "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.",
28 |             "category": "Dataset or Environment",
29 |             "satisfied": null
30 |         },
31 |         {
32 |             "requirement_id": 3,
33 |             "prerequisites": [
34 |                 0,
35 |                 1,
36 |                 2
37 |             ],
38 |             "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.",
39 |             "category": "Other",
40 |             "satisfied": null
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1,
47 |                 2
48 |             ],
49 |             "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.",
50 |             "category": "Visualization",
51 |             "satisfied": null
52 |         },
53 |         {
54 |             "requirement_id": 5,
55 |             "prerequisites": [
56 |                 0,
57 |                 1,
58 |                 2
59 |             ],
60 |             "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.",
61 |             "category": "Visualization",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "preferences": [
66 |         {
67 |             "preference_id": 0,
68 |             "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.",
69 |             "satisfied": null
70 |         }
71 |     ],
72 |     "is_kaggle_api_needed": false,
73 |     "is_training_needed": true,
74 |     "is_web_navigation_needed": false
75 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images",
 3 |     "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models",
 7 |         "Natural Language Processing"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": null
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "The generated images are ensured to be of \"1080p\" resolution and saved in `results/figures/`.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": null
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in `results/figures/`.",
33 |             "category": "Visualization",
34 |             "satisfied": null
35 |         }
36 |     ],
37 |     "preferences": [
38 |         {
39 |             "preference_id": 0,
40 |             "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "preference_id": 1,
45 |             "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.",
46 |             "satisfied": null
47 |         }
48 |     ],
49 |     "is_kaggle_api_needed": false,
50 |     "is_training_needed": false,
51 |     "is_web_navigation_needed": true,
52 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
53 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/52_Devin_AI_Trains_an_AI.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "52_Devin_AI_Trains_an_AI",
 3 |     "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.",
 4 |     "tags": [
 5 |         "Generative Models",
 6 |         "Natural Language Processing"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The repository at `https://github.com/artidoro/qlora` has been downloaded.",
13 |             "category": "Machine Learning Method",
14 |             "satisfied": null
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The necessary environment and dependencies are set up.",
22 |             "category": "Dataset or Environment",
23 |             "satisfied": null
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": null
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "The finetuned model and training summary are saved in `models/saved_models/`, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.",
43 |             "category": "Save Trained Model",
44 |             "satisfied": null
45 |         }
46 |     ],
47 |     "preferences": [
48 |         {
49 |             "preference_id": 0,
50 |             "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.",
51 |             "satisfied": null
52 |         },
53 |         {
54 |             "preference_id": 1,
55 |             "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.",
56 |             "satisfied": null
57 |         }
58 |     ],
59 |     "is_kaggle_api_needed": false,
60 |     "is_training_needed": true,
61 |     "is_web_navigation_needed": true,
62 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
63 | }


--------------------------------------------------------------------------------
/benchmark/devai/instances/53_Devin_Upwork_Side_Hustle.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "53_Devin_Upwork_Side_Hustle",
 3 |     "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.",
 4 |     "tags": [
 5 |         "Computer Vision"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": null
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [
18 |                 0
19 |             ],
20 |             "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.",
21 |             "category": "Dataset or Environment",
22 |             "satisfied": null
23 |         },
24 |         {
25 |             "requirement_id": 2,
26 |             "prerequisites": [
27 |                 0,
28 |                 1
29 |             ],
30 |             "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.",
31 |             "category": "Other",
32 |             "satisfied": null
33 |         },
34 |         {
35 |             "requirement_id": 3,
36 |             "prerequisites": [
37 |                 0,
38 |                 1,
39 |                 2
40 |             ],
41 |             "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.",
42 |             "category": "Visualization",
43 |             "satisfied": null
44 |         },
45 |         {
46 |             "requirement_id": 4,
47 |             "prerequisites": [
48 |                 0,
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": null
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": false,
72 |     "is_web_navigation_needed": true,
73 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
74 | }


--------------------------------------------------------------------------------
/benchmark/devai/trajectory-schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "array",
 3 |   "items": {
 4 |     "type": "object",
 5 |     "properties": {
 6 |       "step": {
 7 |         "type": "integer",
 8 |         "description": "The step number in the trajectory, 0-based."
 9 |       },
10 |       "user_message": {
11 |         "type": ["string", "null"],
12 |         "description": "The message from the external user to the agent. If null, no message was sent."
13 |       },
14 |       "agent": {
15 |         "type": "object",
16 |         "properties": {
17 |           "thought": {
18 |             "type": "string",
19 |             "description": "The agent's thought at this step."
20 |           },
21 |           "action": {
22 |             "type": ["string", "null"],
23 |             "description": "The agent's action sent to the environment. If null, the agent did not take any action, for example, when the agent has finished the task."
24 |           },
25 |           "agent_name": {
26 |             "type": "string",
27 |             "description": "The name of the agent that made the action."
28 |           }
29 |         },
30 |         "required": ["thought", "action"],
31 |         "description": "Everything related to the agent at this step."
32 |       },
33 |       "environment": {
34 |         "type": ["string", "null"],
35 |         "description": "The environment's (shell, python interpreter) response to the action submitted by the agent. If null, the environment was not involved in this step."
36 |       },
37 |       "step_usage": {
38 |         "type": "object",
39 |         "properties": {
40 |           "input_tokens": {
41 |             "type": "integer",
42 |             "description": "The number of input tokens passed as LLM context."
43 |           },
44 |           "output_tokens": {
45 |             "type": "integer",
46 |             "description": "The number of tokens produced by the LLM."
47 |           },
48 |           "model": {
49 |             "type": "string",
50 |             "description": "The name of the LLM model used."
51 |           },
52 |           "cost": {
53 |             "type": "number",
54 |             "description": "The cost of the LLM inference, in USD."
55 |           },
56 |           "llm_inference_time": {
57 |             "type": "number",
58 |             "description": "The time taken by the LLM to generate the output tokens, in seconds."
59 |           },
60 |           "step_execution_time": {
61 |             "type": "number",
62 |             "description": "The time taken to make an entire step including LLM inference and environment execution, in seconds."
63 |           }
64 |         },
65 |         "required": [
66 |           "input_tokens",
67 |           "output_tokens",
68 |           "model",
69 |           "cost",
70 |           "llm_inference_time",
71 |           "step_execution_time"
72 |         ]
73 |       },
74 |       "accumulated_usage": {
75 |         "type": "object",
76 |         "properties": {
77 |           "accumulated_cost": {
78 |             "type": "number",
79 |             "description": "The total cost of the trajectory up to this step, in USD."
80 |           },
81 |           "accumulated_time": {
82 |             "type": "number",
83 |             "description": "The total time taken by the agent to complete the trajectory up to this step, in seconds."
84 |           }
85 |         },
86 |         "required": [
87 |           "accumulated_cost",
88 |           "accumulated_time"
89 |         ]
90 |       }
91 |     },
92 |     "required": ["step", "user_message", "agent", "environment", "step_usage", "accumulated_usage"]
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/benchmark/devai/validate_trajectory.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import jsonschema
 5 | import jsonschema.exceptions
 6 | 
 7 | 
 8 | def validate_trajectory_data(json_data, json_schema) -> bool:
 9 |     try:
10 |         jsonschema.validate(instance=json_data, schema=json_schema)
11 |         print("JSON is valid")
12 |     except jsonschema.exceptions.ValidationError as err:
13 |         print("JSON is invalid")
14 |         print(err)
15 |         return False
16 |     return True
17 | 
18 | 
19 | def main():
20 |     parser = argparse.ArgumentParser(description='Process a trajectory JSON file.')
21 | 
22 |     parser.add_argument('trajectory_json', metavar='--trajectory-json', type=str,
23 |                         help='Path to the trajectory JSON file')
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     this_file_dir = os.path.dirname(os.path.realpath(__file__))
28 |     schema_path = os.path.join(this_file_dir, 'trajectory-schema.json')
29 | 
30 |     with open(schema_path, 'r') as schema_file:
31 |         json_schema = json.load(schema_file)
32 | 
33 |     with open(args.trajectory_json, 'r') as json_file:
34 |         json_data = json.load(json_file)
35 | 
36 |     return int(validate_trajectory_data(json_data, json_schema))
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/02_Maze_Solver_Q_Learning_Gridworld_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "02_Maze_Solver_Q_Learning_Gridworld_RL",
 3 |     "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.",
29 |             "category": "Visualization",
30 |             "satisfied": true
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.",
40 |             "category": "Save Trained Model",
41 |             "satisfied": true
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1
48 |             ],
49 |             "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.",
50 |             "category": "Visualization",
51 |             "satisfied": true
52 |         }
53 |     ],
54 |     "preferences": [
55 |         {
56 |             "preference_id": 0,
57 |             "criteria": "Some real-time progress or feedback during the training process should be displayed.",
58 |             "satisfied": null
59 |         },
60 |         {
61 |             "preference_id": 1,
62 |             "criteria": "The code should be written in a way that's easy to modify or extend later on.",
63 |             "satisfied": null
64 |         }
65 |     ],
66 |     "is_kaggle_api_needed": false,
67 |     "is_training_needed": true,
68 |     "is_web_navigation_needed": false
69 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML",
 3 |     "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.",
 4 |     "tags": [
 5 |         "Classification",
 6 |         "Natural Language Processing",
 7 |         "Supervised Learning"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": true
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.",
33 |             "category": "Data preprocessing and postprocessing",
34 |             "satisfied": true
35 |         },
36 |         {
37 |             "requirement_id": 3,
38 |             "prerequisites": [],
39 |             "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.",
40 |             "category": "Machine Learning Method",
41 |             "satisfied": true
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.",
51 |             "category": "Performance Metrics",
52 |             "satisfied": true
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The model should be straightforward to interpret.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "The final report should be structured clearly for easy review.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": false
70 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/05_Game_Simulation_DQN_CartPole_v1_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "05_Game_Simulation_DQN_CartPole_v1_RL",
 3 |     "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.",
29 |             "category": "Visualization",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1
37 |             ],
38 |             "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.",
39 |             "category": "Save Trained Model",
40 |             "satisfied": true
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1
47 |             ],
48 |             "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.",
49 |             "category": "Performance Metrics",
50 |             "satisfied": true
51 |         }
52 |     ],
53 |     "preferences": [
54 |         {
55 |             "preference_id": 0,
56 |             "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.",
57 |             "satisfied": null
58 |         },
59 |         {
60 |             "preference_id": 1,
61 |             "criteria": "The return over episode curve has key milestones annotated.",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "is_kaggle_api_needed": false,
66 |     "is_training_needed": true,
67 |     "is_web_navigation_needed": false
68 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/06_Sentiment_Analysis_SVM_Sentiment140_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML",
 3 |     "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.",
 4 |     "tags": [
 5 |         "Natural Language Processing",
 6 |         "Supervised Learning"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": true
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": true
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": true
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The dataset download process should be reliable, with clear error handling.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The final accuracy report should be straightforward and easy to interpret.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": true,
72 |     "is_web_navigation_needed": false
73 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/07_Image_Super_Resolution_SRCNN_Set5_DL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "07_Image_Super_Resolution_SRCNN_Set5_DL",
 3 |     "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`.  The generated images should be high-quality and clearly show improvements.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": true
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [],
28 |             "criteria": "The \"SRCNN\" model is used in `src/model.py`.",
29 |             "category": "Machine Learning Method",
30 |             "satisfied": true
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.",
40 |             "category": "Visualization",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.",
51 |             "category": "Visualization",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "Well-organized output images, highlighting key improvements, should be included.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": true
70 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/08_Robot_Control_PPO_PyBullet_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "08_Robot_Control_PPO_PyBullet_RL",
 3 |     "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.",
12 |             "category": "Dataset or Environment",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"PPO\" algorithm is used in `src/train.py`.",
19 |             "category": "Machine Learning Method",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0
26 |             ],
27 |             "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.",
28 |             "category": "Dataset or Environment",
29 |             "satisfied": false
30 |         },
31 |         {
32 |             "requirement_id": 3,
33 |             "prerequisites": [
34 |                 0,
35 |                 1,
36 |                 2
37 |             ],
38 |             "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.",
39 |             "category": "Other",
40 |             "satisfied": false
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1,
47 |                 2
48 |             ],
49 |             "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.",
50 |             "category": "Visualization",
51 |             "satisfied": false
52 |         },
53 |         {
54 |             "requirement_id": 5,
55 |             "prerequisites": [
56 |                 0,
57 |                 1,
58 |                 2
59 |             ],
60 |             "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.",
61 |             "category": "Visualization",
62 |             "satisfied": false
63 |         }
64 |     ],
65 |     "preferences": [
66 |         {
67 |             "preference_id": 0,
68 |             "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.",
69 |             "satisfied": null
70 |         }
71 |     ],
72 |     "is_kaggle_api_needed": false,
73 |     "is_training_needed": true,
74 |     "is_web_navigation_needed": false
75 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images",
 3 |     "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models",
 7 |         "Natural Language Processing"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": true
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.",
33 |             "category": "Visualization",
34 |             "satisfied": true
35 |         }
36 |     ],
37 |     "preferences": [
38 |         {
39 |             "preference_id": 0,
40 |             "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "preference_id": 1,
45 |             "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.",
46 |             "satisfied": null
47 |         }
48 |     ],
49 |     "is_kaggle_api_needed": false,
50 |     "is_training_needed": false,
51 |     "is_web_navigation_needed": true,
52 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
53 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/52_Devin_AI_Trains_an_AI.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "52_Devin_AI_Trains_an_AI",
 3 |     "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.",
 4 |     "tags": [
 5 |         "Generative Models",
 6 |         "Natural Language Processing"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The instructions in the repository at `https://github.com/artidoro/qlora` are followed to finetune a \"7B LLaMA\" model and save it in models/saved_models/.",
13 |             "category": "Machine Learning Method",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The necessary environment and dependencies are set up as outlined in the repository `src/env.py`.",
22 |             "category": "Dataset or Environment",
23 |             "satisfied": true
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "The datasets required for finetuning the model are downloaded and prepared in `src/data_loader.py`.",
32 |             "category": "Dataset or Environment",
33 |             "satisfied": true
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": false
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 0,
50 |                 1,
51 |                 2,
52 |                 3
53 |             ],
54 |             "criteria": "The finetuned model and training summary are saved, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.",
55 |             "category": "Save Trained Model",
56 |             "satisfied": false
57 |         }
58 |     ],
59 |     "preferences": [
60 |         {
61 |             "preference_id": 0,
62 |             "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.",
63 |             "satisfied": null
64 |         },
65 |         {
66 |             "preference_id": 1,
67 |             "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.",
68 |             "satisfied": null
69 |         }
70 |     ],
71 |     "is_kaggle_api_needed": false,
72 |     "is_training_needed": true,
73 |     "is_web_navigation_needed": true,
74 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
75 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/53_Devin_Upwork_Side_Hustle.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "53_Devin_Upwork_Side_Hustle",
 3 |     "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.",
 4 |     "tags": [
 5 |         "Computer Vision"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": false
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [
18 |                 0
19 |             ],
20 |             "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.",
21 |             "category": "Dataset or Environment",
22 |             "satisfied": false
23 |         },
24 |         {
25 |             "requirement_id": 2,
26 |             "prerequisites": [
27 |                 0,
28 |                 1
29 |             ],
30 |             "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.",
31 |             "category": "Other",
32 |             "satisfied": false
33 |         },
34 |         {
35 |             "requirement_id": 3,
36 |             "prerequisites": [
37 |                 0,
38 |                 1,
39 |                 2
40 |             ],
41 |             "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.",
42 |             "category": "Visualization",
43 |             "satisfied": false
44 |         },
45 |         {
46 |             "requirement_id": 4,
47 |             "prerequisites": [
48 |                 0,
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": false,
72 |     "is_web_navigation_needed": true,
73 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
74 | }


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/human_as_a_judge/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML",
 3 |     "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.",
 4 |     "tags": [
 5 |         "Classification",
 6 |         "Natural Language Processing",
 7 |         "Supervised Learning"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": true
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": true
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.",
33 |             "category": "Data preprocessing and postprocessing",
34 |             "satisfied": true
35 |         },
36 |         {
37 |             "requirement_id": 3,
38 |             "prerequisites": [],
39 |             "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.",
40 |             "category": "Machine Learning Method",
41 |             "satisfied": true
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.",
51 |             "category": "Performance Metrics",
52 |             "satisfied": true
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The model should be straightforward to interpret.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "The final report should be structured clearly for easy review.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": false,
70 |     "executed_successfully": false,
71 |     "satisfied_all_requirements": true,
72 |     "satisfied_all_preferences": false
73 | }
74 | 


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/human_as_a_judge/05_Game_Simulation_DQN_CartPole_v1_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "05_Game_Simulation_DQN_CartPole_v1_RL",
 3 |     "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.",
29 |             "category": "Visualization",
30 |             "satisfied": true
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1
37 |             ],
38 |             "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.",
39 |             "category": "Save Trained Model",
40 |             "satisfied": true
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1
47 |             ],
48 |             "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.",
49 |             "category": "Performance Metrics",
50 |             "satisfied": true
51 |         }
52 |     ],
53 |     "preferences": [
54 |         {
55 |             "preference_id": 0,
56 |             "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.",
57 |             "satisfied": null
58 |         },
59 |         {
60 |             "preference_id": 1,
61 |             "criteria": "The return over episode curve has key milestones annotated.",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "is_kaggle_api_needed": false,
66 |     "is_training_needed": true,
67 |     "is_web_navigation_needed": false,
68 |     "executed_successfully": false,
69 |     "satisfied_all_requirements": true,
70 |     "satisfied_all_preferences": false
71 | }
72 | 


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/human_as_a_judge/06_Sentiment_Analysis_SVM_Sentiment140_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML",
 3 |     "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.",
 4 |     "tags": [
 5 |         "Natural Language Processing",
 6 |         "Supervised Learning"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": true
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": true
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": true
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": false
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The dataset download process should be reliable, with clear error handling.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The final accuracy report should be straightforward and easy to interpret.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": true,
72 |     "is_web_navigation_needed": false,
73 |     "executed_successfully": false,
74 |     "satisfied_all_requirements": false,
75 |     "satisfied_all_preferences": false
76 | }
77 | 


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/human_as_a_judge/07_Image_Super_Resolution_SRCNN_Set5_DL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "07_Image_Super_Resolution_SRCNN_Set5_DL",
 3 |     "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`.  The generated images should be high-quality and clearly show improvements.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": true
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [],
28 |             "criteria": "The \"SRCNN\" model is used in `src/model.py`.",
29 |             "category": "Machine Learning Method",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.",
40 |             "category": "Visualization",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.",
51 |             "category": "Visualization",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "Well-organized output images, highlighting key improvements, should be included.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": true,
70 |     "executed_successfully": false,
71 |     "satisfied_all_requirements": false,
72 |     "satisfied_all_preferences": false
73 | }
74 | 


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/human_as_a_judge/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images",
 3 |     "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models",
 7 |         "Natural Language Processing"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": false
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.",
33 |             "category": "Visualization",
34 |             "satisfied": false
35 |         }
36 |     ],
37 |     "preferences": [
38 |         {
39 |             "preference_id": 0,
40 |             "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "preference_id": 1,
45 |             "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.",
46 |             "satisfied": null
47 |         }
48 |     ],
49 |     "is_kaggle_api_needed": false,
50 |     "is_training_needed": false,
51 |     "is_web_navigation_needed": true,
52 |     "resource": "https://www.cognition.ai/blog/introducing-devin",
53 |     "executed_successfully": false,
54 |     "satisfied_all_requirements": false,
55 |     "satisfied_all_preferences": false
56 | }
57 | 


--------------------------------------------------------------------------------
/benchmark/judgment/GPT-Pilot/human_as_a_judge/53_Devin_Upwork_Side_Hustle.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "53_Devin_Upwork_Side_Hustle",
 3 |     "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.",
 4 |     "tags": [
 5 |         "Computer Vision"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": false
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [
18 |                 0
19 |             ],
20 |             "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.",
21 |             "category": "Dataset or Environment",
22 |             "satisfied": false
23 |         },
24 |         {
25 |             "requirement_id": 2,
26 |             "prerequisites": [
27 |                 0,
28 |                 1
29 |             ],
30 |             "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.",
31 |             "category": "Other",
32 |             "satisfied": false
33 |         },
34 |         {
35 |             "requirement_id": 3,
36 |             "prerequisites": [
37 |                 0,
38 |                 1,
39 |                 2
40 |             ],
41 |             "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.",
42 |             "category": "Visualization",
43 |             "satisfied": false
44 |         },
45 |         {
46 |             "requirement_id": 4,
47 |             "prerequisites": [
48 |                 0,
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": false,
72 |     "is_web_navigation_needed": true,
73 |     "resource": "https://www.cognition.ai/blog/introducing-devin",
74 |     "executed_successfully": false,
75 |     "satisfied_all_requirements": false,
76 |     "satisfied_all_preferences": false
77 | }
78 | 


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/02_Maze_Solver_Q_Learning_Gridworld_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "02_Maze_Solver_Q_Learning_Gridworld_RL",
 3 |     "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": false
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.",
29 |             "category": "Visualization",
30 |             "satisfied": true
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.",
40 |             "category": "Save Trained Model",
41 |             "satisfied": true
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1
48 |             ],
49 |             "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.",
50 |             "category": "Visualization",
51 |             "satisfied": true
52 |         }
53 |     ],
54 |     "preferences": [
55 |         {
56 |             "preference_id": 0,
57 |             "criteria": "Some real-time progress or feedback during the training process should be displayed.",
58 |             "satisfied": null
59 |         },
60 |         {
61 |             "preference_id": 1,
62 |             "criteria": "The code should be written in a way that's easy to modify or extend later on.",
63 |             "satisfied": null
64 |         }
65 |     ],
66 |     "is_kaggle_api_needed": false,
67 |     "is_training_needed": true,
68 |     "is_web_navigation_needed": false
69 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML",
 3 |     "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.",
 4 |     "tags": [
 5 |         "Classification",
 6 |         "Natural Language Processing",
 7 |         "Supervised Learning"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": false
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.",
33 |             "category": "Data preprocessing and postprocessing",
34 |             "satisfied": false
35 |         },
36 |         {
37 |             "requirement_id": 3,
38 |             "prerequisites": [],
39 |             "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.",
40 |             "category": "Machine Learning Method",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.",
51 |             "category": "Performance Metrics",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The model should be straightforward to interpret.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "The final report should be structured clearly for easy review.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": false
70 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/05_Game_Simulation_DQN_CartPole_v1_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "05_Game_Simulation_DQN_CartPole_v1_RL",
 3 |     "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": false
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": false
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.",
29 |             "category": "Visualization",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1
37 |             ],
38 |             "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.",
39 |             "category": "Save Trained Model",
40 |             "satisfied": false
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1
47 |             ],
48 |             "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.",
49 |             "category": "Performance Metrics",
50 |             "satisfied": true
51 |         }
52 |     ],
53 |     "preferences": [
54 |         {
55 |             "preference_id": 0,
56 |             "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.",
57 |             "satisfied": null
58 |         },
59 |         {
60 |             "preference_id": 1,
61 |             "criteria": "The return over episode curve has key milestones annotated.",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "is_kaggle_api_needed": false,
66 |     "is_training_needed": true,
67 |     "is_web_navigation_needed": false
68 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/06_Sentiment_Analysis_SVM_Sentiment140_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML",
 3 |     "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.",
 4 |     "tags": [
 5 |         "Natural Language Processing",
 6 |         "Supervised Learning"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": false
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": false
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The dataset download process should be reliable, with clear error handling.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The final accuracy report should be straightforward and easy to interpret.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": true,
72 |     "is_web_navigation_needed": false
73 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/07_Image_Super_Resolution_SRCNN_Set5_DL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "07_Image_Super_Resolution_SRCNN_Set5_DL",
 3 |     "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`.  The generated images should be high-quality and clearly show improvements.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [],
28 |             "criteria": "The \"SRCNN\" model is used in `src/model.py`.",
29 |             "category": "Machine Learning Method",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.",
40 |             "category": "Visualization",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.",
51 |             "category": "Visualization",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "Well-organized output images, highlighting key improvements, should be included.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": true
70 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/08_Robot_Control_PPO_PyBullet_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "08_Robot_Control_PPO_PyBullet_RL",
 3 |     "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.",
12 |             "category": "Dataset or Environment",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"PPO\" algorithm is used in `src/train.py`.",
19 |             "category": "Machine Learning Method",
20 |             "satisfied": false
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0
26 |             ],
27 |             "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.",
28 |             "category": "Dataset or Environment",
29 |             "satisfied": false
30 |         },
31 |         {
32 |             "requirement_id": 3,
33 |             "prerequisites": [
34 |                 0,
35 |                 1,
36 |                 2
37 |             ],
38 |             "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.",
39 |             "category": "Other",
40 |             "satisfied": false
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1,
47 |                 2
48 |             ],
49 |             "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.",
50 |             "category": "Visualization",
51 |             "satisfied": false
52 |         },
53 |         {
54 |             "requirement_id": 5,
55 |             "prerequisites": [
56 |                 0,
57 |                 1,
58 |                 2
59 |             ],
60 |             "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.",
61 |             "category": "Visualization",
62 |             "satisfied": false
63 |         }
64 |     ],
65 |     "preferences": [
66 |         {
67 |             "preference_id": 0,
68 |             "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.",
69 |             "satisfied": null
70 |         }
71 |     ],
72 |     "is_kaggle_api_needed": false,
73 |     "is_training_needed": true,
74 |     "is_web_navigation_needed": false
75 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images",
 3 |     "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models",
 7 |         "Natural Language Processing"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": false
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.",
33 |             "category": "Visualization",
34 |             "satisfied": true
35 |         }
36 |     ],
37 |     "preferences": [
38 |         {
39 |             "preference_id": 0,
40 |             "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "preference_id": 1,
45 |             "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.",
46 |             "satisfied": null
47 |         }
48 |     ],
49 |     "is_kaggle_api_needed": false,
50 |     "is_training_needed": false,
51 |     "is_web_navigation_needed": true,
52 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
53 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/52_Devin_AI_Trains_an_AI.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "52_Devin_AI_Trains_an_AI",
 3 |     "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.",
 4 |     "tags": [
 5 |         "Generative Models",
 6 |         "Natural Language Processing"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The instructions in the repository at `https://github.com/artidoro/qlora` are followed to finetune a \"7B LLaMA\" model and save it in models/saved_models/.",
13 |             "category": "Machine Learning Method",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The necessary environment and dependencies are set up as outlined in the repository `src/env.py`.",
22 |             "category": "Dataset or Environment",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "The datasets required for finetuning the model are downloaded and prepared in `src/data_loader.py`.",
32 |             "category": "Dataset or Environment",
33 |             "satisfied": false
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": false
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 0,
50 |                 1,
51 |                 2,
52 |                 3
53 |             ],
54 |             "criteria": "The finetuned model and training summary are saved, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.",
55 |             "category": "Save Trained Model",
56 |             "satisfied": false
57 |         }
58 |     ],
59 |     "preferences": [
60 |         {
61 |             "preference_id": 0,
62 |             "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.",
63 |             "satisfied": null
64 |         },
65 |         {
66 |             "preference_id": 1,
67 |             "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.",
68 |             "satisfied": null
69 |         }
70 |     ],
71 |     "is_kaggle_api_needed": false,
72 |     "is_training_needed": true,
73 |     "is_web_navigation_needed": true,
74 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
75 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/53_Devin_Upwork_Side_Hustle.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "53_Devin_Upwork_Side_Hustle",
 3 |     "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.",
 4 |     "tags": [
 5 |         "Computer Vision"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": false
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [
18 |                 0
19 |             ],
20 |             "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.",
21 |             "category": "Dataset or Environment",
22 |             "satisfied": false
23 |         },
24 |         {
25 |             "requirement_id": 2,
26 |             "prerequisites": [
27 |                 0,
28 |                 1
29 |             ],
30 |             "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.",
31 |             "category": "Other",
32 |             "satisfied": false
33 |         },
34 |         {
35 |             "requirement_id": 3,
36 |             "prerequisites": [
37 |                 0,
38 |                 1,
39 |                 2
40 |             ],
41 |             "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.",
42 |             "category": "Visualization",
43 |             "satisfied": false
44 |         },
45 |         {
46 |             "requirement_id": 4,
47 |             "prerequisites": [
48 |                 0,
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": false,
72 |     "is_web_navigation_needed": true,
73 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
74 | }


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/human_as_a_judge/05_Game_Simulation_DQN_CartPole_v1_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "05_Game_Simulation_DQN_CartPole_v1_RL",
 3 |     "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": false
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": false
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.",
29 |             "category": "Visualization",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1
37 |             ],
38 |             "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.",
39 |             "category": "Save Trained Model",
40 |             "satisfied": true
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1
47 |             ],
48 |             "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.",
49 |             "category": "Performance Metrics",
50 |             "satisfied": true
51 |         }
52 |     ],
53 |     "preferences": [
54 |         {
55 |             "preference_id": 0,
56 |             "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.",
57 |             "satisfied": null
58 |         },
59 |         {
60 |             "preference_id": 1,
61 |             "criteria": "The return over episode curve has key milestones annotated.",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "is_kaggle_api_needed": false,
66 |     "is_training_needed": true,
67 |     "is_web_navigation_needed": false,
68 |     "executed_successfully": false,
69 |     "satisfied_all_requirements": false,
70 |     "satisfied_all_preferences": false
71 | }
72 | 


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/human_as_a_judge/06_Sentiment_Analysis_SVM_Sentiment140_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML",
 3 |     "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.",
 4 |     "tags": [
 5 |         "Natural Language Processing",
 6 |         "Supervised Learning"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": false
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": false
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The dataset download process should be reliable, with clear error handling.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The final accuracy report should be straightforward and easy to interpret.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": true,
72 |     "is_web_navigation_needed": false,
73 |     "executed_successfully": false,
74 |     "satisfied_all_requirements": false,
75 |     "satisfied_all_preferences": false
76 | }
77 | 


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/human_as_a_judge/07_Image_Super_Resolution_SRCNN_Set5_DL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "07_Image_Super_Resolution_SRCNN_Set5_DL",
 3 |     "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`.  The generated images should be high-quality and clearly show improvements.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [],
28 |             "criteria": "The \"SRCNN\" model is used in `src/model.py`.",
29 |             "category": "Machine Learning Method",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.",
40 |             "category": "Visualization",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.",
51 |             "category": "Visualization",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "Well-organized output images, highlighting key improvements, should be included.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": true,
70 |     "executed_successfully": false,
71 |     "satisfied_all_requirements": false,
72 |     "satisfied_all_preferences": false
73 | }
74 | 


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/human_as_a_judge/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images",
 3 |     "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models",
 7 |         "Natural Language Processing"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": false
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.",
33 |             "category": "Visualization",
34 |             "satisfied": false
35 |         }
36 |     ],
37 |     "preferences": [
38 |         {
39 |             "preference_id": 0,
40 |             "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "preference_id": 1,
45 |             "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.",
46 |             "satisfied": null
47 |         }
48 |     ],
49 |     "is_kaggle_api_needed": false,
50 |     "is_training_needed": false,
51 |     "is_web_navigation_needed": true,
52 |     "resource": "https://www.cognition.ai/blog/introducing-devin",
53 |     "executed_successfully": true,
54 |     "satisfied_all_requirements": false,
55 |     "satisfied_all_preferences": false
56 | }
57 | 


--------------------------------------------------------------------------------
/benchmark/judgment/MetaGPT/human_as_a_judge/53_Devin_Upwork_Side_Hustle.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "53_Devin_Upwork_Side_Hustle",
 3 |     "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.",
 4 |     "tags": [
 5 |         "Computer Vision"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [
18 |                 0
19 |             ],
20 |             "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.",
21 |             "category": "Dataset or Environment",
22 |             "satisfied": false
23 |         },
24 |         {
25 |             "requirement_id": 2,
26 |             "prerequisites": [
27 |                 0,
28 |                 1
29 |             ],
30 |             "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.",
31 |             "category": "Other",
32 |             "satisfied": false
33 |         },
34 |         {
35 |             "requirement_id": 3,
36 |             "prerequisites": [
37 |                 0,
38 |                 1,
39 |                 2
40 |             ],
41 |             "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.",
42 |             "category": "Visualization",
43 |             "satisfied": false
44 |         },
45 |         {
46 |             "requirement_id": 4,
47 |             "prerequisites": [
48 |                 0,
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": false,
72 |     "is_web_navigation_needed": true,
73 |     "resource": "https://www.cognition.ai/blog/introducing-devin",
74 |     "executed_successfully": false,
75 |     "satisfied_all_requirements": false,
76 |     "satisfied_all_preferences": false
77 | }
78 | 


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/02_Maze_Solver_Q_Learning_Gridworld_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "02_Maze_Solver_Q_Learning_Gridworld_RL",
 3 |     "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.",
29 |             "category": "Visualization",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.",
40 |             "category": "Save Trained Model",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1
48 |             ],
49 |             "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.",
50 |             "category": "Visualization",
51 |             "satisfied": false
52 |         }
53 |     ],
54 |     "preferences": [
55 |         {
56 |             "preference_id": 0,
57 |             "criteria": "Some real-time progress or feedback during the training process should be displayed.",
58 |             "satisfied": null
59 |         },
60 |         {
61 |             "preference_id": 1,
62 |             "criteria": "The code should be written in a way that's easy to modify or extend later on.",
63 |             "satisfied": null
64 |         }
65 |     ],
66 |     "is_kaggle_api_needed": false,
67 |     "is_training_needed": true,
68 |     "is_web_navigation_needed": false
69 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML",
 3 |     "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.",
 4 |     "tags": [
 5 |         "Classification",
 6 |         "Natural Language Processing",
 7 |         "Supervised Learning"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": true
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.",
33 |             "category": "Data preprocessing and postprocessing",
34 |             "satisfied": true
35 |         },
36 |         {
37 |             "requirement_id": 3,
38 |             "prerequisites": [],
39 |             "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.",
40 |             "category": "Machine Learning Method",
41 |             "satisfied": true
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.",
51 |             "category": "Performance Metrics",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The model should be straightforward to interpret.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "The final report should be structured clearly for easy review.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": false
70 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/05_Game_Simulation_DQN_CartPole_v1_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "05_Game_Simulation_DQN_CartPole_v1_RL",
 3 |     "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.",
29 |             "category": "Visualization",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1
37 |             ],
38 |             "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.",
39 |             "category": "Save Trained Model",
40 |             "satisfied": false
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1
47 |             ],
48 |             "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.",
49 |             "category": "Performance Metrics",
50 |             "satisfied": false
51 |         }
52 |     ],
53 |     "preferences": [
54 |         {
55 |             "preference_id": 0,
56 |             "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.",
57 |             "satisfied": null
58 |         },
59 |         {
60 |             "preference_id": 1,
61 |             "criteria": "The return over episode curve has key milestones annotated.",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "is_kaggle_api_needed": false,
66 |     "is_training_needed": true,
67 |     "is_web_navigation_needed": false
68 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/06_Sentiment_Analysis_SVM_Sentiment140_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML",
 3 |     "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.",
 4 |     "tags": [
 5 |         "Natural Language Processing",
 6 |         "Supervised Learning"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": true
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": true
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": false
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": true
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The dataset download process should be reliable, with clear error handling.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The final accuracy report should be straightforward and easy to interpret.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": true,
72 |     "is_web_navigation_needed": false
73 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/07_Image_Super_Resolution_SRCNN_Set5_DL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "07_Image_Super_Resolution_SRCNN_Set5_DL",
 3 |     "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`.  The generated images should be high-quality and clearly show improvements.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": true
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [],
28 |             "criteria": "The \"SRCNN\" model is used in `src/model.py`.",
29 |             "category": "Machine Learning Method",
30 |             "satisfied": true
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.",
40 |             "category": "Visualization",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.",
51 |             "category": "Visualization",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "Well-organized output images, highlighting key improvements, should be included.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": true
70 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/08_Robot_Control_PPO_PyBullet_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "08_Robot_Control_PPO_PyBullet_RL",
 3 |     "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.",
12 |             "category": "Dataset or Environment",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "The \"PPO\" algorithm is used in `src/train.py`.",
19 |             "category": "Machine Learning Method",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0
26 |             ],
27 |             "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.",
28 |             "category": "Dataset or Environment",
29 |             "satisfied": false
30 |         },
31 |         {
32 |             "requirement_id": 3,
33 |             "prerequisites": [
34 |                 0,
35 |                 1,
36 |                 2
37 |             ],
38 |             "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.",
39 |             "category": "Other",
40 |             "satisfied": false
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1,
47 |                 2
48 |             ],
49 |             "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.",
50 |             "category": "Visualization",
51 |             "satisfied": false
52 |         },
53 |         {
54 |             "requirement_id": 5,
55 |             "prerequisites": [
56 |                 0,
57 |                 1,
58 |                 2
59 |             ],
60 |             "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.",
61 |             "category": "Visualization",
62 |             "satisfied": false
63 |         }
64 |     ],
65 |     "preferences": [
66 |         {
67 |             "preference_id": 0,
68 |             "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.",
69 |             "satisfied": null
70 |         }
71 |     ],
72 |     "is_kaggle_api_needed": false,
73 |     "is_training_needed": true,
74 |     "is_web_navigation_needed": false
75 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images",
 3 |     "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models",
 7 |         "Natural Language Processing"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": true
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.",
33 |             "category": "Visualization",
34 |             "satisfied": true
35 |         }
36 |     ],
37 |     "preferences": [
38 |         {
39 |             "preference_id": 0,
40 |             "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "preference_id": 1,
45 |             "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.",
46 |             "satisfied": null
47 |         }
48 |     ],
49 |     "is_kaggle_api_needed": false,
50 |     "is_training_needed": false,
51 |     "is_web_navigation_needed": true,
52 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
53 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/52_Devin_AI_Trains_an_AI.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "52_Devin_AI_Trains_an_AI",
 3 |     "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.",
 4 |     "tags": [
 5 |         "Generative Models",
 6 |         "Natural Language Processing"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The instructions in the repository at `https://github.com/artidoro/qlora` are followed to finetune a \"7B LLaMA\" model and save it in models/saved_models/.",
13 |             "category": "Machine Learning Method",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The necessary environment and dependencies are set up as outlined in the repository `src/env.py`.",
22 |             "category": "Dataset or Environment",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "The datasets required for finetuning the model are downloaded and prepared in `src/data_loader.py`.",
32 |             "category": "Dataset or Environment",
33 |             "satisfied": false
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": false
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 0,
50 |                 1,
51 |                 2,
52 |                 3
53 |             ],
54 |             "criteria": "The finetuned model and training summary are saved, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.",
55 |             "category": "Save Trained Model",
56 |             "satisfied": false
57 |         }
58 |     ],
59 |     "preferences": [
60 |         {
61 |             "preference_id": 0,
62 |             "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.",
63 |             "satisfied": null
64 |         },
65 |         {
66 |             "preference_id": 1,
67 |             "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.",
68 |             "satisfied": null
69 |         }
70 |     ],
71 |     "is_kaggle_api_needed": false,
72 |     "is_training_needed": true,
73 |     "is_web_navigation_needed": true,
74 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
75 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/53_Devin_Upwork_Side_Hustle.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "53_Devin_Upwork_Side_Hustle",
 3 |     "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.",
 4 |     "tags": [
 5 |         "Computer Vision"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [
18 |                 0
19 |             ],
20 |             "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.",
21 |             "category": "Dataset or Environment",
22 |             "satisfied": false
23 |         },
24 |         {
25 |             "requirement_id": 2,
26 |             "prerequisites": [
27 |                 0,
28 |                 1
29 |             ],
30 |             "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.",
31 |             "category": "Other",
32 |             "satisfied": false
33 |         },
34 |         {
35 |             "requirement_id": 3,
36 |             "prerequisites": [
37 |                 0,
38 |                 1,
39 |                 2
40 |             ],
41 |             "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.",
42 |             "category": "Visualization",
43 |             "satisfied": false
44 |         },
45 |         {
46 |             "requirement_id": 4,
47 |             "prerequisites": [
48 |                 0,
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": false,
72 |     "is_web_navigation_needed": true,
73 |     "resource": "https://www.cognition.ai/blog/introducing-devin"
74 | }


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/human_as_a_judge/05_Game_Simulation_DQN_CartPole_v1_RL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "05_Game_Simulation_DQN_CartPole_v1_RL",
 3 |     "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.",
 4 |     "tags": [
 5 |         "Reinforcement Learning"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [],
18 |             "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.",
19 |             "category": "Dataset or Environment",
20 |             "satisfied": true
21 |         },
22 |         {
23 |             "requirement_id": 2,
24 |             "prerequisites": [
25 |                 0,
26 |                 1
27 |             ],
28 |             "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.",
29 |             "category": "Visualization",
30 |             "satisfied": false
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1
37 |             ],
38 |             "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.",
39 |             "category": "Save Trained Model",
40 |             "satisfied": false
41 |         },
42 |         {
43 |             "requirement_id": 4,
44 |             "prerequisites": [
45 |                 0,
46 |                 1
47 |             ],
48 |             "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.",
49 |             "category": "Performance Metrics",
50 |             "satisfied": true
51 |         }
52 |     ],
53 |     "preferences": [
54 |         {
55 |             "preference_id": 0,
56 |             "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.",
57 |             "satisfied": null
58 |         },
59 |         {
60 |             "preference_id": 1,
61 |             "criteria": "The return over episode curve has key milestones annotated.",
62 |             "satisfied": null
63 |         }
64 |     ],
65 |     "is_kaggle_api_needed": false,
66 |     "is_training_needed": true,
67 |     "is_web_navigation_needed": false,
68 |     "executed_successfully": false,
69 |     "satisfied_all_requirements": false,
70 |     "satisfied_all_preferences": false
71 | }
72 | 


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/human_as_a_judge/06_Sentiment_Analysis_SVM_Sentiment140_ML.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML",
 3 |     "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.",
 4 |     "tags": [
 5 |         "Natural Language Processing",
 6 |         "Supervised Learning"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": false
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": true
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [
28 |                 0,
29 |                 1
30 |             ],
31 |             "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.",
32 |             "category": "Machine Learning Method",
33 |             "satisfied": false
34 |         },
35 |         {
36 |             "requirement_id": 3,
37 |             "prerequisites": [
38 |                 0,
39 |                 1,
40 |                 2
41 |             ],
42 |             "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.",
43 |             "category": "Machine Learning Method",
44 |             "satisfied": true
45 |         },
46 |         {
47 |             "requirement_id": 4,
48 |             "prerequisites": [
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The dataset download process should be reliable, with clear error handling.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The final accuracy report should be straightforward and easy to interpret.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": true,
72 |     "is_web_navigation_needed": false,
73 |     "executed_successfully": true,
74 |     "satisfied_all_requirements": false,
75 |     "satisfied_all_preferences": false
76 | }
77 | 


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/human_as_a_judge/07_Image_Super_Resolution_SRCNN_Set5_DL.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "07_Image_Super_Resolution_SRCNN_Set5_DL",
 3 |     "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`.  The generated images should be high-quality and clearly show improvements.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models"
 7 |     ],
 8 |     "requirements": [
 9 |         {
10 |             "requirement_id": 0,
11 |             "prerequisites": [],
12 |             "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.",
13 |             "category": "Dataset or Environment",
14 |             "satisfied": true
15 |         },
16 |         {
17 |             "requirement_id": 1,
18 |             "prerequisites": [
19 |                 0
20 |             ],
21 |             "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.",
22 |             "category": "Data preprocessing and postprocessing",
23 |             "satisfied": false
24 |         },
25 |         {
26 |             "requirement_id": 2,
27 |             "prerequisites": [],
28 |             "criteria": "The \"SRCNN\" model is used in `src/model.py`.",
29 |             "category": "Machine Learning Method",
30 |             "satisfied": true
31 |         },
32 |         {
33 |             "requirement_id": 3,
34 |             "prerequisites": [
35 |                 0,
36 |                 1,
37 |                 2
38 |             ],
39 |             "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.",
40 |             "category": "Visualization",
41 |             "satisfied": false
42 |         },
43 |         {
44 |             "requirement_id": 4,
45 |             "prerequisites": [
46 |                 0,
47 |                 1,
48 |                 2
49 |             ],
50 |             "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.",
51 |             "category": "Visualization",
52 |             "satisfied": false
53 |         }
54 |     ],
55 |     "preferences": [
56 |         {
57 |             "preference_id": 0,
58 |             "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.",
59 |             "satisfied": null
60 |         },
61 |         {
62 |             "preference_id": 1,
63 |             "criteria": "Well-organized output images, highlighting key improvements, should be included.",
64 |             "satisfied": null
65 |         }
66 |     ],
67 |     "is_kaggle_api_needed": false,
68 |     "is_training_needed": true,
69 |     "is_web_navigation_needed": true,
70 |     "executed_successfully": false,
71 |     "satisfied_all_requirements": false,
72 |     "satisfied_all_preferences": false
73 | }
74 | 


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/human_as_a_judge/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images",
 3 |     "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.",
 4 |     "tags": [
 5 |         "Computer Vision",
 6 |         "Generative Models",
 7 |         "Natural Language Processing"
 8 |     ],
 9 |     "requirements": [
10 |         {
11 |             "requirement_id": 0,
12 |             "prerequisites": [],
13 |             "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.",
14 |             "category": "Dataset or Environment",
15 |             "satisfied": false
16 |         },
17 |         {
18 |             "requirement_id": 1,
19 |             "prerequisites": [
20 |                 0
21 |             ],
22 |             "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.",
23 |             "category": "Data preprocessing and postprocessing",
24 |             "satisfied": false
25 |         },
26 |         {
27 |             "requirement_id": 2,
28 |             "prerequisites": [
29 |                 0,
30 |                 1
31 |             ],
32 |             "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.",
33 |             "category": "Visualization",
34 |             "satisfied": true
35 |         }
36 |     ],
37 |     "preferences": [
38 |         {
39 |             "preference_id": 0,
40 |             "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.",
41 |             "satisfied": null
42 |         },
43 |         {
44 |             "preference_id": 1,
45 |             "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.",
46 |             "satisfied": null
47 |         }
48 |     ],
49 |     "is_kaggle_api_needed": false,
50 |     "is_training_needed": false,
51 |     "is_web_navigation_needed": true,
52 |     "resource": "https://www.cognition.ai/blog/introducing-devin",
53 |     "executed_successfully": true,
54 |     "satisfied_all_requirements": false,
55 |     "satisfied_all_preferences": false
56 | }
57 | 


--------------------------------------------------------------------------------
/benchmark/judgment/OpenHands/human_as_a_judge/53_Devin_Upwork_Side_Hustle.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "53_Devin_Upwork_Side_Hustle",
 3 |     "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.",
 4 |     "tags": [
 5 |         "Computer Vision"
 6 |     ],
 7 |     "requirements": [
 8 |         {
 9 |             "requirement_id": 0,
10 |             "prerequisites": [],
11 |             "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.",
12 |             "category": "Machine Learning Method",
13 |             "satisfied": true
14 |         },
15 |         {
16 |             "requirement_id": 1,
17 |             "prerequisites": [
18 |                 0
19 |             ],
20 |             "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.",
21 |             "category": "Dataset or Environment",
22 |             "satisfied": false
23 |         },
24 |         {
25 |             "requirement_id": 2,
26 |             "prerequisites": [
27 |                 0,
28 |                 1
29 |             ],
30 |             "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.",
31 |             "category": "Other",
32 |             "satisfied": false
33 |         },
34 |         {
35 |             "requirement_id": 3,
36 |             "prerequisites": [
37 |                 0,
38 |                 1,
39 |                 2
40 |             ],
41 |             "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.",
42 |             "category": "Visualization",
43 |             "satisfied": false
44 |         },
45 |         {
46 |             "requirement_id": 4,
47 |             "prerequisites": [
48 |                 0,
49 |                 1,
50 |                 2,
51 |                 3
52 |             ],
53 |             "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.",
54 |             "category": "Performance Metrics",
55 |             "satisfied": false
56 |         }
57 |     ],
58 |     "preferences": [
59 |         {
60 |             "preference_id": 0,
61 |             "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.",
62 |             "satisfied": null
63 |         },
64 |         {
65 |             "preference_id": 1,
66 |             "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.",
67 |             "satisfied": null
68 |         }
69 |     ],
70 |     "is_kaggle_api_needed": false,
71 |     "is_training_needed": false,
72 |     "is_web_navigation_needed": true,
73 |     "resource": "https://www.cognition.ai/blog/introducing-devin",
74 |     "executed_successfully": false,
75 |     "satisfied_all_requirements": false,
76 |     "satisfied_all_preferences": false
77 | }
78 | 


--------------------------------------------------------------------------------
/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/drug_response_prediction_report.md:
--------------------------------------------------------------------------------
 1 | # Drug Response Prediction Report
 2 | 
 3 | ## Data Preprocessing
 4 | - Loaded the GDSC dataset.
 5 | - Performed feature selection using `SelectKBest` with `f_regression` as the scoring function.
 6 | - Selected the top 10 features based on their scores.
 7 | 
 8 | ## Model Training
 9 | - Implemented a Support Vector Machine (SVM) regressor using `scikit-learn`.
10 | - Created a pipeline with a standard scaler and SVM regressor with a linear kernel.
11 | - Trained the model using the selected features and target values.
12 | 
13 | ## Model Evaluation
14 | - Evaluated the model using 5-fold cross-validation.
15 | - Calculated the Root Mean Squared Error (RMSE) for each fold.
16 | - Saved the performance metrics to `results/metrics/performance.txt`.
17 | 
18 | ## Results
19 | - Selected features: `feature1`, `feature2`, `feature3`, `feature4`, `feature5`, `feature6`, `feature7`, `feature8`, `feature9`, `feature10`.
20 | - Cross-validated RMSE scores: `[0.28540323, 0.3461573, 0.34480114, 0.37766893, 0.28471238]`.
21 | - Mean RMSE: `0.327748593896111`.
22 | - Standard deviation of RMSE: `0.03678846341261786`.
23 | 
24 | ## Visualization
25 | ![RMSE Scores](/workspace/results/figures/rmse_scores.png)
26 | 
27 | The histogram above shows the distribution of the RMSE scores obtained from the cross-validation.
28 | 
29 | ## Conclusion
30 | - The feature selection process helped in identifying the key features that impact the drug response prediction.
31 | - The SVM regressor provided a reasonable prediction performance with a mean RMSE of approximately 0.328.
32 | - The visualization highlights the consistency of the model's performance across different folds.
33 | 


--------------------------------------------------------------------------------
/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/drug_response_prediction_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/drug_response_prediction_report.pdf


--------------------------------------------------------------------------------
/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/performance.txt:
--------------------------------------------------------------------------------
1 | Selected features: Index(['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6',
2 |        'feature7', 'feature8', 'feature9', 'feature10'],
3 |       dtype='object')
4 | Cross-validated RMSE scores: [0.28540323 0.3461573  0.34480114 0.37766893 0.28471238]
5 | Mean RMSE: 0.327748593896111
6 | Standard deviation of RMSE: 0.03678846341261786
7 | 


--------------------------------------------------------------------------------
/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/rmse_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/rmse_scores.png


--------------------------------------------------------------------------------
/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/src/data_loader.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from sklearn.feature_selection import SelectKBest, f_regression
 3 | 
 4 | def load_and_select_features(data_path, target_column, k=10):
 5 |     # Load the dataset
 6 |     data = pd.read_csv(data_path)
 7 |     
 8 |     # Separate features and target
 9 |     X = data.drop(columns=[target_column])
10 |     y = data[target_column]
11 |     
12 |     # Perform feature selection
13 |     selector = SelectKBest(score_func=f_regression, k=k)
14 |     X_selected = selector.fit_transform(X, y)
15 |     
16 |     # Get selected feature names
17 |     selected_features = X.columns[selector.get_support()]
18 |     
19 |     return X_selected, y, selected_features
20 | 
21 | if __name__ == "__main__":
22 |     data_path = 'path_to_gdsc_dataset.csv'  # Update this path
23 |     target_column = 'target'  # Update this column name
24 |     X_selected, y, selected_features = load_and_select_features(data_path, target_column)
25 |     print(f"Selected features: {selected_features}")
26 | 


--------------------------------------------------------------------------------
/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/src/model.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import SVR
 2 | from sklearn.pipeline import make_pipeline
 3 | from sklearn.preprocessing import StandardScaler
 4 | 
 5 | def train_svm_regressor(X, y):
 6 |     # Create a pipeline with standard scaler and SVM regressor
 7 |     model = make_pipeline(StandardScaler(), SVR(kernel='linear'))
 8 |     
 9 |     # Train the model
10 |     model.fit(X, y)
11 |     
12 |     return model
13 | 


--------------------------------------------------------------------------------
/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/src/train.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.model_selection import cross_val_score
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | import os
 6 | from data_loader import load_and_select_features
 7 | from model import train_svm_regressor
 8 | 
 9 | def evaluate_model(data_path, target_column, k=10):
10 |     # Load and select features
11 |     X, y, selected_features = load_and_select_features(data_path, target_column, k)
12 |     
13 |     # Train the model
14 |     model = train_svm_regressor(X, y)
15 |     
16 |     # Perform cross-validation
17 |     scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
18 |     rmse_scores = np.sqrt(-scores)
19 |     
20 |     # Save performance results
21 |     os.makedirs('results/metrics', exist_ok=True)
22 |     with open('results/metrics/performance.txt', 'w') as f:
23 |         f.write(f"Selected features: {selected_features}\n")
24 |         f.write(f"Cross-validated RMSE scores: {rmse_scores}\n")
25 |         f.write(f"Mean RMSE: {rmse_scores.mean()}\n")
26 |         f.write(f"Standard deviation of RMSE: {rmse_scores.std()}\n")
27 |     
28 |     # Visualize regression results
29 |     sns.histplot(rmse_scores, kde=True)
30 |     plt.title('Cross-validated RMSE scores')
31 |     plt.xlabel('RMSE')
32 |     plt.ylabel('Frequency')
33 |     os.makedirs('results/figures', exist_ok=True)
34 |     plt.savefig('results/figures/rmse_scores.png')
35 |     plt.close()
36 | 
37 | if __name__ == "__main__":
38 |     data_path = 'path_to_gdsc_dataset.csv'  # Update this path
39 |     target_column = 'target'  # Update this column name
40 |     evaluate_model(data_path, target_column)
41 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "agent-as-a-judge"
 3 | version = "0.1.5"
 4 | description = "This project contains the source code for the paper [Agent-as-a-Judge: Evaluating Agents with Agents]."
 5 | authors = ["Mingchen Zhuge <mingchen.zhuge@kaust.edu.sa>"]
 6 | license = "MIT License"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | python-dotenv = "^1.0.1"
12 | tiktoken = "^0.9.0"
13 | rich = "^13.9.2"
14 | litellm = "^1.50.0"
15 | tenacity = "^9.0.0"
16 | numpy = "<2.0"
17 | networkx = "^3.3"
18 | spacy = "<3.8.0"
19 | rank-bm25 = "^0.2.2"
20 | sentence-transformers = "^3.1.1"
21 | pandas = "^2.2.3"
22 | docx = "^0.2.4"
23 | markdown = "^3.7"
24 | python-docx = "^1.1.2"
25 | pypdf2 = "^3.0.1"
26 | openpyxl = "^3.1.5"
27 | opencv-python = "^4.10.0.84"
28 | beautifulsoup4 = "^4.12.3"
29 | pylatexenc = "^2.10"
30 | matplotlib = "^3.9.2"
31 | tree-sitter-languages = "1.8.0"
32 | grep-ast = "^0.3.3"
33 | rapidfuzz = "^3.10.0"
34 | tqdm = "^4.66.5"
35 | logging = "^0.4.9.6"
36 | tree-sitter = "0.21.3"
37 | pytest = "^8.3.3"
38 | tf-keras = "^2.17.0"
39 | jinja2 = "^3.1.3"
40 | dotenv = "^0.9.9"
41 | python-pptx = "^1.0.2"
42 | 
43 | 
44 | [tool.poetry.group.dev.dependencies]
45 | pytest = "^8.3.3"
46 | pytest-cov = "^5.0.0"
47 | 
48 | [build-system]
49 | requires = ["poetry-core"]
50 | build-backend = "poetry.core.masonry.api"
51 | 


--------------------------------------------------------------------------------
/scripts/run_statistics.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from pathlib import Path
 3 | import argparse
 4 | import re
 5 | 
 6 | from agent_as_a_judge.module.statistics import DevStatistics
 7 | 
 8 | 
 9 | logging.basicConfig(
10 |     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
11 | )
12 | 
13 | 
14 | def extract_number_from_filename(filename: str) -> int:
15 | 
16 |     match = re.search(r"(\d+)", filename)
17 |     return int(match.group(1)) if match else float("inf")
18 | 
19 | 
20 | def main(instance_dir: Path, workspace_dir: Path):
21 | 
22 |     instance_files = sorted(
23 |         list(instance_dir.glob("*.json")),
24 |         key=lambda f: extract_number_from_filename(f.stem),
25 |     )
26 | 
27 |     logging.info(f"Total instances found: {len(instance_files)}")
28 |     total_py_files = 0
29 |     total_code_lines = 0
30 |     total_files = 0
31 |     total_non_code_files = 0
32 | 
33 |     for instance_file in instance_files:
34 |         instance_name = instance_file.stem
35 |         workspace = workspace_dir / instance_name
36 | 
37 |         dev_statistics = DevStatistics(workspace)
38 |         (
39 |             total_files_in_workspace,
40 |             total_non_code_files_in_workspace,
41 |             py_files_in_workspace,
42 |             lines_in_workspace,
43 |         ) = dev_statistics.calculate_statistics()
44 |         total_py_files += py_files_in_workspace
45 |         total_code_lines += lines_in_workspace
46 |         total_files += total_files_in_workspace
47 |         total_non_code_files += total_non_code_files_in_workspace
48 | 
49 |     logging.info("\nTotal summary across all workspaces:")
50 |     logging.info(f"Total files: {total_files}")
51 |     logging.info(f"Total non-Python files: {total_non_code_files}")
52 |     logging.info(f"Total Python files: {total_py_files}")
53 |     logging.info(f"Total lines of Python code: {total_code_lines}")
54 |     logging.info(
55 |         f"Avg. lines of Python code per workspace: {total_code_lines / len(instance_files):.2f}"
56 |     )
57 |     logging.info(
58 |         f"Avg. python files per workspace: {total_py_files / len(instance_files):.2f}"
59 |     )
60 |     logging.info(
61 |         f"Avg. total files per workspace: {total_files / len(instance_files):.2f}"
62 |     )
63 | 
64 | 
65 | def parse_arguments():
66 | 
67 |     parser = argparse.ArgumentParser(
68 |         description="Run statistics collection for workspaces."
69 |     )
70 |     parser.add_argument(
71 |         "--benchmark_dir",
72 |         type=str,
73 |         required=True,
74 |         help="Base directory for the DevAI benchmark",
75 |     )
76 |     parser.add_argument(
77 |         "--developer_agent", type=str, required=True, help="Name of the developer agent"
78 |     )
79 | 
80 |     return parser.parse_args()
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     args = parse_arguments()
85 |     benchmark_dir = Path(args.benchmark_dir)
86 |     developer_agent = args.developer_agent
87 |     instance_dir = benchmark_dir / "devai/instances"
88 |     workspace_dir = benchmark_dir / f"workspaces/{developer_agent}"
89 |     main(instance_dir, workspace_dir)
90 | 


--------------------------------------------------------------------------------