├── .config └── pre-commit-config.yaml ├── .env.sample ├── .github ├── .codecov.yml └── dependabot.yml ├── .gitignore ├── LICENSE ├── README.md ├── agent_as_a_judge ├── __init__.py ├── agent.py ├── config.py ├── llm │ ├── __init__.py │ ├── cost.py │ └── provider.py ├── module │ ├── __init__.py │ ├── ask.py │ ├── code_search.py │ ├── graph.py │ ├── locate.py │ ├── memory.py │ ├── planning.py │ ├── prompt │ │ ├── __init__.py │ │ ├── prompt_ask.py │ │ ├── prompt_judge.py │ │ ├── prompt_locate.py │ │ ├── prompt_planning.py │ │ ├── prompt_retrieve.py │ │ ├── system_prompt_ask.py │ │ ├── system_prompt_judge.py │ │ ├── system_prompt_locate.py │ │ ├── system_prompt_planning.py │ │ └── system_prompt_retrieve.py │ ├── read.py │ ├── statistics.py │ └── text_retrieve.py └── utils │ ├── __init__.py │ ├── count_lines.py │ └── truncate.py ├── assets ├── aaaj_logo_v6.png ├── aaaj_logo_v7.png ├── aaaj_sample.md ├── ask_sample.md ├── dataset.png ├── demo.gif ├── devai_logo.png ├── judge_first.png ├── openwiki_1a.jpeg ├── openwiki_1b.jpeg └── sample.jpeg ├── benchmark ├── devai │ ├── README.md │ ├── constraints.json │ ├── instances │ │ ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json │ │ ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json │ │ ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json │ │ ├── 04_Text_Generation_GPT2_Prompts_DL.json │ │ ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json │ │ ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json │ │ ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json │ │ ├── 08_Robot_Control_PPO_PyBullet_RL.json │ │ ├── 09_Recommendation_System_NCF_MovieLens_ML.json │ │ ├── 10_Face_Recognition_FaceNet_LFW_DL.json │ │ ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json │ │ ├── 12_Spam_Detection_SVM_Enron_ML.json │ │ ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json │ │ ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json │ │ ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json │ │ ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json │ │ ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json │ │ ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json │ │ ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json │ │ ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json │ │ ├── 21_Iris_Classification_SVM_Iris_ML.json │ │ ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json │ │ ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json │ │ ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json │ │ ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json │ │ ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json │ │ ├── 27_Image_Generation_DCGAN_MNIST_DL.json │ │ ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json │ │ ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json │ │ ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json │ │ ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json │ │ ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json │ │ ├── 33_Object_Detection_YOLOv3_COCO_DL.json │ │ ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json │ │ ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json │ │ ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json │ │ ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json │ │ ├── 38_Object_Tracking_Siamese_OTB50_DL.json │ │ ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json │ │ ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json │ │ ├── 41_Stock_Classification_KNN_YahooFinance_ML.json │ │ ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json │ │ ├── 43_Social_Network_Analysis_GCN_Cora_ML.json │ │ ├── 44_Text_Classification_BERT_AGNews_DL.json │ │ ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json │ │ ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json │ │ ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json │ │ ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json │ │ ├── 49_Explainable_AI_LIME_Titanic_ML.json │ │ ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json │ │ ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json │ │ ├── 52_Devin_AI_Trains_an_AI.json │ │ ├── 53_Devin_Upwork_Side_Hustle.json │ │ ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json │ │ └── 55_SQLite_Database_Viewer_and_Analyzer_App.json │ ├── trajectory-schema.json │ └── validate_trajectory.py ├── judgment │ ├── GPT-Pilot │ │ ├── agent_as_a_judge │ │ │ └── gray_box │ │ │ │ ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json │ │ │ │ ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json │ │ │ │ ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json │ │ │ │ ├── 04_Text_Generation_GPT2_Prompts_DL.json │ │ │ │ ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json │ │ │ │ ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json │ │ │ │ ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json │ │ │ │ ├── 08_Robot_Control_PPO_PyBullet_RL.json │ │ │ │ ├── 09_Recommendation_System_NCF_MovieLens_ML.json │ │ │ │ ├── 10_Face_Recognition_FaceNet_LFW_DL.json │ │ │ │ ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json │ │ │ │ ├── 12_Spam_Detection_SVM_Enron_ML.json │ │ │ │ ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json │ │ │ │ ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json │ │ │ │ ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json │ │ │ │ ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json │ │ │ │ ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json │ │ │ │ ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json │ │ │ │ ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json │ │ │ │ ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json │ │ │ │ ├── 21_Iris_Classification_SVM_Iris_ML.json │ │ │ │ ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json │ │ │ │ ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json │ │ │ │ ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json │ │ │ │ ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json │ │ │ │ ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json │ │ │ │ ├── 27_Image_Generation_DCGAN_MNIST_DL.json │ │ │ │ ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json │ │ │ │ ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json │ │ │ │ ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json │ │ │ │ ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json │ │ │ │ ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json │ │ │ │ ├── 33_Object_Detection_YOLOv3_COCO_DL.json │ │ │ │ ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json │ │ │ │ ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json │ │ │ │ ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json │ │ │ │ ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json │ │ │ │ ├── 38_Object_Tracking_Siamese_OTB50_DL.json │ │ │ │ ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json │ │ │ │ ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json │ │ │ │ ├── 41_Stock_Classification_KNN_YahooFinance_ML.json │ │ │ │ ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json │ │ │ │ ├── 43_Social_Network_Analysis_GCN_Cora_ML.json │ │ │ │ ├── 44_Text_Classification_BERT_AGNews_DL.json │ │ │ │ ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json │ │ │ │ ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json │ │ │ │ ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json │ │ │ │ ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json │ │ │ │ ├── 49_Explainable_AI_LIME_Titanic_ML.json │ │ │ │ ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json │ │ │ │ ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json │ │ │ │ ├── 52_Devin_AI_Trains_an_AI.json │ │ │ │ ├── 53_Devin_Upwork_Side_Hustle.json │ │ │ │ ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json │ │ │ │ └── 55_SQLite_Database_Viewer_and_Analyzer_App.json │ │ └── human_as_a_judge │ │ │ ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json │ │ │ ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json │ │ │ ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json │ │ │ ├── 04_Text_Generation_GPT2_Prompts_DL.json │ │ │ ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json │ │ │ ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json │ │ │ ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json │ │ │ ├── 08_Robot_Control_PPO_PyBullet_RL.json │ │ │ ├── 09_Recommendation_System_NCF_MovieLens_ML.json │ │ │ ├── 10_Face_Recognition_FaceNet_LFW_DL.json │ │ │ ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json │ │ │ ├── 12_Spam_Detection_SVM_Enron_ML.json │ │ │ ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json │ │ │ ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json │ │ │ ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json │ │ │ ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json │ │ │ ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json │ │ │ ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json │ │ │ ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json │ │ │ ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json │ │ │ ├── 21_Iris_Classification_SVM_Iris_ML.json │ │ │ ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json │ │ │ ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json │ │ │ ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json │ │ │ ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json │ │ │ ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json │ │ │ ├── 27_Image_Generation_DCGAN_MNIST_DL.json │ │ │ ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json │ │ │ ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json │ │ │ ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json │ │ │ ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json │ │ │ ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json │ │ │ ├── 33_Object_Detection_YOLOv3_COCO_DL.json │ │ │ ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json │ │ │ ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json │ │ │ ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json │ │ │ ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json │ │ │ ├── 38_Object_Tracking_Siamese_OTB50_DL.json │ │ │ ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json │ │ │ ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json │ │ │ ├── 41_Stock_Classification_KNN_YahooFinance_ML.json │ │ │ ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json │ │ │ ├── 43_Social_Network_Analysis_GCN_Cora_ML.json │ │ │ ├── 44_Text_Classification_BERT_AGNews_DL.json │ │ │ ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json │ │ │ ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json │ │ │ ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json │ │ │ ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json │ │ │ ├── 49_Explainable_AI_LIME_Titanic_ML.json │ │ │ ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json │ │ │ ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json │ │ │ ├── 52_Devin_AI_Trains_an_AI.json │ │ │ ├── 53_Devin_Upwork_Side_Hustle.json │ │ │ ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json │ │ │ └── 55_SQLite_Database_Viewer_and_Analyzer_App.json │ ├── MetaGPT │ │ ├── agent_as_a_judge │ │ │ └── gray_box │ │ │ │ ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json │ │ │ │ ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json │ │ │ │ ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json │ │ │ │ ├── 04_Text_Generation_GPT2_Prompts_DL.json │ │ │ │ ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json │ │ │ │ ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json │ │ │ │ ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json │ │ │ │ ├── 08_Robot_Control_PPO_PyBullet_RL.json │ │ │ │ ├── 09_Recommendation_System_NCF_MovieLens_ML.json │ │ │ │ ├── 10_Face_Recognition_FaceNet_LFW_DL.json │ │ │ │ ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json │ │ │ │ ├── 12_Spam_Detection_SVM_Enron_ML.json │ │ │ │ ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json │ │ │ │ ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json │ │ │ │ ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json │ │ │ │ ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json │ │ │ │ ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json │ │ │ │ ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json │ │ │ │ ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json │ │ │ │ ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json │ │ │ │ ├── 21_Iris_Classification_SVM_Iris_ML.json │ │ │ │ ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json │ │ │ │ ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json │ │ │ │ ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json │ │ │ │ ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json │ │ │ │ ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json │ │ │ │ ├── 27_Image_Generation_DCGAN_MNIST_DL.json │ │ │ │ ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json │ │ │ │ ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json │ │ │ │ ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json │ │ │ │ ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json │ │ │ │ ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json │ │ │ │ ├── 33_Object_Detection_YOLOv3_COCO_DL.json │ │ │ │ ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json │ │ │ │ ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json │ │ │ │ ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json │ │ │ │ ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json │ │ │ │ ├── 38_Object_Tracking_Siamese_OTB50_DL.json │ │ │ │ ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json │ │ │ │ ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json │ │ │ │ ├── 41_Stock_Classification_KNN_YahooFinance_ML.json │ │ │ │ ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json │ │ │ │ ├── 43_Social_Network_Analysis_GCN_Cora_ML.json │ │ │ │ ├── 44_Text_Classification_BERT_AGNews_DL.json │ │ │ │ ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json │ │ │ │ ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json │ │ │ │ ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json │ │ │ │ ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json │ │ │ │ ├── 49_Explainable_AI_LIME_Titanic_ML.json │ │ │ │ ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json │ │ │ │ ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json │ │ │ │ ├── 52_Devin_AI_Trains_an_AI.json │ │ │ │ ├── 53_Devin_Upwork_Side_Hustle.json │ │ │ │ ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json │ │ │ │ └── 55_SQLite_Database_Viewer_and_Analyzer_App.json │ │ └── human_as_a_judge │ │ │ ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json │ │ │ ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json │ │ │ ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json │ │ │ ├── 04_Text_Generation_GPT2_Prompts_DL.json │ │ │ ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json │ │ │ ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json │ │ │ ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json │ │ │ ├── 08_Robot_Control_PPO_PyBullet_RL.json │ │ │ ├── 09_Recommendation_System_NCF_MovieLens_ML.json │ │ │ ├── 10_Face_Recognition_FaceNet_LFW_DL.json │ │ │ ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json │ │ │ ├── 12_Spam_Detection_SVM_Enron_ML.json │ │ │ ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json │ │ │ ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json │ │ │ ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json │ │ │ ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json │ │ │ ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json │ │ │ ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json │ │ │ ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json │ │ │ ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json │ │ │ ├── 21_Iris_Classification_SVM_Iris_ML.json │ │ │ ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json │ │ │ ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json │ │ │ ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json │ │ │ ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json │ │ │ ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json │ │ │ ├── 27_Image_Generation_DCGAN_MNIST_DL.json │ │ │ ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json │ │ │ ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json │ │ │ ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json │ │ │ ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json │ │ │ ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json │ │ │ ├── 33_Object_Detection_YOLOv3_COCO_DL.json │ │ │ ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json │ │ │ ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json │ │ │ ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json │ │ │ ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json │ │ │ ├── 38_Object_Tracking_Siamese_OTB50_DL.json │ │ │ ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json │ │ │ ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json │ │ │ ├── 41_Stock_Classification_KNN_YahooFinance_ML.json │ │ │ ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json │ │ │ ├── 43_Social_Network_Analysis_GCN_Cora_ML.json │ │ │ ├── 44_Text_Classification_BERT_AGNews_DL.json │ │ │ ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json │ │ │ ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json │ │ │ ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json │ │ │ ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json │ │ │ ├── 49_Explainable_AI_LIME_Titanic_ML.json │ │ │ ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json │ │ │ ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json │ │ │ ├── 52_Devin_AI_Trains_an_AI.json │ │ │ ├── 53_Devin_Upwork_Side_Hustle.json │ │ │ ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json │ │ │ └── 55_SQLite_Database_Viewer_and_Analyzer_App.json │ └── OpenHands │ │ ├── agent_as_a_judge │ │ └── gray_box │ │ │ ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json │ │ │ ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json │ │ │ ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json │ │ │ ├── 04_Text_Generation_GPT2_Prompts_DL.json │ │ │ ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json │ │ │ ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json │ │ │ ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json │ │ │ ├── 08_Robot_Control_PPO_PyBullet_RL.json │ │ │ ├── 09_Recommendation_System_NCF_MovieLens_ML.json │ │ │ ├── 10_Face_Recognition_FaceNet_LFW_DL.json │ │ │ ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json │ │ │ ├── 12_Spam_Detection_SVM_Enron_ML.json │ │ │ ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json │ │ │ ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json │ │ │ ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json │ │ │ ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json │ │ │ ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json │ │ │ ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json │ │ │ ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json │ │ │ ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json │ │ │ ├── 21_Iris_Classification_SVM_Iris_ML.json │ │ │ ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json │ │ │ ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json │ │ │ ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json │ │ │ ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json │ │ │ ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json │ │ │ ├── 27_Image_Generation_DCGAN_MNIST_DL.json │ │ │ ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json │ │ │ ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json │ │ │ ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json │ │ │ ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json │ │ │ ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json │ │ │ ├── 33_Object_Detection_YOLOv3_COCO_DL.json │ │ │ ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json │ │ │ ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json │ │ │ ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json │ │ │ ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json │ │ │ ├── 38_Object_Tracking_Siamese_OTB50_DL.json │ │ │ ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json │ │ │ ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json │ │ │ ├── 41_Stock_Classification_KNN_YahooFinance_ML.json │ │ │ ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json │ │ │ ├── 43_Social_Network_Analysis_GCN_Cora_ML.json │ │ │ ├── 44_Text_Classification_BERT_AGNews_DL.json │ │ │ ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json │ │ │ ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json │ │ │ ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json │ │ │ ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json │ │ │ ├── 49_Explainable_AI_LIME_Titanic_ML.json │ │ │ ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json │ │ │ ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json │ │ │ ├── 52_Devin_AI_Trains_an_AI.json │ │ │ ├── 53_Devin_Upwork_Side_Hustle.json │ │ │ ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json │ │ │ └── 55_SQLite_Database_Viewer_and_Analyzer_App.json │ │ └── human_as_a_judge │ │ ├── 01_Image_Classification_ResNet18_Fashion_MNIST_DL.json │ │ ├── 02_Maze_Solver_Q_Learning_Gridworld_RL.json │ │ ├── 03_Text_Classification_NaiveBayes_20Newsgroups_ML.json │ │ ├── 04_Text_Generation_GPT2_Prompts_DL.json │ │ ├── 05_Game_Simulation_DQN_CartPole_v1_RL.json │ │ ├── 06_Sentiment_Analysis_SVM_Sentiment140_ML.json │ │ ├── 07_Image_Super_Resolution_SRCNN_Set5_DL.json │ │ ├── 08_Robot_Control_PPO_PyBullet_RL.json │ │ ├── 09_Recommendation_System_NCF_MovieLens_ML.json │ │ ├── 10_Face_Recognition_FaceNet_LFW_DL.json │ │ ├── 11_House_Price_Prediction_LinearRegression_BostonHousing_ML.json │ │ ├── 12_Spam_Detection_SVM_Enron_ML.json │ │ ├── 13_Style_Transfer_Perceptual_Loss_CustomImages_DL.json │ │ ├── 14_Customer_Churn_Prediction_LogisticRegression_Telco_ML.json │ │ ├── 15_Image_Captioning_ShowAndTell_Flickr8k_DL.json │ │ ├── 16_Credit_Scoring_DecisionTree_GermanCredit_ML.json │ │ ├── 17_Heart_Disease_Prediction_XGBoost_UCI_ML.json │ │ ├── 18_Image_Enhancement_SRGAN_DIV2K_DL.json │ │ ├── 19_Time_Series_Forecasting_Seq2Seq_LSTM_Rossmann_ML.json │ │ ├── 20_Car_Price_Prediction_RandomForest_CarPrices_ML.json │ │ ├── 21_Iris_Classification_SVM_Iris_ML.json │ │ ├── 22_Sentiment_Analysis_LSTM_IMDb_DL.json │ │ ├── 23_Wine_Quality_Prediction_DecisionTree_WineQuality_ML.json │ │ ├── 24_Diabetes_Prediction_LogisticRegression_PimaIndians_ML.json │ │ ├── 25_Speech_Emotion_Recognition_CNN_LSTM_RAVDESS_DL.json │ │ ├── 26_Mushroom_Classification_RandomForest_Mushroom_ML.json │ │ ├── 27_Image_Generation_DCGAN_MNIST_DL.json │ │ ├── 28_Stock_Price_Prediction_LSTM_YahooFinance_ML.json │ │ ├── 29_Financial_Time_Series_Prediction_LSTM_ML.json │ │ ├── 30_Image_Segmentation_UNet_PascalVOC_DL.json │ │ ├── 31_Cancer_Prediction_SVM_BreastCancer_ML.json │ │ ├── 32_Weather_Data_Analysis_LinearRegression_Weather_ML.json │ │ ├── 33_Object_Detection_YOLOv3_COCO_DL.json │ │ ├── 34_Customer_Segmentation_KMeans_CustomerSegmentation_ML.json │ │ ├── 35_Loan_Default_Prediction_RandomForest_LendingClub_ML.json │ │ ├── 36_Music_Emotion_Classification_SVM_GTZAN_ML.json │ │ ├── 37_Lane_Detection_ResNet50_TuSimple_DL.json │ │ ├── 38_Object_Tracking_Siamese_OTB50_DL.json │ │ ├── 39_Drug_Response_Prediction_SVM_GDSC_ML.json │ │ ├── 40_Text_Summarization_BART_CNNDailyMail_DL.json │ │ ├── 41_Stock_Classification_KNN_YahooFinance_ML.json │ │ ├── 42_Medical_Image_Classification_DenseNet121_ChestXray_DL.json │ │ ├── 43_Social_Network_Analysis_GCN_Cora_ML.json │ │ ├── 44_Text_Classification_BERT_AGNews_DL.json │ │ ├── 45_Product_Recommendation_MatrixFactorization_AmazonReviews_ML.json │ │ ├── 46_Speech_Recognition_DeepSpeech_LibriSpeech_DL.json │ │ ├── 47_Network_Traffic_Analysis_KMeans_NetworkTraffic_ML.json │ │ ├── 48_Stock_Trading_Simulation_PPO_HistoricalData_RL.json │ │ ├── 49_Explainable_AI_LIME_Titanic_ML.json │ │ ├── 50_Math_Problem_Solving_Transformer_DeepMindMath_DL.json │ │ ├── 51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json │ │ ├── 52_Devin_AI_Trains_an_AI.json │ │ ├── 53_Devin_Upwork_Side_Hustle.json │ │ ├── 54_Mock_OpenAI_API_Response_Analyzer_App.json │ │ └── 55_SQLite_Database_Viewer_and_Analyzer_App.json ├── trajectories │ └── OpenHands │ │ └── 39_Drug_Response_Prediction_SVM_GDSC_ML.json └── workspaces │ └── OpenHands │ └── 39_Drug_Response_Prediction_SVM_GDSC_ML │ ├── gdsc_dataset.csv │ ├── results │ ├── drug_response_prediction_report.md │ ├── drug_response_prediction_report.pdf │ ├── performance.txt │ └── rmse_scores.png │ └── src │ ├── data_loader.py │ ├── model.py │ └── train.py ├── poetry.lock ├── pyproject.toml └── scripts ├── README.md ├── run_aaaj.py ├── run_ask.py ├── run_statistics.py ├── run_wiki.py └── templates └── html └── index.html /.config/pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-json 9 | - repo: https://github.com/pre-commit/mirrors-mypy 10 | rev: v0.910 11 | hooks: 12 | - id: mypy 13 | additional_dependencies: ['types-termcolor'] 14 | language: python 15 | entry: poetry run mypy 16 | -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | DEFAULT_LLM="gpt-4o-2024-08-06" 2 | OPENAI_API_KEY="sk-***" 3 | PROJECT_DIR="{PATH_TO_THIS_PROJECT}" -------------------------------------------------------------------------------- /.github/.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | wait_for_ci: true 4 | 5 | coverage: 6 | status: 7 | patch: 8 | default: 9 | threshold: 100% 10 | project: 11 | default: 12 | threshold: 5% 13 | comment: false 14 | github_checks: 15 | annotations: false 16 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | open-pull-requests-limit: 5 8 | assignees: 9 | - mczhuge 10 | labels: 11 | - "dependencies" 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 metauto.ai 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /agent_as_a_judge/__init__.py: -------------------------------------------------------------------------------- 1 | from .llm.provider import LLM 2 | from .llm.cost import Cost 3 | 4 | __all__ = ["LLM", "Cost"] 5 | -------------------------------------------------------------------------------- /agent_as_a_judge/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Optional 3 | from pathlib import Path 4 | 5 | 6 | @dataclass 7 | class AgentConfig: 8 | include_dirs: Optional[List[str]] = None 9 | exclude_dirs: Optional[List[str]] = None 10 | exclude_files: Optional[List[str]] = None 11 | setting: str = "gray_box" 12 | planning: str = "efficient (no planning)" 13 | judge_dir: Optional[Path] = None 14 | workspace_dir: Optional[Path] = None 15 | instance_dir: Optional[Path] = None 16 | trajectory_file: Optional[Path] = None 17 | 18 | @classmethod 19 | def from_args(cls, args): 20 | 21 | return cls( 22 | include_dirs=( 23 | args.include_dirs 24 | if hasattr(args, "include_dirs") 25 | else ["src", "results", "models", "data"] 26 | ), 27 | exclude_dirs=( 28 | args.exclude_dirs 29 | if hasattr(args, "exclude_dirs") 30 | else ["__pycache__", "env"] 31 | ), 32 | exclude_files=( 33 | args.exclude_files if hasattr(args, "exclude_files") else [".DS_Store"] 34 | ), 35 | setting=args.setting, 36 | planning=args.planning, 37 | judge_dir=Path(args.judge_dir), 38 | workspace_dir=Path(args.workspace_dir), 39 | instance_dir=Path(args.instance_dir), 40 | trajectory_file=( 41 | Path(args.trajectory_file) if args.trajectory_file else None 42 | ), 43 | ) 44 | -------------------------------------------------------------------------------- /agent_as_a_judge/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/agent_as_a_judge/llm/__init__.py -------------------------------------------------------------------------------- /agent_as_a_judge/llm/cost.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | class Cost: 6 | 7 | def __init__(self) -> None: 8 | self._accumulated_cost: float = 0.0 9 | self._costs: list[float] = [] 10 | 11 | @property 12 | def accumulated_cost(self) -> float: 13 | return self._accumulated_cost 14 | 15 | @accumulated_cost.setter 16 | def accumulated_cost(self, value: float) -> None: 17 | if value < 0: 18 | raise ValueError("Total cost cannot be negative.") 19 | self._accumulated_cost = value 20 | 21 | @property 22 | def costs(self) -> list: 23 | return self._costs 24 | 25 | def add_cost(self, value: float) -> None: 26 | if value < 0: 27 | raise ValueError("Added cost cannot be negative.") 28 | self._accumulated_cost += value 29 | self._costs.append(value) 30 | 31 | def get(self): 32 | return {"accumulated_cost": self._accumulated_cost, "costs": self._costs} 33 | 34 | def log(self): 35 | cost = self.get() 36 | logs = "" 37 | for key, value in cost.items(): 38 | logs += f"{key}: {value}\n" 39 | return logs 40 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/agent_as_a_judge/module/__init__.py -------------------------------------------------------------------------------- /agent_as_a_judge/module/memory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Memory module to store and retrieve historical judgments. 3 | """ 4 | 5 | import os 6 | import logging 7 | import json 8 | from pathlib import Path 9 | 10 | 11 | class Memory: 12 | def __init__(self, memory_file: Path = None): 13 | 14 | self.judgments = [] 15 | self.memory_file = memory_file 16 | 17 | def save_to_file(self): 18 | if not self.memory_file: 19 | logging.error("No memory file provided.") 20 | return 21 | 22 | try: 23 | with open(self.memory_file, "w") as file: 24 | json.dump({"judge_stats": self.judgments}, file, indent=4) 25 | logging.info( 26 | f"Saved {len(self.judgments)} judgments to file '{self.memory_file}'." 27 | ) 28 | except Exception as e: 29 | logging.error(f"Failed to save judgments to file '{self.memory_file}': {e}") 30 | 31 | def add_judgment(self, criteria: str, satisfied: bool, reason: list): 32 | new_judgment = {"criteria": criteria, "satisfied": satisfied, "reason": reason} 33 | self.judgments.append(new_judgment) 34 | logging.debug( 35 | f"Added new judgment for criteria: '{criteria}', Satisfied: {satisfied}" 36 | ) 37 | 38 | def get_historical_evidence(self) -> str: 39 | 40 | if not os.path.exists(self.memory_file): 41 | logging.error(f"File '{self.memory_file}' not found.") 42 | return 43 | 44 | with open(self.memory_file, "r") as file: 45 | data = json.load(file) 46 | self.judgments = data.get("judge_stats", []) 47 | logging.info( 48 | f"Loaded {len(self.judgments)} judgments from file '{self.memory_file}'." 49 | ) 50 | 51 | if not self.judgments: 52 | logging.warning("No historical judgments available.") 53 | return "No historical judgments available." 54 | 55 | historical_evidence = "\n".join( 56 | self._format_judgment(i, judgment) 57 | for i, judgment in enumerate(self.judgments, 1) 58 | ) 59 | logging.info(f"Retrieved {len(self.judgments)} historical judgments.") 60 | return historical_evidence 61 | 62 | @staticmethod 63 | def _format_judgment(index: int, judgment: dict) -> str: 64 | criteria = judgment.get("criteria", "No criteria available") 65 | satisfied = "Yes" if judgment.get("satisfied") else "No" 66 | 67 | llm_stats = judgment.get("llm_stats", {}) 68 | reasons = llm_stats.get("reason", []) 69 | 70 | if isinstance(reasons, list): 71 | formatted_reasons = ( 72 | "\n ".join(reasons) if reasons else "No reasoning provided" 73 | ) 74 | else: 75 | formatted_reasons = reasons if reasons else "No reasoning provided" 76 | 77 | output = ( 78 | f"\n{'-'*50}" 79 | f"\nRequirement {index}:" 80 | f"\n{'-'*50}" 81 | f"\nCriteria : {criteria}" 82 | f"\nSatisfied : {satisfied}" 83 | f"\nReason :\n {formatted_reasons}" 84 | f"\n{'-'*50}\n" 85 | ) 86 | 87 | return output 88 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/planning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | import logging 5 | from agent_as_a_judge.llm.provider import LLM 6 | from dotenv import load_dotenv 7 | from rich.logging import RichHandler 8 | from agent_as_a_judge.module.prompt.system_prompt_planning import ( 9 | get_planning_system_prompt, 10 | ) 11 | from agent_as_a_judge.module.prompt.prompt_planning import get_planning_prompt 12 | 13 | logging.basicConfig( 14 | level=logging.INFO, 15 | format="%(asctime)s - %(levelname)s - %(message)s", 16 | handlers=[RichHandler()], 17 | ) 18 | load_dotenv() 19 | 20 | 21 | class Planning: 22 | def __init__(self): 23 | self.llm = LLM( 24 | model=os.getenv("DEFAULT_LLM"), api_key=os.getenv("OPENAI_API_KEY") 25 | ) 26 | 27 | def generate_plan(self, criteria: str) -> dict: 28 | system_prompt = get_planning_system_prompt("English") # 29 | user_prompt = get_planning_prompt(criteria) 30 | 31 | messages = [ 32 | {"role": "system", "content": system_prompt}, 33 | {"role": "user", "content": user_prompt}, 34 | ] 35 | 36 | start_time = time.time() 37 | llm_stats = self._llm_inference(messages) 38 | llm_stats["inference_time"] = time.time() - start_time 39 | actions = self.parse_plan(llm_stats["llm_response"]) 40 | 41 | return {"actions": actions, "llm_stats": llm_stats} 42 | 43 | def parse_plan(self, plan: str) -> list: 44 | actions = [] 45 | action_patterns = { 46 | "user_query": r"\[User Query\]", 47 | "workspace": r"\[Workspace\]", 48 | "locate": r"\[Locate\]", 49 | "read": r"\[Read\]", 50 | "search": r"\[Search\]", 51 | "history": r"\[History\]", 52 | "trajectory": r"\[Trajectory\]", 53 | } 54 | 55 | for line in plan.splitlines(): 56 | for action, pattern in action_patterns.items(): 57 | if re.search(pattern, line, re.IGNORECASE): 58 | actions.append(action) 59 | break 60 | 61 | return actions 62 | 63 | def _llm_inference(self, messages: list) -> dict: 64 | 65 | response, cost, accumulated_cost = self.llm.do_completion( 66 | messages=messages, temperature=0.0 67 | ) 68 | 69 | llm_response = response.choices[0].message["content"] 70 | input_token = response.usage.prompt_tokens 71 | output_token = response.usage.completion_tokens 72 | 73 | return { 74 | "llm_response": llm_response, 75 | "input_tokens": input_token, 76 | "output_tokens": output_token, 77 | "cost": cost, 78 | # "accumulated_cost": accumulated_cost 79 | } 80 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/agent_as_a_judge/module/prompt/__init__.py -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/prompt_ask.py: -------------------------------------------------------------------------------- 1 | def get_ask_prompt(question: str, evidence: str) -> str: 2 | 3 | return f""" 4 | Provided below is relevant information about the project or context: 5 | {evidence} 6 | 7 | Kindly respond to the following user input: 8 | {question} 9 | 10 | As per the guidelines, provide a comprehensive answer referencing specific elements from the provided information where applicable. 11 | """ 12 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/prompt_judge.py: -------------------------------------------------------------------------------- 1 | def get_judge_prompt(criteria: str, evidence: str) -> str: 2 | 3 | return f""" 4 | Provided below is relevant information about the project: 5 | {evidence} 6 | 7 | Kindly perform an evaluation of the following criteria: 8 | {criteria} 9 | 10 | As per the guidelines, respond with either or , followed by a concise justification that references specific elements from the project information, such as code snippets, data samples, or output results. 11 | """ 12 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/prompt_locate.py: -------------------------------------------------------------------------------- 1 | def get_prompt_locate(criteria: str, workspace_info: str) -> str: 2 | 3 | demonstration = """ 4 | Example: 5 | Suppose the criteria is: 6 | 'The database functionality is implemented in `src/db.py`, and the logging system is defined in `src/logging.py`.' 7 | 8 | And the workspace information is: 9 | /project 10 | ├── src 11 | │ ├── db.py 12 | │ ├── logging.py 13 | │ ├── utils.py 14 | └── tests 15 | ├── test_db.py 16 | └── test_logging.py 17 | 18 | Based on the criteria, the following paths (no more than 5) should be returned, each wrapped in dollar signs (`$`): 19 | $/project/src/db.py$ 20 | $/project/src/logging.py$ 21 | """ 22 | 23 | return f""" 24 | Provided below is the structure of the workspace: 25 | {workspace_info} 26 | 27 | This is the criteria related to the task: 28 | {criteria} 29 | 30 | Follow the format in the example below and return only the file paths that match the criteria: 31 | {demonstration} 32 | """ 33 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/prompt_planning.py: -------------------------------------------------------------------------------- 1 | def get_planning_prompt(criteria: str) -> str: 2 | """ 3 | Returns the LLM prompt to generate a step-by-step plan for evaluating or resolving the given criteria. 4 | The prompt includes demonstrations to guide the LLM in creating effective plans without repeating the action descriptions. 5 | """ 6 | return f""" 7 | You are tasked with generating a list of actions to evaluate or resolve the following requirement. 8 | Select only the necessary actions and arrange them in a logical order to systematically collect evidence and verify whether the requirement is satisfied. 9 | 10 | Requirement: "{criteria}" 11 | 12 | Here are some examples of how to create a plan: 13 | 14 | Example 1: 15 | Requirement: "The system must generate a summary report saved as `output/report.txt`." 16 | Plan: 17 | - [Locate]: Locate the `output/report.txt` file in the workspace. 18 | - [Read]: Read the contents of the `report.txt` file to verify it contains the summary report. 19 | - [Search]: Search the codebase for any functions or methods responsible for generating `report.txt`. 20 | 21 | Example 2: 22 | Requirement: "The machine learning model must be trained and saved as `results/model.pkl`." 23 | Plan: 24 | - [Locate]: Locate `results/model.pkl` in the workspace. 25 | - [Search]: Search for the model training code in the source files. 26 | - [Read]: Read the model training code to verify it aligns with the specified requirement. 27 | - [Trajectory]: Analyze the historical development of the model training process to understand any prior modifications. 28 | 29 | Now, generate a step-by-step plan for the following requirement: 30 | 31 | Requirement: "{criteria}" 32 | 33 | Response: 34 | """ 35 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/prompt_retrieve.py: -------------------------------------------------------------------------------- 1 | def get_text_retrieve_prompt(criteria: str, long_context: str) -> str: 2 | 3 | return f""" 4 | Below is a log of actions, steps, and file operations: 5 | {long_context} 6 | 7 | Summarize concise evidence directly related to the following criteria: 8 | {criteria} 9 | 10 | Focus on the last one or two mentions of relevant files or actions. Since I can check the files locally, omit file existence and content details. Provide a brief analysis of the latest status of relevant files or functions. Exclude irrelevant information. 11 | """ 12 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/system_prompt_ask.py: -------------------------------------------------------------------------------- 1 | def get_ask_system_prompt(language="English"): 2 | 3 | if language == "English": 4 | return """ 5 | You are a knowledgeable assistant capable of answering user queries clearly and accurately. 6 | Your goal is to respond to the user input provided, using relevant project information and context where necessary. 7 | """ 8 | else: 9 | raise NotImplementedError(f"The language '{language}' is not supported.") 10 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/system_prompt_judge.py: -------------------------------------------------------------------------------- 1 | def get_judge_system_prompt(language="English"): 2 | 3 | if language == "English": 4 | return """ 5 | You are an advanced AI system serving as an impartial judge for intelligent code generation outputs. Your primary role is to rigorously evaluate whether the agent's outputs satisfy the specified requirements by thoroughly analyzing the provided code, data, and other relevant materials. 6 | 7 | You will systematically assess aspects such as datasets, model implementations, training procedures, and any task-specific criteria outlined in the requirements. Your evaluations must be objective, detailed, and based solely on the evidence provided. 8 | 9 | For each requirement, deliver one of the following judgments: 10 | 11 | 1. : Use this if the agent's output fully meets the requirement. Provide a brief and precise explanation demonstrating how the specific criteria are fulfilled. 12 | 13 | 2. : Use this if the agent's output does not meet the requirement. Provide a concise explanation indicating the deficiencies or omissions. 14 | 15 | Your assessment should reference specific elements such as code snippets, data samples, or output results where appropriate. Ensure that your justifications are clear, precise, and directly related to the criteria. 16 | 17 | Respond with either or , followed by your concise justification. 18 | """ 19 | 20 | else: 21 | raise NotImplementedError(f"The language '{language}' is not supported.") 22 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/system_prompt_locate.py: -------------------------------------------------------------------------------- 1 | def get_system_prompt_locate(language="English"): 2 | 3 | if language == "English": 4 | return """ 5 | You are an advanced AI system specializing in understanding project structures and determining file locations based on provided criteria. 6 | Your task is to locate specific files in the workspace based on the user's criteria and workspace information. 7 | """ 8 | else: 9 | raise NotImplementedError(f"The language '{language}' is not supported.") 10 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/system_prompt_planning.py: -------------------------------------------------------------------------------- 1 | def get_planning_system_prompt(language="English"): 2 | 3 | if language == "English": 4 | return """ 5 | You are an advanced AI system tasked with generating a step-by-step plan to help verify whether a project's outputs meet the specified requirements. 6 | Your goal is to generate a series of actions that systematically gather evidence from various sources, such as code, documentation, history, or data, to assess whether the requirement is fully satisfied. 7 | 8 | The actions you can choose from are listed below. Select the necessary actions based on the requirement and arrange them in a logical order: 9 | 10 | - [User Query]: Use the user's original query to provide context and understand the requirement. 11 | - [Workspace]: Analyze the overall workspace structure to understand the project’s components and dependencies. 12 | - [Locate]: Locate specific files or directories in the workspace that may contain relevant information or code. 13 | - [Read]: Read and examine the contents of files to verify their correctness and relevance to the requirement. 14 | - [Search]: Search for relevant code snippets, functions, or variables related to the requirement. 15 | - [History]: Refer to previous judgments, evaluations, or decisions made in earlier iterations or related projects. 16 | - [Trajectory]: Analyze the historical development or decision-making trajectory of the project, including previous changes or iterations that impacted the current state. 17 | 18 | Your task is to select and order the necessary actions that will systematically collect evidence to allow for a thorough evaluation of the requirement. 19 | """ 20 | else: 21 | raise NotImplementedError(f"The language '{language}' is not supported.") 22 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/prompt/system_prompt_retrieve.py: -------------------------------------------------------------------------------- 1 | def get_retrieve_system_prompt(language="English"): 2 | if language == "English": 3 | return """ 4 | You are an advanced AI system specializing in retrieving environmental feedback from project execution trajectories. Your task is to analyze the provided trajectory data and extract information about the most relevant files mentioned in the given criteria. 5 | 6 | Focus on the following: 7 | 8 | 1. Identify the **most recent steps** where the files directly related to the criteria were involved in execution, loading, or saving operations. 9 | 2. Provide environmental feedback for these files, such as any errors, warnings, or issues encountered during their execution or processing. 10 | 3. Highlight whether any problems occurred that might affect the functionality or success of these files in the project. 11 | 12 | Your output should be structured as follows: 13 | 14 | - ****: List the specific steps involving the relevant files, including any environmental feedback such as error messages, execution results, or other issues encountered. Each step should concisely present the key information needed to assess the files' execution status. 15 | 16 | Avoid including details about file contents or existence, as this information is already available. Focus solely on the environmental feedback related to the execution of the most relevant files. 17 | 18 | Your goal is to provide clear and concise information that helps determine if there were any execution problems with the files mentioned in the criteria. 19 | """ 20 | else: 21 | raise NotImplementedError(f"The language '{language}' is not supported.") 22 | -------------------------------------------------------------------------------- /agent_as_a_judge/module/statistics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import List 4 | from rich.logging import RichHandler 5 | from agent_as_a_judge.module.graph import DevGraph 6 | 7 | logging.basicConfig( 8 | level=logging.INFO, 9 | format="%(asctime)s - %(levelname)s - %(message)s", 10 | handlers=[RichHandler()], 11 | ) 12 | 13 | 14 | class DevStatistics: 15 | 16 | def __init__(self, workspace: Path): 17 | self.workspace = workspace 18 | 19 | def count_lines_of_code(self, filepaths: List[Path]) -> (int, int): 20 | 21 | total_lines = 0 22 | total_files = 0 23 | 24 | for filepath in filepaths: 25 | try: 26 | with open(filepath, "r", encoding="utf-8") as f: 27 | lines = f.readlines() 28 | total_lines += len(lines) 29 | total_files += 1 30 | except Exception as e: 31 | logging.warning(f"Failed to process file {filepath}: {e}") 32 | 33 | return total_lines, total_files 34 | 35 | def calculate_statistics(self): 36 | 37 | if self.workspace.exists(): 38 | logging.info(f"Processing workspace: {self.workspace.stem}") 39 | 40 | dev_graph = DevGraph( 41 | root=str(self.workspace), 42 | include_dirs=["src", "results", "models"], 43 | exclude_dirs=["__pycache__", "env"], 44 | exclude_files=[".DS_Store"], 45 | ) 46 | 47 | py_files = dev_graph.list_py_files([self.workspace]) 48 | all_files = dev_graph.list_all_files(self.workspace) 49 | lines_in_workspace, files_in_workspace = self.count_lines_of_code(py_files) 50 | total_files_in_workspace = len(all_files) 51 | total_non_code_files_in_workspace = ( 52 | total_files_in_workspace - files_in_workspace 53 | ) 54 | 55 | logging.info(f" Total files: {total_files_in_workspace}") 56 | logging.info(f" Non-Python files: {total_non_code_files_in_workspace}") 57 | logging.info(f" Python files: {files_in_workspace}") 58 | logging.info(f" Lines of Python code: {lines_in_workspace}") 59 | 60 | return ( 61 | total_files_in_workspace, 62 | total_non_code_files_in_workspace, 63 | files_in_workspace, 64 | lines_in_workspace, 65 | ) 66 | 67 | else: 68 | logging.warning( 69 | f"Workspace '{self.workspace.stem}' does not exist. Skipping..." 70 | ) 71 | return 0, 0, 0, 0 72 | -------------------------------------------------------------------------------- /agent_as_a_judge/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from agent_as_a_judge.utils.truncate import truncate_string 2 | from agent_as_a_judge.utils.count_lines import count_lines_of_code 3 | 4 | 5 | __all__ = ["truncate_string", "count_lines_of_code"] 6 | -------------------------------------------------------------------------------- /agent_as_a_judge/utils/count_lines.py: -------------------------------------------------------------------------------- 1 | def count_lines_of_code(filepaths): 2 | 3 | total_lines = 0 4 | total_files = 0 5 | for filepath in filepaths: 6 | with open(filepath, "r", encoding="utf-8") as f: 7 | lines = f.readlines() 8 | total_lines += len(lines) 9 | total_files += 1 10 | return total_lines, total_files 11 | -------------------------------------------------------------------------------- /agent_as_a_judge/utils/truncate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from typing import Union 4 | import tiktoken 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | 10 | def truncate_string( 11 | info_string: Union[str, None], 12 | model: str = os.getenv("DEFAULT_LLM"), 13 | max_tokens: int = 10000, 14 | drop_mode="middle", 15 | ) -> str: 16 | 17 | if info_string is None: 18 | logging.warning( 19 | "Received None input for truncation. Returning an empty string." 20 | ) 21 | return "" 22 | 23 | info_string = str(info_string) 24 | 25 | try: 26 | encoding = tiktoken.encoding_for_model(model) 27 | except KeyError: 28 | # Fallback to cl100k_base (used by gpt-4) if model not found 29 | logging.warning(f"Model {model} not found in tiktoken. Using cl100k_base encoding instead.") 30 | encoding = tiktoken.get_encoding("cl100k_base") 31 | 32 | tokens = encoding.encode(info_string, disallowed_special=()) 33 | 34 | # If tokens exceed the maximum length, we truncate based on the drop_mode 35 | if len(tokens) > max_tokens: 36 | # logging.warning(f"Input string exceeds maximum token limit ({max_tokens}). Truncating using {drop_mode} mode.") 37 | ellipsis = encoding.encode("...") 38 | ellipsis_len = len(ellipsis) 39 | 40 | if drop_mode == "head": 41 | tokens = ellipsis + tokens[-(max_tokens - ellipsis_len) :] 42 | elif drop_mode == "middle": 43 | head_tokens = (max_tokens - ellipsis_len) // 2 44 | tail_tokens = max_tokens - head_tokens - ellipsis_len 45 | tokens = tokens[:head_tokens] + ellipsis + tokens[-tail_tokens:] 46 | elif drop_mode == "tail": 47 | tokens = tokens[: (max_tokens - ellipsis_len)] + ellipsis 48 | 49 | else: 50 | raise ValueError( 51 | f"Unknown drop_mode: {drop_mode}. Supported modes: 'head', 'middle', 'tail'." 52 | ) 53 | 54 | return encoding.decode(tokens) 55 | -------------------------------------------------------------------------------- /assets/aaaj_logo_v6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/aaaj_logo_v6.png -------------------------------------------------------------------------------- /assets/aaaj_logo_v7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/aaaj_logo_v7.png -------------------------------------------------------------------------------- /assets/dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/dataset.png -------------------------------------------------------------------------------- /assets/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/demo.gif -------------------------------------------------------------------------------- /assets/devai_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/devai_logo.png -------------------------------------------------------------------------------- /assets/judge_first.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/judge_first.png -------------------------------------------------------------------------------- /assets/openwiki_1a.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/openwiki_1a.jpeg -------------------------------------------------------------------------------- /assets/openwiki_1b.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/openwiki_1b.jpeg -------------------------------------------------------------------------------- /assets/sample.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/assets/sample.jpeg -------------------------------------------------------------------------------- /benchmark/devai/constraints.json: -------------------------------------------------------------------------------- 1 | { 2 | "generic": "This is a task that requires you to write, execute, and save source code. You have a hard time limit of 30 minutes to produce your programmatic solution to the given task. This time limit includes execution time. The quality of your solution will be judged based on what you left in the working folder by the time 30 minutes expire. Additionally, the hardware you are running on is unknown, and the presence of a GPU is not guaranteed.", 3 | "is_training_needed": "Keep the time limit in mind when setting hyperparameters for training.", 4 | "is_kaggle_api_needed": "You can use the Kaggle API credentials stored in `kaggle.json` in your current working directory." 5 | } 6 | -------------------------------------------------------------------------------- /benchmark/devai/instances/02_Maze_Solver_Q_Learning_Gridworld_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "02_Maze_Solver_Q_Learning_Gridworld_RL", 3 | "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": null 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.", 19 | "category": "Dataset or Environment", 20 | "satisfied": null 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.", 29 | "category": "Visualization", 30 | "satisfied": null 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.", 40 | "category": "Save Trained Model", 41 | "satisfied": null 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1 48 | ], 49 | "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.", 50 | "category": "Visualization", 51 | "satisfied": null 52 | } 53 | ], 54 | "preferences": [ 55 | { 56 | "preference_id": 0, 57 | "criteria": "Some real-time progress or feedback during the training process should be displayed.", 58 | "satisfied": null 59 | }, 60 | { 61 | "preference_id": 1, 62 | "criteria": "The code should be written in a way that's easy to modify or extend later on.", 63 | "satisfied": null 64 | } 65 | ], 66 | "is_kaggle_api_needed": false, 67 | "is_training_needed": true, 68 | "is_web_navigation_needed": false 69 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML", 3 | "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.", 4 | "tags": [ 5 | "Classification", 6 | "Natural Language Processing", 7 | "Supervised Learning" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": null 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": null 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.", 33 | "category": "Data preprocessing and postprocessing", 34 | "satisfied": null 35 | }, 36 | { 37 | "requirement_id": 3, 38 | "prerequisites": [], 39 | "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.", 40 | "category": "Machine Learning Method", 41 | "satisfied": null 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.", 51 | "category": "Performance Metrics", 52 | "satisfied": null 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The model should be straightforward to interpret.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "The final report should be structured clearly for easy review.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": false, 70 | "hint": "In the query, there is a missing word \"be\" after the word \"should\" in \"The dataset should loaded in `src/data_loader.py`\"." 71 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/05_Game_Simulation_DQN_CartPole_v1_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "05_Game_Simulation_DQN_CartPole_v1_RL", 3 | "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": null 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.", 19 | "category": "Dataset or Environment", 20 | "satisfied": null 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.", 29 | "category": "Visualization", 30 | "satisfied": null 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1 37 | ], 38 | "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.", 39 | "category": "Save Trained Model", 40 | "satisfied": null 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1 47 | ], 48 | "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.", 49 | "category": "Performance Metrics", 50 | "satisfied": null 51 | } 52 | ], 53 | "preferences": [ 54 | { 55 | "preference_id": 0, 56 | "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.", 57 | "satisfied": null 58 | }, 59 | { 60 | "preference_id": 1, 61 | "criteria": "The return over episode curve has key milestones annotated.", 62 | "satisfied": null 63 | } 64 | ], 65 | "is_kaggle_api_needed": false, 66 | "is_training_needed": true, 67 | "is_web_navigation_needed": false 68 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/06_Sentiment_Analysis_SVM_Sentiment140_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML", 3 | "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.", 4 | "tags": [ 5 | "Natural Language Processing", 6 | "Supervised Learning" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": null 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": null 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.", 32 | "category": "Machine Learning Method", 33 | "satisfied": null 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.", 43 | "category": "Machine Learning Method", 44 | "satisfied": null 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": null 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The dataset download process should be reliable, with clear error handling.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The final accuracy report should be straightforward and easy to interpret.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": true, 72 | "is_web_navigation_needed": false 73 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/07_Image_Super_Resolution_SRCNN_Set5_DL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "07_Image_Super_Resolution_SRCNN_Set5_DL", 3 | "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`. The generated images should be high-quality and clearly show improvements.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": null 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": null 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [], 28 | "criteria": "The \"SRCNN\" model is used in `src/model.py`.", 29 | "category": "Machine Learning Method", 30 | "satisfied": null 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.", 40 | "category": "Visualization", 41 | "satisfied": null 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.", 51 | "category": "Visualization", 52 | "satisfied": null 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "Well-organized output images, highlighting key improvements, should be included.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": true 70 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/08_Robot_Control_PPO_PyBullet_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "08_Robot_Control_PPO_PyBullet_RL", 3 | "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.", 12 | "category": "Dataset or Environment", 13 | "satisfied": null 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"PPO\" algorithm is used in `src/train.py`.", 19 | "category": "Machine Learning Method", 20 | "satisfied": null 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0 26 | ], 27 | "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.", 28 | "category": "Dataset or Environment", 29 | "satisfied": null 30 | }, 31 | { 32 | "requirement_id": 3, 33 | "prerequisites": [ 34 | 0, 35 | 1, 36 | 2 37 | ], 38 | "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.", 39 | "category": "Other", 40 | "satisfied": null 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1, 47 | 2 48 | ], 49 | "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.", 50 | "category": "Visualization", 51 | "satisfied": null 52 | }, 53 | { 54 | "requirement_id": 5, 55 | "prerequisites": [ 56 | 0, 57 | 1, 58 | 2 59 | ], 60 | "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.", 61 | "category": "Visualization", 62 | "satisfied": null 63 | } 64 | ], 65 | "preferences": [ 66 | { 67 | "preference_id": 0, 68 | "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.", 69 | "satisfied": null 70 | } 71 | ], 72 | "is_kaggle_api_needed": false, 73 | "is_training_needed": true, 74 | "is_web_navigation_needed": false 75 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images", 3 | "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models", 7 | "Natural Language Processing" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": null 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "The generated images are ensured to be of \"1080p\" resolution and saved in `results/figures/`.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": null 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in `results/figures/`.", 33 | "category": "Visualization", 34 | "satisfied": null 35 | } 36 | ], 37 | "preferences": [ 38 | { 39 | "preference_id": 0, 40 | "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.", 41 | "satisfied": null 42 | }, 43 | { 44 | "preference_id": 1, 45 | "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.", 46 | "satisfied": null 47 | } 48 | ], 49 | "is_kaggle_api_needed": false, 50 | "is_training_needed": false, 51 | "is_web_navigation_needed": true, 52 | "resource": "https://www.cognition.ai/blog/introducing-devin" 53 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/52_Devin_AI_Trains_an_AI.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "52_Devin_AI_Trains_an_AI", 3 | "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.", 4 | "tags": [ 5 | "Generative Models", 6 | "Natural Language Processing" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The repository at `https://github.com/artidoro/qlora` has been downloaded.", 13 | "category": "Machine Learning Method", 14 | "satisfied": null 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The necessary environment and dependencies are set up.", 22 | "category": "Dataset or Environment", 23 | "satisfied": null 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"", 32 | "category": "Machine Learning Method", 33 | "satisfied": null 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "The finetuned model and training summary are saved in `models/saved_models/`, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.", 43 | "category": "Save Trained Model", 44 | "satisfied": null 45 | } 46 | ], 47 | "preferences": [ 48 | { 49 | "preference_id": 0, 50 | "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.", 51 | "satisfied": null 52 | }, 53 | { 54 | "preference_id": 1, 55 | "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.", 56 | "satisfied": null 57 | } 58 | ], 59 | "is_kaggle_api_needed": false, 60 | "is_training_needed": true, 61 | "is_web_navigation_needed": true, 62 | "resource": "https://www.cognition.ai/blog/introducing-devin" 63 | } -------------------------------------------------------------------------------- /benchmark/devai/instances/53_Devin_Upwork_Side_Hustle.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "53_Devin_Upwork_Side_Hustle", 3 | "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.", 4 | "tags": [ 5 | "Computer Vision" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.", 12 | "category": "Machine Learning Method", 13 | "satisfied": null 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [ 18 | 0 19 | ], 20 | "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.", 21 | "category": "Dataset or Environment", 22 | "satisfied": null 23 | }, 24 | { 25 | "requirement_id": 2, 26 | "prerequisites": [ 27 | 0, 28 | 1 29 | ], 30 | "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.", 31 | "category": "Other", 32 | "satisfied": null 33 | }, 34 | { 35 | "requirement_id": 3, 36 | "prerequisites": [ 37 | 0, 38 | 1, 39 | 2 40 | ], 41 | "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.", 42 | "category": "Visualization", 43 | "satisfied": null 44 | }, 45 | { 46 | "requirement_id": 4, 47 | "prerequisites": [ 48 | 0, 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": null 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": false, 72 | "is_web_navigation_needed": true, 73 | "resource": "https://www.cognition.ai/blog/introducing-devin" 74 | } -------------------------------------------------------------------------------- /benchmark/devai/trajectory-schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "array", 3 | "items": { 4 | "type": "object", 5 | "properties": { 6 | "step": { 7 | "type": "integer", 8 | "description": "The step number in the trajectory, 0-based." 9 | }, 10 | "user_message": { 11 | "type": ["string", "null"], 12 | "description": "The message from the external user to the agent. If null, no message was sent." 13 | }, 14 | "agent": { 15 | "type": "object", 16 | "properties": { 17 | "thought": { 18 | "type": "string", 19 | "description": "The agent's thought at this step." 20 | }, 21 | "action": { 22 | "type": ["string", "null"], 23 | "description": "The agent's action sent to the environment. If null, the agent did not take any action, for example, when the agent has finished the task." 24 | }, 25 | "agent_name": { 26 | "type": "string", 27 | "description": "The name of the agent that made the action." 28 | } 29 | }, 30 | "required": ["thought", "action"], 31 | "description": "Everything related to the agent at this step." 32 | }, 33 | "environment": { 34 | "type": ["string", "null"], 35 | "description": "The environment's (shell, python interpreter) response to the action submitted by the agent. If null, the environment was not involved in this step." 36 | }, 37 | "step_usage": { 38 | "type": "object", 39 | "properties": { 40 | "input_tokens": { 41 | "type": "integer", 42 | "description": "The number of input tokens passed as LLM context." 43 | }, 44 | "output_tokens": { 45 | "type": "integer", 46 | "description": "The number of tokens produced by the LLM." 47 | }, 48 | "model": { 49 | "type": "string", 50 | "description": "The name of the LLM model used." 51 | }, 52 | "cost": { 53 | "type": "number", 54 | "description": "The cost of the LLM inference, in USD." 55 | }, 56 | "llm_inference_time": { 57 | "type": "number", 58 | "description": "The time taken by the LLM to generate the output tokens, in seconds." 59 | }, 60 | "step_execution_time": { 61 | "type": "number", 62 | "description": "The time taken to make an entire step including LLM inference and environment execution, in seconds." 63 | } 64 | }, 65 | "required": [ 66 | "input_tokens", 67 | "output_tokens", 68 | "model", 69 | "cost", 70 | "llm_inference_time", 71 | "step_execution_time" 72 | ] 73 | }, 74 | "accumulated_usage": { 75 | "type": "object", 76 | "properties": { 77 | "accumulated_cost": { 78 | "type": "number", 79 | "description": "The total cost of the trajectory up to this step, in USD." 80 | }, 81 | "accumulated_time": { 82 | "type": "number", 83 | "description": "The total time taken by the agent to complete the trajectory up to this step, in seconds." 84 | } 85 | }, 86 | "required": [ 87 | "accumulated_cost", 88 | "accumulated_time" 89 | ] 90 | } 91 | }, 92 | "required": ["step", "user_message", "agent", "environment", "step_usage", "accumulated_usage"] 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /benchmark/devai/validate_trajectory.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import jsonschema 5 | import jsonschema.exceptions 6 | 7 | 8 | def validate_trajectory_data(json_data, json_schema) -> bool: 9 | try: 10 | jsonschema.validate(instance=json_data, schema=json_schema) 11 | print("JSON is valid") 12 | except jsonschema.exceptions.ValidationError as err: 13 | print("JSON is invalid") 14 | print(err) 15 | return False 16 | return True 17 | 18 | 19 | def main(): 20 | parser = argparse.ArgumentParser(description='Process a trajectory JSON file.') 21 | 22 | parser.add_argument('trajectory_json', metavar='--trajectory-json', type=str, 23 | help='Path to the trajectory JSON file') 24 | 25 | args = parser.parse_args() 26 | 27 | this_file_dir = os.path.dirname(os.path.realpath(__file__)) 28 | schema_path = os.path.join(this_file_dir, 'trajectory-schema.json') 29 | 30 | with open(schema_path, 'r') as schema_file: 31 | json_schema = json.load(schema_file) 32 | 33 | with open(args.trajectory_json, 'r') as json_file: 34 | json_data = json.load(json_file) 35 | 36 | return int(validate_trajectory_data(json_data, json_schema)) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/02_Maze_Solver_Q_Learning_Gridworld_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "02_Maze_Solver_Q_Learning_Gridworld_RL", 3 | "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.", 19 | "category": "Dataset or Environment", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.", 29 | "category": "Visualization", 30 | "satisfied": true 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.", 40 | "category": "Save Trained Model", 41 | "satisfied": true 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1 48 | ], 49 | "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.", 50 | "category": "Visualization", 51 | "satisfied": true 52 | } 53 | ], 54 | "preferences": [ 55 | { 56 | "preference_id": 0, 57 | "criteria": "Some real-time progress or feedback during the training process should be displayed.", 58 | "satisfied": null 59 | }, 60 | { 61 | "preference_id": 1, 62 | "criteria": "The code should be written in a way that's easy to modify or extend later on.", 63 | "satisfied": null 64 | } 65 | ], 66 | "is_kaggle_api_needed": false, 67 | "is_training_needed": true, 68 | "is_web_navigation_needed": false 69 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML", 3 | "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.", 4 | "tags": [ 5 | "Classification", 6 | "Natural Language Processing", 7 | "Supervised Learning" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": true 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.", 33 | "category": "Data preprocessing and postprocessing", 34 | "satisfied": true 35 | }, 36 | { 37 | "requirement_id": 3, 38 | "prerequisites": [], 39 | "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.", 40 | "category": "Machine Learning Method", 41 | "satisfied": true 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.", 51 | "category": "Performance Metrics", 52 | "satisfied": true 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The model should be straightforward to interpret.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "The final report should be structured clearly for easy review.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": false 70 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/05_Game_Simulation_DQN_CartPole_v1_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "05_Game_Simulation_DQN_CartPole_v1_RL", 3 | "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.", 19 | "category": "Dataset or Environment", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.", 29 | "category": "Visualization", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1 37 | ], 38 | "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.", 39 | "category": "Save Trained Model", 40 | "satisfied": true 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1 47 | ], 48 | "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.", 49 | "category": "Performance Metrics", 50 | "satisfied": true 51 | } 52 | ], 53 | "preferences": [ 54 | { 55 | "preference_id": 0, 56 | "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.", 57 | "satisfied": null 58 | }, 59 | { 60 | "preference_id": 1, 61 | "criteria": "The return over episode curve has key milestones annotated.", 62 | "satisfied": null 63 | } 64 | ], 65 | "is_kaggle_api_needed": false, 66 | "is_training_needed": true, 67 | "is_web_navigation_needed": false 68 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/06_Sentiment_Analysis_SVM_Sentiment140_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML", 3 | "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.", 4 | "tags": [ 5 | "Natural Language Processing", 6 | "Supervised Learning" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": true 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.", 32 | "category": "Machine Learning Method", 33 | "satisfied": true 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.", 43 | "category": "Machine Learning Method", 44 | "satisfied": true 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The dataset download process should be reliable, with clear error handling.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The final accuracy report should be straightforward and easy to interpret.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": true, 72 | "is_web_navigation_needed": false 73 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/07_Image_Super_Resolution_SRCNN_Set5_DL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "07_Image_Super_Resolution_SRCNN_Set5_DL", 3 | "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`. The generated images should be high-quality and clearly show improvements.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": true 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [], 28 | "criteria": "The \"SRCNN\" model is used in `src/model.py`.", 29 | "category": "Machine Learning Method", 30 | "satisfied": true 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.", 40 | "category": "Visualization", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.", 51 | "category": "Visualization", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "Well-organized output images, highlighting key improvements, should be included.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": true 70 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/08_Robot_Control_PPO_PyBullet_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "08_Robot_Control_PPO_PyBullet_RL", 3 | "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.", 12 | "category": "Dataset or Environment", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"PPO\" algorithm is used in `src/train.py`.", 19 | "category": "Machine Learning Method", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0 26 | ], 27 | "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.", 28 | "category": "Dataset or Environment", 29 | "satisfied": false 30 | }, 31 | { 32 | "requirement_id": 3, 33 | "prerequisites": [ 34 | 0, 35 | 1, 36 | 2 37 | ], 38 | "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.", 39 | "category": "Other", 40 | "satisfied": false 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1, 47 | 2 48 | ], 49 | "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.", 50 | "category": "Visualization", 51 | "satisfied": false 52 | }, 53 | { 54 | "requirement_id": 5, 55 | "prerequisites": [ 56 | 0, 57 | 1, 58 | 2 59 | ], 60 | "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.", 61 | "category": "Visualization", 62 | "satisfied": false 63 | } 64 | ], 65 | "preferences": [ 66 | { 67 | "preference_id": 0, 68 | "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.", 69 | "satisfied": null 70 | } 71 | ], 72 | "is_kaggle_api_needed": false, 73 | "is_training_needed": true, 74 | "is_web_navigation_needed": false 75 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images", 3 | "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models", 7 | "Natural Language Processing" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": true 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.", 33 | "category": "Visualization", 34 | "satisfied": true 35 | } 36 | ], 37 | "preferences": [ 38 | { 39 | "preference_id": 0, 40 | "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.", 41 | "satisfied": null 42 | }, 43 | { 44 | "preference_id": 1, 45 | "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.", 46 | "satisfied": null 47 | } 48 | ], 49 | "is_kaggle_api_needed": false, 50 | "is_training_needed": false, 51 | "is_web_navigation_needed": true, 52 | "resource": "https://www.cognition.ai/blog/introducing-devin" 53 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/52_Devin_AI_Trains_an_AI.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "52_Devin_AI_Trains_an_AI", 3 | "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.", 4 | "tags": [ 5 | "Generative Models", 6 | "Natural Language Processing" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The instructions in the repository at `https://github.com/artidoro/qlora` are followed to finetune a \"7B LLaMA\" model and save it in models/saved_models/.", 13 | "category": "Machine Learning Method", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The necessary environment and dependencies are set up as outlined in the repository `src/env.py`.", 22 | "category": "Dataset or Environment", 23 | "satisfied": true 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "The datasets required for finetuning the model are downloaded and prepared in `src/data_loader.py`.", 32 | "category": "Dataset or Environment", 33 | "satisfied": true 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"", 43 | "category": "Machine Learning Method", 44 | "satisfied": false 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 0, 50 | 1, 51 | 2, 52 | 3 53 | ], 54 | "criteria": "The finetuned model and training summary are saved, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.", 55 | "category": "Save Trained Model", 56 | "satisfied": false 57 | } 58 | ], 59 | "preferences": [ 60 | { 61 | "preference_id": 0, 62 | "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.", 63 | "satisfied": null 64 | }, 65 | { 66 | "preference_id": 1, 67 | "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.", 68 | "satisfied": null 69 | } 70 | ], 71 | "is_kaggle_api_needed": false, 72 | "is_training_needed": true, 73 | "is_web_navigation_needed": true, 74 | "resource": "https://www.cognition.ai/blog/introducing-devin" 75 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/agent_as_a_judge/gray_box/53_Devin_Upwork_Side_Hustle.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "53_Devin_Upwork_Side_Hustle", 3 | "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.", 4 | "tags": [ 5 | "Computer Vision" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.", 12 | "category": "Machine Learning Method", 13 | "satisfied": false 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [ 18 | 0 19 | ], 20 | "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.", 21 | "category": "Dataset or Environment", 22 | "satisfied": false 23 | }, 24 | { 25 | "requirement_id": 2, 26 | "prerequisites": [ 27 | 0, 28 | 1 29 | ], 30 | "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.", 31 | "category": "Other", 32 | "satisfied": false 33 | }, 34 | { 35 | "requirement_id": 3, 36 | "prerequisites": [ 37 | 0, 38 | 1, 39 | 2 40 | ], 41 | "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.", 42 | "category": "Visualization", 43 | "satisfied": false 44 | }, 45 | { 46 | "requirement_id": 4, 47 | "prerequisites": [ 48 | 0, 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": false, 72 | "is_web_navigation_needed": true, 73 | "resource": "https://www.cognition.ai/blog/introducing-devin" 74 | } -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/human_as_a_judge/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML", 3 | "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.", 4 | "tags": [ 5 | "Classification", 6 | "Natural Language Processing", 7 | "Supervised Learning" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": true 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": true 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.", 33 | "category": "Data preprocessing and postprocessing", 34 | "satisfied": true 35 | }, 36 | { 37 | "requirement_id": 3, 38 | "prerequisites": [], 39 | "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.", 40 | "category": "Machine Learning Method", 41 | "satisfied": true 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.", 51 | "category": "Performance Metrics", 52 | "satisfied": true 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The model should be straightforward to interpret.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "The final report should be structured clearly for easy review.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": false, 70 | "executed_successfully": false, 71 | "satisfied_all_requirements": true, 72 | "satisfied_all_preferences": false 73 | } 74 | -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/human_as_a_judge/05_Game_Simulation_DQN_CartPole_v1_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "05_Game_Simulation_DQN_CartPole_v1_RL", 3 | "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.", 19 | "category": "Dataset or Environment", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.", 29 | "category": "Visualization", 30 | "satisfied": true 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1 37 | ], 38 | "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.", 39 | "category": "Save Trained Model", 40 | "satisfied": true 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1 47 | ], 48 | "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.", 49 | "category": "Performance Metrics", 50 | "satisfied": true 51 | } 52 | ], 53 | "preferences": [ 54 | { 55 | "preference_id": 0, 56 | "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.", 57 | "satisfied": null 58 | }, 59 | { 60 | "preference_id": 1, 61 | "criteria": "The return over episode curve has key milestones annotated.", 62 | "satisfied": null 63 | } 64 | ], 65 | "is_kaggle_api_needed": false, 66 | "is_training_needed": true, 67 | "is_web_navigation_needed": false, 68 | "executed_successfully": false, 69 | "satisfied_all_requirements": true, 70 | "satisfied_all_preferences": false 71 | } 72 | -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/human_as_a_judge/06_Sentiment_Analysis_SVM_Sentiment140_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML", 3 | "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.", 4 | "tags": [ 5 | "Natural Language Processing", 6 | "Supervised Learning" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": true 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": true 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.", 32 | "category": "Machine Learning Method", 33 | "satisfied": true 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.", 43 | "category": "Machine Learning Method", 44 | "satisfied": false 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The dataset download process should be reliable, with clear error handling.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The final accuracy report should be straightforward and easy to interpret.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": true, 72 | "is_web_navigation_needed": false, 73 | "executed_successfully": false, 74 | "satisfied_all_requirements": false, 75 | "satisfied_all_preferences": false 76 | } 77 | -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/human_as_a_judge/07_Image_Super_Resolution_SRCNN_Set5_DL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "07_Image_Super_Resolution_SRCNN_Set5_DL", 3 | "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`. The generated images should be high-quality and clearly show improvements.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": true 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [], 28 | "criteria": "The \"SRCNN\" model is used in `src/model.py`.", 29 | "category": "Machine Learning Method", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.", 40 | "category": "Visualization", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.", 51 | "category": "Visualization", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "Well-organized output images, highlighting key improvements, should be included.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": true, 70 | "executed_successfully": false, 71 | "satisfied_all_requirements": false, 72 | "satisfied_all_preferences": false 73 | } 74 | -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/human_as_a_judge/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images", 3 | "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models", 7 | "Natural Language Processing" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": false 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.", 33 | "category": "Visualization", 34 | "satisfied": false 35 | } 36 | ], 37 | "preferences": [ 38 | { 39 | "preference_id": 0, 40 | "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.", 41 | "satisfied": null 42 | }, 43 | { 44 | "preference_id": 1, 45 | "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.", 46 | "satisfied": null 47 | } 48 | ], 49 | "is_kaggle_api_needed": false, 50 | "is_training_needed": false, 51 | "is_web_navigation_needed": true, 52 | "resource": "https://www.cognition.ai/blog/introducing-devin", 53 | "executed_successfully": false, 54 | "satisfied_all_requirements": false, 55 | "satisfied_all_preferences": false 56 | } 57 | -------------------------------------------------------------------------------- /benchmark/judgment/GPT-Pilot/human_as_a_judge/53_Devin_Upwork_Side_Hustle.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "53_Devin_Upwork_Side_Hustle", 3 | "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.", 4 | "tags": [ 5 | "Computer Vision" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.", 12 | "category": "Machine Learning Method", 13 | "satisfied": false 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [ 18 | 0 19 | ], 20 | "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.", 21 | "category": "Dataset or Environment", 22 | "satisfied": false 23 | }, 24 | { 25 | "requirement_id": 2, 26 | "prerequisites": [ 27 | 0, 28 | 1 29 | ], 30 | "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.", 31 | "category": "Other", 32 | "satisfied": false 33 | }, 34 | { 35 | "requirement_id": 3, 36 | "prerequisites": [ 37 | 0, 38 | 1, 39 | 2 40 | ], 41 | "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.", 42 | "category": "Visualization", 43 | "satisfied": false 44 | }, 45 | { 46 | "requirement_id": 4, 47 | "prerequisites": [ 48 | 0, 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": false, 72 | "is_web_navigation_needed": true, 73 | "resource": "https://www.cognition.ai/blog/introducing-devin", 74 | "executed_successfully": false, 75 | "satisfied_all_requirements": false, 76 | "satisfied_all_preferences": false 77 | } 78 | -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/02_Maze_Solver_Q_Learning_Gridworld_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "02_Maze_Solver_Q_Learning_Gridworld_RL", 3 | "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": false 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.", 19 | "category": "Dataset or Environment", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.", 29 | "category": "Visualization", 30 | "satisfied": true 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.", 40 | "category": "Save Trained Model", 41 | "satisfied": true 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1 48 | ], 49 | "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.", 50 | "category": "Visualization", 51 | "satisfied": true 52 | } 53 | ], 54 | "preferences": [ 55 | { 56 | "preference_id": 0, 57 | "criteria": "Some real-time progress or feedback during the training process should be displayed.", 58 | "satisfied": null 59 | }, 60 | { 61 | "preference_id": 1, 62 | "criteria": "The code should be written in a way that's easy to modify or extend later on.", 63 | "satisfied": null 64 | } 65 | ], 66 | "is_kaggle_api_needed": false, 67 | "is_training_needed": true, 68 | "is_web_navigation_needed": false 69 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML", 3 | "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.", 4 | "tags": [ 5 | "Classification", 6 | "Natural Language Processing", 7 | "Supervised Learning" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": false 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.", 33 | "category": "Data preprocessing and postprocessing", 34 | "satisfied": false 35 | }, 36 | { 37 | "requirement_id": 3, 38 | "prerequisites": [], 39 | "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.", 40 | "category": "Machine Learning Method", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.", 51 | "category": "Performance Metrics", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The model should be straightforward to interpret.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "The final report should be structured clearly for easy review.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": false 70 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/05_Game_Simulation_DQN_CartPole_v1_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "05_Game_Simulation_DQN_CartPole_v1_RL", 3 | "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": false 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.", 19 | "category": "Dataset or Environment", 20 | "satisfied": false 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.", 29 | "category": "Visualization", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1 37 | ], 38 | "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.", 39 | "category": "Save Trained Model", 40 | "satisfied": false 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1 47 | ], 48 | "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.", 49 | "category": "Performance Metrics", 50 | "satisfied": true 51 | } 52 | ], 53 | "preferences": [ 54 | { 55 | "preference_id": 0, 56 | "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.", 57 | "satisfied": null 58 | }, 59 | { 60 | "preference_id": 1, 61 | "criteria": "The return over episode curve has key milestones annotated.", 62 | "satisfied": null 63 | } 64 | ], 65 | "is_kaggle_api_needed": false, 66 | "is_training_needed": true, 67 | "is_web_navigation_needed": false 68 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/06_Sentiment_Analysis_SVM_Sentiment140_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML", 3 | "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.", 4 | "tags": [ 5 | "Natural Language Processing", 6 | "Supervised Learning" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.", 32 | "category": "Machine Learning Method", 33 | "satisfied": false 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.", 43 | "category": "Machine Learning Method", 44 | "satisfied": false 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The dataset download process should be reliable, with clear error handling.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The final accuracy report should be straightforward and easy to interpret.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": true, 72 | "is_web_navigation_needed": false 73 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/07_Image_Super_Resolution_SRCNN_Set5_DL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "07_Image_Super_Resolution_SRCNN_Set5_DL", 3 | "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`. The generated images should be high-quality and clearly show improvements.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [], 28 | "criteria": "The \"SRCNN\" model is used in `src/model.py`.", 29 | "category": "Machine Learning Method", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.", 40 | "category": "Visualization", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.", 51 | "category": "Visualization", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "Well-organized output images, highlighting key improvements, should be included.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": true 70 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/08_Robot_Control_PPO_PyBullet_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "08_Robot_Control_PPO_PyBullet_RL", 3 | "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.", 12 | "category": "Dataset or Environment", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"PPO\" algorithm is used in `src/train.py`.", 19 | "category": "Machine Learning Method", 20 | "satisfied": false 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0 26 | ], 27 | "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.", 28 | "category": "Dataset or Environment", 29 | "satisfied": false 30 | }, 31 | { 32 | "requirement_id": 3, 33 | "prerequisites": [ 34 | 0, 35 | 1, 36 | 2 37 | ], 38 | "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.", 39 | "category": "Other", 40 | "satisfied": false 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1, 47 | 2 48 | ], 49 | "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.", 50 | "category": "Visualization", 51 | "satisfied": false 52 | }, 53 | { 54 | "requirement_id": 5, 55 | "prerequisites": [ 56 | 0, 57 | 1, 58 | 2 59 | ], 60 | "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.", 61 | "category": "Visualization", 62 | "satisfied": false 63 | } 64 | ], 65 | "preferences": [ 66 | { 67 | "preference_id": 0, 68 | "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.", 69 | "satisfied": null 70 | } 71 | ], 72 | "is_kaggle_api_needed": false, 73 | "is_training_needed": true, 74 | "is_web_navigation_needed": false 75 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images", 3 | "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models", 7 | "Natural Language Processing" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": false 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.", 33 | "category": "Visualization", 34 | "satisfied": true 35 | } 36 | ], 37 | "preferences": [ 38 | { 39 | "preference_id": 0, 40 | "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.", 41 | "satisfied": null 42 | }, 43 | { 44 | "preference_id": 1, 45 | "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.", 46 | "satisfied": null 47 | } 48 | ], 49 | "is_kaggle_api_needed": false, 50 | "is_training_needed": false, 51 | "is_web_navigation_needed": true, 52 | "resource": "https://www.cognition.ai/blog/introducing-devin" 53 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/52_Devin_AI_Trains_an_AI.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "52_Devin_AI_Trains_an_AI", 3 | "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.", 4 | "tags": [ 5 | "Generative Models", 6 | "Natural Language Processing" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The instructions in the repository at `https://github.com/artidoro/qlora` are followed to finetune a \"7B LLaMA\" model and save it in models/saved_models/.", 13 | "category": "Machine Learning Method", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The necessary environment and dependencies are set up as outlined in the repository `src/env.py`.", 22 | "category": "Dataset or Environment", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "The datasets required for finetuning the model are downloaded and prepared in `src/data_loader.py`.", 32 | "category": "Dataset or Environment", 33 | "satisfied": false 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"", 43 | "category": "Machine Learning Method", 44 | "satisfied": false 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 0, 50 | 1, 51 | 2, 52 | 3 53 | ], 54 | "criteria": "The finetuned model and training summary are saved, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.", 55 | "category": "Save Trained Model", 56 | "satisfied": false 57 | } 58 | ], 59 | "preferences": [ 60 | { 61 | "preference_id": 0, 62 | "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.", 63 | "satisfied": null 64 | }, 65 | { 66 | "preference_id": 1, 67 | "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.", 68 | "satisfied": null 69 | } 70 | ], 71 | "is_kaggle_api_needed": false, 72 | "is_training_needed": true, 73 | "is_web_navigation_needed": true, 74 | "resource": "https://www.cognition.ai/blog/introducing-devin" 75 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/agent_as_a_judge/gray_box/53_Devin_Upwork_Side_Hustle.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "53_Devin_Upwork_Side_Hustle", 3 | "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.", 4 | "tags": [ 5 | "Computer Vision" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.", 12 | "category": "Machine Learning Method", 13 | "satisfied": false 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [ 18 | 0 19 | ], 20 | "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.", 21 | "category": "Dataset or Environment", 22 | "satisfied": false 23 | }, 24 | { 25 | "requirement_id": 2, 26 | "prerequisites": [ 27 | 0, 28 | 1 29 | ], 30 | "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.", 31 | "category": "Other", 32 | "satisfied": false 33 | }, 34 | { 35 | "requirement_id": 3, 36 | "prerequisites": [ 37 | 0, 38 | 1, 39 | 2 40 | ], 41 | "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.", 42 | "category": "Visualization", 43 | "satisfied": false 44 | }, 45 | { 46 | "requirement_id": 4, 47 | "prerequisites": [ 48 | 0, 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": false, 72 | "is_web_navigation_needed": true, 73 | "resource": "https://www.cognition.ai/blog/introducing-devin" 74 | } -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/human_as_a_judge/05_Game_Simulation_DQN_CartPole_v1_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "05_Game_Simulation_DQN_CartPole_v1_RL", 3 | "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": false 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.", 19 | "category": "Dataset or Environment", 20 | "satisfied": false 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.", 29 | "category": "Visualization", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1 37 | ], 38 | "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.", 39 | "category": "Save Trained Model", 40 | "satisfied": true 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1 47 | ], 48 | "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.", 49 | "category": "Performance Metrics", 50 | "satisfied": true 51 | } 52 | ], 53 | "preferences": [ 54 | { 55 | "preference_id": 0, 56 | "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.", 57 | "satisfied": null 58 | }, 59 | { 60 | "preference_id": 1, 61 | "criteria": "The return over episode curve has key milestones annotated.", 62 | "satisfied": null 63 | } 64 | ], 65 | "is_kaggle_api_needed": false, 66 | "is_training_needed": true, 67 | "is_web_navigation_needed": false, 68 | "executed_successfully": false, 69 | "satisfied_all_requirements": false, 70 | "satisfied_all_preferences": false 71 | } 72 | -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/human_as_a_judge/06_Sentiment_Analysis_SVM_Sentiment140_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML", 3 | "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.", 4 | "tags": [ 5 | "Natural Language Processing", 6 | "Supervised Learning" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.", 32 | "category": "Machine Learning Method", 33 | "satisfied": false 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.", 43 | "category": "Machine Learning Method", 44 | "satisfied": false 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The dataset download process should be reliable, with clear error handling.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The final accuracy report should be straightforward and easy to interpret.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": true, 72 | "is_web_navigation_needed": false, 73 | "executed_successfully": false, 74 | "satisfied_all_requirements": false, 75 | "satisfied_all_preferences": false 76 | } 77 | -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/human_as_a_judge/07_Image_Super_Resolution_SRCNN_Set5_DL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "07_Image_Super_Resolution_SRCNN_Set5_DL", 3 | "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`. The generated images should be high-quality and clearly show improvements.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [], 28 | "criteria": "The \"SRCNN\" model is used in `src/model.py`.", 29 | "category": "Machine Learning Method", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.", 40 | "category": "Visualization", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.", 51 | "category": "Visualization", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "Well-organized output images, highlighting key improvements, should be included.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": true, 70 | "executed_successfully": false, 71 | "satisfied_all_requirements": false, 72 | "satisfied_all_preferences": false 73 | } 74 | -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/human_as_a_judge/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images", 3 | "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models", 7 | "Natural Language Processing" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": false 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.", 33 | "category": "Visualization", 34 | "satisfied": false 35 | } 36 | ], 37 | "preferences": [ 38 | { 39 | "preference_id": 0, 40 | "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.", 41 | "satisfied": null 42 | }, 43 | { 44 | "preference_id": 1, 45 | "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.", 46 | "satisfied": null 47 | } 48 | ], 49 | "is_kaggle_api_needed": false, 50 | "is_training_needed": false, 51 | "is_web_navigation_needed": true, 52 | "resource": "https://www.cognition.ai/blog/introducing-devin", 53 | "executed_successfully": true, 54 | "satisfied_all_requirements": false, 55 | "satisfied_all_preferences": false 56 | } 57 | -------------------------------------------------------------------------------- /benchmark/judgment/MetaGPT/human_as_a_judge/53_Devin_Upwork_Side_Hustle.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "53_Devin_Upwork_Side_Hustle", 3 | "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.", 4 | "tags": [ 5 | "Computer Vision" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [ 18 | 0 19 | ], 20 | "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.", 21 | "category": "Dataset or Environment", 22 | "satisfied": false 23 | }, 24 | { 25 | "requirement_id": 2, 26 | "prerequisites": [ 27 | 0, 28 | 1 29 | ], 30 | "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.", 31 | "category": "Other", 32 | "satisfied": false 33 | }, 34 | { 35 | "requirement_id": 3, 36 | "prerequisites": [ 37 | 0, 38 | 1, 39 | 2 40 | ], 41 | "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.", 42 | "category": "Visualization", 43 | "satisfied": false 44 | }, 45 | { 46 | "requirement_id": 4, 47 | "prerequisites": [ 48 | 0, 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": false, 72 | "is_web_navigation_needed": true, 73 | "resource": "https://www.cognition.ai/blog/introducing-devin", 74 | "executed_successfully": false, 75 | "satisfied_all_requirements": false, 76 | "satisfied_all_preferences": false 77 | } 78 | -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/02_Maze_Solver_Q_Learning_Gridworld_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "02_Maze_Solver_Q_Learning_Gridworld_RL", 3 | "query": "Can you help me create a system to solve maze-style Gridworld tasks using the Q-learning algorithm? The system should use numpy to make the core calculations more efficient and matplotlib for visualizations. The Q-learning algorithm should be implemented in `src/train.py`, and the aptly-named Gridworld environment should be implemented in `src/env.py` in such a way that one could specific the grid size and start/end positions when instantiating it. The system needs to record the learning curve during training, tracking episodes and their corresponding returns, and save it as `results/figures/learning_curve.png`. Additionally, I'd like you to visualize and save the paths taken by the agent in each episode in a file called `results/figures/path_changes.gif`, and save the trained model as `models/saved_models/q_learning_model.npy`. It would be great to have some form of real-time feedback during training, like seeing the progress or getting updates on how the model is learning. Also, if you can, please try and write the code in a way that's easy to modify or extend later on.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"Q-learning\" algorithm is used in `src/train.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"Gridworld\" environment is defined in `src/env.py` with the ability for a user to specify a grid size and start/end positions.", 19 | "category": "Dataset or Environment", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "Learning curves are recorded during training, and saved as `results/figures/learning_curve.png`. Episodes and returns are recorded.", 29 | "category": "Visualization", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "The learned model is saved as `models/saved_models/q_learning_model.npy`.", 40 | "category": "Save Trained Model", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1 48 | ], 49 | "criteria": "Paths taken during learning are visualized and saved as `results/figures/path_changes.gif`.", 50 | "category": "Visualization", 51 | "satisfied": false 52 | } 53 | ], 54 | "preferences": [ 55 | { 56 | "preference_id": 0, 57 | "criteria": "Some real-time progress or feedback during the training process should be displayed.", 58 | "satisfied": null 59 | }, 60 | { 61 | "preference_id": 1, 62 | "criteria": "The code should be written in a way that's easy to modify or extend later on.", 63 | "satisfied": null 64 | } 65 | ], 66 | "is_kaggle_api_needed": false, 67 | "is_training_needed": true, 68 | "is_web_navigation_needed": false 69 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/03_Text_Classification_NaiveBayes_20Newsgroups_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "03_Text_Classification_NaiveBayes_20Newsgroups_ML", 3 | "query": "Please implement a Naive Bayes classifier for the 20 Newsgroups dataset and save it in a file called `src/model.py`. The dataset should loaded in `src/data_loader.py`. The program should handle data preprocessing, including removing stop words, punctuation, and special characters. Show the improvement of your classifier by generating word clouds before and after training your classifier and saving them as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`. Please calculate and include TF-IDF features when loading the data in `src/data_loader.py`. Lastly, print out a performance report (including precision, recall, and F1-score) and save it as `results/metrics/performance.txt`. The model should be straightforward to interpret, and the final report should be structured clearly for easy review.", 4 | "tags": [ 5 | "Classification", 6 | "Natural Language Processing", 7 | "Supervised Learning" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The \"20 Newsgroups\" dataset is used in `src/data_loader.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": true 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "Data preprocessing is performed, including removing stop words, punctuation, and special characters. Word clouds are visualized before and after training the classifier, and saved as `results/figures/wordcloud_before.png` and `results/figures/wordcloud_after.png`.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "\"TF-IDF\" features are used when loading the data in `src/data_loader.py`.", 33 | "category": "Data preprocessing and postprocessing", 34 | "satisfied": true 35 | }, 36 | { 37 | "requirement_id": 3, 38 | "prerequisites": [], 39 | "criteria": "A \"Naive Bayes classifier\" is implemented in `src/model.py`.", 40 | "category": "Machine Learning Method", 41 | "satisfied": true 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "A performance report, including \"precision,\" \"recall,\" and the \"F1-score,\" is printed and saved as `results/metrics/performance.txt`.", 51 | "category": "Performance Metrics", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The model should be straightforward to interpret.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "The final report should be structured clearly for easy review.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": false 70 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/05_Game_Simulation_DQN_CartPole_v1_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "05_Game_Simulation_DQN_CartPole_v1_RL", 3 | "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.", 19 | "category": "Dataset or Environment", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.", 29 | "category": "Visualization", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1 37 | ], 38 | "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.", 39 | "category": "Save Trained Model", 40 | "satisfied": false 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1 47 | ], 48 | "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.", 49 | "category": "Performance Metrics", 50 | "satisfied": false 51 | } 52 | ], 53 | "preferences": [ 54 | { 55 | "preference_id": 0, 56 | "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.", 57 | "satisfied": null 58 | }, 59 | { 60 | "preference_id": 1, 61 | "criteria": "The return over episode curve has key milestones annotated.", 62 | "satisfied": null 63 | } 64 | ], 65 | "is_kaggle_api_needed": false, 66 | "is_training_needed": true, 67 | "is_web_navigation_needed": false 68 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/06_Sentiment_Analysis_SVM_Sentiment140_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML", 3 | "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.", 4 | "tags": [ 5 | "Natural Language Processing", 6 | "Supervised Learning" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": true 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": true 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.", 32 | "category": "Machine Learning Method", 33 | "satisfied": false 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.", 43 | "category": "Machine Learning Method", 44 | "satisfied": true 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The dataset download process should be reliable, with clear error handling.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The final accuracy report should be straightforward and easy to interpret.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": true, 72 | "is_web_navigation_needed": false 73 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/07_Image_Super_Resolution_SRCNN_Set5_DL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "07_Image_Super_Resolution_SRCNN_Set5_DL", 3 | "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`. The generated images should be high-quality and clearly show improvements.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": true 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [], 28 | "criteria": "The \"SRCNN\" model is used in `src/model.py`.", 29 | "category": "Machine Learning Method", 30 | "satisfied": true 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.", 40 | "category": "Visualization", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.", 51 | "category": "Visualization", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "Well-organized output images, highlighting key improvements, should be included.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": true 70 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/08_Robot_Control_PPO_PyBullet_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "08_Robot_Control_PPO_PyBullet_RL", 3 | "query": "I am seeking to implement a project which explores robotic arm control via reinforcement learning in the PyBullet simulation environment with the PPO algorithm. The PyBullet simulator should be imported and a related robotics environment should be loaded in `src/env.py`. The PPO algorithm should be implemented in `src/train.py`. The project should meticulously document the robot's final position, printing and saving it as `data/final_position.txt`. The training return trajectory should be graphed and saved as `results/figures/training_returns.png`. A sample of the robot's motion should be visualized and saved as `results/figures/robot_motion.gif`. A detailed environment setup and reward structure description should be provided in `src/env.py`. Please ensure that any issues with loading URDF files in PyBullet are clearly handled and documented, providing clear error messages or logging for debugging.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"PyBullet\" simulator is used in `src/env.py`.", 12 | "category": "Dataset or Environment", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "The \"PPO\" algorithm is used in `src/train.py`.", 19 | "category": "Machine Learning Method", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0 26 | ], 27 | "criteria": "A detailed environment setup and reward structure description is provided in `src/env.py`.", 28 | "category": "Dataset or Environment", 29 | "satisfied": false 30 | }, 31 | { 32 | "requirement_id": 3, 33 | "prerequisites": [ 34 | 0, 35 | 1, 36 | 2 37 | ], 38 | "criteria": "The robot's final position is printed and saved as `data/final_position.txt`.", 39 | "category": "Other", 40 | "satisfied": false 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1, 47 | 2 48 | ], 49 | "criteria": "The training returns over time curve is recorded and saved as `results/figures/training_returns.png`.", 50 | "category": "Visualization", 51 | "satisfied": false 52 | }, 53 | { 54 | "requirement_id": 5, 55 | "prerequisites": [ 56 | 0, 57 | 1, 58 | 2 59 | ], 60 | "criteria": "A sample of the robot's motion is visualized and saved as `results/figures/robot_motion.gif`.", 61 | "category": "Visualization", 62 | "satisfied": false 63 | } 64 | ], 65 | "preferences": [ 66 | { 67 | "preference_id": 0, 68 | "criteria": "The system should effectively handle potential issues with loading URDF files in PyBullet, providing clear error messages or logging for debugging.", 69 | "satisfied": null 70 | } 71 | ], 72 | "is_kaggle_api_needed": false, 73 | "is_training_needed": true, 74 | "is_web_navigation_needed": false 75 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images", 3 | "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models", 7 | "Natural Language Processing" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": true 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.", 33 | "category": "Visualization", 34 | "satisfied": true 35 | } 36 | ], 37 | "preferences": [ 38 | { 39 | "preference_id": 0, 40 | "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.", 41 | "satisfied": null 42 | }, 43 | { 44 | "preference_id": 1, 45 | "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.", 46 | "satisfied": null 47 | } 48 | ], 49 | "is_kaggle_api_needed": false, 50 | "is_training_needed": false, 51 | "is_web_navigation_needed": true, 52 | "resource": "https://www.cognition.ai/blog/introducing-devin" 53 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/52_Devin_AI_Trains_an_AI.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "52_Devin_AI_Trains_an_AI", 3 | "query": "Can you finetune a 7B LLaMA model using `https://github.com/artidoro/qlora`? Follow the instructions in the repository to finetune the 7B LLaMA model and save it in models/saved_models/. Ensure the necessary environment and dependencies are set up as outlined in `src/env.py`. Download and prepare the datasets required for finetuning the model as specified in `src/data_loader.py`. Complete the finetuning process, ensuring all configurations are properly set in accordance with qlora. Save the finetuned model and training summary, storing them in the specified directory as results/metrics/finetuning_summary.txt.", 4 | "tags": [ 5 | "Generative Models", 6 | "Natural Language Processing" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The instructions in the repository at `https://github.com/artidoro/qlora` are followed to finetune a \"7B LLaMA\" model and save it in models/saved_models/.", 13 | "category": "Machine Learning Method", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The necessary environment and dependencies are set up as outlined in the repository `src/env.py`.", 22 | "category": "Dataset or Environment", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "The datasets required for finetuning the model are downloaded and prepared in `src/data_loader.py`.", 32 | "category": "Dataset or Environment", 33 | "satisfied": false 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "The finetuning process is completed, ensuring all configurations are properly set in accordance with \"qlora.\"", 43 | "category": "Machine Learning Method", 44 | "satisfied": false 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 0, 50 | 1, 51 | 2, 52 | 3 53 | ], 54 | "criteria": "The finetuned model and training summary are saved, storing them in the specified directory as `results/metrics/finetuning_summary.txt`.", 55 | "category": "Save Trained Model", 56 | "satisfied": false 57 | } 58 | ], 59 | "preferences": [ 60 | { 61 | "preference_id": 0, 62 | "criteria": "The finetuning process should include validation steps to monitor overfitting or other issues.", 63 | "satisfied": null 64 | }, 65 | { 66 | "preference_id": 1, 67 | "criteria": "A detailed report on the finetuning process, including any challenges faced and how they were overcome, should be generated and saved as `results/finetuning_summary.txt`.", 68 | "satisfied": null 69 | } 70 | ], 71 | "is_kaggle_api_needed": false, 72 | "is_training_needed": true, 73 | "is_web_navigation_needed": true, 74 | "resource": "https://www.cognition.ai/blog/introducing-devin" 75 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/agent_as_a_judge/gray_box/53_Devin_Upwork_Side_Hustle.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "53_Devin_Upwork_Side_Hustle", 3 | "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.", 4 | "tags": [ 5 | "Computer Vision" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [ 18 | 0 19 | ], 20 | "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.", 21 | "category": "Dataset or Environment", 22 | "satisfied": false 23 | }, 24 | { 25 | "requirement_id": 2, 26 | "prerequisites": [ 27 | 0, 28 | 1 29 | ], 30 | "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.", 31 | "category": "Other", 32 | "satisfied": false 33 | }, 34 | { 35 | "requirement_id": 3, 36 | "prerequisites": [ 37 | 0, 38 | 1, 39 | 2 40 | ], 41 | "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.", 42 | "category": "Visualization", 43 | "satisfied": false 44 | }, 45 | { 46 | "requirement_id": 4, 47 | "prerequisites": [ 48 | 0, 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": false, 72 | "is_web_navigation_needed": true, 73 | "resource": "https://www.cognition.ai/blog/introducing-devin" 74 | } -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/human_as_a_judge/05_Game_Simulation_DQN_CartPole_v1_RL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "05_Game_Simulation_DQN_CartPole_v1_RL", 3 | "query": "Create a reliable system to train a DQN agent using PyTorch in an OpenAI Gym environment such as CartPole-v1. Implement DQN using PyTorch in `src/model.py`. The environment should be instantiated in the main file, `src/main.py`. Handle any dimension mismatches between DQN and the environment (like would happen if the Gym environment ID were changed) by logging an error. Also please log the return after each episode to `results/metrics/` and update it after each new episode finishes. Save the final return over episode curve as `results/figures/return_over_time.png` and the final trained model as `models/saved_models/dqn_model.pt`. If possible, add annotations to the return over episode curve to showcase key milestones.", 4 | "tags": [ 5 | "Reinforcement Learning" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The \"DQN\" algorithm is implemented using PyTorch and saved in `src/model.py`.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [], 18 | "criteria": "An \"OpenAI Gym\" environment is instantiated in `src/main.py`.", 19 | "category": "Dataset or Environment", 20 | "satisfied": true 21 | }, 22 | { 23 | "requirement_id": 2, 24 | "prerequisites": [ 25 | 0, 26 | 1 27 | ], 28 | "criteria": "The return over time curve is plotted, recording the return of each episode, and saved as `results/figures/reward_over_time.png`.", 29 | "category": "Visualization", 30 | "satisfied": false 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1 37 | ], 38 | "criteria": "The trained model is saved as `models/saved_models/dqn_model.pt`.", 39 | "category": "Save Trained Model", 40 | "satisfied": false 41 | }, 42 | { 43 | "requirement_id": 4, 44 | "prerequisites": [ 45 | 0, 46 | 1 47 | ], 48 | "criteria": "The return for each episode is logged to `results/metrics/` and updated after each episode finishes.", 49 | "category": "Performance Metrics", 50 | "satisfied": true 51 | } 52 | ], 53 | "preferences": [ 54 | { 55 | "preference_id": 0, 56 | "criteria": "The system should handle dimension mismatches, logging the issues for easy debugging.", 57 | "satisfied": null 58 | }, 59 | { 60 | "preference_id": 1, 61 | "criteria": "The return over episode curve has key milestones annotated.", 62 | "satisfied": null 63 | } 64 | ], 65 | "is_kaggle_api_needed": false, 66 | "is_training_needed": true, 67 | "is_web_navigation_needed": false, 68 | "executed_successfully": false, 69 | "satisfied_all_requirements": false, 70 | "satisfied_all_preferences": false 71 | } 72 | -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/human_as_a_judge/06_Sentiment_Analysis_SVM_Sentiment140_ML.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "06_Sentiment_Analysis_SVM_Sentiment140_ML", 3 | "query": "Please help me build a system for sentiment analysis on tweets using the Sentiment140 dataset available from Hugging Face. Load the Sentiment140 dataset and, when loading the data, clean it by removing stop words, punctuation, and special characters, all in `src/data_loader.py`. Use Word2Vec or GloVe for text vectorization. This should occur in `src/data_loader.py`. Next, implement and train an SVM classifier in `src/model.py`. Finally, write a report of the accuracy of the classifier to `results/metrics/accuracy_score.txt`. Ideally, the report should be easily interpretable.", 4 | "tags": [ 5 | "Natural Language Processing", 6 | "Supervised Learning" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Sentiment140\" dataset, available from \"Hugging Face,\" is obtained in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": false 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "The dataset is cleaned, including by removing stop words, punctuation, and special characters, all in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": true 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [ 28 | 0, 29 | 1 30 | ], 31 | "criteria": "Word embeddings, either \"Word2Vec\" or \"GloVe,\" are used to convert text to vectors in `src/data_loader.py`.", 32 | "category": "Machine Learning Method", 33 | "satisfied": false 34 | }, 35 | { 36 | "requirement_id": 3, 37 | "prerequisites": [ 38 | 0, 39 | 1, 40 | 2 41 | ], 42 | "criteria": "An \"SVM classifier\" is implemented and trained in `src/model.py`.", 43 | "category": "Machine Learning Method", 44 | "satisfied": true 45 | }, 46 | { 47 | "requirement_id": 4, 48 | "prerequisites": [ 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "The accuracy score is printed and saved as `results/metrics/accuracy_score.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The dataset download process should be reliable, with clear error handling.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The final accuracy report should be straightforward and easy to interpret.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": true, 72 | "is_web_navigation_needed": false, 73 | "executed_successfully": true, 74 | "satisfied_all_requirements": false, 75 | "satisfied_all_preferences": false 76 | } 77 | -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/human_as_a_judge/07_Image_Super_Resolution_SRCNN_Set5_DL.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "07_Image_Super_Resolution_SRCNN_Set5_DL", 3 | "query": "Hi, I need to create a project for image super-resolution using the SRCNN model with the Set5 dataset (available from `https://huggingface.co/datasets/eugenesiow/Set5`). Load the dataset in `src/data_loader.py`. When loading the data, include image preprocessing steps such as resizing and normalization, all in `src/data_loader.py`. The SRCNN model should be loaded and used in `src/model.py`. Save 5 sets of comparison images, zooming in on details, as `results/figures/super_resolution_compare.png`, and the super-resolution results as `results/figures/super_resolution_results.png`. The generated images should be high-quality and clearly show improvements.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models" 7 | ], 8 | "requirements": [ 9 | { 10 | "requirement_id": 0, 11 | "prerequisites": [], 12 | "criteria": "The \"Set5\" dataset (available from \"Hugging Face\") is loaded in `src/data_loader.py`.", 13 | "category": "Dataset or Environment", 14 | "satisfied": true 15 | }, 16 | { 17 | "requirement_id": 1, 18 | "prerequisites": [ 19 | 0 20 | ], 21 | "criteria": "Image preprocessing, including resizing and normalization, is performed in `src/data_loader.py`.", 22 | "category": "Data preprocessing and postprocessing", 23 | "satisfied": false 24 | }, 25 | { 26 | "requirement_id": 2, 27 | "prerequisites": [], 28 | "criteria": "The \"SRCNN\" model is used in `src/model.py`.", 29 | "category": "Machine Learning Method", 30 | "satisfied": true 31 | }, 32 | { 33 | "requirement_id": 3, 34 | "prerequisites": [ 35 | 0, 36 | 1, 37 | 2 38 | ], 39 | "criteria": "Five sets of comparison images are saved, with details zoomed in, and saved as `results/figures/super_resolution_compare.png`.", 40 | "category": "Visualization", 41 | "satisfied": false 42 | }, 43 | { 44 | "requirement_id": 4, 45 | "prerequisites": [ 46 | 0, 47 | 1, 48 | 2 49 | ], 50 | "criteria": "Super-resolution results are saved as `results/figures/super_resolution_results.png`.", 51 | "category": "Visualization", 52 | "satisfied": false 53 | } 54 | ], 55 | "preferences": [ 56 | { 57 | "preference_id": 0, 58 | "criteria": "The project should generate high-quality, clear super-resolution images with detailed comparisons.", 59 | "satisfied": null 60 | }, 61 | { 62 | "preference_id": 1, 63 | "criteria": "Well-organized output images, highlighting key improvements, should be included.", 64 | "satisfied": null 65 | } 66 | ], 67 | "is_kaggle_api_needed": false, 68 | "is_training_needed": true, 69 | "is_web_navigation_needed": true, 70 | "executed_successfully": false, 71 | "satisfied_all_requirements": false, 72 | "satisfied_all_preferences": false 73 | } 74 | -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/human_as_a_judge/51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "51_Devin_AI_Software_Engineer_Plants_Secret_Messages_in_Images", 3 | "query": "Hi! Please follow the instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) to set up the script mentioned for generating images with hidden text in `src/visualize.py`. Ensure the generated images are of 1080p resolution and saved in `results/figures/`. Create control images embedding the text \"FUTURE\" and save it in `results/figures/`. Please also manually verify that the hidden text is indeed embedded in the generated images.", 4 | "tags": [ 5 | "Computer Vision", 6 | "Generative Models", 7 | "Natural Language Processing" 8 | ], 9 | "requirements": [ 10 | { 11 | "requirement_id": 0, 12 | "prerequisites": [], 13 | "criteria": "The instructions from the blog post [Hidden in Plain Sight](https://www.factsmachine.ai/p/hidden-in-plain-sight) are followed to set up the script mentioned for generating images with hidden text in `src/visualize.py`.", 14 | "category": "Dataset or Environment", 15 | "satisfied": false 16 | }, 17 | { 18 | "requirement_id": 1, 19 | "prerequisites": [ 20 | 0 21 | ], 22 | "criteria": "The generated images are ensured to be of 1080p resolution and saved in results/figures/.", 23 | "category": "Data preprocessing and postprocessing", 24 | "satisfied": false 25 | }, 26 | { 27 | "requirement_id": 2, 28 | "prerequisites": [ 29 | 0, 30 | 1 31 | ], 32 | "criteria": "Control images embedding the text \"FUTURE,\" is created and saved in results/figures/.", 33 | "category": "Visualization", 34 | "satisfied": true 35 | } 36 | ], 37 | "preferences": [ 38 | { 39 | "preference_id": 0, 40 | "criteria": "The system should be capable of learning and using unfamiliar technologies, adapting to new tools or platforms as required.", 41 | "satisfied": null 42 | }, 43 | { 44 | "preference_id": 1, 45 | "criteria": "After reviewing the blog post, ControlNet should be successfully run on Modal to produce the images with the concealed messages for user.", 46 | "satisfied": null 47 | } 48 | ], 49 | "is_kaggle_api_needed": false, 50 | "is_training_needed": false, 51 | "is_web_navigation_needed": true, 52 | "resource": "https://www.cognition.ai/blog/introducing-devin", 53 | "executed_successfully": true, 54 | "satisfied_all_requirements": false, 55 | "satisfied_all_preferences": false 56 | } 57 | -------------------------------------------------------------------------------- /benchmark/judgment/OpenHands/human_as_a_judge/53_Devin_Upwork_Side_Hustle.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "53_Devin_Upwork_Side_Hustle", 3 | "query": "Hello, I am looking to make inferences with the models in this repository `https://github.com/mahdi65/roadDamageDetection2020`. The system should perform inferences using the models from the repository and save the results in `models/saved_models/`. Sample data should be downloaded and prepared for testing the models in `src/data_loader.py`. Inference should be performed using the provided models on the sample data in `models/saved_models/`. Visualized images showing the detections made by the models should be generated and saved in the `results/figures/` directory. Also, a performance report based on the model's detection results should be generated and saved as `results/metrics/model_performance_report.txt`.", 4 | "tags": [ 5 | "Computer Vision" 6 | ], 7 | "requirements": [ 8 | { 9 | "requirement_id": 0, 10 | "prerequisites": [], 11 | "criteria": "The repository at `https://github.com/mahdi65/roadDamageDetection2020` is set up.", 12 | "category": "Machine Learning Method", 13 | "satisfied": true 14 | }, 15 | { 16 | "requirement_id": 1, 17 | "prerequisites": [ 18 | 0 19 | ], 20 | "criteria": "Sample data is downloaded and prepared for testing the models in `src/data_loader.py`.", 21 | "category": "Dataset or Environment", 22 | "satisfied": false 23 | }, 24 | { 25 | "requirement_id": 2, 26 | "prerequisites": [ 27 | 0, 28 | 1 29 | ], 30 | "criteria": "Inference is performed using the provided models on the sample data in `models/saved_models/`.", 31 | "category": "Other", 32 | "satisfied": false 33 | }, 34 | { 35 | "requirement_id": 3, 36 | "prerequisites": [ 37 | 0, 38 | 1, 39 | 2 40 | ], 41 | "criteria": "Visualized images showing the detections made by the models are generated and saved in the `results/figures/` directory.", 42 | "category": "Visualization", 43 | "satisfied": false 44 | }, 45 | { 46 | "requirement_id": 4, 47 | "prerequisites": [ 48 | 0, 49 | 1, 50 | 2, 51 | 3 52 | ], 53 | "criteria": "A performance report based on the model's detection results is generated and saved as `results/metrics/model_performance_report.txt`.", 54 | "category": "Performance Metrics", 55 | "satisfied": false 56 | } 57 | ], 58 | "preferences": [ 59 | { 60 | "preference_id": 0, 61 | "criteria": "The visualized images should be clear, with detections accurately highlighted for easy interpretation.", 62 | "satisfied": null 63 | }, 64 | { 65 | "preference_id": 1, 66 | "criteria": "The performance report should include a summary of detection accuracy and any issues encountered during inference.", 67 | "satisfied": null 68 | } 69 | ], 70 | "is_kaggle_api_needed": false, 71 | "is_training_needed": false, 72 | "is_web_navigation_needed": true, 73 | "resource": "https://www.cognition.ai/blog/introducing-devin", 74 | "executed_successfully": false, 75 | "satisfied_all_requirements": false, 76 | "satisfied_all_preferences": false 77 | } 78 | -------------------------------------------------------------------------------- /benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/drug_response_prediction_report.md: -------------------------------------------------------------------------------- 1 | # Drug Response Prediction Report 2 | 3 | ## Data Preprocessing 4 | - Loaded the GDSC dataset. 5 | - Performed feature selection using `SelectKBest` with `f_regression` as the scoring function. 6 | - Selected the top 10 features based on their scores. 7 | 8 | ## Model Training 9 | - Implemented a Support Vector Machine (SVM) regressor using `scikit-learn`. 10 | - Created a pipeline with a standard scaler and SVM regressor with a linear kernel. 11 | - Trained the model using the selected features and target values. 12 | 13 | ## Model Evaluation 14 | - Evaluated the model using 5-fold cross-validation. 15 | - Calculated the Root Mean Squared Error (RMSE) for each fold. 16 | - Saved the performance metrics to `results/metrics/performance.txt`. 17 | 18 | ## Results 19 | - Selected features: `feature1`, `feature2`, `feature3`, `feature4`, `feature5`, `feature6`, `feature7`, `feature8`, `feature9`, `feature10`. 20 | - Cross-validated RMSE scores: `[0.28540323, 0.3461573, 0.34480114, 0.37766893, 0.28471238]`. 21 | - Mean RMSE: `0.327748593896111`. 22 | - Standard deviation of RMSE: `0.03678846341261786`. 23 | 24 | ## Visualization 25 | ![RMSE Scores](/workspace/results/figures/rmse_scores.png) 26 | 27 | The histogram above shows the distribution of the RMSE scores obtained from the cross-validation. 28 | 29 | ## Conclusion 30 | - The feature selection process helped in identifying the key features that impact the drug response prediction. 31 | - The SVM regressor provided a reasonable prediction performance with a mean RMSE of approximately 0.328. 32 | - The visualization highlights the consistency of the model's performance across different folds. 33 | -------------------------------------------------------------------------------- /benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/drug_response_prediction_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/drug_response_prediction_report.pdf -------------------------------------------------------------------------------- /benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/performance.txt: -------------------------------------------------------------------------------- 1 | Selected features: Index(['feature1', 'feature2', 'feature3', 'feature4', 'feature5', 'feature6', 2 | 'feature7', 'feature8', 'feature9', 'feature10'], 3 | dtype='object') 4 | Cross-validated RMSE scores: [0.28540323 0.3461573 0.34480114 0.37766893 0.28471238] 5 | Mean RMSE: 0.327748593896111 6 | Standard deviation of RMSE: 0.03678846341261786 7 | -------------------------------------------------------------------------------- /benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/rmse_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/metauto-ai/agent-as-a-judge/b23eb69e757cba870f5c3e5e46362b54ac7ad192/benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/results/rmse_scores.png -------------------------------------------------------------------------------- /benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/src/data_loader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.feature_selection import SelectKBest, f_regression 3 | 4 | def load_and_select_features(data_path, target_column, k=10): 5 | # Load the dataset 6 | data = pd.read_csv(data_path) 7 | 8 | # Separate features and target 9 | X = data.drop(columns=[target_column]) 10 | y = data[target_column] 11 | 12 | # Perform feature selection 13 | selector = SelectKBest(score_func=f_regression, k=k) 14 | X_selected = selector.fit_transform(X, y) 15 | 16 | # Get selected feature names 17 | selected_features = X.columns[selector.get_support()] 18 | 19 | return X_selected, y, selected_features 20 | 21 | if __name__ == "__main__": 22 | data_path = 'path_to_gdsc_dataset.csv' # Update this path 23 | target_column = 'target' # Update this column name 24 | X_selected, y, selected_features = load_and_select_features(data_path, target_column) 25 | print(f"Selected features: {selected_features}") 26 | -------------------------------------------------------------------------------- /benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/src/model.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import SVR 2 | from sklearn.pipeline import make_pipeline 3 | from sklearn.preprocessing import StandardScaler 4 | 5 | def train_svm_regressor(X, y): 6 | # Create a pipeline with standard scaler and SVM regressor 7 | model = make_pipeline(StandardScaler(), SVR(kernel='linear')) 8 | 9 | # Train the model 10 | model.fit(X, y) 11 | 12 | return model 13 | -------------------------------------------------------------------------------- /benchmark/workspaces/OpenHands/39_Drug_Response_Prediction_SVM_GDSC_ML/src/train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import cross_val_score 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import os 6 | from data_loader import load_and_select_features 7 | from model import train_svm_regressor 8 | 9 | def evaluate_model(data_path, target_column, k=10): 10 | # Load and select features 11 | X, y, selected_features = load_and_select_features(data_path, target_column, k) 12 | 13 | # Train the model 14 | model = train_svm_regressor(X, y) 15 | 16 | # Perform cross-validation 17 | scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error') 18 | rmse_scores = np.sqrt(-scores) 19 | 20 | # Save performance results 21 | os.makedirs('results/metrics', exist_ok=True) 22 | with open('results/metrics/performance.txt', 'w') as f: 23 | f.write(f"Selected features: {selected_features}\n") 24 | f.write(f"Cross-validated RMSE scores: {rmse_scores}\n") 25 | f.write(f"Mean RMSE: {rmse_scores.mean()}\n") 26 | f.write(f"Standard deviation of RMSE: {rmse_scores.std()}\n") 27 | 28 | # Visualize regression results 29 | sns.histplot(rmse_scores, kde=True) 30 | plt.title('Cross-validated RMSE scores') 31 | plt.xlabel('RMSE') 32 | plt.ylabel('Frequency') 33 | os.makedirs('results/figures', exist_ok=True) 34 | plt.savefig('results/figures/rmse_scores.png') 35 | plt.close() 36 | 37 | if __name__ == "__main__": 38 | data_path = 'path_to_gdsc_dataset.csv' # Update this path 39 | target_column = 'target' # Update this column name 40 | evaluate_model(data_path, target_column) 41 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "agent-as-a-judge" 3 | version = "0.1.5" 4 | description = "This project contains the source code for the paper [Agent-as-a-Judge: Evaluating Agents with Agents]." 5 | authors = ["Mingchen Zhuge "] 6 | license = "MIT License" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | python-dotenv = "^1.0.1" 12 | tiktoken = "^0.9.0" 13 | rich = "^13.9.2" 14 | litellm = "^1.50.0" 15 | tenacity = "^9.0.0" 16 | numpy = "<2.0" 17 | networkx = "^3.3" 18 | spacy = "<3.8.0" 19 | rank-bm25 = "^0.2.2" 20 | sentence-transformers = "^3.1.1" 21 | pandas = "^2.2.3" 22 | docx = "^0.2.4" 23 | markdown = "^3.7" 24 | python-docx = "^1.1.2" 25 | pypdf2 = "^3.0.1" 26 | openpyxl = "^3.1.5" 27 | opencv-python = "^4.10.0.84" 28 | beautifulsoup4 = "^4.12.3" 29 | pylatexenc = "^2.10" 30 | matplotlib = "^3.9.2" 31 | tree-sitter-languages = "1.8.0" 32 | grep-ast = "^0.3.3" 33 | rapidfuzz = "^3.10.0" 34 | tqdm = "^4.66.5" 35 | logging = "^0.4.9.6" 36 | tree-sitter = "0.21.3" 37 | pytest = "^8.3.3" 38 | tf-keras = "^2.17.0" 39 | jinja2 = "^3.1.3" 40 | dotenv = "^0.9.9" 41 | python-pptx = "^1.0.2" 42 | 43 | 44 | [tool.poetry.group.dev.dependencies] 45 | pytest = "^8.3.3" 46 | pytest-cov = "^5.0.0" 47 | 48 | [build-system] 49 | requires = ["poetry-core"] 50 | build-backend = "poetry.core.masonry.api" 51 | -------------------------------------------------------------------------------- /scripts/run_statistics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | import argparse 4 | import re 5 | 6 | from agent_as_a_judge.module.statistics import DevStatistics 7 | 8 | 9 | logging.basicConfig( 10 | level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" 11 | ) 12 | 13 | 14 | def extract_number_from_filename(filename: str) -> int: 15 | 16 | match = re.search(r"(\d+)", filename) 17 | return int(match.group(1)) if match else float("inf") 18 | 19 | 20 | def main(instance_dir: Path, workspace_dir: Path): 21 | 22 | instance_files = sorted( 23 | list(instance_dir.glob("*.json")), 24 | key=lambda f: extract_number_from_filename(f.stem), 25 | ) 26 | 27 | logging.info(f"Total instances found: {len(instance_files)}") 28 | total_py_files = 0 29 | total_code_lines = 0 30 | total_files = 0 31 | total_non_code_files = 0 32 | 33 | for instance_file in instance_files: 34 | instance_name = instance_file.stem 35 | workspace = workspace_dir / instance_name 36 | 37 | dev_statistics = DevStatistics(workspace) 38 | ( 39 | total_files_in_workspace, 40 | total_non_code_files_in_workspace, 41 | py_files_in_workspace, 42 | lines_in_workspace, 43 | ) = dev_statistics.calculate_statistics() 44 | total_py_files += py_files_in_workspace 45 | total_code_lines += lines_in_workspace 46 | total_files += total_files_in_workspace 47 | total_non_code_files += total_non_code_files_in_workspace 48 | 49 | logging.info("\nTotal summary across all workspaces:") 50 | logging.info(f"Total files: {total_files}") 51 | logging.info(f"Total non-Python files: {total_non_code_files}") 52 | logging.info(f"Total Python files: {total_py_files}") 53 | logging.info(f"Total lines of Python code: {total_code_lines}") 54 | logging.info( 55 | f"Avg. lines of Python code per workspace: {total_code_lines / len(instance_files):.2f}" 56 | ) 57 | logging.info( 58 | f"Avg. python files per workspace: {total_py_files / len(instance_files):.2f}" 59 | ) 60 | logging.info( 61 | f"Avg. total files per workspace: {total_files / len(instance_files):.2f}" 62 | ) 63 | 64 | 65 | def parse_arguments(): 66 | 67 | parser = argparse.ArgumentParser( 68 | description="Run statistics collection for workspaces." 69 | ) 70 | parser.add_argument( 71 | "--benchmark_dir", 72 | type=str, 73 | required=True, 74 | help="Base directory for the DevAI benchmark", 75 | ) 76 | parser.add_argument( 77 | "--developer_agent", type=str, required=True, help="Name of the developer agent" 78 | ) 79 | 80 | return parser.parse_args() 81 | 82 | 83 | if __name__ == "__main__": 84 | args = parse_arguments() 85 | benchmark_dir = Path(args.benchmark_dir) 86 | developer_agent = args.developer_agent 87 | instance_dir = benchmark_dir / "devai/instances" 88 | workspace_dir = benchmark_dir / f"workspaces/{developer_agent}" 89 | main(instance_dir, workspace_dir) 90 | --------------------------------------------------------------------------------