├── Fig_AUC_over_time.py ├── LICENSE ├── README.md ├── create_fig3.py ├── create_fig4.py ├── create_figs_withTree.py ├── create_full_data_ML_pkl.py ├── create_full_data_gpt_pkl.py ├── data ├── all_evaluation_data.pkl ├── combined_ELO_results_35.txt ├── combined_ELO_results_4o.txt ├── combined_ELO_results_4omini.txt ├── elo_data_gpt35.pkl ├── elo_data_gpt4o.pkl ├── full_concepts.txt ├── full_data_DT_fixed_params.pkl ├── full_data_ML.pkl ├── full_data_gpt35.pkl ├── full_data_gpt4o.pkl └── full_data_gpt4omini.pkl ├── figures ├── Fig3.png ├── Fig4.png ├── Fig4_with_tree.png ├── auc_over_time_final.png ├── scimuse.jpeg └── scimuse_benchmark_5k.png └── hyperparameters ├── Fig_DecisionTree_hyperparameters ├── all_results_10.txt ├── all_results_15.txt ├── all_results_20.txt ├── all_results_25.txt ├── all_results_30.txt ├── all_results_35.txt ├── all_results_40.txt ├── all_results_45.txt ├── all_results_50.txt ├── mean_auc_heatmaps_highres.png └── plot_results.py └── Fig_NN_hyperparameters ├── all_results_15_0.003.txt ├── all_results_25_0.003.txt ├── all_results_35_0.003.txt ├── all_results_45_0.003.txt ├── all_results_5_0.003.txt ├── mean_auc_heatmaps_nn_with_lr_dropout_single_colorbar.png └── plot_results.py /Fig_AUC_over_time.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.metrics import roc_auc_score 6 | 7 | def read_elo_results(file_path): 8 | elo_results = [] 9 | with open(file_path, 'r') as file: 10 | lines = file.readlines() 11 | for line in lines: 12 | if not len(line)==1: 13 | id1, id2, winner = line.strip().split(',') 14 | elo_results.append((int(id1), int(id2), int(winner))) 15 | return elo_results 16 | 17 | def update_elo(elo_scores, id1, id2, winner, K=32): 18 | # Compute expected scores 19 | R1 = 10**(elo_scores[id1] / 400) 20 | R2 = 10**(elo_scores[id2] / 400) 21 | E1 = R1 / (R1 + R2) 22 | E2 = R2 / (R1 + R2) 23 | 24 | # Update scores 25 | if winner == 1: 26 | S1, S2 = 1, 0 27 | else: 28 | S1, S2 = 0, 1 29 | 30 | elo_scores[id1] = elo_scores[id1] + K * (S1 - E1) 31 | elo_scores[id2] = elo_scores[id2] + K * (S2 - E2) 32 | 33 | return elo_scores 34 | 35 | if __name__ == "__main__": 36 | 37 | data_dir="data" 38 | os.makedirs(data_dir, exist_ok=True) 39 | file_path = os.path.join(data_dir, 'all_evaluation_data.pkl') 40 | 41 | if os.path.exists(file_path): 42 | with open(file_path, 'rb') as file: 43 | all_data = pickle.load(file) 44 | print("'all_data' has been loaded from the pickle file.") 45 | else: 46 | print(f"{file_path} doesnt exist.") 47 | exit() 48 | 49 | num_of_samples = len(all_data['interest']) 50 | all_auc_labels=['GPT4o mini', 'GPT4o', 'GPT 3.5'] 51 | result_files = ['combined_ELO_results_4omini.txt', 'combined_ELO_results_4o.txt', 'combined_ELO_results_35.txt'] 52 | 53 | all_auc_evolutions=[] 54 | for result_file in result_files: 55 | elo_scores = [1400] * num_of_samples 56 | match_counts = [0] * num_of_samples 57 | elo_results = read_elo_results(os.path.join(data_dir, result_file)) 58 | #random.shuffle(elo_results) 59 | 60 | # Prepare interest data and other relevant variables 61 | interest_data = np.array(all_data['interest']) 62 | 63 | # Initialize list to store AUC values 64 | auc_values = [] 65 | 66 | # Update ELO scores based on results and compute AUC after each update 67 | for idx, (id1, id2, winner) in enumerate(elo_results): 68 | elo_scores = update_elo(elo_scores, id1, id2, winner) 69 | match_counts[id1] += 1 70 | match_counts[id2] += 1 71 | 72 | # Compute AUC after every 10th iteration 73 | if (idx + 1) % 1 == 0 or idx == len(elo_results) - 1: 74 | # Compute AUC 75 | ranked_indices = np.argsort(elo_scores)[::-1] 76 | interest_binary = [1 if interest_data[i] >= 4 else 0 for i in ranked_indices] 77 | auc = roc_auc_score(interest_binary, np.sort(elo_scores)[::-1]) 78 | auc_values.append(auc) 79 | print(f'{idx + 1}/{len(elo_results)}: {auc}') 80 | 81 | all_auc_evolutions.append(auc_values) 82 | 83 | # Plot AUC values over the course of the tournament 84 | plt.figure() 85 | plt.plot(all_auc_evolutions[0], label=f'AUC over time ({all_auc_labels[0]})') 86 | plt.plot(all_auc_evolutions[1], label=f'AUC over time ({all_auc_labels[1]})') 87 | plt.plot(all_auc_evolutions[2], label=f'AUC over time ({all_auc_labels[2]})') 88 | plt.xlabel('Match Number') 89 | plt.ylabel('AUC') 90 | plt.title('AUC over the course of the ELO tournament') 91 | plt.legend(loc="lower right") 92 | plt.grid(True) 93 | 94 | save_dir = 'figures' 95 | os.makedirs(save_dir, exist_ok=True) 96 | 97 | auc_plot_file = os.path.join(save_dir, "auc_over_time_final.png") 98 | plt.savefig(auc_plot_file, dpi=300, format='png') 99 | plt.show() 100 | plt.close() 101 | 102 | print(f"AUC over time plot saved to {auc_plot_file}") 103 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Artificial Scientist Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SciMuse 2 | 3 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 4 | [![arXiv](https://img.shields.io/badge/arXiv-2405.17044-b31b1b.svg)](https://arxiv.org/abs/2405.17044) 5 | 6 | ### How interesting are AI-generated research ideas to experienced human researchers, and how can we improve their quality? 7 | 8 | 9 | 📖 Read our paper here: \ 10 | [**Interesting Scientific Idea Generation Using Knowledge Graphs and LLMs: Evaluations with 100 Research Group Leaders**](https://arxiv.org/abs/2405.17044)\ 11 | *[Xuemei Gu](mailto:xuemei.gu@mpl.mpg.de), [Mario Krenn](mailto:mario.krenn@mpl.mpg.de)* 12 | 13 | workflow 14 | 15 | > [!NOTE]\ 16 | > Full Dynamic Knowledge Graph can be downloaded at [10.5281/zenodo.13900962](https://doi.org/10.5281/zenodo.13900962) 17 | 18 | ## The SciMuse benchmark 19 | 20 | The SciMuse Benchmark tests how well a model can predict expert humans' ranking of the scientific interest of personalized research ideas. The higher the model's quality, the better it can predict what experts consider interesting ideas. Ultimately, models with high scores can be used to rank millions of ideas and select a few exceptionally exciting interdisciplinary ideas that could vastly accelerate scientific progress — that is the dream. 21 | 22 | In the paper, we have nearly 3,000 personalized scientific ideas ranked by more than 100 highly experienced research group leaders (in the fields of biology, chemistry, physics, computer science, math, and humanities). The goal of the SciMuse Benchmark is to rank the 3,000 ideas from most interesting to least interesting. To evaluate, we use the AUC of a binary classification task that separates the ideas into high-interest and low-interest categories. 23 | 24 | To achieve this, we establish an ELO ranking for each idea by simulating many matchups between randomly chosen pairs of ideas. In each matchup, the LLM is given two ideas along with five papers from the corresponding researchers, A and B. The LLM then estimates whether researcher A ranked their idea higher than researcher B's. The final ELO ranking is used, together with the ground truth, to compute the AUC. The final result is computed by an average over 100 random shufflings of the matchup orders. 25 | 26 | ### Results at 01.02.2025 27 | 28 | | Name of Model | AUC @ 5000 | 29 | |---------------------|------------| 30 | | Gemini 2 Flash Thinking | 0.6618 | 31 | | GPT o3-mini | 0.6600 | 32 | | GPT o1 | 0.6573 | 33 | | Claude 3.5 Sonnet | 0.6454 | 34 | | DeepSeek R1 | 0.6408 | 35 | | GPT 4o | 0.6303 | 36 | | Grok 2 | 0.6163 | 37 | | GPT 3.5 | 0.5686 | 38 | workflow 39 | 40 | For privacy reasons, both the research questions and the expert-human rankings are private. Thus, this benchmark cannot be part of any training dataset of the models. If you want to help testing other models for the benchmark, please write to us ([Xuemei Gu](mailto:xuemei.gu@mpl.mpg.de), [Mario Krenn](mailto:mario.krenn@mpl.mpg.de)). We will need API access to your model for 5000 calls or (ideally) more. 41 | 42 | The curves clearly do not converge yet, meaning the final AUC for infinite matchups (and thus the ultimate AUC of the model) is higher than the ones at 5000 matchups. However, due to costly execution, we did not run some of the models for more matchups (specifically, the GPT o1 evaluation costs roughly $300). In any case, the AUC at 5000 matchups is a lower limit of the final AUC and clearly distinguishes the quality of the different models. 43 | 44 | 45 | ## Concept Extraction 46 | 1. Initial Concept Extraction: 47 | We analyzed the titles and abstracts of approximately 2.44 million papers from four preprint datasets using the RAKE algorithm, enhanced with additional stopwords, to extract potential concept candidates. 48 | - Initial filtering retained two-word concepts appearing in at least nine articles. 49 | - Concepts with more than three words were retained if they appeared in six or more articles. 50 | 51 | 2. Quality Improvement: To enhance the quality of identified concepts, we implemented a suite of automated tools to address domain-independent errors commonly associated with RAKE. We then manually reviewed and removed inaccuracies such as non-conceptual phrases, verbs, and conjunctions. For further details, refer to the [Impact4Cast Paper](https://arxiv.org/abs/2402.08640) and our [GitHub code for concept extraction](https://github.com/artificial-scientist-lab/Impact4Cast/tree/main/create_concepts). 52 | 53 | 3. Further Refinement with GPT: 54 | We used GPT-3.5 to refine the concepts further, which resulted in the removal of 286,311 entries. Using Wikipedia, we restored 40,614 mistakenly removed entries, resulting in a final, refined list of 123,128 concepts. For details on prompt engineering, refer to the appendix of the [SciMuse paper](https://arxiv.org/abs/2405.17044). 55 | 56 | The code for generating and refining concepts in this repository: 57 | [GitHub - Impact4Cast Concept Extraction](https://github.com/artificial-scientist-lab/Impact4Cast/tree/main/create_concepts). 58 | 59 | 60 | ## Files in this repository for reproducing results 61 | To reproduce the results, download the repository. The file content is explained in detail below. It requires [Pytorch](https://github.com/pytorch/pytorch), [skikit-learn](https://github.com/scikit-learn/scikit-learn). 62 | 63 | **Figure 3** can be reproduced in the following way: 64 | 1. run the file `create_fig3.py` (creates `Fig3.png`) 65 | 66 | **Figure 4** can be reproduced in the following way: 67 | 1. run `create_full_data_ML_pkl.py` to produce `full_data_ML.pkl` (takes less than 15 minutes on a CPU) 68 | 2. run `create_full_data_gpt_pkl.py` to produce `full_data_gpt35.pkl` and `full_data_gpt4o.pkl` (takes less than 15 minutes on a CPU) 69 | 3. run `create_fig4.py` to create the final figure (creates `Fig4.png`) 70 | 71 |
 72 | .
 73 | ├── data                                      # Directory containing datasets
 74 | │   ├── full_concepts.txt                     # Full concept list
 75 | │   ├── all_evaluation_data.pkl               # Human evaluation dataset
 76 | │   ├── full_data_ML.pkl                      # Dataset for supervised neural networks (from create_full_data_ML_pkl.py)
 77 | │   ├── full_data_gpt35.pkl                   # Dataset for GPT-3.5 (from create_full_data_gpt_pkl.py)
 78 | │   ├── full_data_gpt4o.pkl                   # Dataset for GPT-4o (from create_full_data_gpt_pkl.py)
 79 | │   ├── full_data_gpt4omini.pkl               # Dataset for GPT-4omini
 80 | │   ├── full_data_DT_fixed_params.pkl         # Dataset for Decision tree
 81 | │   ├── elo_data_gpt35.pkl                    # ELO ranking data for GPT-3.5 (from create_full_data_gpt_pkl.py)
 82 | │   ├── elo_data_gpt4o.pkl                    # ELO ranking data for GPT-4o (from create_full_data_gpt_pkl.py)
 83 | │   ├── combined_ELO_results_35.txt           # ELO results for GPT-3.5
 84 | │   ├── combined_ELO_results_4omini.txt       # ELO results for GPT-4omini
 85 | │   └── combined_ELO_results_4o.txt           # ELO results for GPT-4o
 86 | │
 87 | ├── figures                                   # Directory for storing generated figures
 88 | │
 89 | ├── create_fig3.py                            # Analysis of interest levels vs. knowledge graph features (for Fig. 3)
 90 | ├── create_full_data_ML_pkl.py                # Code for generating supervised ML dataset (full_data_ML.pkl)
 91 | ├── create_full_data_gpt_pkl.py               # Code for generating GPT datasets (full_data_gpt35.pkl, full_data_gpt4o.pkl, etc.)
 92 | ├── create_fig4.py                            # Predicting scientific interest and generating Fig. 4
 93 | ├── create_figs_withTree.py                   # Predicting scientific interest and generating Fig4 with Decision tree in the SI
 94 | │
 95 | └── Fig_AUC_over_time.py                      # Zero-shot ranking of research suggestions by LLMs (for Fig. 6)
 96 | 
97 | 98 | 99 | ## How to cite 100 | 101 | ``` 102 | @article{gu2024generation, 103 | title={Interesting Scientific Idea Generation using Knowledge Graphs and LLMs: Evaluations with 100 Research Group Leaders}, 104 | author={Gu, Xuemei and Krenn, Mario}, 105 | journal={arXiv:2405.17044}, 106 | year={2024} 107 | } 108 | ``` 109 | -------------------------------------------------------------------------------- /create_fig3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import csv 4 | import matplotlib.pyplot as plt 5 | import pickle 6 | 7 | 8 | def select_top_n_percent(all_data, N): 9 | # Calculate the number of entries to select 10 | num_entries = len(all_data['features']) 11 | top_n_count = int(num_entries * (N / 100)) 12 | 13 | # Step 1: Sort the entries by 'impact' feature 14 | # Create a list of tuples where each tuple is (index, impact_value) 15 | impact_values = [(i, features[141]) for i, features in enumerate(all_data['features'])] 16 | 17 | # Sort this list by the impact_value in descending order 18 | sorted_by_impact = sorted(impact_values, key=lambda x: x[1], reverse=True) 19 | 20 | # Step 2: Select the top N% of these entries 21 | top_n_indices = [index for index, _ in sorted_by_impact[:top_n_count]] 22 | 23 | # Step 3: Create a new dictionary with these selected entries 24 | all_data_top_n = { 25 | 'clarity': [all_data['clarity'][i] for i in top_n_indices], 26 | 'interest': [all_data['interest'][i] for i in top_n_indices], 27 | 'features': [all_data['features'][i] for i in top_n_indices] 28 | } 29 | 30 | return all_data_top_n 31 | 32 | 33 | # Node Features: 0-19 (20 elements) 34 | # Node Citation: 20-77 (58 elements) 35 | # Edge Features: 78-98 (21 elements) 36 | # Edge Citation: 99-140 (42 elements) 37 | # Subnet Ov.: 141-142 (2 elements) 38 | 39 | if __name__ == "__main__": 40 | authortype = ['nat', 'soc'] 41 | institutetype = ['nat', 'same', 'soc'] 42 | suggestiontype = ['random', 'semnet'] 43 | 44 | all_features = [0, 14, 20, 26, 75, 87, 137, 143] 45 | inset_range = [[-1, 1], [-1, 1], [-1, 1], [-1, 1], [-1, 1], [-1, 1], [-1, 1], [-1, 1]] 46 | all_titles = ['Degree of node A\n', 'PageRank of node A\n', 'Citation for node A\n', 'Total Citation for node A\n', 47 | "Rank of 1-year citation increase\n for node B", 'Simpson similarity coefficient\nfor pair (A,B)', 48 | 'Total papers on concept A or B\n up to two years ago, minimum count', 49 | "Semantic distance\n"] 50 | 51 | color_map = {100: 'blue', 50: 'green', 25: 'red'} # Different colors for each percentage 52 | 53 | data_dir="data" 54 | file_path = os.path.join(data_dir, 'all_evaluation_data.pkl') 55 | if os.path.exists(file_path): 56 | with open(file_path, 'rb') as file: 57 | all_data = pickle.load(file) 58 | print("'all_data' has been loaded from the pickle file.") 59 | else: 60 | print(f"{file_path} doesnt exist.") 61 | exit() 62 | 63 | # Set up the subplots 64 | fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(30, 15)) # Adjust the figure size as needed 65 | axes = axes.flatten() # Flatten the array of axes to make indexing easier 66 | subplot_labels = ['(a)', '(b)', '(c)', '(d)', '(e)', '(f)', '(g)', '(h)'] 67 | 68 | for i, (curr_feature, curr_range, curr_title) in enumerate(zip(all_features, inset_range, all_titles)): 69 | all_feature_vals_list = [ff[curr_feature] for ff in all_data['features']] 70 | all_feature_vals = np.array(all_feature_vals_list) 71 | mean_val = np.mean(all_feature_vals) 72 | std_val = np.std(all_feature_vals) 73 | 74 | for percentage in [25, 50, 100]: 75 | all_data_top_n = select_top_n_percent(all_data, percentage) 76 | interest_data = np.array(all_data_top_n['interest']) 77 | all_feature_vals_list = [ff[curr_feature] for ff in all_data_top_n['features']] 78 | all_feature_vals = np.array(all_feature_vals_list) 79 | 80 | # Normalize the values 81 | all_feature_vals_z = (all_feature_vals - mean_val) / std_val 82 | 83 | # Sort by all_feature_vals_z while keeping the correspondence with interest_data 84 | indices = np.argsort(all_feature_vals_z) 85 | sorted_feature_vals = all_feature_vals_z[indices] 86 | sorted_interest_data = interest_data[indices] 87 | 88 | num_parts = 50 89 | avg_interest_data, std_interest_data, avg_feature_vals = [], [], [] 90 | for j in range(num_parts): 91 | index_start = j * len(sorted_feature_vals) // num_parts 92 | index_end = (j + 1) * len(sorted_feature_vals) // num_parts if j != num_parts - 1 else len(sorted_feature_vals) 93 | part_interest_data = sorted_interest_data[index_start:index_end] 94 | part_feature_vals = sorted_feature_vals[index_start:index_end] 95 | avg_interest_data.append(np.mean(part_interest_data)) 96 | std_interest_data.append(np.std(part_interest_data, ddof=1) / np.sqrt(len(part_interest_data))) 97 | avg_feature_vals.append(np.mean(part_feature_vals)) 98 | 99 | if percentage == 100: 100 | curr_label = 'All answers' 101 | else: 102 | curr_label = f'Top {percentage}% impact' 103 | axes[i].errorbar(avg_feature_vals, avg_interest_data, yerr=std_interest_data, fmt='o', capsize=5, color=color_map[percentage], label=curr_label, alpha=0.8) 104 | 105 | # Linear fit 106 | slope, intercept = np.polyfit(avg_feature_vals, avg_interest_data, 1) 107 | fit_line_linear = slope * np.array(avg_feature_vals) + intercept 108 | axes[i].plot(avg_feature_vals, fit_line_linear, 'grey', linestyle='--', linewidth=2, label='Linear Fit (all answers)') 109 | 110 | axes[i].set_title(curr_title, fontsize=24) 111 | axes[i].grid(True) 112 | if i==3: 113 | axes[i].legend(fontsize=22) 114 | 115 | axes[i].tick_params(axis='x', labelsize=23) 116 | axes[i].set_ylim(1, 5) # Adjust as necessary 117 | 118 | y_ticks = np.linspace(1, 5, 5) 119 | axes[i].set_yticks(y_ticks) 120 | axes[i].set_yticklabels(['{:.1f}'.format(y) for y in y_ticks], fontsize=23) 121 | axes[i].text(-0.11, 1.13, subplot_labels[i], transform=axes[i].transAxes, fontsize=28, fontweight='bold', va='top', ha='left') 122 | 123 | for spine in axes[i].spines.values(): 124 | spine.set_linewidth(1.6) # Adjust the thickness here 125 | 126 | # Set common labels more external to the plot area 127 | fig.text(0.5, 0.02, 'Normalized Feature Values', ha='center', va='center', fontsize=26) # Common x-label, more below 128 | fig.text(0.02, 0.5, 'Average Interest', ha='center', va='center', rotation='vertical', fontsize=26) # Common y-label, more left 129 | 130 | #plt.tight_layout(rect=[0.03, 0.03, 0.97, 0.97]) # Adjust layout to not clip content 131 | fig_dir='figures' 132 | os.makedirs(fig_dir, exist_ok=True) 133 | plt.tight_layout(rect=[0.03, 0.03, 1, 1]) 134 | fig.subplots_adjust(hspace=0.25, wspace=0.16) 135 | plt.savefig(os.path.join(fig_dir, 'Fig3.png'), format='png', dpi=300) 136 | plt.show() 137 | plt.close(fig) 138 | -------------------------------------------------------------------------------- /create_fig4.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.metrics import auc 6 | 7 | 8 | if __name__ == "__main__": 9 | 10 | data_dir="data" 11 | os.makedirs(data_dir, exist_ok=True) 12 | 13 | with open(os.path.join(data_dir, 'full_data_gpt35.pkl'), 'rb') as file: 14 | data_loaded = pickle.load(file) 15 | 16 | ranked_indices_gpt35 = data_loaded['ranked_indices'] 17 | interest_binary_gpt35 = data_loaded['interest_binary'] 18 | auc_values_gpt35 = data_loaded['auc_values'] 19 | fpr_gpt35 = data_loaded['fpr'] 20 | tpr_gpt35 = data_loaded['tpr'] 21 | 22 | with open(os.path.join(data_dir, 'full_data_gpt4o.pkl'), 'rb') as file: 23 | data_loaded = pickle.load(file) 24 | 25 | ranked_indices_gpt4o = data_loaded['ranked_indices'] 26 | interest_binary_gpt4o = data_loaded['interest_binary'] 27 | auc_values_gpt4o = data_loaded['auc_values'] 28 | fpr_gpt4o = data_loaded['fpr'] 29 | tpr_gpt4o = data_loaded['tpr'] 30 | 31 | 32 | with open(os.path.join(data_dir, 'full_data_ML.pkl'), 'rb') as file: 33 | data_loaded = pickle.load(file) 34 | 35 | topNprecision_avg_ML = data_loaded['topNprecision_avg'] 36 | highInterestProb_ML = data_loaded['highInterestProb_ML'] 37 | highInterestProb_rnd = data_loaded['highInterestProb_rnd'] 38 | fpr_ML = data_loaded['fpr'] 39 | tpr_ML = data_loaded['tpr'] 40 | auc_value_ML = auc(fpr_ML, tpr_ML) 41 | 42 | 43 | topN_precision_gpt35 = [sum(interest_binary_gpt35[:i+1]) / (i+1) for i in range(len(interest_binary_gpt35))] 44 | max_precision_gpt35 = [max(interest_binary_gpt35[:i+1]) for i in range(len(interest_binary_gpt35))] 45 | 46 | topN_precision_gpt4o = [sum(interest_binary_gpt4o[:i+1]) / (i+1) for i in range(len(interest_binary_gpt4o))] 47 | max_precision_gpt4o = [max(interest_binary_gpt4o[:i+1]) for i in range(len(interest_binary_gpt4o))] 48 | 49 | # Parameters 50 | n_subsystem = 300 # Size of the subsystems 51 | total_size = len(interest_binary_gpt35) # Assuming all have the same size 52 | iterations = 1000 # Number of iterations for averaging 53 | 54 | # Initialize arrays to hold the cumulative precision and max precision 55 | cumulative_topN_precision_gpt35 = np.zeros(n_subsystem) 56 | cumulative_max_precision_gpt35 = np.zeros(n_subsystem) 57 | 58 | cumulative_topN_precision_gpt4o = np.zeros(n_subsystem) 59 | cumulative_max_precision_gpt4o = np.zeros(n_subsystem) 60 | 61 | 62 | # Loop over the specified number of iterations 63 | for _ in range(iterations): 64 | # Select random indices for the subsystems without changing the order 65 | random_indices = np.sort(np.random.choice(total_size, n_subsystem, replace=False)) 66 | 67 | # Extract subsystems while maintaining the order 68 | interest_binary_gpt35_sub = [interest_binary_gpt35[i] for i in random_indices] 69 | interest_binary_gpt4o_sub = [interest_binary_gpt4o[i] for i in random_indices] 70 | 71 | # Compute precision for the subsystems 72 | topN_precision_gpt35_sub = [sum(interest_binary_gpt35_sub[:i+1]) / (i+1) for i in range(len(interest_binary_gpt35_sub))] 73 | max_precision_gpt35_sub = [max(interest_binary_gpt35_sub[:i+1]) for i in range(len(interest_binary_gpt35_sub))] 74 | 75 | topN_precision_gpt4o_sub = [sum(interest_binary_gpt4o_sub[:i+1]) / (i+1) for i in range(len(interest_binary_gpt4o_sub))] 76 | max_precision_gpt4o_sub = [max(interest_binary_gpt4o_sub[:i+1]) for i in range(len(interest_binary_gpt4o_sub))] 77 | 78 | # Accumulate the results 79 | cumulative_topN_precision_gpt35 += np.array(topN_precision_gpt35_sub) 80 | cumulative_max_precision_gpt35 += np.array(max_precision_gpt35_sub) 81 | 82 | cumulative_topN_precision_gpt4o += np.array(topN_precision_gpt4o_sub) 83 | cumulative_max_precision_gpt4o += np.array(max_precision_gpt4o_sub) 84 | 85 | 86 | # Compute the averages 87 | average_topN_precision_gpt35 = cumulative_topN_precision_gpt35 / iterations 88 | average_max_precision_gpt35 = cumulative_max_precision_gpt35 / iterations 89 | 90 | average_topN_precision_gpt4o = cumulative_topN_precision_gpt4o / iterations 91 | average_max_precision_gpt4o = cumulative_max_precision_gpt4o / iterations 92 | 93 | overall_precision_gpt4o = sum(interest_binary_gpt4o) / len(interest_binary_gpt4o) 94 | 95 | # Create a vector with the same length as interest_binary_gpt4o, filled with the overall precision value 96 | topNprecision_avg_rnd = [overall_precision_gpt4o] * len(interest_binary_gpt4o) 97 | 98 | 99 | # Create a figure with three subplots 100 | fig = plt.figure(figsize=(18, 6)) # Adjusted for three subplots 101 | 102 | label_gpt35='GPT 3.5\n [text, 0-shot]' 103 | label_gpt4o='GPT 4o\n[text, 0-shot]' 104 | label_nn='Neural Net\n[graph, superv.]' 105 | label_rnd='random' 106 | 107 | # Subplot 1: ROC Curve 108 | ax1 = fig.add_subplot(1, 3, 1) 109 | ax1.plot(fpr_gpt35, tpr_gpt35, lw=3, label=f'{label_gpt35}\n(AUC={auc_values_gpt35[-1]:.3f})') 110 | ax1.plot(fpr_gpt4o, tpr_gpt4o, lw=3, label=f'{label_gpt4o}\n(AUC={auc_values_gpt4o[-1]:.3f})') 111 | ax1.plot(fpr_ML, tpr_ML, lw=3, label=f'{label_nn}\n(AUC={auc_value_ML:.3f})') 112 | ax1.plot([0, 1], [0, 1], color='grey', lw=3, linestyle='--',label=f'{label_rnd}\n(AUC={0.500:.3f})') 113 | ax1.set_xlim([0.0, 1.0]) 114 | ax1.set_ylim([0.0, 1.0]) 115 | ax1.set_xlabel('False Positive Rate', fontsize=14) # Consistent font size 116 | ax1.set_ylabel('True Positive Rate', fontsize=14) # Consistent font size 117 | ax1.set_title('Average ROC Curve', fontsize=16) # Consistent title font size 118 | ax1.legend(loc="lower right", fontsize=12) # Consistent legend font size 119 | ax1.grid(True) # Add grid 120 | ax1.tick_params(axis='both', which='major', labelsize=14) # Increase tick label size 121 | ax1.spines['top'].set_linewidth(1.5) # Thicker frame line 122 | ax1.spines['right'].set_linewidth(1.5) 123 | ax1.spines['left'].set_linewidth(1.5) 124 | ax1.spines['bottom'].set_linewidth(1.5) 125 | fig.text(0.0, 0.96, '(a)', fontsize=18, weight='bold') 126 | 127 | # Subplot 2: Top-N Precision for First 300 N Values 128 | ax2 = fig.add_subplot(1, 3, 2) 129 | N = 300 # We will plot for the first 300 N values 130 | ax2.plot(range(1, N+1), average_topN_precision_gpt35[:N], lw=3, label=label_gpt35) 131 | ax2.plot(range(1, N+1), average_topN_precision_gpt4o[:N], lw=3, label=label_gpt4o) 132 | ax2.plot(range(1, len(topNprecision_avg_ML[:N])+1), topNprecision_avg_ML[:N], lw=3, label=label_nn) 133 | ax2.plot(range(1, len(topNprecision_avg_rnd[:N])+1), topNprecision_avg_rnd[:N], lw=3, linestyle='--', color='grey', label=label_rnd) 134 | ax2.set_xlim([1, N]) 135 | ax2.set_ylim([0, 1]) 136 | ax2.set_xlabel('Sorted research suggestion', fontsize=14) # Consistent font size 137 | ax2.set_ylabel('Precision', fontsize=14) # Consistent font size 138 | ax2.set_title('Top-N Precision', fontsize=16) # Consistent title font size 139 | ax2.legend(loc="upper right", fontsize=12) # Consistent legend font size 140 | ax2.grid(True) # Add grid 141 | ax2.tick_params(axis='both', which='major', labelsize=14) 142 | ax2.spines['top'].set_linewidth(1.5) 143 | ax2.spines['right'].set_linewidth(1.5) 144 | ax2.spines['left'].set_linewidth(1.5) 145 | ax2.spines['bottom'].set_linewidth(1.5) 146 | fig.text(0.33, 0.96, '(b)', fontsize=18, weight='bold') 147 | 148 | # Subplot 3: Top-N Precision for First 20 N Values (Max Precision) 149 | ax3 = fig.add_subplot(1, 3, 3) 150 | N = 10 # We will plot for the first 20 N values 151 | ax3.plot(range(1, N+1), average_max_precision_gpt35[:N], lw=3, label=label_gpt35) 152 | ax3.plot(range(1, N+1), average_max_precision_gpt4o[:N], lw=3, label=label_gpt4o) 153 | ax3.plot(range(1, len(highInterestProb_ML[:N])+1), highInterestProb_ML[:N], lw=3, label=label_nn) 154 | ax3.plot(range(1, len(highInterestProb_rnd[:N])+1), highInterestProb_rnd[:N], lw=3, linestyle='--', color='grey', label=label_rnd) 155 | ax3.set_xlim([1, N]) 156 | ax3.set_ylim([0, 1]) 157 | ax3.set_xlabel('Sorted research suggestion', fontsize=14) # Consistent font size 158 | ax3.set_ylabel('Probability', fontsize=14) # Consistent font size 159 | ax3.set_title('Top-N Success Probability', fontsize=16) # Consistent title font size 160 | ax3.legend(loc="lower right", fontsize=12) # Consistent legend font size 161 | ax3.grid(True) # Add grid 162 | ax3.tick_params(axis='both', which='major', labelsize=14) 163 | ax3.spines['top'].set_linewidth(1.5) 164 | ax3.spines['right'].set_linewidth(1.5) 165 | ax3.spines['left'].set_linewidth(1.5) 166 | ax3.spines['bottom'].set_linewidth(1.5) 167 | fig.text(0.66, 0.96, '(c)', fontsize=18, weight='bold') 168 | 169 | # Adjust layout to prevent overlap 170 | plt.tight_layout() 171 | # Directory and filename setup 172 | save_dir = 'figures' 173 | filename = 'Fig4.png' 174 | # Ensure the directory exists 175 | os.makedirs(save_dir, exist_ok=True) 176 | 177 | # Full path to save the figure 178 | save_path = os.path.join(save_dir, filename) 179 | 180 | # Save the figure 181 | plt.savefig(save_path, dpi=300, format='png') 182 | 183 | # Show the plot 184 | plt.show() -------------------------------------------------------------------------------- /create_figs_withTree.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.metrics import auc 6 | 7 | 8 | if __name__ == "__main__": 9 | 10 | data_dir="data" 11 | os.makedirs(data_dir, exist_ok=True) 12 | with open(os.path.join(data_dir,'full_data_gpt35.pkl'), 'rb') as file: 13 | data_loaded = pickle.load(file) 14 | 15 | ranked_indices_gpt35 = data_loaded['ranked_indices'] 16 | interest_binary_gpt35 = data_loaded['interest_binary'] 17 | auc_values_gpt35 = data_loaded['auc_values'] 18 | fpr_gpt35 = data_loaded['fpr'] 19 | tpr_gpt35 = data_loaded['tpr'] 20 | 21 | with open(os.path.join(data_dir,'full_data_gpt4o.pkl'), 'rb') as file: 22 | data_loaded = pickle.load(file) 23 | 24 | ranked_indices_gpt4o = data_loaded['ranked_indices'] 25 | interest_binary_gpt4o = data_loaded['interest_binary'] 26 | auc_values_gpt4o = data_loaded['auc_values'] 27 | fpr_gpt4o = data_loaded['fpr'] 28 | tpr_gpt4o = data_loaded['tpr'] 29 | 30 | with open(os.path.join(data_dir,'full_data_gpt4omini.pkl'), 'rb') as file: 31 | data_loaded = pickle.load(file) 32 | 33 | ranked_indices_gpt4o_mini = data_loaded['ranked_indices'] 34 | interest_binary_gpt4o_mini = data_loaded['interest_binary'] 35 | auc_values_gpt4o_mini = data_loaded['auc_values'] 36 | fpr_gpt4o_mini = data_loaded['fpr'] 37 | tpr_gpt4o_mini = data_loaded['tpr'] 38 | 39 | with open(os.path.join(data_dir,'full_data_ML.pkl'), 'rb') as file: 40 | data_loaded = pickle.load(file) 41 | 42 | topNprecision_avg_ML = data_loaded['topNprecision_avg'] 43 | highInterestProb_ML = data_loaded['highInterestProb_ML'] 44 | highInterestProb_rnd = data_loaded['highInterestProb_rnd'] 45 | fpr_ML = data_loaded['fpr'] 46 | tpr_ML = data_loaded['tpr'] 47 | auc_value_ML = auc(fpr_ML, tpr_ML) 48 | 49 | with open(os.path.join(data_dir,'full_data_DT_fixed_params.pkl'), 'rb') as file: 50 | data_loaded = pickle.load(file) 51 | 52 | topNprecision_avg_DT = data_loaded['topNprecision_avg'] 53 | highInterestProb_DT = data_loaded['highInterestProb_ML'] 54 | #highInterestProb_rnd = data_loaded['highInterestProb_rnd'] 55 | fpr_DT = data_loaded['fpr'] 56 | tpr_DT = data_loaded['tpr'] 57 | auc_value_DT = auc(fpr_DT, tpr_DT) 58 | 59 | 60 | topN_precision_gpt35 = [sum(interest_binary_gpt35[:i+1]) / (i+1) for i in range(len(interest_binary_gpt35))] 61 | max_precision_gpt35 = [max(interest_binary_gpt35[:i+1]) for i in range(len(interest_binary_gpt35))] 62 | 63 | topN_precision_gpt4o = [sum(interest_binary_gpt4o[:i+1]) / (i+1) for i in range(len(interest_binary_gpt4o))] 64 | max_precision_gpt4o = [max(interest_binary_gpt4o[:i+1]) for i in range(len(interest_binary_gpt4o))] 65 | 66 | 67 | # Parameters 68 | n_subsystem = 300 # Size of the subsystems 69 | total_size = len(interest_binary_gpt35) # Assuming all have the same size 70 | iterations = 1000 # Number of iterations for averaging 71 | 72 | # Initialize arrays to hold the cumulative precision and max precision 73 | cumulative_topN_precision_gpt35 = np.zeros(n_subsystem) 74 | cumulative_max_precision_gpt35 = np.zeros(n_subsystem) 75 | 76 | cumulative_topN_precision_gpt4o = np.zeros(n_subsystem) 77 | cumulative_max_precision_gpt4o = np.zeros(n_subsystem) 78 | 79 | 80 | cumulative_topN_precision_gpt4o_mini = np.zeros(n_subsystem) 81 | cumulative_max_precision_gpt4o_mini = np.zeros(n_subsystem) 82 | # Loop over the specified number of iterations 83 | for _ in range(iterations): 84 | if _ % 10 ==0: 85 | print(_) 86 | # Select random indices for the subsystems without changing the order 87 | random_indices = np.sort(np.random.choice(total_size, n_subsystem, replace=False)) 88 | 89 | # Extract subsystems while maintaining the order 90 | interest_binary_gpt35_sub = [interest_binary_gpt35[i] for i in random_indices] 91 | interest_binary_gpt4o_sub = [interest_binary_gpt4o[i] for i in random_indices] 92 | interest_binary_gpt4o_mini_sub = [interest_binary_gpt4o_mini[i] for i in random_indices] 93 | 94 | # Compute precision for the subsystems 95 | topN_precision_gpt35_sub = [sum(interest_binary_gpt35_sub[:i+1]) / (i+1) for i in range(len(interest_binary_gpt35_sub))] 96 | max_precision_gpt35_sub = [max(interest_binary_gpt35_sub[:i+1]) for i in range(len(interest_binary_gpt35_sub))] 97 | 98 | topN_precision_gpt4o_sub = [sum(interest_binary_gpt4o_sub[:i+1]) / (i+1) for i in range(len(interest_binary_gpt4o_sub))] 99 | max_precision_gpt4o_sub = [max(interest_binary_gpt4o_sub[:i+1]) for i in range(len(interest_binary_gpt4o_sub))] 100 | 101 | topN_precision_gpt4o_mini_sub = [sum(interest_binary_gpt4o_mini_sub[:i+1]) / (i+1) for i in range(len(interest_binary_gpt4o_mini_sub))] 102 | max_precision_gpt4o_mini_sub = [max(interest_binary_gpt4o_mini_sub[:i+1]) for i in range(len(interest_binary_gpt4o_mini_sub))] 103 | 104 | # Accumulate the results 105 | cumulative_topN_precision_gpt35 += np.array(topN_precision_gpt35_sub) 106 | cumulative_max_precision_gpt35 += np.array(max_precision_gpt35_sub) 107 | 108 | cumulative_topN_precision_gpt4o += np.array(topN_precision_gpt4o_sub) 109 | cumulative_max_precision_gpt4o += np.array(max_precision_gpt4o_sub) 110 | 111 | cumulative_topN_precision_gpt4o_mini += np.array(topN_precision_gpt4o_mini_sub) 112 | cumulative_max_precision_gpt4o_mini += np.array(max_precision_gpt4o_mini_sub) 113 | 114 | 115 | # Compute the averages 116 | average_topN_precision_gpt35 = cumulative_topN_precision_gpt35 / iterations 117 | average_max_precision_gpt35 = cumulative_max_precision_gpt35 / iterations 118 | 119 | average_topN_precision_gpt4o = cumulative_topN_precision_gpt4o / iterations 120 | average_max_precision_gpt4o = cumulative_max_precision_gpt4o / iterations 121 | 122 | 123 | average_topN_precision_gpt4o_mini = cumulative_topN_precision_gpt4o_mini / iterations 124 | average_max_precision_gpt4o_mini = cumulative_max_precision_gpt4o_mini / iterations 125 | 126 | overall_precision_gpt4o = sum(interest_binary_gpt4o) / len(interest_binary_gpt4o) 127 | 128 | # Create a vector with the same length as interest_binary_gpt4o, filled with the overall precision value 129 | topNprecision_avg_rnd = [overall_precision_gpt4o] * len(interest_binary_gpt4o) 130 | 131 | 132 | # Create a figure with three subplots 133 | fig = plt.figure(figsize=(18, 6)) # Adjusted for three subplots 134 | 135 | label_gpt35='GPT 3.5\n [text, 0-shot]' 136 | label_gpt4o='GPT 4o\n[text, 0-shot]' 137 | label_gpt4o_mini='GPT 4o-mini\n[text, 0-shot]' 138 | label_nn='Neural Net\n[graph, superv.]' 139 | label_dt='Decision Tree\n[graph, superv.]' 140 | label_rnd='random' 141 | 142 | # Subplot 1: ROC Curve 143 | ax1 = fig.add_subplot(1, 3, 1) 144 | ax1.plot(fpr_gpt35, tpr_gpt35, lw=4, label=f'{label_gpt35}\n(AUC={auc_values_gpt35[-1]:.3f})') 145 | ax1.plot(fpr_gpt4o, tpr_gpt4o, lw=4, label=f'{label_gpt4o}\n(AUC={auc_values_gpt4o[-1]:.3f})') 146 | ax1.plot(fpr_ML, tpr_ML, lw=4, label=f'{label_nn}\n(AUC={auc_value_ML:.3f})') 147 | ax1.plot(fpr_gpt4o_mini, tpr_gpt4o_mini, lw=4, label=f'{label_gpt4o_mini}\n(AUC={auc_values_gpt4o_mini[-1]:.3f})') 148 | ax1.plot(fpr_DT, tpr_DT, lw=4, label=f'{label_dt}\n(AUC={auc_value_DT:.3f})') 149 | 150 | ax1.plot([0, 1], [0, 1], color='grey', lw=4, linestyle='--',label=f'{label_rnd}\n(AUC={0.500:.3f})') 151 | ax1.set_xlim([0.0, 1.0]) 152 | ax1.set_ylim([0.0, 1.0]) 153 | ax1.set_xlabel('False Positive Rate', fontsize=14) # Consistent font size 154 | ax1.set_ylabel('True Positive Rate', fontsize=14) # Consistent font size 155 | ax1.set_title('Average ROC Curve', fontsize=20) # Consistent title font size 156 | ax1.legend(loc="lower right", fontsize=12) # Consistent legend font size 157 | ax1.grid(True) # Add grid 158 | 159 | # Subplot 2: Top-N Precision for First 300 N Values 160 | ax2 = fig.add_subplot(1, 3, 2) 161 | N = 300 # We will plot for the first 300 N values 162 | ax2.plot(range(1, N+1), average_topN_precision_gpt35[:N], lw=4, label=label_gpt35) 163 | ax2.plot(range(1, N+1), average_topN_precision_gpt4o[:N], lw=4, label=label_gpt4o) 164 | ax2.plot(range(1, len(topNprecision_avg_ML[:N])+1), topNprecision_avg_ML[:N], lw=4, label=label_nn) 165 | ax2.plot(range(1, N+1), average_topN_precision_gpt4o_mini[:N], lw=4, label=label_gpt4o_mini) 166 | ax2.plot(range(1, len(topNprecision_avg_DT[:N])+1), topNprecision_avg_DT[:N], lw=4, label=label_dt) 167 | ax2.plot(range(1, len(topNprecision_avg_rnd[:N])+1), topNprecision_avg_rnd[:N], lw=4, linestyle='--', color='grey', label=label_rnd) 168 | ax2.set_xlim([1, N]) 169 | ax2.set_ylim([0, 1]) 170 | ax2.set_xlabel('Sorted research suggestion', fontsize=14) # Consistent font size 171 | ax2.set_ylabel('Precision', fontsize=14) # Consistent font size 172 | ax2.set_title('Top-N Precision', fontsize=20) # Consistent title font size 173 | ax2.legend(loc="upper right", fontsize=12) # Consistent legend font size 174 | ax2.grid(True) # Add grid 175 | 176 | # Subplot 3: Top-N Precision for First 20 N Values (Max Precision) 177 | ax3 = fig.add_subplot(1, 3, 3) 178 | N = 10 # We will plot for the first 20 N values 179 | ax3.plot(range(1, N+1), average_max_precision_gpt35[:N], lw=4, label=label_gpt35) 180 | ax3.plot(range(1, N+1), average_max_precision_gpt4o[:N], lw=4, label=label_gpt4o) 181 | ax3.plot(range(1, len(highInterestProb_ML[:N])+1), highInterestProb_ML[:N], lw=4, label=label_nn) 182 | ax3.plot(range(1, N+1), average_max_precision_gpt4o_mini[:N], lw=4, label=label_gpt4o_mini) 183 | ax3.plot(range(1, len(highInterestProb_DT[:N])+1), highInterestProb_DT[:N], lw=4, label=label_dt) 184 | 185 | ax3.plot(range(1, len(highInterestProb_rnd[:N])+1), highInterestProb_rnd[:N], lw=4, linestyle='--', color='grey', label=label_rnd) 186 | ax3.set_xlim([1, N]) 187 | ax3.set_ylim([0, 1]) 188 | ax3.set_xlabel('Sorted research suggestion', fontsize=14) # Consistent font size 189 | ax3.set_ylabel('Probability', fontsize=14) # Consistent font size 190 | ax3.set_title('Top-N Success Probability', fontsize=20) # Consistent title font size 191 | ax3.legend(loc="lower right", fontsize=12) # Consistent legend font size 192 | ax3.grid(True) # Add grid 193 | 194 | # Adjust layout to prevent overlap 195 | plt.tight_layout() 196 | # Directory and filename setup 197 | save_dir = 'figures' 198 | filename = 'Fig4_with_tree.png' 199 | 200 | # Ensure the directory exists 201 | os.makedirs(save_dir, exist_ok=True) 202 | 203 | # Full path to save the figure 204 | save_path = os.path.join(save_dir, filename) 205 | 206 | # Save the figure 207 | plt.savefig(save_path, dpi=300, format='png') 208 | 209 | # Show the plot 210 | plt.show() -------------------------------------------------------------------------------- /create_full_data_ML_pkl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import csv 4 | import matplotlib.pyplot as plt 5 | import time 6 | import pickle 7 | import torch 8 | from torch import nn 9 | import torch.nn.functional as F 10 | import random 11 | from sklearn.metrics import roc_curve, auc, precision_recall_curve 12 | from mpl_toolkits.axes_grid1.inset_locator import inset_axes 13 | from datetime import datetime 14 | 15 | def print_log(log_string): 16 | current_time = datetime.now() 17 | formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S') 18 | print_string=f"{formatted_time}: {log_string}" 19 | print(print_string) 20 | with open(log_file, 'a') as file: 21 | file.write(print_string+'\n') 22 | 23 | 24 | 25 | class InterestPredictor(nn.Module): 26 | def __init__(self, input_features, neurons_per_layer, dropout_rate): 27 | super(InterestPredictor, self).__init__() 28 | 29 | num_layers = len(neurons_per_layer) 30 | self.layers = nn.ModuleList() 31 | 32 | # Input layer 33 | self.layers.append(nn.Linear(input_features, neurons_per_layer[0])) 34 | self.layers.append(nn.ReLU()) 35 | # Optionally add dropout after the first layer or activation 36 | self.layers.append(nn.Dropout(dropout_rate)) 37 | 38 | # Hidden layers 39 | for i in range(1, num_layers): 40 | self.layers.append(nn.Linear(neurons_per_layer[i-1], neurons_per_layer[i])) 41 | self.layers.append(nn.ReLU()) 42 | # Add dropout after each activation 43 | self.layers.append(nn.Dropout(dropout_rate)) 44 | 45 | # Output layer 46 | # No dropout is applied to the output layer 47 | self.layers.append(nn.Linear(neurons_per_layer[-1], 1)) 48 | 49 | def forward(self, x): 50 | for layer in self.layers: 51 | x = layer(x) 52 | return x 53 | 54 | def compute_parameters(nn_structure, first_layer=1): 55 | # Add the input layer size at the beginning and the output size at the end 56 | layers = [first_layer] + nn_structure + [1] 57 | parameters = 0 58 | for i in range(1, len(layers)): 59 | parameters += (layers[i-1] * layers[i]) + layers[i] 60 | return parameters 61 | 62 | 63 | 64 | def normalize_features(all_data): 65 | features_norm=[] 66 | 67 | for curr_feature in range(len(all_data['features'][0])): 68 | all_feature_vals_list = [ff[curr_feature] for ff in all_data['features']] 69 | all_feature_vals = np.array(all_feature_vals_list) 70 | 71 | 72 | # Normalize the values 73 | mean_val = np.mean(all_feature_vals) 74 | std_val = np.std(all_feature_vals) 75 | all_feature_vals_z = (all_feature_vals - mean_val) / std_val 76 | 77 | features_norm.append(all_feature_vals_z) 78 | 79 | features_norm=np.array(features_norm) 80 | return features_norm 81 | 82 | 83 | 84 | def prepare_data(features_norm, interest_data, train_ratio, val_ratio, test_ratio): 85 | # Ensure the split ratios sum to 1 86 | 87 | # Shuffle the data 88 | indices = np.arange(len(interest_data)) 89 | np.random.shuffle(indices) 90 | features_norm = features_norm[:, indices] 91 | interest_data = interest_data[indices] 92 | 93 | # Calculate split indices 94 | train_index = int(len(interest_data) * train_ratio) 95 | val_index = int(len(interest_data) * (train_ratio + val_ratio)) 96 | 97 | # Split the data 98 | X_train = features_norm[:, :train_index].T 99 | y_train = interest_data[:train_index] 100 | 101 | X_val = features_norm[:, train_index:val_index].T 102 | y_val = interest_data[train_index:val_index] 103 | 104 | X_test = features_norm[:, val_index:].T 105 | y_test = interest_data[val_index:] 106 | 107 | # Convert to PyTorch tensors 108 | X_train = torch.tensor(X_train, dtype=torch.float32) 109 | y_train = torch.tensor(y_train, dtype=torch.float32) 110 | X_val = torch.tensor(X_val, dtype=torch.float32) 111 | y_val = torch.tensor(y_val, dtype=torch.float32) 112 | X_test = torch.tensor(X_test, dtype=torch.float32) 113 | y_test = torch.tensor(y_test, dtype=torch.float32) 114 | 115 | 116 | return X_train, y_train, X_val, y_val, X_test, y_test 117 | 118 | 119 | def train(X_train, y_train, X_val, y_val, model, optimizer, epochs=100, patience=10, do_plot=False): 120 | train_losses = [] 121 | val_losses = [] 122 | best_val_loss = float('inf') 123 | epochs_no_improve = 0 124 | best_model = None 125 | 126 | criterion = nn.MSELoss() 127 | 128 | for epoch in range(epochs): 129 | # Training phase 130 | model.train() 131 | optimizer.zero_grad() 132 | outputs = model(X_train) 133 | train_loss = criterion(outputs.squeeze(), y_train) 134 | train_loss.backward() 135 | optimizer.step() 136 | 137 | train_losses.append(train_loss.item()) 138 | 139 | # Evaluation phase 140 | model.eval() 141 | with torch.no_grad(): 142 | predictions = model(X_val) 143 | val_loss = criterion(predictions.squeeze(), y_val) 144 | val_losses.append(val_loss.item()) 145 | 146 | if val_loss < best_val_loss: 147 | best_val_loss = val_loss 148 | best_model = model.state_dict().copy() # Ensure a deep copy is made for the model state 149 | epochs_no_improve = 0 150 | else: 151 | epochs_no_improve += 1 152 | 153 | if epochs_no_improve == patience: 154 | #print(f'Early stopping triggered at epoch {epoch+1}') 155 | break 156 | 157 | #if (epoch+1) % 100 == 0: 158 | # print(f'Epoch {epoch+1}, Training Loss: {train_loss.item()}, Val Loss: {val_loss.item()}') 159 | 160 | model.load_state_dict(best_model) 161 | 162 | return model 163 | 164 | 165 | 166 | def evaluate_binary_classification(model, X_test, y_test, iteration=0, do_plot=False): 167 | #print(f'{torch.sum(y_test > 3).float()}/{len(y_test)}') 168 | model.eval() 169 | with torch.no_grad(): 170 | raw_predictions = model(X_test) 171 | predictions_proba = torch.sigmoid(raw_predictions).squeeze() 172 | 173 | # Adjust labels if needed 174 | if sum((y_test > 3)) > 0: 175 | y_test_binary = (y_test > 3).int() 176 | 177 | 178 | 179 | # Get indices that would sort predictions_proba in descending order 180 | sorted_indices = torch.argsort(predictions_proba, descending=True) 181 | sorted_y_test_binary = y_test_binary[sorted_indices] 182 | 183 | random_order = torch.randperm(len(y_test_binary)) 184 | random_y_test_binary = y_test_binary[random_order] 185 | 186 | 187 | # Calculate the cumulative sum of the sorted binary labels 188 | cumulative_sums = torch.cumsum(sorted_y_test_binary, dim=0) 189 | y_cummax=sorted_y_test_binary.cummax(dim=0)[0].numpy() 190 | rnd_cummax=random_y_test_binary.cummax(dim=0)[0].numpy() 191 | 192 | denominators = torch.arange(1, len(sorted_y_test_binary) + 1) 193 | 194 | # Calculate precision for each threshold 195 | precision = cumulative_sums.float() / denominators.float() 196 | precision_numpy = precision.numpy() 197 | 198 | # Calculate ROC AUC as well for comparison 199 | fpr, tpr, _ = roc_curve(y_test_binary.numpy(), predictions_proba.numpy()) 200 | roc_auc = auc(fpr, tpr) 201 | 202 | if do_plot: 203 | # Plot Precision-Recall curve 204 | plt.figure(figsize=(12, 6)) 205 | 206 | plt.plot(precision_numpy, color='blue', lw=2, label=f'Precision curve)') 207 | plt.xlabel('index') 208 | plt.ylabel('Precision') 209 | plt.title('Precision Curve') 210 | plt.legend(loc="lower left") 211 | plt.show() 212 | 213 | # Plot ROC curve 214 | plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.4f)' % roc_auc) 215 | plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') 216 | plt.xlim([0.0, 1.0]) 217 | plt.ylim([0.0, 1.05]) 218 | plt.xlabel('False Positive Rate') 219 | plt.ylabel('True Positive Rate') 220 | plt.title('ROC Curve by fpr, tpr') 221 | plt.legend(loc="lower right") 222 | plt.show() 223 | 224 | 225 | return roc_auc, precision_numpy, fpr, tpr, y_cummax, rnd_cummax 226 | 227 | 228 | 229 | 230 | def select_top_n_percent(all_data, N): 231 | # Calculate the number of entries to select 232 | num_entries = len(all_data['features']) 233 | top_n_count = int(num_entries * (N / 100)) 234 | 235 | # Step 1: Sort the entries by 'impact' feature 236 | # Create a list of tuples where each tuple is (index, impact_value) 237 | impact_values = [(i, features[141]) for i, features in enumerate(all_data['features'])] 238 | 239 | # Sort this list by the impact_value in descending order 240 | sorted_by_impact = sorted(impact_values, key=lambda x: x[1], reverse=True) 241 | 242 | # Step 2: Select the top N% of these entries 243 | top_n_indices = [index for index, _ in sorted_by_impact[:top_n_count]] 244 | 245 | # Step 3: Create a new dictionary with these selected entries 246 | all_data_top_n = { 247 | 'clarity': [all_data['clarity'][i] for i in top_n_indices], 248 | 'interest': [all_data['interest'][i] for i in top_n_indices], 249 | 'features': [all_data['features'][i] for i in top_n_indices] 250 | } 251 | 252 | return all_data_top_n 253 | 254 | 255 | if __name__ == "__main__": 256 | # Configuration 257 | 258 | log_dir='logs_MLdata' 259 | os.makedirs(log_dir, exist_ok=True) 260 | 261 | authortype = ['nat', 'soc'] 262 | institutetype = ['nat', 'same', 'soc'] 263 | suggestiontype = ['random', 'semnet'] 264 | 265 | 266 | np.random.seed() 267 | random.seed() 268 | torch_seed = random.randint(0, 2**32 - 1) 269 | torch.manual_seed(torch_seed) 270 | rrrseed=2 271 | np.random.seed(rrrseed) 272 | random.seed(rrrseed) 273 | torch.manual_seed(rrrseed) 274 | 275 | CURR_ID=random.randint(10000000, 99999999) 276 | log_file=os.path.join(log_dir, f"logs_{CURR_ID}.txt") 277 | 278 | # Load and prepare your data outside the loop if it's the same for each iteration 279 | data_dir="data" 280 | os.makedirs(data_dir, exist_ok=True) 281 | file_path = os.path.join(data_dir, 'all_evaluation_data.pkl') 282 | 283 | if os.path.exists(file_path): 284 | with open(file_path, 'rb') as file: 285 | all_data = pickle.load(file) 286 | print("'all_data' has been loaded from the pickle file.") 287 | else: 288 | print(f"{file_path} doesnt exist.") 289 | exit() 290 | 291 | 292 | percentage=100 293 | all_data_top_n = select_top_n_percent(all_data, percentage) 294 | all_data=all_data_top_n 295 | 296 | interest_data = np.array(all_data['interest']) 297 | features_norm = normalize_features(all_data) 298 | 299 | n_features=25 300 | if True: 301 | #for n_features in list(range(10, 100, 5)): 302 | precision_all= np.array([]) 303 | ycummax_all=np.array([]) 304 | rnd_cummax_all=np.array([]) 305 | 306 | fpr_all = np.array([]) # Initialize if not already 307 | tpr_all = np.array([]) # Initialize if not already 308 | auc_scores = [] 309 | std_of_the_mean=1 310 | 311 | curr_feature_list=[143, 6, 66, 76, 40, 24, 30, 15, 2, 72, 12, 36, 17, 10, 20, 34, 0, 4, 7, 14, 26, 16, 38, 5, 28, 68, 22, 8, 42, 13, 19, 9, 32, 70, 62, 64, 74, 18, 11, 3, 25, 1, 35, 135, 55, 23, 136, 27, 39, 53, 137, 31, 43, 60, 65, 112, 37, 29, 51, 123, 124, 117, 41, 125, 129, 130, 118, 119, 33, 50, 71, 131, 21, 63, 138, 54, 88, 52, 75, 132, 140, 59, 87, 56, 89, 103, 133, 77, 96, 139, 102, 73, 58, 69, 104, 67, 93, 48, 94, 90, 91, 97, 92, 122, 106, 99, 79, 100, 45, 61, 121, 95, 49, 142, 78, 80, 46, 98, 111, 120, 47, 101, 84, 44, 107, 128, 81, 134, 82, 85, 113, 116, 114, 127, 105, 115] 312 | #curr_feature_list=[8, 70, 66, 81, 39, 117, 125, 97, 40, 55, 17, 5, 46, 123, 48, 3, 41, 91, 92, 111, 114, 24, 135, 74, 95, 82, 10, 113, 87, 112, 124, 63, 4, 0, 27, 42, 143, 15] 313 | curr_feature_list=curr_feature_list[0:n_features] 314 | curr_neurons_per_layer_list=[50] 315 | curr_lr=0.003 316 | curr_train_ratio=0.75 317 | curr_dropout=0.2 318 | curr_weight_decay=0.0007 319 | 320 | hyperparameters=curr_neurons_per_layer_list, curr_feature_list, curr_lr, curr_train_ratio, curr_dropout, curr_weight_decay 321 | 322 | #hyperparameters=[[87], [8, 70, 66, 81, 39, 117, 125, 97, 40, 55, 17, 5, 46, 123, 48, 3, 41, 91, 92, 111, 114, 24, 135, 74, 95, 82, 10, 113, 87, 112, 124, 63, 4, 0, 27, 42, 143, 15], 0.00257, 0.7547, 0.21, 0.0007] 323 | #curr_neurons_per_layer_list, curr_feature_list, curr_lr, curr_train_ratio, curr_dropout, curr_weight_decay=hyperparameters 324 | 325 | 326 | curr_features_norm = features_norm[curr_feature_list, :] 327 | print_log(f"hyperparameters={hyperparameters}\n") 328 | did_early_stop=False 329 | while len(auc_scores)<10 or std_of_the_mean>1/3*0.01: 330 | X_train, y_train, X_val, y_val, X_test, y_test = prepare_data(curr_features_norm, interest_data, train_ratio=curr_train_ratio, val_ratio=0.9-curr_train_ratio, test_ratio=0.1) 331 | 332 | # Re-instantiate model and optimizer 333 | model = InterestPredictor(input_features=len(curr_features_norm), neurons_per_layer=curr_neurons_per_layer_list, dropout_rate=curr_dropout) 334 | optimizer = torch.optim.Adam(model.parameters(), lr=curr_lr, weight_decay=curr_weight_decay) 335 | 336 | criterion = nn.MSELoss() 337 | 338 | # Train the model 339 | #model = train(X_train, y_train, X_val, y_val, model, criterion, optimizer, epochs=1000, patience=200) 340 | model = train(X_train, y_train, X_val, y_val, model, optimizer, epochs=1000, patience=200) 341 | 342 | # Evaluate the model 343 | roc_auc, precision_numpy, fpr, tpr, y_cummax, rnd_cummax = evaluate_binary_classification(model, X_test, y_test, iteration=len(auc_scores), do_plot=False) 344 | 345 | fpr_all = np.concatenate((fpr_all, fpr)) 346 | tpr_all = np.concatenate((tpr_all, tpr)) 347 | 348 | if len(precision_all)==0: 349 | precision_all = precision_numpy 350 | ycummax_all =y_cummax 351 | rnd_cummax_all=rnd_cummax 352 | else: 353 | precision_all += precision_numpy 354 | ycummax_all +=y_cummax 355 | rnd_cummax_all+=rnd_cummax 356 | 357 | if roc_auc!=-1: 358 | auc_scores.append(roc_auc) 359 | std_of_the_mean=np.std(auc_scores)/np.sqrt(len(auc_scores)) 360 | 361 | curr_val=(np.mean(auc_scores)+3*std_of_the_mean) 362 | 363 | if len(auc_scores)%1==0: 364 | #print_log('---') 365 | print_log(f'{len(auc_scores)}: roc_auc={roc_auc:.4f} ({np.mean(auc_scores):.4f}+-{np.std(auc_scores)/np.sqrt(len(auc_scores)):.4f}))') 366 | 367 | 368 | # Sorting fpr_all and tpr_all by fpr 369 | indices_fpr = np.argsort(fpr_all) 370 | fpr_all_sorted = fpr_all[indices_fpr] 371 | tpr_all_sorted = tpr_all[indices_fpr] 372 | 373 | # Number of bins (N) and calculating the size of each bin 374 | N = 100 # Adjust N based on your requirements 375 | bin_size = len(indices_fpr) // N # Using thresholds for binning precision 376 | 377 | # Initialize the bins for averaged data 378 | fpr_bin = np.zeros(N) 379 | tpr_bin = np.zeros(N) 380 | 381 | # Populate the bins by averaging the elements in each bin 382 | for i in range(N): 383 | start_index = i * bin_size 384 | end_index = start_index + bin_size 385 | fpr_bin[i] = np.mean(fpr_all_sorted[start_index:end_index]) 386 | tpr_bin[i] = np.mean(tpr_all_sorted[start_index:end_index]) 387 | 388 | 389 | precision_avg=precision_all/len(auc_scores) 390 | ycummax_avg=ycummax_all/len(auc_scores) 391 | rnd_cummax_avg=rnd_cummax_all/len(auc_scores) 392 | 393 | 394 | # Number of elements in precision_avg 395 | n_elements = len(precision_avg) 396 | 397 | # Calculate average precision using PyTorch 398 | avg_precision = precision_avg[-1] # Calculate average precision 399 | 400 | 401 | fig = plt.figure(figsize=(18, 6)) # Adjusted for three subplots 402 | 403 | # ROC Curve 404 | ax1 = fig.add_subplot(1, 3, 1) 405 | ax1.plot(fpr_bin, tpr_bin, color='darkorange', lw=4, label=f'ML Selection (AUC={np.mean(auc_scores*100):.3f})') 406 | ax1.plot([0, 1], [0, 1], color='navy', linestyle='--', lw=4, label='Random Selection') 407 | ax1.set_xlim([0.0, 1.0]) 408 | ax1.set_ylim([0.0, 1.0]) 409 | ax1.set_xlabel('False Positive Rate', fontsize=14) # Increased font size 410 | ax1.set_ylabel('True Positive Rate', fontsize=14) # Increased font size 411 | ax1.set_title('Average ROC Curve', fontsize=20) # Increased title font size 412 | ax1.legend(loc="lower right", fontsize=12) # Increased legend font size 413 | 414 | # Precision-Threshold Curve 415 | ax2 = fig.add_subplot(1, 3, 2) 416 | values = np.arange(1, len(precision_avg) + 1) 417 | ax2.plot(values, precision_avg, color='darkorange', lw=4, label='ML Selection') 418 | ax2.axhline(y=avg_precision.item(), color='navy', linestyle='--', lw=4, label='Random Selection') 419 | ax2.set_ylim([0.0, 1.05]) 420 | ax2.set_xlim([1, len(precision_avg)]) 421 | ax2.set_xlabel('Index of Research Suggestion', fontsize=14) # Increased font size 422 | ax2.set_ylabel('Precision', fontsize=14) # Increased font size 423 | ax2.set_title('Top-N Precision', fontsize=20) # Increased title font size 424 | ax2.legend(loc="lower right", fontsize=12) # Increased legend font size 425 | 426 | # Ycummax-Avg Curve 427 | ax3 = fig.add_subplot(1, 3, 3) 428 | ax3.plot(values[0:20], ycummax_avg[0:20], color='darkorange', lw=4, label='ML Selection') 429 | ax3.plot(values[0:20], rnd_cummax_avg[0:20], color='navy', linestyle='--', lw=4, label='Random Selection') 430 | ax3.set_ylim([0.0, 1.05]) 431 | ax3.set_xlim([1, len(ycummax_avg[0:20])]) 432 | ax3.set_xlabel('Number of Research Suggestion', fontsize=14) # Increased font size 433 | ax3.set_ylabel('Probability', fontsize=14) # Increased font size 434 | ax3.set_title('High-Interest Probability', fontsize=20) # Increased title font size 435 | ax3.legend(loc="lower right", fontsize=12) # Increased legend font size 436 | 437 | plt.tight_layout() 438 | #plt.savefig(f'best_model_{curr_neurons_per_layer_list[0]}_{percentage}_len_{len(curr_feature_list)}_LR_{curr_lr:.4f}_AUC_{np.mean(auc_scores):.4f}.png', dpi=300, format='png') 439 | 440 | plt.show() 441 | 442 | 443 | # Organize your data into a dictionary with the expected keys 444 | data_to_save = { 445 | 'topNprecision_avg': precision_avg, # Renamed for loading 446 | 'highInterestProb_ML': ycummax_avg, # Renamed for loading 447 | 'highInterestProb_rnd': rnd_cummax_avg, # Renamed for loading 448 | 'fpr': fpr_bin, # Matches the expected key 449 | 'tpr': tpr_bin # Matches the expected key 450 | } 451 | 452 | # Save the dictionary to a pickle file 453 | 454 | with open(os.path.join(data_dir,'full_data_ML.pkl'), 'wb') as file: 455 | pickle.dump(data_to_save, file) -------------------------------------------------------------------------------- /create_full_data_gpt_pkl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import pickle 4 | import json 5 | import csv 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from sklearn.metrics import roc_auc_score, roc_curve 9 | 10 | 11 | def update_elo(elo_scores, id1, id2, winner, K=32): 12 | # Compute expected scores 13 | R1 = 10**(elo_scores[id1] / 400) 14 | R2 = 10**(elo_scores[id2] / 400) 15 | E1 = R1 / (R1 + R2) 16 | E2 = R2 / (R1 + R2) 17 | 18 | # Update scores 19 | if winner == 1: 20 | S1, S2 = 1, 0 21 | else: 22 | S1, S2 = 0, 1 23 | 24 | elo_scores[id1] = elo_scores[id1] + K * (S1 - E1) 25 | elo_scores[id2] = elo_scores[id2] + K * (S2 - E2) 26 | 27 | return elo_scores 28 | 29 | 30 | def normalize_features(data): 31 | features = np.array(data['features']) 32 | return (features - np.mean(features, axis=0)) / np.std(features, axis=0) 33 | 34 | 35 | 36 | if __name__ == "__main__": 37 | # Configuration 38 | for use_gpt4o in [True, False]: 39 | 40 | authortype = ['nat', 'soc'] 41 | institutetype = ['nat', 'same', 'soc'] 42 | suggestiontype = ['random', 'semnet'] 43 | 44 | rrrseed = 2 45 | np.random.seed(rrrseed) 46 | random.seed(rrrseed) 47 | 48 | data_dir="data" 49 | os.makedirs(data_dir, exist_ok=True) 50 | # Load and prepare your data outside the loop if it's the same for each iteration 51 | file_path = os.path.join(data_dir, 'all_evaluation_data.pkl') 52 | 53 | with open(file_path, 'rb') as file: 54 | all_data = pickle.load(file) 55 | print("'all_data' has been loaded from the pickle file.") 56 | 57 | # Main body 58 | num_of_samples = len(all_data['interest']) 59 | smaller_data = all_data 60 | 61 | if use_gpt4o: 62 | #result_dir = 'results_4o' 63 | file_name=os.path.join(data_dir,'full_data_gpt4o.pkl') 64 | elo_file=os.path.join(data_dir,'elo_data_gpt4o.pkl') 65 | else: 66 | #result_dir = 'results_gpt35' 67 | file_name=os.path.join(data_dir,'full_data_gpt35.pkl') 68 | elo_file=os.path.join(data_dir,'elo_data_gpt35.pkl') 69 | 70 | 71 | # Initialize ELO scores and match counts if file doesn't exist 72 | elo_scores = [1400] * num_of_samples 73 | match_counts = [0] * num_of_samples 74 | 75 | with open(elo_file, 'rb') as file: 76 | elo_results = pickle.load(file) 77 | 78 | 79 | # Update ELO scores based on results 80 | for id1, id2, winner in elo_results: 81 | elo_scores = update_elo(elo_scores, id1, id2, winner) 82 | match_counts[id1] += 1 83 | match_counts[id2] += 1 84 | 85 | interest_data = np.array(smaller_data['interest']) 86 | features_norm = normalize_features(smaller_data) 87 | 88 | # Ranking suggestions by ELO from large to small 89 | ranked_indices = np.argsort(elo_scores)[::-1] 90 | 91 | # High interest is defined as 4 or 5, low interest as 1, 2, or 3 92 | interest_binary = [1 if interest_data[i] >= 4 else 0 for i in ranked_indices] 93 | 94 | # Compute AUC 95 | auc = roc_auc_score(interest_binary, np.sort(elo_scores)[::-1]) 96 | 97 | print(f"AUC: {auc}") 98 | 99 | # Save the results 100 | total_matches = sum(match_counts) // 2 101 | 102 | # Plot the ROC curve 103 | fpr, tpr, _ = roc_curve(interest_binary, np.sort(elo_scores)[::-1]) 104 | 105 | plt.figure() 106 | plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {auc:.2f})') 107 | plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--') 108 | plt.xlim([0.0, 1.0]) 109 | plt.ylim([0.0, 1.05]) 110 | plt.xlabel('False Positive Rate') 111 | plt.ylabel('True Positive Rate') 112 | plt.title('Receiver Operating Characteristic (ROC) Curve') 113 | plt.legend(loc="lower right") 114 | plt.grid(True) 115 | 116 | plt.show() 117 | plt.close() 118 | 119 | 120 | # Create a dictionary with the required keys 121 | data_to_save = { 122 | 'ranked_indices': ranked_indices, # Your ranked indices data 123 | 'interest_binary': interest_binary, # Your binary interest data 124 | 'auc_values': [auc], # Your AUC values 125 | 'fpr': fpr, # Your false positive rate data 126 | 'tpr': tpr # Your true positive rate data 127 | } 128 | 129 | # Save the dictionary to a pickle file 130 | with open(file_name, 'wb') as file: 131 | pickle.dump(data_to_save, file) 132 | -------------------------------------------------------------------------------- /data/all_evaluation_data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/all_evaluation_data.pkl -------------------------------------------------------------------------------- /data/elo_data_gpt35.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/elo_data_gpt35.pkl -------------------------------------------------------------------------------- /data/elo_data_gpt4o.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/elo_data_gpt4o.pkl -------------------------------------------------------------------------------- /data/full_data_DT_fixed_params.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/full_data_DT_fixed_params.pkl -------------------------------------------------------------------------------- /data/full_data_ML.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/full_data_ML.pkl -------------------------------------------------------------------------------- /data/full_data_gpt35.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/full_data_gpt35.pkl -------------------------------------------------------------------------------- /data/full_data_gpt4o.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/full_data_gpt4o.pkl -------------------------------------------------------------------------------- /data/full_data_gpt4omini.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/data/full_data_gpt4omini.pkl -------------------------------------------------------------------------------- /figures/Fig3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/figures/Fig3.png -------------------------------------------------------------------------------- /figures/Fig4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/figures/Fig4.png -------------------------------------------------------------------------------- /figures/Fig4_with_tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/figures/Fig4_with_tree.png -------------------------------------------------------------------------------- /figures/auc_over_time_final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/figures/auc_over_time_final.png -------------------------------------------------------------------------------- /figures/scimuse.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/figures/scimuse.jpeg -------------------------------------------------------------------------------- /figures/scimuse_benchmark_5k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/figures/scimuse_benchmark_5k.png -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_10.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_10.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_15.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_15.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_20.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_20.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_25.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_25.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_30.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_30.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_35.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_35.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_40.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_40.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_45.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_45.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/all_results_50.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/all_results_50.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/mean_auc_heatmaps_highres.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_DecisionTree_hyperparameters/mean_auc_heatmaps_highres.png -------------------------------------------------------------------------------- /hyperparameters/Fig_DecisionTree_hyperparameters/plot_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import re 6 | import os 7 | 8 | script_path = os.path.abspath(os.path.dirname(__file__)) 9 | # Change current working directory to the script's location 10 | os.chdir(script_path) 11 | print("current path:", script_path) 12 | # List of num_features values 13 | num_features_list = [10, 15, 20, 25, 30, 35, 40, 45, 50] 14 | 15 | # Set up the figure and axes for a 3x3 grid 16 | fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(18, 15)) 17 | 18 | # Set the color normalization range 19 | vmin = 0.55 20 | vmax = 0.62 21 | 22 | # Flatten the axes array for easy iteration 23 | axes = axes.flatten() 24 | 25 | for idx, num_features in enumerate(num_features_list): 26 | print(num_features) 27 | data = [] 28 | # Read and parse the file 29 | try: 30 | with open(f'all_results_{num_features}.txt', 'r', encoding='latin-1') as file: 31 | for line in file: 32 | line = line.strip() 33 | if not line: 34 | continue # Skip empty lines 35 | # Regex pattern to extract n, m, and Mean AUC 36 | pattern = r'\((\d+),\s*(\d+)\):\s*Test AUC=[\d.]+\s*\(Mean AUC:\s*([\d.]+)\s*±\s*[\d.]+\)' 37 | match = re.match(pattern, line) 38 | if match: 39 | n = int(match.group(1)) 40 | m = int(match.group(2)) 41 | mean_auc = float(match.group(3)) 42 | data.append({'max_depth': n, 'min_samples_leaf': m, 'Mean AUC': mean_auc}) 43 | else: 44 | print(f"Line not matched in file {num_features}: {line}") 45 | except FileNotFoundError: 46 | print(f"File all_results_{num_features}.txt not found.") 47 | continue # Skip to the next num_features value 48 | 49 | # Create a DataFrame 50 | df = pd.DataFrame(data) 51 | 52 | if df.empty: 53 | print(f"No data found in file all_results_{num_features}.txt.") 54 | continue 55 | 56 | # Pivot the DataFrame to create a table 57 | table = df.pivot(index='max_depth', columns='min_samples_leaf', values='Mean AUC') 58 | 59 | # Create the heatmap with reversed colormap and customized annotations 60 | sns.heatmap( 61 | table, 62 | ax=axes[idx], 63 | annot=True, 64 | fmt=".3f", 65 | cmap='YlGnBu_r', # Reversed colormap 66 | vmin=vmin, 67 | vmax=vmax, 68 | annot_kws={"size": 10, "weight": "bold"}, # Bold and larger font 69 | cbar=False # We'll add a single colorbar later 70 | ) 71 | 72 | # Set plot labels and title 73 | axes[idx].set_title(f'{num_features} features') 74 | axes[idx].set_xlabel('min_samples_leaf (m)') 75 | axes[idx].set_ylabel('max_depth (n)') 76 | 77 | 78 | # Add a single colorbar for all heatmaps 79 | fig.subplots_adjust(right=0.9) # Adjust the right boundary of the figure 80 | cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7]) # x, y, width, height 81 | norm = plt.Normalize(vmin=vmin, vmax=vmax) 82 | sm = plt.cm.ScalarMappable(cmap='YlGnBu_r', norm=norm) 83 | sm.set_array([]) 84 | fig.colorbar(sm, cax=cbar_ax) 85 | 86 | # Adjust layout and set the overall title 87 | plt.tight_layout(rect=[0, 0, 0.9, 1]) # Leave space for the colorbar 88 | plt.suptitle('Mean AUC for Different Hyper-parameters of Decision Tree', fontsize=16, y=1.02) 89 | 90 | # Save the figure as a high-resolution PNG file 91 | plt.savefig('mean_auc_heatmaps_highres.png', dpi=300, bbox_inches='tight') 92 | 93 | # If you want to display the plot as well, you can uncomment the following line: 94 | plt.show() -------------------------------------------------------------------------------- /hyperparameters/Fig_NN_hyperparameters/all_results_15_0.003.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_NN_hyperparameters/all_results_15_0.003.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_NN_hyperparameters/all_results_25_0.003.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_NN_hyperparameters/all_results_25_0.003.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_NN_hyperparameters/all_results_35_0.003.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_NN_hyperparameters/all_results_35_0.003.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_NN_hyperparameters/all_results_45_0.003.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_NN_hyperparameters/all_results_45_0.003.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_NN_hyperparameters/all_results_5_0.003.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_NN_hyperparameters/all_results_5_0.003.txt -------------------------------------------------------------------------------- /hyperparameters/Fig_NN_hyperparameters/mean_auc_heatmaps_nn_with_lr_dropout_single_colorbar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/SciMuse/d599365c25eee3e83517d05dc6f8d60b390de9bb/hyperparameters/Fig_NN_hyperparameters/mean_auc_heatmaps_nn_with_lr_dropout_single_colorbar.png -------------------------------------------------------------------------------- /hyperparameters/Fig_NN_hyperparameters/plot_results.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import re 6 | import os 7 | 8 | # Get the directory where this script is located 9 | script_path = os.path.abspath(os.path.dirname(__file__)) 10 | 11 | # Change current working directory to the script's location 12 | os.chdir(script_path) 13 | print("current path:", script_path) 14 | 15 | # List of num_features values 16 | num_features_list = [5, 15, 25, 35, 45] 17 | 18 | # Set up the figure 19 | fig = plt.figure(figsize=(25, 10)) 20 | 21 | # Set the color normalization range based on your data 22 | vmin = 0.61 23 | vmax = 0.65 24 | 25 | # First row: Heatmaps for num_features 26 | axes = [] 27 | for idx, num_features in enumerate(num_features_list): 28 | # Calculate position for each subplot 29 | left = 0.05 + idx * 0.18 30 | bottom = 0.55 31 | width = 0.16 32 | height = 0.4 33 | ax = fig.add_axes([left, bottom, width, height]) 34 | axes.append(ax) 35 | print(f"Processing num_features={num_features}") 36 | data = [] 37 | # Read and parse the file 38 | try: 39 | with open(f'all_results_{num_features}_0.003.txt', 'r', encoding='latin-1') as file: 40 | for line in file: 41 | line = line.strip() 42 | if not line: 43 | continue # Skip empty lines 44 | # Regex pattern to extract n, m, and Mean AUC 45 | pattern = r'\((\d+),\s*(\d+)\):\s*Test AUC=[\d.]+\s*\(Mean AUC:\s*([\d.]+)\s*±\s*[\d.]+\)' 46 | match = re.match(pattern, line) 47 | if match: 48 | n = int(match.group(1)) # Number of layers 49 | m = int(match.group(2)) # Number of neurons per layer 50 | mean_auc = float(match.group(3)) 51 | data.append({'layers': n, 'neurons': m, 'Mean AUC': mean_auc}) 52 | else: 53 | print(f"Line not matched in file {num_features}: {line}") 54 | except FileNotFoundError: 55 | print(f"File all_results_{num_features}_0.003.txt not found.") 56 | continue # Skip to the next num_features value 57 | 58 | # Create a DataFrame 59 | df = pd.DataFrame(data) 60 | 61 | if df.empty: 62 | print(f"No data found in file all_results_{num_features}_0.003.txt.") 63 | continue 64 | 65 | # Pivot the DataFrame to create a table 66 | table = df.pivot(index='layers', columns='neurons', values='Mean AUC') 67 | 68 | # Check if the table is empty 69 | if table.empty: 70 | print(f"No data to plot for num_features={num_features}.") 71 | continue 72 | 73 | # Create the heatmap with reversed colormap and customized annotations 74 | sns.heatmap( 75 | table, 76 | ax=ax, 77 | annot=True, 78 | fmt=".3f", 79 | cmap='YlGnBu_r', 80 | vmin=vmin, 81 | vmax=vmax, 82 | annot_kws={"size": 12, "weight": "bold"}, 83 | cbar=False 84 | ) 85 | 86 | # Set plot labels and title 87 | ax.set_title(f'{num_features} features', fontsize=14, y=1.02) 88 | ax.set_xlabel('Neurons per Layer (m)', fontsize=12) 89 | ax.set_ylabel('Number of Layers (n)', fontsize=12) 90 | 91 | # Second row: Heatmaps for Learning Rate and Dropout analysis at num_features=25 92 | 93 | # Learning Rate Heatmap 94 | ax_lr = fig.add_axes([0.22, 0.25, 0.16, 0.15]) 95 | 96 | # Data for LR analysis 97 | mean_aucs_lr = [0.648, 0.648, 0.648] 98 | learning_rates = [0.001, 0.003, 0.009] 99 | columns_lr = [str(lr) for lr in learning_rates] 100 | data_lr = [mean_aucs_lr] 101 | 102 | # Create a DataFrame for LR analysis 103 | df_lr = pd.DataFrame(data_lr, index=['AUC'], columns=columns_lr) 104 | 105 | # Create the heatmap for Learning Rate 106 | sns.heatmap( 107 | df_lr, 108 | ax=ax_lr, 109 | annot=True, 110 | fmt=".3f", 111 | cmap='YlGnBu_r', 112 | vmin=vmin, 113 | vmax=vmax, 114 | cbar=False, 115 | annot_kws={"size": 14, "weight": "bold"} 116 | ) 117 | 118 | # Set labels and title for LR heatmap 119 | ax_lr.set_title('Mean AUC for Learning Rates\n(25 features, 1 layer, 50 neurons)', fontsize=12, y=1.1) 120 | ax_lr.set_xlabel('Learning Rate', fontsize=10) 121 | ax_lr.set_ylabel('') 122 | ax_lr.set_yticks([]) 123 | 124 | # Dropout Rate Heatmap 125 | ax_dr = fig.add_axes([0.62, 0.25, 0.16, 0.15]) 126 | 127 | # Data for Dropout analysis 128 | mean_aucs_dr = [0.647, 0.648, 0.649] 129 | dropout_rates = [0.1, 0.2, 0.3] 130 | columns_dr = [str(dr) for dr in dropout_rates] 131 | data_dr = [mean_aucs_dr] 132 | 133 | # Create a DataFrame for Dropout analysis 134 | df_dr = pd.DataFrame(data_dr, index=['AUC'], columns=columns_dr) 135 | 136 | # Create the heatmap for Dropout Rate 137 | sns.heatmap( 138 | df_dr, 139 | ax=ax_dr, 140 | annot=True, 141 | fmt=".3f", 142 | cmap='YlGnBu_r', 143 | vmin=vmin, 144 | vmax=vmax, 145 | cbar=False, 146 | annot_kws={"size": 14, "weight": "bold"} 147 | ) 148 | 149 | # Set labels and title for Dropout heatmap 150 | ax_dr.set_title('Mean AUC for Dropout Rates\n(25 features, 1 layer, 50 neurons)', fontsize=12, y=1.1) 151 | ax_dr.set_xlabel('Dropout Rate', fontsize=10) 152 | ax_dr.set_ylabel('') 153 | ax_dr.set_yticks([]) 154 | 155 | # Adjust the colorbar to cover both rows and move it further to the right 156 | cbar_ax = fig.add_axes([0.94, 0.15, 0.02, 0.75]) 157 | 158 | norm = plt.Normalize(vmin=vmin, vmax=vmax) 159 | sm = plt.cm.ScalarMappable(cmap='YlGnBu_r', norm=norm) 160 | sm.set_array([]) 161 | fig.colorbar(sm, cax=cbar_ax, label='Mean AUC') 162 | 163 | # Set overall title higher to avoid overlap 164 | plt.suptitle('Mean AUC for Different Hyper-parameters of Neural Network', fontsize=18, y=1.05) 165 | 166 | # Save the figure as a high-resolution PNG file 167 | plt.savefig('mean_auc_heatmaps_nn_with_lr_dropout_single_colorbar.png', dpi=300, bbox_inches='tight') 168 | 169 | # Display the plot 170 | plt.show() 171 | --------------------------------------------------------------------------------