├── LICENSE ├── README.md ├── benchmark_code ├── loops_fcNN.py ├── loops_transformer.py ├── loops_tree.py ├── loops_xgboost.py ├── make_plot_loss_curve.py ├── make_plots.py ├── make_plots_mixed.py └── make_plots_single.py ├── create_concepts ├── Concept_Corpus │ ├── s0_get_preprint_metadata.ipynb │ ├── s1_make_metadate_arxivstyle.ipynb │ ├── s2_combine_all_preprint_metadate.ipynb │ ├── s3_get_concepts.ipynb │ └── s4_improve_concept.ipynb └── Domain_Concept │ ├── full_domain_concepts.txt │ ├── s0_prepare_optics_quantum_data.ipynb │ ├── s1_split_domain_papers.py │ ├── s2_get_domain_concepts.py │ ├── s3_merge_concepts.py │ ├── s4_improve_concepts.ipynb │ └── s5_improve_manually_concepts.py ├── create_dynamic_concepts ├── get_concept_citation.py ├── merge_concept_citation.py └── process_concept_to_pandas_frame.py ├── create_dynamic_edges ├── _get_openalex_workdata.py ├── _get_openalex_workdata_parallel_run1.py ├── get_concept_pairs.py ├── merge_concept_pairs.py └── process_edge_to_pandas_frame.py ├── features_utils.py ├── fpr_example ├── plot_FPR.py └── roc_curve_combined_highres.png ├── general_utils.py ├── miscellaneous ├── Fig2_NeuralNet.png ├── Impact4Cast.png └── KnowledgeGraph.png ├── prepare_adjacency_pagerank.py ├── prepare_eval_data ├── prepare_eval_feature_data.ipynb ├── prepare_eval_feature_data.py └── prepare_eval_feature_data_condition.py ├── prepare_node_pair_citation_data_years.ipynb ├── prepare_unconnected_pair_solution.ipynb ├── preprocess_utils.py ├── train_model_2019_condition.py ├── train_model_2019_individual_feature.py ├── train_model_2019_run.py ├── train_model_2022_run.py └── train_model_utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Artificial Scientist Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchmark_code/loops_fcNN.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import random, time 6 | import numpy as np 7 | import pandas as pd 8 | import torch 9 | import torch.nn as nn 10 | import torch.optim as optim 11 | from datetime import datetime, date 12 | import matplotlib 13 | matplotlib.use('Agg') # Use a non-interactive backend suitable for cluster 14 | import matplotlib.pyplot as plt 15 | from sklearn.metrics import roc_auc_score, roc_curve 16 | 17 | 18 | def get_predictions(model, data, solution, eval_batch_size, log_file): 19 | model.eval() # Set the model to evaluation mode 20 | with torch.no_grad(): 21 | data_batches = torch.split(data, eval_batch_size) 22 | total_batches = len(data_batches) 23 | 24 | all_predictions = [] 25 | start_time = time.time() 26 | for i, batch in enumerate(data_batches, start=1): 27 | batch_start_time = time.time() 28 | batch_preds = model(batch).squeeze().cpu().numpy() 29 | all_predictions.append(batch_preds) 30 | batch_time = time.time() - batch_start_time 31 | print_cluster(f"Processed batch {i}/{total_batches} in {batch_time:.2f} seconds", log_file) 32 | 33 | predictions = np.concatenate(all_predictions) 34 | 35 | true_labels = solution.cpu().numpy() # Move labels to CPU 36 | total_time = time.time() - start_time 37 | print_cluster(f"Total prediction time: {total_time:.2f} seconds", log_file) 38 | return predictions, true_labels 39 | 40 | 41 | def plot_auc_roc(true_labels, predictions, save_file, label="Train"): 42 | # Calculate the AUC-ROC score and ROC curve 43 | auc_score = roc_auc_score(true_labels, predictions) 44 | fpr, tpr, thresholds = roc_curve(true_labels, predictions) 45 | 46 | # Plot the ROC curve 47 | plt.figure(figsize=(8, 6)) 48 | plt.plot(fpr, tpr, label=f"{label} AUC = {auc_score:.4f}") 49 | plt.plot([0, 1], [0, 1], 'k--', label=f"Random AUC={0.5}") 50 | plt.xlabel("False Positive Rate (FPR)") 51 | plt.ylabel("True Positive Rate (TPR)") 52 | plt.title(f"ROC Curve -- {label}") 53 | plt.legend(loc="lower right") 54 | plt.grid() 55 | plt.savefig(save_file, dpi=300) 56 | plt.close() 57 | 58 | # Save data used to produce this figure, including predictions and ground truth 59 | data_file = save_file.replace('.png', '.npz') 60 | np.savez( 61 | data_file, 62 | fpr=fpr, 63 | tpr=tpr, 64 | thresholds=thresholds, 65 | auc_score=auc_score, 66 | true_labels=true_labels, 67 | predictions=predictions 68 | ) 69 | return auc_score 70 | 71 | def plot_loss_curve(loss_train, loss_test, save_file, label="year1-year2"): 72 | plt.figure(figsize=(8, 6)) 73 | plt.plot(loss_train, label=f'Train: {label}') 74 | plt.plot(loss_test, label=f'Test: {label}') 75 | plt.title(f"Loss Over Epochs: {label}") 76 | plt.xlabel("Epoch") 77 | plt.ylabel("Loss") 78 | plt.legend() 79 | plt.savefig(save_file, dpi=300) 80 | plt.close() 81 | 82 | # Save data used to produce this figure 83 | data_file = save_file.replace('.png', '.npz') 84 | np.savez(data_file, loss_train=loss_train, loss_test=loss_test) 85 | 86 | 87 | class ff_network(nn.Module): 88 | def __init__(self, input_size, hidden_size, output_size): 89 | super(ff_network, self).__init__() 90 | 91 | act = nn.ReLU() 92 | 93 | self.semnet = nn.Sequential( 94 | nn.Linear(input_size, hidden_size), 95 | act, 96 | nn.Linear(hidden_size, hidden_size), 97 | act, 98 | nn.Linear(hidden_size, hidden_size), 99 | act, 100 | nn.Linear(hidden_size, output_size) 101 | ) 102 | 103 | def forward(self, x): 104 | res = self.semnet(x) 105 | return res 106 | 107 | # Hyperparameters that we also use in naming 108 | batch_size = 2048 109 | lr_enc = 1e-4 110 | hidden_size = 600 111 | patience = 500 112 | 113 | # Create folders if needed 114 | neuralNet_folder = "save_neuralNet" 115 | plot_folder = "save_plot" 116 | os.makedirs(neuralNet_folder, exist_ok=True) 117 | os.makedirs(plot_folder, exist_ok=True) 118 | 119 | train_data_folder = "data_for_train" 120 | eval_data_folder = "data_for_eval" 121 | os.makedirs(train_data_folder, exist_ok=True) 122 | os.makedirs(eval_data_folder, exist_ok=True) 123 | 124 | rnd_seed = 42 125 | random.seed(rnd_seed) 126 | torch.manual_seed(rnd_seed) 127 | np.random.seed(rnd_seed) 128 | 129 | # We define the loops as requested: 130 | train_year_spans = [2, 3, 4] # (y2_train - y1_train) 131 | eval_year_spans = [1, 2, 3, 4, 5] # (y2_eval - y1_eval) 132 | IR_list = [10, 50] 133 | 134 | fixed_y2_eval = 2022 135 | 136 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 137 | 138 | 139 | def print_cluster(print_str, log_file): 140 | print(print_str) 141 | with open(log_file, "a") as logfile: 142 | logfile.write(print_str + "\n") 143 | 144 | 145 | for t_span in train_year_spans: 146 | for e_span in eval_year_spans: 147 | # Compute eval years 148 | y2_eval = fixed_y2_eval 149 | y1_eval = y2_eval - e_span 150 | 151 | # Compute train years 152 | # given: y2_train = y1_eval 153 | y2_train = y1_eval 154 | y1_train = y2_train - t_span 155 | 156 | # Define the log file name based on parameters 157 | for IR in IR_list: 158 | # Construct a file name prefix for all files 159 | 160 | file_name_prefix = f"{y1_train}_{y2_train}_{hidden_size}_{lr_enc}_{batch_size}_{patience}_{IR}" 161 | log_file = f'log_fcNN_{file_name_prefix}.txt' 162 | 163 | 164 | # Construct file paths 165 | if y1_train==2013: 166 | train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}_star.parquet") 167 | eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}_star.parquet") 168 | else: 169 | train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}.parquet") 170 | eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}.parquet") 171 | 172 | # Check if files exist before proceeding 173 | if not os.path.exists(train_file) or not os.path.exists(eval_file): 174 | print_cluster(f"Skipping (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR}) because files do not exist.", log_file) 175 | continue 176 | 177 | print_cluster(f"Processing (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR})...", log_file) 178 | 179 | # Load train data 180 | train_data_pandas = pd.read_parquet(train_file) 181 | all_input_train = train_data_pandas.values 182 | train_data_pandas = pd.DataFrame() # free memory 183 | 184 | # Load eval data 185 | eval_data_pandas = pd.read_parquet(eval_file) 186 | all_input_eval = eval_data_pandas.values 187 | eval_data_pandas = pd.DataFrame() # free memory 188 | 189 | eval_feature_dataset = all_input_eval[:, 3:] # selecting features f0,... 190 | eval_solution_dataset = all_input_eval[:, 2] 191 | all_negative_solution_eval = np.all(eval_solution_dataset == 0) 192 | 193 | np.random.shuffle(all_input_train) 194 | input_data = all_input_train[:, 3:] # selecting features 195 | supervised_solution = all_input_train[:, 2] # solutions 196 | 197 | train_test_size = [0.85, 0.15] 198 | idx_train = int(len(input_data) * train_test_size[0]) 199 | input_data_train = input_data[:idx_train] 200 | train_solution = supervised_solution[:idx_train] 201 | 202 | input_data_test = input_data[idx_train:] 203 | test_solution = supervised_solution[idx_train:] 204 | 205 | data_train = torch.tensor(input_data_train, dtype=torch.float32).to(device) 206 | solution_train = torch.tensor(train_solution, dtype=torch.float32).to(device) 207 | 208 | data_test = torch.tensor(input_data_test, dtype=torch.float32).to(device) 209 | solution_test = torch.tensor(test_solution, dtype=torch.float32).to(device) 210 | 211 | input_size = data_train.shape[1] 212 | output_size = 1 213 | 214 | model_semnet = ff_network(input_size, hidden_size, output_size).to(device) 215 | criterion = nn.MSELoss() 216 | optimizer = optim.Adam(model_semnet.parameters(), lr=lr_enc) 217 | 218 | size_of_loss_check = 10000 219 | # Initialize variables for early stopping 220 | best_test_loss = float('inf') 221 | best_epoch = 0 222 | 223 | train_loss_total = [] 224 | test_loss_total = [] 225 | 226 | start_time = time.time() 227 | 228 | num_epochs = 5000000 229 | print_cluster("start training....", log_file) 230 | for epoch in range(num_epochs): 231 | model_semnet.train() 232 | # Randomly select 'batch_size' samples from data_train 233 | indices = np.random.choice(len(data_train), batch_size, replace=False) 234 | batch_data = data_train[indices] 235 | batch_solution = solution_train[indices] 236 | 237 | # Forward pass 238 | optimizer.zero_grad() 239 | predictions = model_semnet(batch_data).squeeze() 240 | real_loss = criterion(predictions, batch_solution) 241 | loss = torch.clamp(real_loss, min=0., max=50000.).double() 242 | loss.backward() 243 | optimizer.step() 244 | 245 | with torch.no_grad(): 246 | model_semnet.eval() 247 | # Evaluate on a subset of the training data 248 | train_predictions = model_semnet(data_train[:size_of_loss_check]).squeeze() 249 | train_loss = criterion(train_predictions, solution_train[:size_of_loss_check]).item() 250 | # Evaluate on a subset of the test data 251 | test_predictions = model_semnet(data_test[:size_of_loss_check]).squeeze() 252 | test_loss = criterion(test_predictions, solution_test[:size_of_loss_check]).item() 253 | 254 | train_loss_total.append(train_loss) 255 | test_loss_total.append(test_loss) 256 | 257 | # Calculate epochs since last best test loss 258 | epochs_since_best = epoch - best_epoch 259 | 260 | # Print progress 261 | elapsed_time = time.time() - start_time 262 | print_cluster(f'epoch {epoch}: Train Loss = {train_loss:.5f}, Test Loss = {test_loss:.5f}, Time = {elapsed_time:.5f}s, ESC: {epochs_since_best}/{patience}', log_file) 263 | start_time = time.time() 264 | 265 | # Check if current test loss is the best so far 266 | if test_loss < best_test_loss: 267 | best_test_loss = test_loss 268 | best_epoch = epoch 269 | # Save the model when a new best is found 270 | did_work=False 271 | while did_work==False: 272 | try: 273 | net_file = os.path.join(neuralNet_folder, f"fcNN_netNet_full_trained_{file_name_prefix}.pt") 274 | torch.save(model_semnet, net_file) 275 | torch.save(model_semnet.state_dict(), net_file.replace("fcNN_netNet_full", "fcNN_netNet_state")) 276 | did_work=True 277 | except: 278 | time.sleep(0.1) 279 | # Early stopping: if no improvement in 'patience' epochs, stop training 280 | if epoch - best_epoch > patience: 281 | print_cluster(f'Early stopping triggered at epoch {epoch}. Best test loss {best_test_loss:.5f} was not improved for {patience} epochs.', log_file) 282 | break 283 | 284 | print_cluster("finish training....", log_file) 285 | 286 | # Load the best performing model 287 | net_file = os.path.join(neuralNet_folder, f"fcNN_netNet_full_trained_{file_name_prefix}.pt") 288 | model_semnet = torch.load(net_file, map_location=device) 289 | model_semnet.eval() 290 | 291 | save_loss_file = os.path.join(plot_folder, f"fcNN_loss_curve_{file_name_prefix}.png") 292 | plot_loss_curve(train_loss_total, test_loss_total, save_loss_file, label=f"{y1_train}-{y2_train}") 293 | 294 | print_cluster("start evaluation for train, test and eval if possible....", log_file) 295 | eval_batch_size = 50000 296 | train_predictions, train_labels = get_predictions(model_semnet, data_train, solution_train, eval_batch_size, log_file=log_file) 297 | save_train_auc_file = os.path.join(plot_folder, f"fcNN_train_auc_curve_{file_name_prefix}.png") 298 | curr_auc=plot_auc_roc(train_labels, train_predictions, save_train_auc_file, label="Train") 299 | print_cluster(f"Train AUC: {curr_auc}", log_file) 300 | 301 | test_predictions, test_labels = get_predictions(model_semnet, data_test, solution_test, eval_batch_size, log_file=log_file) 302 | save_test_auc_file = os.path.join(plot_folder, f"fcNN_test_auc_curve_{file_name_prefix}.png") 303 | curr_auc=plot_auc_roc(test_labels, test_predictions, save_test_auc_file, label="Test") 304 | print_cluster(f"Test AUC: {curr_auc}", log_file) 305 | print_cluster("finish auc plot for train, test...", log_file) 306 | 307 | if not all_negative_solution_eval: # contain positive cases 308 | data_eval = torch.tensor(eval_feature_dataset, dtype=torch.float32).to(device) 309 | solution_eval = torch.tensor(eval_solution_dataset, dtype=torch.float32).to(device) 310 | eval_predictions, eval_labels = get_predictions(model_semnet, data_eval, solution_eval, eval_batch_size, log_file=log_file) 311 | save_eval_auc_file = os.path.join(plot_folder, f"fcNN_eval_auc_curve_{file_name_prefix}.png") 312 | curr_auc=plot_auc_roc(eval_labels, eval_predictions, save_eval_auc_file, label="Eval") 313 | print_cluster(f"Eval AUC: {curr_auc}", log_file) 314 | 315 | print_cluster("finish all.....", log_file) 316 | -------------------------------------------------------------------------------- /benchmark_code/loops_tree.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import random, time 6 | import numpy as np 7 | import pandas as pd 8 | from datetime import datetime, date 9 | import matplotlib 10 | matplotlib.use('Agg') # Use a non-interactive backend suitable for cluster 11 | import matplotlib.pyplot as plt 12 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error 13 | from sklearn.ensemble import RandomForestClassifier 14 | import joblib 15 | 16 | def print_cluster(print_str, log_file): 17 | print(print_str) 18 | with open(log_file, "a") as logfile: 19 | logfile.write(print_str + "\n") 20 | 21 | def get_predictions(model, data, solution, eval_batch_size, log_file): 22 | # model here is a RandomForestClassifier 23 | # We'll do the predictions in batches if needed 24 | data_batches = [] 25 | n = data.shape[0] 26 | idx = 0 27 | while idx < n: 28 | data_batches.append(data[idx:idx+eval_batch_size]) 29 | idx += eval_batch_size 30 | 31 | all_predictions = [] 32 | start_time = time.time() 33 | for i, batch in enumerate(data_batches, start=1): 34 | batch_start_time = time.time() 35 | # For random forest, predict_proba returns probabilities for each class 36 | # We assume solution is binary {0,1}, so we take probability of class 1 37 | batch_preds = model.predict_proba(batch)[:, 1] 38 | all_predictions.append(batch_preds) 39 | batch_time = time.time() - batch_start_time 40 | print_cluster(f"Processed batch {i}/{len(data_batches)} in {batch_time:.2f} seconds", log_file) 41 | 42 | predictions = np.concatenate(all_predictions) 43 | true_labels = solution 44 | total_time = time.time() - start_time 45 | print_cluster(f"Total prediction time: {total_time:.2f} seconds", log_file) 46 | return predictions, true_labels 47 | 48 | def plot_auc_roc(true_labels, predictions, save_file, label="Train"): 49 | # Calculate the AUC-ROC score and ROC curve 50 | auc_score = roc_auc_score(true_labels, predictions) 51 | fpr, tpr, thresholds = roc_curve(true_labels, predictions) 52 | 53 | # Plot the ROC curve 54 | plt.figure(figsize=(8, 6)) 55 | plt.plot(fpr, tpr, label=f"{label} AUC = {auc_score:.4f}") 56 | plt.plot([0, 1], [0, 1], 'k--', label=f"Random AUC={0.5}") 57 | plt.xlabel("False Positive Rate (FPR)") 58 | plt.ylabel("True Positive Rate (TPR)") 59 | plt.title(f"ROC Curve -- {label}") 60 | plt.legend(loc="lower right") 61 | plt.grid() 62 | plt.savefig(save_file, dpi=300) 63 | plt.close() 64 | 65 | # Save data used to produce this figure, including predictions and ground truth 66 | data_file = save_file.replace('.png', '.npz') 67 | np.savez( 68 | data_file, 69 | fpr=fpr, 70 | tpr=tpr, 71 | thresholds=thresholds, 72 | auc_score=auc_score, 73 | true_labels=true_labels, 74 | predictions=predictions 75 | ) 76 | return auc_score 77 | 78 | def plot_loss_curve(loss_train, loss_test, save_file, label="year1-year2"): 79 | plt.figure(figsize=(8, 6)) 80 | plt.plot(loss_train, label=f'Train: {label}') 81 | plt.plot(loss_test, label=f'Test: {label}') 82 | plt.title(f"Loss Over Epochs: {label}") 83 | plt.xlabel("Epoch") 84 | plt.ylabel("Loss") 85 | plt.legend() 86 | plt.savefig(save_file, dpi=300) 87 | plt.close() 88 | 89 | # Save data used to produce this figure 90 | data_file = save_file.replace('.png', '.npz') 91 | np.savez(data_file, loss_train=loss_train, loss_test=loss_test) 92 | 93 | # Dummy variables for same file format of the log files 94 | batch_size = 2048 # Dummy variables for same file format of the log files 95 | lr_enc = 1e-4 # Dummy variables for same file format of the log files 96 | hidden_size = 600 # Dummy variables for same file format of the log files 97 | patience = 500 # Dummy variables for same file format of the log files 98 | # Dummy variables for same file format of the log files 99 | 100 | # Create folders if needed 101 | neuralNet_folder = "save_neuralNet" 102 | plot_folder = "save_plot" 103 | os.makedirs(neuralNet_folder, exist_ok=True) 104 | os.makedirs(plot_folder, exist_ok=True) 105 | 106 | train_data_folder = "data_for_train" 107 | eval_data_folder = "data_for_eval" 108 | os.makedirs(train_data_folder, exist_ok=True) 109 | os.makedirs(eval_data_folder, exist_ok=True) 110 | 111 | rnd_seed = 42 112 | random.seed(rnd_seed) 113 | np.random.seed(rnd_seed) 114 | 115 | train_year_spans = [2, 3, 4] # (y2_train - y1_train) 116 | eval_year_spans = [1, 2, 3, 4, 5] # (y2_eval - y1_eval) 117 | IR_list = [10, 50] 118 | fixed_y2_eval = 2022 119 | 120 | 121 | for t_span in train_year_spans: 122 | for e_span in eval_year_spans: 123 | # Compute eval years 124 | y2_eval = fixed_y2_eval 125 | y1_eval = y2_eval - e_span 126 | 127 | # Compute train years 128 | # given: y2_train = y1_eval 129 | y2_train = y1_eval 130 | y1_train = y2_train - t_span 131 | 132 | for IR in IR_list: 133 | # Construct a file name prefix for all files 134 | file_name_prefix = f"{y1_train}_{y2_train}_{hidden_size}_{lr_enc}_{batch_size}_{patience}_{IR}" 135 | log_file = f'log_tree_{file_name_prefix}.txt' 136 | 137 | if y1_train==2013: 138 | train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}_star.parquet") 139 | eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}_star.parquet") 140 | else: 141 | train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}.parquet") 142 | eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}.parquet") 143 | 144 | # Check if files exist before proceeding 145 | if not os.path.exists(train_file) or not os.path.exists(eval_file): 146 | print_cluster(f"Skipping (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR}) because files do not exist.", log_file) 147 | continue 148 | 149 | print_cluster(f"Processing (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR})...", log_file) 150 | 151 | # Load train data 152 | train_data_pandas = pd.read_parquet(train_file) 153 | all_input_train = train_data_pandas.values 154 | train_data_pandas = pd.DataFrame() # free memory 155 | 156 | # Load eval data 157 | eval_data_pandas = pd.read_parquet(eval_file) 158 | all_input_eval = eval_data_pandas.values 159 | eval_data_pandas = pd.DataFrame() # free memory 160 | 161 | eval_feature_dataset = all_input_eval[:, 3:] # selecting features 162 | eval_solution_dataset = all_input_eval[:, 2] 163 | all_negative_solution_eval = np.all(eval_solution_dataset == 0) 164 | 165 | np.random.shuffle(all_input_train) 166 | input_data = all_input_train[:, 3:] # selecting features 167 | supervised_solution = all_input_train[:, 2] # solutions 168 | 169 | train_test_size = [0.85, 0.15] 170 | idx_train = int(len(input_data) * train_test_size[0]) 171 | input_data_train = input_data[:idx_train] 172 | train_solution = supervised_solution[:idx_train] 173 | 174 | input_data_test = input_data[idx_train:] 175 | test_solution = supervised_solution[idx_train:] 176 | 177 | # Initialize the Random Forest classifier 178 | print_cluster("Finished shuffling and splitting training data...", log_file) 179 | 180 | clf = RandomForestClassifier( 181 | n_estimators=300, 182 | max_depth=None, 183 | min_samples_split=25, 184 | min_samples_leaf=10, 185 | n_jobs=-1, 186 | random_state=rnd_seed, 187 | verbose=1, 188 | class_weight='balanced' 189 | ) 190 | 191 | print_cluster("Training the Random Forest classifier...", log_file) 192 | start_time = time.time() 193 | clf.fit(input_data_train, train_solution) 194 | end_time = time.time() 195 | print_cluster(f"Training completed in {end_time - start_time:.2f} seconds.", log_file) 196 | 197 | # We mimic the logic of plotting loss curves by calculating MSE on a subset 198 | size_of_loss_check = 10000 199 | # Compute "loss" as MSE of predictions vs solutions, just for plotting 200 | train_pred_for_loss = clf.predict_proba(input_data_train[:size_of_loss_check])[:,1] 201 | train_loss = mean_squared_error(train_solution[:size_of_loss_check], train_pred_for_loss) 202 | 203 | test_pred_for_loss = clf.predict_proba(input_data_test[:size_of_loss_check])[:,1] 204 | test_loss = mean_squared_error(test_solution[:size_of_loss_check], test_pred_for_loss) 205 | 206 | train_loss_total = [train_loss] 207 | test_loss_total = [test_loss] 208 | 209 | # Save the trained model (replace fcNN by tree in filenames) 210 | net_file = os.path.join(neuralNet_folder, f"tree_netNet_full_trained_{file_name_prefix}.pkl") 211 | joblib.dump(clf, net_file) 212 | 213 | # Plot loss curve (will be trivial, just one point) 214 | save_loss_file = os.path.join(plot_folder, f"tree_loss_curve_{file_name_prefix}.png") 215 | plot_loss_curve(train_loss_total, test_loss_total, save_loss_file, label=f"{y1_train}-{y2_train}") 216 | 217 | print_cluster("start evaluation for train, test and eval if possible....", log_file) 218 | eval_batch_size = 50000 219 | 220 | # Load model again to mimic original code 221 | clf = joblib.load(net_file) 222 | 223 | # Get predictions for training set 224 | train_predictions, train_labels = get_predictions(clf, input_data_train, train_solution, eval_batch_size, log_file=log_file) 225 | save_train_auc_file = os.path.join(plot_folder, f"tree_train_auc_curve_{file_name_prefix}.png") 226 | curr_auc = plot_auc_roc(train_labels, train_predictions, save_train_auc_file, label="Train") 227 | print_cluster(f"Train AUC: {curr_auc}", log_file) 228 | 229 | # Get predictions for test set 230 | test_predictions, test_labels = get_predictions(clf, input_data_test, test_solution, eval_batch_size, log_file=log_file) 231 | save_test_auc_file = os.path.join(plot_folder, f"tree_test_auc_curve_{file_name_prefix}.png") 232 | curr_auc = plot_auc_roc(test_labels, test_predictions, save_test_auc_file, label="Test") 233 | print_cluster(f"Test AUC: {curr_auc}", log_file) 234 | print_cluster("finish auc plot for train, test...", log_file) 235 | 236 | if not all_negative_solution_eval: # contain positive cases 237 | eval_predictions, eval_labels = get_predictions(clf, eval_feature_dataset, eval_solution_dataset, eval_batch_size, log_file=log_file) 238 | save_eval_auc_file = os.path.join(plot_folder, f"tree_eval_auc_curve_{file_name_prefix}.png") 239 | curr_auc = plot_auc_roc(eval_labels, eval_predictions, save_eval_auc_file, label="Eval") 240 | print_cluster(f"Eval AUC: {curr_auc}", log_file) 241 | 242 | print_cluster("finish all.....", log_file) 243 | -------------------------------------------------------------------------------- /benchmark_code/loops_xgboost.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import random, time 6 | import numpy as np 7 | import pandas as pd 8 | from datetime import datetime, date 9 | import matplotlib 10 | matplotlib.use('Agg') # Use a non-interactive backend suitable for cluster 11 | import matplotlib.pyplot as plt 12 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error 13 | import joblib 14 | import xgboost as xgb 15 | 16 | def print_cluster(print_str, log_file): 17 | print(print_str) 18 | with open(log_file, "a") as logfile: 19 | logfile.write(print_str + "\n") 20 | 21 | def get_predictions(model, data, solution, eval_batch_size, log_file): 22 | # model here is an XGBRegressor 23 | # We'll do the predictions in batches if needed 24 | data_batches = [] 25 | n = data.shape[0] 26 | idx = 0 27 | while idx < n: 28 | data_batches.append(data[idx:idx+eval_batch_size]) 29 | idx += eval_batch_size 30 | 31 | all_predictions = [] 32 | start_time = time.time() 33 | for i, batch in enumerate(data_batches, start=1): 34 | batch_start_time = time.time() 35 | # For XGBRegressor, we get a continuous output. We interpret it as probability of class 1. 36 | batch_preds = model.predict(batch) 37 | all_predictions.append(batch_preds) 38 | batch_time = time.time() - batch_start_time 39 | print_cluster(f"Processed batch {i}/{len(data_batches)} in {batch_time:.2f} seconds", log_file) 40 | 41 | predictions = np.concatenate(all_predictions) 42 | true_labels = solution 43 | total_time = time.time() - start_time 44 | print_cluster(f"Total prediction time: {total_time:.2f} seconds", log_file) 45 | return predictions, true_labels 46 | 47 | def plot_auc_roc(true_labels, predictions, save_file, label="Train"): 48 | # Calculate the AUC-ROC score and ROC curve 49 | auc_score = roc_auc_score(true_labels, predictions) 50 | fpr, tpr, thresholds = roc_curve(true_labels, predictions) 51 | 52 | # Plot the ROC curve 53 | plt.figure(figsize=(8, 6)) 54 | plt.plot(fpr, tpr, label=f"{label} AUC = {auc_score:.4f}") 55 | plt.plot([0, 1], [0, 1], 'k--', label=f"Random AUC={0.5}") 56 | plt.xlabel("False Positive Rate (FPR)") 57 | plt.ylabel("True Positive Rate (TPR)") 58 | plt.title(f"ROC Curve -- {label}") 59 | plt.legend(loc="lower right") 60 | plt.grid() 61 | plt.savefig(save_file, dpi=300) 62 | plt.close() 63 | 64 | # Save data used to produce this figure, including predictions and ground truth 65 | data_file = save_file.replace('.png', '.npz') 66 | np.savez( 67 | data_file, 68 | fpr=fpr, 69 | tpr=tpr, 70 | thresholds=thresholds, 71 | auc_score=auc_score, 72 | true_labels=true_labels, 73 | predictions=predictions 74 | ) 75 | return auc_score 76 | 77 | def plot_loss_curve(loss_train, loss_test, save_file, label="year1-year2"): 78 | plt.figure(figsize=(8, 6)) 79 | plt.plot(loss_train, label=f'Train: {label}') 80 | plt.plot(loss_test, label=f'Test: {label}') 81 | plt.title(f"Loss Over Epochs: {label}") 82 | plt.xlabel("Epoch") 83 | plt.ylabel("Loss") 84 | plt.legend() 85 | plt.savefig(save_file, dpi=300) 86 | plt.close() 87 | 88 | # Save data used to produce this figure 89 | data_file = save_file.replace('.png', '.npz') 90 | np.savez(data_file, loss_train=loss_train, loss_test=loss_test) 91 | 92 | # Dummy variables for same file format of the log files 93 | batch_size = 2048 # Dummy variables for same file format of the log files 94 | lr_enc = 1e-4 # Dummy variables for same file format of the log files 95 | hidden_size = 600 # Dummy variables for same file format of the log files 96 | patience = 500 # Dummy variables for same file format of the log files 97 | # Dummy variables for same file format of the log files 98 | 99 | # Create folders if needed 100 | neuralNet_folder = "save_neuralNet" 101 | plot_folder = "save_plot" 102 | os.makedirs(neuralNet_folder, exist_ok=True) 103 | os.makedirs(plot_folder, exist_ok=True) 104 | 105 | train_data_folder = "data_for_train" 106 | eval_data_folder = "data_for_eval" 107 | os.makedirs(train_data_folder, exist_ok=True) 108 | os.makedirs(eval_data_folder, exist_ok=True) 109 | 110 | rnd_seed = 42 111 | random.seed(rnd_seed) 112 | np.random.seed(rnd_seed) 113 | 114 | train_year_spans = [2, 3, 4] # (y2_train - y1_train) 115 | eval_year_spans = [1, 2, 3, 4, 5] # (y2_eval - y1_eval) 116 | IR_list = [10, 50] 117 | fixed_y2_eval = 2022 118 | 119 | 120 | for t_span in train_year_spans: 121 | for e_span in eval_year_spans: 122 | # Compute eval years 123 | y2_eval = fixed_y2_eval 124 | y1_eval = y2_eval - e_span 125 | 126 | # Compute train years 127 | # given: y2_train = y1_eval 128 | y2_train = y1_eval 129 | y1_train = y2_train - t_span 130 | 131 | for IR in IR_list: 132 | # Construct a file name prefix for all files 133 | file_name_prefix = f"{y1_train}_{y2_train}_{hidden_size}_{lr_enc}_{batch_size}_{patience}_{IR}" 134 | log_file = f'log_xgboost_{file_name_prefix}.txt' 135 | 136 | if y1_train==2013: 137 | train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}_star.parquet") 138 | eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}_star.parquet") 139 | else: 140 | train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}.parquet") 141 | eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}.parquet") 142 | 143 | # Check if files exist before proceeding 144 | if not os.path.exists(train_file) or not os.path.exists(eval_file): 145 | print_cluster(f"Skipping (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR}) because files do not exist.", log_file) 146 | continue 147 | 148 | print_cluster(f"Processing (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR})...", log_file) 149 | 150 | # Load train data 151 | train_data_pandas = pd.read_parquet(train_file) 152 | all_input_train = train_data_pandas.values 153 | train_data_pandas = pd.DataFrame() # free memory 154 | 155 | # Load eval data 156 | eval_data_pandas = pd.read_parquet(eval_file) 157 | all_input_eval = eval_data_pandas.values 158 | eval_data_pandas = pd.DataFrame() # free memory 159 | 160 | eval_feature_dataset = all_input_eval[:, 3:] # selecting features 161 | eval_solution_dataset = all_input_eval[:, 2] 162 | all_negative_solution_eval = np.all(eval_solution_dataset == 0) 163 | 164 | np.random.shuffle(all_input_train) 165 | # Taking a smaller slice for demonstration as in the original code (0:100), keep it the same 166 | input_data = all_input_train[:, 3:] # selecting features 167 | supervised_solution = all_input_train[:, 2] # solutions 168 | 169 | train_test_size = [0.85, 0.15] 170 | idx_train = int(len(input_data) * train_test_size[0]) 171 | input_data_train = input_data[:idx_train] 172 | train_solution = supervised_solution[:idx_train] 173 | 174 | input_data_test = input_data[idx_train:] 175 | test_solution = supervised_solution[idx_train:] 176 | 177 | print_cluster("Finished shuffling and splitting training data...", log_file) 178 | 179 | print_cluster("start training with XGBoost....", log_file) 180 | # Define XGBoost model 181 | model = xgb.XGBRegressor( 182 | n_estimators=2000, 183 | learning_rate=0.01, 184 | max_depth=10, 185 | subsample=0.8, 186 | colsample_bytree=0.8, 187 | random_state=rnd_seed, 188 | verbosity=1, 189 | eval_metric="rmse", 190 | early_stopping_rounds=500 # Move this parameter here 191 | ) 192 | 193 | # Fit the model 194 | start_time = time.time() 195 | model.fit( 196 | input_data_train, train_solution, 197 | eval_set=[(input_data_test, test_solution)], 198 | verbose=True 199 | ) 200 | end_time = time.time() 201 | print_cluster(f"Training completed in {end_time - start_time:.2f} seconds.", log_file) 202 | 203 | # Compute "loss" as MSE on train/test subsets 204 | size_of_loss_check = 10000 205 | train_pred_for_loss = model.predict(input_data_train[:size_of_loss_check]) 206 | train_loss = mean_squared_error(train_solution[:size_of_loss_check], train_pred_for_loss) 207 | 208 | test_pred_for_loss = model.predict(input_data_test[:size_of_loss_check]) 209 | test_loss = mean_squared_error(test_solution[:size_of_loss_check], test_pred_for_loss) 210 | 211 | train_loss_total = [train_loss] 212 | test_loss_total = [test_loss] 213 | 214 | # Save the trained model (replace tree with xgboost in filenames) 215 | net_file = os.path.join(neuralNet_folder, f"xgboost_netNet_full_trained_{file_name_prefix}.pkl") 216 | joblib.dump(model, net_file) 217 | 218 | # Plot loss curve (will be trivial, just one point) 219 | save_loss_file = os.path.join(plot_folder, f"xgboost_loss_curve_{file_name_prefix}.png") 220 | plot_loss_curve(train_loss_total, test_loss_total, save_loss_file, label=f"{y1_train}-{y2_train}") 221 | 222 | print_cluster("start evaluation for train, test and eval if possible....", log_file) 223 | eval_batch_size = 50000 224 | 225 | # Load model again to mimic original code 226 | model = joblib.load(net_file) 227 | 228 | # Get predictions for training set 229 | train_predictions, train_labels = get_predictions(model, input_data_train, train_solution, eval_batch_size, log_file=log_file) 230 | save_train_auc_file = os.path.join(plot_folder, f"xgboost_train_auc_curve_{file_name_prefix}.png") 231 | curr_auc = plot_auc_roc(train_labels, train_predictions, save_train_auc_file, label="Train") 232 | print_cluster(f"Train AUC: {curr_auc}", log_file) 233 | 234 | # Get predictions for test set 235 | test_predictions, test_labels = get_predictions(model, input_data_test, test_solution, eval_batch_size, log_file=log_file) 236 | save_test_auc_file = os.path.join(plot_folder, f"xgboost_test_auc_curve_{file_name_prefix}.png") 237 | curr_auc = plot_auc_roc(test_labels, test_predictions, save_test_auc_file, label="Test") 238 | print_cluster(f"Test AUC: {curr_auc}", log_file) 239 | print_cluster("finish auc plot for train, test...", log_file) 240 | 241 | if not all_negative_solution_eval: # contain positive cases 242 | eval_predictions, eval_labels = get_predictions(model, eval_feature_dataset, eval_solution_dataset, eval_batch_size, log_file=log_file) 243 | save_eval_auc_file = os.path.join(plot_folder, f"xgboost_eval_auc_curve_{file_name_prefix}.png") 244 | curr_auc = plot_auc_roc(eval_labels, eval_predictions, save_eval_auc_file, label="Eval") 245 | print_cluster(f"Eval AUC: {curr_auc}", log_file) 246 | 247 | print_cluster("finish all.....", log_file) 248 | -------------------------------------------------------------------------------- /benchmark_code/make_plot_loss_curve.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | # Define the directory and file name 6 | base_dir = "save_plot" # Directory name 7 | file_name = "fcNN_loss_curve_2017_2020_600_0.0001_2048_500_10.npz" 8 | 9 | # Construct full path using os.path 10 | file_path = os.path.join(base_dir, file_name) 11 | 12 | # Load the npz file 13 | if os.path.exists(file_path): 14 | data = np.load(file_path) 15 | loss_train = data['loss_train'] 16 | loss_test = data['loss_test'] 17 | else: 18 | raise FileNotFoundError(f"Error: File '{file_path}' not found.") 19 | 20 | # Main plot 21 | plt.figure(figsize=(10, 6)) 22 | plt.plot(loss_train, label='Train Loss', color='blue') 23 | plt.plot(loss_test, label='Test Loss', color='orange') 24 | plt.xlabel("Epoch", fontsize=24) 25 | plt.ylabel("Loss", fontsize=24) 26 | plt.xticks(fontsize=20) 27 | plt.yticks(fontsize=20) 28 | plt.title("Training and Test Loss Over Epochs", fontsize=28) 29 | plt.legend(fontsize=20) 30 | 31 | # Inset plot: Zoomed in from episode 1000 to the end 32 | inset_start = 1000 33 | ax_inset = plt.axes([0.30, 0.30, 0.55, 0.40]) # Position of inset: [x, y, width, height] 34 | ax_inset.plot(range(inset_start, len(loss_train)), loss_train[inset_start:], label='Train Loss', color='blue') 35 | ax_inset.plot(range(inset_start, len(loss_test)), loss_test[inset_start:], label='Test Loss', color='orange') 36 | ax_inset.set_title(f"Zoomed In, Start at Epoch {inset_start}", fontsize=14) 37 | ax_inset.set_xlabel("Epoch", fontsize=16) 38 | ax_inset.set_ylabel("Loss", fontsize=16) 39 | ax_inset.tick_params(axis='both', which='major', labelsize=14) 40 | 41 | plt.tight_layout() 42 | # Save the figure 43 | output_dir = "save_plot_output" 44 | os.makedirs(output_dir, exist_ok=True) 45 | save_path = os.path.join(output_dir, "loss_curve_with_inset.png") 46 | plt.savefig(save_path, dpi=300) 47 | 48 | plt.show() 49 | -------------------------------------------------------------------------------- /benchmark_code/make_plots.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | # Define the model colors to ensure consistency 7 | model_colors = { 8 | 'fcNN': 'blue', 9 | 'tree': 'red', 10 | 'xgboost': 'green', 11 | 'transformer': 'orange' 12 | } 13 | 14 | # (y1_train, y2_train) pairs to consider based on conditions (already given) 15 | train_pairs = [ 16 | (y1, y2) 17 | for y1 in range(2013, 2020) 18 | for y2 in range(2017, 2022) 19 | if y2 - y1 in [2, 3, 4] 20 | ] 21 | 22 | IR_values = [10, 50] 23 | 24 | data_folder = "save_plot" 25 | plot_folder = "full_ROCs" 26 | os.makedirs(plot_folder, exist_ok=True) 27 | 28 | # Helper function to parse filename and extract model, y1, y2, IR 29 | def parse_filename(filename): 30 | base = os.path.basename(filename) 31 | base = base.replace('.npz', '') 32 | parts = base.split('_') 33 | 34 | # Find 'eval' as an anchor 35 | try: 36 | eval_idx = parts.index('eval') 37 | except ValueError: 38 | return None, None, None, None 39 | 40 | # Model name is everything before 'eval' 41 | model_name = '_'.join(parts[:eval_idx]) 42 | 43 | # Check for minimum length 44 | if len(parts) < eval_idx + 7: 45 | return None, None, None, None 46 | 47 | # After eval_auc_curve come y1, y2 and IR 48 | try: 49 | y1_train = int(parts[eval_idx+3]) 50 | y2_train = int(parts[eval_idx+4]) 51 | IR = int(parts[-1]) 52 | except ValueError: 53 | return None, None, None, None 54 | 55 | return model_name, y1_train, y2_train, IR 56 | 57 | print("Collecting files...") 58 | 59 | # Collect all .npz files 60 | all_files = glob.glob(os.path.join(data_folder, "*.npz")) 61 | 62 | 63 | 64 | # Dictionary to store files by (y1, y2, IR) 65 | files_dict = {} 66 | for f in all_files: 67 | if 'single' in f: 68 | # Ignore files containing "single" 69 | continue 70 | model_name, y1_t, y2_t, IR = parse_filename(f) 71 | if model_name is None: 72 | continue 73 | # Ensure model_name is in our known set 74 | if model_name not in ['fcNN', 'tree', 'xgboost', 'transformer']: 75 | continue 76 | 77 | key = (y1_t, y2_t, IR) 78 | if key not in files_dict: 79 | files_dict[key] = {} 80 | if model_name not in files_dict[key]: 81 | files_dict[key][model_name] = [] 82 | files_dict[key][model_name].append(f) 83 | 84 | print(f"Collected files for {len(files_dict)} (y1,y2,IR) combinations.") 85 | 86 | # We want two figures: one for IR=10 and one for IR=50 87 | # Rows: delta_train in [2,3,4] 88 | # Cols: delta_eval in [1,2,3,4,5] 89 | delta_train_values = [2, 3, 4] 90 | delta_eval_values = [1, 2, 3, 4, 5] 91 | 92 | 93 | 94 | print("Starting plotting...") 95 | 96 | for IR in IR_values: 97 | fig, axes = plt.subplots(len(delta_train_values), len(delta_eval_values), figsize=(20, 12)) 98 | #fig.suptitle(f"Various Training and Evaluation Intervals (IR={IR})", fontsize=24, y=0.96) 99 | 100 | for row, dt in enumerate(delta_train_values): 101 | for col, de in enumerate(delta_eval_values): 102 | ax = axes[row, col] 103 | 104 | # Compute y1, y2 from dt and de: 105 | # delta_eval = 2022 - y2 => y2 = 2022 - delta_eval = 2022 - de 106 | y2 = 2022 - de 107 | # delta_train = y2 - y1 => y1 = y2 - dt 108 | y1 = y2 - dt 109 | 110 | key = (y1, y2, IR) 111 | if key not in files_dict or not files_dict[key]: 112 | # No data for this combination 113 | ax.text(0.5, 0.5, "not enough data for evaluation", 114 | ha='center', va='center', transform=ax.transAxes, fontsize=12) 115 | ax.set_title(f"Train: {y1}-{y2}, Eval: {y2}-2022") 116 | ax.set_xlabel("FPR") 117 | ax.set_ylabel("TPR") 118 | ax.grid(True) 119 | continue 120 | 121 | model_files = files_dict[key] 122 | 123 | # Plot each model's ROC curve (take first file if multiple) 124 | any_plotted = False 125 | for model_name in model_files: 126 | f = model_files[model_name][0] 127 | data = np.load(f) 128 | fpr = data['fpr'] 129 | tpr = data['tpr'] 130 | auc_score = data['auc_score'] 131 | 132 | if model_name=='fcNN': 133 | write_model='fcNN' 134 | if model_name=='tree': 135 | write_model='Forest' 136 | if model_name=='xgboost': 137 | write_model='XGBoost' 138 | if model_name=='transformer': 139 | write_model='Transformer' 140 | ax.plot(fpr, tpr, color=model_colors.get(model_name, 'black'), 141 | label=f"{write_model} AUC={auc_score:.4f}") 142 | any_plotted = True 143 | 144 | if any_plotted: 145 | # Plot the random line 146 | ax.plot([0, 1], [0, 1], 'k--', label="Random AUC=0.5") 147 | ax.legend(loc="lower right") 148 | else: 149 | # If no model plotted (though files_dict said otherwise, just in case) 150 | ax.text(0.5, 0.5, "not enough data for evaluation", 151 | ha='center', va='center', transform=ax.transAxes, fontsize=12) 152 | 153 | ax.set_title(f"Train: {y1}-{y2}, Eval: {y2}-2022") 154 | ax.set_xlabel("FPR") 155 | ax.set_ylabel("TPR") 156 | ax.grid(True) 157 | 158 | plt.tight_layout(rect=[0, 0, 1, 0.95]) 159 | plot_filename = f"comparison_roc_grid_IR{IR}.pdf" 160 | save_path = os.path.join(plot_folder, plot_filename) 161 | plt.savefig(save_path, dpi=300) 162 | plt.close() 163 | print(f"Saved combined figure for IR={IR} to {save_path}") 164 | 165 | print("All plots have been generated.") 166 | -------------------------------------------------------------------------------- /benchmark_code/make_plots_mixed.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | # Directory where npz files are stored 7 | data_folder = "save_plot" 8 | # Directory to save final full ROC plot 9 | output_folder = "full_ROCs_single" 10 | os.makedirs(output_folder, exist_ok=True) 11 | 12 | # The set of y_diff values 13 | y_diff_values = [1, 2, 3, 4, 5] 14 | 15 | # Colors for the two fcNN variants 16 | colors = { 17 | 'single': 'blue', 18 | 'mixed': 'red' 19 | } 20 | 21 | # Create a single figure with 5 columns of subplots, 1 row 22 | fig, axes = plt.subplots(1, 5, figsize=(20, 4)) 23 | 24 | # Add a main title to the entire figure 25 | #fig.suptitle("Training from 2014 -> 2017 (Only fcNN Model Variants)", fontsize=22, y=1.08) 26 | 27 | for i, y_diff in enumerate(y_diff_values): 28 | ax = axes[i] 29 | y2_eval = 2017 + y_diff 30 | 31 | # Patterns for the two fcNN variants: 32 | # single: IR=50 files 33 | single_pattern = f"fcNNsingle_eval_auc_curve_2017_{y2_eval}_*_50.npz" 34 | # mixed: IR=10 files 35 | mixed_pattern = f"fcNNmixed_eval_auc_curve_2017_{y2_eval}_*_10.npz" 36 | 37 | # Search for files 38 | single_files = glob.glob(os.path.join(data_folder, single_pattern)) 39 | mixed_files = glob.glob(os.path.join(data_folder, mixed_pattern)) 40 | 41 | any_data = False 42 | 43 | # Plot single variant if available 44 | if single_files: 45 | data = np.load(single_files[0]) 46 | fpr = data['fpr'] 47 | tpr = data['tpr'] 48 | auc_score = data['auc_score'] 49 | ax.plot(fpr, tpr, label=f"fcNN (IR=50) AUC={auc_score:.4f}", color=colors['single']) 50 | any_data = True 51 | 52 | # Plot mixed variant if available 53 | if mixed_files: 54 | data = np.load(mixed_files[0]) 55 | fpr = data['fpr'] 56 | tpr = data['tpr'] 57 | auc_score = data['auc_score'] 58 | ax.plot(fpr, tpr, label=f"fcNN (IR=10) AUC={auc_score:.4f}", color=colors['mixed']) 59 | any_data = True 60 | 61 | if any_data: 62 | # If at least one curve plotted, show the random line and legend 63 | ax.plot([0, 1], [0, 1], 'k--', label="Random AUC=0.5") 64 | ax.legend(loc="lower right") 65 | else: 66 | # If no data for this subplot 67 | ax.text(0.5, 0.5, "no data available", 68 | ha='center', va='center', transform=ax.transAxes, fontsize=12) 69 | 70 | ax.set_xlabel("FPR") 71 | ax.set_ylabel("TPR") 72 | ax.set_title(f"Evaluation: 2017-{y2_eval}, IR=50") 73 | ax.grid(True) 74 | 75 | plt.tight_layout(rect=[0, 0, 1, 0.95]) 76 | 77 | # Save the single combined figure 78 | output_filename = "full_ROC_grid_fcNN_variants.pdf" 79 | plt.savefig(os.path.join(output_folder, output_filename), dpi=600) 80 | 81 | plt.show() 82 | plt.close() 83 | 84 | print("Single figure with all fcNN variant subplots generated and shown.") 85 | -------------------------------------------------------------------------------- /benchmark_code/make_plots_single.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | # Directory where npz files are stored 7 | data_folder = "save_plot" 8 | # Directory to save final full ROC plot 9 | output_folder = "full_ROCs_single" 10 | os.makedirs(output_folder, exist_ok=True) 11 | 12 | model_colors = { 13 | 'fcNN': 'blue', 14 | 'tree': 'red', 15 | 'xgboost': 'green', 16 | 'transformer': 'orange' 17 | } 18 | 19 | # The set of models to include 20 | models = ['fcNN', 'transformer', 'tree', 'xgboost'] 21 | 22 | # y_diff from 1 to 5 23 | y_diff_values = [1, 2, 3, 4, 5] 24 | # IR values 25 | IR_values = [10, 50] 26 | 27 | # Create a single figure with 2 rows and 5 columns of subplots 28 | fig, axes = plt.subplots(2, 5, figsize=(20, 8)) 29 | 30 | # Add a main title to the entire figure 31 | fig.suptitle("Training from 2014 -> 2017", fontsize=22, y=0.96) 32 | 33 | for i, y_diff in enumerate(y_diff_values): 34 | y1_eval = 2017 35 | y2_eval = 2017 + y_diff 36 | 37 | for j, IR in enumerate(IR_values): 38 | ax = axes[j, i] # j is row index (0 for IR=10, 1 for IR=50), i is column index for y_diff 39 | 40 | any_model_plotted = False # To check if we plotted any model 41 | 42 | # Attempt to plot models 43 | for model in models: 44 | if model == 'fcNN': 45 | pattern = f"{model}single_eval_auc_curve_2017_{y2_eval}_*_{IR}.npz" 46 | else: 47 | # For 'transformer', 'tree', 'xgboost' 48 | pattern = f"{model}single_eval_auc_curve_2017_2017_*_{IR}_2017_{y2_eval}.npz" 49 | 50 | search_pattern = os.path.join(data_folder, pattern) 51 | files = glob.glob(search_pattern) 52 | 53 | if not files: 54 | continue 55 | 56 | npz_file = files[0] 57 | data = np.load(npz_file) 58 | fpr = data['fpr'] 59 | tpr = data['tpr'] 60 | auc_score = data['auc_score'] 61 | 62 | 63 | if model=='fcNN': 64 | write_model='fcNN' 65 | if model=='tree': 66 | write_model='Forest' 67 | if model=='xgboost': 68 | write_model='XGBoost' 69 | if model=='transformer': 70 | write_model='Transformer' 71 | 72 | ax.plot(fpr, tpr, label=f"{write_model} AUC={auc_score:.4f}", color=model_colors[model]) 73 | any_model_plotted = True 74 | 75 | if any_model_plotted: 76 | # If we found at least one model, then plot the random diagonal 77 | ax.plot([0, 1], [0, 1], 'k--', label="Random AUC=0.5") 78 | ax.legend(loc="lower right") 79 | else: 80 | # If no model was plotted on this subplot, add a message 81 | ax.text(0.5, 0.5, "not enough data for evaluation", 82 | ha='center', va='center', transform=ax.transAxes, fontsize=12) 83 | 84 | ax.set_xlabel("FPR") 85 | ax.set_ylabel("TPR") 86 | ax.set_title(f"Evaluation: 2017-{y2_eval}, IR={IR}") 87 | ax.grid(True) 88 | 89 | # Adjust layout so things fit nicely, and leave space for suptitle 90 | plt.tight_layout(rect=[0, 0, 1, 0.96]) 91 | 92 | # Save the single combined figure 93 | output_filename = "full_ROC_grid_single.pdf" 94 | plt.savefig(os.path.join(output_folder, output_filename), dpi=600) 95 | 96 | plt.show() 97 | plt.close() 98 | 99 | print("Single figure with all subplots generated and shown.") 100 | -------------------------------------------------------------------------------- /create_concepts/Concept_Corpus/s1_make_metadate_arxivstyle.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "5f8d99ec-1c43-4354-84ae-57c9eee8e3eb", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import json\n", 13 | "import ujson\n", 14 | "\n", 15 | "medrxiv_all_article=[]\n", 16 | " \n", 17 | "with open('medrxiv/medrxiv_metadata_oringal.json', 'r') as f:\n", 18 | " medrxiv_all_article.extend(json.load(f))\n", 19 | "\n", 20 | "medrxiv_article=[]\n", 21 | "medrxiv_article_not_version1=[] ## some papers appear many times with different versions, we only use the first version\n", 22 | "for ii in range(len(medrxiv_all_article)):\n", 23 | " if medrxiv_all_article[ii]['version']==\"1\":\n", 24 | " medrxiv_article.append(medrxiv_all_article[ii])\n", 25 | "\n", 26 | "\n", 27 | "with open('medrxiv-metadata-oai-snapshot.json', 'w') as f:\n", 28 | " f.writelines(map(lambda item: ujson.dumps(item) + '\\n', medrxiv_article))\n", 29 | " " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "id": "7849fa1d-aa04-4822-a335-7c33965523b8", 36 | "metadata": { 37 | "tags": [] 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "import json\n", 42 | "import ujson\n", 43 | "\n", 44 | "biorxiv_all_article=[]\n", 45 | "with open('biorxiv/biorxiv_metadata_final_all.json', 'r') as f:\n", 46 | " biorxiv_all_article.extend(json.load(f))\n", 47 | "\n", 48 | "biorxiv_article=[]\n", 49 | "\n", 50 | "for ii in range(len(biorxiv_all_article)):\n", 51 | " if biorxiv_all_article[ii]['version']==\"1\":\n", 52 | " biorxiv_article.append(biorxiv_all_article[ii])\n", 53 | " \n", 54 | " \n", 55 | "with open('biorxiv-metadata-oai-snapshot.json', 'w') as f:\n", 56 | " f.writelines(map(lambda item: ujson.dumps(item) + '\\n', biorxiv_article))\n", 57 | " " 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "9b0b9652-60e2-4fc3-89ce-f871f9520320", 64 | "metadata": { 65 | "tags": [] 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "import json\n", 70 | "import ujson\n", 71 | "\n", 72 | "\n", 73 | "chemrxiv_all_article=[]\n", 74 | "with open('chemrxiv/chemrxiv_metadata_oringal_asc.json', 'r') as f: #chemrxiv_metadata_oringal\n", 75 | " chemrxiv_all_article.extend(json.load(f))\n", 76 | " \n", 77 | "chemrxiv_article=[]\n", 78 | "for ii in range(len(chemrxiv_all_article)):\n", 79 | " chemrxiv_article.append(chemrxiv_all_article[ii]['item'])\n", 80 | " \n", 81 | " \n", 82 | "new_chemrxiv_article = []\n", 83 | "for ii in range(len(chemrxiv_article)):\n", 84 | " new_entry = {}\n", 85 | " new_entry['id'] = chemrxiv_article[ii]['id']\n", 86 | " authors_names = [f\"{author['firstName']} {author['lastName']}\" for author in chemrxiv_article[ii]['authors']]\n", 87 | " new_entry['authors'] = ', '.join(authors_names)\n", 88 | " new_entry['title'] = chemrxiv_article[ii]['title']\n", 89 | " new_entry['doi'] = chemrxiv_article[ii]['doi']\n", 90 | " categories_all = ', '.join([d['name'] for d in chemrxiv_article[ii]['categories']])\n", 91 | " new_entry['categories'] = categories_all\n", 92 | " new_entry['abstract'] = chemrxiv_article[ii]['abstract']\n", 93 | " new_entry['date'] = chemrxiv_article[ii]['submittedDate'][0:10]\n", 94 | " new_entry['version'] = chemrxiv_article[ii]['version']\n", 95 | " new_entry['server'] = 'chemrxiv'\n", 96 | " new_chemrxiv_article.append(new_entry)\n", 97 | " \n", 98 | "# write the new dictionary to a JSON file\n", 99 | "with open('chemrxiv-metadata-oai-snapshot.json', 'w') as f:\n", 100 | " f.writelines(map(lambda item: ujson.dumps(item) + '\\n', new_chemrxiv_article))\n" 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3 (ipykernel)", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.9.7" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 5 125 | } 126 | -------------------------------------------------------------------------------- /create_concepts/Concept_Corpus/s2_combine_all_preprint_metadate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "e3c27b65-fe14-4077-bb51-71bd7aea6e3b", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import json\n", 13 | "import linecache\n", 14 | "import time\n", 15 | "import jsonlines\n", 16 | "from datetime import datetime, date\n", 17 | "import pickle" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "0701b2af-9795-49e0-a4ff-15b9470ac60e", 23 | "metadata": {}, 24 | "source": [ 25 | "## read biorxiv_json" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "id": "babf3851-6c70-46bd-8626-9b7d6801a3dd", 32 | "metadata": { 33 | "tags": [] 34 | }, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "Current: 184839; Read biorxiv: 184839, Elapsed time: 2.6651909351348877 seconds\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "all_paper_full_infos=[] ### store all papers from bioxiv, chem, med, arxiv \n", 46 | "\n", 47 | "biorxiv_json = 'biorxiv-metadata-oai-snapshot.json'\n", 48 | "starting_date = date(1990,1,1)\n", 49 | "start_time = time.time()\n", 50 | "\n", 51 | "with jsonlines.open(biorxiv_json, 'r') as f:\n", 52 | " for id_of_abstract, line in enumerate(f):\n", 53 | " get_date = datetime.strptime(line['date'], '%Y-%m-%d').date()\n", 54 | " paper_time = (get_date - starting_date).days\n", 55 | " all_paper_full_infos.append([line['server'],line['title'],line['abstract'],paper_time])\n", 56 | "\n", 57 | "num1=len(all_paper_full_infos)\n", 58 | "elapsed_time = time.time() - start_time\n", 59 | "print(f\"Current: {len(all_paper_full_infos)}; Read biorxiv: {len(all_paper_full_infos)}, Elapsed time: {elapsed_time} seconds\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "2407d05e-e234-4761-9e46-6a8c9f550cd9", 65 | "metadata": {}, 66 | "source": [ 67 | "## read medrxiv_json" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "id": "fdc41bb7-fda8-4029-83ac-844071ee0134", 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "Current: 224071; Read medrxiv: 39232, Elapsed time: 0.6739270687103271 seconds\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "\n", 88 | "medrxiv_json = 'medrxiv-metadata-oai-snapshot.json'\n", 89 | "\n", 90 | "start_time = time.time()\n", 91 | "with jsonlines.open(medrxiv_json, 'r') as f:\n", 92 | " for id_of_abstract, line in enumerate(f):\n", 93 | " get_date = datetime.strptime(line['date'], '%Y-%m-%d').date()\n", 94 | " paper_time = (get_date - starting_date).days\n", 95 | " all_paper_full_infos.append([line['server'],line['title'],line['abstract'],paper_time])\n", 96 | "\n", 97 | "num2=len(all_paper_full_infos)\n", 98 | "elapsed_time = time.time() - start_time\n", 99 | "print(f\"Current: {len(all_paper_full_infos)}; Read medrxiv: {len(all_paper_full_infos)-num1}, Elapsed time: {elapsed_time} seconds\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "id": "5e6151f3-eb2a-4eab-aad3-d50246eddb38", 105 | "metadata": {}, 106 | "source": [ 107 | "## read chemrxiv_json" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 4, 113 | "id": "a6b19d82-ffff-4c2a-ba37-57031da11cd8", 114 | "metadata": { 115 | "tags": [] 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "Current: 240551; Read chemrxiv: 16480, Elapsed time: 0.25910282135009766 seconds\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "\n", 128 | "chemrxiv_json = 'chemrxiv-metadata-oai-snapshot.json'\n", 129 | "\n", 130 | "start_time = time.time()\n", 131 | "with jsonlines.open(chemrxiv_json, 'r') as f:\n", 132 | " for id_of_abstract, line in enumerate(f):\n", 133 | " get_date = datetime.strptime(line['date'][:10], '%Y-%m-%d').date()\n", 134 | " paper_time = (get_date - starting_date).days\n", 135 | " all_paper_full_infos.append([line['server'],line['title'],line['abstract'],paper_time])\n", 136 | "\n", 137 | "num3=len(all_paper_full_infos)\n", 138 | "elapsed_time = time.time() - start_time\n", 139 | "print(f\"Current: {len(all_paper_full_infos)}; Read chemrxiv: {len(all_paper_full_infos)-num2}, Elapsed time: {elapsed_time} seconds\")" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "id": "6b282f2f-561a-4453-b4cd-5240d062b8ee", 145 | "metadata": {}, 146 | "source": [ 147 | "## remove duplicates papers " 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 23, 153 | "id": "e0d91f02-5b36-4814-af9d-d67f8f9a8583", 154 | "metadata": { 155 | "tags": [] 156 | }, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "remove duplicates: 28\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "## remove repeated papers (there are some repeated papers)\n", 168 | "paper_infos_unique = set(map(tuple, all_paper_full_infos)) # convert each sublist to a tuple and create a set\n", 169 | "all_paper_infos_unique = list(map(list, paper_infos_unique)) # convert each tuple back to a list and create a list\n", 170 | "\n", 171 | "print(f\"remove duplicates: {len(all_paper_full_infos)-len(all_paper_infos_unique)}\")\n", 172 | "\n", 173 | "with open('all_paper_info_lists_bio_med_chem.pkl', 'wb') as f:\n", 174 | " pickle.dump(all_paper_infos_unique, f)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "id": "40ecc36a-4de1-40de-a22a-152e2efc6488", 180 | "metadata": {}, 181 | "source": [ 182 | "## read arxiv_json " 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 24, 188 | "id": "136d5452-07e0-4d5f-813c-46009fffce65", 189 | "metadata": { 190 | "tags": [] 191 | }, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "arxiv: {id_of_abstract}\n", 198 | "Current: 2444442; Read chemrxiv: 2203891, Elapsed time: 44.237696170806885 seconds\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "start_time = time.time()\n", 204 | "arxiv_json = 'arxiv-metadata-oai-snapshot.json'\n", 205 | "\n", 206 | "with jsonlines.open(arxiv_json, 'r') as f:\n", 207 | " for id_of_abstract, line in enumerate(f):\n", 208 | " get_date = datetime.strptime(line['versions'][0]['created'], '%a, %d %b %Y %H:%M:%S %Z').date()\n", 209 | " paper_time = (get_date - starting_date).days\n", 210 | " all_paper_infos_unique.append(['arxiv',line['title'],line['abstract'],paper_time])\n", 211 | " \n", 212 | "elapsed_time = time.time() - start_time\n", 213 | "print(\"arxiv: {id_of_abstract}\")\n", 214 | "print(f\"Current: {len(all_paper_infos_unique)}; Read chemrxiv: {len(all_paper_infos_unique)-num3}, Elapsed time: {elapsed_time} seconds\")" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "id": "b7af89d0-aac4-4372-ab86-828ce493f0d9", 220 | "metadata": {}, 221 | "source": [ 222 | "## Store all the processed preprint metadata" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "2345c8dd-8b5c-4a87-8ef3-4529581f2063", 229 | "metadata": { 230 | "tags": [] 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "import pickle\n", 235 | "with open('all_paper_info_lists.pkl', 'wb') as f:\n", 236 | " pickle.dump(all_paper_infos_unique, f)" 237 | ] 238 | } 239 | ], 240 | "metadata": { 241 | "kernelspec": { 242 | "display_name": "Python 3 (ipykernel)", 243 | "language": "python", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 3 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython3", 256 | "version": "3.9.7" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 5 261 | } 262 | -------------------------------------------------------------------------------- /create_concepts/Concept_Corpus/s3_get_concepts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "17001bd9-da10-4fe7-a3d4-9c5174944296", 6 | "metadata": {}, 7 | "source": [ 8 | "### load all the processed preprint papers " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "18ebad34-ea44-4336-8665-ceabec4c5371", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "import pickle\n", 20 | "\n", 21 | "if os.path.exists('all_paper_info_lists.pkl'):\n", 22 | " # open the existing pickle file for reading\n", 23 | " with open('all_paper_info_lists.pkl', 'rb') as f:\n", 24 | " all_paper_lists = pickle.load(f)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "89c4475b-b22e-417d-8f74-64045ea27f90", 30 | "metadata": {}, 31 | "source": [ 32 | "### put title and abstract together, store in to string list" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "bd905ebe-207c-4f63-b869-5e4434343ce4", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "def get_single_article_string(article):\n", 43 | " \n", 44 | " curr_title=article[1] #'title'\n", 45 | " curr_abstract=article[2] #'abstract'\n", 46 | " \n", 47 | " replace_pairs=[['\\n',' '],['-',' '],[' \\\" a','oa'],['\\\" a','ae'],['\\\"a','ae'],[' \\\" o','oe'],['\\\" o','oe'],['\\\"o','oe'],[' \\\" u','ue'],\n", 48 | " ['\\\" u','ue'],['\\\"u','ue'],[' \\' a','a'],[' \\' e','e'],[' \\' o','o'],[\"\\' \", \"\"],[\"\\'\", \"\"],[' ',' '],[' ',' ']]\n", 49 | " \n", 50 | " article_string=(curr_title +' '+ curr_abstract).lower()\n", 51 | " \n", 52 | " for rep_pair in replace_pairs:\n", 53 | " #print(rep_pair)\n", 54 | " \n", 55 | " article_string=article_string.replace(rep_pair[0],rep_pair[1])\n", 56 | " #print(article_string)\n", 57 | " #print('\\n')\n", 58 | " \n", 59 | " return article_string\n", 60 | "\n", 61 | "def get_all_paper_strings(article_lists):\n", 62 | "\n", 63 | " if os.path.exists('all_paper_string_lists.pkl'):\n", 64 | " with open(\"all_paper_string_lists.pkl\", \"rb\") as f:\n", 65 | " all_paper_strings = pickle.load(f)\n", 66 | " \n", 67 | " else:\n", 68 | " all_paper_strings=[]\n", 69 | " cc=0\n", 70 | " for id_of_paper in range(len(article_lists)):\n", 71 | " cc+=1\n", 72 | " if (cc%300000)==0:\n", 73 | " print(str(cc)+'/'+str(len(article_lists)))\n", 74 | "\n", 75 | " all_paper_strings.append(get_single_article_string(article_lists[id_of_paper]))\n", 76 | "\n", 77 | " with open(\"all_paper_string_lists.pkl\", \"wb\") as f:\n", 78 | " pickle.dump(all_paper_strings, f)\n", 79 | " \n", 80 | " return all_paper_strings\n", 81 | "\n", 82 | "\n", 83 | "\n", 84 | "all_article_strings=get_all_paper_strings(all_paper_lists)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "4be3b05e-a8eb-4e1a-a4a6-273f702eac16", 90 | "metadata": {}, 91 | "source": [ 92 | "### Get Concepts from RAKE" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "845be865-8db6-4956-ade5-918b191954dd", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "import time\n", 103 | "import pickle\n", 104 | "import nltk\n", 105 | "from nltk.corpus import stopwords\n", 106 | "from nltk.stem import WordNetLemmatizer\n", 107 | "from rake_nltk import Metric, Rake\n", 108 | "from collections import Counter\n", 109 | "\n", 110 | "starting_time = time.time()\n", 111 | " \n", 112 | "wnl=WordNetLemmatizer()\n", 113 | "\n", 114 | "num_of_abstracts=len(all_paper_lists)\n", 115 | "\n", 116 | "personal_stop_list=['presents','us','show','one','two','three','describes','new','approach','many','introduces','http','also','whose', 'prove','select ','take']\n", 117 | "\n", 118 | "nltk_stop_list=nltk.corpus.stopwords.words('english')\n", 119 | "full_stop_list=nltk_stop_list + personal_stop_list\n", 120 | "\n", 121 | "\n", 122 | "all_concepts_from_rake=[]\n", 123 | "cc=0\n", 124 | "for id_of_abstract in range(num_of_abstracts):\n", 125 | " cc+=1\n", 126 | " if (cc%100000)==0:\n", 127 | " print(str(cc)+'/'+str(num_of_abstracts))\n", 128 | " \n", 129 | " \n", 130 | " single_string = get_single_article_string(all_paper_lists[id_of_abstract])\n", 131 | " \n", 132 | " r = Rake(stopwords=full_stop_list, ranking_metric=Metric.WORD_DEGREE, min_length=2, include_repeated_phrases=False)\n", 133 | "\n", 134 | " r.extract_keywords_from_text(single_string)\n", 135 | " ll=r.get_ranked_phrases_with_scores()\n", 136 | " \n", 137 | " all_concepts_from_rake.extend(ll)\n", 138 | "\n", 139 | "\n", 140 | "with open(\"all_concepts_from_rake.pkl\", \"wb\") as output_file:\n", 141 | " pickle.dump(all_concepts_from_rake, output_file)\n", 142 | " \n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "id": "954d5a11-4106-42e6-aedc-4b2a02c34b17", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [] 152 | } 153 | ], 154 | "metadata": { 155 | "kernelspec": { 156 | "display_name": "asl_semnet", 157 | "language": "python", 158 | "name": "asl_semnet" 159 | }, 160 | "language_info": { 161 | "codemirror_mode": { 162 | "name": "ipython", 163 | "version": 3 164 | }, 165 | "file_extension": ".py", 166 | "mimetype": "text/x-python", 167 | "name": "python", 168 | "nbconvert_exporter": "python", 169 | "pygments_lexer": "ipython3", 170 | "version": "3.10.9" 171 | } 172 | }, 173 | "nbformat": 4, 174 | "nbformat_minor": 5 175 | } 176 | -------------------------------------------------------------------------------- /create_concepts/Domain_Concept/s0_prepare_optics_quantum_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "34f3f857-ce5c-4779-8209-ca0fb47340f6", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "import json\n", 13 | "import linecache\n", 14 | "import time\n", 15 | "import jsonlines\n", 16 | "from datetime import datetime, date\n", 17 | "import pickle\n", 18 | "import os" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "2d360712-7cc0-4692-b5fc-69e2e6d2534d", 25 | "metadata": { 26 | "tags": [] 27 | }, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "sub arxiv: 2227429\n", 34 | "Quantum and Optics: 78084; Modified: 78084, Elapsed time: 20.9117271900177 seconds\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "starting_date = date(1990,1,1)\n", 40 | "start_time = time.time()\n", 41 | "\n", 42 | "arxiv_folder='arxiv-snapshot'\n", 43 | "arxiv_json = os.path.join(arxiv_folder,\"arxiv-metadata-oai-snapshot.json\")\n", 44 | "\n", 45 | "arxiv_optics_quantum_original=[]\n", 46 | "arxiv_optics_quantum_modified=[]\n", 47 | "\n", 48 | "with jsonlines.open(arxiv_json, 'r') as f:\n", 49 | " for id_of_abstract, line in enumerate(f):\n", 50 | " if line['categories'] in ['physics.optics','quant-ph']:\n", 51 | " arxiv_optics_quantum_original.append(line) ## store the original one\n", 52 | " \n", 53 | " get_date = datetime.strptime(line['versions'][0]['created'], '%a, %d %b %Y %H:%M:%S %Z').date()\n", 54 | " paper_time = (get_date - starting_date).days\n", 55 | " arxiv_optics_quantum_modified.append([line['categories'],line['title'],line['abstract'],paper_time]) ## store modified one\n", 56 | " \n", 57 | "elapsed_time = time.time() - start_time\n", 58 | "print(f\"sub arxiv: {id_of_abstract}\")\n", 59 | "print(f\"Quantum and Optics: {len(arxiv_optics_quantum_original)}; Modified: {len(arxiv_optics_quantum_modified)}, Elapsed time: {elapsed_time} seconds\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "id": "431e166c-9e21-4ecf-b741-021fcd919c94", 66 | "metadata": { 67 | "tags": [] 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "store_folder='data'\n", 72 | "\n", 73 | "with open(os.path.join(store_folder,'arxiv_optics_quantum_original.pkl'), 'wb') as f:\n", 74 | " pickle.dump(arxiv_optics_quantum_original, f)\n", 75 | " \n", 76 | "with open(os.path.join(store_folder,'arxiv_optics_quantum_style_modified.pkl'), 'wb') as f:\n", 77 | " pickle.dump(arxiv_optics_quantum_modified, f)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "id": "68c806ae-6789-48b5-ac54-725e73efda08", 84 | "metadata": { 85 | "tags": [] 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "with open(os.path.join(store_folder,'arxiv_optics_quantum_original.json'), 'w') as f:\n", 90 | " json.dump(arxiv_optics_quantum_original, f)\n", 91 | " \n", 92 | "with open(os.path.join(store_folder,'arxiv_optics_quantum_style_modified.json'), 'w') as f:\n", 93 | " json.dump(arxiv_optics_quantum_modified, f)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "cacfa9ab-f568-477b-b7de-cd3b77f7ffd3", 99 | "metadata": {}, 100 | "source": [ 101 | "## make only strings (title+abstract)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 5, 107 | "id": "f2a75bed-b673-4e93-a7e0-775043857bab", 108 | "metadata": { 109 | "tags": [] 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "# ## (Read the modified metadata; [source, title, abstract, time])\n", 114 | "### (Make each article in string, under certain replacements)\n", 115 | "\n", 116 | "def get_single_article_string(article):\n", 117 | " \n", 118 | " curr_title=article[1] #'title'\n", 119 | " curr_abstract=article[2] #'abstract'\n", 120 | " \n", 121 | " replace_pairs=[['\\n',' '],['-',' '],[' \\\" a','oa'],['\\\" a','ae'],['\\\"a','ae'],[' \\\" o','oe'],['\\\" o','oe'],['\\\"o','oe'],[' \\\" u','ue'],\n", 122 | " ['\\\" u','ue'],['\\\"u','ue'],[' \\' a','a'],[' \\' e','e'],[' \\' o','o'],[\"\\' \", \"\"],[\"\\'\", \"\"],[' ',' '],[' ',' ']]\n", 123 | " \n", 124 | " article_string=(curr_title +' '+ curr_abstract).lower()\n", 125 | " \n", 126 | " for rep_pair in replace_pairs:\n", 127 | " #print(rep_pair)\n", 128 | " \n", 129 | " article_string=article_string.replace(rep_pair[0],rep_pair[1])\n", 130 | " #print(article_string)\n", 131 | " #print('\\n')\n", 132 | " \n", 133 | " return article_string\n", 134 | "\n", 135 | "\n", 136 | "def get_all_paper_strings(article_lists, folder_file):\n", 137 | "\n", 138 | " if os.path.exists(os.path.join(folder_file,'arxiv_optics_quantum_paper_strings.pkl')):\n", 139 | " with open(os.path.join(folder_file,'arxiv_optics_quantum_paper_strings.pkl'), \"rb\") as f:\n", 140 | " arxiv_optics_quantum_paper_strings = pickle.load(f)\n", 141 | " \n", 142 | " else:\n", 143 | " all_paper_strings=[]\n", 144 | " cc=0\n", 145 | " for id_of_paper in range(len(article_lists)):\n", 146 | " cc+=1\n", 147 | " #if (cc%3000)==0:\n", 148 | " #print(str(cc)+'/'+str(len(article_lists)))\n", 149 | "\n", 150 | " all_paper_strings.append(get_single_article_string(article_lists[id_of_paper]))\n", 151 | "\n", 152 | " with open(os.path.join(folder_file,'arxiv_optics_quantum_paper_strings.pkl'), \"wb\") as f:\n", 153 | " pickle.dump(all_paper_strings, f)\n", 154 | " \n", 155 | " return all_paper_strings \n", 156 | "\n", 157 | "\n", 158 | "all_article_strings=get_all_paper_strings(arxiv_optics_quantum_modified,folder_file=\"data\")" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "121b2bb6-2385-4112-b61b-b0f9f9f8494b", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3 (ipykernel)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.10.9" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 5 191 | } 192 | -------------------------------------------------------------------------------- /create_concepts/Domain_Concept/s1_split_domain_papers.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from datetime import datetime, date 4 | import pickle 5 | import os 6 | import math 7 | 8 | log_folder='logs' 9 | if not os.path.exists(log_folder): 10 | os.makedirs(log_folder) 11 | 12 | 13 | folder_name="data_seperate" 14 | if not os.path.exists(folder_name): 15 | os.makedirs(folder_name) 16 | 17 | data_folder="Concept_Corpus" 18 | if not os.path.exists(data_folder): 19 | os.makedirs(data_folder) 20 | 21 | with open(os.path.join(data_folder,'arxiv_optics_quantum_paper_strings.pkl'), "rb") as f: 22 | get_all_paper_strings = pickle.load(f) 23 | 24 | log_file = os.path.join(log_folder, 'split_papers_log.txt') 25 | with open(log_file, 'a') as f: 26 | f.write(f"Seperate Optics and Quantum Papers: {len(get_all_paper_strings)}\n") 27 | 28 | 29 | # Determine the number of parts needed 30 | num_parts = math.ceil(len(get_all_paper_strings) / 1000) 31 | 32 | # Store 1000 elements in each part file 33 | for i in range(num_parts): 34 | time_starting=time.time() 35 | start_idx = i * 1000 36 | end_idx = min((i+1)*1000, len(get_all_paper_strings)) 37 | part_data = get_all_paper_strings[start_idx : end_idx] 38 | part_file = os.path.join(folder_name, f'part_{i:02}.pkl') 39 | with open(part_file, 'wb') as f: 40 | pickle.dump(part_data, f) 41 | elapsed_time = time.time() - time_starting 42 | with open(log_file, 'a') as f: 43 | f.write(f"{i}: {(i+1)/num_parts}; Elapsed time: {elapsed_time} seconds \n") -------------------------------------------------------------------------------- /create_concepts/Domain_Concept/s2_get_domain_concepts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from datetime import datetime, date 4 | import pickle 5 | import os 6 | import random 7 | 8 | 9 | if __name__ == '__main__': 10 | 11 | log_folder='logs' 12 | try: 13 | os.mkdir(log_folder) 14 | except FileExistsError: 15 | pass 16 | 17 | data_folder="Concept_Corpus" 18 | data_seperate_folder="data_seperate" 19 | 20 | 21 | concept_folder="concept_seperate" 22 | try: 23 | os.mkdir(concept_folder) 24 | except FileExistsError: 25 | pass 26 | 27 | 28 | concept_list_pkl = os.path.join(data_folder,'full_concept_list.pkl') 29 | 30 | with open(concept_list_pkl, 'rb') as file: 31 | all_concept_lists = pickle.load(file) 32 | 33 | 34 | random.seed() 35 | total_file=78 36 | write_file=0 37 | cc=0 38 | while write_file <= total_file: 39 | 40 | curr_ID = random.randint(0, total_file) 41 | formatted_ID = '{:02d}'.format(curr_ID) 42 | data_file=os.path.join(data_seperate_folder, f'part_{formatted_ID}.pkl') 43 | 44 | concept_file=os.path.join(concept_folder, f'concept_{formatted_ID}.pkl') 45 | 46 | 47 | log_file = os.path.join('logs', 'log_'+formatted_ID+'.txt') 48 | if cc % 10 == 0: 49 | with open(log_file, 'a') as f: 50 | f.write(f'formatted_ID: {formatted_ID}; cc: {cc}, write_file num: {write_file}\n') 51 | cc+=1 52 | 53 | if not os.path.exists(concept_file): 54 | 55 | concepts_for_paper_list=[] 56 | 57 | with open(data_file, 'rb') as file: # read paper 58 | paper_info = pickle.load(file) 59 | 60 | concepts_at_least_one=[] 61 | 62 | # check all papers 63 | for id_paper, cur_paper in enumerate(paper_info): 64 | 65 | concepts_for_single_paper = [] 66 | 67 | for id_concept, cur_concept in enumerate(all_concept_lists): 68 | 69 | if cur_concept in cur_paper: # if the paper contains the concept 70 | concepts_for_single_paper.append(cur_concept) 71 | 72 | concepts_at_least_one.extend(concepts_for_single_paper) ## store the concepts from one paper 73 | 74 | 75 | finish_flag=0 76 | with open(concept_file, "wb") as output_file: 77 | pickle.dump(concepts_at_least_one, output_file) 78 | write_file+=1 79 | 80 | 81 | -------------------------------------------------------------------------------- /create_concepts/Domain_Concept/s3_merge_concepts.py: -------------------------------------------------------------------------------- 1 | # + 2 | import pickle 3 | import os 4 | import time 5 | from datetime import datetime, date 6 | import random 7 | 8 | 9 | # Create SemNet 10 | 11 | if __name__ == '__main__': 12 | 13 | 14 | concept_folder="concept_seperate" 15 | 16 | 17 | total_file=78 18 | 19 | ## finish all 20 | all_concepts_file = os.path.join(concept_folder,'all_concepts.pkl') # edges 21 | 22 | #if not os.path.exists(all_concepts_file1): 23 | 24 | all_concepts=[] 25 | 26 | for id_file in range(total_file+1): # start from 0: 0-78 27 | 28 | file_ID = '{:02d}'.format(id_file) 29 | cur_concept_file=os.path.join(concept_folder, f'concept_{file_ID}.pkl') 30 | 31 | 32 | with open(cur_concept_file, 'rb') as file: 33 | concept_info = pickle.load(file) 34 | 35 | all_concepts.extend(concept_info) 36 | 37 | with open(all_concepts_file, "wb") as output_file: 38 | pickle.dump(all_concepts, output_file) 39 | 40 | 41 | -------------------------------------------------------------------------------- /create_concepts/Domain_Concept/s4_improve_concepts.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "fbff53e8-c63a-4997-9761-b1018ca5c42e", 6 | "metadata": {}, 7 | "source": [ 8 | "##### improve domain concepts " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "d1317372", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import pickle\n", 19 | "import os\n", 20 | "import time\n", 21 | "from datetime import datetime, date\n", 22 | "import nltk\n", 23 | "from nltk.corpus import stopwords\n", 24 | "from nltk.stem import WordNetLemmatizer\n", 25 | "from rake_nltk import Metric, Rake\n", 26 | "from collections import Counter\n", 27 | "import re\n", 28 | "from nltk.corpus import wordnet\n", 29 | "import random\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "6f05270b-b134-44e8-9bb8-5a5642b41755", 35 | "metadata": {}, 36 | "source": [ 37 | "##### store" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "id": "bf5d512d-2f85-4782-a665-cbc9a483a42d", 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "concept_folder=\"concept_seperate\"\n", 48 | "## finish all \n", 49 | "all_concepts_file = os.path.join(concept_folder,'all_concepts.pkl') # edges\n", 50 | "with open(all_concepts_file, \"rb\") as output_file:\n", 51 | " all_concepts=pickle.load(output_file)\n", 52 | " \n", 53 | "## remove repeated concepts\n", 54 | "unique_concepts = list(set(all_concepts))\n", 55 | "concepts_file='full_domain_concepts.txt' # rename 'full_concepts_form_openalex.txt'\n", 56 | "f = open(concepts_file, \"a\")\n", 57 | "for ii in range(len(unique_concepts)):\n", 58 | " f.write(unique_concepts[ii]+'\\n')\n", 59 | "f.close()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "8433c817", 65 | "metadata": {}, 66 | "source": [ 67 | "##### read the concepts file" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 2, 73 | "id": "5ad70368", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "17-04-2023 12:06:38; Concepts: 80675 \n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | " \n", 86 | "if os.path.exists(concepts_file):\n", 87 | " # open the existing file for reading \n", 88 | " with open(concepts_file, \"r\") as f:\n", 89 | " modify_full_concept_list = [line.rstrip() for line in f.readlines()]\n", 90 | " \n", 91 | " now_time = datetime.now()\n", 92 | " formatted_time = now_time.strftime(\"%d-%m-%Y %H:%M:%S\")\n", 93 | " print(\"{}; Concepts: {:d} \".format(formatted_time,len(modify_full_concept_list)))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "id": "421f5569", 99 | "metadata": {}, 100 | "source": [ 101 | "##### filter concepts" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 3, 107 | "id": "464d27f2", 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "Concepts: 80675 ; Store: 80642; Remove: 33 \n", 115 | "Elapsed time: 0.34 seconds\n", 116 | "17-04-2023 12:06:46; Concepts: 80642 \n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "\n", 122 | "starting_time = time.time()\n", 123 | "\n", 124 | "filter_concept_any=['held','equal','dramatic','slowing','excited','occupied','charged','moving','layer','bi','argument','intuition','experiment','entirely','essentially','built','necessary','take','applicable','employ','visit','visited','herein','facilitates','varying','overlapping','addressed','issues','related','add','adds','dominant','preserve','preserves','preserved','stabilizing','match','manipulating','emerging','processed','data','continuously','analytically','argue','smoothly','connect','connects','connecting','software','matlab','toolbox','standard','industrial','technology','success','equipment','call','analogous','sense','persist','persists','throughout','calculated','useful','difficult','proved']\n", 125 | "\n", 126 | "filter_concept_start=['sophisticated','precise','remarkably','consists','gradually','simplified','complete','techniques','partially','presented','iterative','simple','preparation','clear','priori','ae','substantial','sending','protecting','optimized','optimize','optimizing','transmits','transmit','transmitting','transmitted','processing','pre','collect','collected','measured','varied','operating','algorithms','algorithm','robustly','shall','concept','packing','successful','apparent','apparently','readily','adapted','todays','imperfect','seemingly','seeming','shelf','properties','mechanism','phenomenon','behavior','theorem','procedure','usual','form','later','calculating','fundamentally']\n", 127 | "\n", 128 | "filter_concept_end=['illustrates','setup','consisting','set','capable','configuration','complete','borrowed','permit','utilizes','referred','refer','capable','pave','stem','preparation','scheme','optimizes','transmitted','transmit','operating','relate','packed','packing','platform','industry','adapt','adapts','adapted','arrangement','era','device','arrange','arranged','content','procedure','outlined','form','formed','followed','following','calculation']\n", 129 | "\n", 130 | "\n", 131 | "concept_to_remove_pair=['self']\n", 132 | "concept_to_keep_pair=['stabilization']\n", 133 | "\n", 134 | "conditioned_filter_concept_any5=['open']\n", 135 | "conditioned_filter_concept_any3=['driven','component']\n", 136 | "conditioned_filter_concept_any2=['probe','inspired','technique','open','added','transfer','connected','element','exchange']\n", 137 | "\n", 138 | "conditioned_filter_concept_start2=['doubly','probe']\n", 139 | "conditioned_filter_concept_end2=[]\n", 140 | "\n", 141 | "forbidden_continued_strings=['complete measurement','exact numerical','numerical technique','numerical method','complete set','pure entangled','quantum entangled','high fidelity']\n", 142 | "\n", 143 | "improve_full_concept_list=[]\n", 144 | "\n", 145 | "for one_concept in modify_full_concept_list:\n", 146 | " \n", 147 | " separated_words=one_concept.split()\n", 148 | " do_remove=0\n", 149 | " for word in separated_words:\n", 150 | " if word in filter_concept_any:\n", 151 | " do_remove=1\n", 152 | " break\n", 153 | " \n", 154 | " if len(separated_words)<5: ## only for 5 words\n", 155 | " if word in conditioned_filter_concept_any5:\n", 156 | " do_remove=1\n", 157 | " break\n", 158 | "\n", 159 | " if len(separated_words)<=3:\n", 160 | " if word in conditioned_filter_concept_any3:\n", 161 | " do_remove=1\n", 162 | " break\n", 163 | " \n", 164 | " if len(separated_words)==2: ## only for 2 words\n", 165 | " if word in conditioned_filter_concept_any2:\n", 166 | " do_remove=1\n", 167 | " break\n", 168 | "\n", 169 | " \n", 170 | " \n", 171 | " if separated_words[0] in filter_concept_start:\n", 172 | " do_remove=1\n", 173 | " if separated_words[-1] in filter_concept_end:\n", 174 | " do_remove=1\n", 175 | " \n", 176 | " if len(separated_words)==2:\n", 177 | " if separated_words[0] in conditioned_filter_concept_start2: #check the start word \n", 178 | " do_remove=1\n", 179 | " if separated_words[-1] in conditioned_filter_concept_end2: #check the last word \n", 180 | " do_remove=1\n", 181 | "\n", 182 | " if do_remove==0:\n", 183 | " for word in forbidden_continued_strings:\n", 184 | " if word in one_concept:\n", 185 | " do_remove=1\n", 186 | " break\n", 187 | "\n", 188 | " if do_remove==0:\n", 189 | " improve_full_concept_list.append(one_concept)\n", 190 | " \n", 191 | "print(\"Concepts: {:d} ; Store: {:d}; Remove: {:d} \".format(len(modify_full_concept_list), len(improve_full_concept_list),len(modify_full_concept_list)-len(improve_full_concept_list)))\n", 192 | "elapsed_time = time.time() - starting_time\n", 193 | "print(\"Elapsed time: {:.2f} seconds\".format(elapsed_time))\n", 194 | "\n", 195 | "now_time = datetime.now()\n", 196 | "formatted_time = now_time.strftime(\"%d-%m-%Y %H:%M:%S\")\n", 197 | "print(\"{}; Concepts: {:d} \".format(formatted_time,len(improve_full_concept_list)))" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "736ce8e2", 203 | "metadata": {}, 204 | "source": [ 205 | "##### restore the file" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 4, 211 | "id": "2cd82ff3-5ffd-4988-83cf-d59fccf32e8e", 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "txt has been deleted.\n", 219 | "re-create text and store information.\n", 220 | "17-04-2023 12:06:52; Concepts: 80642 \n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "\n", 226 | "# Delete the orginal txt and re-create a new one with the improved concepts \n", 227 | "if os.path.exists(concepts_file):\n", 228 | " os.remove(concepts_file)\n", 229 | " print(\"txt has been deleted.\")\n", 230 | "\n", 231 | " # re-Create the text file \n", 232 | " f = open(concepts_file, \"a\")\n", 233 | " for ii in range(len(improve_full_concept_list)):\n", 234 | " f.write(improve_full_concept_list[ii]+'\\n')\n", 235 | " f.close()\n", 236 | " print(\"re-create text and store information.\") \n", 237 | "else:\n", 238 | " f = open(concepts_file, \"a\")\n", 239 | " for ii in range(len(improve_full_concept_list)):\n", 240 | " f.write(improve_full_concept_list[ii]+'\\n')\n", 241 | " f.close()\n", 242 | " print(\"create text and store information.\")\n", 243 | " \n", 244 | "now_time = datetime.now()\n", 245 | "formatted_time = now_time.strftime(\"%d-%m-%Y %H:%M:%S\")\n", 246 | "print(\"{}; Concepts: {:d} \".format(formatted_time,len(improve_full_concept_list)))\n" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "id": "55cc750e", 252 | "metadata": {}, 253 | "source": [ 254 | "##### additionally, store a pkl file (as a backup)" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "id": "1987ab1c", 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "concepts_path_pkl='improved_concepts_form_openalex.pkl'\n", 265 | "with open(concepts_path_pkl, \"wb\") as output_file:\n", 266 | " pickle.dump(improve_full_concept_list, output_file)" 267 | ] 268 | } 269 | ], 270 | "metadata": { 271 | "kernelspec": { 272 | "display_name": "Python 3 (ipykernel)", 273 | "language": "python", 274 | "name": "python3" 275 | }, 276 | "language_info": { 277 | "codemirror_mode": { 278 | "name": "ipython", 279 | "version": 3 280 | }, 281 | "file_extension": ".py", 282 | "mimetype": "text/x-python", 283 | "name": "python", 284 | "nbconvert_exporter": "python", 285 | "pygments_lexer": "ipython3", 286 | "version": "3.9.7" 287 | } 288 | }, 289 | "nbformat": 4, 290 | "nbformat_minor": 5 291 | } 292 | -------------------------------------------------------------------------------- /create_concepts/Domain_Concept/s5_improve_manually_concepts.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, date 3 | 4 | ## repeated initial_num=0,1,2,...., start from 0, the second run will be 1, etc. 5 | # the new_file_name is the final filtered concepts 6 | initial_num=0 7 | file_name='full_domain_concepts_'+str(initial_num)+'.txt' #'full_concepts_for_openalex_'+str(initial_num)+'.txt' 8 | curr_file = os.path.join("full_concepts",file_name) 9 | new_concept_list=[] 10 | 11 | with open(curr_file, 'r') as file: 12 | lines = file.readlines() 13 | 14 | concept_count=0 15 | for idx, cc in enumerate(lines): 16 | #if cc[0]!='-': 17 | if "-" not in cc: 18 | new_concept_list.append(cc) 19 | concept_count+=1 20 | 21 | now_time = datetime.now() 22 | formatted_time = now_time.strftime("%d-%m-%Y %H:%M:%S") 23 | print(f"{formatted_time}, Concepts: {concept_count} ; Remove: {idx-concept_count+1} ") 24 | 25 | new_num=initial_num+1 26 | new_file_name='full_domain_concepts_'+str(new_num)+'.txt' # the final concept file will be renamed as full_domain_concepts.txt 27 | ### 28 | with open(new_file_name, 'w') as file: 29 | for item in new_concept_list: 30 | file.write(f"{item}") -------------------------------------------------------------------------------- /create_dynamic_concepts/get_concept_citation.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import gzip 3 | import json 4 | import os 5 | import time 6 | from datetime import datetime, date 7 | import pickle 8 | from functools import reduce 9 | import random 10 | import re 11 | 12 | 13 | def get_single_article_string(article): 14 | 15 | curr_title=article['title'] 16 | abstract_inverted_index = article['abstract_inverted_index'] 17 | 18 | # Flatten the inverted index into a list of (position, word) tuples 19 | position_word_list = [(position, word) for word, positions in abstract_inverted_index.items() for position in positions] 20 | 21 | # Sort the list by position and extract the words 22 | sorted_abstract = sorted(position_word_list) 23 | curr_abstract = ' '.join(word for position, word in sorted_abstract) 24 | 25 | # Replace strings according to the replace_pairs list 26 | replace_pairs=[['\n',' '],['-',' '],[' \" a','oa'],['\" a','ae'],['\"a','ae'],[' \" o','oe'],['\" o','oe'],['\"o','oe'],[' \" u','ue'],['\" u','ue'],['\"u','ue'],[' \' a','a'],[' \' e','e'],[' \' o','o'],["\' ", ""],["\'", ""],[' ',' '],[' ',' ']] 27 | 28 | article_string=(curr_title +' '+ curr_abstract).lower() 29 | article_string = reduce(lambda text, pair: text.replace(pair[0], pair[1]), replace_pairs, article_string) 30 | 31 | return article_string 32 | 33 | # Define a sorting key function to extract the date and part number from the path 34 | def get_date_and_part_from_path(path): 35 | date_folder = os.path.dirname(path) 36 | date_str = date_folder.split('=')[-1] 37 | 38 | file_name = os.path.basename(path) 39 | part_str = file_name.split('_')[-1].split('.')[0] 40 | 41 | return date_str, int(part_str) 42 | 43 | def extract_id(filename): 44 | match = re.search(r'log_concept_part_(\d+)_', filename) 45 | if match: 46 | return int(match.group(1)) 47 | return None 48 | 49 | 50 | # define a log foler 51 | log_folder = 'logs_concept' 52 | # define edge_list foler 53 | vertex_folder = 'concept_citation' 54 | vertex_folder_log = 'concept_citation_log' 55 | 56 | try: 57 | if not os.path.exists(log_folder): 58 | os.makedirs(log_folder) 59 | 60 | if not os.path.exists(vertex_folder): 61 | os.makedirs(vertex_folder) 62 | 63 | if not os.path.exists(vertex_folder_log): 64 | os.makedirs(vertex_folder_log) 65 | 66 | except FileExistsError: 67 | pass 68 | 69 | 70 | data_folder="data_concept_graph" 71 | cwd = os.getcwd() 72 | parent_dir = os.path.dirname(cwd) 73 | concept_folder = os.path.join(parent_dir, data_folder) 74 | 75 | 76 | #project_path="/u/xmgu/projects/semnet_openalex" 77 | #base_folder=os.path.join(project_path,'openalex_workdata_filtered/data/works/') 78 | 79 | # Define the base folder, date pattern and file pattern 80 | base_folder = 'openalex_workdata_filtered/data/works/' 81 | date_pattern = 'updated_date=*' 82 | file_pattern = 'filtered_part_*.gz' 83 | 84 | # Find all the files matching the pattern 85 | file_paths = glob.glob(f'{base_folder}/{date_pattern}/{file_pattern}') 86 | # Sort the file_paths list in ascending order based on the date and part number 87 | file_paths = sorted(file_paths, key=get_date_and_part_from_path) 88 | 89 | # Define the date range or specific folders to include 90 | start_date = datetime.strptime("2022-12-20", "%Y-%m-%d") 91 | end_date = datetime.strptime("2023-03-28", "%Y-%m-%d") 92 | 93 | # Filter the file_paths list based on the date range or specific folders 94 | curr_run_file_paths = [path for path in file_paths if start_date <= datetime.strptime(get_date_and_part_from_path(path)[0], "%Y-%m-%d") <= end_date] 95 | 96 | rnd_time=random.random()*50 97 | time.sleep(rnd_time) 98 | 99 | # Read all concepts from full_final_concepts/full_domain_concept.txt 100 | concepts_files = os.path.join(concept_folder, 'full_domain_concept.txt') 101 | with open(concepts_files, 'r') as file: 102 | full_concepts = [concept.strip() for concept in file.readlines()] 103 | 104 | # Define a list to store the vertex lists 105 | paper_starting_date = date(1990,1,1) 106 | 107 | write_file=0 108 | 109 | rnd_time=random.random()*60 110 | time.sleep(rnd_time) 111 | 112 | while write_file <=len(curr_run_file_paths): 113 | 114 | curr_ID = random.randint(0, len(curr_run_file_paths)-1) # get a random number between 0 and the number of files 115 | 116 | formatted_ID = '{:03d}'.format(curr_ID) 117 | 118 | edge_file=os.path.join(vertex_folder, 'concept_part_'+formatted_ID+'.gz') 119 | edge_file_log=os.path.join(vertex_folder_log, 'concept_part_'+formatted_ID+'.txt') 120 | log_file_txt=os.path.join(log_folder, 'log_concept_part_'+formatted_ID+'.txt') 121 | log_file_txt_finish=os.path.join(log_folder, 'log_concept_part_'+formatted_ID+'_finish.txt') 122 | log_file_txt_empty=os.path.join(log_folder, 'log_concept_part_'+formatted_ID+'_empty.txt') 123 | 124 | if not os.path.exists(log_file_txt): 125 | current_time=datetime.now() 126 | open(log_file_txt, 'a').close() 127 | 128 | file_path=curr_run_file_paths[curr_ID] 129 | with open(log_file_txt, 'a') as log_file: 130 | log_file.write(f'Current time: {current_time}; Number of files: {len(curr_run_file_paths)}; Number of concepts: {len(full_concepts)}\n\n') 131 | log_file.write(f'Start the File: {file_path}; Current time: {datetime.now()} \n\n') 132 | 133 | with gzip.open(file_path, 'rt') as file: 134 | lines = file.readlines() 135 | 136 | if not lines: # if lines is empty 137 | print(f'File {file_path} is empty') 138 | write_file+=1 139 | with open(log_file_txt_empty, 'a') as log_file: 140 | log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; File is Empty!\n') 141 | 142 | else: 143 | edge_lists=[] 144 | for id_line, line in enumerate(lines): 145 | time_start_line=time.time() 146 | 147 | article_object = json.loads(line) # Load the JSON object 148 | get_date = datetime.strptime(article_object['publication_date'], "%Y-%m-%d").date() 149 | curr_paper_time = (get_date - paper_starting_date).days 150 | curr_all_citations=article_object['cited_by_count'] 151 | curr_citations_per_year=article_object['counts_by_year'] 152 | curr_article=get_single_article_string(article_object) 153 | 154 | 155 | # Check if the article contains any of the concepts 156 | concepts_for_single_paper=[] 157 | for id_concept, concept in enumerate(full_concepts): 158 | if concept in curr_article: # if the paper contains the concept; then store its concept index 159 | concepts_for_single_paper.append(id_concept) 160 | 161 | for ii in range(len(concepts_for_single_paper)): 162 | edge_lists.append([concepts_for_single_paper[ii],curr_paper_time,curr_all_citations,curr_citations_per_year]) 163 | 164 | if id_line % 10000 == 0: 165 | with open(log_file_txt, 'a') as log_file: 166 | log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; Processed: {(id_line+1)/len(lines)}; time: {time.time()-time_start_line}\n') 167 | 168 | # Finish the current file, then store edge_lists to a pickle file 169 | with gzip.open(edge_file, 'wb') as output_file: 170 | pickle.dump(edge_lists, output_file) 171 | write_file+=1 172 | 173 | with open(edge_file_log, 'a') as log_file: 174 | log_file.write(f'\nconcept_list={len(edge_lists)}') 175 | 176 | with open(log_file_txt, 'a') as log_file: 177 | log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path}; Processed: {write_file}/{len(curr_run_file_paths)}, i.e., {write_file/len(curr_run_file_paths)} \n') 178 | 179 | with open(log_file_txt_finish, 'a') as log_file: 180 | log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path} \n') 181 | 182 | rnd_time=random.random()*5 183 | time.sleep(rnd_time) 184 | 185 | else: 186 | # Match file patterns 187 | finish_pattern = os.path.join(log_folder, 'log_concept_part_*_finish.txt') 188 | empty_pattern = os.path.join(log_folder, 'log_concept_part_*_empty.txt') 189 | finished_files = [f for f in glob.glob(finish_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))] 190 | empty_files = [f for f in glob.glob(empty_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))] 191 | 192 | # Count files that match each pattern 193 | total_files = len(finished_files) + len(empty_files) 194 | 195 | # Check if the total count is larger than 391 196 | if total_files >= len(curr_run_file_paths): 197 | print(f"{datetime.now()}:Finish run!") 198 | break 199 | 200 | 201 | 202 | with open("job_finish.txt", 'a') as f: 203 | f.write(f'\nFinish all: {datetime.now()}\n') 204 | 205 | 206 | 207 | 208 | 209 | -------------------------------------------------------------------------------- /create_dynamic_concepts/merge_concept_citation.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import gzip 3 | import json 4 | import os 5 | import time 6 | from datetime import datetime, date 7 | import pickle 8 | from functools import reduce 9 | import random 10 | 11 | 12 | log_folder = 'logs' 13 | if not os.path.exists(log_folder): 14 | os.makedirs(log_folder) 15 | log_files='log_merge_concept_citation.txt' 16 | 17 | # define vertex_list foler 18 | vertex_list_folder = 'concept_citation' 19 | if not os.path.exists(vertex_list_folder): 20 | os.makedirs(vertex_list_folder) 21 | 22 | list_file_names = os.listdir(vertex_list_folder) # List all files in the directory 23 | vertex_file_name_unsorted = [file for file in list_file_names if file.endswith('.gz')] 24 | vertex_lists_files = sorted(vertex_file_name_unsorted) # Sort the file list 25 | 26 | full_vertex_lists = os.path.join(vertex_list_folder,'all_concept_citation.gz') # vertex 27 | 28 | 29 | with open(os.path.join(log_folder, log_files), 'a') as f: 30 | f.write(f'\nStart: {datetime.now()}\n') 31 | 32 | 33 | full_vertices=[] 34 | empty_count=0 35 | for id_file, curr_vertex_files in enumerate(vertex_lists_files): 36 | 37 | with gzip.open(os.path.join(vertex_list_folder, curr_vertex_files), 'rb') as f: # load the vertex list 38 | vertex_data_list = pickle.load(f) 39 | 40 | if vertex_data_list!=[]: # skip empty files 41 | full_vertices.extend(vertex_data_list) 42 | else: 43 | empty_count+=1 44 | print(f'Empty file: {curr_vertex_files}') 45 | 46 | # write to log file 47 | with open(os.path.join(log_folder, log_files), 'a') as f: 48 | f.write(f'Finish file: {curr_vertex_files}; v: {len(full_vertices)}; Processed: {(id_file+1)/len(vertex_lists_files)}; empty Num: {empty_count}\n') 49 | 50 | # store the vertices list in a gz file 51 | with gzip.open(full_vertex_lists, 'wb') as f: 52 | pickle.dump(full_vertices, f) 53 | 54 | with open(os.path.join(log_folder, log_files), 'a') as f: 55 | f.write(f'\nFinish: {datetime.now()}\n') 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /create_dynamic_concepts/process_concept_to_pandas_frame.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pickle 4 | import gzip 5 | from datetime import datetime, date 6 | import numpy as np 7 | import pandas as pd 8 | import time 9 | import copy 10 | 11 | log_folder = 'logs' # log folder 12 | if not os.path.exists(log_folder): 13 | os.makedirs(log_folder) 14 | 15 | data_folder="concept_citation" 16 | data_file=os.path.join(data_folder,'all_concept_citation.gz') 17 | 18 | 19 | store_folder="data_concept_graph" 20 | cwd = os.getcwd() 21 | parent_dir = os.path.dirname(cwd) # get parent directory 22 | new_dir_path = os.path.join(parent_dir, store_folder) 23 | os.makedirs(new_dir_path, exist_ok=True) 24 | 25 | store_data_file = os.path.join(new_dir_path, "full_dynamic_concept.parquet") 26 | 27 | 28 | logsfile=os.path.join(log_folder,"logs_process_concepts.txt") 29 | starting_time=time.time() 30 | print(f'{datetime.now()}: read full graph') 31 | with open(logsfile+'.txt', "a") as myfile: 32 | myfile.write(f'\n{datetime.now()}: read full graph') 33 | 34 | with gzip.open(data_file, 'rb') as f: # load the edge list 35 | full_dynamic_concept = pickle.load(f) 36 | 37 | with open(logsfile+'.txt', "a") as myfile: 38 | myfile.write(f"\n{datetime.now()}: Done, Total: {len(full_dynamic_concept)}; Elapsed time: {time.time() - starting_time} seconds\n") 39 | 40 | 41 | # process the edge list to make each element with the same size 42 | ## [concept, paper_time, total_citation, citation_per_year] 43 | ## e.g., [7, 10378, 1, [{'year': 2022, 'cited_by_count': 1}]] becomes [7, 10378, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 44 | 45 | starting_time = time.time() 46 | full_dynamic_concept_copy = copy.deepcopy(full_dynamic_concept) 47 | for i, item in enumerate(full_dynamic_concept): 48 | years_data = {year_data['year']: year_data['cited_by_count'] for year_data in item[3]} 49 | new_list = [years_data.get(year, 0) for year in range(2023, 2011, -1)] ## as cited_by_count only contains the last 10 years 50 | full_dynamic_concept_copy[i] = item[:3] + new_list 51 | 52 | if i % 200000 == 0: 53 | with open(logsfile+'.txt', "a") as myfile: 54 | myfile.write(f"\nProcessing item {i+1}/{len(full_dynamic_concept_copy)}") 55 | 56 | 57 | time_start = time.time() 58 | full_concept=np.array(full_dynamic_concept_copy) 59 | with open(logsfile+'.txt', "a") as myfile: 60 | myfile.write(f"\nDone, convert array; Elapsed time: {time.time() - time_start} seconds") 61 | 62 | 63 | time_start = time.time() 64 | full_concept_df = pd.DataFrame(full_concept, columns=['v1', 'time', 'ct', 'c2023', 'c2022', 'c2021', 'c2020', 'c2019', 'c2018', 'c2017', 'c2016', 'c2015', 'c2014', 'c2013', 'c2012']) 65 | 66 | full_concept_df.to_parquet(store_data_file, compression='gzip') 67 | 68 | with open(logsfile+'.txt', "a") as myfile: 69 | myfile.write(f"\n{datetime.now()}: Done, full_graph: {len(full_concept_df)}; Elapsed time: {time.time() - time_start} seconds") 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /create_dynamic_edges/_get_openalex_workdata.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore import UNSIGNED 3 | from botocore.config import Config 4 | import gzip 5 | import jsonlines 6 | import json 7 | import os 8 | 9 | 10 | 11 | # Function to filter the JSON objects by the desired keys 12 | def filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract): 13 | desired_keys = ['type', 'title', 'abstract_inverted_index', 'cited_by_count', 'counts_by_year', 'publication_year', 'publication_date'] 14 | # Check if all the desired keys are in the JSON object 15 | if all(key in json_obj for key in desired_keys): 16 | if json_obj['type'] == 'journal-article' and json_obj['title'] not in [{}, None] and json_obj['publication_year'] not in [{}, None] and json_obj['publication_date'] not in [{}, None]: 17 | journal_paper += 1 18 | if json_obj['abstract_inverted_index'] not in [{}, None]: 19 | journal_paper_with_abstract += 1 20 | return {key: json_obj[key] for key in desired_keys}, journal_paper, journal_paper_with_abstract 21 | return None, journal_paper, journal_paper_with_abstract 22 | 23 | 24 | 25 | # check whether a logs folder exists 26 | logs_path = 'logs' 27 | if not os.path.exists(logs_path): 28 | os.makedirs(logs_path) 29 | 30 | 31 | journal_paper = 0 32 | journal_paper_with_abstract = 0 33 | # Create a local directory for the filtered files 34 | local_base_folder = 'openalex_workdata_filtered' 35 | os.makedirs(local_base_folder, exist_ok=True) 36 | 37 | # Configure the S3 client for anonymous access 38 | s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) 39 | # Specify the S3 bucket and prefix (folder) 40 | bucket_name = 'openalex' 41 | prefix ='data/works/' 42 | # Iterate through the objects in the specified S3 bucket and prefix 43 | paginator = s3.get_paginator('list_objects_v2') 44 | 45 | for id_page, page in enumerate(paginator.paginate(Bucket=bucket_name, Prefix=prefix)): 46 | 47 | for id_obj, obj in enumerate(page['Contents']): 48 | 49 | if obj['Key'].split('/')[-1] == 'manifest': 50 | continue # Skip the manifest file 51 | 52 | log_filename = os.path.join(logs_path, obj['Key'].split('/')[-2]+'_'+obj['Key'].split('/')[-1]+'_log.txt') 53 | with open(log_filename, 'a') as log_file: 54 | log_file.write(f"Page {id_page}, object {id_obj}; obj['Key']: {obj['Key']}\n") 55 | 56 | # Download and process the gzip-compressed JSON Lines file 57 | s3_object = s3.get_object(Bucket=bucket_name, Key=obj['Key']) 58 | 59 | with gzip.GzipFile(fileobj=s3_object['Body'], mode='r') as gz_file: 60 | with jsonlines.Reader(gz_file) as reader: 61 | filtered_objects = [] 62 | for id_json, json_obj in enumerate(reader): 63 | filtered_obj, journal_paper, journal_paper_with_abstract = filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract) 64 | 65 | if filtered_obj is not None: 66 | filtered_objects.append(filtered_obj) 67 | 68 | if id_json % 5000==0: 69 | with open(log_filename, 'a') as log_file: 70 | log_file.write(f"\n Processed {id_json} objects \n") 71 | 72 | # Prepare the local folder structure 73 | local_path_parts = obj['Key'].split('/') 74 | local_filtered_folder = os.path.join(local_base_folder, *local_path_parts[:-1]) 75 | os.makedirs(local_filtered_folder, exist_ok=True) 76 | 77 | # Store the filtered objects in a new gzip-compressed JSON Lines file on the local computer 78 | filtered_file_name = f"filtered_{local_path_parts[-1]}" 79 | filtered_file_path = os.path.join(local_filtered_folder, filtered_file_name) 80 | with gzip.open(filtered_file_path, 'wt') as f: 81 | for item in filtered_objects: 82 | f.write(json.dumps(item) + '\n') 83 | with open(log_filename, 'a') as log_file: 84 | log_file.write(f"Finish writing {filtered_file_path}; until now, journal_paper: {journal_paper}; journal_paper_with_abstract: {journal_paper_with_abstract}\n") 85 | #print(f"Finish writing {obj['Key']}: {filtered_file_path} \n") 86 | 87 | print(f"Finish writing all \n") -------------------------------------------------------------------------------- /create_dynamic_edges/_get_openalex_workdata_parallel_run1.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from botocore import UNSIGNED 3 | from botocore.config import Config 4 | import gzip 5 | import jsonlines 6 | import json 7 | import os 8 | 9 | 10 | 11 | # Function to filter the JSON objects by the desired keys 12 | def filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract): 13 | desired_keys = ['type', 'title', 'abstract_inverted_index', 'cited_by_count', 'counts_by_year', 'publication_year', 'publication_date'] 14 | # Check if all the desired keys are in the JSON object 15 | if all(key in json_obj for key in desired_keys): 16 | if json_obj['type'] == 'journal-article' and json_obj['title'] not in [{}, None] and json_obj['publication_year'] not in [{}, None] and json_obj['publication_date'] not in [{}, None]: 17 | journal_paper += 1 18 | if json_obj['abstract_inverted_index'] not in [{}, None]: 19 | journal_paper_with_abstract += 1 20 | return {key: json_obj[key] for key in desired_keys}, journal_paper, journal_paper_with_abstract 21 | return None, journal_paper, journal_paper_with_abstract 22 | 23 | # check whether a logs folder exists 24 | logs_path = 'logs' 25 | if not os.path.exists(logs_path): 26 | os.makedirs(logs_path) 27 | 28 | 29 | 30 | journal_paper = 0 31 | journal_paper_with_abstract = 0 32 | # Create a local directory for the filtered files 33 | local_base_folder = 'openalex_workdata_filtered' 34 | os.makedirs(local_base_folder, exist_ok=True) 35 | # Configure the S3 client for anonymous access 36 | s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED)) 37 | 38 | # Iterate through the objects in the specified S3 bucket and prefix 39 | paginator = s3.get_paginator('list_objects_v2') 40 | 41 | # Specify the S3 bucket and prefix (folder) as an example here 42 | # change the folder files such that one can do parallel computing with many run code files 43 | process_folder=['updated_date=2023-03-27','updated_date=2023-03-28'] # just an example 44 | 45 | bucket_name = 'openalex' 46 | prefix ='data/works/' 47 | 48 | log_folders = os.path.join(logs_path, process_folder[0]+'_'+process_folder[-1].split('=')[1]+'_log.txt') 49 | for id_folder, folder in enumerate(process_folder): 50 | 51 | prefix ='data/works/' 52 | prefix = prefix+folder+'/' 53 | print(f"Process {prefix}, step%: {id_folder/len(process_folder)} \n") 54 | 55 | with open(log_folders, 'a') as log_file: 56 | log_file.write(f"Process {prefix}, progress: {id_folder/len(process_folder)} \n") 57 | 58 | for id_page, page in enumerate(paginator.paginate(Bucket=bucket_name, Prefix=prefix)): 59 | 60 | for id_obj, obj in enumerate(page['Contents']): 61 | 62 | if obj['Key'].split('/')[-1] == 'manifest': 63 | continue # Skip the manifest file 64 | 65 | log_filename = os.path.join(logs_path, obj['Key'].split('/')[-2]+'_'+obj['Key'].split('/')[-1]+'_log.txt') 66 | with open(log_filename, 'a') as log_file: 67 | log_file.write(f"Page {id_page}, object {id_obj}; obj['Key']: {obj['Key']}\n") 68 | 69 | # Download and process the gzip-compressed JSON Lines file 70 | s3_object = s3.get_object(Bucket=bucket_name, Key=obj['Key']) 71 | 72 | with gzip.GzipFile(fileobj=s3_object['Body'], mode='r') as gz_file: 73 | with jsonlines.Reader(gz_file) as reader: 74 | filtered_objects = [] 75 | for id_json, json_obj in enumerate(reader): 76 | filtered_obj, journal_paper, journal_paper_with_abstract = filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract) 77 | 78 | if filtered_obj is not None: 79 | filtered_objects.append(filtered_obj) 80 | 81 | if id_json % 5000==0: 82 | with open(log_filename, 'a') as log_file: 83 | log_file.write(f"\n Processed {id_json} objects") 84 | 85 | # Prepare the local folder structure 86 | local_path_parts = obj['Key'].split('/') 87 | local_filtered_folder = os.path.join(local_base_folder, *local_path_parts[:-1]) 88 | os.makedirs(local_filtered_folder, exist_ok=True) 89 | 90 | # Store the filtered objects in a new gzip-compressed JSON Lines file on the local computer 91 | filtered_file_name = f"filtered_{local_path_parts[-1]}" 92 | filtered_file_path = os.path.join(local_filtered_folder, filtered_file_name) 93 | with gzip.open(filtered_file_path, 'wt') as f: 94 | for item in filtered_objects: 95 | f.write(json.dumps(item) + '\n') 96 | with open(log_filename, 'a') as log_file: 97 | log_file.write(f"Finish writing {filtered_file_path}; until now, journal_paper: {journal_paper}; journal_paper_with_abstract: {journal_paper_with_abstract}\n") 98 | #print(f"Finish writing {obj['Key']}: {filtered_file_path} \n") 99 | 100 | with open(log_folders, 'a') as log_file: 101 | log_file.write(f"Finish {prefix}, progress: {id_folder/len(process_folder)} \nuntil now, journal_paper: {journal_paper}; journal_paper_with_abstract: {journal_paper_with_abstract}\n") 102 | 103 | print(f"Finish writing all \n") -------------------------------------------------------------------------------- /create_dynamic_edges/get_concept_pairs.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import gzip 3 | import json 4 | import os 5 | import time 6 | from datetime import datetime, date 7 | import pickle 8 | from functools import reduce 9 | import random 10 | import re 11 | 12 | def get_single_article_string(article): 13 | 14 | curr_title=article['title'] 15 | abstract_inverted_index = article['abstract_inverted_index'] 16 | 17 | # Flatten the inverted index into a list of (position, word) tuples 18 | position_word_list = [(position, word) for word, positions in abstract_inverted_index.items() for position in positions] 19 | 20 | # Sort the list by position and extract the words 21 | sorted_abstract = sorted(position_word_list) 22 | curr_abstract = ' '.join(word for position, word in sorted_abstract) 23 | 24 | # Replace strings according to the replace_pairs list 25 | replace_pairs=[['\n',' '],['-',' '],[' \" a','oa'],['\" a','ae'],['\"a','ae'],[' \" o','oe'],['\" o','oe'],['\"o','oe'],[' \" u','ue'],['\" u','ue'],['\"u','ue'],[' \' a','a'],[' \' e','e'],[' \' o','o'],["\' ", ""],["\'", ""],[' ',' '],[' ',' ']] 26 | 27 | article_string=(curr_title +' '+ curr_abstract).lower() 28 | article_string = reduce(lambda text, pair: text.replace(pair[0], pair[1]), replace_pairs, article_string) 29 | 30 | return article_string 31 | 32 | # Define a sorting key function to extract the date and part number from the path 33 | def get_date_and_part_from_path(path): 34 | date_folder = os.path.dirname(path) 35 | date_str = date_folder.split('=')[-1] 36 | 37 | file_name = os.path.basename(path) 38 | part_str = file_name.split('_')[-1].split('.')[0] 39 | 40 | return date_str, int(part_str) 41 | 42 | 43 | def extract_id(filename): 44 | match = re.search(r'log_edge_part_(\d+)_', filename) 45 | if match: 46 | return int(match.group(1)) 47 | return None 48 | 49 | 50 | # define a log foler 51 | log_folder = 'logs_pair' 52 | # define edge_list foler 53 | edge_folder = 'concept_pair' 54 | edge_folder_log = 'concept_pair_log' 55 | 56 | try: 57 | if not os.path.exists(log_folder): 58 | os.makedirs(log_folder) 59 | 60 | if not os.path.exists(edge_folder): 61 | os.makedirs(edge_folder) 62 | 63 | if not os.path.exists(edge_folder_log): 64 | os.makedirs(edge_folder_log) 65 | 66 | except FileExistsError: 67 | pass 68 | 69 | data_folder="data_concept_graph" # the folder that contain the concept file 70 | cwd = os.getcwd() 71 | parent_dir = os.path.dirname(cwd) 72 | concept_folder = os.path.join(parent_dir, data_folder) 73 | 74 | 75 | # Define the base folder, date pattern and file pattern 76 | #project_path="/u/xmgu/projects/semnet_openalex" # change to your path 77 | #base_folder=os.path.join(project_path,'openalex_workdata_filtered/data/works/') 78 | 79 | base_folder = 'openalex_workdata_filtered/data/works/' 80 | date_pattern = 'updated_date=*' 81 | file_pattern = 'filtered_part_*.gz' 82 | 83 | # Find all the files matching the pattern 84 | file_paths = glob.glob(f'{base_folder}/{date_pattern}/{file_pattern}') 85 | # Sort the file_paths list in ascending order based on the date and part number 86 | file_paths = sorted(file_paths, key=get_date_and_part_from_path) 87 | 88 | # Define the date range or specific folders to include 89 | start_date = datetime.strptime("2022-12-20", "%Y-%m-%d") 90 | end_date = datetime.strptime("2023-03-28", "%Y-%m-%d") 91 | 92 | # Filter the file_paths list based on the date range or specific folders 93 | curr_run_file_paths = [path for path in file_paths if start_date <= datetime.strptime(get_date_and_part_from_path(path)[0], "%Y-%m-%d") <= end_date] 94 | 95 | # Read all concepts from full_final_concepts/full_domain_concept.txt 96 | concepts_files = os.path.join(concept_folder, 'full_domain_concept.txt') 97 | with open(concepts_files, 'r') as file: 98 | full_concepts = [concept.strip() for concept in file.readlines()] 99 | 100 | # Define a list to store the edge lists 101 | paper_starting_date = date(1990,1,1) 102 | write_file=0 103 | 104 | rnd_time=random.random()*60 105 | time.sleep(rnd_time) 106 | 107 | while write_file <=len(curr_run_file_paths): # curr_run_file_paths 108 | 109 | curr_ID = random.randint(0, len(curr_run_file_paths)-1) # get a random number between 0 and the number of files 110 | 111 | formatted_ID = '{:03d}'.format(curr_ID) 112 | 113 | edge_file=os.path.join(edge_folder, 'edge_part_'+formatted_ID+'.gz') 114 | edge_file_log=os.path.join(edge_folder_log, 'edge_part_'+formatted_ID+'.txt') 115 | log_file_txt=os.path.join(log_folder, 'log_edge_part_'+formatted_ID+'.txt') 116 | 117 | log_file_txt_finish=os.path.join(log_folder, 'log_edge_part_'+formatted_ID+'_finish.txt') 118 | log_file_txt_empty=os.path.join(log_folder, 'log_edge_part_'+formatted_ID+'_empty.txt') 119 | 120 | if not os.path.exists(log_file_txt): 121 | current_time=datetime.now() 122 | open(log_file_txt, 'a').close() 123 | 124 | file_path=curr_run_file_paths[curr_ID] 125 | with open(log_file_txt, 'a') as log_file: 126 | log_file.write(f'Current time: {current_time}; Number of files: {len(curr_run_file_paths)}; Number of concepts: {len(full_concepts)}\n\n') 127 | log_file.write(f'Start the File: {file_path}; Current time: {datetime.now()} \n\n') 128 | 129 | with gzip.open(file_path, 'rt') as file: 130 | lines = file.readlines() 131 | 132 | if not lines: # if lines is not empty 133 | print(f'File {file_path} is empty') 134 | write_file+=1 135 | with open(log_file_txt_empty, 'a') as log_file: 136 | log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; File is Empty!\n') 137 | 138 | else: 139 | edge_lists=[] 140 | for id_line, line in enumerate(lines): 141 | time_start_line=time.time() 142 | 143 | article_object = json.loads(line) # Load the JSON object 144 | get_date = datetime.strptime(article_object['publication_date'], "%Y-%m-%d").date() 145 | curr_paper_time = (get_date - paper_starting_date).days 146 | curr_all_citations=article_object['cited_by_count'] 147 | curr_citations_per_year=article_object['counts_by_year'] 148 | curr_article=get_single_article_string(article_object) 149 | 150 | # Check if the article contains any of the concepts 151 | concepts_for_single_paper=[] 152 | for id_concept, concept in enumerate(full_concepts): 153 | if concept in curr_article: # if the paper contains the concept; then store its concept index 154 | concepts_for_single_paper.append(id_concept) 155 | 156 | for ii in range(len(concepts_for_single_paper)): 157 | for jj in range(ii+1,len(concepts_for_single_paper)): 158 | edge_lists.append([concepts_for_single_paper[ii],concepts_for_single_paper[jj],curr_paper_time,curr_all_citations,curr_citations_per_year]) 159 | 160 | 161 | if id_line % 10000 == 0: 162 | with open(log_file_txt, 'a') as log_file: 163 | log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; Processed: {(id_line+1)/len(lines)}; time: {time.time()-time_start_line}\n') 164 | 165 | # Finish the current file, then store edge_lists to a pickle file 166 | with gzip.open(edge_file, 'wb') as output_file: 167 | pickle.dump(edge_lists, output_file) 168 | write_file+=1 169 | 170 | with open(edge_file_log, 'a') as log_file: 171 | log_file.write(f'\nedge_list={len(edge_lists)}') 172 | 173 | with open(log_file_txt, 'a') as log_file: 174 | log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path}; Processed: {write_file}/{len(curr_run_file_paths)}, i.e., {write_file/len(curr_run_file_paths)} \n') 175 | log_file.write(f'\nedge_list: {len(edge_lists)}\n') 176 | 177 | with open(log_file_txt_finish, 'a') as log_file: 178 | log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path} \n') 179 | 180 | rnd_time=random.random()*5 181 | time.sleep(rnd_time) 182 | 183 | else: 184 | finish_pattern = os.path.join(log_folder, 'log_edge_part_*_finish.txt') 185 | empty_pattern = os.path.join(log_folder, 'log_edge_part_*_empty.txt') 186 | finished_files = [f for f in glob.glob(finish_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))] 187 | empty_files = [f for f in glob.glob(empty_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))] 188 | 189 | # Count files that match each pattern 190 | total_files = len(finished_files) + len(empty_files) 191 | 192 | # Check if the total count is larger than 391 193 | if total_files >= len(curr_run_file_paths): 194 | print(f"{datetime.now()}:Finish run!") 195 | break 196 | 197 | 198 | 199 | with open("job_finish.txt", 'a') as f: 200 | f.write(f'\nFinish all: {datetime.now()}\n') 201 | 202 | 203 | -------------------------------------------------------------------------------- /create_dynamic_edges/merge_concept_pairs.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import gzip 3 | import json 4 | import os 5 | import time 6 | from datetime import datetime, date 7 | import pickle 8 | from functools import reduce 9 | import random 10 | 11 | 12 | log_folder = 'logs' 13 | if not os.path.exists(log_folder): 14 | os.makedirs(log_folder) 15 | log_files='log_merge_concept_pairs.txt' 16 | 17 | # define edge_list foler 18 | edge_list_folder = 'concept_pair' 19 | if not os.path.exists(edge_list_folder): 20 | os.makedirs(edge_list_folder) 21 | 22 | list_file_names = os.listdir(edge_list_folder) # List all files in the directory 23 | edge_file_name_unsorted = [file for file in list_file_names if file.endswith('.gz')] 24 | edge_lists_files = sorted(edge_file_name_unsorted) # Sort the file list 25 | 26 | full_edge_lists = os.path.join(edge_list_folder,'all_concept_pairs.gz') # edges 27 | 28 | 29 | with open(os.path.join(log_folder, log_files), 'a') as f: 30 | f.write(f'\nStart: {datetime.now()}\n') 31 | 32 | 33 | full_edges=[] 34 | empty_count=0 35 | for id_file, curr_edge_files in enumerate(edge_lists_files): 36 | 37 | with gzip.open(os.path.join(edge_list_folder, curr_edge_files), 'rb') as f: # load the edge list 38 | edge_data_list = pickle.load(f) 39 | 40 | if edge_data_list!=[]: # skip empty files 41 | full_edges.extend(edge_data_list) 42 | else: 43 | empty_count+=1 44 | print(f'Empty file: {curr_edge_files}') 45 | 46 | # write to log file 47 | with open(os.path.join(log_folder, log_files), 'a') as f: 48 | f.write(f'Finish file: {curr_edge_files}; Edges: {len(full_edges)}; Processed: {(id_file+1)/len(edge_lists_files)}; empty Num: {empty_count}\n') 49 | 50 | # store the edge list in a gz file 51 | with gzip.open(full_edge_lists, 'wb') as f: 52 | pickle.dump(full_edges, f) 53 | 54 | with open(os.path.join(log_folder, log_files), 'a') as f: 55 | f.write(f'\nFinish: {datetime.now()}\n') 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /create_dynamic_edges/process_edge_to_pandas_frame.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pickle 4 | import gzip 5 | from datetime import datetime, date 6 | import numpy as np 7 | import pandas as pd 8 | import time 9 | import copy 10 | 11 | log_folder = 'logs' # log folder 12 | if not os.path.exists(log_folder): 13 | os.makedirs(log_folder) 14 | 15 | data_folder="concept_pair" 16 | data_file=os.path.join(data_folder,'all_concept_pairs.gz') 17 | 18 | store_folder="data_concept_graph" 19 | cwd = os.getcwd() 20 | parent_dir = os.path.dirname(cwd) # get parent directory 21 | new_dir_path = os.path.join(parent_dir, store_folder) 22 | os.makedirs(new_dir_path, exist_ok=True) 23 | 24 | store_data_file = os.path.join(new_dir_path, "full_dynamic_graph.parquet") 25 | 26 | 27 | 28 | logsfile=os.path.join(log_folder,"logs_process_pairs.txt") 29 | starting_time=time.time() 30 | print(f'{datetime.now()}: read full graph') 31 | with open(logsfile+'.txt', "a") as myfile: 32 | myfile.write(f'\n{datetime.now()}: read full graph') 33 | 34 | with gzip.open(data_file, 'rb') as f: # load the edge list 35 | full_dynamic_edges = pickle.load(f) 36 | 37 | with open(logsfile+'.txt', "a") as myfile: 38 | myfile.write(f"\n{datetime.now()}: Done, Total: {len(full_dynamic_edges)}; Elapsed time: {time.time() - starting_time} seconds\n") 39 | 40 | 41 | # process the edge list to make each element with the same size 42 | ## [concept1, concept2, paper_time, total_citation, citation_per_year] 43 | ## e.g., [7, 414, 10378, 1, [{'year': 2022, 'cited_by_count': 1}]] becomes [7, 414, 10378, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 44 | 45 | starting_time = time.time() 46 | full_dynamic_edges_copy = copy.deepcopy(full_dynamic_edges) 47 | for i, item in enumerate(full_dynamic_edges): 48 | years_data = {year_data['year']: year_data['cited_by_count'] for year_data in item[4]} 49 | new_list = [years_data.get(year, 0) for year in range(2023, 2011, -1)] ## as cited_by_count only contains the last 10 years 50 | full_dynamic_edges_copy[i] = item[:4] + new_list 51 | 52 | if i % 200000 == 0: 53 | with open(logsfile+'.txt', "a") as myfile: 54 | myfile.write(f"\nProcessing item {i+1}/{len(full_dynamic_edges_copy)}") 55 | 56 | 57 | time_start = time.time() 58 | full_graph=np.array(full_dynamic_edges_copy) 59 | with open(logsfile+'.txt', "a") as myfile: 60 | myfile.write(f"\nDone, convert array; Elapsed time: {time.time() - time_start} seconds") 61 | 62 | 63 | time_start = time.time() 64 | full_graph_df = pd.DataFrame(full_graph, columns=['v1', 'v2', 'time', 'ct', 'c2023', 'c2022', 'c2021', 'c2020', 'c2019', 'c2018', 'c2017', 'c2016', 'c2015', 'c2014', 'c2013', 'c2012']) 65 | 66 | full_graph_df.to_parquet(store_data_file, compression='gzip') 67 | 68 | with open(logsfile+'.txt', "a") as myfile: 69 | myfile.write(f"\n{datetime.now()}: Done, full_graph: {len(full_graph_df)}; Elapsed time: {time.time() - time_start} seconds") 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /fpr_example/plot_FPR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.metrics import roc_curve, auc 4 | 5 | # File names and corresponding imbalance ratios (IR) 6 | files = ["solution_output_10.npy", "solution_output_50.npy", "solution_output_100.npy"] 7 | labels = ["IR=10", "IR=50", "IR=100"] 8 | 9 | # Create a figure with two subplots side by side 10 | fig, axes = plt.subplots(1, 2, figsize=(28, 12)) # Increase the figure width 11 | fig.subplots_adjust(wspace=2) # Add whitespace between subplots 12 | 13 | # Plot for single IR=10 (Left plot) 14 | data = np.load("solution_output_10.npy") 15 | y_true = data[:, 0] 16 | y_scores = data[:, 1] 17 | 18 | fpr, tpr, thresholds = roc_curve(y_true, y_scores) 19 | roc_auc = auc(fpr, tpr) 20 | 21 | axes[0].plot(fpr, tpr, label=f'IR=10 (AUC = {roc_auc:.2f})', color='blue') 22 | 23 | for fpr_value in [0.1, 0.3]: 24 | idx = np.argmin(np.abs(fpr - fpr_value)) 25 | tpr_value = tpr[idx] 26 | print(f'For FPR={fpr_value}, TPR={tpr_value:.2f}') 27 | axes[0].plot([fpr_value, fpr_value], [0, tpr_value], linestyle='dotted', color='black', linewidth=3.5) # Vertical line 28 | axes[0].plot([0, fpr_value], [tpr_value, tpr_value], linestyle='dotted', color='black', linewidth=3.5) # Horizontal line 29 | axes[0].text(fpr_value + 0.02, tpr_value - 0.05, f'(FPR={fpr_value}, TPR={tpr_value:.2f})', 30 | fontsize=36, color='black') # Increased font size by 10 31 | 32 | axes[0].set_xlabel('False Positive Rate', fontsize=40) # Increased font size by 10 33 | axes[0].set_ylabel('True Positive Rate', fontsize=40) # Increased font size by 10 34 | axes[0].set_title('ROC Curve with Thresholds (IR=10)', fontsize=44) # Increased font size by 10 35 | axes[0].tick_params(axis='both', which='major', labelsize=36) # Increased font size by 10 36 | axes[0].grid() 37 | axes[0].legend(loc='lower right', fontsize=36) # Increased font size by 10 38 | 39 | # Plot for all IRs (Right plot) 40 | for file, label in zip(files, labels): 41 | data = np.load(file) 42 | y_true = data[:, 0] 43 | y_scores = data[:, 1] 44 | 45 | fpr, tpr, _ = roc_curve(y_true, y_scores) 46 | roc_auc = auc(fpr, tpr) 47 | axes[1].plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})', linewidth=3) # Increase line width 48 | 49 | 50 | axes[1].plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random') 51 | axes[1].set_xlabel('False Positive Rate', fontsize=40) # Increased font size by 10 52 | axes[1].set_ylabel('True Positive Rate', fontsize=40) # Increased font size by 10 53 | axes[1].set_title('ROC Curve for IR=10, 50, 100', fontsize=44) # Increased font size by 10 54 | axes[1].tick_params(axis='both', which='major', labelsize=36) # Increased font size by 10 55 | axes[1].grid() 56 | axes[1].legend(loc='lower right', fontsize=36) # Increased font size by 10 57 | 58 | # Adjust layout and save the combined plot 59 | plt.tight_layout() 60 | plt.savefig('roc_curve_combined_highres.png', dpi=300) 61 | plt.show() 62 | -------------------------------------------------------------------------------- /fpr_example/roc_curve_combined_highres.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/fpr_example/roc_curve_combined_highres.png -------------------------------------------------------------------------------- /general_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import random, time 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | import networkx as nx 9 | import pandas as pd 10 | from datetime import datetime, date 11 | from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report 12 | from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix 13 | 14 | 15 | 16 | def flatten(t): 17 | return [item for sublist in t for item in sublist] 18 | 19 | 20 | def format_IR(IR_num, split_type): 21 | """ 22 | make a string, which can be used when storing trained neural network, results, log files, etc. 23 | """ 24 | if isinstance(IR_num[0], list): # Check if the first element is a list 25 | inner = ''.join(str(num) for num in IR_num[0]) 26 | outer = '{:02d}'.format(IR_num[1]) 27 | return f'T{split_type}_IR_{inner}_{outer}' 28 | else: 29 | return f'T{split_type}_IR_' + '_'.join('{:03d}'.format(num) for num in IR_num) 30 | 31 | 32 | 33 | def make_folders(year_start, split_type, num_class, addition_str): 34 | """ 35 | create folders and subfolders 36 | year_start is the train start year, e.g., 2016 for predicting 2019, the year_start is 2016 37 | split_type is used for setting whether train conditionally or not 38 | note: num_class is always setting to 2, due to binary classfication 39 | As an example: year_start=2016, split_type=0; num_class=2; addition_str='train': 40 | folder: 2016_train, 41 | subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 42 | """ 43 | parent_folder = str(year_start)+"_"+ addition_str 44 | if not os.path.exists(parent_folder): 45 | os.mkdir(parent_folder) 46 | 47 | log_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_log") 48 | net_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_net") 49 | train_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_loss") 50 | figure_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_curve") 51 | result_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_result") 52 | 53 | try: 54 | if not os.path.exists(log_folder): 55 | os.mkdir(log_folder) 56 | 57 | if not os.path.exists(net_folder): 58 | os.mkdir(net_folder) 59 | 60 | if not os.path.exists(train_folder): 61 | os.mkdir(train_folder) 62 | 63 | if not os.path.exists(figure_folder): 64 | os.makedirs(figure_folder) 65 | 66 | if not os.path.exists(result_folder): 67 | os.makedirs(result_folder) 68 | 69 | except FileExistsError: 70 | pass 71 | 72 | save_folders = [net_folder, train_folder, figure_folder, result_folder] 73 | return save_folders, log_folder 74 | 75 | 76 | ######### Plots ############### 77 | def calculate_plot_ROC(true_labels, nn_outputs, user_parameter, figure_name, save_figure_folder): 78 | """ 79 | Plot the ROC curve for binary classification. 80 | 81 | Parameters: 82 | - true_labels: Ground truth binary labels. 83 | - nn_outputs: Raw outputs from the neural network. 84 | - user_parameter: some user parameters whcih are num_class, IR_num, split_type, out_norm; not used here, can be removed 85 | - figure_name: stored figure name 86 | - save_figure_folder: the folder to store the figure, which is usually defined from t0_c2_curve created from make_folders() 87 | 88 | return: 89 | auc_score_number: the AUC value 90 | """ 91 | num_class, IR_num, split_type, out_norm = user_parameter 92 | figure_path=os.path.join(save_figure_folder, figure_name) 93 | # Compute the ROC curve 94 | fpr, tpr, _ = roc_curve(true_labels, nn_outputs) 95 | roc_auc = auc(fpr, tpr) 96 | 97 | auc_score_number = roc_auc_score(true_labels, nn_outputs) 98 | 99 | # Plot the ROC curve 100 | plt.figure() 101 | lw = 1.5 # Line width 102 | plt.plot(fpr, tpr, color='blue', lw=lw, label='ROC curve (AUC = %0.4f)' % roc_auc) 103 | plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--', label='baseline') 104 | plt.xlim([0.0, 1.0]) 105 | plt.ylim([0.0, 1.05]) 106 | plt.xlabel('False Positive Rate') 107 | plt.ylabel('True Positive Rate') 108 | plt.title('Receiver Operating Characteristic (ROC)') 109 | plt.legend(loc="lower right") 110 | plt.savefig(figure_path,dpi=600) 111 | plt.show() 112 | plt.close() 113 | 114 | return auc_score_number 115 | -------------------------------------------------------------------------------- /miscellaneous/Fig2_NeuralNet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/miscellaneous/Fig2_NeuralNet.png -------------------------------------------------------------------------------- /miscellaneous/Impact4Cast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/miscellaneous/Impact4Cast.png -------------------------------------------------------------------------------- /miscellaneous/KnowledgeGraph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/miscellaneous/KnowledgeGraph.png -------------------------------------------------------------------------------- /prepare_adjacency_pagerank.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import random, time 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from scipy import sparse 9 | from scipy.stats import rankdata 10 | import networkx as nx 11 | import pandas as pd 12 | from collections import defaultdict,Counter 13 | from datetime import datetime, date 14 | from itertools import combinations 15 | from features_utils import get_adjacency_matrix, get_pagerank_score 16 | 17 | NUM_OF_VERTICES=37960 ## number of vertices in the graph 18 | 19 | time_start = time.time() 20 | data_folder="data_concept_graph" # the folder which stores the full dynamic knowledge graph 21 | 22 | # Read all concepts together with time, citation information 23 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") 24 | full_edge_dynamic_data = pd.read_parquet(graph_file) 25 | 26 | print(f"Done, elapsed_time: {time.time() - time_start}\n full_edge_dynamic_data: {len(full_edge_dynamic_data)};\n") 27 | 28 | log_files="log_adjacent_pagerank.txt" # just for logging the running situation 29 | 30 | data_folder="data_for_features" # folder to store the generated adjacency_matrix files and pagerank files for different years 31 | years=[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022] 32 | 33 | start_time1=time.time() 34 | for yy in years: 35 | 36 | print(f"{datetime.now()}: start adjacency_matrix") 37 | with open(log_files, "a") as myfile: 38 | myfile.write(f"\n{datetime.now()}: start adjacency_matrix") 39 | 40 | data_file=os.path.join(data_folder, f"adjacency_matrix_{yy}.gz") 41 | adjacency_matrix_sparse=get_adjacency_matrix(full_edge_dynamic_data, yy, data_file) 42 | print(f"{datetime.now()}: finish adjacency_matrix") 43 | with open(log_files, "a") as myfile: 44 | myfile.write(f"\n{datetime.now()}: finish adjacency_matrix") 45 | 46 | data_file=os.path.join(data_folder,f"pagerank_score_{yy}.gz") 47 | pagerank_score=get_pagerank_score(adjacency_matrix_sparse, data_file) 48 | print(f"{datetime.now()}: finish pagerank_score") 49 | print(f"done, year {yy}: {time.time() - start_time1}s") 50 | with open(log_files, "a") as myfile: 51 | myfile.write(f"\n{datetime.now()}: done, year {yy}: {time.time() - start_time1}s") 52 | start_time1=time.time() 53 | 54 | -------------------------------------------------------------------------------- /prepare_eval_data/prepare_eval_feature_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "e965c883-bc23-437a-822a-6693275a5d54", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import pickle\n", 12 | "import gzip\n", 13 | "import copy\n", 14 | "import torch\n", 15 | "from torch import nn\n", 16 | "import torch.nn.functional as F\n", 17 | "import random, time\n", 18 | "import numpy as np\n", 19 | "import matplotlib.pyplot as plt\n", 20 | "from scipy import sparse\n", 21 | "from scipy.stats import rankdata\n", 22 | "import networkx as nx\n", 23 | "import pandas as pd\n", 24 | "from collections import defaultdict,Counter\n", 25 | "from datetime import datetime, date\n", 26 | "from itertools import combinations\n", 27 | "from preprocess_utils import *\n", 28 | "from features_utils import *\n", 29 | "from train_model_utils import *\n", 30 | " " 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "id": "c44d8573-7c1d-4114-b5b2-ea2fa1bb5c34", 36 | "metadata": {}, 37 | "source": [ 38 | "## read pairs and solutions data (both)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "873dc1df-b27e-4b96-9a5d-40c151b0b256", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "store_folder=\"data_pair_solution\"\n", 49 | "pair_solution_data1=os.path.join(store_folder,\"unconnected_2019_pair_solution_connected_2022.parquet\")\n", 50 | "pair_solution_data2=os.path.join(store_folder,\"unconnected_2019_pair_solution_unconnected_2022.parquet\")\n", 51 | "\n", 52 | "time_start = time.time()\n", 53 | "eval_pair_solution1 = pd.read_parquet(pair_solution_data1)\n", 54 | "eval_pair_solution1=eval_pair_solution1[['v1','v2','citation']]\n", 55 | "print(f\"Done, read pair_solution_yes: {len(eval_pair_solution1)}; elapsed_time: {time.time() - time_start}\")\n", 56 | "\n", 57 | "time_start = time.time()\n", 58 | "eval_pair_solution2 = pd.read_parquet(pair_solution_data2)\n", 59 | "print(f\"Done, read pair_solution_not: {len(eval_pair_solution2)}; elapsed_time: {time.time() - time_start}\")\n", 60 | "\n", 61 | "time_start = time.time()\n", 62 | "full_eval_pair_result = pd.concat([eval_pair_solution1, eval_pair_solution2])\n", 63 | "print(f\"Done, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "id": "8a342e3a-2e78-4ea9-96fe-511a347ad92f", 69 | "metadata": {}, 70 | "source": [ 71 | "#### fix random seed" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "id": "627dbfab-4a88-4419-b868-92d285305456", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "day_origin = date(1990,1,1)\n", 82 | "vertex_degree_cutoff=1\n", 83 | "years_delta=3\n", 84 | "min_edges=1\n", 85 | "year_start=2022-years_delta\n", 86 | "\n", 87 | "rnd_seed=42\n", 88 | "random.seed(rnd_seed)\n", 89 | "torch.manual_seed(rnd_seed)\n", 90 | "np.random.seed(rnd_seed)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "id": "7023bfd5-d921-4d7e-8124-e5ce26fe6d9e", 96 | "metadata": {}, 97 | "source": [ 98 | "### randomly 10M " 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "9dfcdb04-357e-4b9d-8a38-17b104ed2b71", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "edges_used=10**7\n", 109 | "num_row = int(min(edges_used, len(full_eval_pair_result)))\n", 110 | "\n", 111 | "time_start = time.time()\n", 112 | "shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed)\n", 113 | "eval_data_pair_solution = shuffled.head(num_row)\n", 114 | "\n", 115 | "print(f\"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}\")" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "99cfee20-1576-428c-b6bd-beee3ad65ce2", 121 | "metadata": {}, 122 | "source": [ 123 | "## store unconnected pairs and citation, time information" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "834eec4d-4554-4e15-839b-f6d6f25adab4", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "\n", 134 | "store_eval_folder=\"data_eval\"\n", 135 | "if not os.path.exists(store_eval_folder):\n", 136 | " os.makedirs(store_eval_folder)\n", 137 | "print(f\"store files in {store_eval_folder}.....\")\n", 138 | "\n", 139 | "time_start = time.time()\n", 140 | "store_name=os.path.join(store_eval_folder,\"data_eval_pair_solution.parquet\")\n", 141 | "\n", 142 | "eval_data_pair_solution.to_parquet(store_name, compression='gzip')\n", 143 | "print(f\"eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}\")\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "id": "db9e9baa-cfb2-4274-af0f-b93e4b2346d3", 149 | "metadata": { 150 | "tags": [] 151 | }, 152 | "source": [ 153 | "#### prepare properties" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "1925bd7e-cf68-4f03-90c2-7170d3b36057", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "time_start = time.time()\n", 164 | "data_folder=\"data_concept_graph\"\n", 165 | "graph_file=os.path.join(data_folder,\"full_dynamic_graph.parquet\")\n", 166 | "full_dynamic_graph = pd.read_parquet(graph_file)\n", 167 | "print(f\"{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}\")" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "81b249b8-ed05-43ff-b84a-b25e850a5e72", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "day_origin = date(1990,1,1)\n", 178 | "vertex_degree_cutoff=1\n", 179 | "years_delta=3\n", 180 | "min_edges=1\n", 181 | "year_start=2022-years_delta" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "id": "f18e6f1e-b03e-41d2-8f14-c777b937d0ae", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "start_time=time.time()\n", 192 | "adj_mat_sparse=[]\n", 193 | "node_neighbor_list=[]\n", 194 | "num_neighbor_list=[]\n", 195 | "for yy in [year_start,year_start-1,year_start-2]:\n", 196 | " data_file=os.path.join(\"data_for_features\", f\"adjacency_matrix_{yy}.gz\")\n", 197 | " adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file)\n", 198 | " adj_mat_sparse.append(adj_mat)\n", 199 | " \n", 200 | " curr_node_neighbor=get_node_neighbor(adj_mat)\n", 201 | " node_neighbor_list.append(curr_node_neighbor)\n", 202 | " \n", 203 | " curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array \n", 204 | " num_neighbor_list.append(curr_num_neighbor)\n", 205 | " \n", 206 | "print(f\"{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}\")" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "id": "f2aa8b6c-b523-4663-a562-78df50522415", 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "start_time=time.time()\n", 217 | "vertex_features=get_all_node_feature(adj_mat_sparse, year_start, \"data_for_features\")\n", 218 | "print(f\"{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}\")\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "a3a909e3-8949-4d13-9c0d-1fa282d4ab45", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "start_time=time.time()\n", 229 | "vc_feature_list=[]\n", 230 | "for yy in [year_start,year_start-1,year_start-2]:\n", 231 | " data_file=os.path.join(\"data_for_features\", f\"concept_node_citation_data_{yy}.parquet\")\n", 232 | " vc_df=pd.read_parquet(data_file)\n", 233 | " vc_feature=vc_df.values\n", 234 | " vc_feature_list.append(vc_feature)\n", 235 | " \n", 236 | "vertex_cfeatures=get_all_node_cfeature(vc_feature_list)\n", 237 | "print(f\"{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}\") " 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "id": "bb0eb093-f9a0-441d-aa47-9416ae12cc60", 244 | "metadata": { 245 | "tags": [] 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "\n", 250 | "logs_file_name='logs_eval_data_infos'\n", 251 | "time_start = time.time()\n", 252 | "eval_pair_solution=eval_data_pair_solution.values\n", 253 | "unconnected_vertex_pairs=eval_pair_solution[:,:2]\n", 254 | " \n", 255 | "pair_features, pair_cfeatures=get_all_pair_features(vc_feature_list, node_neighbor_list, num_neighbor_list, unconnected_vertex_pairs, logs_file_name)\n", 256 | "\n", 257 | "all_features=[vertex_features, vertex_cfeatures, pair_features, pair_cfeatures]\n", 258 | "\n", 259 | "eval_data_features=get_all_feature(all_features, unconnected_vertex_pairs, logs_file_name)\n", 260 | "\n", 261 | "print(f\"finish; {len(eval_data_features)}; time: {time.time()-time_start}\")" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "id": "c2506f57-621e-48cd-bdf0-ad942cd862f0", 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "time_start = time.time()\n", 272 | "\n", 273 | "store_name=os.path.join(store_eval_folder,\"eval_data_pair_feature.parquet\")\n", 274 | "data_eval_2022 = pd.DataFrame(eval_data_features)\n", 275 | "data_eval_2022.to_parquet(store_name, compression='gzip') \n", 276 | "\n", 277 | "print(f\"data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}\")" 278 | ] 279 | } 280 | ], 281 | "metadata": { 282 | "kernelspec": { 283 | "display_name": "asl_semnet", 284 | "language": "python", 285 | "name": "asl_semnet" 286 | }, 287 | "language_info": { 288 | "codemirror_mode": { 289 | "name": "ipython", 290 | "version": 3 291 | }, 292 | "file_extension": ".py", 293 | "mimetype": "text/x-python", 294 | "name": "python", 295 | "nbconvert_exporter": "python", 296 | "pygments_lexer": "ipython3", 297 | "version": "3.10.9" 298 | } 299 | }, 300 | "nbformat": 4, 301 | "nbformat_minor": 5 302 | } 303 | -------------------------------------------------------------------------------- /prepare_eval_data/prepare_eval_feature_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import torch 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import random, time 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy import sparse 12 | from scipy.stats import rankdata 13 | import networkx as nx 14 | import pandas as pd 15 | from collections import defaultdict,Counter 16 | from datetime import datetime, date 17 | from itertools import combinations 18 | from preprocess_utils import * 19 | from features_utils import * 20 | from train_model_utils import * 21 | 22 | 23 | 24 | time_start_begin=time.time() 25 | logs_file_name='logs_eval_data_infos' 26 | store_folder="data_pair_solution" 27 | pair_solution_data1=os.path.join(store_folder,"unconnected_2019_pair_solution_connected_2022_clean.parquet") 28 | pair_solution_data2=os.path.join(store_folder,"unconnected_2019_pair_solution_unconnected_2022.parquet") 29 | 30 | time_start = time.time() 31 | eval_pair_solution1 = pd.read_parquet(pair_solution_data1) 32 | print(f"Done, read pair_solution_yes: {len(eval_pair_solution1)}; elapsed_time: {time.time() - time_start}") 33 | 34 | time_start = time.time() 35 | eval_pair_solution2 = pd.read_parquet(pair_solution_data2) 36 | print(f"Done, read pair_solution_not: {len(eval_pair_solution2)}; elapsed_time: {time.time() - time_start}") 37 | 38 | time_start = time.time() 39 | full_eval_pair_result = pd.concat([eval_pair_solution1, eval_pair_solution2]) 40 | print(f"Done, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}") 41 | with open(logs_file_name+".txt", "a") as myfile: 42 | myfile.write(f"\nDone, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}") 43 | 44 | 45 | day_origin = date(1990,1,1) 46 | vertex_degree_cutoff=1 47 | min_edges=1 48 | years_delta=3 49 | year_start=2022-years_delta 50 | 51 | rnd_seed=42 52 | random.seed(rnd_seed) 53 | torch.manual_seed(rnd_seed) 54 | np.random.seed(rnd_seed) 55 | 56 | edges_used=10**7 57 | num_row = int(min(edges_used, len(full_eval_pair_result))) 58 | 59 | time_start = time.time() 60 | shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed) 61 | eval_data_pair_solution = shuffled.head(num_row) 62 | 63 | print(f"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 64 | with open(logs_file_name+".txt", "a") as myfile: 65 | myfile.write(f"\nDone, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 66 | 67 | 68 | 69 | store_eval_folder="data_eval" # store folder 70 | if not os.path.exists(store_eval_folder): 71 | os.makedirs(store_eval_folder) 72 | print(f"store files in {store_eval_folder}.....") 73 | 74 | ###----- store eval_data_pair_solution -----### 75 | time_start = time.time() 76 | store_name=os.path.join(store_eval_folder,"eval_data_pair_solution.parquet") 77 | eval_data_pair_solution.to_parquet(store_name, compression='gzip') 78 | print(f"eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 79 | with open(logs_file_name+".txt", "a") as myfile: 80 | myfile.write(f"\neval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 81 | 82 | 83 | 84 | ###----- prepare features -----### 85 | time_start = time.time() 86 | data_folder="data_concept_graph" # folder that stores the full knowledge graph 87 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") # load the full knowledge graph 88 | full_dynamic_graph = pd.read_parquet(graph_file) 89 | print(f"{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}") 90 | with open(logs_file_name+".txt", "a") as myfile: 91 | myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}") 92 | 93 | 94 | start_time=time.time() 95 | adj_mat_sparse=[] 96 | node_neighbor_list=[] 97 | num_neighbor_list=[] 98 | for yy in [year_start,year_start-1,year_start-2]: 99 | data_file=os.path.join("data_for_features", f"adjacency_matrix_{yy}.gz") 100 | adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) 101 | adj_mat_sparse.append(adj_mat) 102 | 103 | curr_node_neighbor=get_node_neighbor(adj_mat) 104 | node_neighbor_list.append(curr_node_neighbor) 105 | 106 | curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() 107 | num_neighbor_list.append(curr_num_neighbor) 108 | 109 | print(f"{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 110 | with open(logs_file_name+".txt", "a") as myfile: 111 | myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 112 | 113 | 114 | start_time=time.time() 115 | feature_folder="data_for_features" 116 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) 117 | print(f"{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}") 118 | with open(logs_file_name+".txt", "a") as myfile: 119 | myfile.write(f"\n{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}") 120 | 121 | 122 | start_time=time.time() 123 | vc_feature_list=[] 124 | for yy in [year_start,year_start-1,year_start-2]: 125 | data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") 126 | vc_df=pd.read_parquet(data_file) 127 | vc_feature=vc_df.values 128 | vc_feature_list.append(vc_feature) 129 | 130 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list) 131 | print(f"{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 132 | with open(logs_file_name+".txt", "a") as myfile: 133 | myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 134 | 135 | 136 | 137 | time_start = time.time() 138 | eval_pair_solution=eval_data_pair_solution.values 139 | unconnected_vertex_pairs=eval_pair_solution[:,:2] 140 | 141 | pair_features, pair_cfeatures=get_all_pair_features(vc_feature_list, node_neighbor_list, num_neighbor_list, unconnected_vertex_pairs, logs_file_name) 142 | 143 | all_features=[vertex_features, vertex_cfeatures, pair_features, pair_cfeatures] 144 | 145 | eval_data_features=get_all_feature(all_features, unconnected_vertex_pairs, logs_file_name) 146 | 147 | print(f"finish; {len(eval_data_features)}; time: {time.time()-time_start}") 148 | with open(logs_file_name+".txt", "a") as myfile: 149 | myfile.write(f"\nfinish; {len(eval_data_features)}; time: {time.time()-time_start}") 150 | 151 | 152 | ###----- store eval_data_pair_feature -----### 153 | time_start = time.time() 154 | store_name=os.path.join(store_eval_folder,"eval_data_pair_feature.parquet") 155 | data_eval_2022 = pd.DataFrame(eval_data_features) 156 | 157 | # Convert column names to string 158 | data_eval_2022.columns = data_eval_2022.columns.astype(str) 159 | data_eval_2022.to_parquet(store_name, compression='gzip') 160 | 161 | print(f"store data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}") 162 | with open(logs_file_name+".txt", "a") as myfile: 163 | myfile.write(f"\nstore data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}") 164 | myfile.write(f"\n\n{datetime.now()}: {time.time() - time_start_begin}") 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /prepare_eval_data/prepare_eval_feature_data_condition.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import torch 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import random, time 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy import sparse 12 | from scipy.stats import rankdata 13 | import networkx as nx 14 | import pandas as pd 15 | from collections import defaultdict,Counter 16 | from datetime import datetime, date 17 | from itertools import combinations 18 | from preprocess_utils import * 19 | from features_utils import * 20 | from train_model_utils import * 21 | 22 | 23 | 24 | time_start_begin=time.time() 25 | logs_file_name='logs_eval_data_infos' 26 | store_folder="data_pair_solution" 27 | pair_solution_data1=os.path.join(store_folder,"unconnected_2019_pair_solution_connected_2022_clean.parquet") 28 | 29 | 30 | time_start = time.time() 31 | full_eval_pair_result = pd.read_parquet(pair_solution_data1) 32 | print(f"Done, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}") 33 | with open(logs_file_name+".txt", "a") as myfile: 34 | myfile.write(f"\nDone, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}") 35 | 36 | 37 | day_origin = date(1990,1,1) 38 | vertex_degree_cutoff=1 39 | years_delta=3 40 | min_edges=1 41 | year_start=2022-years_delta 42 | 43 | rnd_seed=42 44 | random.seed(rnd_seed) 45 | torch.manual_seed(rnd_seed) 46 | np.random.seed(rnd_seed) 47 | 48 | edges_used=10**7 49 | num_row = int(min(edges_used, len(full_eval_pair_result))) 50 | 51 | time_start = time.time() 52 | shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed) 53 | eval_data_pair_solution = shuffled.head(num_row) 54 | 55 | print(f"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 56 | with open(logs_file_name+".txt", "a") as myfile: 57 | myfile.write(f"\nDone, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 58 | 59 | 60 | 61 | store_eval_folder="data_eval" # store folder 62 | if not os.path.exists(store_eval_folder): 63 | os.makedirs(store_eval_folder) 64 | print(f"store files in {store_eval_folder}.....") 65 | 66 | ###----- store eval_data_pair_solution -----### 67 | time_start = time.time() 68 | store_name=os.path.join(store_eval_folder,"eval_data_pair_solution_condition.parquet") 69 | eval_data_pair_solution.to_parquet(store_name, compression='gzip') 70 | print(f"eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 71 | with open(logs_file_name+".txt", "a") as myfile: 72 | myfile.write(f"\neval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}") 73 | 74 | 75 | ###----- prepare features -----### 76 | time_start = time.time() 77 | data_folder="data_concept_graph" # folder that stores the full knowledge graph 78 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") # load the full knowledge graph 79 | full_dynamic_graph = pd.read_parquet(graph_file) 80 | print(f"{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}") 81 | with open(logs_file_name+".txt", "a") as myfile: 82 | myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}") 83 | 84 | 85 | start_time=time.time() 86 | adj_mat_sparse=[] 87 | node_neighbor_list=[] 88 | num_neighbor_list=[] 89 | for yy in [year_start,year_start-1,year_start-2]: 90 | data_file=os.path.join("data_for_features", f"adjacency_matrix_{yy}.gz") 91 | adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) 92 | adj_mat_sparse.append(adj_mat) 93 | 94 | curr_node_neighbor=get_node_neighbor(adj_mat) 95 | node_neighbor_list.append(curr_node_neighbor) 96 | 97 | curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 98 | num_neighbor_list.append(curr_num_neighbor) 99 | 100 | print(f"{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 101 | with open(logs_file_name+".txt", "a") as myfile: 102 | myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 103 | 104 | 105 | start_time=time.time() 106 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, "data_for_features") 107 | print(f"{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}") 108 | with open(logs_file_name+".txt", "a") as myfile: 109 | myfile.write(f"\n{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}") 110 | 111 | 112 | start_time=time.time() 113 | vc_feature_list=[] 114 | for yy in [year_start,year_start-1,year_start-2]: 115 | data_file=os.path.join("data_for_features", f"concept_node_citation_data_{yy}.parquet") 116 | vc_df=pd.read_parquet(data_file) 117 | vc_feature=vc_df.values 118 | vc_feature_list.append(vc_feature) 119 | 120 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list) 121 | print(f"{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 122 | with open(logs_file_name+".txt", "a") as myfile: 123 | myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 124 | 125 | 126 | 127 | time_start = time.time() 128 | eval_pair_solution=eval_data_pair_solution.values 129 | unconnected_vertex_pairs=eval_pair_solution[:,:2] 130 | 131 | pair_features, pair_cfeatures=get_all_pair_features(vc_feature_list, node_neighbor_list, num_neighbor_list, unconnected_vertex_pairs, logs_file_name) 132 | 133 | all_features=[vertex_features, vertex_cfeatures, pair_features, pair_cfeatures] 134 | 135 | eval_data_features=get_all_feature(all_features, unconnected_vertex_pairs, logs_file_name) 136 | 137 | print(f"finish; {len(eval_data_features)}; time: {time.time()-time_start}") 138 | with open(logs_file_name+".txt", "a") as myfile: 139 | myfile.write(f"\nfinish; {len(eval_data_features)}; time: {time.time()-time_start}") 140 | 141 | 142 | ###----- store eval_data_pair_feature -----### 143 | time_start = time.time() 144 | store_name=os.path.join(store_eval_folder,"eval_data_pair_feature_condition.parquet") 145 | data_eval_2022 = pd.DataFrame(eval_data_features) 146 | 147 | # Convert column names to string 148 | data_eval_2022.columns = data_eval_2022.columns.astype(str) 149 | data_eval_2022.to_parquet(store_name, compression='gzip') 150 | 151 | print(f"store data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}") 152 | with open(logs_file_name+".txt", "a") as myfile: 153 | myfile.write(f"\nstore data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}") 154 | myfile.write(f"\n\n{datetime.now()}: {time.time() - time_start_begin}") 155 | 156 | -------------------------------------------------------------------------------- /prepare_node_pair_citation_data_years.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "434dcf89-1cc0-4077-9c3e-d893f55838c9", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "from datetime import datetime, date\n", 12 | "import random, time\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "import pandas as pd\n", 16 | "import torch\n", 17 | "from torch import nn\n", 18 | "from scipy import sparse\n", 19 | "from collections import defaultdict\n", 20 | "import pandas as pd\n", 21 | "import networkx as nx\n", 22 | "import copy\n", 23 | "import gzip\n", 24 | "import pickle\n", 25 | "from scipy.stats import rankdata\n", 26 | "import time" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "id": "cfc6ed42-d6a2-46de-9b8a-bc5994d96b1f", 32 | "metadata": {}, 33 | "source": [ 34 | "### single concept's citation features" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "4f99ff28-b872-4777-8df2-8cfbd1a5031b", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "time_start = time.time()\n", 45 | "data_folder=\"data_concept_graph\"\n", 46 | "\n", 47 | "# Read all concepts together with time, citation information\n", 48 | "dynamic_concept_file=os.path.join(data_folder,\"full_dynamic_concept.parquet\")\n", 49 | "full_concepts_dynamic_data = pd.read_parquet(dynamic_concept_file)\n", 50 | "\n", 51 | "# Read all concepts from full_concepts_for_openalex.txt\n", 52 | "concepts_files = os.path.join(data_folder, 'full_domain_concepts.txt')\n", 53 | "with open(concepts_files, 'r') as file:\n", 54 | " full_concepts = [concept.strip() for concept in file.readlines()]\n", 55 | "\n", 56 | "print(f\"Done, elapsed_time: {time.time() - time_start}\\n full_concepts_dynamic_data: {len(full_concepts_dynamic_data)};\\n full_concept: {len(full_concepts)}\")\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "id": "c610f570-ce15-4f5b-9bb3-1640bc0a7cab", 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "NUM_OF_VERTICES=len(full_concepts)\n", 67 | "vertex_degree_cutoff=1\n", 68 | "years_delta=3\n", 69 | "min_edges=1" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "id": "977a6454-4b41-4a62-ad2f-ace410365751", 76 | "metadata": { 77 | "tags": [] 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "\n", 82 | "years=[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]\n", 83 | "\n", 84 | "day_origin = date(1990,1,1)\n", 85 | "all_concepts_df = pd.DataFrame({'v1': range(0, NUM_OF_VERTICES)})\n", 86 | "\n", 87 | "store_folder=\"data_for_features\"\n", 88 | "if not os.path.exists(store_folder):\n", 89 | " os.makedirs(store_folder)\n", 90 | "\n", 91 | "start_time=time.time()\n", 92 | "for yy in years: \n", 93 | " print(f'Year: {yy}')\n", 94 | " day_curr=(date(yy,12,31)- day_origin).days\n", 95 | " columns_to_subtract = [f'c{i}' for i in range(2023, yy, -1)]\n", 96 | " print(columns_to_subtract)\n", 97 | " cols_to_sum = [f'c{i}' for i in range(yy, yy-years_delta, -1)]\n", 98 | " print(cols_to_sum)\n", 99 | " \n", 100 | " dynamic_concepts=full_concepts_dynamic_data[full_concepts_dynamic_data['time']<=day_curr]\n", 101 | " dynamic_concepts_df = dynamic_concepts.copy()\n", 102 | " \n", 103 | " dynamic_concepts_df[f'ct_{yy}'] = dynamic_concepts_df['ct'] - dynamic_concepts_df[columns_to_subtract].sum(axis=1)\n", 104 | " \n", 105 | " dynamic_concepts_df['ct_delta'] = dynamic_concepts_df[cols_to_sum].sum(axis=1)\n", 106 | " \n", 107 | " dynamic_concepts_df=dynamic_concepts_df[['v1', f'c{yy}', f'ct_{yy}', 'ct_delta']]\n", 108 | " \n", 109 | " dynamic_concepts_grouped = dynamic_concepts_df.groupby('v1').agg({f'c{yy}':'sum', f'ct_{yy}':'sum', 'ct_delta':'sum', 'v1':'size'}).rename(columns={'v1':f'num'}).reset_index()\n", 110 | " \n", 111 | " dynamic_concepts_grouped[f'c{yy}_m'] = dynamic_concepts_grouped[f'c{yy}'] / dynamic_concepts_grouped[f'num']\n", 112 | " dynamic_concepts_grouped[f'ct_{yy}_m'] = dynamic_concepts_grouped[f'ct_{yy}'] / dynamic_concepts_grouped[f'num']\n", 113 | " dynamic_concepts_grouped[f'ct_delta_m'] = dynamic_concepts_grouped['ct_delta'] / dynamic_concepts_grouped[f'num']\n", 114 | " \n", 115 | " \n", 116 | " # Merge with all_concepts_df\n", 117 | " dynamic_concepts_data = pd.merge(all_concepts_df, dynamic_concepts_grouped, on='v1', how='left')\n", 118 | " dynamic_concepts_data.fillna(0, inplace=True) # Fill NaN values with 0\n", 119 | " dynamic_concepts_data.sort_values(by='v1')\n", 120 | " \n", 121 | " data_file = os.path.join(store_folder, f\"concept_node_citation_data_{yy}.parquet\")\n", 122 | " dynamic_concepts_data.to_parquet(data_file, compression='gzip')\n", 123 | " print(f\"in {yy}; time: {time.time()-start_time}\\n\")\n", 124 | " start_time=time.time()\n" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "f1147d2c-2fb7-4f12-a89f-e26d3a2d4689", 130 | "metadata": {}, 131 | "source": [ 132 | "### concept pair's citation features" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "id": "aed3dd3b-eb2a-474c-8665-9743514d55d2", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "time_start = time.time()\n", 143 | "data_folder=\"data_concept_graph\"\n", 144 | "\n", 145 | "# Read all concepts together with time, citation information\n", 146 | "graph_file=os.path.join(data_folder,\"full_dynamic_graph.parquet\")\n", 147 | "full_edge_dynamic_data = pd.read_parquet(graph_file)\n", 148 | "\n", 149 | "print(f\"Done, elapsed_time: {time.time() - time_start}\\n full_edge_dynamic_data: {len(full_edge_dynamic_data)};\\n\")\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "e3f5fc43-a136-4959-8b29-1717eca77bbe", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "\n", 160 | "years=[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]\n", 161 | "\n", 162 | "day_origin = date(1990,1,1)\n", 163 | " \n", 164 | "store_folder=\"data_for_features\"\n", 165 | "start_time=time.time()\n", 166 | "for yy in years: \n", 167 | " print(f'Year: {yy}')\n", 168 | " day_curr=(date(yy,12,31)- day_origin).days\n", 169 | " columns_to_subtract = [f'c{i}' for i in range(2023, yy, -1)]\n", 170 | " print(columns_to_subtract)\n", 171 | " cols_to_sum = [f'c{i}' for i in range(yy, yy-years_delta, -1)]\n", 172 | " print(cols_to_sum)\n", 173 | " \n", 174 | " dynamic_pairs=full_edge_dynamic_data[full_edge_dynamic_data['time']<=day_curr]\n", 175 | " dynamic_pairs_df = dynamic_pairs.copy()\n", 176 | " \n", 177 | " dynamic_pairs_df[f'ct_{yy}'] = dynamic_pairs_df['ct'] - dynamic_pairs_df[columns_to_subtract].sum(axis=1)\n", 178 | " \n", 179 | " dynamic_pairs_df['ct_delta'] = dynamic_pairs_df[cols_to_sum].sum(axis=1)\n", 180 | " \n", 181 | " dynamic_pairs_df=dynamic_pairs_df[['v1', 'v2', f'c{yy}', f'ct_{yy}', 'ct_delta']]\n", 182 | " \n", 183 | " dynamic_pairs_grouped = dynamic_pairs_df.groupby(['v1','v2']).agg({f'c{yy}':'sum', f'ct_{yy}':'sum', 'ct_delta':'sum', 'v1':'size'}).rename(columns={'v1':f'num'}).reset_index()\n", 184 | " \n", 185 | " dynamic_pairs_grouped[f'c{yy}_m'] = dynamic_pairs_grouped[f'c{yy}'] / dynamic_pairs_grouped[f'num']\n", 186 | " dynamic_pairs_grouped[f'ct_{yy}_m'] = dynamic_pairs_grouped[f'ct_{yy}'] / dynamic_pairs_grouped[f'num']\n", 187 | " dynamic_pairs_grouped[f'ct_delta_m'] = dynamic_pairs_grouped['ct_delta'] / dynamic_pairs_grouped[f'num']\n", 188 | " \n", 189 | " data_file = os.path.join(store_folder, f\"concept_pair_citation_data_{yy}.parquet\")\n", 190 | " dynamic_pairs_grouped.to_parquet(data_file, compression='gzip')\n", 191 | " print(f\"in {yy}; time: {time.time()-start_time}\\n\")\n", 192 | " start_time=time.time()\n", 193 | " " 194 | ] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "asl_semnet", 200 | "language": "python", 201 | "name": "asl_semnet" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": { 205 | "name": "ipython", 206 | "version": 3 207 | }, 208 | "file_extension": ".py", 209 | "mimetype": "text/x-python", 210 | "name": "python", 211 | "nbconvert_exporter": "python", 212 | "pygments_lexer": "ipython3", 213 | "version": "3.10.9" 214 | } 215 | }, 216 | "nbformat": 4, 217 | "nbformat_minor": 5 218 | } 219 | -------------------------------------------------------------------------------- /preprocess_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import random, time 6 | import numpy as np 7 | import matplotlib.pyplot as plt 8 | from scipy import sparse 9 | from scipy.stats import rankdata 10 | import networkx as nx 11 | import pandas as pd 12 | from collections import defaultdict,Counter 13 | from datetime import datetime, date 14 | from itertools import combinations 15 | from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report 16 | from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix 17 | from features_utils import * 18 | 19 | 20 | def prepare_split_datasets(full_train_data, user_parameter, logs_file_name): 21 | """ 22 | Split the whole dataset (unconnected pairs) into two classes, positive and negative samples 23 | 24 | Parameters: 25 | - full_train_data: full dataset for training or testing 26 | - user_parameter: some user parameters whcih are num_class, IR_num, split_type, out_norm 27 | - logs_file_name: the folder to store the figure, which is usually defined from t0_c2_curve created from make_folders() 28 | 29 | return: 30 | data_subsets: negative and postive samples 31 | """ 32 | 33 | num_class, IR_num, split_type, out_norm = user_parameter 34 | 35 | time_start=time.time() 36 | data_subsets = [] 37 | if split_type==0: ## IR_num=[100]; namely citation >=IR and citation= IR_num[0]]) # get all the positive samples, which are the citations >= IR_num[0] 40 | else: # conditional case 41 | for i in range(len(IR_num)-1): # e.g., IR_num=[[0,5], 100]; namely citation >=100 and citation in [0,5] 42 | data_subsets.append(full_train_data[(full_train_data[:, 2] >= IR_num[i][0]) & (full_train_data[:, 2] <= IR_num[i][1])]) 43 | data_subsets.append(full_train_data[full_train_data[:, 2] >= IR_num[-1]]) 44 | 45 | if split_type==0: 46 | if len(data_subsets[0])<=3*10**7: # usually not meet, due to the size of data_subsets[0] is more than 600million 47 | num_row_chose=len(data_subsets[0]) 48 | else: 49 | num_row_chose=min(len(data_subsets[0]),len(data_subsets[1])) # take the same size as the positive cases 50 | else: # conditional case 51 | num_row_chose=len(data_subsets[0]) 52 | 53 | indices = np.random.choice(data_subsets[0].shape[0], size=num_row_chose, replace=False) 54 | data_subsets[0] = data_subsets[0][indices] # randomly chose num_row_chose negative samples 55 | 56 | print(f"dataset len: {len(data_subsets[0])}, {len(data_subsets[1])}; num_row_chose: {num_row_chose}; {time.time()-time_start}s") 57 | with open(logs_file_name+"_logs.txt", "a") as myfile: 58 | myfile.write(f"\ndataset len: {len(data_subsets[0])}, {len(data_subsets[1])}; num_row_chose: {num_row_chose}; {time.time()-time_start}s") 59 | 60 | 61 | return data_subsets 62 | 63 | 64 | def shuffle_split_datasets(data_subsets, train_valid_test_size): 65 | """ 66 | Split the dataset into training and testing sets 67 | 68 | Parameters: 69 | - data_subsets: the prepared negative and positive samples (unconnected pairs) 70 | - train_valid_test_size: split ratio, here is train_valid_test_size=[0.85, 0.15, 0.0]; 85% for training, 15% for testing; evaluation part is using future data 71 | 72 | return: 73 | dataset_train: training dataset 74 | dataset_test: testing dataset 75 | """ 76 | dataset_train = [] 77 | dataset_test = [] 78 | for subset in data_subsets: 79 | np.random.shuffle(subset) 80 | idx_train = int(len(subset) * train_valid_test_size[0]) 81 | train_set = subset[:idx_train] 82 | test_set = subset[idx_train:] 83 | dataset_train.append(train_set) 84 | dataset_test.append(test_set) 85 | 86 | return dataset_train, dataset_test 87 | 88 | 89 | def get_pair_solution_datasets(data_subsets, hyper_parameter, user_parameter, logs_file_name): 90 | """ 91 | prepare the unconnected pairs in year y and their corresponding future citations solutions 92 | 93 | Parameters: 94 | - data_subsets: the prepared negative and positive samples (unconnected pairs) 95 | - hyper_parameter: batch_size, lr_enc, rnd_seed 96 | - user_parameter: not used here 97 | - logs_file_name: txt file for logging the running status 98 | 99 | return: 100 | train_edge_pair: dataset (unconnected pairs) 101 | train_edge_solution: the corresponding citations 102 | """ 103 | num_class, IR_num, split_type, out_norm = user_parameter # not used here 104 | batch_size, lr_enc, rnd_seed=hyper_parameter 105 | start_time=time.time() 106 | 107 | for idx, subset in enumerate(data_subsets): 108 | 109 | min_num_row=min(batch_size, len(subset)) 110 | num_new_samples = len(subset) - min_num_row ## data_subsets[0]>=batch_size 111 | if num_new_samples<0: # data_subsets[0]= IR_num[0]]) # features for positive samples 152 | else: # conditional case 153 | for i in range(len(IR_num) - 1): 154 | data_input.append(data_feature[(data_solution >= IR_num[i][0]) & (data_solution <=IR_num[i][1])]) 155 | data_input.append(data_feature[data_solution >= IR_num[-1]]) 156 | 157 | print(f"\n finish split_data_features: {time.time()-start_time}") 158 | with open(logs_file_name+"_logs.txt", "a") as myfile: 159 | myfile.write(f"\n finish split_data_features: {time.time()-start_time}") 160 | 161 | return data_feature, data_input 162 | 163 | 164 | 165 | ######### classify_solution ############### 166 | def classify_solution(data_solution, user_parameter): 167 | """ 168 | classfiy the citation solution into 0 and 1 classes 169 | 170 | Parameters: 171 | - data_solution: the citation solution 172 | - user_parameter: num_class, IR_num, split_type, out_norm 173 | 174 | return: 175 | solution_arr: 0-1 numpy array, 0 for negative samples, 1 for postive samples 176 | """ 177 | num_class, IR_num, split_type, out_norm = user_parameter 178 | solution_arr = np.zeros_like(data_solution) 179 | 180 | if split_type==0: ## only binary 181 | solution_arr[data_solution < IR_num[0]] = 0 182 | solution_arr[data_solution >= IR_num[0]] = 1 183 | 184 | else: # conditional case 185 | for i in range(len(IR_num) - 1): # in this work, IR_num=[[0,5], 100] 186 | solution_arr[(data_solution >= IR_num[i][0]) & (data_solution <=IR_num[i][1])] = i # in this case, 0 187 | solution_arr[data_solution >= IR_num[-1]] = len(IR_num) - 1 # in this case, 1 188 | 189 | return solution_arr 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /train_model_2019_condition.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import torch 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import random, time 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy import sparse 12 | from scipy.stats import rankdata 13 | import networkx as nx 14 | import pandas as pd 15 | from collections import defaultdict,Counter 16 | from datetime import datetime, date 17 | from itertools import combinations 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc 19 | from general_utils import * 20 | from features_utils import * 21 | from preprocess_utils import * 22 | from train_model_utils import * 23 | 24 | 25 | 26 | rn_time=random.random()*30 27 | time.sleep(rn_time) 28 | 29 | if __name__ == '__main__': 30 | 31 | 32 | split_type=1 # 1 is for conditional case 33 | out_norm=False # we fix this to False, using the raw scores from the neural network output 34 | num_class=2 # binary classfication, fixed 35 | day_origin = date(1990,1,1) # the baseline time 36 | 37 | vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it 38 | min_edges=1 # fixed, minimal number of edges that is considered, not used in the work, can be removed 39 | years_delta=3 # year gap is 3 years 40 | year_start=2019-years_delta # train 2016 for 2019 41 | 42 | graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] # parameters for the knowledge graph 43 | 44 | # create folders and subfolders 45 | # it will create a main folder: 2016_train_condition that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 46 | save_folders, log_folder=make_folders(year_start, split_type, num_class, "train_condition") 47 | 48 | log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_run_1") # just a log file to check the running status 49 | with open(log_run+"_logs.txt", "a") as myfile: 50 | myfile.write(f"\n\nstart: {datetime.now()}\n") 51 | 52 | # load the full dynamic graph 53 | start_time = time.time() 54 | data_folder="data_concept_graph" # folder that stores the full knowledge graph 55 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") 56 | full_dynamic_graph = pd.read_parquet(graph_file) 57 | with open(log_run+"_logs.txt", "a") as myfile: 58 | myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}") 59 | 60 | # load data for preparing different type of features 61 | feature_folder="data_for_features" # folder that stores data used for preparing features 62 | start_time=time.time() 63 | adj_mat_sparse=[] 64 | node_neighbor_list=[] 65 | num_neighbor_list=[] 66 | for yy in [year_start,year_start-1,year_start-2]: 67 | data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz") # load the adjacency_matrix file 68 | adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) 69 | adj_mat_sparse.append(adj_mat) 70 | 71 | curr_node_neighbor=get_node_neighbor(adj_mat) 72 | node_neighbor_list.append(curr_node_neighbor) 73 | 74 | curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 75 | num_neighbor_list.append(curr_num_neighbor) 76 | 77 | with open(log_run+"_logs.txt", "a") as myfile: 78 | myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 79 | 80 | start_time=time.time() 81 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) # prepare all the node features for a vertex in years y, y-1, y-2 82 | 83 | # load data for preparing different type of citation features 84 | start_time=time.time() 85 | vc_feature_list=[] 86 | for yy in [year_start,year_start-1,year_start-2]: 87 | data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") # load the citation information for concepts in year yy 88 | vc_df=pd.read_parquet(data_file) 89 | vc_feature=vc_df.values 90 | vc_feature_list.append(vc_feature) 91 | 92 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list) 93 | with open(log_run+"_logs.txt", "a") as myfile: 94 | myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 95 | 96 | pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures] # later used for pair features and cfeatures 97 | 98 | # load the whole unconnected pairs for training and testing 99 | train_data_folder = 'data_pair_solution' 100 | train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet") 101 | 102 | time_start = time.time() 103 | train_pair_data_yes = pd.read_parquet(train_pair_file1) 104 | with open(log_run+"_logs.txt", "a") as myfile: 105 | myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}") 106 | 107 | time_start = time.time() 108 | full_train_data=train_pair_data_yes.values 109 | with open(log_run+"_logs.txt", "a") as myfile: 110 | myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 111 | 112 | full_dynamic_graph=pd.DataFrame() 113 | train_pair_data_yes=pd.DataFrame() 114 | 115 | # load the evaluation data feature and solutions 116 | eval_folder="data_eval" # folder that stores the evaluatuion datasets, unconnected pairs, features, solutions 117 | start_time = time.time() 118 | eval_file=os.path.join(eval_folder,"eval_data_pair_solution_condition.parquet") 119 | eval_data_features_df = pd.read_parquet(eval_file) 120 | eval_data_solution=eval_data_features_df.values 121 | eval_data_features_df=pd.DataFrame() 122 | with open(log_run+"_logs.txt", "a") as myfile: 123 | myfile.write(f"finish loading eval_data_features; {time.time()-start_time}") 124 | 125 | 126 | start_time = time.time() 127 | eval_file=os.path.join(eval_folder,"eval_data_pair_feature_condition.parquet") 128 | eval_data_features_df = pd.read_parquet(eval_file) 129 | eval_data_features=eval_data_features_df.values 130 | eval_data_features_df=pd.DataFrame() 131 | with open(log_run+"_logs.txt", "a") as myfile: 132 | myfile.write(f"\nfinish loading eval_data_solution; {time.time()-start_time}") 133 | 134 | 135 | IR_min=[5] 136 | IR_max=[100] 137 | 138 | for id_min in IR_min: 139 | 140 | for id_max in IR_max: 141 | 142 | IR_num=[[0,id_min], id_max] # IR_num=[[0,5], 100] 143 | IR_Str=format_IR(IR_num, split_type) 144 | 145 | logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str) 146 | if not os.path.exists(logs_file_name+"_logs.txt"): 147 | current_time=datetime.now() 148 | open(logs_file_name+"_logs.txt", 'a').close() 149 | 150 | batch_size=1000 151 | lr_enc=3*10**-5 152 | rnd_seed=42 153 | hyper_parameter=[batch_size, lr_enc, rnd_seed] 154 | graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] 155 | user_parameter=[num_class, IR_num, split_type, out_norm] 156 | 157 | impact_classfication(full_train_data, eval_data_features, eval_data_solution[:,2], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name) 158 | 159 | rn_time=random.random()*30 160 | time.sleep(rn_time) 161 | 162 | else: 163 | pass 164 | 165 | with open(log_run+"_logs.txt", "a") as myfile: 166 | myfile.write(f"\nfinish: {datetime.now()}\n") 167 | 168 | -------------------------------------------------------------------------------- /train_model_2019_individual_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import torch 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import random, time 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy import sparse 12 | from scipy.stats import rankdata 13 | import networkx as nx 14 | import pandas as pd 15 | from collections import defaultdict,Counter 16 | from datetime import datetime, date 17 | from itertools import combinations 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc 19 | from general_utils import * 20 | from preprocess_utils import * 21 | from features_utils import * 22 | from train_model_utils import * 23 | 24 | 25 | 26 | rn_time=random.random()*30 27 | time.sleep(rn_time) 28 | 29 | if __name__ == '__main__': 30 | 31 | 32 | split_type=0 # 1 is for conditional case 33 | out_norm=False # we fix this to False, using the raw scores from the neural network output 34 | num_class=2 # binary classfication, fixed 35 | day_origin = date(1990,1,1) # the baseline time 36 | 37 | vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it 38 | min_edges=1 # fixed, minimal number of edges that is considered, not used in the work, can be removed 39 | years_delta=3 # year gap is 3 years 40 | year_start=2019-years_delta # train 2016 for 2019 41 | 42 | graph_parameter=[year_start, years_delta, vertex_degree_cutoff, min_edges] # parameters for the knowledge graph 43 | 44 | # create folders and subfolders 45 | # it will create a main folder: 2016_train_each that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 46 | save_folders, log_folder=make_folders(year_start, split_type, num_class, "train_each") 47 | 48 | log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_single_run") # just a log file to check the running status 49 | with open(log_run+"_logs.txt", "a") as myfile: 50 | myfile.write(f"\n\nstart: {datetime.now()}\n") 51 | 52 | # load the full dynamic graph 53 | start_time = time.time() 54 | data_folder="data_concept_graph" # folder that stores the full knowledge graph 55 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") 56 | full_dynamic_graph = pd.read_parquet(graph_file) 57 | with open(log_run+"_logs.txt", "a") as myfile: 58 | myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}") 59 | 60 | # load data for preparing different type of features 61 | feature_folder="data_for_features" # folder that stores data used for preparing features 62 | start_time=time.time() 63 | adj_mat_sparse=[] 64 | node_neighbor_list=[] 65 | num_neighbor_list=[] 66 | for yy in [year_start,year_start-1,year_start-2]: 67 | data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz") 68 | adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) 69 | adj_mat_sparse.append(adj_mat) 70 | 71 | curr_node_neighbor=get_node_neighbor(adj_mat) 72 | node_neighbor_list.append(curr_node_neighbor) 73 | 74 | curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 75 | num_neighbor_list.append(curr_num_neighbor) 76 | 77 | with open(log_run+"_logs.txt", "a") as myfile: 78 | myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 79 | 80 | start_time=time.time() 81 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) 82 | 83 | # load data for preparing different type of citation features 84 | start_time=time.time() 85 | vc_feature_list=[] 86 | for yy in [year_start,year_start-1,year_start-2]: 87 | data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") 88 | vc_df=pd.read_parquet(data_file) 89 | vc_feature=vc_df.values 90 | vc_feature_list.append(vc_feature) 91 | 92 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list) 93 | with open(log_run+"_logs.txt", "a") as myfile: 94 | myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 95 | 96 | pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures] 97 | 98 | # load the whole unconnected pairs for training and testing 99 | train_data_folder = 'data_pair_solution' # folder that stores the unconnected pairs and their citation informations in the future 100 | train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet") 101 | train_pair_file2=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}.parquet") 102 | 103 | time_start = time.time() 104 | train_pair_data_yes = pd.read_parquet(train_pair_file1) 105 | with open(log_run+"_logs.txt", "a") as myfile: 106 | myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}") 107 | 108 | time_start = time.time() 109 | train_pair_data_no = pd.read_parquet(train_pair_file2) 110 | with open(log_run+"_logs.txt", "a") as myfile: 111 | myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}: {len(train_pair_data_no)}; elapsed_time: {time.time() - time_start}") 112 | 113 | time_start = time.time() 114 | full_train_data=np.concatenate((train_pair_data_yes.values, train_pair_data_no.values), axis=0) 115 | with open(log_run+"_logs.txt", "a") as myfile: 116 | myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 117 | 118 | full_dynamic_graph=pd.DataFrame() 119 | train_pair_data_yes=pd.DataFrame() 120 | train_pair_data_no=pd.DataFrame() 121 | 122 | # load the evaluation data feature and solutions 123 | eval_folder="data_eval" # folder that stores the evaluatuion datasets, unconnected pairs, features, solutions 124 | start_time = time.time() 125 | eval_file=os.path.join(eval_folder,"eval_data_pair_solution.parquet") 126 | eval_data_features_df = pd.read_parquet(eval_file) 127 | eval_data_solution=eval_data_features_df.values 128 | eval_data_features_df=pd.DataFrame() 129 | with open(log_run+"_logs.txt", "a") as myfile: 130 | myfile.write(f"finish loading eval_data_features; {time.time()-start_time}") 131 | 132 | 133 | start_time = time.time() 134 | eval_file=os.path.join(eval_folder,"eval_data_pair_feature.parquet") 135 | eval_data_features_df = pd.read_parquet(eval_file) 136 | eval_data_features=eval_data_features_df.values 137 | eval_data_features_df=pd.DataFrame() 138 | with open(log_run+"_logs.txt", "a") as myfile: 139 | myfile.write(f"\nfinish loading eval_data_solution; {time.time()-start_time}") 140 | 141 | ## just run one case, IR=100 142 | num_impact=100 143 | IR_num=[num_impact] 144 | IR_Str=format_IR(IR_num, split_type) 145 | 146 | logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str) 147 | open(logs_file_name+"_logs.txt", 'a').close() 148 | 149 | batch_size=1000 150 | lr_enc=3*10**-5 151 | rnd_seed=42 152 | hyper_parameter=[batch_size, lr_enc, rnd_seed] 153 | graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] 154 | user_parameter=[num_class, IR_num, split_type, out_norm] 155 | 156 | impact_classfication_single_feature(full_train_data, eval_data_features, eval_data_solution[:,2], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name) 157 | 158 | with open(log_run+"_logs.txt", "a") as myfile: 159 | myfile.write(f"\nfinish: {datetime.now()}\n\n") 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /train_model_2019_run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import torch 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import random, time 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy import sparse 12 | from scipy.stats import rankdata 13 | import networkx as nx 14 | import pandas as pd 15 | from collections import defaultdict,Counter 16 | from datetime import datetime, date 17 | from itertools import combinations 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc 19 | from general_utils import * 20 | from preprocess_utils import * 21 | from features_utils import * 22 | from train_model_utils import * 23 | 24 | 25 | 26 | rn_time=random.random()*30 27 | time.sleep(rn_time) 28 | 29 | if __name__ == '__main__': 30 | 31 | 32 | split_type=0 # 1 is for conditional case 33 | out_norm=False # we fix this to False, using the raw scores from the neural network output 34 | num_class=2 # binary classfication, fixed 35 | day_origin = date(1990,1,1) # the baseline time 36 | 37 | vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it 38 | min_edges=1 # fixed, minimal number of edges that is considered, not used in the work, can be removed 39 | years_delta=3 # year gap is 3 years 40 | year_start=2019-years_delta # train 2016 for 2019 41 | 42 | graph_parameter=[year_start, years_delta, vertex_degree_cutoff, min_edges] # parameters for the knowledge graph 43 | 44 | # create folders and subfolders 45 | # it will create a main folder: 2016_train that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 46 | save_folders, log_folder=make_folders(year_start, split_type, num_class, "train") 47 | 48 | log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_run_1") # just a log file to check the running status 49 | with open(log_run+"_logs.txt", "a") as myfile: 50 | myfile.write(f"\n\nstart: {datetime.now()}\n") 51 | 52 | # load the full dynamic graph 53 | start_time = time.time() 54 | data_folder="data_concept_graph" # folder that stores the full knowledge graph 55 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") 56 | full_dynamic_graph = pd.read_parquet(graph_file) 57 | with open(log_run+"_logs.txt", "a") as myfile: 58 | myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}") 59 | 60 | # load data for preparing different type of features 61 | feature_folder="data_for_features" # folder that stores data used for preparing features 62 | start_time=time.time() 63 | adj_mat_sparse=[] 64 | node_neighbor_list=[] 65 | num_neighbor_list=[] 66 | for yy in [year_start,year_start-1,year_start-2]: 67 | data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz") 68 | adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) # load the adjacency_matrix file, if not exists, it will automatically generate one 69 | adj_mat_sparse.append(adj_mat) # adjacency_matrix for year y, y-1, y-2 70 | 71 | curr_node_neighbor=get_node_neighbor(adj_mat) 72 | node_neighbor_list.append(curr_node_neighbor) # get the neighbors of vertices for years y, y-1, y-2 73 | 74 | curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 75 | num_neighbor_list.append(curr_num_neighbor) # get the number of neighbors of vertices for years y, y-1, y-2 76 | 77 | with open(log_run+"_logs.txt", "a") as myfile: 78 | myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 79 | 80 | start_time=time.time() 81 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) # prepare all the node features for a vertex in years y, y-1, y-2 82 | 83 | # load data for preparing different type of citation features 84 | start_time=time.time() 85 | vc_feature_list=[] 86 | for yy in [year_start,year_start-1,year_start-2]: 87 | data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") # load the citation information for concepts in year yy 88 | vc_df=pd.read_parquet(data_file) 89 | vc_feature=vc_df.values 90 | vc_feature_list.append(vc_feature) 91 | 92 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list) 93 | with open(log_run+"_logs.txt", "a") as myfile: 94 | myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 95 | 96 | pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures] # later used for pair features and cfeatures 97 | 98 | # load the whole unconnected pairs for training and testing 99 | train_data_folder = 'data_pair_solution' # folder that stores the unconnected pairs and their citation informations in the future 100 | train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet") 101 | train_pair_file2=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}.parquet") 102 | 103 | time_start = time.time() 104 | train_pair_data_yes = pd.read_parquet(train_pair_file1) 105 | with open(log_run+"_logs.txt", "a") as myfile: 106 | myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}") 107 | 108 | time_start = time.time() 109 | train_pair_data_no = pd.read_parquet(train_pair_file2) 110 | with open(log_run+"_logs.txt", "a") as myfile: 111 | myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}: {len(train_pair_data_no)}; elapsed_time: {time.time() - time_start}") 112 | 113 | time_start = time.time() 114 | full_train_data=np.concatenate((train_pair_data_yes.values, train_pair_data_no.values), axis=0) 115 | with open(log_run+"_logs.txt", "a") as myfile: 116 | myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 117 | 118 | full_dynamic_graph=pd.DataFrame() 119 | train_pair_data_yes=pd.DataFrame() 120 | train_pair_data_no=pd.DataFrame() 121 | 122 | # load the evaluation data feature and solutions 123 | eval_folder="data_eval" # folder that stores the evaluatuion datasets, unconnected pairs, features, solutions 124 | start_time = time.time() 125 | eval_file=os.path.join(eval_folder,"eval_data_pair_solution.parquet") 126 | eval_data_features_df = pd.read_parquet(eval_file) 127 | eval_data_solution=eval_data_features_df.values 128 | eval_data_features_df=pd.DataFrame() 129 | with open(log_run+"_logs.txt", "a") as myfile: 130 | myfile.write(f"finish loading eval_data_features; {time.time()-start_time}") 131 | 132 | 133 | start_time = time.time() 134 | eval_file=os.path.join(eval_folder,"eval_data_pair_feature.parquet") 135 | eval_data_features_df = pd.read_parquet(eval_file) 136 | eval_data_features=eval_data_features_df.values 137 | eval_data_features_df=pd.DataFrame() 138 | with open(log_run+"_logs.txt", "a") as myfile: 139 | myfile.write(f"\nfinish loading eval_data_solution; {time.time()-start_time}") 140 | 141 | # train neural networks for different IR number from 1 to 200 142 | IR_start=1 143 | IR_end=40 # IR_end=200 144 | IR_count=IR_start 145 | while IR_count <= IR_end: 146 | 147 | num_impact=random.randint(IR_start, IR_end) # used for parallel computing 148 | IR_num=[num_impact] 149 | IR_Str=format_IR(IR_num, split_type) 150 | 151 | logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str) 152 | if not os.path.exists(logs_file_name+"_logs.txt"): 153 | current_time=datetime.now() 154 | open(logs_file_name+"_logs.txt", 'a').close() 155 | 156 | batch_size=1000 157 | lr_enc=3*10**-5 158 | rnd_seed=42 159 | hyper_parameter=[batch_size, lr_enc, rnd_seed] 160 | graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] 161 | user_parameter=[num_class, IR_num, split_type, out_norm] 162 | 163 | impact_classfication(full_train_data, eval_data_features, eval_data_solution[:,2], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name) 164 | 165 | IR_count+=1 166 | rn_time=random.random()*30 167 | time.sleep(rn_time) 168 | 169 | else: 170 | pass 171 | 172 | with open(log_run+"_logs.txt", "a") as myfile: 173 | myfile.write(f"\nfinish: {datetime.now()}\n\n") 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /train_model_2022_run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import gzip 4 | import copy 5 | import torch 6 | from torch import nn 7 | import torch.nn.functional as F 8 | import random, time 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from scipy import sparse 12 | from scipy.stats import rankdata 13 | import networkx as nx 14 | import pandas as pd 15 | from collections import defaultdict,Counter 16 | from datetime import datetime, date 17 | from itertools import combinations 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc 19 | from general_utils import * 20 | from preprocess_utils import * 21 | from features_utils import * 22 | from train_model_utils import * 23 | 24 | 25 | 26 | rn_time=random.random()*30 27 | time.sleep(rn_time) 28 | 29 | if __name__ == '__main__': 30 | 31 | 32 | split_type=0 # 1 is for conditional case 33 | out_norm=False # we fix this to False, using the raw scores from the neural network output 34 | num_class=2 # binary classfication, fixed 35 | day_origin = date(1990,1,1) # the baseline time 36 | 37 | vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it 38 | min_edges=1 # fixed, minimal number of edges that is considered, not used in the work, can be removed 39 | years_delta=3 # year gap is 3 years 40 | year_start=2022-years_delta # train 2019 for 2022 41 | 42 | graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] # parameters for the knowledge graph 43 | 44 | # create folders and subfolders 45 | # it will create a main folder: 2019_train that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 46 | save_folders, log_folder=make_folders(year_start, split_type, num_class, "train") 47 | 48 | log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_run_1") # just a log file to check the running status 49 | with open(log_run+"_logs.txt", "a") as myfile: 50 | myfile.write(f"\n\nstart: {datetime.now()}\n") 51 | 52 | # load the full dynamic graph 53 | start_time = time.time() 54 | data_folder="data_concept_graph" # folder that stores the full knowledge graph 55 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") 56 | full_dynamic_graph = pd.read_parquet(graph_file) 57 | with open(log_run+"_logs.txt", "a") as myfile: 58 | myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}") 59 | 60 | # load data for preparing different type of features 61 | feature_folder="data_for_features" # folder that stores data used for preparing features 62 | start_time=time.time() 63 | adj_mat_sparse=[] 64 | node_neighbor_list=[] 65 | num_neighbor_list=[] 66 | for yy in [year_start,year_start-1,year_start-2]: 67 | data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz") # load the adjacency_matrix file, if not exists, it will automatically generate one 68 | adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) # adjacency_matrix for year y, y-1, y-2 69 | adj_mat_sparse.append(adj_mat) 70 | 71 | curr_node_neighbor=get_node_neighbor(adj_mat) 72 | node_neighbor_list.append(curr_node_neighbor) # get the neighbors of vertices for years y, y-1, y-2 73 | 74 | curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() 75 | num_neighbor_list.append(curr_num_neighbor) # get the number of neighbors of vertices for years y, y-1, y-2 76 | 77 | with open(log_run+"_logs.txt", "a") as myfile: 78 | myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}") 79 | 80 | start_time=time.time() 81 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) # prepare all the node features for a vertex in years y, y-1, y-2 82 | 83 | # load data for preparing different type of citation features 84 | start_time=time.time() 85 | vc_feature_list=[] 86 | for yy in [year_start,year_start-1,year_start-2]: 87 | data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") # load the citation information for concepts in year yy 88 | vc_df=pd.read_parquet(data_file) 89 | vc_feature=vc_df.values 90 | vc_feature_list.append(vc_feature) 91 | 92 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list) 93 | with open(log_run+"_logs.txt", "a") as myfile: 94 | myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 95 | 96 | pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures] # later used for pair features and cfeatures 97 | 98 | # load the whole unconnected pairs for training and testing 99 | train_data_folder = 'data_pair_solution' 100 | train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet") 101 | train_pair_file2=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}.parquet") 102 | 103 | time_start = time.time() 104 | train_pair_data_yes = pd.read_parquet(train_pair_file1) 105 | with open(log_run+"_logs.txt", "a") as myfile: 106 | myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}") 107 | 108 | time_start = time.time() 109 | train_pair_data_no = pd.read_parquet(train_pair_file2) 110 | with open(log_run+"_logs.txt", "a") as myfile: 111 | myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}: {len(train_pair_data_no)}; elapsed_time: {time.time() - time_start}") 112 | 113 | time_start = time.time() 114 | full_train_data=np.concatenate((train_pair_data_yes.values, train_pair_data_no.values), axis=0) 115 | with open(log_run+"_logs.txt", "a") as myfile: 116 | myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 117 | 118 | full_dynamic_graph=pd.DataFrame() 119 | train_pair_data_yes=pd.DataFrame() 120 | train_pair_data_no=pd.DataFrame() 121 | 122 | # train neural networks for different IR number from 1 to 200 123 | IR_start=1 124 | IR_end=40 # IR_end=200 125 | IR_count=IR_start 126 | while IR_count <= IR_end: 127 | 128 | num_impact=random.randint(IR_start, IR_end) # used for parallel computing 129 | IR_num=[num_impact] 130 | IR_Str=format_IR(IR_num, split_type) 131 | 132 | logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str) 133 | if not os.path.exists(logs_file_name+"_logs.txt"): 134 | current_time=datetime.now() 135 | open(logs_file_name+"_logs.txt", 'a').close() 136 | 137 | batch_size=1000 138 | lr_enc=3*10**-5 139 | rnd_seed=42 140 | hyper_parameter=[batch_size, lr_enc, rnd_seed] 141 | graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] 142 | user_parameter=[num_class, IR_num, split_type, out_norm] 143 | 144 | impact_classfication(full_train_data, [], [], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name) 145 | 146 | IR_count+=1 147 | rn_time=random.random()*30 148 | time.sleep(rn_time) 149 | 150 | else: 151 | pass 152 | 153 | with open(log_run+"_logs.txt", "a") as myfile: 154 | myfile.write(f"\nfinish: {datetime.now()}\n\n") 155 | 156 | 157 | 158 | --------------------------------------------------------------------------------