├── LICENSE
├── README.md
├── benchmark_code
    ├── loops_fcNN.py
    ├── loops_transformer.py
    ├── loops_tree.py
    ├── loops_xgboost.py
    ├── make_plot_loss_curve.py
    ├── make_plots.py
    ├── make_plots_mixed.py
    └── make_plots_single.py
├── create_concepts
    ├── Concept_Corpus
    │   ├── s0_get_preprint_metadata.ipynb
    │   ├── s1_make_metadate_arxivstyle.ipynb
    │   ├── s2_combine_all_preprint_metadate.ipynb
    │   ├── s3_get_concepts.ipynb
    │   └── s4_improve_concept.ipynb
    └── Domain_Concept
    │   ├── full_domain_concepts.txt
    │   ├── s0_prepare_optics_quantum_data.ipynb
    │   ├── s1_split_domain_papers.py
    │   ├── s2_get_domain_concepts.py
    │   ├── s3_merge_concepts.py
    │   ├── s4_improve_concepts.ipynb
    │   └── s5_improve_manually_concepts.py
├── create_dynamic_concepts
    ├── get_concept_citation.py
    ├── merge_concept_citation.py
    └── process_concept_to_pandas_frame.py
├── create_dynamic_edges
    ├── _get_openalex_workdata.py
    ├── _get_openalex_workdata_parallel_run1.py
    ├── get_concept_pairs.py
    ├── merge_concept_pairs.py
    └── process_edge_to_pandas_frame.py
├── features_utils.py
├── fpr_example
    ├── plot_FPR.py
    └── roc_curve_combined_highres.png
├── general_utils.py
├── miscellaneous
    ├── Fig2_NeuralNet.png
    ├── Impact4Cast.png
    └── KnowledgeGraph.png
├── prepare_adjacency_pagerank.py
├── prepare_eval_data
    ├── prepare_eval_feature_data.ipynb
    ├── prepare_eval_feature_data.py
    └── prepare_eval_feature_data_condition.py
├── prepare_node_pair_citation_data_years.ipynb
├── prepare_unconnected_pair_solution.ipynb
├── preprocess_utils.py
├── train_model_2019_condition.py
├── train_model_2019_individual_feature.py
├── train_model_2019_run.py
├── train_model_2022_run.py
└── train_model_utils.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Artificial Scientist Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/benchmark_code/loops_fcNN.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import random, time
  6 | import numpy as np
  7 | import pandas as pd
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.optim as optim
 11 | from datetime import datetime, date
 12 | import matplotlib
 13 | matplotlib.use('Agg')  # Use a non-interactive backend suitable for cluster
 14 | import matplotlib.pyplot as plt
 15 | from sklearn.metrics import roc_auc_score, roc_curve
 16 | 
 17 | 
 18 | def get_predictions(model, data, solution, eval_batch_size, log_file):
 19 |     model.eval()  # Set the model to evaluation mode
 20 |     with torch.no_grad():
 21 |         data_batches = torch.split(data, eval_batch_size)
 22 |         total_batches = len(data_batches)
 23 |         
 24 |         all_predictions = []
 25 |         start_time = time.time()
 26 |         for i, batch in enumerate(data_batches, start=1):
 27 |             batch_start_time = time.time()
 28 |             batch_preds = model(batch).squeeze().cpu().numpy()
 29 |             all_predictions.append(batch_preds)
 30 |             batch_time = time.time() - batch_start_time
 31 |             print_cluster(f"Processed batch {i}/{total_batches} in {batch_time:.2f} seconds", log_file)
 32 | 
 33 |         predictions = np.concatenate(all_predictions)
 34 |         
 35 |     true_labels = solution.cpu().numpy()  # Move labels to CPU
 36 |     total_time = time.time() - start_time
 37 |     print_cluster(f"Total prediction time: {total_time:.2f} seconds", log_file)
 38 |     return predictions, true_labels
 39 | 
 40 | 
 41 | def plot_auc_roc(true_labels, predictions, save_file, label="Train"):
 42 |     # Calculate the AUC-ROC score and ROC curve
 43 |     auc_score = roc_auc_score(true_labels, predictions)
 44 |     fpr, tpr, thresholds = roc_curve(true_labels, predictions)
 45 | 
 46 |     # Plot the ROC curve
 47 |     plt.figure(figsize=(8, 6))
 48 |     plt.plot(fpr, tpr, label=f"{label} AUC = {auc_score:.4f}")
 49 |     plt.plot([0, 1], [0, 1], 'k--', label=f"Random AUC={0.5}")
 50 |     plt.xlabel("False Positive Rate (FPR)")
 51 |     plt.ylabel("True Positive Rate (TPR)")
 52 |     plt.title(f"ROC Curve -- {label}")
 53 |     plt.legend(loc="lower right")
 54 |     plt.grid()
 55 |     plt.savefig(save_file, dpi=300)
 56 |     plt.close()
 57 | 
 58 |     # Save data used to produce this figure, including predictions and ground truth
 59 |     data_file = save_file.replace('.png', '.npz')
 60 |     np.savez(
 61 |         data_file,
 62 |         fpr=fpr,
 63 |         tpr=tpr,
 64 |         thresholds=thresholds,
 65 |         auc_score=auc_score,
 66 |         true_labels=true_labels,
 67 |         predictions=predictions
 68 |     )
 69 |     return auc_score
 70 | 
 71 | def plot_loss_curve(loss_train, loss_test, save_file, label="year1-year2"):
 72 |     plt.figure(figsize=(8, 6))
 73 |     plt.plot(loss_train, label=f'Train: {label}')
 74 |     plt.plot(loss_test, label=f'Test: {label}')
 75 |     plt.title(f"Loss Over Epochs: {label}")
 76 |     plt.xlabel("Epoch")
 77 |     plt.ylabel("Loss")
 78 |     plt.legend()
 79 |     plt.savefig(save_file, dpi=300)
 80 |     plt.close()
 81 | 
 82 |     # Save data used to produce this figure
 83 |     data_file = save_file.replace('.png', '.npz')
 84 |     np.savez(data_file, loss_train=loss_train, loss_test=loss_test)
 85 | 
 86 | 
 87 | class ff_network(nn.Module):
 88 |     def __init__(self, input_size, hidden_size, output_size):
 89 |         super(ff_network, self).__init__()
 90 |         
 91 |         act = nn.ReLU()
 92 | 
 93 |         self.semnet = nn.Sequential(
 94 |             nn.Linear(input_size, hidden_size),
 95 |             act,
 96 |             nn.Linear(hidden_size, hidden_size),
 97 |             act,
 98 |             nn.Linear(hidden_size, hidden_size),
 99 |             act,
100 |             nn.Linear(hidden_size, output_size)
101 |         )
102 | 
103 |     def forward(self, x):
104 |         res = self.semnet(x)
105 |         return res
106 | 
107 | # Hyperparameters that we also use in naming
108 | batch_size = 2048
109 | lr_enc = 1e-4
110 | hidden_size = 600
111 | patience = 500
112 | 
113 | # Create folders if needed
114 | neuralNet_folder = "save_neuralNet"
115 | plot_folder = "save_plot"
116 | os.makedirs(neuralNet_folder, exist_ok=True)
117 | os.makedirs(plot_folder, exist_ok=True)
118 | 
119 | train_data_folder = "data_for_train"
120 | eval_data_folder = "data_for_eval"
121 | os.makedirs(train_data_folder, exist_ok=True)
122 | os.makedirs(eval_data_folder, exist_ok=True)
123 | 
124 | rnd_seed = 42
125 | random.seed(rnd_seed)
126 | torch.manual_seed(rnd_seed)
127 | np.random.seed(rnd_seed)
128 | 
129 | # We define the loops as requested:
130 | train_year_spans = [2, 3, 4]   # (y2_train - y1_train)
131 | eval_year_spans = [1, 2, 3, 4, 5]  # (y2_eval - y1_eval)
132 | IR_list = [10, 50]
133 | 
134 | fixed_y2_eval = 2022
135 | 
136 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
137 | 
138 | 
139 | def print_cluster(print_str, log_file):
140 |     print(print_str)
141 |     with open(log_file, "a") as logfile:
142 |         logfile.write(print_str + "\n")
143 | 
144 | 
145 | for t_span in train_year_spans:
146 |     for e_span in eval_year_spans:
147 |         # Compute eval years
148 |         y2_eval = fixed_y2_eval
149 |         y1_eval = y2_eval - e_span
150 | 
151 |         # Compute train years
152 |         # given: y2_train = y1_eval
153 |         y2_train = y1_eval
154 |         y1_train = y2_train - t_span
155 | 
156 |         # Define the log file name based on parameters
157 |         for IR in IR_list:
158 |             # Construct a file name prefix for all files
159 | 
160 |             file_name_prefix = f"{y1_train}_{y2_train}_{hidden_size}_{lr_enc}_{batch_size}_{patience}_{IR}"
161 |             log_file = f'log_fcNN_{file_name_prefix}.txt'
162 |             
163 |             
164 |             # Construct file paths
165 |             if y1_train==2013:
166 |                 train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}_star.parquet")
167 |                 eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}_star.parquet")
168 |             else:
169 |                 train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}.parquet")
170 |                 eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}.parquet")
171 | 
172 |             # Check if files exist before proceeding
173 |             if not os.path.exists(train_file) or not os.path.exists(eval_file):
174 |                 print_cluster(f"Skipping (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR}) because files do not exist.", log_file)
175 |                 continue
176 | 
177 |             print_cluster(f"Processing (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR})...", log_file)
178 | 
179 |             # Load train data
180 |             train_data_pandas = pd.read_parquet(train_file) 
181 |             all_input_train = train_data_pandas.values
182 |             train_data_pandas = pd.DataFrame()  # free memory
183 | 
184 |             # Load eval data
185 |             eval_data_pandas = pd.read_parquet(eval_file)
186 |             all_input_eval = eval_data_pandas.values
187 |             eval_data_pandas = pd.DataFrame()  # free memory
188 | 
189 |             eval_feature_dataset = all_input_eval[:, 3:]  # selecting features f0,...
190 |             eval_solution_dataset = all_input_eval[:, 2]
191 |             all_negative_solution_eval = np.all(eval_solution_dataset == 0)
192 | 
193 |             np.random.shuffle(all_input_train)
194 |             input_data = all_input_train[:, 3:]  # selecting features
195 |             supervised_solution = all_input_train[:, 2]  # solutions
196 | 
197 |             train_test_size = [0.85, 0.15]
198 |             idx_train = int(len(input_data) * train_test_size[0])
199 |             input_data_train = input_data[:idx_train]
200 |             train_solution = supervised_solution[:idx_train]
201 | 
202 |             input_data_test = input_data[idx_train:]
203 |             test_solution = supervised_solution[idx_train:] 
204 | 
205 |             data_train = torch.tensor(input_data_train, dtype=torch.float32).to(device)
206 |             solution_train = torch.tensor(train_solution, dtype=torch.float32).to(device)
207 | 
208 |             data_test = torch.tensor(input_data_test, dtype=torch.float32).to(device)
209 |             solution_test = torch.tensor(test_solution, dtype=torch.float32).to(device)
210 | 
211 |             input_size = data_train.shape[1]
212 |             output_size = 1
213 | 
214 |             model_semnet = ff_network(input_size, hidden_size, output_size).to(device)
215 |             criterion = nn.MSELoss()
216 |             optimizer = optim.Adam(model_semnet.parameters(), lr=lr_enc)
217 | 
218 |             size_of_loss_check = 10000
219 |             # Initialize variables for early stopping
220 |             best_test_loss = float('inf')
221 |             best_epoch = 0
222 | 
223 |             train_loss_total = []
224 |             test_loss_total = []
225 | 
226 |             start_time = time.time()
227 | 
228 |             num_epochs = 5000000
229 |             print_cluster("start training....", log_file)
230 |             for epoch in range(num_epochs):
231 |                 model_semnet.train()
232 |                 # Randomly select 'batch_size' samples from data_train
233 |                 indices = np.random.choice(len(data_train), batch_size, replace=False)
234 |                 batch_data = data_train[indices]
235 |                 batch_solution = solution_train[indices]
236 | 
237 |                 # Forward pass
238 |                 optimizer.zero_grad()
239 |                 predictions = model_semnet(batch_data).squeeze()
240 |                 real_loss = criterion(predictions, batch_solution)
241 |                 loss = torch.clamp(real_loss, min=0., max=50000.).double()
242 |                 loss.backward()
243 |                 optimizer.step()
244 |                 
245 |                 with torch.no_grad():
246 |                     model_semnet.eval()
247 |                     # Evaluate on a subset of the training data
248 |                     train_predictions = model_semnet(data_train[:size_of_loss_check]).squeeze()
249 |                     train_loss = criterion(train_predictions, solution_train[:size_of_loss_check]).item()
250 |                     # Evaluate on a subset of the test data
251 |                     test_predictions = model_semnet(data_test[:size_of_loss_check]).squeeze()
252 |                     test_loss = criterion(test_predictions, solution_test[:size_of_loss_check]).item()
253 | 
254 |                 train_loss_total.append(train_loss)
255 |                 test_loss_total.append(test_loss)
256 | 
257 |                 # Calculate epochs since last best test loss
258 |                 epochs_since_best = epoch - best_epoch
259 | 
260 |                 # Print progress
261 |                 elapsed_time = time.time() - start_time
262 |                 print_cluster(f'epoch {epoch}: Train Loss = {train_loss:.5f}, Test Loss = {test_loss:.5f}, Time = {elapsed_time:.5f}s, ESC: {epochs_since_best}/{patience}', log_file)
263 |                 start_time = time.time()
264 |                     
265 |                 # Check if current test loss is the best so far
266 |                 if test_loss < best_test_loss:
267 |                     best_test_loss = test_loss
268 |                     best_epoch = epoch
269 |                     # Save the model when a new best is found
270 |                     did_work=False
271 |                     while did_work==False:
272 |                         try:
273 |                             net_file = os.path.join(neuralNet_folder, f"fcNN_netNet_full_trained_{file_name_prefix}.pt")
274 |                             torch.save(model_semnet, net_file)
275 |                             torch.save(model_semnet.state_dict(), net_file.replace("fcNN_netNet_full", "fcNN_netNet_state"))
276 |                             did_work=True
277 |                         except:
278 |                             time.sleep(0.1)
279 |                 # Early stopping: if no improvement in 'patience' epochs, stop training
280 |                 if epoch - best_epoch > patience:
281 |                     print_cluster(f'Early stopping triggered at epoch {epoch}. Best test loss {best_test_loss:.5f} was not improved for {patience} epochs.', log_file)
282 |                     break
283 | 
284 |             print_cluster("finish training....", log_file)
285 | 
286 |             # Load the best performing model
287 |             net_file = os.path.join(neuralNet_folder, f"fcNN_netNet_full_trained_{file_name_prefix}.pt")
288 |             model_semnet = torch.load(net_file, map_location=device)
289 |             model_semnet.eval()
290 | 
291 |             save_loss_file = os.path.join(plot_folder, f"fcNN_loss_curve_{file_name_prefix}.png")
292 |             plot_loss_curve(train_loss_total, test_loss_total, save_loss_file, label=f"{y1_train}-{y2_train}")
293 | 
294 |             print_cluster("start evaluation for train, test and eval if possible....", log_file)
295 |             eval_batch_size = 50000
296 |             train_predictions, train_labels = get_predictions(model_semnet, data_train, solution_train, eval_batch_size, log_file=log_file)
297 |             save_train_auc_file = os.path.join(plot_folder, f"fcNN_train_auc_curve_{file_name_prefix}.png")
298 |             curr_auc=plot_auc_roc(train_labels, train_predictions, save_train_auc_file, label="Train")
299 |             print_cluster(f"Train AUC: {curr_auc}", log_file)
300 | 
301 |             test_predictions, test_labels = get_predictions(model_semnet, data_test, solution_test, eval_batch_size, log_file=log_file)
302 |             save_test_auc_file = os.path.join(plot_folder, f"fcNN_test_auc_curve_{file_name_prefix}.png")
303 |             curr_auc=plot_auc_roc(test_labels, test_predictions, save_test_auc_file, label="Test")
304 |             print_cluster(f"Test AUC: {curr_auc}", log_file)
305 |             print_cluster("finish auc plot for train, test...", log_file)
306 | 
307 |             if not all_negative_solution_eval: # contain positive cases
308 |                 data_eval = torch.tensor(eval_feature_dataset, dtype=torch.float32).to(device)
309 |                 solution_eval = torch.tensor(eval_solution_dataset, dtype=torch.float32).to(device)
310 |                 eval_predictions, eval_labels = get_predictions(model_semnet, data_eval, solution_eval, eval_batch_size, log_file=log_file)
311 |                 save_eval_auc_file = os.path.join(plot_folder, f"fcNN_eval_auc_curve_{file_name_prefix}.png")
312 |                 curr_auc=plot_auc_roc(eval_labels, eval_predictions, save_eval_auc_file, label="Eval")
313 |                 print_cluster(f"Eval AUC: {curr_auc}", log_file)                
314 | 
315 |             print_cluster("finish all.....", log_file)
316 | 


--------------------------------------------------------------------------------
/benchmark_code/loops_tree.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import random, time
  6 | import numpy as np
  7 | import pandas as pd
  8 | from datetime import datetime, date
  9 | import matplotlib
 10 | matplotlib.use('Agg')  # Use a non-interactive backend suitable for cluster
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | import joblib
 15 | 
 16 | def print_cluster(print_str, log_file):
 17 |     print(print_str)
 18 |     with open(log_file, "a") as logfile:
 19 |         logfile.write(print_str + "\n")
 20 | 
 21 | def get_predictions(model, data, solution, eval_batch_size, log_file):
 22 |     # model here is a RandomForestClassifier
 23 |     # We'll do the predictions in batches if needed
 24 |     data_batches = []
 25 |     n = data.shape[0]
 26 |     idx = 0
 27 |     while idx < n:
 28 |         data_batches.append(data[idx:idx+eval_batch_size])
 29 |         idx += eval_batch_size
 30 | 
 31 |     all_predictions = []
 32 |     start_time = time.time()
 33 |     for i, batch in enumerate(data_batches, start=1):
 34 |         batch_start_time = time.time()
 35 |         # For random forest, predict_proba returns probabilities for each class
 36 |         # We assume solution is binary {0,1}, so we take probability of class 1
 37 |         batch_preds = model.predict_proba(batch)[:, 1]
 38 |         all_predictions.append(batch_preds)
 39 |         batch_time = time.time() - batch_start_time
 40 |         print_cluster(f"Processed batch {i}/{len(data_batches)} in {batch_time:.2f} seconds", log_file)
 41 | 
 42 |     predictions = np.concatenate(all_predictions)
 43 |     true_labels = solution
 44 |     total_time = time.time() - start_time
 45 |     print_cluster(f"Total prediction time: {total_time:.2f} seconds", log_file)
 46 |     return predictions, true_labels
 47 | 
 48 | def plot_auc_roc(true_labels, predictions, save_file, label="Train"):
 49 |     # Calculate the AUC-ROC score and ROC curve
 50 |     auc_score = roc_auc_score(true_labels, predictions)
 51 |     fpr, tpr, thresholds = roc_curve(true_labels, predictions)
 52 | 
 53 |     # Plot the ROC curve
 54 |     plt.figure(figsize=(8, 6))
 55 |     plt.plot(fpr, tpr, label=f"{label} AUC = {auc_score:.4f}")
 56 |     plt.plot([0, 1], [0, 1], 'k--', label=f"Random AUC={0.5}")
 57 |     plt.xlabel("False Positive Rate (FPR)")
 58 |     plt.ylabel("True Positive Rate (TPR)")
 59 |     plt.title(f"ROC Curve -- {label}")
 60 |     plt.legend(loc="lower right")
 61 |     plt.grid()
 62 |     plt.savefig(save_file, dpi=300)
 63 |     plt.close()
 64 | 
 65 |     # Save data used to produce this figure, including predictions and ground truth
 66 |     data_file = save_file.replace('.png', '.npz')
 67 |     np.savez(
 68 |         data_file,
 69 |         fpr=fpr,
 70 |         tpr=tpr,
 71 |         thresholds=thresholds,
 72 |         auc_score=auc_score,
 73 |         true_labels=true_labels,
 74 |         predictions=predictions
 75 |     )
 76 |     return auc_score
 77 | 
 78 | def plot_loss_curve(loss_train, loss_test, save_file, label="year1-year2"):
 79 |     plt.figure(figsize=(8, 6))
 80 |     plt.plot(loss_train, label=f'Train: {label}')
 81 |     plt.plot(loss_test, label=f'Test: {label}')
 82 |     plt.title(f"Loss Over Epochs: {label}")
 83 |     plt.xlabel("Epoch")
 84 |     plt.ylabel("Loss")
 85 |     plt.legend()
 86 |     plt.savefig(save_file, dpi=300)
 87 |     plt.close()
 88 | 
 89 |     # Save data used to produce this figure
 90 |     data_file = save_file.replace('.png', '.npz')
 91 |     np.savez(data_file, loss_train=loss_train, loss_test=loss_test)
 92 | 
 93 | # Dummy variables for same file format of the log files
 94 | batch_size = 2048 # Dummy variables for same file format of the log files
 95 | lr_enc = 1e-4     # Dummy variables for same file format of the log files
 96 | hidden_size = 600 # Dummy variables for same file format of the log files
 97 | patience = 500    # Dummy variables for same file format of the log files
 98 | # Dummy variables for same file format of the log files
 99 | 
100 | # Create folders if needed
101 | neuralNet_folder = "save_neuralNet"
102 | plot_folder = "save_plot"
103 | os.makedirs(neuralNet_folder, exist_ok=True)
104 | os.makedirs(plot_folder, exist_ok=True)
105 | 
106 | train_data_folder = "data_for_train"
107 | eval_data_folder = "data_for_eval"
108 | os.makedirs(train_data_folder, exist_ok=True)
109 | os.makedirs(eval_data_folder, exist_ok=True)
110 | 
111 | rnd_seed = 42
112 | random.seed(rnd_seed)
113 | np.random.seed(rnd_seed)
114 | 
115 | train_year_spans = [2, 3, 4]   # (y2_train - y1_train) 
116 | eval_year_spans = [1, 2, 3, 4, 5]  # (y2_eval - y1_eval)
117 | IR_list = [10, 50] 
118 | fixed_y2_eval = 2022
119 | 
120 | 
121 | for t_span in train_year_spans:
122 |     for e_span in eval_year_spans:
123 |         # Compute eval years
124 |         y2_eval = fixed_y2_eval
125 |         y1_eval = y2_eval - e_span
126 | 
127 |         # Compute train years
128 |         # given: y2_train = y1_eval
129 |         y2_train = y1_eval
130 |         y1_train = y2_train - t_span
131 | 
132 |         for IR in IR_list:
133 |             # Construct a file name prefix for all files
134 |             file_name_prefix = f"{y1_train}_{y2_train}_{hidden_size}_{lr_enc}_{batch_size}_{patience}_{IR}"
135 |             log_file = f'log_tree_{file_name_prefix}.txt'
136 |             
137 |             if y1_train==2013:
138 |                 train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}_star.parquet")
139 |                 eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}_star.parquet")
140 |             else:
141 |                 train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}.parquet")
142 |                 eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}.parquet")
143 | 
144 |             # Check if files exist before proceeding
145 |             if not os.path.exists(train_file) or not os.path.exists(eval_file):
146 |                 print_cluster(f"Skipping (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR}) because files do not exist.", log_file)
147 |                 continue
148 | 
149 |             print_cluster(f"Processing (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR})...", log_file)
150 | 
151 |             # Load train data
152 |             train_data_pandas = pd.read_parquet(train_file) 
153 |             all_input_train = train_data_pandas.values
154 |             train_data_pandas = pd.DataFrame()  # free memory
155 | 
156 |             # Load eval data
157 |             eval_data_pandas = pd.read_parquet(eval_file)
158 |             all_input_eval = eval_data_pandas.values
159 |             eval_data_pandas = pd.DataFrame()  # free memory
160 | 
161 |             eval_feature_dataset = all_input_eval[:, 3:]  # selecting features
162 |             eval_solution_dataset = all_input_eval[:, 2]
163 |             all_negative_solution_eval = np.all(eval_solution_dataset == 0)
164 | 
165 |             np.random.shuffle(all_input_train)
166 |             input_data = all_input_train[:, 3:]  # selecting features
167 |             supervised_solution = all_input_train[:, 2]  # solutions
168 | 
169 |             train_test_size = [0.85, 0.15]
170 |             idx_train = int(len(input_data) * train_test_size[0])
171 |             input_data_train = input_data[:idx_train]
172 |             train_solution = supervised_solution[:idx_train]
173 | 
174 |             input_data_test = input_data[idx_train:]
175 |             test_solution = supervised_solution[idx_train:] 
176 | 
177 |             # Initialize the Random Forest classifier
178 |             print_cluster("Finished shuffling and splitting training data...", log_file)
179 |             
180 |             clf = RandomForestClassifier(
181 |                 n_estimators=300,
182 |                 max_depth=None,
183 |                 min_samples_split=25,
184 |                 min_samples_leaf=10,
185 |                 n_jobs=-1,
186 |                 random_state=rnd_seed,
187 |                 verbose=1,
188 |                 class_weight='balanced'
189 |             )
190 | 
191 |             print_cluster("Training the Random Forest classifier...", log_file)
192 |             start_time = time.time()
193 |             clf.fit(input_data_train, train_solution)
194 |             end_time = time.time()
195 |             print_cluster(f"Training completed in {end_time - start_time:.2f} seconds.", log_file)
196 | 
197 |             # We mimic the logic of plotting loss curves by calculating MSE on a subset
198 |             size_of_loss_check = 10000
199 |             # Compute "loss" as MSE of predictions vs solutions, just for plotting
200 |             train_pred_for_loss = clf.predict_proba(input_data_train[:size_of_loss_check])[:,1]
201 |             train_loss = mean_squared_error(train_solution[:size_of_loss_check], train_pred_for_loss)
202 | 
203 |             test_pred_for_loss = clf.predict_proba(input_data_test[:size_of_loss_check])[:,1]
204 |             test_loss = mean_squared_error(test_solution[:size_of_loss_check], test_pred_for_loss)
205 | 
206 |             train_loss_total = [train_loss]
207 |             test_loss_total = [test_loss]
208 | 
209 |             # Save the trained model (replace fcNN by tree in filenames)
210 |             net_file = os.path.join(neuralNet_folder, f"tree_netNet_full_trained_{file_name_prefix}.pkl")
211 |             joblib.dump(clf, net_file)
212 | 
213 |             # Plot loss curve (will be trivial, just one point)
214 |             save_loss_file = os.path.join(plot_folder, f"tree_loss_curve_{file_name_prefix}.png")
215 |             plot_loss_curve(train_loss_total, test_loss_total, save_loss_file, label=f"{y1_train}-{y2_train}")
216 | 
217 |             print_cluster("start evaluation for train, test and eval if possible....", log_file)
218 |             eval_batch_size = 50000
219 | 
220 |             # Load model again to mimic original code
221 |             clf = joblib.load(net_file)
222 | 
223 |             # Get predictions for training set
224 |             train_predictions, train_labels = get_predictions(clf, input_data_train, train_solution, eval_batch_size, log_file=log_file)
225 |             save_train_auc_file = os.path.join(plot_folder, f"tree_train_auc_curve_{file_name_prefix}.png")
226 |             curr_auc = plot_auc_roc(train_labels, train_predictions, save_train_auc_file, label="Train")
227 |             print_cluster(f"Train AUC: {curr_auc}", log_file)
228 | 
229 |             # Get predictions for test set
230 |             test_predictions, test_labels = get_predictions(clf, input_data_test, test_solution, eval_batch_size, log_file=log_file)
231 |             save_test_auc_file = os.path.join(plot_folder, f"tree_test_auc_curve_{file_name_prefix}.png")
232 |             curr_auc = plot_auc_roc(test_labels, test_predictions, save_test_auc_file, label="Test")
233 |             print_cluster(f"Test AUC: {curr_auc}", log_file)
234 |             print_cluster("finish auc plot for train, test...", log_file)
235 | 
236 |             if not all_negative_solution_eval: # contain positive cases
237 |                 eval_predictions, eval_labels = get_predictions(clf, eval_feature_dataset, eval_solution_dataset, eval_batch_size, log_file=log_file)
238 |                 save_eval_auc_file = os.path.join(plot_folder, f"tree_eval_auc_curve_{file_name_prefix}.png")
239 |                 curr_auc = plot_auc_roc(eval_labels, eval_predictions, save_eval_auc_file, label="Eval")
240 |                 print_cluster(f"Eval AUC: {curr_auc}", log_file)                
241 | 
242 |             print_cluster("finish all.....", log_file)
243 | 


--------------------------------------------------------------------------------
/benchmark_code/loops_xgboost.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import random, time
  6 | import numpy as np
  7 | import pandas as pd
  8 | from datetime import datetime, date
  9 | import matplotlib
 10 | matplotlib.use('Agg')  # Use a non-interactive backend suitable for cluster
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error
 13 | import joblib
 14 | import xgboost as xgb
 15 | 
 16 | def print_cluster(print_str, log_file):
 17 |     print(print_str)
 18 |     with open(log_file, "a") as logfile:
 19 |         logfile.write(print_str + "\n")
 20 | 
 21 | def get_predictions(model, data, solution, eval_batch_size, log_file):
 22 |     # model here is an XGBRegressor
 23 |     # We'll do the predictions in batches if needed
 24 |     data_batches = []
 25 |     n = data.shape[0]
 26 |     idx = 0
 27 |     while idx < n:
 28 |         data_batches.append(data[idx:idx+eval_batch_size])
 29 |         idx += eval_batch_size
 30 | 
 31 |     all_predictions = []
 32 |     start_time = time.time()
 33 |     for i, batch in enumerate(data_batches, start=1):
 34 |         batch_start_time = time.time()
 35 |         # For XGBRegressor, we get a continuous output. We interpret it as probability of class 1.
 36 |         batch_preds = model.predict(batch)
 37 |         all_predictions.append(batch_preds)
 38 |         batch_time = time.time() - batch_start_time
 39 |         print_cluster(f"Processed batch {i}/{len(data_batches)} in {batch_time:.2f} seconds", log_file)
 40 | 
 41 |     predictions = np.concatenate(all_predictions)
 42 |     true_labels = solution
 43 |     total_time = time.time() - start_time
 44 |     print_cluster(f"Total prediction time: {total_time:.2f} seconds", log_file)
 45 |     return predictions, true_labels
 46 | 
 47 | def plot_auc_roc(true_labels, predictions, save_file, label="Train"):
 48 |     # Calculate the AUC-ROC score and ROC curve
 49 |     auc_score = roc_auc_score(true_labels, predictions)
 50 |     fpr, tpr, thresholds = roc_curve(true_labels, predictions)
 51 | 
 52 |     # Plot the ROC curve
 53 |     plt.figure(figsize=(8, 6))
 54 |     plt.plot(fpr, tpr, label=f"{label} AUC = {auc_score:.4f}")
 55 |     plt.plot([0, 1], [0, 1], 'k--', label=f"Random AUC={0.5}")
 56 |     plt.xlabel("False Positive Rate (FPR)")
 57 |     plt.ylabel("True Positive Rate (TPR)")
 58 |     plt.title(f"ROC Curve -- {label}")
 59 |     plt.legend(loc="lower right")
 60 |     plt.grid()
 61 |     plt.savefig(save_file, dpi=300)
 62 |     plt.close()
 63 | 
 64 |     # Save data used to produce this figure, including predictions and ground truth
 65 |     data_file = save_file.replace('.png', '.npz')
 66 |     np.savez(
 67 |         data_file,
 68 |         fpr=fpr,
 69 |         tpr=tpr,
 70 |         thresholds=thresholds,
 71 |         auc_score=auc_score,
 72 |         true_labels=true_labels,
 73 |         predictions=predictions
 74 |     )
 75 |     return auc_score
 76 | 
 77 | def plot_loss_curve(loss_train, loss_test, save_file, label="year1-year2"):
 78 |     plt.figure(figsize=(8, 6))
 79 |     plt.plot(loss_train, label=f'Train: {label}')
 80 |     plt.plot(loss_test, label=f'Test: {label}')
 81 |     plt.title(f"Loss Over Epochs: {label}")
 82 |     plt.xlabel("Epoch")
 83 |     plt.ylabel("Loss")
 84 |     plt.legend()
 85 |     plt.savefig(save_file, dpi=300)
 86 |     plt.close()
 87 | 
 88 |     # Save data used to produce this figure
 89 |     data_file = save_file.replace('.png', '.npz')
 90 |     np.savez(data_file, loss_train=loss_train, loss_test=loss_test)
 91 | 
 92 | # Dummy variables for same file format of the log files
 93 | batch_size = 2048 # Dummy variables for same file format of the log files
 94 | lr_enc = 1e-4     # Dummy variables for same file format of the log files
 95 | hidden_size = 600 # Dummy variables for same file format of the log files
 96 | patience = 500    # Dummy variables for same file format of the log files
 97 | # Dummy variables for same file format of the log files
 98 | 
 99 | # Create folders if needed
100 | neuralNet_folder = "save_neuralNet"
101 | plot_folder = "save_plot"
102 | os.makedirs(neuralNet_folder, exist_ok=True)
103 | os.makedirs(plot_folder, exist_ok=True)
104 | 
105 | train_data_folder = "data_for_train"
106 | eval_data_folder = "data_for_eval"
107 | os.makedirs(train_data_folder, exist_ok=True)
108 | os.makedirs(eval_data_folder, exist_ok=True)
109 | 
110 | rnd_seed = 42
111 | random.seed(rnd_seed)
112 | np.random.seed(rnd_seed)
113 | 
114 | train_year_spans = [2, 3, 4]   # (y2_train - y1_train) 
115 | eval_year_spans = [1, 2, 3, 4, 5]  # (y2_eval - y1_eval)
116 | IR_list = [10, 50] 
117 | fixed_y2_eval = 2022
118 | 
119 | 
120 | for t_span in train_year_spans:
121 |     for e_span in eval_year_spans:
122 |         # Compute eval years
123 |         y2_eval = fixed_y2_eval
124 |         y1_eval = y2_eval - e_span
125 | 
126 |         # Compute train years
127 |         # given: y2_train = y1_eval
128 |         y2_train = y1_eval
129 |         y1_train = y2_train - t_span
130 | 
131 |         for IR in IR_list:
132 |             # Construct a file name prefix for all files
133 |             file_name_prefix = f"{y1_train}_{y2_train}_{hidden_size}_{lr_enc}_{batch_size}_{patience}_{IR}"
134 |             log_file = f'log_xgboost_{file_name_prefix}.txt'
135 |             
136 |             if y1_train==2013:
137 |                 train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}_star.parquet")
138 |                 eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}_star.parquet")
139 |             else:
140 |                 train_file = os.path.join(train_data_folder, f"train_data_{y1_train}_{y2_train}_IR{IR}.parquet")
141 |                 eval_file = os.path.join(eval_data_folder, f"eval_data_{y1_eval}_{y2_eval}_IR{IR}.parquet")
142 | 
143 |             # Check if files exist before proceeding
144 |             if not os.path.exists(train_file) or not os.path.exists(eval_file):
145 |                 print_cluster(f"Skipping (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR}) because files do not exist.", log_file)
146 |                 continue
147 | 
148 |             print_cluster(f"Processing (Train: {y1_train}-{y2_train}, Eval: {y1_eval}-{y2_eval}, IR={IR})...", log_file)
149 | 
150 |             # Load train data
151 |             train_data_pandas = pd.read_parquet(train_file) 
152 |             all_input_train = train_data_pandas.values
153 |             train_data_pandas = pd.DataFrame()  # free memory
154 | 
155 |             # Load eval data
156 |             eval_data_pandas = pd.read_parquet(eval_file)
157 |             all_input_eval = eval_data_pandas.values
158 |             eval_data_pandas = pd.DataFrame()  # free memory
159 | 
160 |             eval_feature_dataset = all_input_eval[:, 3:]  # selecting features
161 |             eval_solution_dataset = all_input_eval[:, 2]
162 |             all_negative_solution_eval = np.all(eval_solution_dataset == 0)
163 | 
164 |             np.random.shuffle(all_input_train)
165 |             # Taking a smaller slice for demonstration as in the original code (0:100), keep it the same
166 |             input_data = all_input_train[:, 3:]  # selecting features
167 |             supervised_solution = all_input_train[:, 2]  # solutions
168 | 
169 |             train_test_size = [0.85, 0.15]
170 |             idx_train = int(len(input_data) * train_test_size[0])
171 |             input_data_train = input_data[:idx_train]
172 |             train_solution = supervised_solution[:idx_train]
173 | 
174 |             input_data_test = input_data[idx_train:]
175 |             test_solution = supervised_solution[idx_train:] 
176 | 
177 |             print_cluster("Finished shuffling and splitting training data...", log_file)
178 | 
179 |             print_cluster("start training with XGBoost....", log_file)
180 |             # Define XGBoost model
181 |             model = xgb.XGBRegressor(
182 |                 n_estimators=2000,
183 |                 learning_rate=0.01,
184 |                 max_depth=10,
185 |                 subsample=0.8,
186 |                 colsample_bytree=0.8,
187 |                 random_state=rnd_seed,
188 |                 verbosity=1,
189 |                 eval_metric="rmse",
190 |                 early_stopping_rounds=500  # Move this parameter here                
191 |             )
192 | 
193 |             # Fit the model
194 |             start_time = time.time()
195 |             model.fit(
196 |                 input_data_train, train_solution,
197 |                 eval_set=[(input_data_test, test_solution)],
198 |                 verbose=True
199 |             )
200 |             end_time = time.time()
201 |             print_cluster(f"Training completed in {end_time - start_time:.2f} seconds.", log_file)
202 | 
203 |             # Compute "loss" as MSE on train/test subsets
204 |             size_of_loss_check = 10000
205 |             train_pred_for_loss = model.predict(input_data_train[:size_of_loss_check])
206 |             train_loss = mean_squared_error(train_solution[:size_of_loss_check], train_pred_for_loss)
207 | 
208 |             test_pred_for_loss = model.predict(input_data_test[:size_of_loss_check])
209 |             test_loss = mean_squared_error(test_solution[:size_of_loss_check], test_pred_for_loss)
210 | 
211 |             train_loss_total = [train_loss]
212 |             test_loss_total = [test_loss]
213 | 
214 |             # Save the trained model (replace tree with xgboost in filenames)
215 |             net_file = os.path.join(neuralNet_folder, f"xgboost_netNet_full_trained_{file_name_prefix}.pkl")
216 |             joblib.dump(model, net_file)
217 | 
218 |             # Plot loss curve (will be trivial, just one point)
219 |             save_loss_file = os.path.join(plot_folder, f"xgboost_loss_curve_{file_name_prefix}.png")
220 |             plot_loss_curve(train_loss_total, test_loss_total, save_loss_file, label=f"{y1_train}-{y2_train}")
221 | 
222 |             print_cluster("start evaluation for train, test and eval if possible....", log_file)
223 |             eval_batch_size = 50000
224 | 
225 |             # Load model again to mimic original code
226 |             model = joblib.load(net_file)
227 | 
228 |             # Get predictions for training set
229 |             train_predictions, train_labels = get_predictions(model, input_data_train, train_solution, eval_batch_size, log_file=log_file)
230 |             save_train_auc_file = os.path.join(plot_folder, f"xgboost_train_auc_curve_{file_name_prefix}.png")
231 |             curr_auc = plot_auc_roc(train_labels, train_predictions, save_train_auc_file, label="Train")
232 |             print_cluster(f"Train AUC: {curr_auc}", log_file)
233 | 
234 |             # Get predictions for test set
235 |             test_predictions, test_labels = get_predictions(model, input_data_test, test_solution, eval_batch_size, log_file=log_file)
236 |             save_test_auc_file = os.path.join(plot_folder, f"xgboost_test_auc_curve_{file_name_prefix}.png")
237 |             curr_auc = plot_auc_roc(test_labels, test_predictions, save_test_auc_file, label="Test")
238 |             print_cluster(f"Test AUC: {curr_auc}", log_file)
239 |             print_cluster("finish auc plot for train, test...", log_file)
240 | 
241 |             if not all_negative_solution_eval: # contain positive cases
242 |                 eval_predictions, eval_labels = get_predictions(model, eval_feature_dataset, eval_solution_dataset, eval_batch_size, log_file=log_file)
243 |                 save_eval_auc_file = os.path.join(plot_folder, f"xgboost_eval_auc_curve_{file_name_prefix}.png")
244 |                 curr_auc = plot_auc_roc(eval_labels, eval_predictions, save_eval_auc_file, label="Eval")
245 |                 print_cluster(f"Eval AUC: {curr_auc}", log_file)                
246 | 
247 |             print_cluster("finish all.....", log_file)
248 | 


--------------------------------------------------------------------------------
/benchmark_code/make_plot_loss_curve.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # Define the directory and file name
 6 | base_dir = "save_plot"  # Directory name
 7 | file_name = "fcNN_loss_curve_2017_2020_600_0.0001_2048_500_10.npz"
 8 | 
 9 | # Construct full path using os.path
10 | file_path = os.path.join(base_dir, file_name)
11 | 
12 | # Load the npz file
13 | if os.path.exists(file_path):
14 |     data = np.load(file_path)
15 |     loss_train = data['loss_train']
16 |     loss_test = data['loss_test']
17 | else:
18 |     raise FileNotFoundError(f"Error: File '{file_path}' not found.")
19 | 
20 | # Main plot
21 | plt.figure(figsize=(10, 6))
22 | plt.plot(loss_train, label='Train Loss', color='blue')
23 | plt.plot(loss_test, label='Test Loss', color='orange')
24 | plt.xlabel("Epoch", fontsize=24)
25 | plt.ylabel("Loss", fontsize=24)
26 | plt.xticks(fontsize=20)
27 | plt.yticks(fontsize=20)
28 | plt.title("Training and Test Loss Over Epochs", fontsize=28)
29 | plt.legend(fontsize=20)
30 | 
31 | # Inset plot: Zoomed in from episode 1000 to the end
32 | inset_start = 1000
33 | ax_inset = plt.axes([0.30, 0.30, 0.55, 0.40])  # Position of inset: [x, y, width, height]
34 | ax_inset.plot(range(inset_start, len(loss_train)), loss_train[inset_start:], label='Train Loss', color='blue')
35 | ax_inset.plot(range(inset_start, len(loss_test)), loss_test[inset_start:], label='Test Loss', color='orange')
36 | ax_inset.set_title(f"Zoomed In, Start at Epoch {inset_start}", fontsize=14)
37 | ax_inset.set_xlabel("Epoch", fontsize=16)
38 | ax_inset.set_ylabel("Loss", fontsize=16)
39 | ax_inset.tick_params(axis='both', which='major', labelsize=14)
40 | 
41 | plt.tight_layout()
42 | # Save the figure
43 | output_dir = "save_plot_output"
44 | os.makedirs(output_dir, exist_ok=True)
45 | save_path = os.path.join(output_dir, "loss_curve_with_inset.png")
46 | plt.savefig(save_path, dpi=300)
47 | 
48 | plt.show()
49 | 


--------------------------------------------------------------------------------
/benchmark_code/make_plots.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # Define the model colors to ensure consistency
  7 | model_colors = {
  8 |     'fcNN': 'blue',
  9 |     'tree': 'red',
 10 |     'xgboost': 'green',
 11 |     'transformer': 'orange'
 12 | }
 13 | 
 14 | # (y1_train, y2_train) pairs to consider based on conditions (already given)
 15 | train_pairs = [
 16 |     (y1, y2)
 17 |     for y1 in range(2013, 2020)
 18 |     for y2 in range(2017, 2022)
 19 |     if y2 - y1 in [2, 3, 4]
 20 | ]
 21 | 
 22 | IR_values = [10, 50]
 23 | 
 24 | data_folder = "save_plot"
 25 | plot_folder = "full_ROCs"
 26 | os.makedirs(plot_folder, exist_ok=True)
 27 | 
 28 | # Helper function to parse filename and extract model, y1, y2, IR
 29 | def parse_filename(filename):
 30 |     base = os.path.basename(filename)
 31 |     base = base.replace('.npz', '')
 32 |     parts = base.split('_')
 33 |     
 34 |     # Find 'eval' as an anchor
 35 |     try:
 36 |         eval_idx = parts.index('eval')
 37 |     except ValueError:
 38 |         return None, None, None, None
 39 |     
 40 |     # Model name is everything before 'eval'
 41 |     model_name = '_'.join(parts[:eval_idx])
 42 |     
 43 |     # Check for minimum length
 44 |     if len(parts) < eval_idx + 7:
 45 |         return None, None, None, None
 46 | 
 47 |     # After eval_auc_curve come y1, y2 and IR
 48 |     try:
 49 |         y1_train = int(parts[eval_idx+3])
 50 |         y2_train = int(parts[eval_idx+4])
 51 |         IR = int(parts[-1])
 52 |     except ValueError:
 53 |         return None, None, None, None
 54 |     
 55 |     return model_name, y1_train, y2_train, IR
 56 | 
 57 | print("Collecting files...")
 58 | 
 59 | # Collect all .npz files
 60 | all_files = glob.glob(os.path.join(data_folder, "*.npz"))
 61 | 
 62 | 
 63 | 
 64 | # Dictionary to store files by (y1, y2, IR)
 65 | files_dict = {}
 66 | for f in all_files:
 67 |     if 'single' in f:
 68 |         # Ignore files containing "single"
 69 |         continue
 70 |     model_name, y1_t, y2_t, IR = parse_filename(f)
 71 |     if model_name is None:
 72 |         continue
 73 |     # Ensure model_name is in our known set
 74 |     if model_name not in ['fcNN', 'tree', 'xgboost', 'transformer']:
 75 |         continue
 76 |     
 77 |     key = (y1_t, y2_t, IR)
 78 |     if key not in files_dict:
 79 |         files_dict[key] = {}
 80 |     if model_name not in files_dict[key]:
 81 |         files_dict[key][model_name] = []
 82 |     files_dict[key][model_name].append(f)
 83 | 
 84 | print(f"Collected files for {len(files_dict)} (y1,y2,IR) combinations.")
 85 | 
 86 | # We want two figures: one for IR=10 and one for IR=50
 87 | # Rows: delta_train in [2,3,4]
 88 | # Cols: delta_eval in [1,2,3,4,5]
 89 | delta_train_values = [2, 3, 4]
 90 | delta_eval_values = [1, 2, 3, 4, 5]
 91 | 
 92 | 
 93 | 
 94 | print("Starting plotting...")
 95 | 
 96 | for IR in IR_values:
 97 |     fig, axes = plt.subplots(len(delta_train_values), len(delta_eval_values), figsize=(20, 12))
 98 |     #fig.suptitle(f"Various Training and Evaluation Intervals (IR={IR})", fontsize=24, y=0.96)
 99 | 
100 |     for row, dt in enumerate(delta_train_values):
101 |         for col, de in enumerate(delta_eval_values):
102 |             ax = axes[row, col]
103 | 
104 |             # Compute y1, y2 from dt and de:
105 |             # delta_eval = 2022 - y2 => y2 = 2022 - delta_eval = 2022 - de
106 |             y2 = 2022 - de
107 |             # delta_train = y2 - y1 => y1 = y2 - dt
108 |             y1 = y2 - dt
109 | 
110 |             key = (y1, y2, IR)
111 |             if key not in files_dict or not files_dict[key]:
112 |                 # No data for this combination
113 |                 ax.text(0.5, 0.5, "not enough data for evaluation", 
114 |                         ha='center', va='center', transform=ax.transAxes, fontsize=12)
115 |                 ax.set_title(f"Train: {y1}-{y2}, Eval: {y2}-2022")
116 |                 ax.set_xlabel("FPR")
117 |                 ax.set_ylabel("TPR")
118 |                 ax.grid(True)
119 |                 continue
120 | 
121 |             model_files = files_dict[key]
122 | 
123 |             # Plot each model's ROC curve (take first file if multiple)
124 |             any_plotted = False
125 |             for model_name in model_files:
126 |                 f = model_files[model_name][0]
127 |                 data = np.load(f)
128 |                 fpr = data['fpr']
129 |                 tpr = data['tpr']
130 |                 auc_score = data['auc_score']
131 |                 
132 |                 if model_name=='fcNN':
133 |                     write_model='fcNN'
134 |                 if model_name=='tree':
135 |                     write_model='Forest'
136 |                 if model_name=='xgboost':
137 |                     write_model='XGBoost'
138 |                 if model_name=='transformer':
139 |                     write_model='Transformer'                    
140 |                 ax.plot(fpr, tpr, color=model_colors.get(model_name, 'black'),
141 |                         label=f"{write_model} AUC={auc_score:.4f}")
142 |                 any_plotted = True
143 | 
144 |             if any_plotted:
145 |                 # Plot the random line
146 |                 ax.plot([0, 1], [0, 1], 'k--', label="Random AUC=0.5")
147 |                 ax.legend(loc="lower right")
148 |             else:
149 |                 # If no model plotted (though files_dict said otherwise, just in case)
150 |                 ax.text(0.5, 0.5, "not enough data for evaluation", 
151 |                         ha='center', va='center', transform=ax.transAxes, fontsize=12)
152 | 
153 |             ax.set_title(f"Train: {y1}-{y2}, Eval: {y2}-2022")
154 |             ax.set_xlabel("FPR")
155 |             ax.set_ylabel("TPR")
156 |             ax.grid(True)
157 | 
158 |     plt.tight_layout(rect=[0, 0, 1, 0.95])
159 |     plot_filename = f"comparison_roc_grid_IR{IR}.pdf"
160 |     save_path = os.path.join(plot_folder, plot_filename)
161 |     plt.savefig(save_path, dpi=300)
162 |     plt.close()
163 |     print(f"Saved combined figure for IR={IR} to {save_path}")
164 | 
165 | print("All plots have been generated.")
166 | 


--------------------------------------------------------------------------------
/benchmark_code/make_plots_mixed.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | # Directory where npz files are stored
 7 | data_folder = "save_plot"
 8 | # Directory to save final full ROC plot
 9 | output_folder = "full_ROCs_single"
10 | os.makedirs(output_folder, exist_ok=True)
11 | 
12 | # The set of y_diff values
13 | y_diff_values = [1, 2, 3, 4, 5]
14 | 
15 | # Colors for the two fcNN variants
16 | colors = {
17 |     'single': 'blue',
18 |     'mixed': 'red'
19 | }
20 | 
21 | # Create a single figure with 5 columns of subplots, 1 row
22 | fig, axes = plt.subplots(1, 5, figsize=(20, 4))
23 | 
24 | # Add a main title to the entire figure
25 | #fig.suptitle("Training from 2014 -> 2017 (Only fcNN Model Variants)", fontsize=22, y=1.08)
26 | 
27 | for i, y_diff in enumerate(y_diff_values):
28 |     ax = axes[i]
29 |     y2_eval = 2017 + y_diff
30 | 
31 |     # Patterns for the two fcNN variants:
32 |     # single: IR=50 files
33 |     single_pattern = f"fcNNsingle_eval_auc_curve_2017_{y2_eval}_*_50.npz"
34 |     # mixed: IR=10 files
35 |     mixed_pattern = f"fcNNmixed_eval_auc_curve_2017_{y2_eval}_*_10.npz"
36 | 
37 |     # Search for files
38 |     single_files = glob.glob(os.path.join(data_folder, single_pattern))
39 |     mixed_files = glob.glob(os.path.join(data_folder, mixed_pattern))
40 | 
41 |     any_data = False
42 | 
43 |     # Plot single variant if available
44 |     if single_files:
45 |         data = np.load(single_files[0])
46 |         fpr = data['fpr']
47 |         tpr = data['tpr']
48 |         auc_score = data['auc_score']
49 |         ax.plot(fpr, tpr, label=f"fcNN (IR=50) AUC={auc_score:.4f}", color=colors['single'])
50 |         any_data = True
51 | 
52 |     # Plot mixed variant if available
53 |     if mixed_files:
54 |         data = np.load(mixed_files[0])
55 |         fpr = data['fpr']
56 |         tpr = data['tpr']
57 |         auc_score = data['auc_score']
58 |         ax.plot(fpr, tpr, label=f"fcNN (IR=10) AUC={auc_score:.4f}", color=colors['mixed'])
59 |         any_data = True
60 | 
61 |     if any_data:
62 |         # If at least one curve plotted, show the random line and legend
63 |         ax.plot([0, 1], [0, 1], 'k--', label="Random AUC=0.5")
64 |         ax.legend(loc="lower right")
65 |     else:
66 |         # If no data for this subplot
67 |         ax.text(0.5, 0.5, "no data available",
68 |                 ha='center', va='center', transform=ax.transAxes, fontsize=12)
69 | 
70 |     ax.set_xlabel("FPR")
71 |     ax.set_ylabel("TPR")
72 |     ax.set_title(f"Evaluation: 2017-{y2_eval}, IR=50")
73 |     ax.grid(True)
74 | 
75 | plt.tight_layout(rect=[0, 0, 1, 0.95])
76 | 
77 | # Save the single combined figure
78 | output_filename = "full_ROC_grid_fcNN_variants.pdf"
79 | plt.savefig(os.path.join(output_folder, output_filename), dpi=600)
80 | 
81 | plt.show()
82 | plt.close()
83 | 
84 | print("Single figure with all fcNN variant subplots generated and shown.")
85 | 


--------------------------------------------------------------------------------
/benchmark_code/make_plots_single.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # Directory where npz files are stored
  7 | data_folder = "save_plot"
  8 | # Directory to save final full ROC plot
  9 | output_folder = "full_ROCs_single"
 10 | os.makedirs(output_folder, exist_ok=True)
 11 | 
 12 | model_colors = {
 13 |     'fcNN': 'blue',
 14 |     'tree': 'red',
 15 |     'xgboost': 'green',
 16 |     'transformer': 'orange'
 17 | }
 18 | 
 19 | # The set of models to include
 20 | models = ['fcNN', 'transformer', 'tree', 'xgboost']
 21 | 
 22 | # y_diff from 1 to 5
 23 | y_diff_values = [1, 2, 3, 4, 5]
 24 | # IR values
 25 | IR_values = [10, 50]
 26 | 
 27 | # Create a single figure with 2 rows and 5 columns of subplots
 28 | fig, axes = plt.subplots(2, 5, figsize=(20, 8))
 29 | 
 30 | # Add a main title to the entire figure
 31 | fig.suptitle("Training from 2014 -> 2017", fontsize=22, y=0.96)
 32 | 
 33 | for i, y_diff in enumerate(y_diff_values):
 34 |     y1_eval = 2017
 35 |     y2_eval = 2017 + y_diff
 36 |     
 37 |     for j, IR in enumerate(IR_values):
 38 |         ax = axes[j, i]  # j is row index (0 for IR=10, 1 for IR=50), i is column index for y_diff
 39 | 
 40 |         any_model_plotted = False  # To check if we plotted any model
 41 |         
 42 |         # Attempt to plot models
 43 |         for model in models:
 44 |             if model == 'fcNN':
 45 |                 pattern = f"{model}single_eval_auc_curve_2017_{y2_eval}_*_{IR}.npz"
 46 |             else:
 47 |                 # For 'transformer', 'tree', 'xgboost'
 48 |                 pattern = f"{model}single_eval_auc_curve_2017_2017_*_{IR}_2017_{y2_eval}.npz"
 49 | 
 50 |             search_pattern = os.path.join(data_folder, pattern)
 51 |             files = glob.glob(search_pattern)
 52 | 
 53 |             if not files:
 54 |                 continue
 55 | 
 56 |             npz_file = files[0]
 57 |             data = np.load(npz_file)
 58 |             fpr = data['fpr']
 59 |             tpr = data['tpr']
 60 |             auc_score = data['auc_score']
 61 |             
 62 |             
 63 |             if model=='fcNN':
 64 |                 write_model='fcNN'
 65 |             if model=='tree':
 66 |                 write_model='Forest'
 67 |             if model=='xgboost':
 68 |                 write_model='XGBoost'
 69 |             if model=='transformer':
 70 |                 write_model='Transformer'              
 71 |         
 72 |             ax.plot(fpr, tpr, label=f"{write_model} AUC={auc_score:.4f}", color=model_colors[model])
 73 |             any_model_plotted = True
 74 | 
 75 |         if any_model_plotted:
 76 |             # If we found at least one model, then plot the random diagonal
 77 |             ax.plot([0, 1], [0, 1], 'k--', label="Random AUC=0.5")
 78 |             ax.legend(loc="lower right")
 79 |         else:
 80 |             # If no model was plotted on this subplot, add a message
 81 |             ax.text(0.5, 0.5, "not enough data for evaluation", 
 82 |                     ha='center', va='center', transform=ax.transAxes, fontsize=12)
 83 | 
 84 |         ax.set_xlabel("FPR")
 85 |         ax.set_ylabel("TPR")
 86 |         ax.set_title(f"Evaluation: 2017-{y2_eval}, IR={IR}")
 87 |         ax.grid(True)
 88 | 
 89 | # Adjust layout so things fit nicely, and leave space for suptitle
 90 | plt.tight_layout(rect=[0, 0, 1, 0.96])
 91 | 
 92 | # Save the single combined figure
 93 | output_filename = "full_ROC_grid_single.pdf"
 94 | plt.savefig(os.path.join(output_folder, output_filename), dpi=600)
 95 | 
 96 | plt.show()
 97 | plt.close()
 98 | 
 99 | print("Single figure with all subplots generated and shown.")
100 | 


--------------------------------------------------------------------------------
/create_concepts/Concept_Corpus/s1_make_metadate_arxivstyle.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "5f8d99ec-1c43-4354-84ae-57c9eee8e3eb",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import json\n",
 13 |     "import ujson\n",
 14 |     "\n",
 15 |     "medrxiv_all_article=[]\n",
 16 |     " \n",
 17 |     "with open('medrxiv/medrxiv_metadata_oringal.json', 'r') as f:\n",
 18 |     "    medrxiv_all_article.extend(json.load(f))\n",
 19 |     "\n",
 20 |     "medrxiv_article=[]\n",
 21 |     "medrxiv_article_not_version1=[]  ## some papers appear many times with different versions, we only use the first version\n",
 22 |     "for ii in range(len(medrxiv_all_article)):\n",
 23 |     "    if medrxiv_all_article[ii]['version']==\"1\":\n",
 24 |     "        medrxiv_article.append(medrxiv_all_article[ii])\n",
 25 |     "\n",
 26 |     "\n",
 27 |     "with open('medrxiv-metadata-oai-snapshot.json', 'w') as f:\n",
 28 |     "    f.writelines(map(lambda item: ujson.dumps(item) + '\\n', medrxiv_article))\n",
 29 |     " "
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "id": "7849fa1d-aa04-4822-a335-7c33965523b8",
 36 |    "metadata": {
 37 |     "tags": []
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "import json\n",
 42 |     "import ujson\n",
 43 |     "\n",
 44 |     "biorxiv_all_article=[]\n",
 45 |     "with open('biorxiv/biorxiv_metadata_final_all.json', 'r') as f:\n",
 46 |     "    biorxiv_all_article.extend(json.load(f))\n",
 47 |     "\n",
 48 |     "biorxiv_article=[]\n",
 49 |     "\n",
 50 |     "for ii in range(len(biorxiv_all_article)):\n",
 51 |     "    if biorxiv_all_article[ii]['version']==\"1\":\n",
 52 |     "        biorxiv_article.append(biorxiv_all_article[ii])\n",
 53 |     "    \n",
 54 |     "    \n",
 55 |     "with open('biorxiv-metadata-oai-snapshot.json', 'w') as f:\n",
 56 |     "    f.writelines(map(lambda item: ujson.dumps(item) + '\\n', biorxiv_article))\n",
 57 |     "    "
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "id": "9b0b9652-60e2-4fc3-89ce-f871f9520320",
 64 |    "metadata": {
 65 |     "tags": []
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import json\n",
 70 |     "import ujson\n",
 71 |     "\n",
 72 |     "\n",
 73 |     "chemrxiv_all_article=[]\n",
 74 |     "with open('chemrxiv/chemrxiv_metadata_oringal_asc.json', 'r') as f: #chemrxiv_metadata_oringal\n",
 75 |     "    chemrxiv_all_article.extend(json.load(f))\n",
 76 |     "    \n",
 77 |     "chemrxiv_article=[]\n",
 78 |     "for ii in range(len(chemrxiv_all_article)):\n",
 79 |     "    chemrxiv_article.append(chemrxiv_all_article[ii]['item'])\n",
 80 |     "    \n",
 81 |     "    \n",
 82 |     "new_chemrxiv_article = []\n",
 83 |     "for ii in range(len(chemrxiv_article)):\n",
 84 |     "        new_entry = {}\n",
 85 |     "        new_entry['id'] = chemrxiv_article[ii]['id']\n",
 86 |     "        authors_names = [f\"{author['firstName']} {author['lastName']}\" for author in chemrxiv_article[ii]['authors']]\n",
 87 |     "        new_entry['authors'] = ', '.join(authors_names)\n",
 88 |     "        new_entry['title'] = chemrxiv_article[ii]['title']\n",
 89 |     "        new_entry['doi'] = chemrxiv_article[ii]['doi']\n",
 90 |     "        categories_all = ', '.join([d['name'] for d in chemrxiv_article[ii]['categories']])\n",
 91 |     "        new_entry['categories'] = categories_all\n",
 92 |     "        new_entry['abstract'] = chemrxiv_article[ii]['abstract']\n",
 93 |     "        new_entry['date'] = chemrxiv_article[ii]['submittedDate'][0:10]\n",
 94 |     "        new_entry['version'] = chemrxiv_article[ii]['version']\n",
 95 |     "        new_entry['server'] = 'chemrxiv'\n",
 96 |     "        new_chemrxiv_article.append(new_entry)\n",
 97 |     "        \n",
 98 |     "# write the new dictionary to a JSON file\n",
 99 |     "with open('chemrxiv-metadata-oai-snapshot.json', 'w') as f:\n",
100 |     "    f.writelines(map(lambda item: ujson.dumps(item) + '\\n', new_chemrxiv_article))\n"
101 |    ]
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "Python 3 (ipykernel)",
107 |    "language": "python",
108 |    "name": "python3"
109 |   },
110 |   "language_info": {
111 |    "codemirror_mode": {
112 |     "name": "ipython",
113 |     "version": 3
114 |    },
115 |    "file_extension": ".py",
116 |    "mimetype": "text/x-python",
117 |    "name": "python",
118 |    "nbconvert_exporter": "python",
119 |    "pygments_lexer": "ipython3",
120 |    "version": "3.9.7"
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 5
125 | }
126 | 


--------------------------------------------------------------------------------
/create_concepts/Concept_Corpus/s2_combine_all_preprint_metadate.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "e3c27b65-fe14-4077-bb51-71bd7aea6e3b",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import json\n",
 13 |     "import linecache\n",
 14 |     "import time\n",
 15 |     "import jsonlines\n",
 16 |     "from datetime import datetime, date\n",
 17 |     "import pickle"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "id": "0701b2af-9795-49e0-a4ff-15b9470ac60e",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## read biorxiv_json"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "id": "babf3851-6c70-46bd-8626-9b7d6801a3dd",
 32 |    "metadata": {
 33 |     "tags": []
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Current: 184839; Read biorxiv: 184839, Elapsed time: 2.6651909351348877 seconds\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "all_paper_full_infos=[] ### store all papers from bioxiv, chem, med, arxiv \n",
 46 |     "\n",
 47 |     "biorxiv_json = 'biorxiv-metadata-oai-snapshot.json'\n",
 48 |     "starting_date = date(1990,1,1)\n",
 49 |     "start_time = time.time()\n",
 50 |     "\n",
 51 |     "with jsonlines.open(biorxiv_json, 'r') as f:\n",
 52 |     "    for id_of_abstract, line in enumerate(f):\n",
 53 |     "        get_date = datetime.strptime(line['date'], '%Y-%m-%d').date()\n",
 54 |     "        paper_time = (get_date - starting_date).days\n",
 55 |     "        all_paper_full_infos.append([line['server'],line['title'],line['abstract'],paper_time])\n",
 56 |     "\n",
 57 |     "num1=len(all_paper_full_infos)\n",
 58 |     "elapsed_time = time.time() - start_time\n",
 59 |     "print(f\"Current: {len(all_paper_full_infos)}; Read biorxiv: {len(all_paper_full_infos)}, Elapsed time: {elapsed_time} seconds\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "id": "2407d05e-e234-4761-9e46-6a8c9f550cd9",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## read medrxiv_json"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "id": "fdc41bb7-fda8-4029-83ac-844071ee0134",
 74 |    "metadata": {
 75 |     "tags": []
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "Current: 224071; Read medrxiv: 39232, Elapsed time: 0.6739270687103271 seconds\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "\n",
 88 |     "medrxiv_json = 'medrxiv-metadata-oai-snapshot.json'\n",
 89 |     "\n",
 90 |     "start_time = time.time()\n",
 91 |     "with jsonlines.open(medrxiv_json, 'r') as f:\n",
 92 |     "    for id_of_abstract, line in enumerate(f):\n",
 93 |     "        get_date = datetime.strptime(line['date'], '%Y-%m-%d').date()\n",
 94 |     "        paper_time = (get_date - starting_date).days\n",
 95 |     "        all_paper_full_infos.append([line['server'],line['title'],line['abstract'],paper_time])\n",
 96 |     "\n",
 97 |     "num2=len(all_paper_full_infos)\n",
 98 |     "elapsed_time = time.time() - start_time\n",
 99 |     "print(f\"Current: {len(all_paper_full_infos)}; Read medrxiv: {len(all_paper_full_infos)-num1}, Elapsed time: {elapsed_time} seconds\")"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "id": "5e6151f3-eb2a-4eab-aad3-d50246eddb38",
105 |    "metadata": {},
106 |    "source": [
107 |     "## read chemrxiv_json"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 4,
113 |    "id": "a6b19d82-ffff-4c2a-ba37-57031da11cd8",
114 |    "metadata": {
115 |     "tags": []
116 |    },
117 |    "outputs": [
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "Current: 240551; Read chemrxiv: 16480, Elapsed time: 0.25910282135009766 seconds\n"
123 |      ]
124 |     }
125 |    ],
126 |    "source": [
127 |     "\n",
128 |     "chemrxiv_json = 'chemrxiv-metadata-oai-snapshot.json'\n",
129 |     "\n",
130 |     "start_time = time.time()\n",
131 |     "with jsonlines.open(chemrxiv_json, 'r') as f:\n",
132 |     "    for id_of_abstract, line in enumerate(f):\n",
133 |     "        get_date = datetime.strptime(line['date'][:10], '%Y-%m-%d').date()\n",
134 |     "        paper_time = (get_date - starting_date).days\n",
135 |     "        all_paper_full_infos.append([line['server'],line['title'],line['abstract'],paper_time])\n",
136 |     "\n",
137 |     "num3=len(all_paper_full_infos)\n",
138 |     "elapsed_time = time.time() - start_time\n",
139 |     "print(f\"Current: {len(all_paper_full_infos)}; Read chemrxiv: {len(all_paper_full_infos)-num2}, Elapsed time: {elapsed_time} seconds\")"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "id": "6b282f2f-561a-4453-b4cd-5240d062b8ee",
145 |    "metadata": {},
146 |    "source": [
147 |     "## remove duplicates papers "
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 23,
153 |    "id": "e0d91f02-5b36-4814-af9d-d67f8f9a8583",
154 |    "metadata": {
155 |     "tags": []
156 |    },
157 |    "outputs": [
158 |     {
159 |      "name": "stdout",
160 |      "output_type": "stream",
161 |      "text": [
162 |       "remove duplicates: 28\n"
163 |      ]
164 |     }
165 |    ],
166 |    "source": [
167 |     "## remove repeated papers (there are some repeated papers)\n",
168 |     "paper_infos_unique = set(map(tuple, all_paper_full_infos)) # convert each sublist to a tuple and create a set\n",
169 |     "all_paper_infos_unique = list(map(list, paper_infos_unique)) # convert each tuple back to a list and create a list\n",
170 |     "\n",
171 |     "print(f\"remove duplicates: {len(all_paper_full_infos)-len(all_paper_infos_unique)}\")\n",
172 |     "\n",
173 |     "with open('all_paper_info_lists_bio_med_chem.pkl', 'wb') as f:\n",
174 |     "    pickle.dump(all_paper_infos_unique, f)"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "id": "40ecc36a-4de1-40de-a22a-152e2efc6488",
180 |    "metadata": {},
181 |    "source": [
182 |     "## read arxiv_json "
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 24,
188 |    "id": "136d5452-07e0-4d5f-813c-46009fffce65",
189 |    "metadata": {
190 |     "tags": []
191 |    },
192 |    "outputs": [
193 |     {
194 |      "name": "stdout",
195 |      "output_type": "stream",
196 |      "text": [
197 |       "arxiv: {id_of_abstract}\n",
198 |       "Current: 2444442; Read chemrxiv: 2203891, Elapsed time: 44.237696170806885 seconds\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "start_time = time.time()\n",
204 |     "arxiv_json = 'arxiv-metadata-oai-snapshot.json'\n",
205 |     "\n",
206 |     "with jsonlines.open(arxiv_json, 'r') as f:\n",
207 |     "    for id_of_abstract, line in enumerate(f):\n",
208 |     "        get_date = datetime.strptime(line['versions'][0]['created'], '%a, %d %b %Y %H:%M:%S %Z').date()\n",
209 |     "        paper_time = (get_date - starting_date).days\n",
210 |     "        all_paper_infos_unique.append(['arxiv',line['title'],line['abstract'],paper_time])\n",
211 |     "        \n",
212 |     "elapsed_time = time.time() - start_time\n",
213 |     "print(\"arxiv: {id_of_abstract}\")\n",
214 |     "print(f\"Current: {len(all_paper_infos_unique)}; Read chemrxiv: {len(all_paper_infos_unique)-num3}, Elapsed time: {elapsed_time} seconds\")"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "id": "b7af89d0-aac4-4372-ab86-828ce493f0d9",
220 |    "metadata": {},
221 |    "source": [
222 |     "## Store all the processed preprint metadata"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": null,
228 |    "id": "2345c8dd-8b5c-4a87-8ef3-4529581f2063",
229 |    "metadata": {
230 |     "tags": []
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "import pickle\n",
235 |     "with open('all_paper_info_lists.pkl', 'wb') as f:\n",
236 |     "    pickle.dump(all_paper_infos_unique, f)"
237 |    ]
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "kernelspec": {
242 |    "display_name": "Python 3 (ipykernel)",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.9.7"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 5
261 | }
262 | 


--------------------------------------------------------------------------------
/create_concepts/Concept_Corpus/s3_get_concepts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "17001bd9-da10-4fe7-a3d4-9c5174944296",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "### load all the processed preprint papers "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "18ebad34-ea44-4336-8665-ceabec4c5371",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "import pickle\n",
 20 |     "\n",
 21 |     "if os.path.exists('all_paper_info_lists.pkl'):\n",
 22 |     "    # open the existing pickle file for reading\n",
 23 |     "    with open('all_paper_info_lists.pkl', 'rb') as f:\n",
 24 |     "        all_paper_lists = pickle.load(f)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "89c4475b-b22e-417d-8f74-64045ea27f90",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### put title and abstract together, store in to string list"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "bd905ebe-207c-4f63-b869-5e4434343ce4",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "def get_single_article_string(article):\n",
 43 |     "    \n",
 44 |     "    curr_title=article[1] #'title'\n",
 45 |     "    curr_abstract=article[2] #'abstract'\n",
 46 |     "    \n",
 47 |     "    replace_pairs=[['\\n',' '],['-',' '],[' \\\" a','oa'],['\\\" a','ae'],['\\\"a','ae'],[' \\\" o','oe'],['\\\" o','oe'],['\\\"o','oe'],[' \\\" u','ue'],\n",
 48 |     "                   ['\\\" u','ue'],['\\\"u','ue'],[' \\' a','a'],[' \\' e','e'],[' \\' o','o'],[\"\\' \", \"\"],[\"\\'\", \"\"],['  ',' '],['  ',' ']]\n",
 49 |     "    \n",
 50 |     "    article_string=(curr_title +' '+ curr_abstract).lower()\n",
 51 |     "    \n",
 52 |     "    for rep_pair in replace_pairs:\n",
 53 |     "        #print(rep_pair)\n",
 54 |     "        \n",
 55 |     "        article_string=article_string.replace(rep_pair[0],rep_pair[1])\n",
 56 |     "        #print(article_string)\n",
 57 |     "        #print('\\n')\n",
 58 |     "    \n",
 59 |     "    return article_string\n",
 60 |     "\n",
 61 |     "def get_all_paper_strings(article_lists):\n",
 62 |     "\n",
 63 |     "    if os.path.exists('all_paper_string_lists.pkl'):\n",
 64 |     "        with open(\"all_paper_string_lists.pkl\", \"rb\") as f:\n",
 65 |     "            all_paper_strings = pickle.load(f)\n",
 66 |     "            \n",
 67 |     "    else:\n",
 68 |     "        all_paper_strings=[]\n",
 69 |     "        cc=0\n",
 70 |     "        for id_of_paper in range(len(article_lists)):\n",
 71 |     "            cc+=1\n",
 72 |     "            if (cc%300000)==0:\n",
 73 |     "                print(str(cc)+'/'+str(len(article_lists)))\n",
 74 |     "\n",
 75 |     "            all_paper_strings.append(get_single_article_string(article_lists[id_of_paper]))\n",
 76 |     "\n",
 77 |     "        with open(\"all_paper_string_lists.pkl\", \"wb\") as f:\n",
 78 |     "            pickle.dump(all_paper_strings, f)\n",
 79 |     "    \n",
 80 |     "    return all_paper_strings\n",
 81 |     "\n",
 82 |     "\n",
 83 |     "\n",
 84 |     "all_article_strings=get_all_paper_strings(all_paper_lists)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "4be3b05e-a8eb-4e1a-a4a6-273f702eac16",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Get Concepts from RAKE"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "id": "845be865-8db6-4956-ade5-918b191954dd",
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "import time\n",
103 |     "import pickle\n",
104 |     "import nltk\n",
105 |     "from nltk.corpus import stopwords\n",
106 |     "from nltk.stem import WordNetLemmatizer\n",
107 |     "from rake_nltk import Metric, Rake\n",
108 |     "from collections import Counter\n",
109 |     "\n",
110 |     "starting_time = time.time()\n",
111 |     " \n",
112 |     "wnl=WordNetLemmatizer()\n",
113 |     "\n",
114 |     "num_of_abstracts=len(all_paper_lists)\n",
115 |     "\n",
116 |     "personal_stop_list=['presents','us','show','one','two','three','describes','new','approach','many','introduces','http','also','whose', 'prove','select ','take']\n",
117 |     "\n",
118 |     "nltk_stop_list=nltk.corpus.stopwords.words('english')\n",
119 |     "full_stop_list=nltk_stop_list + personal_stop_list\n",
120 |     "\n",
121 |     "\n",
122 |     "all_concepts_from_rake=[]\n",
123 |     "cc=0\n",
124 |     "for id_of_abstract in range(num_of_abstracts):\n",
125 |     "    cc+=1\n",
126 |     "    if (cc%100000)==0:\n",
127 |     "        print(str(cc)+'/'+str(num_of_abstracts))\n",
128 |     "    \n",
129 |     "            \n",
130 |     "    single_string = get_single_article_string(all_paper_lists[id_of_abstract])\n",
131 |     "    \n",
132 |     "    r = Rake(stopwords=full_stop_list, ranking_metric=Metric.WORD_DEGREE, min_length=2, include_repeated_phrases=False)\n",
133 |     "\n",
134 |     "    r.extract_keywords_from_text(single_string)\n",
135 |     "    ll=r.get_ranked_phrases_with_scores()\n",
136 |     "    \n",
137 |     "    all_concepts_from_rake.extend(ll)\n",
138 |     "\n",
139 |     "\n",
140 |     "with open(\"all_concepts_from_rake.pkl\", \"wb\") as output_file:\n",
141 |     "    pickle.dump(all_concepts_from_rake, output_file)\n",
142 |     "    \n"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "id": "954d5a11-4106-42e6-aedc-4b2a02c34b17",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": []
152 |   }
153 |  ],
154 |  "metadata": {
155 |   "kernelspec": {
156 |    "display_name": "asl_semnet",
157 |    "language": "python",
158 |    "name": "asl_semnet"
159 |   },
160 |   "language_info": {
161 |    "codemirror_mode": {
162 |     "name": "ipython",
163 |     "version": 3
164 |    },
165 |    "file_extension": ".py",
166 |    "mimetype": "text/x-python",
167 |    "name": "python",
168 |    "nbconvert_exporter": "python",
169 |    "pygments_lexer": "ipython3",
170 |    "version": "3.10.9"
171 |   }
172 |  },
173 |  "nbformat": 4,
174 |  "nbformat_minor": 5
175 | }
176 | 


--------------------------------------------------------------------------------
/create_concepts/Domain_Concept/s0_prepare_optics_quantum_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "34f3f857-ce5c-4779-8209-ca0fb47340f6",
  7 |    "metadata": {
  8 |     "tags": []
  9 |    },
 10 |    "outputs": [],
 11 |    "source": [
 12 |     "import json\n",
 13 |     "import linecache\n",
 14 |     "import time\n",
 15 |     "import jsonlines\n",
 16 |     "from datetime import datetime, date\n",
 17 |     "import pickle\n",
 18 |     "import os"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "id": "2d360712-7cc0-4692-b5fc-69e2e6d2534d",
 25 |    "metadata": {
 26 |     "tags": []
 27 |    },
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "sub arxiv: 2227429\n",
 34 |       "Quantum and Optics: 78084; Modified: 78084, Elapsed time: 20.9117271900177 seconds\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "starting_date = date(1990,1,1)\n",
 40 |     "start_time = time.time()\n",
 41 |     "\n",
 42 |     "arxiv_folder='arxiv-snapshot'\n",
 43 |     "arxiv_json = os.path.join(arxiv_folder,\"arxiv-metadata-oai-snapshot.json\")\n",
 44 |     "\n",
 45 |     "arxiv_optics_quantum_original=[]\n",
 46 |     "arxiv_optics_quantum_modified=[]\n",
 47 |     "\n",
 48 |     "with jsonlines.open(arxiv_json, 'r') as f:\n",
 49 |     "    for id_of_abstract, line in enumerate(f):\n",
 50 |     "        if line['categories'] in ['physics.optics','quant-ph']:\n",
 51 |     "            arxiv_optics_quantum_original.append(line)  ## store the original one\n",
 52 |     "        \n",
 53 |     "            get_date = datetime.strptime(line['versions'][0]['created'], '%a, %d %b %Y %H:%M:%S %Z').date()\n",
 54 |     "            paper_time = (get_date - starting_date).days\n",
 55 |     "            arxiv_optics_quantum_modified.append([line['categories'],line['title'],line['abstract'],paper_time]) ## store modified one\n",
 56 |     "        \n",
 57 |     "elapsed_time = time.time() - start_time\n",
 58 |     "print(f\"sub arxiv: {id_of_abstract}\")\n",
 59 |     "print(f\"Quantum and Optics: {len(arxiv_optics_quantum_original)}; Modified: {len(arxiv_optics_quantum_modified)}, Elapsed time: {elapsed_time} seconds\")"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 3,
 65 |    "id": "431e166c-9e21-4ecf-b741-021fcd919c94",
 66 |    "metadata": {
 67 |     "tags": []
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "store_folder='data'\n",
 72 |     "\n",
 73 |     "with open(os.path.join(store_folder,'arxiv_optics_quantum_original.pkl'), 'wb') as f:\n",
 74 |     "    pickle.dump(arxiv_optics_quantum_original, f)\n",
 75 |     "    \n",
 76 |     "with open(os.path.join(store_folder,'arxiv_optics_quantum_style_modified.pkl'), 'wb') as f:\n",
 77 |     "    pickle.dump(arxiv_optics_quantum_modified, f)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 4,
 83 |    "id": "68c806ae-6789-48b5-ac54-725e73efda08",
 84 |    "metadata": {
 85 |     "tags": []
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "with open(os.path.join(store_folder,'arxiv_optics_quantum_original.json'), 'w') as f:\n",
 90 |     "    json.dump(arxiv_optics_quantum_original, f)\n",
 91 |     "        \n",
 92 |     "with open(os.path.join(store_folder,'arxiv_optics_quantum_style_modified.json'), 'w') as f:\n",
 93 |     "    json.dump(arxiv_optics_quantum_modified, f)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "id": "cacfa9ab-f568-477b-b7de-cd3b77f7ffd3",
 99 |    "metadata": {},
100 |    "source": [
101 |     "## make only strings (title+abstract)"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 5,
107 |    "id": "f2a75bed-b673-4e93-a7e0-775043857bab",
108 |    "metadata": {
109 |     "tags": []
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "# ## (Read the modified metadata; [source, title, abstract, time])\n",
114 |     "### (Make each article in string, under certain replacements)\n",
115 |     "\n",
116 |     "def get_single_article_string(article):\n",
117 |     "    \n",
118 |     "    curr_title=article[1] #'title'\n",
119 |     "    curr_abstract=article[2] #'abstract'\n",
120 |     "    \n",
121 |     "    replace_pairs=[['\\n',' '],['-',' '],[' \\\" a','oa'],['\\\" a','ae'],['\\\"a','ae'],[' \\\" o','oe'],['\\\" o','oe'],['\\\"o','oe'],[' \\\" u','ue'],\n",
122 |     "                   ['\\\" u','ue'],['\\\"u','ue'],[' \\' a','a'],[' \\' e','e'],[' \\' o','o'],[\"\\' \", \"\"],[\"\\'\", \"\"],['  ',' '],['  ',' ']]\n",
123 |     "    \n",
124 |     "    article_string=(curr_title +' '+ curr_abstract).lower()\n",
125 |     "    \n",
126 |     "    for rep_pair in replace_pairs:\n",
127 |     "        #print(rep_pair)\n",
128 |     "        \n",
129 |     "        article_string=article_string.replace(rep_pair[0],rep_pair[1])\n",
130 |     "        #print(article_string)\n",
131 |     "        #print('\\n')\n",
132 |     "    \n",
133 |     "    return article_string\n",
134 |     "\n",
135 |     "\n",
136 |     "def get_all_paper_strings(article_lists, folder_file):\n",
137 |     "\n",
138 |     "    if os.path.exists(os.path.join(folder_file,'arxiv_optics_quantum_paper_strings.pkl')):\n",
139 |     "        with open(os.path.join(folder_file,'arxiv_optics_quantum_paper_strings.pkl'), \"rb\") as f:\n",
140 |     "            arxiv_optics_quantum_paper_strings = pickle.load(f)\n",
141 |     "            \n",
142 |     "    else:\n",
143 |     "        all_paper_strings=[]\n",
144 |     "        cc=0\n",
145 |     "        for id_of_paper in range(len(article_lists)):\n",
146 |     "            cc+=1\n",
147 |     "            #if (cc%3000)==0:\n",
148 |     "                #print(str(cc)+'/'+str(len(article_lists)))\n",
149 |     "\n",
150 |     "            all_paper_strings.append(get_single_article_string(article_lists[id_of_paper]))\n",
151 |     "\n",
152 |     "        with open(os.path.join(folder_file,'arxiv_optics_quantum_paper_strings.pkl'), \"wb\") as f:\n",
153 |     "            pickle.dump(all_paper_strings, f)\n",
154 |     "    \n",
155 |     "    return all_paper_strings    \n",
156 |     "\n",
157 |     "\n",
158 |     "all_article_strings=get_all_paper_strings(arxiv_optics_quantum_modified,folder_file=\"data\")"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "id": "121b2bb6-2385-4112-b61b-b0f9f9f8494b",
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": []
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "kernelspec": {
172 |    "display_name": "Python 3 (ipykernel)",
173 |    "language": "python",
174 |    "name": "python3"
175 |   },
176 |   "language_info": {
177 |    "codemirror_mode": {
178 |     "name": "ipython",
179 |     "version": 3
180 |    },
181 |    "file_extension": ".py",
182 |    "mimetype": "text/x-python",
183 |    "name": "python",
184 |    "nbconvert_exporter": "python",
185 |    "pygments_lexer": "ipython3",
186 |    "version": "3.10.9"
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 5
191 | }
192 | 


--------------------------------------------------------------------------------
/create_concepts/Domain_Concept/s1_split_domain_papers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from datetime import datetime, date
 4 | import pickle
 5 | import os
 6 | import math
 7 | 
 8 | log_folder='logs'
 9 | if not os.path.exists(log_folder):
10 |     os.makedirs(log_folder)
11 | 
12 | 
13 | folder_name="data_seperate"
14 | if not os.path.exists(folder_name):
15 |     os.makedirs(folder_name)   
16 |     
17 | data_folder="Concept_Corpus"
18 | if not os.path.exists(data_folder):
19 |     os.makedirs(data_folder) 
20 | 
21 | with open(os.path.join(data_folder,'arxiv_optics_quantum_paper_strings.pkl'), "rb") as f:
22 |         get_all_paper_strings = pickle.load(f)
23 |         
24 | log_file = os.path.join(log_folder, 'split_papers_log.txt')
25 | with open(log_file, 'a') as f:
26 |     f.write(f"Seperate Optics and Quantum Papers: {len(get_all_paper_strings)}\n")
27 | 
28 | 
29 | # Determine the number of parts needed
30 | num_parts = math.ceil(len(get_all_paper_strings) / 1000)
31 | 
32 | # Store 1000 elements in each part file
33 | for i in range(num_parts):
34 |     time_starting=time.time()
35 |     start_idx = i * 1000
36 |     end_idx = min((i+1)*1000, len(get_all_paper_strings))
37 |     part_data = get_all_paper_strings[start_idx : end_idx]
38 |     part_file = os.path.join(folder_name, f'part_{i:02}.pkl')
39 |     with open(part_file, 'wb') as f:
40 |         pickle.dump(part_data, f)
41 |     elapsed_time = time.time() - time_starting
42 |     with open(log_file, 'a') as f:
43 |         f.write(f"{i}: {(i+1)/num_parts}; Elapsed time: {elapsed_time} seconds \n")


--------------------------------------------------------------------------------
/create_concepts/Domain_Concept/s2_get_domain_concepts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from datetime import datetime, date
 4 | import pickle
 5 | import os
 6 | import random
 7 | 
 8 |     
 9 | if __name__ == '__main__':
10 | 
11 |     log_folder='logs'
12 |     try:
13 |         os.mkdir(log_folder)
14 |     except FileExistsError:
15 |         pass 
16 |     
17 |     data_folder="Concept_Corpus"  
18 |     data_seperate_folder="data_seperate"
19 |     
20 | 
21 |     concept_folder="concept_seperate"
22 |     try:
23 |         os.mkdir(concept_folder)
24 |     except FileExistsError:
25 |         pass 
26 | 
27 | 
28 |     concept_list_pkl = os.path.join(data_folder,'full_concept_list.pkl')
29 | 
30 |     with open(concept_list_pkl, 'rb') as file:
31 |         all_concept_lists = pickle.load(file)
32 |     
33 | 
34 |     random.seed() 
35 |     total_file=78
36 |     write_file=0
37 |     cc=0
38 |     while write_file <= total_file:
39 | 
40 |         curr_ID = random.randint(0, total_file)
41 |         formatted_ID = '{:02d}'.format(curr_ID)
42 |         data_file=os.path.join(data_seperate_folder, f'part_{formatted_ID}.pkl')
43 |  
44 |         concept_file=os.path.join(concept_folder, f'concept_{formatted_ID}.pkl')
45 | 
46 | 
47 |         log_file = os.path.join('logs', 'log_'+formatted_ID+'.txt')
48 |         if cc % 10 == 0:
49 |             with open(log_file, 'a') as f:
50 |                 f.write(f'formatted_ID: {formatted_ID}; cc: {cc}, write_file num: {write_file}\n')
51 |         cc+=1
52 | 
53 |         if not os.path.exists(concept_file):
54 |              
55 |             concepts_for_paper_list=[]
56 | 
57 |             with open(data_file, 'rb') as file:  # read paper 
58 |                 paper_info = pickle.load(file)
59 |  
60 |             concepts_at_least_one=[]
61 |  
62 |             # check all papers
63 |             for id_paper, cur_paper in enumerate(paper_info):
64 | 
65 |                 concepts_for_single_paper = []
66 | 
67 |                 for id_concept, cur_concept in enumerate(all_concept_lists):
68 | 
69 |                     if cur_concept in cur_paper: # if the paper contains the concept
70 |                         concepts_for_single_paper.append(cur_concept)  
71 | 
72 |                 concepts_at_least_one.extend(concepts_for_single_paper) ## store the concepts from one paper
73 |     
74 | 
75 |             finish_flag=0
76 |             with open(concept_file, "wb") as output_file:
77 |                 pickle.dump(concepts_at_least_one, output_file)
78 |                 write_file+=1
79 |                  
80 | 
81 |  


--------------------------------------------------------------------------------
/create_concepts/Domain_Concept/s3_merge_concepts.py:
--------------------------------------------------------------------------------
 1 | # +
 2 | import pickle
 3 | import os
 4 | import time
 5 | from datetime import datetime, date
 6 | import random
 7 |  
 8 | 
 9 | # Create SemNet
10 | 
11 | if __name__ == '__main__':
12 |     
13 |  
14 |     concept_folder="concept_seperate"
15 |  
16 |     
17 |     total_file=78
18 | 
19 |     ## finish all 
20 |     all_concepts_file = os.path.join(concept_folder,'all_concepts.pkl')  # edges
21 |  
22 |     #if not os.path.exists(all_concepts_file1):
23 | 
24 |     all_concepts=[]
25 |  
26 |     for id_file in range(total_file+1): # start from 0: 0-78
27 |         
28 |         file_ID = '{:02d}'.format(id_file)
29 |         cur_concept_file=os.path.join(concept_folder, f'concept_{file_ID}.pkl')
30 |  
31 |         
32 |         with open(cur_concept_file, 'rb') as file:  
33 |             concept_info = pickle.load(file)
34 | 
35 |         all_concepts.extend(concept_info)
36 |         
37 |     with open(all_concepts_file, "wb") as output_file:
38 |         pickle.dump(all_concepts, output_file)
39 | 
40 |  
41 | 


--------------------------------------------------------------------------------
/create_concepts/Domain_Concept/s4_improve_concepts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "fbff53e8-c63a-4997-9761-b1018ca5c42e",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "##### improve domain concepts "
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "d1317372",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import pickle\n",
 19 |     "import os\n",
 20 |     "import time\n",
 21 |     "from datetime import datetime, date\n",
 22 |     "import nltk\n",
 23 |     "from nltk.corpus import stopwords\n",
 24 |     "from nltk.stem import WordNetLemmatizer\n",
 25 |     "from rake_nltk import Metric, Rake\n",
 26 |     "from collections import Counter\n",
 27 |     "import re\n",
 28 |     "from nltk.corpus import wordnet\n",
 29 |     "import random\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "id": "6f05270b-b134-44e8-9bb8-5a5642b41755",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "##### store"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "id": "bf5d512d-2f85-4782-a665-cbc9a483a42d",
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "concept_folder=\"concept_seperate\"\n",
 48 |     "## finish all \n",
 49 |     "all_concepts_file = os.path.join(concept_folder,'all_concepts.pkl')  # edges\n",
 50 |     "with open(all_concepts_file, \"rb\") as output_file:\n",
 51 |     "    all_concepts=pickle.load(output_file)\n",
 52 |     "    \n",
 53 |     "## remove repeated concepts\n",
 54 |     "unique_concepts = list(set(all_concepts))\n",
 55 |     "concepts_file='full_domain_concepts.txt' # rename 'full_concepts_form_openalex.txt'\n",
 56 |     "f = open(concepts_file, \"a\")\n",
 57 |     "for ii in range(len(unique_concepts)):\n",
 58 |     "    f.write(unique_concepts[ii]+'\\n')\n",
 59 |     "f.close()"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "id": "8433c817",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "##### read the concepts file"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 2,
 73 |    "id": "5ad70368",
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "name": "stdout",
 78 |      "output_type": "stream",
 79 |      "text": [
 80 |       "17-04-2023 12:06:38; Concepts: 80675 \n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     " \n",
 86 |     "if os.path.exists(concepts_file):\n",
 87 |     "    # open the existing file for reading   \n",
 88 |     "    with open(concepts_file, \"r\") as f:\n",
 89 |     "        modify_full_concept_list = [line.rstrip() for line in f.readlines()]\n",
 90 |     "    \n",
 91 |     "    now_time = datetime.now()\n",
 92 |     "    formatted_time = now_time.strftime(\"%d-%m-%Y %H:%M:%S\")\n",
 93 |     "    print(\"{}; Concepts: {:d} \".format(formatted_time,len(modify_full_concept_list)))"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "id": "421f5569",
 99 |    "metadata": {},
100 |    "source": [
101 |     "##### filter concepts"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 3,
107 |    "id": "464d27f2",
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "Concepts: 80675 ; Store: 80642; Remove: 33 \n",
115 |       "Elapsed time: 0.34 seconds\n",
116 |       "17-04-2023 12:06:46; Concepts: 80642 \n"
117 |      ]
118 |     }
119 |    ],
120 |    "source": [
121 |     "\n",
122 |     "starting_time = time.time()\n",
123 |     "\n",
124 |     "filter_concept_any=['held','equal','dramatic','slowing','excited','occupied','charged','moving','layer','bi','argument','intuition','experiment','entirely','essentially','built','necessary','take','applicable','employ','visit','visited','herein','facilitates','varying','overlapping','addressed','issues','related','add','adds','dominant','preserve','preserves','preserved','stabilizing','match','manipulating','emerging','processed','data','continuously','analytically','argue','smoothly','connect','connects','connecting','software','matlab','toolbox','standard','industrial','technology','success','equipment','call','analogous','sense','persist','persists','throughout','calculated','useful','difficult','proved']\n",
125 |     "\n",
126 |     "filter_concept_start=['sophisticated','precise','remarkably','consists','gradually','simplified','complete','techniques','partially','presented','iterative','simple','preparation','clear','priori','ae','substantial','sending','protecting','optimized','optimize','optimizing','transmits','transmit','transmitting','transmitted','processing','pre','collect','collected','measured','varied','operating','algorithms','algorithm','robustly','shall','concept','packing','successful','apparent','apparently','readily','adapted','todays','imperfect','seemingly','seeming','shelf','properties','mechanism','phenomenon','behavior','theorem','procedure','usual','form','later','calculating','fundamentally']\n",
127 |     "\n",
128 |     "filter_concept_end=['illustrates','setup','consisting','set','capable','configuration','complete','borrowed','permit','utilizes','referred','refer','capable','pave','stem','preparation','scheme','optimizes','transmitted','transmit','operating','relate','packed','packing','platform','industry','adapt','adapts','adapted','arrangement','era','device','arrange','arranged','content','procedure','outlined','form','formed','followed','following','calculation']\n",
129 |     "\n",
130 |     "\n",
131 |     "concept_to_remove_pair=['self']\n",
132 |     "concept_to_keep_pair=['stabilization']\n",
133 |     "\n",
134 |     "conditioned_filter_concept_any5=['open']\n",
135 |     "conditioned_filter_concept_any3=['driven','component']\n",
136 |     "conditioned_filter_concept_any2=['probe','inspired','technique','open','added','transfer','connected','element','exchange']\n",
137 |     "\n",
138 |     "conditioned_filter_concept_start2=['doubly','probe']\n",
139 |     "conditioned_filter_concept_end2=[]\n",
140 |     "\n",
141 |     "forbidden_continued_strings=['complete measurement','exact numerical','numerical technique','numerical method','complete set','pure entangled','quantum entangled','high fidelity']\n",
142 |     "\n",
143 |     "improve_full_concept_list=[]\n",
144 |     "\n",
145 |     "for one_concept in modify_full_concept_list:\n",
146 |     "    \n",
147 |     "    separated_words=one_concept.split()\n",
148 |     "    do_remove=0\n",
149 |     "    for word in separated_words:\n",
150 |     "        if word in filter_concept_any:\n",
151 |     "            do_remove=1\n",
152 |     "            break\n",
153 |     "        \n",
154 |     "        if len(separated_words)<5: ## only for 5 words\n",
155 |     "            if word in conditioned_filter_concept_any5:\n",
156 |     "                do_remove=1\n",
157 |     "                break\n",
158 |     "\n",
159 |     "            if len(separated_words)<=3:\n",
160 |     "                if word in conditioned_filter_concept_any3:\n",
161 |     "                    do_remove=1\n",
162 |     "                    break\n",
163 |     "                \n",
164 |     "                if len(separated_words)==2: ## only for 2 words\n",
165 |     "                    if word in conditioned_filter_concept_any2:\n",
166 |     "                        do_remove=1\n",
167 |     "                        break\n",
168 |     "\n",
169 |     "             \n",
170 |     "    \n",
171 |     "    if separated_words[0] in filter_concept_start:\n",
172 |     "            do_remove=1\n",
173 |     "    if separated_words[-1] in filter_concept_end:\n",
174 |     "            do_remove=1\n",
175 |     "                \n",
176 |     "    if len(separated_words)==2:\n",
177 |     "        if separated_words[0] in conditioned_filter_concept_start2: #check the start word \n",
178 |     "            do_remove=1\n",
179 |     "        if separated_words[-1] in conditioned_filter_concept_end2: #check the last word \n",
180 |     "            do_remove=1\n",
181 |     "\n",
182 |     "    if do_remove==0:\n",
183 |     "        for word in forbidden_continued_strings:\n",
184 |     "            if word in one_concept:\n",
185 |     "                do_remove=1\n",
186 |     "                break\n",
187 |     "\n",
188 |     "    if do_remove==0:\n",
189 |     "        improve_full_concept_list.append(one_concept)\n",
190 |     "        \n",
191 |     "print(\"Concepts: {:d} ; Store: {:d}; Remove: {:d} \".format(len(modify_full_concept_list), len(improve_full_concept_list),len(modify_full_concept_list)-len(improve_full_concept_list)))\n",
192 |     "elapsed_time = time.time() - starting_time\n",
193 |     "print(\"Elapsed time: {:.2f} seconds\".format(elapsed_time))\n",
194 |     "\n",
195 |     "now_time =  datetime.now()\n",
196 |     "formatted_time = now_time.strftime(\"%d-%m-%Y %H:%M:%S\")\n",
197 |     "print(\"{}; Concepts: {:d} \".format(formatted_time,len(improve_full_concept_list)))"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "id": "736ce8e2",
203 |    "metadata": {},
204 |    "source": [
205 |     "##### restore the file"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 4,
211 |    "id": "2cd82ff3-5ffd-4988-83cf-d59fccf32e8e",
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "name": "stdout",
216 |      "output_type": "stream",
217 |      "text": [
218 |       "txt has been deleted.\n",
219 |       "re-create text and store information.\n",
220 |       "17-04-2023 12:06:52; Concepts: 80642 \n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "\n",
226 |     "# Delete the orginal txt and re-create a new one with the improved concepts \n",
227 |     "if os.path.exists(concepts_file):\n",
228 |     "    os.remove(concepts_file)\n",
229 |     "    print(\"txt has been deleted.\")\n",
230 |     "\n",
231 |     "    # re-Create the text file  \n",
232 |     "    f = open(concepts_file, \"a\")\n",
233 |     "    for ii in range(len(improve_full_concept_list)):\n",
234 |     "        f.write(improve_full_concept_list[ii]+'\\n')\n",
235 |     "    f.close()\n",
236 |     "    print(\"re-create text and store information.\")  \n",
237 |     "else:\n",
238 |     "    f = open(concepts_file, \"a\")\n",
239 |     "    for ii in range(len(improve_full_concept_list)):\n",
240 |     "        f.write(improve_full_concept_list[ii]+'\\n')\n",
241 |     "    f.close()\n",
242 |     "    print(\"create text and store information.\")\n",
243 |     "    \n",
244 |     "now_time = datetime.now()\n",
245 |     "formatted_time = now_time.strftime(\"%d-%m-%Y %H:%M:%S\")\n",
246 |     "print(\"{}; Concepts: {:d} \".format(formatted_time,len(improve_full_concept_list)))\n"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "id": "55cc750e",
252 |    "metadata": {},
253 |    "source": [
254 |     "##### additionally, store a pkl file (as a backup)"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "id": "1987ab1c",
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "concepts_path_pkl='improved_concepts_form_openalex.pkl'\n",
265 |     "with open(concepts_path_pkl, \"wb\") as output_file:\n",
266 |     "    pickle.dump(improve_full_concept_list, output_file)"
267 |    ]
268 |   }
269 |  ],
270 |  "metadata": {
271 |   "kernelspec": {
272 |    "display_name": "Python 3 (ipykernel)",
273 |    "language": "python",
274 |    "name": "python3"
275 |   },
276 |   "language_info": {
277 |    "codemirror_mode": {
278 |     "name": "ipython",
279 |     "version": 3
280 |    },
281 |    "file_extension": ".py",
282 |    "mimetype": "text/x-python",
283 |    "name": "python",
284 |    "nbconvert_exporter": "python",
285 |    "pygments_lexer": "ipython3",
286 |    "version": "3.9.7"
287 |   }
288 |  },
289 |  "nbformat": 4,
290 |  "nbformat_minor": 5
291 | }
292 | 


--------------------------------------------------------------------------------
/create_concepts/Domain_Concept/s5_improve_manually_concepts.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime, date
 3 | 
 4 | ## repeated initial_num=0,1,2,...., start from 0, the second run will be 1, etc.
 5 | # the new_file_name is the final filtered concepts  
 6 | initial_num=0
 7 | file_name='full_domain_concepts_'+str(initial_num)+'.txt'  #'full_concepts_for_openalex_'+str(initial_num)+'.txt'
 8 | curr_file = os.path.join("full_concepts",file_name) 
 9 | new_concept_list=[]
10 | 
11 | with open(curr_file, 'r') as file:
12 |     lines = file.readlines()
13 |     
14 | concept_count=0
15 | for idx, cc in enumerate(lines):
16 |     #if cc[0]!='-':
17 |     if "-" not in cc:
18 |         new_concept_list.append(cc)
19 |         concept_count+=1
20 | 
21 | now_time = datetime.now()
22 | formatted_time = now_time.strftime("%d-%m-%Y %H:%M:%S")
23 | print(f"{formatted_time}, Concepts: {concept_count} ; Remove: {idx-concept_count+1} ")
24 | 
25 | new_num=initial_num+1
26 | new_file_name='full_domain_concepts_'+str(new_num)+'.txt' # the final concept file will be renamed as full_domain_concepts.txt
27 | ### 
28 | with open(new_file_name, 'w') as file:
29 |     for item in new_concept_list:
30 |         file.write(f"{item}")


--------------------------------------------------------------------------------
/create_dynamic_concepts/get_concept_citation.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import gzip
  3 | import json
  4 | import os
  5 | import time
  6 | from datetime import datetime, date
  7 | import pickle
  8 | from functools import reduce
  9 | import random
 10 | import re
 11 | 
 12 | 
 13 | def get_single_article_string(article):
 14 |         
 15 |     curr_title=article['title']
 16 |     abstract_inverted_index = article['abstract_inverted_index']
 17 | 
 18 |     # Flatten the inverted index into a list of (position, word) tuples
 19 |     position_word_list = [(position, word) for word, positions in abstract_inverted_index.items() for position in positions]
 20 | 
 21 |     # Sort the list by position and extract the words
 22 |     sorted_abstract = sorted(position_word_list)
 23 |     curr_abstract = ' '.join(word for position, word in sorted_abstract)
 24 | 
 25 |     # Replace strings according to the replace_pairs list
 26 |     replace_pairs=[['\n',' '],['-',' '],[' \" a','oa'],['\" a','ae'],['\"a','ae'],[' \" o','oe'],['\" o','oe'],['\"o','oe'],[' \" u','ue'],['\" u','ue'],['\"u','ue'],[' \' a','a'],[' \' e','e'],[' \' o','o'],["\' ", ""],["\'", ""],['  ',' '],['  ',' ']]
 27 | 
 28 |     article_string=(curr_title +' '+ curr_abstract).lower()
 29 |     article_string = reduce(lambda text, pair: text.replace(pair[0], pair[1]), replace_pairs, article_string)
 30 | 
 31 |     return article_string
 32 | 
 33 | # Define a sorting key function to extract the date and part number from the path
 34 | def get_date_and_part_from_path(path):
 35 |     date_folder = os.path.dirname(path)
 36 |     date_str = date_folder.split('=')[-1]
 37 | 
 38 |     file_name = os.path.basename(path)
 39 |     part_str = file_name.split('_')[-1].split('.')[0]
 40 | 
 41 |     return date_str, int(part_str)
 42 | 
 43 | def extract_id(filename):
 44 |     match = re.search(r'log_concept_part_(\d+)_', filename)
 45 |     if match:
 46 |         return int(match.group(1))
 47 |     return None
 48 | 
 49 | 
 50 | # define a log foler
 51 | log_folder = 'logs_concept'
 52 | # define edge_list foler
 53 | vertex_folder = 'concept_citation'
 54 | vertex_folder_log = 'concept_citation_log'
 55 | 
 56 | try:
 57 |     if not os.path.exists(log_folder):
 58 |         os.makedirs(log_folder)
 59 |     
 60 |     if not os.path.exists(vertex_folder):
 61 |         os.makedirs(vertex_folder)
 62 |         
 63 |     if not os.path.exists(vertex_folder_log):
 64 |         os.makedirs(vertex_folder_log)
 65 | 
 66 | except FileExistsError:
 67 |     pass
 68 | 
 69 | 
 70 | data_folder="data_concept_graph"
 71 | cwd = os.getcwd()
 72 | parent_dir = os.path.dirname(cwd)
 73 | concept_folder = os.path.join(parent_dir, data_folder)
 74 | 
 75 | 
 76 | #project_path="/u/xmgu/projects/semnet_openalex"
 77 | #base_folder=os.path.join(project_path,'openalex_workdata_filtered/data/works/')
 78 | 
 79 | # Define the base folder, date pattern and file pattern
 80 | base_folder = 'openalex_workdata_filtered/data/works/'
 81 | date_pattern = 'updated_date=*'
 82 | file_pattern = 'filtered_part_*.gz'
 83 | 
 84 | # Find all the files matching the pattern
 85 | file_paths = glob.glob(f'{base_folder}/{date_pattern}/{file_pattern}')
 86 | # Sort the file_paths list in ascending order based on the date and part number
 87 | file_paths = sorted(file_paths, key=get_date_and_part_from_path)
 88 | 
 89 | # Define the date range or specific folders to include
 90 | start_date = datetime.strptime("2022-12-20", "%Y-%m-%d")
 91 | end_date = datetime.strptime("2023-03-28", "%Y-%m-%d")
 92 | 
 93 | # Filter the file_paths list based on the date range or specific folders
 94 | curr_run_file_paths = [path for path in file_paths if start_date <= datetime.strptime(get_date_and_part_from_path(path)[0], "%Y-%m-%d") <= end_date]
 95 | 
 96 | rnd_time=random.random()*50
 97 | time.sleep(rnd_time)
 98 | 
 99 | # Read all concepts from full_final_concepts/full_domain_concept.txt
100 | concepts_files = os.path.join(concept_folder, 'full_domain_concept.txt')
101 | with open(concepts_files, 'r') as file:
102 |     full_concepts = [concept.strip() for concept in file.readlines()]
103 | 
104 | # Define a list to store the vertex lists
105 | paper_starting_date = date(1990,1,1) 
106 | 
107 | write_file=0
108 | 
109 | rnd_time=random.random()*60
110 | time.sleep(rnd_time)
111 | 
112 | while write_file <=len(curr_run_file_paths):
113 |     
114 |     curr_ID = random.randint(0, len(curr_run_file_paths)-1) # get a random number between 0 and the number of files
115 | 
116 |     formatted_ID = '{:03d}'.format(curr_ID)
117 | 
118 |     edge_file=os.path.join(vertex_folder, 'concept_part_'+formatted_ID+'.gz')
119 |     edge_file_log=os.path.join(vertex_folder_log, 'concept_part_'+formatted_ID+'.txt')
120 |     log_file_txt=os.path.join(log_folder, 'log_concept_part_'+formatted_ID+'.txt')
121 |     log_file_txt_finish=os.path.join(log_folder, 'log_concept_part_'+formatted_ID+'_finish.txt')
122 |     log_file_txt_empty=os.path.join(log_folder, 'log_concept_part_'+formatted_ID+'_empty.txt')
123 | 
124 |     if not os.path.exists(log_file_txt):
125 |         current_time=datetime.now()
126 |         open(log_file_txt, 'a').close()
127 |         
128 |         file_path=curr_run_file_paths[curr_ID]
129 |         with open(log_file_txt, 'a') as log_file:
130 |             log_file.write(f'Current time: {current_time}; Number of files: {len(curr_run_file_paths)}; Number of concepts: {len(full_concepts)}\n\n')
131 |             log_file.write(f'Start the File: {file_path}; Current time: {datetime.now()} \n\n')
132 | 
133 |         with gzip.open(file_path, 'rt') as file:
134 |             lines = file.readlines()
135 | 
136 |             if not lines: # if lines is empty
137 |                 print(f'File {file_path} is empty')
138 |                 write_file+=1
139 |                 with open(log_file_txt_empty, 'a') as log_file:
140 |                     log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; File is Empty!\n')
141 |                 
142 |             else:
143 |                 edge_lists=[]
144 |                 for id_line, line in enumerate(lines):
145 |                     time_start_line=time.time()
146 | 
147 |                     article_object = json.loads(line) # Load the JSON object
148 |                     get_date = datetime.strptime(article_object['publication_date'], "%Y-%m-%d").date()
149 |                     curr_paper_time = (get_date - paper_starting_date).days
150 |                     curr_all_citations=article_object['cited_by_count']
151 |                     curr_citations_per_year=article_object['counts_by_year']
152 |                     curr_article=get_single_article_string(article_object)
153 | 
154 | 
155 |                     # Check if the article contains any of the concepts
156 |                     concepts_for_single_paper=[]
157 |                     for id_concept, concept in enumerate(full_concepts):
158 |                         if concept in curr_article: # if the paper contains the concept; then store its concept index 
159 |                             concepts_for_single_paper.append(id_concept)
160 | 
161 |                     for ii in range(len(concepts_for_single_paper)):
162 |                         edge_lists.append([concepts_for_single_paper[ii],curr_paper_time,curr_all_citations,curr_citations_per_year])
163 | 
164 |                     if id_line % 10000 == 0:
165 |                         with open(log_file_txt, 'a') as log_file:
166 |                             log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; Processed: {(id_line+1)/len(lines)}; time: {time.time()-time_start_line}\n')
167 | 
168 |                 # Finish the current file, then store edge_lists to a pickle file
169 |                 with gzip.open(edge_file, 'wb') as output_file:
170 |                     pickle.dump(edge_lists, output_file)
171 |                     write_file+=1
172 | 
173 |                 with open(edge_file_log, 'a') as log_file:
174 |                     log_file.write(f'\nconcept_list={len(edge_lists)}')
175 |                         
176 |                 with open(log_file_txt, 'a') as log_file:
177 |                     log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path}; Processed: {write_file}/{len(curr_run_file_paths)}, i.e., {write_file/len(curr_run_file_paths)} \n')
178 |                 
179 |                 with open(log_file_txt_finish, 'a') as log_file:
180 |                     log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path} \n')
181 |                 
182 |                 rnd_time=random.random()*5
183 |                 time.sleep(rnd_time)
184 |                 
185 |     else:
186 |         # Match file patterns
187 |         finish_pattern = os.path.join(log_folder, 'log_concept_part_*_finish.txt')
188 |         empty_pattern = os.path.join(log_folder, 'log_concept_part_*_empty.txt')
189 |         finished_files = [f for f in glob.glob(finish_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))]
190 |         empty_files = [f for f in glob.glob(empty_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))]
191 | 
192 |         # Count files that match each pattern
193 |         total_files = len(finished_files) + len(empty_files)
194 | 
195 |         # Check if the total count is larger than 391
196 |         if total_files >= len(curr_run_file_paths):
197 |             print(f"{datetime.now()}:Finish run!")
198 |             break
199 | 
200 |             
201 | 
202 | with open("job_finish.txt", 'a') as f:
203 |     f.write(f'\nFinish all: {datetime.now()}\n')
204 | 
205 | 
206 | 
207 | 
208 | 
209 | 


--------------------------------------------------------------------------------
/create_dynamic_concepts/merge_concept_citation.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import gzip
 3 | import json
 4 | import os
 5 | import time
 6 | from datetime import datetime, date
 7 | import pickle
 8 | from functools import reduce
 9 | import random
10 |  
11 | 
12 | log_folder = 'logs'
13 | if not os.path.exists(log_folder):
14 |     os.makedirs(log_folder)
15 | log_files='log_merge_concept_citation.txt'
16 | 
17 | # define vertex_list foler
18 | vertex_list_folder = 'concept_citation'
19 | if not os.path.exists(vertex_list_folder):
20 |     os.makedirs(vertex_list_folder)
21 | 
22 | list_file_names = os.listdir(vertex_list_folder) # List all files in the directory
23 | vertex_file_name_unsorted = [file for file in list_file_names if file.endswith('.gz')]
24 | vertex_lists_files = sorted(vertex_file_name_unsorted) # Sort the file list
25 | 
26 | full_vertex_lists = os.path.join(vertex_list_folder,'all_concept_citation.gz')  # vertex
27 | 
28 | 
29 | with open(os.path.join(log_folder, log_files), 'a') as f:
30 |     f.write(f'\nStart: {datetime.now()}\n')
31 |     
32 |     
33 | full_vertices=[]
34 | empty_count=0
35 | for id_file, curr_vertex_files in enumerate(vertex_lists_files):
36 | 
37 |     with gzip.open(os.path.join(vertex_list_folder, curr_vertex_files), 'rb') as f: # load the vertex list
38 |         vertex_data_list = pickle.load(f)
39 | 
40 |     if vertex_data_list!=[]:  # skip empty files
41 |         full_vertices.extend(vertex_data_list)
42 |     else:
43 |         empty_count+=1
44 |         print(f'Empty file: {curr_vertex_files}')
45 | 
46 |     # write to log file
47 |     with open(os.path.join(log_folder, log_files), 'a') as f:
48 |         f.write(f'Finish file: {curr_vertex_files}; v: {len(full_vertices)}; Processed: {(id_file+1)/len(vertex_lists_files)}; empty Num: {empty_count}\n')
49 | 
50 | # store the vertices list in a gz file
51 | with gzip.open(full_vertex_lists, 'wb') as f:
52 |     pickle.dump(full_vertices, f)
53 | 
54 | with open(os.path.join(log_folder, log_files), 'a') as f:
55 |     f.write(f'\nFinish: {datetime.now()}\n')
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/create_dynamic_concepts/process_concept_to_pandas_frame.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pickle
 4 | import gzip
 5 | from datetime import datetime, date
 6 | import numpy as np
 7 | import pandas as pd
 8 | import time
 9 | import copy
10 | 
11 | log_folder = 'logs' # log folder
12 | if not os.path.exists(log_folder):
13 |     os.makedirs(log_folder)
14 | 
15 | data_folder="concept_citation"
16 | data_file=os.path.join(data_folder,'all_concept_citation.gz')   
17 | 
18 | 
19 | store_folder="data_concept_graph"
20 | cwd = os.getcwd()
21 | parent_dir = os.path.dirname(cwd) # get parent directory
22 | new_dir_path = os.path.join(parent_dir, store_folder)
23 | os.makedirs(new_dir_path, exist_ok=True)
24 | 
25 | store_data_file = os.path.join(new_dir_path, "full_dynamic_concept.parquet")
26 | 
27 | 
28 | logsfile=os.path.join(log_folder,"logs_process_concepts.txt")
29 | starting_time=time.time()
30 | print(f'{datetime.now()}: read full graph')
31 | with open(logsfile+'.txt', "a") as myfile:
32 |     myfile.write(f'\n{datetime.now()}: read full graph') 
33 | 
34 | with gzip.open(data_file, 'rb') as f: # load the edge list
35 |     full_dynamic_concept = pickle.load(f)
36 |     
37 | with open(logsfile+'.txt', "a") as myfile:
38 |     myfile.write(f"\n{datetime.now()}: Done, Total: {len(full_dynamic_concept)}; Elapsed time: {time.time() - starting_time} seconds\n")
39 | 
40 | 
41 | # process the edge list to make each element with the same size
42 | ## [concept, paper_time, total_citation, citation_per_year] 
43 | ## e.g., [7, 10378, 1, [{'year': 2022, 'cited_by_count': 1}]] becomes [7, 10378, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
44 | 
45 | starting_time = time.time()
46 | full_dynamic_concept_copy = copy.deepcopy(full_dynamic_concept)
47 | for i, item in enumerate(full_dynamic_concept):
48 |     years_data = {year_data['year']: year_data['cited_by_count'] for year_data in item[3]}
49 |     new_list = [years_data.get(year, 0) for year in range(2023, 2011, -1)] ## as cited_by_count only contains the last 10 years
50 |     full_dynamic_concept_copy[i] = item[:3] + new_list
51 | 
52 |     if i % 200000 == 0:
53 |         with open(logsfile+'.txt', "a") as myfile:
54 |             myfile.write(f"\nProcessing item {i+1}/{len(full_dynamic_concept_copy)}")
55 | 
56 | 
57 | time_start = time.time() 
58 | full_concept=np.array(full_dynamic_concept_copy)
59 | with open(logsfile+'.txt', "a") as myfile:
60 |     myfile.write(f"\nDone, convert array; Elapsed time: {time.time() - time_start} seconds")
61 |     
62 |     
63 | time_start = time.time()
64 | full_concept_df = pd.DataFrame(full_concept, columns=['v1', 'time', 'ct', 'c2023', 'c2022', 'c2021', 'c2020', 'c2019', 'c2018', 'c2017', 'c2016', 'c2015', 'c2014', 'c2013', 'c2012'])
65 | 
66 | full_concept_df.to_parquet(store_data_file, compression='gzip')
67 | 
68 | with open(logsfile+'.txt', "a") as myfile:
69 |     myfile.write(f"\n{datetime.now()}: Done, full_graph: {len(full_concept_df)}; Elapsed time: {time.time() - time_start} seconds")
70 |     
71 |  
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/create_dynamic_edges/_get_openalex_workdata.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from botocore import UNSIGNED
 3 | from botocore.config import Config
 4 | import gzip
 5 | import jsonlines
 6 | import json
 7 | import os
 8 | 
 9 | 
10 | 
11 | # Function to filter the JSON objects by the desired keys
12 | def filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract):
13 |     desired_keys = ['type', 'title', 'abstract_inverted_index', 'cited_by_count', 'counts_by_year', 'publication_year', 'publication_date']
14 |     # Check if all the desired keys are in the JSON object
15 |     if all(key in json_obj for key in desired_keys):
16 |         if json_obj['type'] == 'journal-article' and json_obj['title'] not in [{}, None] and json_obj['publication_year'] not in [{}, None] and json_obj['publication_date'] not in [{}, None]:
17 |             journal_paper += 1
18 |             if json_obj['abstract_inverted_index'] not in [{}, None]:
19 |                 journal_paper_with_abstract += 1
20 |                 return {key: json_obj[key] for key in desired_keys}, journal_paper, journal_paper_with_abstract
21 |     return None, journal_paper, journal_paper_with_abstract
22 | 
23 | 
24 | 
25 | # check whether a logs folder exists
26 | logs_path = 'logs'
27 | if not os.path.exists(logs_path):
28 |     os.makedirs(logs_path)
29 | 
30 | 
31 | journal_paper = 0
32 | journal_paper_with_abstract = 0
33 | # Create a local directory for the filtered files
34 | local_base_folder = 'openalex_workdata_filtered'
35 | os.makedirs(local_base_folder, exist_ok=True)
36 | 
37 | # Configure the S3 client for anonymous access
38 | s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
39 | # Specify the S3 bucket and prefix (folder)
40 | bucket_name = 'openalex'
41 | prefix ='data/works/'
42 | # Iterate through the objects in the specified S3 bucket and prefix
43 | paginator = s3.get_paginator('list_objects_v2')
44 | 
45 | for id_page, page in enumerate(paginator.paginate(Bucket=bucket_name, Prefix=prefix)):
46 | 
47 |     for id_obj, obj in enumerate(page['Contents']):
48 | 
49 |         if obj['Key'].split('/')[-1] == 'manifest':
50 |             continue  # Skip the manifest file
51 | 
52 |         log_filename = os.path.join(logs_path, obj['Key'].split('/')[-2]+'_'+obj['Key'].split('/')[-1]+'_log.txt') 
53 |         with open(log_filename, 'a') as log_file:
54 |             log_file.write(f"Page {id_page}, object {id_obj}; obj['Key']: {obj['Key']}\n")
55 | 
56 |         # Download and process the gzip-compressed JSON Lines file
57 |         s3_object = s3.get_object(Bucket=bucket_name, Key=obj['Key'])
58 |         
59 |         with gzip.GzipFile(fileobj=s3_object['Body'], mode='r') as gz_file:
60 |             with jsonlines.Reader(gz_file) as reader:
61 |                 filtered_objects = []
62 |                 for id_json, json_obj in enumerate(reader):
63 |                     filtered_obj, journal_paper, journal_paper_with_abstract = filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract)
64 | 
65 |                     if filtered_obj is not None:
66 |                         filtered_objects.append(filtered_obj)
67 | 
68 |                     if id_json % 5000==0:
69 |                         with open(log_filename, 'a') as log_file:
70 |                             log_file.write(f"\n Processed {id_json} objects \n")
71 | 
72 |         # Prepare the local folder structure
73 |         local_path_parts = obj['Key'].split('/')
74 |         local_filtered_folder = os.path.join(local_base_folder, *local_path_parts[:-1])
75 |         os.makedirs(local_filtered_folder, exist_ok=True)
76 | 
77 |         # Store the filtered objects in a new gzip-compressed JSON Lines file on the local computer
78 |         filtered_file_name = f"filtered_{local_path_parts[-1]}"
79 |         filtered_file_path = os.path.join(local_filtered_folder, filtered_file_name)
80 |         with gzip.open(filtered_file_path, 'wt') as f:
81 |             for item in filtered_objects:
82 |                 f.write(json.dumps(item) + '\n')
83 |         with open(log_filename, 'a') as log_file:
84 |             log_file.write(f"Finish writing {filtered_file_path}; until now, journal_paper: {journal_paper}; journal_paper_with_abstract: {journal_paper_with_abstract}\n")
85 |         #print(f"Finish writing {obj['Key']}: {filtered_file_path} \n")
86 |  
87 | print(f"Finish writing all \n")


--------------------------------------------------------------------------------
/create_dynamic_edges/_get_openalex_workdata_parallel_run1.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | from botocore import UNSIGNED
  3 | from botocore.config import Config
  4 | import gzip
  5 | import jsonlines
  6 | import json
  7 | import os
  8 | 
  9 | 
 10 | 
 11 | # Function to filter the JSON objects by the desired keys
 12 | def filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract):
 13 |     desired_keys = ['type', 'title', 'abstract_inverted_index', 'cited_by_count', 'counts_by_year', 'publication_year', 'publication_date']
 14 |     # Check if all the desired keys are in the JSON object
 15 |     if all(key in json_obj for key in desired_keys):
 16 |         if json_obj['type'] == 'journal-article' and json_obj['title'] not in [{}, None] and json_obj['publication_year'] not in [{}, None] and json_obj['publication_date'] not in [{}, None]:
 17 |             journal_paper += 1
 18 |             if json_obj['abstract_inverted_index'] not in [{}, None]:
 19 |                 journal_paper_with_abstract += 1
 20 |                 return {key: json_obj[key] for key in desired_keys}, journal_paper, journal_paper_with_abstract
 21 |     return None, journal_paper, journal_paper_with_abstract
 22 | 
 23 | # check whether a logs folder exists
 24 | logs_path = 'logs'
 25 | if not os.path.exists(logs_path):
 26 |     os.makedirs(logs_path)
 27 |     
 28 |    
 29 | 
 30 | journal_paper = 0
 31 | journal_paper_with_abstract = 0
 32 | # Create a local directory for the filtered files
 33 | local_base_folder = 'openalex_workdata_filtered'
 34 | os.makedirs(local_base_folder, exist_ok=True)
 35 | # Configure the S3 client for anonymous access
 36 | s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
 37 | 
 38 | # Iterate through the objects in the specified S3 bucket and prefix
 39 | paginator = s3.get_paginator('list_objects_v2')
 40 | 
 41 | # Specify the S3 bucket and prefix (folder) as an example here
 42 | # change the folder files such that one can do parallel computing with many run code files
 43 | process_folder=['updated_date=2023-03-27','updated_date=2023-03-28'] # just an example
 44 | 
 45 | bucket_name = 'openalex'
 46 | prefix ='data/works/'
 47 | 
 48 | log_folders = os.path.join(logs_path, process_folder[0]+'_'+process_folder[-1].split('=')[1]+'_log.txt') 
 49 | for id_folder, folder in enumerate(process_folder):
 50 | 
 51 |     prefix ='data/works/'
 52 |     prefix = prefix+folder+'/'
 53 |     print(f"Process {prefix}, step%: {id_folder/len(process_folder)} \n")
 54 | 
 55 |     with open(log_folders, 'a') as log_file:
 56 |         log_file.write(f"Process {prefix}, progress: {id_folder/len(process_folder)} \n")
 57 | 
 58 |     for id_page, page in enumerate(paginator.paginate(Bucket=bucket_name, Prefix=prefix)):
 59 | 
 60 |         for id_obj, obj in enumerate(page['Contents']):
 61 | 
 62 |             if obj['Key'].split('/')[-1] == 'manifest':
 63 |                 continue  # Skip the manifest file
 64 | 
 65 |             log_filename = os.path.join(logs_path, obj['Key'].split('/')[-2]+'_'+obj['Key'].split('/')[-1]+'_log.txt') 
 66 |             with open(log_filename, 'a') as log_file:
 67 |                 log_file.write(f"Page {id_page}, object {id_obj}; obj['Key']: {obj['Key']}\n")
 68 | 
 69 |             # Download and process the gzip-compressed JSON Lines file
 70 |             s3_object = s3.get_object(Bucket=bucket_name, Key=obj['Key'])
 71 |             
 72 |             with gzip.GzipFile(fileobj=s3_object['Body'], mode='r') as gz_file:
 73 |                 with jsonlines.Reader(gz_file) as reader:
 74 |                     filtered_objects = []
 75 |                     for id_json, json_obj in enumerate(reader):
 76 |                         filtered_obj, journal_paper, journal_paper_with_abstract = filter_json_objects(json_obj, journal_paper, journal_paper_with_abstract)
 77 | 
 78 |                         if filtered_obj is not None:
 79 |                             filtered_objects.append(filtered_obj)
 80 | 
 81 |                         if id_json % 5000==0:
 82 |                             with open(log_filename, 'a') as log_file:
 83 |                                 log_file.write(f"\n Processed {id_json} objects")
 84 | 
 85 |             # Prepare the local folder structure
 86 |             local_path_parts = obj['Key'].split('/')
 87 |             local_filtered_folder = os.path.join(local_base_folder, *local_path_parts[:-1])
 88 |             os.makedirs(local_filtered_folder, exist_ok=True)
 89 | 
 90 |             # Store the filtered objects in a new gzip-compressed JSON Lines file on the local computer
 91 |             filtered_file_name = f"filtered_{local_path_parts[-1]}"
 92 |             filtered_file_path = os.path.join(local_filtered_folder, filtered_file_name)
 93 |             with gzip.open(filtered_file_path, 'wt') as f:
 94 |                 for item in filtered_objects:
 95 |                     f.write(json.dumps(item) + '\n')
 96 |             with open(log_filename, 'a') as log_file:
 97 |                 log_file.write(f"Finish writing {filtered_file_path}; until now, journal_paper: {journal_paper}; journal_paper_with_abstract: {journal_paper_with_abstract}\n")
 98 |             #print(f"Finish writing {obj['Key']}: {filtered_file_path} \n")
 99 | 
100 |     with open(log_folders, 'a') as log_file:
101 |         log_file.write(f"Finish {prefix}, progress: {id_folder/len(process_folder)} \nuntil now, journal_paper: {journal_paper}; journal_paper_with_abstract: {journal_paper_with_abstract}\n")
102 | 
103 | print(f"Finish writing all \n")


--------------------------------------------------------------------------------
/create_dynamic_edges/get_concept_pairs.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import gzip
  3 | import json
  4 | import os
  5 | import time
  6 | from datetime import datetime, date
  7 | import pickle
  8 | from functools import reduce
  9 | import random
 10 | import re
 11 | 
 12 | def get_single_article_string(article):
 13 |         
 14 |     curr_title=article['title']
 15 |     abstract_inverted_index = article['abstract_inverted_index']
 16 | 
 17 |     # Flatten the inverted index into a list of (position, word) tuples
 18 |     position_word_list = [(position, word) for word, positions in abstract_inverted_index.items() for position in positions]
 19 | 
 20 |     # Sort the list by position and extract the words
 21 |     sorted_abstract = sorted(position_word_list)
 22 |     curr_abstract = ' '.join(word for position, word in sorted_abstract)
 23 | 
 24 |     # Replace strings according to the replace_pairs list
 25 |     replace_pairs=[['\n',' '],['-',' '],[' \" a','oa'],['\" a','ae'],['\"a','ae'],[' \" o','oe'],['\" o','oe'],['\"o','oe'],[' \" u','ue'],['\" u','ue'],['\"u','ue'],[' \' a','a'],[' \' e','e'],[' \' o','o'],["\' ", ""],["\'", ""],['  ',' '],['  ',' ']]
 26 | 
 27 |     article_string=(curr_title +' '+ curr_abstract).lower()
 28 |     article_string = reduce(lambda text, pair: text.replace(pair[0], pair[1]), replace_pairs, article_string)
 29 | 
 30 |     return article_string
 31 | 
 32 | # Define a sorting key function to extract the date and part number from the path
 33 | def get_date_and_part_from_path(path):
 34 |     date_folder = os.path.dirname(path)
 35 |     date_str = date_folder.split('=')[-1]
 36 | 
 37 |     file_name = os.path.basename(path)
 38 |     part_str = file_name.split('_')[-1].split('.')[0]
 39 | 
 40 |     return date_str, int(part_str)
 41 | 
 42 | 
 43 | def extract_id(filename):
 44 |     match = re.search(r'log_edge_part_(\d+)_', filename)
 45 |     if match:
 46 |         return int(match.group(1))
 47 |     return None
 48 | 
 49 | 
 50 | # define a log foler
 51 | log_folder = 'logs_pair'
 52 | # define edge_list foler
 53 | edge_folder = 'concept_pair'
 54 | edge_folder_log = 'concept_pair_log'
 55 | 
 56 | try:
 57 |     if not os.path.exists(log_folder):
 58 |         os.makedirs(log_folder)
 59 |     
 60 |     if not os.path.exists(edge_folder):
 61 |         os.makedirs(edge_folder)
 62 |         
 63 |     if not os.path.exists(edge_folder_log):
 64 |         os.makedirs(edge_folder_log)
 65 | 
 66 | except FileExistsError:
 67 |     pass
 68 | 
 69 | data_folder="data_concept_graph" # the folder that contain the concept file
 70 | cwd = os.getcwd()
 71 | parent_dir = os.path.dirname(cwd)
 72 | concept_folder = os.path.join(parent_dir, data_folder)
 73 | 
 74 | 
 75 | # Define the base folder, date pattern and file pattern
 76 | #project_path="/u/xmgu/projects/semnet_openalex" # change to your path
 77 | #base_folder=os.path.join(project_path,'openalex_workdata_filtered/data/works/')
 78 | 
 79 | base_folder = 'openalex_workdata_filtered/data/works/'
 80 | date_pattern = 'updated_date=*'
 81 | file_pattern = 'filtered_part_*.gz'
 82 | 
 83 | # Find all the files matching the pattern
 84 | file_paths = glob.glob(f'{base_folder}/{date_pattern}/{file_pattern}')
 85 | # Sort the file_paths list in ascending order based on the date and part number
 86 | file_paths = sorted(file_paths, key=get_date_and_part_from_path)
 87 | 
 88 | # Define the date range or specific folders to include
 89 | start_date = datetime.strptime("2022-12-20", "%Y-%m-%d")
 90 | end_date = datetime.strptime("2023-03-28", "%Y-%m-%d")
 91 | 
 92 | # Filter the file_paths list based on the date range or specific folders
 93 | curr_run_file_paths = [path for path in file_paths if start_date <= datetime.strptime(get_date_and_part_from_path(path)[0], "%Y-%m-%d") <= end_date]
 94 | 
 95 | # Read all concepts from full_final_concepts/full_domain_concept.txt
 96 | concepts_files = os.path.join(concept_folder, 'full_domain_concept.txt')
 97 | with open(concepts_files, 'r') as file:
 98 |     full_concepts = [concept.strip() for concept in file.readlines()]
 99 | 
100 | # Define a list to store the edge lists
101 | paper_starting_date = date(1990,1,1) 
102 | write_file=0
103 | 
104 | rnd_time=random.random()*60
105 | time.sleep(rnd_time)
106 | 
107 | while write_file <=len(curr_run_file_paths): # curr_run_file_paths
108 |     
109 |     curr_ID = random.randint(0, len(curr_run_file_paths)-1) # get a random number between 0 and the number of files
110 | 
111 |     formatted_ID = '{:03d}'.format(curr_ID)
112 | 
113 |     edge_file=os.path.join(edge_folder, 'edge_part_'+formatted_ID+'.gz')
114 |     edge_file_log=os.path.join(edge_folder_log, 'edge_part_'+formatted_ID+'.txt')
115 |     log_file_txt=os.path.join(log_folder, 'log_edge_part_'+formatted_ID+'.txt')
116 | 
117 |     log_file_txt_finish=os.path.join(log_folder, 'log_edge_part_'+formatted_ID+'_finish.txt')
118 |     log_file_txt_empty=os.path.join(log_folder, 'log_edge_part_'+formatted_ID+'_empty.txt')
119 | 
120 |     if not os.path.exists(log_file_txt):
121 |         current_time=datetime.now()
122 |         open(log_file_txt, 'a').close()
123 |             
124 |         file_path=curr_run_file_paths[curr_ID]
125 |         with open(log_file_txt, 'a') as log_file:
126 |             log_file.write(f'Current time: {current_time}; Number of files: {len(curr_run_file_paths)}; Number of concepts: {len(full_concepts)}\n\n')
127 |             log_file.write(f'Start the File: {file_path}; Current time: {datetime.now()} \n\n')
128 |             
129 |         with gzip.open(file_path, 'rt') as file:
130 |             lines = file.readlines()
131 | 
132 |             if not lines: # if lines is not empty
133 |                 print(f'File {file_path} is empty')
134 |                 write_file+=1
135 |                 with open(log_file_txt_empty, 'a') as log_file:
136 |                     log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; File is Empty!\n')
137 |                 
138 |             else:
139 |                 edge_lists=[]
140 |                 for id_line, line in enumerate(lines):
141 |                     time_start_line=time.time()
142 | 
143 |                     article_object = json.loads(line) # Load the JSON object
144 |                     get_date = datetime.strptime(article_object['publication_date'], "%Y-%m-%d").date()
145 |                     curr_paper_time = (get_date - paper_starting_date).days
146 |                     curr_all_citations=article_object['cited_by_count']
147 |                     curr_citations_per_year=article_object['counts_by_year']
148 |                     curr_article=get_single_article_string(article_object)
149 | 
150 |                     # Check if the article contains any of the concepts
151 |                     concepts_for_single_paper=[]
152 |                     for id_concept, concept in enumerate(full_concepts):
153 |                         if concept in curr_article: # if the paper contains the concept; then store its concept index 
154 |                             concepts_for_single_paper.append(id_concept)
155 | 
156 |                     for ii in range(len(concepts_for_single_paper)):
157 |                         for jj in range(ii+1,len(concepts_for_single_paper)):
158 |                             edge_lists.append([concepts_for_single_paper[ii],concepts_for_single_paper[jj],curr_paper_time,curr_all_citations,curr_citations_per_year])
159 |                     
160 | 
161 |                     if id_line % 10000 == 0:
162 |                         with open(log_file_txt, 'a') as log_file:
163 |                             log_file.write(f'Current File: {file_path}; Paper: {len(lines)}; Processed: {(id_line+1)/len(lines)}; time: {time.time()-time_start_line}\n')
164 | 
165 |                 # Finish the current file, then store edge_lists to a pickle file
166 |                 with gzip.open(edge_file, 'wb') as output_file:
167 |                     pickle.dump(edge_lists, output_file)
168 |                     write_file+=1
169 |                     
170 |                 with open(edge_file_log, 'a') as log_file:
171 |                     log_file.write(f'\nedge_list={len(edge_lists)}')
172 |                 
173 |                 with open(log_file_txt, 'a') as log_file:
174 |                     log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path}; Processed: {write_file}/{len(curr_run_file_paths)}, i.e., {write_file/len(curr_run_file_paths)} \n')
175 |                     log_file.write(f'\nedge_list: {len(edge_lists)}\n')
176 |                 
177 |                 with open(log_file_txt_finish, 'a') as log_file:
178 |                     log_file.write(f'\n\nFinish Time: {datetime.now()}; Current File: {file_path} \n')
179 |                     
180 |                 rnd_time=random.random()*5
181 |                 time.sleep(rnd_time)
182 |                 
183 |     else:
184 |         finish_pattern = os.path.join(log_folder, 'log_edge_part_*_finish.txt')
185 |         empty_pattern = os.path.join(log_folder, 'log_edge_part_*_empty.txt')
186 |         finished_files = [f for f in glob.glob(finish_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))]
187 |         empty_files = [f for f in glob.glob(empty_pattern) if extract_id(f) in range(0, len(curr_run_file_paths))]
188 | 
189 |         # Count files that match each pattern
190 |         total_files = len(finished_files) + len(empty_files)
191 | 
192 |         # Check if the total count is larger than 391
193 |         if total_files >= len(curr_run_file_paths):
194 |             print(f"{datetime.now()}:Finish run!")
195 |             break
196 | 
197 | 
198 | 
199 | with open("job_finish.txt", 'a') as f:
200 |     f.write(f'\nFinish all: {datetime.now()}\n')
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------
/create_dynamic_edges/merge_concept_pairs.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import gzip
 3 | import json
 4 | import os
 5 | import time
 6 | from datetime import datetime, date
 7 | import pickle
 8 | from functools import reduce
 9 | import random
10 |  
11 | 
12 | log_folder = 'logs'
13 | if not os.path.exists(log_folder):
14 |     os.makedirs(log_folder)
15 | log_files='log_merge_concept_pairs.txt'
16 | 
17 | # define edge_list foler
18 | edge_list_folder = 'concept_pair'
19 | if not os.path.exists(edge_list_folder):
20 |     os.makedirs(edge_list_folder)
21 | 
22 | list_file_names = os.listdir(edge_list_folder) # List all files in the directory
23 | edge_file_name_unsorted = [file for file in list_file_names if file.endswith('.gz')]
24 | edge_lists_files = sorted(edge_file_name_unsorted) # Sort the file list
25 | 
26 | full_edge_lists = os.path.join(edge_list_folder,'all_concept_pairs.gz')  # edges
27 | 
28 | 
29 | with open(os.path.join(log_folder, log_files), 'a') as f:
30 |     f.write(f'\nStart: {datetime.now()}\n')
31 |     
32 |     
33 | full_edges=[]
34 | empty_count=0
35 | for id_file, curr_edge_files in enumerate(edge_lists_files):
36 | 
37 |     with gzip.open(os.path.join(edge_list_folder, curr_edge_files), 'rb') as f: # load the edge list
38 |         edge_data_list = pickle.load(f)
39 | 
40 |     if edge_data_list!=[]:  # skip empty files
41 |         full_edges.extend(edge_data_list)
42 |     else:
43 |         empty_count+=1
44 |         print(f'Empty file: {curr_edge_files}')
45 | 
46 |     # write to log file
47 |     with open(os.path.join(log_folder, log_files), 'a') as f:
48 |         f.write(f'Finish file: {curr_edge_files}; Edges: {len(full_edges)}; Processed: {(id_file+1)/len(edge_lists_files)}; empty Num: {empty_count}\n')
49 | 
50 | # store the edge list in a gz file
51 | with gzip.open(full_edge_lists, 'wb') as f:
52 |     pickle.dump(full_edges, f)
53 | 
54 | with open(os.path.join(log_folder, log_files), 'a') as f:
55 |     f.write(f'\nFinish: {datetime.now()}\n')
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/create_dynamic_edges/process_edge_to_pandas_frame.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pickle
 4 | import gzip
 5 | from datetime import datetime, date
 6 | import numpy as np
 7 | import pandas as pd
 8 | import time
 9 | import copy
10 | 
11 | log_folder = 'logs' # log folder
12 | if not os.path.exists(log_folder):
13 |     os.makedirs(log_folder)
14 |     
15 | data_folder="concept_pair"
16 | data_file=os.path.join(data_folder,'all_concept_pairs.gz')   
17 | 
18 | store_folder="data_concept_graph"
19 | cwd = os.getcwd()
20 | parent_dir = os.path.dirname(cwd) # get parent directory
21 | new_dir_path = os.path.join(parent_dir, store_folder)
22 | os.makedirs(new_dir_path, exist_ok=True)
23 | 
24 | store_data_file = os.path.join(new_dir_path, "full_dynamic_graph.parquet")
25 | 
26 | 
27 | 
28 | logsfile=os.path.join(log_folder,"logs_process_pairs.txt")
29 | starting_time=time.time()
30 | print(f'{datetime.now()}: read full graph')
31 | with open(logsfile+'.txt', "a") as myfile:
32 |     myfile.write(f'\n{datetime.now()}: read full graph') 
33 | 
34 | with gzip.open(data_file, 'rb') as f: # load the edge list
35 |     full_dynamic_edges = pickle.load(f)
36 | 
37 | with open(logsfile+'.txt', "a") as myfile:
38 |     myfile.write(f"\n{datetime.now()}: Done, Total: {len(full_dynamic_edges)}; Elapsed time: {time.time() - starting_time} seconds\n")
39 | 
40 | 
41 | # process the edge list to make each element with the same size
42 | ## [concept1, concept2, paper_time, total_citation, citation_per_year] 
43 | ## e.g., [7, 414, 10378, 1, [{'year': 2022, 'cited_by_count': 1}]] becomes [7, 414, 10378, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
44 | 
45 | starting_time = time.time()
46 | full_dynamic_edges_copy = copy.deepcopy(full_dynamic_edges)
47 | for i, item in enumerate(full_dynamic_edges):
48 |     years_data = {year_data['year']: year_data['cited_by_count'] for year_data in item[4]}
49 |     new_list = [years_data.get(year, 0) for year in range(2023, 2011, -1)] ## as cited_by_count only contains the last 10 years
50 |     full_dynamic_edges_copy[i] = item[:4] + new_list
51 | 
52 |     if i % 200000 == 0:
53 |         with open(logsfile+'.txt', "a") as myfile:
54 |             myfile.write(f"\nProcessing item {i+1}/{len(full_dynamic_edges_copy)}")
55 | 
56 | 
57 | time_start = time.time() 
58 | full_graph=np.array(full_dynamic_edges_copy)
59 | with open(logsfile+'.txt', "a") as myfile:
60 |     myfile.write(f"\nDone, convert array; Elapsed time: {time.time() - time_start} seconds")
61 |     
62 |     
63 | time_start = time.time()
64 | full_graph_df = pd.DataFrame(full_graph, columns=['v1', 'v2', 'time', 'ct', 'c2023', 'c2022', 'c2021', 'c2020', 'c2019', 'c2018', 'c2017', 'c2016', 'c2015', 'c2014', 'c2013', 'c2012'])
65 | 
66 | full_graph_df.to_parquet(store_data_file, compression='gzip')
67 | 
68 | with open(logsfile+'.txt', "a") as myfile:
69 |     myfile.write(f"\n{datetime.now()}: Done, full_graph: {len(full_graph_df)}; Elapsed time: {time.time() - time_start} seconds")
70 |     
71 |  
72 | 
73 | 
74 | 


--------------------------------------------------------------------------------
/fpr_example/plot_FPR.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.metrics import roc_curve, auc
 4 | 
 5 | # File names and corresponding imbalance ratios (IR)
 6 | files = ["solution_output_10.npy", "solution_output_50.npy", "solution_output_100.npy"]
 7 | labels = ["IR=10", "IR=50", "IR=100"]
 8 | 
 9 | # Create a figure with two subplots side by side
10 | fig, axes = plt.subplots(1, 2, figsize=(28, 12))  # Increase the figure width
11 | fig.subplots_adjust(wspace=2)  # Add whitespace between subplots
12 | 
13 | # Plot for single IR=10 (Left plot)
14 | data = np.load("solution_output_10.npy")
15 | y_true = data[:, 0]
16 | y_scores = data[:, 1]
17 | 
18 | fpr, tpr, thresholds = roc_curve(y_true, y_scores)
19 | roc_auc = auc(fpr, tpr)
20 | 
21 | axes[0].plot(fpr, tpr, label=f'IR=10 (AUC = {roc_auc:.2f})', color='blue')
22 | 
23 | for fpr_value in [0.1, 0.3]:
24 |     idx = np.argmin(np.abs(fpr - fpr_value))
25 |     tpr_value = tpr[idx]
26 |     print(f'For FPR={fpr_value}, TPR={tpr_value:.2f}')
27 |     axes[0].plot([fpr_value, fpr_value], [0, tpr_value], linestyle='dotted', color='black', linewidth=3.5)  # Vertical line
28 |     axes[0].plot([0, fpr_value], [tpr_value, tpr_value], linestyle='dotted', color='black', linewidth=3.5)  # Horizontal line
29 |     axes[0].text(fpr_value + 0.02, tpr_value - 0.05, f'(FPR={fpr_value}, TPR={tpr_value:.2f})',
30 |                  fontsize=36, color='black')  # Increased font size by 10
31 | 
32 | axes[0].set_xlabel('False Positive Rate', fontsize=40)  # Increased font size by 10
33 | axes[0].set_ylabel('True Positive Rate', fontsize=40)  # Increased font size by 10
34 | axes[0].set_title('ROC Curve with Thresholds (IR=10)', fontsize=44)  # Increased font size by 10
35 | axes[0].tick_params(axis='both', which='major', labelsize=36)  # Increased font size by 10
36 | axes[0].grid()
37 | axes[0].legend(loc='lower right', fontsize=36)  # Increased font size by 10
38 | 
39 | # Plot for all IRs (Right plot)
40 | for file, label in zip(files, labels):
41 |     data = np.load(file)
42 |     y_true = data[:, 0]
43 |     y_scores = data[:, 1]
44 | 
45 |     fpr, tpr, _ = roc_curve(y_true, y_scores)
46 |     roc_auc = auc(fpr, tpr)
47 |     axes[1].plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})', linewidth=3)  # Increase line width
48 | 
49 | 
50 | axes[1].plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random')
51 | axes[1].set_xlabel('False Positive Rate', fontsize=40)  # Increased font size by 10
52 | axes[1].set_ylabel('True Positive Rate', fontsize=40)  # Increased font size by 10
53 | axes[1].set_title('ROC Curve for IR=10, 50, 100', fontsize=44)  # Increased font size by 10
54 | axes[1].tick_params(axis='both', which='major', labelsize=36)  # Increased font size by 10
55 | axes[1].grid()
56 | axes[1].legend(loc='lower right', fontsize=36)  # Increased font size by 10
57 | 
58 | # Adjust layout and save the combined plot
59 | plt.tight_layout()
60 | plt.savefig('roc_curve_combined_highres.png', dpi=300)
61 | plt.show()
62 | 


--------------------------------------------------------------------------------
/fpr_example/roc_curve_combined_highres.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/fpr_example/roc_curve_combined_highres.png


--------------------------------------------------------------------------------
/general_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import random, time
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | import networkx as nx
  9 | import pandas as pd
 10 | from datetime import datetime, date
 11 | from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
 12 | from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix
 13 | 
 14 | 
 15 |     
 16 | def flatten(t):
 17 |     return [item for sublist in t for item in sublist]
 18 | 
 19 | 
 20 | def format_IR(IR_num, split_type):
 21 |     """
 22 |     make a string, which can be used when storing trained neural network, results, log files, etc.
 23 |     """
 24 |     if isinstance(IR_num[0], list):  # Check if the first element is a list
 25 |         inner = ''.join(str(num) for num in IR_num[0])
 26 |         outer = '{:02d}'.format(IR_num[1])
 27 |         return f'T{split_type}_IR_{inner}_{outer}'
 28 |     else:
 29 |         return f'T{split_type}_IR_' + '_'.join('{:03d}'.format(num) for num in IR_num)
 30 | 
 31 |     
 32 | 
 33 | def make_folders(year_start, split_type, num_class, addition_str):
 34 |     """
 35 |         create folders and subfolders
 36 |         year_start is the train start year, e.g., 2016 for predicting 2019, the year_start is 2016
 37 |         split_type is used for setting whether train conditionally or not
 38 |         note: num_class is always setting to 2, due to binary classfication
 39 |         As an example: year_start=2016, split_type=0; num_class=2; addition_str='train':
 40 |         folder: 2016_train, 
 41 |         subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result   
 42 |     """
 43 |     parent_folder = str(year_start)+"_"+ addition_str
 44 |     if not os.path.exists(parent_folder):
 45 |         os.mkdir(parent_folder)
 46 |         
 47 |     log_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_log")
 48 |     net_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_net")
 49 |     train_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_loss")
 50 |     figure_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_curve")
 51 |     result_folder = os.path.join(parent_folder, f"t{split_type}_c{num_class}_result")
 52 | 
 53 |     try:
 54 |         if not os.path.exists(log_folder):
 55 |             os.mkdir(log_folder)
 56 | 
 57 |         if not os.path.exists(net_folder):
 58 |             os.mkdir(net_folder)
 59 | 
 60 |         if not os.path.exists(train_folder):
 61 |             os.mkdir(train_folder)
 62 | 
 63 |         if not os.path.exists(figure_folder):
 64 |             os.makedirs(figure_folder)
 65 | 
 66 |         if not os.path.exists(result_folder):
 67 |             os.makedirs(result_folder)
 68 | 
 69 |     except FileExistsError:
 70 |         pass
 71 | 
 72 |     save_folders = [net_folder, train_folder, figure_folder, result_folder]
 73 |     return save_folders, log_folder
 74 | 
 75 | 
 76 | ######### Plots ###############
 77 | def calculate_plot_ROC(true_labels, nn_outputs, user_parameter, figure_name, save_figure_folder):
 78 |     """
 79 |     Plot the ROC curve for binary classification.
 80 | 
 81 |     Parameters:
 82 |     - true_labels: Ground truth binary labels.
 83 |     - nn_outputs: Raw outputs from the neural network.
 84 |     - user_parameter: some user parameters whcih are num_class, IR_num, split_type, out_norm; not used here, can be removed
 85 |     - figure_name: stored figure name
 86 |     - save_figure_folder: the folder to store the figure, which is usually defined from t0_c2_curve created from make_folders()
 87 | 
 88 |     return:
 89 |     auc_score_number: the AUC value
 90 |     """
 91 |     num_class, IR_num, split_type, out_norm = user_parameter
 92 |     figure_path=os.path.join(save_figure_folder, figure_name)
 93 |     # Compute the ROC curve
 94 |     fpr, tpr, _ = roc_curve(true_labels, nn_outputs)
 95 |     roc_auc = auc(fpr, tpr)
 96 | 
 97 |     auc_score_number = roc_auc_score(true_labels, nn_outputs)
 98 |     
 99 |     # Plot the ROC curve
100 |     plt.figure()
101 |     lw = 1.5  # Line width
102 |     plt.plot(fpr, tpr, color='blue', lw=lw, label='ROC curve (AUC = %0.4f)' % roc_auc)
103 |     plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--', label='baseline')
104 |     plt.xlim([0.0, 1.0])
105 |     plt.ylim([0.0, 1.05])
106 |     plt.xlabel('False Positive Rate')
107 |     plt.ylabel('True Positive Rate')
108 |     plt.title('Receiver Operating Characteristic (ROC)')
109 |     plt.legend(loc="lower right")
110 |     plt.savefig(figure_path,dpi=600)
111 |     plt.show()
112 |     plt.close()
113 |     
114 |     return auc_score_number
115 | 


--------------------------------------------------------------------------------
/miscellaneous/Fig2_NeuralNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/miscellaneous/Fig2_NeuralNet.png


--------------------------------------------------------------------------------
/miscellaneous/Impact4Cast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/miscellaneous/Impact4Cast.png


--------------------------------------------------------------------------------
/miscellaneous/KnowledgeGraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/artificial-scientist-lab/Impact4Cast/0c1dc9fb31ae92a21b8e38c7356a170d22db3360/miscellaneous/KnowledgeGraph.png


--------------------------------------------------------------------------------
/prepare_adjacency_pagerank.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import gzip
 4 | import copy
 5 | import random, time
 6 | import numpy as np
 7 | import matplotlib.pyplot as plt
 8 | from scipy import sparse
 9 | from scipy.stats import rankdata
10 | import networkx as nx
11 | import pandas as pd
12 | from collections import defaultdict,Counter
13 | from datetime import datetime, date
14 | from itertools import combinations
15 | from features_utils import get_adjacency_matrix, get_pagerank_score
16 | 
17 | NUM_OF_VERTICES=37960   ## number of vertices in the graph
18 | 
19 | time_start = time.time()
20 | data_folder="data_concept_graph" # the folder which stores the full dynamic knowledge graph
21 | 
22 | # Read all concepts together with time, citation information
23 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet")
24 | full_edge_dynamic_data = pd.read_parquet(graph_file)
25 | 
26 | print(f"Done, elapsed_time: {time.time() - time_start}\n full_edge_dynamic_data: {len(full_edge_dynamic_data)};\n")
27 | 
28 | log_files="log_adjacent_pagerank.txt" # just for logging the running situation
29 | 
30 | data_folder="data_for_features" # folder to store the generated adjacency_matrix files and pagerank files for different years
31 | years=[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
32 | 
33 | start_time1=time.time()
34 | for yy in years:
35 |     
36 |     print(f"{datetime.now()}: start adjacency_matrix")
37 |     with open(log_files, "a") as myfile:
38 |         myfile.write(f"\n{datetime.now()}: start adjacency_matrix")
39 | 
40 |     data_file=os.path.join(data_folder, f"adjacency_matrix_{yy}.gz")   
41 |     adjacency_matrix_sparse=get_adjacency_matrix(full_edge_dynamic_data, yy, data_file)
42 |     print(f"{datetime.now()}: finish adjacency_matrix")
43 |     with open(log_files, "a") as myfile:
44 |         myfile.write(f"\n{datetime.now()}: finish adjacency_matrix")
45 |         
46 |     data_file=os.path.join(data_folder,f"pagerank_score_{yy}.gz")
47 |     pagerank_score=get_pagerank_score(adjacency_matrix_sparse, data_file)
48 |     print(f"{datetime.now()}: finish pagerank_score")
49 |     print(f"done, year {yy}: {time.time() - start_time1}s")
50 |     with open(log_files, "a") as myfile:
51 |         myfile.write(f"\n{datetime.now()}: done, year {yy}: {time.time() - start_time1}s")
52 |     start_time1=time.time()
53 | 
54 | 


--------------------------------------------------------------------------------
/prepare_eval_data/prepare_eval_feature_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "e965c883-bc23-437a-822a-6693275a5d54",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "import pickle\n",
 12 |     "import gzip\n",
 13 |     "import copy\n",
 14 |     "import torch\n",
 15 |     "from torch import nn\n",
 16 |     "import torch.nn.functional as F\n",
 17 |     "import random, time\n",
 18 |     "import numpy as np\n",
 19 |     "import matplotlib.pyplot as plt\n",
 20 |     "from scipy import sparse\n",
 21 |     "from scipy.stats import rankdata\n",
 22 |     "import networkx as nx\n",
 23 |     "import pandas as pd\n",
 24 |     "from collections import defaultdict,Counter\n",
 25 |     "from datetime import datetime, date\n",
 26 |     "from itertools import combinations\n",
 27 |     "from preprocess_utils import *\n",
 28 |     "from features_utils import *\n",
 29 |     "from train_model_utils import *\n",
 30 |     " "
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "id": "c44d8573-7c1d-4114-b5b2-ea2fa1bb5c34",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## read pairs and solutions data (both)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "id": "873dc1df-b27e-4b96-9a5d-40c151b0b256",
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "store_folder=\"data_pair_solution\"\n",
 49 |     "pair_solution_data1=os.path.join(store_folder,\"unconnected_2019_pair_solution_connected_2022.parquet\")\n",
 50 |     "pair_solution_data2=os.path.join(store_folder,\"unconnected_2019_pair_solution_unconnected_2022.parquet\")\n",
 51 |     "\n",
 52 |     "time_start = time.time()\n",
 53 |     "eval_pair_solution1 = pd.read_parquet(pair_solution_data1)\n",
 54 |     "eval_pair_solution1=eval_pair_solution1[['v1','v2','citation']]\n",
 55 |     "print(f\"Done, read pair_solution_yes: {len(eval_pair_solution1)}; elapsed_time: {time.time() - time_start}\")\n",
 56 |     "\n",
 57 |     "time_start = time.time()\n",
 58 |     "eval_pair_solution2 = pd.read_parquet(pair_solution_data2)\n",
 59 |     "print(f\"Done, read pair_solution_not: {len(eval_pair_solution2)}; elapsed_time: {time.time() - time_start}\")\n",
 60 |     "\n",
 61 |     "time_start = time.time()\n",
 62 |     "full_eval_pair_result = pd.concat([eval_pair_solution1, eval_pair_solution2])\n",
 63 |     "print(f\"Done, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}\")"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "id": "8a342e3a-2e78-4ea9-96fe-511a347ad92f",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "#### fix random seed"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "id": "627dbfab-4a88-4419-b868-92d285305456",
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "day_origin = date(1990,1,1)\n",
 82 |     "vertex_degree_cutoff=1\n",
 83 |     "years_delta=3\n",
 84 |     "min_edges=1\n",
 85 |     "year_start=2022-years_delta\n",
 86 |     "\n",
 87 |     "rnd_seed=42\n",
 88 |     "random.seed(rnd_seed)\n",
 89 |     "torch.manual_seed(rnd_seed)\n",
 90 |     "np.random.seed(rnd_seed)"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "id": "7023bfd5-d921-4d7e-8124-e5ce26fe6d9e",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "### randomly 10M "
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "id": "9dfcdb04-357e-4b9d-8a38-17b104ed2b71",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "edges_used=10**7\n",
109 |     "num_row = int(min(edges_used, len(full_eval_pair_result)))\n",
110 |     "\n",
111 |     "time_start = time.time()\n",
112 |     "shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed)\n",
113 |     "eval_data_pair_solution = shuffled.head(num_row)\n",
114 |     "\n",
115 |     "print(f\"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}\")"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "id": "99cfee20-1576-428c-b6bd-beee3ad65ce2",
121 |    "metadata": {},
122 |    "source": [
123 |     "## store unconnected pairs and citation, time information"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "id": "834eec4d-4554-4e15-839b-f6d6f25adab4",
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "\n",
134 |     "store_eval_folder=\"data_eval\"\n",
135 |     "if not os.path.exists(store_eval_folder):\n",
136 |     "    os.makedirs(store_eval_folder)\n",
137 |     "print(f\"store files in {store_eval_folder}.....\")\n",
138 |     "\n",
139 |     "time_start = time.time()\n",
140 |     "store_name=os.path.join(store_eval_folder,\"data_eval_pair_solution.parquet\")\n",
141 |     "\n",
142 |     "eval_data_pair_solution.to_parquet(store_name, compression='gzip')\n",
143 |     "print(f\"eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}\")\n"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "id": "db9e9baa-cfb2-4274-af0f-b93e4b2346d3",
149 |    "metadata": {
150 |     "tags": []
151 |    },
152 |    "source": [
153 |     "#### prepare properties"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "id": "1925bd7e-cf68-4f03-90c2-7170d3b36057",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "time_start = time.time()\n",
164 |     "data_folder=\"data_concept_graph\"\n",
165 |     "graph_file=os.path.join(data_folder,\"full_dynamic_graph.parquet\")\n",
166 |     "full_dynamic_graph = pd.read_parquet(graph_file)\n",
167 |     "print(f\"{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}\")"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "81b249b8-ed05-43ff-b84a-b25e850a5e72",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "day_origin = date(1990,1,1)\n",
178 |     "vertex_degree_cutoff=1\n",
179 |     "years_delta=3\n",
180 |     "min_edges=1\n",
181 |     "year_start=2022-years_delta"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "id": "f18e6f1e-b03e-41d2-8f14-c777b937d0ae",
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "start_time=time.time()\n",
192 |     "adj_mat_sparse=[]\n",
193 |     "node_neighbor_list=[]\n",
194 |     "num_neighbor_list=[]\n",
195 |     "for yy in [year_start,year_start-1,year_start-2]:\n",
196 |     "    data_file=os.path.join(\"data_for_features\", f\"adjacency_matrix_{yy}.gz\")\n",
197 |     "    adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file)\n",
198 |     "    adj_mat_sparse.append(adj_mat)\n",
199 |     "    \n",
200 |     "    curr_node_neighbor=get_node_neighbor(adj_mat)\n",
201 |     "    node_neighbor_list.append(curr_node_neighbor)\n",
202 |     "    \n",
203 |     "    curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array \n",
204 |     "    num_neighbor_list.append(curr_num_neighbor)\n",
205 |     "    \n",
206 |     "print(f\"{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}\")"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "id": "f2aa8b6c-b523-4663-a562-78df50522415",
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "start_time=time.time()\n",
217 |     "vertex_features=get_all_node_feature(adj_mat_sparse, year_start, \"data_for_features\")\n",
218 |     "print(f\"{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}\")\n"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "id": "a3a909e3-8949-4d13-9c0d-1fa282d4ab45",
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "start_time=time.time()\n",
229 |     "vc_feature_list=[]\n",
230 |     "for yy in [year_start,year_start-1,year_start-2]:\n",
231 |     "    data_file=os.path.join(\"data_for_features\", f\"concept_node_citation_data_{yy}.parquet\")\n",
232 |     "    vc_df=pd.read_parquet(data_file)\n",
233 |     "    vc_feature=vc_df.values\n",
234 |     "    vc_feature_list.append(vc_feature)\n",
235 |     "    \n",
236 |     "vertex_cfeatures=get_all_node_cfeature(vc_feature_list)\n",
237 |     "print(f\"{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}\") "
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "id": "bb0eb093-f9a0-441d-aa47-9416ae12cc60",
244 |    "metadata": {
245 |     "tags": []
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "\n",
250 |     "logs_file_name='logs_eval_data_infos'\n",
251 |     "time_start = time.time()\n",
252 |     "eval_pair_solution=eval_data_pair_solution.values\n",
253 |     "unconnected_vertex_pairs=eval_pair_solution[:,:2]\n",
254 |     " \n",
255 |     "pair_features, pair_cfeatures=get_all_pair_features(vc_feature_list, node_neighbor_list, num_neighbor_list, unconnected_vertex_pairs, logs_file_name)\n",
256 |     "\n",
257 |     "all_features=[vertex_features, vertex_cfeatures, pair_features, pair_cfeatures]\n",
258 |     "\n",
259 |     "eval_data_features=get_all_feature(all_features, unconnected_vertex_pairs, logs_file_name)\n",
260 |     "\n",
261 |     "print(f\"finish; {len(eval_data_features)}; time: {time.time()-time_start}\")"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "id": "c2506f57-621e-48cd-bdf0-ad942cd862f0",
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "time_start = time.time()\n",
272 |     "\n",
273 |     "store_name=os.path.join(store_eval_folder,\"eval_data_pair_feature.parquet\")\n",
274 |     "data_eval_2022 = pd.DataFrame(eval_data_features)\n",
275 |     "data_eval_2022.to_parquet(store_name, compression='gzip')  \n",
276 |     "\n",
277 |     "print(f\"data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}\")"
278 |    ]
279 |   }
280 |  ],
281 |  "metadata": {
282 |   "kernelspec": {
283 |    "display_name": "asl_semnet",
284 |    "language": "python",
285 |    "name": "asl_semnet"
286 |   },
287 |   "language_info": {
288 |    "codemirror_mode": {
289 |     "name": "ipython",
290 |     "version": 3
291 |    },
292 |    "file_extension": ".py",
293 |    "mimetype": "text/x-python",
294 |    "name": "python",
295 |    "nbconvert_exporter": "python",
296 |    "pygments_lexer": "ipython3",
297 |    "version": "3.10.9"
298 |   }
299 |  },
300 |  "nbformat": 4,
301 |  "nbformat_minor": 5
302 | }
303 | 


--------------------------------------------------------------------------------
/prepare_eval_data/prepare_eval_feature_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | import random, time
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | from scipy import sparse
 12 | from scipy.stats import rankdata
 13 | import networkx as nx
 14 | import pandas as pd
 15 | from collections import defaultdict,Counter
 16 | from datetime import datetime, date
 17 | from itertools import combinations
 18 | from preprocess_utils import *
 19 | from features_utils import *
 20 | from train_model_utils import *
 21 |  
 22 | 
 23 | 
 24 | time_start_begin=time.time() 
 25 | logs_file_name='logs_eval_data_infos'   
 26 | store_folder="data_pair_solution"
 27 | pair_solution_data1=os.path.join(store_folder,"unconnected_2019_pair_solution_connected_2022_clean.parquet")
 28 | pair_solution_data2=os.path.join(store_folder,"unconnected_2019_pair_solution_unconnected_2022.parquet")
 29 | 
 30 | time_start = time.time()
 31 | eval_pair_solution1 = pd.read_parquet(pair_solution_data1)
 32 | print(f"Done, read pair_solution_yes: {len(eval_pair_solution1)}; elapsed_time: {time.time() - time_start}")
 33 | 
 34 | time_start = time.time()
 35 | eval_pair_solution2 = pd.read_parquet(pair_solution_data2)
 36 | print(f"Done, read pair_solution_not: {len(eval_pair_solution2)}; elapsed_time: {time.time() - time_start}")
 37 | 
 38 | time_start = time.time()
 39 | full_eval_pair_result = pd.concat([eval_pair_solution1, eval_pair_solution2])
 40 | print(f"Done, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}")
 41 | with open(logs_file_name+".txt", "a") as myfile:
 42 |     myfile.write(f"\nDone, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}")
 43 | 
 44 | 
 45 | day_origin = date(1990,1,1)
 46 | vertex_degree_cutoff=1
 47 | min_edges=1
 48 | years_delta=3
 49 | year_start=2022-years_delta
 50 | 
 51 | rnd_seed=42
 52 | random.seed(rnd_seed)
 53 | torch.manual_seed(rnd_seed)
 54 | np.random.seed(rnd_seed)
 55 | 
 56 | edges_used=10**7
 57 | num_row = int(min(edges_used, len(full_eval_pair_result)))
 58 | 
 59 | time_start = time.time()
 60 | shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed)
 61 | eval_data_pair_solution = shuffled.head(num_row)
 62 | 
 63 | print(f"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 64 | with open(logs_file_name+".txt", "a") as myfile:
 65 |     myfile.write(f"\nDone, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 66 | 
 67 | 
 68 | 
 69 | store_eval_folder="data_eval" # store folder
 70 | if not os.path.exists(store_eval_folder):
 71 |     os.makedirs(store_eval_folder)
 72 | print(f"store files in {store_eval_folder}.....")
 73 | 
 74 | ###----- store eval_data_pair_solution -----###    
 75 | time_start = time.time()
 76 | store_name=os.path.join(store_eval_folder,"eval_data_pair_solution.parquet")
 77 | eval_data_pair_solution.to_parquet(store_name, compression='gzip')
 78 | print(f"eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 79 | with open(logs_file_name+".txt", "a") as myfile:
 80 |     myfile.write(f"\neval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 81 | 
 82 | 
 83 | 
 84 | ###----- prepare features -----###
 85 | time_start = time.time()
 86 | data_folder="data_concept_graph" # folder that stores the full knowledge graph
 87 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") # load the full knowledge graph 
 88 | full_dynamic_graph = pd.read_parquet(graph_file)
 89 | print(f"{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}")
 90 | with open(logs_file_name+".txt", "a") as myfile:
 91 |     myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}")
 92 | 
 93 | 
 94 | start_time=time.time()
 95 | adj_mat_sparse=[]
 96 | node_neighbor_list=[]
 97 | num_neighbor_list=[]
 98 | for yy in [year_start,year_start-1,year_start-2]:
 99 |     data_file=os.path.join("data_for_features", f"adjacency_matrix_{yy}.gz")
100 |     adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file)
101 |     adj_mat_sparse.append(adj_mat)
102 |     
103 |     curr_node_neighbor=get_node_neighbor(adj_mat)
104 |     node_neighbor_list.append(curr_node_neighbor)
105 |     
106 |     curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten()  
107 |     num_neighbor_list.append(curr_num_neighbor)
108 |     
109 | print(f"{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
110 | with open(logs_file_name+".txt", "a") as myfile:
111 |     myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
112 |     
113 | 
114 | start_time=time.time()
115 | feature_folder="data_for_features"
116 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder)
117 | print(f"{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}")
118 | with open(logs_file_name+".txt", "a") as myfile:
119 |     myfile.write(f"\n{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}")
120 |     
121 | 
122 | start_time=time.time()
123 | vc_feature_list=[]
124 | for yy in [year_start,year_start-1,year_start-2]:
125 |     data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet")
126 |     vc_df=pd.read_parquet(data_file)
127 |     vc_feature=vc_df.values
128 |     vc_feature_list.append(vc_feature)
129 |     
130 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list)
131 | print(f"{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 
132 | with open(logs_file_name+".txt", "a") as myfile:
133 |     myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}")
134 |     
135 | 
136 | 
137 | time_start = time.time()
138 | eval_pair_solution=eval_data_pair_solution.values
139 | unconnected_vertex_pairs=eval_pair_solution[:,:2]
140 | 
141 | pair_features, pair_cfeatures=get_all_pair_features(vc_feature_list, node_neighbor_list, num_neighbor_list, unconnected_vertex_pairs, logs_file_name)
142 | 
143 | all_features=[vertex_features, vertex_cfeatures, pair_features, pair_cfeatures]
144 | 
145 | eval_data_features=get_all_feature(all_features, unconnected_vertex_pairs, logs_file_name)
146 | 
147 | print(f"finish; {len(eval_data_features)}; time: {time.time()-time_start}")
148 | with open(logs_file_name+".txt", "a") as myfile:
149 |     myfile.write(f"\nfinish; {len(eval_data_features)}; time: {time.time()-time_start}")
150 |     
151 | 
152 | ###----- store eval_data_pair_feature -----### 
153 | time_start = time.time()
154 | store_name=os.path.join(store_eval_folder,"eval_data_pair_feature.parquet")
155 | data_eval_2022 = pd.DataFrame(eval_data_features)
156 |  
157 | # Convert column names to string
158 | data_eval_2022.columns = data_eval_2022.columns.astype(str)
159 | data_eval_2022.to_parquet(store_name, compression='gzip')  
160 | 
161 | print(f"store data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}")
162 | with open(logs_file_name+".txt", "a") as myfile:
163 |     myfile.write(f"\nstore data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}")
164 |     myfile.write(f"\n\n{datetime.now()}: {time.time() - time_start_begin}")
165 |     
166 |     
167 |  
168 | 


--------------------------------------------------------------------------------
/prepare_eval_data/prepare_eval_feature_data_condition.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | import random, time
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | from scipy import sparse
 12 | from scipy.stats import rankdata
 13 | import networkx as nx
 14 | import pandas as pd
 15 | from collections import defaultdict,Counter
 16 | from datetime import datetime, date
 17 | from itertools import combinations
 18 | from preprocess_utils import *
 19 | from features_utils import *
 20 | from train_model_utils import *
 21 |  
 22 | 
 23 | 
 24 | time_start_begin=time.time() 
 25 | logs_file_name='logs_eval_data_infos'   
 26 | store_folder="data_pair_solution"
 27 | pair_solution_data1=os.path.join(store_folder,"unconnected_2019_pair_solution_connected_2022_clean.parquet")
 28 | 
 29 | 
 30 | time_start = time.time()
 31 | full_eval_pair_result = pd.read_parquet(pair_solution_data1)
 32 | print(f"Done, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}")
 33 | with open(logs_file_name+".txt", "a") as myfile:
 34 |     myfile.write(f"\nDone, combine all: {len(full_eval_pair_result)}; elapsed_time: {time.time() - time_start}")
 35 | 
 36 | 
 37 | day_origin = date(1990,1,1)
 38 | vertex_degree_cutoff=1
 39 | years_delta=3
 40 | min_edges=1
 41 | year_start=2022-years_delta
 42 | 
 43 | rnd_seed=42
 44 | random.seed(rnd_seed)
 45 | torch.manual_seed(rnd_seed)
 46 | np.random.seed(rnd_seed)
 47 | 
 48 | edges_used=10**7
 49 | num_row = int(min(edges_used, len(full_eval_pair_result)))
 50 | 
 51 | time_start = time.time()
 52 | shuffled = full_eval_pair_result.sample(frac=1, random_state=rnd_seed)
 53 | eval_data_pair_solution = shuffled.head(num_row)
 54 | 
 55 | print(f"Done, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 56 | with open(logs_file_name+".txt", "a") as myfile:
 57 |     myfile.write(f"\nDone, eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 58 | 
 59 | 
 60 | 
 61 | store_eval_folder="data_eval" # store folder
 62 | if not os.path.exists(store_eval_folder):
 63 |     os.makedirs(store_eval_folder)
 64 | print(f"store files in {store_eval_folder}.....")
 65 | 
 66 | ###----- store eval_data_pair_solution -----###     
 67 | time_start = time.time()
 68 | store_name=os.path.join(store_eval_folder,"eval_data_pair_solution_condition.parquet")
 69 | eval_data_pair_solution.to_parquet(store_name, compression='gzip')
 70 | print(f"eval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 71 | with open(logs_file_name+".txt", "a") as myfile:
 72 |     myfile.write(f"\neval_data_pair_solution: {len(eval_data_pair_solution)}; elapsed_time: {time.time() - time_start}")
 73 | 
 74 | 
 75 | ###----- prepare features -----###
 76 | time_start = time.time()
 77 | data_folder="data_concept_graph" # folder that stores the full knowledge graph
 78 | graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet") # load the full knowledge graph 
 79 | full_dynamic_graph = pd.read_parquet(graph_file)
 80 | print(f"{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}")
 81 | with open(logs_file_name+".txt", "a") as myfile:
 82 |     myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - time_start}")
 83 | 
 84 | 
 85 | start_time=time.time()
 86 | adj_mat_sparse=[]
 87 | node_neighbor_list=[]
 88 | num_neighbor_list=[]
 89 | for yy in [year_start,year_start-1,year_start-2]:
 90 |     data_file=os.path.join("data_for_features", f"adjacency_matrix_{yy}.gz")
 91 |     adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file)
 92 |     adj_mat_sparse.append(adj_mat)
 93 |     
 94 |     curr_node_neighbor=get_node_neighbor(adj_mat)
 95 |     node_neighbor_list.append(curr_node_neighbor)
 96 |     
 97 |     curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 
 98 |     num_neighbor_list.append(curr_num_neighbor)
 99 |     
100 | print(f"{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
101 | with open(logs_file_name+".txt", "a") as myfile:
102 |     myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
103 |     
104 | 
105 | start_time=time.time()
106 | vertex_features=get_all_node_feature(adj_mat_sparse, year_start, "data_for_features")
107 | print(f"{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}")
108 | with open(logs_file_name+".txt", "a") as myfile:
109 |     myfile.write(f"\n{datetime.now()}: Done, vertex_features; elapsed_time: {time.time() - start_time}")
110 |     
111 | 
112 | start_time=time.time()
113 | vc_feature_list=[]
114 | for yy in [year_start,year_start-1,year_start-2]:
115 |     data_file=os.path.join("data_for_features", f"concept_node_citation_data_{yy}.parquet")
116 |     vc_df=pd.read_parquet(data_file)
117 |     vc_feature=vc_df.values
118 |     vc_feature_list.append(vc_feature)
119 |     
120 | vertex_cfeatures=get_all_node_cfeature(vc_feature_list)
121 | print(f"{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}") 
122 | with open(logs_file_name+".txt", "a") as myfile:
123 |     myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}")
124 |     
125 | 
126 | 
127 | time_start = time.time()
128 | eval_pair_solution=eval_data_pair_solution.values
129 | unconnected_vertex_pairs=eval_pair_solution[:,:2]
130 | 
131 | pair_features, pair_cfeatures=get_all_pair_features(vc_feature_list, node_neighbor_list, num_neighbor_list, unconnected_vertex_pairs, logs_file_name)
132 | 
133 | all_features=[vertex_features, vertex_cfeatures, pair_features, pair_cfeatures]
134 | 
135 | eval_data_features=get_all_feature(all_features, unconnected_vertex_pairs, logs_file_name)
136 | 
137 | print(f"finish; {len(eval_data_features)}; time: {time.time()-time_start}")
138 | with open(logs_file_name+".txt", "a") as myfile:
139 |     myfile.write(f"\nfinish; {len(eval_data_features)}; time: {time.time()-time_start}")
140 |     
141 | 
142 | ###----- store eval_data_pair_feature -----### 
143 | time_start = time.time()
144 | store_name=os.path.join(store_eval_folder,"eval_data_pair_feature_condition.parquet")
145 | data_eval_2022 = pd.DataFrame(eval_data_features)
146 |  
147 | # Convert column names to string
148 | data_eval_2022.columns = data_eval_2022.columns.astype(str)
149 | data_eval_2022.to_parquet(store_name, compression='gzip')  
150 | 
151 | print(f"store data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}")
152 | with open(logs_file_name+".txt", "a") as myfile:
153 |     myfile.write(f"\nstore data_eval_2022: {len(data_eval_2022)}; elapsed_time: {time.time() - time_start}")
154 |     myfile.write(f"\n\n{datetime.now()}: {time.time() - time_start_begin}")
155 |     
156 | 


--------------------------------------------------------------------------------
/prepare_node_pair_citation_data_years.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "434dcf89-1cc0-4077-9c3e-d893f55838c9",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "from datetime import datetime, date\n",
 12 |     "import random, time\n",
 13 |     "import numpy as np\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "import pandas as pd\n",
 16 |     "import torch\n",
 17 |     "from torch import nn\n",
 18 |     "from scipy import sparse\n",
 19 |     "from collections import defaultdict\n",
 20 |     "import pandas as pd\n",
 21 |     "import networkx as nx\n",
 22 |     "import copy\n",
 23 |     "import gzip\n",
 24 |     "import pickle\n",
 25 |     "from scipy.stats import rankdata\n",
 26 |     "import time"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "cfc6ed42-d6a2-46de-9b8a-bc5994d96b1f",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "### single concept's citation features"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "4f99ff28-b872-4777-8df2-8cfbd1a5031b",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "time_start = time.time()\n",
 45 |     "data_folder=\"data_concept_graph\"\n",
 46 |     "\n",
 47 |     "# Read all concepts together with time, citation information\n",
 48 |     "dynamic_concept_file=os.path.join(data_folder,\"full_dynamic_concept.parquet\")\n",
 49 |     "full_concepts_dynamic_data = pd.read_parquet(dynamic_concept_file)\n",
 50 |     "\n",
 51 |     "# Read all concepts from full_concepts_for_openalex.txt\n",
 52 |     "concepts_files = os.path.join(data_folder, 'full_domain_concepts.txt')\n",
 53 |     "with open(concepts_files, 'r') as file:\n",
 54 |     "    full_concepts = [concept.strip() for concept in file.readlines()]\n",
 55 |     "\n",
 56 |     "print(f\"Done, elapsed_time: {time.time() - time_start}\\n full_concepts_dynamic_data: {len(full_concepts_dynamic_data)};\\n full_concept: {len(full_concepts)}\")\n"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "id": "c610f570-ce15-4f5b-9bb3-1640bc0a7cab",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "NUM_OF_VERTICES=len(full_concepts)\n",
 67 |     "vertex_degree_cutoff=1\n",
 68 |     "years_delta=3\n",
 69 |     "min_edges=1"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "id": "977a6454-4b41-4a62-ad2f-ace410365751",
 76 |    "metadata": {
 77 |     "tags": []
 78 |    },
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "\n",
 82 |     "years=[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]\n",
 83 |     "\n",
 84 |     "day_origin = date(1990,1,1)\n",
 85 |     "all_concepts_df = pd.DataFrame({'v1': range(0, NUM_OF_VERTICES)})\n",
 86 |     "\n",
 87 |     "store_folder=\"data_for_features\"\n",
 88 |     "if not os.path.exists(store_folder):\n",
 89 |     "    os.makedirs(store_folder)\n",
 90 |     "\n",
 91 |     "start_time=time.time()\n",
 92 |     "for yy in years:  \n",
 93 |     "    print(f'Year: {yy}')\n",
 94 |     "    day_curr=(date(yy,12,31)- day_origin).days\n",
 95 |     "    columns_to_subtract = [f'c{i}' for i in range(2023, yy, -1)]\n",
 96 |     "    print(columns_to_subtract)\n",
 97 |     "    cols_to_sum = [f'c{i}' for i in range(yy, yy-years_delta, -1)]\n",
 98 |     "    print(cols_to_sum)\n",
 99 |     "    \n",
100 |     "    dynamic_concepts=full_concepts_dynamic_data[full_concepts_dynamic_data['time']<=day_curr]\n",
101 |     "    dynamic_concepts_df = dynamic_concepts.copy()\n",
102 |     "    \n",
103 |     "    dynamic_concepts_df[f'ct_{yy}'] = dynamic_concepts_df['ct'] - dynamic_concepts_df[columns_to_subtract].sum(axis=1)\n",
104 |     "    \n",
105 |     "    dynamic_concepts_df['ct_delta'] = dynamic_concepts_df[cols_to_sum].sum(axis=1)\n",
106 |     "    \n",
107 |     "    dynamic_concepts_df=dynamic_concepts_df[['v1', f'c{yy}', f'ct_{yy}', 'ct_delta']]\n",
108 |     "    \n",
109 |     "    dynamic_concepts_grouped = dynamic_concepts_df.groupby('v1').agg({f'c{yy}':'sum', f'ct_{yy}':'sum', 'ct_delta':'sum', 'v1':'size'}).rename(columns={'v1':f'num'}).reset_index()\n",
110 |     "    \n",
111 |     "    dynamic_concepts_grouped[f'c{yy}_m'] = dynamic_concepts_grouped[f'c{yy}'] / dynamic_concepts_grouped[f'num']\n",
112 |     "    dynamic_concepts_grouped[f'ct_{yy}_m'] = dynamic_concepts_grouped[f'ct_{yy}'] / dynamic_concepts_grouped[f'num']\n",
113 |     "    dynamic_concepts_grouped[f'ct_delta_m'] = dynamic_concepts_grouped['ct_delta'] / dynamic_concepts_grouped[f'num']\n",
114 |     "     \n",
115 |     "    \n",
116 |     "    # Merge with all_concepts_df\n",
117 |     "    dynamic_concepts_data = pd.merge(all_concepts_df, dynamic_concepts_grouped, on='v1', how='left')\n",
118 |     "    dynamic_concepts_data.fillna(0, inplace=True) # Fill NaN values with 0\n",
119 |     "    dynamic_concepts_data.sort_values(by='v1')\n",
120 |     "    \n",
121 |     "    data_file = os.path.join(store_folder, f\"concept_node_citation_data_{yy}.parquet\")\n",
122 |     "    dynamic_concepts_data.to_parquet(data_file, compression='gzip')\n",
123 |     "    print(f\"in {yy}; time: {time.time()-start_time}\\n\")\n",
124 |     "    start_time=time.time()\n"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "id": "f1147d2c-2fb7-4f12-a89f-e26d3a2d4689",
130 |    "metadata": {},
131 |    "source": [
132 |     "### concept pair's citation features"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "id": "aed3dd3b-eb2a-474c-8665-9743514d55d2",
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "time_start = time.time()\n",
143 |     "data_folder=\"data_concept_graph\"\n",
144 |     "\n",
145 |     "# Read all concepts together with time, citation information\n",
146 |     "graph_file=os.path.join(data_folder,\"full_dynamic_graph.parquet\")\n",
147 |     "full_edge_dynamic_data = pd.read_parquet(graph_file)\n",
148 |     "\n",
149 |     "print(f\"Done, elapsed_time: {time.time() - time_start}\\n full_edge_dynamic_data: {len(full_edge_dynamic_data)};\\n\")\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "e3f5fc43-a136-4959-8b29-1717eca77bbe",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "\n",
160 |     "years=[2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]\n",
161 |     "\n",
162 |     "day_origin = date(1990,1,1)\n",
163 |     " \n",
164 |     "store_folder=\"data_for_features\"\n",
165 |     "start_time=time.time()\n",
166 |     "for yy in years:  \n",
167 |     "    print(f'Year: {yy}')\n",
168 |     "    day_curr=(date(yy,12,31)- day_origin).days\n",
169 |     "    columns_to_subtract = [f'c{i}' for i in range(2023, yy, -1)]\n",
170 |     "    print(columns_to_subtract)\n",
171 |     "    cols_to_sum = [f'c{i}' for i in range(yy, yy-years_delta, -1)]\n",
172 |     "    print(cols_to_sum)\n",
173 |     "    \n",
174 |     "    dynamic_pairs=full_edge_dynamic_data[full_edge_dynamic_data['time']<=day_curr]\n",
175 |     "    dynamic_pairs_df = dynamic_pairs.copy()\n",
176 |     "    \n",
177 |     "    dynamic_pairs_df[f'ct_{yy}'] = dynamic_pairs_df['ct'] - dynamic_pairs_df[columns_to_subtract].sum(axis=1)\n",
178 |     "    \n",
179 |     "    dynamic_pairs_df['ct_delta'] = dynamic_pairs_df[cols_to_sum].sum(axis=1)\n",
180 |     "    \n",
181 |     "    dynamic_pairs_df=dynamic_pairs_df[['v1', 'v2', f'c{yy}', f'ct_{yy}', 'ct_delta']]\n",
182 |     "    \n",
183 |     "    dynamic_pairs_grouped = dynamic_pairs_df.groupby(['v1','v2']).agg({f'c{yy}':'sum', f'ct_{yy}':'sum', 'ct_delta':'sum', 'v1':'size'}).rename(columns={'v1':f'num'}).reset_index()\n",
184 |     "    \n",
185 |     "    dynamic_pairs_grouped[f'c{yy}_m'] = dynamic_pairs_grouped[f'c{yy}'] / dynamic_pairs_grouped[f'num']\n",
186 |     "    dynamic_pairs_grouped[f'ct_{yy}_m'] = dynamic_pairs_grouped[f'ct_{yy}'] / dynamic_pairs_grouped[f'num']\n",
187 |     "    dynamic_pairs_grouped[f'ct_delta_m'] = dynamic_pairs_grouped['ct_delta'] / dynamic_pairs_grouped[f'num']\n",
188 |     "    \n",
189 |     "    data_file = os.path.join(store_folder, f\"concept_pair_citation_data_{yy}.parquet\")\n",
190 |     "    dynamic_pairs_grouped.to_parquet(data_file, compression='gzip')\n",
191 |     "    print(f\"in {yy}; time: {time.time()-start_time}\\n\")\n",
192 |     "    start_time=time.time()\n",
193 |     "    "
194 |    ]
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "asl_semnet",
200 |    "language": "python",
201 |    "name": "asl_semnet"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": {
205 |     "name": "ipython",
206 |     "version": 3
207 |    },
208 |    "file_extension": ".py",
209 |    "mimetype": "text/x-python",
210 |    "name": "python",
211 |    "nbconvert_exporter": "python",
212 |    "pygments_lexer": "ipython3",
213 |    "version": "3.10.9"
214 |   }
215 |  },
216 |  "nbformat": 4,
217 |  "nbformat_minor": 5
218 | }
219 | 


--------------------------------------------------------------------------------
/preprocess_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import random, time
  6 | import numpy as np
  7 | import matplotlib.pyplot as plt
  8 | from scipy import sparse
  9 | from scipy.stats import rankdata
 10 | import networkx as nx
 11 | import pandas as pd
 12 | from collections import defaultdict,Counter
 13 | from datetime import datetime, date
 14 | from itertools import combinations
 15 | from sklearn.metrics import roc_auc_score, roc_curve, auc, classification_report
 16 | from sklearn.metrics import precision_recall_curve, average_precision_score, confusion_matrix
 17 | from features_utils import *
 18 |  
 19 | 
 20 | def prepare_split_datasets(full_train_data, user_parameter, logs_file_name):
 21 |     """
 22 |     Split the whole dataset (unconnected pairs) into two classes, positive and negative samples 
 23 | 
 24 |     Parameters:
 25 |     - full_train_data: full dataset for training or testing
 26 |     - user_parameter: some user parameters whcih are num_class, IR_num, split_type, out_norm
 27 |     - logs_file_name: the folder to store the figure, which is usually defined from t0_c2_curve created from make_folders()
 28 | 
 29 |     return:
 30 |     data_subsets: negative and postive samples
 31 |     """
 32 | 
 33 |     num_class, IR_num, split_type, out_norm = user_parameter
 34 | 
 35 |     time_start=time.time()
 36 |     data_subsets = []
 37 |     if split_type==0: ## IR_num=[100]; namely citation >=IR and citation<IR; binary case 1
 38 |         data_subsets.append(full_train_data[full_train_data[:, 2] < IR_num[0]]) # get all the negative samples, which are the citations < IR_num[0]
 39 |         data_subsets.append(full_train_data[full_train_data[:, 2] >= IR_num[0]]) # get all the positive samples, which are the citations >= IR_num[0]
 40 |     else:  # conditional case
 41 |         for i in range(len(IR_num)-1): # e.g., IR_num=[[0,5], 100]; namely citation >=100 and citation in [0,5] 
 42 |             data_subsets.append(full_train_data[(full_train_data[:, 2] >= IR_num[i][0]) & (full_train_data[:, 2] <= IR_num[i][1])])
 43 |         data_subsets.append(full_train_data[full_train_data[:, 2] >= IR_num[-1]])
 44 |         
 45 |     if split_type==0: 
 46 |         if len(data_subsets[0])<=3*10**7: # usually not meet, due to the size of data_subsets[0] is more than 600million
 47 |             num_row_chose=len(data_subsets[0])
 48 |         else:
 49 |             num_row_chose=min(len(data_subsets[0]),len(data_subsets[1])) # take the same size as the positive cases
 50 |     else:  # conditional case
 51 |         num_row_chose=len(data_subsets[0])
 52 |         
 53 |     indices = np.random.choice(data_subsets[0].shape[0], size=num_row_chose, replace=False)  
 54 |     data_subsets[0] = data_subsets[0][indices] # randomly chose num_row_chose negative samples
 55 |     
 56 |     print(f"dataset len: {len(data_subsets[0])}, {len(data_subsets[1])}; num_row_chose: {num_row_chose}; {time.time()-time_start}s")
 57 |     with open(logs_file_name+"_logs.txt", "a") as myfile:
 58 |         myfile.write(f"\ndataset len: {len(data_subsets[0])}, {len(data_subsets[1])}; num_row_chose: {num_row_chose}; {time.time()-time_start}s")
 59 |           
 60 |         
 61 |     return data_subsets
 62 | 
 63 | 
 64 | def shuffle_split_datasets(data_subsets, train_valid_test_size):
 65 |     """
 66 |     Split the dataset into training and testing sets
 67 | 
 68 |     Parameters:
 69 |     - data_subsets: the prepared negative and positive samples (unconnected pairs)
 70 |     - train_valid_test_size: split ratio, here is train_valid_test_size=[0.85, 0.15, 0.0]; 85% for training, 15% for testing; evaluation part is using future data
 71 |  
 72 |     return:
 73 |     dataset_train: training dataset
 74 |     dataset_test: testing dataset
 75 |     """
 76 |     dataset_train = []
 77 |     dataset_test = []
 78 |     for subset in data_subsets:
 79 |         np.random.shuffle(subset)
 80 |         idx_train = int(len(subset) * train_valid_test_size[0])
 81 |         train_set = subset[:idx_train]
 82 |         test_set = subset[idx_train:]
 83 |         dataset_train.append(train_set)
 84 |         dataset_test.append(test_set)
 85 |     
 86 |     return dataset_train, dataset_test
 87 | 
 88 | 
 89 | def get_pair_solution_datasets(data_subsets, hyper_parameter, user_parameter, logs_file_name):
 90 |     """
 91 |     prepare the unconnected pairs in year y and their corresponding future citations solutions
 92 | 
 93 |     Parameters:
 94 |     - data_subsets: the prepared negative and positive samples (unconnected pairs)
 95 |     - hyper_parameter:  batch_size, lr_enc, rnd_seed
 96 |     - user_parameter: not used here
 97 |     - logs_file_name: txt file for logging the running status 
 98 | 
 99 |     return:
100 |     train_edge_pair: dataset (unconnected pairs)
101 |     train_edge_solution: the corresponding citations 
102 |     """    
103 |     num_class, IR_num, split_type, out_norm = user_parameter # not used here
104 |     batch_size, lr_enc, rnd_seed=hyper_parameter
105 |     start_time=time.time()   
106 | 
107 |     for idx, subset in enumerate(data_subsets):
108 | 
109 |         min_num_row=min(batch_size, len(subset))
110 |         num_new_samples = len(subset) - min_num_row ## data_subsets[0]>=batch_size
111 |         if num_new_samples<0:  # data_subsets[0]<batch_size, one can set the batch_size smaller such that this part will not meet
112 |             upsamples=np.abs(num_new_samples)
113 |             new_samples_idx = np.random.choice(subset.shape[0], size=upsamples, replace=True)
114 |             data_subsets[idx]= np.concatenate([subset, subset[new_samples_idx]], axis=0)
115 |             
116 |     train_data_for_checking = np.concatenate(data_subsets, axis=0)
117 |     train_edge_pair=train_data_for_checking[:,:2]
118 |     train_edge_solution=train_data_for_checking[:,2]
119 |     
120 |     print(f"\nDone, data_for_checking: {len(train_data_for_checking)}; elapsed_time: {time.time() - start_time}") 
121 |     with open(logs_file_name+"_logs.txt", "a") as myfile:
122 |         myfile.write(f"\nDone, data_for_checking: {len(train_data_for_checking)}; elapsed_time: {time.time() - start_time}")
123 |         
124 |     return train_edge_pair, train_edge_solution
125 | 
126 | 
127 |     
128 | def prepare_train_data(data_pair, data_solution, all_features, user_parameter, logs_file_name):
129 |     """
130 |     prepare the features for unconnected pairs and separate into two classes whcih are used in train_model()
131 | 
132 |     Parameters:
133 |     - data_pair: unconnected pairs
134 |     - data_solution:  the citation solution
135 |     - all_features: all the types of features, e.g., all_features=[node_feature, node_cfeature, pair_feature_train, pair_cfeature_train]
136 |     - user_parameter: num_class, IR_num, split_type, out_norm
137 |     - logs_file_name: txt file for logging the running status 
138 | 
139 |     return:
140 |     data_feature: the 141 features for each unconnected pair, 2-d numpy array, row is for each unconnected pair, 141 columns for the 141 features
141 |     data_input:  separate features ; data_input[0] for negative samples; data_input[1] for postive samples
142 |     """ 
143 | 
144 |     num_class, IR_num, split_type, out_norm = user_parameter
145 |     data_feature=get_all_feature(all_features, data_pair, logs_file_name) # make all 141 features for each unconnected pair
146 |     
147 |     data_input = []
148 |     start_time=time.time()
149 |     if split_type==0:
150 |         data_input.append(data_feature[data_solution < IR_num[0]]) # features for negative samples
151 |         data_input.append(data_feature[data_solution >= IR_num[0]]) # features for positive samples         
152 |     else: # conditional case
153 |         for i in range(len(IR_num) - 1):
154 |             data_input.append(data_feature[(data_solution >= IR_num[i][0]) & (data_solution <=IR_num[i][1])])
155 |         data_input.append(data_feature[data_solution >= IR_num[-1]])
156 |         
157 |     print(f"\n   finish split_data_features: {time.time()-start_time}")
158 |     with open(logs_file_name+"_logs.txt", "a") as myfile:
159 |         myfile.write(f"\n   finish split_data_features: {time.time()-start_time}")    
160 |     
161 |     return data_feature, data_input
162 |  
163 | 
164 |     
165 | ######### classify_solution ###############
166 | def classify_solution(data_solution, user_parameter):
167 |     """
168 |     classfiy the citation solution into 0 and 1 classes
169 | 
170 |     Parameters:
171 |     - data_solution:  the citation solution
172 |     - user_parameter: num_class, IR_num, split_type, out_norm
173 | 
174 |     return:
175 |     solution_arr: 0-1 numpy array, 0 for negative samples, 1 for postive samples
176 |     """     
177 |     num_class, IR_num, split_type, out_norm = user_parameter
178 |     solution_arr = np.zeros_like(data_solution)
179 |     
180 |     if split_type==0: ## only binary
181 |         solution_arr[data_solution < IR_num[0]] = 0
182 |         solution_arr[data_solution >= IR_num[0]] = 1
183 |         
184 |     else: # conditional case
185 |         for i in range(len(IR_num) - 1): # in this work, IR_num=[[0,5], 100]
186 |             solution_arr[(data_solution >= IR_num[i][0]) & (data_solution <=IR_num[i][1])] = i # in this case, 0
187 |         solution_arr[data_solution >= IR_num[-1]] = len(IR_num) - 1 # in this case, 1
188 | 
189 |     return solution_arr
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/train_model_2019_condition.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | import random, time
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | from scipy import sparse
 12 | from scipy.stats import rankdata
 13 | import networkx as nx
 14 | import pandas as pd
 15 | from collections import defaultdict,Counter
 16 | from datetime import datetime, date
 17 | from itertools import combinations
 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc
 19 | from general_utils import *
 20 | from features_utils import *
 21 | from preprocess_utils import *
 22 | from train_model_utils import * 
 23 | 
 24 | 
 25 | 
 26 | rn_time=random.random()*30
 27 | time.sleep(rn_time)
 28 |     
 29 | if __name__ == '__main__':
 30 | 
 31 |     
 32 |     split_type=1 # 1 is for conditional case
 33 |     out_norm=False  # we fix this to False, using the raw scores from the neural network output
 34 |     num_class=2 # binary classfication, fixed 
 35 |     day_origin = date(1990,1,1) # the baseline time
 36 | 
 37 |     vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it
 38 |     min_edges=1 # fixed, minimal number of edges that is considered, not used in the work, can be removed
 39 |     years_delta=3 # year gap is 3 years
 40 |     year_start=2019-years_delta # train 2016 for 2019
 41 |     
 42 |     graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] # parameters for the knowledge graph
 43 | 
 44 |     # create folders and subfolders
 45 |     # it will create a main folder: 2016_train_condition that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result    
 46 |     save_folders, log_folder=make_folders(year_start, split_type, num_class, "train_condition")
 47 |     
 48 |     log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_run_1") # just a log file to check the running status
 49 |     with open(log_run+"_logs.txt", "a") as myfile:
 50 |         myfile.write(f"\n\nstart: {datetime.now()}\n") 
 51 |         
 52 |     # load the full dynamic graph
 53 |     start_time = time.time()
 54 |     data_folder="data_concept_graph" # folder that stores the full knowledge graph
 55 |     graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet")
 56 |     full_dynamic_graph = pd.read_parquet(graph_file)
 57 |     with open(log_run+"_logs.txt", "a") as myfile:
 58 |         myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}")
 59 | 
 60 |     # load data for preparing different type of features
 61 |     feature_folder="data_for_features" # folder that stores data used for preparing features
 62 |     start_time=time.time()
 63 |     adj_mat_sparse=[]
 64 |     node_neighbor_list=[]
 65 |     num_neighbor_list=[]
 66 |     for yy in [year_start,year_start-1,year_start-2]:
 67 |         data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz") # load the adjacency_matrix file 
 68 |         adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file)
 69 |         adj_mat_sparse.append(adj_mat)
 70 | 
 71 |         curr_node_neighbor=get_node_neighbor(adj_mat)
 72 |         node_neighbor_list.append(curr_node_neighbor)
 73 | 
 74 |         curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 
 75 |         num_neighbor_list.append(curr_num_neighbor)
 76 |     
 77 |     with open(log_run+"_logs.txt", "a") as myfile:
 78 |         myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
 79 |     
 80 |     start_time=time.time()
 81 |     vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) # prepare all the node features for a vertex in years y, y-1, y-2
 82 |     
 83 |     # load data for preparing different type of citation features
 84 |     start_time=time.time()
 85 |     vc_feature_list=[]
 86 |     for yy in [year_start,year_start-1,year_start-2]:
 87 |         data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") # load the citation information for concepts in year yy
 88 |         vc_df=pd.read_parquet(data_file)
 89 |         vc_feature=vc_df.values
 90 |         vc_feature_list.append(vc_feature)
 91 | 
 92 |     vertex_cfeatures=get_all_node_cfeature(vc_feature_list)
 93 |     with open(log_run+"_logs.txt", "a") as myfile:
 94 |         myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}")
 95 |         
 96 |     pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures] # later used for pair features and cfeatures
 97 |             
 98 |     # load the whole unconnected pairs for training and testing
 99 |     train_data_folder = 'data_pair_solution'
100 |     train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet")
101 | 
102 |     time_start = time.time()
103 |     train_pair_data_yes = pd.read_parquet(train_pair_file1)
104 |     with open(log_run+"_logs.txt", "a") as myfile:
105 |         myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}")
106 |  
107 |     time_start = time.time()
108 |     full_train_data=train_pair_data_yes.values
109 |     with open(log_run+"_logs.txt", "a") as myfile:
110 |         myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 
111 | 
112 |     full_dynamic_graph=pd.DataFrame()
113 |     train_pair_data_yes=pd.DataFrame()
114 |     
115 |     # load the evaluation data feature and solutions
116 |     eval_folder="data_eval" # folder that stores the evaluatuion datasets, unconnected pairs, features, solutions
117 |     start_time = time.time()
118 |     eval_file=os.path.join(eval_folder,"eval_data_pair_solution_condition.parquet")
119 |     eval_data_features_df = pd.read_parquet(eval_file)
120 |     eval_data_solution=eval_data_features_df.values
121 |     eval_data_features_df=pd.DataFrame()
122 |     with open(log_run+"_logs.txt", "a") as myfile:
123 |         myfile.write(f"finish loading eval_data_features; {time.time()-start_time}")
124 |     
125 | 
126 |     start_time = time.time()
127 |     eval_file=os.path.join(eval_folder,"eval_data_pair_feature_condition.parquet")
128 |     eval_data_features_df = pd.read_parquet(eval_file)
129 |     eval_data_features=eval_data_features_df.values
130 |     eval_data_features_df=pd.DataFrame()
131 |     with open(log_run+"_logs.txt", "a") as myfile:
132 |         myfile.write(f"\nfinish loading eval_data_solution; {time.time()-start_time}")      
133 |     
134 | 
135 |     IR_min=[5]
136 |     IR_max=[100]
137 | 
138 |     for id_min in IR_min:
139 |         
140 |         for id_max in IR_max:
141 | 
142 |             IR_num=[[0,id_min], id_max] # IR_num=[[0,5], 100]
143 |             IR_Str=format_IR(IR_num, split_type)
144 | 
145 |             logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str) 
146 |             if not os.path.exists(logs_file_name+"_logs.txt"):
147 |                 current_time=datetime.now()
148 |                 open(logs_file_name+"_logs.txt", 'a').close()
149 | 
150 |                 batch_size=1000 
151 |                 lr_enc=3*10**-5
152 |                 rnd_seed=42
153 |                 hyper_parameter=[batch_size, lr_enc, rnd_seed]
154 |                 graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges]
155 |                 user_parameter=[num_class, IR_num, split_type, out_norm]
156 | 
157 |                 impact_classfication(full_train_data, eval_data_features, eval_data_solution[:,2], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name)
158 | 
159 |                 rn_time=random.random()*30
160 |                 time.sleep(rn_time)
161 | 
162 |             else:
163 |                 pass
164 |             
165 |     with open(log_run+"_logs.txt", "a") as myfile:
166 |         myfile.write(f"\nfinish: {datetime.now()}\n")    
167 | 
168 |  


--------------------------------------------------------------------------------
/train_model_2019_individual_feature.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | import random, time
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | from scipy import sparse
 12 | from scipy.stats import rankdata
 13 | import networkx as nx
 14 | import pandas as pd
 15 | from collections import defaultdict,Counter
 16 | from datetime import datetime, date
 17 | from itertools import combinations
 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc
 19 | from general_utils import *
 20 | from preprocess_utils import *
 21 | from features_utils import *
 22 | from train_model_utils import * 
 23 | 
 24 |  
 25 | 
 26 | rn_time=random.random()*30
 27 | time.sleep(rn_time)
 28 |     
 29 | if __name__ == '__main__':
 30 | 
 31 |     
 32 |     split_type=0 # 1 is for conditional case
 33 |     out_norm=False # we fix this to False, using the raw scores from the neural network output
 34 |     num_class=2 # binary classfication, fixed 
 35 |     day_origin = date(1990,1,1) # the baseline time
 36 | 
 37 |     vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it
 38 |     min_edges=1  # fixed, minimal number of edges that is considered, not used in the work, can be removed
 39 |     years_delta=3 # year gap is 3 years
 40 |     year_start=2019-years_delta # train 2016 for 2019
 41 | 
 42 |     graph_parameter=[year_start, years_delta, vertex_degree_cutoff, min_edges] # parameters for the knowledge graph
 43 |     
 44 |     # create folders and subfolders
 45 |     # it will create a main folder: 2016_train_each that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 
 46 |     save_folders, log_folder=make_folders(year_start, split_type, num_class, "train_each")  
 47 |     
 48 |     log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_single_run") # just a log file to check the running status
 49 |     with open(log_run+"_logs.txt", "a") as myfile:
 50 |         myfile.write(f"\n\nstart: {datetime.now()}\n")    
 51 |         
 52 |     # load the full dynamic graph
 53 |     start_time = time.time()
 54 |     data_folder="data_concept_graph" # folder that stores the full knowledge graph
 55 |     graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet")
 56 |     full_dynamic_graph = pd.read_parquet(graph_file)
 57 |     with open(log_run+"_logs.txt", "a") as myfile:
 58 |         myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}")
 59 |     
 60 |     # load data for preparing different type of features
 61 |     feature_folder="data_for_features"  # folder that stores data used for preparing features
 62 |     start_time=time.time()
 63 |     adj_mat_sparse=[]
 64 |     node_neighbor_list=[]
 65 |     num_neighbor_list=[]
 66 |     for yy in [year_start,year_start-1,year_start-2]:
 67 |         data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz")
 68 |         adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file)
 69 |         adj_mat_sparse.append(adj_mat)
 70 | 
 71 |         curr_node_neighbor=get_node_neighbor(adj_mat)
 72 |         node_neighbor_list.append(curr_node_neighbor)
 73 | 
 74 |         curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 
 75 |         num_neighbor_list.append(curr_num_neighbor)
 76 |     
 77 |     with open(log_run+"_logs.txt", "a") as myfile:
 78 |         myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
 79 |     
 80 |     start_time=time.time()
 81 |     vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder)
 82 |     
 83 |     # load data for preparing different type of citation features
 84 |     start_time=time.time()
 85 |     vc_feature_list=[]
 86 |     for yy in [year_start,year_start-1,year_start-2]:
 87 |         data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet")
 88 |         vc_df=pd.read_parquet(data_file)
 89 |         vc_feature=vc_df.values
 90 |         vc_feature_list.append(vc_feature)
 91 | 
 92 |     vertex_cfeatures=get_all_node_cfeature(vc_feature_list)
 93 |     with open(log_run+"_logs.txt", "a") as myfile:
 94 |         myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}")
 95 |         
 96 |     pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures]
 97 |             
 98 |     # load the whole unconnected pairs for training and testing
 99 |     train_data_folder = 'data_pair_solution'  # folder that stores the unconnected pairs and their citation informations in the future 
100 |     train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet")
101 |     train_pair_file2=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}.parquet")
102 | 
103 |     time_start = time.time()
104 |     train_pair_data_yes = pd.read_parquet(train_pair_file1)
105 |     with open(log_run+"_logs.txt", "a") as myfile:
106 |         myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}")
107 | 
108 |     time_start = time.time()
109 |     train_pair_data_no = pd.read_parquet(train_pair_file2)
110 |     with open(log_run+"_logs.txt", "a") as myfile:
111 |         myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}: {len(train_pair_data_no)}; elapsed_time: {time.time() - time_start}")
112 | 
113 |     time_start = time.time()
114 |     full_train_data=np.concatenate((train_pair_data_yes.values, train_pair_data_no.values), axis=0)
115 |     with open(log_run+"_logs.txt", "a") as myfile:
116 |         myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 
117 | 
118 |     full_dynamic_graph=pd.DataFrame()
119 |     train_pair_data_yes=pd.DataFrame()
120 |     train_pair_data_no=pd.DataFrame()
121 |     
122 |     # load the evaluation data feature and solutions
123 |     eval_folder="data_eval" # folder that stores the evaluatuion datasets, unconnected pairs, features, solutions
124 |     start_time = time.time()
125 |     eval_file=os.path.join(eval_folder,"eval_data_pair_solution.parquet")
126 |     eval_data_features_df = pd.read_parquet(eval_file)
127 |     eval_data_solution=eval_data_features_df.values
128 |     eval_data_features_df=pd.DataFrame()
129 |     with open(log_run+"_logs.txt", "a") as myfile:
130 |         myfile.write(f"finish loading eval_data_features; {time.time()-start_time}")
131 |     
132 | 
133 |     start_time = time.time()
134 |     eval_file=os.path.join(eval_folder,"eval_data_pair_feature.parquet")
135 |     eval_data_features_df = pd.read_parquet(eval_file)
136 |     eval_data_features=eval_data_features_df.values
137 |     eval_data_features_df=pd.DataFrame()
138 |     with open(log_run+"_logs.txt", "a") as myfile:
139 |         myfile.write(f"\nfinish loading eval_data_solution; {time.time()-start_time}")
140 | 
141 |     ## just run one case, IR=100
142 |     num_impact=100 
143 |     IR_num=[num_impact]
144 |     IR_Str=format_IR(IR_num, split_type)
145 | 
146 |     logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str)
147 |     open(logs_file_name+"_logs.txt", 'a').close()
148 | 
149 |     batch_size=1000 
150 |     lr_enc=3*10**-5
151 |     rnd_seed=42
152 |     hyper_parameter=[batch_size, lr_enc, rnd_seed]
153 |     graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges]
154 |     user_parameter=[num_class, IR_num, split_type, out_norm]
155 | 
156 |     impact_classfication_single_feature(full_train_data, eval_data_features, eval_data_solution[:,2], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name)
157 |     
158 |     with open(log_run+"_logs.txt", "a") as myfile:
159 |         myfile.write(f"\nfinish: {datetime.now()}\n\n") 
160 |             
161 |     
162 | 
163 |  


--------------------------------------------------------------------------------
/train_model_2019_run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | import random, time
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | from scipy import sparse
 12 | from scipy.stats import rankdata
 13 | import networkx as nx
 14 | import pandas as pd
 15 | from collections import defaultdict,Counter
 16 | from datetime import datetime, date
 17 | from itertools import combinations
 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc
 19 | from general_utils import *
 20 | from preprocess_utils import *
 21 | from features_utils import *
 22 | from train_model_utils import * 
 23 | 
 24 |  
 25 | 
 26 | rn_time=random.random()*30
 27 | time.sleep(rn_time)
 28 |     
 29 | if __name__ == '__main__':
 30 | 
 31 |     
 32 |     split_type=0 # 1 is for conditional case
 33 |     out_norm=False # we fix this to False, using the raw scores from the neural network output
 34 |     num_class=2 # binary classfication, fixed 
 35 |     day_origin = date(1990,1,1) # the baseline time
 36 | 
 37 |     vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it
 38 |     min_edges=1  # fixed, minimal number of edges that is considered, not used in the work, can be removed
 39 |     years_delta=3 # year gap is 3 years
 40 |     year_start=2019-years_delta # train 2016 for 2019
 41 |     
 42 |     graph_parameter=[year_start, years_delta, vertex_degree_cutoff, min_edges] # parameters for the knowledge graph
 43 |     
 44 |     # create folders and subfolders
 45 |     # it will create a main folder: 2016_train that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 
 46 |     save_folders, log_folder=make_folders(year_start, split_type, num_class, "train")  
 47 |     
 48 |     log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_run_1") # just a log file to check the running status
 49 |     with open(log_run+"_logs.txt", "a") as myfile:
 50 |         myfile.write(f"\n\nstart: {datetime.now()}\n")    
 51 |         
 52 |     # load the full dynamic graph
 53 |     start_time = time.time()
 54 |     data_folder="data_concept_graph" # folder that stores the full knowledge graph
 55 |     graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet")
 56 |     full_dynamic_graph = pd.read_parquet(graph_file)
 57 |     with open(log_run+"_logs.txt", "a") as myfile:
 58 |         myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}")
 59 |     
 60 |     # load data for preparing different type of features
 61 |     feature_folder="data_for_features" # folder that stores data used for preparing features
 62 |     start_time=time.time()
 63 |     adj_mat_sparse=[]
 64 |     node_neighbor_list=[]
 65 |     num_neighbor_list=[]
 66 |     for yy in [year_start,year_start-1,year_start-2]:
 67 |         data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz")
 68 |         adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) # load the adjacency_matrix file, if not exists, it will automatically generate one
 69 |         adj_mat_sparse.append(adj_mat) # adjacency_matrix for year y, y-1, y-2
 70 | 
 71 |         curr_node_neighbor=get_node_neighbor(adj_mat) 
 72 |         node_neighbor_list.append(curr_node_neighbor) # get the neighbors of vertices for years y, y-1, y-2
 73 | 
 74 |         curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten() # array 
 75 |         num_neighbor_list.append(curr_num_neighbor) # get the number of neighbors of vertices for years y, y-1, y-2
 76 |     
 77 |     with open(log_run+"_logs.txt", "a") as myfile:
 78 |         myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
 79 |     
 80 |     start_time=time.time()
 81 |     vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) # prepare all the node features for a vertex in years y, y-1, y-2
 82 |     
 83 |     # load data for preparing different type of citation features
 84 |     start_time=time.time()
 85 |     vc_feature_list=[]
 86 |     for yy in [year_start,year_start-1,year_start-2]:
 87 |         data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") # load the citation information for concepts in year yy
 88 |         vc_df=pd.read_parquet(data_file)
 89 |         vc_feature=vc_df.values
 90 |         vc_feature_list.append(vc_feature)
 91 | 
 92 |     vertex_cfeatures=get_all_node_cfeature(vc_feature_list)
 93 |     with open(log_run+"_logs.txt", "a") as myfile:
 94 |         myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}")
 95 |         
 96 |     pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures] # later used for pair features and cfeatures
 97 |             
 98 |     # load the whole unconnected pairs for training and testing
 99 |     train_data_folder = 'data_pair_solution'  # folder that stores the unconnected pairs and their citation informations in the future
100 |     train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet")
101 |     train_pair_file2=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}.parquet")
102 | 
103 |     time_start = time.time()
104 |     train_pair_data_yes = pd.read_parquet(train_pair_file1)
105 |     with open(log_run+"_logs.txt", "a") as myfile:
106 |         myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}")
107 | 
108 |     time_start = time.time()
109 |     train_pair_data_no = pd.read_parquet(train_pair_file2)
110 |     with open(log_run+"_logs.txt", "a") as myfile:
111 |         myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}: {len(train_pair_data_no)}; elapsed_time: {time.time() - time_start}")
112 | 
113 |     time_start = time.time()
114 |     full_train_data=np.concatenate((train_pair_data_yes.values, train_pair_data_no.values), axis=0) 
115 |     with open(log_run+"_logs.txt", "a") as myfile:
116 |         myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 
117 | 
118 |     full_dynamic_graph=pd.DataFrame()
119 |     train_pair_data_yes=pd.DataFrame()
120 |     train_pair_data_no=pd.DataFrame()
121 |     
122 |     # load the evaluation data feature and solutions
123 |     eval_folder="data_eval" # folder that stores the evaluatuion datasets, unconnected pairs, features, solutions
124 |     start_time = time.time()
125 |     eval_file=os.path.join(eval_folder,"eval_data_pair_solution.parquet")
126 |     eval_data_features_df = pd.read_parquet(eval_file)
127 |     eval_data_solution=eval_data_features_df.values
128 |     eval_data_features_df=pd.DataFrame()
129 |     with open(log_run+"_logs.txt", "a") as myfile:
130 |         myfile.write(f"finish loading eval_data_features; {time.time()-start_time}")
131 |     
132 | 
133 |     start_time = time.time()
134 |     eval_file=os.path.join(eval_folder,"eval_data_pair_feature.parquet")
135 |     eval_data_features_df = pd.read_parquet(eval_file)
136 |     eval_data_features=eval_data_features_df.values
137 |     eval_data_features_df=pd.DataFrame()
138 |     with open(log_run+"_logs.txt", "a") as myfile:
139 |         myfile.write(f"\nfinish loading eval_data_solution; {time.time()-start_time}")
140 | 
141 |     # train neural networks for different IR number from 1 to 200
142 |     IR_start=1  
143 |     IR_end=40  # IR_end=200
144 |     IR_count=IR_start
145 |     while IR_count <= IR_end:
146 |         
147 |         num_impact=random.randint(IR_start, IR_end) # used for parallel computing 
148 |         IR_num=[num_impact]
149 |         IR_Str=format_IR(IR_num, split_type)
150 | 
151 |         logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str) 
152 |         if not os.path.exists(logs_file_name+"_logs.txt"):
153 |             current_time=datetime.now()
154 |             open(logs_file_name+"_logs.txt", 'a').close()
155 | 
156 |             batch_size=1000 
157 |             lr_enc=3*10**-5
158 |             rnd_seed=42
159 |             hyper_parameter=[batch_size, lr_enc, rnd_seed]
160 |             graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges]
161 |             user_parameter=[num_class, IR_num, split_type, out_norm]
162 |             
163 |             impact_classfication(full_train_data, eval_data_features, eval_data_solution[:,2], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name)
164 |             
165 |             IR_count+=1
166 |             rn_time=random.random()*30
167 |             time.sleep(rn_time)
168 |             
169 |         else:
170 |             pass
171 |     
172 |     with open(log_run+"_logs.txt", "a") as myfile:
173 |         myfile.write(f"\nfinish: {datetime.now()}\n\n") 
174 |             
175 |     
176 | 
177 |  


--------------------------------------------------------------------------------
/train_model_2022_run.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | import gzip
  4 | import copy
  5 | import torch
  6 | from torch import nn
  7 | import torch.nn.functional as F
  8 | import random, time
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | from scipy import sparse
 12 | from scipy.stats import rankdata
 13 | import networkx as nx
 14 | import pandas as pd
 15 | from collections import defaultdict,Counter
 16 | from datetime import datetime, date
 17 | from itertools import combinations
 18 | from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc
 19 | from general_utils import *
 20 | from preprocess_utils import *
 21 | from features_utils import *
 22 | from train_model_utils import * 
 23 | 
 24 |  
 25 | 
 26 | rn_time=random.random()*30
 27 | time.sleep(rn_time)
 28 |     
 29 | if __name__ == '__main__':
 30 | 
 31 |     
 32 |     split_type=0 # 1 is for conditional case
 33 |     out_norm=False # we fix this to False, using the raw scores from the neural network output
 34 |     num_class=2 # binary classfication, fixed 
 35 |     day_origin = date(1990,1,1) # the baseline time
 36 | 
 37 |     vertex_degree_cutoff=1 # fixed, the vertex has at least one edge connecting to it
 38 |     min_edges=1 # fixed, minimal number of edges that is considered, not used in the work, can be removed
 39 |     years_delta=3 # year gap is 3 years
 40 |     year_start=2022-years_delta # train 2019 for 2022
 41 |     
 42 |     graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges] # parameters for the knowledge graph
 43 |     
 44 |     # create folders and subfolders
 45 |     # it will create a main folder: 2019_train that contains subfolders: t0_c2_log, t0_c2_net, t0_c2_loss, t0_c2_curve, t0_c2_result 
 46 |     save_folders, log_folder=make_folders(year_start, split_type, num_class, "train")
 47 |     
 48 |     log_run=os.path.join(log_folder,f"train_model_{year_start+years_delta}_run_1") # just a log file to check the running status
 49 |     with open(log_run+"_logs.txt", "a") as myfile:
 50 |         myfile.write(f"\n\nstart: {datetime.now()}\n")    
 51 |         
 52 |     # load the full dynamic graph
 53 |     start_time = time.time()
 54 |     data_folder="data_concept_graph" # folder that stores the full knowledge graph
 55 |     graph_file=os.path.join(data_folder,"full_dynamic_graph.parquet")
 56 |     full_dynamic_graph = pd.read_parquet(graph_file)
 57 |     with open(log_run+"_logs.txt", "a") as myfile:
 58 |         myfile.write(f"\n{datetime.now()}: Done, read full_dynamic_graph: {len(full_dynamic_graph)}; elapsed_time: {time.time() - start_time}")
 59 |     
 60 |     # load data for preparing different type of features
 61 |     feature_folder="data_for_features" # folder that stores data used for preparing features
 62 |     start_time=time.time()
 63 |     adj_mat_sparse=[]
 64 |     node_neighbor_list=[]
 65 |     num_neighbor_list=[]
 66 |     for yy in [year_start,year_start-1,year_start-2]:
 67 |         data_file=os.path.join(feature_folder, f"adjacency_matrix_{yy}.gz") # load the adjacency_matrix file, if not exists, it will automatically generate one
 68 |         adj_mat=get_adjacency_matrix(full_dynamic_graph, yy, data_file) # adjacency_matrix for year y, y-1, y-2
 69 |         adj_mat_sparse.append(adj_mat)
 70 | 
 71 |         curr_node_neighbor=get_node_neighbor(adj_mat)
 72 |         node_neighbor_list.append(curr_node_neighbor) # get the neighbors of vertices for years y, y-1, y-2
 73 | 
 74 |         curr_num_neighbor = np.array(adj_mat.sum(axis=0)).flatten()  
 75 |         num_neighbor_list.append(curr_num_neighbor) # get the number of neighbors of vertices for years y, y-1, y-2
 76 |     
 77 |     with open(log_run+"_logs.txt", "a") as myfile:
 78 |         myfile.write(f"\n{datetime.now()}: Done, adjacency_matrix_sparse; elapsed_time: {time.time() - start_time}")
 79 |     
 80 |     start_time=time.time()
 81 |     vertex_features=get_all_node_feature(adj_mat_sparse, year_start, feature_folder) # prepare all the node features for a vertex in years y, y-1, y-2
 82 |     
 83 |     # load data for preparing different type of citation features
 84 |     start_time=time.time()
 85 |     vc_feature_list=[]
 86 |     for yy in [year_start,year_start-1,year_start-2]:
 87 |         data_file=os.path.join(feature_folder, f"concept_node_citation_data_{yy}.parquet") # load the citation information for concepts in year yy
 88 |         vc_df=pd.read_parquet(data_file)
 89 |         vc_feature=vc_df.values
 90 |         vc_feature_list.append(vc_feature)
 91 | 
 92 |     vertex_cfeatures=get_all_node_cfeature(vc_feature_list)
 93 |     with open(log_run+"_logs.txt", "a") as myfile:
 94 |         myfile.write(f"\n{datetime.now()}: Done, vertex_cfeatures; elapsed_time: {time.time() - start_time}")
 95 |         
 96 |     pair_cf_parameter=[vc_feature_list, node_neighbor_list, num_neighbor_list, vertex_features, vertex_cfeatures] # later used for pair features and cfeatures
 97 |             
 98 |     # load the whole unconnected pairs for training and testing
 99 |     train_data_folder = 'data_pair_solution'  
100 |     train_pair_file1=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}_clean.parquet")
101 |     train_pair_file2=os.path.join(train_data_folder,f"unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}.parquet")
102 | 
103 |     time_start = time.time()
104 |     train_pair_data_yes = pd.read_parquet(train_pair_file1)
105 |     with open(log_run+"_logs.txt", "a") as myfile:
106 |         myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_connected_{year_start+years_delta}: {len(train_pair_data_yes)}; elapsed_time: {time.time() - time_start}")
107 | 
108 |     time_start = time.time()
109 |     train_pair_data_no = pd.read_parquet(train_pair_file2)
110 |     with open(log_run+"_logs.txt", "a") as myfile:
111 |         myfile.write(f"\nDone, read unconnected_{year_start}_pair_solution_unconnected_{year_start+years_delta}: {len(train_pair_data_no)}; elapsed_time: {time.time() - time_start}")
112 | 
113 |     time_start = time.time()
114 |     full_train_data=np.concatenate((train_pair_data_yes.values, train_pair_data_no.values), axis=0)
115 |     with open(log_run+"_logs.txt", "a") as myfile:
116 |         myfile.write(f"\nDone, combine all: {len(full_train_data)}; elapsed_time: {time.time() - time_start}") 
117 | 
118 |     full_dynamic_graph=pd.DataFrame()
119 |     train_pair_data_yes=pd.DataFrame()
120 |     train_pair_data_no=pd.DataFrame()
121 |  
122 |     # train neural networks for different IR number from 1 to 200
123 |     IR_start=1  
124 |     IR_end=40 # IR_end=200
125 |     IR_count=IR_start
126 |     while IR_count <= IR_end:
127 |         
128 |         num_impact=random.randint(IR_start, IR_end) # used for parallel computing  
129 |         IR_num=[num_impact]
130 |         IR_Str=format_IR(IR_num, split_type)
131 | 
132 |         logs_file_name=os.path.join(log_folder,f"train_model_{year_start+years_delta}_"+IR_Str) 
133 |         if not os.path.exists(logs_file_name+"_logs.txt"):
134 |             current_time=datetime.now()
135 |             open(logs_file_name+"_logs.txt", 'a').close()
136 | 
137 |             batch_size=1000 
138 |             lr_enc=3*10**-5
139 |             rnd_seed=42
140 |             hyper_parameter=[batch_size, lr_enc, rnd_seed]
141 |             graph_parameter=[year_start,years_delta,vertex_degree_cutoff, min_edges]
142 |             user_parameter=[num_class, IR_num, split_type, out_norm]
143 |             
144 |             impact_classfication(full_train_data, [], [], pair_cf_parameter, hyper_parameter, graph_parameter, user_parameter, save_folders, logs_file_name)
145 |             
146 |             IR_count+=1
147 |             rn_time=random.random()*30
148 |             time.sleep(rn_time)
149 |             
150 |         else:
151 |             pass
152 |     
153 |     with open(log_run+"_logs.txt", "a") as myfile:
154 |         myfile.write(f"\nfinish: {datetime.now()}\n\n") 
155 |             
156 |     
157 | 
158 |  


--------------------------------------------------------------------------------