├── .gitignore ├── Licence.md ├── MD_simulation_on_alanine_dipeptide └── current_work │ ├── .gitignore │ ├── README.md │ ├── resources │ └── .gitignore │ ├── snapshot.sh │ ├── src │ ├── .gitignore │ ├── ANN_simulation.py │ ├── autoencoders.py │ ├── biased_simulation.py │ ├── biased_simulation_general.py │ ├── config.py │ ├── coordinates_data_files_list.py │ ├── generate_coordinates.py │ ├── helper_func.py │ ├── kernel_tica.py │ ├── main_work.py │ ├── molecule_spec_sutils.py │ ├── remove_water_mol.py │ ├── structural_alignment.py │ ├── tf_load.py │ ├── train_network_and_save_for_iter.py │ └── workqueue.py │ ├── target │ └── .gitignore │ └── tests │ ├── .gitignore │ ├── ANN_simulation_test.py │ └── Makefile ├── README.md ├── archive └── plumed_adp.zip └── figures ├── autoencoder_1.png ├── autoencoder_2.png ├── diagram_autoencoder.png └── hierarchical_autoencoder.png /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | **/current_work/.idea/* 3 | *.sge 4 | *.sge.* 5 | *.mat 6 | **/.ipynb_checkpoints/ 7 | -------------------------------------------------------------------------------- /Licence.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Wei Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/.gitignore: -------------------------------------------------------------------------------- 1 | previous_runs/** 2 | .spyderworkspace 3 | .vscode/** 4 | 5 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/MD_simulation_on_alanine_dipeptide/current_work/README.md -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/resources/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | *.txt 3 | *.pdb 4 | *.png 5 | charmm36.xml 6 | 7 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/snapshot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | prefix=$1 4 | 5 | current_time=$(date -u +%Y%m%d%H%M%S) 6 | dir_name=${prefix}"/ss_"${current_time} 7 | 8 | mkdir -p ${dir_name} 9 | 10 | for item in README.md resources src target; do 11 | echo "copying "${item} 12 | rsync -ar --exclude='.*' ${item} ${dir_name} 13 | done 14 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | nohup.out 3 | temp.py 4 | *.ipynb 5 | ANN.py 6 | ANN.pyc 7 | ANN_wrap.cxx 8 | *.pkl 9 | temp_*.py 10 | temp_*.sh 11 | *.pdb 12 | *_coordinates.txt 13 | temp*.txt 14 | temp/** 15 | *.hdf5 16 | *.npy 17 | *.pdf 18 | temp_model.dot 19 | HDE_paper_notebooks/** 20 | 21 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/ANN_simulation.py: -------------------------------------------------------------------------------- 1 | from config import * # configuration file 2 | from cluster_management import * 3 | from autoencoders import * 4 | from helper_func import * 5 | 6 | """note that all configurations for a class should be in function __init__(), and take configuration parameters 7 | from config.py 8 | """ 9 | 10 | class plotting(object): 11 | """this class implements different plottings 12 | """ 13 | 14 | def __init__(self, network=None): 15 | self._network = network 16 | pass 17 | 18 | @staticmethod 19 | def plot_fve_L_method(fve, CV_min, CV_max, fig, ax): 20 | temp_fve = np.array(fve).flatten() 21 | temp_fve = temp_fve.reshape(CV_max - CV_min, temp_fve.shape[0] / (CV_max - CV_min)) 22 | evaluation_values = np.mean(temp_fve, axis=-1) 23 | optimal_num, x_data, y_data_left, y_data_right = Sutils.L_method(evaluation_values, list(range(CV_min, CV_max + 1))) 24 | x_data = [_ - CV_min for _ in x_data] 25 | ax.plot(x_data, y_data_left) 26 | ax.plot(x_data, y_data_right) 27 | ax.scatter(list(range(CV_max - CV_min + 1)), evaluation_values) 28 | df = pd.DataFrame(temp_fve.T) 29 | sns.boxplot(df, ax=ax) 30 | ax.set_xticklabels(list(range(CV_min, CV_max + 1))) 31 | ax.set_ylim([evaluation_values.min() - 0.1, evaluation_values.max() + 0.1]) 32 | ax.set_xlabel('num of CVs') 33 | ax.set_ylabel('FVE') 34 | return fig, ax 35 | 36 | def plotting_with_coloring_option(self, plotting_space, 37 | fig_object, 38 | axis_object, 39 | network=None, 40 | input_data_for_plotting=None, # input could be cossin or Cartesian 41 | color_option='other', 42 | other_coloring=None, 43 | contain_title=True, 44 | title=None, 45 | axis_ranges=None, 46 | contain_colorbar=True, 47 | colorbar_label=None, 48 | smoothing_using_RNR = False, # smooth the coloring values for data points using RadiusNeighborsRegressor() 49 | variance_using_RNR = False, # get variance of coloring values over space using RNR 50 | smoothing_radius = 0.1, 51 | enable_mousing_clicking_event = False, 52 | related_coor_list_obj = None, 53 | saving_snapshot_mode = 'single_point' 54 | ): 55 | """ 56 | by default, we are using training data, and we also allow external data input 57 | :param related_coor_list_obj, this must be specified when enable_mousing_clicking_event == True 58 | """ 59 | if enable_mousing_clicking_event and related_coor_list_obj is None: 60 | raise Exception('related_coor_list_obj not defined!') 61 | 62 | if network is None: network = self._network 63 | if title is None: title = "plotting in %s, coloring with %s" % (plotting_space, color_option) # default title 64 | if input_data_for_plotting is None: 65 | input_data = self._network._data_set 66 | else: 67 | input_data = input_data_for_plotting 68 | 69 | if plotting_space == "PC": 70 | PCs_to_plot = network.get_PCs(input_data= input_data) 71 | (x, y) = ([item[0] for item in PCs_to_plot], [item[1] for item in PCs_to_plot]) 72 | labels = ["PC1", "PC2"] 73 | else: 74 | raise Exception('plotting_space not defined!') 75 | 76 | # coloring 77 | if color_option == 'step': 78 | coloring = list(range(len(x))) 79 | elif color_option == 'other': 80 | assert (len(other_coloring) == len(x)), (len(other_coloring), len(x)) 81 | coloring = other_coloring 82 | if smoothing_using_RNR: # smooth coloring using RNR 83 | r_neigh = RadiusNeighborsRegressor(radius=smoothing_radius, weights='uniform') 84 | temp_coors = [list(item) for item in zip(x, y)] 85 | r_neigh.fit(temp_coors, coloring) 86 | coloring = r_neigh.predict(temp_coors) 87 | elif variance_using_RNR: # get variance of the coloring values over space, using RNR 88 | r_neigh = RadiusNeighborsRegressor(radius=smoothing_radius, weights='uniform') 89 | temp_coors = [list(item) for item in zip(x, y)] 90 | r_neigh.fit(temp_coors, coloring) 91 | coloring_mean = r_neigh.predict(temp_coors) 92 | r_neigh.fit(temp_coors, np.multiply(np.array(coloring), np.array(coloring))) 93 | coloring_square_mean = r_neigh.predict(temp_coors) 94 | coloring = coloring_square_mean - np.multiply(coloring_mean, coloring_mean) 95 | else: 96 | raise Exception('color_option not defined!') 97 | 98 | im = axis_object.scatter(x,y,s=4, c=coloring, cmap='gist_rainbow', picker=True) 99 | axis_object.set_xlabel(labels[0]) 100 | axis_object.set_ylabel(labels[1]) 101 | if contain_title: 102 | axis_object.set_title(title) 103 | 104 | if not axis_ranges is None: 105 | axis_object.set_xlim(axis_ranges[0]) 106 | axis_object.set_ylim(axis_ranges[1]) 107 | 108 | if contain_colorbar: 109 | temp_colorbar = fig_object.colorbar(im, ax=axis_object) 110 | if not colorbar_label is None: 111 | temp_colorbar.set_label(str(colorbar_label)) 112 | 113 | # mouse clicking event 114 | if enable_mousing_clicking_event: 115 | folder_to_store_these_frames = 'temp_pdb' 116 | if not os.path.exists(folder_to_store_these_frames): 117 | subprocess.check_output(['mkdir', folder_to_store_these_frames]) 118 | 119 | # should calculate step_interval 120 | total_num_of_lines_in_coor_files = sum(related_coor_list_obj.get_list_of_line_num_of_coor_data_file()) 121 | step_interval = int(total_num_of_lines_in_coor_files / len(input_data)) 122 | 123 | if saving_snapshot_mode == 'multiple_points': 124 | axis_object.text(-1.2, -1.2, 'save_frames', picker = True, fontsize=12) # TODO: find better coordinates 125 | 126 | global temp_list_of_coor_index # TODO: use better way instead of global variable 127 | temp_list_of_coor_index = [] 128 | def onclick(event): 129 | global temp_list_of_coor_index 130 | if isinstance(event.artist, matplotlib.text.Text): 131 | if event.artist.get_text() == 'save_frames': 132 | print(temp_list_of_coor_index) 133 | related_coor_list_obj.write_pdb_frames_into_file_with_list_of_coor_index(temp_list_of_coor_index, 134 | folder_to_store_these_frames + '/temp_frames.pdb') # TODO: better naming 135 | 136 | temp_list_of_coor_index = [] # output pdb file and clean up 137 | print ('done saving frames!') 138 | elif isinstance(event.artist, matplotlib.collections.PathCollection): 139 | ind_list = list(event.ind) 140 | print ('onclick:') 141 | temp_list_of_coor_index += [item * step_interval for item in ind_list] # should include step_interval 142 | 143 | for item in ind_list: 144 | print(item, x[item], y[item]) 145 | return 146 | 147 | elif saving_snapshot_mode == 'single_point': 148 | global temp_global_index_click 149 | temp_global_index_click = 0 150 | def onclick(event): 151 | global temp_global_index_click 152 | if isinstance(event.artist, matplotlib.collections.PathCollection): 153 | ind_list = list(event.ind) 154 | print ('onclick:') 155 | for item in ind_list: 156 | print(item, x[item], y[item]) 157 | 158 | temp_ind_list = [item * step_interval for item in ind_list] # should include step_interval 159 | average_x = sum([x[item] for item in ind_list]) / len(ind_list) 160 | average_y = sum([y[item] for item in ind_list]) / len(ind_list) 161 | # notation on the graph 162 | axis_object.scatter([average_x], [average_y], s=50, marker='s') 163 | axis_object.text(average_x, average_y, '%d' % temp_global_index_click, picker = False, fontsize=15) 164 | out_file_name = folder_to_store_these_frames + '/%02d_temp_frames_[%f,%f].pdb' % \ 165 | (temp_global_index_click, average_x, average_y) 166 | 167 | temp_global_index_click += 1 168 | related_coor_list_obj.write_pdb_frames_into_file_with_list_of_coor_index(temp_ind_list, 169 | out_file_name=out_file_name) 170 | # need to verify PCs generated from this output pdb file are consistent from those in the list selected 171 | molecule_type.generate_coordinates_from_pdb_files(path_for_pdb=out_file_name) 172 | if CONFIG_48 == "cossin": 173 | temp_input_data = molecule_type.get_many_cossin_from_coordinates_in_list_of_files( 174 | list_of_files=[out_file_name.replace('.pdb', '_coordinates.npy')]) 175 | elif CONFIG_48 == "Cartesian" or 'pairwise_distance': 176 | scaling_factor = CONFIG_49 177 | temp_input_data = np.load(out_file_name.replace('.pdb', '_coordinates.npy')) / scaling_factor 178 | temp_input_data = Sutils.remove_translation(temp_input_data) 179 | else: 180 | raise Exception("input data type error") 181 | 182 | PCs_of_points_selected = network.get_PCs(input_data=temp_input_data) 183 | assert_almost_equal(PCs_of_points_selected, np.array([[x[item], y[item]] for item in ind_list]), decimal=4) 184 | 185 | return 186 | else: 187 | raise Exception('saving_snapshot_mode error') 188 | 189 | fig_object.canvas.mpl_connect('pick_event', onclick) 190 | 191 | return fig_object, axis_object, im 192 | 193 | def density_plotting(self, fig_object, axis_object, 194 | network=None, 195 | data_for_plotting=None, 196 | n_levels=40 197 | ): 198 | if network is None: network = self._network 199 | temp_data = self._network._data_set if data_for_plotting is None else data_for_plotting 200 | 201 | x = [item[0] for item in network.get_PCs(temp_data)] 202 | y = [item[1] for item in network.get_PCs(temp_data)] 203 | 204 | df = pd.DataFrame({'x': x, 'y': y}) 205 | sns.kdeplot(df.x, df.y, ax=axis_object, n_levels=n_levels) 206 | 207 | return fig_object, axis_object 208 | 209 | @staticmethod 210 | def plotting_potential_centers(fig_object, axis_object, 211 | list_of_coor_data_files, marker='x'): 212 | potential_centers = [single_biased_simulation_data(None, item)._potential_center for item in list_of_coor_data_files] 213 | [x, y] = list(zip(*potential_centers)) 214 | 215 | axis_object.scatter(x, y, marker=marker) 216 | return fig_object, axis_object 217 | 218 | def equilibration_check(self, coor_file_folder, 219 | scaling_factor, num_of_splits, save_fig=True, 220 | starting_index_of_last_few_frames=0 221 | ): 222 | """this function checks equilibration by plotting each individual runs in PC space, colored with 'step', 223 | note: inputs should be Cartesian coordinates, the case with input using cossin is not implemented 224 | """ 225 | import scipy 226 | ks_stats_list = [] 227 | temp_arrow_list = [] 228 | potential_centers_list = [] 229 | temp_arrow_start_list = [] 230 | _1 = coordinates_data_files_list([coor_file_folder]) 231 | for item in _1.get_list_of_coor_data_files(): 232 | data = np.load(item)[starting_index_of_last_few_frames:] / scaling_factor 233 | data = Sutils.remove_translation(data) 234 | potential_centers_list.append([float(item_1) for item_1 in item.split('_pc_[')[1].split(']')[0].split(',')]) 235 | # do analysis using K-S test 236 | PCs = self._network.get_PCs(data) 237 | dim_of_PCs = PCs.shape[1] 238 | PCs = PCs[:int(PCs.shape[0]) // num_of_splits * num_of_splits] # in case that PCs cannot be splitted evenly 239 | samples_for_KS_testing = np.split(PCs, num_of_splits) 240 | ks_stats = max([ 241 | sum( 242 | [scipy.stats.ks_2samp(samples_for_KS_testing[xx][:,subindex], samples_for_KS_testing[yy][:,subindex])[0] 243 | for subindex in range(dim_of_PCs) 244 | ]) / float(dim_of_PCs) 245 | for xx in range(num_of_splits) for yy in range(xx + 1, num_of_splits)] 246 | ) 247 | ks_stats_list.append(ks_stats) 248 | # plot arrow from center of first split to last split 249 | temp_arrow_start = np.average(samples_for_KS_testing[0], axis=0) 250 | temp_arrow_end = np.average(samples_for_KS_testing[-1], axis=0) 251 | temp_arrow = (temp_arrow_end - temp_arrow_start) 252 | assert (temp_arrow.shape[0] == 2), temp_arrow.shape[0] 253 | temp_arrow_list.append(temp_arrow) 254 | temp_arrow_start_list.append(temp_arrow_start) 255 | 256 | fig, ax = plt.subplots() 257 | self.plotting_with_coloring_option("PC", fig, ax, input_data_for_plotting=data, color_option='step', 258 | title=item.strip().split('/')[-1]) 259 | ax.quiver([temp_arrow_start[0]], [temp_arrow_start[1]], [temp_arrow[0]], [temp_arrow[1]], 260 | units="xy", scale=1) 261 | if save_fig: 262 | fig.savefig(ax.get_title() + '.png') 263 | 264 | # plotting K-S stats 265 | potential_centers_list = np.array(potential_centers_list) 266 | temp_arrow_list = np.array(temp_arrow_list) 267 | temp_arrow_start_list = np.array(temp_arrow_start_list) 268 | fig, ax = plt.subplots() 269 | im = ax.scatter(potential_centers_list[:,0], potential_centers_list[:,1], c=ks_stats_list, cmap="Blues") 270 | col_bar = fig.colorbar(im, ax=ax) 271 | col_bar.set_label("KS value") 272 | for pc, arr_start in zip(potential_centers_list, temp_arrow_start_list): 273 | # connect potential center to starting point of arrow with dashed line 274 | ax.plot([pc[0], arr_start[0]], [pc[1], arr_start[1]], linestyle='dotted') 275 | 276 | ax.quiver(temp_arrow_start_list[:,0], temp_arrow_start_list[:,1], 277 | temp_arrow_list[:,0], temp_arrow_list[:,1], 278 | units = 'xy', scale=1) 279 | ax.set_xlabel("PC1") 280 | ax.set_ylabel("PC2") 281 | fig.set_size_inches((10, 10)) 282 | fig.savefig("temp_harmonic_centers_and_stats.png") 283 | 284 | return 285 | 286 | 287 | class machine_independent_run(object): 288 | def __init__(self): 289 | return 290 | 291 | @staticmethod 292 | def run_commands(machine_to_run_simulations, commands, cuda, max_num_failed_jobs): 293 | if machine_to_run_simulations == 'cluster': 294 | cluster_management.create_sge_files_for_commands(list_of_commands_to_run=commands, 295 | run_on_gpu=cuda) 296 | cluster_management.monitor_status_and_submit_periodically(num = CONFIG_14, 297 | monitor_mode='normal', 298 | num_of_running_jobs_when_allowed_to_stop = 500) # should not loop forever 299 | elif machine_to_run_simulations == 'local': 300 | total_num_failed_jobs = Helper_func.run_multiple_jobs_on_local_machine(commands=commands) 301 | assert (total_num_failed_jobs < max_num_failed_jobs) 302 | else: 303 | raise Exception('machine type error') 304 | return 305 | 306 | 307 | class iteration(object): 308 | def __init__(self, index, 309 | network=None # if you want to start with existing network, assign value to "network" 310 | ): 311 | self._index = index 312 | self._network = network 313 | 314 | @staticmethod 315 | def preprocessing(machine_to_run_simulations = CONFIG_24, target_folder=None): 316 | """ 317 | 1. aligned structure 318 | 2. generate coordinate files 319 | """ 320 | reference_suffix_list = CONFIG_63 321 | reference_configs = CONFIG_62 322 | atom_selection_list = CONFIG_64 323 | assert (len(reference_configs) == len(reference_suffix_list)), ( 324 | len(reference_configs), len(reference_suffix_list)) 325 | num_of_reference_configs = len(reference_configs) 326 | if not target_folder is None: 327 | temp_target_folder = target_folder 328 | else: 329 | if isinstance(molecule_type, Trp_cage): 330 | temp_target_folder = '../target/Trp_cage' 331 | elif isinstance(molecule_type, Alanine_dipeptide): 332 | temp_target_folder = '../target/Alanine_dipeptide' 333 | else: 334 | raise Exception("molecule type error") 335 | 336 | if CONFIG_48 == 'Cartesian': 337 | for _1 in range(num_of_reference_configs): 338 | temp_command_list = ['python', 'structural_alignment.py', temp_target_folder, 339 | '--ref', reference_configs[_1], '--suffix', reference_suffix_list[_1], 340 | '--atom_selection', atom_selection_list[_1] 341 | ] 342 | if machine_to_run_simulations == 'local': 343 | subprocess.check_output(temp_command_list) 344 | elif machine_to_run_simulations == 'cluster': 345 | temp_command = ' '.join(['"%s"' % item for item in temp_command_list]) + ' 2> /dev/null ' # TODO: does it work by adding quotation marks to everything 346 | cluster_management.run_a_command_and_wait_on_cluster(command=temp_command) 347 | else: 348 | raise Exception('machine type error') 349 | 350 | molecule_type.generate_coordinates_from_pdb_files(path_for_pdb=temp_target_folder) 351 | return 352 | 353 | def train_network_and_save(self, machine_to_run_simulations = CONFIG_24, 354 | training_interval=1, num_of_trainings=CONFIG_13): 355 | """num_of_trainings is the number of trainings that we are going to run, and 356 | then pick one that has the largest Fraction of Variance Explained (FVE), 357 | by doing this, we might avoid network with very poor quality 358 | """ 359 | command = 'python ../src/train_network_and_save_for_iter.py %d --training_interval %d --num_of_trainings %d' %\ 360 | (self._index, training_interval, num_of_trainings) 361 | if machine_to_run_simulations == 'local': 362 | print(command) 363 | temp_output = subprocess.check_output(command.strip().split(' ')).decode("utf-8") 364 | elif machine_to_run_simulations == 'cluster': 365 | command = 'OMP_NUM_THREADS=6 ' + command 366 | job_id = cluster_management.run_a_command_and_wait_on_cluster(command=command, ppn=10) 367 | output_file, _ = cluster_management.get_output_and_err_with_job_id(job_id=job_id) 368 | temp_output = subprocess.check_output(['cat', output_file]).decode("utf-8") 369 | else: 370 | raise Exception('machine type error') 371 | autoencoder_filename = temp_output.strip().split( 372 | 'excited! this is the name of best network: ')[1].strip().split('\n')[0] # locate filename in output 373 | 374 | print(temp_output) 375 | return autoencoder.load_from_pkl_file(autoencoder_filename) 376 | 377 | def run_simulation(self, machine_to_run_simulations = CONFIG_24, commands = None, cuda=None): 378 | if cuda is None: cuda = (CONFIG_23 == 'CUDA') 379 | if commands is None: 380 | commands = self._network.get_commands_for_further_biased_simulations() 381 | machine_independent_run.run_commands(machine_to_run_simulations, commands, cuda, 382 | CONFIG_31) # we do not allow more than CONFIG_31 simulations failed in each iteration 383 | 384 | # next line only when the jobs are done, check this 385 | if CONFIG_29: 386 | molecule_type.remove_water_mol_and_Cl_from_pdb_file(preserve_original_file = CONFIG_50) 387 | return 388 | 389 | 390 | class simulation_with_ANN_main(object): 391 | def __init__(self, num_of_iterations = 1, 392 | initial_iteration=None, # this is where we start with 393 | training_interval = None, 394 | ): 395 | self._num_of_iterations = num_of_iterations 396 | self._initial_iteration = initial_iteration 397 | self._training_interval = training_interval 398 | print("running iterations for system: %s" % CONFIG_30) 399 | return 400 | 401 | def run_one_iteration(self, one_iteration): 402 | one_iteration.preprocessing() 403 | if one_iteration is None: 404 | one_iteration = iteration(1, network=None) 405 | if one_iteration._network is None: 406 | one_iteration._network = one_iteration.train_network_and_save( 407 | training_interval = self._training_interval) # train it if it is empty 408 | one_iteration._network.write_coefficients_of_connections_into_file() 409 | print('running this iteration #index = %d' % one_iteration._index) 410 | one_iteration.run_simulation() 411 | return 412 | 413 | def run_mult_iterations(self, num=None): 414 | if num is None: num = self._num_of_iterations 415 | 416 | current_iter = self._initial_iteration 417 | for _ in range(num): 418 | self.run_one_iteration(current_iter) 419 | next_index = current_iter._index + 1 420 | current_iter = iteration(next_index, None) 421 | 422 | return 423 | 424 | class single_biased_simulation_data(object): 425 | def __init__(self, my_network, file_for_single_biased_simulation_coor): 426 | """my_network is the corresponding network for this biased simulation""" 427 | self._file_for_single_biased_simulation_coor = file_for_single_biased_simulation_coor 428 | self._my_network = my_network 429 | temp_potential_center_string = file_for_single_biased_simulation_coor.split('_pc_[')[1].split(']')[0] 430 | self._potential_center = [float(item) for item in temp_potential_center_string.split(',')] 431 | self._force_constant = float(file_for_single_biased_simulation_coor.split('output_fc_')[1].split('_pc_')[0]) 432 | self._number_of_data = float(subprocess.check_output(['wc', '-l', file_for_single_biased_simulation_coor]).decode("utf-8").split()[0]) 433 | 434 | if not self._my_network is None: 435 | if self._my_network._hidden_layers_type[1] == "Circular": 436 | self._dimension_of_PCs = self._my_network._node_num[2] // 2 437 | else: 438 | self._dimension_of_PCs = self._my_network._node_num[2] 439 | 440 | return 441 | 442 | def get_center_of_data_cloud_in_this_biased_simulation(self, input_data_type): 443 | if input_data_type == 'cossin': 444 | PCs = self._my_network.get_PCs(molecule_type.get_many_cossin_from_coordinates_in_list_of_files( 445 | [self._file_for_single_biased_simulation_coor])) 446 | elif input_data_type == 'Cartesian': 447 | scaling_factor = CONFIG_49 448 | temp_data = np.load(self._file_for_single_biased_simulation_coor) / scaling_factor 449 | temp_data = Sutils.remove_translation(temp_data) 450 | PCs = self._my_network.get_PCs(temp_data) 451 | else: 452 | raise Exception('error input_data_type') 453 | 454 | assert(len(PCs[0]) == self._dimension_of_PCs) 455 | assert(len(PCs) == self._number_of_data) 456 | PCs_transpose = list(zip(*PCs)) 457 | center_of_data_cloud = [sum(x) / len(x) for x in PCs_transpose] 458 | return center_of_data_cloud 459 | 460 | def get_offset_between_potential_center_and_data_cloud_center(self, input_data_type): 461 | """see if the push in this biased simulation actually works, large offset means it 462 | does not work well 463 | """ 464 | PCs_average = self.get_center_of_data_cloud_in_this_biased_simulation(input_data_type) 465 | offset = [PCs_average[item] - self._potential_center[item] for item in range(self._dimension_of_PCs)] 466 | return offset 467 | 468 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/biased_simulation.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is for biased simulation for alanine dipeptide only, it is used as the test for 3 | more general file biased_simulation_general.py, which could be easily extend to other new 4 | systems. 5 | """ 6 | 7 | from ANN_simulation import * 8 | from simtk.openmm.app import * 9 | from simtk.openmm import * 10 | from simtk.unit import * 11 | from sys import stdout 12 | import ast, argparse 13 | 14 | import os 15 | import datetime 16 | 17 | from config import * 18 | 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument("record_interval", type=int, help="interval to take snapshots") 21 | parser.add_argument("total_num_of_steps", type=int, help="total number of simulation steps") 22 | parser.add_argument("force_constant", type=float, help="force constants") 23 | parser.add_argument("folder_to_store_output_files", type=str, help="folder to store the output pdb and report files") 24 | parser.add_argument("autoencoder_info_file", type=str, help="file to store autoencoder information (coefficients)") 25 | parser.add_argument("pc_potential_center", type=str, help="potential center (should include 'pc_' as prefix)") 26 | parser.add_argument("--out_traj", type=str, default=None, help="output trajectory file") 27 | parser.add_argument("--layer_types", type=str, default=str(CONFIG_27), help='layer types') 28 | parser.add_argument("--num_of_nodes", type=str, default=str(CONFIG_3[:3]), help='number of nodes in each layer') 29 | parser.add_argument("--temperature", type=int, default= CONFIG_21, help='simulation temperature') 30 | parser.add_argument("--data_type_in_input_layer", type=int, default=1, help='data_type_in_input_layer, 0 = cos/sin, 1 = Cartesian coordinates') 31 | parser.add_argument("--platform", type=str, default=CONFIG_23, help='platform on which the simulation is run') 32 | parser.add_argument("--scaling_factor", type=float, default = float(CONFIG_49), help='scaling_factor for ANN_Force') 33 | parser.add_argument("--starting_pdb_file", type=str, default='../resources/alanine_dipeptide.pdb', help='the input pdb file to start simulation') 34 | parser.add_argument("--starting_frame", type=int, default=0, help="index of starting frame in the starting pdb file") 35 | parser.add_argument("--minimize_energy", type=int, default=1, help='whether to minimize energy (1 = yes, 0 = no)') 36 | parser.add_argument("--equilibration_steps", type=int, default=1000, help="number of steps for the equilibration process") 37 | # next few options are for metadynamics 38 | parser.add_argument("--bias_method", type=str, default='US', help="biasing method for enhanced sampling, US = umbrella sampling, MTD = metadynamics") 39 | parser.add_argument("--MTD_pace", type=int, default=CONFIG_66, help="pace of metadynamics") 40 | parser.add_argument("--MTD_height", type=float, default=CONFIG_67, help="height of metadynamics") 41 | parser.add_argument("--MTD_sigma", type=float, default=CONFIG_68, help="sigma of metadynamics") 42 | parser.add_argument("--MTD_WT", type=int, default=CONFIG_69, help="whether to use well-tempered version") 43 | parser.add_argument("--MTD_biasfactor", type=float, default=CONFIG_70, help="biasfactor of well-tempered metadynamics") 44 | # following is for plumed script 45 | parser.add_argument("--plumed_file", type=str, default=None, help="plumed script for biasing force, used only when the bias_method == plumed_other") 46 | parser.add_argument("--plumed_add_string", type=str, default="", help="additional string to be attached to the end of plumed script in args.plumed_file") 47 | # note on "force_constant_adjustable" mode: 48 | # the simulation will stop if either: 49 | # force constant is greater or equal to max_force_constant 50 | # or distance between center of data cloud and potential center is smaller than distance_tolerance 51 | parser.add_argument("--fc_adjustable", help="set the force constant to be adjustable", action="store_true") 52 | parser.add_argument("--max_fc", type=float, default=CONFIG_32, help="max force constant (for force_constant_adjustable mode)") 53 | parser.add_argument("--fc_step", type=float, default=CONFIG_34, help="the value by which the force constant is increased each time (for force_constant_adjustable mode)") 54 | parser.add_argument("--distance_tolerance", type=float, default=CONFIG_35, help="max distance allowed between center of data cloud and potential center (for force_constant_adjustable mode)") 55 | parser.add_argument("--autoencoder_file", type=str, help="pkl file that stores autoencoder (for force_constant_adjustable mode)") 56 | parser.add_argument("--remove_previous", help="remove previous outputs while adjusting force constants", action="store_true") 57 | args = parser.parse_args() 58 | 59 | record_interval = args.record_interval 60 | total_number_of_steps = args.total_num_of_steps 61 | input_data_type = ['cossin', 'Cartesian', 'pairwise'][args.data_type_in_input_layer] 62 | force_constant = args.force_constant 63 | scaling_factor = args.scaling_factor 64 | layer_types = re.sub("\[|\]|\"|\'| ",'', args.layer_types).split(',') 65 | num_of_nodes = re.sub("\[|\]|\"|\'| ",'', args.num_of_nodes).split(',') 66 | num_of_nodes = [int(item) for item in num_of_nodes] 67 | out_format = '.dcd' if args.out_traj is None else os.path.splitext(args.out_traj)[1] 68 | 69 | if float(force_constant) != 0: 70 | from ANN import * 71 | 72 | folder_to_store_output_files = args.folder_to_store_output_files # this is used to separate outputs for different networks into different folders 73 | autoencoder_info_file = args.autoencoder_info_file 74 | 75 | potential_center = list([float(x) for x in args.pc_potential_center.replace('"','')\ 76 | .replace('pc_','').split(',')]) # this API is the generalization for higher-dimensional cases 77 | 78 | if not os.path.exists(folder_to_store_output_files): 79 | try: os.makedirs(folder_to_store_output_files) 80 | except: pass 81 | 82 | def run_simulation(force_constant): 83 | assert(os.path.exists(folder_to_store_output_files)) 84 | input_pdb_file_of_molecule = args.starting_pdb_file 85 | force_field_file = 'amber99sb.xml' 86 | water_field_file = 'tip3p.xml' 87 | pdb_reporter_file = '%s/output_fc_%f_pc_%s.pdb' %(folder_to_store_output_files, force_constant, str(potential_center).replace(' ','')) 88 | 89 | if not args.out_traj is None: 90 | pdb_reporter_file = args.out_traj 91 | 92 | state_data_reporter_file = pdb_reporter_file.replace('output_fc', 'report_fc').replace('.pdb', '.txt') 93 | 94 | # check if the file exist 95 | for item_filename in [pdb_reporter_file, state_data_reporter_file]: 96 | Helper_func.backup_rename_file_if_exists(item_filename) 97 | 98 | index_of_backbone_atoms = CONFIG_57[0] 99 | flag_random_seed = 0 # whether we need to fix this random seed 100 | 101 | simulation_temperature = args.temperature 102 | time_step = CONFIG_22 # simulation time step, in ps 103 | 104 | pdb = PDBFile(input_pdb_file_of_molecule) 105 | modeller = Modeller(pdb.topology, pdb.getPositions(frame=args.starting_frame)) 106 | solvent_opt = 'no_water' 107 | if solvent_opt == 'explicit': 108 | forcefield = ForceField(force_field_file, water_field_file) 109 | modeller.addSolvent(forcefield, model=water_field_file.split('.xml')[0], boxSize=Vec3(3, 3, 3) * nanometers, 110 | ionicStrength=0 * molar) 111 | system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0 * nanometers, 112 | constraints=AllBonds, ewaldErrorTolerance=0.0005) 113 | else: 114 | forcefield = ForceField(force_field_file) 115 | system = forcefield.createSystem(modeller.topology, nonbondedMethod=NoCutoff, constraints=AllBonds) 116 | 117 | if args.bias_method == "US": 118 | if float(force_constant) != 0: 119 | force = ANN_Force() 120 | force.set_layer_types(layer_types) 121 | force.set_data_type_in_input_layer(args.data_type_in_input_layer) 122 | force.set_list_of_index_of_atoms_forming_dihedrals_from_index_of_backbone_atoms(index_of_backbone_atoms) 123 | force.set_index_of_backbone_atoms(index_of_backbone_atoms) 124 | if args.data_type_in_input_layer == 2: 125 | force.set_list_of_pair_index_for_distances(CONFIG_80) 126 | 127 | force.set_num_of_nodes(num_of_nodes) 128 | force.set_potential_center(potential_center) 129 | force.set_force_constant(float(force_constant)) 130 | unit_scaling = 1.0 # TODO: check unit scaling 131 | force.set_scaling_factor(float(scaling_factor) / unit_scaling) # since default unit is nm in OpenMM 132 | 133 | # TODO: need to fix following for multi-hidden layer cases 134 | temp_coeffs, temp_bias = np.load(autoencoder_info_file) 135 | for item_layer_index in [0, 1]: 136 | assert (len(temp_coeffs[item_layer_index]) == 137 | num_of_nodes[item_layer_index] * num_of_nodes[item_layer_index + 1]), (len(temp_coeffs[item_layer_index]), 138 | (num_of_nodes[item_layer_index], num_of_nodes[item_layer_index + 1])) 139 | assert (len(temp_bias[item_layer_index]) == num_of_nodes[item_layer_index + 1]), (len(temp_bias[item_layer_index]), num_of_nodes[item_layer_index + 1]) 140 | 141 | # need tolist() since C++ only accepts Python list 142 | force.set_coeffients_of_connections([item_w.tolist() for item_w in temp_coeffs]) 143 | force.set_values_of_biased_nodes([item_w.tolist() for item_w in temp_bias]) 144 | 145 | system.addForce(force) 146 | elif args.bias_method == "US_on_phipsi": 147 | from openmmplumed import PlumedForce 148 | kappa_string = ','.join([str(force_constant) for _ in potential_center]) 149 | plumed_force_string = """ 150 | phi: TORSION ATOMS=5,7,9,15 151 | psi: TORSION ATOMS=7,9,15,17 152 | restraint: RESTRAINT ARG=phi,psi AT=%f,%f KAPPA=%s 153 | PRINT STRIDE=10 ARG=* FILE=COLVAR 154 | """ % (potential_center[0], potential_center[1], kappa_string) 155 | system.addForce(PlumedForce(plumed_force_string)) 156 | elif args.bias_method == "MTD": 157 | from openmmplumed import PlumedForce 158 | plumed_force_string = Alanine_dipeptide.get_expression_script_for_plumed() 159 | with open(autoencoder_info_file, 'r') as f_in: 160 | plumed_force_string += f_in.read() 161 | 162 | # note that dimensionality of MTD is determined by potential_center string 163 | plumed_script_ANN_mode = 'ANN' 164 | if plumed_script_ANN_mode == 'native': 165 | mtd_output_layer_string = ['l_2_out_%d' % item for item in range(len(potential_center))] 166 | elif plumed_script_ANN_mode == 'ANN': 167 | mtd_output_layer_string = ['ann_force.%d' % item for item in range(len(potential_center))] 168 | else: raise Exception('mode error') 169 | 170 | mtd_output_layer_string = ','.join(mtd_output_layer_string) 171 | mtd_sigma_string = ','.join([str(args.MTD_sigma) for _ in range(len(potential_center))]) 172 | if args.MTD_WT: 173 | mtd_well_tempered_string = 'TEMP=%d BIASFACTOR=%f' % (args.temperature, args.MTD_biasfactor) 174 | else: 175 | mtd_well_tempered_string = "" 176 | plumed_force_string += """ 177 | metad: METAD ARG=%s PACE=%d HEIGHT=%f SIGMA=%s FILE=temp_MTD_hills.txt %s 178 | PRINT STRIDE=%d ARG=%s,metad.bias FILE=temp_MTD_out.txt 179 | """ % (mtd_output_layer_string, args.MTD_pace, args.MTD_height, mtd_sigma_string, mtd_well_tempered_string, 180 | record_interval, mtd_output_layer_string) 181 | # print plumed_force_string 182 | system.addForce(PlumedForce(plumed_force_string)) 183 | elif args.bias_method == "SMD": 184 | # TODO: this is temporary version 185 | from openmmplumed import PlumedForce 186 | kappa_string = '1000,1000' 187 | plumed_force_string = """ 188 | phi: TORSION ATOMS=5,7,9,15 189 | psi: TORSION ATOMS=7,9,15,17 190 | restraint: MOVINGRESTRAINT ARG=phi,psi AT0=-1.5,1.0 STEP0=0 KAPPA0=%s AT1=1.0,-1.0 STEP1=%d KAPPA1=%s 191 | PRINT STRIDE=10 ARG=* FILE=COLVAR 192 | """ % (kappa_string, total_number_of_steps, kappa_string) 193 | system.addForce(PlumedForce(plumed_force_string)) 194 | elif args.bias_method == "TMD": # targeted MD 195 | # TODO: this is temporary version 196 | from openmmplumed import PlumedForce 197 | kappa_string = '10000' 198 | plumed_force_string = """ 199 | phi: TORSION ATOMS=5,7,9,15 200 | psi: TORSION ATOMS=7,9,15,17 201 | rmsd: RMSD REFERENCE=../resources/alanine_ref_1_TMD.pdb TYPE=OPTIMAL 202 | restraint: MOVINGRESTRAINT ARG=rmsd AT0=0 STEP0=0 KAPPA0=0 AT1=0 STEP1=%d KAPPA1=%s 203 | PRINT STRIDE=10 ARG=* FILE=COLVAR 204 | """ % (total_number_of_steps, kappa_string) 205 | system.addForce(PlumedForce(plumed_force_string)) 206 | elif args.bias_method == "plumed_other": 207 | from openmmplumed import PlumedForce 208 | with open(args.plumed_file, 'r') as f_in: 209 | plumed_force_string = f_in.read().strip() + args.plumed_add_string 210 | system.addForce(PlumedForce(plumed_force_string)) 211 | else: 212 | raise Exception('bias method error') 213 | # end of biased force 214 | 215 | integrator = LangevinIntegrator(simulation_temperature*kelvin, 1/picosecond, time_step*picoseconds) 216 | if flag_random_seed: 217 | integrator.setRandomNumberSeed(1) # set random seed 218 | 219 | platform = Platform.getPlatformByName(args.platform) 220 | platform.loadPluginsFromDirectory(CONFIG_25) # load the plugin from specific directory 221 | 222 | simulation = Simulation(modeller.topology, system, integrator, platform) 223 | simulation.context.setPositions(modeller.positions) 224 | if args.minimize_energy: 225 | print('begin Minimizing energy...') 226 | print(datetime.datetime.now()) 227 | simulation.minimizeEnergy() 228 | print('Done minimizing energy.') 229 | print(datetime.datetime.now()) 230 | else: 231 | print('energy minimization not required') 232 | 233 | simulation.step(args.equilibration_steps) 234 | if out_format == '.pdb': 235 | simulation.reporters.append(PDBReporter(pdb_reporter_file, record_interval)) 236 | elif out_format == '.dcd': 237 | simulation.reporters.append(DCDReporter(pdb_reporter_file.replace('.pdb', '.dcd'), record_interval)) 238 | simulation.reporters.append(StateDataReporter(state_data_reporter_file, record_interval, 239 | step=True, potentialEnergy=True, kineticEnergy=True, speed=True, 240 | temperature=True, progress=True, remainingTime=True, 241 | totalSteps=total_number_of_steps + args.equilibration_steps, 242 | )) 243 | simulation.step(total_number_of_steps) 244 | 245 | print('Done biased simulation!') 246 | return pdb_reporter_file 247 | 248 | def get_distance_between_data_cloud_center_and_potential_center(pdb_file): 249 | coor_file = Alanine_dipeptide().generate_coordinates_from_pdb_files(pdb_file)[0] 250 | temp_network = autoencoder.load_from_pkl_file(args.autoencoder_file) 251 | this_simulation_data = single_biased_simulation_data(temp_network, coor_file) 252 | offset = this_simulation_data.get_offset_between_potential_center_and_data_cloud_center(input_data_type) 253 | if layer_types[1] == "Circular": 254 | offset = [min(abs(item), abs(item + 2 * np.pi), abs(item - 2 * np.pi)) for item in offset] 255 | print("circular offset") 256 | print('offset = %s' % str(offset)) 257 | distance = sqrt(sum([item * item for item in offset])) 258 | return distance 259 | 260 | 261 | def run_simulation_ssages(force_constant): 262 | ssages_output_file = '%s/output_fc_%f_pc_%s.json' % ( 263 | folder_to_store_output_files, force_constant, str(potential_center).replace(' ', '')) 264 | subprocess.check_output('python ../src/temp_create_json_ssages.py %s %s %s %s %s' % ( 265 | ssages_output_file, str(potential_center).replace(' ', ''), autoencoder_info_file.replace('.npy', '.txt'), 266 | ssages_output_file.replace('.json', '.trr'), force_constant), shell=True) 267 | command = "ssages " + ssages_output_file 268 | subprocess.check_output(command, shell=True) 269 | pdb_reporter_file = ssages_output_file.replace('.json', '.pdb') 270 | subprocess.check_output('mdconvert -o %s %s -t ../resources/alanine_dipeptide.pdb' % ( 271 | pdb_reporter_file, pdb_reporter_file.replace('.pdb', '.trr')), shell = True) 272 | return pdb_reporter_file 273 | 274 | 275 | if __name__ == '__main__': 276 | if not args.fc_adjustable: 277 | run_simulation(args.force_constant) 278 | else: 279 | force_constant = args.force_constant 280 | distance_of_data_cloud_center = float("inf") 281 | while force_constant < args.max_fc and distance_of_data_cloud_center > args.distance_tolerance: 282 | if args.remove_previous: 283 | try: 284 | command = 'rm %s/*%s*' % (folder_to_store_output_files, str(potential_center).replace(' ','')) 285 | command = command.replace('[','').replace(']','') 286 | subprocess.check_output(command, shell=True) 287 | print("removing previous results...") 288 | except: 289 | pass 290 | pdb_file = run_simulation(force_constant) 291 | distance_of_data_cloud_center = get_distance_between_data_cloud_center_and_potential_center(pdb_file) 292 | force_constant += args.fc_step 293 | print("distance_between_data_cloud_center_and_potential_center = %f" % distance_of_data_cloud_center) 294 | 295 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/biased_simulation_general.py: -------------------------------------------------------------------------------- 1 | from ANN_simulation import * 2 | import datetime, os, argparse 3 | from simtk.openmm.app import * 4 | from simtk.openmm import * 5 | from simtk.unit import * 6 | from sys import stdout 7 | import ast 8 | from config import * 9 | 10 | ############################ PARAMETERS BEGIN ############################################################### 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument("molecule", type=str, help="type of molecule for the simulation") 14 | parser.add_argument("record_interval", type=int, help="interval to take snapshots") 15 | parser.add_argument("total_num_of_steps", type=int, help="total number of simulation steps") 16 | parser.add_argument("force_constant", type=float, help="force constants") 17 | parser.add_argument("folder_to_store_output_files", type=str, help="folder to store the output pdb and report files") 18 | parser.add_argument("autoencoder_info_file", type=str, help="file to store autoencoder information (coefficients)") 19 | parser.add_argument("pc_potential_center", type=str, help="potential center (should include 'pc_' as prefix)") 20 | parser.add_argument("whether_to_add_water_mol_opt", type=str, help='whether to add water (options: explicit, implicit, water_already_included, no_water)') 21 | parser.add_argument("ensemble_type", type=str, help='simulation ensemble type, either NVT or NPT') 22 | parser.add_argument("--out_traj", type=str, default=None, help="output trajectory file") 23 | parser.add_argument("--layer_types", type=str, default=str(CONFIG_27), help='layer types') 24 | parser.add_argument("--num_of_nodes", type=str, default=str(CONFIG_3[:3]), help='number of nodes in each layer') 25 | parser.add_argument("--scaling_factor", type=float, default = CONFIG_49, help='scaling_factor for ANN_Force') 26 | parser.add_argument("--temperature", type=int, default= 300, help='simulation temperature') 27 | parser.add_argument("--starting_pdb_file", type=str, default='auto', help='the input pdb file to start simulation') 28 | parser.add_argument("--starting_frame", type=int, default=0, help="index of starting frame in the starting pdb file") 29 | parser.add_argument("--minimize_energy", type=int, default=1, help='whether to minimize energy (1 = yes, 0 = no)') 30 | parser.add_argument("--data_type_in_input_layer", type=int, default=1, help='data_type_in_input_layer, 0 = cos/sin, 1 = Cartesian coordinates') 31 | parser.add_argument("--platform", type=str, default=CONFIG_23, help='platform on which the simulation is run') 32 | parser.add_argument("--device", type=str, default='none', help='device index to run simulation on') 33 | parser.add_argument("--checkpoint", type=int, default=1, help="whether to save checkpoint at the end of the simulation") 34 | parser.add_argument("--starting_checkpoint", type=str, default="auto", help='starting checkpoint file, to resume simulation ("none" means no starting checkpoint file is provided, "auto" means automatically)') 35 | parser.add_argument("--equilibration_steps", type=int, default=1000, help="number of steps for the equilibration process") 36 | parser.add_argument("--fast_equilibration", type=int, default=0, help="do fast equilibration by running biased simulation with larger force constant") 37 | parser.add_argument("--remove_eq_file", type=int, default=1, help="remove equilibration pdb files associated with fast equilibration") 38 | parser.add_argument("--auto_equilibration", help="enable auto equilibration so that it will run enough equilibration steps", action="store_true") 39 | # next few options are for metadynamics 40 | parser.add_argument("--bias_method", type=str, default='US', help="biasing method for enhanced sampling, US = umbrella sampling, MTD = metadynamics") 41 | parser.add_argument("--MTD_pace", type=int, default=CONFIG_66, help="pace of metadynamics") 42 | parser.add_argument("--MTD_height", type=float, default=CONFIG_67, help="height of metadynamics") 43 | parser.add_argument("--MTD_sigma", type=float, default=CONFIG_68, help="sigma of metadynamics") 44 | parser.add_argument("--MTD_WT", type=int, default=CONFIG_69, help="whether to use well-tempered version") 45 | parser.add_argument("--MTD_biasfactor", type=float, default=CONFIG_70, help="biasfactor of well-tempered metadynamics") 46 | # following is for plumed script 47 | parser.add_argument("--plumed_file", type=str, default=None, help="plumed script for biasing force, used only when the bias_method == plumed_other") 48 | parser.add_argument("--plumed_add_string", type=str, default="", help="additional string to be attached to the end of plumed script in args.plumed_file") 49 | # note on "force_constant_adjustable" mode: 50 | # the simulation will stop if either: 51 | # force constant is greater or equal to max_force_constant 52 | # or distance between center of data cloud and potential center is smaller than distance_tolerance 53 | parser.add_argument("--fc_adjustable", help="set the force constant to be adjustable", action="store_true") 54 | parser.add_argument("--max_fc", type=float, default=CONFIG_32, help="max force constant (for force_constant_adjustable mode)") 55 | parser.add_argument("--fc_step", type=float, default=CONFIG_34, help="the value by which the force constant is increased each time (for force_constant_adjustable mode)") 56 | parser.add_argument("--distance_tolerance", type=float, default=CONFIG_35, help="max distance allowed between center of data cloud and potential center (for force_constant_adjustable mode)") 57 | parser.add_argument("--autoencoder_file", type=str, help="pkl file that stores autoencoder (for force_constant_adjustable mode)") 58 | parser.add_argument("--remove_previous", help="remove previous outputs while adjusting force constants", action="store_true") 59 | args = parser.parse_args() 60 | 61 | print("start simulation at %s" % datetime.datetime.now()) # to calculate compile time 62 | 63 | record_interval = args.record_interval 64 | total_number_of_steps = args.total_num_of_steps 65 | force_constant = args.force_constant 66 | scaling_factor = args.scaling_factor 67 | num_of_nodes = re.sub("\[|\]|\"|\'| ",'', args.num_of_nodes).split(',') 68 | num_of_nodes = [int(item) for item in num_of_nodes] 69 | out_format = '.dcd' if args.out_traj is None else os.path.splitext(args.out_traj)[1] 70 | 71 | platform = Platform.getPlatformByName(args.platform) 72 | temperature = args.temperature 73 | input_data_type = ['cossin', 'Cartesian', 'pairwise'][args.data_type_in_input_layer] 74 | 75 | if float(force_constant) != 0: 76 | from ANN import * 77 | platform.loadPluginsFromDirectory(CONFIG_25) # load the plugin from specific directory 78 | 79 | folder_to_store_output_files = args.folder_to_store_output_files # this is used to separate outputs for different networks into different folders 80 | autoencoder_info_file = args.autoencoder_info_file 81 | 82 | potential_center = list([float(x) for x in args.pc_potential_center.replace('"','')\ 83 | .replace('pc_','').split(',')]) # this API is the generalization for higher-dimensional cases 84 | 85 | def run_simulation(force_constant, number_of_simulation_steps): 86 | if not os.path.exists(folder_to_store_output_files): 87 | try: 88 | os.makedirs(folder_to_store_output_files) 89 | except: 90 | pass 91 | 92 | assert(os.path.exists(folder_to_store_output_files)) 93 | 94 | force_field_file = {'Trp_cage': 'amber03.xml', '2src': 'amber03.xml', '1y57': 'amber03.xml', 95 | 'BetaHairpin': 'amber03.xml', 'C24':'charmm36.xml', 'BPTI': 'amber03.xml' 96 | }[args.molecule] 97 | water_field_file = {'Trp_cage': 'tip4pew.xml', '2src': 'tip3p.xml', '1y57': 'tip3p.xml', 98 | 'BetaHairpin': 'tip3p.xml', 'C24':'charmm36/spce.xml', 'BPTI': 'tip4pew.xml'}[args.molecule] 99 | water_model = water_field_file.replace('.xml', '').replace('charmm36/', '') 100 | ionic_strength = {'Trp_cage': 0 * molar, '2src': 0.5 * .15 * molar, '1y57': 0.5 * .15 * molar, 101 | 'BetaHairpin': 0 * molar, 'C24': 0 * molar, 'BPTI': 0 * molar}[args.molecule] 102 | implicit_solvent_force_field = 'amber03_obc.xml' 103 | 104 | pdb_reporter_file = '%s/output_fc_%s_pc_%s_T_%d_%s_%s.pdb' % (folder_to_store_output_files, force_constant, 105 | str(potential_center).replace(' ', ''), temperature, 106 | args.whether_to_add_water_mol_opt, args.ensemble_type) 107 | 108 | 109 | if args.starting_pdb_file == 'auto': 110 | input_pdb_file_of_molecule = {'Trp_cage': '../resources/1l2y.pdb', 111 | '2src': '../resources/2src.pdb', 112 | '1y57': '../resources/1y57.pdb', 113 | 'BetaHairpin': '../resources/BetaHairpin.pdb', 114 | 'C24': '../resources/C24.pdb', 'BPTI': '../resources/bpti.pdb'}[args.molecule] 115 | else: 116 | input_pdb_file_of_molecule = args.starting_pdb_file 117 | pdb_reporter_file = pdb_reporter_file.split('.pdb')[0] + '_sf_%s.pdb' % \ 118 | args.starting_pdb_file.split('_sf_')[0].split('.pdb')[0].split('/')[-1] # 'sf' means 'starting_from' 119 | 120 | print("start_pdb = %s" % input_pdb_file_of_molecule) 121 | if args.starting_frame != 0: 122 | pdb_reporter_file = pdb_reporter_file.split('.pdb')[0] + '_ff_%d.pdb' % args.starting_frame # 'ff' means 'from_frame' 123 | 124 | if not args.out_traj is None: 125 | pdb_reporter_file = args.out_traj 126 | 127 | state_data_reporter_file = pdb_reporter_file.replace('output_fc', 'report_fc').replace('.pdb', '.txt') 128 | checkpoint_file = pdb_reporter_file.replace('output_fc', 'checkpoint_fc').replace('.pdb', '.chk') 129 | if args.fast_equilibration: 130 | checkpoint_file = checkpoint_file.replace(str(force_constant), str(args.force_constant)) 131 | 132 | # check existence 133 | for item_filename in [pdb_reporter_file, state_data_reporter_file]: 134 | Helper_func.backup_rename_file_if_exists(item_filename) 135 | 136 | flag_random_seed = 0 # whether we need to fix this random seed 137 | box_size = {'Trp_cage': 4.5, '2src': 8.0, '1y57': 8.0, 138 | 'BetaHairpin': 5, 'C24': 5, 'BPTI': 5.1263}[args.molecule] 139 | time_step = CONFIG_22 # simulation time step, in ps 140 | 141 | index_of_backbone_atoms = {'Trp_cage': CONFIG_57[1], 142 | '2src': CONFIG_57[2], '1y57': CONFIG_57[2], 143 | 'BetaHairpin': CONFIG_57[3], 144 | 'C24': CONFIG_57[4], 'BPTI': None}[args.molecule] 145 | 146 | layer_types = CONFIG_27 147 | simulation_constraints = HBonds 148 | 149 | pdb = PDBFile(input_pdb_file_of_molecule) 150 | modeller = Modeller(pdb.topology, pdb.getPositions(frame=args.starting_frame)) 151 | 152 | if args.whether_to_add_water_mol_opt == 'explicit': 153 | forcefield = ForceField(force_field_file, water_field_file) 154 | modeller.addHydrogens(forcefield) 155 | modeller.addSolvent(forcefield, model=water_model, boxSize=Vec3(box_size, box_size, box_size)*nanometers, 156 | ionicStrength=ionic_strength) 157 | if not water_model == 'spce': modeller.addExtraParticles(forcefield) 158 | system = forcefield.createSystem(modeller.topology, nonbondedMethod=PME, nonbondedCutoff=1.0 * nanometers, 159 | constraints = simulation_constraints, ewaldErrorTolerance = 0.0005) 160 | elif args.whether_to_add_water_mol_opt == 'implicit': 161 | forcefield = ForceField(force_field_file, implicit_solvent_force_field) 162 | modeller.addHydrogens(forcefield) 163 | modeller.addExtraParticles(forcefield) 164 | system = forcefield.createSystem(pdb.topology,nonbondedMethod=CutoffNonPeriodic, nonbondedCutoff=5 * nanometers, 165 | constraints=simulation_constraints, rigidWater=True, removeCMMotion=True) 166 | 167 | elif args.whether_to_add_water_mol_opt == 'no_water' or args.whether_to_add_water_mol_opt == 'water_already_included': 168 | forcefield = ForceField(force_field_file, water_field_file) 169 | modeller.addExtraParticles(forcefield) 170 | modeller.addHydrogens(forcefield) 171 | system = forcefield.createSystem(modeller.topology, nonbondedMethod=NoCutoff,nonbondedCutoff=1.0 * nanometers, 172 | constraints = simulation_constraints) 173 | else: 174 | raise Exception("parameter error") 175 | 176 | # print modeller.topology.getPeriodicBoxVectors() 177 | 178 | system.addForce(AndersenThermostat(temperature*kelvin, 1/picosecond)) 179 | if args.ensemble_type == "NPT" and args.whether_to_add_water_mol_opt == 'explicit': 180 | system.addForce(MonteCarloBarostat(1*atmospheres, temperature*kelvin, 25)) 181 | 182 | # add custom force (only for biased simulation) 183 | if args.bias_method == "US": 184 | if float(force_constant) != 0: 185 | force = ANN_Force() 186 | force.set_layer_types(layer_types) 187 | force.set_data_type_in_input_layer(args.data_type_in_input_layer) 188 | force.set_list_of_index_of_atoms_forming_dihedrals_from_index_of_backbone_atoms(index_of_backbone_atoms) 189 | force.set_index_of_backbone_atoms(index_of_backbone_atoms) 190 | if args.data_type_in_input_layer == 2: 191 | force.set_list_of_pair_index_for_distances(CONFIG_80) 192 | force.set_num_of_nodes(num_of_nodes) 193 | force.set_potential_center(potential_center) 194 | force.set_force_constant(float(force_constant)) 195 | unit_scaling = 1.0 # TODO: check unit scaling 196 | force.set_scaling_factor(float(scaling_factor) / unit_scaling) # since default unit is nm in OpenMM 197 | 198 | with open(autoencoder_info_file, 'r') as f_in: 199 | content = f_in.readlines() 200 | 201 | # TODO: need to fix following for multi-hidden layer cases 202 | temp_coeffs = [ast.literal_eval(content[0].strip())[0], ast.literal_eval(content[1].strip())[0]] 203 | temp_bias = [ast.literal_eval(content[2].strip())[0], ast.literal_eval(content[3].strip())[0]] 204 | for item_layer_index in [0, 1]: 205 | assert (len(temp_coeffs[item_layer_index]) == 206 | num_of_nodes[item_layer_index] * num_of_nodes[item_layer_index + 1]), \ 207 | (len(temp_coeffs[item_layer_index]), num_of_nodes[item_layer_index], num_of_nodes[item_layer_index + 1]) 208 | assert (len(temp_bias[item_layer_index]) == num_of_nodes[item_layer_index + 1]), (len(temp_bias[item_layer_index]), num_of_nodes[item_layer_index + 1]) 209 | 210 | force.set_coeffients_of_connections(temp_coeffs) 211 | force.set_values_of_biased_nodes(temp_bias) 212 | 213 | system.addForce(force) 214 | elif args.bias_method == "MTD": 215 | from openmmplumed import PlumedForce 216 | molecule_type = {'Trp_cage': Trp_cage, '2src': Src_kinase, '1y57': Src_kinase, 'BetaHairpin': BetaHairpin}[args.molecule] 217 | plumed_force_string = molecule_type.get_expression_script_for_plumed() 218 | with open(autoencoder_info_file, 'r') as f_in: 219 | plumed_force_string += f_in.read() 220 | 221 | # note that dimensionality of MTD is determined by potential_center string 222 | mtd_output_layer_string = ['l_2_out_%d' % item for item in range(len(potential_center))] 223 | mtd_output_layer_string = ','.join(mtd_output_layer_string) 224 | mtd_sigma_string = ','.join([str(args.MTD_sigma) for _ in range(len(potential_center))]) 225 | if args.MTD_WT: 226 | mtd_well_tempered_string = 'TEMP=%d BIASFACTOR=%f' % (args.temperature, args.MTD_biasfactor) 227 | else: 228 | mtd_well_tempered_string = "" 229 | plumed_force_string += """ 230 | metad: METAD ARG=%s PACE=%d HEIGHT=%f SIGMA=%s FILE=temp_MTD_hills.txt %s 231 | PRINT STRIDE=%d ARG=%s,metad.bias FILE=temp_MTD_out.txt 232 | """ % (mtd_output_layer_string, args.MTD_pace, args.MTD_height, mtd_sigma_string, mtd_well_tempered_string, 233 | record_interval, mtd_output_layer_string) 234 | system.addForce(PlumedForce(plumed_force_string)) 235 | elif args.bias_method == "TMD": # targeted MD 236 | # TODO: this is temporary version 237 | from openmmplumed import PlumedForce 238 | kappa_string = str(args.force_constant) 239 | plumed_force_string = """ 240 | rmsd: RMSD REFERENCE=../resources/1y57_TMD.pdb TYPE=OPTIMAL 241 | restraint: MOVINGRESTRAINT ARG=rmsd AT0=0.4 STEP0=0 KAPPA0=%s AT1=0 STEP1=%d KAPPA1=%s 242 | PRINT STRIDE=500 ARG=* FILE=COLVAR 243 | """ % (kappa_string, total_number_of_steps, kappa_string) 244 | system.addForce(PlumedForce(plumed_force_string)) 245 | elif args.bias_method == "US_on_ANN_plumed": 246 | # in this case, all ANN related parts (including scripts for inputs) have been stored in 247 | # args.plumed_file, only need to add biasing plumed script for umbrella sampling 248 | from openmmplumed import PlumedForce 249 | with open(args.plumed_file, 'r') as f_in: 250 | plumed_force_string = f_in.read() 251 | arg_string = ','.join(['ann_force.%d' % _2 for _2 in range(len(potential_center))]) 252 | pc_string = ','.join([str(_2) for _2 in potential_center]) 253 | kappa_string = ','.join([str(force_constant) for _ in potential_center]) 254 | plumed_force_string += """\nmypotential: RESTRAINT ARG=%s AT=%s KAPPA=%s""" % ( 255 | arg_string, pc_string, kappa_string, 256 | ) 257 | system.addForce(PlumedForce(plumed_force_string)) 258 | elif args.bias_method == "plumed_other": 259 | from openmmplumed import PlumedForce 260 | with open(args.plumed_file, 'r') as f_in: 261 | plumed_force_string = f_in.read().strip() + args.plumed_add_string 262 | system.addForce(PlumedForce(plumed_force_string)) 263 | else: 264 | raise Exception('bias method error') 265 | # end add custom force 266 | 267 | integrator = VerletIntegrator(time_step*picoseconds) 268 | 269 | if flag_random_seed: 270 | integrator.setRandomNumberSeed(1) # set random seed 271 | 272 | if args.platform == "CUDA" and args.device != 'none': 273 | properties = {'CudaDeviceIndex': args.device} 274 | simulation = Simulation(modeller.topology, system, integrator, platform, properties) 275 | else: 276 | simulation = Simulation(modeller.topology, system, integrator, platform) 277 | # print "positions = " 278 | # print (modeller.positions) 279 | simulation.context.setPositions(modeller.positions) 280 | print(datetime.datetime.now()) 281 | 282 | if args.starting_checkpoint != 'none': 283 | if args.starting_checkpoint == "auto": # restart from checkpoint if it exists 284 | if os.path.isfile(checkpoint_file): 285 | print("resume simulation from %s" % checkpoint_file) 286 | simulation.loadCheckpoint(checkpoint_file) 287 | else: 288 | print("resume simulation from %s" % args.starting_checkpoint) 289 | simulation.loadCheckpoint(args.starting_checkpoint) # the topology is already set by pdb file, and the positions in the pdb file will be overwritten by those in the starting_checkpoing file 290 | 291 | if args.minimize_energy: 292 | print('begin Minimizing energy...') 293 | print(datetime.datetime.now()) 294 | simulation.minimizeEnergy() 295 | print('Done minimizing energy.') 296 | print(datetime.datetime.now()) 297 | else: 298 | print('energy minimization not required') 299 | 300 | print("begin equilibrating...") 301 | print(datetime.datetime.now()) 302 | simulation.step(args.equilibration_steps) 303 | previous_distance_to_potential_center = 100 304 | current_distance_to_potential_center = 90 305 | if args.auto_equilibration: 306 | distance_change_tolerance = 0.05 307 | while abs(previous_distance_to_potential_center - current_distance_to_potential_center) > distance_change_tolerance: 308 | temp_pdb_reporter_file_for_auto_equilibration = pdb_reporter_file.replace('.pdb', '_temp.pdb') 309 | simulation.reporters.append(PDBReporter(temp_pdb_reporter_file_for_auto_equilibration, record_interval)) 310 | simulation.step(args.equilibration_steps) 311 | previous_distance_to_potential_center = current_distance_to_potential_center 312 | current_distance_to_potential_center = get_distance_between_data_cloud_center_and_potential_center( 313 | temp_pdb_reporter_file_for_auto_equilibration) 314 | subprocess.check_output(['rm', temp_pdb_reporter_file_for_auto_equilibration]) 315 | print("previous_distance_to_potential_center = %f\ncurrent_distance_to_potential_center = %f" % ( 316 | previous_distance_to_potential_center, current_distance_to_potential_center 317 | )) 318 | 319 | print("Done equilibration") 320 | print(datetime.datetime.now()) 321 | 322 | if out_format == '.pdb': 323 | simulation.reporters.append(PDBReporter(pdb_reporter_file, record_interval)) 324 | elif out_format == '.dcd': 325 | simulation.reporters.append(DCDReporter(pdb_reporter_file.replace('.pdb', '.dcd'), record_interval)) 326 | simulation.reporters.append(StateDataReporter(state_data_reporter_file, record_interval, time=True, 327 | step=True, potentialEnergy=True, kineticEnergy=True, speed=True, 328 | temperature=True, progress=True, remainingTime=True, volume = True,density=True, 329 | totalSteps=number_of_simulation_steps + args.equilibration_steps, 330 | )) 331 | simulation.step(number_of_simulation_steps) 332 | 333 | if args.checkpoint: 334 | Helper_func.backup_rename_file_if_exists(checkpoint_file) 335 | simulation.saveCheckpoint(checkpoint_file) 336 | 337 | print('Done!') 338 | print(datetime.datetime.now()) 339 | return pdb_reporter_file 340 | 341 | def get_distance_between_data_cloud_center_and_potential_center(pdb_file): 342 | coor_file = Trp_cage().generate_coordinates_from_pdb_files(pdb_file)[0] 343 | temp_network = autoencoder.load_from_pkl_file(args.autoencoder_file) 344 | print(coor_file) 345 | this_simulation_data = single_biased_simulation_data(temp_network, coor_file) 346 | offset = this_simulation_data.get_offset_between_potential_center_and_data_cloud_center(input_data_type) 347 | if CONFIG_17[1] == "Circular": 348 | offset = [min(abs(item), abs(item + 2 * np.pi), abs(item - 2 * np.pi)) for item in offset] 349 | print("circular offset") 350 | print('offset = %s' % str(offset)) 351 | distance = sqrt(sum([item * item for item in offset])) 352 | return distance 353 | 354 | if __name__ == '__main__': 355 | if not args.fc_adjustable: 356 | if args.fast_equilibration: 357 | temp_eq_force_constants = [args.force_constant * item for item in [5, 3, 2, 1.5, 1.2]] 358 | temp_eq_num_steps = [int(total_number_of_steps * item) for item in [0.02, 0.05, 0.05, 0.1, 0.1]] 359 | for item_1, item_2 in zip(temp_eq_force_constants, temp_eq_num_steps): 360 | temp_eq_pdb = run_simulation(item_1, item_2) 361 | if args.remove_eq_file: 362 | subprocess.check_output(['rm', temp_eq_pdb]) 363 | 364 | run_simulation(args.force_constant, total_number_of_steps) 365 | 366 | else: 367 | force_constant = args.force_constant 368 | distance_of_data_cloud_center = float("inf") 369 | while force_constant < args.max_fc and distance_of_data_cloud_center > args.distance_tolerance: 370 | if args.remove_previous: 371 | try: 372 | command = 'rm %s/*%s*' % (folder_to_store_output_files, str(potential_center).replace(' ','')) 373 | command = command.replace('[','').replace(']','') 374 | subprocess.check_output(command, shell=True) 375 | print("removing previous results...") 376 | except: 377 | pass 378 | pdb_file = run_simulation(force_constant, total_number_of_steps) 379 | distance_of_data_cloud_center = get_distance_between_data_cloud_center_and_potential_center(pdb_file) 380 | force_constant += args.fc_step 381 | print("distance_between_data_cloud_center_and_potential_center = %f" % distance_of_data_cloud_center) 382 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import copy, pickle, re, os, time, subprocess, datetime, itertools, sys, abc, argparse, matplotlib, glob 3 | matplotlib.use('agg') 4 | sys.path.append('/home/kengyangyao/Dropbox/temp_Linux/temp_research_proj/cluster_management/cm/src') 5 | sys.path.append('/home/kengyangyao/Dropbox/temp_Linux/temp_research_proj/plumed_helper') 6 | from plumed_helper import Plumed_helper 7 | from scipy import io as sciio 8 | import numpy as np, pandas as pd, seaborn as sns 9 | from numpy.testing import assert_almost_equal 10 | from math import * 11 | import matplotlib.pyplot as plt 12 | from sklearn.neighbors import RadiusNeighborsRegressor 13 | import matplotlib 14 | from Bio import PDB 15 | from sklearn.metrics import mean_squared_error 16 | from sklearn import linear_model 17 | from MDAnalysis import Universe 18 | from MDAnalysis.analysis.align import * 19 | from MDAnalysis.analysis.rms import rmsd 20 | from MDAnalysis.analysis.distances import distance_array 21 | 22 | '''This is the configuration file for all Python code in this directory, 23 | it configures all default values/global parameters for constructors/functions 24 | ''' 25 | 26 | ####################################################################### 27 | ############ some global variables and helper functions ############ 28 | ####################################################################### 29 | 30 | CONFIG_30 = "Alanine_dipeptide" # the type of molecule we are studying 31 | WARNING_INFO = "Comment out this line to continue." 32 | 33 | def get_mol_param(parameter_list, molecule_name=CONFIG_30): # get molecule specific parameter using a parameter list 34 | molecule_name_to_index = {"Alanine_dipeptide": 0, "Trp_cage": 1, "Src_kinase": 2, 35 | "BetaHairpin": 3, "C24": 4} 36 | try: result = parameter_list[molecule_name_to_index[molecule_name]] 37 | except: result = None 38 | return result 39 | 40 | def get_index_list_with_selection_statement(pdb_file, atom_selection_statement): 41 | pdb_file_1 = os.path.join(os.path.dirname(__file__), pdb_file) 42 | return (Universe(pdb_file_1).select_atoms(atom_selection_statement).indices + 1).tolist() 43 | 44 | ####################################################################### 45 | ################## configurations ################################## 46 | ####################################################################### 47 | 48 | CONFIG_45 = 'keras' # training backend: "keras" 49 | CONFIG_48 = 'Cartesian' # input data type 50 | CONFIG_76 = 'Cartesian' # output data type 51 | CONFIG_75 = get_mol_param([None, None, None, None, None]) # weights for the expected output (equivalent to modifying error functions) 52 | CONFIG_52 = 64 # number of copies we generate for data augmentation 53 | CONFIG_58 = True # use representative points for training (generated by clustering) 54 | CONFIG_59 = 1000 # number of representative points 55 | 56 | # CONFIG_49 = get_mol_param([5.0, 20.0, 40.0, 20.0, 20.0]) # scaling factor for output for Cartesian coordinates 57 | CONFIG_49 = get_mol_param([0.5, 2.0, 4.0, 2.0, 2.0]) # scaling factor for Cartesian coordinates, be careful about units 58 | CONFIG_1 = ['../target/' + CONFIG_30] # list of directories that contains all coordinates files 59 | 60 | CONFIG_57 = [ 61 | get_index_list_with_selection_statement('../resources/alanine_dipeptide.pdb', 'name C or name CH3 or name CA or name N'), 62 | # get_index_list_with_selection_statement('../resources/alanine_dipeptide.pdb', 'not name H*'), 63 | get_index_list_with_selection_statement('../resources/1l2y.pdb', 'backbone and not name O'), 64 | # get_index_list_with_selection_statement('../resources/2src.pdb', 'backbone and not name O'), 65 | get_index_list_with_selection_statement('../resources/2src.pdb', 66 | '(resid 144:170 or resid 44:58) and not name H*'), 67 | get_index_list_with_selection_statement('../resources/BetaHairpin.pdb', 'backbone and not name O'), 68 | get_index_list_with_selection_statement('../resources/C24.pdb', 'name C*') 69 | ] # index list of atoms for training and biased simulations 70 | 71 | CONFIG_73 = get_mol_param(['name C or name CH3 or name CA or name N', 'name CA', 72 | '(resid 144:170 or resid 44:58) and name CA', 'name CA', 'name C*' 73 | ]) # atom selection for calculating pairwise distances, used only when it is in 'pairwise_distance' mode 74 | temp_CONFIG_80 = get_index_list_with_selection_statement( 75 | get_mol_param(['../resources/alanine_dipeptide.pdb', '../resources/1l2y.pdb', 76 | '../resources/2src.pdb', '../resources/BetaHairpin.pdb', '../resources/C24.pdb']), CONFIG_73 77 | ) 78 | CONFIG_80 = [[temp_CONFIG_80[item_xx], temp_CONFIG_80[item_yy]] 79 | for item_xx in range(len(temp_CONFIG_80)) 80 | for item_yy in range(item_xx + 1, len(temp_CONFIG_80))] # pair index list for pairwise distances as input 81 | 82 | CONFIG_17 = ['Tanh', 'Tanh', 'Tanh'] # types of hidden layers 83 | CONFIG_78 = "Linear" # output layer type 84 | CONFIG_79 = True # determine dimensionality of input/output of autoencoder automatically 85 | if CONFIG_76 == 'cossin': 86 | CONFIG_4 = get_mol_param([ 87 | [.5,.4,0, True, [0.001, 0.001, 0.001, 0.001]] if CONFIG_17[1] == "Circular" else [0.3, 0.9, 0, True, [0.00, 0.1, 0.00, 0.00]] 88 | ]) 89 | elif CONFIG_76 == 'Cartesian' or CONFIG_76 == 'combined': 90 | CONFIG_4 = get_mol_param([ 91 | [.5, 0.5, 0, True, 0.0], 92 | [0.3, 0.9, 0, True, 0.0], 93 | [0.3, 0.9, 0, True, 0.0], 94 | [0.3, 0.9, 0, True, 0.0], 95 | [0.3, 0.9, 0, True, 0.0], 96 | ]) # [learning rates, momentum, learning rate decay, nesterov, regularization coeff] 97 | elif CONFIG_76 == 'pairwise_distance': 98 | CONFIG_4 = get_mol_param([ 99 | [0.3, 0.9, 0, True, 0.0], 100 | [1.5, 0.9, 0, True, 0.0], 101 | [1.5, 0.9, 0, True, 0.0], 102 | [0.7, 0.8, 0, True, 0.0] 103 | ]) 104 | else: raise Exception('error') 105 | 106 | CONFIG_5 = 50 # max number of training epochs 107 | CONFIG_6 = None # filename to save this network 108 | CONFIG_36 = 2 # dimensionality 109 | CONFIG_37 = 2 * CONFIG_36 if CONFIG_17[1] == "Circular" else CONFIG_36 # number of nodes in bottleneck layer 110 | 111 | 112 | CONFIG_71 = False # use mixed error function (for Trp_cage only) 113 | CONFIG_62 = get_mol_param([ 114 | ['../resources/alanine_dipeptide.pdb', '../resources/alanine_ref_1.pdb'], 115 | ['../resources/1l2y.pdb', '../resources/Trp_cage_ref_1.pdb'] if not CONFIG_71 else ['../resources/1l2y.pdb', '../resources/1l2y.pdb'], # mixed_err 116 | # ['../resources/2src.pdb', '../resources/2src.pdb'] 117 | ['../resources/2src.pdb'], 118 | ['../resources/BetaHairpin.pdb'], None 119 | ]) # list of reference file 120 | CONFIG_63 = get_mol_param([ 121 | ['', '_1'], 122 | ['', '_1'], 123 | [''], [''], [''] 124 | ] 125 | ) # suffix for each reference configuration 126 | CONFIG_61 = ['_aligned%s_coordinates.npy' % item 127 | for item in CONFIG_63] # alignment_coor_file_suffix_list (we use different suffix for aligned files with respect to different references) 128 | CONFIG_64 = get_mol_param([ 129 | ['backbone', 'backbone'], 130 | ['backbone', 'backbone'] if not CONFIG_71 else ['backbone and resid 2:8', 'backbone'], # mixed_err 131 | # ['backbone and resid 144:170', 'backbone and resid 44:58'] 132 | ['backbone'], 133 | ['backbone'] 134 | ]) # atom selection statement list for structural alignment 135 | CONFIG_55 = len(CONFIG_61) # number of reference configurations used in training 136 | 137 | CONFIG_3 = get_mol_param([ # the structure of ANN: number of nodes in each layer (input/output dim typically determined automatically) 138 | [21, 40, CONFIG_37, 40, 0], 139 | [0, 50, CONFIG_37, 50, 0], 140 | [861, 100, CONFIG_37, 100, 0], 141 | [0, 100, CONFIG_37, 100, 0], 142 | [0, 100, CONFIG_37, 100, 0], 143 | ]) 144 | 145 | if CONFIG_3[-1] == 0: CONFIG_3[-1] = CONFIG_3[0] 146 | 147 | CONFIG_74 = False # whether we start each biased simulation with nearest configuration or a fixed configuration 148 | CONFIG_40 = 'explicit' # whether to include water molecules, option: explicit, implicit, water_already_included, no_water 149 | CONFIG_51 = 'NVT' # simulation ensemble type 150 | CONFIG_42 = False # whether to enable force constant adjustable mode 151 | CONFIG_44 = False # whether to use hierarchical autoencoder 152 | CONFIG_77 = 2 # hierarchical autoencoder variant index 153 | CONFIG_13 = 3 # num of trainings to run, and pick best one 154 | CONFIG_31 = 10 # maximum number of failed simulations allowed in each iteration 155 | 156 | CONFIG_56 = get_mol_param([20, 8, 6, 6]) # number of biased simulations running in parallel 157 | CONFIG_14 = 50 # max number of jobs submitted each time 158 | CONFIG_29 = True if CONFIG_40 == 'explicit' else False # whether we need to remove the water molecules from pdb files 159 | CONFIG_50 = False # whether we need to preserve original file if water molecules are removed 160 | 161 | CONFIG_10 = 15 # num of bins for get_boundary_points() 162 | CONFIG_11 = 15 # num of boundary points 163 | 164 | CONFIG_39 = False # set the range of histogram automatically based on min,max values in each dimension 165 | CONFIG_41 = False # whether we reverse the order of sorting of diff_with_neighbors values in get_boundary algorithm 166 | 167 | if CONFIG_17[1] == "Circular": 168 | CONFIG_18 = True # whether we limit the boundary points to be between [-pi, pi], typically works for circularLayer 169 | CONFIG_26 = [[-np.pi, np.pi] for item in range(CONFIG_36)] # range of PCs, for circular case, it is typically [[-np.pi, np.pi],[-np.pi, np.pi]] 170 | elif CONFIG_17[1] == "Tanh": 171 | CONFIG_18 = False 172 | CONFIG_26 = [[-1, 1] for item in range(CONFIG_36)] 173 | else: 174 | raise Exception('Layer not defined') 175 | 176 | CONFIG_33 = CONFIG_3[0] # length of list of cos/sin values, equal to the number of nodes in input layer 177 | CONFIG_12 = '../target/' + CONFIG_30 # folder that contains all pdb files 178 | 179 | CONFIG_65 = "US" # default biasing method 180 | CONFIG_16 = get_mol_param([500, 5000, 2000, 2000]) # record interval (the frequency of writing system state into the file) 181 | CONFIG_8 = get_mol_param([50000, 500000, 200000, 200000]) # num of simulation steps 182 | CONFIG_72 = 0 # enable fast equilibration 183 | # following: for umbrella sampling 184 | CONFIG_9 = get_mol_param([5000, 2000, 3000, 3000]) # force constant for biased simulations 185 | CONFIG_53 = 'fixed' # use fixed/flexible force constants for biased simulation for each iteration 186 | CONFIG_54 = 2.50 * get_mol_param([30.0, 20.0, 15.0, 20.0, 20]) # max external potential energy allowed (in k_BT) 187 | # following: for metadynamics 188 | CONFIG_66 = 500 # pace of metadynamics 189 | CONFIG_67 = 2 # height of metadynamics 190 | CONFIG_68 = 0.1 # sigma of metadynamics 191 | CONFIG_69 = 0 # whether to use well-tempered version 192 | CONFIG_70 = 15 # biasfactor for well-tempered metadynamics 193 | 194 | CONFIG_21 = 300 # simulation temperature 195 | CONFIG_22 = 0.002 # simulation time step, in ps 196 | 197 | CONFIG_23 = get_mol_param(['CPU', 'CUDA', 'CUDA', 'CUDA', 'CUDA']) # simulation platform 198 | 199 | temp_home_directory = str(subprocess.check_output('echo $HOME', shell=True).strip().decode("utf-8")) 200 | if temp_home_directory == "/home/kengyangyao": 201 | CONFIG_24 = 'local' # machine to run the simulations 202 | CONFIG_25 = temp_home_directory + '/.anaconda2/lib/plugins' # this is the directory where the plugin is installed 203 | elif temp_home_directory == "/home/weichen9": 204 | CONFIG_24 = 'cluster' # machine to run the simulations 205 | CONFIG_25 = temp_home_directory + '/.my_softwares/openmm7/lib/plugins' 206 | elif temp_home_directory == "/u/sciteam/chen21": 207 | CONFIG_24 = 'cluster' 208 | CONFIG_25 = temp_home_directory + '/.openmm/lib/plugins' 209 | else: 210 | print('unknown user directory: %s' % temp_home_directory) 211 | 212 | CONFIG_27 = CONFIG_17[:2] # layer_types for ANN_Force, it should be consistent with autoencoder 213 | 214 | CONFIG_32 = 5000 # maximum force constant allowed (for force constant adjustable mode) 215 | CONFIG_34 = 500 # force constant step, the value by which the force constant is increased each time (for force constant adjustable mode) 216 | CONFIG_35 = 0.1 # distance tolerance, max distance allowed between center of data cloud and potential center (for force_constant_adjustable mode) 217 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/coordinates_data_files_list.py: -------------------------------------------------------------------------------- 1 | from config import * 2 | from helper_func import * 3 | 4 | class coordinates_data_files_list(object): 5 | def __init__(self, 6 | list_of_dir_of_coor_data_files = CONFIG_1, # this is the directory that holds corrdinates data files 7 | ): 8 | assert (isinstance(list_of_dir_of_coor_data_files, list)) # to avoid passing the string in the constructor 9 | self._list_of_dir_of_coor_data_files = list_of_dir_of_coor_data_files 10 | self._list_of_coor_data_files = [] 11 | 12 | for item in self._list_of_dir_of_coor_data_files: 13 | self._list_of_coor_data_files += subprocess.check_output('''find %s -name "*coordinates.npy"''' % item, shell=True).decode("utf-8").strip().split('\n') 14 | 15 | self._list_of_coor_data_files = list(set(self._list_of_coor_data_files)) # remove duplicates 16 | self._list_of_coor_data_files = [x for x in self._list_of_coor_data_files if os.stat(x).st_size > 0] # remove empty files 17 | self._list_of_coor_data_files.sort() # to be consistent 18 | self._list_num_frames = [np.load(_1).shape[0] for _1 in self._list_of_coor_data_files] 19 | 20 | return 21 | 22 | def create_sub_coor_data_files_list_using_filter_conditional(self, filter_conditional): 23 | """ 24 | :param filter_conditional: a lambda conditional expression on file names 25 | :return: a coordinates_data_files_list object 26 | """ 27 | temp_coor_files = list(filter(filter_conditional, self._list_of_coor_data_files)) 28 | return coordinates_data_files_list(temp_coor_files) 29 | 30 | def get_list_of_coor_data_files(self): 31 | return self._list_of_coor_data_files 32 | 33 | def get_coor_data(self, scaling_factor, format='npy'): 34 | result = np.concatenate([ 35 | Helper_func.load_npy(item, format=format) for item in self._list_of_coor_data_files], axis=0) / scaling_factor 36 | assert (sum(self._list_num_frames) == result.shape[0]) 37 | return result 38 | 39 | def get_list_of_corresponding_pdb_dcd(self): 40 | list_of_corresponding_pdb_files = [x.strip().replace('_coordinates.npy', '.pdb') for x in self.get_list_of_coor_data_files()] 41 | for item in range(len(list_of_corresponding_pdb_files)): 42 | if not os.path.exists(list_of_corresponding_pdb_files[item]): 43 | list_of_corresponding_pdb_files[item] = list_of_corresponding_pdb_files[item].replace('.pdb', '.dcd') 44 | try: 45 | assert os.path.exists(list_of_corresponding_pdb_files[item]) 46 | except: 47 | raise Exception('%s does not exist!' % list_of_corresponding_pdb_files[item]) 48 | 49 | return list_of_corresponding_pdb_files 50 | 51 | def write_pdb_frames_into_file_with_list_of_coor_index(self, list_of_coor_index, out_file_name, verbose=True): 52 | """ 53 | This function picks several frames from pdb files, and write a new pdb file as output, 54 | we could use this together with the mouse-clicking callback implemented in the scatter plot: 55 | first we select a few points interactively in the scatter plot, and get corresponding index in the data point 56 | list, the we find the corresponding pdb frames with the index 57 | """ 58 | Helper_func.backup_rename_file_if_exists(out_file_name) 59 | list_of_coor_index.sort() 60 | pdb_files = self.get_list_of_corresponding_pdb_dcd() 61 | accum_sum = np.cumsum(np.array(self._list_num_frames)) # use accumulative sum to find corresponding pdb files 62 | for item in range(len(accum_sum)): 63 | if item == 0: 64 | temp_index_related_to_this_pdb_file = [x for x in list_of_coor_index if x < accum_sum[item]] 65 | else: 66 | temp_index_related_to_this_pdb_file = [x for x in list_of_coor_index if accum_sum[item - 1] <= x < accum_sum[item]] 67 | temp_index_related_to_this_pdb_file = [x - accum_sum[item - 1] for x in temp_index_related_to_this_pdb_file] 68 | temp_index_related_to_this_pdb_file.sort() 69 | 70 | if len(temp_index_related_to_this_pdb_file) != 0: 71 | if verbose: print(pdb_files[item]) 72 | with open(pdb_files[item], 'r') as in_file: 73 | content = in_file.read().split('MODEL')[1:] # remove header 74 | frames_to_use = [content[ii] for ii in temp_index_related_to_this_pdb_file] 75 | with open(out_file_name, 'a') as out_file: 76 | for frame in frames_to_use: 77 | out_file.write("MODEL" + frame) 78 | 79 | return 80 | 81 | def get_pdb_name_and_corresponding_frame_index_with_global_coor_index(self, coor_index): 82 | for item, temp_pdb in zip(self._list_num_frames, self.get_list_of_corresponding_pdb_dcd()): 83 | if coor_index < item: break 84 | else: coor_index -= item 85 | return temp_pdb, coor_index 86 | 87 | def concat_all_pdb_files(self, out_pdb_file): 88 | """ 89 | Why don't I use 'cat' in terminal? since I want to make order consistent with Python sort() function 90 | """ 91 | with open(out_pdb_file, 'w') as outfile: 92 | for fname in self.get_list_of_corresponding_pdb_dcd(): 93 | with open(fname) as infile: 94 | outfile.write(infile.read()) 95 | return 96 | 97 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/generate_coordinates.py: -------------------------------------------------------------------------------- 1 | from ANN_simulation import * 2 | import argparse, subprocess, os 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("mol_type", type=str, help="molecule type of the pdb files") 6 | parser.add_argument("--path", type=str, default="../target", help="specify the directory/file containing the pdb files") 7 | args = parser.parse_args() 8 | 9 | molecule_type = Sutils.create_subclass_instance_using_name(args.mol_type) 10 | temp_path = args.path 11 | 12 | if os.path.exists(temp_path): 13 | molecule_type.generate_coordinates_from_pdb_files(path_for_pdb=temp_path) 14 | else: 15 | print("%s not existed!" % temp_path) 16 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/helper_func.py: -------------------------------------------------------------------------------- 1 | from config import * 2 | from scipy.special import erf 3 | 4 | class Helper_func(object): 5 | def __init__(self): 6 | return 7 | 8 | @staticmethod 9 | def get_mutual_info_of_two_continuous_vars(temp_var_0, temp_var_1, bins=10, normalization=True): 10 | temp_hist_0, _ = np.histogramdd(temp_var_0, bins=bins) 11 | temp_hist_1, _ = np.histogramdd(temp_var_1, bins=bins) 12 | temp_hist_2, _ = np.histogramdd(np.array([temp_var_0, temp_var_1]).T, bins=bins) 13 | temp_hist_0 /= temp_hist_0.sum() 14 | temp_hist_1 /= temp_hist_1.sum() 15 | temp_hist_2 /= temp_hist_2.sum() 16 | result = np.sum([temp_hist_2[item_x, item_y] * np.log( 17 | temp_hist_2[item_x, item_y] / temp_hist_0[item_x] / temp_hist_1[item_y]) 18 | for item_x in range(bins) for item_y in range(bins) if temp_hist_2[item_x, item_y] != 0]) 19 | if normalization: 20 | entropy_0 = - np.sum(temp_hist_0 * np.log(temp_hist_0)) 21 | entropy_1 = - np.sum(temp_hist_1 * np.log(temp_hist_1)) 22 | result /= (0.5 * (entropy_0 + entropy_1)) 23 | return result 24 | 25 | @staticmethod 26 | def generate_alkane_residue_code_in_openmm_xml(num, name): 27 | print(''' 28 | 29 | 30 | 31 | ''' % name) 32 | for item in range(num - 2): 33 | print(''' 34 | 35 | ''' % (item + 2, item + 2, item + 2)) 36 | print(""" 37 | 38 | 39 | 40 | 41 | 42 | """ % (num, num, num, num)) 43 | for item in range(num - 1): 44 | print(""" 45 | 46 | """ % (item + 1, item + 2, item + 2, item + 2, item + 2, item + 2)) 47 | print(""" 48 | 49 | 50 | """ % (num, num)) 51 | return 52 | 53 | @staticmethod 54 | def check_center_of_mass_is_at_origin(result): 55 | coords_of_center_of_mass_after = [[np.average(result[item, ::3]), np.average(result[item, 1::3]), 56 | np.average(result[item, 2::3])] 57 | for item in range(result.shape[0])] 58 | return np.all(np.abs(np.array(coords_of_center_of_mass_after).flatten()) < 1e-5) 59 | 60 | @staticmethod 61 | def remove_translation(coords): # remove the translational degree of freedom 62 | if len(coords.shape) == 1: # convert 1D array (when there is only one coord) to 2D array 63 | coords = coords.reshape((1, coords.shape[0])) 64 | number_of_atoms = coords.shape[1] // 3 65 | coords_of_center_of_mass = [[np.average(coords[item, ::3]), np.average(coords[item, 1::3]), 66 | np.average(coords[item, 2::3])] * number_of_atoms 67 | for item in range(coords.shape[0])] 68 | result = coords - np.array(coords_of_center_of_mass) 69 | assert Helper_func.check_center_of_mass_is_at_origin(result) 70 | return result 71 | 72 | @staticmethod 73 | def get_gyration_tensor_and_principal_moments(coords): 74 | coords = Helper_func.remove_translation(coords) 75 | temp_coords = coords.reshape(coords.shape[0], coords.shape[1] // 3, 3) 76 | gyration = np.zeros((coords.shape[0], 3, 3)) 77 | for xx in range(3): 78 | for yy in range(3): 79 | gyration[:, xx, yy] = (temp_coords[:, :, xx] * temp_coords[:, :, yy]).mean(axis=-1) 80 | moments_gyration = np.linalg.eig(gyration)[0] 81 | moments_gyration.sort(axis=-1) 82 | return gyration, moments_gyration[:, ::-1] 83 | 84 | @staticmethod 85 | def get_norm_factor(rcut, sig): 86 | rcut2 = rcut*rcut 87 | sig2 = 2.0*sig*sig 88 | normconst = np.sqrt( np.pi * sig2 ) * erf( rcut / (sqrt(2.0)*sig) ) - 2*rcut* np.exp( - rcut2 / sig2 ) 89 | preerf = np.sqrt( 0.5 * np.pi * sig * sig ) / normconst 90 | prelinear = np.exp( - rcut2 / sig2 ) / normconst 91 | return normconst, preerf, prelinear 92 | 93 | @staticmethod 94 | def get_cg_count_in_sphere(dis, r_hi, rcut, sig): # get coarse grained counts 95 | # TODO: test if this function is correct 96 | normconst, preerf, prelinear = Helper_func.get_norm_factor(rcut, sig) 97 | hiMinus = r_hi - rcut 98 | hiPlus = r_hi + rcut 99 | count = np.float64((dis <= hiPlus).sum(axis=-1)) 100 | temp_in_boundary_region = ((dis > hiMinus) & (dis <= hiPlus)) 101 | temp_correction = ( 0.5 + preerf * erf( np.sqrt(0.5) * (dis - r_hi)/sig ) \ 102 | - prelinear * (dis - r_hi)) 103 | # print count.shape, temp_in_boundary_region.shape, temp_correction.shape 104 | count -= (temp_in_boundary_region * temp_correction).sum(axis=-1) 105 | actual_count = (dis < r_hi).sum(axis=-1) 106 | return count, actual_count 107 | 108 | @staticmethod 109 | def get_cg_count_in_shell(dis, r_low, r_hi, rcut, sig): 110 | cg_1, actual_1 = Helper_func.get_cg_count_in_sphere(dis, r_hi, rcut, sig) 111 | cg_2, actual_2 = Helper_func.get_cg_count_in_sphere(dis, r_low, rcut, sig) 112 | return cg_1 - cg_2, actual_1 - actual_2 113 | 114 | @staticmethod 115 | def get_cg_count_slice_representation(dis, r_shell_low, r_shell_high, num, rcut, sig): 116 | temp_r = np.linspace(r_shell_low, r_shell_high, num) 117 | r_low_list = temp_r[:-1] 118 | r_high_list = temp_r[1:] 119 | result = [Helper_func.get_cg_count_in_shell(dis, r_low, r_high, rcut, sig)[0] 120 | for (r_low, r_high) in zip(r_low_list, r_high_list)] 121 | return np.concatenate(result, axis=1), temp_r 122 | 123 | @staticmethod 124 | def get_box_length_list_fom_reporter_file(reporter_file, unit): # require unit explicitly 125 | reporter_file_content = np.loadtxt(reporter_file, delimiter=',', usecols=(6,)) # column 6 is volume of box 126 | if unit == 'nm': scaling_factor = 1 127 | elif unit == 'A': scaling_factor = 10 128 | return scaling_factor * np.cbrt(reporter_file_content) 129 | 130 | @staticmethod 131 | def compute_distances_min_image_convention(atoms_pos_1, atoms_pos_2, box_length_list): 132 | # note: box_length may be different for different frames when using NPT, typically is read from reporter file 133 | # shape of atoms_pos_{1,2}: (num of frames, num of atoms * 3) 134 | # output: distance matrix 135 | # why don't we use mdtraj? Because it requires large memory for loading large pdb files 136 | # why don't we use MDAnalysis? Because it is not fast enough (looping over trajectory would take long time) 137 | # this function is especially useful when both atoms_pos_1, atoms_pos_2 are not super long, while the number of frames is large, 138 | # since it vectorizes computation over frames 139 | temp_dis_2 = np.zeros((atoms_pos_1.shape[0], atoms_pos_1.shape[1] // 3, atoms_pos_2.shape[1] // 3)) 140 | for index_1 in range(atoms_pos_1.shape[1] // 3): 141 | # print index_1 142 | for index_2 in range(atoms_pos_2.shape[1] // 3): 143 | temp_diff = atoms_pos_1[:, 3 * index_1: 3 * index_1 + 3] - atoms_pos_2[:, 3 * index_2: 3 * index_2 + 3] 144 | temp_vec = np.array([(item + box_length_list / 2.0) % box_length_list - box_length_list / 2.0 for item in temp_diff.T]) 145 | temp_dis_2[:, index_1, index_2] = np.linalg.norm(temp_vec, axis=0) 146 | return temp_dis_2 147 | 148 | @staticmethod 149 | def get_index_list_of_O_atom_in_water(pdb_file, ignore_TER_line): 150 | """this is used for solvent analysis, e.g. biased simulation with PLUMED""" 151 | temp_u = Universe(pdb_file) 152 | atom_sel = temp_u.select_atoms('resname HOH and name O') 153 | if ignore_TER_line: return atom_sel.indices + 1 154 | else: raise Exception('double check your pdb') 155 | 156 | @staticmethod 157 | def get_distances_with_water_for_atom_list(pdb_file, atom_selection, box_length_list): 158 | # box_length information is stored in reporter_file 159 | temp_u = Universe(pdb_file) 160 | water_pos, atoms_pos = [], [] 161 | water_sel = temp_u.select_atoms('resname HOH and name O') 162 | atoms_sel = temp_u.select_atoms(atom_selection) 163 | for _ in temp_u.trajectory: 164 | water_pos.append(water_sel.positions.flatten()) 165 | atoms_pos.append(atoms_sel.positions.flatten()) 166 | atoms_pos = np.array(atoms_pos) 167 | water_pos = np.array(water_pos) 168 | distances = Helper_func.compute_distances_min_image_convention(atoms_pos_1=atoms_pos, atoms_pos_2=water_pos, 169 | box_length_list=box_length_list) 170 | return distances 171 | 172 | @staticmethod 173 | def get_list_of_cg_count_for_atom_list(pdb_file, atom_selection, box_length_list, r_low, r_hi, rcut, sig): 174 | """ cg = coarse grained, atom list is specified by atom_selection """ 175 | distances = Helper_func.get_distances_with_water_for_atom_list(pdb_file, atom_selection, box_length_list) 176 | return Helper_func.get_cg_count_in_shell(distances, r_low, r_hi, rcut, sig) 177 | 178 | @staticmethod 179 | def get_radial_distribution(distances, num, nbins, dr, length): 180 | hist = np.zeros(nbins, ) 181 | for item in distances: 182 | temp_target_index = int(item / dr) 183 | if temp_target_index < nbins: 184 | hist[temp_target_index] += 1.0 / (4 / 3.0 * np.pi) / ( 185 | ((temp_target_index + 1) * dr) ** 3 - ((temp_target_index + 0) * dr) ** 3) 186 | return hist / (num / length ** 3) 187 | 188 | @staticmethod 189 | def backup_rename_file_if_exists(filename): 190 | extension = '.' + filename.split('.')[-1] 191 | if os.path.isfile(filename): # backup file if previous one exists 192 | new_filename = filename + ".bak_" + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + extension 193 | os.rename(filename, new_filename) 194 | else: new_filename = None 195 | return new_filename 196 | 197 | @staticmethod 198 | def attempt_to_save_npy(npy_file, npy_array): 199 | """when trying to save a npy array to a file, if it exists and contains a different value, 200 | then save to another file""" 201 | if npy_file.strip()[-4:] != '.npy': npy_file += '.npy' 202 | original_npy_file = npy_file 203 | index = 0 204 | while True: 205 | if os.path.isfile(npy_file): 206 | content = np.load(npy_file) 207 | if np.all(npy_array == content): 208 | break 209 | else: 210 | npy_file = original_npy_file.replace('.npy', '_%d.npy' % index) 211 | index += 1 212 | else: 213 | np.save(npy_file, npy_array) 214 | break 215 | return npy_file 216 | 217 | @staticmethod 218 | def run_multiple_jobs_on_local_machine(commands, num_of_jobs_in_parallel=CONFIG_56): 219 | total_num_failed_jobs = 0 220 | for item in range(int(len(commands) / num_of_jobs_in_parallel) + 1): 221 | temp_commands_parallel = commands[item * num_of_jobs_in_parallel: (item + 1) * num_of_jobs_in_parallel] 222 | print("running: \t" + '\n'.join(temp_commands_parallel)) 223 | procs_to_run_commands = [subprocess.Popen(_1.strip(), shell=True) for _1 in temp_commands_parallel] 224 | exit_codes = [p.wait() for p in procs_to_run_commands] 225 | total_num_failed_jobs += sum(exit_codes) 226 | return total_num_failed_jobs 227 | 228 | @staticmethod 229 | def shuffle_multiple_arrays(list_of_arrays): 230 | """can be used for shuffle training and validation set to improve sampling""" 231 | indices = np.arange(list_of_arrays[0].shape[0]) 232 | np.random.shuffle(indices) 233 | return [item[indices] for item in list_of_arrays] 234 | 235 | @staticmethod 236 | def find_indices_of_points_in_array_near_each_point_in_ref_list(point_list, ref_list, threshold_r): 237 | """used to find points near a specific point (in the reference list), useful for sampling structures 238 | in a pdb file that are near a specific point in CV space (result is the indices of pdb snapshots) 239 | """ 240 | return [np.where(np.linalg.norm(point_list - item, axis=1) < threshold_r)[0] 241 | for item in ref_list] 242 | 243 | @staticmethod 244 | def tica_inverse_transform(tica, data_list): 245 | from msmbuilder.decomposition import tICA 246 | assert (isinstance(tica, tICA)) 247 | result_list = [] 248 | for data in data_list: 249 | result = np.dot(tica.covariance_.T, np.dot(tica.components_.T, data.T)).T + tica.means_ 250 | assert_almost_equal(tica.transform([result])[0], data) 251 | result_list.append(result) 252 | return result_list 253 | 254 | @staticmethod 255 | def get_autocorr(x_list, lag_time): 256 | return np.corrcoef(np.array([x_list[0:len(x_list) - lag_time], x_list[lag_time:len(x_list)]]))[0, 1] 257 | 258 | @staticmethod 259 | def generate_sequence_with_constant_autocorrelation(constant_autocorrelation, length): 260 | traj_list = [np.random.normal()] 261 | for _ in range(length - 1): 262 | temp_value = np.random.normal(constant_autocorrelation * traj_list[-1], scale=1) 263 | traj_list.append(temp_value) 264 | return traj_list 265 | 266 | @staticmethod 267 | def load_object_from_pkl_file(file_path): 268 | try: 269 | result = pickle.load(open(file_path, 'rb')) 270 | except: # solve encoding issue for python2 -> python3 271 | with open(file_path, 'rb') as ff: 272 | result = pickle.load(ff, encoding='latin1') 273 | return result 274 | 275 | @staticmethod 276 | def load_npy(file, format): 277 | if format == 'txt': return np.loadtxt(file) 278 | elif format == 'npy': return np.load(file) 279 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/kernel_tica.py: -------------------------------------------------------------------------------- 1 | import numpy as np, pyemma as py 2 | # from msmbuilder.decomposition.tica import tICA 3 | from sklearn.kernel_approximation import Nystroem 4 | 5 | """modified from https://github.com/msmbuilder/msmbuilder/blob/master/msmbuilder/decomposition/ktica.py""" 6 | """reference: [1] Schwantes, Christian R., and Vijay S. Pande. J. Chem Theory Comput. 11.2 (2015): 600--608.""" 7 | 8 | class Kernel_tica(object): 9 | def __init__(self, n_components, lag_time, 10 | gamma, # gamma value for rbf kernel 11 | n_components_nystroem=100, # number of components for Nystroem kernel approximation 12 | landmarks = None, 13 | shrinkage = None, 14 | weights='empirical' # if 'koopman', use Koopman reweighting for tICA (see Wu, Hao, et al. "Variational Koopman models: slow collective variables and molecular kinetics from short off-equilibrium simulations." The Journal of Chemical Physics 146.15 (2017): 154104.) 15 | ): 16 | self._n_components = n_components 17 | self._lag_time = lag_time 18 | self._n_components_nystroem = n_components_nystroem 19 | self._landmarks = landmarks 20 | self._gamma = gamma 21 | self._nystroem = Nystroem(gamma=gamma, n_components=n_components_nystroem) 22 | self._weights = weights 23 | # self._tica = tICA(n_components=n_components, lag_time=lag_time, shrinkage=shrinkage) 24 | self._shrinkage = shrinkage 25 | return 26 | 27 | def fit(self, sequence_list): 28 | if self._landmarks is None: 29 | self._nystroem.fit(np.concatenate(sequence_list)) 30 | else: 31 | print("using landmarks") 32 | self._nystroem.fit(self._landmarks) 33 | sequence_transformed = [self._nystroem.transform(item) for item in sequence_list] 34 | # define tica object at fit() with sequence_list supplied for initialization, as it is required by 35 | # Koopman reweighting 36 | self._tica = py.coordinates.tica(sequence_transformed, lag=self._lag_time, 37 | dim=self._n_components, kinetic_map=True, 38 | weights=self._weights) 39 | return 40 | 41 | def transform(self, sequence_list): 42 | return self._tica.transform( 43 | [self._nystroem.transform(item) for item in sequence_list]) 44 | 45 | def fit_transform(self, sequence_list): 46 | self.fit(sequence_list) 47 | return self.transform(sequence_list) 48 | 49 | def score(self, sequence_list): 50 | model = self.__class__(n_components = self._n_components, lag_time=self._lag_time, gamma=self._gamma, 51 | n_components_nystroem=self._n_components_nystroem, landmarks=self._landmarks, 52 | shrinkage=self._shrinkage) 53 | model.fit(sequence_list) 54 | return np.sum(model._tica.eigenvalues) 55 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/main_work.py: -------------------------------------------------------------------------------- 1 | from ANN_simulation import * 2 | import argparse 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--starting_index", type=int, default=1, help="index of starting iteration") 6 | parser.add_argument("--num_of_iterations", type=int, default=10, help="number of iterations to run") 7 | parser.add_argument("--starting_network_file", type=str, default=None, help="the network to start with") 8 | parser.add_argument("--training_interval", type=int, default=1, help="training interval") 9 | args = parser.parse_args() 10 | 11 | if args.starting_network_file is None: 12 | starting_network = None 13 | else: 14 | starting_network = autoencoder.load_from_pkl_file(args.starting_network_file) 15 | 16 | init_iter = iteration(index = args.starting_index, network = starting_network) 17 | 18 | a = simulation_with_ANN_main(num_of_iterations = args.num_of_iterations, initial_iteration = init_iter, training_interval=args.training_interval) 19 | a.run_mult_iterations() 20 | 21 | print("Done main work!") 22 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/molecule_spec_sutils.py: -------------------------------------------------------------------------------- 1 | """Sutils: simulation unilities, some of them are molecule-specific (implemented as methods in subclasses) 2 | """ 3 | 4 | from config import * 5 | import random, mdtraj as md 6 | from coordinates_data_files_list import * 7 | from sklearn.cluster import KMeans 8 | from helper_func import * 9 | from functools import reduce 10 | 11 | class Sutils(object): 12 | def __init__(self): 13 | return 14 | 15 | @staticmethod 16 | def get_num_of_non_overlapping_hyperspheres_that_filled_explored_phase_space( 17 | pdb_file_list, atom_selection, radius, step_interval=1, shuffle_list=True, 18 | distance_metric='RMSD'): 19 | """ 20 | This functions is used to count how many non-overlapping hyperspheres are needed to fill the explored phase 21 | space, to estimate volumn of explored region 22 | :param atom_selection: atom selection statement for MDAnalysis 23 | :param radius: radius of hyperspheres 24 | :param distance_metric: distance metric of two frames 25 | :return: number of hyperspheres 26 | """ 27 | if shuffle_list: random.shuffle(pdb_file_list) 28 | index = 0 29 | positions_list = [] 30 | for sample_file in pdb_file_list: 31 | sample = Universe(sample_file) 32 | sample_atom_selection = sample.select_atoms(atom_selection) 33 | frame_index_list = list(range(sample.trajectory.n_frames)) 34 | if shuffle_list: random.shuffle(frame_index_list) 35 | for item_index in frame_index_list: 36 | sample.trajectory[item_index] 37 | if index % step_interval == 0: 38 | current_positions = sample_atom_selection.positions 39 | distances_to_previous_frames = np.array( 40 | [Sutils.get_RMSD_after_alignment(item, current_positions) 41 | for item in positions_list]) 42 | if len(distances_to_previous_frames) == 0 or np.all(distances_to_previous_frames > radius): 43 | # need to include a new hypershere 44 | positions_list.append(current_positions) 45 | 46 | index += 1 47 | 48 | return len(positions_list), np.array(positions_list) 49 | 50 | @staticmethod 51 | def mark_and_modify_pdb_for_calculating_RMSD_for_plumed(pdb_file, out_pdb, 52 | atom_index_list, start_idx, item_positions=None): 53 | """ 54 | :param pdb_file: input pdb 55 | :param out_pdb: output reference pdb 56 | :param atom_index_list: index list used to calculate RMSD 57 | :param item_positions: reference positions of selected atoms, set it None if we do not want to modify positions 58 | """ 59 | indices = np.array(atom_index_list) - start_idx # explicitly specify start_idx, to avoid confusion 60 | temp_sample = Universe(pdb_file) 61 | temp_atoms = temp_sample.select_atoms('all') 62 | if not item_positions is None: 63 | item_positions = item_positions.reshape((item_positions.shape[0] // 3, 3)) 64 | temp_positions = temp_atoms.positions 65 | temp_positions[indices] = item_positions 66 | temp_atoms.positions = temp_positions 67 | 68 | temp_bfactors = np.zeros(len(temp_atoms)) 69 | temp_bfactors[indices] = 1 70 | temp_atoms.tempfactors = temp_bfactors 71 | temp_atoms.occupancies = temp_bfactors 72 | temp_atoms.write(out_pdb) 73 | return out_pdb 74 | 75 | @staticmethod 76 | def get_plumed_script_that_generate_a_segment_connecting_two_configs( 77 | pdb_1, pdb_2, atom_selection_statement, num_steps, force_constant): 78 | """ 79 | This function uses targeted MD to generate a segment connecting two configurations 80 | :param pdb_1, pdb_2: two ends of segment 81 | :param atom_selection_statement: atoms for calculating RMSD in targeted MD 82 | """ 83 | atom_list = get_index_list_with_selection_statement(pdb_1, atom_selection_statement) 84 | ref_pdb = pdb_2.replace('.pdb', '_ref.pdb') 85 | Sutils.mark_and_modify_pdb_for_calculating_RMSD_for_plumed(pdb_2, ref_pdb, atom_list, None) 86 | rmsd_diff = Sutils.metric_RMSD_of_atoms([pdb_1], ref_file=ref_pdb, 87 | atom_selection_statement=atom_selection_statement, step_interval=100)[0] # TODO: check units 88 | plumed_script = """rmsd: RMSD REFERENCE=%s TYPE=OPTIMAL 89 | restraint: MOVINGRESTRAINT ARG=rmsd AT0=%f STEP0=0 KAPPA0=%f AT1=0 STEP1=%d KAPPA1=%f 90 | PRINT STRIDE=500 ARG=* FILE=COLVAR 91 | """ % (ref_pdb, rmsd_diff, force_constant, num_steps, force_constant) 92 | return plumed_script 93 | 94 | @staticmethod 95 | def prepare_output_Cartesian_coor_with_multiple_ref_structures( 96 | folder_list, 97 | alignment_coor_file_suffix_list, 98 | scaling_factor 99 | ): 100 | my_coor_data_obj = coordinates_data_files_list(list_of_dir_of_coor_data_files=folder_list) 101 | coor_data_obj_input = my_coor_data_obj.create_sub_coor_data_files_list_using_filter_conditional( 102 | lambda x: not 'aligned' in x) 103 | assert (len(alignment_coor_file_suffix_list) == CONFIG_55) 104 | coor_data_obj_output_list = [my_coor_data_obj.create_sub_coor_data_files_list_using_filter_conditional( 105 | lambda x: item in x) for item in alignment_coor_file_suffix_list] 106 | 107 | for item in range(len(alignment_coor_file_suffix_list)): 108 | for _1, _2 in zip(coor_data_obj_input.get_list_of_coor_data_files(), 109 | coor_data_obj_output_list[item].get_list_of_coor_data_files()): 110 | assert (_2 == _1.replace('_coordinates.npy', alignment_coor_file_suffix_list[item])), (_2, _1) 111 | 112 | output_data_set = np.concatenate([Sutils.remove_translation(item.get_coor_data(scaling_factor)) 113 | for item in coor_data_obj_output_list] , axis=1) 114 | return output_data_set 115 | 116 | @staticmethod 117 | def select_representative_points(data_set, output_data_set): 118 | # clustering, pick representative points for training, two purposes: 119 | # 1. avoid that training results are too good for densely-sampled regions, but bad for others. 120 | # 2. reduce computation cost 121 | print ("selecting representative points...") 122 | kmeans = KMeans(init='k-means++', n_clusters=min(CONFIG_59, output_data_set.shape[0]), n_init=10) 123 | kmeans.fit(output_data_set) 124 | indices_of_representative_points = np.array([np.where(kmeans.labels_ == ii)[0][0] 125 | for ii in range(kmeans.n_clusters)]) 126 | return data_set[indices_of_representative_points], output_data_set[indices_of_representative_points] 127 | 128 | @staticmethod 129 | def create_subclass_instance_using_name(name): 130 | return {'Alanine_dipeptide': Alanine_dipeptide(), 'Trp_cage': Trp_cage()}[name] 131 | 132 | @staticmethod 133 | def load_object_from_pkl_file(file_path): 134 | return Helper_func.load_object_from_pkl_file(file_path) 135 | 136 | @staticmethod 137 | def write_some_frames_into_a_new_file_based_on_index_list_for_pdb_file_list(list_of_files, index_list, new_pdb_file_name): 138 | print("note that order may not be preserved!") 139 | remaining_index_list = index_list 140 | for _1 in list_of_files: 141 | remaining_index_list = Sutils.write_some_frames_into_a_new_file_based_on_index_list(_1, remaining_index_list, new_pdb_file_name) 142 | 143 | # check number of frames to be correct 144 | with open(new_pdb_file_name, 'r') as f_in: 145 | content = f_in.read().strip().split('MODEL')[1:] 146 | assert (len(content) == len(index_list)), (len(content), len(index_list)) 147 | 148 | return 149 | 150 | @staticmethod 151 | def write_some_frames_into_a_new_file_based_on_index_list(pdb_file_name, index_list, new_pdb_file_name=None, 152 | overwrite=False): 153 | if os.stat(pdb_file_name).st_size > 1000000000: raise Exception('file may be too large, try to use other tools') 154 | 155 | if new_pdb_file_name is None: 156 | new_pdb_file_name = pdb_file_name.strip().split('.pdb')[0] + '_someframes.pdb' 157 | 158 | with open(pdb_file_name, 'r') as f_in: 159 | content = [item for item in f_in.readlines() if (not 'REMARK' in item) and (not 'END\n' in item)] 160 | content = ''.join(content) 161 | content = content.split('MODEL')[1:] # remove header 162 | num_of_frames_in_current_file = len(content) 163 | index_for_this_file = [_2 for _2 in index_list if _2 < num_of_frames_in_current_file] 164 | remaining_index_list = [_2 - num_of_frames_in_current_file for _2 in index_list if 165 | _2 >= num_of_frames_in_current_file] 166 | content_to_write = [content[_2] for _2 in index_for_this_file] 167 | 168 | write_flag = 'w' if overwrite else 'a' 169 | with open(new_pdb_file_name, write_flag) as f_out: 170 | for item in content_to_write: 171 | f_out.write("MODEL") 172 | f_out.write(item) 173 | 174 | return remaining_index_list 175 | 176 | @staticmethod 177 | def concat_first_frame_in_all_pdb_files(list_of_pdb_files, new_pdb_file_name): 178 | for item in list_of_pdb_files: 179 | Sutils.write_some_frames_into_a_new_file_based_on_index_list(item, [0], new_pdb_file_name) 180 | return 181 | 182 | @staticmethod 183 | def write_some_frames_into_a_new_file(pdb_file_name, start_index, end_index, step_interval = 1, # start_index included, end_index not included 184 | new_pdb_file_name=None, method=1): 185 | print('writing frames of %s: [%d:%d:%d]...' % (pdb_file_name, start_index, end_index, step_interval)) 186 | if new_pdb_file_name is None: 187 | new_pdb_file_name = pdb_file_name.strip().split('.pdb')[0] + '_frame_%d_%d_%d.pdb' % (start_index, end_index, step_interval) 188 | 189 | if method == 0: 190 | if os.stat(pdb_file_name).st_size > 1000000000: raise Exception('file may be too large, try to use other tools') 191 | with open(pdb_file_name, 'r') as f_in: 192 | content = [item for item in f_in.readlines() if (not 'REMARK' in item) and (not 'END\n' in item)] 193 | content = ''.join(content) 194 | content = content.split('MODEL')[1:] # remove header 195 | if end_index == 0: 196 | content_to_write = content[start_index::step_interval] # for selecting last few frames 197 | else: 198 | content_to_write = content[start_index:end_index:step_interval] 199 | 200 | with open(new_pdb_file_name, 'w') as f_out: 201 | for item in content_to_write: 202 | f_out.write("MODEL") 203 | f_out.write(item) 204 | elif method == 1: 205 | index = -1 206 | with open(pdb_file_name, 'r') as f_in, open(new_pdb_file_name, 'w') as f_out: 207 | for item in f_in: 208 | if 'MODEL' in item: index += 1 209 | if (not 'REMARK' in item) and (not 'END\n' in item) and (index % step_interval == 0) \ 210 | and ( 211 | (end_index != 0 and (start_index <= index < end_index)) 212 | or (end_index == 0 and index >= start_index)): 213 | f_out.write(item) 214 | return 215 | 216 | @staticmethod 217 | def data_augmentation(data_set, output_data_set, num_of_copies, is_output_reconstructed_Cartesian=True): 218 | """ 219 | assume that center of mass motion of data_set and output_data_set should be removed. 220 | """ 221 | assert (Sutils.check_center_of_mass_is_at_origin(data_set)) 222 | if is_output_reconstructed_Cartesian: 223 | assert (Sutils.check_center_of_mass_is_at_origin(output_data_set)) 224 | 225 | num_of_data = data_set.shape[0] 226 | output_data_set = np.array(output_data_set.tolist() * num_of_copies) 227 | num_atoms = len(data_set[0]) // 3 228 | data_set = data_set.reshape((num_of_data, num_atoms, 3)) 229 | temp_data_set = [] 230 | for _ in range(num_of_copies): 231 | temp_data_set.append([Sutils.rotating_randomly_around_center_of_mass(x) for x in data_set]) 232 | 233 | data_set = np.concatenate(temp_data_set, axis=0) 234 | data_set = data_set.reshape((num_of_copies * num_of_data, num_atoms * 3)) 235 | return data_set, output_data_set 236 | 237 | @staticmethod 238 | def check_center_of_mass_is_at_origin(result): 239 | return Helper_func.check_center_of_mass_is_at_origin(result=result) 240 | 241 | @staticmethod 242 | def remove_translation(coords): # remove the translational degree of freedom 243 | return Helper_func.remove_translation(coords=coords) 244 | 245 | @staticmethod 246 | def rotating_randomly_around_center_of_mass(coords): 247 | axis_vector = np.random.uniform(0, 1, 3) 248 | angle = np.random.uniform(0, 2 * np.pi) 249 | return Sutils.rotating_around_center_of_mass(coords, axis_vector, angle) 250 | 251 | @staticmethod 252 | def rotating_around_center_of_mass(coords, axis_vector, angle): 253 | center_of_mass = coords.mean(axis=0) 254 | return Sutils.rotating_coordinates(coords, center_of_mass, axis_vector, angle) 255 | 256 | @staticmethod 257 | def rotating_coordinates(coords, fixed_coord, axis_vector, angle): 258 | indices_atoms = list(range(len(coords))) 259 | return Sutils.rotating_group_of_atoms(coords, indices_atoms, fixed_coord, axis_vector, angle) 260 | 261 | @staticmethod 262 | def rotating_group_of_atoms(coords, indices_atoms, fixed_coord, axis_vector, angle): 263 | """ 264 | :param coords: coordinates of all atoms 265 | :param indices_atoms: indices of atoms to rotate 266 | :param fixed_coord: coordinates of fixed point 267 | :param axis_vector: rotation axis 268 | :param angle: rotation angle 269 | :return: coordinates of all atoms after rotation 270 | """ 271 | result = copy.deepcopy(coords) # avoid modifying original input 272 | temp_coords = coords[indices_atoms] - fixed_coord # coordinates for rotation 273 | temp_coords = np.array(temp_coords) 274 | cos_value = np.cos(angle); sin_value = np.sin(angle) 275 | axis_vector_length = np.sqrt(np.sum(np.array(axis_vector) ** 2)) 276 | ux = axis_vector[0] / axis_vector_length; uy = axis_vector[1] / axis_vector_length; uz = axis_vector[2] / axis_vector_length 277 | rotation_matrix = np.array([[cos_value + ux ** 2 * (1 - cos_value), 278 | ux * uy * (1 - cos_value) - uz * sin_value, 279 | ux * uz * (1 - cos_value) + uy * sin_value], 280 | [ux * uy * (1 - cos_value) + uz * sin_value, 281 | cos_value + uy ** 2 * (1 - cos_value), 282 | uy * uz * (1 - cos_value) - ux * sin_value], 283 | [ux * uz * (1 - cos_value) - uy * sin_value, 284 | uy * uz * (1 - cos_value) + ux * sin_value, 285 | cos_value + uz ** 2 * (1 - cos_value)]]) 286 | result[indices_atoms] = np.dot(temp_coords, rotation_matrix) + fixed_coord 287 | return result 288 | 289 | @staticmethod 290 | def _generate_coordinates_from_pdb_files(atom_index, file_path=CONFIG_12, format='npy'): 291 | atom_index = [int(_1) for _1 in atom_index] 292 | atom_index = np.array(atom_index) - 1 # note that atom index starts from 1 293 | filenames = subprocess.check_output([ 294 | 'find', file_path, '-name', '*.pdb', '-o', '-name', '*.dcd']).decode("utf-8").strip().split('\n') 295 | output_file_list = [] 296 | 297 | for input_file in filenames: 298 | output_file = input_file[:-4] + '_coordinates.' + format 299 | 300 | output_file_list += [output_file] 301 | if os.path.exists(output_file) and os.path.getmtime(input_file) < os.path.getmtime(output_file): # check modified time 302 | print("coordinate file already exists: %s (remove previous one if needed)" % output_file) 303 | else: 304 | print('generating coordinates of ' + input_file) 305 | mdxyz = md.load(input_file, top=CONFIG_62[0]).xyz 306 | mdxyz = mdxyz[:, atom_index, :].reshape(mdxyz.shape[0], len(atom_index) * 3) 307 | if format == 'txt': np.savetxt(output_file, mdxyz) 308 | elif format == 'npy': np.save(output_file, mdxyz) 309 | 310 | print("Done generating coordinates files\n") 311 | return output_file_list 312 | 313 | @staticmethod 314 | def _get_plumed_script_with_pairwise_dis_as_input(index_atoms, scaling_factor): 315 | return Plumed_helper.get_pairwise_dis(index_atoms, scaling_factor=scaling_factor, 316 | unit_scaling=1.0, out_var_prefix='l_0_out_') 317 | 318 | @staticmethod 319 | def remove_water_mol_and_Cl_from_pdb_file(folder_for_pdb = CONFIG_12, preserve_original_file=True): 320 | """ 321 | This is used to remove water molecule from pdb file, purposes: 322 | - save storage space 323 | - reduce processing time of pdb file 324 | """ 325 | filenames = subprocess.check_output(['find', folder_for_pdb, '-name', '*.pdb']).decode("utf-8").split('\n')[:-1] 326 | for item in filenames: 327 | print('removing water molecules from pdb file: ' + item) 328 | output_file = item[:-4] + '_rm_tmp.pdb' 329 | is_line_removed_flag = False 330 | with open(item, 'r') as f_in, open(output_file, 'w') as f_out: 331 | for line in f_in: 332 | if not 'HOH' in line and not 'CL' in line and not "NA" in line and not 'SPC' in line and not 'pseu' in line: 333 | f_out.write(line) 334 | else: is_line_removed_flag = True 335 | 336 | if not preserve_original_file: 337 | if is_line_removed_flag: 338 | subprocess.check_output(['mv', output_file, item]) 339 | else: 340 | subprocess.check_output(['rm', output_file]) 341 | 342 | print('Done removing water molecules from all pdb files!') 343 | return 344 | 345 | @staticmethod 346 | def get_boundary_points(list_of_points, 347 | range_of_PCs = CONFIG_26, 348 | num_of_bins = CONFIG_10, 349 | num_of_boundary_points = CONFIG_11, 350 | is_circular_boundary = CONFIG_18, 351 | preprocessing = True, 352 | auto_range_for_histogram = CONFIG_39, # set the range of histogram based on min,max values in each dimension 353 | reverse_sorting_mode = CONFIG_41 # whether we reverse the order of sorting of diff_with_neighbors values 354 | ): 355 | """ 356 | :param preprocessing: if True, then more weight is not linear, this would be better based on experience 357 | """ 358 | dimensionality = len(list_of_points[0]) 359 | list_of_points = list(zip(*list_of_points)) 360 | assert (len(list_of_points) == dimensionality) 361 | 362 | if is_circular_boundary or not auto_range_for_histogram: 363 | hist_matrix, edges = np.histogramdd(list_of_points, bins= num_of_bins * np.ones(dimensionality), range = range_of_PCs) 364 | else: 365 | temp_hist_range = [[min(item) - (max(item) - min(item)) / (num_of_bins - 2), max(item) + (max(item) - min(item)) / (num_of_bins - 2)]\ 366 | for item in list_of_points] 367 | hist_matrix, edges = np.histogramdd(list_of_points, bins=num_of_bins * np.ones(dimensionality), range=temp_hist_range) 368 | 369 | # following is the main algorithm to find boundary and holes 370 | # simply find the points that are lower than average of its 4 neighbors 371 | 372 | if preprocessing: 373 | hist_matrix = np.array([[- np.exp(- y) for y in x] for x in hist_matrix]) # preprocessing process 374 | 375 | if is_circular_boundary: # typically works for circular autoencoder 376 | diff_with_neighbors = hist_matrix - 1.0 / (2 * dimensionality) \ 377 | * sum( 378 | [np.roll(hist_matrix, 1, axis=x) + np.roll(hist_matrix, -1, axis=x) for x in list(range(dimensionality))] 379 | ) 380 | else: 381 | # TODO: code not concise and general enough, fix this later 382 | diff_with_neighbors = np.zeros(hist_matrix.shape) 383 | temp_1 = [list(range(item)) for item in hist_matrix.shape] 384 | for grid_index in itertools.product(*temp_1): 385 | neighbor_index_list = [(np.array(grid_index) + temp_2).astype(int) for temp_2 in np.eye(dimensionality)] 386 | neighbor_index_list += [(np.array(grid_index) - temp_2).astype(int) for temp_2 in np.eye(dimensionality)] 387 | neighbor_index_list = [x for x in neighbor_index_list if np.all(x >= 0) and np.all(x < num_of_bins)] 388 | # print "grid_index = %s" % str(grid_index) 389 | # print "neighbor_index_list = %s" % str(neighbor_index_list) 390 | diff_with_neighbors[tuple(grid_index)] = hist_matrix[tuple(grid_index)] - np.average( 391 | [hist_matrix[tuple(temp_2)] for temp_2 in neighbor_index_list] 392 | ) 393 | 394 | # get grid centers 395 | edge_centers = [0.5 * (np.array(x[1:]) + np.array(x[:-1])) for x in edges] 396 | grid_centers = np.array(list(itertools.product(*edge_centers))) # "itertools.product" gives Cartesian/direct product of several lists 397 | grid_centers = np.reshape(grid_centers, np.append(num_of_bins * np.ones(dimensionality), dimensionality).astype(int)) 398 | # print grid_centers 399 | 400 | potential_centers = [] 401 | 402 | # now sort these grids (that has no points in it) 403 | # based on total number of points in its neighbors 404 | 405 | temp_seperate_index = [] 406 | 407 | for _ in range(dimensionality): 408 | temp_seperate_index.append(list(range(num_of_bins))) 409 | 410 | index_of_grids = list(itertools.product( 411 | *temp_seperate_index 412 | )) 413 | 414 | index_of_grids = [x for x in index_of_grids if diff_with_neighbors[x] < 0] # only apply to grids with diff_with_neighbors value < 0 415 | sorted_index_of_grids = sorted(index_of_grids, key = lambda x: diff_with_neighbors[x]) # sort based on histogram, return index values 416 | if reverse_sorting_mode: 417 | sorted_index_of_grids.reverse() 418 | 419 | for index in sorted_index_of_grids[:num_of_boundary_points]: # note index can be of dimension >= 2 420 | temp_potential_center = [round(x, 2) for x in grid_centers[index]] 421 | potential_centers.append(temp_potential_center) 422 | 423 | return potential_centers 424 | 425 | @staticmethod 426 | def L_method(evaluation_values, num): 427 | evaluation_values = np.array(evaluation_values) 428 | num = np.array(num) 429 | assert (evaluation_values.shape == num.shape) 430 | min_weighted_err = float('inf') 431 | optimal_num = 0 432 | best_regr = None 433 | for item in range(1, len(num) - 1): 434 | y_left = evaluation_values[:item] 435 | x_left = num[:item].reshape(item, 1) 436 | y_right = evaluation_values[item - 1:] 437 | x_right = num[item - 1:].reshape(len(num) - item + 1, 1) 438 | regr_left = linear_model.LinearRegression() 439 | regr_left.fit(x_left, y_left) 440 | y_left_pred = regr_left.predict(x_left) 441 | regr_right = linear_model.LinearRegression() 442 | regr_right.fit(x_right, y_right) 443 | y_right_pred = regr_right.predict(x_right) 444 | 445 | err_left = mean_squared_error(y_left, y_left_pred) 446 | err_right = mean_squared_error(y_right, y_right_pred) 447 | weighted_err = (err_left * item + err_right * (len(num) - item + 1)) / (len(num) + 1) 448 | if weighted_err < min_weighted_err: 449 | optimal_num = num[item] 450 | min_weighted_err = weighted_err 451 | best_regr = [regr_left, regr_right] 452 | 453 | x_data = np.linspace(min(num), max(num), 100).reshape(100, 1) 454 | y_data_left = best_regr[0].predict(x_data) 455 | y_data_right = best_regr[1].predict(x_data) 456 | 457 | return optimal_num, x_data, y_data_left, y_data_right 458 | 459 | @staticmethod 460 | def get_RMSD_after_alignment(position_1, position_2): 461 | return rmsd(position_1, position_2, center=True, superposition=True) 462 | 463 | @staticmethod 464 | def metric_RMSD_of_atoms(list_of_files, ref_file='../resources/1l2y.pdb', ref_index=0, 465 | atom_selection_statement="name CA", step_interval=1): 466 | """ 467 | :param atom_selection_statement: could be either 468 | - "name CA" for alpha-carbon atoms only 469 | - "protein" for all atoms 470 | - "backbone" for backbone atoms 471 | - others: see more information here: https://pythonhosted.org/MDAnalysis/documentation_pages/selections.html 472 | """ 473 | ref = Universe(ref_file) 474 | ref_atom_selection = ref.select_atoms(atom_selection_statement) 475 | ref.trajectory[ref_index] 476 | ref_positions = ref_atom_selection.positions 477 | result_rmsd_of_atoms = [] 478 | index = 0 479 | 480 | for sample_file in list_of_files: 481 | sample = Universe(ref_file, sample_file) 482 | sample_atom_selection = sample.select_atoms(atom_selection_statement) 483 | 484 | for _ in sample.trajectory: 485 | if index % step_interval == 0: 486 | result_rmsd_of_atoms.append(Sutils.get_RMSD_after_alignment(ref_positions, 487 | sample_atom_selection.positions)) 488 | 489 | index += 1 490 | return np.array(result_rmsd_of_atoms) 491 | 492 | @staticmethod 493 | def get_positions_from_list_of_pdb(pdb_file_list, atom_selection_statement='name CA'): 494 | positions = [] 495 | for sample_file in pdb_file_list: 496 | sample = Universe(sample_file) 497 | sample_atom_selection = sample.select_atoms(atom_selection_statement) 498 | for _ in sample.trajectory: 499 | positions.append(sample_atom_selection.positions) 500 | return positions 501 | 502 | @staticmethod 503 | def get_RMSD_of_a_point_wrt_neighbors_in_PC_space_with_list_of_pdb(PCs, pdb_file_list, radius=0.1): 504 | """This function calculates RMSD of a configuration with respect to its neighbors in PC space, 505 | the purpose is to see if similar structures (small RMSD) are projected to points close to each other 506 | in PC space. 507 | wrt = with respect to 508 | """ 509 | from sklearn.metrics.pairwise import euclidean_distances 510 | positions = Sutils.get_positions_from_list_of_pdb(pdb_file_list) 511 | pairwise_dis_in_PC = euclidean_distances(PCs) 512 | neighbor_matrix = pairwise_dis_in_PC < radius 513 | RMSD_diff_of_neighbors = np.zeros(neighbor_matrix.shape) 514 | for ii in range(len(PCs)): 515 | for jj in range(ii + 1, len(PCs)): 516 | if neighbor_matrix[ii][jj]: 517 | RMSD_diff_of_neighbors[ii, jj] = RMSD_diff_of_neighbors[jj, ii] \ 518 | = Sutils.get_RMSD_after_alignment(positions[ii], positions[jj]) 519 | average_RMSD_wrt_neighbors = [np.average([x for x in RMSD_diff_of_neighbors[ii] if x]) 520 | for ii in range(len(PCs))] 521 | return average_RMSD_wrt_neighbors 522 | 523 | @staticmethod 524 | def get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval=1, atom_selection='name CA'): 525 | distances_list = [] 526 | index = 0 527 | for sample_file in list_of_files: 528 | sample = Universe(sample_file) 529 | sample_atom_selection = sample.select_atoms(atom_selection) 530 | for _ in sample.trajectory: 531 | if index % step_interval == 0: 532 | distances_list.append( 533 | distance_array(sample_atom_selection.positions, sample_atom_selection.positions)) 534 | 535 | index += 1 536 | 537 | return np.array(distances_list) 538 | 539 | @staticmethod 540 | def get_non_repeated_pairwise_distance(list_of_files, step_interval=1, atom_selection='name CA'): 541 | """each element in this result is a list, not a matrix""" 542 | dis_matrix_list = Sutils.get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval, 543 | atom_selection) 544 | num_atoms = dis_matrix_list[0].shape[0] 545 | result = [] 546 | for mat in dis_matrix_list: 547 | p_distances = [] 548 | for item_1 in range(num_atoms): 549 | for item_2 in range(item_1 + 1, num_atoms): 550 | p_distances += [mat[item_1][item_2]] 551 | assert (len(p_distances) == num_atoms * (num_atoms - 1) // 2) 552 | result += [p_distances] 553 | 554 | return np.array(result) 555 | 556 | @staticmethod 557 | def get_non_repeated_pairwise_distance_from_pos_npy(pos_npy): 558 | from sklearn.metrics.pairwise import pairwise_distances 559 | num_atoms = pos_npy.shape[1] // 3 560 | temp_pos_npy = pos_npy.reshape(pos_npy.shape[0], num_atoms, 3) 561 | pairwise_dis = np.array([pairwise_distances(item, item) for item in temp_pos_npy]) 562 | temp_result = np.array( 563 | [[item[_1][_2] for _1 in range(num_atoms) for _2 in range(_1 + 1, num_atoms)] for item in pairwise_dis]) 564 | return temp_result 565 | 566 | @staticmethod 567 | def get_residue_relative_position_list(sample_file): 568 | sample = Universe(sample_file) 569 | temp_heavy_atoms = sample.select_atoms('not name H*') 570 | temp_CA_atoms = sample.select_atoms('name CA') 571 | residue_relative_position_list = [] 572 | 573 | for _ in sample.trajectory: 574 | temp_residue_relative_position_list = [] 575 | for temp_residue_index in sample.residues.resnums: 576 | temp_residue_relative_position_list.append( 577 | temp_heavy_atoms[temp_heavy_atoms.resnums == temp_residue_index].positions \ 578 | - temp_CA_atoms[temp_CA_atoms.resnums == temp_residue_index].positions) 579 | residue_relative_position_list.append(temp_residue_relative_position_list) 580 | return residue_relative_position_list 581 | 582 | 583 | class Alanine_dipeptide(Sutils): 584 | """docstring for Alanine_dipeptide""" 585 | def __init__(self): 586 | super(Alanine_dipeptide, self).__init__() 587 | return 588 | 589 | @staticmethod 590 | def get_cossin_from_a_coordinate(a_coordinate): 591 | num_of_coordinates = len(list(a_coordinate)) // 3 592 | a_coordinate = np.array(a_coordinate).reshape(num_of_coordinates, 3) 593 | diff_coordinates = a_coordinate[1:num_of_coordinates, :] - a_coordinate[0:num_of_coordinates - 1,:] # bond vectors 594 | diff_coordinates_1=diff_coordinates[0:num_of_coordinates-2,:];diff_coordinates_2=diff_coordinates[1:num_of_coordinates-1,:] 595 | normal_vectors = np.cross(diff_coordinates_1, diff_coordinates_2) 596 | normal_vectors_normalized = np.array([x / sqrt(np.dot(x,x)) for x in normal_vectors]) 597 | normal_vectors_normalized_1 = normal_vectors_normalized[0:num_of_coordinates-3, :]; normal_vectors_normalized_2 = normal_vectors_normalized[1:num_of_coordinates-2,:] 598 | diff_coordinates_mid = diff_coordinates[1:num_of_coordinates-2] # these are bond vectors in the middle (remove the first and last one), they should be perpendicular to adjacent normal vectors 599 | 600 | cos_of_angles = list(range(len(normal_vectors_normalized_1))) 601 | sin_of_angles_vec = list(range(len(normal_vectors_normalized_1))) 602 | sin_of_angles = list(range(len(normal_vectors_normalized_1))) # initialization 603 | result = [] 604 | 605 | for index in range(len(normal_vectors_normalized_1)): 606 | cos_of_angles[index] = np.dot(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index]) 607 | sin_of_angles_vec[index] = np.cross(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index]) 608 | sin_of_angles[index] = sqrt(np.dot(sin_of_angles_vec[index], sin_of_angles_vec[index])) * np.sign(sum(sin_of_angles_vec[index]) * sum(diff_coordinates_mid[index])) 609 | result += [cos_of_angles[index], sin_of_angles[index]] 610 | 611 | return result 612 | 613 | @staticmethod 614 | def get_many_cossin_from_coordinates(coordinates): 615 | return list(map(Alanine_dipeptide.get_cossin_from_a_coordinate, coordinates)) 616 | 617 | @staticmethod 618 | def get_many_cossin_from_coordinates_in_list_of_files(list_of_files, step_interval=1, format='npy'): 619 | coordinates = [] 620 | for item in list_of_files: 621 | temp_coordinates = Helper_func.load_npy(item, format=format) 622 | # the result could be 1D or 2D numpy array, need further checking 623 | if temp_coordinates.shape[0] != 0: # remove info from empty files 624 | if len(temp_coordinates.shape) == 1: # if 1D numpy array, convert it to 2D array for consistency 625 | temp_coordinates = temp_coordinates[:, None].T 626 | 627 | coordinates += list(temp_coordinates) 628 | 629 | coordinates = coordinates[::step_interval] 630 | result = Alanine_dipeptide.get_many_cossin_from_coordinates(coordinates) 631 | 632 | return result 633 | 634 | @staticmethod 635 | def get_many_dihedrals_from_coordinates_in_file (list_of_files): 636 | # why we need to get dihedrals from a list of coordinate files? 637 | # because we will probably need to plot other files outside self._list_of_coor_data_files 638 | temp = Alanine_dipeptide.get_many_cossin_from_coordinates_in_list_of_files(list_of_files) 639 | return Alanine_dipeptide.get_many_dihedrals_from_cossin(temp) 640 | 641 | @staticmethod 642 | def get_many_dihedrals_from_cossin(cossin): 643 | result = [] 644 | for item in cossin: 645 | assert (len(item) == 8) 646 | temp_angle = [] 647 | for ii in range(4): 648 | temp_angle += [np.arctan2(item[2 * ii + 1], item[2 * ii])] 649 | 650 | result += [list(temp_angle)] 651 | return result 652 | 653 | @staticmethod 654 | def generate_coordinates_from_pdb_files(path_for_pdb=CONFIG_12): 655 | index_of_backbone_atoms = [str(item) for item in CONFIG_57[0]] 656 | output_file_list = Sutils._generate_coordinates_from_pdb_files(index_of_backbone_atoms, file_path=path_for_pdb) 657 | return output_file_list 658 | 659 | @staticmethod 660 | def get_expression_script_for_plumed(scaling_factor=CONFIG_49): 661 | index_of_backbone_atoms = CONFIG_57[0] 662 | return Plumed_helper.get_atom_positions(index_of_backbone_atoms, scaling_factor, unit_scaling=1.0) 663 | 664 | 665 | class Trp_cage(Sutils): 666 | """docstring for Trp_cage""" 667 | def __init__(self): 668 | super(Trp_cage, self).__init__() 669 | return 670 | 671 | @staticmethod 672 | def get_cossin_of_a_dihedral_from_four_atoms(coord_1, coord_2, coord_3, coord_4): 673 | """each parameter is a 3D Cartesian coordinates of an atom""" 674 | coords_of_four = np.array([coord_1, coord_2, coord_3, coord_4]) 675 | num_of_coordinates = 4 676 | diff_coordinates = coords_of_four[1:num_of_coordinates, :] - coords_of_four[0:num_of_coordinates - 1,:] # bond vectors 677 | diff_coordinates_1=diff_coordinates[0:num_of_coordinates-2,:];diff_coordinates_2=diff_coordinates[1:num_of_coordinates-1,:] 678 | normal_vectors = np.cross(diff_coordinates_1, diff_coordinates_2) 679 | normal_vectors_normalized = np.array([x / sqrt(np.dot(x,x)) for x in normal_vectors]) 680 | normal_vectors_normalized_1 = normal_vectors_normalized[0:num_of_coordinates-3, :]; normal_vectors_normalized_2 = normal_vectors_normalized[1:num_of_coordinates-2,:] 681 | diff_coordinates_mid = diff_coordinates[1:num_of_coordinates-2] # these are bond vectors in the middle (remove the first and last one), they should be perpendicular to adjacent normal vectors 682 | 683 | index = 0 684 | cos_of_angle = np.dot(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index]) 685 | sin_of_angle_vec = np.cross(normal_vectors_normalized_1[index], normal_vectors_normalized_2[index]) 686 | if sin_of_angle_vec[0] != 0 and diff_coordinates_mid[index][0] != 0: 687 | component_index = 0 688 | elif sin_of_angle_vec[1] != 0 and diff_coordinates_mid[index][1] != 0: 689 | component_index = 1 690 | else: 691 | component_index = 2 692 | 693 | sin_of_angle = sqrt(np.dot(sin_of_angle_vec, sin_of_angle_vec)) * np.sign(sin_of_angle_vec[component_index] * diff_coordinates_mid[index][component_index]) 694 | try: 695 | assert ( cos_of_angle ** 2 + sin_of_angle ** 2 - 1 < 0.0001) 696 | except: 697 | print("error: cos^2 x+ sin^2 x != 1, it is %f" %(cos_of_angle ** 2 + sin_of_angle ** 2)) 698 | # print ("coordinates of four atoms are:") 699 | # print (coords_of_four) 700 | 701 | return [cos_of_angle, sin_of_angle] 702 | 703 | @staticmethod 704 | def get_coordinates_of_atom_with_index(a_coodinate, index): 705 | """:param a_coodinate is coordinate of all 20 atoms""" 706 | return [a_coodinate[3 * index], a_coodinate[3 * index + 1], a_coodinate[3 * index + 2]] 707 | 708 | @staticmethod 709 | def get_cossin_from_a_coordinate(a_coordinate): 710 | total_num_of_residues = 20 711 | list_of_idx_four_atoms = [[[3 * x - 1, 3 * x, 3 * x + 1, 3 * x + 2], 712 | [3 * x, 3 * x + 1, 3 * x + 2, 3 * x + 3]] for x in list(range(total_num_of_residues))] 713 | list_of_idx_four_atoms = reduce(lambda x, y: x + y, list_of_idx_four_atoms) 714 | list_of_idx_four_atoms = [x for x in list_of_idx_four_atoms if x[0] >= 0 and x[3] < 3 * total_num_of_residues] 715 | 716 | assert (len(list_of_idx_four_atoms) == 38) 717 | 718 | result = [] 719 | 720 | for item in list_of_idx_four_atoms: 721 | parameter_list = [Trp_cage.get_coordinates_of_atom_with_index(a_coordinate, x) for x in item] 722 | [cos_value, sin_value] = Trp_cage.get_cossin_of_a_dihedral_from_four_atoms(*parameter_list) 723 | # print(item) 724 | # print(cos_value, sin_value) 725 | result += [cos_value, sin_value] 726 | 727 | return result 728 | 729 | @staticmethod 730 | def get_many_cossin_from_coordinates(coordinates): 731 | return list(map(Trp_cage.get_cossin_from_a_coordinate, coordinates)) 732 | 733 | @staticmethod 734 | def get_many_cossin_from_coordinates_in_list_of_files(list_of_files, step_interval=1, format='npy'): 735 | coordinates = [] 736 | for item in list_of_files: 737 | temp_coordinates = Helper_func.load_npy(item, format=format) # the result could be 1D or 2D numpy array, need further checking 738 | if temp_coordinates.shape[0] != 0: # remove info from empty files 739 | if len(temp_coordinates.shape) == 1: # if 1D numpy array, convert it to 2D array for consistency 740 | temp_coordinates = temp_coordinates[:, None].T 741 | 742 | coordinates += list(temp_coordinates) 743 | 744 | coordinates = coordinates[::step_interval] 745 | result = Trp_cage.get_many_cossin_from_coordinates(coordinates) 746 | 747 | return result 748 | 749 | @staticmethod 750 | def get_many_dihedrals_from_coordinates_in_file (list_of_files, step_interval=1): 751 | # why we need to get dihedrals from a list of coordinate files? 752 | # because we will probably need to plot other files outside self._list_of_coor_data_files 753 | temp = Trp_cage.get_many_cossin_from_coordinates_in_list_of_files(list_of_files, step_interval) 754 | return Trp_cage.get_many_dihedrals_from_cossin(temp) 755 | 756 | @staticmethod 757 | def get_many_dihedrals_from_cossin(cossin): 758 | result = [] 759 | for item in cossin: 760 | temp_angle = [] 761 | len_of_cos_sin = 76 762 | assert (len(item) == len_of_cos_sin), (len(item), len_of_cos_sin) 763 | for idx_of_angle in range(len_of_cos_sin // 2): 764 | temp_angle += [np.arctan2(item[2 * idx_of_angle + 1], item[2 * idx_of_angle])] 765 | 766 | assert (len(temp_angle) == len_of_cos_sin // 2) 767 | 768 | result += [temp_angle] 769 | 770 | assert (len(result) == len(cossin)) 771 | 772 | return result 773 | 774 | @staticmethod 775 | def generate_coordinates_from_pdb_files(path_for_pdb = CONFIG_12): 776 | index_of_backbone_atoms = [str(item) for item in CONFIG_57[1]] 777 | assert (len(index_of_backbone_atoms) % 3 == 0) 778 | 779 | output_file_list = Sutils._generate_coordinates_from_pdb_files(index_of_backbone_atoms, file_path=path_for_pdb) 780 | 781 | return output_file_list 782 | 783 | @staticmethod 784 | def metric_get_diff_pairwise_distance_matrices_of_alpha_carbon(list_of_files, ref_file ='../resources/1l2y.pdb', step_interval = 1): 785 | ref = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms([ref_file]) 786 | sample = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval) 787 | diff = [np.linalg.norm(ref[0] - x) for x in sample] 788 | return diff 789 | 790 | @staticmethod 791 | def metric_get_residue_9_16_salt_bridge_distance(list_of_files, step_interval = 1): 792 | distances_list = [] 793 | index = 0 794 | for sample_file in list_of_files: 795 | sample = Universe(sample_file) 796 | sample_atom_selection_1 = sample.select_atoms("name OD2 and resid 9") 797 | sample_atom_selection_2 = sample.select_atoms("name NH2 and resid 16") 798 | for _ in sample.trajectory: 799 | if index % step_interval == 0: 800 | distances_list.append( 801 | distance_array(sample_atom_selection_1.positions, sample_atom_selection_2.positions)) 802 | 803 | index += 1 804 | 805 | return np.array(distances_list).flatten() 806 | 807 | @staticmethod 808 | def metric_chirality(list_of_files, step_interval=1): 809 | result = [] 810 | index = 0 811 | for temp_file in list_of_files: 812 | temp_universe = Universe(temp_file) 813 | for _ in temp_universe.trajectory: 814 | if index % step_interval == 0: 815 | atom_list = [temp_universe.select_atoms('name CA and resid %d' % item).positions[0] 816 | for item in [1, 9, 14, 20]] 817 | result.append(Trp_cage.get_cossin_of_a_dihedral_from_four_atoms( 818 | atom_list[0], atom_list[1], atom_list[2], atom_list[3])[1]) 819 | index += 1 820 | return np.array(result) 821 | 822 | @staticmethod 823 | def metric_vertical_shift(list_of_files, step_interval=1): 824 | result = [] 825 | index = 0 826 | for temp_file in list_of_files: 827 | temp_universe = Universe(temp_file) 828 | for _ in temp_universe.trajectory: 829 | if index % step_interval == 0: 830 | atom_list = [temp_universe.select_atoms('name CA and resid %d' % item).positions[0] 831 | for item in [1, 11, 20]] 832 | result.append(np.linalg.norm(atom_list[0] - atom_list[1]) - np.linalg.norm(atom_list[2] - atom_list[1])) 833 | index += 1 834 | return np.array(result) 835 | 836 | @staticmethod 837 | def metric_get_number_of_native_contacts(list_of_files, ref_file ='../resources/1l2y.pdb', threshold = 8, step_interval = 1): 838 | ref = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms([ref_file]) 839 | sample = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(list_of_files, step_interval) 840 | 841 | result = [sum(sum(((x < threshold) & (ref[0] < threshold)).astype(int))) for x in sample] 842 | return result 843 | 844 | @staticmethod 845 | def metric_radius_of_gyration(list_of_files, step_interval = 1, atom_selection_statement = "name CA"): 846 | result = [] 847 | index = 0 848 | for item_file in list_of_files: 849 | temp_sample = Universe(item_file) 850 | temp_atoms = temp_sample.select_atoms(atom_selection_statement) 851 | for _ in temp_sample.trajectory: 852 | if index % step_interval == 0: 853 | result.append(temp_atoms.radius_of_gyration()) 854 | index += 1 855 | 856 | return result 857 | 858 | @staticmethod 859 | def get_pairwise_RMSD_after_alignment_for_a_file(sample_file, atom_selection_statement = 'name CA'): 860 | sample_1 = Universe(sample_file); sample_2 = Universe(sample_file) # should use two variables here, otherwise it will be 0, might be related to iterator issue? 861 | sel_1 = sample_1.select_atoms(atom_selection_statement); sel_2 = sample_2.select_atoms(atom_selection_statement) 862 | 863 | return [[rmsd(sel_1.positions, sel_2.positions, center=True, superposition=True) for _2 in sample_2.trajectory] for _1 in sample_1.trajectory] 864 | 865 | @staticmethod 866 | def structure_clustering_in_a_file(sample_file, atom_selection_statement = 'name CA', 867 | write_most_common_class_into_file = False, 868 | output_file_name = None, 869 | eps=0.5, 870 | min_num_of_neighboring_samples = 2 871 | ): 872 | pairwise_RMSD = Trp_cage.get_pairwise_RMSD_after_alignment_for_a_file(sample_file, atom_selection_statement=atom_selection_statement) 873 | from sklearn.cluster import DBSCAN 874 | 875 | dbscan_obj = DBSCAN(metric='precomputed', eps=eps, min_samples=min_num_of_neighboring_samples).fit(pairwise_RMSD) 876 | class_labels = dbscan_obj.labels_ 877 | max_class_label = max(class_labels) 878 | num_in_each_class = {label: np.where(class_labels == label)[0].shape[0] for label in range(-1, max_class_label + 1)} 879 | most_common_class_labels = sorted(list(num_in_each_class.keys()), key=lambda x: num_in_each_class[x], reverse=True) 880 | with open(sample_file, 'r') as in_file: 881 | content = [item for item in in_file.readlines() if not 'REMARK' in item] 882 | content = ''.join(content) 883 | content = content.split('MODEL')[1:] # remove header 884 | assert (len(content) == len(class_labels)) 885 | 886 | if most_common_class_labels[0] == -1: 887 | raise Exception("too many outliers, check if there is actually a cluster, or adjust parameters") 888 | else: 889 | index_of_most_common_class = np.where(class_labels == most_common_class_labels[0])[0] 890 | if write_most_common_class_into_file: 891 | if output_file_name is None: 892 | output_file_name = sample_file.replace('.pdb', '_most_common.pdb') 893 | 894 | frames_to_use = [content[ii] for ii in index_of_most_common_class] 895 | with open(output_file_name, 'w') as out_file: 896 | for frame in frames_to_use: 897 | out_file.write("MODEL" + frame) 898 | 899 | return num_in_each_class, index_of_most_common_class, most_common_class_labels[0] 900 | 901 | @staticmethod 902 | def rotating_dihedral_angles_and_save_to_pdb(input_pdb, target_dihedrals, output_pdb): 903 | pdb_parser = PDB.PDBParser(QUIET=True) 904 | temp_structure = pdb_parser.get_structure('temp', input_pdb) 905 | coor_file = Trp_cage.generate_coordinates_from_pdb_files(input_pdb)[0] 906 | current_dihedrals = Trp_cage.get_many_dihedrals_from_coordinates_in_file([coor_file]) 907 | rotation_angles = np.array(target_dihedrals) - np.array(current_dihedrals) 908 | 909 | atom_indices_in_each_residue = [[]] * 20 910 | temp_model = list(temp_structure.get_models())[0] 911 | for _1, item in list(enumerate(temp_model.get_residues())): 912 | atom_indices_in_each_residue[_1] = [int(_2.get_serial_number()) - 1 for _2 in item.get_atoms()] 913 | 914 | for temp_model in temp_structure.get_models(): 915 | atoms_in_this_frame = list(temp_model.get_atoms()) 916 | temp_coords = np.array([_1.get_coord() for _1 in atoms_in_this_frame]) 917 | 918 | for item in range(19): # 19 * 2 = 38 dihedrals in total 919 | C_atom_in_this_residue = list(filter(lambda x: x.get_name() == "C", atoms_in_this_frame))[item] 920 | CA_atom_in_this_residue = list(filter(lambda x: x.get_name() == "CA", atoms_in_this_frame))[item] 921 | CA_atom_in_next_residue = list(filter(lambda x: x.get_name() == "CA", atoms_in_this_frame))[item + 1] 922 | N_atom_in_next_residue = list(filter(lambda x: x.get_name() == "N", atoms_in_this_frame))[item + 1] 923 | 924 | axis_vector_0 = C_atom_in_this_residue.get_coord() - CA_atom_in_this_residue.get_coord() 925 | axis_vector_1 = CA_atom_in_next_residue.get_coord() - N_atom_in_next_residue.get_coord() 926 | 927 | fixed_coord_0 = temp_coords[int(C_atom_in_this_residue.get_serial_number()) - 1] 928 | fixed_coord_1 = temp_coords[int(N_atom_in_next_residue.get_serial_number()) - 1] 929 | 930 | indices_atom_to_rotate = reduce(lambda x, y: x + y, atom_indices_in_each_residue[:item + 1]) 931 | 932 | temp_coords = Sutils.rotating_group_of_atoms(temp_coords, indices_atom_to_rotate, fixed_coord_0, 933 | axis_vector_0, rotation_angles[temp_model.get_id()][2 * item]) 934 | temp_coords = Sutils.rotating_group_of_atoms(temp_coords, indices_atom_to_rotate, fixed_coord_1, 935 | axis_vector_1, rotation_angles[temp_model.get_id()][2 * item + 1]) 936 | 937 | # save coordinates into structure 938 | for _1, item in enumerate(temp_model.get_atoms()): 939 | item.set_coord(temp_coords[_1]) 940 | 941 | io = PDB.PDBIO() 942 | io.set_structure(temp_structure) 943 | io.save(output_pdb) 944 | return 945 | 946 | @staticmethod 947 | def get_expression_script_for_plumed(scaling_factor=CONFIG_49): 948 | index_of_backbone_atoms = CONFIG_57[1] 949 | return Plumed_helper.get_atom_positions(index_of_backbone_atoms, scaling_factor, unit_scaling=1.0) 950 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/remove_water_mol.py: -------------------------------------------------------------------------------- 1 | from ANN_simulation import * 2 | import argparse, subprocess 3 | 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--path", type=str, default="", help="specify the directory/file containing the pdb files") 6 | parser.add_argument("--remove_original", help="remove original pdb files", action="store_true") 7 | args = parser.parse_args() 8 | 9 | if args.remove_original: 10 | Sutils.remove_water_mol_and_Cl_from_pdb_file(folder_for_pdb = args.path, preserve_original_file=False) 11 | else: 12 | Sutils.remove_water_mol_and_Cl_from_pdb_file(folder_for_pdb = args.path, preserve_original_file=True) 13 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/structural_alignment.py: -------------------------------------------------------------------------------- 1 | """ 2 | modified from the code: https://gist.github.com/andersx/6354971 3 | """ 4 | 5 | import Bio.PDB, argparse, subprocess, os 6 | from MDAnalysis import * 7 | from MDAnalysis.analysis.align import * 8 | from MDAnalysis.analysis.rms import rmsd 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("sample_path", type=str, help="path (file or folder) of pdb file(s) to be aligned") 12 | parser.add_argument("--ignore_aligned_file",type=int, default=1) 13 | parser.add_argument("--ref", type=str, help="reference pdb file") 14 | parser.add_argument("--name", type=str, default=None, help='name of the aligned pdb file') 15 | parser.add_argument('--remove_original', help='remove original pdb file after doing structural alignment', action="store_true") 16 | parser.add_argument('--suffix', type=str, default="", help="string that appends at the end of filename") 17 | parser.add_argument('--atom_selection', type=str, default='backbone', help='atom_selection_statement for alignment') 18 | args = parser.parse_args() 19 | 20 | ref_pdb = args.ref 21 | 22 | traj_files = subprocess.check_output([ 23 | 'find', args.sample_path, '-name', "*.pdb", '-o', '-name', '*.dcd']).decode("utf-8").strip().split('\n') 24 | if args.ignore_aligned_file: 25 | traj_files = [x for x in traj_files if not '_aligned' in x] 26 | 27 | for sample_traj in traj_files: 28 | print("doing structural alignment for %s" % sample_traj) 29 | 30 | if args.name is None: 31 | output_pdb_file = sample_traj[:-4] + '_aligned%s.pdb' % (args.suffix) 32 | else: 33 | output_pdb_file = parser.name 34 | 35 | if os.path.exists(output_pdb_file) and os.path.getmtime(sample_traj) < os.path.getmtime(output_pdb_file): 36 | print("aligned file already exists: %s (remove previous one if needed)" % output_pdb_file) 37 | else: 38 | ref = Universe(ref_pdb) 39 | m_traj = Universe(ref_pdb, sample_traj) 40 | AlignTraj(m_traj, reference=ref, filename=output_pdb_file, select=args.atom_selection).run() 41 | print("done structural alignment for %s" % sample_traj) 42 | 43 | if args.remove_original: 44 | subprocess.check_output(['rm', sample_traj]) 45 | print("%s removed!" % sample_traj) 46 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/tf_load.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # disable tensorflow warning messages (https://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information) 3 | import tensorflow as tf, keras.backend as K 4 | from keras.models import load_model 5 | config = tf.ConfigProto() 6 | config.gpu_options.allow_growth = True # avoid tensorflow using all GPU memory 7 | K.tensorflow_backend.set_session(tf.Session(config=config)) 8 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/train_network_and_save_for_iter.py: -------------------------------------------------------------------------------- 1 | """train autoencoder and save into file 2 | this file is typically used for running training in an iteration 3 | """ 4 | 5 | from ANN_simulation import * 6 | import argparse 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("index", type=int, help="index of autoencoder") 10 | parser.add_argument("--training_interval", type=int, default=1, help="training interval") 11 | parser.add_argument("--num_of_trainings", type=int, default=CONFIG_13, help="total number of trainings (and pick the best one to save)") 12 | parser.add_argument("--num_of_copies", type=int, default=CONFIG_52, help="num of copies for data augmentation") 13 | parser.add_argument("--lr_m", type=str, default=None, help="learning rate and momentum") 14 | parser.add_argument("--output_file", type=str, default=None, help="file name to save autoencoder") 15 | parser.add_argument('--data_folder', type=str, default=None, help="folder containing training data") 16 | parser.add_argument('--in_data', type=str, default=None, help="npy file containing pre-computed input data") 17 | parser.add_argument('--out_data', type=str, default=None, help="npy file containing pre-computed output data, if in_data is not None while out_data is None, then out_data is set to be in_data") 18 | parser.add_argument('--node_num', type=str, default=None, help="node number") 19 | parser.add_argument('--batch_size', type=int, default=None, help='batch size') 20 | parser.add_argument('--auto_dim', type=int, default=CONFIG_79, help="automatically determine input/output dim based on data") 21 | parser.add_argument('--auto_scale', type=int, default=False, help="automatically scale inputs and outputs") 22 | parser.add_argument('--save_to_data_files', type=str, default=None, help="save training data to external files if it is not None, example: 'temp_in.npy,temp_out.npy' ") 23 | parser.add_argument('--lag_time', type=int, default=0, help='lag time for time lagged autoencoder') 24 | parser.add_argument('--rec_loss_type', type=int, default=True, help='0: standard rec loss, 1: lagged rec loss, 2: no rec loss (pytorch only)') 25 | parser.add_argument('--rec_weight', type=float, default=1.0, help='weight of reconstruction loss (pytorch only)') 26 | parser.add_argument('--autocorr_weight', type=float, default=1.0, help='weight of autocorrelation loss in the loss function (pytorch only)') 27 | parser.add_argument('--pearson_weight', type=float, default=None, help='weight of pearson loss (pytorch only)') 28 | parser.add_argument('--sf', type=str, default=None, help='model to start with (pytorch only)') 29 | args = parser.parse_args() 30 | 31 | def get_data_from_folder(temp_folder, input_type, output_type): 32 | my_coor_data_obj = coordinates_data_files_list( 33 | list_of_dir_of_coor_data_files=[temp_folder]) 34 | coor_data_obj_input = my_coor_data_obj.create_sub_coor_data_files_list_using_filter_conditional( 35 | lambda x: not 'aligned' in x) 36 | if input_type == 'cossin': 37 | data_set = np.array(molecule_type.get_many_cossin_from_coordinates_in_list_of_files( 38 | coor_data_obj_input.get_list_of_coor_data_files(), step_interval=args.training_interval)) 39 | elif input_type == 'Cartesian': 40 | scaling_factor = CONFIG_49 41 | data_set = coor_data_obj_input.get_coor_data(scaling_factor) 42 | data_set = data_set[::args.training_interval] 43 | data_set = Sutils.remove_translation(data_set) 44 | assert (Sutils.check_center_of_mass_is_at_origin(data_set)) 45 | elif input_type == 'pairwise_distance': 46 | data_set = np.array(Sutils.get_non_repeated_pairwise_distance( 47 | coor_data_obj_input.get_list_of_corresponding_pdb_dcd(), step_interval=args.training_interval, 48 | atom_selection=CONFIG_73)) / CONFIG_49 49 | else: 50 | raise Exception('error input type') 51 | 52 | if output_type == 'cossin': 53 | output_data_set = np.array(molecule_type.get_many_cossin_from_coordinates_in_list_of_files( 54 | coor_data_obj_input.get_list_of_coor_data_files(), step_interval=args.training_interval)) 55 | elif output_type == 'Cartesian': 56 | scaling_factor = CONFIG_49 57 | alignment_coor_file_suffix_list = CONFIG_61 58 | output_data_set = Sutils.prepare_output_Cartesian_coor_with_multiple_ref_structures( 59 | [temp_folder], alignment_coor_file_suffix_list, scaling_factor) 60 | output_data_set = output_data_set[::args.training_interval] 61 | mixed_error_function = CONFIG_71 # TODO: refactor this part later 62 | if mixed_error_function: 63 | if CONFIG_30 == "Trp_cage": 64 | output_data_set_1 = Sutils.remove_translation( 65 | output_data_set[:, list(range(9 * 1, 9 * 8))]) # mixed_err 66 | output_data_set_2 = Sutils.remove_translation(output_data_set[:, list(range(180, 360))]) 67 | output_data_set = np.concatenate([4.0 * output_data_set_1, output_data_set_2], 68 | axis=1) # TODO: may modify this relative weight later 69 | elif CONFIG_30 == "Src_kinase": 70 | output_data_set_1 = Sutils.remove_translation( 71 | output_data_set[:, list(range(9 * 143, 9 * 170))]) # mixed_err 72 | output_data_set_2 = Sutils.remove_translation( 73 | output_data_set[:, list(range(2358 + 9 * 43, 2358 + 9 * 58))]) 74 | output_data_set = np.concatenate([output_data_set_1, output_data_set_2], axis=1) 75 | assert (Sutils.check_center_of_mass_is_at_origin(output_data_set)) 76 | elif output_type == 'pairwise_distance': 77 | output_data_set = np.array(Sutils.get_non_repeated_pairwise_distance( 78 | coor_data_obj_input.get_list_of_corresponding_pdb_dcd(), step_interval=args.training_interval, 79 | atom_selection=CONFIG_73)) / CONFIG_49 80 | elif output_type == 'combined': 81 | scaling_factor = CONFIG_49 82 | alignment_coor_file_suffix_list = CONFIG_61 83 | output_data_set = Sutils.prepare_output_Cartesian_coor_with_multiple_ref_structures( 84 | [temp_folder], alignment_coor_file_suffix_list, scaling_factor) 85 | output_data_set = output_data_set[::args.training_interval] 86 | mixed_error_function = CONFIG_71 # TODO: refactor this part later 87 | assert mixed_error_function # mixed error is required 88 | if CONFIG_30 == "Trp_cage": 89 | output_data_set_1 = Sutils.remove_translation(output_data_set[:, list(range(9 * 1, 9 * 8))]) # mixed_err 90 | output_data_set_2 = Sutils.remove_translation(output_data_set[:, list(range(180, 360))]) 91 | output_data_set = np.concatenate([4.0 * output_data_set_1, output_data_set_2], 92 | axis=1) # TODO: may modify this relative weight later 93 | else: 94 | raise Exception('not defined') 95 | temp_output_data_set = np.array(Sutils.get_non_repeated_pairwise_distance( 96 | coor_data_obj_input.get_list_of_corresponding_pdb_dcd(), step_interval=args.training_interval, 97 | atom_selection=CONFIG_73)) / CONFIG_49 98 | output_data_set = np.concatenate([output_data_set, temp_output_data_set], axis=1) 99 | else: 100 | raise Exception('error output data type') 101 | return data_set, output_data_set 102 | 103 | # used to process additional arguments 104 | additional_argument_list = {} 105 | if not args.output_file is None: 106 | additional_argument_list['filename_to_save_network'] = args.output_file 107 | if not args.lr_m is None: 108 | temp_lr = float(args.lr_m.strip().split(',')[0]) 109 | temp_momentum = float(args.lr_m.strip().split(',')[1]) 110 | additional_argument_list['network_parameters'] = [temp_lr, temp_momentum, 0, True, CONFIG_4[4]] 111 | if not args.batch_size is None: 112 | additional_argument_list['batch_size'] = args.batch_size 113 | 114 | if args.data_folder is None: 115 | args.data_folder = '../target/' + CONFIG_30 116 | 117 | fraction_of_data_to_be_saved = 1 # save all training data by default 118 | input_data_type, output_data_type = CONFIG_48, CONFIG_76 119 | 120 | # getting training data 121 | if not args.in_data is None: 122 | data_set = np.load(args.in_data) 123 | if args.out_data is None: 124 | output_data_set = data_set 125 | else: 126 | output_data_set = np.load(args.out_data) 127 | else: 128 | data_set, output_data_set = get_data_from_folder(args.data_folder, input_data_type, output_data_type) 129 | 130 | assert (len(data_set) == len(output_data_set)) 131 | use_representative_points_for_training = CONFIG_58 132 | if use_representative_points_for_training: 133 | data_set, output_data_set = Sutils.select_representative_points(data_set, output_data_set) 134 | 135 | if input_data_type == 'Cartesian' and args.in_data is None: 136 | print('applying data augmentation...') 137 | data_set, output_data_set = Sutils.data_augmentation(data_set, output_data_set, args.num_of_copies, 138 | is_output_reconstructed_Cartesian=(output_data_type == 'Cartesian')) 139 | fraction_of_data_to_be_saved = 1.0 / args.num_of_copies 140 | else: 141 | print("data augmentation not applied") 142 | 143 | scaling_factor_for_expected_output = CONFIG_75 # TODO: is this useful? 144 | if not scaling_factor_for_expected_output is None: 145 | print("expected output is weighted by %s" % str(scaling_factor_for_expected_output)) 146 | output_data_set = np.dot(output_data_set, np.diag(scaling_factor_for_expected_output)) 147 | 148 | if args.node_num is None: 149 | temp_node_num = CONFIG_3[:] # deep copy list 150 | else: 151 | temp_node_num = [int(item) for item in args.node_num.split(',')] 152 | 153 | if args.auto_dim: temp_node_num[0], temp_node_num[-1] = data_set.shape[1], output_data_set.shape[1] 154 | additional_argument_list['node_num'] = temp_node_num 155 | 156 | if args.auto_scale: 157 | auto_scaling_factor = np.max(np.abs(data_set)).astype(np.float) 158 | print("auto_scaling_factor = %f" % auto_scaling_factor) 159 | data_set /= auto_scaling_factor 160 | output_data_set /= (np.max(np.abs(output_data_set)).astype(np.float)) 161 | assert np.max(np.abs(data_set)) == 1.0 and np.max(np.abs(output_data_set)) == 1.0 162 | 163 | print("min/max of output = %f, %f, min/max of input = %f, %f" % (np.min(output_data_set), np.max(output_data_set), 164 | np.min(data_set), np.max(data_set))) 165 | 166 | if not args.save_to_data_files is None: 167 | args.save_to_data_files = args.save_to_data_files.split(',') 168 | 169 | if CONFIG_45 == 'keras': 170 | temp_network_list = [autoencoder_Keras(index=args.index, 171 | data_set_for_training=data_set, 172 | output_data_set=output_data_set, 173 | data_files=args.save_to_data_files, 174 | **additional_argument_list 175 | ) for _ in range(args.num_of_trainings)] 176 | elif CONFIG_45 == 'pytorch': 177 | additional_argument_list['rec_loss_type'] = args.rec_loss_type 178 | additional_argument_list['start_from'] = args.sf 179 | additional_argument_list['rec_weight'] = args.rec_weight 180 | additional_argument_list['autocorr_weight'] = args.autocorr_weight 181 | additional_argument_list['pearson_weight'] = args.pearson_weight 182 | temp_network_list = [autoencoder_torch(index=args.index, 183 | data_set_for_training=data_set, 184 | output_data_set=output_data_set, 185 | data_files=args.save_to_data_files, 186 | **additional_argument_list 187 | ) for _ in range(args.num_of_trainings)] 188 | else: 189 | raise Exception ('this training backend not implemented') 190 | 191 | for item in temp_network_list: item.train(lag_time=args.lag_time) 192 | 193 | if len(temp_network_list) == 1: 194 | best_network = temp_network_list[0] 195 | # if np.all(np.isnan(best_network.get_PCs())): 196 | # best_network = None 197 | else: 198 | temp_FVE_list = [item.get_fraction_of_variance_explained() for item in temp_network_list] 199 | max_FVE = np.max(temp_FVE_list) 200 | print('temp_FVE_list = %s, max_FVE = %f' % (str(temp_FVE_list), max_FVE)) 201 | best_network = temp_network_list[temp_FVE_list.index(max_FVE)] 202 | assert (isinstance(best_network, autoencoder)) 203 | assert (best_network.get_fraction_of_variance_explained() == max_FVE) 204 | 205 | best_network.save_into_file(fraction_of_data_to_be_saved=fraction_of_data_to_be_saved) 206 | print("excited! this is the name of best network: %s" % best_network._filename_to_save_network) # this line is used to locate file name of neural network 207 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/src/workqueue.py: -------------------------------------------------------------------------------- 1 | """ 2 | this programs takes a file containing all Python programs to run as input, and 3 | put these programs into a workqueue, and at every instance we make sure only 4 | n Python programs are running 5 | 6 | =========================== 7 | input: 8 | 9 | - file containing Python programs to run 10 | - number of programs allowed to run concurrently 11 | - time interval of checking the number of running programs 12 | """ 13 | 14 | import argparse, subprocess, time 15 | 16 | def main(): 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("cmdfile", type=str, help="file containing Python programs to run") 19 | parser.add_argument("file_finished", type=str, help="file containing the programs finished") 20 | parser.add_argument("--num", type=int, default=20, help="number of programs allowed to run concurrently") 21 | parser.add_argument("--interval", type=int, default=10, help="time interval of checking the number of running programs") 22 | 23 | args = parser.parse_args() 24 | 25 | command_file = args.cmdfile 26 | num_of_programs_allowed = args.num 27 | interval = args.interval 28 | 29 | with open(command_file, 'r') as cmdf: 30 | command_list = cmdf.read().strip().split('\n') 31 | 32 | command_list = [x for x in command_list if x.strip() != "" and x.strip()[0] != "#"] # remove empty commands 33 | 34 | total_num_jobs = len(command_list) 35 | next_job_index = 0 36 | 37 | previous_running_python_jobs = [] 38 | 39 | while next_job_index < total_num_jobs: 40 | time.sleep(interval) 41 | current_running_python_jobs = [x for x in subprocess.check_output(['ps', 'aux']).decode("utf-8").split('\n') if ' python ' in x and not 'python workqueue.py' in x] 42 | current_running_python_jobs = [' '.join(x.split()[10:]) for x in current_running_python_jobs] # 11th column is command 43 | # print "current_running_jobs = %s" % str(current_running_python_jobs) 44 | 45 | # save finished programs into this file 46 | with open(args.file_finished, 'a') as file_containing_programs_finished: 47 | for item in previous_running_python_jobs: 48 | if not item in current_running_python_jobs: 49 | file_containing_programs_finished.write(item) 50 | file_containing_programs_finished.write('\n') 51 | 52 | previous_running_python_jobs = current_running_python_jobs 53 | 54 | num_of_running_jobs = len(current_running_python_jobs) 55 | if num_of_running_jobs < num_of_programs_allowed: 56 | if num_of_programs_allowed - num_of_running_jobs > total_num_jobs - next_job_index: 57 | run_programs(command_list, next_job_index, total_num_jobs) 58 | next_job_index = total_num_jobs 59 | else: 60 | run_programs(command_list, next_job_index, next_job_index + num_of_programs_allowed - num_of_running_jobs) 61 | next_job_index += num_of_programs_allowed - num_of_running_jobs 62 | 63 | print("Done all programs in " + args.cmdfile) 64 | return 65 | 66 | 67 | def run_programs(command_list, start_index, end_index, shell=True): 68 | """ 69 | run programs with index [start_index, end_index - 1] 70 | """ 71 | for item in range(start_index, end_index): 72 | command_arg = command_list[item].strip() 73 | if command_arg != "": 74 | if command_arg[-1] == "&": 75 | command_arg = command_arg[:-1] 76 | 77 | print("running command: " + command_arg) 78 | if not shell: command_arg = command_arg.split() 79 | subprocess.Popen(command_arg, shell=shell) 80 | 81 | return 82 | 83 | 84 | if __name__ == '__main__': 85 | main() 86 | 87 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/target/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | *.pdb 3 | *.png 4 | *.npy 5 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/tests/.gitignore: -------------------------------------------------------------------------------- 1 | .coverage 2 | *.png 3 | *.pdf 4 | *.jpg 5 | *.pkl 6 | dependency/** 7 | *.txt 8 | *.pdb 9 | *.hdf5 10 | *.chk 11 | temp_model.dot 12 | 13 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/tests/ANN_simulation_test.py: -------------------------------------------------------------------------------- 1 | '''This is a test for functionality of ANN_simulation.py 2 | ''' 3 | 4 | import sys, os, math, subprocess, matplotlib 5 | from functools import reduce 6 | matplotlib.use('agg') 7 | 8 | sys.path.append('../src/') # add the source file folder 9 | 10 | from ANN_simulation import * 11 | from numpy.testing import assert_almost_equal, assert_equal 12 | 13 | 14 | class test_Sutils(object): 15 | @staticmethod 16 | def test_mark_and_modify_pdb_for_calculating_RMSD_for_plumed(): 17 | temp_out = 'temp_out.pdb' 18 | Sutils.mark_and_modify_pdb_for_calculating_RMSD_for_plumed('../resources/1l2y.pdb', temp_out, 19 | get_index_list_with_selection_statement('../resources/1l2y.pdb', 'name CA')) 20 | a = Universe(temp_out) 21 | b = a.select_atoms('name CA') 22 | assert np.all(b.tempfactors) and np.all(b.occupancies) 23 | b = a.select_atoms('not name CA') 24 | assert not (np.any(b.tempfactors) or np.any(b.occupancies)) 25 | subprocess.check_output(['rm', temp_out]) 26 | return 27 | 28 | @staticmethod 29 | def test_write_some_frames_into_a_new_file(): 30 | input_pdb = '../tests/dependency/temp_output_0.pdb' 31 | output_pdb = "../tests/dependency/temp_output_0_interval_3.pdb" 32 | output_coor = output_pdb.replace('.pdb', '_coordinates.npy') 33 | actual_output_coor = '../tests/dependency/temp_output_0_coor.npy' 34 | for interval in range(3, 10): 35 | Sutils.write_some_frames_into_a_new_file(input_pdb, 0, 0, interval, output_pdb) 36 | if os.path.exists(output_coor): 37 | subprocess.check_output(['rm', output_coor]) 38 | Alanine_dipeptide.generate_coordinates_from_pdb_files(output_pdb) 39 | assert_almost_equal(np.load(output_coor), np.load(actual_output_coor)[::interval]) 40 | subprocess.check_output(['rm', output_coor, output_pdb]) 41 | return 42 | 43 | @staticmethod 44 | def test_get_boundary_points(): 45 | """generate plotting for tests""" 46 | cov = [[0.1, 0], [0, 0.1]] # diagonal covariance 47 | get_points = lambda mean: np.random.multivariate_normal(mean, cov, 50) 48 | points = reduce(lambda x, y: np.concatenate((x, y)), list(map(get_points, [[0, 1], [0, -1]]))) 49 | boundary_points = Sutils.get_boundary_points(points, preprocessing=True) 50 | x, y = list(zip(*points)) 51 | x1, y1 = list(zip(*boundary_points)) 52 | fig, ax = plt.subplots() 53 | ax.scatter(x, y, c='b') 54 | ax.scatter(x1, y1, c='r') 55 | fig.savefig('test_get_boundary_points_noncircular.png') 56 | 57 | points = reduce(lambda x, y: np.concatenate((x, y)), list(map(get_points, [[-.8, -.8]]))) 58 | boundary_points = Sutils.get_boundary_points(points, preprocessing=True, is_circular_boundary=True, 59 | range_of_PCs=[[-1, 1], [-1, 1]]) 60 | x, y = list(zip(*points)) 61 | x1, y1 = list(zip(*boundary_points)) 62 | fig, ax = plt.subplots() 63 | ax.scatter(x, y, c='b') 64 | ax.scatter(x1, y1, c='r') 65 | fig.savefig('test_get_boundary_points_circular.png') 66 | return 67 | 68 | @staticmethod 69 | def test_get_boundary_points_2_diagram(): 70 | """diagram for the find_boundary algorithm""" 71 | dimensionality = 2 72 | fig, axes = plt.subplots(2, 2) 73 | fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=None, hspace=0.3) 74 | fig.set_size_inches(15, 15) 75 | # hist_matrix = np.random.randint(1, 10, size=(size_of_grid, size_of_grid)) 76 | hist_matrix = [ 77 | [0, 0, 0, 0, 0, 0, 0, 0], 78 | [0, 0, 0, 0, 0, 1, 0, 0], 79 | [0, 0, 3, 5, 3, 2, 1, 0], 80 | [0, 0, 2, 9, 6, 2, 0, 0], 81 | [0, 0, 5, 1, 7, 2, 0, 0], 82 | [0, 1, 2, 9, 8, 1, 0, 0], 83 | [0, 0, 0, 1, 4, 0, 0, 0], 84 | [0, 0, 0, 0, 0, 0, 0, 0], 85 | ] 86 | hist_matrix = np.array(hist_matrix) 87 | hist_matrix_processed = [[- np.exp(- y) for y in x] for x in hist_matrix] # preprocessing process 88 | 89 | diff_with_neighbors = hist_matrix_processed - 1.0 / (2 * dimensionality) * sum( 90 | [np.roll(hist_matrix_processed, 1, axis=x) 91 | + np.roll(hist_matrix_processed, -1, axis=x) for x in range(dimensionality)] 92 | ) 93 | temp_fontsize = 25 94 | sns.heatmap(hist_matrix, ax=axes[0][0], annot=True, cbar=False) 95 | sns.heatmap(hist_matrix_processed, ax=axes[0][1], annot=True, cbar=False) 96 | sns.heatmap(diff_with_neighbors, ax=axes[1][0], annot=True, cbar=False) 97 | sns.heatmap(diff_with_neighbors < 0, ax=axes[1][1], annot=False, cbar=False) 98 | axes[0][0].set_title(r'number of data points $n_i$', fontsize=temp_fontsize) 99 | axes[0][1].set_title(r'$p_i = -\exp{(-n_i)}$', fontsize=temp_fontsize) 100 | axes[1][0].text(2, 8.5, r'$v_i = p_i-\frac{1}{| K_i |}\sum_{j \in K_i} p_j$', fontsize=temp_fontsize) 101 | axes[1][1].set_title('locations of selected cells', fontsize=temp_fontsize) 102 | temp_annotation = ['(a)', '(b)', '(c)', '(d)'] 103 | index = 0 104 | for _1 in axes: 105 | for ax in _1: 106 | ax.set_xlabel('$\\xi_1$', fontsize=temp_fontsize) 107 | ax.set_ylabel('$\\xi_2$', fontsize=temp_fontsize) 108 | ax.text(-0.5, 8.4, temp_annotation[index], fontsize=temp_fontsize - 5) 109 | index += 1 110 | # fig.tight_layout() 111 | fig.savefig('diagram_of_finding_boundary.pdf', format='pdf', bbox_inches='tight') 112 | return 113 | 114 | @staticmethod 115 | def test_L_method(): 116 | evaluation_values = [0, 0.1, 0.5, 0.85, 0.9, 0.93] 117 | nums = list(range(len(evaluation_values))) 118 | opt_num, x_data, y_data_left, y_data_right = Sutils.L_method(evaluation_values, nums) 119 | fig, ax = plt.subplots() 120 | ax.plot(x_data, y_data_left) 121 | ax.plot(x_data, y_data_right) 122 | ax.scatter(nums, evaluation_values) 123 | fig.savefig("L_method.png") 124 | assert (opt_num == 4), opt_num 125 | return 126 | 127 | @staticmethod 128 | def test_rotating_coordinates(): 129 | data = np.loadtxt('../tests/dependency/temp_Trp_cage_data/1l2y_coordinates.txt').reshape((38, 60, 3))[0] 130 | actual = Sutils.rotating_coordinates(data, [0,0,0], [0,0,1], np.pi / 2) 131 | expected = np.array([data[:, 1], - data[:,0], data[:,2]]).T 132 | assert_almost_equal(expected, actual) 133 | return 134 | 135 | @staticmethod 136 | def test__get_expression_script_for_plumed(): 137 | with open('dependency/expected_plumed_Trp_script.txt', 'r') as my_f: 138 | expected = my_f.read().strip() 139 | actual = Trp_cage.get_expression_script_for_plumed(scaling_factor=2.0).strip() 140 | assert (expected == actual), actual 141 | return 142 | 143 | 144 | # class test_Alanine_dipeptide(object): 145 | # @staticmethod 146 | # def test_get_many_cossin_from_coordiantes_in_list_of_files(): 147 | # list_of_files = ['../tests/dependency/biased_output_fc_1000_x1_0.7_x2_-1.07_coordinates.txt'] 148 | # actual = Alanine_dipeptide().get_many_cossin_from_coordinates_in_list_of_files(list_of_files) 149 | # assert_equal(100, len(actual)) 150 | # assert_equal(8, len(actual[0])) 151 | # expected = np.loadtxt('../tests/dependency/output_cossin.txt') 152 | # assert_almost_equal(expected, actual) 153 | # return 154 | # 155 | # @staticmethod 156 | # def test_get_many_dihedrals_from_cossin(): 157 | # angle = [.4, -.7, math.pi, -.45] 158 | # cossin = [[1, 0, -1, 0, 1, 0, -1, 0], [0, 1, 0, -1, 0, 1, 0, -1], 159 | # reduce(lambda x, y: x + y, [[cos(x), sin(x)] for x in angle]) 160 | # ] 161 | # actual = Alanine_dipeptide().get_many_dihedrals_from_cossin(cossin) 162 | # expected = [[0, 0, 0, 0], [math.pi / 2, -math.pi / 2, math.pi / 2, -math.pi / 2], angle] 163 | # for item in range(len(actual)): 164 | # for index in range(4): 165 | # assert_almost_equal(actual[item][index], expected[item][index], 4) 166 | # return 167 | # 168 | # @staticmethod 169 | # def test_get_many_dihedrals_from_coordinates_in_file(): 170 | # list_of_files = ['../tests/dependency/biased_output_fc_1000_x1_0.7_x2_-1.07_coordinates.txt'] 171 | # actual = Alanine_dipeptide.get_many_dihedrals_from_coordinates_in_file(list_of_files) 172 | # expected = np.loadtxt('../tests/dependency/output_dihedrals.txt') 173 | # assert_almost_equal(actual, expected) 174 | # return 175 | 176 | # @staticmethod 177 | # def test_generate_coordinates_from_pdb_files(): 178 | # pdb_file_name = '../tests/dependency/temp_output_0.pdb' 179 | # actual_output_file = pdb_file_name.replace('.pdb', '_coordinates.txt') 180 | # expected_output_files = '../tests/dependency/temp_output_0_coor.txt' 181 | # for interval in range(1, 10): 182 | # if interval != 1: 183 | # actual_output_file = pdb_file_name.replace('.pdb', '_int_%d_coordinates.txt' % interval) 184 | # if os.path.exists(actual_output_file): 185 | # subprocess.check_output(['rm', actual_output_file]) 186 | # Alanine_dipeptide.generate_coordinates_from_pdb_files(pdb_file_name, step_interval=interval) 187 | # assert_equal(np.loadtxt(actual_output_file), np.loadtxt(expected_output_files)[::interval]) 188 | # subprocess.check_output(['rm', actual_output_file]) 189 | # return 190 | 191 | 192 | class test_Trp_cage(object): 193 | @staticmethod 194 | def test_get_non_repeated_pairwise_distance_as_list_of_alpha_carbon(): 195 | pdb_file_list = ['../tests/dependency/temp_Trp_cage_data/1l2y.pdb'] 196 | a = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(pdb_file_list) 197 | a = [item.reshape(400, 1) for item in a] 198 | b = Trp_cage.get_non_repeated_pairwise_distance(pdb_file_list) 199 | assert (len(a) == len(b)) 200 | for _1 in range(len(b)): 201 | for _2 in b[_1]: 202 | assert (_2 in a[_1]) 203 | return 204 | 205 | @staticmethod 206 | def test_get_pairwise_distance_matrices_of_alpha_carbon(): 207 | actual = Trp_cage.get_pairwise_distance_matrices_of_selected_atoms(['../tests/dependency/temp_Trp_cage_data/1l2y.pdb'])[0] 208 | expected = np.loadtxt("../tests/dependency/test_get_pairwise_distance_matrices_of_alpha_carbon.txt") 209 | assert_almost_equal(actual, expected) 210 | return 211 | 212 | @staticmethod 213 | def test_rotating_dihedral_angles_and_save_to_pdb(): 214 | pdb_file = '../tests/dependency/temp_Trp_cage_data/1l2y.pdb' 215 | output = 'temp_rotating_out.pdb' 216 | target_dihedrals_list = [np.ones((38, 38)), np.zeros((38, 38))] 217 | for target_dihedrals in target_dihedrals_list: 218 | Trp_cage.rotating_dihedral_angles_and_save_to_pdb(pdb_file, target_dihedrals, output) 219 | out_coor_file_list = Trp_cage.generate_coordinates_from_pdb_files(output) 220 | actual_dihedrals = Trp_cage.get_many_dihedrals_from_coordinates_in_file(out_coor_file_list) 221 | print(out_coor_file_list) 222 | # print np.max(np.abs(actual_dihedrals - target_dihedrals)) 223 | assert_almost_equal(actual_dihedrals, target_dihedrals, decimal=2) 224 | 225 | return 226 | 227 | 228 | class test_coordinates_data_files_list(object): 229 | @staticmethod 230 | def test__init__(): 231 | folder = '../tests/dependency/temp_data' 232 | num_of_coor_files = len(subprocess.check_output(['find', folder, '-name', "*_coordinates.npy"]).strip().split()) 233 | a = coordinates_data_files_list([folder]) 234 | assert len(a.get_list_of_coor_data_files()) == num_of_coor_files 235 | assert a._list_num_frames == [100 for _ in range(num_of_coor_files)] 236 | assert sorted(a.get_list_of_coor_data_files()) == a.get_list_of_coor_data_files() 237 | assert len(a.get_list_of_corresponding_pdb_dcd()) == num_of_coor_files 238 | assert sorted(a.get_list_of_corresponding_pdb_dcd()) == a.get_list_of_corresponding_pdb_dcd() 239 | 240 | @staticmethod 241 | def test_create_sub_coor_data_files_list_using_filter_conditional(): 242 | folder = '../tests/dependency/temp_data' 243 | a = coordinates_data_files_list([folder]) 244 | a_sub = a.create_sub_coor_data_files_list_using_filter_conditional(lambda x: '0.7' in x) 245 | for item in a_sub.get_list_of_coor_data_files(): 246 | assert ('0.7' in item) 247 | return 248 | 249 | @staticmethod 250 | def test_get_pdb_name_and_corresponding_frame_index_with_global_coor_index(): 251 | _1 = coordinates_data_files_list(['../tests/dependency/temp_data/']) 252 | pdb_files = _1.get_list_of_corresponding_pdb_dcd() 253 | for item in range(1, 602, 100): 254 | assert (_1.get_pdb_name_and_corresponding_frame_index_with_global_coor_index(item) == (pdb_files[item // 100], 1)) 255 | return 256 | 257 | 258 | class test_autoencoder_Keras(object): 259 | def __init__(self): 260 | my_file_list = coordinates_data_files_list(['../tests/dependency/noncircular_alanine_exploration_data/']) 261 | self._data = np.array(Alanine_dipeptide.get_many_cossin_from_coordinates_in_list_of_files( 262 | my_file_list.get_list_of_coor_data_files())) 263 | self._dihedrals = Alanine_dipeptide.get_many_dihedrals_from_cossin(self._data) 264 | 265 | def test_train(self): 266 | data, dihedrals = self._data, self._dihedrals 267 | hidden_layers_list = [["Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh"], 268 | ["Sigmoid", "Sigmoid", "Sigmoid", "Sigmoid", "Tanh", "Sigmoid", "Tanh"]] 269 | model_type_list = [autoencoder_Keras, autoencoder_torch] 270 | reg_list = [0.001, 0] 271 | for item_activation in range(2): 272 | for is_hi, hier_var in [(0, 0), (1,1), (1,2)]: # do not test variant 0 for now 273 | for type_index, model_type in enumerate(model_type_list): 274 | model = model_type(1447, data, 275 | data_files=['/tmp/train_in.npy', '/tmp/train_out.npy'], 276 | node_num=[8, 8, 15, 8, 2, 15, 8, 8, 8], 277 | hidden_layers_types=hidden_layers_list[item_activation], 278 | network_parameters = [0.02, 0.9,0, True, [reg_list[item_activation]]* 8], 279 | batch_size=100, hierarchical=is_hi, hi_variant=hier_var, 280 | epochs=50 281 | ) 282 | model.train() 283 | PCs = model.get_PCs() 284 | [x, y] = list(zip(*PCs)) 285 | psi = [item[2] for item in dihedrals] 286 | fig, ax = plt.subplots() 287 | ax.scatter(x, y, c=psi, cmap='gist_rainbow') 288 | ax.set_title("FVE = %f" % model.get_fraction_of_variance_explained()) 289 | file_name = 'try_model_type_%d_hierarchical_%d_%d_act_%d.pkl' % ( 290 | type_index, is_hi, hier_var, item_activation) 291 | model.save_into_file(file_name) 292 | fig.savefig(file_name.replace('.pkl', '.png')) 293 | return 294 | 295 | def test_train_with_different_mse_weights(self): 296 | data, dihedrals = self._data, self._dihedrals 297 | for _1, weights in enumerate([None, np.array([1,1,0,0,0,0,1,1]), 298 | np.array([0,0,1,1,1,1,0,0]), np.array([1,1,1,1,0,0,0,0])]): 299 | model = autoencoder_Keras(1447, data, 300 | node_num=[8, 8, 15, 8, 2, 15, 8, 8, 8], 301 | hidden_layers_types=["Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh", "Tanh"], 302 | network_parameters=[0.02, 0.9, 0, True, [0.001] * 8], 303 | batch_size=100, hierarchical=0, 304 | mse_weights=weights 305 | ) 306 | _, history = model.train() 307 | PCs = model.get_PCs() 308 | [x, y] = list(zip(*PCs)) 309 | psi = [item[2] for item in dihedrals] 310 | fig, ax = plt.subplots() 311 | ax.scatter(x, y, c=psi, cmap='gist_rainbow') 312 | model.save_into_file('try_diff_weights_%02d.pkl' % _1) 313 | fig.savefig('try_diff_weights_%02d.png' % _1) 314 | return 315 | 316 | def test_train_2(self): 317 | data, dihedrals = self._data, self._dihedrals 318 | model = autoencoder_Keras(1447, data, 319 | node_num=[8, 15, 4, 15, 8], 320 | hidden_layers_types=["Tanh", "Circular", "Tanh"], 321 | network_parameters = [0.1, 0.4,0, True, [0.001]* 4], 322 | hierarchical=False 323 | ) 324 | model.train() 325 | PCs = model.get_PCs() 326 | [x, y] = list(zip(*PCs)) 327 | psi = [item[2] for item in dihedrals] 328 | fig, ax = plt.subplots() 329 | ax.scatter(x, y, c=psi, cmap='gist_rainbow') 330 | 331 | fig.savefig('try_keras_circular.png') 332 | return 333 | 334 | def test_save_into_file_and_load(self): 335 | data = self._data 336 | model = autoencoder_Keras(1447, data, 337 | node_num=[8, 15, 2, 15, 8], 338 | hidden_layers_types=["Tanh", "Tanh", "Tanh"], 339 | network_parameters=[0.02, 0.9,0, True, [0.001]* 4], 340 | batch_size=50, 341 | data_files=['test_save_into_file.npy', 'test_save_into_file.npy'] 342 | ) 343 | model.train() 344 | model.save_into_file('test_save_into_file.pkl') 345 | model.save_into_file('test_save_into_file_fraction.pkl', fraction_of_data_to_be_saved=0.5) 346 | model.save_into_file('temp_save/complicated/path/temp.pkl') 347 | _ = autoencoder.load_from_pkl_file('test_save_into_file.pkl') 348 | return 349 | 350 | @staticmethod 351 | def check_two_plumed_strings_containing_floats(string_1, string_2): 352 | """due to precision issue, string literals may not be exactly the same for two plumed strings, so we 353 | need to explicitly compare the float values""" 354 | def is_float(s): 355 | try: 356 | float(s) 357 | return True 358 | except ValueError: 359 | return False 360 | split_1 = re.split(' |\n|=|,', string_1) 361 | split_2 = re.split(' |\n|=|,', string_2) 362 | assert (len(split_1) == len(split_2)), (len(split_1), len(split_2)) 363 | for _1, _2 in zip(split_1, split_2): 364 | if is_float(_1): 365 | assert_almost_equal(float(_1), float(_2), decimal=4) 366 | else: 367 | assert (_1 == _2), (_1, _2) 368 | return 369 | 370 | def test_get_plumed_script_for_biased_simulation_with_solute_pairwise_dis_and_solvent_cg_input_and_ANN(self): 371 | scaling_factor_v = 26.9704478916 372 | scaling_factor_u = 29.1703348377 373 | r_high = 5.5 374 | atom_indices = list(range(1, 25)) 375 | water_index_string = '75-11421:3' 376 | 377 | ae = autoencoder.load_from_pkl_file('../tests/dependency/solute_plus_solvent_AE/temp_alpha_0.5.pkl') 378 | with open('../tests/dependency/solute_plus_solvent_AE/temp_plumed.txt', 'r') as my_f: 379 | expected_plumed = my_f.read().strip() 380 | plumed_string = ae.get_plumed_script_for_biased_simulation_with_solute_pairwise_dis_and_solvent_cg_input_and_ANN( 381 | list(range(1, 25)), scaling_factor_u, water_index_string, atom_indices, -5, r_high, scaling_factor_v) 382 | self.check_two_plumed_strings_containing_floats(plumed_string, expected_plumed) 383 | 384 | AE = autoencoder.load_from_pkl_file('../tests/dependency/solvent_AE/solvent_test.pkl') 385 | with open('../tests/dependency/solvent_AE/temp_plumed.txt', 'r') as my_f: 386 | expected_plumed = my_f.read().strip() 387 | plumed_string = AE.get_plumed_script_for_biased_simulation_with_INDUS_cg_input_and_ANN( 388 | water_index_string, atom_indices, -5, r_high, scaling_factor_v).strip() 389 | self.check_two_plumed_strings_containing_floats(plumed_string, expected_plumed) 390 | return 391 | 392 | class test_autoencoder_torch(object): 393 | @staticmethod 394 | def test_general_train_save_and_load(): 395 | data = np.random.rand(1000, 21) 396 | a = autoencoder_torch(1447, data, 397 | output_data_set=data, 398 | hierarchical=True, hi_variant=2, 399 | batch_size=500, 400 | node_num=[21, 100, 2, 100, 21], 401 | hidden_layers_types=['tanh', 'tanh', 'tanh'], epochs=10) 402 | a.train(lag_time=10) 403 | a.save_into_file('/tmp/temp_save.pkl') 404 | torch.save(a._ae, '/tmp/temp.df') 405 | model_1 = torch.load('/tmp/temp.df') 406 | torch.save(a._ae.state_dict(), '/tmp/temp_2.df') 407 | model_2 = AE_net([21, 100, 2], [2, 100, 21], activations=a._hidden_layers_type + ['linear'], 408 | hi_variant=2, hierarchical=True).cuda() 409 | model_2.load_state_dict(torch.load('/tmp/temp_2.df')) 410 | data_in = torch.rand(1000, 21).cuda() 411 | assert_almost_equal(model_1(data_in)[0].cpu().data.numpy(), a._ae(data_in)[0].cpu().data.numpy()) 412 | assert_almost_equal(model_2(data_in)[0].cpu().data.numpy(), a._ae(data_in)[0].cpu().data.numpy()) 413 | _ = autoencoder_torch.load_from_pkl_file('/tmp/temp_save.pkl') 414 | return 415 | 416 | @staticmethod 417 | def test_time_lagged_AE_stored_data_saving(): 418 | data = np.random.rand(1000, 21) 419 | a = autoencoder_torch(1447, data, 420 | output_data_set=data, 421 | hierarchical=True, hi_variant=2, 422 | rec_loss_type=1, 423 | batch_size=500, 424 | node_num=[21, 100, 2, 100, 21], 425 | hidden_layers_types=['tanh', 'tanh', 'tanh'], epochs=10) 426 | a.train(lag_time=10) 427 | assert_almost_equal(a._data_set, data[:-10]) 428 | assert_almost_equal(a._output_data_set, data[10:]) 429 | return 430 | 431 | @staticmethod 432 | def test_save_and_load_data(): 433 | data = np.random.rand(1000, 21) 434 | a = autoencoder_torch(1447, data, 435 | output_data_set=data, 436 | hierarchical=True, 437 | batch_size=500, 438 | node_num=[21, 100, 2, 100, 21], epochs=10, 439 | data_files=['data.npy', 'data.npy']) 440 | a.train(lag_time=0) 441 | a.save_into_file('temp_save_pytorch/temp_save.pkl') 442 | assert (os.path.isfile('temp_save_pytorch/data.npy')) 443 | b = autoencoder_torch.load_from_pkl_file('temp_save_pytorch/temp_save.pkl') 444 | assert_almost_equal(a._data_set, b._data_set) 445 | assert_almost_equal(a._output_data_set, b._output_data_set) 446 | return 447 | 448 | 449 | class test_biased_simulation(object): 450 | @staticmethod 451 | def helper_biased_simulation_alanine_dipeptide(potential_center): 452 | autoencoder_coeff_file = 'autoencoder_info_9.npy' 453 | autoencoder_pkl_file = '../tests/dependency/test_biased_simulation/network_5.pkl' 454 | my_network = autoencoder.load_from_pkl_file(autoencoder_pkl_file) 455 | assert (isinstance(my_network, autoencoder)) 456 | my_network.write_coefficients_of_connections_into_file(autoencoder_coeff_file) 457 | output_folder = 'temp_output_test_biased_simulation' 458 | 459 | if os.path.exists(output_folder): 460 | subprocess.check_output(['rm', '-rf', output_folder]) 461 | 462 | subprocess.check_output( 463 | 'python ../src/biased_simulation.py 50 5000 5000 %s %s pc_%s --num_of_nodes %s --layer_types %s --platform CPU --data_type_in_input_layer 1' 464 | % (output_folder, autoencoder_coeff_file, potential_center, "21,40,2", "Tanh,Tanh"), 465 | shell=True) 466 | 467 | Alanine_dipeptide.generate_coordinates_from_pdb_files(output_folder) 468 | fig, ax = plt.subplots() 469 | input_data = coordinates_data_files_list([output_folder]).get_coor_data(0.5) 470 | input_data = Sutils.remove_translation(input_data) 471 | PCs = my_network.get_PCs(input_data) 472 | x, y = list(zip(*PCs)) 473 | ax.scatter(x, y, c=list(range(len(x))), cmap='gist_rainbow', s=5) 474 | potential_center_num = [float(item_1) for item_1 in potential_center.split(',')] 475 | ax.scatter([potential_center_num[0]], [potential_center_num[1]], marker='X', s=30) 476 | fig.savefig('test_biased_simulation_%s.png' % potential_center) 477 | subprocess.check_output(['rm', '-rf', output_folder]) 478 | return 479 | 480 | @staticmethod 481 | def test_biased_simulation_alanine_dipeptide(): 482 | for item in ['-0.3,-0.7', '-0.3,-0.5', '-0.2,-0.4', '0,-0.4', '-0.1,-0.5']: 483 | test_biased_simulation.helper_biased_simulation_alanine_dipeptide(item.replace(' ','')) 484 | return 485 | 486 | @staticmethod 487 | def test_biased_simulation_alanine_dipeptide_with_metadynamics(use_well_tempered=0, biasfactor=-1): 488 | autoencoder_pkl_file = '../tests/dependency/test_biased_simulation/network_5.pkl' 489 | output_folder = 'temp_output_test_biased_simulation' 490 | a = autoencoder.load_from_pkl_file(autoencoder_pkl_file) 491 | a.write_expression_script_for_plumed('temp_info.txt', mode='ANN') 492 | subprocess.check_output( 493 | 'python ../src/biased_simulation.py 50 50000 0 %s temp_info.txt pc_0,0 --MTD_pace 100 --platform CPU --bias_method MTD --MTD_biasfactor %f --MTD_WT %d --equilibration_steps 0' 494 | % (output_folder, biasfactor, use_well_tempered), shell=True) 495 | subprocess.check_output(['python', '../src/generate_coordinates.py', 'Alanine_dipeptide', '--path', output_folder]) 496 | fig, axes = plt.subplots(1, 3) 497 | data = np.load( 498 | output_folder + '/output_fc_0.000000_pc_[0.0,0.0]_coordinates.npy') 499 | data /= 0.5 500 | data = Sutils.remove_translation(data) 501 | PCs = a.get_PCs(data) 502 | ax = axes[0] 503 | ax.set_xlabel('CV1') 504 | ax.set_ylabel('CV2') 505 | ax.set_title('CV data generated by autoencoder') 506 | im = ax.scatter(PCs[:, 0], PCs[:, 1], c=list(range(PCs.shape[0])), cmap='gist_rainbow', s=4) 507 | fig.colorbar(im, ax=ax) 508 | 509 | out_data = np.loadtxt('temp_MTD_out.txt') 510 | 511 | ax = axes[1] 512 | im = ax.scatter(out_data[:, 1], out_data[:, 2], c=list(range(out_data.shape[0])), cmap='gist_rainbow', s=4) 513 | ax.set_xlabel('CV1') 514 | ax.set_ylabel('CV2') 515 | ax.set_title('CV data generated by PLUMED') 516 | fig.colorbar(im, ax=ax) 517 | 518 | ax = axes[2] 519 | dihedrals = Alanine_dipeptide.get_many_dihedrals_from_cossin( 520 | Alanine_dipeptide.get_many_cossin_from_coordinates(data)) 521 | dihedrals = np.array(dihedrals) 522 | im = ax.scatter(dihedrals[:, 1], dihedrals[:, 2], c=list(range(len(dihedrals))), cmap="gist_rainbow", s=4) 523 | ax.set_xlabel('$\phi$') 524 | ax.set_ylabel('$\psi$') 525 | ax.set_title('data in phi-psi space') 526 | fig.colorbar(im, ax=ax) 527 | fig.set_size_inches((15, 5)) 528 | fig.savefig('metadynamics_biasfactor_%f.png' % biasfactor) 529 | subprocess.check_output(['rm', '-rf', output_folder]) 530 | return 531 | 532 | @staticmethod 533 | def test_biased_simulation_alanine_dipeptide_with_metadynamics_multiple(): 534 | test_biased_simulation.test_biased_simulation_alanine_dipeptide_with_metadynamics(0, -1) 535 | for item in [5, 20, 100]: 536 | test_biased_simulation.test_biased_simulation_alanine_dipeptide_with_metadynamics(1, item) 537 | return 538 | 539 | 540 | class test_Helper_func(object): 541 | @staticmethod 542 | def test_compute_distances_min_image_convention(): 543 | output_pdb = 'out_for_computing_distances.pdb' 544 | subprocess.check_output(['python', '../src/biased_simulation_general.py', 'Trp_cage', '50', '1000', '0', 'temp_out_12345', 545 | 'none', 'pc_0,0', 'explicit', 'NPT', '--platform', 'CUDA', '--device', '0', '--out_traj', output_pdb]) 546 | import mdtraj as md 547 | box_length = 4.5 # in nm 548 | temp_t = md.load(output_pdb) 549 | temp_t.unitcell_lengths, temp_t.unitcell_angles = box_length * np.ones((20, 3)), 90 * np.ones((20, 3)) 550 | temp_u = Universe(output_pdb) 551 | a_sel = temp_u.select_atoms('name N') 552 | b_sel = temp_u.select_atoms('name O and resname HOH') 553 | absolute_index = b_sel.atoms.indices[30] 554 | b_positions = np.array([b_sel.positions for _ in temp_u.trajectory]) 555 | b_positions = b_positions.reshape(20, b_positions.shape[1] * b_positions.shape[2]) 556 | a_positions = np.array([a_sel.positions for _ in temp_u.trajectory]) 557 | a_positions = a_positions.reshape(20, a_positions.shape[1] * a_positions.shape[2]) 558 | result = Helper_func.compute_distances_min_image_convention(a_positions, b_positions, 10 * box_length) 559 | assert_almost_equal(md.compute_distances(temp_t, [[0, absolute_index]]).flatten(), result[:, 0, 30] / 10, decimal=4) 560 | subprocess.check_output(['rm', '-rf', output_pdb, 'temp_out_12345']) 561 | return 562 | 563 | @staticmethod 564 | def test_shuffle_multiple_arrays(): 565 | a = np.random.rand(10, 2) 566 | b1, b2 =Helper_func.shuffle_multiple_arrays([a[:, 0], a[:, 1]]) 567 | for item in range(10): 568 | assert( [b1[item], b2[item]] in a) 569 | return 570 | 571 | @staticmethod 572 | def test_attempt_to_save_npy(): 573 | import shutil 574 | def get_num_files_in_folder(temp_folder): return len(os.listdir(temp_folder)) 575 | a = 2 * np.eye(3, 3) 576 | folder = 'temp_test_save_npy' 577 | if os.path.exists(folder): shutil.rmtree(folder) 578 | os.mkdir(folder) 579 | filename = folder + '/1.npy' 580 | Helper_func.attempt_to_save_npy(filename, a) 581 | assert (os.path.isfile(filename)) 582 | assert (get_num_files_in_folder(folder) == 1) 583 | Helper_func.attempt_to_save_npy(filename, a) 584 | assert (get_num_files_in_folder(folder) == 1) 585 | for item in range(10): 586 | Helper_func.attempt_to_save_npy(filename, a+item) 587 | assert (get_num_files_in_folder(folder) == item + 1) 588 | shutil.rmtree(folder) 589 | return 590 | 591 | 592 | class test_others(object): 593 | @staticmethod 594 | def test_SphSh_INDUS_PLUMED_plugin(): 595 | # test for original version of SphSh_INDUS_PLUMED_plugin provided by Prof. Amish Patel 596 | num_frames = 20 597 | potential_center = np.random.random(size=3) * 3 598 | with open('temp_plumed.txt', 'w') as my_f: 599 | my_f.write(''' 600 | SPHSH ATOMS=306-11390:4 XCEN=%f YCEN=%f ZCEN=%f RLOW=-0.5 RHIGH=0.311 SIGMA=0.01 CUTOFF=0.02 LABEL=sph 601 | SPHSH ATOMS=306-11390:4 XCEN=%f YCEN=%f ZCEN=%f RLOW=0.05 RHIGH=0.311 SIGMA=0.01 CUTOFF=0.02 LABEL=sph_2 602 | RESTRAINT ARG=sph.Ntw AT=5 KAPPA=5 SLOPE=0 LABEL=mypotential 603 | PRINT STRIDE=50 ARG=sph.N,sph.Ntw,sph_2.N,sph_2.Ntw FILE=NDATA''' \ 604 | % (potential_center[0], potential_center[1], potential_center[2], 605 | potential_center[0], potential_center[1], potential_center[2])) # since there is "TER" separating solute and solvent in pdb file, so index should start with 306, not 307 606 | out_pdb = 'temp_plumed/output_fc_0.0_pc_[0.0,0.0]_T_300_explicit_NPT.pdb' 607 | subprocess.check_output(['python', '../src/biased_simulation_general.py', 'Trp_cage', '50', '1000', '0', 608 | 'temp_plumed', 'none', 'pc_0,0', 'explicit', 'NPT', '--platform', 'CUDA', 609 | '--bias_method', 'plumed_other', '--plumed_file', 'temp_plumed.txt', 610 | '--out_traj', out_pdb]) 611 | temp_u = Universe(out_pdb) 612 | reporter_file = out_pdb.replace('output', 'report').replace('.pdb', '.txt') 613 | box_length_list = Helper_func.get_box_length_list_fom_reporter_file(reporter_file, unit='A') 614 | O_sel = temp_u.select_atoms('name O and resname HOH') 615 | O_coords = np.array([O_sel.positions for _ in temp_u.trajectory]).reshape(num_frames, 2772 * 3) 616 | distances = Helper_func.compute_distances_min_image_convention( 617 | 10 * np.array([potential_center for _ in range(num_frames)]), O_coords, box_length_list) 618 | coarse_count, actual_count = Helper_func.get_cg_count_in_sphere(distances, 3.11, 0.2, .1) 619 | plumed_count = np.loadtxt('NDATA') 620 | assert_almost_equal(plumed_count[-num_frames:, 1], actual_count.flatten()) 621 | assert_almost_equal(plumed_count[-num_frames:, 2], coarse_count.flatten(), decimal=2) 622 | coarse_count_1, actual_count_1 = Helper_func.get_cg_count_in_shell(distances, 0.5, 3.11, 0.2, .1) 623 | assert_almost_equal(plumed_count[-num_frames:, 3], actual_count_1.flatten()) 624 | assert_almost_equal(plumed_count[-num_frames:, 4], coarse_count_1.flatten(), decimal=2) 625 | subprocess.check_output(['rm', '-rf', 'temp_plumed', 'NDATA', 'temp_plumed.txt']) 626 | return 627 | 628 | @staticmethod 629 | def test_SphShMod_INDUS_PLUMED_plugin(): 630 | num_frames = 20 631 | with open('temp_plumed.txt', 'w') as my_f: 632 | my_f.write(''' 633 | SPHSHMOD ATOMS=306-11390:4 ATOMREF=1 RLOW=-0.5 RHIGH=0.311 SIGMA=0.01 CUTOFF=0.02 LABEL=sph 634 | RESTRAINT ARG=sph.Ntw AT=10 KAPPA=5 SLOPE=0 LABEL=mypotential 635 | PRINT STRIDE=50 ARG=sph.N,sph.Ntw FILE=NDATA''' ) 636 | out_pdb = 'temp_plumed/output_fc_0.0_pc_[0.0,0.0]_T_300_explicit_NPT.pdb' 637 | subprocess.check_output(['python', '../src/biased_simulation_general.py', 'Trp_cage', '50', '1000', '0', 638 | 'temp_plumed', 'none', 'pc_0,0', 'explicit', 'NPT', '--platform', 'CUDA', 639 | '--bias_method', 'plumed_other', '--plumed_file', 'temp_plumed.txt', 640 | '--out_traj', out_pdb]) 641 | temp_u = Universe(out_pdb) 642 | reporter_file = out_pdb.replace('output', 'report').replace('.pdb', '.txt') 643 | box_length_list = Helper_func.get_box_length_list_fom_reporter_file(reporter_file, unit='A') 644 | print(box_length_list) 645 | O_sel = temp_u.select_atoms('name O and resname HOH') 646 | N_sel = temp_u.select_atoms('resnum 1 and name N') 647 | O_coords = np.array([O_sel.positions for _ in temp_u.trajectory]).reshape(num_frames, 2772 * 3) 648 | N_coords = np.array([N_sel.positions for _ in temp_u.trajectory]).reshape(num_frames, 3) 649 | distances = Helper_func.compute_distances_min_image_convention(N_coords, O_coords, box_length_list) 650 | coarse_count, actual_count = Helper_func.get_cg_count_in_sphere(distances, 3.11, 0.2, .1) 651 | plumed_count = np.loadtxt('NDATA') 652 | assert_almost_equal(plumed_count[-num_frames:, 1], actual_count.flatten()) 653 | assert_almost_equal(plumed_count[-num_frames:, 2], coarse_count.flatten(), decimal=2) 654 | subprocess.check_output(['rm', '-rf', 'temp_plumed', 'NDATA', 'temp_plumed.txt']) 655 | return 656 | -------------------------------------------------------------------------------- /MD_simulation_on_alanine_dipeptide/current_work/tests/Makefile: -------------------------------------------------------------------------------- 1 | SHELL=/bin/bash 2 | 3 | test: 4 | nosetests --with-coverage --cover-package=../ ANN_simulation_test.py # nosetests with coverage 5 | 6 | clean: 7 | rm -rf *.pkl *.pyc *.png *.txt *.pdb *.pdf *.hdf5 *.chk bck.* NDATA temp_save *.npy temp_model.dot temp_save_pytorch .coverage *.pth 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![plumID:19.065](https://www.plumed-nest.org/eggs/19/065/badge.svg)](https://www.plumed-nest.org/eggs/19/065/) 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | 4 | # Accelerated sampling with data-augmented autoencoders 5 | 6 | This is the framework for running accelerated sampling with data-augmented autoencoders. 7 | 8 | ## Dependency 9 | 10 | OpenMM simulation pacakge: https://github.com/pandegroup/openmm 11 | 12 | ANN_Force biasing force package: https://github.com/weiHelloWorld/ANN_Force 13 | 14 | Keras: https://github.com/fchollet/keras 15 | 16 | PyTorch: https://pytorch.org 17 | 18 | MDAnalysis: https://github.com/MDAnalysis/mdanalysis 19 | 20 | Nose testing framework: https://github.com/nose-devs/nose 21 | 22 | PLUMED (ANN included): https://github.com/plumed/plumed2 + https://github.com/weiHelloWorld/plumed_additional 23 | 24 | cluster management: https://github.com/weiHelloWorld/cluster_management 25 | 26 | plumed helper: https://github.com/weiHelloWorld/plumed_helper 27 | 28 | OpenMM-PLUMED force plugin: https://github.com/peastman/openmm-plumed 29 | 30 | Bayes WHAM free energy calculation package: https://bitbucket.org/andrewlferguson/bayeswham_python 31 | 32 | Some other Python scientific calculation packages (e.g. seaborn, pandas, sklearn) are also needed, it is recommended to install them with Anaconda: https://www.continuum.io/downloads 33 | 34 | 35 | ## Installation and preparation 36 | 37 | No installation is required. You may simply have all dependent packages installed and checkout this repository. 38 | 39 | It is **highly recommended** to run tests before running code to make sure packages are correctly installed. 40 | 41 | ## Testing 42 | 43 | This package uses `nosetest` framework. To run testing, run 44 | 45 | ```bash 46 | root_dir=MD_simulation_on_alanine_dipeptide/current_work 47 | cd ${root_dir}/tests 48 | make test 49 | ``` 50 | 51 | Tests include numerical unit tests (for tests with clear expected results) and figure plots (for others, such as neural network training). 52 | 53 | ## 1-minute quick start 54 | 55 | Go ahead to modify configuration file `${root_dir}/src/config.py`, and run 56 | 57 | ```bash 58 | python main_work.py 59 | ``` 60 | 61 | For more options, type 62 | 63 | ```bash 64 | python main_work.py --help 65 | ``` 66 | 67 | ## Quick introduction to autoencoders 68 | 69 | A typical autoencoder consists of encoder ANN and decoder ANN, where encoder ANN maps inputs to a small number of collective variables (CVs) in encoding layer and decoder ANN tries to reconstruct inputs (or some variants of inputs) from CVs: 70 | 71 | ![](figures/diagram_autoencoder.png) 72 | 73 | A typical 5-layer structure is given below: 74 | 75 | ![](figures/autoencoder_2.png) 76 | 77 | For traditional autoencoders, we minimize 78 | 79 | $$E=|A(x)-x|^2 + R$$ 80 | 81 | where $A$ is autoencoder mapping function, $R$ is regularization term. 82 | 83 | To remove external degrees of freedom, we use data-augmented autoencoders, which minimizes 84 | 85 | $$E=|A(x)-L(x)|^2 + R$$ 86 | 87 | where $L$ is the alignment function responsible for data augmentation. It can be written in "cat form" as (cat = molecule configuration, little human = alignment function L): 88 | 89 | ![](figures/autoencoder_1.png) 90 | 91 | To possibly remove dependency on specific reference, we apply multiple references to data-augmented autoencoders, corresponding error function is 92 | 93 | $$E=\sum_j |A_j(x)-L_j(x)|^2 + R$$ 94 | 95 | where $A_j$ are autoencoders that share all but the last layer, and $L_j$ is alignment functions with respect to reference $j$. 96 | 97 | If we want to see relative importance among these CVs, we construct multiple outputs with each output taking contribution from some of CVs in encoding layer. Two possible types of network topology are given below: 98 | 99 | ![](figures/hierarchical_autoencoder.png) 100 | 101 | Corresponding error function is then 102 | 103 | $$E=E_{1}+E_{1,2}+E_{1,2,3}+...$$ 104 | 105 | where $E_{1}$ is reconstruction error when only 1st CV is used to compute output, $E_{1,2}$ is reconstruction error when only first two CVs are used to compute output, ... 106 | 107 | See slides for more information: (TODO) 108 | 109 | 110 | ## Directory structure 111 | 112 | Directories are arranged as follows: 113 | 114 | ``` 115 | ${root_dir}/src: source code 116 | ${root_dir}/target: output of simulation data (pdb files and coordinate files) 117 | ${root_dir}/resources: training results (autoencoders), and reference configurations files (pdb files) 118 | ${root_dir}/tests: test source code 119 | ``` 120 | 121 | 122 | ## Extensions 123 | 124 | #### 1. apply to new molecules 125 | 126 | 1. Create a subclass of `Sutils` for the molecule and implement corresponding methods in `${root_dir}/src/molecule_spec_sutils.py`. 127 | 128 | 2. Include molecule-specific information in the configuration file `${root_dir}/src/config.py`, and modify corresponding configuration settings. 129 | 130 | 3. Modify biased simulation file (`${root_dir}/src/biased_simulation_general.py`) for the new molecule. 131 | 132 | 4. Add molecule-related statements to `${root_dir}/src/ANN_simulation.py` and `${root_dir}/src/autoencoders.py` whenever `Trp_cage` appears. 133 | 134 | #### 2. use a new neural network architecture or switch to a new training backend 135 | 136 | 1. Create a subclass of `autoencoder` for the new structure/backend and do implementation. Note that all abstract methods (`@abc.abstractmethod`) must be implemented. 137 | 138 | 2. Include new network information in the configuration file `${root_dir}/src/config.py`. 139 | 140 | #### 3. apply a new potential center selection algorithm 141 | 142 | Modify method `Sutils.get_boundary_points()` in `${root_dir}/src/molecule_spec_sutils.py`. 143 | 144 | #### 4. use a new simulation package 145 | 146 | Modify `biased_simulation.py` or `biased_simulation_general.py` 147 | 148 | ## Citation 149 | 150 | If you use this code in your work, please cite: 151 | 152 | - Chen, Wei, and Andrew L. Ferguson. "Molecular enhanced sampling with autoencoders: On‐the‐fly collective variable discovery and accelerated free energy landscape exploration." Journal of computational chemistry 39.25 (2018): 2079-2102. 153 | 154 | - Chen, Wei, Aik Rui Tan, and Andrew L. Ferguson. "Collective variable discovery and enhanced sampling using autoencoders: Innovations in network architecture and error function design." The Journal of chemical physics 149.7 (2018): 072312. 155 | 156 | ## Contact 157 | 158 | For any questions, feel free to contact weichen9@illinois.edu or open a github issue. 159 | -------------------------------------------------------------------------------- /archive/plumed_adp.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/archive/plumed_adp.zip -------------------------------------------------------------------------------- /figures/autoencoder_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/autoencoder_1.png -------------------------------------------------------------------------------- /figures/autoencoder_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/autoencoder_2.png -------------------------------------------------------------------------------- /figures/diagram_autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/diagram_autoencoder.png -------------------------------------------------------------------------------- /figures/hierarchical_autoencoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/weiHelloWorld/accelerated_sampling_with_autoencoder/fe2b98bc81fc0b30db42ca8a83e23adb775a487d/figures/hierarchical_autoencoder.png --------------------------------------------------------------------------------